├── _pkgdown.yml ├── LICENSE ├── .gitignore ├── tests ├── testthat.R └── testthat │ ├── test_step_select_mrmr.R │ ├── test_step_select_infgain.R │ ├── test_step_select_linear.R │ ├── test_step_select_tree.R │ ├── test_step_select_forests.R │ ├── test_step_select_boruta.R │ └── test_step_select_vip.R ├── docs ├── reference │ ├── Rplot001.png │ ├── pipe.html │ ├── top_p.html │ └── index.html ├── pkgdown.yml ├── link.svg ├── sitemap.xml ├── bootstrap-toc.css ├── docsearch.js ├── pkgdown.js ├── LICENSE-text.html ├── 404.html ├── bootstrap-toc.js ├── authors.html ├── LICENSE.html └── pkgdown.css ├── .Rbuildignore ├── R ├── imports.R ├── utils-pipe.R ├── parameters.R ├── recipeselectors.R ├── misc.R ├── step_select_boruta.R ├── step_select_mrmr.R ├── step_select_vip.R ├── step_select_roc.R ├── step_select_carscore.R ├── step_select_xtab.R ├── step_select_infgain.R ├── step_select_tree.R ├── step_select_forests.R └── pull_importances.R ├── man ├── pipe.Rd ├── top_p.Rd ├── recipeselectors.Rd ├── pull_importances.Rd ├── step_select_boruta.Rd ├── step_select_mrmr.Rd ├── step_select_roc.Rd ├── step_select_xtab.Rd ├── step_select_vip.Rd ├── step_select_carscore.Rd ├── step_select_tree.Rd ├── step_select_infgain.Rd ├── step_select_linear.Rd └── step_select_forests.Rd ├── recipeselectors.Rproj ├── LICENSE.md ├── DESCRIPTION ├── NAMESPACE └── README.md /_pkgdown.yml: -------------------------------------------------------------------------------- 1 | destination: docs 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2019 2 | COPYRIGHT HOLDER: Steven Pawley 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | .DS_Store 6 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(recipeselectors) 3 | 4 | test_check("recipeselectors") 5 | -------------------------------------------------------------------------------- /docs/reference/Rplot001.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stevenpawley/recipeselectors/HEAD/docs/reference/Rplot001.png -------------------------------------------------------------------------------- /docs/pkgdown.yml: -------------------------------------------------------------------------------- 1 | pandoc: 2.17.1.1 2 | pkgdown: 2.0.2 3 | pkgdown_sha: ~ 4 | articles: {} 5 | last_built: 2022-03-23T20:12Z 6 | 7 | -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^LICENSE\.md$ 4 | ^README\.Rmd$ 5 | ^\.travis\.yml$ 6 | ^codecov\.yml$ 7 | ^_pkgdown\.yml$ 8 | ^docs$ 9 | ^pkgdown$ 10 | -------------------------------------------------------------------------------- /R/imports.R: -------------------------------------------------------------------------------- 1 | ## usethis namespace: start 2 | #' @importFrom tibble tibble as_tibble 3 | #' @importFrom recipes prep bake 4 | #' @importFrom generics tidy 5 | #' @importFrom tune tunable 6 | ## usethis namespace: end 7 | NULL 8 | -------------------------------------------------------------------------------- /R/utils-pipe.R: -------------------------------------------------------------------------------- 1 | #' Pipe operator 2 | #' 3 | #' See \code{magrittr::\link[magrittr:pipe]{\%>\%}} for details. 4 | #' 5 | #' @name %>% 6 | #' @rdname pipe 7 | #' @keywords internal 8 | #' @export 9 | #' @importFrom magrittr %>% 10 | #' @usage lhs \%>\% rhs 11 | NULL 12 | -------------------------------------------------------------------------------- /man/pipe.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils-pipe.R 3 | \name{\%>\%} 4 | \alias{\%>\%} 5 | \title{Pipe operator} 6 | \usage{ 7 | lhs \%>\% rhs 8 | } 9 | \description{ 10 | See \code{magrittr::\link[magrittr:pipe]{\%>\%}} for details. 11 | } 12 | \keyword{internal} 13 | -------------------------------------------------------------------------------- /recipeselectors.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Package 19 | PackageUseDevtools: Yes 20 | PackageInstallArgs: --no-multiarch --with-keep.source 21 | PackageCheckArgs: --as-cran 22 | PackageRoxygenize: rd,collate,namespace 23 | -------------------------------------------------------------------------------- /tests/testthat/test_step_select_mrmr.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(recipes) 3 | library(tibble) 4 | data("iris") 5 | 6 | test_that("step_select_mrmr, execution", { 7 | skip_if_not_installed("praznik") 8 | 9 | irisX <- iris[-5] 10 | y <- iris$Species 11 | 12 | res <- praznik::MRMR(X = irisX, Y = y, k = 4) 13 | 14 | mrmr_scores <- tibble( 15 | variable = names(res$score), 16 | scores = res$score 17 | ) 18 | 19 | rec <- recipe(Species ~ ., data = iris) 20 | 21 | mrmr_rec <- rec %>% 22 | step_select_mrmr(all_predictors(), outcome = "Species", top_p = 2) %>% 23 | prep() 24 | 25 | mrmr_pred <- juice(mrmr_rec) 26 | expect_true(all(names(mrmr_pred)[1:2] %in% mrmr_scores$variable[1:2])) 27 | 28 | expect_equal(mrmr_scores$scores, mrmr_rec$steps[[1]]$scores$score) 29 | }) 30 | 31 | 32 | -------------------------------------------------------------------------------- /docs/link.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 8 | 12 | 13 | -------------------------------------------------------------------------------- /tests/testthat/test_step_select_infgain.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(recipes) 3 | library(tibble) 4 | data("iris") 5 | 6 | test_that("step_select_infgain, execution", { 7 | skip_if_not_installed("FSelectorRcpp") 8 | 9 | irisX <- iris[-5] 10 | y <- iris$Species 11 | 12 | ig_scores <- as_tibble(FSelectorRcpp::information_gain(x = irisX, y = y)) 13 | ig_scores <- ig_scores[order(ig_scores$importance), ] 14 | ig_scores$importance <- rlang::set_names(ig_scores$importance, ig_scores$attributes) 15 | ig_scores <- ig_scores[order(ig_scores$importance, decreasing = TRUE), ] 16 | 17 | rec <- recipe(Species ~ ., data = iris) 18 | 19 | ig_rec <- rec %>% 20 | step_select_infgain( 21 | all_predictors(), outcome = "Species", type = "infogain", top_p = 2) %>% 22 | prep() 23 | 24 | ig_pred <- juice(ig_rec) 25 | expect_true(all(names(ig_pred)[1:2] %in% ig_scores$attributes[1:2])) 26 | }) 27 | 28 | 29 | -------------------------------------------------------------------------------- /man/top_p.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/parameters.R 3 | \name{top_p} 4 | \alias{top_p} 5 | \title{Parameter functions for feature selection recipes} 6 | \usage{ 7 | top_p(range = c(1L, 4L), trans = NULL) 8 | } 9 | \arguments{ 10 | \item{range}{A two-element vector holding the _defaults_ for the smallest and 11 | largest possible values, respectively.} 12 | 13 | \item{trans}{A `trans` object from the `scales` package, such as 14 | `scales::log10_trans()` or `scales::reciprocal_trans()`. If not provided, 15 | the default is used which matches the units used in `range`. If no 16 | transformation, `NULL`.} 17 | } 18 | \value{ 19 | A function with classes "quant_param" and "param" 20 | } 21 | \description{ 22 | Feature selection recipes allow the top-performing features to be selected 23 | using two parameters. `top_p` is for specifying the number of the 24 | top-performing features. 25 | } 26 | \examples{ 27 | top_p(c(3, 10)) 28 | } 29 | -------------------------------------------------------------------------------- /R/parameters.R: -------------------------------------------------------------------------------- 1 | #' Parameter functions for feature selection recipes 2 | #' 3 | #' Feature selection recipes allow the top-performing features to be selected 4 | #' using two parameters. `top_p` is for specifying the number of the 5 | #' top-performing features. 6 | #' 7 | #' @param range A two-element vector holding the _defaults_ for the smallest and 8 | #' largest possible values, respectively. 9 | #' @param trans A `trans` object from the `scales` package, such as 10 | #' `scales::log10_trans()` or `scales::reciprocal_trans()`. If not provided, 11 | #' the default is used which matches the units used in `range`. If no 12 | #' transformation, `NULL`. 13 | #' 14 | #' @return A function with classes "quant_param" and "param" 15 | #' @export 16 | #' 17 | #' @examples 18 | #' top_p(c(3, 10)) 19 | top_p <- function(range = c(1L, 4L), trans = NULL) { 20 | dials::new_quant_param( 21 | type = "integer", 22 | range = range, 23 | inclusive = c(TRUE, TRUE), 24 | trans = trans, 25 | label = c(top_p = "# Selected Predictors"), 26 | finalize = dials::get_p 27 | ) 28 | } 29 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | Copyright (c) 2019 Steven Pawley 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: recipeselectors 2 | Type: Package 3 | Title: Extra Recipes Steps for Supervised Feature Selection 4 | Version: 0.0.1 5 | Authors@R: 6 | person(given = "Steven", 7 | family = "Pawley", 8 | role = c("aut", "cre"), 9 | email = "dr.stevenpawley@gmail.com") 10 | Maintainer: Steven Pawley 11 | Description: Provides additional steps for supervised feature selection to be 12 | used with the 'recipes' package. 13 | License: MIT + file LICENSE 14 | Encoding: UTF-8 15 | LazyData: true 16 | URL: https://github.com/stevenpawley/recipeselectors 17 | BugReports: https://github.com/stevenpawley/recipeselectors/issues 18 | Depends: 19 | R (>= 2.10), 20 | recipes 21 | Imports: 22 | generics, 23 | tibble, 24 | parsnip, 25 | tune, 26 | dials, 27 | purrr, 28 | rlang (>= 0.1.2), 29 | magrittr, 30 | dplyr, 31 | scales, 32 | pROC, 33 | stats 34 | RoxygenNote: 7.1.2 35 | Suggests: 36 | testthat, 37 | roxygen2, 38 | FSelectorRcpp, 39 | praznik, 40 | ranger, 41 | Boruta, 42 | care, 43 | modeldata, 44 | covr 45 | -------------------------------------------------------------------------------- /tests/testthat/test_step_select_linear.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(recipes) 3 | library(tibble) 4 | library(parsnip) 5 | library(modeldata) 6 | 7 | data("cells") 8 | 9 | test_that("step_select_linear, execution using top_p on binary case", { 10 | rec <- cells %>% 11 | select(-case) %>% 12 | recipe(class ~ .) %>% 13 | step_normalize(all_numeric_predictors()) %>% 14 | step_select_linear( 15 | all_predictors(), 16 | outcome = "class", 17 | top_p = 2 18 | ) 19 | 20 | prepped <- prep(rec) 21 | selected <- bake(prepped, new_data = NULL) 22 | 23 | expect_length(names(selected), 3) 24 | }) 25 | 26 | 27 | test_that("step_select_linear, execution using threshold on binary case", { 28 | # test selection by retaining features with scores >= 50th percentile 29 | rec <- cells %>% 30 | select(-case) %>% 31 | recipe(class ~ .) %>% 32 | step_normalize(all_numeric_predictors()) %>% 33 | step_select_linear( 34 | all_predictors(), 35 | outcome = "class", 36 | threshold = 0.99 37 | ) 38 | 39 | prepped <- prep(rec) 40 | selected <- juice(prepped) 41 | 42 | expect_length(names(selected), 2) 43 | }) 44 | -------------------------------------------------------------------------------- /R/recipeselectors.R: -------------------------------------------------------------------------------- 1 | #' recipeselectors: A collection of steps for feature selection to use with the 2 | #' 'recipes' package 3 | #' 4 | #' \pkg{recipeselectors} provides a collection of additional step objects 5 | #' related to feature selection to be used with the 'recipes' package. 6 | #' 7 | #' @examples 8 | #' library(parsnip) 9 | #' library(recipes) 10 | #' library(magrittr) 11 | #' 12 | #' # load the example iris dataset 13 | #' data(iris) 14 | #' 15 | #' # define a base model to use for feature importances 16 | #' base_model <- rand_forest(mode = "classification") %>% 17 | #' set_engine("ranger", importance = "permutation") 18 | #' 19 | #' # create a preprocessing recipe 20 | #' rec <- iris %>% 21 | #' recipe(Species ~ .) %>% 22 | #' step_select_vip(all_predictors(), model = base_model, top_p = 2, 23 | #' outcome = "Species") 24 | #' 25 | #' prepped <- prep(rec) 26 | #' 27 | #' # create a model specification 28 | #' clf <- decision_tree(mode = "classification") %>% 29 | #' set_engine("rpart") 30 | #' 31 | #' clf_fitted <- clf %>% 32 | #' fit(Species ~ ., juice(prepped)) 33 | #' 34 | #' @author Steven Pawley, \email{dr.stevenpawley@@gmail.com} 35 | 36 | #' @docType package 37 | #' @name recipeselectors 38 | NULL 39 | -------------------------------------------------------------------------------- /man/recipeselectors.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/recipeselectors.R 3 | \docType{package} 4 | \name{recipeselectors} 5 | \alias{recipeselectors} 6 | \title{recipeselectors: A collection of steps for feature selection to use with the 7 | 'recipes' package} 8 | \description{ 9 | \pkg{recipeselectors} provides a collection of additional step objects 10 | related to feature selection to be used with the 'recipes' package. 11 | } 12 | \examples{ 13 | library(parsnip) 14 | library(recipes) 15 | library(magrittr) 16 | 17 | # load the example iris dataset 18 | data(iris) 19 | 20 | # define a base model to use for feature importances 21 | base_model <- rand_forest(mode = "classification") \%>\% 22 | set_engine("ranger", importance = "permutation") 23 | 24 | # create a preprocessing recipe 25 | rec <- iris \%>\% 26 | recipe(Species ~ .) \%>\% 27 | step_select_vip(all_predictors(), model = base_model, top_p = 2, 28 | outcome = "Species") 29 | 30 | prepped <- prep(rec) 31 | 32 | # create a model specification 33 | clf <- decision_tree(mode = "classification") \%>\% 34 | set_engine("rpart") 35 | 36 | clf_fitted <- clf \%>\% 37 | fit(Species ~ ., juice(prepped)) 38 | 39 | } 40 | \author{ 41 | Steven Pawley, \email{dr.stevenpawley@gmail.com} 42 | } 43 | -------------------------------------------------------------------------------- /tests/testthat/test_step_select_tree.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(recipes) 3 | library(tibble) 4 | library(parsnip) 5 | data("iris") 6 | 7 | test_that("step_select_tree, execution using top_p", { 8 | skip_if_not_installed("rpart") 9 | 10 | irisX <- iris[-5] 11 | y <- iris$Species 12 | 13 | rec <- iris %>% 14 | recipe(Species ~.) %>% 15 | step_select_tree( 16 | all_predictors(), 17 | outcome = "Species", 18 | engine = "rpart", 19 | top_p = 2 20 | ) 21 | 22 | prepped <- prep(rec) 23 | selected <- juice(prepped) 24 | 25 | expect_length(names(selected), 3) 26 | }) 27 | 28 | 29 | test_that("step_select_tree, execution using threshold", { 30 | skip_if_not_installed("rpart") 31 | 32 | irisX <- iris[-5] 33 | y <- iris$Species 34 | 35 | # test selection by retaining features with scores >= 50th percentile 36 | rec <- iris %>% 37 | recipe(Species ~.) %>% 38 | step_select_tree( 39 | all_predictors(), 40 | outcome = "Species", 41 | threshold = 0.5 42 | ) 43 | 44 | prepped <- prep(rec) 45 | selected <- juice(prepped) 46 | 47 | expect_length(names(selected), 3) 48 | 49 | # test selection by retaining features with scores in 90th percentile 50 | rec <- iris %>% 51 | recipe(Species ~.) %>% 52 | step_select_tree( 53 | all_predictors(), 54 | outcome = "Species", 55 | threshold = 0.9 56 | ) 57 | 58 | prepped <- prep(rec) 59 | selected <- juice(prepped) 60 | 61 | expect_length(names(selected), 2) 62 | }) 63 | -------------------------------------------------------------------------------- /tests/testthat/test_step_select_forests.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(recipes) 3 | library(tibble) 4 | library(parsnip) 5 | 6 | data("iris") 7 | 8 | test_that("step_select_forests, execution using top_p", { 9 | skip_if_not_installed("ranger") 10 | 11 | rec <- iris %>% 12 | recipe(Species ~.) %>% 13 | step_select_forests( 14 | all_predictors(), 15 | outcome = "Species", 16 | engine = "ranger", 17 | top_p = 2 18 | ) 19 | 20 | prepped <- prep(rec) 21 | tidy(rec, number = 1) 22 | selected <- juice(prepped) 23 | 24 | expect_length(names(selected), 3) 25 | }) 26 | 27 | 28 | test_that("step_select_forests, execution using threshold", { 29 | skip_if_not_installed("ranger") 30 | 31 | irisX <- iris[-5] 32 | y <- iris$Species 33 | 34 | # test selection by retaining features with scores >= 50th percentile 35 | rec <- iris %>% 36 | recipe(Species ~.) %>% 37 | step_select_forests( 38 | all_predictors(), 39 | outcome = "Species", 40 | threshold = 0.5 41 | ) 42 | 43 | prepped <- prep(rec) 44 | selected <- juice(prepped) 45 | 46 | expect_length(names(selected), 3) 47 | 48 | # test selection by retaining features with scores in 90th percentile 49 | rec <- iris %>% 50 | recipe(Species ~.) %>% 51 | step_select_forests( 52 | all_predictors(), 53 | outcome = "Species", 54 | threshold = 0.9 55 | ) 56 | 57 | prepped <- prep(rec) 58 | selected <- juice(prepped) 59 | 60 | expect_length(names(selected), 2) 61 | }) 62 | 63 | -------------------------------------------------------------------------------- /tests/testthat/test_step_select_boruta.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(recipes) 3 | library(tibble) 4 | library(modeldata) 5 | 6 | data("lending_club") 7 | 8 | test_that("step_select_boruta, execution", { 9 | skip_if_not_installed("Boruta") 10 | 11 | # Boruta model results 12 | set.seed(1234) 13 | boruta_mod <- Boruta::Boruta( 14 | x = lending_club[, -23], 15 | y = lending_club$Class 16 | ) 17 | excluded <- names( 18 | boruta_mod$finalDecision[boruta_mod$finalDecision == "Rejected"] 19 | ) 20 | 21 | # step_select_boruta results 22 | rec <- recipe(Class ~ ., data = lending_club) %>% 23 | step_select_boruta(all_predictors(), outcome = "Class") 24 | set.seed(1234) 25 | prepped <- rec %>% prep() 26 | 27 | # check 28 | expect_equal(excluded, prepped$steps[[1]]$exclude) 29 | expect_equal(boruta_mod$ImpHistory, prepped$steps[[1]]$res$ImpHistory) 30 | }) 31 | 32 | 33 | test_that("step_select_boruta, options", { 34 | skip_if_not_installed("Boruta") 35 | 36 | # Boruta model results 37 | set.seed(1234) 38 | boruta_mod <- Boruta::Boruta( 39 | x = lending_club[, -23], 40 | y = lending_club$Class, 41 | getImp = Boruta::getImpRfGini 42 | ) 43 | excluded <- names( 44 | boruta_mod$finalDecision[boruta_mod$finalDecision == "Rejected"] 45 | ) 46 | 47 | # step_select_boruta results 48 | rec <- recipe(Class ~ ., data = lending_club) %>% 49 | step_select_boruta(all_predictors(), outcome = "Class", 50 | options = list(getImp = Boruta::getImpRfGini)) 51 | set.seed(1234) 52 | prepped <- rec %>% prep() 53 | 54 | # check 55 | expect_equal(excluded, prepped$steps[[1]]$exclude) 56 | expect_equal(boruta_mod$ImpHistory, prepped$steps[[1]]$res$ImpHistory) 57 | }) 58 | 59 | -------------------------------------------------------------------------------- /tests/testthat/test_step_select_vip.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(recipes) 3 | library(tibble) 4 | library(parsnip) 5 | data("iris") 6 | 7 | test_that("step_select_vip, execution using top_p", { 8 | skip_if_not_installed("ranger") 9 | 10 | irisX <- iris[-5] 11 | y <- iris$Species 12 | 13 | base_model <- rand_forest(mode = "classification") %>% 14 | set_engine("ranger", importance = "permutation") 15 | 16 | rec <- iris %>% 17 | recipe(Species ~.) %>% 18 | step_select_vip( 19 | all_predictors(), 20 | outcome = "Species", 21 | model = base_model, 22 | top_p = 2 23 | ) 24 | 25 | prepped <- prep(rec) 26 | selected <- juice(prepped) 27 | 28 | expect_length(names(selected), 3) 29 | }) 30 | 31 | 32 | test_that("step_select_vip, execution using threshold", { 33 | skip_if_not_installed("ranger") 34 | 35 | irisX <- iris[-5] 36 | y <- iris$Species 37 | 38 | base_model <- rand_forest(mode = "classification") %>% 39 | set_engine("ranger", importance = "permutation") 40 | 41 | # test selection by retaining features with scores >= 50th percentile 42 | rec <- iris %>% 43 | recipe(Species ~.) %>% 44 | step_select_vip( 45 | all_predictors(), 46 | outcome = "Species", 47 | model = base_model, 48 | threshold = 0.5 49 | ) 50 | 51 | prepped <- prep(rec) 52 | selected <- juice(prepped) 53 | 54 | expect_length(names(selected), 3) 55 | 56 | # test selection by retaining features with scores in 90th percentile 57 | rec <- iris %>% 58 | recipe(Species ~.) %>% 59 | step_select_vip( 60 | all_predictors(), 61 | outcome = "Species", 62 | model = base_model, 63 | threshold = 0.9 64 | ) 65 | 66 | prepped <- prep(rec) 67 | selected <- juice(prepped) 68 | 69 | expect_length(names(selected), 2) 70 | }) 71 | 72 | -------------------------------------------------------------------------------- /docs/sitemap.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | /404.html 5 | 6 | 7 | /LICENSE-text.html 8 | 9 | 10 | /LICENSE.html 11 | 12 | 13 | /authors.html 14 | 15 | 16 | /index.html 17 | 18 | 19 | /reference/index.html 20 | 21 | 22 | /reference/pipe.html 23 | 24 | 25 | /reference/pull_importances.html 26 | 27 | 28 | /reference/recipeselectors.html 29 | 30 | 31 | /reference/step_boruta.html 32 | 33 | 34 | /reference/step_idw.html 35 | 36 | 37 | /reference/step_importance.html 38 | 39 | 40 | /reference/step_infgain.html 41 | 42 | 43 | /reference/step_mrmr.html 44 | 45 | 46 | /reference/step_select_boruta.html 47 | 48 | 49 | /reference/step_select_carscore.html 50 | 51 | 52 | /reference/step_select_forests.html 53 | 54 | 55 | /reference/step_select_infgain.html 56 | 57 | 58 | /reference/step_select_linear.html 59 | 60 | 61 | /reference/step_select_mrmr.html 62 | 63 | 64 | /reference/step_select_roc.html 65 | 66 | 67 | /reference/step_select_tree.html 68 | 69 | 70 | /reference/step_select_vip.html 71 | 72 | 73 | /reference/step_select_xtab.html 74 | 75 | 76 | /reference/tidyeval.html 77 | 78 | 79 | /reference/top_p.html 80 | 81 | 82 | -------------------------------------------------------------------------------- /docs/bootstrap-toc.css: -------------------------------------------------------------------------------- 1 | /*! 2 | * Bootstrap Table of Contents v0.4.1 (http://afeld.github.io/bootstrap-toc/) 3 | * Copyright 2015 Aidan Feldman 4 | * Licensed under MIT (https://github.com/afeld/bootstrap-toc/blob/gh-pages/LICENSE.md) */ 5 | 6 | /* modified from https://github.com/twbs/bootstrap/blob/94b4076dd2efba9af71f0b18d4ee4b163aa9e0dd/docs/assets/css/src/docs.css#L548-L601 */ 7 | 8 | /* All levels of nav */ 9 | nav[data-toggle='toc'] .nav > li > a { 10 | display: block; 11 | padding: 4px 20px; 12 | font-size: 13px; 13 | font-weight: 500; 14 | color: #767676; 15 | } 16 | nav[data-toggle='toc'] .nav > li > a:hover, 17 | nav[data-toggle='toc'] .nav > li > a:focus { 18 | padding-left: 19px; 19 | color: #563d7c; 20 | text-decoration: none; 21 | background-color: transparent; 22 | border-left: 1px solid #563d7c; 23 | } 24 | nav[data-toggle='toc'] .nav > .active > a, 25 | nav[data-toggle='toc'] .nav > .active:hover > a, 26 | nav[data-toggle='toc'] .nav > .active:focus > a { 27 | padding-left: 18px; 28 | font-weight: bold; 29 | color: #563d7c; 30 | background-color: transparent; 31 | border-left: 2px solid #563d7c; 32 | } 33 | 34 | /* Nav: second level (shown on .active) */ 35 | nav[data-toggle='toc'] .nav .nav { 36 | display: none; /* Hide by default, but at >768px, show it */ 37 | padding-bottom: 10px; 38 | } 39 | nav[data-toggle='toc'] .nav .nav > li > a { 40 | padding-top: 1px; 41 | padding-bottom: 1px; 42 | padding-left: 30px; 43 | font-size: 12px; 44 | font-weight: normal; 45 | } 46 | nav[data-toggle='toc'] .nav .nav > li > a:hover, 47 | nav[data-toggle='toc'] .nav .nav > li > a:focus { 48 | padding-left: 29px; 49 | } 50 | nav[data-toggle='toc'] .nav .nav > .active > a, 51 | nav[data-toggle='toc'] .nav .nav > .active:hover > a, 52 | nav[data-toggle='toc'] .nav .nav > .active:focus > a { 53 | padding-left: 28px; 54 | font-weight: 500; 55 | } 56 | 57 | /* from https://github.com/twbs/bootstrap/blob/e38f066d8c203c3e032da0ff23cd2d6098ee2dd6/docs/assets/css/src/docs.css#L631-L634 */ 58 | nav[data-toggle='toc'] .nav > .active > ul { 59 | display: block; 60 | } 61 | -------------------------------------------------------------------------------- /man/pull_importances.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/pull_importances.R 3 | \name{pull_importances} 4 | \alias{pull_importances} 5 | \title{Pull feature importances from a parsnip fitted model} 6 | \usage{ 7 | pull_importances(object, scaled = TRUE, ...) 8 | } 9 | \arguments{ 10 | \item{object}{A `model_fit` object.} 11 | 12 | \item{scaled}{A logical indicating whether to rescale the importances between 13 | 0 and 1. Default is TRUE.} 14 | 15 | \item{...}{A list of other parameters passed to the feature importance 16 | method.} 17 | } 18 | \value{ 19 | tibble 20 | } 21 | \description{ 22 | `pull_importances` is a generic function to extract feature importance scores 23 | or coefficients from a parsnip `model_fit` object and return them as a tibble 24 | with a 'feature' and 'importance' column. This is designed to support the 25 | `step_importance` recipe step. 26 | } 27 | \details{ 28 | Most of the basic models within the parsnip package that support feature 29 | importances are implemented (call `methods(pull_importances)` to list models 30 | that are currently implemented). If need to pull the feature importance scores 31 | from a model that is not currently supported in this package, then you can 32 | add a class to the pull_importances generic function which returns a 33 | two-column tibble: 34 | } 35 | \examples{ 36 | library(parsnip) 37 | 38 | # pull feature importances from a model_fit object 39 | model <- boost_tree(mode = "classification") \%>\% 40 | set_engine("xgboost") 41 | model_fit <- model \%>\% fit(Species ~., iris) 42 | pull_importances(model_fit) 43 | 44 | # create a new pull_importances method 45 | pull_importances._ranger <- function(object, scaled = FALSE, ...) { 46 | # create a call to the ranger::importance function avoiding having to use 47 | # ranger as a dependency 48 | call <- rlang::call2(.fn = "importance", .ns = "ranger", x = object$fit) 49 | scores <- rlang::eval_tidy(call) 50 | 51 | # create a tibble with 'feature' and 'importance' columns 52 | scores <- tibble::tibble( 53 | feature = names(scores), 54 | importance = as.numeric(scores) 55 | ) 56 | # optionally rescale the importance scores 57 | if (isTRUE(scaled)) 58 | scores$importance <- rescale(scores$importance) 59 | 60 | scores 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /docs/docsearch.js: -------------------------------------------------------------------------------- 1 | $(function() { 2 | 3 | // register a handler to move the focus to the search bar 4 | // upon pressing shift + "/" (i.e. "?") 5 | $(document).on('keydown', function(e) { 6 | if (e.shiftKey && e.keyCode == 191) { 7 | e.preventDefault(); 8 | $("#search-input").focus(); 9 | } 10 | }); 11 | 12 | $(document).ready(function() { 13 | // do keyword highlighting 14 | /* modified from https://jsfiddle.net/julmot/bL6bb5oo/ */ 15 | var mark = function() { 16 | 17 | var referrer = document.URL ; 18 | var paramKey = "q" ; 19 | 20 | if (referrer.indexOf("?") !== -1) { 21 | var qs = referrer.substr(referrer.indexOf('?') + 1); 22 | var qs_noanchor = qs.split('#')[0]; 23 | var qsa = qs_noanchor.split('&'); 24 | var keyword = ""; 25 | 26 | for (var i = 0; i < qsa.length; i++) { 27 | var currentParam = qsa[i].split('='); 28 | 29 | if (currentParam.length !== 2) { 30 | continue; 31 | } 32 | 33 | if (currentParam[0] == paramKey) { 34 | keyword = decodeURIComponent(currentParam[1].replace(/\+/g, "%20")); 35 | } 36 | } 37 | 38 | if (keyword !== "") { 39 | $(".contents").unmark({ 40 | done: function() { 41 | $(".contents").mark(keyword); 42 | } 43 | }); 44 | } 45 | } 46 | }; 47 | 48 | mark(); 49 | }); 50 | }); 51 | 52 | /* Search term highlighting ------------------------------*/ 53 | 54 | function matchedWords(hit) { 55 | var words = []; 56 | 57 | var hierarchy = hit._highlightResult.hierarchy; 58 | // loop to fetch from lvl0, lvl1, etc. 59 | for (var idx in hierarchy) { 60 | words = words.concat(hierarchy[idx].matchedWords); 61 | } 62 | 63 | var content = hit._highlightResult.content; 64 | if (content) { 65 | words = words.concat(content.matchedWords); 66 | } 67 | 68 | // return unique words 69 | var words_uniq = [...new Set(words)]; 70 | return words_uniq; 71 | } 72 | 73 | function updateHitURL(hit) { 74 | 75 | var words = matchedWords(hit); 76 | var url = ""; 77 | 78 | if (hit.anchor) { 79 | url = hit.url_without_anchor + '?q=' + escape(words.join(" ")) + '#' + hit.anchor; 80 | } else { 81 | url = hit.url + '?q=' + escape(words.join(" ")); 82 | } 83 | 84 | return url; 85 | } 86 | -------------------------------------------------------------------------------- /R/misc.R: -------------------------------------------------------------------------------- 1 | check_zero_one <- function(x) { 2 | if (is.na(x)) { 3 | return(x) 4 | } else { 5 | if (is.numeric(x)) { 6 | if (x >= 1 | x <= 0) { 7 | rlang::abort("`threshold` should be on (0, 1).") 8 | } 9 | } else { 10 | rlang::abort("`threshold` should be numeric.") 11 | } 12 | } 13 | return(x) 14 | } 15 | 16 | check_top_p <- function(x, n) { 17 | if (is.na(x)) { 18 | return(x) 19 | } else { 20 | if (is.numeric(x)) { 21 | if (!is.integer(x)) { 22 | x <- as.integer(x) 23 | } 24 | if (x >= n | x <= 0) { 25 | msg <- paste0("`top_p` should be on (0, ", n, ").") 26 | rlang::warn(msg) 27 | x <- min(n - 1, x) 28 | } 29 | } else { 30 | rlang::abort("`top_p` should be numeric.") 31 | } 32 | } 33 | x 34 | } 35 | 36 | check_criteria <- function(top_p, threshold, cl) { 37 | if (is.na(top_p) & is.na(threshold)) { 38 | msg <- paste0( 39 | "For `", 40 | cl[[1]], 41 | "`, `top_p` and `threshold` cannot both be missing." 42 | ) 43 | rlang::abort(msg) 44 | } 45 | invisible(NULL) 46 | } 47 | 48 | dual_filter <- function(x, top_p, threshold, maximize) { 49 | na_x <- x[ is.na(x)] 50 | x <- x[!is.na(x)] 51 | x <- sort(x) 52 | if (maximize) { 53 | x <- rev(x) 54 | } 55 | p <- length(x) 56 | 57 | if (!is.na(top_p)) { 58 | top_p_lgl <- seq_along(x) <= top_p 59 | } else { 60 | top_p_lgl <- rep(FALSE, p) 61 | } 62 | 63 | if (!is.na(threshold)) { 64 | if (maximize) { 65 | threshold_lgl <- x >= threshold 66 | } else { 67 | threshold_lgl <- x <= threshold 68 | } 69 | } else { 70 | threshold_lgl <- rep(FALSE, p) 71 | } 72 | keep_lgl <- top_p_lgl | threshold_lgl 73 | c(names(x)[!keep_lgl], names(na_x)) 74 | } 75 | 76 | select_percentile <- function(x, top_p, threshold, maximize) { 77 | # filter a named vector by the top_p features or using a percentile 78 | # threshold 79 | 80 | x <- x[!is.na(x)] 81 | 82 | if (!is.na(threshold)) { 83 | p_to_exceed <- stats::quantile(x, threshold) 84 | 85 | if (maximize) { 86 | removals <- x < p_to_exceed 87 | } else { 88 | removals <- x >= p_to_exceed 89 | } 90 | 91 | removals <- names(removals[removals]) 92 | 93 | } else { 94 | if (maximize) { 95 | x <- sort(x, decreasing = TRUE) 96 | } else { 97 | x <- sort(x, decreasing = FALSE) 98 | } 99 | 100 | removals <- names(x[-seq_len(top_p)]) 101 | } 102 | 103 | removals 104 | } 105 | -------------------------------------------------------------------------------- /man/step_select_boruta.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/step_select_boruta.R 3 | \name{step_select_boruta} 4 | \alias{step_select_boruta} 5 | \alias{tidy.step_select_boruta} 6 | \title{Feature selection step using Boruta} 7 | \usage{ 8 | step_select_boruta( 9 | recipe, 10 | ..., 11 | outcome = NULL, 12 | role = "predictor", 13 | trained = FALSE, 14 | exclude = NULL, 15 | options = list(pValue = 0.01, mcAdj = TRUE, maxRuns = 100), 16 | res = NULL, 17 | skip = FALSE, 18 | id = recipes::rand_id("select_boruta") 19 | ) 20 | 21 | \method{tidy}{step_select_boruta}(x, ...) 22 | } 23 | \arguments{ 24 | \item{recipe}{A recipe object. The step will be added to the sequence of 25 | operations for this recipe.} 26 | 27 | \item{...}{One or more selector functions to choose which variables are 28 | affected by the step. See selections() for more details. For the tidy 29 | method, these are not currently used.} 30 | 31 | \item{outcome}{A character string with the name of the response variable to 32 | use to calculate the feature importance scores.} 33 | 34 | \item{role}{Not used by this step since no new variables are created.} 35 | 36 | \item{trained}{A logical to indicate if the quantities for preprocessing have 37 | been estimated.} 38 | 39 | \item{exclude}{A character vector of predictor names that will be removed 40 | from the data. This will be set when `prep()` is used on the recipe and 41 | should not be set by the user.} 42 | 43 | \item{options}{A list of options to pass to `Boruta::Boruta()`. The defaults 44 | use Boruta's defaults. *Note* that `x` and `y` should not be passed here.} 45 | 46 | \item{res}{The `Boruta::Boruta` object is stored here once this preprocessing 47 | step has been trained by `prep.recipe()`.} 48 | 49 | \item{skip}{A logical. Should the step be skipped when the recipe is baked by 50 | bake.recipe()? While all operations are baked when prep.recipe() is run, 51 | some operations may not be able to be conducted on new data (e.g. 52 | processing the outcome variable(s)). Care should be taken when using skip = 53 | TRUE as it may affect the computations for subsequent operations.} 54 | 55 | \item{id}{A character string that is unique to this step to identify it.} 56 | 57 | \item{x}{A `step_select_boruta` object.} 58 | } 59 | \value{ 60 | a `step_select_boruta` object. 61 | } 62 | \description{ 63 | `step_select_boruta` creates a *specification* of a recipe step that selects a 64 | subset of predictors using the Boruta feature selection approach. 65 | } 66 | \examples{ 67 | library(recipes) 68 | library(parsnip) 69 | 70 | # load the example iris dataset 71 | data(cells, package = "modeldata") 72 | 73 | # create a preprocessing recipe 74 | rec <- 75 | recipe(class ~ ., data = cells[, -1]) \%>\% 76 | step_select_boruta(all_predictors(), outcome = "class") 77 | 78 | prepped <- prep(rec) 79 | 80 | preproc_data <- juice(prepped) 81 | prepped 82 | } 83 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method(bake,step_select_boruta) 4 | S3method(bake,step_select_carscore) 5 | S3method(bake,step_select_forests) 6 | S3method(bake,step_select_infgain) 7 | S3method(bake,step_select_linear) 8 | S3method(bake,step_select_mrmr) 9 | S3method(bake,step_select_roc) 10 | S3method(bake,step_select_tree) 11 | S3method(bake,step_select_vip) 12 | S3method(bake,step_select_xtab) 13 | S3method(prep,step_select_boruta) 14 | S3method(prep,step_select_carscore) 15 | S3method(prep,step_select_forests) 16 | S3method(prep,step_select_infgain) 17 | S3method(prep,step_select_linear) 18 | S3method(prep,step_select_mrmr) 19 | S3method(prep,step_select_roc) 20 | S3method(prep,step_select_tree) 21 | S3method(prep,step_select_vip) 22 | S3method(prep,step_select_xtab) 23 | S3method(print,step_select_boruta) 24 | S3method(print,step_select_carscore) 25 | S3method(print,step_select_forests) 26 | S3method(print,step_select_infgain) 27 | S3method(print,step_select_linear) 28 | S3method(print,step_select_mrmr) 29 | S3method(print,step_select_roc) 30 | S3method(print,step_select_tree) 31 | S3method(print,step_select_vip) 32 | S3method(print,step_select_xtab) 33 | S3method(pull_importances,"_C5.0") 34 | S3method(pull_importances,"_H2OMultinomialModel") 35 | S3method(pull_importances,"_H2ORegressionModel") 36 | S3method(pull_importances,"_cubist") 37 | S3method(pull_importances,"_earth") 38 | S3method(pull_importances,"_elnet") 39 | S3method(pull_importances,"_glm") 40 | S3method(pull_importances,"_lm") 41 | S3method(pull_importances,"_lognet") 42 | S3method(pull_importances,"_randomForest") 43 | S3method(pull_importances,"_ranger") 44 | S3method(pull_importances,"_rpart") 45 | S3method(pull_importances,"_xgb.Booster") 46 | S3method(pull_importances,default) 47 | S3method(tidy,step_select_boruta) 48 | S3method(tidy,step_select_carscore) 49 | S3method(tidy,step_select_forests) 50 | S3method(tidy,step_select_infgain) 51 | S3method(tidy,step_select_linear) 52 | S3method(tidy,step_select_mrmr) 53 | S3method(tidy,step_select_roc) 54 | S3method(tidy,step_select_tree) 55 | S3method(tidy,step_select_vip) 56 | S3method(tidy,step_select_xtab) 57 | S3method(tunable,step_select_carscore) 58 | S3method(tunable,step_select_forests) 59 | S3method(tunable,step_select_infgain) 60 | S3method(tunable,step_select_linear) 61 | S3method(tunable,step_select_mrmr) 62 | S3method(tunable,step_select_roc) 63 | S3method(tunable,step_select_tree) 64 | S3method(tunable,step_select_vip) 65 | S3method(tunable,step_select_xtab) 66 | export("%>%") 67 | export(pull_importances) 68 | export(step_select_boruta) 69 | export(step_select_carscore) 70 | export(step_select_forests) 71 | export(step_select_infgain) 72 | export(step_select_linear) 73 | export(step_select_mrmr) 74 | export(step_select_roc) 75 | export(step_select_tree) 76 | export(step_select_vip) 77 | export(step_select_xtab) 78 | export(top_p) 79 | importFrom(generics,tidy) 80 | importFrom(magrittr,"%>%") 81 | importFrom(recipes,bake) 82 | importFrom(recipes,prep) 83 | importFrom(recipes,step) 84 | importFrom(tibble,as_tibble) 85 | importFrom(tibble,tibble) 86 | importFrom(tune,tunable) 87 | -------------------------------------------------------------------------------- /man/step_select_mrmr.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/step_select_mrmr.R 3 | \name{step_select_mrmr} 4 | \alias{step_select_mrmr} 5 | \alias{tidy.step_select_mrmr} 6 | \title{Apply minimum Redundancy Maximum Relevance Feature Selection (mRMR)} 7 | \usage{ 8 | step_select_mrmr( 9 | recipe, 10 | ..., 11 | outcome = NULL, 12 | role = NA, 13 | trained = FALSE, 14 | top_p = NA, 15 | threshold = NA, 16 | threads = 0, 17 | exclude = NULL, 18 | scores = NULL, 19 | skip = FALSE, 20 | id = recipes::rand_id("select_mrmr") 21 | ) 22 | 23 | \method{tidy}{step_select_mrmr}(x, ...) 24 | } 25 | \arguments{ 26 | \item{recipe}{A recipe object. The step will be added to the sequence of 27 | operations for this recipe} 28 | 29 | \item{...}{One or more selector functions to choose which variables are 30 | affected by the step. See selections() for more details. For the tidy 31 | method, these are not currently used} 32 | 33 | \item{outcome}{A character string specifying the name of response variable 34 | used to evaluate mRMR.} 35 | 36 | \item{role}{Not used by this step since no new variables are created} 37 | 38 | \item{trained}{A logical to indicate if the quantities for preprocessing have 39 | been estimated} 40 | 41 | \item{top_p}{An integer that will be used to select the number of best 42 | scoring features.} 43 | 44 | \item{threshold}{A numeric value between 0 and 1 representing the percentile 45 | of best scoring features to select. Features with scores that are _larger_ 46 | than the specified threshold will be retained, for example `threshold = 47 | 0.9` will retain only predictors with scores in the top 90th percentile. 48 | Note that this overrides `top_p`.} 49 | 50 | \item{threads}{An integer specifying the number of threads to use for 51 | processing. The default = 0 uses all available threads.} 52 | 53 | \item{exclude}{A character vector of predictor names that will be removed 54 | from the data. This will be set when `prep()` is used on the recipe and 55 | should not be set by the user.} 56 | 57 | \item{scores}{A tibble with 'variable' and 'scores' columns containing the 58 | names of the variables and their mRMR scores. This parameter is only 59 | produced after the recipe has been trained.} 60 | 61 | \item{skip}{A logical. Should the step be skipped when the recipe is baked by 62 | bake.recipe()? While all operations are baked when prep.recipe() is run, 63 | some operations may not be able to be conducted on new data (e.g. 64 | processing the outcome variable(s)). Care should be taken when using skip = 65 | TRUE as it may affect the computations for subsequent operations.} 66 | 67 | \item{id}{A character string that is unique to this step to identify it.} 68 | 69 | \item{x}{A `step_select_mrmr` object.} 70 | } 71 | \value{ 72 | A step_select_mrmr object. 73 | } 74 | \description{ 75 | `step_select_mrmr` creates a *specification* of a recipe step that will apply 76 | minimum Redundancy Maximum Relevance Feature Selection (mRMR) to numeric 77 | data. The top `top_p` scoring features, or features whose scores occur in 78 | the top percentile `threshold` will be retained as new predictors. 79 | } 80 | \details{ 81 | The recipe will stop if both `top_p` and `threshold` are left unspecified. 82 | } 83 | \examples{ 84 | library(recipes) 85 | 86 | data(cells, package = "modeldata") 87 | 88 | rec <- 89 | recipe(class ~ ., data = cells[, -1]) \%>\% 90 | step_select_mrmr(all_predictors(), outcome = "class", top_p = 10, threshold = 0.9) 91 | 92 | prepped <- prep(rec) 93 | 94 | new_data <- juice(prepped) 95 | prepped 96 | } 97 | \concept{preprocessing} 98 | \concept{supervised_filter} 99 | \keyword{datagen} 100 | -------------------------------------------------------------------------------- /docs/pkgdown.js: -------------------------------------------------------------------------------- 1 | /* http://gregfranko.com/blog/jquery-best-practices/ */ 2 | (function($) { 3 | $(function() { 4 | 5 | $('.navbar-fixed-top').headroom(); 6 | 7 | $('body').css('padding-top', $('.navbar').height() + 10); 8 | $(window).resize(function(){ 9 | $('body').css('padding-top', $('.navbar').height() + 10); 10 | }); 11 | 12 | $('[data-toggle="tooltip"]').tooltip(); 13 | 14 | var cur_path = paths(location.pathname); 15 | var links = $("#navbar ul li a"); 16 | var max_length = -1; 17 | var pos = -1; 18 | for (var i = 0; i < links.length; i++) { 19 | if (links[i].getAttribute("href") === "#") 20 | continue; 21 | // Ignore external links 22 | if (links[i].host !== location.host) 23 | continue; 24 | 25 | var nav_path = paths(links[i].pathname); 26 | 27 | var length = prefix_length(nav_path, cur_path); 28 | if (length > max_length) { 29 | max_length = length; 30 | pos = i; 31 | } 32 | } 33 | 34 | // Add class to parent
  • , and enclosing
  • if in dropdown 35 | if (pos >= 0) { 36 | var menu_anchor = $(links[pos]); 37 | menu_anchor.parent().addClass("active"); 38 | menu_anchor.closest("li.dropdown").addClass("active"); 39 | } 40 | }); 41 | 42 | function paths(pathname) { 43 | var pieces = pathname.split("/"); 44 | pieces.shift(); // always starts with / 45 | 46 | var end = pieces[pieces.length - 1]; 47 | if (end === "index.html" || end === "") 48 | pieces.pop(); 49 | return(pieces); 50 | } 51 | 52 | // Returns -1 if not found 53 | function prefix_length(needle, haystack) { 54 | if (needle.length > haystack.length) 55 | return(-1); 56 | 57 | // Special case for length-0 haystack, since for loop won't run 58 | if (haystack.length === 0) { 59 | return(needle.length === 0 ? 0 : -1); 60 | } 61 | 62 | for (var i = 0; i < haystack.length; i++) { 63 | if (needle[i] != haystack[i]) 64 | return(i); 65 | } 66 | 67 | return(haystack.length); 68 | } 69 | 70 | /* Clipboard --------------------------*/ 71 | 72 | function changeTooltipMessage(element, msg) { 73 | var tooltipOriginalTitle=element.getAttribute('data-original-title'); 74 | element.setAttribute('data-original-title', msg); 75 | $(element).tooltip('show'); 76 | element.setAttribute('data-original-title', tooltipOriginalTitle); 77 | } 78 | 79 | if(ClipboardJS.isSupported()) { 80 | $(document).ready(function() { 81 | var copyButton = ""; 82 | 83 | $("div.sourceCode").addClass("hasCopyButton"); 84 | 85 | // Insert copy buttons: 86 | $(copyButton).prependTo(".hasCopyButton"); 87 | 88 | // Initialize tooltips: 89 | $('.btn-copy-ex').tooltip({container: 'body'}); 90 | 91 | // Initialize clipboard: 92 | var clipboardBtnCopies = new ClipboardJS('[data-clipboard-copy]', { 93 | text: function(trigger) { 94 | return trigger.parentNode.textContent.replace(/\n#>[^\n]*/g, ""); 95 | } 96 | }); 97 | 98 | clipboardBtnCopies.on('success', function(e) { 99 | changeTooltipMessage(e.trigger, 'Copied!'); 100 | e.clearSelection(); 101 | }); 102 | 103 | clipboardBtnCopies.on('error', function() { 104 | changeTooltipMessage(e.trigger,'Press Ctrl+C or Command+C to copy'); 105 | }); 106 | }); 107 | } 108 | })(window.jQuery || window.$) 109 | -------------------------------------------------------------------------------- /man/step_select_roc.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/step_select_roc.R 3 | \name{step_select_roc} 4 | \alias{step_select_roc} 5 | \alias{tidy.step_select_roc} 6 | \title{Filter Numeric Predictors using ROC Curve} 7 | \usage{ 8 | step_select_roc( 9 | recipe, 10 | ..., 11 | outcome, 12 | role = "predictor", 13 | trained = FALSE, 14 | threshold = NA, 15 | top_p = NA, 16 | exclude = NULL, 17 | skip = FALSE, 18 | id = recipes::rand_id("select_roc") 19 | ) 20 | 21 | \method{tidy}{step_select_roc}(x, ...) 22 | } 23 | \arguments{ 24 | \item{recipe}{A recipe object. The step will be added to the sequence of 25 | operations for this recipe.} 26 | 27 | \item{...}{One or more selector functions to choose which predictors are 28 | affected by the step. See [selections()] for more details. For the `tidy` 29 | method, these are not currently used.} 30 | 31 | \item{outcome}{A single character string that specifies a single categorical 32 | variable to be used as the class.} 33 | 34 | \item{role}{For model terms created by this step, what analysis role should 35 | they be assigned?. By default, the function assumes that resulting distances 36 | will be used as predictors in a model.} 37 | 38 | \item{trained}{A logical to indicate if the quantities for preprocessing have 39 | been estimated.} 40 | 41 | \item{threshold}{A numeric value, in AUC units, where predictors with ROC 42 | AUC values _larger_ than the threshold will be retained. A value of `NA` 43 | implies that this criterion will be ignored.} 44 | 45 | \item{top_p}{An integer that will be used to select the predictors with the 46 | largest ROC AUC values. A value of `NA` implies that this criterion will be 47 | ignored.} 48 | 49 | \item{exclude}{A character vector of predictor names that will be removed 50 | from the data. This will be set when `prep()` is used on the recipe and 51 | should not be set by the user.} 52 | 53 | \item{skip}{A logical. Should the step be skipped when the recipe is baked by 54 | bake.recipe()? While all operations are baked when prep.recipe() is run, 55 | some operations may not be able to be conducted on new data (e.g. 56 | processing the outcome variable(s)). Care should be taken when using skip = 57 | TRUE as it may affect the computations for subsequent operations.} 58 | 59 | \item{id}{A character string that is unique to this step to identify it.} 60 | 61 | \item{x}{A `step_select_roc` object.} 62 | } 63 | \value{ 64 | An updated version of `recipe` with the new step 65 | added to the sequence of existing steps (if any). For the 66 | `tidy` method, a tibble with a `terms` column for which predictors were 67 | removed. 68 | } 69 | \description{ 70 | `step_select_roc` creates a *specification* of a recipe step that will 71 | filter predictors using their relationship with the outcome as measured 72 | using a Receiver Operating Characteristic curve. 73 | } 74 | \details{ 75 | The recipe will stop if both `top_p` and `threshold` are left unspecified. 76 | 77 | The ROC AUC will be set to be 1 - AUC if the value is less than 0.50. 78 | } 79 | \examples{ 80 | data(cells, package = "modeldata") 81 | 82 | rec <- 83 | recipe(class ~ ., data = cells[, -1]) \%>\% 84 | step_select_roc(all_predictors(), outcome = "class", top_p = 10, threshold = 0.9) \%>\% 85 | prep() 86 | 87 | rec \%>\% juice(all_predictors()) \%>\% names() 88 | 89 | # Use ROC values to select but always keep at least one: 90 | rec <- 91 | recipe(class ~ ., data = cells[, -1]) \%>\% 92 | step_select_roc(all_predictors(), outcome = "class", top_p = 1, threshold = 0.99) \%>\% 93 | prep() 94 | 95 | rec \%>\% juice(all_predictors()) \%>\% names() 96 | 97 | # in case of missing data... 98 | } 99 | \concept{preprocessing} 100 | \concept{supervised_filter} 101 | \keyword{datagen} 102 | -------------------------------------------------------------------------------- /man/step_select_xtab.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/step_select_xtab.R 3 | \name{step_select_xtab} 4 | \alias{step_select_xtab} 5 | \alias{tidy.step_select_xtab} 6 | \title{Filter Categorical Predictors using Contingency Tables} 7 | \usage{ 8 | step_select_xtab( 9 | recipe, 10 | ..., 11 | outcome, 12 | role = "predictor", 13 | trained = FALSE, 14 | threshold = NA, 15 | top_p = NA, 16 | exact = FALSE, 17 | fdr = TRUE, 18 | exclude = NULL, 19 | skip = FALSE, 20 | id = recipes::rand_id("select_xtab") 21 | ) 22 | 23 | \method{tidy}{step_select_xtab}(x, ...) 24 | } 25 | \arguments{ 26 | \item{recipe}{A recipe object. The step will be added to the sequence of 27 | operations for this recipe.} 28 | 29 | \item{...}{One or more selector functions to choose which predictors are 30 | affected by the step. See [selections()] for more details. For the `tidy` 31 | method, these are not currently used.} 32 | 33 | \item{outcome}{A single character string that specifies a single categorical 34 | variable to be used as the class.} 35 | 36 | \item{role}{For model terms created by this step, what analysis role should 37 | they be assigned?. By default, the function assumes that resulting distances 38 | will be used as predictors in a model.} 39 | 40 | \item{trained}{A logical to indicate if the quantities for preprocessing have 41 | been estimated.} 42 | 43 | \item{threshold}{A numeric value, in p-value/FDR units, where predictors with 44 | _smaller_ than the threshold will be retained. A value of `NA` 45 | implies that this criterion will be ignored.} 46 | 47 | \item{top_p}{An integer that will be used to select the predictors with the 48 | smallest p/FDR values. A value of `NA` implies that this criterion will be 49 | ignored.} 50 | 51 | \item{exact}{Should an exact test be used?} 52 | 53 | \item{fdr}{Should false discovery rates (FDR) be used instead of p-values?} 54 | 55 | \item{exclude}{A character vector of predictor names that will be removed 56 | from the data. This will be set when `prep()` is used on the recipe and 57 | should not be set by the user.} 58 | 59 | \item{skip}{A logical. Should the step be skipped when the recipe is baked by 60 | bake.recipe()? While all operations are baked when prep.recipe() is run, 61 | some operations may not be able to be conducted on new data (e.g. 62 | processing the outcome variable(s)). Care should be taken when using skip = 63 | TRUE as it may affect the computations for subsequent operations.} 64 | 65 | \item{id}{A character string that is unique to this step to identify it.} 66 | 67 | \item{x}{A `step_select_xtab` object.} 68 | } 69 | \value{ 70 | An updated version of `recipe` with the new step added to the 71 | sequence of existing steps (if any). For the `tidy` method, a tibble with a 72 | `terms` column for which predictors were removed. 73 | } 74 | \description{ 75 | `step_select_xtab` creates a *specification* of a recipe step that will 76 | filter predictors using their relationship with the outcome as measured 77 | using statistical tests for association. 78 | } 79 | \details{ 80 | The recipe will stop if both `top_p` and `threshold` are left unspecified. If 81 | both are used, they are combined via 'or'. 82 | 83 | The Benjamini-Hochberg FDR correction is used (see [stats::p.adjust()]). 84 | 85 | Warnings from [stats::chisq.test()] and [stats::fisher.test()] are suppressed. 86 | } 87 | \examples{ 88 | data(attrition, package = "modeldata") 89 | 90 | rec <- 91 | recipe(Attrition ~ ., data = attrition) \%>\% 92 | step_select_xtab(all_nominal(), -all_outcomes(), outcome = "Attrition", 93 | top_p = 1, threshold = 0.001, exact = TRUE) \%>\% 94 | prep() 95 | 96 | rec \%>\% juice(all_nominal(), -all_outcomes()) \%>\% names() 97 | 98 | tidy(rec, number = 1) 99 | 100 | } 101 | \concept{preprocessing} 102 | \concept{supervised_filter} 103 | \keyword{datagen} 104 | -------------------------------------------------------------------------------- /man/step_select_vip.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/step_select_vip.R 3 | \name{step_select_vip} 4 | \alias{step_select_vip} 5 | \alias{tidy.step_select_vip} 6 | \title{Feature selection step using a model's feature importance scores or 7 | coefficients} 8 | \usage{ 9 | step_select_vip( 10 | recipe, 11 | ..., 12 | outcome = NULL, 13 | role = "predictor", 14 | trained = FALSE, 15 | model = NULL, 16 | top_p = NA, 17 | threshold = NA, 18 | exclude = NULL, 19 | scores = NULL, 20 | skip = FALSE, 21 | id = recipes::rand_id("select_vip") 22 | ) 23 | 24 | \method{tidy}{step_select_vip}(x, ...) 25 | } 26 | \arguments{ 27 | \item{recipe}{A recipe object. The step will be added to the sequence of 28 | operations for this recipe.} 29 | 30 | \item{...}{One or more selector functions to choose which variables are 31 | affected by the step. See selections() for more details. For the tidy 32 | method, these are not currently used.} 33 | 34 | \item{outcome}{A character string with the name of the response variable to 35 | use to calculate the feature importance scores.} 36 | 37 | \item{role}{Not used by this step since no new variables are created.} 38 | 39 | \item{trained}{A logical to indicate if the quantities for preprocessing have 40 | been estimated.} 41 | 42 | \item{model}{A `model_spec` object from `parsnip` that has a feature 43 | importances or coefficients method. The model needs to have an equivalent 44 | `pull_importances` method defined. See `?pull_importances` for how to 45 | define methods for models that are not currently supported.} 46 | 47 | \item{top_p}{An integer with the number of best scoring features to 48 | select.} 49 | 50 | \item{threshold}{A numeric value between 0 and 1 representing the percentile 51 | of best scoring features to select. Features with scores that are _larger_ 52 | than the specified threshold will be retained, for example `threshold = 53 | 0.9` will retain only predictors with scores in the top 90th percentile. 54 | Note that this overrides `top_p`.} 55 | 56 | \item{exclude}{A character vector of predictor names that will be removed 57 | from the data. This will be set when `prep()` is used on the recipe and 58 | should not be set by the user.} 59 | 60 | \item{scores}{A tibble with 'variable' and 'scores' columns containing the 61 | names of the variables and their feature importance scores. This parameter 62 | is only produced after the recipe has been trained.} 63 | 64 | \item{skip}{A logical. Should the step be skipped when the recipe is baked by 65 | bake.recipe()? While all operations are baked when prep.recipe() is run, 66 | some operations may not be able to be conducted on new data (e.g. 67 | processing the outcome variable(s)). Care should be taken when using skip = 68 | TRUE as it may affect the computations for subsequent operations.} 69 | 70 | \item{id}{A character string that is unique to this step to identify it.} 71 | 72 | \item{x}{A `step_select_vip` object.} 73 | } 74 | \value{ 75 | a `step_select_vip` object. 76 | } 77 | \description{ 78 | `step_select_vip` creates a *specification* of a recipe step that selects a 79 | subset of predictors based on the ranking of variable importance provided by 80 | a `parsnip` model specification and the `model` parameter 81 | } 82 | \examples{ 83 | library(recipes) 84 | library(parsnip) 85 | 86 | # load the example iris dataset 87 | data(cells, package = "modeldata") 88 | 89 | # define a base model to use for feature importances 90 | base_model <- rand_forest(mode = "classification") \%>\% 91 | set_engine("ranger", importance = "permutation") 92 | 93 | # create a preprocessing recipe 94 | rec <- 95 | recipe(class ~ ., data = cells[, -1]) \%>\% 96 | step_select_vip(all_predictors(), outcome = "class", model = base_model, top_p = 10, threshold = 0.9) 97 | 98 | prepped <- prep(rec) 99 | 100 | preproc_data <- juice(prepped) 101 | prepped 102 | } 103 | -------------------------------------------------------------------------------- /man/step_select_carscore.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/step_select_carscore.R 3 | \name{step_select_carscore} 4 | \alias{step_select_carscore} 5 | \alias{tidy.step_select_carscore} 6 | \title{Information gain feature selection step} 7 | \usage{ 8 | step_select_carscore( 9 | recipe, 10 | ..., 11 | outcome = NULL, 12 | role = NA, 13 | trained = FALSE, 14 | top_p = NA, 15 | threshold = NA, 16 | lambda = NA, 17 | diagonal = FALSE, 18 | exclude = NULL, 19 | scores = NULL, 20 | skip = FALSE, 21 | id = recipes::rand_id("select_carscore") 22 | ) 23 | 24 | \method{tidy}{step_select_carscore}(x, ...) 25 | } 26 | \arguments{ 27 | \item{recipe}{A recipe object. The step will be added to the sequence of 28 | operations for this recipe.} 29 | 30 | \item{...}{One or more selector functions to choose which variables are 31 | affected by the step. See selections() for more details. For the tidy 32 | method, these are not currently used.} 33 | 34 | \item{outcome}{A character string with the name of the response variable. 35 | This must refer to a numeric feature for regression.} 36 | 37 | \item{role}{Not used by this step since no new variables are created.} 38 | 39 | \item{trained}{A logical to indicate if the quantities for preprocessing have 40 | been estimated.} 41 | 42 | \item{top_p}{An integer with the number of best scoring features to 43 | select.} 44 | 45 | \item{threshold}{A numeric value between 0 and 1 representing the percentile 46 | of best scoring features to select. Features with scores that are _larger_ 47 | than the specified threshold will be retained, for example `threshold = 48 | 0.9` will retain only predictors with scores in the top 90th percentile. 49 | Note that this overrides `top_p`.} 50 | 51 | \item{lambda}{The correlation shrinkage intensity (range 0-1).} 52 | 53 | \item{diagonal}{For diagonal = FALSE (the default) CAR scores are computed; 54 | otherwise with diagonal = TRUE marginal correlations.} 55 | 56 | \item{exclude}{A character vector of predictor names that will be removed 57 | from the data. This will be set when `prep()` is used on the recipe and 58 | should not be set by the user.} 59 | 60 | \item{scores}{A tibble with 'variable' and 'scores' columns containing the 61 | names of the variables and the absolute values of the calculated CAR 62 | scores. This parameter is only produced after the recipe has been trained.} 63 | 64 | \item{skip}{A logical. Should the step be skipped when the recipe is baked by 65 | bake.recipe()? While all operations are baked when prep.recipe() is run, 66 | some operations may not be able to be conducted on new data (e.g. 67 | processing the outcome variable(s)). Care should be taken when using skip = 68 | TRUE as it may affect the computations for subsequent operations.} 69 | 70 | \item{id}{A character string that is unique to this step to identify it.} 71 | 72 | \item{x}{A `step_select_carscore` object.} 73 | } 74 | \value{ 75 | A step_select_carscore object. 76 | } 77 | \description{ 78 | `step_select_carscore` creates a *specification* of a recipe step that 79 | selects a subset of predictors as part of a regression model based on the 80 | scores of the CAR score algorithm. This step requires the `care` package to be 81 | installed. The top `top_p` scoring features, or features whose scores occur 82 | in the top percentile `threshold` will be retained as new predictors. 83 | } 84 | \details{ 85 | The recipe will stop if both `top_p` and `threshold` are left unspecified. 86 | } 87 | \examples{ 88 | library(recipes) 89 | 90 | data(car_prices, package = "modeldata") 91 | 92 | rec <- 93 | recipe(Price ~ ., data = car_prices) \%>\% 94 | step_select_carscore(all_predictors(), outcome = "Price", top_p = 5, threshold = 0.7) 95 | 96 | prepped <- prep(rec) 97 | 98 | new_data <- juice(prepped) 99 | prepped 100 | } 101 | \concept{preprocessing} 102 | \concept{supervised_filter} 103 | \keyword{datagen} 104 | -------------------------------------------------------------------------------- /man/step_select_tree.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/step_select_tree.R 3 | \name{step_select_tree} 4 | \alias{step_select_tree} 5 | \alias{tidy.step_select_tree} 6 | \title{Feature selection step using a decision tree importance scores} 7 | \usage{ 8 | step_select_tree( 9 | recipe, 10 | ..., 11 | outcome = NULL, 12 | role = "predictor", 13 | trained = FALSE, 14 | engine = "rpart", 15 | cost_complexity = NULL, 16 | tree_depth = NULL, 17 | min_n = NULL, 18 | top_p = NA, 19 | threshold = NA, 20 | exclude = NULL, 21 | scores = NULL, 22 | skip = FALSE, 23 | id = recipes::rand_id("select_tree") 24 | ) 25 | 26 | \method{tidy}{step_select_tree}(x, ...) 27 | } 28 | \arguments{ 29 | \item{recipe}{A recipe object. The step will be added to the sequence of 30 | operations for this recipe.} 31 | 32 | \item{...}{One or more selector functions to choose which variables are 33 | affected by the step. See selections() for more details. For the tidy 34 | method, these are not currently used.} 35 | 36 | \item{outcome}{A character string with the name of the response variable to 37 | use to calculate the feature importance scores.} 38 | 39 | \item{role}{Not used by this step since no new variables are created.} 40 | 41 | \item{trained}{A logical to indicate if the quantities for preprocessing have 42 | been estimated.} 43 | 44 | \item{engine}{A supported rand_forest engine that is supported by parsnip. 45 | The default is "rpart".} 46 | 47 | \item{cost_complexity}{A positive number for the the cost/complexity 48 | parameter (a.k.a. Cp) used by CART models (specific engines only).} 49 | 50 | \item{tree_depth}{An integer for maximum depth of the tree.} 51 | 52 | \item{min_n}{An integer for the minimum number of data points in a node that 53 | are required for the node to be split further.} 54 | 55 | \item{top_p}{An integer with the number of best scoring features to 56 | select.} 57 | 58 | \item{threshold}{A numeric value between 0 and 1 representing the percentile 59 | of best scoring features to select. Features with scores that are _larger_ 60 | than the specified threshold will be retained, for example `threshold = 61 | 0.9` will retain only predictors with scores in the top 90th percentile. 62 | Note that this overrides `top_p`.} 63 | 64 | \item{exclude}{A character vector of predictor names that will be removed 65 | from the data. This will be set when `prep()` is used on the recipe and 66 | should not be set by the user.} 67 | 68 | \item{scores}{A tibble with 'variable' and 'scores' columns containing the 69 | names of the variables and their feature importance scores. This parameter 70 | is only produced after the recipe has been trained.} 71 | 72 | \item{skip}{A logical. Should the step be skipped when the recipe is baked by 73 | bake.recipe()? While all operations are baked when prep.recipe() is run, 74 | some operations may not be able to be conducted on new data (e.g. 75 | processing the outcome variable(s)). Care should be taken when using skip = 76 | TRUE as it may affect the computations for subsequent operations.} 77 | 78 | \item{id}{A character string that is unique to this step to identify it.} 79 | 80 | \item{x}{A `step_select_tree` object.} 81 | } 82 | \value{ 83 | a `step_select_tree` object. 84 | } 85 | \description{ 86 | `step_select_tree` creates a *specification* of a recipe step that selects a 87 | subset of predictors based on the ranking of variable importance provided by 88 | a `parsnip::decision_tree` supported model. 89 | } 90 | \examples{ 91 | library(recipes) 92 | library(parsnip) 93 | 94 | # load the example iris dataset 95 | data(cells, package = "modeldata") 96 | 97 | # create a preprocessing recipe 98 | rec <- 99 | recipe(class ~ ., data = cells[, -1]) \%>\% 100 | step_select_tree(all_predictors(), outcome = "class", top_p = 10, 101 | threshold = 0.9) 102 | 103 | prepped <- prep(rec) 104 | 105 | preproc_data <- juice(prepped) 106 | prepped 107 | } 108 | -------------------------------------------------------------------------------- /man/step_select_infgain.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/step_select_infgain.R 3 | \name{step_select_infgain} 4 | \alias{step_select_infgain} 5 | \alias{tidy.step_select_infgain} 6 | \title{Information gain feature selection step} 7 | \usage{ 8 | step_select_infgain( 9 | recipe, 10 | ..., 11 | outcome = NULL, 12 | role = NA, 13 | trained = FALSE, 14 | top_p = NA, 15 | threshold = NA, 16 | type = "infogain", 17 | threads = 1, 18 | exclude = NULL, 19 | scores = NULL, 20 | skip = FALSE, 21 | id = recipes::rand_id("select_infgain") 22 | ) 23 | 24 | \method{tidy}{step_select_infgain}(x, ...) 25 | } 26 | \arguments{ 27 | \item{recipe}{A recipe object. The step will be added to the sequence of 28 | operations for this recipe.} 29 | 30 | \item{...}{One or more selector functions to choose which variables are 31 | affected by the step. See selections() for more details. For the tidy 32 | method, these are not currently used.} 33 | 34 | \item{outcome}{A character string with the name of the response variable to 35 | use to evaluate information gain value against the predictors.} 36 | 37 | \item{role}{Not used by this step since no new variables are created.} 38 | 39 | \item{trained}{A logical to indicate if the quantities for preprocessing have 40 | been estimated.} 41 | 42 | \item{top_p}{An integer with the number of best scoring features to 43 | select.} 44 | 45 | \item{threshold}{A numeric value between 0 and 1 representing the percentile 46 | of best scoring features to select. Features with scores that are _larger_ 47 | than the specified threshold will be retained, for example `threshold = 48 | 0.9` will retain only predictors with scores in the top 90th percentile. 49 | Note that this overrides `top_p`.} 50 | 51 | \item{type}{A character string specifying the information gain method to use. 52 | One of "infogain", "gainratio", "symuncert". The default is 'infogain'.} 53 | 54 | \item{threads}{An integer specifying the number of threads to use for 55 | processing. The default = 0 uses all available threads.} 56 | 57 | \item{exclude}{A character vector of predictor names that will be removed 58 | from the data. This will be set when `prep()` is used on the recipe and 59 | should not be set by the user.} 60 | 61 | \item{scores}{A tibble with 'variable' and 'scores' columns containing the 62 | names of the variables and their information gain scores. This parameter is 63 | only produced after the recipe has been trained.} 64 | 65 | \item{skip}{A logical. Should the step be skipped when the recipe is baked by 66 | bake.recipe()? While all operations are baked when prep.recipe() is run, 67 | some operations may not be able to be conducted on new data (e.g. 68 | processing the outcome variable(s)). Care should be taken when using skip = 69 | TRUE as it may affect the computations for subsequent operations.} 70 | 71 | \item{id}{A character string that is unique to this step to identify it.} 72 | 73 | \item{x}{A `step_select_infgain` object.} 74 | } 75 | \value{ 76 | A step_select_infgain object. 77 | } 78 | \description{ 79 | `step_select_infgain` creates a *specification* of a recipe step that selects a 80 | subset of predictors based on the scores of the information gain algorithm. 81 | This step requires the FSelectorRcpp package to be installed. The top 82 | `top_p` scoring features, or features whose scores occur in the top 83 | percentile `threshold` will be retained as new predictors. 84 | } 85 | \details{ 86 | The recipe will stop if both `top_p` and `threshold` are left unspecified. 87 | } 88 | \examples{ 89 | library(recipes) 90 | 91 | data(cells, package = "modeldata") 92 | 93 | rec <- 94 | recipe(class ~ ., data = cells[, -1]) \%>\% 95 | step_select_infgain(all_predictors(), outcome = "class", top_p = 10, threshold = 0.9) 96 | 97 | prepped <- prep(rec) 98 | 99 | new_data <- juice(prepped) 100 | prepped 101 | } 102 | \concept{preprocessing} 103 | \concept{supervised_filter} 104 | \keyword{datagen} 105 | -------------------------------------------------------------------------------- /man/step_select_linear.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/step_select_linear.R 3 | \name{step_select_linear} 4 | \alias{step_select_linear} 5 | \alias{tidy.step_select_linear} 6 | \title{Feature selection step using the magnitude of a linear models' coefficients} 7 | \usage{ 8 | step_select_linear( 9 | recipe, 10 | ..., 11 | outcome = NULL, 12 | role = "predictor", 13 | trained = FALSE, 14 | engine = "glm", 15 | penalty = NULL, 16 | mixture = NULL, 17 | top_p = NA, 18 | threshold = NA, 19 | exclude = NULL, 20 | scores = NULL, 21 | skip = FALSE, 22 | id = recipes::rand_id("select_linear") 23 | ) 24 | 25 | \method{tidy}{step_select_linear}(x, ...) 26 | } 27 | \arguments{ 28 | \item{recipe}{A recipe object. The step will be added to the sequence of 29 | operations for this recipe.} 30 | 31 | \item{...}{One or more selector functions to choose which variables are 32 | affected by the step. See selections() for more details. For the tidy 33 | method, these are not currently used.} 34 | 35 | \item{outcome}{A character string with the name of the response variable to 36 | use to calculate the feature importance scores.} 37 | 38 | \item{role}{Not used by this step since no new variables are created.} 39 | 40 | \item{trained}{A logical to indicate if the quantities for preprocessing have 41 | been estimated.} 42 | 43 | \item{engine}{A supported rand_forest engine that is supported by parsnip. 44 | The default is "glm".} 45 | 46 | \item{penalty}{A non-negative number representing the total amount of 47 | regularization (specific engines only).} 48 | 49 | \item{mixture}{A number between zero and one (inclusive) that is the 50 | proportion of L1 regularization (i.e. lasso) in the model. When mixture = 51 | 1, it is a pure lasso model while mixture = 0 indicates that ridge 52 | regression is being used (specific engines only).} 53 | 54 | \item{top_p}{An integer with the number of best scoring features to 55 | select.} 56 | 57 | \item{threshold}{A numeric value between 0 and 1 representing the percentile 58 | of best scoring features to select. Features with scores that are _larger_ 59 | than the specified threshold will be retained, for example `threshold = 60 | 0.9` will retain only predictors with scores in the top 90th percentile. 61 | Note that this overrides `top_p`.} 62 | 63 | \item{exclude}{A character vector of predictor names that will be removed 64 | from the data. This will be set when `prep()` is used on the recipe and 65 | should not be set by the user.} 66 | 67 | \item{scores}{A tibble with 'variable' and 'scores' columns containing the 68 | names of the variables and their feature importance scores. This parameter 69 | is only produced after the recipe has been trained.} 70 | 71 | \item{skip}{A logical. Should the step be skipped when the recipe is baked by 72 | bake.recipe()? While all operations are baked when prep.recipe() is run, 73 | some operations may not be able to be conducted on new data (e.g. 74 | processing the outcome variable(s)). Care should be taken when using skip = 75 | TRUE as it may affect the computations for subsequent operations.} 76 | 77 | \item{id}{A character string that is unique to this step to identify it.} 78 | 79 | \item{x}{A `step_select_linear` object.} 80 | } 81 | \value{ 82 | a `step_select_linear` object. 83 | } 84 | \description{ 85 | `step_select_linear` creates a *specification* of a recipe step that selects 86 | a subset of predictors based on the ranking of the magnitude of coefficients 87 | provided by a `parsnip::linear_reg` or `parsnip::logistic_reg` model. 88 | } 89 | \examples{ 90 | library(recipes) 91 | library(parsnip) 92 | 93 | # load the example iris dataset 94 | data(cells, package = "modeldata") 95 | 96 | # create a preprocessing recipe 97 | rec <- 98 | recipe(class ~ ., data = cells[, -1]) \%>\% 99 | step_select_linear(all_predictors(), outcome = "class", top_p = 10, 100 | threshold = 0.9) 101 | 102 | prepped <- prep(rec) 103 | 104 | preproc_data <- juice(prepped) 105 | prepped 106 | } 107 | -------------------------------------------------------------------------------- /man/step_select_forests.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/step_select_forests.R 3 | \name{step_select_forests} 4 | \alias{step_select_forests} 5 | \title{Feature selection step using a random forest feature importance scores} 6 | \usage{ 7 | step_select_forests( 8 | recipe, 9 | ..., 10 | outcome = NULL, 11 | role = "predictor", 12 | trained = FALSE, 13 | engine = "ranger", 14 | options = list(importance = "permutation"), 15 | mtry = NULL, 16 | trees = NULL, 17 | min_n = NULL, 18 | top_p = NA, 19 | threshold = NA, 20 | exclude = NULL, 21 | scores = NULL, 22 | skip = FALSE, 23 | id = recipes::rand_id("select_forests") 24 | ) 25 | } 26 | \arguments{ 27 | \item{recipe}{A recipe object. The step will be added to the sequence of 28 | operations for this recipe.} 29 | 30 | \item{...}{One or more selector functions to choose which variables are 31 | affected by the step. See selections() for more details. For the tidy 32 | method, these are not currently used.} 33 | 34 | \item{outcome}{A character string with the name of the response variable to 35 | use to calculate the feature importance scores.} 36 | 37 | \item{role}{Not used by this step since no new variables are created.} 38 | 39 | \item{trained}{A logical to indicate if the quantities for preprocessing have 40 | been estimated.} 41 | 42 | \item{engine}{A supported rand_forest engine that is supported by parsnip. 43 | The default is "ranger".} 44 | 45 | \item{options}{A named list of options to pass to the rand_forest engine. For 46 | example, if `engine = 'ranger'` (the default) then options could be 47 | `list(permutation = 'importance`) because a feature importance method needs 48 | to be specified for this engine. This is the default.} 49 | 50 | \item{mtry}{An integer for the number of predictors that will be randomly 51 | sampled at each split when creating the tree models.} 52 | 53 | \item{trees}{An integer for the number of trees contained in the ensemble.} 54 | 55 | \item{min_n}{An integer for the minimum number of data points in a node that 56 | are required for the node to be split further.} 57 | 58 | \item{top_p}{An integer with the number of best scoring features to 59 | select.} 60 | 61 | \item{threshold}{A numeric value between 0 and 1 representing the percentile 62 | of best scoring features to select. Features with scores that are _larger_ 63 | than the specified threshold will be retained, for example `threshold = 64 | 0.9` will retain only predictors with scores in the top 90th percentile. 65 | Note that this overrides `top_p`.} 66 | 67 | \item{exclude}{A character vector of predictor names that will be removed 68 | from the data. This will be set when `prep()` is used on the recipe and 69 | should not be set by the user.} 70 | 71 | \item{scores}{A tibble with 'variable' and 'scores' columns containing the 72 | names of the variables and their feature importance scores. This parameter 73 | is only produced after the recipe has been trained.} 74 | 75 | \item{skip}{A logical. Should the step be skipped when the recipe is baked by 76 | bake.recipe()? While all operations are baked when prep.recipe() is run, 77 | some operations may not be able to be conducted on new data (e.g. 78 | processing the outcome variable(s)). Care should be taken when using skip = 79 | TRUE as it may affect the computations for subsequent operations.} 80 | 81 | \item{id}{A character string that is unique to this step to identify it.} 82 | } 83 | \value{ 84 | a `step_select_forests` object. 85 | } 86 | \description{ 87 | `step_select_forests` creates a *specification* of a recipe step that selects 88 | a subset of predictors based on the ranking of variable importance using 89 | a `parsnip::rand_forest` supported model. 90 | } 91 | \examples{ 92 | library(recipes) 93 | library(parsnip) 94 | 95 | # load the example iris dataset 96 | data(cells, package = "modeldata") 97 | 98 | # create a preprocessing recipe 99 | rec <- 100 | recipe(class ~ ., data = cells[, -1]) \%>\% 101 | step_select_forests(all_predictors(), outcome = "class", top_p = 10, 102 | threshold = 0.9) 103 | 104 | prepped <- prep(rec) 105 | 106 | preproc_data <- juice(prepped) 107 | prepped 108 | } 109 | -------------------------------------------------------------------------------- /docs/LICENSE-text.html: -------------------------------------------------------------------------------- 1 | 2 | License • recipeselectors 6 | 7 | 8 |
    9 |
    37 | 38 | 39 | 40 |
    41 |
    42 | 45 | 46 |
    YEAR: 2019
    47 | COPYRIGHT HOLDER: Steven Pawley
    48 | 
    49 | 50 |
    51 | 52 | 55 | 56 |
    57 | 58 | 59 | 60 |
    63 | 64 |
    65 |

    Site built with pkgdown 66 | 2.0.2.

    67 |
    68 | 69 |
    70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | -------------------------------------------------------------------------------- /docs/404.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | Page not found (404) • recipeselectors 9 | 10 | 11 | 12 | 13 | 14 | 15 | 19 | 20 | 21 | 22 | 23 |
    24 |
    58 | 59 | 60 | 61 | 62 |
    63 |
    64 | 67 | 68 | Content not found. Please use links in the navbar. 69 | 70 |
    71 | 72 | 76 | 77 |
    78 | 79 | 80 | 81 |
    85 | 86 |
    87 |

    88 |

    Site built with pkgdown 89 | 2.0.2.

    90 |
    91 | 92 |
    93 |
    94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | -------------------------------------------------------------------------------- /docs/bootstrap-toc.js: -------------------------------------------------------------------------------- 1 | /*! 2 | * Bootstrap Table of Contents v0.4.1 (http://afeld.github.io/bootstrap-toc/) 3 | * Copyright 2015 Aidan Feldman 4 | * Licensed under MIT (https://github.com/afeld/bootstrap-toc/blob/gh-pages/LICENSE.md) */ 5 | (function() { 6 | 'use strict'; 7 | 8 | window.Toc = { 9 | helpers: { 10 | // return all matching elements in the set, or their descendants 11 | findOrFilter: function($el, selector) { 12 | // http://danielnouri.org/notes/2011/03/14/a-jquery-find-that-also-finds-the-root-element/ 13 | // http://stackoverflow.com/a/12731439/358804 14 | var $descendants = $el.find(selector); 15 | return $el.filter(selector).add($descendants).filter(':not([data-toc-skip])'); 16 | }, 17 | 18 | generateUniqueIdBase: function(el) { 19 | var text = $(el).text(); 20 | var anchor = text.trim().toLowerCase().replace(/[^A-Za-z0-9]+/g, '-'); 21 | return anchor || el.tagName.toLowerCase(); 22 | }, 23 | 24 | generateUniqueId: function(el) { 25 | var anchorBase = this.generateUniqueIdBase(el); 26 | for (var i = 0; ; i++) { 27 | var anchor = anchorBase; 28 | if (i > 0) { 29 | // add suffix 30 | anchor += '-' + i; 31 | } 32 | // check if ID already exists 33 | if (!document.getElementById(anchor)) { 34 | return anchor; 35 | } 36 | } 37 | }, 38 | 39 | generateAnchor: function(el) { 40 | if (el.id) { 41 | return el.id; 42 | } else { 43 | var anchor = this.generateUniqueId(el); 44 | el.id = anchor; 45 | return anchor; 46 | } 47 | }, 48 | 49 | createNavList: function() { 50 | return $(''); 51 | }, 52 | 53 | createChildNavList: function($parent) { 54 | var $childList = this.createNavList(); 55 | $parent.append($childList); 56 | return $childList; 57 | }, 58 | 59 | generateNavEl: function(anchor, text) { 60 | var $a = $(''); 61 | $a.attr('href', '#' + anchor); 62 | $a.text(text); 63 | var $li = $('
  • '); 64 | $li.append($a); 65 | return $li; 66 | }, 67 | 68 | generateNavItem: function(headingEl) { 69 | var anchor = this.generateAnchor(headingEl); 70 | var $heading = $(headingEl); 71 | var text = $heading.data('toc-text') || $heading.text(); 72 | return this.generateNavEl(anchor, text); 73 | }, 74 | 75 | // Find the first heading level (`

    `, then `

    `, etc.) that has more than one element. Defaults to 1 (for `

    `). 76 | getTopLevel: function($scope) { 77 | for (var i = 1; i <= 6; i++) { 78 | var $headings = this.findOrFilter($scope, 'h' + i); 79 | if ($headings.length > 1) { 80 | return i; 81 | } 82 | } 83 | 84 | return 1; 85 | }, 86 | 87 | // returns the elements for the top level, and the next below it 88 | getHeadings: function($scope, topLevel) { 89 | var topSelector = 'h' + topLevel; 90 | 91 | var secondaryLevel = topLevel + 1; 92 | var secondarySelector = 'h' + secondaryLevel; 93 | 94 | return this.findOrFilter($scope, topSelector + ',' + secondarySelector); 95 | }, 96 | 97 | getNavLevel: function(el) { 98 | return parseInt(el.tagName.charAt(1), 10); 99 | }, 100 | 101 | populateNav: function($topContext, topLevel, $headings) { 102 | var $context = $topContext; 103 | var $prevNav; 104 | 105 | var helpers = this; 106 | $headings.each(function(i, el) { 107 | var $newNav = helpers.generateNavItem(el); 108 | var navLevel = helpers.getNavLevel(el); 109 | 110 | // determine the proper $context 111 | if (navLevel === topLevel) { 112 | // use top level 113 | $context = $topContext; 114 | } else if ($prevNav && $context === $topContext) { 115 | // create a new level of the tree and switch to it 116 | $context = helpers.createChildNavList($prevNav); 117 | } // else use the current $context 118 | 119 | $context.append($newNav); 120 | 121 | $prevNav = $newNav; 122 | }); 123 | }, 124 | 125 | parseOps: function(arg) { 126 | var opts; 127 | if (arg.jquery) { 128 | opts = { 129 | $nav: arg 130 | }; 131 | } else { 132 | opts = arg; 133 | } 134 | opts.$scope = opts.$scope || $(document.body); 135 | return opts; 136 | } 137 | }, 138 | 139 | // accepts a jQuery object, or an options object 140 | init: function(opts) { 141 | opts = this.helpers.parseOps(opts); 142 | 143 | // ensure that the data attribute is in place for styling 144 | opts.$nav.attr('data-toggle', 'toc'); 145 | 146 | var $topContext = this.helpers.createChildNavList(opts.$nav); 147 | var topLevel = this.helpers.getTopLevel(opts.$scope); 148 | var $headings = this.helpers.getHeadings(opts.$scope, topLevel); 149 | this.helpers.populateNav($topContext, topLevel, $headings); 150 | } 151 | }; 152 | 153 | $(function() { 154 | $('nav[data-toggle="toc"]').each(function(i, el) { 155 | var $nav = $(el); 156 | Toc.init($nav); 157 | }); 158 | }); 159 | })(); 160 | -------------------------------------------------------------------------------- /docs/reference/pipe.html: -------------------------------------------------------------------------------- 1 | 2 | Pipe operator — %>% • recipeselectors 6 | 7 | 8 |
    9 |
    37 | 38 | 39 | 40 |
    41 |
    42 | 47 | 48 |
    49 |

    See magrittr::%>% for details.

    50 |
    51 | 52 |
    53 |
    lhs %>% rhs
    54 |
    55 | 56 | 57 |
    58 | 61 |
    62 | 63 | 64 |
    67 | 68 |
    69 |

    Site built with pkgdown 70 | 2.0.2.

    71 |
    72 | 73 |
    74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | -------------------------------------------------------------------------------- /R/step_select_boruta.R: -------------------------------------------------------------------------------- 1 | #' Feature selection step using Boruta 2 | #' 3 | #' `step_select_boruta` creates a *specification* of a recipe step that selects a 4 | #' subset of predictors using the Boruta feature selection approach. 5 | #' 6 | #' @param recipe A recipe object. The step will be added to the sequence of 7 | #' operations for this recipe. 8 | #' @param ... One or more selector functions to choose which variables are 9 | #' affected by the step. See selections() for more details. For the tidy 10 | #' method, these are not currently used. 11 | #' @param outcome A character string with the name of the response variable to 12 | #' use to calculate the feature importance scores. 13 | #' @param role Not used by this step since no new variables are created. 14 | #' @param trained A logical to indicate if the quantities for preprocessing have 15 | #' been estimated. 16 | #' @param exclude A character vector of predictor names that will be removed 17 | #' from the data. This will be set when `prep()` is used on the recipe and 18 | #' should not be set by the user. 19 | #' @param options A list of options to pass to `Boruta::Boruta()`. The defaults 20 | #' use Boruta's defaults. *Note* that `x` and `y` should not be passed here. 21 | #' @param res The `Boruta::Boruta` object is stored here once this preprocessing 22 | #' step has been trained by `prep.recipe()`. 23 | #' @param skip A logical. Should the step be skipped when the recipe is baked by 24 | #' bake.recipe()? While all operations are baked when prep.recipe() is run, 25 | #' some operations may not be able to be conducted on new data (e.g. 26 | #' processing the outcome variable(s)). Care should be taken when using skip = 27 | #' TRUE as it may affect the computations for subsequent operations. 28 | #' @param id A character string that is unique to this step to identify it. 29 | #' 30 | #' @return a `step_select_boruta` object. 31 | #' @export 32 | #' @examples 33 | #' library(recipes) 34 | #' library(parsnip) 35 | #' 36 | #' # load the example iris dataset 37 | #' data(cells, package = "modeldata") 38 | #' 39 | #' # create a preprocessing recipe 40 | #' rec <- 41 | #' recipe(class ~ ., data = cells[, -1]) %>% 42 | #' step_select_boruta(all_predictors(), outcome = "class") 43 | #' 44 | #' prepped <- prep(rec) 45 | #' 46 | #' preproc_data <- juice(prepped) 47 | #' prepped 48 | step_select_boruta <- function( 49 | recipe, 50 | ..., 51 | outcome = NULL, 52 | role = "predictor", 53 | trained = FALSE, 54 | exclude = NULL, 55 | options = list(pValue = 0.01, mcAdj = TRUE, maxRuns = 100), 56 | res = NULL, 57 | skip = FALSE, 58 | id = recipes::rand_id("select_boruta")) { 59 | 60 | recipes::recipes_pkg_check("Boruta") 61 | 62 | recipes::add_step( 63 | recipe, 64 | step_select_boruta_new( 65 | terms = recipes::ellipse_check(...), 66 | trained = trained, 67 | outcome = outcome, 68 | role = role, 69 | exclude = exclude, 70 | options = options, 71 | res = res, 72 | skip = skip, 73 | id = id 74 | ) 75 | ) 76 | } 77 | 78 | # wrapper around 'step' function that sets the class of new step objects 79 | #' @importFrom recipes step 80 | step_select_boruta_new <- function(terms, role, trained, outcome, exclude, 81 | options, res, skip, id) { 82 | recipes::step( 83 | subclass = "select_boruta", 84 | terms = terms, 85 | role = role, 86 | trained = trained, 87 | outcome = outcome, 88 | exclude = exclude, 89 | options = options, 90 | res = res, 91 | skip = skip, 92 | id = id 93 | ) 94 | } 95 | 96 | #' @export 97 | prep.step_select_boruta <- function(x, training, info = NULL, ...) { 98 | 99 | # translate the terms arguments 100 | x_names <- recipes::terms_select(terms = x$terms, info = info) 101 | y_name <- recipes::terms_select(x$outcome, info = info) 102 | y_name <- y_name[1] 103 | 104 | if (length(x_names) > 0) { 105 | 106 | call <- rlang::call2( 107 | .fn = "Boruta", 108 | .ns = "Boruta", 109 | x = rlang::quo(training[, x_names]), 110 | y = rlang::quo(training[[y_name]]), 111 | !!!x$options 112 | ) 113 | 114 | res <- rlang::eval_tidy(call) 115 | 116 | exclude <- names(res$finalDecision[res$finalDecision == "Rejected"]) 117 | 118 | } else { 119 | exclude <- character() 120 | } 121 | 122 | step_select_boruta_new( 123 | terms = x$terms, 124 | trained = TRUE, 125 | role = x$role, 126 | outcome = y_name, 127 | exclude = exclude, 128 | options = x$options, 129 | res = res, 130 | skip = x$skip, 131 | id = x$id 132 | ) 133 | } 134 | 135 | #' @export 136 | bake.step_select_boruta <- function(object, new_data, ...) { 137 | if (length(object$exclude) > 0) { 138 | new_data <- new_data[, !colnames(new_data) %in% object$exclude] 139 | } 140 | as_tibble(new_data) 141 | } 142 | 143 | #' @export 144 | print.step_select_boruta <- function(x, width = max(20, options()$width - 30), ...) { 145 | cat("Boruta feature selection") 146 | 147 | if(recipes::is_trained(x)) { 148 | n <- length(x$exclude) 149 | cat(paste0(" (", n, " excluded)")) 150 | } 151 | cat("\n") 152 | 153 | invisible(x) 154 | } 155 | 156 | #' @rdname step_select_boruta 157 | #' @param x A `step_select_boruta` object. 158 | #' @export 159 | tidy.step_select_boruta <- function(x, ...) { 160 | if (recipes::is_trained(x)) { 161 | res <- tibble(terms = x$exclude) 162 | } else { 163 | term_names <- recipes::sel2char(x$terms) 164 | res <- tibble(terms = rlang::na_chr) 165 | } 166 | res$id <- x$id 167 | res 168 | } 169 | -------------------------------------------------------------------------------- /docs/authors.html: -------------------------------------------------------------------------------- 1 | 2 | Authors and Citation • recipeselectors 6 | 7 | 8 |
    9 |
    37 | 38 | 39 | 40 |
    41 |
    42 |
    43 | 46 | 47 | 48 |
    • 49 |

      Steven Pawley. Author, maintainer. 50 |

      51 |
    • 52 |
    53 |
    54 |
    55 |

    Citation

    56 | Source: DESCRIPTION 57 |
    58 |
    59 | 60 | 61 |

    Pawley S (2022). 62 | recipeselectors: Extra Recipes Steps for Supervised Feature Selection. 63 | R package version 0.0.1, https://github.com/stevenpawley/recipeselectors. 64 |

    65 |
    @Manual{,
    66 |   title = {recipeselectors: Extra Recipes Steps for Supervised Feature Selection},
    67 |   author = {Steven Pawley},
    68 |   year = {2022},
    69 |   note = {R package version 0.0.1},
    70 |   url = {https://github.com/stevenpawley/recipeselectors},
    71 | }
    72 | 73 |
    74 | 75 |
    76 | 77 | 78 | 79 |
    82 | 83 |
    84 |

    Site built with pkgdown 85 | 2.0.2.

    86 |
    87 | 88 |
    89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Note 2 | 3 | The package recipeselectors is changing its name to 'colino'. Continued package development and eventual release top CRAN will occur from the 'https://github.com/stevenpawley/colino' repository. 4 | 5 | # recipeselectors 6 | 7 | The goal of recipeselectors is to provide extra supervised feature selection 8 | steps to be used with the tidymodels recipes package. 9 | 10 | The package is under development. 11 | 12 | ## Installation 13 | 14 | ``` r 15 | devtools::install_github("stevenpawley/recipeselectors") 16 | ``` 17 | 18 | ## Feature Selection Methods 19 | 20 | The following feature selection methods are implemented: 21 | 22 | - `step_select_infgain` provides Information Gain feature selection. This step 23 | requires the `FSelectorRcpp` package to be installed. 24 | 25 | - `step_select_mrmr` provides maximum Relevancy Minimum Redundancy feature 26 | selection. This step requires the `praznik` package to be installed. 27 | 28 | - `step_select_roc` provides ROC-based feature selection based on each 29 | predictors' relationship with the response outcomeas measured using a Receiver 30 | Operating Characteristic curve. Thanks to Max Kuhn, along with many other useful 31 | suggestions. 32 | 33 | - `step_select_xtab` provides feature selection using statistical association 34 | (also thanks to Max Kuhn). 35 | 36 | - `step_select_vip` provides model-based selection using feature importance 37 | scores or coefficients. This method allows a `parsnip` model specification to be 38 | used to select a subset of features based on the models' feature importances or 39 | coefficients. See below for details. Note, that this step will eventually be 40 | deprecated in favor of separate steps that contain the specific models that are 41 | most commonly used for feature selection such as `step_select_forests`, 42 | `step_select_tree` and `step_select_linear`. 43 | 44 | - `step_select_boruta` provides a Boruta feature selection step. 45 | 46 | - `step_select_carscore` provides a CAR score feature selection step for 47 | regression models. This step requires the `care` package to be installed. 48 | 49 | - `step_select_forests`, `step_select_tree`, and `step_select_linear` provide 50 | model-based methods of selecting a subset of features based on the model's 51 | feature importance scores or coefficients. These steps, and potential 52 | `step_select_rules`, `step_select_boost` will replace the `step_select_vip` 53 | method. 54 | 55 | ## Under Development 56 | 57 | Methods that are planned to be added: 58 | 59 | - Relief-based methods (CORElearn package) 60 | 61 | - Ensemble feature selection (EFS package) 62 | 63 | ## Notes on Wrapper Feature Selection Methods 64 | 65 | The focus of `recipeselectors` is to provide extra recipes for filter-based 66 | feature selection. A single wrapper method is also included using the variable 67 | importance scores of selected algorithms for feature selection. 68 | 69 | The `step_select_vip` is designed to work with the `parsnip` package and 70 | requires a base model specification that provides a method of ranking the 71 | importance of features, such as feature importance scores or coefficients, with 72 | one score per feature. The base model is specified in the step using the `model` 73 | parameter. 74 | 75 | A limitation is that the model used in the `step_select_vip` cannot be tuned. 76 | This step will be replaced by a more appropriate structure that allows both 77 | variable selection and tuning for specific model types. 78 | 79 | The parsnip package does not currently contain a method of pulling feature 80 | importance scores from models that support them. The `recipeselectors` package 81 | provides a generic function `pull_importances` for this purpose that accepts 82 | a fitted parsnip model, and returns a tibble with two columns 'feature' and 83 | 'importance': 84 | 85 | ``` 86 | model <- boost_tree(mode = "classification") %>% 87 | set_engine("xgboost") 88 | 89 | model_fit <- model %>% 90 | fit(Species ~., iris) 91 | 92 | pull_importances(model_fit) 93 | ``` 94 | 95 | Most of the models and 'engines' that provide feature importances are 96 | implemented. In addition, `h2o` models are supported using the `h2oparsnip` 97 | package. Use `methods(pull_importances)` to list models that are currently 98 | implemented. If need to pull the feature importance scores from a model that is 99 | not currently supported in this package, then you can add a class to the 100 | pull_importances generic function which returns a two-column tibble: 101 | 102 | ``` 103 | pull_importances._ranger <- function(object, scaled = FALSE, ...) { 104 | scores <- ranger::importance(object$fit) 105 | 106 | # create a tibble with 'feature' and 'importance' columns 107 | scores <- tibble::tibble( 108 | feature = names(scores), 109 | importance = as.numeric(scores) 110 | ) 111 | 112 | # optionally rescale the importance scores 113 | if (scaled) 114 | scores$importance <- scales::rescale(scores$importance) 115 | scores 116 | } 117 | ``` 118 | 119 | An example of using the step_importance function: 120 | 121 | ``` 122 | library(parsnip) 123 | library(recipes) 124 | library(magrittr) 125 | 126 | # load the example iris dataset 127 | data(iris) 128 | 129 | # define a base model to use for feature importances 130 | base_model <- rand_forest(mode = "classification") %>% 131 | set_engine("ranger", importance = "permutation") 132 | 133 | # create a preprocessing recipe 134 | rec <- iris %>% 135 | recipe(Species ~ .) %>% 136 | step_select_vip(all_predictors(), model = base_model, top_p = 2, 137 | outcome = "Species") 138 | 139 | prepped <- prep(rec) 140 | 141 | # create a model specification 142 | clf <- decision_tree(mode = "classification") %>% 143 | set_engine("rpart") 144 | 145 | clf_fitted <- clf %>% 146 | fit(Species ~ ., juice(prepped)) 147 | ``` 148 | -------------------------------------------------------------------------------- /docs/LICENSE.html: -------------------------------------------------------------------------------- 1 | 2 | MIT License • recipeselectors 6 | 7 | 8 |
    9 |
    37 | 38 | 39 | 40 |
    41 |
    42 | 45 | 46 |
    47 | 48 |

    Copyright (c) 2019 Steven Pawley

    49 |

    Permission is hereby granted, free of charge, to any person obtaining 50 | a copy of this software and associated documentation files (the 51 | “Software”), to deal in the Software without restriction, including 52 | without limitation the rights to use, copy, modify, merge, publish, 53 | distribute, sublicense, and/or sell copies of the Software, and to 54 | permit persons to whom the Software is furnished to do so, subject to 55 | the following conditions:

    56 |

    The above copyright notice and this permission notice shall be 57 | included in all copies or substantial portions of the Software.

    58 |

    THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, 59 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 60 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 61 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 62 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 63 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 64 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

    65 |
    66 | 67 |
    68 | 69 | 72 | 73 |
    74 | 75 | 76 | 77 |
    80 | 81 |
    82 |

    Site built with pkgdown 83 | 2.0.2.

    84 |
    85 | 86 |
    87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | -------------------------------------------------------------------------------- /R/step_select_mrmr.R: -------------------------------------------------------------------------------- 1 | #' Apply minimum Redundancy Maximum Relevance Feature Selection (mRMR) 2 | #' 3 | #' `step_select_mrmr` creates a *specification* of a recipe step that will apply 4 | #' minimum Redundancy Maximum Relevance Feature Selection (mRMR) to numeric 5 | #' data. The top `top_p` scoring features, or features whose scores occur in 6 | #' the top percentile `threshold` will be retained as new predictors. 7 | #' 8 | #' @param recipe A recipe object. The step will be added to the sequence of 9 | #' operations for this recipe 10 | #' @param ... One or more selector functions to choose which variables are 11 | #' affected by the step. See selections() for more details. For the tidy 12 | #' method, these are not currently used 13 | #' @param role Not used by this step since no new variables are created 14 | #' @param trained A logical to indicate if the quantities for preprocessing have 15 | #' been estimated 16 | #' @param outcome A character string specifying the name of response variable 17 | #' used to evaluate mRMR. 18 | #' @param top_p An integer that will be used to select the number of best 19 | #' scoring features. 20 | #' @param threshold A numeric value between 0 and 1 representing the percentile 21 | #' of best scoring features to select. Features with scores that are _larger_ 22 | #' than the specified threshold will be retained, for example `threshold = 23 | #' 0.9` will retain only predictors with scores in the top 90th percentile. 24 | #' Note that this overrides `top_p`. 25 | #' @param threads An integer specifying the number of threads to use for 26 | #' processing. The default = 0 uses all available threads. 27 | #' @param exclude A character vector of predictor names that will be removed 28 | #' from the data. This will be set when `prep()` is used on the recipe and 29 | #' should not be set by the user. 30 | #' @param scores A tibble with 'variable' and 'scores' columns containing the 31 | #' names of the variables and their mRMR scores. This parameter is only 32 | #' produced after the recipe has been trained. 33 | #' @param skip A logical. Should the step be skipped when the recipe is baked by 34 | #' bake.recipe()? While all operations are baked when prep.recipe() is run, 35 | #' some operations may not be able to be conducted on new data (e.g. 36 | #' processing the outcome variable(s)). Care should be taken when using skip = 37 | #' TRUE as it may affect the computations for subsequent operations. 38 | #' @param id A character string that is unique to this step to identify it. 39 | #' @return A step_select_mrmr object. 40 | #' @keywords datagen 41 | #' @concept preprocessing 42 | #' @concept supervised_filter 43 | #' @export 44 | #' @details 45 | #' 46 | #' The recipe will stop if both `top_p` and `threshold` are left unspecified. 47 | #' 48 | #' @examples 49 | #' library(recipes) 50 | #' 51 | #' data(cells, package = "modeldata") 52 | #' 53 | #' rec <- 54 | #' recipe(class ~ ., data = cells[, -1]) %>% 55 | #' step_select_mrmr(all_predictors(), outcome = "class", top_p = 10, threshold = 0.9) 56 | #' 57 | #' prepped <- prep(rec) 58 | #' 59 | #' new_data <- juice(prepped) 60 | #' prepped 61 | step_select_mrmr <- function( 62 | recipe, ..., 63 | outcome = NULL, 64 | role = NA, 65 | trained = FALSE, 66 | top_p = NA, 67 | threshold = NA, 68 | threads = 0, 69 | exclude = NULL, 70 | scores = NULL, 71 | skip = FALSE, 72 | id = recipes::rand_id("select_mrmr")) { 73 | 74 | recipes::recipes_pkg_check("praznik") 75 | 76 | terms <- recipes::ellipse_check(...) 77 | 78 | recipes::add_step( 79 | recipe, 80 | step_select_mrmr_new( 81 | terms = terms, 82 | trained = trained, 83 | outcome = outcome, 84 | role = role, 85 | top_p = top_p, 86 | threshold = threshold, 87 | threads = threads, 88 | exclude = exclude, 89 | scores = scores, 90 | skip = skip, 91 | id = id 92 | ) 93 | ) 94 | } 95 | 96 | step_select_mrmr_new <- function(terms, role, trained, outcome, top_p, 97 | threshold, threads, exclude, scores, skip, 98 | id) { 99 | recipes::step( 100 | subclass = "select_mrmr", 101 | terms = terms, 102 | role = role, 103 | trained = trained, 104 | outcome = outcome, 105 | top_p = top_p, 106 | threshold = threshold, 107 | threads = threads, 108 | exclude = exclude, 109 | scores = scores, 110 | skip = skip, 111 | id = id 112 | ) 113 | } 114 | 115 | #' @export 116 | prep.step_select_mrmr <- function(x, training, info = NULL, ...) { 117 | # extract response and predictor names 118 | y_name <- recipes::terms_select(x$outcome, info = info) 119 | y_name <- y_name[1] 120 | x_names <- recipes::terms_select(terms = x$terms, info = info) 121 | 122 | # check criteria 123 | check_criteria(x$top_p, x$threshold, match.call()) 124 | check_zero_one(x$threshold) 125 | x$top_p <- check_top_p(x$top_p, length(x_names)) 126 | 127 | if (length(x_names) > 0) { 128 | 129 | call <- rlang::call2( 130 | .fn = "MRMR", 131 | .ns = "praznik", 132 | X = rlang::quo(training[, x_names]), 133 | Y = rlang::quo(training[[y_name]]), 134 | k = length(x_names), 135 | threads = x$threads 136 | ) 137 | 138 | res <- rlang::eval_tidy(call) 139 | 140 | res <- tibble( 141 | variable = names(res$selection), 142 | score = res$score 143 | ) 144 | 145 | exclude <- 146 | select_percentile(res$score, x$top_p, x$threshold, maximize = TRUE) 147 | 148 | } else { 149 | exclude <- character() 150 | } 151 | 152 | step_select_mrmr_new( 153 | terms = x$terms, 154 | trained = TRUE, 155 | role = x$role, 156 | outcome = y_name, 157 | top_p = x$top_p, 158 | threshold = x$threshold, 159 | threads = x$threads, 160 | exclude = exclude, 161 | scores = res, 162 | skip = x$skip, 163 | id = x$id 164 | ) 165 | } 166 | 167 | #' @export 168 | bake.step_select_mrmr <- function(object, new_data, ...) { 169 | if (length(object$exclude) > 0) { 170 | new_data <- new_data[, !(colnames(new_data) %in% object$exclude)] 171 | } 172 | as_tibble(new_data) 173 | } 174 | 175 | #' @export 176 | print.step_select_mrmr <- function(x, width = max(20, options()$width - 30), ...) { 177 | cat("mRMR feature selection") 178 | 179 | if(recipes::is_trained(x)) { 180 | n <- length(x$exclude) 181 | cat(paste0(" (", n, " excluded)")) 182 | } 183 | cat("\n") 184 | 185 | invisible(x) 186 | } 187 | 188 | #' @rdname step_select_mrmr 189 | #' @param x A `step_select_mrmr` object. 190 | #' @export 191 | tidy.step_select_mrmr <- function(x, ...) { 192 | if (recipes::is_trained(x)) { 193 | res <- tibble(terms = x$exclude) 194 | } else { 195 | term_names <- recipes::sel2char(x$terms) 196 | res <- tibble(terms = rlang::na_chr) 197 | } 198 | res$id <- x$id 199 | res 200 | } 201 | 202 | #' @export 203 | tunable.step_select_mrmr <- function(x, ...) { 204 | tibble( 205 | name = c("top_p", "threshold"), 206 | call_info = list( 207 | list(pkg = "recipeselectors", fun = "top_p"), 208 | list(pkg = "dials", fun = "threshold", range = c(0, 1)) 209 | ), 210 | source = "recipe", 211 | component = "step_select_mrmr", 212 | component_id = x$id 213 | ) 214 | } 215 | -------------------------------------------------------------------------------- /docs/reference/top_p.html: -------------------------------------------------------------------------------- 1 | 2 | Parameter functions for feature selection recipes — top_p • recipeselectors 8 | 9 | 10 |
    11 |
    39 | 40 | 41 | 42 |
    43 |
    44 | 49 | 50 |
    51 |

    Feature selection recipes allow the top-performing features to be selected 52 | using two parameters. `top_p` is for specifying the number of the 53 | top-performing features.

    54 |
    55 | 56 |
    57 |
    top_p(range = c(1L, 4L), trans = NULL)
    58 |
    59 | 60 |
    61 |

    Arguments

    62 |
    range
    63 |

    A two-element vector holding the _defaults_ for the smallest and 64 | largest possible values, respectively.

    65 |
    trans
    66 |

    A `trans` object from the `scales` package, such as 67 | `scales::log10_trans()` or `scales::reciprocal_trans()`. If not provided, 68 | the default is used which matches the units used in `range`. If no 69 | transformation, `NULL`.

    70 |
    71 |
    72 |

    Value

    73 |

    A function with classes "quant_param" and "param"

    74 |
    75 | 76 |
    77 |

    Examples

    78 |
    top_p(c(3, 10))
     79 | #> # Selected Predictors (quantitative)
     80 | #> Range: [3, 10]
     81 | 
    82 |
    83 |
    84 | 87 |
    88 | 89 | 90 |
    93 | 94 |
    95 |

    Site built with pkgdown 96 | 2.0.2.

    97 |
    98 | 99 |
    100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | -------------------------------------------------------------------------------- /R/step_select_vip.R: -------------------------------------------------------------------------------- 1 | #' Feature selection step using a model's feature importance scores or 2 | #' coefficients 3 | #' 4 | #' `step_select_vip` creates a *specification* of a recipe step that selects a 5 | #' subset of predictors based on the ranking of variable importance provided by 6 | #' a `parsnip` model specification and the `model` parameter 7 | #' 8 | #' @param recipe A recipe object. The step will be added to the sequence of 9 | #' operations for this recipe. 10 | #' @param ... One or more selector functions to choose which variables are 11 | #' affected by the step. See selections() for more details. For the tidy 12 | #' method, these are not currently used. 13 | #' @param outcome A character string with the name of the response variable to 14 | #' use to calculate the feature importance scores. 15 | #' @param role Not used by this step since no new variables are created. 16 | #' @param trained A logical to indicate if the quantities for preprocessing have 17 | #' been estimated. 18 | #' @param model A `model_spec` object from `parsnip` that has a feature 19 | #' importances or coefficients method. The model needs to have an equivalent 20 | #' `pull_importances` method defined. See `?pull_importances` for how to 21 | #' define methods for models that are not currently supported. 22 | #' @param top_p An integer with the number of best scoring features to 23 | #' select. 24 | #' @param threshold A numeric value between 0 and 1 representing the percentile 25 | #' of best scoring features to select. Features with scores that are _larger_ 26 | #' than the specified threshold will be retained, for example `threshold = 27 | #' 0.9` will retain only predictors with scores in the top 90th percentile. 28 | #' Note that this overrides `top_p`. 29 | #' @param exclude A character vector of predictor names that will be removed 30 | #' from the data. This will be set when `prep()` is used on the recipe and 31 | #' should not be set by the user. 32 | #' @param scores A tibble with 'variable' and 'scores' columns containing the 33 | #' names of the variables and their feature importance scores. This parameter 34 | #' is only produced after the recipe has been trained. 35 | #' @param skip A logical. Should the step be skipped when the recipe is baked by 36 | #' bake.recipe()? While all operations are baked when prep.recipe() is run, 37 | #' some operations may not be able to be conducted on new data (e.g. 38 | #' processing the outcome variable(s)). Care should be taken when using skip = 39 | #' TRUE as it may affect the computations for subsequent operations. 40 | #' @param id A character string that is unique to this step to identify it. 41 | #' 42 | #' @return a `step_select_vip` object. 43 | #' @export 44 | #' @examples 45 | #' library(recipes) 46 | #' library(parsnip) 47 | #' 48 | #' # load the example iris dataset 49 | #' data(cells, package = "modeldata") 50 | #' 51 | #' # define a base model to use for feature importances 52 | #' base_model <- rand_forest(mode = "classification") %>% 53 | #' set_engine("ranger", importance = "permutation") 54 | #' 55 | #' # create a preprocessing recipe 56 | #' rec <- 57 | #' recipe(class ~ ., data = cells[, -1]) %>% 58 | #' step_select_vip(all_predictors(), outcome = "class", model = base_model, top_p = 10, threshold = 0.9) 59 | #' 60 | #' prepped <- prep(rec) 61 | #' 62 | #' preproc_data <- juice(prepped) 63 | #' prepped 64 | step_select_vip <- function( 65 | recipe, 66 | ..., 67 | outcome = NULL, 68 | role = "predictor", 69 | trained = FALSE, 70 | model = NULL, 71 | top_p = NA, 72 | threshold = NA, 73 | exclude = NULL, 74 | scores = NULL, 75 | skip = FALSE, 76 | id = recipes::rand_id("select_vip")) { 77 | 78 | if (missing(model)) 79 | rlang::abort("Model argument should be a `parsnip` model specification") 80 | 81 | recipes::add_step( 82 | recipe, 83 | step_select_vip_new( 84 | terms = recipes::ellipse_check(...), 85 | trained = trained, 86 | outcome = outcome, 87 | role = role, 88 | model = model, 89 | top_p = top_p, 90 | threshold = threshold, 91 | exclude = exclude, 92 | scores = scores, 93 | skip = skip, 94 | id = id 95 | ) 96 | ) 97 | } 98 | 99 | # wrapper around 'step' function that sets the class of new step objects 100 | #' @importFrom recipes step 101 | step_select_vip_new <- function(terms, role, trained, outcome, model, top_p, 102 | threshold, exclude, scores, skip, id) { 103 | recipes::step( 104 | subclass = "select_vip", 105 | terms = terms, 106 | role = role, 107 | trained = trained, 108 | outcome = outcome, 109 | model = model, 110 | top_p = top_p, 111 | threshold = threshold, 112 | exclude = exclude, 113 | scores = scores, 114 | skip = skip, 115 | id = id 116 | ) 117 | } 118 | 119 | #' @export 120 | prep.step_select_vip <- function(x, training, info = NULL, ...) { 121 | 122 | # translate the terms arguments 123 | x_names <- recipes::terms_select(terms = x$terms, info = info) 124 | y_name <- recipes::terms_select(x$outcome, info = info) 125 | y_name <- y_name[1] 126 | 127 | # check criteria 128 | check_criteria(x$top_p, x$threshold, match.call()) 129 | check_zero_one(x$threshold) 130 | x$top_p <- check_top_p(x$top_p, length(x_names)) 131 | 132 | if (length(x_names) > 0) { 133 | # fit initial model 134 | X <- training[, x_names] 135 | y <- training[[y_name]] 136 | 137 | initial_model <- parsnip::fit_xy(x$model, X, y) 138 | res <- pull_importances(initial_model) 139 | names(res) <- c("variable", "score") 140 | res$score <- rlang::set_names(res$score, res$variable) 141 | 142 | exclude <- 143 | select_percentile(res$score, x$top_p, x$threshold, maximize = TRUE) 144 | 145 | } else { 146 | exclude <- character() 147 | } 148 | 149 | step_select_vip_new( 150 | terms = x$terms, 151 | trained = TRUE, 152 | role = x$role, 153 | outcome = y_name, 154 | model = x$model, 155 | top_p = x$top_p, 156 | threshold = x$threshold, 157 | exclude = exclude, 158 | scores = res, 159 | skip = x$skip, 160 | id = x$id 161 | ) 162 | } 163 | 164 | #' @export 165 | bake.step_select_vip <- function(object, new_data, ...) { 166 | if (length(object$exclude) > 0) { 167 | new_data <- new_data[, !colnames(new_data) %in% object$exclude] 168 | } 169 | as_tibble(new_data) 170 | } 171 | 172 | #' @export 173 | print.step_select_vip <- function(x, width = max(20, options()$width - 30), ...) { 174 | cat("Variable importance feature selection") 175 | 176 | if(recipes::is_trained(x)) { 177 | n <- length(x$exclude) 178 | cat(paste0(" (", n, " excluded)")) 179 | } 180 | cat("\n") 181 | 182 | invisible(x) 183 | } 184 | 185 | #' @rdname step_select_vip 186 | #' @param x A `step_select_vip` object. 187 | #' @export 188 | tidy.step_select_vip <- function(x, ...) { 189 | if (recipes::is_trained(x)) { 190 | res <- tibble(terms = x$exclude) 191 | } else { 192 | term_names <- recipes::sel2char(x$terms) 193 | res <- tibble(terms = rlang::na_chr) 194 | } 195 | res$id <- x$id 196 | res 197 | } 198 | 199 | #' @export 200 | tunable.step_select_vip <- function(x, ...) { 201 | tibble( 202 | name = c("top_p", "threshold"), 203 | call_info = list( 204 | list(pkg = "recipeselectors", fun = "top_p"), 205 | list(pkg = "dials", fun = "threshold", range = c(0, 1)) 206 | ), 207 | source = "recipe", 208 | component = "step_select_vip", 209 | component_id = x$id 210 | ) 211 | } 212 | -------------------------------------------------------------------------------- /R/step_select_roc.R: -------------------------------------------------------------------------------- 1 | #' Filter Numeric Predictors using ROC Curve 2 | #' 3 | #' `step_select_roc` creates a *specification* of a recipe step that will 4 | #' filter predictors using their relationship with the outcome as measured 5 | #' using a Receiver Operating Characteristic curve. 6 | #' 7 | #' @param recipe A recipe object. The step will be added to the sequence of 8 | #' operations for this recipe. 9 | #' @param ... One or more selector functions to choose which predictors are 10 | #' affected by the step. See [selections()] for more details. For the `tidy` 11 | #' method, these are not currently used. 12 | #' @param outcome A single character string that specifies a single categorical 13 | #' variable to be used as the class. 14 | #' @param role For model terms created by this step, what analysis role should 15 | #' they be assigned?. By default, the function assumes that resulting distances 16 | #' will be used as predictors in a model. 17 | #' @param threshold A numeric value, in AUC units, where predictors with ROC 18 | #' AUC values _larger_ than the threshold will be retained. A value of `NA` 19 | #' implies that this criterion will be ignored. 20 | #' @param top_p An integer that will be used to select the predictors with the 21 | #' largest ROC AUC values. A value of `NA` implies that this criterion will be 22 | #' ignored. 23 | #' @param exclude A character vector of predictor names that will be removed 24 | #' from the data. This will be set when `prep()` is used on the recipe and 25 | #' should not be set by the user. 26 | #' @param trained A logical to indicate if the quantities for preprocessing have 27 | #' been estimated. 28 | #' @param skip A logical. Should the step be skipped when the recipe is baked by 29 | #' bake.recipe()? While all operations are baked when prep.recipe() is run, 30 | #' some operations may not be able to be conducted on new data (e.g. 31 | #' processing the outcome variable(s)). Care should be taken when using skip = 32 | #' TRUE as it may affect the computations for subsequent operations. 33 | #' @param id A character string that is unique to this step to identify it. 34 | #' @return An updated version of `recipe` with the new step 35 | #' added to the sequence of existing steps (if any). For the 36 | #' `tidy` method, a tibble with a `terms` column for which predictors were 37 | #' removed. 38 | #' @keywords datagen 39 | #' @concept preprocessing 40 | #' @concept supervised_filter 41 | #' @export 42 | #' @details 43 | #' 44 | #' The recipe will stop if both `top_p` and `threshold` are left unspecified. 45 | #' 46 | #' The ROC AUC will be set to be 1 - AUC if the value is less than 0.50. 47 | #' @examples 48 | #' data(cells, package = "modeldata") 49 | #' 50 | #' rec <- 51 | #' recipe(class ~ ., data = cells[, -1]) %>% 52 | #' step_select_roc(all_predictors(), outcome = "class", top_p = 10, threshold = 0.9) %>% 53 | #' prep() 54 | #' 55 | #' rec %>% juice(all_predictors()) %>% names() 56 | #' 57 | #' # Use ROC values to select but always keep at least one: 58 | #' rec <- 59 | #' recipe(class ~ ., data = cells[, -1]) %>% 60 | #' step_select_roc(all_predictors(), outcome = "class", top_p = 1, threshold = 0.99) %>% 61 | #' prep() 62 | #' 63 | #' rec %>% juice(all_predictors()) %>% names() 64 | #' 65 | #' # in case of missing data... 66 | step_select_roc <- function(recipe, 67 | ..., 68 | outcome, 69 | role = "predictor", 70 | trained = FALSE, 71 | threshold = NA, 72 | top_p = NA, 73 | exclude = NULL, 74 | skip = FALSE, 75 | id = recipes::rand_id("select_roc")) { 76 | recipes::add_step( 77 | recipe, 78 | step_select_roc_new( 79 | terms = recipes::ellipse_check(...), 80 | outcome = outcome, 81 | role = role, 82 | trained = trained, 83 | threshold = threshold, 84 | top_p = top_p, 85 | exclude = exclude, 86 | skip = skip, 87 | id = id 88 | ) 89 | ) 90 | } 91 | 92 | step_select_roc_new <- 93 | function(terms, outcome, role, trained, threshold, top_p, exclude, skip, id) { 94 | recipes::step( 95 | subclass = "select_roc", 96 | terms = terms, 97 | outcome = outcome, 98 | role = role, 99 | trained = trained, 100 | threshold = threshold, 101 | top_p = top_p, 102 | exclude = exclude, 103 | skip = skip, 104 | id = id 105 | ) 106 | } 107 | 108 | roc_calc <- function(x, y) { 109 | suppressMessages( 110 | suppressWarnings( 111 | { 112 | if (length(levels(y)) == 2) { 113 | res <- try(pROC::roc(y, x, direction = "auto"), silent = TRUE) 114 | } else { 115 | res <- try(pROC::multiclass.roc(y, x, direction = "auto"), silent = TRUE) 116 | } 117 | } 118 | ) 119 | ) 120 | 121 | if (inherits(res, "try-error")) { 122 | res <- NA_real_ 123 | } else { 124 | res <- unname(pROC::auc(res)) 125 | } 126 | res 127 | } 128 | 129 | #' @export 130 | prep.step_select_roc <- function(x, training, info = NULL, ...) { 131 | y_name <- recipes::terms_select(x$outcome, info = info) 132 | y_name <- x$outcome[1] 133 | recipes::check_type(training[, y_name], quant = FALSE) 134 | x_names <- recipes::terms_select(x$terms, info = info, empty_fun = I) 135 | 136 | if(length(x_names) > 0) { 137 | 138 | recipes::check_type(training[, x_names]) 139 | 140 | # check criteria 141 | check_criteria(x$top_p, x$threshold, match.call()) 142 | check_zero_one(x$threshold) 143 | x$top_p <- check_top_p(x$top_p, length(x_names)) 144 | 145 | # filter 146 | scores <- purrr::map_dbl(training[, x_names], ~ roc_calc(.x, training[[y_name]])) 147 | exclude_chr <- dual_filter(scores, x$top_p, x$threshold, maximize = TRUE) 148 | } else { 149 | exclude_chr <- character() 150 | } 151 | 152 | step_select_roc_new( 153 | terms = x$terms, 154 | outcome = x$outcome, 155 | role = x$role, 156 | trained = TRUE, 157 | threshold = x$threshold, 158 | top_p = x$top_p, 159 | exclude = exclude_chr, 160 | skip = x$skip, 161 | id = x$id 162 | ) 163 | } 164 | 165 | #' @export 166 | bake.step_select_roc <- function(object, new_data, ...) { 167 | if (length(object$exclude) > 0) { 168 | new_data <- new_data %>% dplyr::select(-dplyr::one_of(object$exclude)) 169 | } 170 | new_data 171 | } 172 | 173 | #' @export 174 | print.step_select_roc <- function(x, width = max(20, options()$width - 30), ...) { 175 | cat("ROC curve feature selection") 176 | 177 | if(recipes::is_trained(x)) { 178 | n <- length(x$exclude) 179 | cat(paste0(" (", n, " excluded)")) 180 | } 181 | cat("\n") 182 | 183 | invisible(x) 184 | } 185 | 186 | #' @rdname step_select_roc 187 | #' @param x A `step_select_roc` object. 188 | #' @export 189 | tidy.step_select_roc <- function(x, ...) { 190 | if (recipes::is_trained(x)) { 191 | res <- tibble(terms = x$exclude) 192 | } else { 193 | term_names <- recipes::sel2char(x$terms) 194 | res <- tibble(terms = rlang::na_chr) 195 | } 196 | res$id <- x$id 197 | res 198 | } 199 | 200 | #' @export 201 | tunable.step_select_roc <- function(x, ...) { 202 | tibble::tibble( 203 | name = c("top_p", "threshold"), 204 | call_info = list( 205 | list(pkg = "recipeselectors", fun = "top_p"), 206 | list(pkg = "dials", fun = "threshold", range = c(0, 1)) 207 | ), 208 | source = "recipe", 209 | component = "step_select_roc", 210 | component_id = x$id 211 | ) 212 | } 213 | -------------------------------------------------------------------------------- /R/step_select_carscore.R: -------------------------------------------------------------------------------- 1 | #' Information gain feature selection step 2 | #' 3 | #' `step_select_carscore` creates a *specification* of a recipe step that 4 | #' selects a subset of predictors as part of a regression model based on the 5 | #' scores of the CAR score algorithm. This step requires the `care` package to be 6 | #' installed. The top `top_p` scoring features, or features whose scores occur 7 | #' in the top percentile `threshold` will be retained as new predictors. 8 | #' 9 | #' @param recipe A recipe object. The step will be added to the sequence of 10 | #' operations for this recipe. 11 | #' @param ... One or more selector functions to choose which variables are 12 | #' affected by the step. See selections() for more details. For the tidy 13 | #' method, these are not currently used. 14 | #' @param role Not used by this step since no new variables are created. 15 | #' @param trained A logical to indicate if the quantities for preprocessing have 16 | #' been estimated. 17 | #' @param lambda The correlation shrinkage intensity (range 0-1). 18 | #' @param diagonal For diagonal = FALSE (the default) CAR scores are computed; 19 | #' otherwise with diagonal = TRUE marginal correlations. 20 | #' @param outcome A character string with the name of the response variable. 21 | #' This must refer to a numeric feature for regression. 22 | #' @param top_p An integer with the number of best scoring features to 23 | #' select. 24 | #' @param threshold A numeric value between 0 and 1 representing the percentile 25 | #' of best scoring features to select. Features with scores that are _larger_ 26 | #' than the specified threshold will be retained, for example `threshold = 27 | #' 0.9` will retain only predictors with scores in the top 90th percentile. 28 | #' Note that this overrides `top_p`. 29 | #' @param exclude A character vector of predictor names that will be removed 30 | #' from the data. This will be set when `prep()` is used on the recipe and 31 | #' should not be set by the user. 32 | #' @param scores A tibble with 'variable' and 'scores' columns containing the 33 | #' names of the variables and the absolute values of the calculated CAR 34 | #' scores. This parameter is only produced after the recipe has been trained. 35 | #' @param skip A logical. Should the step be skipped when the recipe is baked by 36 | #' bake.recipe()? While all operations are baked when prep.recipe() is run, 37 | #' some operations may not be able to be conducted on new data (e.g. 38 | #' processing the outcome variable(s)). Care should be taken when using skip = 39 | #' TRUE as it may affect the computations for subsequent operations. 40 | #' @param id A character string that is unique to this step to identify it. 41 | #' @return A step_select_carscore object. 42 | #' @export 43 | #' @keywords datagen 44 | #' @concept preprocessing 45 | #' @concept supervised_filter 46 | #' @export 47 | #' @details 48 | #' 49 | #' The recipe will stop if both `top_p` and `threshold` are left unspecified. 50 | #' 51 | #' @examples 52 | #' library(recipes) 53 | #' 54 | #' data(car_prices, package = "modeldata") 55 | #' 56 | #' rec <- 57 | #' recipe(Price ~ ., data = car_prices) %>% 58 | #' step_select_carscore(all_predictors(), outcome = "Price", top_p = 5, threshold = 0.7) 59 | #' 60 | #' prepped <- prep(rec) 61 | #' 62 | #' new_data <- juice(prepped) 63 | #' prepped 64 | step_select_carscore <- function( 65 | recipe, ..., 66 | outcome = NULL, 67 | role = NA, 68 | trained = FALSE, 69 | top_p = NA, 70 | threshold = NA, 71 | lambda = NA, 72 | diagonal = FALSE, 73 | exclude = NULL, 74 | scores = NULL, 75 | skip = FALSE, 76 | id = recipes::rand_id("select_carscore")) { 77 | 78 | recipes::recipes_pkg_check("care") 79 | 80 | terms <- recipes::ellipse_check(...) 81 | 82 | recipes::add_step( 83 | recipe, 84 | step_select_carscore_new( 85 | terms = terms, 86 | trained = trained, 87 | outcome = outcome, 88 | role = role, 89 | top_p = top_p, 90 | threshold = threshold, 91 | lambda = lambda, 92 | diagonal = diagonal, 93 | exclude = exclude, 94 | scores = scores, 95 | skip = skip, 96 | id = id 97 | ) 98 | ) 99 | } 100 | 101 | 102 | # wrapper around 'step' function that sets the class of new step objects 103 | step_select_carscore_new <- function(terms, role, trained, outcome, top_p, 104 | threshold, lambda, diagonal, exclude, scores, 105 | skip, id) { 106 | recipes::step( 107 | subclass = "select_carscore", 108 | terms = terms, 109 | role = role, 110 | trained = trained, 111 | outcome = outcome, 112 | top_p = top_p, 113 | threshold = threshold, 114 | lambda = lambda, 115 | diagonal = diagonal, 116 | exclude = exclude, 117 | scores = scores, 118 | skip = skip, 119 | id = id 120 | ) 121 | } 122 | 123 | 124 | #' @export 125 | prep.step_select_carscore <- function(x, training, info = NULL, ...) { 126 | 127 | # extract response and predictor names 128 | x_names <- recipes::terms_select(terms = x$terms, info = info) 129 | y_name <- recipes::terms_select(x$outcome, info = info) 130 | y_name <- y_name[1] 131 | 132 | # check criteria 133 | recipes::check_type(training[, y_name], quant = TRUE) 134 | check_criteria(x$top_p, x$threshold, match.call()) 135 | check_zero_one(x$threshold) 136 | x$top_p <- check_top_p(x$top_p, length(x_names)) 137 | 138 | # information gain 139 | if (length(x_names) > 0) { 140 | 141 | args <- list() 142 | 143 | if (!is.na(x$lambda)) 144 | args$lambda <- x$lambda 145 | 146 | call <- rlang::call2( 147 | .fn = "carscore", 148 | .ns = "care", 149 | Xtrain = training[, x_names], 150 | Ytrain = training[, y_name], 151 | diagonal = x$diagonal, 152 | !!!args 153 | ) 154 | 155 | res <- rlang::eval_tidy(call) 156 | 157 | res <- tibble( 158 | variable = names(res), 159 | score = abs(res) 160 | ) 161 | 162 | exclude <- 163 | select_percentile(res$score, x$top_p, x$threshold, maximize = TRUE) 164 | 165 | } else { 166 | exclude <- character() 167 | } 168 | 169 | step_select_carscore_new( 170 | terms = x$terms, 171 | trained = TRUE, 172 | role = x$role, 173 | outcome = y_name, 174 | top_p = x$top_p, 175 | threshold = x$threshold, 176 | lambda = x$lambda, 177 | diagonal = x$diagonal, 178 | exclude = exclude, 179 | scores = res, 180 | skip = x$skip, 181 | id = x$id 182 | ) 183 | } 184 | 185 | #' @export 186 | bake.step_select_carscore <- function(object, new_data, ...) { 187 | if (length(object$exclude > 0)) { 188 | new_data <- new_data[, !(colnames(new_data) %in% object$exclude)] 189 | } 190 | as_tibble(new_data) 191 | } 192 | 193 | #' @export 194 | print.step_select_carscore <- function(x, width = max(20, options()$width - 30), ...) { 195 | cat("Carscore feature selection") 196 | 197 | if(recipes::is_trained(x)) { 198 | n <- length(x$exclude) 199 | cat(paste0(" (", n, " excluded)")) 200 | } 201 | cat("\n") 202 | 203 | invisible(x) 204 | } 205 | 206 | #' @rdname step_select_carscore 207 | #' @param x A `step_select_carscore` object. 208 | #' @export 209 | tidy.step_select_carscore <- function(x, ...) { 210 | if (recipes::is_trained(x)) { 211 | res <- tibble(terms = x$exclude) 212 | } else { 213 | term_names <- recipes::sel2char(x$terms) 214 | res <- tibble(terms = rlang::na_chr) 215 | } 216 | res$id <- x$id 217 | res 218 | } 219 | 220 | #' @export 221 | tunable.step_select_carscore <- function(x, ...) { 222 | tibble::tibble( 223 | name = c("top_p", "threshold"), 224 | call_info = list( 225 | list(pkg = "recipeselectors", fun = "top_p"), 226 | list(pkg = "dials", fun = "threshold", range = c(0, 1)) 227 | ), 228 | source = "recipe", 229 | component = "step_select_carscore", 230 | component_id = x$id 231 | ) 232 | } 233 | -------------------------------------------------------------------------------- /R/step_select_xtab.R: -------------------------------------------------------------------------------- 1 | #' Filter Categorical Predictors using Contingency Tables 2 | #' 3 | #' `step_select_xtab` creates a *specification* of a recipe step that will 4 | #' filter predictors using their relationship with the outcome as measured 5 | #' using statistical tests for association. 6 | #' 7 | #' @param recipe A recipe object. The step will be added to the sequence of 8 | #' operations for this recipe. 9 | #' @param ... One or more selector functions to choose which predictors are 10 | #' affected by the step. See [selections()] for more details. For the `tidy` 11 | #' method, these are not currently used. 12 | #' @param outcome A single character string that specifies a single categorical 13 | #' variable to be used as the class. 14 | #' @param role For model terms created by this step, what analysis role should 15 | #' they be assigned?. By default, the function assumes that resulting distances 16 | #' will be used as predictors in a model. 17 | #' @param threshold A numeric value, in p-value/FDR units, where predictors with 18 | #' _smaller_ than the threshold will be retained. A value of `NA` 19 | #' implies that this criterion will be ignored. 20 | #' @param top_p An integer that will be used to select the predictors with the 21 | #' smallest p/FDR values. A value of `NA` implies that this criterion will be 22 | #' ignored. 23 | #' @param exact Should an exact test be used? 24 | #' @param fdr Should false discovery rates (FDR) be used instead of p-values? 25 | #' @param exclude A character vector of predictor names that will be removed 26 | #' from the data. This will be set when `prep()` is used on the recipe and 27 | #' should not be set by the user. 28 | #' @param trained A logical to indicate if the quantities for preprocessing have 29 | #' been estimated. 30 | #' @param skip A logical. Should the step be skipped when the recipe is baked by 31 | #' bake.recipe()? While all operations are baked when prep.recipe() is run, 32 | #' some operations may not be able to be conducted on new data (e.g. 33 | #' processing the outcome variable(s)). Care should be taken when using skip = 34 | #' TRUE as it may affect the computations for subsequent operations. 35 | #' @param id A character string that is unique to this step to identify it. 36 | #' @return An updated version of `recipe` with the new step added to the 37 | #' sequence of existing steps (if any). For the `tidy` method, a tibble with a 38 | #' `terms` column for which predictors were removed. 39 | #' @keywords datagen 40 | #' @concept preprocessing 41 | #' @concept supervised_filter 42 | #' @export 43 | #' @details 44 | #' 45 | #' The recipe will stop if both `top_p` and `threshold` are left unspecified. If 46 | #' both are used, they are combined via 'or'. 47 | #' 48 | #' The Benjamini-Hochberg FDR correction is used (see [stats::p.adjust()]). 49 | #' 50 | #' Warnings from [stats::chisq.test()] and [stats::fisher.test()] are suppressed. 51 | #' @examples 52 | #' data(attrition, package = "modeldata") 53 | #' 54 | #' rec <- 55 | #' recipe(Attrition ~ ., data = attrition) %>% 56 | #' step_select_xtab(all_nominal(), -all_outcomes(), outcome = "Attrition", 57 | #' top_p = 1, threshold = 0.001, exact = TRUE) %>% 58 | #' prep() 59 | #' 60 | #' rec %>% juice(all_nominal(), -all_outcomes()) %>% names() 61 | #' 62 | #' tidy(rec, number = 1) 63 | #' 64 | step_select_xtab <- function(recipe, 65 | ..., 66 | outcome, 67 | role = "predictor", 68 | trained = FALSE, 69 | threshold = NA, 70 | top_p = NA, 71 | exact = FALSE, 72 | fdr = TRUE, 73 | exclude = NULL, 74 | skip = FALSE, 75 | id = recipes::rand_id("select_xtab")) { 76 | recipes::add_step( 77 | recipe, 78 | step_select_xtab_new( 79 | terms = recipes::ellipse_check(...), 80 | outcome = outcome, 81 | role = role, 82 | trained = trained, 83 | threshold = threshold, 84 | top_p = top_p, 85 | exact = exact, 86 | fdr = fdr, 87 | exclude = exclude, 88 | skip = skip, 89 | id = id 90 | ) 91 | ) 92 | } 93 | 94 | step_select_xtab_new <- 95 | function(terms, outcome, role, trained, threshold, top_p, exact, fdr, 96 | exclude, skip, id) { 97 | recipes::step( 98 | subclass = "select_xtab", 99 | terms = terms, 100 | outcome = outcome, 101 | role = role, 102 | trained = trained, 103 | threshold = threshold, 104 | top_p = top_p, 105 | exact = exact, 106 | fdr = fdr, 107 | exclude = exclude, 108 | skip = skip, 109 | id = id 110 | ) 111 | } 112 | 113 | tbl_calc <- function(x, y, exact) { 114 | xtab <- table(x, y) 115 | if (exact) { 116 | res <- suppressWarnings(try(stats::fisher.test(xtab)$p.value, silent = TRUE)) 117 | } else { 118 | res <- suppressWarnings(try(stats::chisq.test(xtab)$p.value, silent = TRUE)) 119 | } 120 | if (inherits(res, "try-error")) { 121 | res <- NA_real_ 122 | } 123 | res 124 | } 125 | 126 | #' @export 127 | prep.step_select_xtab <- function(x, training, info = NULL, ...) { 128 | y_name <- recipes::terms_select(x$outcome, info = info) 129 | y_name <- x$outcome[1] 130 | recipes::check_type(training[, y_name], quant = FALSE) 131 | x_names <- recipes::terms_select(x$terms, info = info, empty_fun = I) 132 | 133 | if(length(x_names) > 0) { 134 | 135 | recipes::check_type(training[, x_names], quant = FALSE) 136 | 137 | # check criteria 138 | check_criteria(x$top_p, x$threshold, match.call()) 139 | check_zero_one(x$threshold) 140 | x$top_p <- check_top_p(x$top_p, length(x_names)) 141 | 142 | # filter 143 | scores <- purrr::map_dbl(training[, x_names], 144 | ~ tbl_calc(.x, training[[y_name]], exact = x$exact)) 145 | scores <- sort(scores, na.last = TRUE) 146 | if (x$fdr) { 147 | scores <- stats::p.adjust(scores, method = "BH") 148 | } 149 | 150 | exclude_chr <- dual_filter(scores, x$top_p, x$threshold, maximize = FALSE) 151 | } else { 152 | exclude_chr <- character() 153 | } 154 | 155 | step_select_xtab_new( 156 | terms = x$terms, 157 | outcome = x$outcome, 158 | role = x$role, 159 | trained = TRUE, 160 | threshold = x$threshold, 161 | top_p = x$top_p, 162 | exact = x$exact, 163 | fdr = x$fdr, 164 | exclude = exclude_chr, 165 | skip = x$skip, 166 | id = x$id 167 | ) 168 | } 169 | 170 | #' @export 171 | bake.step_select_xtab <- function(object, new_data, ...) { 172 | if (length(object$exclude) > 0) { 173 | new_data <- new_data %>% dplyr::select(-dplyr::one_of(object$exclude)) 174 | } 175 | new_data 176 | } 177 | 178 | #' @export 179 | print.step_select_xtab <- function(x, width = max(20, options()$width - 30), ...) { 180 | cat("Association test feature selection") 181 | 182 | if(recipes::is_trained(x)) { 183 | n <- length(x$exclude) 184 | cat(paste0(" (", n, " excluded)")) 185 | } 186 | cat("\n") 187 | 188 | invisible(x) 189 | } 190 | 191 | #' @rdname step_select_xtab 192 | #' @param x A `step_select_xtab` object. 193 | #' @export 194 | tidy.step_select_xtab <- function(x, ...) { 195 | if (recipes::is_trained(x)) { 196 | res <- tibble(terms = x$exclude) 197 | } else { 198 | term_names <- recipes::sel2char(x$terms) 199 | res <- tibble(terms = rlang::na_chr) 200 | } 201 | res$id <- x$id 202 | res 203 | } 204 | 205 | #' @export 206 | tunable.step_select_xtab <- function(x, ...) { 207 | tibble::tibble( 208 | name = c("top_p", "threshold"), 209 | call_info = list( 210 | list(pkg = "recipeselectors", fun = "top_p"), 211 | list(pkg = "dials", fun = "threshold", range = c(-10, -1)) 212 | ), 213 | source = "recipe", 214 | component = "step_select_xtab", 215 | component_id = x$id 216 | ) 217 | } 218 | -------------------------------------------------------------------------------- /R/step_select_infgain.R: -------------------------------------------------------------------------------- 1 | #' Information gain feature selection step 2 | #' 3 | #' `step_select_infgain` creates a *specification* of a recipe step that selects a 4 | #' subset of predictors based on the scores of the information gain algorithm. 5 | #' This step requires the FSelectorRcpp package to be installed. The top 6 | #' `top_p` scoring features, or features whose scores occur in the top 7 | #' percentile `threshold` will be retained as new predictors. 8 | #' 9 | #' @param recipe A recipe object. The step will be added to the sequence of 10 | #' operations for this recipe. 11 | #' @param ... One or more selector functions to choose which variables are 12 | #' affected by the step. See selections() for more details. For the tidy 13 | #' method, these are not currently used. 14 | #' @param role Not used by this step since no new variables are created. 15 | #' @param trained A logical to indicate if the quantities for preprocessing have 16 | #' been estimated. 17 | #' @param type A character string specifying the information gain method to use. 18 | #' One of "infogain", "gainratio", "symuncert". The default is 'infogain'. 19 | #' @param outcome A character string with the name of the response variable to 20 | #' use to evaluate information gain value against the predictors. 21 | #' @param top_p An integer with the number of best scoring features to 22 | #' select. 23 | #' @param threshold A numeric value between 0 and 1 representing the percentile 24 | #' of best scoring features to select. Features with scores that are _larger_ 25 | #' than the specified threshold will be retained, for example `threshold = 26 | #' 0.9` will retain only predictors with scores in the top 90th percentile. 27 | #' Note that this overrides `top_p`. 28 | #' @param threads An integer specifying the number of threads to use for 29 | #' processing. The default = 0 uses all available threads. 30 | #' @param exclude A character vector of predictor names that will be removed 31 | #' from the data. This will be set when `prep()` is used on the recipe and 32 | #' should not be set by the user. 33 | #' @param scores A tibble with 'variable' and 'scores' columns containing the 34 | #' names of the variables and their information gain scores. This parameter is 35 | #' only produced after the recipe has been trained. 36 | #' @param skip A logical. Should the step be skipped when the recipe is baked by 37 | #' bake.recipe()? While all operations are baked when prep.recipe() is run, 38 | #' some operations may not be able to be conducted on new data (e.g. 39 | #' processing the outcome variable(s)). Care should be taken when using skip = 40 | #' TRUE as it may affect the computations for subsequent operations. 41 | #' @param id A character string that is unique to this step to identify it. 42 | #' @return A step_select_infgain object. 43 | #' @export 44 | #' @keywords datagen 45 | #' @concept preprocessing 46 | #' @concept supervised_filter 47 | #' @export 48 | #' @details 49 | #' 50 | #' The recipe will stop if both `top_p` and `threshold` are left unspecified. 51 | #' 52 | #' @examples 53 | #' library(recipes) 54 | #' 55 | #' data(cells, package = "modeldata") 56 | #' 57 | #' rec <- 58 | #' recipe(class ~ ., data = cells[, -1]) %>% 59 | #' step_select_infgain(all_predictors(), outcome = "class", top_p = 10, threshold = 0.9) 60 | #' 61 | #' prepped <- prep(rec) 62 | #' 63 | #' new_data <- juice(prepped) 64 | #' prepped 65 | step_select_infgain <- function( 66 | recipe, ..., 67 | outcome = NULL, 68 | role = NA, 69 | trained = FALSE, 70 | top_p = NA, 71 | threshold = NA, 72 | type = "infogain", 73 | threads = 1, 74 | exclude = NULL, 75 | scores = NULL, 76 | skip = FALSE, 77 | id = recipes::rand_id("select_infgain")) { 78 | 79 | recipes::recipes_pkg_check("FSelectorRcpp") 80 | 81 | terms <- recipes::ellipse_check(...) 82 | 83 | recipes::add_step( 84 | recipe, 85 | step_select_infgain_new( 86 | terms = terms, 87 | trained = trained, 88 | outcome = outcome, 89 | role = role, 90 | top_p = top_p, 91 | threshold = threshold, 92 | type = type, 93 | threads = threads, 94 | exclude = exclude, 95 | scores = scores, 96 | skip = skip, 97 | id = id 98 | ) 99 | ) 100 | } 101 | 102 | 103 | # wrapper around 'step' function that sets the class of new step objects 104 | step_select_infgain_new <- function(terms, role, trained, outcome, top_p, 105 | threshold, type, threads, exclude, scores, 106 | skip, id) { 107 | recipes::step( 108 | subclass = "select_infgain", 109 | terms = terms, 110 | role = role, 111 | trained = trained, 112 | outcome = outcome, 113 | top_p = top_p, 114 | threshold = threshold, 115 | type = type, 116 | threads = threads, 117 | exclude = exclude, 118 | scores = scores, 119 | skip = skip, 120 | id = id 121 | ) 122 | } 123 | 124 | 125 | #' @export 126 | prep.step_select_infgain <- function(x, training, info = NULL, ...) { 127 | # extract response and predictor names 128 | x_names <- recipes::terms_select(terms = x$terms, info = info) 129 | y_name <- recipes::terms_select(x$outcome, info = info) 130 | y_name <- y_name[1] 131 | 132 | # check criteria 133 | check_criteria(x$top_p, x$threshold, match.call()) 134 | check_zero_one(x$threshold) 135 | x$top_p <- check_top_p(x$top_p, length(x_names)) 136 | 137 | # information gain 138 | if (length(x_names) > 0) { 139 | 140 | f <- stats::as.formula(paste(y_name, "~", paste0(x_names, collapse = " + "))) 141 | 142 | ig_call <- rlang::call2( 143 | .fn = "information_gain", 144 | .ns = "FSelectorRcpp", 145 | formula = f, 146 | data = rlang::quo(training), 147 | type = x$type, 148 | threads = x$threads, 149 | discIntegers = TRUE, 150 | equal = FALSE 151 | ) 152 | 153 | res <- rlang::eval_tidy(ig_call) 154 | res <- as_tibble(res) 155 | res <- rlang::set_names(res, c("variable", "score")) 156 | res$score <- rlang::set_names(res$score, res$variable) 157 | 158 | exclude <- 159 | select_percentile(res$score, x$top_p, x$threshold, maximize = TRUE) 160 | 161 | } else { 162 | exclude <- character() 163 | } 164 | 165 | step_select_infgain_new( 166 | terms = x$terms, 167 | trained = TRUE, 168 | role = x$role, 169 | outcome = y_name, 170 | top_p = x$top_p, 171 | threshold = x$threshold, 172 | type = x$type, 173 | threads = x$threads, 174 | exclude = exclude, 175 | scores = res, 176 | skip = x$skip, 177 | id = x$id 178 | ) 179 | } 180 | 181 | #' @export 182 | bake.step_select_infgain <- function(object, new_data, ...) { 183 | if (length(object$exclude > 0)) { 184 | new_data <- new_data[, !(colnames(new_data) %in% object$exclude)] 185 | } 186 | as_tibble(new_data) 187 | } 188 | 189 | #' @export 190 | print.step_select_infgain <- function(x, width = max(20, options()$width - 30), ...) { 191 | cat("Information Gain feature selection") 192 | 193 | if(recipes::is_trained(x)) { 194 | n <- length(x$exclude) 195 | cat(paste0(" (", n, " excluded)")) 196 | } 197 | cat("\n") 198 | 199 | invisible(x) 200 | } 201 | 202 | #' @rdname step_select_infgain 203 | #' @param x A `step_select_infgain` object. 204 | #' @export 205 | tidy.step_select_infgain <- function(x, ...) { 206 | if (recipes::is_trained(x)) { 207 | res <- tibble(terms = x$exclude) 208 | } else { 209 | term_names <- recipes::sel2char(x$terms) 210 | res <- tibble(terms = rlang::na_chr) 211 | } 212 | res$id <- x$id 213 | res 214 | } 215 | 216 | #' @export 217 | tunable.step_select_infgain <- function(x, ...) { 218 | tibble::tibble( 219 | name = c("top_p", "threshold"), 220 | call_info = list( 221 | list(pkg = "recipeselectors", fun = "top_p"), 222 | list(pkg = "dials", fun = "threshold", range = c(0, 1)) 223 | ), 224 | source = "recipe", 225 | component = "step_select_infgain", 226 | component_id = x$id 227 | ) 228 | } 229 | -------------------------------------------------------------------------------- /docs/reference/index.html: -------------------------------------------------------------------------------- 1 | 2 | Function reference • recipeselectors 6 | 7 | 8 |
    9 |
    37 | 38 | 39 | 40 |
    41 |
    42 | 45 | 46 | 50 | 53 | 54 | 57 | 59 | 62 | 63 | 66 | 67 | 70 | 71 | 74 | 75 | 78 | 79 | 82 | 83 | 86 | 87 | 90 | 91 | 94 | 96 | 99 | 100 | 103 | 104 |
    47 |

    All functions

    48 |

    49 |
    51 |

    pull_importances()

    52 |

    Pull feature importances from a parsnip fitted model

    55 |

    recipeselectors

    56 |

    recipeselectors: A collection of steps for feature selection to use with the 58 | 'recipes' package

    60 |

    step_select_boruta() tidy(<step_select_boruta>)

    61 |

    Feature selection step using Boruta

    64 |

    step_select_carscore() tidy(<step_select_carscore>)

    65 |

    Information gain feature selection step

    68 |

    step_select_forests() tidy(<step_select_forests>)

    69 |

    Feature selection step using a random forest feature importance scores

    72 |

    step_select_infgain() tidy(<step_select_infgain>)

    73 |

    Information gain feature selection step

    76 |

    step_select_linear() tidy(<step_select_linear>)

    77 |

    Feature selection step using the magnitude of a linear models' coefficients

    80 |

    step_select_mrmr() tidy(<step_select_mrmr>)

    81 |

    Apply minimum Redundancy Maximum Relevance Feature Selection (mRMR)

    84 |

    step_select_roc() tidy(<step_select_roc>)

    85 |

    Filter Numeric Predictors using ROC Curve

    88 |

    step_select_tree() tidy(<step_select_tree>)

    89 |

    Feature selection step using a decision tree importance scores

    92 |

    step_select_vip() tidy(<step_select_vip>)

    93 |

    Feature selection step using a model's feature importance scores or 95 | coefficients

    97 |

    step_select_xtab() tidy(<step_select_xtab>)

    98 |

    Filter Categorical Predictors using Contingency Tables

    101 |

    top_p()

    102 |

    Parameter functions for feature selection recipes

    105 | 106 | 109 |
    110 | 111 | 112 |
    122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | -------------------------------------------------------------------------------- /R/step_select_tree.R: -------------------------------------------------------------------------------- 1 | #' Feature selection step using a decision tree importance scores 2 | #' 3 | #' `step_select_tree` creates a *specification* of a recipe step that selects a 4 | #' subset of predictors based on the ranking of variable importance provided by 5 | #' a `parsnip::decision_tree` supported model. 6 | #' 7 | #' @param recipe A recipe object. The step will be added to the sequence of 8 | #' operations for this recipe. 9 | #' @param ... One or more selector functions to choose which variables are 10 | #' affected by the step. See selections() for more details. For the tidy 11 | #' method, these are not currently used. 12 | #' @param outcome A character string with the name of the response variable to 13 | #' use to calculate the feature importance scores. 14 | #' @param role Not used by this step since no new variables are created. 15 | #' @param trained A logical to indicate if the quantities for preprocessing have 16 | #' been estimated. 17 | #' @param engine A supported rand_forest engine that is supported by parsnip. 18 | #' The default is "rpart". 19 | #' @param top_p An integer with the number of best scoring features to 20 | #' select. 21 | #' @param cost_complexity A positive number for the the cost/complexity 22 | #' parameter (a.k.a. Cp) used by CART models (specific engines only). 23 | #' @param tree_depth An integer for maximum depth of the tree. 24 | #' @param min_n An integer for the minimum number of data points in a node that 25 | #' are required for the node to be split further. 26 | #' @param threshold A numeric value between 0 and 1 representing the percentile 27 | #' of best scoring features to select. Features with scores that are _larger_ 28 | #' than the specified threshold will be retained, for example `threshold = 29 | #' 0.9` will retain only predictors with scores in the top 90th percentile. 30 | #' Note that this overrides `top_p`. 31 | #' @param exclude A character vector of predictor names that will be removed 32 | #' from the data. This will be set when `prep()` is used on the recipe and 33 | #' should not be set by the user. 34 | #' @param scores A tibble with 'variable' and 'scores' columns containing the 35 | #' names of the variables and their feature importance scores. This parameter 36 | #' is only produced after the recipe has been trained. 37 | #' @param skip A logical. Should the step be skipped when the recipe is baked by 38 | #' bake.recipe()? While all operations are baked when prep.recipe() is run, 39 | #' some operations may not be able to be conducted on new data (e.g. 40 | #' processing the outcome variable(s)). Care should be taken when using skip = 41 | #' TRUE as it may affect the computations for subsequent operations. 42 | #' @param id A character string that is unique to this step to identify it. 43 | #' 44 | #' @return a `step_select_tree` object. 45 | #' @export 46 | #' @examples 47 | #' library(recipes) 48 | #' library(parsnip) 49 | #' 50 | #' # load the example iris dataset 51 | #' data(cells, package = "modeldata") 52 | #' 53 | #' # create a preprocessing recipe 54 | #' rec <- 55 | #' recipe(class ~ ., data = cells[, -1]) %>% 56 | #' step_select_tree(all_predictors(), outcome = "class", top_p = 10, 57 | #' threshold = 0.9) 58 | #' 59 | #' prepped <- prep(rec) 60 | #' 61 | #' preproc_data <- juice(prepped) 62 | #' prepped 63 | step_select_tree <- function( 64 | recipe, 65 | ..., 66 | outcome = NULL, 67 | role = "predictor", 68 | trained = FALSE, 69 | engine = "rpart", 70 | cost_complexity = NULL, 71 | tree_depth = NULL, 72 | min_n = NULL, 73 | top_p = NA, 74 | threshold = NA, 75 | exclude = NULL, 76 | scores = NULL, 77 | skip = FALSE, 78 | id = recipes::rand_id("select_tree")) { 79 | 80 | engines <- parsnip::show_engines("decision_tree")$engine 81 | 82 | if (!engine %in% engines) { 83 | rlang::abort( 84 | paste("Engine argument should be one of", paste(engines, collapse = ", ")) 85 | ) 86 | } 87 | 88 | recipes::add_step( 89 | recipe, 90 | step_select_tree_new( 91 | terms = recipes::ellipse_check(...), 92 | trained = trained, 93 | outcome = outcome, 94 | role = role, 95 | engine = engine, 96 | cost_complexity = cost_complexity, 97 | tree_depth = tree_depth, 98 | min_n = min_n, 99 | top_p = top_p, 100 | threshold = threshold, 101 | exclude = exclude, 102 | scores = scores, 103 | skip = skip, 104 | id = id 105 | ) 106 | ) 107 | } 108 | 109 | # wrapper around 'step' function that sets the class of new step objects 110 | #' @importFrom recipes step 111 | step_select_tree_new <- function(terms, role, trained, outcome, engine, 112 | top_p, cost_complexity, tree_depth, min_n, 113 | threshold, exclude, scores, skip, id) { 114 | recipes::step( 115 | subclass = "select_tree", 116 | terms = terms, 117 | role = role, 118 | trained = trained, 119 | outcome = outcome, 120 | engine = engine, 121 | cost_complexity = cost_complexity, 122 | tree_depth = tree_depth, 123 | min_n = min_n, 124 | top_p = top_p, 125 | threshold = threshold, 126 | exclude = exclude, 127 | scores = scores, 128 | skip = skip, 129 | id = id 130 | ) 131 | } 132 | 133 | #' @export 134 | prep.step_select_tree <- function(x, training, info = NULL, ...) { 135 | 136 | # translate the terms arguments 137 | x_names <- recipes::terms_select(terms = x$terms, info = info) 138 | y_name <- recipes::terms_select(x$outcome, info = info) 139 | y_name <- y_name[1] 140 | 141 | # check criteria 142 | check_criteria(x$top_p, x$threshold, match.call()) 143 | check_zero_one(x$threshold) 144 | x$top_p <- check_top_p(x$top_p, length(x_names)) 145 | 146 | if (length(x_names) > 0) { 147 | # fit initial model 148 | X <- training[, x_names] 149 | y <- training[[y_name]] 150 | 151 | model_mode <- ifelse(inherits(y, "numeric"), "regression", "classification") 152 | 153 | model_args <- list( 154 | cost_complexity = x$cost_complexity, 155 | tree_depth = x$tree_depth, 156 | min_n = x$min_n 157 | ) 158 | 159 | model_spec <- 160 | parsnip::make_call("decision_tree", args = model_args, ns = "parsnip") 161 | 162 | model_spec <- 163 | rlang::eval_tidy(model_spec) %>% 164 | parsnip::set_mode(model_mode) %>% 165 | parsnip::set_engine(x$engine) 166 | 167 | initial_model <- parsnip::fit_xy(model_spec, X, y) 168 | res <- pull_importances(initial_model) 169 | names(res) <- c("variable", "score") 170 | res$score <- rlang::set_names(res$score, res$variable) 171 | 172 | exclude <- 173 | select_percentile(res$score, x$top_p, x$threshold, maximize = TRUE) 174 | 175 | } else { 176 | exclude <- character() 177 | } 178 | 179 | step_select_tree_new( 180 | terms = x$terms, 181 | trained = TRUE, 182 | role = x$role, 183 | outcome = y_name, 184 | engine = x$engine, 185 | cost_complexity = x$cost_complexity, 186 | tree_depth = x$tree_depth, 187 | min_n = x$min_n, 188 | top_p = x$top_p, 189 | threshold = x$threshold, 190 | exclude = exclude, 191 | scores = res, 192 | skip = x$skip, 193 | id = x$id 194 | ) 195 | } 196 | 197 | #' @export 198 | bake.step_select_tree <- function(object, new_data, ...) { 199 | if (length(object$exclude) > 0) { 200 | new_data <- new_data[, !colnames(new_data) %in% object$exclude] 201 | } 202 | 203 | as_tibble(new_data) 204 | } 205 | 206 | #' @export 207 | print.step_select_tree <- function(x, width = max(20, options()$width - 30), 208 | ...) { 209 | cat("Variable importance feature selection") 210 | 211 | if (recipes::is_trained(x)) { 212 | n <- length(x$exclude) 213 | cat(paste0(" (", n, " excluded)")) 214 | } 215 | cat("\n") 216 | 217 | invisible(x) 218 | } 219 | 220 | #' @rdname step_select_tree 221 | #' @param x A `step_select_tree` object. 222 | #' @export 223 | tidy.step_select_tree <- function(x, ...) { 224 | if (recipes::is_trained(x)) { 225 | res <- tibble(terms = x$exclude) 226 | 227 | } else { 228 | term_names <- recipes::sel2char(x$terms) 229 | res <- tibble(terms = term_names) 230 | } 231 | res$id <- x$id 232 | res 233 | } 234 | 235 | #' @export 236 | tunable.step_select_tree <- function(x, ...) { 237 | tibble( 238 | name = c("top_p", "threshold", "cost_complexity", "tree_depth", "min_n"), 239 | call_info = list( 240 | list(pkg = "recipeselectors", fun = "top_p"), 241 | list(pkg = "dials", fun = "threshold", range = c(0, 1)), 242 | list(pkg = "dials", fun = "cost_complexity", range = c(-10, -1), 243 | trans = scales::log10_trans()), 244 | list(pkg = "dials", fun = "tree_depth", range = c(1L, 15L)), 245 | list(pkg = "dials", fun = "min_n", range = c(2L, 40L)) 246 | ), 247 | source = "recipe", 248 | component = "step_select_tree", 249 | component_id = x$id 250 | ) 251 | } 252 | -------------------------------------------------------------------------------- /docs/pkgdown.css: -------------------------------------------------------------------------------- 1 | /* Sticky footer */ 2 | 3 | /** 4 | * Basic idea: https://philipwalton.github.io/solved-by-flexbox/demos/sticky-footer/ 5 | * Details: https://github.com/philipwalton/solved-by-flexbox/blob/master/assets/css/components/site.css 6 | * 7 | * .Site -> body > .container 8 | * .Site-content -> body > .container .row 9 | * .footer -> footer 10 | * 11 | * Key idea seems to be to ensure that .container and __all its parents__ 12 | * have height set to 100% 13 | * 14 | */ 15 | 16 | html, body { 17 | height: 100%; 18 | } 19 | 20 | body { 21 | position: relative; 22 | } 23 | 24 | body > .container { 25 | display: flex; 26 | height: 100%; 27 | flex-direction: column; 28 | } 29 | 30 | body > .container .row { 31 | flex: 1 0 auto; 32 | } 33 | 34 | footer { 35 | margin-top: 45px; 36 | padding: 35px 0 36px; 37 | border-top: 1px solid #e5e5e5; 38 | color: #666; 39 | display: flex; 40 | flex-shrink: 0; 41 | } 42 | footer p { 43 | margin-bottom: 0; 44 | } 45 | footer div { 46 | flex: 1; 47 | } 48 | footer .pkgdown { 49 | text-align: right; 50 | } 51 | footer p { 52 | margin-bottom: 0; 53 | } 54 | 55 | img.icon { 56 | float: right; 57 | } 58 | 59 | /* Ensure in-page images don't run outside their container */ 60 | .contents img { 61 | max-width: 100%; 62 | height: auto; 63 | } 64 | 65 | /* Fix bug in bootstrap (only seen in firefox) */ 66 | summary { 67 | display: list-item; 68 | } 69 | 70 | /* Typographic tweaking ---------------------------------*/ 71 | 72 | .contents .page-header { 73 | margin-top: calc(-60px + 1em); 74 | } 75 | 76 | dd { 77 | margin-left: 3em; 78 | } 79 | 80 | /* Section anchors ---------------------------------*/ 81 | 82 | a.anchor { 83 | display: none; 84 | margin-left: 5px; 85 | width: 20px; 86 | height: 20px; 87 | 88 | background-image: url(./link.svg); 89 | background-repeat: no-repeat; 90 | background-size: 20px 20px; 91 | background-position: center center; 92 | } 93 | 94 | h1:hover .anchor, 95 | h2:hover .anchor, 96 | h3:hover .anchor, 97 | h4:hover .anchor, 98 | h5:hover .anchor, 99 | h6:hover .anchor { 100 | display: inline-block; 101 | } 102 | 103 | /* Fixes for fixed navbar --------------------------*/ 104 | 105 | .contents h1, .contents h2, .contents h3, .contents h4 { 106 | padding-top: 60px; 107 | margin-top: -40px; 108 | } 109 | 110 | /* Navbar submenu --------------------------*/ 111 | 112 | .dropdown-submenu { 113 | position: relative; 114 | } 115 | 116 | .dropdown-submenu>.dropdown-menu { 117 | top: 0; 118 | left: 100%; 119 | margin-top: -6px; 120 | margin-left: -1px; 121 | border-radius: 0 6px 6px 6px; 122 | } 123 | 124 | .dropdown-submenu:hover>.dropdown-menu { 125 | display: block; 126 | } 127 | 128 | .dropdown-submenu>a:after { 129 | display: block; 130 | content: " "; 131 | float: right; 132 | width: 0; 133 | height: 0; 134 | border-color: transparent; 135 | border-style: solid; 136 | border-width: 5px 0 5px 5px; 137 | border-left-color: #cccccc; 138 | margin-top: 5px; 139 | margin-right: -10px; 140 | } 141 | 142 | .dropdown-submenu:hover>a:after { 143 | border-left-color: #ffffff; 144 | } 145 | 146 | .dropdown-submenu.pull-left { 147 | float: none; 148 | } 149 | 150 | .dropdown-submenu.pull-left>.dropdown-menu { 151 | left: -100%; 152 | margin-left: 10px; 153 | border-radius: 6px 0 6px 6px; 154 | } 155 | 156 | /* Sidebar --------------------------*/ 157 | 158 | #pkgdown-sidebar { 159 | margin-top: 30px; 160 | position: -webkit-sticky; 161 | position: sticky; 162 | top: 70px; 163 | } 164 | 165 | #pkgdown-sidebar h2 { 166 | font-size: 1.5em; 167 | margin-top: 1em; 168 | } 169 | 170 | #pkgdown-sidebar h2:first-child { 171 | margin-top: 0; 172 | } 173 | 174 | #pkgdown-sidebar .list-unstyled li { 175 | margin-bottom: 0.5em; 176 | } 177 | 178 | /* bootstrap-toc tweaks ------------------------------------------------------*/ 179 | 180 | /* All levels of nav */ 181 | 182 | nav[data-toggle='toc'] .nav > li > a { 183 | padding: 4px 20px 4px 6px; 184 | font-size: 1.5rem; 185 | font-weight: 400; 186 | color: inherit; 187 | } 188 | 189 | nav[data-toggle='toc'] .nav > li > a:hover, 190 | nav[data-toggle='toc'] .nav > li > a:focus { 191 | padding-left: 5px; 192 | color: inherit; 193 | border-left: 1px solid #878787; 194 | } 195 | 196 | nav[data-toggle='toc'] .nav > .active > a, 197 | nav[data-toggle='toc'] .nav > .active:hover > a, 198 | nav[data-toggle='toc'] .nav > .active:focus > a { 199 | padding-left: 5px; 200 | font-size: 1.5rem; 201 | font-weight: 400; 202 | color: inherit; 203 | border-left: 2px solid #878787; 204 | } 205 | 206 | /* Nav: second level (shown on .active) */ 207 | 208 | nav[data-toggle='toc'] .nav .nav { 209 | display: none; /* Hide by default, but at >768px, show it */ 210 | padding-bottom: 10px; 211 | } 212 | 213 | nav[data-toggle='toc'] .nav .nav > li > a { 214 | padding-left: 16px; 215 | font-size: 1.35rem; 216 | } 217 | 218 | nav[data-toggle='toc'] .nav .nav > li > a:hover, 219 | nav[data-toggle='toc'] .nav .nav > li > a:focus { 220 | padding-left: 15px; 221 | } 222 | 223 | nav[data-toggle='toc'] .nav .nav > .active > a, 224 | nav[data-toggle='toc'] .nav .nav > .active:hover > a, 225 | nav[data-toggle='toc'] .nav .nav > .active:focus > a { 226 | padding-left: 15px; 227 | font-weight: 500; 228 | font-size: 1.35rem; 229 | } 230 | 231 | /* orcid ------------------------------------------------------------------- */ 232 | 233 | .orcid { 234 | font-size: 16px; 235 | color: #A6CE39; 236 | /* margins are required by official ORCID trademark and display guidelines */ 237 | margin-left:4px; 238 | margin-right:4px; 239 | vertical-align: middle; 240 | } 241 | 242 | /* Reference index & topics ----------------------------------------------- */ 243 | 244 | .ref-index th {font-weight: normal;} 245 | 246 | .ref-index td {vertical-align: top; min-width: 100px} 247 | .ref-index .icon {width: 40px;} 248 | .ref-index .alias {width: 40%;} 249 | .ref-index-icons .alias {width: calc(40% - 40px);} 250 | .ref-index .title {width: 60%;} 251 | 252 | .ref-arguments th {text-align: right; padding-right: 10px;} 253 | .ref-arguments th, .ref-arguments td {vertical-align: top; min-width: 100px} 254 | .ref-arguments .name {width: 20%;} 255 | .ref-arguments .desc {width: 80%;} 256 | 257 | /* Nice scrolling for wide elements --------------------------------------- */ 258 | 259 | table { 260 | display: block; 261 | overflow: auto; 262 | } 263 | 264 | /* Syntax highlighting ---------------------------------------------------- */ 265 | 266 | pre, code, pre code { 267 | background-color: #f8f8f8; 268 | color: #333; 269 | } 270 | pre, pre code { 271 | white-space: pre-wrap; 272 | word-break: break-all; 273 | overflow-wrap: break-word; 274 | } 275 | 276 | pre { 277 | border: 1px solid #eee; 278 | } 279 | 280 | pre .img, pre .r-plt { 281 | margin: 5px 0; 282 | } 283 | 284 | pre .img img, pre .r-plt img { 285 | background-color: #fff; 286 | } 287 | 288 | code a, pre a { 289 | color: #375f84; 290 | } 291 | 292 | a.sourceLine:hover { 293 | text-decoration: none; 294 | } 295 | 296 | .fl {color: #1514b5;} 297 | .fu {color: #000000;} /* function */ 298 | .ch,.st {color: #036a07;} /* string */ 299 | .kw {color: #264D66;} /* keyword */ 300 | .co {color: #888888;} /* comment */ 301 | 302 | .error {font-weight: bolder;} 303 | .warning {font-weight: bolder;} 304 | 305 | /* Clipboard --------------------------*/ 306 | 307 | .hasCopyButton { 308 | position: relative; 309 | } 310 | 311 | .btn-copy-ex { 312 | position: absolute; 313 | right: 0; 314 | top: 0; 315 | visibility: hidden; 316 | } 317 | 318 | .hasCopyButton:hover button.btn-copy-ex { 319 | visibility: visible; 320 | } 321 | 322 | /* headroom.js ------------------------ */ 323 | 324 | .headroom { 325 | will-change: transform; 326 | transition: transform 200ms linear; 327 | } 328 | .headroom--pinned { 329 | transform: translateY(0%); 330 | } 331 | .headroom--unpinned { 332 | transform: translateY(-100%); 333 | } 334 | 335 | /* mark.js ----------------------------*/ 336 | 337 | mark { 338 | background-color: rgba(255, 255, 51, 0.5); 339 | border-bottom: 2px solid rgba(255, 153, 51, 0.3); 340 | padding: 1px; 341 | } 342 | 343 | /* vertical spacing after htmlwidgets */ 344 | .html-widget { 345 | margin-bottom: 10px; 346 | } 347 | 348 | /* fontawesome ------------------------ */ 349 | 350 | .fab { 351 | font-family: "Font Awesome 5 Brands" !important; 352 | } 353 | 354 | /* don't display links in code chunks when printing */ 355 | /* source: https://stackoverflow.com/a/10781533 */ 356 | @media print { 357 | code a:link:after, code a:visited:after { 358 | content: ""; 359 | } 360 | } 361 | 362 | /* Section anchors --------------------------------- 363 | Added in pandoc 2.11: https://github.com/jgm/pandoc-templates/commit/9904bf71 364 | */ 365 | 366 | div.csl-bib-body { } 367 | div.csl-entry { 368 | clear: both; 369 | } 370 | .hanging-indent div.csl-entry { 371 | margin-left:2em; 372 | text-indent:-2em; 373 | } 374 | div.csl-left-margin { 375 | min-width:2em; 376 | float:left; 377 | } 378 | div.csl-right-inline { 379 | margin-left:2em; 380 | padding-left:1em; 381 | } 382 | div.csl-indent { 383 | margin-left: 2em; 384 | } 385 | -------------------------------------------------------------------------------- /R/step_select_forests.R: -------------------------------------------------------------------------------- 1 | #' Feature selection step using a random forest feature importance scores 2 | #' 3 | #' `step_select_forests` creates a *specification* of a recipe step that selects 4 | #' a subset of predictors based on the ranking of variable importance using 5 | #' a `parsnip::rand_forest` supported model. 6 | #' 7 | #' @param recipe A recipe object. The step will be added to the sequence of 8 | #' operations for this recipe. 9 | #' @param ... One or more selector functions to choose which variables are 10 | #' affected by the step. See selections() for more details. For the tidy 11 | #' method, these are not currently used. 12 | #' @param outcome A character string with the name of the response variable to 13 | #' use to calculate the feature importance scores. 14 | #' @param role Not used by this step since no new variables are created. 15 | #' @param trained A logical to indicate if the quantities for preprocessing have 16 | #' been estimated. 17 | #' @param engine A supported rand_forest engine that is supported by parsnip. 18 | #' The default is "ranger". 19 | #' @param options A named list of options to pass to the rand_forest engine. For 20 | #' example, if `engine = 'ranger'` (the default) then options could be 21 | #' `list(permutation = 'importance`) because a feature importance method needs 22 | #' to be specified for this engine. This is the default. 23 | #' @param top_p An integer with the number of best scoring features to 24 | #' select. 25 | #' @param mtry An integer for the number of predictors that will be randomly 26 | #' sampled at each split when creating the tree models. 27 | #' @param trees An integer for the number of trees contained in the ensemble. 28 | #' @param min_n An integer for the minimum number of data points in a node that 29 | #' are required for the node to be split further. 30 | #' @param threshold A numeric value between 0 and 1 representing the percentile 31 | #' of best scoring features to select. Features with scores that are _larger_ 32 | #' than the specified threshold will be retained, for example `threshold = 33 | #' 0.9` will retain only predictors with scores in the top 90th percentile. 34 | #' Note that this overrides `top_p`. 35 | #' @param exclude A character vector of predictor names that will be removed 36 | #' from the data. This will be set when `prep()` is used on the recipe and 37 | #' should not be set by the user. 38 | #' @param scores A tibble with 'variable' and 'scores' columns containing the 39 | #' names of the variables and their feature importance scores. This parameter 40 | #' is only produced after the recipe has been trained. 41 | #' @param skip A logical. Should the step be skipped when the recipe is baked by 42 | #' bake.recipe()? While all operations are baked when prep.recipe() is run, 43 | #' some operations may not be able to be conducted on new data (e.g. 44 | #' processing the outcome variable(s)). Care should be taken when using skip = 45 | #' TRUE as it may affect the computations for subsequent operations. 46 | #' @param id A character string that is unique to this step to identify it. 47 | #' 48 | #' @return a `step_select_forests` object. 49 | #' @export 50 | #' @examples 51 | #' library(recipes) 52 | #' library(parsnip) 53 | #' 54 | #' # load the example iris dataset 55 | #' data(cells, package = "modeldata") 56 | #' 57 | #' # create a preprocessing recipe 58 | #' rec <- 59 | #' recipe(class ~ ., data = cells[, -1]) %>% 60 | #' step_select_forests(all_predictors(), outcome = "class", top_p = 10, 61 | #' threshold = 0.9) 62 | #' 63 | #' prepped <- prep(rec) 64 | #' 65 | #' preproc_data <- juice(prepped) 66 | #' prepped 67 | step_select_forests <- function( 68 | recipe, 69 | ..., 70 | outcome = NULL, 71 | role = "predictor", 72 | trained = FALSE, 73 | engine = "ranger", 74 | options = list(importance = "permutation"), 75 | mtry = NULL, 76 | trees = NULL, 77 | min_n = NULL, 78 | top_p = NA, 79 | threshold = NA, 80 | exclude = NULL, 81 | scores = NULL, 82 | skip = FALSE, 83 | id = recipes::rand_id("select_forests")) { 84 | 85 | engines <- parsnip::show_engines("rand_forest")$engine 86 | 87 | if (!engine %in% parsnip::show_engines("rand_forest")$engine) 88 | rlang::abort( 89 | paste("Engine argument should be one of", paste(engines, collapse = ", ")) 90 | ) 91 | 92 | recipes::add_step( 93 | recipe, 94 | step_select_forests_new( 95 | terms = recipes::ellipse_check(...), 96 | trained = trained, 97 | outcome = outcome, 98 | role = role, 99 | engine = engine, 100 | options = options, 101 | mtry = mtry, 102 | trees = trees, 103 | min_n = min_n, 104 | top_p = top_p, 105 | threshold = threshold, 106 | exclude = exclude, 107 | scores = scores, 108 | skip = skip, 109 | id = id 110 | ) 111 | ) 112 | } 113 | 114 | # wrapper around 'step' function that sets the class of new step objects 115 | #' @importFrom recipes step 116 | step_select_forests_new <- function(terms, role, trained, outcome, engine, 117 | options, top_p, mtry, trees, min_n, 118 | threshold, exclude, scores, skip, id) { 119 | recipes::step( 120 | subclass = "select_forests", 121 | terms = terms, 122 | role = role, 123 | trained = trained, 124 | outcome = outcome, 125 | engine = engine, 126 | options = options, 127 | mtry = mtry, 128 | trees = trees, 129 | min_n = min_n, 130 | top_p = top_p, 131 | threshold = threshold, 132 | exclude = exclude, 133 | scores = scores, 134 | skip = skip, 135 | id = id 136 | ) 137 | } 138 | 139 | #' @export 140 | prep.step_select_forests <- function(x, training, info = NULL, ...) { 141 | # translate the terms arguments 142 | x_names <- recipes::terms_select(terms = x$terms, info = info) 143 | y_name <- recipes::terms_select(x$outcome, info = info) 144 | y_name <- y_name[1] 145 | 146 | # check criteria 147 | check_criteria(x$top_p, x$threshold, match.call()) 148 | check_zero_one(x$threshold) 149 | x$top_p <- check_top_p(x$top_p, length(x_names)) 150 | 151 | if (length(x_names) > 0) { 152 | # fit initial model 153 | X <- training[, x_names] 154 | y <- training[[y_name]] 155 | 156 | model_mode <- ifelse(inherits(y, "numeric"), "regression", "classification") 157 | 158 | model_args <- list( 159 | trees = x$trees, 160 | mtry = x$mtry, 161 | min_n = x$min_n 162 | ) 163 | 164 | model_spec <- 165 | parsnip::make_call("rand_forest", args = model_args, ns = "parsnip") 166 | 167 | model_spec <- 168 | rlang::eval_tidy(model_spec) %>% 169 | parsnip::set_mode(model_mode) %>% 170 | parsnip::set_engine(x$engine, !!!x$options) 171 | 172 | initial_model <- parsnip::fit_xy(model_spec, X, y) 173 | res <- pull_importances(initial_model) 174 | names(res) <- c("variable", "score") 175 | res$score <- rlang::set_names(res$score, res$variable) 176 | 177 | exclude <- 178 | select_percentile(res$score, x$top_p, x$threshold, maximize = TRUE) 179 | 180 | } else { 181 | exclude <- character() 182 | } 183 | 184 | step_select_forests_new( 185 | terms = x$terms, 186 | trained = TRUE, 187 | role = x$role, 188 | outcome = y_name, 189 | engine = x$engine, 190 | options = x$options, 191 | mtry = x$mtry, 192 | trees = x$trees, 193 | min_n = x$min_n, 194 | top_p = x$top_p, 195 | threshold = x$threshold, 196 | exclude = exclude, 197 | scores = res, 198 | skip = x$skip, 199 | id = x$id 200 | ) 201 | } 202 | 203 | #' @export 204 | bake.step_select_forests <- function(object, new_data, ...) { 205 | if (length(object$exclude) > 0) { 206 | new_data <- new_data[, !colnames(new_data) %in% object$exclude] 207 | } 208 | 209 | as_tibble(new_data) 210 | } 211 | 212 | #' @export 213 | print.step_select_forests <- function(x, width = max(20, options()$width - 30), 214 | ...) { 215 | cat("Variable importance feature selection") 216 | 217 | if (recipes::is_trained(x)) { 218 | n <- length(x$exclude) 219 | cat(paste0(" (", n, " excluded)")) 220 | } 221 | cat("\n") 222 | 223 | invisible(x) 224 | } 225 | 226 | #' @rdname tidy.recipe 227 | #' @param x A `step_select_forests` object. 228 | #' @export 229 | tidy.step_select_forests <- function(x, ...) { 230 | if (recipes::is_trained(x)) { 231 | res <- tibble(terms = x$exclude) 232 | 233 | } else { 234 | term_names <- recipes::sel2char(x$terms) 235 | res <- tibble(terms = term_names) 236 | } 237 | res$id <- x$id 238 | res 239 | } 240 | 241 | #' @export 242 | tunable.step_select_forests <- function(x, ...) { 243 | tibble( 244 | name = c("top_p", "threshold", "mtry", "trees", "min_n"), 245 | call_info = list( 246 | list(pkg = "recipeselectors", fun = "top_p"), 247 | list(pkg = "dials", fun = "threshold", range = c(0, 1)), 248 | list(pkg = "dials", fun = "mtry", range = c(1L, dials::unknown())), 249 | list(pkg = "dials", fun = "trees", range = c(1L, 2000L)), 250 | list(pkg = "dials", fun = "min_n", range = c(2L, 40L)) 251 | ), 252 | source = "recipe", 253 | component = "step_select_forests", 254 | component_id = x$id 255 | ) 256 | } 257 | -------------------------------------------------------------------------------- /R/pull_importances.R: -------------------------------------------------------------------------------- 1 | #' Pull feature importances from a parsnip fitted model 2 | #' 3 | #' `pull_importances` is a generic function to extract feature importance scores 4 | #' or coefficients from a parsnip `model_fit` object and return them as a tibble 5 | #' with a 'feature' and 'importance' column. This is designed to support the 6 | #' `step_importance` recipe step. 7 | #' 8 | #' Most of the basic models within the parsnip package that support feature 9 | #' importances are implemented (call `methods(pull_importances)` to list models 10 | #' that are currently implemented). If need to pull the feature importance scores 11 | #' from a model that is not currently supported in this package, then you can 12 | #' add a class to the pull_importances generic function which returns a 13 | #' two-column tibble: 14 | #' 15 | #' @param object A `model_fit` object. 16 | #' @param scaled A logical indicating whether to rescale the importances between 17 | #' 0 and 1. Default is TRUE. 18 | #' @param ... A list of other parameters passed to the feature importance 19 | #' method. 20 | #' 21 | #' @return tibble 22 | #' @export 23 | #' 24 | #' @examples 25 | #' library(parsnip) 26 | #' 27 | #' # pull feature importances from a model_fit object 28 | #' model <- boost_tree(mode = "classification") %>% 29 | #' set_engine("xgboost") 30 | #' model_fit <- model %>% fit(Species ~., iris) 31 | #' pull_importances(model_fit) 32 | #' 33 | #' # create a new pull_importances method 34 | #' pull_importances._ranger <- function(object, scaled = FALSE, ...) { 35 | #' # create a call to the ranger::importance function avoiding having to use 36 | #' # ranger as a dependency 37 | #' call <- rlang::call2(.fn = "importance", .ns = "ranger", x = object$fit) 38 | #' scores <- rlang::eval_tidy(call) 39 | #' 40 | #' # create a tibble with 'feature' and 'importance' columns 41 | #' scores <- tibble::tibble( 42 | #' feature = names(scores), 43 | #' importance = as.numeric(scores) 44 | #' ) 45 | 46 | #' # optionally rescale the importance scores 47 | #' if (isTRUE(scaled)) 48 | #' scores$importance <- rescale(scores$importance) 49 | #' 50 | #' scores 51 | #' } 52 | pull_importances <- function(object, scaled = TRUE, ...) { 53 | UseMethod("pull_importances", object) 54 | } 55 | 56 | 57 | rescale <- function(x) 58 | (x - min(x)) / (max(x) - min(x)) * 100 59 | 60 | 61 | #' @export 62 | pull_importances.default <- function(object, scaled = TRUE, ...) { 63 | message(paste( 64 | "No method for pulling feature importances is defined for", 65 | class(object)[1] 66 | )) 67 | } 68 | 69 | 70 | #' @export 71 | pull_importances._xgb.Booster <- 72 | function(object, 73 | scaled = TRUE, 74 | type = "Gain", 75 | ...) { 76 | 77 | call <- rlang::call2( 78 | .fn = "xgb.importance", 79 | .ns = "xgboost", 80 | model = object$fit 81 | ) 82 | scores <- rlang::eval_tidy(call) 83 | scores <- tibble(feature = scores$Feature, importance = scores[[type]]) 84 | 85 | if (scaled) 86 | scores$importance <- rescale(scores$importance) 87 | 88 | scores 89 | } 90 | 91 | #' @export 92 | pull_importances._C5.0 <- function(object, scaled = TRUE, ...) { 93 | others <- list(...) 94 | 95 | if (!length(others)) 96 | others$metric = "usage" 97 | 98 | call <- rlang::call2(.fn = "C5imp", .ns = "C50", object = object$fit,!!!others) 99 | scores <- rlang::eval_tidy(call) 100 | 101 | scores <- tibble(feature = rownames(scores), importance = scores$Overall) 102 | 103 | if (scaled) 104 | scores$importance <- rescale(scores$importance) 105 | 106 | scores 107 | } 108 | 109 | #' @export 110 | pull_importances._H2OMultinomialModel <- 111 | function(object, scaled = TRUE, ...) { 112 | call <- rlang::call2(.fn = "h2o.varimp", .ns = "h2o", object = object$fit) 113 | scores <- rlang::eval_tidy(call) 114 | 115 | scores <- 116 | tibble(feature = scores$variable, importance = scores$relative_importance) 117 | 118 | if (scaled) 119 | scores$importance <- rescale(scores$importance) 120 | 121 | scores 122 | } 123 | 124 | #' @export 125 | pull_importances._H2ORegressionModel <- 126 | function(object, scaled = TRUE, ...) { 127 | 128 | call <- rlang::call2(.fn = "h2o.varimp", .ns = "h2o", object = object$fit) 129 | scores <- rlang::eval_tidy(call) 130 | 131 | scores <- 132 | tibble(feature = scores$variable, importance = scores$relative_importance) 133 | 134 | if (scaled) 135 | scores$importance <- rescale(scores$importance) 136 | 137 | scores 138 | } 139 | 140 | #' @export 141 | pull_importances._ranger <- function(object, scaled = TRUE, ...) { 142 | call <- rlang::call2(.fn = "importance", .ns = "ranger", x = object$fit) 143 | scores <- rlang::eval_tidy(call) 144 | 145 | scores <- tibble(feature = names(scores), importance = as.numeric(scores)) 146 | 147 | if (scaled) 148 | scores$importance <- rescale(scores$importance) 149 | 150 | scores 151 | } 152 | 153 | #' @export 154 | pull_importances._cubist <- function(object, scaled = TRUE, ...) { 155 | scores <- object$fit$usage 156 | 157 | scores <- tibble(feature = scores$Variable, importance = scores$Model) 158 | 159 | if (scaled) 160 | scores$importance <- rescale(scores$importance) 161 | 162 | scores 163 | } 164 | 165 | #' @export 166 | pull_importances._earth <- function(object, scaled = TRUE, ...) { 167 | call <- rlang::call2(.fn = "evimp", .ns = "earth", object = object$fit) 168 | scores <- rlang::eval_tidy(call) 169 | 170 | scores <- tibble(feature = rownames(scores), importance = scores[, "rss"]) 171 | 172 | if (scaled) 173 | scores$importance <- rescale(scores$importance) 174 | 175 | scores 176 | } 177 | 178 | #' @export 179 | pull_importances._lm <- 180 | function(object, 181 | scaled = FALSE, 182 | intercept = FALSE, 183 | ...) { 184 | 185 | scores <- tibble( 186 | feature = names(stats::coefficients(object$fit)), 187 | importance = stats::coefficients(object$fit) 188 | ) 189 | 190 | if (!intercept) 191 | scores <- scores[scores$feature != "(Intercept)",] 192 | 193 | if (scaled) 194 | scores$importance <- rescale(abs(scores$importance)) 195 | 196 | scores 197 | } 198 | 199 | #' @export 200 | pull_importances._glm <- 201 | function(object, 202 | scaled = FALSE, 203 | intercept = FALSE, 204 | ...) { 205 | 206 | scores <- tibble(feature = names(stats::coefficients(object$fit)), 207 | importance = stats::coefficients(object$fit)) 208 | 209 | if (!intercept) 210 | scores <- scores[scores$feature != "(Intercept)", ] 211 | 212 | if (scaled) 213 | scores$importance <- rescale(abs(scores$importance)) 214 | 215 | scores 216 | } 217 | 218 | #' @export 219 | pull_importances._elnet <- 220 | function(object, 221 | scaled = FALSE, 222 | intercept = FALSE, 223 | penalty = NULL, 224 | ...) { 225 | if (is.null(penalty)) 226 | penalty <- object$spec$args$penalty 227 | 228 | if (is.null(penalty)) 229 | rlang::abort( 230 | "model specification was not fitted using a `penalty` value. `penalty` should be supplied to the `pull_importances` method" 231 | ) 232 | 233 | scores <- tibble(feature = rownames(stats::coef(object$fit, s = penalty)), 234 | importance = stats::coef(object$fit, s = penalty)[, 1]) 235 | 236 | if (!intercept) 237 | scores <- scores[scores$feature != "(Intercept)", ] 238 | 239 | if (scaled) 240 | scores$importance <- rescale(abs(scores$importance)) 241 | 242 | scores 243 | } 244 | 245 | #' @export 246 | pull_importances._lognet <- 247 | function(object, 248 | scaled = FALSE, 249 | intercept = FALSE, 250 | penalty = NULL, 251 | ...) { 252 | if (!is.null(penalty)) { 253 | s <- penalty 254 | } else { 255 | s <- object$spec$args$penalty 256 | } 257 | 258 | if (is.null(s)) 259 | rlang::abort( 260 | "model specification was not fitted using a `penalty` value. `penalty` should be supplied to the `pull_importances` method" 261 | ) 262 | 263 | scores <- tibble( 264 | feature = rownames(stats::coef(object$fit, s = s)), 265 | importance = stats::coef(object$fit, s = s)[, 1] 266 | ) 267 | 268 | if (!intercept) 269 | scores <- scores[scores$feature != "(Intercept)",] 270 | 271 | if (scaled) 272 | scores$importance <- rescale(abs(scores$importance)) 273 | 274 | scores 275 | } 276 | 277 | #' @export 278 | pull_importances._randomForest <- 279 | function(object, scaled = TRUE, ...) { 280 | scores <- tibble( 281 | feature = rownames(object$fit$importance), 282 | importance = object$fit$importance 283 | ) 284 | 285 | if (scaled) 286 | scores$importance <- rescale(scores$importance) 287 | 288 | scores 289 | } 290 | 291 | #' @export 292 | pull_importances._rpart <- function(object, scaled = TRUE, ...) { 293 | scores <- tibble( 294 | feature = names(object$fit$variable.importance), 295 | importance = object$fit$variable.importance 296 | ) 297 | 298 | if (scaled) 299 | scores$importance <- rescale(scores$importance) 300 | 301 | scores 302 | } 303 | 304 | # stan? 305 | # surv? 306 | --------------------------------------------------------------------------------