├── .github ├── .gitignore └── workflows │ ├── pkgdown.yaml │ ├── R-CMD-check.yaml │ └── test-coverage.yaml ├── LICENSE ├── tests ├── testthat.R └── testthat │ ├── test-step_select_relief.R │ ├── test_step_select_mrmr.R │ ├── test_step_select_linear.R │ ├── test_dual_filter.R │ ├── test_discretize_var.R │ ├── test_step_select_tree.R │ ├── test_step_select_boruta.R │ ├── test_step_select_vip.R │ ├── test_step_select_infgain.R │ ├── test_step_select_forests.R │ └── test_step_select_fcbf.R ├── .gitignore ├── _pkgdown.yml ├── docs ├── reference │ ├── Rplot001.png │ └── pipe.html ├── pkgdown.yml ├── link.svg ├── sitemap.xml ├── pkgdown.js ├── LICENSE-text.html ├── 404.html ├── LICENSE.html └── authors.html ├── .Rbuildignore ├── R ├── utils-pipe.R ├── imports.R ├── tidy_filter_step.R ├── colino-package.R ├── parameters.R ├── misc.R ├── step_select_boruta.R ├── step_select_roc.R ├── step_select_mrmr.R ├── step_select_vip.R ├── step_select_xtab.R ├── step_select_carscore.R ├── step_select_infgain.R ├── step_select_tree.R └── step_select_relief.R ├── man ├── pipe.Rd ├── entropy.Rd ├── top_p.Rd ├── cutoff.Rd ├── colino.Rd ├── dual_filter.Rd ├── required_pkgs.colino.Rd ├── pull_importances.Rd ├── step_select_aov.Rd ├── step_select_boruta.Rd ├── step_select_roc.Rd ├── step_select_xtab.Rd ├── step_select_mrmr.Rd ├── step_select_carscore.Rd ├── step_select_fcbf.Rd ├── step_select_linear.Rd ├── step_select_vip.Rd ├── step_select_tree.Rd ├── step_select_infgain.Rd ├── step_select_forests.Rd └── step_select_relief.Rd ├── colino.Rproj ├── LICENSE.md ├── DESCRIPTION └── NAMESPACE /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2019 2 | COPYRIGHT HOLDER: Steven Pawley 3 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(colino) 3 | 4 | test_check("colino") 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | .DS_Store 6 | docs 7 | -------------------------------------------------------------------------------- /_pkgdown.yml: -------------------------------------------------------------------------------- 1 | url: https://stevenpawley.github.io/colino/ 2 | template: 3 | bootstrap: 5 4 | 5 | -------------------------------------------------------------------------------- /docs/reference/Rplot001.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stevenpawley/colino/HEAD/docs/reference/Rplot001.png -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^LICENSE\.md$ 4 | ^README\.Rmd$ 5 | ^\.travis\.yml$ 6 | ^codecov\.yml$ 7 | ^_pkgdown\.yml$ 8 | ^docs$ 9 | ^pkgdown$ 10 | ^\.github$ 11 | -------------------------------------------------------------------------------- /R/utils-pipe.R: -------------------------------------------------------------------------------- 1 | #' Pipe operator 2 | #' 3 | #' See \code{magrittr::\link[magrittr:pipe]{\%>\%}} for details. 4 | #' 5 | #' @name %>% 6 | #' @rdname pipe 7 | #' @keywords internal 8 | #' @export 9 | #' @importFrom magrittr %>% 10 | #' @usage lhs \%>\% rhs 11 | NULL 12 | -------------------------------------------------------------------------------- /docs/pkgdown.yml: -------------------------------------------------------------------------------- 1 | pandoc: 2.19.2 2 | pkgdown: 2.0.6 3 | pkgdown_sha: ~ 4 | articles: {} 5 | last_built: 2022-11-21T21:35Z 6 | urls: 7 | reference: https://stevenpawley.github.io/colino/reference 8 | article: https://stevenpawley.github.io/colino/articles 9 | 10 | -------------------------------------------------------------------------------- /R/imports.R: -------------------------------------------------------------------------------- 1 | ## usethis namespace: start 2 | #' @importFrom tibble tibble as_tibble 3 | #' @importFrom recipes prep bake 4 | #' @importFrom generics tidy required_pkgs 5 | #' @importFrom tune tunable 6 | #' @importFrom stats aov as.formula 7 | ## usethis namespace: end 8 | NULL 9 | -------------------------------------------------------------------------------- /man/pipe.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils-pipe.R 3 | \name{\%>\%} 4 | \alias{\%>\%} 5 | \title{Pipe operator} 6 | \usage{ 7 | lhs \%>\% rhs 8 | } 9 | \description{ 10 | See \code{magrittr::\link[magrittr:pipe]{\%>\%}} for details. 11 | } 12 | \keyword{internal} 13 | -------------------------------------------------------------------------------- /R/tidy_filter_step.R: -------------------------------------------------------------------------------- 1 | tidy_filter_step <- function(x, type = "terms") { 2 | if (recipes::is_trained(x)) { 3 | if (type == "terms") { 4 | res <- tibble(terms = x$exclude) 5 | } else if (type == "scores") { 6 | res <- x$scores 7 | res <- res[order(res$score, decreasing = TRUE), ] 8 | } 9 | 10 | } else { 11 | res <- tibble(terms = rlang::na_chr) 12 | } 13 | res$id <- x$id 14 | res 15 | } 16 | -------------------------------------------------------------------------------- /colino.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Package 19 | PackageUseDevtools: Yes 20 | PackageInstallArgs: --no-multiarch --with-keep.source 21 | PackageCheckArgs: --as-cran 22 | PackageRoxygenize: rd,collate,namespace 23 | -------------------------------------------------------------------------------- /man/entropy.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/parameters.R 3 | \name{entropy} 4 | \alias{entropy} 5 | \title{Parameter functions for feature selection recipes} 6 | \usage{ 7 | entropy(values = values_entropy) 8 | } 9 | \arguments{ 10 | \item{values}{A character string of possible values. See `values_entropy` for 11 | possible values.} 12 | } 13 | \value{ 14 | A function with classes "qual_param" and "param" 15 | } 16 | \description{ 17 | Entropy-based feature selection methods can be applied using several methods 18 | to calculate the entropy formula. `entropy` is for specifying the type of 19 | entropy-based filter that is used. 20 | } 21 | \examples{ 22 | entropy('infogain') 23 | } 24 | -------------------------------------------------------------------------------- /tests/testthat/test-step_select_relief.R: -------------------------------------------------------------------------------- 1 | test_that("step_select_relief", { 2 | skip_if_not_installed("FSelectorRcpp") 3 | 4 | # FSelectorRcpp method 5 | set.seed(1234) 6 | raw <- FSelectorRcpp::relief( 7 | formula = Species ~ ., 8 | data = iris, 9 | neighboursCount = 5, 10 | sampleSize = 10 11 | ) 12 | raw <- setNames(raw, c("variable", "score")) 13 | raw <- raw[order(raw$score, decreasing = TRUE), ] 14 | 15 | # test recipe 16 | rec <- 17 | recipe(Species ~ ., iris) %>% 18 | step_select_relief(all_predictors(), outcome = "Species", top_p = 2) 19 | 20 | set.seed(1234) 21 | prepped <- prep(rec) 22 | expect_equal(as.numeric(prepped$steps[[1]]$scores$score), raw$score) 23 | 24 | new_data <- bake(prepped, new_data = NULL) 25 | expect_equal(ncol(new_data), 3) 26 | }) 27 | -------------------------------------------------------------------------------- /tests/testthat/test_step_select_mrmr.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(recipes) 3 | library(tibble) 4 | data("iris") 5 | 6 | test_that("step_select_mrmr, execution", { 7 | skip_if_not_installed("praznik") 8 | 9 | irisX <- iris[-5] 10 | y <- iris$Species 11 | 12 | res <- praznik::MRMR(X = irisX, Y = y, k = 4) 13 | 14 | mrmr_scores <- tibble( 15 | variable = names(res$score), 16 | scores = res$score 17 | ) 18 | 19 | rec <- recipe(Species ~ ., data = iris) 20 | 21 | mrmr_rec <- rec %>% 22 | step_select_mrmr(all_predictors(), outcome = "Species", top_p = 2) %>% 23 | prep() 24 | 25 | mrmr_pred <- juice(mrmr_rec) 26 | expect_true(all(names(mrmr_pred)[1:2] %in% mrmr_scores$variable[1:2])) 27 | 28 | expect_equal(mrmr_scores$scores, mrmr_rec$steps[[1]]$scores$score) 29 | }) 30 | 31 | 32 | -------------------------------------------------------------------------------- /docs/link.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 8 | 12 | 13 | -------------------------------------------------------------------------------- /man/top_p.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/parameters.R 3 | \name{top_p} 4 | \alias{top_p} 5 | \title{Parameter functions for feature selection recipes} 6 | \usage{ 7 | top_p(range = c(1L, 4L), trans = NULL) 8 | } 9 | \arguments{ 10 | \item{range}{A two-element vector holding the _defaults_ for the smallest and 11 | largest possible values, respectively.} 12 | 13 | \item{trans}{A `trans` object from the `scales` package, such as 14 | `scales::log10_trans()` or `scales::reciprocal_trans()`. If not provided, 15 | the default is used which matches the units used in `range`. If no 16 | transformation, `NULL`.} 17 | } 18 | \value{ 19 | A function with classes "quant_param" and "param" 20 | } 21 | \description{ 22 | Feature selection recipes allow the top-performing features to be selected 23 | using three parameters. `top_p` is for specifying the number of the 24 | top-performing features. 25 | } 26 | \examples{ 27 | top_p(c(3, 10)) 28 | } 29 | -------------------------------------------------------------------------------- /man/cutoff.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/parameters.R 3 | \name{cutoff} 4 | \alias{cutoff} 5 | \title{Parameter functions for feature selection recipes} 6 | \usage{ 7 | cutoff(range = c(dials::unknown(), dials::unknown()), trans = NULL) 8 | } 9 | \arguments{ 10 | \item{range}{A two-element vector holding the _defaults_ for the smallest and 11 | largest possible values, respectively.} 12 | 13 | \item{trans}{A `trans` object from the `scales` package, such as 14 | `scales::log10_trans()` or `scales::reciprocal_trans()`. If not provided, 15 | the default is used which matches the units used in `range`. If no 16 | transformation, `NULL`.} 17 | } 18 | \value{ 19 | A function with classes "quant_param" and "param" 20 | } 21 | \description{ 22 | Feature selection recipes allow the top-performing features to be selected 23 | using three parameters. `cutoff` is for selecting features using the absolute 24 | value in the filter methods scores. 25 | } 26 | \examples{ 27 | cutoff(c(3.5, 15)) 28 | } 29 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | Copyright (c) 2019 Steven Pawley 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /tests/testthat/test_step_select_linear.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(recipes) 3 | library(tibble) 4 | library(parsnip) 5 | library(modeldata) 6 | 7 | data("cells") 8 | 9 | test_that("step_select_linear, execution using top_p on binary case", { 10 | rec <- cells %>% 11 | select(-case) %>% 12 | recipe(class ~ .) %>% 13 | step_normalize(all_numeric_predictors()) %>% 14 | step_select_linear( 15 | all_predictors(), 16 | outcome = "class", 17 | top_p = 2 18 | ) 19 | 20 | prepped <- prep(rec) 21 | selected <- bake(prepped, new_data = NULL) 22 | 23 | expect_length(names(selected), 3) 24 | }) 25 | 26 | 27 | test_that("step_select_linear, execution using threshold on binary case", { 28 | # test selection by retaining features with scores >= 50th percentile 29 | rec <- cells %>% 30 | select(-case) %>% 31 | recipe(class ~ .) %>% 32 | step_normalize(all_numeric_predictors()) %>% 33 | step_select_linear( 34 | all_predictors(), 35 | outcome = "class", 36 | threshold = 0.99 37 | ) 38 | 39 | prepped <- prep(rec) 40 | selected <- juice(prepped) 41 | 42 | expect_length(names(selected), 2) 43 | }) 44 | -------------------------------------------------------------------------------- /R/colino-package.R: -------------------------------------------------------------------------------- 1 | #' colino: A collection of steps for feature selection to use with the 2 | #' 'recipes' package 3 | #' 4 | #' \pkg{colino} provides a collection of additional step objects 5 | #' related to feature selection to be used with the 'recipes' package. 6 | #' 7 | #' @examples 8 | #' library(parsnip) 9 | #' library(recipes) 10 | #' library(magrittr) 11 | #' 12 | #' # load the example iris dataset 13 | #' data(iris) 14 | #' 15 | #' # define a base model to use for feature importances 16 | #' base_model <- rand_forest(mode = "classification") %>% 17 | #' set_engine("ranger", importance = "permutation") 18 | #' 19 | #' # create a preprocessing recipe 20 | #' rec <- iris %>% 21 | #' recipe(Species ~ .) %>% 22 | #' step_select_vip(all_predictors(), model = base_model, top_p = 2, 23 | #' outcome = "Species") 24 | #' 25 | #' prepped <- prep(rec) 26 | #' 27 | #' # create a model specification 28 | #' clf <- decision_tree(mode = "classification") %>% 29 | #' set_engine("rpart") 30 | #' 31 | #' clf_fitted <- clf %>% 32 | #' fit(Species ~ ., juice(prepped)) 33 | #' 34 | #' @author Steven Pawley, \email{dr.stevenpawley@@gmail.com} 35 | 36 | #' @name colino 37 | #' @keywords internal 38 | "_PACKAGE" 39 | 40 | -------------------------------------------------------------------------------- /tests/testthat/test_dual_filter.R: -------------------------------------------------------------------------------- 1 | test_that("test dual filter", { 2 | scores <- c(feature1 = 0.25, feature2 = 0.1, feature3 = 0.5, feature4 = 0.9) 3 | 4 | # excludes features 1 and 2 5 | excluded <- dual_filter(scores, top_p = 2, threshold = NA, cutoff = NA, maximize = TRUE) 6 | expect_setequal(excluded, c("feature1", "feature2")) 7 | 8 | # excludes feature 2 (score is < cutoff and not in top two features) 9 | excluded <- dual_filter(scores, top_p = 2, threshold = NA, cutoff = 0.2, maximize = TRUE) 10 | expect_equal(excluded, "feature2") 11 | 12 | # expect error if both top_p and threshold are used 13 | expect_error( 14 | dual_filter(scores, top_p = 2, threshold = 0.5, cutoff = 0.2, maximize = TRUE), 15 | regexp = "mutually exclusive" 16 | ) 17 | 18 | # excludes features 1-3 because their absolute scores are less than cutoff 19 | excluded <- dual_filter(scores, top_p = NA, threshold = NA, cutoff = 0.7, maximize = TRUE) 20 | expect_setequal(excluded, c("feature1", "feature2", "feature3")) 21 | 22 | # excludes features 4 because their absolute scores are greater than cutoff 23 | excluded <- dual_filter(scores, top_p = NA, threshold = NA, cutoff = 0.7, maximize = FALSE) 24 | expect_equal(excluded, "feature4") 25 | }) 26 | -------------------------------------------------------------------------------- /tests/testthat/test_discretize_var.R: -------------------------------------------------------------------------------- 1 | test_that("discretize_var returns expected values", { 2 | expect_equal(discretize_var(c(8, 7, 2, 5, NA, 3, 1), cutpoint = 0.5), 3 | as.factor(c('h', 'h', 'l', 'h', NA, 'l', 'l'))) 4 | 5 | expect_equal(discretize_var(c(1, 1, 1, 1, 1, 21), cutpoint = 0.5), 6 | as.factor(c('l', 'l', 'l', 'l', 'l', 'h'))) 7 | 8 | expect_equal(discretize_var(1:50, cutpoint = 0.5), 9 | as.factor(c(rep('l', 25), rep('h', 25)))) 10 | 11 | expect_equal(discretize_var(as.numeric(c(NA, NA, NA, NA, NA)), cutpoint = 0.5), 12 | as.factor(c(NA, NA, NA, NA, NA))) 13 | }) 14 | 15 | test_that("discretize_var rejects bad feature input", { 16 | expect_error(discretize_var(c(NA, NULL), cutpoint = 0.5), 17 | "Feature must be numeric to discretize") 18 | 19 | expect_error(discretize_var(c('putty', 'grass', 'grass'), cutpoint = 0.5), 20 | "Feature must be numeric to discretize") 21 | 22 | expect_error(discretize_var(data.frame(x = 1:50), cutpoint = 0.5), 23 | "Feature must be numeric to discretize") 24 | 25 | expect_error(discretize_var(list(1, 2, 3, 4), cutpoint = 0.5), 26 | "Feature must be numeric to discretize") 27 | }) 28 | -------------------------------------------------------------------------------- /.github/workflows/pkgdown.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | release: 9 | types: [published] 10 | workflow_dispatch: 11 | 12 | name: pkgdown 13 | 14 | jobs: 15 | pkgdown: 16 | runs-on: ubuntu-latest 17 | # Only restrict concurrency for non-PR jobs 18 | concurrency: 19 | group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }} 20 | env: 21 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 22 | steps: 23 | - uses: actions/checkout@v2 24 | 25 | - uses: r-lib/actions/setup-pandoc@v2 26 | 27 | - uses: r-lib/actions/setup-r@v2 28 | with: 29 | use-public-rspm: true 30 | 31 | - uses: r-lib/actions/setup-r-dependencies@v2 32 | with: 33 | extra-packages: any::pkgdown, local::. 34 | needs: website 35 | 36 | - name: Build site 37 | run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) 38 | shell: Rscript {0} 39 | 40 | - name: Deploy to GitHub pages 🚀 41 | if: github.event_name != 'pull_request' 42 | uses: JamesIves/github-pages-deploy-action@4.1.4 43 | with: 44 | clean: false 45 | branch: gh-pages 46 | folder: docs 47 | -------------------------------------------------------------------------------- /man/colino.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/colino-package.R 3 | \docType{package} 4 | \name{colino} 5 | \alias{colino-package} 6 | \alias{colino} 7 | \title{colino: A collection of steps for feature selection to use with the 8 | 'recipes' package} 9 | \description{ 10 | \pkg{colino} provides a collection of additional step objects 11 | related to feature selection to be used with the 'recipes' package. 12 | } 13 | \examples{ 14 | library(parsnip) 15 | library(recipes) 16 | library(magrittr) 17 | 18 | # load the example iris dataset 19 | data(iris) 20 | 21 | # define a base model to use for feature importances 22 | base_model <- rand_forest(mode = "classification") \%>\% 23 | set_engine("ranger", importance = "permutation") 24 | 25 | # create a preprocessing recipe 26 | rec <- iris \%>\% 27 | recipe(Species ~ .) \%>\% 28 | step_select_vip(all_predictors(), model = base_model, top_p = 2, 29 | outcome = "Species") 30 | 31 | prepped <- prep(rec) 32 | 33 | # create a model specification 34 | clf <- decision_tree(mode = "classification") \%>\% 35 | set_engine("rpart") 36 | 37 | clf_fitted <- clf \%>\% 38 | fit(Species ~ ., juice(prepped)) 39 | 40 | } 41 | \seealso{ 42 | Useful links: 43 | \itemize{ 44 | \item \url{https://stevenpawley.github.io/colino} 45 | \item Report bugs at \url{https://github.com/stevenpawley/colino/issues} 46 | } 47 | 48 | } 49 | \author{ 50 | Steven Pawley, \email{dr.stevenpawley@gmail.com} 51 | } 52 | \keyword{internal} 53 | -------------------------------------------------------------------------------- /tests/testthat/test_step_select_tree.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(recipes) 3 | library(tibble) 4 | library(parsnip) 5 | data("iris") 6 | 7 | test_that("step_select_tree, execution using top_p", { 8 | skip_if_not_installed("rpart") 9 | 10 | irisX <- iris[-5] 11 | y <- iris$Species 12 | 13 | rec <- iris %>% 14 | recipe(Species ~.) %>% 15 | step_select_tree( 16 | all_predictors(), 17 | outcome = "Species", 18 | engine = "rpart", 19 | top_p = 2 20 | ) 21 | 22 | prepped <- prep(rec) 23 | selected <- juice(prepped) 24 | 25 | expect_length(names(selected), 3) 26 | }) 27 | 28 | 29 | test_that("step_select_tree, execution using threshold", { 30 | skip_if_not_installed("rpart") 31 | 32 | irisX <- iris[-5] 33 | y <- iris$Species 34 | 35 | # test selection by retaining features with scores >= 50th percentile 36 | rec <- iris %>% 37 | recipe(Species ~.) %>% 38 | step_select_tree( 39 | all_predictors(), 40 | outcome = "Species", 41 | threshold = 0.5 42 | ) 43 | 44 | prepped <- prep(rec) 45 | selected <- juice(prepped) 46 | 47 | expect_length(names(selected), 3) 48 | 49 | # test selection by retaining features with scores in 90th percentile 50 | rec <- iris %>% 51 | recipe(Species ~.) %>% 52 | step_select_tree( 53 | all_predictors(), 54 | outcome = "Species", 55 | threshold = 0.9 56 | ) 57 | 58 | prepped <- prep(rec) 59 | selected <- juice(prepped) 60 | 61 | expect_length(names(selected), 2) 62 | }) 63 | -------------------------------------------------------------------------------- /.github/workflows/R-CMD-check.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | 8 | name: R-CMD-check.yaml 9 | 10 | permissions: read-all 11 | 12 | jobs: 13 | R-CMD-check: 14 | runs-on: ${{ matrix.config.os }} 15 | 16 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 17 | 18 | strategy: 19 | fail-fast: false 20 | matrix: 21 | config: 22 | - {os: macos-latest, r: 'release'} 23 | - {os: windows-latest, r: 'release'} 24 | - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} 25 | - {os: ubuntu-latest, r: 'release'} 26 | - {os: ubuntu-latest, r: 'oldrel-1'} 27 | 28 | env: 29 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 30 | R_KEEP_PKG_SOURCE: yes 31 | 32 | steps: 33 | - uses: actions/checkout@v4 34 | 35 | - uses: r-lib/actions/setup-pandoc@v2 36 | 37 | - uses: r-lib/actions/setup-r@v2 38 | with: 39 | r-version: ${{ matrix.config.r }} 40 | http-user-agent: ${{ matrix.config.http-user-agent }} 41 | use-public-rspm: true 42 | 43 | - uses: r-lib/actions/setup-r-dependencies@v2 44 | with: 45 | extra-packages: any::rcmdcheck 46 | needs: check 47 | 48 | - uses: r-lib/actions/check-r-package@v2 49 | with: 50 | upload-snapshots: true 51 | build_args: 'c("--no-manual","--compact-vignettes=gs+qpdf")' 52 | -------------------------------------------------------------------------------- /man/dual_filter.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/misc.R 3 | \name{dual_filter} 4 | \alias{dual_filter} 5 | \title{Select features using `top_p` or `threshold`.} 6 | \usage{ 7 | dual_filter(x, top_p, threshold, cutoff, maximize) 8 | } 9 | \arguments{ 10 | \item{x}{a named numeric vector of scores per feature} 11 | 12 | \item{top_p}{an integer specifying the number of top-performing features to 13 | retain} 14 | 15 | \item{threshold}{a numeric with percentile of top-performing features to 16 | retain. For example, `threshold = 0.9` will only retain features that are 17 | in the top 90th percentile. A smaller value of threshold will select 18 | more features.} 19 | 20 | \item{cutoff}{a numeric with the value that represents the cutoff in the 21 | scores in `x` by which to retain/discard features.} 22 | 23 | \item{maximize}{logical to indicate whether `top_p`, `threshold` and `cutoff` 24 | are used to keep features where high scores = 'best' (maximize = TRUE) or 25 | where low scores = 'best' (maximize = FALSE).} 26 | } 27 | \value{ 28 | character vector of feature names to exclude 29 | } 30 | \description{ 31 | Feature selection using either the `top_p` or `threshold` features OR 32 | `cutoff` where cutoff refers to the absolute numeric value of the feature 33 | importance scores. 34 | } 35 | \details{ 36 | `dual_filter` selects feature that are selected using either (`top_p`, 37 | `threshold`) or `cutoff` or both. If top_p/threshold and cutoff are both used 38 | then features are selected using OR. For example, if top_p selects features 1 39 | & 2, and threshold selects features 1 & 3, then the selected features = 40 | 1,2,3. 41 | } 42 | \keyword{internal} 43 | -------------------------------------------------------------------------------- /tests/testthat/test_step_select_boruta.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(recipes) 3 | library(tibble) 4 | library(modeldata) 5 | 6 | data("lending_club") 7 | 8 | test_that("step_select_boruta, execution", { 9 | skip_if_not_installed("Boruta") 10 | 11 | # Boruta model results 12 | set.seed(1234) 13 | boruta_mod <- Boruta::Boruta( 14 | x = lending_club[, -23], 15 | y = lending_club$Class 16 | ) 17 | excluded <- names( 18 | boruta_mod$finalDecision[boruta_mod$finalDecision == "Rejected"] 19 | ) 20 | 21 | # step_select_boruta results 22 | rec <- recipe(Class ~ ., data = lending_club) %>% 23 | step_select_boruta(all_predictors(), outcome = "Class") 24 | set.seed(1234) 25 | prepped <- rec %>% prep() 26 | 27 | # check 28 | expect_equal(excluded, prepped$steps[[1]]$exclude) 29 | expect_equal(boruta_mod$ImpHistory, prepped$steps[[1]]$res$ImpHistory) 30 | }) 31 | 32 | 33 | test_that("step_select_boruta, options", { 34 | skip_if_not_installed("Boruta") 35 | 36 | # Boruta model results 37 | set.seed(1234) 38 | boruta_mod <- Boruta::Boruta( 39 | x = lending_club[, -23], 40 | y = lending_club$Class, 41 | getImp = Boruta::getImpRfGini 42 | ) 43 | excluded <- names( 44 | boruta_mod$finalDecision[boruta_mod$finalDecision == "Rejected"] 45 | ) 46 | 47 | # step_select_boruta results 48 | rec <- 49 | recipe(Class ~ ., data = lending_club) %>% 50 | step_select_boruta( 51 | all_predictors(), 52 | outcome = "Class", 53 | options = list(getImp = Boruta::getImpRfGini) 54 | ) 55 | set.seed(1234) 56 | prepped <- rec %>% prep() 57 | 58 | # check 59 | expect_equal(excluded, tidy(prepped, number = 1)$terms) 60 | expect_equal(boruta_mod$ImpHistory, prepped$steps[[1]]$res$ImpHistory) 61 | }) 62 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: colino 2 | Type: Package 3 | Title: Recipes Steps for Supervised Filter-Based Feature Selection 4 | Version: 0.0.1 5 | Authors@R: 6 | c(person(given = "Steven", 7 | family = "Pawley", 8 | role = c("aut", "cre"), 9 | email = "dr.stevenpawley@gmail.com"), 10 | person(given = "Max", 11 | family = "Kuhn", 12 | role = c("aut"), 13 | email = "max@rstudio.com"), 14 | person(given = "Rowan", 15 | family = "Jacques-Hamilton", 16 | role = c("aut"), 17 | email = "rowan.jacques.hamilton@gmail.com"), 18 | person(given = "Byron", 19 | family = "Jaeger", 20 | role = c("aut"), 21 | email = "bjaeger@wakehealth.edu")) 22 | Maintainer: Steven Pawley 23 | Description: Provides supervised selection methods to be used as preprocessing 24 | steps alongside the 'recipes' package. These steps represent filter-based 25 | methods where the features are ranked according to the feature selection 26 | method and a subset of features are retained. 27 | License: MIT + file LICENSE 28 | Encoding: UTF-8 29 | URL: https://stevenpawley.github.io/colino 30 | BugReports: https://github.com/stevenpawley/colino/issues 31 | Depends: 32 | R (>= 2.10), 33 | recipes 34 | Imports: 35 | generics, 36 | tibble, 37 | parsnip, 38 | tune, 39 | dials, 40 | purrr, 41 | rlang (>= 0.1.2), 42 | magrittr, 43 | dplyr, 44 | scales, 45 | pROC, 46 | stats 47 | RoxygenNote: 7.3.2 48 | Suggests: 49 | testthat, 50 | roxygen2, 51 | FSelectorRcpp, 52 | praznik, 53 | ranger, 54 | Boruta, 55 | care, 56 | modeldata, 57 | covr, 58 | bonsai, 59 | aorsf, 60 | xgboost 61 | -------------------------------------------------------------------------------- /tests/testthat/test_step_select_vip.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(recipes) 3 | library(tibble) 4 | library(parsnip) 5 | data("iris") 6 | 7 | test_that("step_select_vip, execution using top_p", { 8 | skip_if_not_installed("ranger") 9 | 10 | irisX <- iris[-5] 11 | y <- iris$Species 12 | 13 | base_model <- rand_forest(mode = "classification") %>% 14 | set_engine("ranger", importance = "permutation") 15 | 16 | rec <- iris %>% 17 | recipe(Species ~.) %>% 18 | step_select_vip( 19 | all_predictors(), 20 | outcome = "Species", 21 | model = base_model, 22 | top_p = 2 23 | ) 24 | 25 | prepped <- prep(rec) 26 | selected <- juice(prepped) 27 | 28 | expect_length(names(selected), 3) 29 | }) 30 | 31 | 32 | test_that("step_select_vip, execution using threshold", { 33 | skip_if_not_installed("ranger") 34 | 35 | irisX <- iris[-5] 36 | y <- iris$Species 37 | 38 | base_model <- rand_forest(mode = "classification") %>% 39 | set_engine("ranger", importance = "permutation") 40 | 41 | # test selection by retaining features with scores >= 50th percentile 42 | rec <- iris %>% 43 | recipe(Species ~.) %>% 44 | step_select_vip( 45 | all_predictors(), 46 | outcome = "Species", 47 | model = base_model, 48 | threshold = 0.5 49 | ) 50 | 51 | prepped <- prep(rec) 52 | selected <- juice(prepped) 53 | 54 | expect_length(names(selected), 3) 55 | 56 | # test selection by retaining features with scores in 90th percentile 57 | rec <- iris %>% 58 | recipe(Species ~.) %>% 59 | step_select_vip( 60 | all_predictors(), 61 | outcome = "Species", 62 | model = base_model, 63 | threshold = 0.9 64 | ) 65 | 66 | prepped <- prep(rec) 67 | selected <- juice(prepped) 68 | 69 | expect_length(names(selected), 2) 70 | }) 71 | 72 | -------------------------------------------------------------------------------- /.github/workflows/test-coverage.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | 8 | name: test-coverage.yaml 9 | 10 | permissions: read-all 11 | 12 | jobs: 13 | test-coverage: 14 | runs-on: ubuntu-latest 15 | env: 16 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 17 | 18 | steps: 19 | - uses: actions/checkout@v4 20 | 21 | - uses: r-lib/actions/setup-r@v2 22 | with: 23 | use-public-rspm: true 24 | 25 | - uses: r-lib/actions/setup-r-dependencies@v2 26 | with: 27 | extra-packages: any::covr, any::xml2 28 | needs: coverage 29 | 30 | - name: Test coverage 31 | run: | 32 | cov <- covr::package_coverage( 33 | quiet = FALSE, 34 | clean = FALSE, 35 | install_path = file.path(normalizePath(Sys.getenv("RUNNER_TEMP"), winslash = "/"), "package") 36 | ) 37 | covr::to_cobertura(cov) 38 | shell: Rscript {0} 39 | 40 | - uses: codecov/codecov-action@v4 41 | with: 42 | # Fail if error if not on PR, or if on PR and token is given 43 | fail_ci_if_error: ${{ github.event_name != 'pull_request' || secrets.CODECOV_TOKEN }} 44 | file: ./cobertura.xml 45 | plugin: noop 46 | disable_search: true 47 | token: ${{ secrets.CODECOV_TOKEN }} 48 | 49 | - name: Show testthat output 50 | if: always() 51 | run: | 52 | ## -------------------------------------------------------------------- 53 | find '${{ runner.temp }}/package' -name 'testthat.Rout*' -exec cat '{}' \; || true 54 | shell: bash 55 | 56 | - name: Upload test results 57 | if: failure() 58 | uses: actions/upload-artifact@v4 59 | with: 60 | name: coverage-test-failures 61 | path: ${{ runner.temp }}/package 62 | -------------------------------------------------------------------------------- /tests/testthat/test_step_select_infgain.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(recipes) 3 | library(tibble) 4 | library(modeldata) 5 | 6 | data("iris") 7 | 8 | test_that("step_select_infgain, classification", { 9 | skip_if_not_installed("FSelectorRcpp") 10 | 11 | irisX <- iris[-5] 12 | y <- iris$Species 13 | 14 | ig_scores <- as_tibble(FSelectorRcpp::information_gain(x = irisX, y = y)) 15 | ig_scores <- ig_scores[order(ig_scores$importance), ] 16 | ig_scores$importance <- rlang::set_names(ig_scores$importance, ig_scores$attributes) 17 | ig_scores <- ig_scores[order(ig_scores$importance, decreasing = TRUE), ] 18 | 19 | rec <- recipe(Species ~ ., data = iris) 20 | 21 | ig_rec <- rec %>% 22 | step_select_infgain( 23 | all_predictors(), outcome = "Species", type = "infogain", top_p = 2) %>% 24 | prep() 25 | 26 | ig_pred <- juice(ig_rec) 27 | expect_true(all(names(ig_pred)[1:2] %in% ig_scores$attributes[1:2])) 28 | }) 29 | 30 | 31 | test_that("step_select_infgain, regression", { 32 | skip_if_not_installed("FSelectorRcpp") 33 | data("biomass", package = "modeldata") 34 | 35 | X <- as.data.frame(biomass[, -c(1:2, 8)]) 36 | y <- biomass$HHV 37 | 38 | ig_scores <- 39 | as_tibble(FSelectorRcpp::information_gain(x = X, y = y, equal = TRUE)) 40 | ig_scores <- ig_scores[order(ig_scores$importance), ] 41 | ig_scores$importance <- rlang::set_names(ig_scores$importance, ig_scores$attributes) 42 | ig_scores <- ig_scores[order(ig_scores$importance, decreasing = TRUE), ] 43 | 44 | ig_rec <- 45 | recipe(HHV ~ ., data = biomass[, -(1:2)]) %>% 46 | step_select_infgain( 47 | all_predictors(), 48 | outcome = "HHV", 49 | type = "infogain", 50 | top_p = 2) %>% 51 | prep() 52 | 53 | ig_pred <- bake(ig_rec, new_data = NULL) 54 | expect_equal(names(ig_pred)[1:2], ig_scores$attributes[1:2]) 55 | 56 | tidyed_scores <- tidy(ig_rec, number = 1, type = "scores") 57 | tidyed_scores <- tidyed_scores[, -3] 58 | expect_equal(tidyed_scores$variable, ig_scores$attributes) 59 | expect_equal(tidyed_scores$score, ig_scores$importance) 60 | }) 61 | -------------------------------------------------------------------------------- /man/required_pkgs.colino.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/step_select_aov.R, R/step_select_boruta.R, 3 | % R/step_select_carscore.R, R/step_select_fcbf.R, R/step_select_forests.R, 4 | % R/step_select_infgain.R, R/step_select_linear.R, R/step_select_mrmr.R, 5 | % R/step_select_relief.R, R/step_select_roc.R, R/step_select_tree.R, 6 | % R/step_select_vip.R, R/step_select_xtab.R 7 | \name{required_pkgs.step_select_aov} 8 | \alias{required_pkgs.step_select_aov} 9 | \alias{required_pkgs.step_select_boruta} 10 | \alias{required_pkgs.step_select_carscore} 11 | \alias{required_pkgs.step_select_fcbf} 12 | \alias{required_pkgs.step_select_forests} 13 | \alias{required_pkgs.step_select_infgain} 14 | \alias{required_pkgs.step_select_linear} 15 | \alias{required_pkgs.step_select_mrmr} 16 | \alias{required_pkgs.step_select_relief} 17 | \alias{required_pkgs.step_select_roc} 18 | \alias{required_pkgs.step_select_tree} 19 | \alias{required_pkgs.step_select_vip} 20 | \alias{required_pkgs.step_select_xtab} 21 | \title{S3 methods for tracking which additional packages are needed for steps.} 22 | \usage{ 23 | \method{required_pkgs}{step_select_aov}(x, ...) 24 | 25 | \method{required_pkgs}{step_select_boruta}(x, ...) 26 | 27 | \method{required_pkgs}{step_select_carscore}(x, ...) 28 | 29 | \method{required_pkgs}{step_select_fcbf}(x, ...) 30 | 31 | \method{required_pkgs}{step_select_forests}(x, ...) 32 | 33 | \method{required_pkgs}{step_select_infgain}(x, ...) 34 | 35 | \method{required_pkgs}{step_select_linear}(x, ...) 36 | 37 | \method{required_pkgs}{step_select_mrmr}(x, ...) 38 | 39 | \method{required_pkgs}{step_select_relief}(x, ...) 40 | 41 | \method{required_pkgs}{step_select_roc}(x, ...) 42 | 43 | \method{required_pkgs}{step_select_tree}(x, ...) 44 | 45 | \method{required_pkgs}{step_select_vip}(x, ...) 46 | 47 | \method{required_pkgs}{step_select_xtab}(x, ...) 48 | } 49 | \arguments{ 50 | \item{x}{A recipe step} 51 | } 52 | \value{ 53 | A character vector 54 | } 55 | \description{ 56 | Recipe-adjacent packages always list themselves as a required package so that 57 | the steps can function properly within parallel processing schemes. 58 | } 59 | \keyword{internal} 60 | -------------------------------------------------------------------------------- /man/pull_importances.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/pull_importances.R 3 | \name{pull_importances} 4 | \alias{pull_importances} 5 | \title{Pull feature importances from a parsnip fitted model} 6 | \usage{ 7 | pull_importances(object, scaled = TRUE, ...) 8 | } 9 | \arguments{ 10 | \item{object}{A `model_fit` object.} 11 | 12 | \item{scaled}{A logical indicating whether to rescale the importances between 13 | 0 and 1. Default is TRUE.} 14 | 15 | \item{...}{A list of other parameters passed to the feature importance 16 | method.} 17 | } 18 | \value{ 19 | tibble 20 | } 21 | \description{ 22 | `pull_importances` is a generic function to extract feature importance scores 23 | or coefficients from a parsnip `model_fit` object and return them as a tibble 24 | with a 'feature' and 'importance' column. This is designed to support the 25 | `step_importance` recipe step. 26 | } 27 | \details{ 28 | Most of the basic models within the parsnip package that support feature 29 | importances are implemented (call `methods(pull_importances)` to list models 30 | that are currently implemented). If need to pull the feature importance 31 | scores from a model that is not currently supported in this package, then you 32 | can add a class to the pull_importances generic function which returns a 33 | two-column tibble: 34 | } 35 | \examples{ 36 | library(parsnip) 37 | 38 | # pull feature importances from a model_fit object 39 | model <- boost_tree(mode = "classification") \%>\% 40 | set_engine("xgboost") 41 | model_fit <- model \%>\% fit(Species ~., iris) 42 | pull_importances(model_fit) 43 | 44 | # create a new pull_importances method 45 | pull_importances._ranger <- function(object, scaled = FALSE, ...) { 46 | # create a call to the ranger::importance function avoiding having to use 47 | # ranger as a dependency 48 | call <- rlang::call2(.fn = "importance", .ns = "ranger", x = object$fit) 49 | scores <- rlang::eval_tidy(call) 50 | 51 | # create a tibble with 'feature' and 'importance' columns 52 | scores <- tibble::tibble( 53 | feature = names(scores), 54 | importance = as.numeric(scores) 55 | ) 56 | # optionally rescale the importance scores 57 | if (isTRUE(scaled)) 58 | scores$importance <- rescale(scores$importance) 59 | 60 | scores 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /tests/testthat/test_step_select_forests.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(recipes) 3 | library(tibble) 4 | library(parsnip) 5 | 6 | data("iris") 7 | 8 | test_that("step_select_forests, execution using top_p", { 9 | skip_if_not_installed("ranger") 10 | 11 | rec <- iris %>% 12 | recipe(Species ~.) %>% 13 | step_select_forests( 14 | all_predictors(), 15 | outcome = "Species", 16 | engine = "ranger", 17 | top_p = 2 18 | ) 19 | 20 | prepped <- prep(rec) 21 | tidy(rec, number = 1) 22 | selected <- juice(prepped) 23 | 24 | expect_length(names(selected), 3) 25 | }) 26 | 27 | 28 | test_that("step_select_forests, execution using threshold", { 29 | skip_if_not_installed("ranger") 30 | 31 | irisX <- iris[-5] 32 | y <- iris$Species 33 | 34 | # test selection by retaining features with scores >= 50th percentile 35 | rec <- iris %>% 36 | recipe(Species ~.) %>% 37 | step_select_forests( 38 | all_predictors(), 39 | outcome = "Species", 40 | threshold = 0.5 41 | ) 42 | 43 | prepped <- prep(rec) 44 | selected <- juice(prepped) 45 | 46 | expect_length(names(selected), 3) 47 | 48 | # test selection by retaining features with scores in 90th percentile 49 | rec <- iris %>% 50 | recipe(Species ~.) %>% 51 | step_select_forests( 52 | all_predictors(), 53 | outcome = "Species", 54 | threshold = 0.9 55 | ) 56 | 57 | prepped <- prep(rec) 58 | selected <- juice(prepped) 59 | 60 | expect_length(names(selected), 2) 61 | }) 62 | 63 | test_that( 64 | desc = "step_select_forests, execution using aorsf", 65 | code = { 66 | 67 | skip_if_not_installed('aorsf') 68 | skip_if_not_installed('bonsai') 69 | 70 | library(bonsai) 71 | 72 | irisX <- iris[-5] 73 | y <- iris$Species 74 | 75 | # test selection by retaining features with scores >= 50th percentile 76 | rec <- iris %>% 77 | recipe(Species ~.) %>% 78 | step_select_forests( 79 | all_predictors(), 80 | outcome = "Species", 81 | threshold = 0.5, 82 | engine = 'aorsf' 83 | ) 84 | 85 | prepped <- prep(rec) 86 | selected <- juice(prepped) 87 | 88 | expect_length(names(selected), 3) 89 | 90 | # test selection by retaining features with scores in 90th percentile 91 | rec <- iris %>% 92 | recipe(Species ~.) %>% 93 | step_select_forests( 94 | all_predictors(), 95 | outcome = "Species", 96 | threshold = 0.9 97 | ) 98 | 99 | prepped <- prep(rec) 100 | selected <- juice(prepped) 101 | 102 | expect_length(names(selected), 2) 103 | 104 | } 105 | ) 106 | 107 | 108 | -------------------------------------------------------------------------------- /docs/sitemap.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | https://stevenpawley.github.io/colino/404.html 5 | 6 | 7 | https://stevenpawley.github.io/colino/LICENSE-text.html 8 | 9 | 10 | https://stevenpawley.github.io/colino/LICENSE.html 11 | 12 | 13 | https://stevenpawley.github.io/colino/authors.html 14 | 15 | 16 | https://stevenpawley.github.io/colino/index.html 17 | 18 | 19 | https://stevenpawley.github.io/colino/reference/colino.html 20 | 21 | 22 | https://stevenpawley.github.io/colino/reference/cutoff.html 23 | 24 | 25 | https://stevenpawley.github.io/colino/reference/dual_filter.html 26 | 27 | 28 | https://stevenpawley.github.io/colino/reference/entropy.html 29 | 30 | 31 | https://stevenpawley.github.io/colino/reference/index.html 32 | 33 | 34 | https://stevenpawley.github.io/colino/reference/pipe.html 35 | 36 | 37 | https://stevenpawley.github.io/colino/reference/pull_importances.html 38 | 39 | 40 | https://stevenpawley.github.io/colino/reference/required_pkgs.embed.html 41 | 42 | 43 | https://stevenpawley.github.io/colino/reference/step_select_aov.html 44 | 45 | 46 | https://stevenpawley.github.io/colino/reference/step_select_boruta.html 47 | 48 | 49 | https://stevenpawley.github.io/colino/reference/step_select_carscore.html 50 | 51 | 52 | https://stevenpawley.github.io/colino/reference/step_select_fcbf.html 53 | 54 | 55 | https://stevenpawley.github.io/colino/reference/step_select_forests.html 56 | 57 | 58 | https://stevenpawley.github.io/colino/reference/step_select_infgain.html 59 | 60 | 61 | https://stevenpawley.github.io/colino/reference/step_select_linear.html 62 | 63 | 64 | https://stevenpawley.github.io/colino/reference/step_select_mrmr.html 65 | 66 | 67 | https://stevenpawley.github.io/colino/reference/step_select_relief.html 68 | 69 | 70 | https://stevenpawley.github.io/colino/reference/step_select_roc.html 71 | 72 | 73 | https://stevenpawley.github.io/colino/reference/step_select_tree.html 74 | 75 | 76 | https://stevenpawley.github.io/colino/reference/step_select_vip.html 77 | 78 | 79 | https://stevenpawley.github.io/colino/reference/step_select_xtab.html 80 | 81 | 82 | https://stevenpawley.github.io/colino/reference/top_p.html 83 | 84 | 85 | -------------------------------------------------------------------------------- /R/parameters.R: -------------------------------------------------------------------------------- 1 | #' Parameter functions for feature selection recipes 2 | #' 3 | #' Feature selection recipes allow the top-performing features to be selected 4 | #' using three parameters. `top_p` is for specifying the number of the 5 | #' top-performing features. 6 | #' 7 | #' @param range A two-element vector holding the _defaults_ for the smallest and 8 | #' largest possible values, respectively. 9 | #' @param trans A `trans` object from the `scales` package, such as 10 | #' `scales::log10_trans()` or `scales::reciprocal_trans()`. If not provided, 11 | #' the default is used which matches the units used in `range`. If no 12 | #' transformation, `NULL`. 13 | #' 14 | #' @return A function with classes "quant_param" and "param" 15 | #' @export 16 | #' 17 | #' @examples 18 | #' top_p(c(3, 10)) 19 | top_p <- function(range = c(1L, 4L), trans = NULL) { 20 | dials::new_quant_param( 21 | type = "integer", 22 | range = range, 23 | inclusive = c(TRUE, TRUE), 24 | trans = trans, 25 | label = c(top_p = "# Selected Predictors"), 26 | finalize = dials::get_p 27 | ) 28 | } 29 | 30 | #' Parameter functions for feature selection recipes 31 | #' 32 | #' Feature selection recipes allow the top-performing features to be selected 33 | #' using three parameters. `cutoff` is for selecting features using the absolute 34 | #' value in the filter methods scores. 35 | #' 36 | #' @param range A two-element vector holding the _defaults_ for the smallest and 37 | #' largest possible values, respectively. 38 | #' @param trans A `trans` object from the `scales` package, such as 39 | #' `scales::log10_trans()` or `scales::reciprocal_trans()`. If not provided, 40 | #' the default is used which matches the units used in `range`. If no 41 | #' transformation, `NULL`. 42 | #' 43 | #' @return A function with classes "quant_param" and "param" 44 | #' @export 45 | #' 46 | #' @examples 47 | #' cutoff(c(3.5, 15)) 48 | cutoff <- function(range = c(dials::unknown(), dials::unknown()), trans = NULL) { 49 | dials::new_quant_param( 50 | type = "double", 51 | range = range, 52 | inclusive = c(FALSE, FALSE), 53 | trans = trans, 54 | label = c(cutoff = "Absolute cutoff threshold for the feature scores") 55 | ) 56 | } 57 | 58 | values_entropy <- c("infogain", "gainratio", "symuncert") 59 | 60 | 61 | #' Parameter functions for feature selection recipes 62 | #' 63 | #' Entropy-based feature selection methods can be applied using several methods 64 | #' to calculate the entropy formula. `entropy` is for specifying the type of 65 | #' entropy-based filter that is used. 66 | #' 67 | #' @param values A character string of possible values. See `values_entropy` for 68 | #' possible values. 69 | #' 70 | #' @return A function with classes "qual_param" and "param" 71 | #' @export 72 | #' 73 | #' @examples 74 | #' entropy('infogain') 75 | entropy <- function(values = values_entropy) { 76 | dials::new_qual_param( 77 | type = "character", 78 | values = values, 79 | label = c(entropy = "Method used for entropy-based feature selection"), 80 | finalize = NULL 81 | ) 82 | } 83 | -------------------------------------------------------------------------------- /man/step_select_aov.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/step_select_aov.R 3 | \name{step_select_aov} 4 | \alias{step_select_aov} 5 | \alias{tidy.step_select_aov} 6 | \title{Filter Categorical Predictors using the ANOVA F-Test} 7 | \usage{ 8 | step_select_aov( 9 | recipe, 10 | ..., 11 | outcome, 12 | role = "predictor", 13 | trained = FALSE, 14 | top_p = NA, 15 | threshold = NA, 16 | cutoff = NA, 17 | exclude = NULL, 18 | skip = FALSE, 19 | id = recipes::rand_id("select_aov") 20 | ) 21 | 22 | \method{tidy}{step_select_aov}(x, ...) 23 | } 24 | \arguments{ 25 | \item{recipe}{A recipe object. The step will be added to the sequence of 26 | operations for this recipe.} 27 | 28 | \item{...}{One or more selector functions to choose which predictors are 29 | affected by the step. See [selections()] for more details. For the `tidy` 30 | method, these are not currently used.} 31 | 32 | \item{outcome}{A single character string that specifies a single numeric 33 | variable.} 34 | 35 | \item{role}{For model terms created by this step, what analysis role should 36 | they be assigned? By default, the function assumes that resulting distances 37 | will be used as predictors in a model.} 38 | 39 | \item{trained}{A logical to indicate if the quantities for preprocessing have 40 | been estimated.} 41 | 42 | \item{top_p}{An integer that will be used to select the `top_p` predictors 43 | with the smallest p-values. A value of `NA` implies that this criterion 44 | will be ignored.} 45 | 46 | \item{threshold}{A numeric value between 0 and 1 representing the percentile 47 | of best scoring features to select. For example `threshold = 0.9` will 48 | retain only predictors with scores in the top 90th percentile and a smaller 49 | threshold will select more features. Note that `top_p` and `threshold` are 50 | mutually exclusive but either can be used in conjunction with `cutoff` to 51 | select the top-ranked features and those that are smaller than the cutoff 52 | value.} 53 | 54 | \item{cutoff}{A numeric value, in -log10(p-value) units, where predictors 55 | with _larger_ than the cutoff will be retained. A value of `NA` implies 56 | that this criterion will be ignored.} 57 | 58 | \item{exclude}{A character vector of predictor names that will be removed 59 | from the data. This will be set when `prep()` is used on the recipe and 60 | should not be set by the user.} 61 | 62 | \item{skip}{A logical. Should the step be skipped when the recipe is baked by 63 | bake.recipe()? While all operations are baked when prep.recipe() is run, 64 | some operations may not be able to be conducted on new data (e.g. 65 | processing the outcome variable(s)). Care should be taken when using skip = 66 | TRUE as it may affect the computations for subsequent operations.} 67 | 68 | \item{id}{A character string that is unique to this step to identify it.} 69 | 70 | \item{x}{A `step_select_aov` object.} 71 | } 72 | \value{ 73 | An updated version of `recipe` with the new step added to the 74 | sequence of existing steps (if any). For the `tidy` method, a tibble with a 75 | `terms` column for which predictors were removed. 76 | } 77 | \description{ 78 | `step_select_aov` creates a *specification* of a recipe step that will filter 79 | predictors using their relationship with a numerical outcome as measured 80 | using an ANOVA F-test. 81 | } 82 | \details{ 83 | The recipe will stop if both `top_p`, `threshold` or `cutoff` are left 84 | unspecified. If both are used, they are combined via 'or'. 85 | } 86 | \examples{ 87 | data(ames, package = "modeldata") 88 | 89 | rec <- 90 | recipe(Sale_Price ~ ., data = ames) \%>\% 91 | step_select_aov( 92 | all_nominal(), 93 | -all_outcomes(), 94 | outcome = "Sale_Price", 95 | top_p = 1, 96 | cutoff = -log10(0.01) 97 | ) \%>\% 98 | prep() 99 | 100 | rec \%>\% 101 | juice(all_nominal()) \%>\% 102 | names() 103 | 104 | tidy(rec, number = 1) 105 | } 106 | \concept{preprocessing} 107 | \concept{supervised_filter} 108 | -------------------------------------------------------------------------------- /man/step_select_boruta.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/step_select_boruta.R 3 | \name{step_select_boruta} 4 | \alias{step_select_boruta} 5 | \alias{tidy.step_select_boruta} 6 | \title{Feature selection step using Boruta} 7 | \usage{ 8 | step_select_boruta( 9 | recipe, 10 | ..., 11 | outcome = NULL, 12 | role = "predictor", 13 | trained = FALSE, 14 | exclude = NULL, 15 | options = list(pValue = 0.01, mcAdj = TRUE, maxRuns = 100), 16 | res = NULL, 17 | skip = FALSE, 18 | id = recipes::rand_id("select_boruta") 19 | ) 20 | 21 | \method{tidy}{step_select_boruta}(x, type = "terms", ...) 22 | } 23 | \arguments{ 24 | \item{recipe}{A recipe object. The step will be added to the sequence of 25 | operations for this recipe.} 26 | 27 | \item{...}{One or more selector functions to choose which predictors are 28 | affected by the step. See [selections()] for more details. For the `tidy` 29 | method, these are not currently used.} 30 | 31 | \item{outcome}{A character string with the name of the response variable to 32 | use to calculate the feature importance scores.} 33 | 34 | \item{role}{Not used by this step since no new variables are created.} 35 | 36 | \item{trained}{A logical to indicate if the quantities for preprocessing have 37 | been estimated.} 38 | 39 | \item{exclude}{A character vector of predictor names that will be removed 40 | from the data. This will be set when `prep()` is used on the recipe and 41 | should not be set by the user.} 42 | 43 | \item{options}{A list of options to pass to `Boruta::Boruta()`. The defaults 44 | use Boruta's defaults. *Note* that `x` and `y` should not be passed here.} 45 | 46 | \item{res}{The `Boruta::Boruta` object is stored here once this preprocessing 47 | step has been trained by `prep.recipe()`.} 48 | 49 | \item{skip}{A logical. Should the step be skipped when the recipe is baked by 50 | bake.recipe()? While all operations are baked when prep.recipe() is run, 51 | some operations may not be able to be conducted on new data (e.g. 52 | processing the outcome variable(s)). Care should be taken when using skip = 53 | TRUE as it may affect the computations for subsequent operations.} 54 | 55 | \item{id}{A character string that is unique to this step to identify it.} 56 | 57 | \item{x}{A `step_select_boruta` object.} 58 | 59 | \item{type}{A character with either 'terms' (the default) to return a 60 | tibble containing the variables that have been removed by the filter step, 61 | or 'scores' to return the scores for each variable.} 62 | } 63 | \value{ 64 | An updated version of `recipe` with the new step added to the 65 | sequence of existing steps (if any). For the `tidy` method, a tibble with a 66 | `terms` column for which predictors were removed. 67 | } 68 | \description{ 69 | `step_select_boruta` creates a *specification* of a recipe step that selects 70 | a subset of predictors using the Boruta feature selection approach. 71 | } 72 | \details{ 73 | The Boruta algorithm technically is a wrapper approach that uses random 74 | forests to test whether the feature importance scores obtained on the 75 | original data are higher than best of the scores obtained when the variables 76 | are randomly permuted. These permuted features are termed 'shadow' features. 77 | If the scores for any original feature are higher than the best of the scores 78 | for the randomly permuted features, then this is marked as a 'hit'. Features 79 | are confirmed or rejected based on a confidence threshold (default is p = 80 | 0.01) applied to the tails of the binomial distribution with p = 0.5. 81 | Features that do not fall within the lower (reject) or upper (accept) tails 82 | of the distribution are labelled as 'tentative'. Rejected features are 83 | dropped from the feature set and the procedure is repeated until no more 84 | 'tentative' features exist, or that a maximum number of runs are reached. 85 | } 86 | \examples{ 87 | library(recipes) 88 | library(parsnip) 89 | 90 | # load the example iris dataset 91 | data(cells, package = "modeldata") 92 | 93 | # create a preprocessing recipe 94 | rec <- 95 | recipe(class ~ ., data = cells[, -1]) \%>\% 96 | step_select_boruta(all_predictors(), outcome = "class") 97 | 98 | prepped <- prep(rec) 99 | 100 | preproc_data <- juice(prepped) 101 | prepped 102 | } 103 | -------------------------------------------------------------------------------- /man/step_select_roc.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/step_select_roc.R 3 | \name{step_select_roc} 4 | \alias{step_select_roc} 5 | \alias{tidy.step_select_roc} 6 | \title{Filter Numeric Predictors using ROC Curve} 7 | \usage{ 8 | step_select_roc( 9 | recipe, 10 | ..., 11 | outcome, 12 | role = "predictor", 13 | trained = FALSE, 14 | threshold = NA, 15 | top_p = NA, 16 | cutoff = NA, 17 | exclude = NULL, 18 | skip = FALSE, 19 | id = recipes::rand_id("select_roc") 20 | ) 21 | 22 | \method{tidy}{step_select_roc}(x, ...) 23 | } 24 | \arguments{ 25 | \item{recipe}{A recipe object. The step will be added to the sequence of 26 | operations for this recipe.} 27 | 28 | \item{...}{One or more selector functions to choose which predictors are 29 | affected by the step. See [selections()] for more details. For the `tidy` 30 | method, these are not currently used.} 31 | 32 | \item{outcome}{A single character string that specifies a single categorical 33 | variable to be used as the class.} 34 | 35 | \item{role}{For model terms created by this step, what analysis role should 36 | they be assigned?. By default, the function assumes that resulting distances 37 | will be used as predictors in a model.} 38 | 39 | \item{trained}{A logical to indicate if the quantities for preprocessing have 40 | been estimated.} 41 | 42 | \item{threshold}{A numeric value between 0 and 1 representing the percentile 43 | of best scoring features to select. For example `threshold = 0.9` will 44 | retain only predictors with scores in the top 90th percentile and a smaller 45 | threshold will select more features. Note that `top_p` and `threshold` are 46 | mutually exclusive but either can be used in conjunction with `cutoff` to 47 | select the top-ranked features and those that are smaller than the cutoff 48 | value.} 49 | 50 | \item{top_p}{An integer that will be used to select the `top_p` predictors 51 | with the smallest p-values. A value of `NA` implies that this criterion 52 | will be ignored.} 53 | 54 | \item{cutoff}{A numeric value, in -log10(p-value) units, where predictors 55 | with _larger_ than the cutoff will be retained. A value of `NA` implies 56 | that this criterion will be ignored.} 57 | 58 | \item{exclude}{A character vector of predictor names that will be removed 59 | from the data. This will be set when `prep()` is used on the recipe and 60 | should not be set by the user.} 61 | 62 | \item{skip}{A logical. Should the step be skipped when the recipe is baked by 63 | bake.recipe()? While all operations are baked when prep.recipe() is run, 64 | some operations may not be able to be conducted on new data (e.g. 65 | processing the outcome variable(s)). Care should be taken when using skip = 66 | TRUE as it may affect the computations for subsequent operations.} 67 | 68 | \item{id}{A character string that is unique to this step to identify it.} 69 | 70 | \item{x}{A `step_select_roc` object.} 71 | } 72 | \value{ 73 | An updated version of `recipe` with the new step added to the 74 | sequence of existing steps (if any). For the `tidy` method, a tibble with a 75 | `terms` column for which predictors were removed. 76 | } 77 | \description{ 78 | `step_select_roc` creates a *specification* of a recipe step that will 79 | filter predictors using their relationship with the outcome as measured 80 | using a Receiver Operating Characteristic curve. 81 | } 82 | \details{ 83 | The recipe will stop if all of `top_p`, `threshold` and `cutoff` are left 84 | unspecified. 85 | 86 | The ROC AUC will be set to be 1 - AUC if the value is less than 0.50. 87 | } 88 | \examples{ 89 | data(cells, package = "modeldata") 90 | 91 | rec <- 92 | recipe(class ~ ., data = cells[, -1]) \%>\% 93 | step_select_roc(all_predictors(), outcome = "class", top_p = 10, cutoff = 0.9) \%>\% 94 | prep() 95 | 96 | rec \%>\% bake(all_predictors(), new_data = NULL) \%>\% names() 97 | 98 | # Use ROC values to select but always keep at least one: 99 | rec <- 100 | recipe(class ~ ., data = cells[, -1]) \%>\% 101 | step_select_roc( 102 | all_predictors(), 103 | outcome = "class", 104 | top_p = 1, 105 | cutoff = 0.99 106 | ) \%>\% 107 | prep() 108 | 109 | rec \%>\% juice(all_predictors()) \%>\% names() 110 | } 111 | \concept{preprocessing} 112 | \concept{supervised_filter} 113 | \keyword{datagen} 114 | -------------------------------------------------------------------------------- /man/step_select_xtab.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/step_select_xtab.R 3 | \name{step_select_xtab} 4 | \alias{step_select_xtab} 5 | \alias{tidy.step_select_xtab} 6 | \title{Filter Categorical Predictors using Contingency Tables} 7 | \usage{ 8 | step_select_xtab( 9 | recipe, 10 | ..., 11 | outcome, 12 | role = "predictor", 13 | trained = FALSE, 14 | top_p = NA, 15 | threshold = NA, 16 | cutoff = NA, 17 | exact = FALSE, 18 | fdr = TRUE, 19 | exclude = NULL, 20 | skip = FALSE, 21 | id = recipes::rand_id("select_xtab") 22 | ) 23 | 24 | \method{tidy}{step_select_xtab}(x, ...) 25 | } 26 | \arguments{ 27 | \item{recipe}{A recipe object. The step will be added to the sequence of 28 | operations for this recipe.} 29 | 30 | \item{...}{One or more selector functions to choose which predictors are 31 | affected by the step. See [selections()] for more details. For the `tidy` 32 | method, these are not currently used.} 33 | 34 | \item{outcome}{A single character string that specifies a single categorical 35 | variable to be used as the class.} 36 | 37 | \item{role}{For model terms created by this step, what analysis role should 38 | they be assigned?. By default, the function assumes that resulting distances 39 | will be used as predictors in a model.} 40 | 41 | \item{trained}{A logical to indicate if the quantities for preprocessing have 42 | been estimated.} 43 | 44 | \item{top_p}{An integer that will be used to select the `top_p` predictors 45 | with the smallest p-values. A value of `NA` implies that this criterion 46 | will be ignored.} 47 | 48 | \item{threshold}{A numeric value between 0 and 1 representing the percentile 49 | of best scoring features to select. For example `threshold = 0.9` will 50 | retain only predictors with scores in the top 90th percentile and a smaller 51 | threshold will select more features. Note that `top_p` and `threshold` are 52 | mutually exclusive but either can be used in conjunction with `cutoff` to 53 | select the top-ranked features and those that are smaller than the cutoff 54 | value.} 55 | 56 | \item{cutoff}{A numeric value, in -log10(p-value) units, where predictors 57 | with _larger_ than the cutoff will be retained. A value of `NA` implies 58 | that this criterion will be ignored.} 59 | 60 | \item{exact}{Should an exact test be used?} 61 | 62 | \item{fdr}{Should false discovery rates (FDR) be used instead of p-values?} 63 | 64 | \item{exclude}{A character vector of predictor names that will be removed 65 | from the data. This will be set when `prep()` is used on the recipe and 66 | should not be set by the user.} 67 | 68 | \item{skip}{A logical. Should the step be skipped when the recipe is baked by 69 | bake.recipe()? While all operations are baked when prep.recipe() is run, 70 | some operations may not be able to be conducted on new data (e.g. 71 | processing the outcome variable(s)). Care should be taken when using skip = 72 | TRUE as it may affect the computations for subsequent operations.} 73 | 74 | \item{id}{A character string that is unique to this step to identify it.} 75 | 76 | \item{x}{A `step_select_xtab` object.} 77 | } 78 | \value{ 79 | An updated version of `recipe` with the new step added to the 80 | sequence of existing steps (if any). For the `tidy` method, a tibble with a 81 | `terms` column for which predictors were removed. 82 | } 83 | \description{ 84 | `step_select_xtab` creates a *specification* of a recipe step that will 85 | filter predictors using their relationship with the outcome as measured 86 | using statistical tests for association. 87 | } 88 | \details{ 89 | The recipe will stop if all of `top_p`, `threshold` and `cutoff` are left 90 | unspecified. If both are used, they are combined via 'or'. 91 | 92 | The Benjamini-Hochberg FDR correction is used (see [stats::p.adjust()]). 93 | 94 | Warnings from [stats::chisq.test()] and [stats::fisher.test()] are suppressed. 95 | } 96 | \examples{ 97 | data(attrition, package = "modeldata") 98 | 99 | rec <- 100 | recipe(Attrition ~ ., data = attrition) \%>\% 101 | step_select_xtab(all_nominal(), -all_outcomes(), outcome = "Attrition", 102 | top_p = 1, cutoff = 0.001, exact = TRUE) \%>\% 103 | prep() 104 | 105 | rec \%>\% juice(all_nominal(), -all_outcomes()) \%>\% names() 106 | 107 | tidy(rec, number = 1) 108 | } 109 | \concept{preprocessing} 110 | \concept{supervised_filter} 111 | \keyword{datagen} 112 | -------------------------------------------------------------------------------- /man/step_select_mrmr.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/step_select_mrmr.R 3 | \name{step_select_mrmr} 4 | \alias{step_select_mrmr} 5 | \alias{tidy.step_select_mrmr} 6 | \title{Apply minimum Redundancy Maximum Relevance Feature Selection (mRMR)} 7 | \usage{ 8 | step_select_mrmr( 9 | recipe, 10 | ..., 11 | outcome = NULL, 12 | role = NA, 13 | trained = FALSE, 14 | top_p = NA, 15 | threshold = NA, 16 | cutoff = NA, 17 | threads = 0, 18 | exclude = NULL, 19 | scores = NULL, 20 | skip = FALSE, 21 | id = recipes::rand_id("select_mrmr") 22 | ) 23 | 24 | \method{tidy}{step_select_mrmr}(x, type = "terms", ...) 25 | } 26 | \arguments{ 27 | \item{recipe}{A recipe object. The step will be added to the sequence of 28 | operations for this recipe.} 29 | 30 | \item{...}{One or more selector functions to choose which predictors are 31 | affected by the step. See [selections()] for more details. For the `tidy` 32 | method, these are not currently used.} 33 | 34 | \item{outcome}{A character string specifying the name of response variable 35 | used to evaluate mRMR.} 36 | 37 | \item{role}{Not used by this step since no new variables are created} 38 | 39 | \item{trained}{A logical to indicate if the quantities for preprocessing have 40 | been estimated.} 41 | 42 | \item{top_p}{An integer that will be used to select the `top_p` predictors 43 | with the smallest p-values. A value of `NA` implies that this criterion 44 | will be ignored.} 45 | 46 | \item{threshold}{A numeric value between 0 and 1 representing the percentile 47 | of best scoring features to select. For example `threshold = 0.9` will 48 | retain only predictors with scores in the top 90th percentile and a smaller 49 | threshold will select more features. Note that `top_p` and `threshold` are 50 | mutually exclusive but either can be used in conjunction with `cutoff` to 51 | select the top-ranked features and those that are smaller than the cutoff 52 | value.} 53 | 54 | \item{cutoff}{A numeric value, in -log10(p-value) units, where predictors 55 | with _larger_ than the cutoff will be retained. A value of `NA` implies 56 | that this criterion will be ignored.} 57 | 58 | \item{threads}{An integer specifying the number of threads to use for 59 | processing. The default = 0 uses all available threads.} 60 | 61 | \item{exclude}{A character vector of predictor names that will be removed 62 | from the data. This will be set when `prep()` is used on the recipe and 63 | should not be set by the user.} 64 | 65 | \item{scores}{A tibble with 'variable' and 'scores' columns containing the 66 | names of the variables and their mRMR scores. This parameter is only 67 | produced after the recipe has been trained.} 68 | 69 | \item{skip}{A logical. Should the step be skipped when the recipe is baked by 70 | bake.recipe()? While all operations are baked when prep.recipe() is run, 71 | some operations may not be able to be conducted on new data (e.g. 72 | processing the outcome variable(s)). Care should be taken when using skip = 73 | TRUE as it may affect the computations for subsequent operations.} 74 | 75 | \item{id}{A character string that is unique to this step to identify it.} 76 | 77 | \item{x}{A `step_select_mrmr` object.} 78 | 79 | \item{type}{A character with either 'terms' (the default) to return a 80 | tibble containing the variables that have been removed by the filter step, 81 | or 'scores' to return the scores for each variable.} 82 | } 83 | \value{ 84 | An updated version of `recipe` with the new step added to the 85 | sequence of existing steps (if any). For the `tidy` method, a tibble with a 86 | `terms` column for which predictors were removed. 87 | } 88 | \description{ 89 | `step_select_mrmr` creates a *specification* of a recipe step that will apply 90 | minimum Redundancy Maximum Relevance Feature Selection (mRMR) to numeric 91 | data. The top `top_p` scoring features, or features whose scores occur in the 92 | top percentile `threshold` will be retained as new predictors. 93 | } 94 | \details{ 95 | The recipe will stop if all of `top_p`, `threshold` and `cutoff` are left 96 | unspecified. 97 | } 98 | \examples{ 99 | library(recipes) 100 | 101 | data(cells, package = "modeldata") 102 | 103 | rec <- 104 | recipe(class ~ ., data = cells[, -1]) \%>\% 105 | step_select_mrmr( 106 | all_predictors(), 107 | outcome = "class", 108 | top_p = 10 109 | ) 110 | 111 | prepped <- prep(rec) 112 | 113 | new_data <- bake(prepped, new_data = NULL) 114 | prepped 115 | } 116 | \concept{preprocessing} 117 | \concept{supervised_filter} 118 | \keyword{datagen} 119 | -------------------------------------------------------------------------------- /man/step_select_carscore.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/step_select_carscore.R 3 | \name{step_select_carscore} 4 | \alias{step_select_carscore} 5 | \alias{tidy.step_select_carscore} 6 | \title{Feature selection step using the CAR score algorithm} 7 | \usage{ 8 | step_select_carscore( 9 | recipe, 10 | ..., 11 | outcome = NULL, 12 | role = NA, 13 | trained = FALSE, 14 | top_p = NA, 15 | threshold = NA, 16 | cutoff = NA, 17 | lambda = NA, 18 | diagonal = FALSE, 19 | exclude = NULL, 20 | scores = NULL, 21 | skip = FALSE, 22 | id = recipes::rand_id("select_carscore") 23 | ) 24 | 25 | \method{tidy}{step_select_carscore}(x, type = "terms", ...) 26 | } 27 | \arguments{ 28 | \item{recipe}{A recipe object. The step will be added to the sequence of 29 | operations for this recipe.} 30 | 31 | \item{...}{One or more selector functions to choose which predictors are 32 | affected by the step. See [selections()] for more details. For the `tidy` 33 | method, these are not currently used.} 34 | 35 | \item{outcome}{A character string with the name of the response variable. 36 | This must refer to a numeric feature for regression.} 37 | 38 | \item{role}{Not used by this step since no new variables are created.} 39 | 40 | \item{trained}{A logical to indicate if the quantities for preprocessing have 41 | been estimated.} 42 | 43 | \item{top_p}{An integer that will be used to select the `top_p` predictors 44 | with the smallest p-values. A value of `NA` implies that this criterion 45 | will be ignored.} 46 | 47 | \item{threshold}{A numeric value between 0 and 1 representing the percentile 48 | of best scoring features to select. For example `threshold = 0.9` will 49 | retain only predictors with scores in the top 90th percentile and a smaller 50 | threshold will select more features. Note that `top_p` and `threshold` are 51 | mutually exclusive but either can be used in conjunction with `cutoff` to 52 | select the top-ranked features and those that are smaller than the cutoff 53 | value.} 54 | 55 | \item{cutoff}{A numeric value, in -log10(p-value) units, where predictors 56 | with _larger_ than the cutoff will be retained. A value of `NA` implies 57 | that this criterion will be ignored.} 58 | 59 | \item{lambda}{The correlation shrinkage intensity (range 0-1).} 60 | 61 | \item{diagonal}{For diagonal = FALSE (the default) CAR scores are computed; 62 | otherwise with diagonal = TRUE marginal correlations.} 63 | 64 | \item{exclude}{A character vector of predictor names that will be removed 65 | from the data. This will be set when `prep()` is used on the recipe and 66 | should not be set by the user.} 67 | 68 | \item{scores}{A tibble with 'variable' and 'scores' columns containing the 69 | names of the variables and the absolute values of the calculated CAR 70 | scores. This parameter is only produced after the recipe has been trained.} 71 | 72 | \item{skip}{A logical. Should the step be skipped when the recipe is baked by 73 | bake.recipe()? While all operations are baked when prep.recipe() is run, 74 | some operations may not be able to be conducted on new data (e.g. 75 | processing the outcome variable(s)). Care should be taken when using skip = 76 | TRUE as it may affect the computations for subsequent operations.} 77 | 78 | \item{id}{A character string that is unique to this step to identify it.} 79 | 80 | \item{x}{A `step_select_carscore` object.} 81 | 82 | \item{type}{A character with either 'terms' (the default) to return a 83 | tibble containing the variables that have been removed by the filter step, 84 | or 'scores' to return the scores for each variable.} 85 | } 86 | \value{ 87 | An updated version of `recipe` with the new step added to the 88 | sequence of existing steps (if any). For the `tidy` method, a tibble with a 89 | `terms` column for which predictors were removed. 90 | } 91 | \description{ 92 | `step_select_carscore` creates a *specification* of a recipe step that 93 | selects a subset of predictors as part of a regression model based on the 94 | scores of the CAR score algorithm. This step requires the `care` package to be 95 | installed. The top `top_p` scoring features, or features whose scores occur 96 | in the top percentile `threshold` will be retained as new predictors. 97 | } 98 | \details{ 99 | The recipe will stop if all of `top_p`, `threshold` and `cutoff` are left 100 | unspecified. 101 | } 102 | \examples{ 103 | library(recipes) 104 | 105 | data(car_prices, package = "modeldata") 106 | 107 | rec <- 108 | recipe(Price ~ ., data = car_prices) \%>\% 109 | step_select_carscore( 110 | all_predictors(), 111 | outcome = "Price", 112 | top_p = 5, 113 | cutoff = 0.7 114 | ) 115 | 116 | prepped <- prep(rec) 117 | 118 | new_data <- bake(prepped, new_data = NULL) 119 | prepped 120 | } 121 | \concept{preprocessing} 122 | \concept{supervised_filter} 123 | \keyword{datagen} 124 | -------------------------------------------------------------------------------- /man/step_select_fcbf.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/step_select_fcbf.R 3 | \name{step_select_fcbf} 4 | \alias{step_select_fcbf} 5 | \title{Fast Correlation Based Filter for Feature Selection} 6 | \usage{ 7 | step_select_fcbf( 8 | recipe, 9 | ..., 10 | threshold = 0.025, 11 | outcome = NA, 12 | cutpoint = 0.5, 13 | features_retained = NA, 14 | removals = NULL, 15 | role = NA, 16 | trained = FALSE, 17 | skip = FALSE, 18 | id = rand_id("select_fcbf") 19 | ) 20 | } 21 | \arguments{ 22 | \item{recipe}{A recipe object. The step will be added to the sequence of 23 | operations for this recipe.} 24 | 25 | \item{...}{One or more selector functions to choose which predictors are 26 | affected by the step. See [selections()] for more details. For the `tidy` 27 | method, these are not currently used.} 28 | 29 | \item{threshold}{A numeric value between 0 and 1 representing the symmetrical 30 | uncertainty threshold used by the FCBF algorithm. Lower thresholds allow 31 | more features to be selected.} 32 | 33 | \item{outcome}{A character string specifying the name of the response 34 | variable. Automatically inferred from the recipe (if possible) when not 35 | specified by the user.} 36 | 37 | \item{cutpoint}{A numeric value between 0 and 1 representing the quantile at 38 | which to split numeric features into binary nominal features. e.g. 0.5 = 39 | median split. See details for more information on discretization} 40 | 41 | \item{features_retained}{A tibble containing the features that were retained 42 | by the FCBF algorithm. This parameter is only produced after the recipe has 43 | been trained and should not be specified by the user} 44 | 45 | \item{removals}{A tibble containing the features that were removed 46 | by the FCBF algorithm. This parameter is only produced after the recipe has 47 | been trained, and should not be specified by the user} 48 | 49 | \item{role}{Not used for this step since new variables are not created.} 50 | 51 | \item{trained}{A logical to indicate if the quantities for preprocessing have 52 | been estimated.} 53 | 54 | \item{skip}{A logical. Should the step be skipped when the recipe is baked by 55 | bake.recipe()? While all operations are baked when prep.recipe() is run, 56 | some operations may not be able to be conducted on new data (e.g. 57 | processing the outcome variable(s)). Care should be taken when using skip = 58 | TRUE as it may affect the computations for subsequent operations.} 59 | 60 | \item{id}{A character string that is unique to this step to identify it.} 61 | } 62 | \value{ 63 | An updated version of `recipe` with the new step added to the 64 | sequence of existing steps (if any). For the `tidy` method, a tibble with a 65 | `terms` column for which predictors were removed. 66 | } 67 | \description{ 68 | `step_select_fcbf` creates a *specification* of a recipe step that selects a 69 | subset of predictors using the FCBF algorithm. The number of features 70 | retained depends on the `threshold` parameter: a lower threshold 71 | selects more features. 72 | } 73 | \details{ 74 | This function implements the fast correlation-based filter (FCBF) 75 | algorithm as described in Yu & Liu (2003). FCBF selects features that 76 | have high correlation to the outcome, and low correlation to other features. 77 | 78 | Symmetrical uncertainty (SU) is used to indicate the degree of correlation 79 | between predictors and the outcome. A threshold value for SU must be 80 | specified, and smaller threshold values will result in more features being 81 | selected by the algorithm. Appropriate thresholds are data-dependent, so 82 | different threshold values may need to be explored. It is not possible to 83 | specify an exact number of features that should be retained 84 | 85 | The algorithm requires categorical features, so continuous features are 86 | discretized using a binary split (split at the median by default). 87 | Discretization is only used within the feature selection algorithm, 88 | selected features are then retained in their original continuous form for 89 | further processing. 90 | 91 | The FCBF algorithm is implemented by the Bioconductor package 'FCBF', which 92 | can be installed with BiocManager::install("FCBF") 93 | } 94 | \examples{ 95 | \dontrun{ 96 | library(recipes) 97 | library(colino) 98 | 99 | # Load the example iris dataset 100 | data("iris") 101 | 102 | # Create a preprocessing recipe including FCBF 103 | my_recipe <- recipe(Species ~ ., data = iris) \%>\% 104 | step_select_fcbf(all_predictors(), threshold = 0.001) 105 | 106 | . prepped <- prep(my_recipe, iris) 107 | new_data <- bake(prepped, new_data = iris) 108 | prepped 109 | } 110 | } 111 | \references{ 112 | Yu, L. and Liu, H. (2003); Feature Selection for High-Dimensional 113 | Data A Fast Correlation Based Filter Solution, Proc. 20th Intl. Conf. Mach. 114 | Learn. (ICML-2003), Washington DC, 2003. 115 | } 116 | -------------------------------------------------------------------------------- /man/step_select_linear.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/step_select_linear.R 3 | \name{step_select_linear} 4 | \alias{step_select_linear} 5 | \alias{tidy.step_select_linear} 6 | \title{Feature selection step using the magnitude of a linear models' coefficients} 7 | \usage{ 8 | step_select_linear( 9 | recipe, 10 | ..., 11 | outcome = NULL, 12 | role = "predictor", 13 | trained = FALSE, 14 | engine = "glm", 15 | penalty = NULL, 16 | mixture = NULL, 17 | top_p = NA, 18 | threshold = NA, 19 | cutoff = NA, 20 | exclude = NULL, 21 | scores = NULL, 22 | skip = FALSE, 23 | id = recipes::rand_id("select_linear") 24 | ) 25 | 26 | \method{tidy}{step_select_linear}(x, type = "terms", ...) 27 | } 28 | \arguments{ 29 | \item{recipe}{A recipe object. The step will be added to the sequence of 30 | operations for this recipe.} 31 | 32 | \item{...}{One or more selector functions to choose which predictors are 33 | affected by the step. See [selections()] for more details. For the `tidy` 34 | method, these are not currently used.} 35 | 36 | \item{outcome}{A character string with the name of the response variable to 37 | use to calculate the feature importance scores.} 38 | 39 | \item{role}{Not used by this step since no new variables are created.} 40 | 41 | \item{trained}{A logical to indicate if the quantities for preprocessing have 42 | been estimated.} 43 | 44 | \item{engine}{A supported rand_forest engine that is supported by parsnip. 45 | The default is "glm".} 46 | 47 | \item{penalty}{A non-negative number representing the total amount of 48 | regularization (specific engines only).} 49 | 50 | \item{mixture}{A number between zero and one (inclusive) that is the 51 | proportion of L1 regularization (i.e. lasso) in the model. When mixture = 52 | 1, it is a pure lasso model while mixture = 0 indicates that ridge 53 | regression is being used (specific engines only).} 54 | 55 | \item{top_p}{An integer that will be used to select the `top_p` predictors 56 | with the smallest p-values. A value of `NA` implies that this criterion 57 | will be ignored.} 58 | 59 | \item{threshold}{A numeric value between 0 and 1 representing the percentile 60 | of best scoring features to select. For example `threshold = 0.9` will 61 | retain only predictors with scores in the top 90th percentile and a smaller 62 | threshold will select more features. Note that `top_p` and `threshold` are 63 | mutually exclusive but either can be used in conjunction with `cutoff` to 64 | select the top-ranked features and those that are smaller than the cutoff 65 | value.} 66 | 67 | \item{cutoff}{A numeric value, in -log10(p-value) units, where predictors 68 | with _larger_ than the cutoff will be retained. A value of `NA` implies 69 | that this criterion will be ignored.} 70 | 71 | \item{exclude}{A character vector of predictor names that will be removed 72 | from the data. This will be set when `prep()` is used on the recipe and 73 | should not be set by the user.} 74 | 75 | \item{scores}{A tibble with 'variable' and 'scores' columns containing the 76 | names of the variables and their feature importance scores. This parameter 77 | is only produced after the recipe has been trained.} 78 | 79 | \item{skip}{A logical. Should the step be skipped when the recipe is baked by 80 | bake.recipe()? While all operations are baked when prep.recipe() is run, 81 | some operations may not be able to be conducted on new data (e.g. 82 | processing the outcome variable(s)). Care should be taken when using skip = 83 | TRUE as it may affect the computations for subsequent operations.} 84 | 85 | \item{id}{A character string that is unique to this step to identify it.} 86 | 87 | \item{x}{A `step_select_linear` object.} 88 | 89 | \item{type}{A character with either 'terms' (the default) to return a 90 | tibble containing the variables that have been removed by the filter step, 91 | or 'scores' to return the scores for each variable.} 92 | } 93 | \value{ 94 | An updated version of `recipe` with the new step added to the 95 | sequence of existing steps (if any). For the `tidy` method, a tibble with a 96 | `terms` column for which predictors were removed. 97 | } 98 | \description{ 99 | `step_select_linear` creates a *specification* of a recipe step that selects 100 | a subset of predictors based on the ranking of the magnitude of coefficients 101 | provided by a `parsnip::linear_reg` or `parsnip::logistic_reg` model. 102 | } 103 | \examples{ 104 | library(recipes) 105 | library(parsnip) 106 | 107 | # load the example iris dataset 108 | data(cells, package = "modeldata") 109 | 110 | # create a preprocessing recipe 111 | rec <- 112 | recipe(class ~ ., data = cells[, -1]) \%>\% 113 | step_select_linear( 114 | all_predictors(), 115 | outcome = "class", 116 | threshold = 0.9 117 | ) 118 | 119 | prepped <- prep(rec) 120 | 121 | preproc_data <- bake(prepped, new_data = NULL) 122 | prepped 123 | } 124 | -------------------------------------------------------------------------------- /man/step_select_vip.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/step_select_vip.R 3 | \name{step_select_vip} 4 | \alias{step_select_vip} 5 | \alias{tidy.step_select_vip} 6 | \title{Feature selection step using a model's feature importance scores or 7 | coefficients} 8 | \usage{ 9 | step_select_vip( 10 | recipe, 11 | ..., 12 | outcome = NULL, 13 | role = "predictor", 14 | trained = FALSE, 15 | model = NULL, 16 | top_p = NA, 17 | threshold = NA, 18 | cutoff = NA, 19 | exclude = NULL, 20 | scores = NULL, 21 | skip = FALSE, 22 | id = recipes::rand_id("select_vip") 23 | ) 24 | 25 | \method{tidy}{step_select_vip}(x, type = "terms", ...) 26 | } 27 | \arguments{ 28 | \item{recipe}{A recipe object. The step will be added to the sequence of 29 | operations for this recipe.} 30 | 31 | \item{...}{One or more selector functions to choose which predictors are 32 | affected by the step. See [selections()] for more details. For the `tidy` 33 | method, these are not currently used.} 34 | 35 | \item{outcome}{A character string with the name of the response variable to 36 | use to calculate the feature importance scores.} 37 | 38 | \item{role}{Not used by this step since no new variables are created.} 39 | 40 | \item{trained}{A logical to indicate if the quantities for preprocessing have 41 | been estimated.} 42 | 43 | \item{model}{A `model_spec` object from `parsnip` that has a feature 44 | importances or coefficients method. The model needs to have an equivalent 45 | `pull_importances` method defined. See `?pull_importances` for how to 46 | define methods for models that are not currently supported.} 47 | 48 | \item{top_p}{An integer that will be used to select the `top_p` predictors 49 | with the smallest p-values. A value of `NA` implies that this criterion 50 | will be ignored.} 51 | 52 | \item{threshold}{A numeric value between 0 and 1 representing the percentile 53 | of best scoring features to select. For example `threshold = 0.9` will 54 | retain only predictors with scores in the top 90th percentile and a smaller 55 | threshold will select more features. Note that `top_p` and `threshold` are 56 | mutually exclusive but either can be used in conjunction with `cutoff` to 57 | select the top-ranked features and those that are smaller than the cutoff 58 | value.} 59 | 60 | \item{cutoff}{A numeric value, in -log10(p-value) units, where predictors 61 | with _larger_ than the cutoff will be retained. A value of `NA` implies 62 | that this criterion will be ignored.} 63 | 64 | \item{exclude}{A character vector of predictor names that will be removed 65 | from the data. This will be set when `prep()` is used on the recipe and 66 | should not be set by the user.} 67 | 68 | \item{scores}{A tibble with 'variable' and 'scores' columns containing the 69 | names of the variables and their feature importance scores. This parameter 70 | is only produced after the recipe has been trained.} 71 | 72 | \item{skip}{A logical. Should the step be skipped when the recipe is baked by 73 | bake.recipe()? While all operations are baked when prep.recipe() is run, 74 | some operations may not be able to be conducted on new data (e.g. 75 | processing the outcome variable(s)). Care should be taken when using skip = 76 | TRUE as it may affect the computations for subsequent operations.} 77 | 78 | \item{id}{A character string that is unique to this step to identify it.} 79 | 80 | \item{x}{A `step_select_vip` object} 81 | 82 | \item{type}{A character with either 'terms' (the default) to return a 83 | tibble containing the variables that have been removed by the filter step, 84 | or 'scores' to return the scores for each variable.} 85 | } 86 | \value{ 87 | An updated version of `recipe` with the new step added to the 88 | sequence of existing steps (if any). For the `tidy` method, a tibble with a 89 | `terms` column for which predictors were removed. 90 | } 91 | \description{ 92 | `step_select_vip` creates a *specification* of a recipe step that selects a 93 | subset of predictors based on the ranking of variable importance provided by 94 | a `parsnip` model specification and the `model` parameter 95 | } 96 | \details{ 97 | The recipe will stop if all of `top_p`, `threshold` and `cutoff` are left 98 | unspecified. 99 | } 100 | \examples{ 101 | library(recipes) 102 | library(parsnip) 103 | 104 | # load the example cells dataset 105 | data(cells, package = "modeldata") 106 | 107 | # define a base model to use for feature importances 108 | base_model <- rand_forest(mode = "classification") \%>\% 109 | set_engine("ranger", importance = "permutation") 110 | 111 | # create a preprocessing recipe 112 | rec <- 113 | recipe(class ~ ., data = cells[, -1]) \%>\% 114 | step_select_vip( 115 | all_predictors(), 116 | outcome = "class", 117 | model = base_model, 118 | top_p = 10 119 | ) 120 | 121 | prepped <- prep(rec) 122 | 123 | preproc_data <- juice(prepped) 124 | prepped 125 | } 126 | -------------------------------------------------------------------------------- /man/step_select_tree.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/step_select_tree.R 3 | \name{step_select_tree} 4 | \alias{step_select_tree} 5 | \alias{tidy.step_select_tree} 6 | \title{Feature selection step using a decision tree importance scores} 7 | \usage{ 8 | step_select_tree( 9 | recipe, 10 | ..., 11 | outcome = NULL, 12 | role = "predictor", 13 | trained = FALSE, 14 | engine = "rpart", 15 | cost_complexity = NULL, 16 | tree_depth = NULL, 17 | min_n = NULL, 18 | top_p = NA, 19 | threshold = NA, 20 | cutoff = NA, 21 | exclude = NULL, 22 | scores = NULL, 23 | skip = FALSE, 24 | id = recipes::rand_id("select_tree") 25 | ) 26 | 27 | \method{tidy}{step_select_tree}(x, type = "terms", ...) 28 | } 29 | \arguments{ 30 | \item{recipe}{A recipe object. The step will be added to the sequence of 31 | operations for this recipe.} 32 | 33 | \item{...}{One or more selector functions to choose which predictors are 34 | affected by the step. See [selections()] for more details. For the `tidy` 35 | method, these are not currently used.} 36 | 37 | \item{outcome}{A character string with the name of the response variable to 38 | use to calculate the feature importance scores.} 39 | 40 | \item{role}{Not used by this step since no new variables are created.} 41 | 42 | \item{trained}{A logical to indicate if the quantities for preprocessing have 43 | been estimated.} 44 | 45 | \item{engine}{A supported rand_forest engine that is supported by parsnip. 46 | The default is "rpart".} 47 | 48 | \item{cost_complexity}{A positive number for the the cost/complexity 49 | parameter (a.k.a. Cp) used by CART models (specific engines only).} 50 | 51 | \item{tree_depth}{An integer for maximum depth of the tree.} 52 | 53 | \item{min_n}{An integer for the minimum number of data points in a node that 54 | are required for the node to be split further.} 55 | 56 | \item{top_p}{An integer that will be used to select the `top_p` predictors 57 | with the smallest p-values. A value of `NA` implies that this criterion 58 | will be ignored.} 59 | 60 | \item{threshold}{A numeric value between 0 and 1 representing the percentile 61 | of best scoring features to select. For example `threshold = 0.9` will 62 | retain only predictors with scores in the top 90th percentile and a smaller 63 | threshold will select more features. Note that `top_p` and `threshold` are 64 | mutually exclusive but either can be used in conjunction with `cutoff` to 65 | select the top-ranked features and those that are smaller than the cutoff 66 | value.} 67 | 68 | \item{cutoff}{A numeric value, in -log10(p-value) units, where predictors 69 | with _larger_ than the cutoff will be retained. A value of `NA` implies 70 | that this criterion will be ignored.} 71 | 72 | \item{exclude}{A character vector of predictor names that will be removed 73 | from the data. This will be set when `prep()` is used on the recipe and 74 | should not be set by the user.} 75 | 76 | \item{scores}{A tibble with 'variable' and 'scores' columns containing the 77 | names of the variables and their feature importance scores. This parameter 78 | is only produced after the recipe has been trained.} 79 | 80 | \item{skip}{A logical. Should the step be skipped when the recipe is baked by 81 | bake.recipe()? While all operations are baked when prep.recipe() is run, 82 | some operations may not be able to be conducted on new data (e.g. 83 | processing the outcome variable(s)). Care should be taken when using skip = 84 | TRUE as it may affect the computations for subsequent operations.} 85 | 86 | \item{id}{A character string that is unique to this step to identify it.} 87 | 88 | \item{x}{A `step_select_tree` object.} 89 | 90 | \item{type}{A character with either 'terms' (the default) to return a 91 | tibble containing the variables that have been removed by the filter step, 92 | or 'scores' to return the scores for each variable.} 93 | } 94 | \value{ 95 | An updated version of `recipe` with the new step added to the 96 | sequence of existing steps (if any). For the `tidy` method, a tibble with a 97 | `terms` column for which predictors were removed. 98 | } 99 | \description{ 100 | `step_select_tree` creates a *specification* of a recipe step that selects a 101 | subset of predictors based on the ranking of variable importance provided by 102 | a `parsnip::decision_tree` supported model. 103 | } 104 | \details{ 105 | The recipe will stop if all of `top_p`, `threshold` and `cutoff` are left 106 | unspecified. 107 | } 108 | \examples{ 109 | library(recipes) 110 | library(parsnip) 111 | 112 | # load the example cells dataset 113 | data(cells, package = "modeldata") 114 | 115 | # create a preprocessing recipe 116 | rec <- 117 | recipe(class ~ ., data = cells[, -1]) \%>\% 118 | step_select_tree(all_predictors(), outcome = "class", top_p = 10) 119 | 120 | prepped <- prep(rec) 121 | 122 | preproc_data <- bake(prepped, new_data = NULL) 123 | prepped 124 | } 125 | -------------------------------------------------------------------------------- /man/step_select_infgain.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/step_select_infgain.R 3 | \name{step_select_infgain} 4 | \alias{step_select_infgain} 5 | \alias{tidy.step_select_infgain} 6 | \title{Information gain feature selection step} 7 | \usage{ 8 | step_select_infgain( 9 | recipe, 10 | ..., 11 | outcome = NULL, 12 | role = NA, 13 | trained = FALSE, 14 | top_p = NA, 15 | threshold = NA, 16 | cutoff = NA, 17 | type = "infogain", 18 | nbins = 5, 19 | threads = 1, 20 | exclude = NULL, 21 | scores = NULL, 22 | skip = FALSE, 23 | id = recipes::rand_id("select_infgain") 24 | ) 25 | 26 | \method{tidy}{step_select_infgain}(x, type = "terms", ...) 27 | } 28 | \arguments{ 29 | \item{recipe}{A recipe object. The step will be added to the sequence of 30 | operations for this recipe.} 31 | 32 | \item{...}{One or more selector functions to choose which predictors are 33 | affected by the step. See [selections()] for more details. For the `tidy` 34 | method, these are not currently used.} 35 | 36 | \item{outcome}{A character string with the name of the response variable to 37 | use to evaluate information gain value against the predictors.} 38 | 39 | \item{role}{Not used by this step since no new variables are created.} 40 | 41 | \item{trained}{A logical to indicate if the quantities for preprocessing have 42 | been estimated.} 43 | 44 | \item{top_p}{An integer that will be used to select the `top_p` predictors 45 | with the smallest p-values. A value of `NA` implies that this criterion 46 | will be ignored.} 47 | 48 | \item{threshold}{A numeric value between 0 and 1 representing the percentile 49 | of best scoring features to select. For example `threshold = 0.9` will 50 | retain only predictors with scores in the top 90th percentile and a smaller 51 | threshold will select more features. Note that `top_p` and `threshold` are 52 | mutually exclusive but either can be used in conjunction with `cutoff` to 53 | select the top-ranked features and those that are smaller than the cutoff 54 | value.} 55 | 56 | \item{cutoff}{A numeric value, in -log10(p-value) units, where predictors 57 | with _larger_ than the cutoff will be retained. A value of `NA` implies 58 | that this criterion will be ignored.} 59 | 60 | \item{type}{A character with either 'terms' (the default) to return a 61 | tibble containing the variables that have been removed by the filter step, 62 | or 'scores' to return the scores for each variable.} 63 | 64 | \item{nbins}{An integer specifying the number of bins for discretization. 65 | Only used if the outcome of a continuous variable for regression. The 66 | default is 'nbins = 5'.} 67 | 68 | \item{threads}{An integer specifying the number of threads to use for 69 | processing. The default = 0 uses all available threads.} 70 | 71 | \item{exclude}{A character vector of predictor names that will be removed 72 | from the data. This will be set when `prep()` is used on the recipe and 73 | should not be set by the user.} 74 | 75 | \item{scores}{A tibble with 'variable' and 'scores' columns containing the 76 | names of the variables and their information gain scores. This parameter is 77 | only produced after the recipe has been trained.} 78 | 79 | \item{skip}{A logical. Should the step be skipped when the recipe is baked by 80 | bake.recipe()? While all operations are baked when prep.recipe() is run, 81 | some operations may not be able to be conducted on new data (e.g. 82 | processing the outcome variable(s)). Care should be taken when using skip = 83 | TRUE as it may affect the computations for subsequent operations.} 84 | 85 | \item{id}{A character string that is unique to this step to identify it.} 86 | 87 | \item{x}{A `step_select_infgain` object.} 88 | } 89 | \value{ 90 | An updated version of `recipe` with the new step added to the 91 | sequence of existing steps (if any). For the `tidy` method, a tibble with a 92 | `terms` column for which predictors were removed. 93 | } 94 | \description{ 95 | `step_select_infgain` creates a *specification* of a recipe step that selects a 96 | subset of predictors based on the scores of the information gain algorithm. 97 | This step requires the FSelectorRcpp package to be installed. The top 98 | `top_p` scoring features, or features whose scores occur in the top 99 | percentile `threshold` will be retained as new predictors. 100 | } 101 | \details{ 102 | The recipe will stop if both `top_p`, `threshold` and `cutoff` are left 103 | unspecified. 104 | } 105 | \examples{ 106 | library(recipes) 107 | 108 | data(cells, package = "modeldata") 109 | 110 | rec <- 111 | recipe(class ~ ., data = cells[, -1]) \%>\% 112 | step_select_infgain( 113 | all_predictors(), 114 | outcome = "class", 115 | threshold = 0.9, 116 | id = "infgain" 117 | ) 118 | 119 | prepped <- prep(rec) 120 | 121 | new_data <- juice(prepped) 122 | prepped 123 | } 124 | \concept{preprocessing} 125 | \concept{supervised_filter} 126 | \keyword{datagen} 127 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method(bake,step_select_aov) 4 | S3method(bake,step_select_boruta) 5 | S3method(bake,step_select_carscore) 6 | S3method(bake,step_select_fcbf) 7 | S3method(bake,step_select_forests) 8 | S3method(bake,step_select_infgain) 9 | S3method(bake,step_select_linear) 10 | S3method(bake,step_select_mrmr) 11 | S3method(bake,step_select_relief) 12 | S3method(bake,step_select_roc) 13 | S3method(bake,step_select_tree) 14 | S3method(bake,step_select_vip) 15 | S3method(bake,step_select_xtab) 16 | S3method(prep,step_select_aov) 17 | S3method(prep,step_select_boruta) 18 | S3method(prep,step_select_carscore) 19 | S3method(prep,step_select_fcbf) 20 | S3method(prep,step_select_forests) 21 | S3method(prep,step_select_infgain) 22 | S3method(prep,step_select_linear) 23 | S3method(prep,step_select_mrmr) 24 | S3method(prep,step_select_relief) 25 | S3method(prep,step_select_roc) 26 | S3method(prep,step_select_tree) 27 | S3method(prep,step_select_vip) 28 | S3method(prep,step_select_xtab) 29 | S3method(print,step_select_aov) 30 | S3method(print,step_select_boruta) 31 | S3method(print,step_select_carscore) 32 | S3method(print,step_select_fcbf) 33 | S3method(print,step_select_forests) 34 | S3method(print,step_select_infgain) 35 | S3method(print,step_select_linear) 36 | S3method(print,step_select_mrmr) 37 | S3method(print,step_select_relief) 38 | S3method(print,step_select_roc) 39 | S3method(print,step_select_tree) 40 | S3method(print,step_select_vip) 41 | S3method(print,step_select_xtab) 42 | S3method(pull_importances,"_C5.0") 43 | S3method(pull_importances,"_H2OMultinomialModel") 44 | S3method(pull_importances,"_H2ORegressionModel") 45 | S3method(pull_importances,"_ObliqueForestClassification") 46 | S3method(pull_importances,"_ObliqueForestRegression") 47 | S3method(pull_importances,"_ObliqueForestSurvival") 48 | S3method(pull_importances,"_cubist") 49 | S3method(pull_importances,"_earth") 50 | S3method(pull_importances,"_elnet") 51 | S3method(pull_importances,"_glm") 52 | S3method(pull_importances,"_lm") 53 | S3method(pull_importances,"_lognet") 54 | S3method(pull_importances,"_randomForest") 55 | S3method(pull_importances,"_ranger") 56 | S3method(pull_importances,"_rpart") 57 | S3method(pull_importances,"_xgb.Booster") 58 | S3method(pull_importances,default) 59 | S3method(required_pkgs,step_select_aov) 60 | S3method(required_pkgs,step_select_boruta) 61 | S3method(required_pkgs,step_select_carscore) 62 | S3method(required_pkgs,step_select_fcbf) 63 | S3method(required_pkgs,step_select_forests) 64 | S3method(required_pkgs,step_select_infgain) 65 | S3method(required_pkgs,step_select_linear) 66 | S3method(required_pkgs,step_select_mrmr) 67 | S3method(required_pkgs,step_select_relief) 68 | S3method(required_pkgs,step_select_roc) 69 | S3method(required_pkgs,step_select_tree) 70 | S3method(required_pkgs,step_select_vip) 71 | S3method(required_pkgs,step_select_xtab) 72 | S3method(tidy,step_select_aov) 73 | S3method(tidy,step_select_boruta) 74 | S3method(tidy,step_select_carscore) 75 | S3method(tidy,step_select_forests) 76 | S3method(tidy,step_select_infgain) 77 | S3method(tidy,step_select_linear) 78 | S3method(tidy,step_select_mrmr) 79 | S3method(tidy,step_select_relief) 80 | S3method(tidy,step_select_roc) 81 | S3method(tidy,step_select_tree) 82 | S3method(tidy,step_select_vip) 83 | S3method(tidy,step_select_xtab) 84 | S3method(tunable,step_select_aov) 85 | S3method(tunable,step_select_carscore) 86 | S3method(tunable,step_select_forests) 87 | S3method(tunable,step_select_infgain) 88 | S3method(tunable,step_select_linear) 89 | S3method(tunable,step_select_mrmr) 90 | S3method(tunable,step_select_relief) 91 | S3method(tunable,step_select_roc) 92 | S3method(tunable,step_select_tree) 93 | S3method(tunable,step_select_vip) 94 | S3method(tunable,step_select_xtab) 95 | export("%>%") 96 | export(cutoff) 97 | export(entropy) 98 | export(pull_importances) 99 | export(step_select_aov) 100 | export(step_select_boruta) 101 | export(step_select_carscore) 102 | export(step_select_fcbf) 103 | export(step_select_forests) 104 | export(step_select_infgain) 105 | export(step_select_linear) 106 | export(step_select_mrmr) 107 | export(step_select_relief) 108 | export(step_select_roc) 109 | export(step_select_tree) 110 | export(step_select_vip) 111 | export(step_select_xtab) 112 | export(top_p) 113 | importFrom(dplyr,filter) 114 | importFrom(dplyr,pull) 115 | importFrom(generics,required_pkgs) 116 | importFrom(generics,tidy) 117 | importFrom(magrittr,"%>%") 118 | importFrom(recipes,add_step) 119 | importFrom(recipes,bake) 120 | importFrom(recipes,prep) 121 | importFrom(recipes,print_step) 122 | importFrom(recipes,rand_id) 123 | importFrom(recipes,recipes_eval_select) 124 | importFrom(recipes,recipes_pkg_check) 125 | importFrom(recipes,step) 126 | importFrom(rlang,.data) 127 | importFrom(rlang,enquos) 128 | importFrom(stats,aov) 129 | importFrom(stats,as.formula) 130 | importFrom(tibble,as_tibble) 131 | importFrom(tibble,tibble) 132 | importFrom(tune,tunable) 133 | -------------------------------------------------------------------------------- /docs/pkgdown.js: -------------------------------------------------------------------------------- 1 | /* http://gregfranko.com/blog/jquery-best-practices/ */ 2 | (function($) { 3 | $(function() { 4 | 5 | $('nav.navbar').headroom(); 6 | 7 | Toc.init({ 8 | $nav: $("#toc"), 9 | $scope: $("main h2, main h3, main h4, main h5, main h6") 10 | }); 11 | 12 | if ($('#toc').length) { 13 | $('body').scrollspy({ 14 | target: '#toc', 15 | offset: $("nav.navbar").outerHeight() + 1 16 | }); 17 | } 18 | 19 | // Activate popovers 20 | $('[data-bs-toggle="popover"]').popover({ 21 | container: 'body', 22 | html: true, 23 | trigger: 'focus', 24 | placement: "top", 25 | sanitize: false, 26 | }); 27 | 28 | $('[data-bs-toggle="tooltip"]').tooltip(); 29 | 30 | /* Clipboard --------------------------*/ 31 | 32 | function changeTooltipMessage(element, msg) { 33 | var tooltipOriginalTitle=element.getAttribute('data-original-title'); 34 | element.setAttribute('data-original-title', msg); 35 | $(element).tooltip('show'); 36 | element.setAttribute('data-original-title', tooltipOriginalTitle); 37 | } 38 | 39 | if(ClipboardJS.isSupported()) { 40 | $(document).ready(function() { 41 | var copyButton = ""; 42 | 43 | $("div.sourceCode").addClass("hasCopyButton"); 44 | 45 | // Insert copy buttons: 46 | $(copyButton).prependTo(".hasCopyButton"); 47 | 48 | // Initialize tooltips: 49 | $('.btn-copy-ex').tooltip({container: 'body'}); 50 | 51 | // Initialize clipboard: 52 | var clipboard = new ClipboardJS('[data-clipboard-copy]', { 53 | text: function(trigger) { 54 | return trigger.parentNode.textContent.replace(/\n#>[^\n]*/g, ""); 55 | } 56 | }); 57 | 58 | clipboard.on('success', function(e) { 59 | changeTooltipMessage(e.trigger, 'Copied!'); 60 | e.clearSelection(); 61 | }); 62 | 63 | clipboard.on('error', function() { 64 | changeTooltipMessage(e.trigger,'Press Ctrl+C or Command+C to copy'); 65 | }); 66 | 67 | }); 68 | } 69 | 70 | /* Search marking --------------------------*/ 71 | var url = new URL(window.location.href); 72 | var toMark = url.searchParams.get("q"); 73 | var mark = new Mark("main#main"); 74 | if (toMark) { 75 | mark.mark(toMark, { 76 | accuracy: { 77 | value: "complementary", 78 | limiters: [",", ".", ":", "/"], 79 | } 80 | }); 81 | } 82 | 83 | /* Search --------------------------*/ 84 | /* Adapted from https://github.com/rstudio/bookdown/blob/2d692ba4b61f1e466c92e78fd712b0ab08c11d31/inst/resources/bs4_book/bs4_book.js#L25 */ 85 | // Initialise search index on focus 86 | var fuse; 87 | $("#search-input").focus(async function(e) { 88 | if (fuse) { 89 | return; 90 | } 91 | 92 | $(e.target).addClass("loading"); 93 | var response = await fetch($("#search-input").data("search-index")); 94 | var data = await response.json(); 95 | 96 | var options = { 97 | keys: ["what", "text", "code"], 98 | ignoreLocation: true, 99 | threshold: 0.1, 100 | includeMatches: true, 101 | includeScore: true, 102 | }; 103 | fuse = new Fuse(data, options); 104 | 105 | $(e.target).removeClass("loading"); 106 | }); 107 | 108 | // Use algolia autocomplete 109 | var options = { 110 | autoselect: true, 111 | debug: true, 112 | hint: false, 113 | minLength: 2, 114 | }; 115 | var q; 116 | async function searchFuse(query, callback) { 117 | await fuse; 118 | 119 | var items; 120 | if (!fuse) { 121 | items = []; 122 | } else { 123 | q = query; 124 | var results = fuse.search(query, { limit: 20 }); 125 | items = results 126 | .filter((x) => x.score <= 0.75) 127 | .map((x) => x.item); 128 | if (items.length === 0) { 129 | items = [{dir:"Sorry 😿",previous_headings:"",title:"No results found.",what:"No results found.",path:window.location.href}]; 130 | } 131 | } 132 | callback(items); 133 | } 134 | $("#search-input").autocomplete(options, [ 135 | { 136 | name: "content", 137 | source: searchFuse, 138 | templates: { 139 | suggestion: (s) => { 140 | if (s.title == s.what) { 141 | return `${s.dir} >
${s.title}
`; 142 | } else if (s.previous_headings == "") { 143 | return `${s.dir} >
${s.title}
> ${s.what}`; 144 | } else { 145 | return `${s.dir} >
${s.title}
> ${s.previous_headings} > ${s.what}`; 146 | } 147 | }, 148 | }, 149 | }, 150 | ]).on('autocomplete:selected', function(event, s) { 151 | window.location.href = s.path + "?q=" + q + "#" + s.id; 152 | }); 153 | }); 154 | })(window.jQuery || window.$) 155 | 156 | 157 | -------------------------------------------------------------------------------- /man/step_select_forests.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/step_select_forests.R 3 | \name{step_select_forests} 4 | \alias{step_select_forests} 5 | \alias{tidy.step_select_forests} 6 | \title{Feature selection step using a random forest feature importance scores} 7 | \usage{ 8 | step_select_forests( 9 | recipe, 10 | ..., 11 | outcome = NULL, 12 | role = "predictor", 13 | trained = FALSE, 14 | engine = "ranger", 15 | options = list(importance = "permutation"), 16 | mtry = NULL, 17 | trees = NULL, 18 | min_n = NULL, 19 | top_p = NA, 20 | threshold = NA, 21 | cutoff = NA, 22 | exclude = NULL, 23 | scores = NULL, 24 | skip = FALSE, 25 | id = recipes::rand_id("select_forests") 26 | ) 27 | 28 | \method{tidy}{step_select_forests}(x, type = "terms", ...) 29 | } 30 | \arguments{ 31 | \item{recipe}{A recipe object. The step will be added to the sequence of 32 | operations for this recipe.} 33 | 34 | \item{...}{One or more selector functions to choose which predictors are 35 | affected by the step. See [selections()] for more details. For the `tidy` 36 | method, these are not currently used.} 37 | 38 | \item{outcome}{A character string with the name of the response variable to 39 | use to calculate the feature importance scores.} 40 | 41 | \item{role}{Not used by this step since no new variables are created.} 42 | 43 | \item{trained}{A logical to indicate if the quantities for preprocessing have 44 | been estimated.} 45 | 46 | \item{engine}{A supported rand_forest engine that is supported by parsnip. 47 | The default is "ranger".} 48 | 49 | \item{options}{A named list of options to pass to the rand_forest engine. For 50 | example, if `engine = 'ranger'` (the default) then options could be 51 | `list(permutation = 'importance`) because a feature importance method needs 52 | to be specified for this engine. This is the default.} 53 | 54 | \item{mtry}{An integer for the number of predictors that will be randomly 55 | sampled at each split when creating the tree models.} 56 | 57 | \item{trees}{An integer for the number of trees contained in the ensemble.} 58 | 59 | \item{min_n}{An integer for the minimum number of data points in a node that 60 | are required for the node to be split further.} 61 | 62 | \item{top_p}{An integer that will be used to select the `top_p` predictors 63 | with the smallest p-values. A value of `NA` implies that this criterion 64 | will be ignored.} 65 | 66 | \item{threshold}{A numeric value between 0 and 1 representing the percentile 67 | of best scoring features to select. For example `threshold = 0.9` will 68 | retain only predictors with scores in the top 90th percentile and a smaller 69 | threshold will select more features. Note that `top_p` and `threshold` are 70 | mutually exclusive but either can be used in conjunction with `cutoff` to 71 | select the top-ranked features and those that are smaller than the cutoff 72 | value.} 73 | 74 | \item{cutoff}{A numeric value, in -log10(p-value) units, where predictors 75 | with _larger_ than the cutoff will be retained. A value of `NA` implies 76 | that this criterion will be ignored.} 77 | 78 | \item{exclude}{A character vector of predictor names that will be removed 79 | from the data. This will be set when `prep()` is used on the recipe and 80 | should not be set by the user.} 81 | 82 | \item{scores}{A tibble with 'variable' and 'scores' columns containing the 83 | names of the variables and their feature importance scores. This parameter 84 | is only produced after the recipe has been trained.} 85 | 86 | \item{skip}{A logical. Should the step be skipped when the recipe is baked by 87 | bake.recipe()? While all operations are baked when prep.recipe() is run, 88 | some operations may not be able to be conducted on new data (e.g. 89 | processing the outcome variable(s)). Care should be taken when using skip = 90 | TRUE as it may affect the computations for subsequent operations.} 91 | 92 | \item{id}{A character string that is unique to this step to identify it.} 93 | 94 | \item{x}{A `step_select_forests` object.} 95 | 96 | \item{type}{A character with either 'terms' (the default) to return a 97 | tibble containing the variables that have been removed by the filter step, 98 | or 'scores' to return the scores for each variable.} 99 | } 100 | \value{ 101 | An updated version of `recipe` with the new step added to the 102 | sequence of existing steps (if any). For the `tidy` method, a tibble with a 103 | `terms` column for which predictors were removed. 104 | } 105 | \description{ 106 | `step_select_forests` creates a *specification* of a recipe step that selects 107 | a subset of predictors based on the ranking of variable importance using a 108 | `parsnip::rand_forest` supported model. 109 | } 110 | \examples{ 111 | library(recipes) 112 | library(parsnip) 113 | 114 | # load the example iris dataset 115 | data(cells, package = "modeldata") 116 | 117 | # create a preprocessing recipe 118 | rec <- 119 | recipe(class ~ ., data = cells[, -1]) \%>\% 120 | step_select_forests(all_predictors(), outcome = "class", top_p = 10, 121 | cutoff = 0.9) 122 | 123 | prepped <- prep(rec) 124 | 125 | preproc_data <- juice(prepped) 126 | prepped 127 | } 128 | -------------------------------------------------------------------------------- /docs/LICENSE-text.html: -------------------------------------------------------------------------------- 1 | 2 | License • colino 6 | Skip to contents 7 | 8 | 9 |
38 |
39 |
43 | 44 |
YEAR: 2019
45 | COPYRIGHT HOLDER: Steven Pawley
46 | 
47 | 48 |
49 | 50 | 51 |
54 | 55 | 58 | 59 |
60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | -------------------------------------------------------------------------------- /R/misc.R: -------------------------------------------------------------------------------- 1 | check_zero_one <- function(x) { 2 | if (is.na(x)) { 3 | return(x) 4 | } else { 5 | if (is.numeric(x)) { 6 | if (x >= 1 | x <= 0) { 7 | rlang::abort("`threshold` should be on (0, 1).") 8 | } 9 | } else { 10 | rlang::abort("`threshold` should be numeric.") 11 | } 12 | } 13 | return(x) 14 | } 15 | 16 | check_top_p <- function(x, n) { 17 | # checks on x (top_p) and n (number of features) 18 | if (is.na(x)) { 19 | return(x) 20 | } 21 | 22 | if (!is.numeric(x)) { 23 | rlang::abort("`top_p` should be numeric.") 24 | } 25 | 26 | if (!is.integer(x)) { 27 | x <- as.integer(x) 28 | } 29 | 30 | msg <- paste0("`top_p` should be on (1, ", n, ") based on the number of features available.") 31 | 32 | # return top_n = all features if top_n > n 33 | if (x >= n) { 34 | rlang::warn(msg) 35 | x <- min(n - 1, x) 36 | 37 | # return a single feature if top_p < 1 38 | } else if (x < 1) { 39 | rlang::warn(msg) 40 | x <- 1 41 | } 42 | 43 | return(x) 44 | } 45 | 46 | check_criteria <- function(top_p, threshold, cl) { 47 | if (is.na(top_p) & is.na(threshold)) { 48 | msg <- paste0( 49 | "For `", 50 | cl[[1]], 51 | "`, `top_p` and `threshold` cannot both be missing." 52 | ) 53 | rlang::abort(msg) 54 | } 55 | invisible(NULL) 56 | } 57 | 58 | #' Select features using `top_p` or `threshold`. 59 | #' 60 | #' Feature selection using either the `top_p` or `threshold` features OR 61 | #' `cutoff` where cutoff refers to the absolute numeric value of the feature 62 | #' importance scores. 63 | #' 64 | #' @details 65 | #' `dual_filter` selects feature that are selected using either (`top_p`, 66 | #' `threshold`) or `cutoff` or both. If top_p/threshold and cutoff are both used 67 | #' then features are selected using OR. For example, if top_p selects features 1 68 | #' & 2, and threshold selects features 1 & 3, then the selected features = 69 | #' 1,2,3. 70 | #' 71 | #' @param x a named numeric vector of scores per feature 72 | #' @param top_p an integer specifying the number of top-performing features to 73 | #' retain 74 | #' @param threshold a numeric with percentile of top-performing features to 75 | #' retain. For example, `threshold = 0.9` will only retain features that are 76 | #' in the top 90th percentile. A smaller value of threshold will select 77 | #' more features. 78 | #' @param cutoff a numeric with the value that represents the cutoff in the 79 | #' scores in `x` by which to retain/discard features. 80 | #' @param maximize logical to indicate whether `top_p`, `threshold` and `cutoff` 81 | #' are used to keep features where high scores = 'best' (maximize = TRUE) or 82 | #' where low scores = 'best' (maximize = FALSE). 83 | #' 84 | #' @return character vector of feature names to exclude 85 | #' @keywords internal 86 | dual_filter <- function(x, top_p, threshold, cutoff, maximize) { 87 | if (!is.na(top_p) & !is.na(threshold)) { 88 | rlang::abort("`top_p` and `threshold` are mutually exclusive") 89 | } 90 | 91 | na_x <- x[is.na(x)] 92 | x <- x[!is.na(x)] 93 | x <- sort(x, decreasing = maximize) 94 | 95 | p <- length(x) 96 | 97 | # assign logical selection variable using top_p 98 | if (!is.na(top_p)) { 99 | top_p_lgl <- seq_along(x) <= top_p 100 | } else { 101 | top_p_lgl <- rep(FALSE, p) 102 | } 103 | 104 | # assign logical selection variable using threshold 105 | if (!is.na(threshold)) { 106 | p_to_exceed <- stats::quantile(x, threshold) 107 | 108 | if (maximize) { 109 | threshold_lgl <- x >= p_to_exceed 110 | } else { 111 | threshold_lgl <- x < p_to_exceed 112 | } 113 | 114 | } else { 115 | threshold_lgl <- rep(FALSE, p) 116 | } 117 | 118 | # assign logical selection variable using cutoff 119 | if (!is.na(cutoff)) { 120 | if (maximize) { 121 | cutoff_lgl <- x >= cutoff 122 | } else { 123 | cutoff_lgl <- x <= cutoff 124 | } 125 | 126 | } else { 127 | cutoff_lgl <- rep(FALSE, p) 128 | } 129 | 130 | keep_lgl <- top_p_lgl | threshold_lgl | cutoff_lgl 131 | excluded <- c(names(x)[!keep_lgl], names(na_x)) 132 | 133 | return(excluded) 134 | } 135 | 136 | check_outcome <- function(y) { 137 | ifelse(inherits(y, "factor"), "classification", "regression") 138 | } 139 | 140 | get_outcome <- function(x, training, info) { 141 | if (!all(is.na(x$outcome))) { 142 | if (!all(is.character(x$outcome))) { 143 | rlang::abort("Outcome variable must be supplied as a character string") 144 | } 145 | 146 | outcome_col <- x$outcome 147 | 148 | } else { 149 | outcome_col <- info %>% 150 | dplyr::filter(.data$role == 'outcome') %>% 151 | dplyr::pull("variable") 152 | } 153 | 154 | if (length(outcome_col) > 1) { 155 | msg <- paste( 156 | "Multiple outcome variables are present in the recipe.", 157 | "Only a single outcome variable can be accepted by any `step_select` functions.", 158 | "Please supply the outcome variable using the `outcome` argument" 159 | ) 160 | rlang::abort(msg) 161 | } 162 | 163 | if (length(outcome_col) < 1) { 164 | msg <- paste( 165 | "An outcome variable was not found.", 166 | "Please ensure an outcome variable is specified." 167 | ) 168 | rlang::abort(msg) 169 | } 170 | 171 | if (!outcome_col %in% names(training)) { 172 | rlang::abort(paste0("Outcome variable '", outcome_col, "' not found")) 173 | } 174 | 175 | return(outcome_col) 176 | } 177 | -------------------------------------------------------------------------------- /man/step_select_relief.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/step_select_relief.R 3 | \name{step_select_relief} 4 | \alias{step_select_relief} 5 | \alias{tidy.step_select_relief} 6 | \title{Feature selection step using the Relief algorithm} 7 | \usage{ 8 | step_select_relief( 9 | recipe, 10 | ..., 11 | outcome = NULL, 12 | role = NA, 13 | trained = FALSE, 14 | top_p = NA, 15 | threshold = NA, 16 | cutoff = NA, 17 | neighbors = 5, 18 | sample_size = 10, 19 | exclude = NULL, 20 | scores = NULL, 21 | skip = FALSE, 22 | id = recipes::rand_id("select_relief") 23 | ) 24 | 25 | \method{tidy}{step_select_relief}(x, ...) 26 | } 27 | \arguments{ 28 | \item{recipe}{A recipe object. The step will be added to the sequence of 29 | operations for this recipe.} 30 | 31 | \item{...}{One or more selector functions to choose which predictors are 32 | affected by the step. See [selections()] for more details. For the `tidy` 33 | method, these are not currently used.} 34 | 35 | \item{outcome}{A character string with the name of the response variable to 36 | use to evaluate information gain value against the predictors.} 37 | 38 | \item{role}{Not used by this step since no new variables are created.} 39 | 40 | \item{trained}{A logical to indicate if the quantities for preprocessing have 41 | been estimated.} 42 | 43 | \item{top_p}{An integer that will be used to select the `top_p` predictors 44 | with the smallest p-values. A value of `NA` implies that this criterion 45 | will be ignored.} 46 | 47 | \item{threshold}{A numeric value between 0 and 1 representing the percentile 48 | of best scoring features to select. For example `threshold = 0.9` will 49 | retain only predictors with scores in the top 90th percentile and a smaller 50 | threshold will select more features. Note that `top_p` and `threshold` are 51 | mutually exclusive but either can be used in conjunction with `cutoff` to 52 | select the top-ranked features and those that are smaller than the cutoff 53 | value.} 54 | 55 | \item{cutoff}{A numeric value, in -log10(p-value) units, where predictors 56 | with _larger_ than the cutoff will be retained. A value of `NA` implies 57 | that this criterion will be ignored.} 58 | 59 | \item{neighbors}{An integer with the number of neighbors for find for each 60 | sampled instance. Default is 5.} 61 | 62 | \item{sample_size}{An integer with the number of instances to sample. Default 63 | is 10.} 64 | 65 | \item{exclude}{A character vector of predictor names that will be removed 66 | from the data. This will be set when `prep()` is used on the recipe and 67 | should not be set by the user.} 68 | 69 | \item{scores}{A tibble with 'variable' and 'scores' columns containing the 70 | names of the variables and their information gain scores. This parameter is 71 | only produced after the recipe has been trained.} 72 | 73 | \item{skip}{A logical. Should the step be skipped when the recipe is baked by 74 | bake.recipe()? While all operations are baked when prep.recipe() is run, 75 | some operations may not be able to be conducted on new data (e.g. 76 | processing the outcome variable(s)). Care should be taken when using skip = 77 | TRUE as it may affect the computations for subsequent operations.} 78 | 79 | \item{id}{A character string that is unique to this step to identify it.} 80 | 81 | \item{x}{A `step_select_relief` object.} 82 | } 83 | \value{ 84 | An updated version of `recipe` with the new step added to the 85 | sequence of existing steps (if any). For the `tidy` method, a tibble with a 86 | `terms` column for which predictors were removed. 87 | } 88 | \description{ 89 | Relief-based algorithms use nearest neighbors of randomly sampled 90 | observations (without replacement) to derive feature weights/scores that 91 | describe the relevance of each feature to the target variable. The feature 92 | weights represent the differences between the normalized feature values from 93 | each randomly sampled observation and a neighboring observation. If the 94 | neighboring observation's class is the same as the sampled observation 95 | (termed a 'hit') but the feature values are different, then this reduces the 96 | score on the basis that widely varying feature values for the same class are 97 | not desirable. Conversely, if a neighboring observation's class is different 98 | from the sampled observation (termed a 'miss') and the feature values are 99 | different, then this increases the score on the basis that observations of 100 | different classes are widely separated by their feature values. The feature 101 | weights / scores range from -1 (worst) to +1 (best). 102 | } 103 | \details{ 104 | `step_select_relief` creates a *specification* of a recipe step that selects 105 | a subset of predictors based on the scores of the relief algorithm. This step 106 | requires the FSinR package to be installed. The top `top_p` scoring features, 107 | or features whose scores occur in the top percentile `threshold` will be 108 | retained as new predictors. 109 | 110 | 111 | 112 | The recipe will stop if all of `top_p`, `threshold` and `cutoff` are left 113 | unspecified. 114 | } 115 | \examples{ 116 | \dontrun{ 117 | library(recipes) 118 | 119 | data(cells, package = "modeldata") 120 | 121 | rec <- recipe(class ~ ., data = cells[, -1]) \%>\% 122 | step_select_relief( 123 | all_predictors(), 124 | outcome = "class", 125 | top_p = 10 126 | ) 127 | 128 | prepped <- prep(rec) 129 | new_data <- bake(prepped, new_data = NULL) 130 | prepped 131 | } 132 | } 133 | \concept{preprocessing} 134 | \concept{supervised_filter} 135 | \keyword{datagen} 136 | -------------------------------------------------------------------------------- /docs/404.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | Page not found (404) • colino 9 | 10 | 11 | 12 | 13 | 14 | 18 | 19 | 20 | Skip to contents 21 | 22 | 23 |
57 |
58 |
62 | 63 | Content not found. Please use links in the navbar. 64 | 65 |
66 |
67 | 68 | 69 |
73 | 74 | 78 | 79 |
80 |
81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | -------------------------------------------------------------------------------- /R/step_select_boruta.R: -------------------------------------------------------------------------------- 1 | #' Feature selection step using Boruta 2 | #' 3 | #' `step_select_boruta` creates a *specification* of a recipe step that selects 4 | #' a subset of predictors using the Boruta feature selection approach. 5 | #' 6 | #' The Boruta algorithm technically is a wrapper approach that uses random 7 | #' forests to test whether the feature importance scores obtained on the 8 | #' original data are higher than best of the scores obtained when the variables 9 | #' are randomly permuted. These permuted features are termed 'shadow' features. 10 | #' If the scores for any original feature are higher than the best of the scores 11 | #' for the randomly permuted features, then this is marked as a 'hit'. Features 12 | #' are confirmed or rejected based on a confidence threshold (default is p = 13 | #' 0.01) applied to the tails of the binomial distribution with p = 0.5. 14 | #' Features that do not fall within the lower (reject) or upper (accept) tails 15 | #' of the distribution are labelled as 'tentative'. Rejected features are 16 | #' dropped from the feature set and the procedure is repeated until no more 17 | #' 'tentative' features exist, or that a maximum number of runs are reached. 18 | #' 19 | #' @inheritParams step_select_aov 20 | #' @inherit step_select_aov return 21 | #' @param outcome A character string with the name of the response variable to 22 | #' use to calculate the feature importance scores. 23 | #' @param role Not used by this step since no new variables are created. 24 | #' @param options A list of options to pass to `Boruta::Boruta()`. The defaults 25 | #' use Boruta's defaults. *Note* that `x` and `y` should not be passed here. 26 | #' @param res The `Boruta::Boruta` object is stored here once this preprocessing 27 | #' step has been trained by `prep.recipe()`. 28 | #' 29 | #' @export 30 | #' @examples 31 | #' library(recipes) 32 | #' library(parsnip) 33 | #' 34 | #' # load the example iris dataset 35 | #' data(cells, package = "modeldata") 36 | #' 37 | #' # create a preprocessing recipe 38 | #' rec <- 39 | #' recipe(class ~ ., data = cells[, -1]) %>% 40 | #' step_select_boruta(all_predictors(), outcome = "class") 41 | #' 42 | #' prepped <- prep(rec) 43 | #' 44 | #' preproc_data <- juice(prepped) 45 | #' prepped 46 | step_select_boruta <- function( 47 | recipe, 48 | ..., 49 | outcome = NULL, 50 | role = "predictor", 51 | trained = FALSE, 52 | exclude = NULL, 53 | options = list(pValue = 0.01, mcAdj = TRUE, maxRuns = 100), 54 | res = NULL, 55 | skip = FALSE, 56 | id = recipes::rand_id("select_boruta")) { 57 | 58 | recipes::recipes_pkg_check("Boruta") 59 | 60 | recipes::add_step( 61 | recipe, 62 | step_select_boruta_new( 63 | terms = recipes::ellipse_check(...), 64 | trained = trained, 65 | outcome = outcome, 66 | role = role, 67 | exclude = exclude, 68 | options = options, 69 | res = res, 70 | skip = skip, 71 | id = id 72 | ) 73 | ) 74 | } 75 | 76 | # wrapper around 'step' function that sets the class of new step objects 77 | #' @importFrom recipes step 78 | step_select_boruta_new <- function(terms, role, trained, outcome, exclude, 79 | options, res, skip, id) { 80 | recipes::step( 81 | subclass = "select_boruta", 82 | terms = terms, 83 | role = role, 84 | trained = trained, 85 | outcome = outcome, 86 | exclude = exclude, 87 | options = options, 88 | res = res, 89 | skip = skip, 90 | id = id 91 | ) 92 | } 93 | 94 | #' @export 95 | prep.step_select_boruta <- function(x, training, info = NULL, ...) { 96 | 97 | # translate the terms arguments 98 | x_names <- recipes::recipes_eval_select(x$terms, training, info) 99 | y_name <- recipes::recipes_eval_select(x$outcome, training, info) 100 | y_name <- y_name[1] 101 | 102 | if (length(x_names) > 0) { 103 | 104 | call <- rlang::call2( 105 | .fn = "Boruta", 106 | .ns = "Boruta", 107 | x = rlang::quo(training[, x_names]), 108 | y = rlang::quo(training[[y_name]]), 109 | !!!x$options 110 | ) 111 | 112 | res <- rlang::eval_tidy(call) 113 | 114 | exclude <- names(res$finalDecision[res$finalDecision == "Rejected"]) 115 | 116 | } else { 117 | exclude <- character() 118 | } 119 | 120 | step_select_boruta_new( 121 | terms = x$terms, 122 | trained = TRUE, 123 | role = x$role, 124 | outcome = y_name, 125 | exclude = exclude, 126 | options = x$options, 127 | res = res, 128 | skip = x$skip, 129 | id = x$id 130 | ) 131 | } 132 | 133 | #' @export 134 | bake.step_select_boruta <- function(object, new_data, ...) { 135 | if (length(object$exclude) > 0) { 136 | new_data <- new_data[, !colnames(new_data) %in% object$exclude] 137 | } 138 | as_tibble(new_data) 139 | } 140 | 141 | #' @export 142 | print.step_select_boruta <- 143 | function(x, width = max(20, options()$width - 30), ...) { 144 | cat("Boruta feature selection") 145 | 146 | if (recipes::is_trained(x)) { 147 | n <- length(x$exclude) 148 | cat(paste0(" (", n, " excluded)")) 149 | } 150 | cat("\n") 151 | 152 | invisible(x) 153 | } 154 | 155 | #' @rdname step_select_boruta 156 | #' @param x A `step_select_boruta` object. 157 | #' @param type A character with either 'terms' (the default) to return a 158 | #' tibble containing the variables that have been removed by the filter step, 159 | #' or 'scores' to return the scores for each variable. 160 | #' @export 161 | tidy.step_select_boruta <- function(x, type = "terms", ...) { 162 | tidy_filter_step(x, type) 163 | } 164 | 165 | #' @rdname required_pkgs.colino 166 | #' @export 167 | required_pkgs.step_select_boruta <- function(x, ...) { 168 | c("colino", "Boruta") 169 | } 170 | -------------------------------------------------------------------------------- /docs/reference/pipe.html: -------------------------------------------------------------------------------- 1 | 2 | Pipe operator — %>% • colino 6 | Skip to contents 7 | 8 | 9 |
38 |
39 |
44 | 45 |
46 |

See magrittr::%>% for details.

47 |
48 | 49 |
50 |

Usage

51 |
lhs %>% rhs
52 |
53 | 54 | 55 |
56 | 57 | 58 |
61 | 62 | 65 | 66 |
67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | -------------------------------------------------------------------------------- /R/step_select_roc.R: -------------------------------------------------------------------------------- 1 | #' Filter Numeric Predictors using ROC Curve 2 | #' 3 | #' `step_select_roc` creates a *specification* of a recipe step that will 4 | #' filter predictors using their relationship with the outcome as measured 5 | #' using a Receiver Operating Characteristic curve. 6 | #' 7 | #' @inheritParams step_select_aov 8 | #' @inherit step_select_aov return 9 | #' @param outcome A single character string that specifies a single categorical 10 | #' variable to be used as the class. 11 | #' @param role For model terms created by this step, what analysis role should 12 | #' they be assigned?. By default, the function assumes that resulting distances 13 | #' will be used as predictors in a model. 14 | #' 15 | #' @keywords datagen 16 | #' @concept preprocessing 17 | #' @concept supervised_filter 18 | #' @export 19 | #' @details 20 | #' 21 | #' The recipe will stop if all of `top_p`, `threshold` and `cutoff` are left 22 | #' unspecified. 23 | #' 24 | #' The ROC AUC will be set to be 1 - AUC if the value is less than 0.50. 25 | #' 26 | #' @examples 27 | #' data(cells, package = "modeldata") 28 | #' 29 | #' rec <- 30 | #' recipe(class ~ ., data = cells[, -1]) %>% 31 | #' step_select_roc(all_predictors(), outcome = "class", top_p = 10, cutoff = 0.9) %>% 32 | #' prep() 33 | #' 34 | #' rec %>% bake(all_predictors(), new_data = NULL) %>% names() 35 | #' 36 | #' # Use ROC values to select but always keep at least one: 37 | #' rec <- 38 | #' recipe(class ~ ., data = cells[, -1]) %>% 39 | #' step_select_roc( 40 | #' all_predictors(), 41 | #' outcome = "class", 42 | #' top_p = 1, 43 | #' cutoff = 0.99 44 | #' ) %>% 45 | #' prep() 46 | #' 47 | #' rec %>% juice(all_predictors()) %>% names() 48 | step_select_roc <- 49 | function(recipe, ..., outcome, role = "predictor", trained = FALSE, 50 | threshold = NA, top_p = NA, cutoff = NA, exclude = NULL, 51 | skip = FALSE, id = recipes::rand_id("select_roc")) { 52 | recipes::add_step( 53 | recipe, 54 | step_select_roc_new( 55 | terms = recipes::ellipse_check(...), 56 | outcome = outcome, 57 | role = role, 58 | trained = trained, 59 | top_p = top_p, 60 | threshold = threshold, 61 | cutoff = cutoff, 62 | exclude = exclude, 63 | skip = skip, 64 | id = id 65 | ) 66 | ) 67 | } 68 | 69 | step_select_roc_new <- 70 | function(terms, outcome, role, trained, top_p, threshold, cutoff, exclude, 71 | skip, id) { 72 | recipes::step( 73 | subclass = "select_roc", 74 | terms = terms, 75 | outcome = outcome, 76 | role = role, 77 | trained = trained, 78 | top_p = top_p, 79 | threshold = threshold, 80 | cutoff = cutoff, 81 | exclude = exclude, 82 | skip = skip, 83 | id = id 84 | ) 85 | } 86 | 87 | roc_calc <- function(x, y) { 88 | suppressMessages( 89 | suppressWarnings( 90 | { 91 | if (length(levels(y)) == 2) { 92 | res <- try(pROC::roc(y, x, direction = "auto"), 93 | silent = TRUE) 94 | } else { 95 | res <- try(pROC::multiclass.roc(y, x, direction = "auto"), 96 | silent = TRUE) 97 | } 98 | } 99 | ) 100 | ) 101 | 102 | if (inherits(res, "try-error")) { 103 | res <- NA_real_ 104 | } else { 105 | res <- unname(pROC::auc(res)) 106 | } 107 | res 108 | } 109 | 110 | #' @export 111 | prep.step_select_roc <- function(x, training, info = NULL, ...) { 112 | y_name <- recipes::recipes_eval_select(x$outcome, training, info) 113 | y_name <- x$outcome[1] 114 | recipes::check_type(training[, y_name], quant = FALSE) 115 | x_names <- recipes::recipes_eval_select(x$terms, training, info) 116 | 117 | if (length(x_names) > 0) { 118 | 119 | recipes::check_type(training[, x_names]) 120 | 121 | # check criteria 122 | check_criteria(x$top_p, x$threshold, match.call()) 123 | check_zero_one(x$threshold) 124 | x$top_p <- check_top_p(x$top_p, length(x_names)) 125 | 126 | # filter 127 | scores <- purrr::map_dbl(training[, x_names], ~ roc_calc(.x, training[[y_name]])) 128 | exclude_chr <- dual_filter(scores, x$top_p, x$threshold, x$cutoff, 129 | maximize = TRUE) 130 | } else { 131 | exclude_chr <- character() 132 | } 133 | 134 | step_select_roc_new( 135 | terms = x$terms, 136 | outcome = x$outcome, 137 | role = x$role, 138 | trained = TRUE, 139 | top_p = x$top_p, 140 | threshold = x$threshold, 141 | cutoff = x$cutoff, 142 | exclude = exclude_chr, 143 | skip = x$skip, 144 | id = x$id 145 | ) 146 | } 147 | 148 | #' @export 149 | bake.step_select_roc <- function(object, new_data, ...) { 150 | if (length(object$exclude) > 0) { 151 | new_data <- new_data %>% 152 | dplyr::select(-dplyr::one_of(object$exclude)) 153 | } 154 | new_data 155 | } 156 | 157 | #' @export 158 | print.step_select_roc <- 159 | function(x, width = max(20, options()$width - 30), ...) { 160 | cat("ROC curve feature selection") 161 | 162 | if (recipes::is_trained(x)) { 163 | n <- length(x$exclude) 164 | cat(paste0(" (", n, " excluded)")) 165 | } 166 | cat("\n") 167 | 168 | invisible(x) 169 | } 170 | 171 | #' @rdname step_select_roc 172 | #' @param x A `step_select_roc` object. 173 | #' @export 174 | tidy.step_select_roc <- function(x, ...) { 175 | tidy_filter_step(x, type = "terms") 176 | } 177 | 178 | #' @export 179 | tunable.step_select_roc <- function(x, ...) { 180 | tibble::tibble( 181 | name = c("top_p", "threshold", "cutoff"), 182 | call_info = list( 183 | list(pkg = "colino", fun = "top_p"), 184 | list(pkg = "dials", fun = "threshold", range = c(0, 1)), 185 | list(pkg = "colino", fun = "cutoff") 186 | ), 187 | source = "recipe", 188 | component = "step_select_roc", 189 | component_id = x$id 190 | ) 191 | } 192 | 193 | #' @rdname required_pkgs.colino 194 | #' @export 195 | required_pkgs.step_select_roc <- function(x, ...) { 196 | c("colino", "pROC") 197 | } 198 | -------------------------------------------------------------------------------- /R/step_select_mrmr.R: -------------------------------------------------------------------------------- 1 | #' Apply minimum Redundancy Maximum Relevance Feature Selection (mRMR) 2 | #' 3 | #' `step_select_mrmr` creates a *specification* of a recipe step that will apply 4 | #' minimum Redundancy Maximum Relevance Feature Selection (mRMR) to numeric 5 | #' data. The top `top_p` scoring features, or features whose scores occur in the 6 | #' top percentile `threshold` will be retained as new predictors. 7 | #' 8 | #' @inheritParams step_select_aov 9 | #' @inherit step_select_aov return 10 | #' @param role Not used by this step since no new variables are created 11 | #' @param outcome A character string specifying the name of response variable 12 | #' used to evaluate mRMR. 13 | #' @param threads An integer specifying the number of threads to use for 14 | #' processing. The default = 0 uses all available threads. 15 | #' @param scores A tibble with 'variable' and 'scores' columns containing the 16 | #' names of the variables and their mRMR scores. This parameter is only 17 | #' produced after the recipe has been trained. 18 | #' 19 | #' @keywords datagen 20 | #' @concept preprocessing 21 | #' @concept supervised_filter 22 | #' @export 23 | #' @details 24 | #' 25 | #' The recipe will stop if all of `top_p`, `threshold` and `cutoff` are left 26 | #' unspecified. 27 | #' 28 | #' @examples 29 | #' library(recipes) 30 | #' 31 | #' data(cells, package = "modeldata") 32 | #' 33 | #' rec <- 34 | #' recipe(class ~ ., data = cells[, -1]) %>% 35 | #' step_select_mrmr( 36 | #' all_predictors(), 37 | #' outcome = "class", 38 | #' top_p = 10 39 | #' ) 40 | #' 41 | #' prepped <- prep(rec) 42 | #' 43 | #' new_data <- bake(prepped, new_data = NULL) 44 | #' prepped 45 | step_select_mrmr <- function( 46 | recipe, ..., 47 | outcome = NULL, 48 | role = NA, 49 | trained = FALSE, 50 | top_p = NA, 51 | threshold = NA, 52 | cutoff = NA, 53 | threads = 0, 54 | exclude = NULL, 55 | scores = NULL, 56 | skip = FALSE, 57 | id = recipes::rand_id("select_mrmr")) { 58 | 59 | recipes::recipes_pkg_check("praznik") 60 | 61 | terms <- recipes::ellipse_check(...) 62 | 63 | recipes::add_step( 64 | recipe, 65 | step_select_mrmr_new( 66 | terms = terms, 67 | trained = trained, 68 | outcome = outcome, 69 | role = role, 70 | top_p = top_p, 71 | threshold = threshold, 72 | cutoff = cutoff, 73 | threads = threads, 74 | exclude = exclude, 75 | scores = scores, 76 | skip = skip, 77 | id = id 78 | ) 79 | ) 80 | } 81 | 82 | step_select_mrmr_new <- 83 | function(terms, role, trained, outcome, top_p, threshold, cutoff, threads, 84 | exclude, scores, skip, id) { 85 | recipes::step( 86 | subclass = "select_mrmr", 87 | terms = terms, 88 | role = role, 89 | trained = trained, 90 | outcome = outcome, 91 | top_p = top_p, 92 | threshold = threshold, 93 | cutoff = cutoff, 94 | threads = threads, 95 | exclude = exclude, 96 | scores = scores, 97 | skip = skip, 98 | id = id 99 | ) 100 | } 101 | 102 | #' @export 103 | prep.step_select_mrmr <- function(x, training, info = NULL, ...) { 104 | # extract response and predictor names 105 | y_name <- recipes::recipes_eval_select(x$outcome, training, info) 106 | y_name <- y_name[1] 107 | x_names <- recipes::recipes_eval_select(x$terms, training, info) 108 | 109 | # check criteria 110 | check_criteria(x$top_p, x$threshold, match.call()) 111 | check_zero_one(x$threshold) 112 | x$top_p <- check_top_p(x$top_p, length(x_names)) 113 | 114 | if (length(x_names) > 0) { 115 | 116 | call <- rlang::call2( 117 | .fn = "MRMR", 118 | .ns = "praznik", 119 | X = rlang::quo(training[, x_names]), 120 | Y = rlang::quo(training[[y_name]]), 121 | k = length(x_names), 122 | threads = x$threads 123 | ) 124 | 125 | res <- rlang::eval_tidy(call) 126 | 127 | res <- tibble( 128 | variable = names(res$selection), 129 | score = res$score 130 | ) 131 | 132 | exclude <- 133 | dual_filter(res$score, x$top_p, x$threshold, x$cutoff, maximize = TRUE) 134 | 135 | } else { 136 | exclude <- character() 137 | } 138 | 139 | step_select_mrmr_new( 140 | terms = x$terms, 141 | trained = TRUE, 142 | role = x$role, 143 | outcome = y_name, 144 | top_p = x$top_p, 145 | threshold = x$threshold, 146 | cutoff = x$cutoff, 147 | threads = x$threads, 148 | exclude = exclude, 149 | scores = res, 150 | skip = x$skip, 151 | id = x$id 152 | ) 153 | } 154 | 155 | #' @export 156 | bake.step_select_mrmr <- function(object, new_data, ...) { 157 | if (length(object$exclude) > 0) { 158 | new_data <- new_data[, !(colnames(new_data) %in% object$exclude)] 159 | } 160 | as_tibble(new_data) 161 | } 162 | 163 | #' @export 164 | print.step_select_mrmr <- 165 | function(x, width = max(20, options()$width - 30), ...) { 166 | cat("mRMR feature selection") 167 | 168 | if (recipes::is_trained(x)) { 169 | n <- length(x$exclude) 170 | cat(paste0(" (", n, " excluded)")) 171 | } 172 | cat("\n") 173 | 174 | invisible(x) 175 | } 176 | 177 | #' @rdname step_select_mrmr 178 | #' @param x A `step_select_mrmr` object. 179 | #' @param type A character with either 'terms' (the default) to return a 180 | #' tibble containing the variables that have been removed by the filter step, 181 | #' or 'scores' to return the scores for each variable. 182 | #' @export 183 | tidy.step_select_mrmr <- function(x, type = "terms", ...) { 184 | tidy_filter_step(x, type) 185 | } 186 | 187 | #' @export 188 | tunable.step_select_mrmr <- function(x, ...) { 189 | tibble( 190 | name = c("top_p", "threshold", "cutoff"), 191 | call_info = list( 192 | list(pkg = "colino", fun = "top_p"), 193 | list(pkg = "dials", fun = "threshold", range = c(0, 1)), 194 | list(pkg = "colino", fun = "cutoff") 195 | ), 196 | source = "recipe", 197 | component = "step_select_mrmr", 198 | component_id = x$id 199 | ) 200 | } 201 | 202 | #' @rdname required_pkgs.colino 203 | #' @export 204 | required_pkgs.step_select_mrmr <- function(x, ...) { 205 | c("colino", "praznik") 206 | } 207 | -------------------------------------------------------------------------------- /tests/testthat/test_step_select_fcbf.R: -------------------------------------------------------------------------------- 1 | library(recipes) 2 | 3 | data(iris) 4 | 5 | test_that("basic usage: expected columns retrieved", { 6 | skip_if_not_installed("FCBF") 7 | 8 | my_iris <- iris 9 | my_iris[['lglfeat']] <- c(TRUE, TRUE, FALSE, FALSE, TRUE, FALSE) 10 | my_iris[['partial_NAfeat']] <- c(2, 3, 6, 4, 3, NA) 11 | 12 | rec <- 13 | recipe(Species ~ ., data = my_iris) %>% 14 | step_select_fcbf(all_predictors(), threshold = 0.001) 15 | 16 | rec_p <- prep(rec, training = my_iris) 17 | 18 | iris_bake <- bake(rec_p, new_data = my_iris) 19 | 20 | expect_equal(names(iris_bake), 21 | c("Sepal.Width", "Petal.Width", "Species")) 22 | }) 23 | 24 | test_that("warns/breaks if not enough predictors are provided", { 25 | skip_if_not_installed("FCBF") 26 | 27 | # warn if one usable predictor is provided to fcbf 28 | rec1 <- 29 | recipe(Species ~ Sepal.Length, iris) %>% 30 | step_select_fcbf(all_predictors()) 31 | 32 | expect_warning(prep(rec1, training = iris), "Only one usable") 33 | 34 | # stop if no usable predictors are provided to fcbf 35 | rec2 <- 36 | recipe(Species ~ ., iris[, 'Species', drop = FALSE]) %>% 37 | step_select_fcbf(all_predictors()) 38 | 39 | expect_error(prep(rec2, training = iris), "No usable predictors") 40 | }) 41 | 42 | test_that("step_select_fcbf rejects bad threshold or cutpoint argument input", { 43 | skip_if_not_installed("FCBF") 44 | 45 | rec <- recipe(Species ~ ., data = iris) 46 | 47 | expect_error( 48 | rec %>% 49 | step_select_fcbf(threshold = 1.5) %>% 50 | prep(), 51 | "(0, 1)" 52 | ) 53 | 54 | expect_error( 55 | rec %>% 56 | step_select_fcbf(threshold = NA) %>% 57 | prep(), 58 | "No usable" 59 | ) 60 | 61 | expect_error( 62 | rec %>% 63 | step_select_fcbf(threshold = 0) %>% 64 | prep(), 65 | "(0, 1)" 66 | ) 67 | 68 | expect_error( 69 | rec %>% 70 | step_select_fcbf(threshold = "median") %>% 71 | prep(), 72 | "should be numeric" 73 | ) 74 | 75 | expect_error( 76 | rec %>% 77 | step_select_fcbf(threshold = TRUE) %>% 78 | prep(), 79 | "should be numeric" 80 | ) 81 | 82 | expect_error( 83 | rec %>% 84 | step_select_fcbf(threshold = -0.01) %>% 85 | prep(), 86 | "(0, 1)" 87 | ) 88 | 89 | error_cut <- "`cutpoint` must be a number between 0-1" 90 | expect_error(rec %>% step_select_fcbf(cutpoint = 1.5), error_cut) 91 | expect_error(rec %>% step_select_fcbf(cutpoint = NA), error_cut) 92 | expect_error(rec %>% step_select_fcbf(cutpoint = 0), error_cut) 93 | expect_error(rec %>% step_select_fcbf(cutpoint = "median"), error_cut) 94 | expect_error(rec %>% step_select_fcbf(cutpoint = TRUE), error_cut) 95 | expect_error(rec %>% step_select_fcbf(cutpoint = -0.01), error_cut) 96 | }) 97 | 98 | # return warning if NA columns are provided 99 | test_that("NA columns get removed with warning", { 100 | skip_if_not_installed("FCBF") 101 | 102 | na_vec <- rep(NA, 10) 103 | na_dat <- tibble( 104 | out = rep(c("A", "B"), 5), 105 | f1 = as.character(na_vec), 106 | f2 = as.numeric(na_vec), 107 | f3 = na_vec, 108 | f4 = c(1, 4, 32, 6, 4, 23, 44, 54, 23, 6), 109 | f5 = c(1:10) 110 | ) 111 | 112 | inpt_cols <- c('f1', 'f2', 'f3', 'f4', 'f5') 113 | 114 | rec <- 115 | recipe(out ~ f1 + f2 + f3 + f4 + f5, data = na_dat) %>% 116 | step_select_fcbf(all_predictors()) 117 | 118 | expect_warning(remove_NA_cols(inpt_cols, na_dat), "3 features were full") 119 | expect_warning(prep(rec, na_dat), "3 features were full") 120 | }) 121 | 122 | 123 | # return error if outcome is not provided, or not in expected format 124 | test_that("bad outcome variables handled correctly", { 125 | skip_if_not_installed("FCBF") 126 | 127 | # no outcome variable specified in recipe 128 | rec <- 129 | recipe(iris) %>% 130 | update_role(Sepal.Length, Sepal.Width, Petal.Length, new_role = 'predictor') %>% 131 | step_select_fcbf(all_predictors()) 132 | 133 | expect_error(prep(rec, iris), "outcome variable was not found") 134 | 135 | # code works if outcome = argument supplied, despite no outcome in the recipe 136 | rec2 <- 137 | recipe(iris) %>% 138 | update_role(Sepal.Length, Sepal.Width, Petal.Length, new_role = 'predictor') %>% 139 | step_select_fcbf(all_predictors(), outcome = "Species") 140 | 141 | expect_equal( 142 | prep(rec2, iris) %>% bake(iris) %>% names, 143 | c("Sepal.Width", "Petal.Length", "Petal.Width", "Species") 144 | ) 145 | 146 | # outcome supplied in unexpected format 147 | rec3 <- 148 | recipe(iris) %>% 149 | update_role(Sepal.Length, Sepal.Width, Petal.Length, new_role = 'predictor') %>% 150 | step_select_fcbf(all_predictors(), outcome = 5) 151 | 152 | expect_error(prep(rec3, iris), "supplied as a character") 153 | 154 | rec4 <- 155 | recipe(iris) %>% 156 | update_role(Sepal.Length, Sepal.Width, Petal.Length, new_role = 'predictor') %>% 157 | step_select_fcbf(all_predictors(), outcome = c("Species", "Petal.Length")) 158 | 159 | expect_error(prep(rec4, iris), "single outcome variable can be") 160 | 161 | rec5 <- 162 | recipe(iris) %>% 163 | update_role(Sepal.Length, Sepal.Width, Petal.Length, new_role = 'predictor') %>% 164 | step_select_fcbf(all_predictors(), outcome = TRUE) 165 | 166 | expect_error(prep(rec5, iris), "supplied as a character") 167 | 168 | rec6 <- 169 | recipe(iris) %>% 170 | update_role(Sepal.Length, Sepal.Width, Petal.Length, new_role = 'predictor') %>% 171 | step_select_fcbf(all_predictors(), outcome = 'doesnt_exist') 172 | 173 | expect_error(prep(rec6, iris), "not found") 174 | 175 | rec7 <- 176 | recipe(iris) %>% 177 | update_role(Sepal.Length, Sepal.Width, Petal.Length, new_role = 'predictor') %>% 178 | step_select_fcbf(all_predictors(), outcome = NA) 179 | 180 | expect_error(prep(rec7, iris), "outcome variable was not found") 181 | }) 182 | 183 | # Test if user provides columns by name rather than using tidyselect helpers 184 | test_that("function works if user provides columns by name", { 185 | skip_if_not_installed("FCBF") 186 | 187 | rec <- 188 | recipe(Species ~ . , iris) %>% 189 | step_select_fcbf(c("Petal.Length", "Sepal.Length")) 190 | 191 | expect_equal( 192 | prep(rec, iris) %>% bake(iris) %>% names, 193 | c("Sepal.Width", "Petal.Length", "Petal.Width", "Species") 194 | ) 195 | }) 196 | -------------------------------------------------------------------------------- /docs/LICENSE.html: -------------------------------------------------------------------------------- 1 | 2 | MIT License • colino 6 | Skip to contents 7 | 8 | 9 |
38 |
39 |
43 | 44 |
45 | 46 |

Copyright (c) 2019 Steven Pawley

47 |

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

48 |

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

49 |

THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

50 |
51 | 52 |
53 | 54 | 55 |
58 | 59 | 62 | 63 |
64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | -------------------------------------------------------------------------------- /R/step_select_vip.R: -------------------------------------------------------------------------------- 1 | #' Feature selection step using a model's feature importance scores or 2 | #' coefficients 3 | #' 4 | #' `step_select_vip` creates a *specification* of a recipe step that selects a 5 | #' subset of predictors based on the ranking of variable importance provided by 6 | #' a `parsnip` model specification and the `model` parameter 7 | #' 8 | #' @inheritParams step_select_aov 9 | #' @inherit step_select_aov return 10 | #' @param outcome A character string with the name of the response variable to 11 | #' use to calculate the feature importance scores. 12 | #' @param role Not used by this step since no new variables are created. 13 | #' @param model A `model_spec` object from `parsnip` that has a feature 14 | #' importances or coefficients method. The model needs to have an equivalent 15 | #' `pull_importances` method defined. See `?pull_importances` for how to 16 | #' define methods for models that are not currently supported. 17 | #' @param scores A tibble with 'variable' and 'scores' columns containing the 18 | #' names of the variables and their feature importance scores. This parameter 19 | #' is only produced after the recipe has been trained. 20 | #' 21 | #' @export 22 | #' 23 | #' @details 24 | #' The recipe will stop if all of `top_p`, `threshold` and `cutoff` are left 25 | #' unspecified. 26 | #' 27 | #' @examples 28 | #' library(recipes) 29 | #' library(parsnip) 30 | #' 31 | #' # load the example cells dataset 32 | #' data(cells, package = "modeldata") 33 | #' 34 | #' # define a base model to use for feature importances 35 | #' base_model <- rand_forest(mode = "classification") %>% 36 | #' set_engine("ranger", importance = "permutation") 37 | #' 38 | #' # create a preprocessing recipe 39 | #' rec <- 40 | #' recipe(class ~ ., data = cells[, -1]) %>% 41 | #' step_select_vip( 42 | #' all_predictors(), 43 | #' outcome = "class", 44 | #' model = base_model, 45 | #' top_p = 10 46 | #' ) 47 | #' 48 | #' prepped <- prep(rec) 49 | #' 50 | #' preproc_data <- juice(prepped) 51 | #' prepped 52 | step_select_vip <- function( 53 | recipe, 54 | ..., 55 | outcome = NULL, 56 | role = "predictor", 57 | trained = FALSE, 58 | model = NULL, 59 | top_p = NA, 60 | threshold = NA, 61 | cutoff = NA, 62 | exclude = NULL, 63 | scores = NULL, 64 | skip = FALSE, 65 | id = recipes::rand_id("select_vip")) { 66 | 67 | if (missing(model)) 68 | rlang::abort("Model argument should be a `parsnip` model specification") 69 | 70 | recipes::add_step( 71 | recipe, 72 | step_select_vip_new( 73 | terms = recipes::ellipse_check(...), 74 | trained = trained, 75 | outcome = outcome, 76 | role = role, 77 | model = model, 78 | top_p = top_p, 79 | threshold = threshold, 80 | cutoff = cutoff, 81 | exclude = exclude, 82 | scores = scores, 83 | skip = skip, 84 | id = id 85 | ) 86 | ) 87 | } 88 | 89 | # wrapper around 'step' function that sets the class of new step objects 90 | #' @importFrom recipes step 91 | step_select_vip_new <- function(terms, role, trained, outcome, model, top_p, 92 | threshold, cutoff, exclude, scores, skip, id) { 93 | recipes::step( 94 | subclass = "select_vip", 95 | terms = terms, 96 | role = role, 97 | trained = trained, 98 | outcome = outcome, 99 | model = model, 100 | top_p = top_p, 101 | threshold = threshold, 102 | cutoff = cutoff, 103 | exclude = exclude, 104 | scores = scores, 105 | skip = skip, 106 | id = id 107 | ) 108 | } 109 | 110 | #' @export 111 | prep.step_select_vip <- function(x, training, info = NULL, ...) { 112 | 113 | # translate the terms arguments 114 | x_names <- recipes::recipes_eval_select(x$terms, training, info) 115 | y_name <- recipes::recipes_eval_select(x$outcome, training, info) 116 | y_name <- y_name[1] 117 | 118 | # check criteria 119 | check_criteria(x$top_p, x$threshold, match.call()) 120 | check_zero_one(x$threshold) 121 | x$top_p <- check_top_p(x$top_p, length(x_names)) 122 | 123 | if (length(x_names) > 0) { 124 | # fit initial model 125 | X <- training[, x_names] 126 | y <- training[[y_name]] 127 | 128 | initial_model <- parsnip::fit_xy(x$model, X, y) 129 | res <- pull_importances(initial_model) 130 | names(res) <- c("variable", "score") 131 | res$score <- rlang::set_names(res$score, res$variable) 132 | 133 | exclude <- 134 | dual_filter(res$score, x$top_p, x$threshold, x$cutoff, maximize = TRUE) 135 | 136 | } else { 137 | exclude <- character() 138 | } 139 | 140 | step_select_vip_new( 141 | terms = x$terms, 142 | trained = TRUE, 143 | role = x$role, 144 | outcome = y_name, 145 | model = x$model, 146 | top_p = x$top_p, 147 | threshold = x$threshold, 148 | cutoff = x$cutoff, 149 | exclude = exclude, 150 | scores = res, 151 | skip = x$skip, 152 | id = x$id 153 | ) 154 | } 155 | 156 | #' @export 157 | bake.step_select_vip <- function(object, new_data, ...) { 158 | if (length(object$exclude) > 0) { 159 | new_data <- new_data[, !colnames(new_data) %in% object$exclude] 160 | } 161 | as_tibble(new_data) 162 | } 163 | 164 | #' @export 165 | print.step_select_vip <- 166 | function(x, width = max(20, options()$width - 30), ...) { 167 | cat("Variable importance feature selection") 168 | 169 | if (recipes::is_trained(x)) { 170 | n <- length(x$exclude) 171 | cat(paste0(" (", n, " excluded)")) 172 | } 173 | cat("\n") 174 | 175 | invisible(x) 176 | } 177 | 178 | #' @rdname step_select_vip 179 | #' @param x A `step_select_vip` object 180 | #' @param type A character with either 'terms' (the default) to return a 181 | #' tibble containing the variables that have been removed by the filter step, 182 | #' or 'scores' to return the scores for each variable. 183 | #' @export 184 | tidy.step_select_vip <- function(x, type = "terms", ...) { 185 | tidy_filter_step(x, type) 186 | } 187 | 188 | #' @export 189 | tunable.step_select_vip <- function(x, ...) { 190 | tibble( 191 | name = c("top_p", "threshold", "cutoff"), 192 | call_info = list( 193 | list(pkg = "colino", fun = "top_p"), 194 | list(pkg = "dials", fun = "threshold", range = c(0, 1)), 195 | list(pkg = "colino", fun = "cutoff") 196 | ), 197 | source = "recipe", 198 | component = "step_select_vip", 199 | component_id = x$id 200 | ) 201 | } 202 | 203 | #' @rdname required_pkgs.colino 204 | #' @export 205 | required_pkgs.step_select_vip <- function(x, ...) { 206 | c("colino") 207 | } 208 | -------------------------------------------------------------------------------- /R/step_select_xtab.R: -------------------------------------------------------------------------------- 1 | #' Filter Categorical Predictors using Contingency Tables 2 | #' 3 | #' `step_select_xtab` creates a *specification* of a recipe step that will 4 | #' filter predictors using their relationship with the outcome as measured 5 | #' using statistical tests for association. 6 | #' 7 | #' @inheritParams step_select_aov 8 | #' @inherit step_select_aov return 9 | #' @param outcome A single character string that specifies a single categorical 10 | #' variable to be used as the class. 11 | #' @param role For model terms created by this step, what analysis role should 12 | #' they be assigned?. By default, the function assumes that resulting distances 13 | #' will be used as predictors in a model. 14 | #' @param exact Should an exact test be used? 15 | #' @param fdr Should false discovery rates (FDR) be used instead of p-values? 16 | #' 17 | #' @keywords datagen 18 | #' @concept preprocessing 19 | #' @concept supervised_filter 20 | #' @export 21 | #' @details 22 | #' 23 | #' The recipe will stop if all of `top_p`, `threshold` and `cutoff` are left 24 | #' unspecified. If both are used, they are combined via 'or'. 25 | #' 26 | #' The Benjamini-Hochberg FDR correction is used (see [stats::p.adjust()]). 27 | #' 28 | #' Warnings from [stats::chisq.test()] and [stats::fisher.test()] are suppressed. 29 | #' @examples 30 | #' data(attrition, package = "modeldata") 31 | #' 32 | #' rec <- 33 | #' recipe(Attrition ~ ., data = attrition) %>% 34 | #' step_select_xtab(all_nominal(), -all_outcomes(), outcome = "Attrition", 35 | #' top_p = 1, cutoff = 0.001, exact = TRUE) %>% 36 | #' prep() 37 | #' 38 | #' rec %>% juice(all_nominal(), -all_outcomes()) %>% names() 39 | #' 40 | #' tidy(rec, number = 1) 41 | step_select_xtab <- function(recipe, 42 | ..., 43 | outcome, 44 | role = "predictor", 45 | trained = FALSE, 46 | top_p = NA, 47 | threshold = NA, 48 | cutoff = NA, 49 | exact = FALSE, 50 | fdr = TRUE, 51 | exclude = NULL, 52 | skip = FALSE, 53 | id = recipes::rand_id("select_xtab")) { 54 | recipes::add_step( 55 | recipe, 56 | step_select_xtab_new( 57 | terms = recipes::ellipse_check(...), 58 | outcome = outcome, 59 | role = role, 60 | trained = trained, 61 | top_p = top_p, 62 | threshold = threshold, 63 | cutoff = cutoff, 64 | exact = exact, 65 | fdr = fdr, 66 | exclude = exclude, 67 | skip = skip, 68 | id = id 69 | ) 70 | ) 71 | } 72 | 73 | step_select_xtab_new <- 74 | function(terms, outcome, role, trained, top_p, threshold, cutoff, exact, fdr, 75 | exclude, skip, id) { 76 | recipes::step( 77 | subclass = "select_xtab", 78 | terms = terms, 79 | outcome = outcome, 80 | role = role, 81 | trained = trained, 82 | top_p = top_p, 83 | threshold = threshold, 84 | cutoff = cutoff, 85 | exact = exact, 86 | fdr = fdr, 87 | exclude = exclude, 88 | skip = skip, 89 | id = id 90 | ) 91 | } 92 | 93 | tbl_calc <- function(x, y, exact) { 94 | xtab <- table(x, y) 95 | if (exact) { 96 | res <- suppressWarnings(try(stats::fisher.test(xtab)$p.value, silent = TRUE)) 97 | } else { 98 | res <- suppressWarnings(try(stats::chisq.test(xtab)$p.value, silent = TRUE)) 99 | } 100 | if (inherits(res, "try-error")) { 101 | res <- NA_real_ 102 | } 103 | res 104 | } 105 | 106 | #' @export 107 | prep.step_select_xtab <- function(x, training, info = NULL, ...) { 108 | y_name <- recipes::recipes_eval_select(x$outcome, training, info) 109 | y_name <- x$outcome[1] 110 | recipes::check_type(training[, y_name], quant = FALSE) 111 | x_names <- recipes::recipes_eval_select(x$terms, training, info) 112 | 113 | if (length(x_names) > 0) { 114 | 115 | recipes::check_type(training[, x_names], quant = FALSE) 116 | 117 | # check criteria 118 | check_criteria(x$top_p, x$threshold, match.call()) 119 | check_zero_one(x$threshold) 120 | x$top_p <- check_top_p(x$top_p, length(x_names)) 121 | 122 | # filter 123 | scores <- purrr::map_dbl(training[, x_names], 124 | ~ tbl_calc(.x, training[[y_name]], exact = x$exact)) 125 | scores <- sort(scores, na.last = TRUE) 126 | if (x$fdr) { 127 | scores <- stats::p.adjust(scores, method = "BH") 128 | } 129 | 130 | exclude_chr <- dual_filter(scores, x$top_p, x$threshold, x$cutoff, maximize = FALSE) 131 | } else { 132 | exclude_chr <- character() 133 | } 134 | 135 | step_select_xtab_new( 136 | terms = x$terms, 137 | outcome = x$outcome, 138 | role = x$role, 139 | trained = TRUE, 140 | top_p = x$top_p, 141 | threshold = x$threshold, 142 | cutoff = x$cutoff, 143 | exact = x$exact, 144 | fdr = x$fdr, 145 | exclude = exclude_chr, 146 | skip = x$skip, 147 | id = x$id 148 | ) 149 | } 150 | 151 | #' @export 152 | bake.step_select_xtab <- function(object, new_data, ...) { 153 | if (length(object$exclude) > 0) { 154 | new_data <- new_data %>% dplyr::select(-dplyr::one_of(object$exclude)) 155 | } 156 | new_data 157 | } 158 | 159 | #' @export 160 | print.step_select_xtab <- 161 | function(x, width = max(20, options()$width - 30), ...) { 162 | cat("Association test feature selection") 163 | 164 | if (recipes::is_trained(x)) { 165 | n <- length(x$exclude) 166 | cat(paste0(" (", n, " excluded)")) 167 | } 168 | cat("\n") 169 | 170 | invisible(x) 171 | } 172 | 173 | #' @rdname step_select_xtab 174 | #' @param x A `step_select_xtab` object. 175 | #' @export 176 | tidy.step_select_xtab <- function(x, ...) { 177 | tidy_filter_step(x, type = "terms") 178 | } 179 | 180 | #' @export 181 | tunable.step_select_xtab <- function(x, ...) { 182 | tibble::tibble( 183 | name = c("top_p", "threshold", "cutoff"), 184 | call_info = list( 185 | list(pkg = "colino", fun = "top_p"), 186 | list(pkg = "dials", fun = "threshold", range = c(-10, -1)), 187 | list(pkg = "colino", fun = "cutoff") 188 | ), 189 | source = "recipe", 190 | component = "step_select_xtab", 191 | component_id = x$id 192 | ) 193 | } 194 | 195 | #' @rdname required_pkgs.colino 196 | #' @export 197 | required_pkgs.step_select_xtab <- function(x, ...) { 198 | c("colino") 199 | } 200 | -------------------------------------------------------------------------------- /docs/authors.html: -------------------------------------------------------------------------------- 1 | 2 | Authors and Citation • colino 6 | Skip to contents 7 | 8 | 9 |
38 |
39 |
42 | 43 |
44 |

Authors

45 | 46 |
  • 47 |

    Steven Pawley. Author, maintainer. 48 |

    49 |
  • 50 |
  • 51 |

    Max Kuhn. Author. 52 |

    53 |
  • 54 |
  • 55 |

    Rowan Jacques-Hamilton. Author. 56 |

    57 |
  • 58 |
59 | 60 |
61 |

Citation

62 |

Source: DESCRIPTION

63 | 64 |

Pawley S, Kuhn M, Jacques-Hamilton R (2022). 65 | colino: Recipes Steps for Supervised Filter-Based Feature Selection. 66 | R package version 0.0.1, https://stevenpawley.github.io/colino. 67 |

68 |
@Manual{,
69 |   title = {colino: Recipes Steps for Supervised Filter-Based Feature Selection},
70 |   author = {Steven Pawley and Max Kuhn and Rowan Jacques-Hamilton},
71 |   year = {2022},
72 |   note = {R package version 0.0.1},
73 |   url = {https://stevenpawley.github.io/colino},
74 | }
75 |
76 |
78 | 79 | 80 |
89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | -------------------------------------------------------------------------------- /R/step_select_carscore.R: -------------------------------------------------------------------------------- 1 | #' Feature selection step using the CAR score algorithm 2 | #' 3 | #' `step_select_carscore` creates a *specification* of a recipe step that 4 | #' selects a subset of predictors as part of a regression model based on the 5 | #' scores of the CAR score algorithm. This step requires the `care` package to be 6 | #' installed. The top `top_p` scoring features, or features whose scores occur 7 | #' in the top percentile `threshold` will be retained as new predictors. 8 | #' 9 | #' @inheritParams step_select_aov 10 | #' @inherit step_select_aov return 11 | #' @param role Not used by this step since no new variables are created. 12 | #' @param lambda The correlation shrinkage intensity (range 0-1). 13 | #' @param diagonal For diagonal = FALSE (the default) CAR scores are computed; 14 | #' otherwise with diagonal = TRUE marginal correlations. 15 | #' @param outcome A character string with the name of the response variable. 16 | #' This must refer to a numeric feature for regression. 17 | #' @param scores A tibble with 'variable' and 'scores' columns containing the 18 | #' names of the variables and the absolute values of the calculated CAR 19 | #' scores. This parameter is only produced after the recipe has been trained. 20 | #' @export 21 | #' @keywords datagen 22 | #' @concept preprocessing 23 | #' @concept supervised_filter 24 | #' @export 25 | #' @details 26 | #' 27 | #' The recipe will stop if all of `top_p`, `threshold` and `cutoff` are left 28 | #' unspecified. 29 | #' 30 | #' @examples 31 | #' library(recipes) 32 | #' 33 | #' data(car_prices, package = "modeldata") 34 | #' 35 | #' rec <- 36 | #' recipe(Price ~ ., data = car_prices) %>% 37 | #' step_select_carscore( 38 | #' all_predictors(), 39 | #' outcome = "Price", 40 | #' top_p = 5, 41 | #' cutoff = 0.7 42 | #' ) 43 | #' 44 | #' prepped <- prep(rec) 45 | #' 46 | #' new_data <- bake(prepped, new_data = NULL) 47 | #' prepped 48 | step_select_carscore <- function( 49 | recipe, ..., 50 | outcome = NULL, 51 | role = NA, 52 | trained = FALSE, 53 | top_p = NA, 54 | threshold = NA, 55 | cutoff = NA, 56 | lambda = NA, 57 | diagonal = FALSE, 58 | exclude = NULL, 59 | scores = NULL, 60 | skip = FALSE, 61 | id = recipes::rand_id("select_carscore")) { 62 | 63 | recipes::recipes_pkg_check("care") 64 | 65 | terms <- recipes::ellipse_check(...) 66 | 67 | recipes::add_step( 68 | recipe, 69 | step_select_carscore_new( 70 | terms = terms, 71 | trained = trained, 72 | outcome = outcome, 73 | role = role, 74 | top_p = top_p, 75 | threshold = threshold, 76 | cutoff = cutoff, 77 | lambda = lambda, 78 | diagonal = diagonal, 79 | exclude = exclude, 80 | scores = scores, 81 | skip = skip, 82 | id = id 83 | ) 84 | ) 85 | } 86 | 87 | 88 | # wrapper around 'step' function that sets the class of new step objects 89 | step_select_carscore_new <- 90 | function(terms, role, trained, outcome, top_p, threshold, cutoff, lambda, 91 | diagonal, exclude, scores, skip, id) { 92 | recipes::step( 93 | subclass = "select_carscore", 94 | terms = terms, 95 | role = role, 96 | trained = trained, 97 | outcome = outcome, 98 | top_p = top_p, 99 | threshold = threshold, 100 | cutoff = cutoff, 101 | lambda = lambda, 102 | diagonal = diagonal, 103 | exclude = exclude, 104 | scores = scores, 105 | skip = skip, 106 | id = id 107 | ) 108 | } 109 | 110 | 111 | #' @export 112 | prep.step_select_carscore <- function(x, training, info = NULL, ...) { 113 | 114 | # extract response and predictor names 115 | x_names <- recipes::recipes_eval_select(x$terms, training, info) 116 | y_name <- recipes::recipes_eval_select(x$outcome, training, info) 117 | y_name <- y_name[1] 118 | 119 | # check criteria 120 | recipes::check_type(training[, y_name], quant = TRUE) 121 | check_criteria(x$top_p, x$threshold, match.call()) 122 | check_zero_one(x$threshold) 123 | x$top_p <- check_top_p(x$top_p, length(x_names)) 124 | 125 | # information gain 126 | if (length(x_names) > 0) { 127 | 128 | args <- list() 129 | 130 | if (!is.na(x$lambda)) 131 | args$lambda <- x$lambda 132 | 133 | call <- rlang::call2( 134 | .fn = "carscore", 135 | .ns = "care", 136 | Xtrain = training[, x_names], 137 | Ytrain = training[, y_name], 138 | diagonal = x$diagonal, 139 | !!!args 140 | ) 141 | 142 | res <- rlang::eval_tidy(call) 143 | 144 | res <- tibble( 145 | variable = names(res), 146 | score = abs(res) 147 | ) 148 | 149 | exclude <- 150 | dual_filter(res$score, x$top_p, x$threshold, x$cutoff, maximize = TRUE) 151 | 152 | } else { 153 | exclude <- character() 154 | } 155 | 156 | step_select_carscore_new( 157 | terms = x$terms, 158 | trained = TRUE, 159 | role = x$role, 160 | outcome = y_name, 161 | top_p = x$top_p, 162 | threshold = x$threshold, 163 | cutoff = x$cutoff, 164 | lambda = x$lambda, 165 | diagonal = x$diagonal, 166 | exclude = exclude, 167 | scores = res, 168 | skip = x$skip, 169 | id = x$id 170 | ) 171 | } 172 | 173 | #' @export 174 | bake.step_select_carscore <- function(object, new_data, ...) { 175 | if (length(object$exclude > 0)) { 176 | new_data <- new_data[, !(colnames(new_data) %in% object$exclude)] 177 | } 178 | as_tibble(new_data) 179 | } 180 | 181 | #' @export 182 | print.step_select_carscore <- 183 | function(x, width = max(20, options()$width - 30), ...) { 184 | cat("Carscore feature selection") 185 | 186 | if (recipes::is_trained(x)) { 187 | n <- length(x$exclude) 188 | cat(paste0(" (", n, " excluded)")) 189 | } 190 | cat("\n") 191 | 192 | invisible(x) 193 | } 194 | 195 | #' @rdname step_select_carscore 196 | #' @param x A `step_select_carscore` object. 197 | #' @param type A character with either 'terms' (the default) to return a 198 | #' tibble containing the variables that have been removed by the filter step, 199 | #' or 'scores' to return the scores for each variable. 200 | #' @export 201 | tidy.step_select_carscore <- function(x, type = "terms", ...) { 202 | tidy_filter_step(x, type) 203 | } 204 | 205 | #' @export 206 | tunable.step_select_carscore <- function(x, ...) { 207 | tibble::tibble( 208 | name = c("top_p", "threshold", "cutoff"), 209 | call_info = list( 210 | list(pkg = "colino", fun = "top_p"), 211 | list(pkg = "dials", fun = "threshold", range = c(0, 1)), 212 | list(pkg = "colino", fun = "cutoff") 213 | ), 214 | source = "recipe", 215 | component = "step_select_carscore", 216 | component_id = x$id 217 | ) 218 | } 219 | 220 | #' @rdname required_pkgs.colino 221 | #' @export 222 | required_pkgs.step_select_carscore <- function(x, ...) { 223 | c("colino", "care") 224 | } 225 | -------------------------------------------------------------------------------- /R/step_select_infgain.R: -------------------------------------------------------------------------------- 1 | #' Information gain feature selection step 2 | #' 3 | #' `step_select_infgain` creates a *specification* of a recipe step that selects a 4 | #' subset of predictors based on the scores of the information gain algorithm. 5 | #' This step requires the FSelectorRcpp package to be installed. The top 6 | #' `top_p` scoring features, or features whose scores occur in the top 7 | #' percentile `threshold` will be retained as new predictors. 8 | #' 9 | #' @inheritParams step_select_aov 10 | #' @inherit step_select_aov return 11 | #' @param role Not used by this step since no new variables are created. 12 | #' @param type A character string specifying the information gain method to use. 13 | #' One of "infogain", "gainratio", "symuncert". The default is 'infogain'. 14 | #' @param outcome A character string with the name of the response variable to 15 | #' use to evaluate information gain value against the predictors. 16 | #' @param type The entropy measure. One of c("infogain", "gainratio", 17 | #' "symuncert"). The default is 'infogain'. 18 | #' @param threads An integer specifying the number of threads to use for 19 | #' processing. The default = 0 uses all available threads. 20 | #' @param nbins An integer specifying the number of bins for discretization. 21 | #' Only used if the outcome of a continuous variable for regression. The 22 | #' default is 'nbins = 5'. 23 | #' @param scores A tibble with 'variable' and 'scores' columns containing the 24 | #' names of the variables and their information gain scores. This parameter is 25 | #' only produced after the recipe has been trained. 26 | #' 27 | #' @export 28 | #' @keywords datagen 29 | #' @concept preprocessing 30 | #' @concept supervised_filter 31 | #' @export 32 | #' @details 33 | #' 34 | #' The recipe will stop if both `top_p`, `threshold` and `cutoff` are left 35 | #' unspecified. 36 | #' 37 | #' @examples 38 | #' library(recipes) 39 | #' 40 | #' data(cells, package = "modeldata") 41 | #' 42 | #' rec <- 43 | #' recipe(class ~ ., data = cells[, -1]) %>% 44 | #' step_select_infgain( 45 | #' all_predictors(), 46 | #' outcome = "class", 47 | #' threshold = 0.9, 48 | #' id = "infgain" 49 | #' ) 50 | #' 51 | #' prepped <- prep(rec) 52 | #' 53 | #' new_data <- juice(prepped) 54 | #' prepped 55 | step_select_infgain <- function( 56 | recipe, ..., 57 | outcome = NULL, 58 | role = NA, 59 | trained = FALSE, 60 | top_p = NA, 61 | threshold = NA, 62 | cutoff = NA, 63 | type = "infogain", 64 | nbins = 5, 65 | threads = 1, 66 | exclude = NULL, 67 | scores = NULL, 68 | skip = FALSE, 69 | id = recipes::rand_id("select_infgain")) { 70 | 71 | recipes::recipes_pkg_check("FSelectorRcpp") 72 | 73 | terms <- recipes::ellipse_check(...) 74 | 75 | recipes::add_step( 76 | recipe, 77 | step_select_infgain_new( 78 | terms = terms, 79 | trained = trained, 80 | outcome = outcome, 81 | role = role, 82 | top_p = top_p, 83 | threshold = threshold, 84 | cutoff = cutoff, 85 | type = type, 86 | threads = threads, 87 | nbins = nbins, 88 | exclude = exclude, 89 | scores = scores, 90 | skip = skip, 91 | id = id 92 | ) 93 | ) 94 | } 95 | 96 | 97 | # wrapper around 'step' function that sets the class of new step objects 98 | step_select_infgain_new <- function(terms, role, trained, outcome, top_p, 99 | threshold, cutoff, type, threads, nbins, 100 | exclude, scores, skip, id) { 101 | recipes::step( 102 | subclass = "select_infgain", 103 | terms = terms, 104 | role = role, 105 | trained = trained, 106 | outcome = outcome, 107 | top_p = top_p, 108 | threshold = threshold, 109 | cutoff = cutoff, 110 | type = type, 111 | threads = threads, 112 | nbins = nbins, 113 | exclude = exclude, 114 | scores = scores, 115 | skip = skip, 116 | id = id 117 | ) 118 | } 119 | 120 | 121 | #' @export 122 | prep.step_select_infgain <- function(x, training, info = NULL, ...) { 123 | # extract response and predictor names 124 | x_names <- recipes::recipes_eval_select(x$terms, training, info) 125 | y_name <- recipes::recipes_eval_select(x$outcome, training, info) 126 | y_name <- y_name[1] 127 | 128 | # check criteria 129 | check_criteria(x$top_p, x$threshold, match.call()) 130 | check_zero_one(x$threshold) 131 | x$top_p <- check_top_p(x$top_p, length(x_names)) 132 | 133 | # information gain 134 | if (length(x_names) > 0) { 135 | 136 | f <- stats::as.formula(paste(y_name, "~", paste0(x_names, collapse = " + "))) 137 | model_mode <- check_outcome(training[[y_name]]) 138 | equal <- model_mode == "regression" 139 | 140 | ig_call <- rlang::call2( 141 | .fn = "information_gain", 142 | .ns = "FSelectorRcpp", 143 | formula = f, 144 | data = rlang::quo(training), 145 | type = x$type, 146 | threads = x$threads, 147 | discIntegers = TRUE, 148 | equal = equal, 149 | nbins = x$nbins 150 | ) 151 | 152 | res <- rlang::eval_tidy(ig_call) 153 | res <- as_tibble(res) 154 | res <- rlang::set_names(res, c("variable", "score")) 155 | res$score <- rlang::set_names(res$score, res$variable) 156 | 157 | exclude <- 158 | dual_filter(res$score, x$top_p, x$threshold, x$cutoff, maximize = TRUE) 159 | 160 | } else { 161 | exclude <- character() 162 | } 163 | 164 | step_select_infgain_new( 165 | terms = x$terms, 166 | trained = TRUE, 167 | role = x$role, 168 | outcome = y_name, 169 | top_p = x$top_p, 170 | threshold = x$threshold, 171 | cutoff = x$cutoff, 172 | type = x$type, 173 | threads = x$threads, 174 | nbins = x$nbins, 175 | exclude = exclude, 176 | scores = res, 177 | skip = x$skip, 178 | id = x$id 179 | ) 180 | } 181 | 182 | #' @export 183 | bake.step_select_infgain <- function(object, new_data, ...) { 184 | if (length(object$exclude > 0)) { 185 | new_data <- new_data[, !(colnames(new_data) %in% object$exclude)] 186 | } 187 | as_tibble(new_data) 188 | } 189 | 190 | #' @export 191 | print.step_select_infgain <- 192 | function(x, width = max(20, options()$width - 30), ...) { 193 | cat("Information Gain feature selection") 194 | 195 | if (recipes::is_trained(x)) { 196 | n <- length(x$exclude) 197 | cat(paste0(" (", n, " excluded)")) 198 | } 199 | cat("\n") 200 | 201 | invisible(x) 202 | } 203 | 204 | #' @rdname step_select_infgain 205 | #' @param x A `step_select_infgain` object. 206 | #' @param type A character with either 'terms' (the default) to return a 207 | #' tibble containing the variables that have been removed by the filter step, 208 | #' or 'scores' to return the scores for each variable. 209 | #' @export 210 | tidy.step_select_infgain <- function(x, type = "terms", ...) { 211 | tidy_filter_step(x, type) 212 | } 213 | 214 | #' @export 215 | tunable.step_select_infgain <- function(x, ...) { 216 | tibble::tibble( 217 | name = c("top_p", "entropy", "threshold", "cutoff"), 218 | call_info = list( 219 | list(pkg = "colino", fun = "top_p"), 220 | list(pkg = "colino", fun = "entropy", values = values_entropy), 221 | list(pkg = "dials", fun = "threshold", range = c(0, 1)), 222 | list(pkg = "colino", fun = "cutoff") 223 | ), 224 | source = "recipe", 225 | component = "step_select_infgain", 226 | component_id = x$id 227 | ) 228 | } 229 | 230 | #' @rdname required_pkgs.colino 231 | #' @export 232 | required_pkgs.step_select_infgain <- function(x, ...) { 233 | c("colino", "FSelectorRcpp") 234 | } 235 | -------------------------------------------------------------------------------- /R/step_select_tree.R: -------------------------------------------------------------------------------- 1 | #' Feature selection step using a decision tree importance scores 2 | #' 3 | #' `step_select_tree` creates a *specification* of a recipe step that selects a 4 | #' subset of predictors based on the ranking of variable importance provided by 5 | #' a `parsnip::decision_tree` supported model. 6 | #' 7 | #' @inheritParams step_select_aov 8 | #' @inherit step_select_aov return 9 | #' @param outcome A character string with the name of the response variable to 10 | #' use to calculate the feature importance scores. 11 | #' @param role Not used by this step since no new variables are created. 12 | #' @param engine A supported rand_forest engine that is supported by parsnip. 13 | #' The default is "rpart". 14 | #' @param cost_complexity A positive number for the the cost/complexity 15 | #' parameter (a.k.a. Cp) used by CART models (specific engines only). 16 | #' @param tree_depth An integer for maximum depth of the tree. 17 | #' @param min_n An integer for the minimum number of data points in a node that 18 | #' are required for the node to be split further. 19 | #' @param scores A tibble with 'variable' and 'scores' columns containing the 20 | #' names of the variables and their feature importance scores. This parameter 21 | #' is only produced after the recipe has been trained. 22 | #' 23 | #' @export 24 | #' 25 | #' @details 26 | #' 27 | #' The recipe will stop if all of `top_p`, `threshold` and `cutoff` are left 28 | #' unspecified. 29 | #' 30 | #' @examples 31 | #' library(recipes) 32 | #' library(parsnip) 33 | #' 34 | #' # load the example cells dataset 35 | #' data(cells, package = "modeldata") 36 | #' 37 | #' # create a preprocessing recipe 38 | #' rec <- 39 | #' recipe(class ~ ., data = cells[, -1]) %>% 40 | #' step_select_tree(all_predictors(), outcome = "class", top_p = 10) 41 | #' 42 | #' prepped <- prep(rec) 43 | #' 44 | #' preproc_data <- bake(prepped, new_data = NULL) 45 | #' prepped 46 | step_select_tree <- function( 47 | recipe, 48 | ..., 49 | outcome = NULL, 50 | role = "predictor", 51 | trained = FALSE, 52 | engine = "rpart", 53 | cost_complexity = NULL, 54 | tree_depth = NULL, 55 | min_n = NULL, 56 | top_p = NA, 57 | threshold = NA, 58 | cutoff = NA, 59 | exclude = NULL, 60 | scores = NULL, 61 | skip = FALSE, 62 | id = recipes::rand_id("select_tree")) { 63 | 64 | engines <- parsnip::show_engines("decision_tree")$engine 65 | 66 | if (!engine %in% engines) { 67 | rlang::abort( 68 | paste("Engine argument should be one of", paste(engines, collapse = ", ")) 69 | ) 70 | } 71 | 72 | recipes::add_step( 73 | recipe, 74 | step_select_tree_new( 75 | terms = recipes::ellipse_check(...), 76 | trained = trained, 77 | outcome = outcome, 78 | role = role, 79 | engine = engine, 80 | cost_complexity = cost_complexity, 81 | tree_depth = tree_depth, 82 | min_n = min_n, 83 | top_p = top_p, 84 | threshold = threshold, 85 | cutoff = cutoff, 86 | exclude = exclude, 87 | scores = scores, 88 | skip = skip, 89 | id = id 90 | ) 91 | ) 92 | } 93 | 94 | # wrapper around 'step' function that sets the class of new step objects 95 | #' @importFrom recipes step 96 | step_select_tree_new <- function(terms, role, trained, outcome, engine, 97 | top_p, threshold, cutoff, cost_complexity, 98 | tree_depth, min_n, exclude, scores, skip, id) { 99 | recipes::step( 100 | subclass = "select_tree", 101 | terms = terms, 102 | role = role, 103 | trained = trained, 104 | outcome = outcome, 105 | engine = engine, 106 | cost_complexity = cost_complexity, 107 | tree_depth = tree_depth, 108 | min_n = min_n, 109 | top_p = top_p, 110 | threshold = threshold, 111 | cutoff = cutoff, 112 | exclude = exclude, 113 | scores = scores, 114 | skip = skip, 115 | id = id 116 | ) 117 | } 118 | 119 | #' @export 120 | prep.step_select_tree <- function(x, training, info = NULL, ...) { 121 | 122 | # translate the terms arguments 123 | x_names <- recipes::recipes_eval_select(x$terms, training, info) 124 | y_name <- recipes::recipes_eval_select(x$outcome, training, info) 125 | y_name <- y_name[1] 126 | 127 | # check criteria 128 | check_criteria(x$top_p, x$threshold, match.call()) 129 | check_zero_one(x$threshold) 130 | x$top_p <- check_top_p(x$top_p, length(x_names)) 131 | 132 | if (length(x_names) > 0) { 133 | # fit initial model 134 | X <- training[, x_names] 135 | y <- training[[y_name]] 136 | 137 | model_mode <- check_outcome(y) 138 | 139 | model_args <- list( 140 | cost_complexity = x$cost_complexity, 141 | tree_depth = x$tree_depth, 142 | min_n = x$min_n 143 | ) 144 | 145 | model_spec <- 146 | parsnip::make_call("decision_tree", args = model_args, ns = "parsnip") 147 | 148 | model_spec <- 149 | rlang::eval_tidy(model_spec) %>% 150 | parsnip::set_mode(model_mode) %>% 151 | parsnip::set_engine(x$engine) 152 | 153 | initial_model <- parsnip::fit_xy(model_spec, X, y) 154 | res <- pull_importances(initial_model) 155 | names(res) <- c("variable", "score") 156 | res$score <- rlang::set_names(res$score, res$variable) 157 | 158 | exclude <- 159 | dual_filter(res$score, x$top_p, x$threshold, x$cutoff, maximize = TRUE) 160 | 161 | } else { 162 | exclude <- character() 163 | } 164 | 165 | step_select_tree_new( 166 | terms = x$terms, 167 | trained = TRUE, 168 | role = x$role, 169 | outcome = y_name, 170 | engine = x$engine, 171 | cost_complexity = x$cost_complexity, 172 | tree_depth = x$tree_depth, 173 | min_n = x$min_n, 174 | top_p = x$top_p, 175 | threshold = x$threshold, 176 | cutoff = x$cutoff, 177 | exclude = exclude, 178 | scores = res, 179 | skip = x$skip, 180 | id = x$id 181 | ) 182 | } 183 | 184 | #' @export 185 | bake.step_select_tree <- function(object, new_data, ...) { 186 | if (length(object$exclude) > 0) { 187 | new_data <- new_data[, !colnames(new_data) %in% object$exclude] 188 | } 189 | 190 | as_tibble(new_data) 191 | } 192 | 193 | #' @export 194 | print.step_select_tree <- 195 | function(x, width = max(20, options()$width - 30), 196 | ...) { 197 | cat("Variable importance feature selection") 198 | 199 | if (recipes::is_trained(x)) { 200 | n <- length(x$exclude) 201 | cat(paste0(" (", n, " excluded)")) 202 | } 203 | cat("\n") 204 | 205 | invisible(x) 206 | } 207 | 208 | #' @rdname step_select_tree 209 | #' @param x A `step_select_tree` object. 210 | #' @param type A character with either 'terms' (the default) to return a 211 | #' tibble containing the variables that have been removed by the filter step, 212 | #' or 'scores' to return the scores for each variable. 213 | #' @export 214 | tidy.step_select_tree <- function(x, type = "terms", ...) { 215 | tidy_filter_step(x, type) 216 | } 217 | 218 | #' @export 219 | tunable.step_select_tree <- function(x, ...) { 220 | tibble( 221 | name = c("top_p", "threshold", "cutoff", "cost_complexity", "tree_depth", "min_n"), 222 | call_info = list( 223 | list(pkg = "colino", fun = "top_p"), 224 | list(pkg = "dials", fun = "threshold", range = c(0, 1)), 225 | list(pkg = "colino", fun = "cutoff"), 226 | list(pkg = "dials", fun = "cost_complexity", range = c(-10, -1), 227 | trans = scales::log10_trans()), 228 | list(pkg = "dials", fun = "tree_depth", range = c(1L, 15L)), 229 | list(pkg = "dials", fun = "min_n", range = c(2L, 40L)) 230 | ), 231 | source = "recipe", 232 | component = "step_select_tree", 233 | component_id = x$id 234 | ) 235 | } 236 | 237 | #' @rdname required_pkgs.colino 238 | #' @export 239 | required_pkgs.step_select_tree <- function(x, ...) { 240 | c("colino") 241 | } 242 | -------------------------------------------------------------------------------- /R/step_select_relief.R: -------------------------------------------------------------------------------- 1 | #' Feature selection step using the Relief algorithm 2 | #' 3 | #' Relief-based algorithms use nearest neighbors of randomly sampled 4 | #' observations (without replacement) to derive feature weights/scores that 5 | #' describe the relevance of each feature to the target variable. The feature 6 | #' weights represent the differences between the normalized feature values from 7 | #' each randomly sampled observation and a neighboring observation. If the 8 | #' neighboring observation's class is the same as the sampled observation 9 | #' (termed a 'hit') but the feature values are different, then this reduces the 10 | #' score on the basis that widely varying feature values for the same class are 11 | #' not desirable. Conversely, if a neighboring observation's class is different 12 | #' from the sampled observation (termed a 'miss') and the feature values are 13 | #' different, then this increases the score on the basis that observations of 14 | #' different classes are widely separated by their feature values. The feature 15 | #' weights / scores range from -1 (worst) to +1 (best). 16 | #' 17 | #' `step_select_relief` creates a *specification* of a recipe step that selects 18 | #' a subset of predictors based on the scores of the relief algorithm. This step 19 | #' requires the FSinR package to be installed. The top `top_p` scoring features, 20 | #' or features whose scores occur in the top percentile `threshold` will be 21 | #' retained as new predictors. 22 | #' 23 | #' @inheritParams step_select_aov 24 | #' @inherit step_select_aov return 25 | #' @param role Not used by this step since no new variables are created. 26 | #' @param outcome A character string with the name of the response variable to 27 | #' use to evaluate information gain value against the predictors. 28 | #' @param neighbors An integer with the number of neighbors for find for each 29 | #' sampled instance. Default is 5. 30 | #' @param sample_size An integer with the number of instances to sample. Default 31 | #' is 10. 32 | #' @param scores A tibble with 'variable' and 'scores' columns containing the 33 | #' names of the variables and their information gain scores. This parameter is 34 | #' only produced after the recipe has been trained. 35 | #' 36 | #' @export 37 | #' @keywords datagen 38 | #' @concept preprocessing 39 | #' @concept supervised_filter 40 | #' @export 41 | #' @details 42 | #' 43 | #' The recipe will stop if all of `top_p`, `threshold` and `cutoff` are left 44 | #' unspecified. 45 | #' 46 | #' @examples 47 | #' \dontrun{ 48 | #' library(recipes) 49 | #' 50 | #' data(cells, package = "modeldata") 51 | #' 52 | #' rec <- recipe(class ~ ., data = cells[, -1]) %>% 53 | #' step_select_relief( 54 | #' all_predictors(), 55 | #' outcome = "class", 56 | #' top_p = 10 57 | #' ) 58 | #' 59 | #' prepped <- prep(rec) 60 | #' new_data <- bake(prepped, new_data = NULL) 61 | #' prepped 62 | #' } 63 | step_select_relief <- function( 64 | recipe, ..., 65 | outcome = NULL, 66 | role = NA, 67 | trained = FALSE, 68 | top_p = NA, 69 | threshold = NA, 70 | cutoff = NA, 71 | neighbors = 5, 72 | sample_size = 10, 73 | exclude = NULL, 74 | scores = NULL, 75 | skip = FALSE, 76 | id = recipes::rand_id("select_relief")) { 77 | 78 | recipes::recipes_pkg_check("FSinR") 79 | 80 | if (neighbors <= 0) 81 | rlang::abort("`neighbors` must be greater than zero") 82 | 83 | if (sample_size <= 0) 84 | rlang::abort("'sample_size' must be greater than zero") 85 | 86 | terms <- recipes::ellipse_check(...) 87 | 88 | recipes::add_step( 89 | recipe, 90 | step_select_relief_new( 91 | terms = terms, 92 | trained = trained, 93 | outcome = outcome, 94 | role = role, 95 | top_p = top_p, 96 | threshold = threshold, 97 | cutoff = cutoff, 98 | neighbors = neighbors, 99 | sample_size = sample_size, 100 | exclude = exclude, 101 | scores = scores, 102 | skip = skip, 103 | id = id 104 | ) 105 | ) 106 | } 107 | 108 | 109 | # wrapper around 'step' function that sets the class of new step objects 110 | step_select_relief_new <- 111 | function(terms, role, trained, outcome, top_p, threshold, cutoff, neighbors, 112 | sample_size, exclude, scores, skip, id) { 113 | recipes::step( 114 | subclass = "select_relief", 115 | terms = terms, 116 | role = role, 117 | trained = trained, 118 | outcome = outcome, 119 | top_p = top_p, 120 | threshold = threshold, 121 | cutoff = cutoff, 122 | neighbors = neighbors, 123 | sample_size = sample_size, 124 | exclude = exclude, 125 | scores = scores, 126 | skip = skip, 127 | id = id 128 | ) 129 | } 130 | 131 | 132 | #' @export 133 | prep.step_select_relief <- function(x, training, info = NULL, ...) { 134 | x_names <- recipes::recipes_eval_select(x$terms, training, info) 135 | y_name <- recipes::recipes_eval_select(x$outcome, training, info) 136 | y_name <- y_name[1] 137 | 138 | # check criteria 139 | check_criteria(x$top_p, x$threshold, match.call()) 140 | check_zero_one(x$threshold) 141 | x$top_p <- check_top_p(x$top_p, length(x_names)) 142 | 143 | # feature selection 144 | if (length(x_names) > 0) { 145 | call_func <- rlang::call2( 146 | .fn = "relief", 147 | .ns = "FSelectorRcpp", 148 | x = rlang::quo(as.data.frame(training[, x_names])), 149 | y = rlang::quo(training[[y_name]]), 150 | neighboursCount = x$neighbors, 151 | sampleSize = x$sample_size 152 | ) 153 | res <- rlang::eval_tidy(call_func) 154 | res <- as_tibble(res) 155 | res <- rlang::set_names(res, c("variable", "score")) 156 | res$score <- rlang::set_names(res$score, res$variable) 157 | res <- res[order(res$score, decreasing = TRUE), ] 158 | 159 | exclude <- 160 | dual_filter(res$score, x$top_p, x$threshold, x$cutoff, maximize = TRUE) 161 | 162 | } else { 163 | exclude <- character() 164 | } 165 | 166 | step_select_relief_new( 167 | terms = x$terms, 168 | trained = TRUE, 169 | role = x$role, 170 | outcome = y_name, 171 | top_p = x$top_p, 172 | threshold = x$threshold, 173 | cutoff = x$cutoff, 174 | neighbors = x$neighbors, 175 | sample_size = x$sample_size, 176 | exclude = exclude, 177 | scores = res, 178 | skip = x$skip, 179 | id = x$id 180 | ) 181 | } 182 | 183 | #' @export 184 | bake.step_select_relief <- function(object, new_data, ...) { 185 | if (length(object$exclude > 0)) { 186 | new_data <- new_data[, !(colnames(new_data) %in% object$exclude)] 187 | } 188 | as_tibble(new_data) 189 | } 190 | 191 | #' @export 192 | print.step_select_relief <- function(x, width = max(20, options()$width - 30), ...) { 193 | cat("Relief feature selection") 194 | 195 | if (recipes::is_trained(x)) { 196 | n <- length(x$exclude) 197 | cat(paste0(" (", n, " excluded)")) 198 | } 199 | cat("\n") 200 | 201 | invisible(x) 202 | } 203 | 204 | #' @rdname step_select_relief 205 | #' @param x A `step_select_relief` object. 206 | #' @export 207 | tidy.step_select_relief <- function(x, ...) { 208 | if (recipes::is_trained(x)) { 209 | res <- tibble(terms = x$exclude) 210 | } else { 211 | term_names <- recipes::sel2char(x$terms) 212 | res <- tibble(terms = rlang::na_chr) 213 | } 214 | res$id <- x$id 215 | res 216 | } 217 | 218 | #' @export 219 | tunable.step_select_relief <- function(x, ...) { 220 | tibble::tibble( 221 | name = c("top_p", "threshold", "cutoff"), 222 | call_info = list( 223 | list(pkg = "colino", fun = "top_p"), 224 | list(pkg = "dials", fun = "threshold", range = c(0, 1)), 225 | list(pkg = "colino", fun = "cutoff") 226 | ), 227 | source = "recipe", 228 | component = "step_select_relief", 229 | component_id = x$id 230 | ) 231 | } 232 | 233 | #' @rdname required_pkgs.colino 234 | #' @export 235 | required_pkgs.step_select_relief <- function(x, ...) { 236 | c("colino", "FSinR") 237 | } 238 | --------------------------------------------------------------------------------