├── .github
    ├── .gitignore
    └── workflows
    │   ├── pkgdown.yaml
    │   ├── R-CMD-check.yaml
    │   └── test-coverage.yaml
├── LICENSE
├── tests
    ├── testthat.R
    └── testthat
    │   ├── test-step_select_relief.R
    │   ├── test_step_select_mrmr.R
    │   ├── test_step_select_linear.R
    │   ├── test_dual_filter.R
    │   ├── test_discretize_var.R
    │   ├── test_step_select_tree.R
    │   ├── test_step_select_boruta.R
    │   ├── test_step_select_vip.R
    │   ├── test_step_select_infgain.R
    │   ├── test_step_select_forests.R
    │   └── test_step_select_fcbf.R
├── .gitignore
├── _pkgdown.yml
├── docs
    ├── reference
    │   ├── Rplot001.png
    │   └── pipe.html
    ├── pkgdown.yml
    ├── link.svg
    ├── sitemap.xml
    ├── pkgdown.js
    ├── LICENSE-text.html
    ├── 404.html
    ├── LICENSE.html
    └── authors.html
├── .Rbuildignore
├── R
    ├── utils-pipe.R
    ├── imports.R
    ├── tidy_filter_step.R
    ├── colino-package.R
    ├── parameters.R
    ├── misc.R
    ├── step_select_boruta.R
    ├── step_select_roc.R
    ├── step_select_mrmr.R
    ├── step_select_vip.R
    ├── step_select_xtab.R
    ├── step_select_carscore.R
    ├── step_select_infgain.R
    ├── step_select_tree.R
    └── step_select_relief.R
├── man
    ├── pipe.Rd
    ├── entropy.Rd
    ├── top_p.Rd
    ├── cutoff.Rd
    ├── colino.Rd
    ├── dual_filter.Rd
    ├── required_pkgs.colino.Rd
    ├── pull_importances.Rd
    ├── step_select_aov.Rd
    ├── step_select_boruta.Rd
    ├── step_select_roc.Rd
    ├── step_select_xtab.Rd
    ├── step_select_mrmr.Rd
    ├── step_select_carscore.Rd
    ├── step_select_fcbf.Rd
    ├── step_select_linear.Rd
    ├── step_select_vip.Rd
    ├── step_select_tree.Rd
    ├── step_select_infgain.Rd
    ├── step_select_forests.Rd
    └── step_select_relief.Rd
├── colino.Rproj
├── LICENSE.md
├── DESCRIPTION
└── NAMESPACE


/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2019
2 | COPYRIGHT HOLDER: Steven Pawley
3 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(colino)
3 | 
4 | test_check("colino")
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | .DS_Store
6 | docs
7 | 


--------------------------------------------------------------------------------
/_pkgdown.yml:
--------------------------------------------------------------------------------
1 | url: https://stevenpawley.github.io/colino/
2 | template:
3 |   bootstrap: 5
4 | 
5 | 


--------------------------------------------------------------------------------
/docs/reference/Rplot001.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stevenpawley/colino/HEAD/docs/reference/Rplot001.png


--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^.*\.Rproj$
 2 | ^\.Rproj\.user$
 3 | ^LICENSE\.md$
 4 | ^README\.Rmd$
 5 | ^\.travis\.yml$
 6 | ^codecov\.yml$
 7 | ^_pkgdown\.yml$
 8 | ^docs$
 9 | ^pkgdown$
10 | ^\.github$
11 | 


--------------------------------------------------------------------------------
/R/utils-pipe.R:
--------------------------------------------------------------------------------
 1 | #' Pipe operator
 2 | #'
 3 | #' See \code{magrittr::\link[magrittr:pipe]{\%>\%}} for details.
 4 | #'
 5 | #' @name %>%
 6 | #' @rdname pipe
 7 | #' @keywords internal
 8 | #' @export
 9 | #' @importFrom magrittr %>%
10 | #' @usage lhs \%>\% rhs
11 | NULL
12 | 


--------------------------------------------------------------------------------
/docs/pkgdown.yml:
--------------------------------------------------------------------------------
 1 | pandoc: 2.19.2
 2 | pkgdown: 2.0.6
 3 | pkgdown_sha: ~
 4 | articles: {}
 5 | last_built: 2022-11-21T21:35Z
 6 | urls:
 7 |   reference: https://stevenpawley.github.io/colino/reference
 8 |   article: https://stevenpawley.github.io/colino/articles
 9 | 
10 | 


--------------------------------------------------------------------------------
/R/imports.R:
--------------------------------------------------------------------------------
1 | ## usethis namespace: start
2 | #' @importFrom tibble tibble as_tibble
3 | #' @importFrom recipes prep bake
4 | #' @importFrom generics tidy required_pkgs
5 | #' @importFrom tune tunable
6 | #' @importFrom stats aov as.formula
7 | ## usethis namespace: end
8 | NULL
9 | 


--------------------------------------------------------------------------------
/man/pipe.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils-pipe.R
 3 | \name{\%>\%}
 4 | \alias{\%>\%}
 5 | \title{Pipe operator}
 6 | \usage{
 7 | lhs \%>\% rhs
 8 | }
 9 | \description{
10 | See \code{magrittr::\link[magrittr:pipe]{\%>\%}} for details.
11 | }
12 | \keyword{internal}
13 | 


--------------------------------------------------------------------------------
/R/tidy_filter_step.R:
--------------------------------------------------------------------------------
 1 | tidy_filter_step <- function(x, type = "terms") {
 2 |   if (recipes::is_trained(x)) {
 3 |     if (type == "terms") {
 4 |       res <- tibble(terms = x$exclude)
 5 |     } else if (type == "scores") {
 6 |       res <- x$scores
 7 |       res <- res[order(res$score, decreasing = TRUE), ]
 8 |     }
 9 | 
10 |   } else {
11 |     res <- tibble(terms = rlang::na_chr)
12 |   }
13 |   res$id <- x$id
14 |   res
15 | }
16 | 


--------------------------------------------------------------------------------
/colino.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 | 
18 | BuildType: Package
19 | PackageUseDevtools: Yes
20 | PackageInstallArgs: --no-multiarch --with-keep.source
21 | PackageCheckArgs: --as-cran
22 | PackageRoxygenize: rd,collate,namespace
23 | 


--------------------------------------------------------------------------------
/man/entropy.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/parameters.R
 3 | \name{entropy}
 4 | \alias{entropy}
 5 | \title{Parameter functions for feature selection recipes}
 6 | \usage{
 7 | entropy(values = values_entropy)
 8 | }
 9 | \arguments{
10 | \item{values}{A character string of possible values. See `values_entropy` for
11 | possible values.}
12 | }
13 | \value{
14 | A function with classes "qual_param" and "param"
15 | }
16 | \description{
17 | Entropy-based feature selection methods can be applied using several methods
18 | to calculate the entropy formula. `entropy` is for specifying the type of
19 | entropy-based filter that is used.
20 | }
21 | \examples{
22 | entropy('infogain')
23 | }
24 | 


--------------------------------------------------------------------------------
/tests/testthat/test-step_select_relief.R:
--------------------------------------------------------------------------------
 1 | test_that("step_select_relief", {
 2 |   skip_if_not_installed("FSelectorRcpp")
 3 | 
 4 |   # FSelectorRcpp method
 5 |   set.seed(1234)
 6 |   raw <- FSelectorRcpp::relief(
 7 |     formula = Species ~ .,
 8 |     data = iris,
 9 |     neighboursCount = 5,
10 |     sampleSize = 10
11 |   )
12 |   raw <- setNames(raw, c("variable", "score"))
13 |   raw <- raw[order(raw$score, decreasing = TRUE), ]
14 | 
15 |   # test recipe
16 |   rec <-
17 |     recipe(Species ~ ., iris) %>%
18 |     step_select_relief(all_predictors(), outcome = "Species", top_p = 2)
19 | 
20 |   set.seed(1234)
21 |   prepped <- prep(rec)
22 |   expect_equal(as.numeric(prepped$steps[[1]]$scores$score), raw$score)
23 | 
24 |   new_data <- bake(prepped, new_data = NULL)
25 |   expect_equal(ncol(new_data), 3)
26 | })
27 | 


--------------------------------------------------------------------------------
/tests/testthat/test_step_select_mrmr.R:
--------------------------------------------------------------------------------
 1 | library(testthat)
 2 | library(recipes)
 3 | library(tibble)
 4 | data("iris")
 5 | 
 6 | test_that("step_select_mrmr, execution", {
 7 |   skip_if_not_installed("praznik")
 8 | 
 9 |   irisX <- iris[-5]
10 |   y <- iris$Species
11 | 
12 |   res <- praznik::MRMR(X = irisX, Y = y, k = 4)
13 | 
14 |   mrmr_scores <- tibble(
15 |     variable = names(res$score),
16 |     scores = res$score
17 |   )
18 | 
19 |   rec <- recipe(Species ~ ., data = iris)
20 | 
21 |   mrmr_rec <- rec %>%
22 |     step_select_mrmr(all_predictors(), outcome = "Species", top_p = 2) %>%
23 |     prep()
24 | 
25 |   mrmr_pred <- juice(mrmr_rec)
26 |   expect_true(all(names(mrmr_pred)[1:2] %in% mrmr_scores$variable[1:2]))
27 | 
28 |   expect_equal(mrmr_scores$scores, mrmr_rec$steps[[1]]$scores$score)
29 | })
30 | 
31 | 
32 | 


--------------------------------------------------------------------------------
/docs/link.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="utf-8"?>
 2 | <!-- Generator: Adobe Illustrator 19.2.1, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
 3 | <svg version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
 4 | 	 viewBox="0 0 20 20" style="enable-background:new 0 0 20 20;" xml:space="preserve">
 5 | <style type="text/css">
 6 | 	.st0{fill:#75AADB;}
 7 | </style>
 8 | <path class="st0" d="M4,11.3h1.3v1.3H4c-2,0-4-2.3-4-4.7s2.1-4.7,4-4.7h5.3c1.9,0,4,2.3,4,4.7c0,1.9-1.2,3.6-2.7,4.3v-1.5
 9 | 	C11.4,10.2,12,9.1,12,8c0-1.7-1.4-3.3-2.7-3.3H4C2.7,4.7,1.3,6.3,1.3,8S2.7,11.3,4,11.3z M16,7.3h-1.3v1.3H16c1.3,0,2.7,1.6,2.7,3.3
10 | 	s-1.4,3.3-2.7,3.3h-5.3C9.4,15.3,8,13.7,8,12c0-1.1,0.6-2.2,1.3-2.8V7.7C7.9,8.4,6.7,10.1,6.7,12c0,2.4,2.1,4.7,4,4.7H16
11 | 	c1.9,0,4-2.3,4-4.7S18,7.3,16,7.3z"/>
12 | </svg>
13 | 


--------------------------------------------------------------------------------
/man/top_p.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/parameters.R
 3 | \name{top_p}
 4 | \alias{top_p}
 5 | \title{Parameter functions for feature selection recipes}
 6 | \usage{
 7 | top_p(range = c(1L, 4L), trans = NULL)
 8 | }
 9 | \arguments{
10 | \item{range}{A two-element vector holding the _defaults_ for the smallest and
11 | largest possible values, respectively.}
12 | 
13 | \item{trans}{A `trans` object from the `scales` package, such as
14 | `scales::log10_trans()` or `scales::reciprocal_trans()`. If not provided,
15 | the default is used which matches the units used in `range`. If no
16 | transformation, `NULL`.}
17 | }
18 | \value{
19 | A function with classes "quant_param" and "param"
20 | }
21 | \description{
22 | Feature selection recipes allow the top-performing features to be selected
23 | using three parameters. `top_p` is for specifying the number of the
24 | top-performing features.
25 | }
26 | \examples{
27 | top_p(c(3, 10))
28 | }
29 | 


--------------------------------------------------------------------------------
/man/cutoff.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/parameters.R
 3 | \name{cutoff}
 4 | \alias{cutoff}
 5 | \title{Parameter functions for feature selection recipes}
 6 | \usage{
 7 | cutoff(range = c(dials::unknown(), dials::unknown()), trans = NULL)
 8 | }
 9 | \arguments{
10 | \item{range}{A two-element vector holding the _defaults_ for the smallest and
11 | largest possible values, respectively.}
12 | 
13 | \item{trans}{A `trans` object from the `scales` package, such as
14 | `scales::log10_trans()` or `scales::reciprocal_trans()`. If not provided,
15 | the default is used which matches the units used in `range`. If no
16 | transformation, `NULL`.}
17 | }
18 | \value{
19 | A function with classes "quant_param" and "param"
20 | }
21 | \description{
22 | Feature selection recipes allow the top-performing features to be selected
23 | using three parameters. `cutoff` is for selecting features using the absolute
24 | value in the filter methods scores.
25 | }
26 | \examples{
27 | cutoff(c(3.5, 15))
28 | }
29 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | Copyright (c) 2019 Steven Pawley
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/tests/testthat/test_step_select_linear.R:
--------------------------------------------------------------------------------
 1 | library(testthat)
 2 | library(recipes)
 3 | library(tibble)
 4 | library(parsnip)
 5 | library(modeldata)
 6 | 
 7 | data("cells")
 8 | 
 9 | test_that("step_select_linear, execution using top_p on binary case", {
10 |   rec <- cells %>%
11 |     select(-case) %>%
12 |     recipe(class ~ .) %>%
13 |     step_normalize(all_numeric_predictors()) %>%
14 |     step_select_linear(
15 |       all_predictors(),
16 |       outcome = "class",
17 |       top_p = 2
18 |     )
19 | 
20 |   prepped <- prep(rec)
21 |   selected <- bake(prepped, new_data = NULL)
22 | 
23 |   expect_length(names(selected), 3)
24 | })
25 | 
26 | 
27 | test_that("step_select_linear, execution using threshold on binary case", {
28 |   # test selection by retaining features with scores >= 50th percentile
29 |   rec <- cells %>%
30 |     select(-case) %>%
31 |     recipe(class ~ .) %>%
32 |     step_normalize(all_numeric_predictors()) %>%
33 |     step_select_linear(
34 |       all_predictors(),
35 |       outcome = "class",
36 |       threshold = 0.99
37 |     )
38 | 
39 |   prepped <- prep(rec)
40 |   selected <- juice(prepped)
41 | 
42 |   expect_length(names(selected), 2)
43 | })
44 | 


--------------------------------------------------------------------------------
/R/colino-package.R:
--------------------------------------------------------------------------------
 1 | #' colino: A collection of steps for feature selection to use with the
 2 | #' 'recipes' package
 3 | #'
 4 | #' \pkg{colino} provides a collection of additional step objects
 5 | #' related to feature selection to be used with the 'recipes' package.
 6 | #'
 7 | #' @examples
 8 | #' library(parsnip)
 9 | #' library(recipes)
10 | #' library(magrittr)
11 | #'
12 | #' # load the example iris dataset
13 | #' data(iris)
14 | #'
15 | #' # define a base model to use for feature importances
16 | #' base_model <- rand_forest(mode = "classification") %>%
17 | #'     set_engine("ranger", importance = "permutation")
18 | #'
19 | #' # create a preprocessing recipe
20 | #' rec <- iris %>%
21 | #'  recipe(Species ~ .) %>%
22 | #'  step_select_vip(all_predictors(), model = base_model, top_p = 2,
23 | #'                  outcome = "Species")
24 | #'
25 | #' prepped <- prep(rec)
26 | #'
27 | #' # create a model specification
28 | #' clf <- decision_tree(mode = "classification") %>%
29 | #'     set_engine("rpart")
30 | #'
31 | #' clf_fitted <- clf %>%
32 | #'     fit(Species ~ ., juice(prepped))
33 | #'
34 | #' @author Steven Pawley, \email{dr.stevenpawley@@gmail.com}
35 | 
36 | #' @name colino
37 | #' @keywords internal
38 | "_PACKAGE"
39 | 
40 | 


--------------------------------------------------------------------------------
/tests/testthat/test_dual_filter.R:
--------------------------------------------------------------------------------
 1 | test_that("test dual filter", {
 2 |   scores <- c(feature1 = 0.25, feature2 = 0.1, feature3 = 0.5, feature4 = 0.9)
 3 | 
 4 |   # excludes features 1 and 2
 5 |   excluded <- dual_filter(scores, top_p = 2, threshold = NA, cutoff = NA, maximize = TRUE)
 6 |   expect_setequal(excluded, c("feature1", "feature2"))
 7 | 
 8 |   # excludes feature 2 (score is < cutoff and not in top two features)
 9 |   excluded <- dual_filter(scores, top_p = 2, threshold = NA, cutoff = 0.2, maximize = TRUE)
10 |   expect_equal(excluded, "feature2")
11 | 
12 |   # expect error if both top_p and threshold are used
13 |   expect_error(
14 |     dual_filter(scores, top_p = 2, threshold = 0.5, cutoff = 0.2, maximize = TRUE),
15 |     regexp = "mutually exclusive"
16 |   )
17 | 
18 |   # excludes features 1-3 because their absolute scores are less than cutoff
19 |   excluded <- dual_filter(scores, top_p = NA, threshold = NA, cutoff = 0.7, maximize = TRUE)
20 |   expect_setequal(excluded, c("feature1", "feature2", "feature3"))
21 | 
22 |   # excludes features 4 because their absolute scores are greater than cutoff
23 |   excluded <- dual_filter(scores, top_p = NA, threshold = NA, cutoff = 0.7, maximize = FALSE)
24 |   expect_equal(excluded, "feature4")
25 | })
26 | 


--------------------------------------------------------------------------------
/tests/testthat/test_discretize_var.R:
--------------------------------------------------------------------------------
 1 | test_that("discretize_var returns expected values", {
 2 |   expect_equal(discretize_var(c(8, 7, 2, 5, NA, 3, 1), cutpoint = 0.5),
 3 |                as.factor(c('h', 'h', 'l', 'h', NA, 'l', 'l')))
 4 | 
 5 |   expect_equal(discretize_var(c(1, 1, 1, 1, 1, 21), cutpoint = 0.5),
 6 |                as.factor(c('l', 'l', 'l', 'l', 'l', 'h')))
 7 | 
 8 |   expect_equal(discretize_var(1:50, cutpoint = 0.5),
 9 |                as.factor(c(rep('l', 25), rep('h', 25))))
10 | 
11 |   expect_equal(discretize_var(as.numeric(c(NA, NA, NA, NA, NA)), cutpoint = 0.5),
12 |                as.factor(c(NA, NA, NA, NA, NA)))
13 | })
14 | 
15 | test_that("discretize_var rejects bad feature input", {
16 |   expect_error(discretize_var(c(NA, NULL), cutpoint = 0.5),
17 |                "Feature must be numeric to discretize")
18 | 
19 |   expect_error(discretize_var(c('putty', 'grass', 'grass'), cutpoint = 0.5),
20 |                "Feature must be numeric to discretize")
21 | 
22 |   expect_error(discretize_var(data.frame(x = 1:50), cutpoint = 0.5),
23 |                "Feature must be numeric to discretize")
24 | 
25 |   expect_error(discretize_var(list(1, 2, 3, 4), cutpoint = 0.5),
26 |                "Feature must be numeric to discretize")
27 | })
28 | 


--------------------------------------------------------------------------------
/.github/workflows/pkgdown.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 |     branches: [main, master]
 8 |   release:
 9 |     types: [published]
10 |   workflow_dispatch:
11 | 
12 | name: pkgdown
13 | 
14 | jobs:
15 |   pkgdown:
16 |     runs-on: ubuntu-latest
17 |     # Only restrict concurrency for non-PR jobs
18 |     concurrency:
19 |       group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }}
20 |     env:
21 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
22 |     steps:
23 |       - uses: actions/checkout@v2
24 | 
25 |       - uses: r-lib/actions/setup-pandoc@v2
26 | 
27 |       - uses: r-lib/actions/setup-r@v2
28 |         with:
29 |           use-public-rspm: true
30 | 
31 |       - uses: r-lib/actions/setup-r-dependencies@v2
32 |         with:
33 |           extra-packages: any::pkgdown, local::.
34 |           needs: website
35 | 
36 |       - name: Build site
37 |         run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE)
38 |         shell: Rscript {0}
39 | 
40 |       - name: Deploy to GitHub pages 🚀
41 |         if: github.event_name != 'pull_request'
42 |         uses: JamesIves/github-pages-deploy-action@4.1.4
43 |         with:
44 |           clean: false
45 |           branch: gh-pages
46 |           folder: docs
47 | 


--------------------------------------------------------------------------------
/man/colino.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/colino-package.R
 3 | \docType{package}
 4 | \name{colino}
 5 | \alias{colino-package}
 6 | \alias{colino}
 7 | \title{colino: A collection of steps for feature selection to use with the
 8 | 'recipes' package}
 9 | \description{
10 | \pkg{colino} provides a collection of additional step objects
11 | related to feature selection to be used with the 'recipes' package.
12 | }
13 | \examples{
14 | library(parsnip)
15 | library(recipes)
16 | library(magrittr)
17 | 
18 | # load the example iris dataset
19 | data(iris)
20 | 
21 | # define a base model to use for feature importances
22 | base_model <- rand_forest(mode = "classification") \%>\%
23 |     set_engine("ranger", importance = "permutation")
24 | 
25 | # create a preprocessing recipe
26 | rec <- iris \%>\%
27 |  recipe(Species ~ .) \%>\%
28 |  step_select_vip(all_predictors(), model = base_model, top_p = 2,
29 |                  outcome = "Species")
30 | 
31 | prepped <- prep(rec)
32 | 
33 | # create a model specification
34 | clf <- decision_tree(mode = "classification") \%>\%
35 |     set_engine("rpart")
36 | 
37 | clf_fitted <- clf \%>\%
38 |     fit(Species ~ ., juice(prepped))
39 | 
40 | }
41 | \seealso{
42 | Useful links:
43 | \itemize{
44 |   \item \url{https://stevenpawley.github.io/colino}
45 |   \item Report bugs at \url{https://github.com/stevenpawley/colino/issues}
46 | }
47 | 
48 | }
49 | \author{
50 | Steven Pawley, \email{dr.stevenpawley@gmail.com}
51 | }
52 | \keyword{internal}
53 | 


--------------------------------------------------------------------------------
/tests/testthat/test_step_select_tree.R:
--------------------------------------------------------------------------------
 1 | library(testthat)
 2 | library(recipes)
 3 | library(tibble)
 4 | library(parsnip)
 5 | data("iris")
 6 | 
 7 | test_that("step_select_tree, execution using top_p", {
 8 |   skip_if_not_installed("rpart")
 9 | 
10 |   irisX <- iris[-5]
11 |   y <- iris$Species
12 | 
13 |   rec <- iris %>%
14 |     recipe(Species ~.) %>%
15 |     step_select_tree(
16 |       all_predictors(),
17 |       outcome = "Species",
18 |       engine = "rpart",
19 |       top_p = 2
20 |     )
21 | 
22 |   prepped <- prep(rec)
23 |   selected <- juice(prepped)
24 | 
25 |   expect_length(names(selected), 3)
26 | })
27 | 
28 | 
29 | test_that("step_select_tree, execution using threshold", {
30 |   skip_if_not_installed("rpart")
31 | 
32 |   irisX <- iris[-5]
33 |   y <- iris$Species
34 | 
35 |   # test selection by retaining features with scores >= 50th percentile
36 |   rec <- iris %>%
37 |     recipe(Species ~.) %>%
38 |     step_select_tree(
39 |       all_predictors(),
40 |       outcome = "Species",
41 |       threshold = 0.5
42 |     )
43 | 
44 |   prepped <- prep(rec)
45 |   selected <- juice(prepped)
46 | 
47 |   expect_length(names(selected), 3)
48 | 
49 |   # test selection by retaining features with scores in 90th percentile
50 |   rec <- iris %>%
51 |     recipe(Species ~.) %>%
52 |     step_select_tree(
53 |       all_predictors(),
54 |       outcome = "Species",
55 |       threshold = 0.9
56 |     )
57 | 
58 |   prepped <- prep(rec)
59 |   selected <- juice(prepped)
60 | 
61 |   expect_length(names(selected), 2)
62 | })
63 | 


--------------------------------------------------------------------------------
/.github/workflows/R-CMD-check.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 | 
 8 | name: R-CMD-check.yaml
 9 | 
10 | permissions: read-all
11 | 
12 | jobs:
13 |   R-CMD-check:
14 |     runs-on: ${{ matrix.config.os }}
15 | 
16 |     name: ${{ matrix.config.os }} (${{ matrix.config.r }})
17 | 
18 |     strategy:
19 |       fail-fast: false
20 |       matrix:
21 |         config:
22 |           - {os: macos-latest,   r: 'release'}
23 |           - {os: windows-latest, r: 'release'}
24 |           - {os: ubuntu-latest,   r: 'devel', http-user-agent: 'release'}
25 |           - {os: ubuntu-latest,   r: 'release'}
26 |           - {os: ubuntu-latest,   r: 'oldrel-1'}
27 | 
28 |     env:
29 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
30 |       R_KEEP_PKG_SOURCE: yes
31 | 
32 |     steps:
33 |       - uses: actions/checkout@v4
34 | 
35 |       - uses: r-lib/actions/setup-pandoc@v2
36 | 
37 |       - uses: r-lib/actions/setup-r@v2
38 |         with:
39 |           r-version: ${{ matrix.config.r }}
40 |           http-user-agent: ${{ matrix.config.http-user-agent }}
41 |           use-public-rspm: true
42 | 
43 |       - uses: r-lib/actions/setup-r-dependencies@v2
44 |         with:
45 |           extra-packages: any::rcmdcheck
46 |           needs: check
47 | 
48 |       - uses: r-lib/actions/check-r-package@v2
49 |         with:
50 |           upload-snapshots: true
51 |           build_args: 'c("--no-manual","--compact-vignettes=gs+qpdf")'
52 | 


--------------------------------------------------------------------------------
/man/dual_filter.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/misc.R
 3 | \name{dual_filter}
 4 | \alias{dual_filter}
 5 | \title{Select features using `top_p` or `threshold`.}
 6 | \usage{
 7 | dual_filter(x, top_p, threshold, cutoff, maximize)
 8 | }
 9 | \arguments{
10 | \item{x}{a named numeric vector of scores per feature}
11 | 
12 | \item{top_p}{an integer specifying the number of top-performing features to
13 | retain}
14 | 
15 | \item{threshold}{a numeric with percentile of top-performing features to
16 | retain. For example, `threshold = 0.9` will only retain features that are
17 | in the top 90th percentile. A smaller value of threshold will select
18 | more features.}
19 | 
20 | \item{cutoff}{a numeric with the value that represents the cutoff in the
21 | scores in `x` by which to retain/discard features.}
22 | 
23 | \item{maximize}{logical to indicate whether `top_p`, `threshold` and `cutoff`
24 | are used to keep features where high scores = 'best' (maximize = TRUE) or
25 | where low scores = 'best' (maximize = FALSE).}
26 | }
27 | \value{
28 | character vector of feature names to exclude
29 | }
30 | \description{
31 | Feature selection using either the `top_p` or `threshold` features OR
32 | `cutoff` where cutoff refers to the absolute numeric value of the feature
33 | importance scores.
34 | }
35 | \details{
36 | `dual_filter` selects feature that are selected using either (`top_p`,
37 | `threshold`) or `cutoff` or both. If top_p/threshold and cutoff are both used
38 | then features are selected using OR. For example, if top_p selects features 1
39 | & 2, and threshold selects features 1 & 3, then the selected features =
40 | 1,2,3.
41 | }
42 | \keyword{internal}
43 | 


--------------------------------------------------------------------------------
/tests/testthat/test_step_select_boruta.R:
--------------------------------------------------------------------------------
 1 | library(testthat)
 2 | library(recipes)
 3 | library(tibble)
 4 | library(modeldata)
 5 | 
 6 | data("lending_club")
 7 | 
 8 | test_that("step_select_boruta, execution", {
 9 |   skip_if_not_installed("Boruta")
10 | 
11 |   # Boruta model results
12 |   set.seed(1234)
13 |   boruta_mod <- Boruta::Boruta(
14 |     x = lending_club[, -23],
15 |     y = lending_club$Class
16 |   )
17 |   excluded <- names(
18 |     boruta_mod$finalDecision[boruta_mod$finalDecision == "Rejected"]
19 |   )
20 | 
21 |   # step_select_boruta results
22 |   rec <- recipe(Class ~ ., data = lending_club) %>%
23 |     step_select_boruta(all_predictors(), outcome = "Class")
24 |   set.seed(1234)
25 |   prepped <- rec %>% prep()
26 | 
27 |   # check
28 |   expect_equal(excluded, prepped$steps[[1]]$exclude)
29 |   expect_equal(boruta_mod$ImpHistory, prepped$steps[[1]]$res$ImpHistory)
30 | })
31 | 
32 | 
33 | test_that("step_select_boruta, options", {
34 |   skip_if_not_installed("Boruta")
35 | 
36 |   # Boruta model results
37 |   set.seed(1234)
38 |   boruta_mod <- Boruta::Boruta(
39 |     x = lending_club[, -23],
40 |     y = lending_club$Class,
41 |     getImp = Boruta::getImpRfGini
42 |   )
43 |   excluded <- names(
44 |     boruta_mod$finalDecision[boruta_mod$finalDecision == "Rejected"]
45 |   )
46 | 
47 |   # step_select_boruta results
48 |   rec <-
49 |     recipe(Class ~ ., data = lending_club) %>%
50 |     step_select_boruta(
51 |       all_predictors(),
52 |       outcome = "Class",
53 |       options = list(getImp = Boruta::getImpRfGini)
54 |     )
55 |   set.seed(1234)
56 |   prepped <- rec %>% prep()
57 | 
58 |   # check
59 |   expect_equal(excluded, tidy(prepped, number = 1)$terms)
60 |   expect_equal(boruta_mod$ImpHistory, prepped$steps[[1]]$res$ImpHistory)
61 | })
62 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: colino
 2 | Type: Package
 3 | Title: Recipes Steps for Supervised Filter-Based Feature Selection
 4 | Version: 0.0.1
 5 | Authors@R: 
 6 |     c(person(given = "Steven",
 7 |            family = "Pawley",
 8 |            role = c("aut", "cre"),
 9 |            email = "dr.stevenpawley@gmail.com"),
10 |     person(given = "Max",
11 |            family = "Kuhn",
12 |            role = c("aut"),
13 |            email = "max@rstudio.com"),
14 |     person(given = "Rowan",
15 |            family = "Jacques-Hamilton",
16 |            role = c("aut"),
17 |            email = "rowan.jacques.hamilton@gmail.com"),
18 |     person(given = "Byron",
19 |            family = "Jaeger",
20 |            role = c("aut"),
21 |            email = "bjaeger@wakehealth.edu"))
22 | Maintainer: Steven Pawley <dr.stevenpawley@gmail.com>
23 | Description: Provides supervised selection methods to be used as preprocessing 
24 |     steps alongside the 'recipes' package. These steps represent filter-based
25 |     methods where the features are ranked according to the feature selection 
26 |     method and a subset of features are retained.
27 | License: MIT + file LICENSE
28 | Encoding: UTF-8
29 | URL: https://stevenpawley.github.io/colino
30 | BugReports: https://github.com/stevenpawley/colino/issues
31 | Depends:
32 |      R (>= 2.10),
33 |      recipes
34 | Imports:
35 |     generics,
36 |     tibble,
37 |     parsnip,
38 |     tune,
39 |     dials,
40 |     purrr,
41 |     rlang (>= 0.1.2),
42 |     magrittr,
43 |     dplyr,
44 |     scales,
45 |     pROC,
46 |     stats
47 | RoxygenNote: 7.3.2
48 | Suggests: 
49 |     testthat,
50 |     roxygen2,
51 |     FSelectorRcpp,
52 |     praznik,
53 |     ranger,
54 |     Boruta,
55 |     care,
56 |     modeldata,
57 |     covr,
58 |     bonsai,
59 |     aorsf,
60 |     xgboost
61 | 


--------------------------------------------------------------------------------
/tests/testthat/test_step_select_vip.R:
--------------------------------------------------------------------------------
 1 | library(testthat)
 2 | library(recipes)
 3 | library(tibble)
 4 | library(parsnip)
 5 | data("iris")
 6 | 
 7 | test_that("step_select_vip, execution using top_p", {
 8 |   skip_if_not_installed("ranger")
 9 | 
10 |   irisX <- iris[-5]
11 |   y <- iris$Species
12 | 
13 |   base_model <- rand_forest(mode = "classification") %>%
14 |     set_engine("ranger", importance = "permutation")
15 | 
16 |   rec <- iris %>%
17 |     recipe(Species ~.) %>%
18 |     step_select_vip(
19 |       all_predictors(),
20 |       outcome = "Species",
21 |       model = base_model,
22 |       top_p = 2
23 |     )
24 | 
25 |   prepped <- prep(rec)
26 |   selected <- juice(prepped)
27 | 
28 |   expect_length(names(selected), 3)
29 | })
30 | 
31 | 
32 | test_that("step_select_vip, execution using threshold", {
33 |   skip_if_not_installed("ranger")
34 | 
35 |   irisX <- iris[-5]
36 |   y <- iris$Species
37 | 
38 |   base_model <- rand_forest(mode = "classification") %>%
39 |     set_engine("ranger", importance = "permutation")
40 | 
41 |   # test selection by retaining features with scores >= 50th percentile
42 |   rec <- iris %>%
43 |     recipe(Species ~.) %>%
44 |     step_select_vip(
45 |       all_predictors(),
46 |       outcome = "Species",
47 |       model = base_model,
48 |       threshold = 0.5
49 |     )
50 | 
51 |   prepped <- prep(rec)
52 |   selected <- juice(prepped)
53 | 
54 |   expect_length(names(selected), 3)
55 | 
56 |   # test selection by retaining features with scores in 90th percentile
57 |   rec <- iris %>%
58 |     recipe(Species ~.) %>%
59 |     step_select_vip(
60 |       all_predictors(),
61 |       outcome = "Species",
62 |       model = base_model,
63 |       threshold = 0.9
64 |     )
65 | 
66 |   prepped <- prep(rec)
67 |   selected <- juice(prepped)
68 | 
69 |   expect_length(names(selected), 2)
70 | })
71 | 
72 | 


--------------------------------------------------------------------------------
/.github/workflows/test-coverage.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 | 
 8 | name: test-coverage.yaml
 9 | 
10 | permissions: read-all
11 | 
12 | jobs:
13 |   test-coverage:
14 |     runs-on: ubuntu-latest
15 |     env:
16 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
17 | 
18 |     steps:
19 |       - uses: actions/checkout@v4
20 | 
21 |       - uses: r-lib/actions/setup-r@v2
22 |         with:
23 |           use-public-rspm: true
24 | 
25 |       - uses: r-lib/actions/setup-r-dependencies@v2
26 |         with:
27 |           extra-packages: any::covr, any::xml2
28 |           needs: coverage
29 | 
30 |       - name: Test coverage
31 |         run: |
32 |           cov <- covr::package_coverage(
33 |             quiet = FALSE,
34 |             clean = FALSE,
35 |             install_path = file.path(normalizePath(Sys.getenv("RUNNER_TEMP"), winslash = "/"), "package")
36 |           )
37 |           covr::to_cobertura(cov)
38 |         shell: Rscript {0}
39 | 
40 |       - uses: codecov/codecov-action@v4
41 |         with:
42 |           # Fail if error if not on PR, or if on PR and token is given
43 |           fail_ci_if_error: ${{ github.event_name != 'pull_request' || secrets.CODECOV_TOKEN }}
44 |           file: ./cobertura.xml
45 |           plugin: noop
46 |           disable_search: true
47 |           token: ${{ secrets.CODECOV_TOKEN }}
48 | 
49 |       - name: Show testthat output
50 |         if: always()
51 |         run: |
52 |           ## --------------------------------------------------------------------
53 |           find '${{ runner.temp }}/package' -name 'testthat.Rout*' -exec cat '{}' \; || true
54 |         shell: bash
55 | 
56 |       - name: Upload test results
57 |         if: failure()
58 |         uses: actions/upload-artifact@v4
59 |         with:
60 |           name: coverage-test-failures
61 |           path: ${{ runner.temp }}/package
62 | 


--------------------------------------------------------------------------------
/tests/testthat/test_step_select_infgain.R:
--------------------------------------------------------------------------------
 1 | library(testthat)
 2 | library(recipes)
 3 | library(tibble)
 4 | library(modeldata)
 5 | 
 6 | data("iris")
 7 | 
 8 | test_that("step_select_infgain, classification", {
 9 |   skip_if_not_installed("FSelectorRcpp")
10 | 
11 |   irisX <- iris[-5]
12 |   y <- iris$Species
13 | 
14 |   ig_scores <- as_tibble(FSelectorRcpp::information_gain(x = irisX, y = y))
15 |   ig_scores <- ig_scores[order(ig_scores$importance), ]
16 |   ig_scores$importance <- rlang::set_names(ig_scores$importance, ig_scores$attributes)
17 |   ig_scores <- ig_scores[order(ig_scores$importance, decreasing = TRUE), ]
18 | 
19 |   rec <- recipe(Species ~ ., data = iris)
20 | 
21 |   ig_rec <- rec %>%
22 |     step_select_infgain(
23 |       all_predictors(), outcome = "Species", type = "infogain", top_p = 2) %>%
24 |     prep()
25 | 
26 |   ig_pred <- juice(ig_rec)
27 |   expect_true(all(names(ig_pred)[1:2] %in% ig_scores$attributes[1:2]))
28 | })
29 | 
30 | 
31 | test_that("step_select_infgain, regression", {
32 |   skip_if_not_installed("FSelectorRcpp")
33 |   data("biomass", package = "modeldata")
34 | 
35 |   X <- as.data.frame(biomass[, -c(1:2, 8)])
36 |   y <- biomass$HHV
37 | 
38 |   ig_scores <-
39 |     as_tibble(FSelectorRcpp::information_gain(x = X, y = y, equal = TRUE))
40 |   ig_scores <- ig_scores[order(ig_scores$importance), ]
41 |   ig_scores$importance <- rlang::set_names(ig_scores$importance, ig_scores$attributes)
42 |   ig_scores <- ig_scores[order(ig_scores$importance, decreasing = TRUE), ]
43 | 
44 |   ig_rec <-
45 |     recipe(HHV ~ ., data = biomass[, -(1:2)]) %>%
46 |     step_select_infgain(
47 |       all_predictors(),
48 |       outcome = "HHV",
49 |       type = "infogain",
50 |       top_p = 2) %>%
51 |     prep()
52 | 
53 |   ig_pred <- bake(ig_rec, new_data = NULL)
54 |   expect_equal(names(ig_pred)[1:2], ig_scores$attributes[1:2])
55 | 
56 |   tidyed_scores <- tidy(ig_rec, number = 1, type = "scores")
57 |   tidyed_scores <- tidyed_scores[, -3]
58 |   expect_equal(tidyed_scores$variable, ig_scores$attributes)
59 |   expect_equal(tidyed_scores$score, ig_scores$importance)
60 | })
61 | 


--------------------------------------------------------------------------------
/man/required_pkgs.colino.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/step_select_aov.R, R/step_select_boruta.R,
 3 | %   R/step_select_carscore.R, R/step_select_fcbf.R, R/step_select_forests.R,
 4 | %   R/step_select_infgain.R, R/step_select_linear.R, R/step_select_mrmr.R,
 5 | %   R/step_select_relief.R, R/step_select_roc.R, R/step_select_tree.R,
 6 | %   R/step_select_vip.R, R/step_select_xtab.R
 7 | \name{required_pkgs.step_select_aov}
 8 | \alias{required_pkgs.step_select_aov}
 9 | \alias{required_pkgs.step_select_boruta}
10 | \alias{required_pkgs.step_select_carscore}
11 | \alias{required_pkgs.step_select_fcbf}
12 | \alias{required_pkgs.step_select_forests}
13 | \alias{required_pkgs.step_select_infgain}
14 | \alias{required_pkgs.step_select_linear}
15 | \alias{required_pkgs.step_select_mrmr}
16 | \alias{required_pkgs.step_select_relief}
17 | \alias{required_pkgs.step_select_roc}
18 | \alias{required_pkgs.step_select_tree}
19 | \alias{required_pkgs.step_select_vip}
20 | \alias{required_pkgs.step_select_xtab}
21 | \title{S3 methods for tracking which additional packages are needed for steps.}
22 | \usage{
23 | \method{required_pkgs}{step_select_aov}(x, ...)
24 | 
25 | \method{required_pkgs}{step_select_boruta}(x, ...)
26 | 
27 | \method{required_pkgs}{step_select_carscore}(x, ...)
28 | 
29 | \method{required_pkgs}{step_select_fcbf}(x, ...)
30 | 
31 | \method{required_pkgs}{step_select_forests}(x, ...)
32 | 
33 | \method{required_pkgs}{step_select_infgain}(x, ...)
34 | 
35 | \method{required_pkgs}{step_select_linear}(x, ...)
36 | 
37 | \method{required_pkgs}{step_select_mrmr}(x, ...)
38 | 
39 | \method{required_pkgs}{step_select_relief}(x, ...)
40 | 
41 | \method{required_pkgs}{step_select_roc}(x, ...)
42 | 
43 | \method{required_pkgs}{step_select_tree}(x, ...)
44 | 
45 | \method{required_pkgs}{step_select_vip}(x, ...)
46 | 
47 | \method{required_pkgs}{step_select_xtab}(x, ...)
48 | }
49 | \arguments{
50 | \item{x}{A recipe step}
51 | }
52 | \value{
53 | A character vector
54 | }
55 | \description{
56 | Recipe-adjacent packages always list themselves as a required package so that
57 | the steps can function properly within parallel processing schemes.
58 | }
59 | \keyword{internal}
60 | 


--------------------------------------------------------------------------------
/man/pull_importances.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/pull_importances.R
 3 | \name{pull_importances}
 4 | \alias{pull_importances}
 5 | \title{Pull feature importances from a parsnip fitted model}
 6 | \usage{
 7 | pull_importances(object, scaled = TRUE, ...)
 8 | }
 9 | \arguments{
10 | \item{object}{A `model_fit` object.}
11 | 
12 | \item{scaled}{A logical indicating whether to rescale the importances between
13 | 0 and 1. Default is TRUE.}
14 | 
15 | \item{...}{A list of other parameters passed to the feature importance
16 | method.}
17 | }
18 | \value{
19 | tibble
20 | }
21 | \description{
22 | `pull_importances` is a generic function to extract feature importance scores
23 | or coefficients from a parsnip `model_fit` object and return them as a tibble
24 | with a 'feature' and 'importance' column. This is designed to support the
25 | `step_importance` recipe step.
26 | }
27 | \details{
28 | Most of the basic models within the parsnip package that support feature
29 | importances are implemented (call `methods(pull_importances)` to list models
30 | that are currently implemented). If need to pull the feature importance
31 | scores from a model that is not currently supported in this package, then you
32 | can add a class to the pull_importances generic function which returns a
33 | two-column tibble:
34 | }
35 | \examples{
36 | library(parsnip)
37 | 
38 | # pull feature importances from a model_fit object
39 | model <- boost_tree(mode = "classification") \%>\%
40 |     set_engine("xgboost")
41 | model_fit <- model \%>\% fit(Species ~., iris)
42 | pull_importances(model_fit)
43 | 
44 | # create a new pull_importances method
45 | pull_importances._ranger <- function(object, scaled = FALSE, ...) {
46 |     # create a call to the ranger::importance function avoiding having to use
47 |     # ranger as a dependency
48 |     call <- rlang::call2(.fn = "importance", .ns = "ranger", x = object$fit)
49 |     scores <- rlang::eval_tidy(call)
50 | 
51 |     # create a tibble with 'feature' and 'importance' columns
52 |     scores <- tibble::tibble(
53 |       feature = names(scores),
54 |       importance = as.numeric(scores)
55 |     )
56 |     # optionally rescale the importance scores
57 |     if (isTRUE(scaled))
58 |       scores$importance <- rescale(scores$importance)
59 | 
60 |     scores
61 | }
62 | }
63 | 


--------------------------------------------------------------------------------
/tests/testthat/test_step_select_forests.R:
--------------------------------------------------------------------------------
  1 | library(testthat)
  2 | library(recipes)
  3 | library(tibble)
  4 | library(parsnip)
  5 | 
  6 | data("iris")
  7 | 
  8 | test_that("step_select_forests, execution using top_p", {
  9 |   skip_if_not_installed("ranger")
 10 | 
 11 |   rec <- iris %>%
 12 |     recipe(Species ~.) %>%
 13 |     step_select_forests(
 14 |       all_predictors(),
 15 |       outcome = "Species",
 16 |       engine = "ranger",
 17 |       top_p = 2
 18 |     )
 19 | 
 20 |   prepped <- prep(rec)
 21 |   tidy(rec, number = 1)
 22 |   selected <- juice(prepped)
 23 | 
 24 |   expect_length(names(selected), 3)
 25 | })
 26 | 
 27 | 
 28 | test_that("step_select_forests, execution using threshold", {
 29 |   skip_if_not_installed("ranger")
 30 | 
 31 |   irisX <- iris[-5]
 32 |   y <- iris$Species
 33 | 
 34 |   # test selection by retaining features with scores >= 50th percentile
 35 |   rec <- iris %>%
 36 |     recipe(Species ~.) %>%
 37 |     step_select_forests(
 38 |       all_predictors(),
 39 |       outcome = "Species",
 40 |       threshold = 0.5
 41 |     )
 42 | 
 43 |   prepped <- prep(rec)
 44 |   selected <- juice(prepped)
 45 | 
 46 |   expect_length(names(selected), 3)
 47 | 
 48 |   # test selection by retaining features with scores in 90th percentile
 49 |   rec <- iris %>%
 50 |     recipe(Species ~.) %>%
 51 |     step_select_forests(
 52 |       all_predictors(),
 53 |       outcome = "Species",
 54 |       threshold = 0.9
 55 |     )
 56 | 
 57 |   prepped <- prep(rec)
 58 |   selected <- juice(prepped)
 59 | 
 60 |   expect_length(names(selected), 2)
 61 | })
 62 | 
 63 | test_that(
 64 |   desc = "step_select_forests, execution using aorsf",
 65 |   code = {
 66 | 
 67 |     skip_if_not_installed('aorsf')
 68 |     skip_if_not_installed('bonsai')
 69 | 
 70 |     library(bonsai)
 71 | 
 72 |     irisX <- iris[-5]
 73 |     y <- iris$Species
 74 | 
 75 |     # test selection by retaining features with scores >= 50th percentile
 76 |     rec <- iris %>%
 77 |       recipe(Species ~.) %>%
 78 |       step_select_forests(
 79 |         all_predictors(),
 80 |         outcome = "Species",
 81 |         threshold = 0.5,
 82 |         engine = 'aorsf'
 83 |       )
 84 | 
 85 |     prepped <- prep(rec)
 86 |     selected <- juice(prepped)
 87 | 
 88 |     expect_length(names(selected), 3)
 89 | 
 90 |     # test selection by retaining features with scores in 90th percentile
 91 |     rec <- iris %>%
 92 |       recipe(Species ~.) %>%
 93 |       step_select_forests(
 94 |         all_predictors(),
 95 |         outcome = "Species",
 96 |         threshold = 0.9
 97 |       )
 98 | 
 99 |     prepped <- prep(rec)
100 |     selected <- juice(prepped)
101 | 
102 |     expect_length(names(selected), 2)
103 | 
104 |   }
105 | )
106 | 
107 | 
108 | 


--------------------------------------------------------------------------------
/docs/sitemap.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
 3 |   <url>
 4 |     <loc>https://stevenpawley.github.io/colino/404.html</loc>
 5 |   </url>
 6 |   <url>
 7 |     <loc>https://stevenpawley.github.io/colino/LICENSE-text.html</loc>
 8 |   </url>
 9 |   <url>
10 |     <loc>https://stevenpawley.github.io/colino/LICENSE.html</loc>
11 |   </url>
12 |   <url>
13 |     <loc>https://stevenpawley.github.io/colino/authors.html</loc>
14 |   </url>
15 |   <url>
16 |     <loc>https://stevenpawley.github.io/colino/index.html</loc>
17 |   </url>
18 |   <url>
19 |     <loc>https://stevenpawley.github.io/colino/reference/colino.html</loc>
20 |   </url>
21 |   <url>
22 |     <loc>https://stevenpawley.github.io/colino/reference/cutoff.html</loc>
23 |   </url>
24 |   <url>
25 |     <loc>https://stevenpawley.github.io/colino/reference/dual_filter.html</loc>
26 |   </url>
27 |   <url>
28 |     <loc>https://stevenpawley.github.io/colino/reference/entropy.html</loc>
29 |   </url>
30 |   <url>
31 |     <loc>https://stevenpawley.github.io/colino/reference/index.html</loc>
32 |   </url>
33 |   <url>
34 |     <loc>https://stevenpawley.github.io/colino/reference/pipe.html</loc>
35 |   </url>
36 |   <url>
37 |     <loc>https://stevenpawley.github.io/colino/reference/pull_importances.html</loc>
38 |   </url>
39 |   <url>
40 |     <loc>https://stevenpawley.github.io/colino/reference/required_pkgs.embed.html</loc>
41 |   </url>
42 |   <url>
43 |     <loc>https://stevenpawley.github.io/colino/reference/step_select_aov.html</loc>
44 |   </url>
45 |   <url>
46 |     <loc>https://stevenpawley.github.io/colino/reference/step_select_boruta.html</loc>
47 |   </url>
48 |   <url>
49 |     <loc>https://stevenpawley.github.io/colino/reference/step_select_carscore.html</loc>
50 |   </url>
51 |   <url>
52 |     <loc>https://stevenpawley.github.io/colino/reference/step_select_fcbf.html</loc>
53 |   </url>
54 |   <url>
55 |     <loc>https://stevenpawley.github.io/colino/reference/step_select_forests.html</loc>
56 |   </url>
57 |   <url>
58 |     <loc>https://stevenpawley.github.io/colino/reference/step_select_infgain.html</loc>
59 |   </url>
60 |   <url>
61 |     <loc>https://stevenpawley.github.io/colino/reference/step_select_linear.html</loc>
62 |   </url>
63 |   <url>
64 |     <loc>https://stevenpawley.github.io/colino/reference/step_select_mrmr.html</loc>
65 |   </url>
66 |   <url>
67 |     <loc>https://stevenpawley.github.io/colino/reference/step_select_relief.html</loc>
68 |   </url>
69 |   <url>
70 |     <loc>https://stevenpawley.github.io/colino/reference/step_select_roc.html</loc>
71 |   </url>
72 |   <url>
73 |     <loc>https://stevenpawley.github.io/colino/reference/step_select_tree.html</loc>
74 |   </url>
75 |   <url>
76 |     <loc>https://stevenpawley.github.io/colino/reference/step_select_vip.html</loc>
77 |   </url>
78 |   <url>
79 |     <loc>https://stevenpawley.github.io/colino/reference/step_select_xtab.html</loc>
80 |   </url>
81 |   <url>
82 |     <loc>https://stevenpawley.github.io/colino/reference/top_p.html</loc>
83 |   </url>
84 | </urlset>
85 | 


--------------------------------------------------------------------------------
/R/parameters.R:
--------------------------------------------------------------------------------
 1 | #' Parameter functions for feature selection recipes
 2 | #'
 3 | #' Feature selection recipes allow the top-performing features to be selected
 4 | #' using three parameters. `top_p` is for specifying the number of the
 5 | #' top-performing features.
 6 | #'
 7 | #' @param range A two-element vector holding the _defaults_ for the smallest and
 8 | #'   largest possible values, respectively.
 9 | #' @param trans A `trans` object from the `scales` package, such as
10 | #'   `scales::log10_trans()` or `scales::reciprocal_trans()`. If not provided,
11 | #'   the default is used which matches the units used in `range`. If no
12 | #'   transformation, `NULL`.
13 | #'
14 | #' @return A function with classes "quant_param" and "param"
15 | #' @export
16 | #'
17 | #' @examples
18 | #' top_p(c(3, 10))
19 | top_p <- function(range = c(1L, 4L), trans = NULL) {
20 |   dials::new_quant_param(
21 |     type = "integer",
22 |     range = range,
23 |     inclusive = c(TRUE, TRUE),
24 |     trans = trans,
25 |     label = c(top_p = "# Selected Predictors"),
26 |     finalize = dials::get_p
27 |   )
28 | }
29 | 
30 | #' Parameter functions for feature selection recipes
31 | #'
32 | #' Feature selection recipes allow the top-performing features to be selected
33 | #' using three parameters. `cutoff` is for selecting features using the absolute
34 | #' value in the filter methods scores.
35 | #'
36 | #' @param range A two-element vector holding the _defaults_ for the smallest and
37 | #'   largest possible values, respectively.
38 | #' @param trans A `trans` object from the `scales` package, such as
39 | #'   `scales::log10_trans()` or `scales::reciprocal_trans()`. If not provided,
40 | #'   the default is used which matches the units used in `range`. If no
41 | #'   transformation, `NULL`.
42 | #'
43 | #' @return A function with classes "quant_param" and "param"
44 | #' @export
45 | #'
46 | #' @examples
47 | #' cutoff(c(3.5, 15))
48 | cutoff <- function(range = c(dials::unknown(), dials::unknown()), trans = NULL) {
49 |   dials::new_quant_param(
50 |     type = "double",
51 |     range = range,
52 |     inclusive = c(FALSE, FALSE),
53 |     trans = trans,
54 |     label = c(cutoff = "Absolute cutoff threshold for the feature scores")
55 |   )
56 | }
57 | 
58 | values_entropy <- c("infogain", "gainratio", "symuncert")
59 | 
60 | 
61 | #' Parameter functions for feature selection recipes
62 | #'
63 | #' Entropy-based feature selection methods can be applied using several methods
64 | #' to calculate the entropy formula. `entropy` is for specifying the type of
65 | #' entropy-based filter that is used.
66 | #'
67 | #' @param values A character string of possible values. See `values_entropy` for
68 | #'   possible values.
69 | #'
70 | #' @return A function with classes "qual_param" and "param"
71 | #' @export
72 | #'
73 | #' @examples
74 | #' entropy('infogain')
75 | entropy <- function(values = values_entropy) {
76 |   dials::new_qual_param(
77 |     type = "character",
78 |     values = values,
79 |     label = c(entropy = "Method used for entropy-based feature selection"),
80 |     finalize = NULL
81 |   )
82 | }
83 | 


--------------------------------------------------------------------------------
/man/step_select_aov.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/step_select_aov.R
  3 | \name{step_select_aov}
  4 | \alias{step_select_aov}
  5 | \alias{tidy.step_select_aov}
  6 | \title{Filter Categorical Predictors using the ANOVA F-Test}
  7 | \usage{
  8 | step_select_aov(
  9 |   recipe,
 10 |   ...,
 11 |   outcome,
 12 |   role = "predictor",
 13 |   trained = FALSE,
 14 |   top_p = NA,
 15 |   threshold = NA,
 16 |   cutoff = NA,
 17 |   exclude = NULL,
 18 |   skip = FALSE,
 19 |   id = recipes::rand_id("select_aov")
 20 | )
 21 | 
 22 | \method{tidy}{step_select_aov}(x, ...)
 23 | }
 24 | \arguments{
 25 | \item{recipe}{A recipe object. The step will be added to the sequence of
 26 | operations for this recipe.}
 27 | 
 28 | \item{...}{One or more selector functions to choose which predictors are
 29 | affected by the step. See [selections()] for more details. For the `tidy`
 30 | method, these are not currently used.}
 31 | 
 32 | \item{outcome}{A single character string that specifies a single numeric
 33 | variable.}
 34 | 
 35 | \item{role}{For model terms created by this step, what analysis role should
 36 | they be assigned? By default, the function assumes that resulting distances
 37 | will be used as predictors in a model.}
 38 | 
 39 | \item{trained}{A logical to indicate if the quantities for preprocessing have
 40 | been estimated.}
 41 | 
 42 | \item{top_p}{An integer that will be used to select the `top_p` predictors
 43 | with the smallest p-values. A value of `NA` implies that this criterion
 44 | will be ignored.}
 45 | 
 46 | \item{threshold}{A numeric value between 0 and 1 representing the percentile
 47 | of best scoring features to select. For example `threshold = 0.9` will
 48 | retain only predictors with scores in the top 90th percentile and a smaller
 49 | threshold will select more features. Note that `top_p` and `threshold` are
 50 | mutually exclusive but either can be used in conjunction with `cutoff` to
 51 | select the top-ranked features and those that are smaller than the cutoff
 52 | value.}
 53 | 
 54 | \item{cutoff}{A numeric value, in -log10(p-value) units, where predictors
 55 | with _larger_ than the cutoff will be retained. A value of `NA` implies
 56 | that this criterion will be ignored.}
 57 | 
 58 | \item{exclude}{A character vector of predictor names that will be removed
 59 | from the data. This will be set when `prep()` is used on the recipe and
 60 | should not be set by the user.}
 61 | 
 62 | \item{skip}{A logical. Should the step be skipped when the recipe is baked by
 63 | bake.recipe()? While all operations are baked when prep.recipe() is run,
 64 | some operations may not be able to be conducted on new data (e.g.
 65 | processing the outcome variable(s)). Care should be taken when using skip =
 66 | TRUE as it may affect the computations for subsequent operations.}
 67 | 
 68 | \item{id}{A character string that is unique to this step to identify it.}
 69 | 
 70 | \item{x}{A `step_select_aov` object.}
 71 | }
 72 | \value{
 73 | An updated version of `recipe` with the new step added to the
 74 |  sequence of existing steps (if any). For the `tidy` method, a tibble with a
 75 |  `terms` column for which predictors were removed.
 76 | }
 77 | \description{
 78 | `step_select_aov` creates a *specification* of a recipe step that will filter
 79 | predictors using their relationship with a numerical outcome as measured
 80 | using an ANOVA F-test.
 81 | }
 82 | \details{
 83 | The recipe will stop if both `top_p`, `threshold` or `cutoff` are left
 84 | unspecified. If both are used, they are combined via 'or'.
 85 | }
 86 | \examples{
 87 | data(ames, package = "modeldata")
 88 | 
 89 | rec <-
 90 |   recipe(Sale_Price ~ ., data = ames) \%>\%
 91 |   step_select_aov(
 92 |     all_nominal(),
 93 |     -all_outcomes(),
 94 |     outcome = "Sale_Price",
 95 |     top_p = 1,
 96 |     cutoff = -log10(0.01)
 97 |   ) \%>\%
 98 |   prep()
 99 | 
100 | rec \%>\%
101 |   juice(all_nominal()) \%>\%
102 |   names()
103 | 
104 | tidy(rec, number = 1)
105 | }
106 | \concept{preprocessing}
107 | \concept{supervised_filter}
108 | 


--------------------------------------------------------------------------------
/man/step_select_boruta.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/step_select_boruta.R
  3 | \name{step_select_boruta}
  4 | \alias{step_select_boruta}
  5 | \alias{tidy.step_select_boruta}
  6 | \title{Feature selection step using Boruta}
  7 | \usage{
  8 | step_select_boruta(
  9 |   recipe,
 10 |   ...,
 11 |   outcome = NULL,
 12 |   role = "predictor",
 13 |   trained = FALSE,
 14 |   exclude = NULL,
 15 |   options = list(pValue = 0.01, mcAdj = TRUE, maxRuns = 100),
 16 |   res = NULL,
 17 |   skip = FALSE,
 18 |   id = recipes::rand_id("select_boruta")
 19 | )
 20 | 
 21 | \method{tidy}{step_select_boruta}(x, type = "terms", ...)
 22 | }
 23 | \arguments{
 24 | \item{recipe}{A recipe object. The step will be added to the sequence of
 25 | operations for this recipe.}
 26 | 
 27 | \item{...}{One or more selector functions to choose which predictors are
 28 | affected by the step. See [selections()] for more details. For the `tidy`
 29 | method, these are not currently used.}
 30 | 
 31 | \item{outcome}{A character string with the name of the response variable to
 32 | use to calculate the feature importance scores.}
 33 | 
 34 | \item{role}{Not used by this step since no new variables are created.}
 35 | 
 36 | \item{trained}{A logical to indicate if the quantities for preprocessing have
 37 | been estimated.}
 38 | 
 39 | \item{exclude}{A character vector of predictor names that will be removed
 40 | from the data. This will be set when `prep()` is used on the recipe and
 41 | should not be set by the user.}
 42 | 
 43 | \item{options}{A list of options to pass to `Boruta::Boruta()`. The defaults
 44 | use Boruta's defaults. *Note* that `x` and `y` should not be passed here.}
 45 | 
 46 | \item{res}{The `Boruta::Boruta` object is stored here once this preprocessing
 47 | step has been trained by `prep.recipe()`.}
 48 | 
 49 | \item{skip}{A logical. Should the step be skipped when the recipe is baked by
 50 | bake.recipe()? While all operations are baked when prep.recipe() is run,
 51 | some operations may not be able to be conducted on new data (e.g.
 52 | processing the outcome variable(s)). Care should be taken when using skip =
 53 | TRUE as it may affect the computations for subsequent operations.}
 54 | 
 55 | \item{id}{A character string that is unique to this step to identify it.}
 56 | 
 57 | \item{x}{A `step_select_boruta` object.}
 58 | 
 59 | \item{type}{A character with either 'terms' (the default) to return a
 60 | tibble containing the variables that have been removed by the filter step,
 61 | or 'scores' to return the scores for each variable.}
 62 | }
 63 | \value{
 64 | An updated version of `recipe` with the new step added to the
 65 |  sequence of existing steps (if any). For the `tidy` method, a tibble with a
 66 |  `terms` column for which predictors were removed.
 67 | }
 68 | \description{
 69 | `step_select_boruta` creates a *specification* of a recipe step that selects
 70 | a subset of predictors using the Boruta feature selection approach.
 71 | }
 72 | \details{
 73 | The Boruta algorithm technically is a wrapper approach that uses random
 74 | forests to test whether the feature importance scores obtained on the
 75 | original data are higher than best of the scores obtained when the variables
 76 | are randomly permuted. These permuted features are termed 'shadow' features.
 77 | If the scores for any original feature are higher than the best of the scores
 78 | for the randomly permuted features, then this is marked as a 'hit'. Features
 79 | are confirmed or rejected based on a confidence threshold (default is p =
 80 | 0.01) applied to the tails of the binomial distribution with p = 0.5.
 81 | Features that do not fall within the lower (reject) or upper (accept) tails
 82 | of the distribution are labelled as 'tentative'. Rejected features are
 83 | dropped from the feature set and the procedure is repeated until no more
 84 | 'tentative' features exist, or that a maximum number of runs are reached.
 85 | }
 86 | \examples{
 87 | library(recipes)
 88 | library(parsnip)
 89 | 
 90 | # load the example iris dataset
 91 | data(cells, package = "modeldata")
 92 | 
 93 | # create a preprocessing recipe
 94 | rec <-
 95 |  recipe(class ~ ., data = cells[, -1]) \%>\%
 96 |  step_select_boruta(all_predictors(), outcome = "class")
 97 | 
 98 | prepped <- prep(rec)
 99 | 
100 | preproc_data <- juice(prepped)
101 | prepped
102 | }
103 | 


--------------------------------------------------------------------------------
/man/step_select_roc.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/step_select_roc.R
  3 | \name{step_select_roc}
  4 | \alias{step_select_roc}
  5 | \alias{tidy.step_select_roc}
  6 | \title{Filter Numeric Predictors using ROC Curve}
  7 | \usage{
  8 | step_select_roc(
  9 |   recipe,
 10 |   ...,
 11 |   outcome,
 12 |   role = "predictor",
 13 |   trained = FALSE,
 14 |   threshold = NA,
 15 |   top_p = NA,
 16 |   cutoff = NA,
 17 |   exclude = NULL,
 18 |   skip = FALSE,
 19 |   id = recipes::rand_id("select_roc")
 20 | )
 21 | 
 22 | \method{tidy}{step_select_roc}(x, ...)
 23 | }
 24 | \arguments{
 25 | \item{recipe}{A recipe object. The step will be added to the sequence of
 26 | operations for this recipe.}
 27 | 
 28 | \item{...}{One or more selector functions to choose which predictors are
 29 | affected by the step. See [selections()] for more details. For the `tidy`
 30 | method, these are not currently used.}
 31 | 
 32 | \item{outcome}{A single character string that specifies a single categorical
 33 | variable to be used as the class.}
 34 | 
 35 | \item{role}{For model terms created by this step, what analysis role should
 36 | they be assigned?. By default, the function assumes that resulting distances
 37 | will be used as predictors in a model.}
 38 | 
 39 | \item{trained}{A logical to indicate if the quantities for preprocessing have
 40 | been estimated.}
 41 | 
 42 | \item{threshold}{A numeric value between 0 and 1 representing the percentile
 43 | of best scoring features to select. For example `threshold = 0.9` will
 44 | retain only predictors with scores in the top 90th percentile and a smaller
 45 | threshold will select more features. Note that `top_p` and `threshold` are
 46 | mutually exclusive but either can be used in conjunction with `cutoff` to
 47 | select the top-ranked features and those that are smaller than the cutoff
 48 | value.}
 49 | 
 50 | \item{top_p}{An integer that will be used to select the `top_p` predictors
 51 | with the smallest p-values. A value of `NA` implies that this criterion
 52 | will be ignored.}
 53 | 
 54 | \item{cutoff}{A numeric value, in -log10(p-value) units, where predictors
 55 | with _larger_ than the cutoff will be retained. A value of `NA` implies
 56 | that this criterion will be ignored.}
 57 | 
 58 | \item{exclude}{A character vector of predictor names that will be removed
 59 | from the data. This will be set when `prep()` is used on the recipe and
 60 | should not be set by the user.}
 61 | 
 62 | \item{skip}{A logical. Should the step be skipped when the recipe is baked by
 63 | bake.recipe()? While all operations are baked when prep.recipe() is run,
 64 | some operations may not be able to be conducted on new data (e.g.
 65 | processing the outcome variable(s)). Care should be taken when using skip =
 66 | TRUE as it may affect the computations for subsequent operations.}
 67 | 
 68 | \item{id}{A character string that is unique to this step to identify it.}
 69 | 
 70 | \item{x}{A `step_select_roc` object.}
 71 | }
 72 | \value{
 73 | An updated version of `recipe` with the new step added to the
 74 |  sequence of existing steps (if any). For the `tidy` method, a tibble with a
 75 |  `terms` column for which predictors were removed.
 76 | }
 77 | \description{
 78 | `step_select_roc` creates a *specification* of a recipe step that will
 79 |  filter predictors using their relationship with the outcome as measured
 80 |  using a Receiver Operating Characteristic curve.
 81 | }
 82 | \details{
 83 | The recipe will stop if all of `top_p`, `threshold` and `cutoff` are left
 84 | unspecified.
 85 | 
 86 | The ROC AUC will be set to be 1 - AUC if the value is less than 0.50.
 87 | }
 88 | \examples{
 89 | data(cells, package = "modeldata")
 90 | 
 91 | rec <-
 92 |   recipe(class ~ ., data = cells[, -1]) \%>\%
 93 |   step_select_roc(all_predictors(), outcome = "class", top_p = 10, cutoff = 0.9) \%>\%
 94 |   prep()
 95 | 
 96 | rec \%>\% bake(all_predictors(), new_data = NULL) \%>\% names()
 97 | 
 98 | # Use ROC values to select but always keep at least one:
 99 | rec <-
100 |   recipe(class ~ ., data = cells[, -1]) \%>\%
101 |   step_select_roc(
102 |     all_predictors(),
103 |     outcome = "class",
104 |     top_p = 1,
105 |     cutoff = 0.99
106 |   ) \%>\%
107 |   prep()
108 | 
109 | rec \%>\% juice(all_predictors()) \%>\% names()
110 | }
111 | \concept{preprocessing}
112 | \concept{supervised_filter}
113 | \keyword{datagen}
114 | 


--------------------------------------------------------------------------------
/man/step_select_xtab.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/step_select_xtab.R
  3 | \name{step_select_xtab}
  4 | \alias{step_select_xtab}
  5 | \alias{tidy.step_select_xtab}
  6 | \title{Filter Categorical Predictors using Contingency Tables}
  7 | \usage{
  8 | step_select_xtab(
  9 |   recipe,
 10 |   ...,
 11 |   outcome,
 12 |   role = "predictor",
 13 |   trained = FALSE,
 14 |   top_p = NA,
 15 |   threshold = NA,
 16 |   cutoff = NA,
 17 |   exact = FALSE,
 18 |   fdr = TRUE,
 19 |   exclude = NULL,
 20 |   skip = FALSE,
 21 |   id = recipes::rand_id("select_xtab")
 22 | )
 23 | 
 24 | \method{tidy}{step_select_xtab}(x, ...)
 25 | }
 26 | \arguments{
 27 | \item{recipe}{A recipe object. The step will be added to the sequence of
 28 | operations for this recipe.}
 29 | 
 30 | \item{...}{One or more selector functions to choose which predictors are
 31 | affected by the step. See [selections()] for more details. For the `tidy`
 32 | method, these are not currently used.}
 33 | 
 34 | \item{outcome}{A single character string that specifies a single categorical
 35 | variable to be used as the class.}
 36 | 
 37 | \item{role}{For model terms created by this step, what analysis role should
 38 | they be assigned?. By default, the function assumes that resulting distances
 39 | will be used as predictors in a model.}
 40 | 
 41 | \item{trained}{A logical to indicate if the quantities for preprocessing have
 42 | been estimated.}
 43 | 
 44 | \item{top_p}{An integer that will be used to select the `top_p` predictors
 45 | with the smallest p-values. A value of `NA` implies that this criterion
 46 | will be ignored.}
 47 | 
 48 | \item{threshold}{A numeric value between 0 and 1 representing the percentile
 49 | of best scoring features to select. For example `threshold = 0.9` will
 50 | retain only predictors with scores in the top 90th percentile and a smaller
 51 | threshold will select more features. Note that `top_p` and `threshold` are
 52 | mutually exclusive but either can be used in conjunction with `cutoff` to
 53 | select the top-ranked features and those that are smaller than the cutoff
 54 | value.}
 55 | 
 56 | \item{cutoff}{A numeric value, in -log10(p-value) units, where predictors
 57 | with _larger_ than the cutoff will be retained. A value of `NA` implies
 58 | that this criterion will be ignored.}
 59 | 
 60 | \item{exact}{Should an exact test be used?}
 61 | 
 62 | \item{fdr}{Should false discovery rates (FDR) be used instead of p-values?}
 63 | 
 64 | \item{exclude}{A character vector of predictor names that will be removed
 65 | from the data. This will be set when `prep()` is used on the recipe and
 66 | should not be set by the user.}
 67 | 
 68 | \item{skip}{A logical. Should the step be skipped when the recipe is baked by
 69 | bake.recipe()? While all operations are baked when prep.recipe() is run,
 70 | some operations may not be able to be conducted on new data (e.g.
 71 | processing the outcome variable(s)). Care should be taken when using skip =
 72 | TRUE as it may affect the computations for subsequent operations.}
 73 | 
 74 | \item{id}{A character string that is unique to this step to identify it.}
 75 | 
 76 | \item{x}{A `step_select_xtab` object.}
 77 | }
 78 | \value{
 79 | An updated version of `recipe` with the new step added to the
 80 |  sequence of existing steps (if any). For the `tidy` method, a tibble with a
 81 |  `terms` column for which predictors were removed.
 82 | }
 83 | \description{
 84 | `step_select_xtab` creates a *specification* of a recipe step that will
 85 |  filter predictors using their relationship with the outcome as measured
 86 |  using statistical tests for association.
 87 | }
 88 | \details{
 89 | The recipe will stop if all of `top_p`, `threshold` and `cutoff` are left
 90 | unspecified. If both are used, they are combined via 'or'.
 91 | 
 92 | The Benjamini-Hochberg FDR correction is used (see [stats::p.adjust()]).
 93 | 
 94 | Warnings from [stats::chisq.test()] and [stats::fisher.test()] are suppressed.
 95 | }
 96 | \examples{
 97 | data(attrition, package = "modeldata")
 98 | 
 99 | rec <-
100 |   recipe(Attrition ~ ., data = attrition) \%>\%
101 |   step_select_xtab(all_nominal(), -all_outcomes(), outcome = "Attrition",
102 |                    top_p = 1, cutoff = 0.001, exact = TRUE) \%>\%
103 |   prep()
104 | 
105 | rec \%>\% juice(all_nominal(), -all_outcomes()) \%>\% names()
106 | 
107 | tidy(rec, number = 1)
108 | }
109 | \concept{preprocessing}
110 | \concept{supervised_filter}
111 | \keyword{datagen}
112 | 


--------------------------------------------------------------------------------
/man/step_select_mrmr.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/step_select_mrmr.R
  3 | \name{step_select_mrmr}
  4 | \alias{step_select_mrmr}
  5 | \alias{tidy.step_select_mrmr}
  6 | \title{Apply minimum Redundancy Maximum Relevance Feature Selection (mRMR)}
  7 | \usage{
  8 | step_select_mrmr(
  9 |   recipe,
 10 |   ...,
 11 |   outcome = NULL,
 12 |   role = NA,
 13 |   trained = FALSE,
 14 |   top_p = NA,
 15 |   threshold = NA,
 16 |   cutoff = NA,
 17 |   threads = 0,
 18 |   exclude = NULL,
 19 |   scores = NULL,
 20 |   skip = FALSE,
 21 |   id = recipes::rand_id("select_mrmr")
 22 | )
 23 | 
 24 | \method{tidy}{step_select_mrmr}(x, type = "terms", ...)
 25 | }
 26 | \arguments{
 27 | \item{recipe}{A recipe object. The step will be added to the sequence of
 28 | operations for this recipe.}
 29 | 
 30 | \item{...}{One or more selector functions to choose which predictors are
 31 | affected by the step. See [selections()] for more details. For the `tidy`
 32 | method, these are not currently used.}
 33 | 
 34 | \item{outcome}{A character string specifying the name of response variable
 35 | used to evaluate mRMR.}
 36 | 
 37 | \item{role}{Not used by this step since no new variables are created}
 38 | 
 39 | \item{trained}{A logical to indicate if the quantities for preprocessing have
 40 | been estimated.}
 41 | 
 42 | \item{top_p}{An integer that will be used to select the `top_p` predictors
 43 | with the smallest p-values. A value of `NA` implies that this criterion
 44 | will be ignored.}
 45 | 
 46 | \item{threshold}{A numeric value between 0 and 1 representing the percentile
 47 | of best scoring features to select. For example `threshold = 0.9` will
 48 | retain only predictors with scores in the top 90th percentile and a smaller
 49 | threshold will select more features. Note that `top_p` and `threshold` are
 50 | mutually exclusive but either can be used in conjunction with `cutoff` to
 51 | select the top-ranked features and those that are smaller than the cutoff
 52 | value.}
 53 | 
 54 | \item{cutoff}{A numeric value, in -log10(p-value) units, where predictors
 55 | with _larger_ than the cutoff will be retained. A value of `NA` implies
 56 | that this criterion will be ignored.}
 57 | 
 58 | \item{threads}{An integer specifying the number of threads to use for
 59 | processing. The default = 0 uses all available threads.}
 60 | 
 61 | \item{exclude}{A character vector of predictor names that will be removed
 62 | from the data. This will be set when `prep()` is used on the recipe and
 63 | should not be set by the user.}
 64 | 
 65 | \item{scores}{A tibble with 'variable' and 'scores' columns containing the
 66 | names of the variables and their mRMR scores. This parameter is only
 67 | produced after the recipe has been trained.}
 68 | 
 69 | \item{skip}{A logical. Should the step be skipped when the recipe is baked by
 70 | bake.recipe()? While all operations are baked when prep.recipe() is run,
 71 | some operations may not be able to be conducted on new data (e.g.
 72 | processing the outcome variable(s)). Care should be taken when using skip =
 73 | TRUE as it may affect the computations for subsequent operations.}
 74 | 
 75 | \item{id}{A character string that is unique to this step to identify it.}
 76 | 
 77 | \item{x}{A `step_select_mrmr` object.}
 78 | 
 79 | \item{type}{A character with either 'terms' (the default) to return a
 80 | tibble containing the variables that have been removed by the filter step,
 81 | or 'scores' to return the scores for each variable.}
 82 | }
 83 | \value{
 84 | An updated version of `recipe` with the new step added to the
 85 |  sequence of existing steps (if any). For the `tidy` method, a tibble with a
 86 |  `terms` column for which predictors were removed.
 87 | }
 88 | \description{
 89 | `step_select_mrmr` creates a *specification* of a recipe step that will apply
 90 | minimum Redundancy Maximum Relevance Feature Selection (mRMR) to numeric
 91 | data. The top `top_p` scoring features, or features whose scores occur in the
 92 | top percentile `threshold` will be retained as new predictors.
 93 | }
 94 | \details{
 95 | The recipe will stop if all of `top_p`, `threshold` and `cutoff` are left
 96 | unspecified.
 97 | }
 98 | \examples{
 99 | library(recipes)
100 | 
101 | data(cells, package = "modeldata")
102 | 
103 | rec <-
104 |  recipe(class ~ ., data = cells[, -1]) \%>\%
105 |  step_select_mrmr(
106 |    all_predictors(),
107 |    outcome = "class",
108 |    top_p = 10
109 |  )
110 | 
111 | prepped <- prep(rec)
112 | 
113 | new_data <- bake(prepped, new_data = NULL)
114 | prepped
115 | }
116 | \concept{preprocessing}
117 | \concept{supervised_filter}
118 | \keyword{datagen}
119 | 


--------------------------------------------------------------------------------
/man/step_select_carscore.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/step_select_carscore.R
  3 | \name{step_select_carscore}
  4 | \alias{step_select_carscore}
  5 | \alias{tidy.step_select_carscore}
  6 | \title{Feature selection step using the CAR score algorithm}
  7 | \usage{
  8 | step_select_carscore(
  9 |   recipe,
 10 |   ...,
 11 |   outcome = NULL,
 12 |   role = NA,
 13 |   trained = FALSE,
 14 |   top_p = NA,
 15 |   threshold = NA,
 16 |   cutoff = NA,
 17 |   lambda = NA,
 18 |   diagonal = FALSE,
 19 |   exclude = NULL,
 20 |   scores = NULL,
 21 |   skip = FALSE,
 22 |   id = recipes::rand_id("select_carscore")
 23 | )
 24 | 
 25 | \method{tidy}{step_select_carscore}(x, type = "terms", ...)
 26 | }
 27 | \arguments{
 28 | \item{recipe}{A recipe object. The step will be added to the sequence of
 29 | operations for this recipe.}
 30 | 
 31 | \item{...}{One or more selector functions to choose which predictors are
 32 | affected by the step. See [selections()] for more details. For the `tidy`
 33 | method, these are not currently used.}
 34 | 
 35 | \item{outcome}{A character string with the name of the response variable.
 36 | This must refer to a numeric feature for regression.}
 37 | 
 38 | \item{role}{Not used by this step since no new variables are created.}
 39 | 
 40 | \item{trained}{A logical to indicate if the quantities for preprocessing have
 41 | been estimated.}
 42 | 
 43 | \item{top_p}{An integer that will be used to select the `top_p` predictors
 44 | with the smallest p-values. A value of `NA` implies that this criterion
 45 | will be ignored.}
 46 | 
 47 | \item{threshold}{A numeric value between 0 and 1 representing the percentile
 48 | of best scoring features to select. For example `threshold = 0.9` will
 49 | retain only predictors with scores in the top 90th percentile and a smaller
 50 | threshold will select more features. Note that `top_p` and `threshold` are
 51 | mutually exclusive but either can be used in conjunction with `cutoff` to
 52 | select the top-ranked features and those that are smaller than the cutoff
 53 | value.}
 54 | 
 55 | \item{cutoff}{A numeric value, in -log10(p-value) units, where predictors
 56 | with _larger_ than the cutoff will be retained. A value of `NA` implies
 57 | that this criterion will be ignored.}
 58 | 
 59 | \item{lambda}{The correlation shrinkage intensity (range 0-1).}
 60 | 
 61 | \item{diagonal}{For diagonal = FALSE (the default) CAR scores are computed;
 62 | otherwise with diagonal = TRUE marginal correlations.}
 63 | 
 64 | \item{exclude}{A character vector of predictor names that will be removed
 65 | from the data. This will be set when `prep()` is used on the recipe and
 66 | should not be set by the user.}
 67 | 
 68 | \item{scores}{A tibble with 'variable' and 'scores' columns containing the
 69 | names of the variables and the absolute values of the calculated CAR
 70 | scores. This parameter is only produced after the recipe has been trained.}
 71 | 
 72 | \item{skip}{A logical. Should the step be skipped when the recipe is baked by
 73 | bake.recipe()? While all operations are baked when prep.recipe() is run,
 74 | some operations may not be able to be conducted on new data (e.g.
 75 | processing the outcome variable(s)). Care should be taken when using skip =
 76 | TRUE as it may affect the computations for subsequent operations.}
 77 | 
 78 | \item{id}{A character string that is unique to this step to identify it.}
 79 | 
 80 | \item{x}{A `step_select_carscore` object.}
 81 | 
 82 | \item{type}{A character with either 'terms' (the default) to return a
 83 | tibble containing the variables that have been removed by the filter step,
 84 | or 'scores' to return the scores for each variable.}
 85 | }
 86 | \value{
 87 | An updated version of `recipe` with the new step added to the
 88 |  sequence of existing steps (if any). For the `tidy` method, a tibble with a
 89 |  `terms` column for which predictors were removed.
 90 | }
 91 | \description{
 92 | `step_select_carscore` creates a *specification* of a recipe step that
 93 | selects a subset of predictors as part of a regression model based on the
 94 | scores of the CAR score algorithm. This step requires the `care` package to be
 95 | installed. The top `top_p` scoring features, or features whose scores occur
 96 | in the top percentile `threshold` will be retained as new predictors.
 97 | }
 98 | \details{
 99 | The recipe will stop if all of `top_p`, `threshold` and `cutoff` are left
100 | unspecified.
101 | }
102 | \examples{
103 | library(recipes)
104 | 
105 | data(car_prices, package = "modeldata")
106 | 
107 | rec <-
108 |  recipe(Price ~ ., data = car_prices) \%>\%
109 |  step_select_carscore(
110 |    all_predictors(),
111 |    outcome = "Price",
112 |    top_p = 5,
113 |    cutoff = 0.7
114 |  )
115 | 
116 | prepped <- prep(rec)
117 | 
118 | new_data <- bake(prepped, new_data = NULL)
119 | prepped
120 | }
121 | \concept{preprocessing}
122 | \concept{supervised_filter}
123 | \keyword{datagen}
124 | 


--------------------------------------------------------------------------------
/man/step_select_fcbf.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/step_select_fcbf.R
  3 | \name{step_select_fcbf}
  4 | \alias{step_select_fcbf}
  5 | \title{Fast Correlation Based Filter for Feature Selection}
  6 | \usage{
  7 | step_select_fcbf(
  8 |   recipe,
  9 |   ...,
 10 |   threshold = 0.025,
 11 |   outcome = NA,
 12 |   cutpoint = 0.5,
 13 |   features_retained = NA,
 14 |   removals = NULL,
 15 |   role = NA,
 16 |   trained = FALSE,
 17 |   skip = FALSE,
 18 |   id = rand_id("select_fcbf")
 19 | )
 20 | }
 21 | \arguments{
 22 | \item{recipe}{A recipe object. The step will be added to the sequence of
 23 | operations for this recipe.}
 24 | 
 25 | \item{...}{One or more selector functions to choose which predictors are
 26 | affected by the step. See [selections()] for more details. For the `tidy`
 27 | method, these are not currently used.}
 28 | 
 29 | \item{threshold}{A numeric value between 0 and 1 representing the symmetrical
 30 | uncertainty threshold used by the FCBF algorithm. Lower thresholds allow
 31 | more features to be selected.}
 32 | 
 33 | \item{outcome}{A character string specifying the name of the response
 34 | variable. Automatically inferred from the recipe (if possible) when not
 35 | specified by the user.}
 36 | 
 37 | \item{cutpoint}{A numeric value between 0 and 1 representing the quantile at
 38 | which to split numeric features into binary nominal features. e.g. 0.5 =
 39 | median split. See details for more information on discretization}
 40 | 
 41 | \item{features_retained}{A tibble containing the features that were retained
 42 | by the FCBF algorithm. This parameter is only produced after the recipe has
 43 | been trained and should not be specified by the user}
 44 | 
 45 | \item{removals}{A tibble containing the features that were removed
 46 | by the FCBF algorithm. This parameter is only produced after the recipe has
 47 | been trained, and should not be specified by the user}
 48 | 
 49 | \item{role}{Not used for this step since new variables are not created.}
 50 | 
 51 | \item{trained}{A logical to indicate if the quantities for preprocessing have
 52 | been estimated.}
 53 | 
 54 | \item{skip}{A logical. Should the step be skipped when the recipe is baked by
 55 | bake.recipe()? While all operations are baked when prep.recipe() is run,
 56 | some operations may not be able to be conducted on new data (e.g.
 57 | processing the outcome variable(s)). Care should be taken when using skip =
 58 | TRUE as it may affect the computations for subsequent operations.}
 59 | 
 60 | \item{id}{A character string that is unique to this step to identify it.}
 61 | }
 62 | \value{
 63 | An updated version of `recipe` with the new step added to the
 64 |  sequence of existing steps (if any). For the `tidy` method, a tibble with a
 65 |  `terms` column for which predictors were removed.
 66 | }
 67 | \description{
 68 | `step_select_fcbf` creates a *specification* of a recipe step that selects a
 69 | subset of  predictors using the FCBF algorithm. The number of features
 70 | retained depends on the `threshold` parameter: a lower threshold
 71 | selects more features.
 72 | }
 73 | \details{
 74 | This function implements the fast correlation-based filter (FCBF)
 75 | algorithm as described in Yu & Liu (2003). FCBF selects features that
 76 | have high correlation to the outcome, and low correlation to other features.
 77 | 
 78 | Symmetrical uncertainty (SU) is used to indicate the degree of correlation
 79 | between predictors and the outcome. A threshold value for SU must be
 80 | specified, and smaller threshold values will result in more features being
 81 | selected by the algorithm. Appropriate thresholds are data-dependent, so
 82 | different threshold values may need to be explored. It is not possible to
 83 | specify an exact number of features that should be retained
 84 | 
 85 | The algorithm requires categorical features, so continuous features are
 86 | discretized using a binary split (split at the median by default).
 87 | Discretization is only used within the feature selection algorithm,
 88 | selected features are then retained in their original continuous form for
 89 | further processing.
 90 | 
 91 | The FCBF algorithm is implemented by the Bioconductor package 'FCBF', which
 92 | can be installed with BiocManager::install("FCBF")
 93 | }
 94 | \examples{
 95 | \dontrun{
 96 | library(recipes)
 97 | library(colino)
 98 | 
 99 | # Load the example iris dataset
100 | data("iris")
101 | 
102 | # Create a preprocessing recipe including FCBF
103 | my_recipe <- recipe(Species ~ ., data = iris) \%>\%
104 |   step_select_fcbf(all_predictors(), threshold = 0.001)
105 | 
106 | . prepped <- prep(my_recipe, iris)
107 |  new_data <- bake(prepped, new_data = iris)
108 |  prepped
109 | }
110 | }
111 | \references{
112 | Yu, L. and Liu, H. (2003); Feature Selection for High-Dimensional
113 |   Data A Fast Correlation Based Filter Solution, Proc. 20th Intl. Conf. Mach.
114 |   Learn. (ICML-2003), Washington DC, 2003.
115 | }
116 | 


--------------------------------------------------------------------------------
/man/step_select_linear.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/step_select_linear.R
  3 | \name{step_select_linear}
  4 | \alias{step_select_linear}
  5 | \alias{tidy.step_select_linear}
  6 | \title{Feature selection step using the magnitude of a linear models' coefficients}
  7 | \usage{
  8 | step_select_linear(
  9 |   recipe,
 10 |   ...,
 11 |   outcome = NULL,
 12 |   role = "predictor",
 13 |   trained = FALSE,
 14 |   engine = "glm",
 15 |   penalty = NULL,
 16 |   mixture = NULL,
 17 |   top_p = NA,
 18 |   threshold = NA,
 19 |   cutoff = NA,
 20 |   exclude = NULL,
 21 |   scores = NULL,
 22 |   skip = FALSE,
 23 |   id = recipes::rand_id("select_linear")
 24 | )
 25 | 
 26 | \method{tidy}{step_select_linear}(x, type = "terms", ...)
 27 | }
 28 | \arguments{
 29 | \item{recipe}{A recipe object. The step will be added to the sequence of
 30 | operations for this recipe.}
 31 | 
 32 | \item{...}{One or more selector functions to choose which predictors are
 33 | affected by the step. See [selections()] for more details. For the `tidy`
 34 | method, these are not currently used.}
 35 | 
 36 | \item{outcome}{A character string with the name of the response variable to
 37 | use to calculate the feature importance scores.}
 38 | 
 39 | \item{role}{Not used by this step since no new variables are created.}
 40 | 
 41 | \item{trained}{A logical to indicate if the quantities for preprocessing have
 42 | been estimated.}
 43 | 
 44 | \item{engine}{A supported rand_forest engine that is supported by parsnip.
 45 | The default is "glm".}
 46 | 
 47 | \item{penalty}{A non-negative number representing the total amount of
 48 | regularization (specific engines only).}
 49 | 
 50 | \item{mixture}{A number between zero and one (inclusive) that is the
 51 | proportion of L1 regularization (i.e. lasso) in the model. When mixture =
 52 | 1, it is a pure lasso model while mixture = 0 indicates that ridge
 53 | regression is being used (specific engines only).}
 54 | 
 55 | \item{top_p}{An integer that will be used to select the `top_p` predictors
 56 | with the smallest p-values. A value of `NA` implies that this criterion
 57 | will be ignored.}
 58 | 
 59 | \item{threshold}{A numeric value between 0 and 1 representing the percentile
 60 | of best scoring features to select. For example `threshold = 0.9` will
 61 | retain only predictors with scores in the top 90th percentile and a smaller
 62 | threshold will select more features. Note that `top_p` and `threshold` are
 63 | mutually exclusive but either can be used in conjunction with `cutoff` to
 64 | select the top-ranked features and those that are smaller than the cutoff
 65 | value.}
 66 | 
 67 | \item{cutoff}{A numeric value, in -log10(p-value) units, where predictors
 68 | with _larger_ than the cutoff will be retained. A value of `NA` implies
 69 | that this criterion will be ignored.}
 70 | 
 71 | \item{exclude}{A character vector of predictor names that will be removed
 72 | from the data. This will be set when `prep()` is used on the recipe and
 73 | should not be set by the user.}
 74 | 
 75 | \item{scores}{A tibble with 'variable' and 'scores' columns containing the
 76 | names of the variables and their feature importance scores. This parameter
 77 | is only produced after the recipe has been trained.}
 78 | 
 79 | \item{skip}{A logical. Should the step be skipped when the recipe is baked by
 80 | bake.recipe()? While all operations are baked when prep.recipe() is run,
 81 | some operations may not be able to be conducted on new data (e.g.
 82 | processing the outcome variable(s)). Care should be taken when using skip =
 83 | TRUE as it may affect the computations for subsequent operations.}
 84 | 
 85 | \item{id}{A character string that is unique to this step to identify it.}
 86 | 
 87 | \item{x}{A `step_select_linear` object.}
 88 | 
 89 | \item{type}{A character with either 'terms' (the default) to return a
 90 | tibble containing the variables that have been removed by the filter step,
 91 | or 'scores' to return the scores for each variable.}
 92 | }
 93 | \value{
 94 | An updated version of `recipe` with the new step added to the
 95 |  sequence of existing steps (if any). For the `tidy` method, a tibble with a
 96 |  `terms` column for which predictors were removed.
 97 | }
 98 | \description{
 99 | `step_select_linear` creates a *specification* of a recipe step that selects
100 | a subset of predictors based on the ranking of the magnitude of coefficients
101 | provided by a `parsnip::linear_reg` or `parsnip::logistic_reg` model.
102 | }
103 | \examples{
104 | library(recipes)
105 | library(parsnip)
106 | 
107 | # load the example iris dataset
108 | data(cells, package = "modeldata")
109 | 
110 | # create a preprocessing recipe
111 | rec <-
112 |  recipe(class ~ ., data = cells[, -1]) \%>\%
113 |  step_select_linear(
114 |    all_predictors(),
115 |    outcome = "class",
116 |    threshold = 0.9
117 |  )
118 | 
119 | prepped <- prep(rec)
120 | 
121 | preproc_data <- bake(prepped, new_data = NULL)
122 | prepped
123 | }
124 | 


--------------------------------------------------------------------------------
/man/step_select_vip.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/step_select_vip.R
  3 | \name{step_select_vip}
  4 | \alias{step_select_vip}
  5 | \alias{tidy.step_select_vip}
  6 | \title{Feature selection step using a model's feature importance scores or
  7 | coefficients}
  8 | \usage{
  9 | step_select_vip(
 10 |   recipe,
 11 |   ...,
 12 |   outcome = NULL,
 13 |   role = "predictor",
 14 |   trained = FALSE,
 15 |   model = NULL,
 16 |   top_p = NA,
 17 |   threshold = NA,
 18 |   cutoff = NA,
 19 |   exclude = NULL,
 20 |   scores = NULL,
 21 |   skip = FALSE,
 22 |   id = recipes::rand_id("select_vip")
 23 | )
 24 | 
 25 | \method{tidy}{step_select_vip}(x, type = "terms", ...)
 26 | }
 27 | \arguments{
 28 | \item{recipe}{A recipe object. The step will be added to the sequence of
 29 | operations for this recipe.}
 30 | 
 31 | \item{...}{One or more selector functions to choose which predictors are
 32 | affected by the step. See [selections()] for more details. For the `tidy`
 33 | method, these are not currently used.}
 34 | 
 35 | \item{outcome}{A character string with the name of the response variable to
 36 | use to calculate the feature importance scores.}
 37 | 
 38 | \item{role}{Not used by this step since no new variables are created.}
 39 | 
 40 | \item{trained}{A logical to indicate if the quantities for preprocessing have
 41 | been estimated.}
 42 | 
 43 | \item{model}{A `model_spec` object from `parsnip` that has a feature
 44 | importances or coefficients method. The model needs to have an equivalent
 45 | `pull_importances` method defined. See `?pull_importances` for how to
 46 | define methods for models that are not currently supported.}
 47 | 
 48 | \item{top_p}{An integer that will be used to select the `top_p` predictors
 49 | with the smallest p-values. A value of `NA` implies that this criterion
 50 | will be ignored.}
 51 | 
 52 | \item{threshold}{A numeric value between 0 and 1 representing the percentile
 53 | of best scoring features to select. For example `threshold = 0.9` will
 54 | retain only predictors with scores in the top 90th percentile and a smaller
 55 | threshold will select more features. Note that `top_p` and `threshold` are
 56 | mutually exclusive but either can be used in conjunction with `cutoff` to
 57 | select the top-ranked features and those that are smaller than the cutoff
 58 | value.}
 59 | 
 60 | \item{cutoff}{A numeric value, in -log10(p-value) units, where predictors
 61 | with _larger_ than the cutoff will be retained. A value of `NA` implies
 62 | that this criterion will be ignored.}
 63 | 
 64 | \item{exclude}{A character vector of predictor names that will be removed
 65 | from the data. This will be set when `prep()` is used on the recipe and
 66 | should not be set by the user.}
 67 | 
 68 | \item{scores}{A tibble with 'variable' and 'scores' columns containing the
 69 | names of the variables and their feature importance scores. This parameter
 70 | is only produced after the recipe has been trained.}
 71 | 
 72 | \item{skip}{A logical. Should the step be skipped when the recipe is baked by
 73 | bake.recipe()? While all operations are baked when prep.recipe() is run,
 74 | some operations may not be able to be conducted on new data (e.g.
 75 | processing the outcome variable(s)). Care should be taken when using skip =
 76 | TRUE as it may affect the computations for subsequent operations.}
 77 | 
 78 | \item{id}{A character string that is unique to this step to identify it.}
 79 | 
 80 | \item{x}{A `step_select_vip` object}
 81 | 
 82 | \item{type}{A character with either 'terms' (the default) to return a
 83 | tibble containing the variables that have been removed by the filter step,
 84 | or 'scores' to return the scores for each variable.}
 85 | }
 86 | \value{
 87 | An updated version of `recipe` with the new step added to the
 88 |  sequence of existing steps (if any). For the `tidy` method, a tibble with a
 89 |  `terms` column for which predictors were removed.
 90 | }
 91 | \description{
 92 | `step_select_vip` creates a *specification* of a recipe step that selects a
 93 | subset of predictors based on the ranking of variable importance provided by
 94 | a `parsnip` model specification and the `model` parameter
 95 | }
 96 | \details{
 97 | The recipe will stop if all of `top_p`, `threshold` and `cutoff` are left
 98 | unspecified.
 99 | }
100 | \examples{
101 | library(recipes)
102 | library(parsnip)
103 | 
104 | # load the example cells dataset
105 | data(cells, package = "modeldata")
106 | 
107 | # define a base model to use for feature importances
108 | base_model <- rand_forest(mode = "classification") \%>\%
109 |     set_engine("ranger", importance = "permutation")
110 | 
111 | # create a preprocessing recipe
112 | rec <-
113 |  recipe(class ~ ., data = cells[, -1]) \%>\%
114 |  step_select_vip(
115 |    all_predictors(),
116 |    outcome = "class",
117 |    model = base_model,
118 |    top_p = 10
119 |  )
120 | 
121 | prepped <- prep(rec)
122 | 
123 | preproc_data <- juice(prepped)
124 | prepped
125 | }
126 | 


--------------------------------------------------------------------------------
/man/step_select_tree.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/step_select_tree.R
  3 | \name{step_select_tree}
  4 | \alias{step_select_tree}
  5 | \alias{tidy.step_select_tree}
  6 | \title{Feature selection step using a decision tree importance scores}
  7 | \usage{
  8 | step_select_tree(
  9 |   recipe,
 10 |   ...,
 11 |   outcome = NULL,
 12 |   role = "predictor",
 13 |   trained = FALSE,
 14 |   engine = "rpart",
 15 |   cost_complexity = NULL,
 16 |   tree_depth = NULL,
 17 |   min_n = NULL,
 18 |   top_p = NA,
 19 |   threshold = NA,
 20 |   cutoff = NA,
 21 |   exclude = NULL,
 22 |   scores = NULL,
 23 |   skip = FALSE,
 24 |   id = recipes::rand_id("select_tree")
 25 | )
 26 | 
 27 | \method{tidy}{step_select_tree}(x, type = "terms", ...)
 28 | }
 29 | \arguments{
 30 | \item{recipe}{A recipe object. The step will be added to the sequence of
 31 | operations for this recipe.}
 32 | 
 33 | \item{...}{One or more selector functions to choose which predictors are
 34 | affected by the step. See [selections()] for more details. For the `tidy`
 35 | method, these are not currently used.}
 36 | 
 37 | \item{outcome}{A character string with the name of the response variable to
 38 | use to calculate the feature importance scores.}
 39 | 
 40 | \item{role}{Not used by this step since no new variables are created.}
 41 | 
 42 | \item{trained}{A logical to indicate if the quantities for preprocessing have
 43 | been estimated.}
 44 | 
 45 | \item{engine}{A supported rand_forest engine that is supported by parsnip.
 46 | The default is "rpart".}
 47 | 
 48 | \item{cost_complexity}{A positive number for the the cost/complexity
 49 | parameter (a.k.a. Cp) used by CART models (specific engines only).}
 50 | 
 51 | \item{tree_depth}{An integer for maximum depth of the tree.}
 52 | 
 53 | \item{min_n}{An integer for the minimum number of data points in a node that
 54 | are required for the node to be split further.}
 55 | 
 56 | \item{top_p}{An integer that will be used to select the `top_p` predictors
 57 | with the smallest p-values. A value of `NA` implies that this criterion
 58 | will be ignored.}
 59 | 
 60 | \item{threshold}{A numeric value between 0 and 1 representing the percentile
 61 | of best scoring features to select. For example `threshold = 0.9` will
 62 | retain only predictors with scores in the top 90th percentile and a smaller
 63 | threshold will select more features. Note that `top_p` and `threshold` are
 64 | mutually exclusive but either can be used in conjunction with `cutoff` to
 65 | select the top-ranked features and those that are smaller than the cutoff
 66 | value.}
 67 | 
 68 | \item{cutoff}{A numeric value, in -log10(p-value) units, where predictors
 69 | with _larger_ than the cutoff will be retained. A value of `NA` implies
 70 | that this criterion will be ignored.}
 71 | 
 72 | \item{exclude}{A character vector of predictor names that will be removed
 73 | from the data. This will be set when `prep()` is used on the recipe and
 74 | should not be set by the user.}
 75 | 
 76 | \item{scores}{A tibble with 'variable' and 'scores' columns containing the
 77 | names of the variables and their feature importance scores. This parameter
 78 | is only produced after the recipe has been trained.}
 79 | 
 80 | \item{skip}{A logical. Should the step be skipped when the recipe is baked by
 81 | bake.recipe()? While all operations are baked when prep.recipe() is run,
 82 | some operations may not be able to be conducted on new data (e.g.
 83 | processing the outcome variable(s)). Care should be taken when using skip =
 84 | TRUE as it may affect the computations for subsequent operations.}
 85 | 
 86 | \item{id}{A character string that is unique to this step to identify it.}
 87 | 
 88 | \item{x}{A `step_select_tree` object.}
 89 | 
 90 | \item{type}{A character with either 'terms' (the default) to return a
 91 | tibble containing the variables that have been removed by the filter step,
 92 | or 'scores' to return the scores for each variable.}
 93 | }
 94 | \value{
 95 | An updated version of `recipe` with the new step added to the
 96 |  sequence of existing steps (if any). For the `tidy` method, a tibble with a
 97 |  `terms` column for which predictors were removed.
 98 | }
 99 | \description{
100 | `step_select_tree` creates a *specification* of a recipe step that selects a
101 | subset of predictors based on the ranking of variable importance provided by
102 | a `parsnip::decision_tree` supported model.
103 | }
104 | \details{
105 | The recipe will stop if all of `top_p`, `threshold` and `cutoff` are left
106 | unspecified.
107 | }
108 | \examples{
109 | library(recipes)
110 | library(parsnip)
111 | 
112 | # load the example cells dataset
113 | data(cells, package = "modeldata")
114 | 
115 | # create a preprocessing recipe
116 | rec <-
117 |  recipe(class ~ ., data = cells[, -1]) \%>\%
118 |  step_select_tree(all_predictors(), outcome = "class", top_p = 10)
119 | 
120 | prepped <- prep(rec)
121 | 
122 | preproc_data <- bake(prepped, new_data = NULL)
123 | prepped
124 | }
125 | 


--------------------------------------------------------------------------------
/man/step_select_infgain.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/step_select_infgain.R
  3 | \name{step_select_infgain}
  4 | \alias{step_select_infgain}
  5 | \alias{tidy.step_select_infgain}
  6 | \title{Information gain feature selection step}
  7 | \usage{
  8 | step_select_infgain(
  9 |   recipe,
 10 |   ...,
 11 |   outcome = NULL,
 12 |   role = NA,
 13 |   trained = FALSE,
 14 |   top_p = NA,
 15 |   threshold = NA,
 16 |   cutoff = NA,
 17 |   type = "infogain",
 18 |   nbins = 5,
 19 |   threads = 1,
 20 |   exclude = NULL,
 21 |   scores = NULL,
 22 |   skip = FALSE,
 23 |   id = recipes::rand_id("select_infgain")
 24 | )
 25 | 
 26 | \method{tidy}{step_select_infgain}(x, type = "terms", ...)
 27 | }
 28 | \arguments{
 29 | \item{recipe}{A recipe object. The step will be added to the sequence of
 30 | operations for this recipe.}
 31 | 
 32 | \item{...}{One or more selector functions to choose which predictors are
 33 | affected by the step. See [selections()] for more details. For the `tidy`
 34 | method, these are not currently used.}
 35 | 
 36 | \item{outcome}{A character string with the name of the response variable to
 37 | use to evaluate information gain value against the predictors.}
 38 | 
 39 | \item{role}{Not used by this step since no new variables are created.}
 40 | 
 41 | \item{trained}{A logical to indicate if the quantities for preprocessing have
 42 | been estimated.}
 43 | 
 44 | \item{top_p}{An integer that will be used to select the `top_p` predictors
 45 | with the smallest p-values. A value of `NA` implies that this criterion
 46 | will be ignored.}
 47 | 
 48 | \item{threshold}{A numeric value between 0 and 1 representing the percentile
 49 | of best scoring features to select. For example `threshold = 0.9` will
 50 | retain only predictors with scores in the top 90th percentile and a smaller
 51 | threshold will select more features. Note that `top_p` and `threshold` are
 52 | mutually exclusive but either can be used in conjunction with `cutoff` to
 53 | select the top-ranked features and those that are smaller than the cutoff
 54 | value.}
 55 | 
 56 | \item{cutoff}{A numeric value, in -log10(p-value) units, where predictors
 57 | with _larger_ than the cutoff will be retained. A value of `NA` implies
 58 | that this criterion will be ignored.}
 59 | 
 60 | \item{type}{A character with either 'terms' (the default) to return a
 61 | tibble containing the variables that have been removed by the filter step,
 62 | or 'scores' to return the scores for each variable.}
 63 | 
 64 | \item{nbins}{An integer specifying the number of bins for discretization.
 65 | Only used if the outcome of a continuous variable for regression. The
 66 | default is 'nbins = 5'.}
 67 | 
 68 | \item{threads}{An integer specifying the number of threads to use for
 69 | processing. The default = 0 uses all available threads.}
 70 | 
 71 | \item{exclude}{A character vector of predictor names that will be removed
 72 | from the data. This will be set when `prep()` is used on the recipe and
 73 | should not be set by the user.}
 74 | 
 75 | \item{scores}{A tibble with 'variable' and 'scores' columns containing the
 76 | names of the variables and their information gain scores. This parameter is
 77 | only produced after the recipe has been trained.}
 78 | 
 79 | \item{skip}{A logical. Should the step be skipped when the recipe is baked by
 80 | bake.recipe()? While all operations are baked when prep.recipe() is run,
 81 | some operations may not be able to be conducted on new data (e.g.
 82 | processing the outcome variable(s)). Care should be taken when using skip =
 83 | TRUE as it may affect the computations for subsequent operations.}
 84 | 
 85 | \item{id}{A character string that is unique to this step to identify it.}
 86 | 
 87 | \item{x}{A `step_select_infgain` object.}
 88 | }
 89 | \value{
 90 | An updated version of `recipe` with the new step added to the
 91 |  sequence of existing steps (if any). For the `tidy` method, a tibble with a
 92 |  `terms` column for which predictors were removed.
 93 | }
 94 | \description{
 95 | `step_select_infgain` creates a *specification* of a recipe step that selects a
 96 | subset of predictors based on the scores of the information gain algorithm.
 97 | This step requires the FSelectorRcpp package to be installed. The top
 98 | `top_p` scoring features, or features whose scores occur in the top
 99 | percentile `threshold` will be retained as new predictors.
100 | }
101 | \details{
102 | The recipe will stop if both `top_p`, `threshold` and `cutoff` are left
103 | unspecified.
104 | }
105 | \examples{
106 | library(recipes)
107 | 
108 | data(cells, package = "modeldata")
109 | 
110 | rec <-
111 |  recipe(class ~ ., data = cells[, -1]) \%>\%
112 |  step_select_infgain(
113 |    all_predictors(),
114 |    outcome = "class",
115 |    threshold = 0.9,
116 |    id = "infgain"
117 |  )
118 | 
119 | prepped <- prep(rec)
120 | 
121 | new_data <- juice(prepped)
122 | prepped
123 | }
124 | \concept{preprocessing}
125 | \concept{supervised_filter}
126 | \keyword{datagen}
127 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
  1 | # Generated by roxygen2: do not edit by hand
  2 | 
  3 | S3method(bake,step_select_aov)
  4 | S3method(bake,step_select_boruta)
  5 | S3method(bake,step_select_carscore)
  6 | S3method(bake,step_select_fcbf)
  7 | S3method(bake,step_select_forests)
  8 | S3method(bake,step_select_infgain)
  9 | S3method(bake,step_select_linear)
 10 | S3method(bake,step_select_mrmr)
 11 | S3method(bake,step_select_relief)
 12 | S3method(bake,step_select_roc)
 13 | S3method(bake,step_select_tree)
 14 | S3method(bake,step_select_vip)
 15 | S3method(bake,step_select_xtab)
 16 | S3method(prep,step_select_aov)
 17 | S3method(prep,step_select_boruta)
 18 | S3method(prep,step_select_carscore)
 19 | S3method(prep,step_select_fcbf)
 20 | S3method(prep,step_select_forests)
 21 | S3method(prep,step_select_infgain)
 22 | S3method(prep,step_select_linear)
 23 | S3method(prep,step_select_mrmr)
 24 | S3method(prep,step_select_relief)
 25 | S3method(prep,step_select_roc)
 26 | S3method(prep,step_select_tree)
 27 | S3method(prep,step_select_vip)
 28 | S3method(prep,step_select_xtab)
 29 | S3method(print,step_select_aov)
 30 | S3method(print,step_select_boruta)
 31 | S3method(print,step_select_carscore)
 32 | S3method(print,step_select_fcbf)
 33 | S3method(print,step_select_forests)
 34 | S3method(print,step_select_infgain)
 35 | S3method(print,step_select_linear)
 36 | S3method(print,step_select_mrmr)
 37 | S3method(print,step_select_relief)
 38 | S3method(print,step_select_roc)
 39 | S3method(print,step_select_tree)
 40 | S3method(print,step_select_vip)
 41 | S3method(print,step_select_xtab)
 42 | S3method(pull_importances,"_C5.0")
 43 | S3method(pull_importances,"_H2OMultinomialModel")
 44 | S3method(pull_importances,"_H2ORegressionModel")
 45 | S3method(pull_importances,"_ObliqueForestClassification")
 46 | S3method(pull_importances,"_ObliqueForestRegression")
 47 | S3method(pull_importances,"_ObliqueForestSurvival")
 48 | S3method(pull_importances,"_cubist")
 49 | S3method(pull_importances,"_earth")
 50 | S3method(pull_importances,"_elnet")
 51 | S3method(pull_importances,"_glm")
 52 | S3method(pull_importances,"_lm")
 53 | S3method(pull_importances,"_lognet")
 54 | S3method(pull_importances,"_randomForest")
 55 | S3method(pull_importances,"_ranger")
 56 | S3method(pull_importances,"_rpart")
 57 | S3method(pull_importances,"_xgb.Booster")
 58 | S3method(pull_importances,default)
 59 | S3method(required_pkgs,step_select_aov)
 60 | S3method(required_pkgs,step_select_boruta)
 61 | S3method(required_pkgs,step_select_carscore)
 62 | S3method(required_pkgs,step_select_fcbf)
 63 | S3method(required_pkgs,step_select_forests)
 64 | S3method(required_pkgs,step_select_infgain)
 65 | S3method(required_pkgs,step_select_linear)
 66 | S3method(required_pkgs,step_select_mrmr)
 67 | S3method(required_pkgs,step_select_relief)
 68 | S3method(required_pkgs,step_select_roc)
 69 | S3method(required_pkgs,step_select_tree)
 70 | S3method(required_pkgs,step_select_vip)
 71 | S3method(required_pkgs,step_select_xtab)
 72 | S3method(tidy,step_select_aov)
 73 | S3method(tidy,step_select_boruta)
 74 | S3method(tidy,step_select_carscore)
 75 | S3method(tidy,step_select_forests)
 76 | S3method(tidy,step_select_infgain)
 77 | S3method(tidy,step_select_linear)
 78 | S3method(tidy,step_select_mrmr)
 79 | S3method(tidy,step_select_relief)
 80 | S3method(tidy,step_select_roc)
 81 | S3method(tidy,step_select_tree)
 82 | S3method(tidy,step_select_vip)
 83 | S3method(tidy,step_select_xtab)
 84 | S3method(tunable,step_select_aov)
 85 | S3method(tunable,step_select_carscore)
 86 | S3method(tunable,step_select_forests)
 87 | S3method(tunable,step_select_infgain)
 88 | S3method(tunable,step_select_linear)
 89 | S3method(tunable,step_select_mrmr)
 90 | S3method(tunable,step_select_relief)
 91 | S3method(tunable,step_select_roc)
 92 | S3method(tunable,step_select_tree)
 93 | S3method(tunable,step_select_vip)
 94 | S3method(tunable,step_select_xtab)
 95 | export("%>%")
 96 | export(cutoff)
 97 | export(entropy)
 98 | export(pull_importances)
 99 | export(step_select_aov)
100 | export(step_select_boruta)
101 | export(step_select_carscore)
102 | export(step_select_fcbf)
103 | export(step_select_forests)
104 | export(step_select_infgain)
105 | export(step_select_linear)
106 | export(step_select_mrmr)
107 | export(step_select_relief)
108 | export(step_select_roc)
109 | export(step_select_tree)
110 | export(step_select_vip)
111 | export(step_select_xtab)
112 | export(top_p)
113 | importFrom(dplyr,filter)
114 | importFrom(dplyr,pull)
115 | importFrom(generics,required_pkgs)
116 | importFrom(generics,tidy)
117 | importFrom(magrittr,"%>%")
118 | importFrom(recipes,add_step)
119 | importFrom(recipes,bake)
120 | importFrom(recipes,prep)
121 | importFrom(recipes,print_step)
122 | importFrom(recipes,rand_id)
123 | importFrom(recipes,recipes_eval_select)
124 | importFrom(recipes,recipes_pkg_check)
125 | importFrom(recipes,step)
126 | importFrom(rlang,.data)
127 | importFrom(rlang,enquos)
128 | importFrom(stats,aov)
129 | importFrom(stats,as.formula)
130 | importFrom(tibble,as_tibble)
131 | importFrom(tibble,tibble)
132 | importFrom(tune,tunable)
133 | 


--------------------------------------------------------------------------------
/docs/pkgdown.js:
--------------------------------------------------------------------------------
  1 | /* http://gregfranko.com/blog/jquery-best-practices/ */
  2 | (function($) {
  3 |   $(function() {
  4 | 
  5 |     $('nav.navbar').headroom();
  6 | 
  7 |     Toc.init({
  8 |       $nav: $("#toc"),
  9 |       $scope: $("main h2, main h3, main h4, main h5, main h6")
 10 |     });
 11 | 
 12 |     if ($('#toc').length) {
 13 |       $('body').scrollspy({
 14 |         target: '#toc',
 15 |         offset: $("nav.navbar").outerHeight() + 1
 16 |       });
 17 |     }
 18 | 
 19 |     // Activate popovers
 20 |     $('[data-bs-toggle="popover"]').popover({
 21 |       container: 'body',
 22 |       html: true,
 23 |       trigger: 'focus',
 24 |       placement: "top",
 25 |       sanitize: false,
 26 |     });
 27 | 
 28 |     $('[data-bs-toggle="tooltip"]').tooltip();
 29 | 
 30 |   /* Clipboard --------------------------*/
 31 | 
 32 |   function changeTooltipMessage(element, msg) {
 33 |     var tooltipOriginalTitle=element.getAttribute('data-original-title');
 34 |     element.setAttribute('data-original-title', msg);
 35 |     $(element).tooltip('show');
 36 |     element.setAttribute('data-original-title', tooltipOriginalTitle);
 37 |   }
 38 | 
 39 |   if(ClipboardJS.isSupported()) {
 40 |     $(document).ready(function() {
 41 |       var copyButton = "<button type='button' class='btn btn-primary btn-copy-ex' title='Copy to clipboard' aria-label='Copy to clipboard' data-toggle='tooltip' data-placement='left' data-trigger='hover' data-clipboard-copy><i class='fa fa-copy'></i></button>";
 42 | 
 43 |       $("div.sourceCode").addClass("hasCopyButton");
 44 | 
 45 |       // Insert copy buttons:
 46 |       $(copyButton).prependTo(".hasCopyButton");
 47 | 
 48 |       // Initialize tooltips:
 49 |       $('.btn-copy-ex').tooltip({container: 'body'});
 50 | 
 51 |       // Initialize clipboard:
 52 |       var clipboard = new ClipboardJS('[data-clipboard-copy]', {
 53 |         text: function(trigger) {
 54 |           return trigger.parentNode.textContent.replace(/\n#>[^\n]*/g, "");
 55 |         }
 56 |       });
 57 | 
 58 |       clipboard.on('success', function(e) {
 59 |         changeTooltipMessage(e.trigger, 'Copied!');
 60 |         e.clearSelection();
 61 |       });
 62 | 
 63 |       clipboard.on('error', function() {
 64 |         changeTooltipMessage(e.trigger,'Press Ctrl+C or Command+C to copy');
 65 |       });
 66 | 
 67 |     });
 68 |   }
 69 | 
 70 |     /* Search marking --------------------------*/
 71 |     var url = new URL(window.location.href);
 72 |     var toMark = url.searchParams.get("q");
 73 |     var mark = new Mark("main#main");
 74 |     if (toMark) {
 75 |       mark.mark(toMark, {
 76 |         accuracy: {
 77 |           value: "complementary",
 78 |           limiters: [",", ".", ":", "/"],
 79 |         }
 80 |       });
 81 |     }
 82 | 
 83 |   /* Search --------------------------*/
 84 |   /* Adapted from https://github.com/rstudio/bookdown/blob/2d692ba4b61f1e466c92e78fd712b0ab08c11d31/inst/resources/bs4_book/bs4_book.js#L25 */
 85 |     // Initialise search index on focus
 86 |   var fuse;
 87 |   $("#search-input").focus(async function(e) {
 88 |     if (fuse) {
 89 |       return;
 90 |     }
 91 | 
 92 |     $(e.target).addClass("loading");
 93 |     var response = await fetch($("#search-input").data("search-index"));
 94 |     var data = await response.json();
 95 | 
 96 |     var options = {
 97 |       keys: ["what", "text", "code"],
 98 |       ignoreLocation: true,
 99 |       threshold: 0.1,
100 |       includeMatches: true,
101 |       includeScore: true,
102 |     };
103 |     fuse = new Fuse(data, options);
104 | 
105 |     $(e.target).removeClass("loading");
106 |   });
107 | 
108 |   // Use algolia autocomplete
109 |   var options = {
110 |     autoselect: true,
111 |     debug: true,
112 |     hint: false,
113 |     minLength: 2,
114 |   };
115 |   var q;
116 | async function searchFuse(query, callback) {
117 |   await fuse;
118 | 
119 |   var items;
120 |   if (!fuse) {
121 |     items = [];
122 |   } else {
123 |     q = query;
124 |     var results = fuse.search(query, { limit: 20 });
125 |     items = results
126 |       .filter((x) => x.score <= 0.75)
127 |       .map((x) => x.item);
128 |     if (items.length === 0) {
129 |       items = [{dir:"Sorry 😿",previous_headings:"",title:"No results found.",what:"No results found.",path:window.location.href}];
130 |     }
131 |   }
132 |   callback(items);
133 | }
134 |   $("#search-input").autocomplete(options, [
135 |     {
136 |       name: "content",
137 |       source: searchFuse,
138 |       templates: {
139 |         suggestion: (s) => {
140 |           if (s.title == s.what) {
141 |             return `${s.dir} >	<div class="search-details"> ${s.title}</div>`;
142 |           } else if (s.previous_headings == "") {
143 |             return `${s.dir} >	<div class="search-details"> ${s.title}</div> > ${s.what}`;
144 |           } else {
145 |             return `${s.dir} >	<div class="search-details"> ${s.title}</div> > ${s.previous_headings} > ${s.what}`;
146 |           }
147 |         },
148 |       },
149 |     },
150 |   ]).on('autocomplete:selected', function(event, s) {
151 |     window.location.href = s.path + "?q=" + q + "#" + s.id;
152 |   });
153 |   });
154 | })(window.jQuery || window.$)
155 | 
156 | 
157 | 


--------------------------------------------------------------------------------
/man/step_select_forests.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/step_select_forests.R
  3 | \name{step_select_forests}
  4 | \alias{step_select_forests}
  5 | \alias{tidy.step_select_forests}
  6 | \title{Feature selection step using a random forest feature importance scores}
  7 | \usage{
  8 | step_select_forests(
  9 |   recipe,
 10 |   ...,
 11 |   outcome = NULL,
 12 |   role = "predictor",
 13 |   trained = FALSE,
 14 |   engine = "ranger",
 15 |   options = list(importance = "permutation"),
 16 |   mtry = NULL,
 17 |   trees = NULL,
 18 |   min_n = NULL,
 19 |   top_p = NA,
 20 |   threshold = NA,
 21 |   cutoff = NA,
 22 |   exclude = NULL,
 23 |   scores = NULL,
 24 |   skip = FALSE,
 25 |   id = recipes::rand_id("select_forests")
 26 | )
 27 | 
 28 | \method{tidy}{step_select_forests}(x, type = "terms", ...)
 29 | }
 30 | \arguments{
 31 | \item{recipe}{A recipe object. The step will be added to the sequence of
 32 | operations for this recipe.}
 33 | 
 34 | \item{...}{One or more selector functions to choose which predictors are
 35 | affected by the step. See [selections()] for more details. For the `tidy`
 36 | method, these are not currently used.}
 37 | 
 38 | \item{outcome}{A character string with the name of the response variable to
 39 | use to calculate the feature importance scores.}
 40 | 
 41 | \item{role}{Not used by this step since no new variables are created.}
 42 | 
 43 | \item{trained}{A logical to indicate if the quantities for preprocessing have
 44 | been estimated.}
 45 | 
 46 | \item{engine}{A supported rand_forest engine that is supported by parsnip.
 47 | The default is "ranger".}
 48 | 
 49 | \item{options}{A named list of options to pass to the rand_forest engine. For
 50 | example, if `engine = 'ranger'` (the default) then options could be
 51 | `list(permutation = 'importance`) because a feature importance method needs
 52 | to be specified for this engine. This is the default.}
 53 | 
 54 | \item{mtry}{An integer for the number of predictors that will be randomly
 55 | sampled at each split when creating the tree models.}
 56 | 
 57 | \item{trees}{An integer for the number of trees contained in the ensemble.}
 58 | 
 59 | \item{min_n}{An integer for the minimum number of data points in a node that
 60 | are required for the node to be split further.}
 61 | 
 62 | \item{top_p}{An integer that will be used to select the `top_p` predictors
 63 | with the smallest p-values. A value of `NA` implies that this criterion
 64 | will be ignored.}
 65 | 
 66 | \item{threshold}{A numeric value between 0 and 1 representing the percentile
 67 | of best scoring features to select. For example `threshold = 0.9` will
 68 | retain only predictors with scores in the top 90th percentile and a smaller
 69 | threshold will select more features. Note that `top_p` and `threshold` are
 70 | mutually exclusive but either can be used in conjunction with `cutoff` to
 71 | select the top-ranked features and those that are smaller than the cutoff
 72 | value.}
 73 | 
 74 | \item{cutoff}{A numeric value, in -log10(p-value) units, where predictors
 75 | with _larger_ than the cutoff will be retained. A value of `NA` implies
 76 | that this criterion will be ignored.}
 77 | 
 78 | \item{exclude}{A character vector of predictor names that will be removed
 79 | from the data. This will be set when `prep()` is used on the recipe and
 80 | should not be set by the user.}
 81 | 
 82 | \item{scores}{A tibble with 'variable' and 'scores' columns containing the
 83 | names of the variables and their feature importance scores. This parameter
 84 | is only produced after the recipe has been trained.}
 85 | 
 86 | \item{skip}{A logical. Should the step be skipped when the recipe is baked by
 87 | bake.recipe()? While all operations are baked when prep.recipe() is run,
 88 | some operations may not be able to be conducted on new data (e.g.
 89 | processing the outcome variable(s)). Care should be taken when using skip =
 90 | TRUE as it may affect the computations for subsequent operations.}
 91 | 
 92 | \item{id}{A character string that is unique to this step to identify it.}
 93 | 
 94 | \item{x}{A `step_select_forests` object.}
 95 | 
 96 | \item{type}{A character with either 'terms' (the default) to return a
 97 | tibble containing the variables that have been removed by the filter step,
 98 | or 'scores' to return the scores for each variable.}
 99 | }
100 | \value{
101 | An updated version of `recipe` with the new step added to the
102 |  sequence of existing steps (if any). For the `tidy` method, a tibble with a
103 |  `terms` column for which predictors were removed.
104 | }
105 | \description{
106 | `step_select_forests` creates a *specification* of a recipe step that selects
107 | a subset of predictors based on the ranking of variable importance using a
108 | `parsnip::rand_forest` supported model.
109 | }
110 | \examples{
111 | library(recipes)
112 | library(parsnip)
113 | 
114 | # load the example iris dataset
115 | data(cells, package = "modeldata")
116 | 
117 | # create a preprocessing recipe
118 | rec <-
119 |  recipe(class ~ ., data = cells[, -1]) \%>\%
120 |  step_select_forests(all_predictors(), outcome = "class", top_p = 10,
121 |                      cutoff = 0.9)
122 | 
123 | prepped <- prep(rec)
124 | 
125 | preproc_data <- juice(prepped)
126 | prepped
127 | }
128 | 


--------------------------------------------------------------------------------
/docs/LICENSE-text.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <!-- Generated by pkgdown: do not edit by hand --><html lang="en"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><meta charset="utf-8"><meta http-equiv="X-UA-Compatible" content="IE=edge"><meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no"><title>License • colino</title><script src="deps/jquery-3.6.0/jquery-3.6.0.min.js"></script><meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no"><link href="deps/bootstrap-5.1.3/bootstrap.min.css" rel="stylesheet"><script src="deps/bootstrap-5.1.3/bootstrap.bundle.min.js"></script><!-- Font Awesome icons --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/all.min.css" integrity="sha256-mmgLkCYLUQbXn0B1SRqzHar6dCnv9oZFPEC1g1cwlkk=" crossorigin="anonymous"><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/v4-shims.min.css" integrity="sha256-wZjR52fzng1pJHwx4aV2AO3yyTOXrcDW7jBpJtTwVxw=" crossorigin="anonymous"><!-- bootstrap-toc --><script src="https://cdn.rawgit.com/afeld/bootstrap-toc/v1.0.1/dist/bootstrap-toc.min.js"></script><!-- headroom.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/headroom.min.js" integrity="sha256-AsUX4SJE1+yuDu5+mAVzJbuYNPHj/WroHuZ8Ir/CkE0=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/jQuery.headroom.min.js" integrity="sha256-ZX/yNShbjqsohH1k95liqY9Gd8uOiE1S4vZc+9KQ1K4=" crossorigin="anonymous"></script><!-- clipboard.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.6/clipboard.min.js" integrity="sha256-inc5kl9MA1hkeYUt+EC3BhlIgyp/2jDIyBLS6k3UxPI=" crossorigin="anonymous"></script><!-- search --><script src="https://cdnjs.cloudflare.com/ajax/libs/fuse.js/6.4.6/fuse.js" integrity="sha512-zv6Ywkjyktsohkbp9bb45V6tEMoWhzFzXis+LrMehmJZZSys19Yxf1dopHx7WzIKxr5tK2dVcYmaCk2uqdjF4A==" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/autocomplete.js/0.38.0/autocomplete.jquery.min.js" integrity="sha512-GU9ayf+66Xx2TmpxqJpliWbT5PiGYxpaG8rfnBEk1LL8l1KGkRShhngwdXK1UgqhAzWpZHSiYPc09/NwDQIGyg==" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mark.js/8.11.1/mark.min.js" integrity="sha512-5CYOlHXGh6QpOFA/TeTylKLWfB3ftPsde7AnmhuitiTX4K5SqCLBeKro6sPS8ilsz1Q4NRx3v8Ko2IBiszzdww==" crossorigin="anonymous"></script><!-- pkgdown --><script src="pkgdown.js"></script><meta property="og:title" content="License"><!-- mathjax --><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script><!--[if lt IE 9]>
 3 | <script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
 4 | <script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
 5 | <![endif]--></head><body>
 6 |     <a href="#main" class="visually-hidden-focusable">Skip to contents</a>
 7 |     
 8 | 
 9 |     <nav class="navbar fixed-top navbar-light navbar-expand-lg bg-light"><div class="container">
10 |     
11 |     <a class="navbar-brand me-2" href="index.html">colino</a>
12 | 
13 |     <small class="nav-text text-muted me-auto" data-bs-toggle="tooltip" data-bs-placement="bottom" title="">0.0.1</small>
14 | 
15 |     
16 |     <button class="navbar-toggler" type="button" data-bs-toggle="collapse" data-bs-target="#navbar" aria-controls="navbar" aria-expanded="false" aria-label="Toggle navigation">
17 |       <span class="navbar-toggler-icon"></span>
18 |     </button>
19 | 
20 |     <div id="navbar" class="collapse navbar-collapse ms-3">
21 |       <ul class="navbar-nav me-auto"><li class="nav-item">
22 |   <a class="nav-link" href="reference/index.html">Reference</a>
23 | </li>
24 |       </ul><form class="form-inline my-2 my-lg-0" role="search">
25 |         <input type="search" class="form-control me-sm-2" aria-label="Toggle navigation" name="search-input" data-search-index="search.json" id="search-input" placeholder="Search for" autocomplete="off"></form>
26 | 
27 |       <ul class="navbar-nav"><li class="nav-item">
28 |   <a class="external-link nav-link" href="https://github.com/stevenpawley/colino/" aria-label="github">
29 |     <span class="fab fa fab fa-github fa-lg"></span>
30 |      
31 |   </a>
32 | </li>
33 |       </ul></div>
34 | 
35 |     
36 |   </div>
37 | </nav><div class="container template-title-body">
38 | <div class="row">
39 |   <main id="main" class="col-md-9"><div class="page-header">
40 |       <img src="" class="logo" alt=""><h1>License</h1>
41 |       
42 |     </div>
43 | 
44 | <pre>YEAR: 2019
45 | COPYRIGHT HOLDER: Steven Pawley
46 | </pre>
47 | 
48 |   </main></div>
49 | 
50 | 
51 |     <footer><div class="pkgdown-footer-left">
52 |   <p></p><p>Developed by Steven Pawley, Max Kuhn, Rowan Jacques-Hamilton.</p>
53 | </div>
54 | 
55 | <div class="pkgdown-footer-right">
56 |   <p></p><p>Site built with <a href="https://pkgdown.r-lib.org/" class="external-link">pkgdown</a> 2.0.6.</p>
57 | </div>
58 | 
59 |     </footer></div>
60 | 
61 |   
62 | 
63 |   
64 | 
65 |   </body></html>
66 | 
67 | 


--------------------------------------------------------------------------------
/R/misc.R:
--------------------------------------------------------------------------------
  1 | check_zero_one <- function(x) {
  2 |   if (is.na(x)) {
  3 |     return(x)
  4 |   } else {
  5 |     if (is.numeric(x)) {
  6 |       if (x >= 1 | x <= 0) {
  7 |         rlang::abort("`threshold` should be on (0, 1).")
  8 |       }
  9 |     } else {
 10 |       rlang::abort("`threshold` should be numeric.")
 11 |     }
 12 |   }
 13 |   return(x)
 14 | }
 15 | 
 16 | check_top_p <- function(x, n) {
 17 |   # checks on x (top_p) and n (number of features)
 18 |   if (is.na(x)) {
 19 |     return(x)
 20 |   }
 21 | 
 22 |   if (!is.numeric(x)) {
 23 |     rlang::abort("`top_p` should be numeric.")
 24 |   }
 25 | 
 26 |   if (!is.integer(x)) {
 27 |     x <- as.integer(x)
 28 |   }
 29 | 
 30 |   msg <- paste0("`top_p` should be on (1, ", n, ") based on the number of features available.")
 31 | 
 32 |   # return top_n = all features if top_n > n
 33 |   if (x >= n) {
 34 |     rlang::warn(msg)
 35 |     x <- min(n - 1, x)
 36 | 
 37 |   # return a single feature if top_p < 1
 38 |   } else if (x < 1) {
 39 |     rlang::warn(msg)
 40 |     x <- 1
 41 |   }
 42 | 
 43 |   return(x)
 44 | }
 45 | 
 46 | check_criteria <- function(top_p, threshold, cl) {
 47 |   if (is.na(top_p) & is.na(threshold)) {
 48 |     msg <- paste0(
 49 |       "For `",
 50 |       cl[[1]],
 51 |       "`, `top_p` and `threshold` cannot both be missing."
 52 |     )
 53 |     rlang::abort(msg)
 54 |   }
 55 |   invisible(NULL)
 56 | }
 57 | 
 58 | #' Select features using `top_p` or `threshold`.
 59 | #'
 60 | #' Feature selection using either the `top_p` or `threshold` features OR
 61 | #' `cutoff` where cutoff refers to the absolute numeric value of the feature
 62 | #' importance scores.
 63 | #'
 64 | #' @details
 65 | #' `dual_filter` selects feature that are selected using either (`top_p`,
 66 | #' `threshold`) or `cutoff` or both. If top_p/threshold and cutoff are both used
 67 | #' then features are selected using OR. For example, if top_p selects features 1
 68 | #' & 2, and threshold selects features 1 & 3, then the selected features =
 69 | #' 1,2,3.
 70 | #'
 71 | #' @param x a named numeric vector of scores per feature
 72 | #' @param top_p an integer specifying the number of top-performing features to
 73 | #'   retain
 74 | #' @param threshold a numeric with percentile of top-performing features to
 75 | #'   retain. For example, `threshold = 0.9` will only retain features that are
 76 | #'   in the top 90th percentile. A smaller value of threshold will select
 77 | #'   more features.
 78 | #' @param cutoff a numeric with the value that represents the cutoff in the
 79 | #'   scores in `x` by which to retain/discard features.
 80 | #' @param maximize logical to indicate whether `top_p`, `threshold` and `cutoff`
 81 | #'   are used to keep features where high scores = 'best' (maximize = TRUE) or
 82 | #'   where low scores = 'best' (maximize = FALSE).
 83 | #'
 84 | #' @return character vector of feature names to exclude
 85 | #' @keywords internal
 86 | dual_filter <- function(x, top_p, threshold, cutoff, maximize) {
 87 |   if (!is.na(top_p) & !is.na(threshold)) {
 88 |     rlang::abort("`top_p` and `threshold` are mutually exclusive")
 89 |   }
 90 | 
 91 |   na_x <- x[is.na(x)]
 92 |   x <- x[!is.na(x)]
 93 |   x <- sort(x, decreasing = maximize)
 94 | 
 95 |   p <- length(x)
 96 | 
 97 |   # assign logical selection variable using top_p
 98 |   if (!is.na(top_p)) {
 99 |     top_p_lgl <- seq_along(x) <= top_p
100 |   } else {
101 |     top_p_lgl <- rep(FALSE, p)
102 |   }
103 | 
104 |   # assign logical selection variable using threshold
105 |   if (!is.na(threshold)) {
106 |     p_to_exceed <- stats::quantile(x, threshold)
107 | 
108 |     if (maximize) {
109 |       threshold_lgl <- x >= p_to_exceed
110 |     } else {
111 |       threshold_lgl <- x < p_to_exceed
112 |     }
113 | 
114 |   } else {
115 |     threshold_lgl <- rep(FALSE, p)
116 |   }
117 | 
118 |   # assign logical selection variable using cutoff
119 |   if (!is.na(cutoff)) {
120 |     if (maximize) {
121 |       cutoff_lgl <- x >= cutoff
122 |     } else {
123 |       cutoff_lgl <- x <= cutoff
124 |     }
125 | 
126 |   } else {
127 |     cutoff_lgl <- rep(FALSE, p)
128 |   }
129 | 
130 |   keep_lgl <- top_p_lgl | threshold_lgl | cutoff_lgl
131 |   excluded <- c(names(x)[!keep_lgl], names(na_x))
132 | 
133 |   return(excluded)
134 | }
135 | 
136 | check_outcome <- function(y) {
137 |   ifelse(inherits(y, "factor"), "classification", "regression")
138 | }
139 | 
140 | get_outcome <- function(x, training, info) {
141 |   if (!all(is.na(x$outcome))) {
142 |     if (!all(is.character(x$outcome))) {
143 |       rlang::abort("Outcome variable must be supplied as a character string")
144 |     }
145 | 
146 |     outcome_col <- x$outcome
147 | 
148 |   } else {
149 |     outcome_col <- info %>%
150 |       dplyr::filter(.data$role == 'outcome') %>%
151 |       dplyr::pull("variable")
152 |   }
153 | 
154 |   if (length(outcome_col) > 1) {
155 |     msg <- paste(
156 |       "Multiple outcome variables are present in the recipe.",
157 |       "Only a single outcome variable can be accepted by any `step_select` functions.",
158 |       "Please supply the outcome variable using the `outcome` argument"
159 |     )
160 |     rlang::abort(msg)
161 |   }
162 | 
163 |   if (length(outcome_col) < 1) {
164 |     msg <- paste(
165 |       "An outcome variable was not found.",
166 |       "Please ensure an outcome variable is specified."
167 |     )
168 |     rlang::abort(msg)
169 |   }
170 | 
171 |   if (!outcome_col %in% names(training)) {
172 |     rlang::abort(paste0("Outcome variable '", outcome_col, "' not found"))
173 |   }
174 | 
175 |   return(outcome_col)
176 | }
177 | 


--------------------------------------------------------------------------------
/man/step_select_relief.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/step_select_relief.R
  3 | \name{step_select_relief}
  4 | \alias{step_select_relief}
  5 | \alias{tidy.step_select_relief}
  6 | \title{Feature selection step using the Relief algorithm}
  7 | \usage{
  8 | step_select_relief(
  9 |   recipe,
 10 |   ...,
 11 |   outcome = NULL,
 12 |   role = NA,
 13 |   trained = FALSE,
 14 |   top_p = NA,
 15 |   threshold = NA,
 16 |   cutoff = NA,
 17 |   neighbors = 5,
 18 |   sample_size = 10,
 19 |   exclude = NULL,
 20 |   scores = NULL,
 21 |   skip = FALSE,
 22 |   id = recipes::rand_id("select_relief")
 23 | )
 24 | 
 25 | \method{tidy}{step_select_relief}(x, ...)
 26 | }
 27 | \arguments{
 28 | \item{recipe}{A recipe object. The step will be added to the sequence of
 29 | operations for this recipe.}
 30 | 
 31 | \item{...}{One or more selector functions to choose which predictors are
 32 | affected by the step. See [selections()] for more details. For the `tidy`
 33 | method, these are not currently used.}
 34 | 
 35 | \item{outcome}{A character string with the name of the response variable to
 36 | use to evaluate information gain value against the predictors.}
 37 | 
 38 | \item{role}{Not used by this step since no new variables are created.}
 39 | 
 40 | \item{trained}{A logical to indicate if the quantities for preprocessing have
 41 | been estimated.}
 42 | 
 43 | \item{top_p}{An integer that will be used to select the `top_p` predictors
 44 | with the smallest p-values. A value of `NA` implies that this criterion
 45 | will be ignored.}
 46 | 
 47 | \item{threshold}{A numeric value between 0 and 1 representing the percentile
 48 | of best scoring features to select. For example `threshold = 0.9` will
 49 | retain only predictors with scores in the top 90th percentile and a smaller
 50 | threshold will select more features. Note that `top_p` and `threshold` are
 51 | mutually exclusive but either can be used in conjunction with `cutoff` to
 52 | select the top-ranked features and those that are smaller than the cutoff
 53 | value.}
 54 | 
 55 | \item{cutoff}{A numeric value, in -log10(p-value) units, where predictors
 56 | with _larger_ than the cutoff will be retained. A value of `NA` implies
 57 | that this criterion will be ignored.}
 58 | 
 59 | \item{neighbors}{An integer with the number of neighbors for find for each
 60 | sampled instance. Default is 5.}
 61 | 
 62 | \item{sample_size}{An integer with the number of instances to sample. Default
 63 | is 10.}
 64 | 
 65 | \item{exclude}{A character vector of predictor names that will be removed
 66 | from the data. This will be set when `prep()` is used on the recipe and
 67 | should not be set by the user.}
 68 | 
 69 | \item{scores}{A tibble with 'variable' and 'scores' columns containing the
 70 | names of the variables and their information gain scores. This parameter is
 71 | only produced after the recipe has been trained.}
 72 | 
 73 | \item{skip}{A logical. Should the step be skipped when the recipe is baked by
 74 | bake.recipe()? While all operations are baked when prep.recipe() is run,
 75 | some operations may not be able to be conducted on new data (e.g.
 76 | processing the outcome variable(s)). Care should be taken when using skip =
 77 | TRUE as it may affect the computations for subsequent operations.}
 78 | 
 79 | \item{id}{A character string that is unique to this step to identify it.}
 80 | 
 81 | \item{x}{A `step_select_relief` object.}
 82 | }
 83 | \value{
 84 | An updated version of `recipe` with the new step added to the
 85 |  sequence of existing steps (if any). For the `tidy` method, a tibble with a
 86 |  `terms` column for which predictors were removed.
 87 | }
 88 | \description{
 89 | Relief-based algorithms use nearest neighbors of randomly sampled
 90 | observations (without replacement) to derive feature weights/scores that
 91 | describe the relevance of each feature to the target variable. The feature
 92 | weights represent the differences between the normalized feature values from
 93 | each randomly sampled observation and a neighboring observation. If the
 94 | neighboring observation's class is the same as the sampled observation
 95 | (termed a 'hit') but the feature values are different, then this reduces the
 96 | score on the basis that widely varying feature values for the same class are
 97 | not desirable. Conversely, if a neighboring observation's class is different
 98 | from the sampled observation (termed a 'miss') and the feature values are
 99 | different, then this increases the score on the basis that observations of
100 | different classes are widely separated by their feature values. The feature
101 | weights / scores range from -1 (worst) to +1 (best).
102 | }
103 | \details{
104 | `step_select_relief` creates a *specification* of a recipe step that selects
105 | a subset of predictors based on the scores of the relief algorithm. This step
106 | requires the FSinR package to be installed. The top `top_p` scoring features,
107 | or features whose scores occur in the top percentile `threshold` will be
108 | retained as new predictors.
109 | 
110 | 
111 | 
112 | The recipe will stop if all of `top_p`, `threshold` and `cutoff` are left
113 | unspecified.
114 | }
115 | \examples{
116 | \dontrun{
117 | library(recipes)
118 | 
119 | data(cells, package = "modeldata")
120 | 
121 | rec <- recipe(class ~ ., data = cells[, -1]) \%>\%
122 |   step_select_relief(
123 |     all_predictors(),
124 |     outcome = "class",
125 |     top_p = 10
126 |   )
127 | 
128 |   prepped <- prep(rec)
129 |   new_data <- bake(prepped, new_data = NULL)
130 |   prepped
131 | }
132 | }
133 | \concept{preprocessing}
134 | \concept{supervised_filter}
135 | \keyword{datagen}
136 | 


--------------------------------------------------------------------------------
/docs/404.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <!-- Generated by pkgdown: do not edit by hand --><html lang="en">
 3 | <head>
 4 | <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
 5 | <meta charset="utf-8">
 6 | <meta http-equiv="X-UA-Compatible" content="IE=edge">
 7 | <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
 8 | <title>Page not found (404) • colino</title>
 9 | <script src="https://stevenpawley.github.io/colino/deps/jquery-3.6.0/jquery-3.6.0.min.js"></script><meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
10 | <link href="https://stevenpawley.github.io/colino/deps/bootstrap-5.1.3/bootstrap.min.css" rel="stylesheet">
11 | <script src="https://stevenpawley.github.io/colino/deps/bootstrap-5.1.3/bootstrap.bundle.min.js"></script><!-- Font Awesome icons --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/all.min.css" integrity="sha256-mmgLkCYLUQbXn0B1SRqzHar6dCnv9oZFPEC1g1cwlkk=" crossorigin="anonymous">
12 | <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/v4-shims.min.css" integrity="sha256-wZjR52fzng1pJHwx4aV2AO3yyTOXrcDW7jBpJtTwVxw=" crossorigin="anonymous">
13 | <!-- bootstrap-toc --><script src="https://cdn.rawgit.com/afeld/bootstrap-toc/v1.0.1/dist/bootstrap-toc.min.js"></script><!-- headroom.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/headroom.min.js" integrity="sha256-AsUX4SJE1+yuDu5+mAVzJbuYNPHj/WroHuZ8Ir/CkE0=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/jQuery.headroom.min.js" integrity="sha256-ZX/yNShbjqsohH1k95liqY9Gd8uOiE1S4vZc+9KQ1K4=" crossorigin="anonymous"></script><!-- clipboard.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.6/clipboard.min.js" integrity="sha256-inc5kl9MA1hkeYUt+EC3BhlIgyp/2jDIyBLS6k3UxPI=" crossorigin="anonymous"></script><!-- search --><script src="https://cdnjs.cloudflare.com/ajax/libs/fuse.js/6.4.6/fuse.js" integrity="sha512-zv6Ywkjyktsohkbp9bb45V6tEMoWhzFzXis+LrMehmJZZSys19Yxf1dopHx7WzIKxr5tK2dVcYmaCk2uqdjF4A==" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/autocomplete.js/0.38.0/autocomplete.jquery.min.js" integrity="sha512-GU9ayf+66Xx2TmpxqJpliWbT5PiGYxpaG8rfnBEk1LL8l1KGkRShhngwdXK1UgqhAzWpZHSiYPc09/NwDQIGyg==" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mark.js/8.11.1/mark.min.js" integrity="sha512-5CYOlHXGh6QpOFA/TeTylKLWfB3ftPsde7AnmhuitiTX4K5SqCLBeKro6sPS8ilsz1Q4NRx3v8Ko2IBiszzdww==" crossorigin="anonymous"></script><!-- pkgdown --><script src="https://stevenpawley.github.io/colino/pkgdown.js"></script><meta property="og:title" content="Page not found (404)">
14 | <!-- mathjax --><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script><!--[if lt IE 9]>
15 | <script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
16 | <script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
17 | <![endif]-->
18 | </head>
19 | <body>
20 |     <a href="https://stevenpawley.github.io/colino/#main" class="visually-hidden-focusable">Skip to contents</a>
21 |     
22 | 
23 |     <nav class="navbar fixed-top navbar-light navbar-expand-lg bg-light"><div class="container">
24 |     
25 |     <a class="navbar-brand me-2" href="https://stevenpawley.github.io/colino/index.html">colino</a>
26 | 
27 |     <small class="nav-text text-muted me-auto" data-bs-toggle="tooltip" data-bs-placement="bottom" title="">0.0.1</small>
28 | 
29 |     
30 |     <button class="navbar-toggler" type="button" data-bs-toggle="collapse" data-bs-target="#navbar" aria-controls="navbar" aria-expanded="false" aria-label="Toggle navigation">
31 |       <span class="navbar-toggler-icon"></span>
32 |     </button>
33 | 
34 |     <div id="navbar" class="collapse navbar-collapse ms-3">
35 |       <ul class="navbar-nav me-auto">
36 | <li class="nav-item">
37 |   <a class="nav-link" href="https://stevenpawley.github.io/colino/reference/index.html">Reference</a>
38 | </li>
39 |       </ul>
40 | <form class="form-inline my-2 my-lg-0" role="search">
41 |         <input type="search" class="form-control me-sm-2" aria-label="Toggle navigation" name="search-input" data-search-index="search.json" id="search-input" placeholder="Search for" autocomplete="off">
42 | </form>
43 | 
44 |       <ul class="navbar-nav">
45 | <li class="nav-item">
46 |   <a class="external-link nav-link" href="https://github.com/stevenpawley/colino/" aria-label="github">
47 |     <span class="fab fa fab fa-github fa-lg"></span>
48 |      
49 |   </a>
50 | </li>
51 |       </ul>
52 | </div>
53 | 
54 |     
55 |   </div>
56 | </nav><div class="container template-title-body">
57 | <div class="row">
58 |   <main id="main" class="col-md-9"><div class="page-header">
59 |       <img src="https://stevenpawley.github.io/colino/" class="logo" alt=""><h1>Page not found (404)</h1>
60 |       
61 |     </div>
62 | 
63 | Content not found. Please use links in the navbar.
64 | 
65 |   </main>
66 | </div>
67 | 
68 | 
69 |     <footer><div class="pkgdown-footer-left">
70 |   <p></p>
71 | <p>Developed by Steven Pawley, Max Kuhn, Rowan Jacques-Hamilton.</p>
72 | </div>
73 | 
74 | <div class="pkgdown-footer-right">
75 |   <p></p>
76 | <p>Site built with <a href="https://pkgdown.r-lib.org/" class="external-link">pkgdown</a> 2.0.6.</p>
77 | </div>
78 | 
79 |     </footer>
80 | </div>
81 | 
82 |   
83 | 
84 |   
85 | 
86 |   </body>
87 | </html>
88 | 


--------------------------------------------------------------------------------
/R/step_select_boruta.R:
--------------------------------------------------------------------------------
  1 | #' Feature selection step using Boruta
  2 | #'
  3 | #' `step_select_boruta` creates a *specification* of a recipe step that selects
  4 | #' a subset of predictors using the Boruta feature selection approach.
  5 | #'
  6 | #' The Boruta algorithm technically is a wrapper approach that uses random
  7 | #' forests to test whether the feature importance scores obtained on the
  8 | #' original data are higher than best of the scores obtained when the variables
  9 | #' are randomly permuted. These permuted features are termed 'shadow' features.
 10 | #' If the scores for any original feature are higher than the best of the scores
 11 | #' for the randomly permuted features, then this is marked as a 'hit'. Features
 12 | #' are confirmed or rejected based on a confidence threshold (default is p =
 13 | #' 0.01) applied to the tails of the binomial distribution with p = 0.5.
 14 | #' Features that do not fall within the lower (reject) or upper (accept) tails
 15 | #' of the distribution are labelled as 'tentative'. Rejected features are
 16 | #' dropped from the feature set and the procedure is repeated until no more
 17 | #' 'tentative' features exist, or that a maximum number of runs are reached.
 18 | #'
 19 | #' @inheritParams step_select_aov
 20 | #' @inherit step_select_aov return
 21 | #' @param outcome A character string with the name of the response variable to
 22 | #'   use to calculate the feature importance scores.
 23 | #' @param role Not used by this step since no new variables are created.
 24 | #' @param options A list of options to pass to `Boruta::Boruta()`. The defaults
 25 | #'   use Boruta's defaults. *Note* that `x` and `y` should not be passed here.
 26 | #' @param res The `Boruta::Boruta` object is stored here once this preprocessing
 27 | #'   step has been trained by `prep.recipe()`.
 28 | #'
 29 | #' @export
 30 | #' @examples
 31 | #' library(recipes)
 32 | #' library(parsnip)
 33 | #'
 34 | #' # load the example iris dataset
 35 | #' data(cells, package = "modeldata")
 36 | #'
 37 | #' # create a preprocessing recipe
 38 | #' rec <-
 39 | #'  recipe(class ~ ., data = cells[, -1]) %>%
 40 | #'  step_select_boruta(all_predictors(), outcome = "class")
 41 | #'
 42 | #' prepped <- prep(rec)
 43 | #'
 44 | #' preproc_data <- juice(prepped)
 45 | #' prepped
 46 | step_select_boruta <- function(
 47 |   recipe,
 48 |   ...,
 49 |   outcome = NULL,
 50 |   role = "predictor",
 51 |   trained = FALSE,
 52 |   exclude = NULL,
 53 |   options = list(pValue = 0.01, mcAdj = TRUE, maxRuns = 100),
 54 |   res = NULL,
 55 |   skip = FALSE,
 56 |   id = recipes::rand_id("select_boruta")) {
 57 | 
 58 |   recipes::recipes_pkg_check("Boruta")
 59 | 
 60 |   recipes::add_step(
 61 |     recipe,
 62 |     step_select_boruta_new(
 63 |       terms = recipes::ellipse_check(...),
 64 |       trained = trained,
 65 |       outcome = outcome,
 66 |       role = role,
 67 |       exclude = exclude,
 68 |       options = options,
 69 |       res = res,
 70 |       skip = skip,
 71 |       id = id
 72 |     )
 73 |   )
 74 | }
 75 | 
 76 | # wrapper around 'step' function that sets the class of new step objects
 77 | #' @importFrom recipes step
 78 | step_select_boruta_new <- function(terms, role, trained, outcome, exclude,
 79 |                                    options, res, skip, id) {
 80 |   recipes::step(
 81 |     subclass = "select_boruta",
 82 |     terms = terms,
 83 |     role = role,
 84 |     trained = trained,
 85 |     outcome = outcome,
 86 |     exclude = exclude,
 87 |     options = options,
 88 |     res = res,
 89 |     skip = skip,
 90 |     id = id
 91 |   )
 92 | }
 93 | 
 94 | #' @export
 95 | prep.step_select_boruta <- function(x, training, info = NULL, ...) {
 96 | 
 97 |   # translate the terms arguments
 98 |   x_names <- recipes::recipes_eval_select(x$terms, training, info)
 99 |   y_name <- recipes::recipes_eval_select(x$outcome, training, info)
100 |   y_name <- y_name[1]
101 | 
102 |   if (length(x_names) > 0) {
103 | 
104 |     call <- rlang::call2(
105 |       .fn = "Boruta",
106 |       .ns = "Boruta",
107 |       x = rlang::quo(training[, x_names]),
108 |       y = rlang::quo(training[[y_name]]),
109 |       !!!x$options
110 |     )
111 | 
112 |     res <- rlang::eval_tidy(call)
113 | 
114 |     exclude <- names(res$finalDecision[res$finalDecision == "Rejected"])
115 | 
116 |   } else {
117 |     exclude <- character()
118 |   }
119 | 
120 |   step_select_boruta_new(
121 |     terms = x$terms,
122 |     trained = TRUE,
123 |     role = x$role,
124 |     outcome = y_name,
125 |     exclude = exclude,
126 |     options = x$options,
127 |     res = res,
128 |     skip = x$skip,
129 |     id = x$id
130 |   )
131 | }
132 | 
133 | #' @export
134 | bake.step_select_boruta <- function(object, new_data, ...) {
135 |   if (length(object$exclude) > 0) {
136 |     new_data <- new_data[, !colnames(new_data) %in% object$exclude]
137 |   }
138 |   as_tibble(new_data)
139 | }
140 | 
141 | #' @export
142 | print.step_select_boruta <-
143 |   function(x, width = max(20, options()$width - 30), ...) {
144 |     cat("Boruta feature selection")
145 | 
146 |     if (recipes::is_trained(x)) {
147 |       n <- length(x$exclude)
148 |       cat(paste0(" (", n, " excluded)"))
149 |     }
150 |     cat("\n")
151 | 
152 |     invisible(x)
153 |   }
154 | 
155 | #' @rdname step_select_boruta
156 | #' @param x A `step_select_boruta` object.
157 | #' @param type A character with either 'terms' (the default) to return a
158 | #'   tibble containing the variables that have been removed by the filter step,
159 | #'   or 'scores' to return the scores for each variable.
160 | #' @export
161 | tidy.step_select_boruta <- function(x, type = "terms", ...) {
162 |   tidy_filter_step(x, type)
163 | }
164 | 
165 | #' @rdname required_pkgs.colino
166 | #' @export
167 | required_pkgs.step_select_boruta <- function(x, ...) {
168 |   c("colino", "Boruta")
169 | }
170 | 


--------------------------------------------------------------------------------
/docs/reference/pipe.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <!-- Generated by pkgdown: do not edit by hand --><html lang="en"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><meta charset="utf-8"><meta http-equiv="X-UA-Compatible" content="IE=edge"><meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no"><meta name="description" content="See magrittr::%&amp;gt;% for details."><title>Pipe operator — %&gt;% • colino</title><script src="../deps/jquery-3.6.0/jquery-3.6.0.min.js"></script><meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no"><link href="../deps/bootstrap-5.1.3/bootstrap.min.css" rel="stylesheet"><script src="../deps/bootstrap-5.1.3/bootstrap.bundle.min.js"></script><!-- Font Awesome icons --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/all.min.css" integrity="sha256-mmgLkCYLUQbXn0B1SRqzHar6dCnv9oZFPEC1g1cwlkk=" crossorigin="anonymous"><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/v4-shims.min.css" integrity="sha256-wZjR52fzng1pJHwx4aV2AO3yyTOXrcDW7jBpJtTwVxw=" crossorigin="anonymous"><!-- bootstrap-toc --><script src="https://cdn.rawgit.com/afeld/bootstrap-toc/v1.0.1/dist/bootstrap-toc.min.js"></script><!-- headroom.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/headroom.min.js" integrity="sha256-AsUX4SJE1+yuDu5+mAVzJbuYNPHj/WroHuZ8Ir/CkE0=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/jQuery.headroom.min.js" integrity="sha256-ZX/yNShbjqsohH1k95liqY9Gd8uOiE1S4vZc+9KQ1K4=" crossorigin="anonymous"></script><!-- clipboard.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.6/clipboard.min.js" integrity="sha256-inc5kl9MA1hkeYUt+EC3BhlIgyp/2jDIyBLS6k3UxPI=" crossorigin="anonymous"></script><!-- search --><script src="https://cdnjs.cloudflare.com/ajax/libs/fuse.js/6.4.6/fuse.js" integrity="sha512-zv6Ywkjyktsohkbp9bb45V6tEMoWhzFzXis+LrMehmJZZSys19Yxf1dopHx7WzIKxr5tK2dVcYmaCk2uqdjF4A==" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/autocomplete.js/0.38.0/autocomplete.jquery.min.js" integrity="sha512-GU9ayf+66Xx2TmpxqJpliWbT5PiGYxpaG8rfnBEk1LL8l1KGkRShhngwdXK1UgqhAzWpZHSiYPc09/NwDQIGyg==" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mark.js/8.11.1/mark.min.js" integrity="sha512-5CYOlHXGh6QpOFA/TeTylKLWfB3ftPsde7AnmhuitiTX4K5SqCLBeKro6sPS8ilsz1Q4NRx3v8Ko2IBiszzdww==" crossorigin="anonymous"></script><!-- pkgdown --><script src="../pkgdown.js"></script><meta property="og:title" content="Pipe operator — %&gt;%"><meta property="og:description" content="See magrittr::%&amp;gt;% for details."><!-- mathjax --><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script><!--[if lt IE 9]>
 3 | <script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
 4 | <script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
 5 | <![endif]--></head><body>
 6 |     <a href="#main" class="visually-hidden-focusable">Skip to contents</a>
 7 |     
 8 | 
 9 |     <nav class="navbar fixed-top navbar-light navbar-expand-lg bg-light"><div class="container">
10 |     
11 |     <a class="navbar-brand me-2" href="../index.html">colino</a>
12 | 
13 |     <small class="nav-text text-muted me-auto" data-bs-toggle="tooltip" data-bs-placement="bottom" title="">0.0.1</small>
14 | 
15 |     
16 |     <button class="navbar-toggler" type="button" data-bs-toggle="collapse" data-bs-target="#navbar" aria-controls="navbar" aria-expanded="false" aria-label="Toggle navigation">
17 |       <span class="navbar-toggler-icon"></span>
18 |     </button>
19 | 
20 |     <div id="navbar" class="collapse navbar-collapse ms-3">
21 |       <ul class="navbar-nav me-auto"><li class="active nav-item">
22 |   <a class="nav-link" href="../reference/index.html">Reference</a>
23 | </li>
24 |       </ul><form class="form-inline my-2 my-lg-0" role="search">
25 |         <input type="search" class="form-control me-sm-2" aria-label="Toggle navigation" name="search-input" data-search-index="../search.json" id="search-input" placeholder="Search for" autocomplete="off"></form>
26 | 
27 |       <ul class="navbar-nav"><li class="nav-item">
28 |   <a class="external-link nav-link" href="https://github.com/stevenpawley/colino/" aria-label="github">
29 |     <span class="fab fa fab fa-github fa-lg"></span>
30 |      
31 |   </a>
32 | </li>
33 |       </ul></div>
34 | 
35 |     
36 |   </div>
37 | </nav><div class="container template-reference-topic">
38 | <div class="row">
39 |   <main id="main" class="col-md-9"><div class="page-header">
40 |       <img src="" class="logo" alt=""><h1>Pipe operator</h1>
41 |       <small class="dont-index">Source: <a href="https://github.com/stevenpawley/colino/blob/HEAD/R/utils-pipe.R" class="external-link"><code>R/utils-pipe.R</code></a></small>
42 |       <div class="d-none name"><code>pipe.Rd</code></div>
43 |     </div>
44 | 
45 |     <div class="ref-description section level2">
46 |     <p>See <code>magrittr::%&gt;%</code> for details.</p>
47 |     </div>
48 | 
49 |     <div class="section level2">
50 |     <h2 id="ref-usage">Usage<a class="anchor" aria-label="anchor" href="#ref-usage"></a></h2>
51 |     <div class="sourceCode"><pre class="sourceCode r"><code><span><span class="va">lhs</span> <span class="op">%&gt;%</span> <span class="va">rhs</span></span></code></pre></div>
52 |     </div>
53 | 
54 | 
55 |   </main></div>
56 | 
57 | 
58 |     <footer><div class="pkgdown-footer-left">
59 |   <p></p><p>Developed by Steven Pawley, Max Kuhn, Rowan Jacques-Hamilton.</p>
60 | </div>
61 | 
62 | <div class="pkgdown-footer-right">
63 |   <p></p><p>Site built with <a href="https://pkgdown.r-lib.org/" class="external-link">pkgdown</a> 2.0.6.</p>
64 | </div>
65 | 
66 |     </footer></div>
67 | 
68 |   
69 | 
70 |   
71 | 
72 |   </body></html>
73 | 
74 | 


--------------------------------------------------------------------------------
/R/step_select_roc.R:
--------------------------------------------------------------------------------
  1 | #' Filter Numeric Predictors using ROC Curve
  2 | #'
  3 | #' `step_select_roc` creates a *specification* of a recipe step that will
  4 | #'  filter predictors using their relationship with the outcome as measured
  5 | #'  using a Receiver Operating Characteristic curve.
  6 | #'
  7 | #' @inheritParams step_select_aov
  8 | #' @inherit step_select_aov return
  9 | #' @param outcome A single character string that specifies a single categorical
 10 | #'  variable to be used as the class.
 11 | #' @param role For model terms created by this step, what analysis role should
 12 | #'  they be assigned?. By default, the function assumes that resulting distances
 13 | #'  will be used as predictors in a model.
 14 | #'
 15 | #' @keywords datagen
 16 | #' @concept preprocessing
 17 | #' @concept supervised_filter
 18 | #' @export
 19 | #' @details
 20 | #'
 21 | #' The recipe will stop if all of `top_p`, `threshold` and `cutoff` are left
 22 | #' unspecified.
 23 | #'
 24 | #' The ROC AUC will be set to be 1 - AUC if the value is less than 0.50.
 25 | #'
 26 | #' @examples
 27 | #' data(cells, package = "modeldata")
 28 | #'
 29 | #' rec <-
 30 | #'   recipe(class ~ ., data = cells[, -1]) %>%
 31 | #'   step_select_roc(all_predictors(), outcome = "class", top_p = 10, cutoff = 0.9) %>%
 32 | #'   prep()
 33 | #'
 34 | #' rec %>% bake(all_predictors(), new_data = NULL) %>% names()
 35 | #'
 36 | #' # Use ROC values to select but always keep at least one:
 37 | #' rec <-
 38 | #'   recipe(class ~ ., data = cells[, -1]) %>%
 39 | #'   step_select_roc(
 40 | #'     all_predictors(),
 41 | #'     outcome = "class",
 42 | #'     top_p = 1,
 43 | #'     cutoff = 0.99
 44 | #'   ) %>%
 45 | #'   prep()
 46 | #'
 47 | #' rec %>% juice(all_predictors()) %>% names()
 48 | step_select_roc <-
 49 |   function(recipe, ..., outcome, role = "predictor", trained = FALSE,
 50 |            threshold = NA, top_p = NA, cutoff = NA, exclude = NULL,
 51 |            skip = FALSE, id = recipes::rand_id("select_roc")) {
 52 |   recipes::add_step(
 53 |     recipe,
 54 |     step_select_roc_new(
 55 |       terms = recipes::ellipse_check(...),
 56 |       outcome = outcome,
 57 |       role = role,
 58 |       trained = trained,
 59 |       top_p = top_p,
 60 |       threshold = threshold,
 61 |       cutoff = cutoff,
 62 |       exclude = exclude,
 63 |       skip = skip,
 64 |       id = id
 65 |     )
 66 |   )
 67 | }
 68 | 
 69 | step_select_roc_new <-
 70 |   function(terms, outcome, role, trained, top_p, threshold, cutoff, exclude,
 71 |            skip, id) {
 72 |     recipes::step(
 73 |       subclass = "select_roc",
 74 |       terms = terms,
 75 |       outcome = outcome,
 76 |       role = role,
 77 |       trained = trained,
 78 |       top_p = top_p,
 79 |       threshold = threshold,
 80 |       cutoff = cutoff,
 81 |       exclude = exclude,
 82 |       skip = skip,
 83 |       id = id
 84 |     )
 85 |   }
 86 | 
 87 | roc_calc <- function(x, y) {
 88 |   suppressMessages(
 89 |     suppressWarnings(
 90 |       {
 91 |         if (length(levels(y)) == 2) {
 92 |           res <- try(pROC::roc(y, x, direction = "auto"),
 93 |                      silent = TRUE)
 94 |         } else {
 95 |           res <- try(pROC::multiclass.roc(y, x, direction = "auto"),
 96 |                      silent = TRUE)
 97 |         }
 98 |       }
 99 |     )
100 |   )
101 | 
102 |   if (inherits(res, "try-error")) {
103 |     res <- NA_real_
104 |   } else {
105 |     res <- unname(pROC::auc(res))
106 |   }
107 |   res
108 | }
109 | 
110 | #' @export
111 | prep.step_select_roc <- function(x, training, info = NULL, ...) {
112 |   y_name <- recipes::recipes_eval_select(x$outcome, training, info)
113 |   y_name <- x$outcome[1]
114 |   recipes::check_type(training[, y_name], quant = FALSE)
115 |   x_names <- recipes::recipes_eval_select(x$terms, training, info)
116 | 
117 |   if (length(x_names) > 0) {
118 | 
119 |     recipes::check_type(training[, x_names])
120 | 
121 |     # check criteria
122 |     check_criteria(x$top_p, x$threshold, match.call())
123 |     check_zero_one(x$threshold)
124 |     x$top_p <- check_top_p(x$top_p, length(x_names))
125 | 
126 |     # filter
127 |     scores <- purrr::map_dbl(training[, x_names], ~ roc_calc(.x, training[[y_name]]))
128 |     exclude_chr <- dual_filter(scores, x$top_p, x$threshold, x$cutoff,
129 |                                maximize = TRUE)
130 |   } else {
131 |     exclude_chr <- character()
132 |   }
133 | 
134 |   step_select_roc_new(
135 |     terms = x$terms,
136 |     outcome = x$outcome,
137 |     role = x$role,
138 |     trained = TRUE,
139 |     top_p = x$top_p,
140 |     threshold = x$threshold,
141 |     cutoff = x$cutoff,
142 |     exclude = exclude_chr,
143 |     skip = x$skip,
144 |     id = x$id
145 |   )
146 | }
147 | 
148 | #' @export
149 | bake.step_select_roc <- function(object, new_data, ...) {
150 |   if (length(object$exclude) > 0) {
151 |     new_data <- new_data %>%
152 |       dplyr::select(-dplyr::one_of(object$exclude))
153 |   }
154 |   new_data
155 | }
156 | 
157 | #' @export
158 | print.step_select_roc <-
159 |   function(x, width = max(20, options()$width - 30), ...) {
160 |     cat("ROC curve feature selection")
161 | 
162 |     if (recipes::is_trained(x)) {
163 |       n <- length(x$exclude)
164 |       cat(paste0(" (", n, " excluded)"))
165 |     }
166 |     cat("\n")
167 | 
168 |     invisible(x)
169 |   }
170 | 
171 | #' @rdname step_select_roc
172 | #' @param x A `step_select_roc` object.
173 | #' @export
174 | tidy.step_select_roc <- function(x, ...) {
175 |   tidy_filter_step(x, type = "terms")
176 | }
177 | 
178 | #' @export
179 | tunable.step_select_roc <- function(x, ...) {
180 |   tibble::tibble(
181 |     name = c("top_p", "threshold", "cutoff"),
182 |     call_info = list(
183 |       list(pkg = "colino", fun = "top_p"),
184 |       list(pkg = "dials", fun = "threshold", range = c(0, 1)),
185 |       list(pkg = "colino", fun = "cutoff")
186 |     ),
187 |     source = "recipe",
188 |     component = "step_select_roc",
189 |     component_id = x$id
190 |   )
191 | }
192 | 
193 | #' @rdname required_pkgs.colino
194 | #' @export
195 | required_pkgs.step_select_roc <- function(x, ...) {
196 |   c("colino", "pROC")
197 | }
198 | 


--------------------------------------------------------------------------------
/R/step_select_mrmr.R:
--------------------------------------------------------------------------------
  1 | #' Apply minimum Redundancy Maximum Relevance Feature Selection (mRMR)
  2 | #'
  3 | #' `step_select_mrmr` creates a *specification* of a recipe step that will apply
  4 | #' minimum Redundancy Maximum Relevance Feature Selection (mRMR) to numeric
  5 | #' data. The top `top_p` scoring features, or features whose scores occur in the
  6 | #' top percentile `threshold` will be retained as new predictors.
  7 | #'
  8 | #' @inheritParams step_select_aov
  9 | #' @inherit step_select_aov return
 10 | #' @param role Not used by this step since no new variables are created
 11 | #' @param outcome A character string specifying the name of response variable
 12 | #'   used to evaluate mRMR.
 13 | #' @param threads An integer specifying the number of threads to use for
 14 | #'   processing. The default = 0 uses all available threads.
 15 | #' @param scores A tibble with 'variable' and 'scores' columns containing the
 16 | #'   names of the variables and their mRMR scores. This parameter is only
 17 | #'   produced after the recipe has been trained.
 18 | #'
 19 | #' @keywords datagen
 20 | #' @concept preprocessing
 21 | #' @concept supervised_filter
 22 | #' @export
 23 | #' @details
 24 | #'
 25 | #' The recipe will stop if all of `top_p`, `threshold` and `cutoff` are left
 26 | #' unspecified.
 27 | #'
 28 | #' @examples
 29 | #' library(recipes)
 30 | #'
 31 | #' data(cells, package = "modeldata")
 32 | #'
 33 | #' rec <-
 34 | #'  recipe(class ~ ., data = cells[, -1]) %>%
 35 | #'  step_select_mrmr(
 36 | #'    all_predictors(),
 37 | #'    outcome = "class",
 38 | #'    top_p = 10
 39 | #'  )
 40 | #'
 41 | #' prepped <- prep(rec)
 42 | #'
 43 | #' new_data <- bake(prepped, new_data = NULL)
 44 | #' prepped
 45 | step_select_mrmr <- function(
 46 |   recipe, ...,
 47 |   outcome = NULL,
 48 |   role = NA,
 49 |   trained = FALSE,
 50 |   top_p = NA,
 51 |   threshold = NA,
 52 |   cutoff = NA,
 53 |   threads = 0,
 54 |   exclude = NULL,
 55 |   scores = NULL,
 56 |   skip = FALSE,
 57 |   id = recipes::rand_id("select_mrmr")) {
 58 | 
 59 |   recipes::recipes_pkg_check("praznik")
 60 | 
 61 |   terms <- recipes::ellipse_check(...)
 62 | 
 63 |   recipes::add_step(
 64 |     recipe,
 65 |     step_select_mrmr_new(
 66 |       terms = terms,
 67 |       trained = trained,
 68 |       outcome = outcome,
 69 |       role = role,
 70 |       top_p = top_p,
 71 |       threshold = threshold,
 72 |       cutoff = cutoff,
 73 |       threads = threads,
 74 |       exclude = exclude,
 75 |       scores = scores,
 76 |       skip = skip,
 77 |       id = id
 78 |     )
 79 |   )
 80 | }
 81 | 
 82 | step_select_mrmr_new <-
 83 |   function(terms, role, trained, outcome, top_p, threshold, cutoff, threads,
 84 |            exclude, scores, skip, id) {
 85 |     recipes::step(
 86 |       subclass = "select_mrmr",
 87 |       terms = terms,
 88 |       role = role,
 89 |       trained = trained,
 90 |       outcome = outcome,
 91 |       top_p = top_p,
 92 |       threshold = threshold,
 93 |       cutoff = cutoff,
 94 |       threads = threads,
 95 |       exclude = exclude,
 96 |       scores = scores,
 97 |       skip = skip,
 98 |       id = id
 99 |     )
100 |   }
101 | 
102 | #' @export
103 | prep.step_select_mrmr <- function(x, training, info = NULL, ...) {
104 |   # extract response and predictor names
105 |   y_name <- recipes::recipes_eval_select(x$outcome, training, info)
106 |   y_name <- y_name[1]
107 |   x_names <- recipes::recipes_eval_select(x$terms, training, info)
108 | 
109 |   # check criteria
110 |   check_criteria(x$top_p, x$threshold, match.call())
111 |   check_zero_one(x$threshold)
112 |   x$top_p <- check_top_p(x$top_p, length(x_names))
113 | 
114 |   if (length(x_names) > 0) {
115 | 
116 |     call <- rlang::call2(
117 |       .fn = "MRMR",
118 |       .ns = "praznik",
119 |       X = rlang::quo(training[, x_names]),
120 |       Y = rlang::quo(training[[y_name]]),
121 |       k = length(x_names),
122 |       threads = x$threads
123 |     )
124 | 
125 |     res <- rlang::eval_tidy(call)
126 | 
127 |     res <- tibble(
128 |       variable = names(res$selection),
129 |       score = res$score
130 |     )
131 | 
132 |     exclude <-
133 |       dual_filter(res$score, x$top_p, x$threshold, x$cutoff, maximize = TRUE)
134 | 
135 |   } else {
136 |     exclude <- character()
137 |   }
138 | 
139 |   step_select_mrmr_new(
140 |     terms = x$terms,
141 |     trained = TRUE,
142 |     role = x$role,
143 |     outcome = y_name,
144 |     top_p = x$top_p,
145 |     threshold = x$threshold,
146 |     cutoff = x$cutoff,
147 |     threads = x$threads,
148 |     exclude = exclude,
149 |     scores = res,
150 |     skip = x$skip,
151 |     id = x$id
152 |   )
153 | }
154 | 
155 | #' @export
156 | bake.step_select_mrmr <- function(object, new_data, ...) {
157 |   if (length(object$exclude) > 0) {
158 |     new_data <- new_data[, !(colnames(new_data) %in% object$exclude)]
159 |   }
160 |   as_tibble(new_data)
161 | }
162 | 
163 | #' @export
164 | print.step_select_mrmr <-
165 |   function(x, width = max(20, options()$width - 30), ...) {
166 |     cat("mRMR feature selection")
167 | 
168 |     if (recipes::is_trained(x)) {
169 |       n <- length(x$exclude)
170 |       cat(paste0(" (", n, " excluded)"))
171 |     }
172 |     cat("\n")
173 | 
174 |     invisible(x)
175 |   }
176 | 
177 | #' @rdname step_select_mrmr
178 | #' @param x A `step_select_mrmr` object.
179 | #' @param type A character with either 'terms' (the default) to return a
180 | #'   tibble containing the variables that have been removed by the filter step,
181 | #'   or 'scores' to return the scores for each variable.
182 | #' @export
183 | tidy.step_select_mrmr <- function(x, type = "terms", ...) {
184 |   tidy_filter_step(x, type)
185 | }
186 | 
187 | #' @export
188 | tunable.step_select_mrmr <- function(x, ...) {
189 |   tibble(
190 |     name = c("top_p", "threshold", "cutoff"),
191 |     call_info = list(
192 |       list(pkg = "colino", fun = "top_p"),
193 |       list(pkg = "dials", fun = "threshold", range = c(0, 1)),
194 |       list(pkg = "colino", fun = "cutoff")
195 |     ),
196 |     source = "recipe",
197 |     component = "step_select_mrmr",
198 |     component_id = x$id
199 |   )
200 | }
201 | 
202 | #' @rdname required_pkgs.colino
203 | #' @export
204 | required_pkgs.step_select_mrmr <- function(x, ...) {
205 |   c("colino", "praznik")
206 | }
207 | 


--------------------------------------------------------------------------------
/tests/testthat/test_step_select_fcbf.R:
--------------------------------------------------------------------------------
  1 | library(recipes)
  2 | 
  3 | data(iris)
  4 | 
  5 | test_that("basic usage: expected columns retrieved", {
  6 |   skip_if_not_installed("FCBF")
  7 | 
  8 |   my_iris <- iris
  9 |   my_iris[['lglfeat']] <- c(TRUE, TRUE, FALSE, FALSE, TRUE, FALSE)
 10 |   my_iris[['partial_NAfeat']] <- c(2, 3, 6, 4, 3, NA)
 11 | 
 12 |   rec <-
 13 |     recipe(Species ~ ., data = my_iris) %>%
 14 |     step_select_fcbf(all_predictors(), threshold = 0.001)
 15 | 
 16 |   rec_p <- prep(rec, training = my_iris)
 17 | 
 18 |   iris_bake <- bake(rec_p, new_data = my_iris)
 19 | 
 20 |   expect_equal(names(iris_bake),
 21 |                c("Sepal.Width", "Petal.Width", "Species"))
 22 | })
 23 | 
 24 | test_that("warns/breaks if not enough predictors are provided", {
 25 |   skip_if_not_installed("FCBF")
 26 | 
 27 |   # warn if one usable predictor is provided to fcbf
 28 |   rec1 <-
 29 |     recipe(Species ~ Sepal.Length, iris) %>%
 30 |     step_select_fcbf(all_predictors())
 31 | 
 32 |   expect_warning(prep(rec1, training = iris), "Only one usable")
 33 | 
 34 |   # stop if no usable predictors are provided to fcbf
 35 |   rec2 <-
 36 |     recipe(Species ~ ., iris[, 'Species', drop = FALSE]) %>%
 37 |     step_select_fcbf(all_predictors())
 38 | 
 39 |   expect_error(prep(rec2, training = iris), "No usable predictors")
 40 | })
 41 | 
 42 | test_that("step_select_fcbf rejects bad threshold or cutpoint argument input", {
 43 |   skip_if_not_installed("FCBF")
 44 | 
 45 |   rec <- recipe(Species ~ ., data = iris)
 46 | 
 47 |   expect_error(
 48 |     rec %>%
 49 |       step_select_fcbf(threshold = 1.5) %>%
 50 |       prep(),
 51 |     "(0, 1)"
 52 |   )
 53 | 
 54 |   expect_error(
 55 |     rec %>%
 56 |       step_select_fcbf(threshold = NA) %>%
 57 |       prep(),
 58 |     "No usable"
 59 |   )
 60 | 
 61 |   expect_error(
 62 |     rec %>%
 63 |       step_select_fcbf(threshold = 0) %>%
 64 |       prep(),
 65 |     "(0, 1)"
 66 |   )
 67 | 
 68 |   expect_error(
 69 |     rec %>%
 70 |       step_select_fcbf(threshold = "median") %>%
 71 |       prep(),
 72 |     "should be numeric"
 73 |   )
 74 | 
 75 |   expect_error(
 76 |     rec %>%
 77 |       step_select_fcbf(threshold = TRUE) %>%
 78 |       prep(),
 79 |     "should be numeric"
 80 |   )
 81 | 
 82 |   expect_error(
 83 |     rec %>%
 84 |       step_select_fcbf(threshold = -0.01) %>%
 85 |       prep(),
 86 |     "(0, 1)"
 87 |   )
 88 | 
 89 |   error_cut <- "`cutpoint` must be a number between 0-1"
 90 |   expect_error(rec %>% step_select_fcbf(cutpoint = 1.5), error_cut)
 91 |   expect_error(rec %>% step_select_fcbf(cutpoint = NA), error_cut)
 92 |   expect_error(rec %>% step_select_fcbf(cutpoint = 0), error_cut)
 93 |   expect_error(rec %>% step_select_fcbf(cutpoint = "median"), error_cut)
 94 |   expect_error(rec %>% step_select_fcbf(cutpoint = TRUE), error_cut)
 95 |   expect_error(rec %>% step_select_fcbf(cutpoint = -0.01), error_cut)
 96 | })
 97 | 
 98 | # return warning if NA columns are provided
 99 | test_that("NA columns get removed with warning", {
100 |   skip_if_not_installed("FCBF")
101 | 
102 |   na_vec <- rep(NA, 10)
103 |   na_dat <- tibble(
104 |     out = rep(c("A", "B"), 5),
105 |     f1 = as.character(na_vec),
106 |     f2 = as.numeric(na_vec),
107 |     f3 = na_vec,
108 |     f4 = c(1, 4, 32, 6, 4, 23, 44, 54, 23, 6),
109 |     f5 = c(1:10)
110 |   )
111 | 
112 |   inpt_cols <- c('f1', 'f2', 'f3', 'f4', 'f5')
113 | 
114 |   rec <-
115 |     recipe(out ~ f1 + f2 + f3 + f4 + f5, data = na_dat) %>%
116 |     step_select_fcbf(all_predictors())
117 | 
118 |   expect_warning(remove_NA_cols(inpt_cols, na_dat), "3 features were full")
119 |   expect_warning(prep(rec, na_dat), "3 features were full")
120 | })
121 | 
122 | 
123 | # return error if outcome is not provided, or not in expected format
124 | test_that("bad outcome variables handled correctly", {
125 |   skip_if_not_installed("FCBF")
126 | 
127 |   # no outcome variable specified in recipe
128 |   rec <-
129 |     recipe(iris) %>%
130 |     update_role(Sepal.Length, Sepal.Width, Petal.Length, new_role =  'predictor') %>%
131 |     step_select_fcbf(all_predictors())
132 | 
133 |   expect_error(prep(rec, iris), "outcome variable was not found")
134 | 
135 |   # code works if outcome = argument supplied, despite no outcome in the recipe
136 |   rec2 <-
137 |     recipe(iris) %>%
138 |     update_role(Sepal.Length, Sepal.Width, Petal.Length, new_role =  'predictor') %>%
139 |     step_select_fcbf(all_predictors(), outcome = "Species")
140 | 
141 |   expect_equal(
142 |     prep(rec2, iris) %>% bake(iris) %>% names,
143 |     c("Sepal.Width", "Petal.Length", "Petal.Width", "Species")
144 |   )
145 | 
146 |   # outcome supplied in unexpected format
147 |   rec3 <-
148 |     recipe(iris) %>%
149 |     update_role(Sepal.Length, Sepal.Width, Petal.Length, new_role =  'predictor') %>%
150 |     step_select_fcbf(all_predictors(), outcome = 5)
151 | 
152 |   expect_error(prep(rec3, iris), "supplied as a character")
153 | 
154 |   rec4 <-
155 |     recipe(iris) %>%
156 |     update_role(Sepal.Length, Sepal.Width, Petal.Length, new_role =  'predictor') %>%
157 |     step_select_fcbf(all_predictors(), outcome = c("Species", "Petal.Length"))
158 | 
159 |   expect_error(prep(rec4, iris), "single outcome variable can be")
160 | 
161 |   rec5 <-
162 |     recipe(iris) %>%
163 |     update_role(Sepal.Length, Sepal.Width, Petal.Length, new_role =  'predictor') %>%
164 |     step_select_fcbf(all_predictors(), outcome = TRUE)
165 | 
166 |   expect_error(prep(rec5, iris), "supplied as a character")
167 | 
168 |   rec6 <-
169 |     recipe(iris) %>%
170 |     update_role(Sepal.Length, Sepal.Width, Petal.Length, new_role =  'predictor') %>%
171 |     step_select_fcbf(all_predictors(), outcome = 'doesnt_exist')
172 | 
173 |   expect_error(prep(rec6, iris), "not found")
174 | 
175 |   rec7 <-
176 |     recipe(iris) %>%
177 |     update_role(Sepal.Length, Sepal.Width, Petal.Length, new_role =  'predictor') %>%
178 |     step_select_fcbf(all_predictors(), outcome = NA)
179 | 
180 |   expect_error(prep(rec7, iris), "outcome variable was not found")
181 | })
182 | 
183 | # Test if user provides columns by name rather than using tidyselect helpers
184 | test_that("function works if user provides columns by name", {
185 |   skip_if_not_installed("FCBF")
186 | 
187 |   rec <-
188 |     recipe(Species ~ . , iris) %>%
189 |     step_select_fcbf(c("Petal.Length", "Sepal.Length"))
190 | 
191 |   expect_equal(
192 |     prep(rec, iris) %>% bake(iris) %>% names,
193 |     c("Sepal.Width", "Petal.Length", "Petal.Width", "Species")
194 |   )
195 | })
196 | 


--------------------------------------------------------------------------------
/docs/LICENSE.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <!-- Generated by pkgdown: do not edit by hand --><html lang="en"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><meta charset="utf-8"><meta http-equiv="X-UA-Compatible" content="IE=edge"><meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no"><title>MIT License • colino</title><script src="deps/jquery-3.6.0/jquery-3.6.0.min.js"></script><meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no"><link href="deps/bootstrap-5.1.3/bootstrap.min.css" rel="stylesheet"><script src="deps/bootstrap-5.1.3/bootstrap.bundle.min.js"></script><!-- Font Awesome icons --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/all.min.css" integrity="sha256-mmgLkCYLUQbXn0B1SRqzHar6dCnv9oZFPEC1g1cwlkk=" crossorigin="anonymous"><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/v4-shims.min.css" integrity="sha256-wZjR52fzng1pJHwx4aV2AO3yyTOXrcDW7jBpJtTwVxw=" crossorigin="anonymous"><!-- bootstrap-toc --><script src="https://cdn.rawgit.com/afeld/bootstrap-toc/v1.0.1/dist/bootstrap-toc.min.js"></script><!-- headroom.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/headroom.min.js" integrity="sha256-AsUX4SJE1+yuDu5+mAVzJbuYNPHj/WroHuZ8Ir/CkE0=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/jQuery.headroom.min.js" integrity="sha256-ZX/yNShbjqsohH1k95liqY9Gd8uOiE1S4vZc+9KQ1K4=" crossorigin="anonymous"></script><!-- clipboard.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.6/clipboard.min.js" integrity="sha256-inc5kl9MA1hkeYUt+EC3BhlIgyp/2jDIyBLS6k3UxPI=" crossorigin="anonymous"></script><!-- search --><script src="https://cdnjs.cloudflare.com/ajax/libs/fuse.js/6.4.6/fuse.js" integrity="sha512-zv6Ywkjyktsohkbp9bb45V6tEMoWhzFzXis+LrMehmJZZSys19Yxf1dopHx7WzIKxr5tK2dVcYmaCk2uqdjF4A==" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/autocomplete.js/0.38.0/autocomplete.jquery.min.js" integrity="sha512-GU9ayf+66Xx2TmpxqJpliWbT5PiGYxpaG8rfnBEk1LL8l1KGkRShhngwdXK1UgqhAzWpZHSiYPc09/NwDQIGyg==" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mark.js/8.11.1/mark.min.js" integrity="sha512-5CYOlHXGh6QpOFA/TeTylKLWfB3ftPsde7AnmhuitiTX4K5SqCLBeKro6sPS8ilsz1Q4NRx3v8Ko2IBiszzdww==" crossorigin="anonymous"></script><!-- pkgdown --><script src="pkgdown.js"></script><meta property="og:title" content="MIT License"><!-- mathjax --><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script><!--[if lt IE 9]>
 3 | <script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
 4 | <script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
 5 | <![endif]--></head><body>
 6 |     <a href="#main" class="visually-hidden-focusable">Skip to contents</a>
 7 |     
 8 | 
 9 |     <nav class="navbar fixed-top navbar-light navbar-expand-lg bg-light"><div class="container">
10 |     
11 |     <a class="navbar-brand me-2" href="index.html">colino</a>
12 | 
13 |     <small class="nav-text text-muted me-auto" data-bs-toggle="tooltip" data-bs-placement="bottom" title="">0.0.1</small>
14 | 
15 |     
16 |     <button class="navbar-toggler" type="button" data-bs-toggle="collapse" data-bs-target="#navbar" aria-controls="navbar" aria-expanded="false" aria-label="Toggle navigation">
17 |       <span class="navbar-toggler-icon"></span>
18 |     </button>
19 | 
20 |     <div id="navbar" class="collapse navbar-collapse ms-3">
21 |       <ul class="navbar-nav me-auto"><li class="nav-item">
22 |   <a class="nav-link" href="reference/index.html">Reference</a>
23 | </li>
24 |       </ul><form class="form-inline my-2 my-lg-0" role="search">
25 |         <input type="search" class="form-control me-sm-2" aria-label="Toggle navigation" name="search-input" data-search-index="search.json" id="search-input" placeholder="Search for" autocomplete="off"></form>
26 | 
27 |       <ul class="navbar-nav"><li class="nav-item">
28 |   <a class="external-link nav-link" href="https://github.com/stevenpawley/colino/" aria-label="github">
29 |     <span class="fab fa fab fa-github fa-lg"></span>
30 |      
31 |   </a>
32 | </li>
33 |       </ul></div>
34 | 
35 |     
36 |   </div>
37 | </nav><div class="container template-title-body">
38 | <div class="row">
39 |   <main id="main" class="col-md-9"><div class="page-header">
40 |       <img src="" class="logo" alt=""><h1>MIT License</h1>
41 |       <small class="dont-index">Source: <a href="https://github.com/stevenpawley/colino/blob/HEAD/LICENSE.md" class="external-link"><code>LICENSE.md</code></a></small>
42 |     </div>
43 | 
44 | <div id="mit-license" class="section level1">
45 | 
46 | <p>Copyright (c) 2019 Steven Pawley</p>
47 | <p>Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:</p>
48 | <p>The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.</p>
49 | <p>THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.</p>
50 | </div>
51 | 
52 |   </main></div>
53 | 
54 | 
55 |     <footer><div class="pkgdown-footer-left">
56 |   <p></p><p>Developed by Steven Pawley, Max Kuhn, Rowan Jacques-Hamilton.</p>
57 | </div>
58 | 
59 | <div class="pkgdown-footer-right">
60 |   <p></p><p>Site built with <a href="https://pkgdown.r-lib.org/" class="external-link">pkgdown</a> 2.0.6.</p>
61 | </div>
62 | 
63 |     </footer></div>
64 | 
65 |   
66 | 
67 |   
68 | 
69 |   </body></html>
70 | 
71 | 


--------------------------------------------------------------------------------
/R/step_select_vip.R:
--------------------------------------------------------------------------------
  1 | #' Feature selection step using a model's feature importance scores or
  2 | #' coefficients
  3 | #'
  4 | #' `step_select_vip` creates a *specification* of a recipe step that selects a
  5 | #' subset of predictors based on the ranking of variable importance provided by
  6 | #' a `parsnip` model specification and the `model` parameter
  7 | #'
  8 | #' @inheritParams step_select_aov
  9 | #' @inherit step_select_aov return
 10 | #' @param outcome A character string with the name of the response variable to
 11 | #'   use to calculate the feature importance scores.
 12 | #' @param role Not used by this step since no new variables are created.
 13 | #' @param model A `model_spec` object from `parsnip` that has a feature
 14 | #'   importances or coefficients method. The model needs to have an equivalent
 15 | #'   `pull_importances` method defined. See `?pull_importances` for how to
 16 | #'   define methods for models that are not currently supported.
 17 | #' @param scores A tibble with 'variable' and 'scores' columns containing the
 18 | #'   names of the variables and their feature importance scores. This parameter
 19 | #'   is only produced after the recipe has been trained.
 20 | #'
 21 | #' @export
 22 | #'
 23 | #' @details
 24 | #' The recipe will stop if all of `top_p`, `threshold` and `cutoff` are left
 25 | #' unspecified.
 26 | #'
 27 | #' @examples
 28 | #' library(recipes)
 29 | #' library(parsnip)
 30 | #'
 31 | #' # load the example cells dataset
 32 | #' data(cells, package = "modeldata")
 33 | #'
 34 | #' # define a base model to use for feature importances
 35 | #' base_model <- rand_forest(mode = "classification") %>%
 36 | #'     set_engine("ranger", importance = "permutation")
 37 | #'
 38 | #' # create a preprocessing recipe
 39 | #' rec <-
 40 | #'  recipe(class ~ ., data = cells[, -1]) %>%
 41 | #'  step_select_vip(
 42 | #'    all_predictors(),
 43 | #'    outcome = "class",
 44 | #'    model = base_model,
 45 | #'    top_p = 10
 46 | #'  )
 47 | #'
 48 | #' prepped <- prep(rec)
 49 | #'
 50 | #' preproc_data <- juice(prepped)
 51 | #' prepped
 52 | step_select_vip <- function(
 53 |   recipe,
 54 |   ...,
 55 |   outcome = NULL,
 56 |   role = "predictor",
 57 |   trained = FALSE,
 58 |   model = NULL,
 59 |   top_p = NA,
 60 |   threshold = NA,
 61 |   cutoff = NA,
 62 |   exclude = NULL,
 63 |   scores = NULL,
 64 |   skip = FALSE,
 65 |   id = recipes::rand_id("select_vip")) {
 66 | 
 67 |   if (missing(model))
 68 |     rlang::abort("Model argument should be a `parsnip` model specification")
 69 | 
 70 |   recipes::add_step(
 71 |     recipe,
 72 |     step_select_vip_new(
 73 |       terms = recipes::ellipse_check(...),
 74 |       trained = trained,
 75 |       outcome = outcome,
 76 |       role = role,
 77 |       model = model,
 78 |       top_p = top_p,
 79 |       threshold = threshold,
 80 |       cutoff = cutoff,
 81 |       exclude = exclude,
 82 |       scores = scores,
 83 |       skip = skip,
 84 |       id = id
 85 |     )
 86 |   )
 87 | }
 88 | 
 89 | # wrapper around 'step' function that sets the class of new step objects
 90 | #' @importFrom recipes step
 91 | step_select_vip_new <- function(terms, role, trained, outcome, model, top_p,
 92 |                                 threshold, cutoff, exclude, scores, skip, id) {
 93 |   recipes::step(
 94 |     subclass = "select_vip",
 95 |     terms = terms,
 96 |     role = role,
 97 |     trained = trained,
 98 |     outcome = outcome,
 99 |     model = model,
100 |     top_p = top_p,
101 |     threshold = threshold,
102 |     cutoff = cutoff,
103 |     exclude = exclude,
104 |     scores = scores,
105 |     skip = skip,
106 |     id = id
107 |   )
108 | }
109 | 
110 | #' @export
111 | prep.step_select_vip <- function(x, training, info = NULL, ...) {
112 | 
113 |   # translate the terms arguments
114 |   x_names <- recipes::recipes_eval_select(x$terms, training, info)
115 |   y_name <- recipes::recipes_eval_select(x$outcome, training, info)
116 |   y_name <- y_name[1]
117 | 
118 |   # check criteria
119 |   check_criteria(x$top_p, x$threshold, match.call())
120 |   check_zero_one(x$threshold)
121 |   x$top_p <- check_top_p(x$top_p, length(x_names))
122 | 
123 |   if (length(x_names) > 0) {
124 |     # fit initial model
125 |     X <- training[, x_names]
126 |     y <- training[[y_name]]
127 | 
128 |     initial_model <- parsnip::fit_xy(x$model, X, y)
129 |     res <- pull_importances(initial_model)
130 |     names(res) <- c("variable", "score")
131 |     res$score <- rlang::set_names(res$score, res$variable)
132 | 
133 |     exclude <-
134 |       dual_filter(res$score, x$top_p, x$threshold, x$cutoff, maximize = TRUE)
135 | 
136 |   } else {
137 |     exclude <- character()
138 |   }
139 | 
140 |   step_select_vip_new(
141 |     terms = x$terms,
142 |     trained = TRUE,
143 |     role = x$role,
144 |     outcome = y_name,
145 |     model = x$model,
146 |     top_p = x$top_p,
147 |     threshold = x$threshold,
148 |     cutoff = x$cutoff,
149 |     exclude = exclude,
150 |     scores = res,
151 |     skip = x$skip,
152 |     id = x$id
153 |   )
154 | }
155 | 
156 | #' @export
157 | bake.step_select_vip <- function(object, new_data, ...) {
158 |   if (length(object$exclude) > 0) {
159 |     new_data <- new_data[, !colnames(new_data) %in% object$exclude]
160 |   }
161 |   as_tibble(new_data)
162 | }
163 | 
164 | #' @export
165 | print.step_select_vip <-
166 |   function(x, width = max(20, options()$width - 30), ...) {
167 |     cat("Variable importance feature selection")
168 | 
169 |     if (recipes::is_trained(x)) {
170 |       n <- length(x$exclude)
171 |       cat(paste0(" (", n, " excluded)"))
172 |     }
173 |     cat("\n")
174 | 
175 |     invisible(x)
176 |   }
177 | 
178 | #' @rdname step_select_vip
179 | #' @param x A `step_select_vip` object
180 | #' @param type A character with either 'terms' (the default) to return a
181 | #'   tibble containing the variables that have been removed by the filter step,
182 | #'   or 'scores' to return the scores for each variable.
183 | #' @export
184 | tidy.step_select_vip <- function(x, type = "terms", ...) {
185 |   tidy_filter_step(x, type)
186 | }
187 | 
188 | #' @export
189 | tunable.step_select_vip <- function(x, ...) {
190 |   tibble(
191 |     name = c("top_p", "threshold", "cutoff"),
192 |     call_info = list(
193 |       list(pkg = "colino", fun = "top_p"),
194 |       list(pkg = "dials", fun = "threshold", range = c(0, 1)),
195 |       list(pkg = "colino", fun = "cutoff")
196 |     ),
197 |     source = "recipe",
198 |     component = "step_select_vip",
199 |     component_id = x$id
200 |   )
201 | }
202 | 
203 | #' @rdname required_pkgs.colino
204 | #' @export
205 | required_pkgs.step_select_vip <- function(x, ...) {
206 |   c("colino")
207 | }
208 | 


--------------------------------------------------------------------------------
/R/step_select_xtab.R:
--------------------------------------------------------------------------------
  1 | #' Filter Categorical Predictors using Contingency Tables
  2 | #'
  3 | #' `step_select_xtab` creates a *specification* of a recipe step that will
  4 | #'  filter predictors using their relationship with the outcome as measured
  5 | #'  using statistical tests for association.
  6 | #'
  7 | #' @inheritParams step_select_aov
  8 | #' @inherit step_select_aov return
  9 | #' @param outcome A single character string that specifies a single categorical
 10 | #'  variable to be used as the class.
 11 | #' @param role For model terms created by this step, what analysis role should
 12 | #'  they be assigned?. By default, the function assumes that resulting distances
 13 | #'  will be used as predictors in a model.
 14 | #' @param exact Should an exact test be used?
 15 | #' @param fdr Should false discovery rates (FDR) be used instead of p-values?
 16 | #'
 17 | #' @keywords datagen
 18 | #' @concept preprocessing
 19 | #' @concept supervised_filter
 20 | #' @export
 21 | #' @details
 22 | #'
 23 | #' The recipe will stop if all of `top_p`, `threshold` and `cutoff` are left
 24 | #' unspecified. If both are used, they are combined via 'or'.
 25 | #'
 26 | #' The Benjamini-Hochberg FDR correction is used (see [stats::p.adjust()]).
 27 | #'
 28 | #' Warnings from [stats::chisq.test()] and [stats::fisher.test()] are suppressed.
 29 | #' @examples
 30 | #' data(attrition, package = "modeldata")
 31 | #'
 32 | #' rec <-
 33 | #'   recipe(Attrition ~ ., data = attrition) %>%
 34 | #'   step_select_xtab(all_nominal(), -all_outcomes(), outcome = "Attrition",
 35 | #'                    top_p = 1, cutoff = 0.001, exact = TRUE) %>%
 36 | #'   prep()
 37 | #'
 38 | #' rec %>% juice(all_nominal(), -all_outcomes()) %>% names()
 39 | #'
 40 | #' tidy(rec, number = 1)
 41 | step_select_xtab <- function(recipe,
 42 |                              ...,
 43 |                              outcome,
 44 |                              role = "predictor",
 45 |                              trained = FALSE,
 46 |                              top_p = NA,
 47 |                              threshold = NA,
 48 |                              cutoff = NA,
 49 |                              exact = FALSE,
 50 |                              fdr = TRUE,
 51 |                              exclude = NULL,
 52 |                              skip = FALSE,
 53 |                              id = recipes::rand_id("select_xtab")) {
 54 |   recipes::add_step(
 55 |     recipe,
 56 |     step_select_xtab_new(
 57 |       terms = recipes::ellipse_check(...),
 58 |       outcome = outcome,
 59 |       role = role,
 60 |       trained = trained,
 61 |       top_p = top_p,
 62 |       threshold = threshold,
 63 |       cutoff = cutoff,
 64 |       exact = exact,
 65 |       fdr = fdr,
 66 |       exclude = exclude,
 67 |       skip = skip,
 68 |       id = id
 69 |     )
 70 |   )
 71 | }
 72 | 
 73 | step_select_xtab_new <-
 74 |   function(terms, outcome, role, trained, top_p, threshold, cutoff, exact, fdr,
 75 |            exclude, skip, id) {
 76 |     recipes::step(
 77 |       subclass = "select_xtab",
 78 |       terms = terms,
 79 |       outcome = outcome,
 80 |       role = role,
 81 |       trained = trained,
 82 |       top_p = top_p,
 83 |       threshold = threshold,
 84 |       cutoff = cutoff,
 85 |       exact = exact,
 86 |       fdr = fdr,
 87 |       exclude = exclude,
 88 |       skip = skip,
 89 |       id = id
 90 |     )
 91 |   }
 92 | 
 93 | tbl_calc <- function(x, y, exact) {
 94 |   xtab <- table(x, y)
 95 |   if (exact) {
 96 |     res <- suppressWarnings(try(stats::fisher.test(xtab)$p.value, silent = TRUE))
 97 |   } else {
 98 |     res <- suppressWarnings(try(stats::chisq.test(xtab)$p.value, silent = TRUE))
 99 |   }
100 |   if (inherits(res, "try-error")) {
101 |     res <- NA_real_
102 |   }
103 |   res
104 | }
105 | 
106 | #' @export
107 | prep.step_select_xtab <- function(x, training, info = NULL, ...) {
108 |   y_name <- recipes::recipes_eval_select(x$outcome, training, info)
109 |   y_name <- x$outcome[1]
110 |   recipes::check_type(training[, y_name], quant = FALSE)
111 |   x_names <- recipes::recipes_eval_select(x$terms, training, info)
112 | 
113 |   if (length(x_names) > 0) {
114 | 
115 |     recipes::check_type(training[, x_names], quant = FALSE)
116 | 
117 |     # check criteria
118 |     check_criteria(x$top_p, x$threshold, match.call())
119 |     check_zero_one(x$threshold)
120 |     x$top_p <- check_top_p(x$top_p, length(x_names))
121 | 
122 |     # filter
123 |     scores <- purrr::map_dbl(training[, x_names],
124 |                              ~ tbl_calc(.x, training[[y_name]], exact = x$exact))
125 |     scores <- sort(scores, na.last = TRUE)
126 |     if (x$fdr) {
127 |       scores <- stats::p.adjust(scores, method = "BH")
128 |     }
129 | 
130 |     exclude_chr <- dual_filter(scores, x$top_p, x$threshold, x$cutoff, maximize = FALSE)
131 |   } else {
132 |     exclude_chr <- character()
133 |   }
134 | 
135 |   step_select_xtab_new(
136 |     terms = x$terms,
137 |     outcome = x$outcome,
138 |     role = x$role,
139 |     trained = TRUE,
140 |     top_p = x$top_p,
141 |     threshold = x$threshold,
142 |     cutoff = x$cutoff,
143 |     exact = x$exact,
144 |     fdr = x$fdr,
145 |     exclude = exclude_chr,
146 |     skip = x$skip,
147 |     id = x$id
148 |   )
149 | }
150 | 
151 | #' @export
152 | bake.step_select_xtab <- function(object, new_data, ...) {
153 |   if (length(object$exclude) > 0) {
154 |     new_data <- new_data %>% dplyr::select(-dplyr::one_of(object$exclude))
155 |   }
156 |   new_data
157 | }
158 | 
159 | #' @export
160 | print.step_select_xtab <-
161 |   function(x, width = max(20, options()$width - 30), ...) {
162 |     cat("Association test feature selection")
163 | 
164 |     if (recipes::is_trained(x)) {
165 |       n <- length(x$exclude)
166 |       cat(paste0(" (", n, " excluded)"))
167 |     }
168 |     cat("\n")
169 | 
170 |     invisible(x)
171 |   }
172 | 
173 | #' @rdname step_select_xtab
174 | #' @param x A `step_select_xtab` object.
175 | #' @export
176 | tidy.step_select_xtab <- function(x, ...) {
177 |   tidy_filter_step(x, type = "terms")
178 | }
179 | 
180 | #' @export
181 | tunable.step_select_xtab <- function(x, ...) {
182 |   tibble::tibble(
183 |     name = c("top_p", "threshold", "cutoff"),
184 |     call_info = list(
185 |       list(pkg = "colino", fun = "top_p"),
186 |       list(pkg = "dials", fun = "threshold", range = c(-10, -1)),
187 |       list(pkg = "colino", fun = "cutoff")
188 |     ),
189 |     source = "recipe",
190 |     component = "step_select_xtab",
191 |     component_id = x$id
192 |   )
193 | }
194 | 
195 | #' @rdname required_pkgs.colino
196 | #' @export
197 | required_pkgs.step_select_xtab <- function(x, ...) {
198 |   c("colino")
199 | }
200 | 


--------------------------------------------------------------------------------
/docs/authors.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <!-- Generated by pkgdown: do not edit by hand --><html lang="en"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><meta charset="utf-8"><meta http-equiv="X-UA-Compatible" content="IE=edge"><meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no"><title>Authors and Citation • colino</title><script src="deps/jquery-3.6.0/jquery-3.6.0.min.js"></script><meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no"><link href="deps/bootstrap-5.1.3/bootstrap.min.css" rel="stylesheet"><script src="deps/bootstrap-5.1.3/bootstrap.bundle.min.js"></script><!-- Font Awesome icons --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/all.min.css" integrity="sha256-mmgLkCYLUQbXn0B1SRqzHar6dCnv9oZFPEC1g1cwlkk=" crossorigin="anonymous"><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/v4-shims.min.css" integrity="sha256-wZjR52fzng1pJHwx4aV2AO3yyTOXrcDW7jBpJtTwVxw=" crossorigin="anonymous"><!-- bootstrap-toc --><script src="https://cdn.rawgit.com/afeld/bootstrap-toc/v1.0.1/dist/bootstrap-toc.min.js"></script><!-- headroom.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/headroom.min.js" integrity="sha256-AsUX4SJE1+yuDu5+mAVzJbuYNPHj/WroHuZ8Ir/CkE0=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/jQuery.headroom.min.js" integrity="sha256-ZX/yNShbjqsohH1k95liqY9Gd8uOiE1S4vZc+9KQ1K4=" crossorigin="anonymous"></script><!-- clipboard.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.6/clipboard.min.js" integrity="sha256-inc5kl9MA1hkeYUt+EC3BhlIgyp/2jDIyBLS6k3UxPI=" crossorigin="anonymous"></script><!-- search --><script src="https://cdnjs.cloudflare.com/ajax/libs/fuse.js/6.4.6/fuse.js" integrity="sha512-zv6Ywkjyktsohkbp9bb45V6tEMoWhzFzXis+LrMehmJZZSys19Yxf1dopHx7WzIKxr5tK2dVcYmaCk2uqdjF4A==" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/autocomplete.js/0.38.0/autocomplete.jquery.min.js" integrity="sha512-GU9ayf+66Xx2TmpxqJpliWbT5PiGYxpaG8rfnBEk1LL8l1KGkRShhngwdXK1UgqhAzWpZHSiYPc09/NwDQIGyg==" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mark.js/8.11.1/mark.min.js" integrity="sha512-5CYOlHXGh6QpOFA/TeTylKLWfB3ftPsde7AnmhuitiTX4K5SqCLBeKro6sPS8ilsz1Q4NRx3v8Ko2IBiszzdww==" crossorigin="anonymous"></script><!-- pkgdown --><script src="pkgdown.js"></script><meta property="og:title" content="Authors and Citation"><!-- mathjax --><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script><!--[if lt IE 9]>
 3 | <script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
 4 | <script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
 5 | <![endif]--></head><body>
 6 |     <a href="#main" class="visually-hidden-focusable">Skip to contents</a>
 7 |     
 8 | 
 9 |     <nav class="navbar fixed-top navbar-light navbar-expand-lg bg-light"><div class="container">
10 |     
11 |     <a class="navbar-brand me-2" href="index.html">colino</a>
12 | 
13 |     <small class="nav-text text-muted me-auto" data-bs-toggle="tooltip" data-bs-placement="bottom" title="">0.0.1</small>
14 | 
15 |     
16 |     <button class="navbar-toggler" type="button" data-bs-toggle="collapse" data-bs-target="#navbar" aria-controls="navbar" aria-expanded="false" aria-label="Toggle navigation">
17 |       <span class="navbar-toggler-icon"></span>
18 |     </button>
19 | 
20 |     <div id="navbar" class="collapse navbar-collapse ms-3">
21 |       <ul class="navbar-nav me-auto"><li class="nav-item">
22 |   <a class="nav-link" href="reference/index.html">Reference</a>
23 | </li>
24 |       </ul><form class="form-inline my-2 my-lg-0" role="search">
25 |         <input type="search" class="form-control me-sm-2" aria-label="Toggle navigation" name="search-input" data-search-index="search.json" id="search-input" placeholder="Search for" autocomplete="off"></form>
26 | 
27 |       <ul class="navbar-nav"><li class="nav-item">
28 |   <a class="external-link nav-link" href="https://github.com/stevenpawley/colino/" aria-label="github">
29 |     <span class="fab fa fab fa-github fa-lg"></span>
30 |      
31 |   </a>
32 | </li>
33 |       </ul></div>
34 | 
35 |     
36 |   </div>
37 | </nav><div class="container template-citation-authors">
38 | <div class="row">
39 |   <main id="main" class="col-md-9"><div class="page-header">
40 |       <img src="" class="logo" alt=""><h1>Authors and Citation</h1>
41 |     </div>
42 | 
43 |     <div class="section level2 citation">
44 |       <h2>Authors</h2>
45 |       
46 |       <ul class="list-unstyled"><li>
47 |           <p><strong>Steven Pawley</strong>. Author, maintainer. 
48 |           </p>
49 |         </li>
50 |         <li>
51 |           <p><strong>Max Kuhn</strong>. Author. 
52 |           </p>
53 |         </li>
54 |         <li>
55 |           <p><strong>Rowan Jacques-Hamilton</strong>. Author. 
56 |           </p>
57 |         </li>
58 |       </ul></div>
59 | 
60 |     <div class="section level2 citation">
61 |       <h2 id="citation">Citation</h2>
62 |       <p><small class="dont-index">Source: <a href="https://github.com/stevenpawley/colino/blob/HEAD/DESCRIPTION" class="external-link"><code>DESCRIPTION</code></a></small></p>
63 | 
64 |       <p>Pawley S, Kuhn M, Jacques-Hamilton R (2022).
65 | <em>colino: Recipes Steps for Supervised Filter-Based Feature Selection</em>.
66 | R package version 0.0.1, <a href="https://stevenpawley.github.io/colino">https://stevenpawley.github.io/colino</a>. 
67 | </p>
68 |       <pre>@Manual{,
69 |   title = {colino: Recipes Steps for Supervised Filter-Based Feature Selection},
70 |   author = {Steven Pawley and Max Kuhn and Rowan Jacques-Hamilton},
71 |   year = {2022},
72 |   note = {R package version 0.0.1},
73 |   url = {https://stevenpawley.github.io/colino},
74 | }</pre>
75 |     </div>
76 |   </main><aside class="col-md-3"><nav id="toc"><h2>On this page</h2>
77 |     </nav></aside></div>
78 | 
79 | 
80 |     <footer><div class="pkgdown-footer-left">
81 |   <p></p><p>Developed by Steven Pawley, Max Kuhn, Rowan Jacques-Hamilton.</p>
82 | </div>
83 | 
84 | <div class="pkgdown-footer-right">
85 |   <p></p><p>Site built with <a href="https://pkgdown.r-lib.org/" class="external-link">pkgdown</a> 2.0.6.</p>
86 | </div>
87 | 
88 |     </footer></div>
89 | 
90 |   
91 | 
92 |   
93 | 
94 |   </body></html>
95 | 
96 | 


--------------------------------------------------------------------------------
/R/step_select_carscore.R:
--------------------------------------------------------------------------------
  1 | #' Feature selection step using the CAR score algorithm
  2 | #'
  3 | #' `step_select_carscore` creates a *specification* of a recipe step that
  4 | #' selects a subset of predictors as part of a regression model based on the
  5 | #' scores of the CAR score algorithm. This step requires the `care` package to be
  6 | #' installed. The top `top_p` scoring features, or features whose scores occur
  7 | #' in the top percentile `threshold` will be retained as new predictors.
  8 | #'
  9 | #' @inheritParams step_select_aov
 10 | #' @inherit step_select_aov return
 11 | #' @param role Not used by this step since no new variables are created.
 12 | #' @param lambda The correlation shrinkage intensity (range 0-1).
 13 | #' @param diagonal For diagonal = FALSE (the default) CAR scores are computed;
 14 | #'   otherwise with diagonal = TRUE marginal correlations.
 15 | #' @param outcome A character string with the name of the response variable.
 16 | #'   This must refer to a numeric feature for regression.
 17 | #' @param scores A tibble with 'variable' and 'scores' columns containing the
 18 | #'   names of the variables and the absolute values of the calculated CAR
 19 | #'   scores. This parameter is only produced after the recipe has been trained.
 20 | #' @export
 21 | #' @keywords datagen
 22 | #' @concept preprocessing
 23 | #' @concept supervised_filter
 24 | #' @export
 25 | #' @details
 26 | #'
 27 | #' The recipe will stop if all of `top_p`, `threshold` and `cutoff` are left
 28 | #' unspecified.
 29 | #'
 30 | #' @examples
 31 | #' library(recipes)
 32 | #'
 33 | #' data(car_prices, package = "modeldata")
 34 | #'
 35 | #' rec <-
 36 | #'  recipe(Price ~ ., data = car_prices) %>%
 37 | #'  step_select_carscore(
 38 | #'    all_predictors(),
 39 | #'    outcome = "Price",
 40 | #'    top_p = 5,
 41 | #'    cutoff = 0.7
 42 | #'  )
 43 | #'
 44 | #' prepped <- prep(rec)
 45 | #'
 46 | #' new_data <- bake(prepped, new_data = NULL)
 47 | #' prepped
 48 | step_select_carscore <- function(
 49 |   recipe, ...,
 50 |   outcome = NULL,
 51 |   role = NA,
 52 |   trained = FALSE,
 53 |   top_p = NA,
 54 |   threshold = NA,
 55 |   cutoff = NA,
 56 |   lambda = NA,
 57 |   diagonal = FALSE,
 58 |   exclude = NULL,
 59 |   scores = NULL,
 60 |   skip = FALSE,
 61 |   id = recipes::rand_id("select_carscore")) {
 62 | 
 63 |   recipes::recipes_pkg_check("care")
 64 | 
 65 |   terms <- recipes::ellipse_check(...)
 66 | 
 67 |   recipes::add_step(
 68 |     recipe,
 69 |     step_select_carscore_new(
 70 |       terms = terms,
 71 |       trained = trained,
 72 |       outcome = outcome,
 73 |       role = role,
 74 |       top_p = top_p,
 75 |       threshold = threshold,
 76 |       cutoff = cutoff,
 77 |       lambda = lambda,
 78 |       diagonal = diagonal,
 79 |       exclude = exclude,
 80 |       scores = scores,
 81 |       skip = skip,
 82 |       id = id
 83 |     )
 84 |   )
 85 | }
 86 | 
 87 | 
 88 | # wrapper around 'step' function that sets the class of new step objects
 89 | step_select_carscore_new <-
 90 |   function(terms, role, trained, outcome, top_p, threshold, cutoff, lambda,
 91 |            diagonal, exclude, scores, skip, id) {
 92 |     recipes::step(
 93 |       subclass = "select_carscore",
 94 |       terms = terms,
 95 |       role = role,
 96 |       trained = trained,
 97 |       outcome = outcome,
 98 |       top_p = top_p,
 99 |       threshold = threshold,
100 |       cutoff = cutoff,
101 |       lambda = lambda,
102 |       diagonal = diagonal,
103 |       exclude = exclude,
104 |       scores = scores,
105 |       skip = skip,
106 |       id = id
107 |     )
108 |   }
109 | 
110 | 
111 | #' @export
112 | prep.step_select_carscore <- function(x, training, info = NULL, ...) {
113 | 
114 |   # extract response and predictor names
115 |   x_names <- recipes::recipes_eval_select(x$terms, training, info)
116 |   y_name <- recipes::recipes_eval_select(x$outcome, training, info)
117 |   y_name <- y_name[1]
118 | 
119 |   # check criteria
120 |   recipes::check_type(training[, y_name], quant = TRUE)
121 |   check_criteria(x$top_p, x$threshold, match.call())
122 |   check_zero_one(x$threshold)
123 |   x$top_p <- check_top_p(x$top_p, length(x_names))
124 | 
125 |   # information gain
126 |   if (length(x_names) > 0) {
127 | 
128 |     args <- list()
129 | 
130 |     if (!is.na(x$lambda))
131 |       args$lambda <- x$lambda
132 | 
133 |     call <- rlang::call2(
134 |       .fn = "carscore",
135 |       .ns = "care",
136 |       Xtrain = training[, x_names],
137 |       Ytrain = training[, y_name],
138 |       diagonal = x$diagonal,
139 |       !!!args
140 |     )
141 | 
142 |     res <- rlang::eval_tidy(call)
143 | 
144 |     res <- tibble(
145 |       variable = names(res),
146 |       score = abs(res)
147 |     )
148 | 
149 |     exclude <-
150 |       dual_filter(res$score, x$top_p, x$threshold, x$cutoff, maximize = TRUE)
151 | 
152 |   } else {
153 |     exclude <- character()
154 |   }
155 | 
156 |   step_select_carscore_new(
157 |     terms = x$terms,
158 |     trained = TRUE,
159 |     role = x$role,
160 |     outcome = y_name,
161 |     top_p = x$top_p,
162 |     threshold = x$threshold,
163 |     cutoff = x$cutoff,
164 |     lambda = x$lambda,
165 |     diagonal = x$diagonal,
166 |     exclude = exclude,
167 |     scores = res,
168 |     skip = x$skip,
169 |     id = x$id
170 |   )
171 | }
172 | 
173 | #' @export
174 | bake.step_select_carscore <- function(object, new_data, ...) {
175 |   if (length(object$exclude > 0)) {
176 |     new_data <- new_data[, !(colnames(new_data) %in% object$exclude)]
177 |   }
178 |   as_tibble(new_data)
179 | }
180 | 
181 | #' @export
182 | print.step_select_carscore <-
183 |   function(x, width = max(20, options()$width - 30), ...) {
184 |     cat("Carscore feature selection")
185 | 
186 |     if (recipes::is_trained(x)) {
187 |       n <- length(x$exclude)
188 |       cat(paste0(" (", n, " excluded)"))
189 |     }
190 |     cat("\n")
191 | 
192 |     invisible(x)
193 |   }
194 | 
195 | #' @rdname step_select_carscore
196 | #' @param x A `step_select_carscore` object.
197 | #' @param type A character with either 'terms' (the default) to return a
198 | #'   tibble containing the variables that have been removed by the filter step,
199 | #'   or 'scores' to return the scores for each variable.
200 | #' @export
201 | tidy.step_select_carscore <- function(x, type = "terms", ...) {
202 |   tidy_filter_step(x, type)
203 | }
204 | 
205 | #' @export
206 | tunable.step_select_carscore <- function(x, ...) {
207 |   tibble::tibble(
208 |     name = c("top_p", "threshold", "cutoff"),
209 |     call_info = list(
210 |       list(pkg = "colino", fun = "top_p"),
211 |       list(pkg = "dials", fun = "threshold", range = c(0, 1)),
212 |       list(pkg = "colino", fun = "cutoff")
213 |     ),
214 |     source = "recipe",
215 |     component = "step_select_carscore",
216 |     component_id = x$id
217 |   )
218 | }
219 | 
220 | #' @rdname required_pkgs.colino
221 | #' @export
222 | required_pkgs.step_select_carscore <- function(x, ...) {
223 |   c("colino", "care")
224 | }
225 | 


--------------------------------------------------------------------------------
/R/step_select_infgain.R:
--------------------------------------------------------------------------------
  1 | #' Information gain feature selection step
  2 | #'
  3 | #' `step_select_infgain` creates a *specification* of a recipe step that selects a
  4 | #' subset of predictors based on the scores of the information gain algorithm.
  5 | #' This step requires the FSelectorRcpp package to be installed. The top
  6 | #' `top_p` scoring features, or features whose scores occur in the top
  7 | #' percentile `threshold` will be retained as new predictors.
  8 | #'
  9 | #' @inheritParams step_select_aov
 10 | #' @inherit step_select_aov return
 11 | #' @param role Not used by this step since no new variables are created.
 12 | #' @param type A character string specifying the information gain method to use.
 13 | #'   One of "infogain", "gainratio", "symuncert". The default is 'infogain'.
 14 | #' @param outcome A character string with the name of the response variable to
 15 | #'   use to evaluate information gain value against the predictors.
 16 | #' @param type The entropy measure. One of c("infogain", "gainratio",
 17 | #'   "symuncert"). The default is 'infogain'.
 18 | #' @param threads An integer specifying the number of threads to use for
 19 | #'   processing. The default = 0 uses all available threads.
 20 | #' @param nbins An integer specifying the number of bins for discretization.
 21 | #'   Only used if the outcome of a continuous variable for regression. The
 22 | #'   default is 'nbins = 5'.
 23 | #' @param scores A tibble with 'variable' and 'scores' columns containing the
 24 | #'   names of the variables and their information gain scores. This parameter is
 25 | #'   only produced after the recipe has been trained.
 26 | #'
 27 | #' @export
 28 | #' @keywords datagen
 29 | #' @concept preprocessing
 30 | #' @concept supervised_filter
 31 | #' @export
 32 | #' @details
 33 | #'
 34 | #' The recipe will stop if both `top_p`, `threshold` and `cutoff` are left
 35 | #' unspecified.
 36 | #'
 37 | #' @examples
 38 | #' library(recipes)
 39 | #'
 40 | #' data(cells, package = "modeldata")
 41 | #'
 42 | #' rec <-
 43 | #'  recipe(class ~ ., data = cells[, -1]) %>%
 44 | #'  step_select_infgain(
 45 | #'    all_predictors(),
 46 | #'    outcome = "class",
 47 | #'    threshold = 0.9,
 48 | #'    id = "infgain"
 49 | #'  )
 50 | #'
 51 | #' prepped <- prep(rec)
 52 | #'
 53 | #' new_data <- juice(prepped)
 54 | #' prepped
 55 | step_select_infgain <- function(
 56 |   recipe, ...,
 57 |   outcome = NULL,
 58 |   role = NA,
 59 |   trained = FALSE,
 60 |   top_p = NA,
 61 |   threshold = NA,
 62 |   cutoff = NA,
 63 |   type = "infogain",
 64 |   nbins = 5,
 65 |   threads = 1,
 66 |   exclude = NULL,
 67 |   scores = NULL,
 68 |   skip = FALSE,
 69 |   id = recipes::rand_id("select_infgain")) {
 70 | 
 71 |   recipes::recipes_pkg_check("FSelectorRcpp")
 72 | 
 73 |   terms <- recipes::ellipse_check(...)
 74 | 
 75 |   recipes::add_step(
 76 |     recipe,
 77 |     step_select_infgain_new(
 78 |       terms = terms,
 79 |       trained = trained,
 80 |       outcome = outcome,
 81 |       role = role,
 82 |       top_p = top_p,
 83 |       threshold = threshold,
 84 |       cutoff = cutoff,
 85 |       type = type,
 86 |       threads = threads,
 87 |       nbins = nbins,
 88 |       exclude = exclude,
 89 |       scores = scores,
 90 |       skip = skip,
 91 |       id = id
 92 |     )
 93 |   )
 94 | }
 95 | 
 96 | 
 97 | # wrapper around 'step' function that sets the class of new step objects
 98 | step_select_infgain_new <- function(terms, role, trained, outcome, top_p,
 99 |                                     threshold, cutoff, type, threads, nbins,
100 |                                     exclude, scores, skip, id) {
101 |   recipes::step(
102 |     subclass = "select_infgain",
103 |     terms = terms,
104 |     role = role,
105 |     trained = trained,
106 |     outcome = outcome,
107 |     top_p = top_p,
108 |     threshold = threshold,
109 |     cutoff = cutoff,
110 |     type = type,
111 |     threads = threads,
112 |     nbins = nbins,
113 |     exclude = exclude,
114 |     scores = scores,
115 |     skip = skip,
116 |     id = id
117 |   )
118 | }
119 | 
120 | 
121 | #' @export
122 | prep.step_select_infgain <- function(x, training, info = NULL, ...) {
123 |   # extract response and predictor names
124 |   x_names <- recipes::recipes_eval_select(x$terms, training, info)
125 |   y_name <- recipes::recipes_eval_select(x$outcome, training, info)
126 |   y_name <- y_name[1]
127 | 
128 |   # check criteria
129 |   check_criteria(x$top_p, x$threshold, match.call())
130 |   check_zero_one(x$threshold)
131 |   x$top_p <- check_top_p(x$top_p, length(x_names))
132 | 
133 |   # information gain
134 |   if (length(x_names) > 0) {
135 | 
136 |     f <- stats::as.formula(paste(y_name, "~", paste0(x_names, collapse = " + ")))
137 |     model_mode <- check_outcome(training[[y_name]])
138 |     equal <- model_mode == "regression"
139 | 
140 |     ig_call <- rlang::call2(
141 |       .fn = "information_gain",
142 |       .ns = "FSelectorRcpp",
143 |       formula = f,
144 |       data = rlang::quo(training),
145 |       type = x$type,
146 |       threads = x$threads,
147 |       discIntegers = TRUE,
148 |       equal = equal,
149 |       nbins = x$nbins
150 |     )
151 | 
152 |     res <- rlang::eval_tidy(ig_call)
153 |     res <- as_tibble(res)
154 |     res <- rlang::set_names(res, c("variable", "score"))
155 |     res$score <- rlang::set_names(res$score, res$variable)
156 | 
157 |     exclude <-
158 |       dual_filter(res$score, x$top_p, x$threshold, x$cutoff, maximize = TRUE)
159 | 
160 |   } else {
161 |     exclude <- character()
162 |   }
163 | 
164 |   step_select_infgain_new(
165 |     terms = x$terms,
166 |     trained = TRUE,
167 |     role = x$role,
168 |     outcome = y_name,
169 |     top_p = x$top_p,
170 |     threshold = x$threshold,
171 |     cutoff = x$cutoff,
172 |     type = x$type,
173 |     threads = x$threads,
174 |     nbins = x$nbins,
175 |     exclude = exclude,
176 |     scores = res,
177 |     skip = x$skip,
178 |     id = x$id
179 |   )
180 | }
181 | 
182 | #' @export
183 | bake.step_select_infgain <- function(object, new_data, ...) {
184 |   if (length(object$exclude > 0)) {
185 |     new_data <- new_data[, !(colnames(new_data) %in% object$exclude)]
186 |   }
187 |   as_tibble(new_data)
188 | }
189 | 
190 | #' @export
191 | print.step_select_infgain <-
192 |   function(x, width = max(20, options()$width - 30), ...) {
193 |     cat("Information Gain feature selection")
194 | 
195 |     if (recipes::is_trained(x)) {
196 |       n <- length(x$exclude)
197 |       cat(paste0(" (", n, " excluded)"))
198 |     }
199 |     cat("\n")
200 | 
201 |     invisible(x)
202 |   }
203 | 
204 | #' @rdname step_select_infgain
205 | #' @param x A `step_select_infgain` object.
206 | #' @param type A character with either 'terms' (the default) to return a
207 | #'   tibble containing the variables that have been removed by the filter step,
208 | #'   or 'scores' to return the scores for each variable.
209 | #' @export
210 | tidy.step_select_infgain <- function(x, type = "terms", ...) {
211 |   tidy_filter_step(x, type)
212 | }
213 | 
214 | #' @export
215 | tunable.step_select_infgain <- function(x, ...) {
216 |   tibble::tibble(
217 |     name = c("top_p", "entropy", "threshold", "cutoff"),
218 |     call_info = list(
219 |       list(pkg = "colino", fun = "top_p"),
220 |       list(pkg = "colino", fun = "entropy", values = values_entropy),
221 |       list(pkg = "dials", fun = "threshold", range = c(0, 1)),
222 |       list(pkg = "colino", fun = "cutoff")
223 |     ),
224 |     source = "recipe",
225 |     component = "step_select_infgain",
226 |     component_id = x$id
227 |   )
228 | }
229 | 
230 | #' @rdname required_pkgs.colino
231 | #' @export
232 | required_pkgs.step_select_infgain <- function(x, ...) {
233 |   c("colino", "FSelectorRcpp")
234 | }
235 | 


--------------------------------------------------------------------------------
/R/step_select_tree.R:
--------------------------------------------------------------------------------
  1 | #' Feature selection step using a decision tree importance scores
  2 | #'
  3 | #' `step_select_tree` creates a *specification* of a recipe step that selects a
  4 | #' subset of predictors based on the ranking of variable importance provided by
  5 | #' a `parsnip::decision_tree` supported model.
  6 | #'
  7 | #' @inheritParams step_select_aov
  8 | #' @inherit step_select_aov return
  9 | #' @param outcome A character string with the name of the response variable to
 10 | #'   use to calculate the feature importance scores.
 11 | #' @param role Not used by this step since no new variables are created.
 12 | #' @param engine A supported rand_forest engine that is supported by parsnip.
 13 | #'   The default is "rpart".
 14 | #' @param cost_complexity A positive number for the the cost/complexity
 15 | #'   parameter (a.k.a. Cp) used by CART models (specific engines only).
 16 | #' @param tree_depth An integer for maximum depth of the tree.
 17 | #' @param min_n An integer for the minimum number of data points in a node that
 18 | #'   are required for the node to be split further.
 19 | #' @param scores A tibble with 'variable' and 'scores' columns containing the
 20 | #'   names of the variables and their feature importance scores. This parameter
 21 | #'   is only produced after the recipe has been trained.
 22 | #'
 23 | #' @export
 24 | #'
 25 | #' @details
 26 | #'
 27 | #' The recipe will stop if all of `top_p`, `threshold` and `cutoff` are left
 28 | #' unspecified.
 29 | #'
 30 | #' @examples
 31 | #' library(recipes)
 32 | #' library(parsnip)
 33 | #'
 34 | #' # load the example cells dataset
 35 | #' data(cells, package = "modeldata")
 36 | #'
 37 | #' # create a preprocessing recipe
 38 | #' rec <-
 39 | #'  recipe(class ~ ., data = cells[, -1]) %>%
 40 | #'  step_select_tree(all_predictors(), outcome = "class", top_p = 10)
 41 | #'
 42 | #' prepped <- prep(rec)
 43 | #'
 44 | #' preproc_data <- bake(prepped, new_data = NULL)
 45 | #' prepped
 46 | step_select_tree <- function(
 47 |     recipe,
 48 |     ...,
 49 |     outcome = NULL,
 50 |     role = "predictor",
 51 |     trained = FALSE,
 52 |     engine = "rpart",
 53 |     cost_complexity = NULL,
 54 |     tree_depth = NULL,
 55 |     min_n = NULL,
 56 |     top_p = NA,
 57 |     threshold = NA,
 58 |     cutoff = NA,
 59 |     exclude = NULL,
 60 |     scores = NULL,
 61 |     skip = FALSE,
 62 |     id = recipes::rand_id("select_tree")) {
 63 | 
 64 |   engines <- parsnip::show_engines("decision_tree")$engine
 65 | 
 66 |   if (!engine %in% engines) {
 67 |     rlang::abort(
 68 |       paste("Engine argument should be one of", paste(engines, collapse = ", "))
 69 |     )
 70 |   }
 71 | 
 72 |   recipes::add_step(
 73 |     recipe,
 74 |     step_select_tree_new(
 75 |       terms = recipes::ellipse_check(...),
 76 |       trained = trained,
 77 |       outcome = outcome,
 78 |       role = role,
 79 |       engine = engine,
 80 |       cost_complexity = cost_complexity,
 81 |       tree_depth = tree_depth,
 82 |       min_n = min_n,
 83 |       top_p = top_p,
 84 |       threshold = threshold,
 85 |       cutoff = cutoff,
 86 |       exclude = exclude,
 87 |       scores = scores,
 88 |       skip = skip,
 89 |       id = id
 90 |     )
 91 |   )
 92 | }
 93 | 
 94 | # wrapper around 'step' function that sets the class of new step objects
 95 | #' @importFrom recipes step
 96 | step_select_tree_new <- function(terms, role, trained, outcome, engine,
 97 |                                  top_p, threshold, cutoff, cost_complexity,
 98 |                                  tree_depth, min_n, exclude, scores, skip, id) {
 99 |   recipes::step(
100 |     subclass = "select_tree",
101 |     terms = terms,
102 |     role = role,
103 |     trained = trained,
104 |     outcome = outcome,
105 |     engine = engine,
106 |     cost_complexity = cost_complexity,
107 |     tree_depth = tree_depth,
108 |     min_n = min_n,
109 |     top_p = top_p,
110 |     threshold = threshold,
111 |     cutoff = cutoff,
112 |     exclude = exclude,
113 |     scores = scores,
114 |     skip = skip,
115 |     id = id
116 |   )
117 | }
118 | 
119 | #' @export
120 | prep.step_select_tree <- function(x, training, info = NULL, ...) {
121 | 
122 |   # translate the terms arguments
123 |   x_names <- recipes::recipes_eval_select(x$terms, training, info)
124 |   y_name <- recipes::recipes_eval_select(x$outcome, training, info)
125 |   y_name <- y_name[1]
126 | 
127 |   # check criteria
128 |   check_criteria(x$top_p, x$threshold, match.call())
129 |   check_zero_one(x$threshold)
130 |   x$top_p <- check_top_p(x$top_p, length(x_names))
131 | 
132 |   if (length(x_names) > 0) {
133 |     # fit initial model
134 |     X <- training[, x_names]
135 |     y <- training[[y_name]]
136 | 
137 |     model_mode <- check_outcome(y)
138 | 
139 |     model_args <- list(
140 |       cost_complexity = x$cost_complexity,
141 |       tree_depth = x$tree_depth,
142 |       min_n = x$min_n
143 |     )
144 | 
145 |     model_spec <-
146 |       parsnip::make_call("decision_tree", args = model_args, ns = "parsnip")
147 | 
148 |     model_spec <-
149 |       rlang::eval_tidy(model_spec) %>%
150 |       parsnip::set_mode(model_mode) %>%
151 |       parsnip::set_engine(x$engine)
152 | 
153 |     initial_model <- parsnip::fit_xy(model_spec, X, y)
154 |     res <- pull_importances(initial_model)
155 |     names(res) <- c("variable", "score")
156 |     res$score <- rlang::set_names(res$score, res$variable)
157 | 
158 |     exclude <-
159 |       dual_filter(res$score, x$top_p, x$threshold, x$cutoff, maximize = TRUE)
160 | 
161 |   } else {
162 |     exclude <- character()
163 |   }
164 | 
165 |   step_select_tree_new(
166 |     terms = x$terms,
167 |     trained = TRUE,
168 |     role = x$role,
169 |     outcome = y_name,
170 |     engine = x$engine,
171 |     cost_complexity = x$cost_complexity,
172 |     tree_depth = x$tree_depth,
173 |     min_n = x$min_n,
174 |     top_p = x$top_p,
175 |     threshold = x$threshold,
176 |     cutoff = x$cutoff,
177 |     exclude = exclude,
178 |     scores = res,
179 |     skip = x$skip,
180 |     id = x$id
181 |   )
182 | }
183 | 
184 | #' @export
185 | bake.step_select_tree <- function(object, new_data, ...) {
186 |   if (length(object$exclude) > 0) {
187 |     new_data <- new_data[, !colnames(new_data) %in% object$exclude]
188 |   }
189 | 
190 |   as_tibble(new_data)
191 | }
192 | 
193 | #' @export
194 | print.step_select_tree <-
195 |   function(x, width = max(20, options()$width - 30),
196 |            ...) {
197 |     cat("Variable importance feature selection")
198 | 
199 |     if (recipes::is_trained(x)) {
200 |       n <- length(x$exclude)
201 |       cat(paste0(" (", n, " excluded)"))
202 |     }
203 |     cat("\n")
204 | 
205 |     invisible(x)
206 |   }
207 | 
208 | #' @rdname step_select_tree
209 | #' @param x A `step_select_tree` object.
210 | #' @param type A character with either 'terms' (the default) to return a
211 | #'   tibble containing the variables that have been removed by the filter step,
212 | #'   or 'scores' to return the scores for each variable.
213 | #' @export
214 | tidy.step_select_tree <- function(x, type = "terms", ...) {
215 |   tidy_filter_step(x, type)
216 | }
217 | 
218 | #' @export
219 | tunable.step_select_tree <- function(x, ...) {
220 |   tibble(
221 |     name = c("top_p", "threshold", "cutoff", "cost_complexity", "tree_depth", "min_n"),
222 |     call_info = list(
223 |       list(pkg = "colino", fun = "top_p"),
224 |       list(pkg = "dials", fun = "threshold", range = c(0, 1)),
225 |       list(pkg = "colino", fun = "cutoff"),
226 |       list(pkg = "dials", fun = "cost_complexity", range = c(-10, -1),
227 |            trans = scales::log10_trans()),
228 |       list(pkg = "dials", fun = "tree_depth", range = c(1L, 15L)),
229 |       list(pkg = "dials", fun = "min_n", range = c(2L, 40L))
230 |     ),
231 |     source = "recipe",
232 |     component = "step_select_tree",
233 |     component_id = x$id
234 |   )
235 | }
236 | 
237 | #' @rdname required_pkgs.colino
238 | #' @export
239 | required_pkgs.step_select_tree <- function(x, ...) {
240 |   c("colino")
241 | }
242 | 


--------------------------------------------------------------------------------
/R/step_select_relief.R:
--------------------------------------------------------------------------------
  1 | #' Feature selection step using the Relief algorithm
  2 | #'
  3 | #' Relief-based algorithms use nearest neighbors of randomly sampled
  4 | #' observations (without replacement) to derive feature weights/scores that
  5 | #' describe the relevance of each feature to the target variable. The feature
  6 | #' weights represent the differences between the normalized feature values from
  7 | #' each randomly sampled observation and a neighboring observation. If the
  8 | #' neighboring observation's class is the same as the sampled observation
  9 | #' (termed a 'hit') but the feature values are different, then this reduces the
 10 | #' score on the basis that widely varying feature values for the same class are
 11 | #' not desirable. Conversely, if a neighboring observation's class is different
 12 | #' from the sampled observation (termed a 'miss') and the feature values are
 13 | #' different, then this increases the score on the basis that observations of
 14 | #' different classes are widely separated by their feature values. The feature
 15 | #' weights / scores range from -1 (worst) to +1 (best).
 16 | #'
 17 | #' `step_select_relief` creates a *specification* of a recipe step that selects
 18 | #' a subset of predictors based on the scores of the relief algorithm. This step
 19 | #' requires the FSinR package to be installed. The top `top_p` scoring features,
 20 | #' or features whose scores occur in the top percentile `threshold` will be
 21 | #' retained as new predictors.
 22 | #'
 23 | #' @inheritParams step_select_aov
 24 | #' @inherit step_select_aov return
 25 | #' @param role Not used by this step since no new variables are created.
 26 | #' @param outcome A character string with the name of the response variable to
 27 | #'   use to evaluate information gain value against the predictors.
 28 | #' @param neighbors An integer with the number of neighbors for find for each
 29 | #'   sampled instance. Default is 5.
 30 | #' @param sample_size An integer with the number of instances to sample. Default
 31 | #'   is 10.
 32 | #' @param scores A tibble with 'variable' and 'scores' columns containing the
 33 | #'   names of the variables and their information gain scores. This parameter is
 34 | #'   only produced after the recipe has been trained.
 35 | #'
 36 | #' @export
 37 | #' @keywords datagen
 38 | #' @concept preprocessing
 39 | #' @concept supervised_filter
 40 | #' @export
 41 | #' @details
 42 | #'
 43 | #' The recipe will stop if all of `top_p`, `threshold` and `cutoff` are left
 44 | #' unspecified.
 45 | #'
 46 | #' @examples
 47 | #' \dontrun{
 48 | #' library(recipes)
 49 | #'
 50 | #' data(cells, package = "modeldata")
 51 | #'
 52 | #' rec <- recipe(class ~ ., data = cells[, -1]) %>%
 53 | #'   step_select_relief(
 54 | #'     all_predictors(),
 55 | #'     outcome = "class",
 56 | #'     top_p = 10
 57 | #'   )
 58 | #'
 59 | #'   prepped <- prep(rec)
 60 | #'   new_data <- bake(prepped, new_data = NULL)
 61 | #'   prepped
 62 | #' }
 63 | step_select_relief <- function(
 64 |     recipe, ...,
 65 |     outcome = NULL,
 66 |     role = NA,
 67 |     trained = FALSE,
 68 |     top_p = NA,
 69 |     threshold = NA,
 70 |     cutoff = NA,
 71 |     neighbors = 5,
 72 |     sample_size = 10,
 73 |     exclude = NULL,
 74 |     scores = NULL,
 75 |     skip = FALSE,
 76 |     id = recipes::rand_id("select_relief")) {
 77 | 
 78 |   recipes::recipes_pkg_check("FSinR")
 79 | 
 80 |   if (neighbors <= 0)
 81 |     rlang::abort("`neighbors` must be greater than zero")
 82 | 
 83 |   if (sample_size <= 0)
 84 |     rlang::abort("'sample_size' must be greater than zero")
 85 | 
 86 |   terms <- recipes::ellipse_check(...)
 87 | 
 88 |   recipes::add_step(
 89 |     recipe,
 90 |     step_select_relief_new(
 91 |       terms = terms,
 92 |       trained = trained,
 93 |       outcome = outcome,
 94 |       role = role,
 95 |       top_p = top_p,
 96 |       threshold = threshold,
 97 |       cutoff = cutoff,
 98 |       neighbors = neighbors,
 99 |       sample_size = sample_size,
100 |       exclude = exclude,
101 |       scores = scores,
102 |       skip = skip,
103 |       id = id
104 |     )
105 |   )
106 | }
107 | 
108 | 
109 | # wrapper around 'step' function that sets the class of new step objects
110 | step_select_relief_new <-
111 |   function(terms, role, trained, outcome, top_p, threshold, cutoff, neighbors,
112 |            sample_size, exclude, scores, skip, id) {
113 |   recipes::step(
114 |     subclass = "select_relief",
115 |     terms = terms,
116 |     role = role,
117 |     trained = trained,
118 |     outcome = outcome,
119 |     top_p = top_p,
120 |     threshold = threshold,
121 |     cutoff = cutoff,
122 |     neighbors = neighbors,
123 |     sample_size = sample_size,
124 |     exclude = exclude,
125 |     scores = scores,
126 |     skip = skip,
127 |     id = id
128 |   )
129 | }
130 | 
131 | 
132 | #' @export
133 | prep.step_select_relief <- function(x, training, info = NULL, ...) {
134 |   x_names <- recipes::recipes_eval_select(x$terms, training, info)
135 |   y_name <- recipes::recipes_eval_select(x$outcome, training, info)
136 |   y_name <- y_name[1]
137 | 
138 |   # check criteria
139 |   check_criteria(x$top_p, x$threshold, match.call())
140 |   check_zero_one(x$threshold)
141 |   x$top_p <- check_top_p(x$top_p, length(x_names))
142 | 
143 |   # feature selection
144 |   if (length(x_names) > 0) {
145 |     call_func <- rlang::call2(
146 |       .fn = "relief",
147 |       .ns = "FSelectorRcpp",
148 |       x = rlang::quo(as.data.frame(training[, x_names])),
149 |       y = rlang::quo(training[[y_name]]),
150 |       neighboursCount = x$neighbors,
151 |       sampleSize = x$sample_size
152 |     )
153 |     res <- rlang::eval_tidy(call_func)
154 |     res <- as_tibble(res)
155 |     res <- rlang::set_names(res, c("variable", "score"))
156 |     res$score <- rlang::set_names(res$score, res$variable)
157 |     res <- res[order(res$score, decreasing = TRUE), ]
158 | 
159 |     exclude <-
160 |       dual_filter(res$score, x$top_p, x$threshold, x$cutoff, maximize = TRUE)
161 | 
162 |   } else {
163 |     exclude <- character()
164 |   }
165 | 
166 |   step_select_relief_new(
167 |     terms = x$terms,
168 |     trained = TRUE,
169 |     role = x$role,
170 |     outcome = y_name,
171 |     top_p = x$top_p,
172 |     threshold = x$threshold,
173 |     cutoff = x$cutoff,
174 |     neighbors = x$neighbors,
175 |     sample_size = x$sample_size,
176 |     exclude = exclude,
177 |     scores = res,
178 |     skip = x$skip,
179 |     id = x$id
180 |   )
181 | }
182 | 
183 | #' @export
184 | bake.step_select_relief <- function(object, new_data, ...) {
185 |   if (length(object$exclude > 0)) {
186 |     new_data <- new_data[, !(colnames(new_data) %in% object$exclude)]
187 |   }
188 |   as_tibble(new_data)
189 | }
190 | 
191 | #' @export
192 | print.step_select_relief <- function(x, width = max(20, options()$width - 30), ...) {
193 |   cat("Relief feature selection")
194 | 
195 |   if (recipes::is_trained(x)) {
196 |     n <- length(x$exclude)
197 |     cat(paste0(" (", n, " excluded)"))
198 |   }
199 |   cat("\n")
200 | 
201 |   invisible(x)
202 | }
203 | 
204 | #' @rdname step_select_relief
205 | #' @param x A `step_select_relief` object.
206 | #' @export
207 | tidy.step_select_relief <- function(x, ...) {
208 |   if (recipes::is_trained(x)) {
209 |     res <- tibble(terms = x$exclude)
210 |   } else {
211 |     term_names <- recipes::sel2char(x$terms)
212 |     res <- tibble(terms = rlang::na_chr)
213 |   }
214 |   res$id <- x$id
215 |   res
216 | }
217 | 
218 | #' @export
219 | tunable.step_select_relief <- function(x, ...) {
220 |   tibble::tibble(
221 |     name = c("top_p", "threshold", "cutoff"),
222 |     call_info = list(
223 |       list(pkg = "colino", fun = "top_p"),
224 |       list(pkg = "dials", fun = "threshold", range = c(0, 1)),
225 |       list(pkg = "colino", fun = "cutoff")
226 |     ),
227 |     source = "recipe",
228 |     component = "step_select_relief",
229 |     component_id = x$id
230 |   )
231 | }
232 | 
233 | #' @rdname required_pkgs.colino
234 | #' @export
235 | required_pkgs.step_select_relief <- function(x, ...) {
236 |   c("colino", "FSinR")
237 | }
238 | 


--------------------------------------------------------------------------------