├── .Rbuildignore
├── .github
├── ISSUE_TEMPLATE
│ ├── config.yml
│ └── issue_template.md
└── workflows
│ ├── issue.yml
│ ├── stale-actions.yml
│ └── tic.yml
├── .gitignore
├── DESCRIPTION
├── Dockerfile
├── NAMESPACE
├── NEWS.md
├── R
├── correlation_of_multiple_predictors.R
├── create_folds_and_data_split.R
├── feature_selection.R
├── plot_feature_selection.R
├── utils.R
└── wrapper_feature_selection.R
├── README.md
├── man
├── DataSplit.Rd
├── add_probs_dfs.Rd
├── barplot_feat_select.Rd
├── class_folds.Rd
├── feature_selection.Rd
├── func_correlation.Rd
├── func_replace_NAs.Rd
├── func_shuffle.Rd
├── normalized.Rd
├── regr_folds.Rd
├── remove_duplic_func.Rd
├── second_func_cor.Rd
└── wrapper_feat_select.Rd
├── tests
├── testthat.R
└── testthat
│ ├── helper-function_for_tests.R
│ ├── test-add_probability_matrices_OR_data_frames.R
│ ├── test-barplot_feat_select.R
│ ├── test-classification_folds.R
│ ├── test-correlation_function.R
│ ├── test-data_split_function.R
│ ├── test-feature_selection.R
│ ├── test-regression_folds.R
│ ├── test-secondary_functions_func_correlation.R
│ ├── test-shuffle_data.R
│ └── test-wrapper_feature_selection.R
└── tic.R
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^Dockerfile$
2 | ^.git$
3 | ^.github$
4 | ^\.ccache$
5 | ^\.github$
6 | ^tic\.R$
7 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | # For more info see: https://docs.github.com/en/github/building-a-strong-community/configuring-issue-templates-for-your-repository#configuring-the-template-chooser
2 |
3 | blank_issues_enabled: true
4 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/issue_template.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report or feature request
3 | about: Describe a bug you've encountered or make a case for a new feature
4 | ---
5 |
6 | Please briefly describe your problem and what output you expect. If you have a question, you also have the option of (but I'm flexible if it's not too complicated)
7 |
8 | Please include a minimal reproducible example
9 |
10 | Please give a brief description of the problem
11 |
12 | Please add your Operating System (e.g., Windows10, Macintosh, Linux) and the R version that you use (e.g., 3.6.2)
13 |
14 | If my package uses Python (via 'reticulate') then please add also the Python version (e.g., Python 3.8) and the 'reticulate' version (e.g., 1.18.0)
15 |
--------------------------------------------------------------------------------
/.github/workflows/issue.yml:
--------------------------------------------------------------------------------
1 | # For more info see: https://github.com/Renato66/auto-label
2 | # for the 'secrets.GITHUB_TOKEN' see: https://docs.github.com/en/actions/reference/authentication-in-a-workflow#about-the-github_token-secret
3 |
4 | name: Labeling new issue
5 | on:
6 | issues:
7 | types: ['opened']
8 | jobs:
9 | build:
10 | runs-on: ubuntu-latest
11 | steps:
12 | - uses: Renato66/auto-label@v2
13 | with:
14 | repo-token: ${{ secrets.GITHUB_TOKEN }}
15 | ignore-comments: true
16 | labels-synonyms: '{"bug":["error","need fix","not working"],"enhancement":["upgrade"],"question":["help"]}'
17 | labels-not-allowed: '["good first issue"]'
18 | default-labels: '["help wanted"]'
19 |
--------------------------------------------------------------------------------
/.github/workflows/stale-actions.yml:
--------------------------------------------------------------------------------
1 | # for the 'secrets.GITHUB_TOKEN' see: https://docs.github.com/en/actions/reference/authentication-in-a-workflow#about-the-github_token-secret
2 |
3 | name: "Mark or close stale issues and PRs"
4 |
5 | on:
6 | schedule:
7 | - cron: "00 * * * *"
8 |
9 | jobs:
10 | stale:
11 | runs-on: ubuntu-latest
12 | steps:
13 | - uses: actions/stale@v3
14 | with:
15 | repo-token: ${{ secrets.GITHUB_TOKEN }}
16 | days-before-stale: 12
17 | days-before-close: 7
18 | stale-issue-message: "This is Robo-lampros because the Human-lampros is lazy. This issue has been automatically marked as stale because it has not had recent activity. It will be closed after 7 days if no further activity occurs. Feel free to re-open a closed issue and the Human-lampros will respond."
19 | stale-pr-message: "This is Robo-lampros because the Human-lampros is lazy. This PR has been automatically marked as stale because it has not had recent activity. It will be closed after 7 days if no further activity occurs."
20 | close-issue-message: "This issue was automatically closed because of being stale. Feel free to re-open a closed issue and the Human-lampros will respond."
21 | close-pr-message: "This PR was automatically closed because of being stale."
22 | stale-pr-label: "stale"
23 | stale-issue-label: "stale"
24 | exempt-issue-labels: "bug,enhancement,pinned,security,pending,work_in_progress"
25 | exempt-pr-labels: "bug,enhancement,pinned,security,pending,work_in_progress"
26 |
--------------------------------------------------------------------------------
/.github/workflows/tic.yml:
--------------------------------------------------------------------------------
1 | ## tic GitHub Actions template: linux-macos-windows-deploy
2 | ## revision date: 2020-12-11
3 | on:
4 | workflow_dispatch:
5 | push:
6 | pull_request:
7 | # for now, CRON jobs only run on the default branch of the repo (i.e. usually on master)
8 | schedule:
9 | # * is a special character in YAML so you have to quote this string
10 | - cron: "0 4 * * *"
11 |
12 | name: tic
13 |
14 | jobs:
15 | all:
16 | runs-on: ${{ matrix.config.os }}
17 |
18 | name: ${{ matrix.config.os }} (${{ matrix.config.r }})
19 |
20 | strategy:
21 | fail-fast: false
22 | matrix:
23 | config:
24 | # use a different tic template type if you do not want to build on all listed platforms
25 | - { os: macOS-latest, r: "release", pkgdown: "true", latex: "true" }
26 | - { os: ubuntu-latest, r: "devel" }
27 | - { os: ubuntu-latest, r: "release" }
28 |
29 | env:
30 | # otherwise remotes::fun() errors cause the build to fail. Example: Unavailability of binaries
31 | R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
32 | CRAN: ${{ matrix.config.cran }}
33 | # make sure to run `tic::use_ghactions_deploy()` to set up deployment
34 | TIC_DEPLOY_KEY: ${{ secrets.TIC_DEPLOY_KEY }}
35 | # prevent rgl issues because no X11 display is available
36 | RGL_USE_NULL: true
37 | # if you use bookdown or blogdown, replace "PKGDOWN" by the respective
38 | # capitalized term. This also might need to be done in tic.R
39 | BUILD_PKGDOWN: ${{ matrix.config.pkgdown }}
40 | # macOS >= 10.15.4 linking
41 | SDKROOT: /Library/Developer/CommandLineTools/SDKs/MacOSX.sdk
42 | # use GITHUB_TOKEN from GitHub to workaround rate limits in {remotes}
43 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
44 |
45 | steps:
46 | - uses: actions/checkout@v3
47 |
48 | - uses: r-lib/actions/setup-r@v2
49 | with:
50 | r-version: ${{ matrix.config.r }}
51 | Ncpus: 4
52 |
53 | # LaTeX. Installation time:
54 | # Linux: ~ 1 min
55 | # macOS: ~ 1 min 30s
56 | # Windows: never finishes
57 | - uses: r-lib/actions/setup-tinytex@v2
58 | if: matrix.config.latex == 'true'
59 |
60 | - uses: r-lib/actions/setup-pandoc@v2
61 |
62 | # set date/week for use in cache creation
63 | # https://github.community/t5/GitHub-Actions/How-to-set-and-access-a-Workflow-variable/m-p/42970
64 | # - cache R packages daily
65 | - name: "[Cache] Prepare daily timestamp for cache"
66 | if: runner.os != 'Windows'
67 | id: date
68 | run: echo "::set-output name=date::$(date '+%d-%m')"
69 |
70 | - name: "[Cache] Cache R packages"
71 | if: runner.os != 'Windows'
72 | uses: pat-s/always-upload-cache@v2.1.3
73 | with:
74 | path: ${{ env.R_LIBS_USER }}
75 | key: ${{ runner.os }}-r-${{ matrix.config.r }}-${{steps.date.outputs.date}}
76 | restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-${{steps.date.outputs.date}}
77 |
78 | # for some strange Windows reason this step and the next one need to be decoupled
79 | - name: "[Stage] Prepare"
80 | run: |
81 | Rscript -e "if (!requireNamespace('remotes')) install.packages('remotes', type = 'source')"
82 | Rscript -e "if (getRversion() < '3.2' && !requireNamespace('curl')) install.packages('curl', type = 'source')"
83 |
84 | - name: "[Stage] [Linux] Install curl and libgit2"
85 | if: runner.os == 'Linux'
86 | run: sudo apt install libcurl4-openssl-dev libgit2-dev
87 |
88 | - name: "[Stage] [macOS] Install libgit2"
89 | if: runner.os == 'macOS'
90 | run: brew install libgit2
91 |
92 | - name: "[Stage] [macOS] Install system libs for pkgdown"
93 | if: runner.os == 'macOS' && matrix.config.pkgdown != ''
94 | run: brew install harfbuzz fribidi
95 |
96 | - name: "[Stage] [Linux] Install system libs for pkgdown"
97 | if: runner.os == 'Linux' && matrix.config.pkgdown != ''
98 | run: sudo apt install libharfbuzz-dev libfribidi-dev
99 |
100 | - name: "[Stage] Install"
101 | if: matrix.config.os != 'macOS-latest' || matrix.config.r != 'devel'
102 | run: Rscript -e "remotes::install_github('ropensci/tic')" -e "print(tic::dsl_load())" -e "tic::prepare_all_stages()" -e "tic::before_install()" -e "tic::install()"
103 |
104 | # macOS devel needs its own stage because we need to work with an option to suppress the usage of binaries
105 | - name: "[Stage] Prepare & Install (macOS-devel)"
106 | if: matrix.config.os == 'macOS-latest' && matrix.config.r == 'devel'
107 | run: |
108 | echo -e 'options(Ncpus = 4, pkgType = "source", repos = structure(c(CRAN = "https://cloud.r-project.org/")))' > $HOME/.Rprofile
109 | Rscript -e "remotes::install_github('ropensci/tic')" -e "print(tic::dsl_load())" -e "tic::prepare_all_stages()" -e "tic::before_install()" -e "tic::install()"
110 |
111 | - name: "[Stage] Script"
112 | run: Rscript -e 'tic::script()'
113 |
114 | - name: "[Stage] After Success"
115 | if: matrix.config.os == 'macOS-latest' && matrix.config.r == 'release'
116 | run: Rscript -e "tic::after_success()"
117 |
118 | - name: "[Stage] Upload R CMD check artifacts"
119 | if: failure()
120 | uses: actions/upload-artifact@v2.2.1
121 | with:
122 | name: ${{ runner.os }}-r${{ matrix.config.r }}-results
123 | path: check
124 | - name: "[Stage] Before Deploy"
125 | run: |
126 | Rscript -e "tic::before_deploy()"
127 |
128 | - name: "[Stage] Deploy"
129 | run: Rscript -e "tic::deploy()"
130 |
131 | - name: "[Stage] After Deploy"
132 | run: Rscript -e "tic::after_deploy()"
133 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | docs/
2 | .Rhistory
3 |
--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
1 | Package: FeatureSelection
2 | Type: Package
3 | Title: Feature extraction and selection based on 'glmnet', 'xgboost' and 'ranger'
4 | Version: 1.0.0
5 | Date: 2024-08-09
6 | Authors@R: c( person(given = "Lampros", family = "Mouselimis", email = "mouselimislampros@gmail.com", role = c("aut", "cre"), comment = c(ORCID = "https://orcid.org/0000-0002-8024-1546")))
7 | Maintainer: Lampros Mouselimis
8 | BugReports: https://github.com/mlampros/FeatureSelection/issues
9 | URL: https://github.com/mlampros/FeatureSelection
10 | Description: Feature extraction and selection based on 'glmnet', 'xgboost' and 'ranger' R packages. This package allows also the plotting of selected features and observing the correlation of multiple predictors.
11 | Depends:
12 | R(>= 3.3.0)
13 | Imports:
14 | doParallel,
15 | data.table,
16 | glmnet,
17 | ranger,
18 | xgboost,
19 | Matrix,
20 | magrittr,
21 | utils,
22 | stats,
23 | graphics,
24 | grDevices,
25 | rlang
26 | Suggests:
27 | testthat,
28 | covr
29 | SystemRequirements: update: apt-get -y update (deb)
30 | License: GPL-3
31 | Encoding: UTF-8
32 | RoxygenNote: 7.3.0
33 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM rocker/rstudio:devel
2 |
3 | LABEL maintainer='Lampros Mouselimis'
4 |
5 | RUN export DEBIAN_FRONTEND=noninteractive; apt-get -y update && \
6 | apt-get install -y make zlib1g-dev libssl-dev libcurl4-openssl-dev && \
7 | apt-get install -y sudo && \
8 | apt-get -y update && \
9 | R -e "install.packages(c( 'doParallel', 'data.table', 'glmnet', 'ranger', 'xgboost', 'Matrix', 'magrittr', 'utils', 'stats', 'graphics', 'grDevices', 'rlang', 'testthat', 'covr', 'remotes' ), repos = 'https://cloud.r-project.org/' )" && \
10 | R -e "remotes::install_github('mlampros/FeatureSelection', upgrade = 'always', dependencies = TRUE, repos = 'https://cloud.r-project.org/')" && \
11 | apt-get autoremove -y && \
12 | apt-get clean
13 |
14 | ENV USER rstudio
15 |
16 |
17 |
--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
1 | # Generated by roxygen2: do not edit by hand
2 |
3 | export(DataSplit)
4 | export(add_probs_dfs)
5 | export(barplot_feat_select)
6 | export(class_folds)
7 | export(feature_selection)
8 | export(func_correlation)
9 | export(func_shuffle)
10 | export(normalized)
11 | export(regr_folds)
12 | export(wrapper_feat_select)
13 | importFrom(Matrix,Matrix)
14 | importFrom(Matrix,colSums)
15 | importFrom(data.table,as.data.table)
16 | importFrom(doParallel,registerDoParallel)
17 | importFrom(glmnet,cv.glmnet)
18 | importFrom(grDevices,dev.cur)
19 | importFrom(grDevices,dev.off)
20 | importFrom(graphics,barplot)
21 | importFrom(graphics,par)
22 | importFrom(magrittr,"%>%")
23 | importFrom(ranger,ranger)
24 | importFrom(rlang,.data)
25 | importFrom(stats,as.formula)
26 | importFrom(stats,cor)
27 | importFrom(stats,median)
28 | importFrom(stats,na.omit)
29 | importFrom(utils,combn)
30 | importFrom(xgboost,xgb.DMatrix)
31 | importFrom(xgboost,xgb.importance)
32 | importFrom(xgboost,xgb.train)
33 |
--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
1 |
2 | ## FeatureSelection
3 |
4 | * **09-08-2024**: I replaced **dplyr** with **data.table** because the **dplyr** functions 'summarise_each_' and 'funs' gave deprecation warnings
5 | * **19-05-2021**: I replaced **doMC** with **doParallel** because **doMC** does not work on both **Unix** and **Windows** OS (applies only to **'glmnet-lasso'** method if number of threads > 1)
6 | * **03-02-2020**:
7 | + Updated the R files so that *Feature Selection* works with the newest versions of the imported R packages
8 | + Adjusted the tests
9 | + Added Dockerfile and docker image
10 | + Updated the README.md and .travis.yml files
11 | * **18-05-2016**: I added tests and code-coverage
12 |
--------------------------------------------------------------------------------
/R/correlation_of_multiple_predictors.R:
--------------------------------------------------------------------------------
1 | #' secondary function ( used in the func_correlation )
2 | #'
3 | #' this is a secondary function that is used in the func_correlation
4 | #'
5 | #' @keywords internal
6 | #' @importFrom stats na.omit
7 |
8 | # use this function if more than one predictor output
9 | second_func_cor = function(dat_frame) {
10 |
11 | lst = list()
12 |
13 | for (i in 1:dim(dat_frame)[2]) {
14 |
15 | coln = colnames(dat_frame)
16 | temp_df = data.frame(dat_frame[, i], row.names = rownames(dat_frame))
17 | colnames(temp_df) = coln[i]
18 | temp_df[temp_df == 0.0] = NA
19 | lst[[i]] = stats::na.omit(temp_df)
20 | }
21 |
22 | return(lst)
23 | }
24 |
25 |
26 | #' function to remove duplicated pairs of predictors ( used in the func_correlation )
27 | #'
28 | #' this is a secondary function that is used in the func_correlation
29 | #'
30 | #' @keywords internal
31 |
32 |
33 | remove_duplic_func = function(sublist) {
34 |
35 | vec_col = vec_row = vec_prob = rep(NA, dim(sublist)[1])
36 |
37 | for (j in 1:dim(sublist)[1]) {
38 |
39 | vec_col[j] = colnames(sublist)
40 | vec_row[j] = rownames(sublist)[j]
41 | vec_prob[j] = sublist[j, 1]
42 | }
43 |
44 | data.frame(predictor1 = vec_row, predictor2 = vec_col, prob = vec_prob)
45 | }
46 |
47 |
48 | #' find correlated variables
49 | #'
50 | #' This function takes a data frame or a matrix and returns either a data frame or a list of data frames with correlated variables
51 | #'
52 | #' @param data either a data frame or a matrix
53 | #' @param target either a string (name of the predictor/response in the data set) or a vector of strings (predictor/response names of the data set)
54 | #' @param correlation_thresh a float indicating the correlation threshold
55 | #' @param use_obs one of "everything", "all.obs", "complete.obs", "na.or.complete", "pairwise.complete.obs"
56 | #' @param correlation_method one of "pearson", "kendall", "spearman"
57 | #' @return either a data frame or a list of data frames
58 | #' @details
59 | #' This function takes a data frame or a matrix and returns the correlated variables of the data
60 | #' @export
61 | #' @importFrom stats cor na.omit
62 |
63 |
64 | func_correlation = function(data, target = NULL, correlation_thresh = NULL, use_obs = NULL, correlation_method = NULL) {
65 |
66 | if (!inherits(data, c("data.frame", "matrix"))) stop('the data should be either a data frame or a matrix')
67 | if (sum(unlist(lapply(1:dim(data)[2], function(x) is.factor(data[, x]) || is.character(data[, x]))) > 0)) stop(simpleError('data must be numeric'))
68 | if (is.null(correlation_method)) stop(simpleError('use one of "pearson", "kendall", "spearman" as correlation method'))
69 | if (is.null(use_obs)) stop(simpleError('use one of "everything", "all.obs", "complete.obs", "na.or.complete", "pairwise.complete.obs" as use_obs'))
70 | if (is.null(correlation_thresh) || correlation_thresh <= 0.0 || correlation_thresh >= 1.0) stop(simpleError('the correlation_thresh should be greater than 0.0 and less than 1.0'))
71 |
72 | df = data.frame(stats::cor(data, use = use_obs, method = correlation_method))
73 |
74 | if (length(target) > 1) { # in case that target is a vector with multiple predictors, it returns a matrix with those predictors above the correlation_thresh
75 |
76 | df[upper.tri(df)] = diag(df) = 0.0
77 | df[df < correlation_thresh] = 0.0
78 | df = df[-as.vector(which(rowSums(df) == 0.0)), -as.vector(which(colSums(df) == 0.0))] # negate to remove all sparse columns and rows
79 |
80 | if (length(which(!is.na(match(colnames(df), target)))) == 1) { # exception if one of the column-names or row-names do not appear in the end-df
81 |
82 | filt = data.frame(row.names = rownames(df), df[, which(!is.na(match(colnames(df), target)))])
83 | colnames(filt) = colnames(df)[which(!is.na(match(colnames(df), target)))]
84 | filt[filt == 0.0] = NA
85 | filt = stats::na.omit(filt)
86 | }
87 |
88 | else {
89 |
90 | filt = df[, which(!is.na(match(colnames(df), target)))]
91 | }
92 | }
93 |
94 | else if (is.null(target)) {
95 |
96 | df[upper.tri(df)] = diag(df) = 0.0 # return all predictors in case that is.null(target)
97 | df[df < correlation_thresh] = 0.0
98 | filt = df[-as.vector(which(rowSums(df) == 0.0)), -as.vector(which(colSums(df) == 0.0))] # negate to remove all sparse columns and rows
99 | }
100 |
101 | else {
102 |
103 | df_names = data.frame(df[, target], row.names = rownames(df)) # in case where target is a single string, I get a single column data.frame out
104 | colnames(df_names) = target
105 | df_names[rownames(df_names) == target, ] = NA # remove the 'target' column-name
106 | df_names = stats::na.omit(df_names)
107 | filt = subset(df_names, df_names[, target] >= correlation_thresh)
108 | filt = data.frame(features = rownames(filt), filt[, 1])
109 | filt = filt[order(filt[, 2], decreasing = TRUE), ]
110 | filt = data.frame(filt[, 2], row.names = filt[, 1])
111 | colnames(filt) = c(target)
112 | }
113 |
114 | if (dim(filt)[2] > 1) {
115 |
116 | out = second_func_cor(filt)
117 | out = out[unlist(lapply(out, function(x) dim(x)[1] != 0))]
118 | return(list(out_list = out, out_df = do.call(rbind, lapply(out, function(x) remove_duplic_func(x)))))}
119 |
120 | else {
121 |
122 | return(filt)
123 | }
124 | }
125 |
126 |
127 |
--------------------------------------------------------------------------------
/R/create_folds_and_data_split.R:
--------------------------------------------------------------------------------
1 | #' stratified folds (in classification)
2 | #'
3 | #' this function creates stratified folds in binary and multiclass classification
4 | #'
5 | #' @param folds is an integer specifying the number of folds
6 | #' @param RESP is the response variable
7 | #' @param shuffle is a boolean specifying if the vector of indices should be shuffled or not
8 | #' @return a list of indices
9 | #' @export
10 | #' @importFrom utils combn
11 | #' @examples
12 | #'
13 | #' \dontrun{
14 | #'
15 | #' data(iris)
16 | #'
17 | #' y = as.factor(iris[, 5])
18 | #'
19 | #' folds = class_folds(10, y, shuffle = TRUE)
20 | #' }
21 |
22 |
23 | class_folds = function(folds, RESP, shuffle = FALSE) {
24 |
25 | if (!is.factor(RESP)) {
26 |
27 | stop(simpleError("RESP must be a factor"))
28 | }
29 |
30 | clas = lapply(unique(RESP), function(x) which(RESP == x))
31 |
32 | len = lapply(clas, function(x) length(x))
33 |
34 | samp_vec = rep(1/folds, folds)
35 |
36 | prop = lapply(len, function(y) sapply(1:length(samp_vec), function(x) round(y * samp_vec[x])))
37 |
38 | repl = unlist(lapply(prop, function(x) sapply(1:length(x), function(y) rep(paste0('fold_', y), x[y]))))
39 |
40 | spl = suppressWarnings(split(1:length(RESP), repl))
41 |
42 | sort_names = paste0('fold_', 1:folds)
43 |
44 | spl = spl[sort_names]
45 |
46 | if (length(table(unlist(lapply(spl, function(x) length(x))))) > 1) {
47 |
48 | warning('the folds are not equally split') # the warning appears when I divide the unique labels to the number of folds and instead of an integer I get a float
49 | }
50 |
51 | if (shuffle == TRUE) {
52 |
53 | spl = lapply(spl, function(x) func_shuffle(x)) # the indices of the unique levels will be shuffled
54 | }
55 |
56 | ind = t(utils::combn(1:folds, 2))
57 |
58 | ind1 = apply(ind, 1, function(x) length(intersect(spl[x[1]], spl[x[2]])))
59 |
60 | if (sum(ind1) > 0) {
61 |
62 | stop(simpleError("there is an intersection between the resulted indexes of the folds"))
63 | }
64 |
65 | if (length(unlist(spl)) != length(RESP)) {
66 |
67 | stop(simpleError("the number of items in the folds are not equal with the response items"))
68 | }
69 |
70 | spl
71 | }
72 |
73 |
74 |
75 |
76 | #' create folds (in regression)
77 | #'
78 | #' this function creates both stratified and non-stratified folds in regression
79 | #'
80 | #' @param folds is an integer specifying the number of folds
81 | #' @param RESP is the response variable
82 | #' @param stratified is a boolean specifying if the folds should be stratfied
83 | #' @return a list of indices
84 | #' @export
85 | #' @examples
86 | #'
87 | #' \dontrun{
88 | #'
89 | #' data(iris)
90 | #'
91 | #' y = X[, 1]
92 | #'
93 | #' folds = regr_folds(5, y, stratified = FALSE)
94 | #'
95 | #' }
96 |
97 |
98 | regr_folds = function(folds, RESP, stratified = FALSE) {
99 |
100 | if (is.factor(RESP)) {
101 |
102 | stop(simpleError("this function is meant for regression for classification use the 'class_folds' function"))
103 | }
104 |
105 | samp_vec = rep(1/folds, folds)
106 |
107 | sort_names = paste0('fold_', 1:folds)
108 |
109 |
110 | if (stratified == TRUE) {
111 |
112 | stratif = cut(RESP, breaks = folds)
113 |
114 | clas = lapply(unique(stratif), function(x) which(stratif == x))
115 |
116 | len = lapply(clas, function(x) length(x))
117 |
118 | prop = lapply(len, function(y) sapply(1:length(samp_vec), function(x) round(y * samp_vec[x])))
119 |
120 | repl = unlist(lapply(prop, function(x) sapply(1:length(x), function(y) rep(paste0('fold_', y), x[y]))))
121 |
122 | spl = suppressWarnings(split(1:length(RESP), repl))}
123 |
124 | else {
125 |
126 | prop = lapply(length(RESP), function(y) sapply(1:length(samp_vec), function(x) round(y * samp_vec[x])))
127 |
128 | repl = func_shuffle(unlist(lapply(prop, function(x) sapply(1:length(x), function(y) rep(paste0('fold_', y), x[y])))))
129 |
130 | spl = suppressWarnings(split(1:length(RESP), repl))
131 | }
132 |
133 | spl = spl[sort_names]
134 |
135 | if (length(table(unlist(lapply(spl, function(x) length(x))))) > 1) {
136 |
137 | warning('the folds are not equally split') # the warning appears when I divide the unique labels to the number of folds and instead of an ingeger I get a float
138 | }
139 |
140 | if (length(unlist(spl)) != length(RESP)) {
141 |
142 | stop(simpleError("the length of the splits are not equal with the length of the response"))
143 | }
144 |
145 | spl
146 | }
147 |
148 |
149 |
150 | #' partition of data (train-test-split)
151 | #'
152 | #' @param y is a numeric vector (response variable)
153 | #' @param TrainRatio is the percentage of train-data after the partition
154 | #' @param regression is a boolean (TRUE, FALSE) indicating if it's a regression or classification task
155 | #' @param shuffle is a boolean (TRUE, FALSE) indicating if the data should be shuffled or not (by default 5 times)
156 | #' @param seed an integer specifying the random seed
157 | #' @return a list of indices (train-test)
158 | #' @export
159 | #' @examples
160 | #'
161 | #' \dontrun{
162 | #'
163 | #' data(iris)
164 | #'
165 | #' y = X[, 1]
166 | #'
167 | #' split = DataSplit(y, TrainRatio = 0.75, regression = FALSE, shuffle = FALSE)
168 | #'
169 | #' }
170 |
171 |
172 | DataSplit = function(y, TrainRatio = 0.75, regression = TRUE, shuffle = FALSE, seed = 1) {
173 |
174 | if (TrainRatio >= 1.0 || TrainRatio <= 0.0) stop('TrainRation should be a float number greater than 0 and less than 1.0')
175 |
176 | if (regression) {
177 | set.seed(seed)
178 | idx_train = sample(1:length(y), size = round(TrainRatio * length(y)))
179 | idx_test = setdiff(1:length(y), idx_train)
180 | }
181 |
182 | if (!regression) {
183 | clas = lapply(unique(y), function(x) which(y == x))
184 | set.seed(seed)
185 | idx_train = unlist(lapply(clas, function(x) sample(x, size = round(TrainRatio * length(x)))))
186 | idx_test = setdiff(1:length(y), idx_train)
187 | }
188 |
189 | if (shuffle) {
190 |
191 | for (i in c(1:5)){ idx_train = sample(idx_train, length(idx_train)) }
192 |
193 | for (i in c(1:5)){ idx_test = sample(idx_test, length(idx_test)) }
194 | }
195 |
196 | list(idx_train = idx_train, idx_test = idx_test)
197 | }
198 |
--------------------------------------------------------------------------------
/R/feature_selection.R:
--------------------------------------------------------------------------------
1 |
2 | utils::globalVariables(c("%>%",
3 | ".",
4 | ".N",
5 | ".SD",
6 | "coefficients",
7 | "predict")) # Keep 'predict' as a global variable. It appears both in 'stats' and 'glmnet' however I can not specify 'predict.cv.glmnet' because the function does not appear in the >= 3.0.0 version of the package (I receive an error otherwise)
8 |
9 |
10 | #' Feature selection
11 | #'
12 | #' This function uses three different methods (glmnet, xgboost, ranger) in order to select important features.
13 | #'
14 | #' @param X a sparse Matrix, a matrix or a data frame
15 | #' @param y a vector of length representing the response variable
16 | #' @param method one of 'glmnet-lasso', 'xgboost', 'ranger'
17 | #' @param params_glmnet a list of parameters for the glmnet model
18 | #' @param params_xgboost a list of parameters for the xgboost model
19 | #' @param params_ranger a list of parameters for the ranger model
20 | #' @param xgb_sort sort the xgboost features by "Gain", "Cover" or "Frequency" ( defaults to "Frequency")
21 | #' @param CV_folds a number specifying the number of folds for cross validation
22 | #' @param stratified_regr a boolean determining if the folds in regression should be stratified
23 | #' @param scale_coefs_glmnet if TRUE, less important coefficients will be smaller than the more important ones (ranking/plotting by magnitude possible)
24 | #' @param cores_glmnet an integer determining the number of cores to register in glmnet
25 | #' @param verbose outputs info
26 | #' @return a data frame with the most important features
27 | #' @author Lampros Mouselimis
28 | #'
29 | #' @details
30 | #'
31 | #' This function returns the important features using one of the glmnet, xgboost or ranger algorithms. The glmnet algorithm can take either a sparse matrix, a matrix or a data frame
32 | #' and returns a data frame with non zero coefficients. The xgboost algorithm can take either a sparse matrix, a matrix or a data frame and returns the importance of the features in form
33 | #' of a data frame, furthermore it is possible to sort the features using one of the "Gain", "Cover" or "Frequency" methods. The ranger algorithm can take either a matrix or a data frame
34 | #' and returns the important features using one of the 'impurity' or 'permutation' methods.
35 | #'
36 | #' @export
37 | #' @importFrom glmnet cv.glmnet
38 | #' @importFrom data.table as.data.table
39 | #' @importFrom doParallel registerDoParallel
40 | #' @importFrom xgboost xgb.DMatrix xgb.train xgb.importance
41 | #' @importFrom ranger ranger
42 | #' @importFrom stats as.formula
43 | #' @importFrom Matrix colSums Matrix
44 | #' @importFrom magrittr %>%
45 | #' @importFrom rlang .data
46 | #'
47 | #' @examples
48 | #'
49 | #' \dontrun{
50 | #'
51 | #' #...........
52 | #' # regression
53 | #' #...........
54 | #'
55 | #' data(iris)
56 | #'
57 | #' X = iris[, -5]
58 | #' y = X[, 1]
59 | #' X = X[, -1]
60 | #'
61 | #' params_glmnet = list(alpha = 1,
62 | #' family = 'gaussian',
63 | #' nfolds = 3,
64 | #' parallel = TRUE)
65 | #'
66 | #' res = feature_selection(X,
67 | #' y,
68 | #' method = 'glmnet-lasso',
69 | #' params_glmnet = params_glmnet,
70 | #' CV_folds = 5,
71 | #' cores_glmnet = 5)
72 | #'
73 | #' #......................
74 | #' # binary classification
75 | #' #......................
76 | #'
77 | #' y = iris[, 5]
78 | #' y = as.character(y)
79 | #' y[y == 'setosa'] = 'virginica'
80 | #' X = iris[, -5]
81 | #'
82 | #' params_ranger = list(write.forest = TRUE,
83 | #' probability = TRUE,
84 | #' num.threads = 6,
85 | #' num.trees = 50,
86 | #' verbose = FALSE,
87 | #' classification = TRUE,
88 | #' mtry = 2,
89 | #' min.node.size = 5,
90 | #' importance = 'impurity')
91 | #'
92 | #' res = feature_selection(X,
93 | #' y,
94 | #' method = 'ranger',
95 | #' params_ranger = params_ranger,
96 | #' CV_folds = 5)
97 | #'
98 | #' #..........................
99 | #' # multiclass classification
100 | #' #..........................
101 | #'
102 | #' y = iris[, 5]
103 | #' multiclass_xgboost = ifelse(y == 'setosa', 0, ifelse(y == 'virginica', 1, 2))
104 | #' X = iris[, -5]
105 | #'
106 | #' params_xgboost = list(params = list("objective" = "multi:softprob",
107 | #' "bst:eta" = 0.35,
108 | #' "subsample" = 0.65,
109 | #' "num_class" = 3,
110 | #' "max_depth" = 6,
111 | #' "colsample_bytree" = 0.65,
112 | #' "nthread" = 2),
113 | #' nrounds = 50,
114 | #' print.every.n = 50,
115 | #' verbose = 0,
116 | #' maximize = FALSE)
117 | #'
118 | #' res = feature_selection(X,
119 | #' multiclass_xgboost,
120 | #' method = 'xgboost',
121 | #' params_xgboost = params_xgboost,
122 | #' CV_folds = 5)
123 | #' }
124 |
125 |
126 | feature_selection = function(X,
127 | y,
128 | method = NULL,
129 | params_glmnet = NULL,
130 | params_xgboost = NULL,
131 | params_ranger = NULL,
132 | xgb_sort = NULL,
133 | CV_folds = 5,
134 | stratified_regr = FALSE,
135 | scale_coefs_glmnet = FALSE,
136 | cores_glmnet = NULL,
137 | verbose = FALSE) {
138 |
139 | if (is.null(method)) stop("use method = .. to select one of the available methods : xgboost, glmnet-lasso, ranger")
140 | if (CV_folds < 1) stop("CV_folds should be >= 1")
141 |
142 | if (method == 'glmnet-lasso' && CV_folds == 1) {
143 |
144 | if (verbose) {
145 | cat('=====================================================================', '\n')
146 | cat('glmnet feature selection, starts...', '\n')
147 | cat('=====================================================================', '\n')
148 | }
149 |
150 | if (params_glmnet$family == 'binomial' || params_glmnet$family == 'multinomial') {
151 |
152 | y = as.factor(y)
153 | }
154 |
155 | isna = as.vector(Matrix::colSums(is.na(X))) # replace the NA-values of each column with the median
156 |
157 | if (sum(isna) > 0) {
158 |
159 | if (verbose) {
160 | cat('\n')
161 | cat('Missing values present in glmnet-lasso. They will be replaced with the median.', '\n')
162 | cat('\n')
163 | }
164 |
165 | X = func_replace_NAs(X, which(isna > 0))
166 | }
167 |
168 | Feature = colnames(X)
169 |
170 | if (is.data.frame(X)) {
171 |
172 | X <- as.matrix(X)
173 | }
174 |
175 | else if (is.matrix(X) || (inherits(X, 'dgCMatrix'))) {
176 |
177 | X = X
178 | }
179 |
180 | else {
181 |
182 | stop(simpleError("X must be either a data.frame or a (sparse-) matrix"))
183 | }
184 |
185 | # scale the explanatory variables as explained here : http://stats.stackexchange.com/questions/14853/variable-importance-from-glmnet
186 | # [ exclude from scaling those predictors that have less than 2 unique values, OTHERWISE error ]
187 | if (scale_coefs_glmnet) X[, -which(as.vector(apply(X, 2, function(x) length(unique(x)))) < 2)] = scale(X[, -which(as.vector(apply(X, 2, function(x) length(unique(x)))) < 2)])
188 |
189 | params_glmnet[['x']] = X
190 | params_glmnet[['y']] = y
191 | if (scale_coefs_glmnet) params_glmnet[['standardize']] = FALSE # after using scale() ensure that the variables won't be standardized prior to fitting the model
192 |
193 | cv = do.call(glmnet::cv.glmnet, params_glmnet)
194 |
195 | pr = predict(cv, type = 'coefficients', s = cv$lambda.min)
196 |
197 | if (is.factor(y)) { # in case of classification glmnet returns importance in form of a sparse matrix
198 |
199 | if (length(unique(y)) == 2) { # in case of binary-classification it returns a single column
200 |
201 | df1 = as.matrix(pr)[-1, ]
202 | df1 = data.frame(features = names(df1), importance = as.vector(df1))
203 |
204 | if (scale_coefs_glmnet) {
205 | df1[, 2] = abs(df1[, 2])
206 | df1 = df1[order(df1[, 2], decreasing = TRUE), ]
207 | }
208 | }
209 |
210 | if (length(unique(y)) > 2) { # in case of multiclass classification it returns a sparse matrix for each class separately
211 |
212 | df1 = do.call(rbind, lapply(pr, function(x) as.matrix(x)[-1, ]))
213 | df1 = colMeans(df1)
214 |
215 | if (any(df1 == 0.0)) {
216 |
217 | df1 = df1[-which(df1 == 0L)] # remove zero-coefficient predictors
218 | }
219 |
220 | df1 = data.frame(features = names(df1), importance = as.vector(df1))
221 | if (scale_coefs_glmnet) {
222 | df1[, 2] = abs(df1[, 2]) # after scaling, I take the absolute value in order to plot the important features [ this because many of them have high negative values -- meaning high impact on the response ]
223 | df1 = df1[order(df1[, 2], decreasing = TRUE), ]
224 | }
225 | }
226 | }
227 |
228 | else {
229 |
230 | df = data.frame(Feature, coefficients = pr[2:length(pr)], stringsAsFactors = FALSE)
231 |
232 | df1 = subset(df, df[,2] != 0)
233 | if (scale_coefs_glmnet) {
234 | df1[, 2] = abs(df1[, 2])
235 | df1 = df1[order(df1[, 2], decreasing = TRUE), ]
236 | }
237 | }
238 |
239 | return(df1)
240 | }
241 |
242 | else if (method == 'glmnet-lasso' && CV_folds > 1) {
243 |
244 | if (params_glmnet$parallel && !is.null(cores_glmnet)) {
245 |
246 | if (.Platform$OS.type == "unix") {
247 | doParallel::registerDoParallel(cores = cores_glmnet)
248 | }
249 |
250 | if (.Platform$OS.type == "windows") {
251 | cl = parallel::makePSOCKcluster(cores_glmnet)
252 | doParallel::registerDoParallel(cl = cl) # compared to unix, ".. if not specified, on Windows a three worker cluster is created and used .." [ see also: https://stackoverflow.com/a/45122448/8302386 ]
253 | }
254 | }
255 |
256 | if (verbose) {
257 | cat('=====================================================================', '\n')
258 | cat('glmnet feature selection, starts...', '\n')
259 | cat('=====================================================================', '\n')
260 | }
261 |
262 | if (params_glmnet$family == 'binomial' || params_glmnet$family == 'multinomial') {
263 |
264 | y = as.factor(y)
265 | }
266 |
267 | if (is.factor(y)) {
268 |
269 | folds = class_folds(CV_folds, y, shuffle = TRUE)
270 | }
271 |
272 | else {
273 |
274 | folds = regr_folds(CV_folds, y, stratified = stratified_regr)
275 | }
276 |
277 | get_all_feat = list()
278 |
279 | for (i in 1:CV_folds) {
280 |
281 | if (verbose) {
282 | cat('--------------------------------------------------------------------', '\n')
283 | cat('Fold ', i, '\n')
284 | cat('--------------------------------------------------------------------', '\n')
285 | }
286 |
287 | X_folds = X[unlist(folds[-i]), ]
288 | y_folds = y[unlist(folds[-i])]
289 |
290 | isna = as.vector(Matrix::colSums(is.na(X_folds))) # replace the NA-values of each column with the median
291 |
292 | if (sum(isna) > 0) {
293 |
294 | if (verbose) {
295 | cat('\n')
296 | cat('Missing values present in glmnet-lasso. They will be replaced with the median.', '\n')
297 | cat('\n')
298 | }
299 |
300 | X_folds = func_replace_NAs(X_folds, which(isna > 0))
301 | }
302 |
303 | Feature = colnames(X_folds)
304 |
305 | if (is.data.frame(X_folds)) {
306 |
307 | X_folds <- as.matrix(X_folds)
308 | }
309 |
310 | else if (is.matrix(X_folds) || (inherits(X, 'dgCMatrix'))) {
311 |
312 | X_folds = X_folds
313 | }
314 |
315 | else {
316 |
317 | stop(simpleError("X must be either a data.frame or a (sparse-) matrix"))
318 | }
319 |
320 | # scale the explanatory variables as explained here : http://stats.stackexchange.com/questions/14853/variable-importance-from-glmnet
321 | # [ exclude from scaling those predictors that have less than 2 unique values, OTHERWISE error ]
322 | if (scale_coefs_glmnet) X_folds[, -which(as.vector(apply(X_folds, 2, function(x) length(unique(x)))) < 2)] = scale(X_folds[, -which(as.vector(apply(X_folds, 2, function(x) length(unique(x)))) < 2)])
323 |
324 | params_glmnet[['x']] = X_folds
325 | params_glmnet[['y']] = y_folds
326 | if (scale_coefs_glmnet) params_glmnet[['standardize']] = FALSE # after using scale() ensure that the variables won't be standardized prior to fitting the model
327 |
328 | cv = do.call(glmnet::cv.glmnet, params_glmnet)
329 |
330 | pr = predict(cv, type = 'coefficients', s = cv$lambda.min)
331 |
332 | if (is.factor(y)) { # in case of classification glmnet returns importance in form of a sparse matrix
333 |
334 | if (length(unique(y)) == 2) { # in case of binary-classification it returns a single column
335 |
336 | get_all_feat[[i]] = as.matrix(pr)[-1, ]
337 | }
338 |
339 | if (length(unique(y)) > 2) { # in case of multiclass classification it returns a sparse matrix for each class separately
340 |
341 | get_all_feat[[i]] = do.call(rbind, lapply(pr, function(x) as.matrix(x)[-1, ]))
342 |
343 | gc()
344 | }
345 | }
346 |
347 | else {
348 |
349 | df = data.frame(Feature, coefficients = pr[2:length(pr)], stringsAsFactors = FALSE)
350 |
351 | df1 = subset(df, df[,2] != 0)
352 |
353 | get_all_feat[[i]] = df1
354 |
355 | gc()
356 | }
357 | }
358 |
359 | if (is.factor(y)) {
360 |
361 | if (length(unique(y)) == 2) {
362 |
363 | tbl_x = colMeans(data.frame(do.call(rbind, get_all_feat)))
364 | tbl_x = data.frame(features = names(tbl_x), importance = as.vector(tbl_x))
365 | if (scale_coefs_glmnet) {
366 | tbl_x[, 2] = abs(tbl_x[, 2])
367 | tbl_x = tbl_x[order(tbl_x[, 2], decreasing = TRUE), ]
368 | }
369 | }
370 |
371 | if (length(unique(y)) > 2) {
372 |
373 | df1 = data.frame(add_probs_dfs(get_all_feat), row.names = rownames(get_all_feat[[1]]))
374 | df1 = colMeans(df1)
375 |
376 | if (any(df1 == 0.0)) {
377 |
378 | df1 = df1[-which(df1 == 0L)] # remove zero-coefficient predictors
379 | }
380 |
381 | tbl_x = data.frame(features = names(df1), importance = as.vector(df1))
382 | if (scale_coefs_glmnet) {
383 | tbl_x[, 2] = abs(tbl_x[, 2]) # after scaling, I take the absolute value in order to plot the important features [ this because many of them have high negative values -- meaning high impact on the response ]
384 | tbl_x = tbl_x[order(tbl_x[, 2], decreasing = TRUE), ]
385 | }
386 | }
387 | }
388 |
389 | else {
390 |
391 | all_feat = data.frame(do.call('rbind', get_all_feat)) |>
392 | data.table::as.data.table()
393 |
394 | tbl_x = all_feat[, .(coefficients = mean(coefficients, na.rm = TRUE),
395 | Frequency = .N),
396 | by = 'Feature'] |>
397 | as.data.frame()
398 |
399 | if (scale_coefs_glmnet) tbl_x[, 2] = abs(tbl_x[, 2])
400 | tbl_x = tbl_x[order(tbl_x$Frequency, tbl_x$coefficients, decreasing = TRUE),] # the data.frame in 'glmnet-lasso' is sorted by Frequency (default)
401 | }
402 |
403 | if (params_glmnet$parallel && !is.null(cores_glmnet)) {
404 |
405 | if (.Platform$OS.type == "windows") {
406 | parallel::stopCluster(cl = cl)
407 | }
408 | }
409 |
410 | return(tbl_x)
411 | }
412 |
413 | else if (method == 'xgboost' && CV_folds == 1) {
414 |
415 | if (verbose) {
416 | cat('=====================================================================', '\n')
417 | cat('xgboost feature selection, starts...', '\n')
418 | cat('=====================================================================', '\n')
419 | }
420 |
421 | if (is.data.frame(X)) {
422 |
423 | dtrain <- xgboost::xgb.DMatrix(data = as.matrix(X), label = y, missing = NA)
424 | }
425 |
426 | else if (is.matrix(X) || (inherits(X, 'dgCMatrix'))) {
427 |
428 | dtrain <- xgboost::xgb.DMatrix(data = X, label = y, missing = NA)
429 | }
430 |
431 | else {
432 |
433 | stop(simpleError("X must be either a data.frame or a (sparse-) matrix"))
434 | }
435 |
436 | params_xgboost[['watchlist']] = list(train = dtrain)
437 | params_xgboost[['data']] = dtrain
438 |
439 | bst = suppressWarnings(do.call(xgboost::xgb.train, params_xgboost))
440 |
441 | tbl1 <- data.frame(xgboost::xgb.importance(colnames(X), model = bst))
442 |
443 | if (is.null(xgb_sort) || (xgb_sort == 'Frequency')) {
444 |
445 | tbl1 = tbl1[order(tbl1$Frequency, decreasing = TRUE),]
446 | }
447 |
448 | else if (xgb_sort == 'Gain') {
449 |
450 | tbl1 = tbl1[order(tbl1$Gain, decreasing = TRUE),]
451 | }
452 |
453 | else if (xgb_sort == 'Cover') {
454 |
455 | tbl1 = tbl1[order(tbl1$Cover, decreasing = TRUE),]
456 | }
457 |
458 | return(tbl1)
459 | }
460 |
461 | else if (method == 'xgboost' && CV_folds > 1) {
462 |
463 | if (verbose) {
464 | cat('=====================================================================', '\n')
465 | cat('xgboost feature selection, starts...', '\n')
466 | cat('=====================================================================', '\n')
467 | }
468 |
469 | if (length(unique(y)) == 2 || ("num_class" %in% names(params_xgboost$params))) {
470 |
471 | folds = class_folds(CV_folds, as.factor(y), shuffle = TRUE)
472 | }
473 |
474 | else {
475 |
476 | folds = regr_folds(CV_folds, y, stratified = stratified_regr)
477 | }
478 |
479 | get_all_feat = list()
480 |
481 | for (i in 1:CV_folds) {
482 |
483 | if (verbose) {
484 | cat('--------------------------------------------------------------------', '\n')
485 | cat('Fold ', i, '\n')
486 | cat('--------------------------------------------------------------------', '\n')
487 | }
488 |
489 | X_folds = X[unlist(folds[-i]), ]
490 |
491 | y_folds = y[unlist(folds[-i])]
492 |
493 | if (is.data.frame(X_folds)) {
494 |
495 | dtrain <- xgboost::xgb.DMatrix(data = as.matrix(X_folds), label = y_folds, missing = NA)
496 | }
497 |
498 | else if (is.matrix(X_folds) || (inherits(X_folds, 'dgCMatrix'))) {
499 |
500 | dtrain <- xgboost::xgb.DMatrix(data = X_folds, label = y_folds, missing = NA)
501 | }
502 |
503 | else {
504 |
505 | stop(simpleError("X must be either a data.frame or a (sparse-) matrix"))
506 | }
507 |
508 | params_xgboost[['watchlist']] = list(train = dtrain)
509 | params_xgboost[['data']] = dtrain
510 |
511 | bst = suppressWarnings(do.call(xgboost::xgb.train, params_xgboost))
512 |
513 | get_all_feat[[i]] <- data.frame(xgboost::xgb.importance(colnames(X_folds), model = bst))
514 |
515 | gc()
516 | }
517 |
518 | tbl_x = data.frame(do.call('rbind', get_all_feat)) |>
519 | data.table::as.data.table()
520 |
521 | tbl1 = tbl_x[, lapply(.SD, mean, na.rm = TRUE), by = 'Feature'] |>
522 | as.data.frame()
523 |
524 | if (is.null(xgb_sort) || (xgb_sort == 'Frequency')) {
525 |
526 | tbl1 = tbl1[order(tbl1$Frequency, decreasing = TRUE),]
527 | }
528 |
529 | else if (xgb_sort == 'Gain') {
530 |
531 | tbl1 = tbl1[order(tbl1$Gain, decreasing = TRUE),]
532 | }
533 |
534 | else if (xgb_sort == 'Cover') {
535 |
536 | tbl1 = tbl1[order(tbl1$Cover, decreasing = TRUE),]
537 | }
538 |
539 | return(tbl1)
540 | }
541 |
542 | else if (method == 'ranger' && CV_folds == 1) {
543 |
544 | if (!(is.matrix(X) || is.data.frame(X))) {
545 |
546 | stop(simpleError("X must be either a data.frame or a matrix"))
547 | }
548 |
549 | if (params_ranger$classification) {
550 |
551 | y = as.factor(y)
552 | }
553 |
554 | if (verbose) {
555 | cat('=====================================================================', '\n')
556 | cat('ranger feature selection, starts...', '\n')
557 | cat('=====================================================================', '\n')
558 | }
559 |
560 | if (!"dependent.variable.name" %in% names(params_ranger)) {
561 |
562 | form = stats::as.formula(paste0(paste0('y ~ '), paste(colnames(X), collapse = '+')))
563 |
564 | params_ranger[['formula']] = form
565 |
566 | #dat = data.frame(y = y, X)}
567 | }
568 |
569 | dat = cbind(y = y, X) # include y in the data so that it works with or without the 'dependent.variable.name'
570 |
571 | # else {
572 | #
573 | # dat = X
574 | # }
575 |
576 | params_ranger[['data']] = dat
577 |
578 | fit = do.call(ranger::ranger, params_ranger)
579 |
580 | tbl_x = data.frame(names(fit$variable.importance), as.vector(fit$variable.importance))
581 | colnames(tbl_x) = c('Feature', params_ranger$importance)
582 |
583 | tbl1 = tbl_x[order(tbl_x[, 2], decreasing = TRUE), ]
584 |
585 | return(tbl1)
586 | }
587 |
588 | else if (method == 'ranger' && CV_folds > 1) {
589 |
590 | if (verbose) {
591 | cat('=====================================================================', '\n')
592 | cat('ranger feature selection, starts...', '\n')
593 | cat('=====================================================================', '\n')
594 | }
595 |
596 | if (params_ranger$classification) {
597 |
598 | y = as.factor(y)
599 | }
600 |
601 | if (is.factor(y)) {
602 |
603 | folds = class_folds(CV_folds, y, shuffle = TRUE)
604 | }
605 |
606 | else {
607 |
608 | folds = regr_folds(CV_folds, y, stratified = stratified_regr)
609 | }
610 |
611 | get_all_feat = list()
612 |
613 | for (i in 1:CV_folds) {
614 |
615 | if (!(is.matrix(X) || is.data.frame(X))) {
616 |
617 | stop(simpleError("X must be either a data.frame or a matrix"))
618 | }
619 |
620 | if (verbose) {
621 | cat('--------------------------------------------------------------------', '\n')
622 | cat('Fold ', i, '\n')
623 | cat('--------------------------------------------------------------------', '\n')
624 | }
625 |
626 | X_folds = X[unlist(folds[-i]), ]
627 |
628 | y_folds = y[unlist(folds[-i])]
629 |
630 | if (!"dependent.variable.name" %in% names(params_ranger)) {
631 |
632 | form = stats::as.formula(paste0(paste0('y ~ '), paste(colnames(X_folds), collapse = '+')))
633 |
634 | params_ranger[['formula']] = form
635 |
636 | #dat = data.frame(y = y_folds, X_folds)}
637 | }
638 |
639 | dat = cbind(y = y_folds, X_folds) # include y in the data so that it works with or without the 'dependent.variable.name'
640 |
641 | # else {
642 | #
643 | # dat = X_folds
644 | # }
645 |
646 | params_ranger[['data']] = dat
647 |
648 | fit = do.call(ranger::ranger, params_ranger)
649 |
650 | tbl_x = data.frame(names(fit$variable.importance), as.vector(fit$variable.importance))
651 | colnames(tbl_x) = c('Feature', params_ranger$importance)
652 |
653 | get_all_feat[[i]] <- tbl_x
654 |
655 | gc()
656 | }
657 |
658 | tbl_x = data.frame(do.call('rbind', get_all_feat)) |>
659 | data.table::as.data.table()
660 |
661 | tbl1 = tbl_x[, lapply(.SD, mean, na.rm = TRUE), by = 'Feature'] |>
662 | as.data.frame()
663 |
664 | tbl1 = tbl1[order(tbl1[, 2], decreasing = TRUE), ]
665 |
666 | return(tbl1)
667 | }
668 | }
669 |
670 |
671 |
--------------------------------------------------------------------------------
/R/plot_feature_selection.R:
--------------------------------------------------------------------------------
1 |
2 | #' plot important features
3 | #'
4 | #' This function takes the result of the feature_selection function or the wrapper_feat_select function and returns a barplot with the important features
5 | #'
6 | #' @param obj either a data frame or a list from the functions : feature_selection, wrapper_feat_select
7 | #' @param params_barplot a list of parameters needed for plotting the important features
8 | #' @param xgb_sort sort the xgboost features by "Gain", "Cover" or "Frequency" ( defaults to "Frequency")
9 | #' @return a barplot with the important features of each method
10 | #' @details
11 | #' This function takes a data frame (from the feature_selection function) or a list (from the wrapper_feat_select function) and returns a barplot of the important features.
12 | #' If union is TRUE in the params_barplot vector it returns also the average importance of all methods
13 | #' @export
14 | #' @importFrom graphics barplot par
15 | #' @importFrom grDevices dev.cur dev.off
16 | #' @examples
17 | #'
18 | #' \dontrun{
19 | #'
20 | #' data(iris)
21 | #'
22 | #' X = iris[, -5]
23 | #' y = X[, 1]
24 | #' X = X[, -1]
25 | #'
26 | #' #.............................
27 | #' # plot of the wrapper function
28 | #' #.............................
29 | #'
30 | #' feat = wrapper_feat_select(X, y, params_glmnet = params_glmnet, params_xgboost = params_xgboost, params_ranger = params_ranger, xgb_sort = NULL,
31 | #' CV_folds = 10, stratified_regr = FALSE, cores_glmnet = 2, params_features = params_features)
32 | #'
33 | #' params_barplot = list(keep_features = 5, horiz = TRUE, cex.names = 1.0)
34 | #'
35 | #' barplot_feat_select(feat, params_barplot, xgb_sort = NULL)
36 | #'
37 | #'
38 | #' #.......................................
39 | #' # plot of the feature_selection function
40 | #' #.......................................
41 | #'
42 | #' res = feature_selection(X, y, method = 'glmnet-lasso', params_glmnet = params_glmnet, CV_folds = 5, cores_glmnet = 5)
43 | #'
44 | #'
45 | #' params_barplot = list(keep_features = 5, horiz = TRUE, cex.names = 0.8)
46 | #'
47 | #' barplot_feat_select(res, params_barplot, xgb_sort = NULL)
48 | #'
49 | #' }
50 |
51 |
52 | barplot_feat_select = function(obj, params_barplot, xgb_sort = NULL) {
53 |
54 |
55 | if (is.null(params_barplot$keep_features)) {
56 |
57 | stop(simpleError("specify a maximum number of features to be plotted using the parameter keep_features in the params_barplot list"))
58 | }
59 |
60 | if (is.data.frame(obj)) {
61 |
62 | if (grDevices::dev.cur() != 1) {
63 |
64 | grDevices::dev.off() # reset par()
65 | }
66 |
67 | graphics::par(las = 2) # make label text perpendicular to axis
68 | graphics::par(mar = c(5, 8, 4, 2)) # increase y-axis margin.
69 |
70 | if (dim(obj)[2] == 3) { # glmnet-lasso , CV_folds > 1
71 |
72 | graphics::barplot(obj[params_barplot$keep_features:1, 3], main = "glmnet-importance", horiz = params_barplot$horiz, names.arg = obj[params_barplot$keep_features:1, 1],
73 |
74 | cex.names = params_barplot$cex.names)}
75 |
76 | else if (dim(obj)[2] == 4) { # xgboost
77 |
78 | if (is.null(xgb_sort) || xgb_sort == 'Frequency') {
79 |
80 | obj = obj[order(obj[, 4], decreasing = TRUE), ]
81 | obj = obj[, c(1, 4)]}
82 |
83 | else if (xgb_sort == 'Cover') {
84 |
85 | obj = obj[order(obj[, 3], decreasing = TRUE), ]
86 | obj = obj[, c(1, 3)]}
87 |
88 | else if (xgb_sort == 'Gain') {
89 |
90 | obj = obj[order(obj[, 2], decreasing = TRUE), ]
91 | obj = obj[, c(1, 2)]}
92 |
93 | else {
94 |
95 | stop(simpleError("not a valid method for xgb_sort"))
96 | }
97 |
98 | graphics::barplot(obj[params_barplot$keep_features:1, 2], main = "xgboost-importance", horiz = params_barplot$horiz, names.arg = obj[params_barplot$keep_features:1, 1],
99 |
100 | cex.names = params_barplot$cex.names)}
101 |
102 | else if (dim(obj)[2] == 2 && (colnames(obj)[2] %in% c('impurity', 'permutation'))) {
103 |
104 | graphics::barplot(obj[params_barplot$keep_features:1, 2], main = "ranger-importance", horiz = params_barplot$horiz, names.arg = obj[params_barplot$keep_features:1, 1],
105 |
106 | cex.names = params_barplot$cex.names)}
107 |
108 | else if (dim(obj)[2] == 2) { # glmnet-lasso, CV_folds = 1
109 |
110 | graphics::barplot(obj[params_barplot$keep_features:1, 2], main = "glmnet-importance", horiz = params_barplot$horiz, names.arg = obj[params_barplot$keep_features:1, 1],
111 |
112 | cex.names = params_barplot$cex.names)
113 | }
114 | }
115 |
116 | else {
117 |
118 | if (length(names(obj)) < 3 && ("all_feat" %in% names(obj))) { # union = TRUE
119 |
120 | if (grDevices::dev.cur() != 1) {
121 |
122 | grDevices::dev.off() # reset par()
123 | }
124 |
125 | graphics::par(las = 2) # make label text perpendicular to axis
126 | graphics::par(mar = c(5, 8, 4, 2)) # increase y-axis margin.
127 |
128 | graphics::par(mfrow = c(1, length(names(obj$all_feat)) + 1))
129 |
130 | if (is.null(xgb_sort)) xgb_sort = 'Frequency'
131 |
132 | for (i in names(obj$all_feat)) {
133 |
134 | if (i == 'glmnet-lasso') {
135 |
136 | obj_g = obj$all_feat[['glmnet-lasso']]
137 |
138 | graphics::barplot(obj_g[params_barplot$keep_features:1, 3], main = "glmnet-importance", horiz = params_barplot$horiz, names.arg = obj_g[params_barplot$keep_features:1, 1],
139 |
140 | cex.names = params_barplot$cex.names)}
141 |
142 | else if (i == 'xgboost') {
143 |
144 | obj_x = obj$all_feat[['xgboost']]
145 | obj_x = obj_x[order(obj_x[, xgb_sort], decreasing = TRUE), ]
146 | tmp_xgb = obj_x[params_barplot$keep_features:1, xgb_sort]
147 |
148 | graphics::barplot(tmp_xgb, main = "xgboost-importance", horiz = params_barplot$horiz, names.arg = obj_x[params_barplot$keep_features:1, 1],
149 |
150 | cex.names = params_barplot$cex.names)}
151 |
152 | else if (i == 'ranger') {
153 |
154 | obj_r = obj$all_feat[['ranger']]
155 |
156 | graphics::barplot(obj_r[params_barplot$keep_features:1, 2], main = "ranger-importance", horiz = params_barplot$horiz, names.arg = obj_r[params_barplot$keep_features:1, 1],
157 |
158 | cex.names = params_barplot$cex.names)
159 | }
160 | }
161 |
162 | graphics::barplot(obj$union_feat[params_barplot$keep_features:1, 2], main = "union-importance",
163 |
164 | horiz = params_barplot$horiz, names.arg = obj$union_feat[params_barplot$keep_features:1, 1], cex.names = params_barplot$cex.names)
165 | }
166 |
167 | else if (length(names(obj)) < 3 && (sum(c("glmnet-lasso", "xgboost", "ranger") %in% names(obj)) > 0)){ # union = FALSE
168 |
169 | if (grDevices::dev.cur() != 1) {
170 |
171 | grDevices::dev.off() # reset par()
172 | }
173 |
174 | graphics::par(las = 2) # make label text perpendicular to axis
175 | graphics::par(mar = c(5, 8, 4, 2)) # increase y-axis margin.
176 |
177 | graphics::par(mfrow = c(1, length(names(obj))))
178 |
179 | if (is.null(xgb_sort)) xgb_sort = 'Frequency'
180 |
181 | for (i in names(obj)) {
182 |
183 | if (i == 'glmnet-lasso') {
184 |
185 | obj_g = obj[['glmnet-lasso']]
186 |
187 | graphics::barplot(obj_g[params_barplot$keep_features:1, 3], main = "glmnet-importance", horiz = params_barplot$horiz, names.arg = obj_g[params_barplot$keep_features:1, 1],
188 |
189 | cex.names = params_barplot$cex.names)}
190 |
191 | else if (i == 'xgboost') {
192 |
193 | obj_x = obj[['xgboost']]
194 | obj_x = obj_x[order(obj_x[, xgb_sort], decreasing = TRUE), ]
195 | tmp_xgb = obj_x[params_barplot$keep_features:1, xgb_sort]
196 |
197 | graphics::barplot(tmp_xgb, main = "xgboost-importance", horiz = params_barplot$horiz, names.arg = obj_x[params_barplot$keep_features:1, 1],
198 |
199 | cex.names = params_barplot$cex.names)}
200 |
201 | else if (i == 'ranger') {
202 |
203 | obj_r = obj[['ranger']]
204 |
205 | graphics::barplot(obj_r[params_barplot$keep_features:1, 2], main = "ranger-importance", horiz = params_barplot$horiz, names.arg = obj_r[params_barplot$keep_features:1, 1],
206 |
207 | cex.names = params_barplot$cex.names)
208 | }
209 |
210 | else {
211 |
212 | stop('invalid method')
213 | }
214 | }
215 | }
216 | }
217 | }
218 |
219 |
--------------------------------------------------------------------------------
/R/utils.R:
--------------------------------------------------------------------------------
1 |
2 |
3 | #' secondary function to replace NAs
4 | #'
5 | #' @keywords internal
6 | #'
7 | #' @importFrom stats median
8 |
9 | func_replace_NAs = function(data, which_isna) {
10 |
11 | for (i in which_isna) {
12 |
13 | tmp_median = stats::median(data[, i], na.rm = T)
14 |
15 | data[which(is.na(data[, i])), i] = tmp_median
16 | }
17 |
18 | return(data)
19 | }
20 |
21 |
22 |
23 | #' shuffle data
24 | #'
25 | #' this function shuffles the items of a vector
26 | #'
27 | #' @param vec is a vector of indices
28 | #' @param times is a number
29 | #' @return shuffled indices of a vector
30 | #' @export
31 |
32 |
33 | func_shuffle = function(vec, times = 10) {
34 |
35 | for (i in 1:times) {
36 |
37 | out = sample(vec, length(vec))
38 | }
39 | out
40 | }
41 |
42 |
43 | #' addition of probability-data-frames
44 | #'
45 | #' this function takes a number of probability data frames and returns their average (mainly used in multi-class classification).
46 | #'
47 | #' @param PREDS_LST is a list of data frames
48 | #' @return average of a number of data frames
49 | #' @export
50 |
51 |
52 | add_probs_dfs = function(PREDS_LST) {
53 |
54 | if (!inherits(PREDS_LST, "list")) stop("PREDS_LST must be a list")
55 |
56 | r = all(unlist(lapply(PREDS_LST, nrow)) == unlist(lapply(PREDS_LST, nrow))[1])
57 | c = all(unlist(lapply(PREDS_LST, ncol)) == unlist(lapply(PREDS_LST, ncol))[1])
58 |
59 | if (!all(c(r,c))) stop("the dimensions of the included data.frames or matrices differ")
60 |
61 | init_df = data.frame(matrix(rep(0, dim(PREDS_LST[[1]])[1]*dim(PREDS_LST[[1]])[2]), nrow = dim(PREDS_LST[[1]])[1], ncol = dim(PREDS_LST[[1]])[2]))
62 |
63 | for (i in 1:length(PREDS_LST)) {
64 |
65 | init_df = init_df + PREDS_LST[[i]]
66 | }
67 |
68 | init_df = init_df/length(PREDS_LST)
69 | colnames(init_df) = colnames(PREDS_LST[[1]])
70 |
71 | return(as.matrix(init_df))
72 | }
73 |
74 |
75 | #' normalize data
76 | #'
77 | #' this function normalizes the feature importance of the algorithms, so that data between algorithms is in the same scale
78 | #'
79 | #' @param x is a numeric vector
80 | #' @return normalized data in form of a vector
81 | #' @export
82 |
83 |
84 | normalized = function(x) {
85 |
86 | out = (x - min(x))/(max(x) - min(x))
87 |
88 | out
89 | }
90 |
91 |
92 |
--------------------------------------------------------------------------------
/R/wrapper_feature_selection.R:
--------------------------------------------------------------------------------
1 |
2 | #' Wraps all three methods
3 | #'
4 | #' This function is a wrapper for the feature_selection function
5 | #'
6 | #' @param X a sparse Matrix, a matrix or a data frame
7 | #' @param y a vector of length representing the response variable
8 | #' @param params_glmnet a list of parameters for the glmnet model
9 | #' @param params_xgboost a list of parameters for the xgboost model
10 | #' @param params_ranger a list of parameters for the ranger model
11 | #' @param xgb_sort sort the xgboost features by "Gain", "Cover" or "Frequency" ( defaults to "Frequency")
12 | #' @param CV_folds a number specifying the number of folds for cross validation
13 | #' @param stratified_regr a boolean determining if the folds in regression should be stratified
14 | #' @param scale_coefs_glmnet if TRUE, less important coefficients will be smaller than the more important ones (ranking/plotting by magnitude possible)
15 | #' @param cores_glmnet an integer determining the number of cores to register in glmnet
16 | #' @param params_features is a list of parameters for the wrapper function
17 | #' @param verbose outputs info
18 | #' @return a list containing the important features of each method. If union in the params_feature list is enabled, then it also returns the average importance of all methods.
19 | #'
20 | #' @details
21 | #' This function returns the importance of the methods specified and if union in the params_feature list is TRUE then it also returns the average importance of all methods.
22 | #' Furthermore the user can limit the number of features using the keep_number_feat parameter of the params_feature list.
23 | #'
24 | #' @export
25 | #' @importFrom data.table as.data.table
26 | #' @importFrom magrittr %>%
27 | #' @importFrom rlang .data
28 | #'
29 | #' @examples
30 | #'
31 | #' \dontrun{
32 | #'
33 | #' #...........
34 | #' # regression
35 | #' #...........
36 | #'
37 | #' data(iris)
38 | #'
39 | #' X = iris[, -5]
40 | #' y = X[, 1]
41 | #' X = X[, -1]
42 | #'
43 | #' params_glmnet = list(alpha = 1,
44 | #' family = 'gaussian',
45 | #' nfolds = 3,
46 | #' parallel = TRUE)
47 | #'
48 | #'
49 | #' params_xgboost = list( params = list("objective" = "reg:linear",
50 | #' "bst:eta" = 0.01,
51 | #' "subsample" = 0.65,
52 | #' "max_depth" = 5,
53 | #' "colsample_bytree" = 0.65,
54 | #' "nthread" = 2),
55 | #' nrounds = 100,
56 | #' print.every.n = 50,
57 | #' verbose = 0,
58 | #' maximize = FALSE)
59 | #'
60 | #'
61 | #' params_ranger = list(probability = FALSE,
62 | #' num.trees = 100,
63 | #' verbose = TRUE,
64 | #' classification = FALSE,
65 | #' mtry = 3,
66 | #' min.node.size = 10,
67 | #' num.threads = 2,
68 | #' importance = 'permutation')
69 | #'
70 | #'
71 | #' params_features = list(keep_number_feat = NULL,
72 | #' union = TRUE)
73 | #'
74 | #' feat = wrapper_feat_select(X,
75 | #' y,
76 | #' params_glmnet = params_glmnet,
77 | #' params_xgboost = params_xgboost,
78 | #' params_ranger = params_ranger,
79 | #' xgb_sort = NULL,
80 | #' CV_folds = 10,
81 | #' stratified_regr = FALSE,
82 | #' cores_glmnet = 2,
83 | #' params_features = params_features)
84 | #' }
85 |
86 |
87 | wrapper_feat_select = function(X, y, params_glmnet = NULL, params_xgboost = NULL, params_ranger = NULL, xgb_sort = NULL, CV_folds = 5,
88 |
89 | stratified_regr = FALSE, scale_coefs_glmnet = FALSE, cores_glmnet = NULL, params_features = NULL, verbose = FALSE) {
90 |
91 | method = c('glmnet-lasso', 'xgboost', 'ranger')
92 | met = list(params_glmnet, params_xgboost, params_ranger)
93 | met1 = unlist(lapply(met, function(x) !is.null(x)))
94 |
95 | if (sum(met1) == 0) stop('at least one of the methods should be non-null')
96 |
97 | method = method[met1]
98 |
99 | #if (!is.null(params_ranger) && ("dependent.variable.name" %in% names(params_ranger)) && sum(met1) > 1) stop("ranger can not be used simoultaneously with the other methods if 'dependent.variable.name' is in the params_ranger")
100 |
101 | if (CV_folds < 2) {
102 |
103 | stop(simpleError("CV_folds should be > 1"))
104 | }
105 |
106 | if (('ranger' %in% method) && (params_ranger$importance == 'none')) {
107 |
108 | stop(simpleError("enable importance in ranger using one of the arguments 'impurity' or 'permutation'"))
109 | }
110 |
111 | if (params_features$union == TRUE && length(method) == 1) {
112 |
113 | stop(simpleError("run union = TRUE only in case of more than one method"))
114 | }
115 |
116 | out_meth = list()
117 |
118 | for (meth in method) {
119 |
120 | out_meth[[meth]] = feature_selection(X, y, method = meth, params_glmnet = params_glmnet, params_xgboost = params_xgboost, params_ranger = params_ranger, xgb_sort = xgb_sort,
121 |
122 | CV_folds = CV_folds, stratified_regr = stratified_regr, scale_coefs_glmnet = scale_coefs_glmnet, cores_glmnet = cores_glmnet, verbose = verbose)
123 | }
124 |
125 |
126 | if (is.null(params_features$keep_number_feat)) {
127 |
128 | out = out_meth}
129 |
130 | else {
131 |
132 | out = lapply(out_meth, function(x) x[1:params_features$keep_number_feat, ])
133 | }
134 |
135 | if (params_features$union == TRUE) {
136 |
137 | out_union = list()
138 |
139 | for (k in method) {
140 |
141 | if (k == 'glmnet-lasso' && (!params_glmnet$family %in% c('binomial', 'multinomial'))) {
142 |
143 | un_glmnet = out_meth[[k]][, -2] # exclude coefficients column
144 | un_glmnet[, 2] = normalized(un_glmnet[, 2]) # normalize the frequency so that is in same scale with the other algorithms
145 | colnames(un_glmnet) = c('features', 'importance')
146 | un_glmnet[, 1] = as.character(un_glmnet[, 1])
147 | out_union[[k]] = un_glmnet}
148 |
149 | else if (k == 'glmnet-lasso' && params_glmnet$family %in% c('binomial', 'multinomial')) {
150 |
151 | un_glmnet = out_meth[[k]]
152 | un_glmnet[, 2] = normalized(un_glmnet[, 2])
153 | colnames(un_glmnet) = c('features', 'importance')
154 | un_glmnet[, 1] = as.character(un_glmnet[, 1])
155 | out_union[[k]] = un_glmnet}
156 |
157 | else if (k == 'xgboost') {
158 |
159 | if (is.null(xgb_sort)) {
160 |
161 | un_xgboost = out_meth[[k]][, c(1, 4)]
162 | un_xgboost[, 2] = normalized(un_xgboost[, 2])
163 | colnames(un_xgboost) = c('features', 'importance')
164 | un_xgboost[, 1] = as.character(un_xgboost[, 1])
165 | out_union[[k]] = un_xgboost}
166 |
167 | else {
168 |
169 | un_xgboost = data.frame(features = out_meth[[k]][, 1], importance = out_meth[[k]][, xgb_sort])
170 | un_xgboost[, 2] = normalized(un_xgboost[, 2])
171 | un_xgboost[, 1] = as.character(un_xgboost[, 1])
172 | out_union[[k]] = un_xgboost}
173 | }
174 |
175 | else if (k == 'ranger') {
176 |
177 | un_ranger = out_meth[[k]]
178 | un_ranger[, 2] = normalized(un_ranger[, 2])
179 | colnames(un_ranger) = c('features', 'importance')
180 | un_ranger[, 1] = as.character(un_ranger[, 1])
181 | out_union[[k]] = un_ranger}
182 | }
183 |
184 | modify_lst = lapply(out_union, function(x) data.frame(feature = x$features, rank = normalized(length(x$features):1)))
185 |
186 | modify_lst1 = data.frame(do.call(rbind, modify_lst)) |>
187 | data.table::as.data.table()
188 |
189 | tbl_x = modify_lst1[, .(importance = sum(rank, na.rm = TRUE),
190 | Frequency = .N),
191 | by = 'feature'] |>
192 | as.data.frame()
193 |
194 | tbl1 = tbl_x[order(tbl_x$importance, decreasing = TRUE), ]
195 |
196 | tbl1$importance = normalized(tbl1$importance)
197 |
198 | return(list(all_feat = out, union_feat = tbl1))}
199 |
200 | else {
201 |
202 | return(all_feat = out)
203 | }
204 | }
205 |
206 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | [](https://github.com/mlampros/FeatureSelection/actions)
3 | [](https://codecov.io/github/mlampros/FeatureSelection?branch=master)
4 |
5 | [](https://hub.docker.com/r/mlampros/featureselection)
6 |
7 |
8 |
9 | #### Feature Selection in R using glmnet-lasso, xgboost and ranger
10 |
11 |
12 |
13 | This R package wraps **glmnet-lasso**, **xgboost** and **ranger** to perform feature selection. After downloading use ? to read info about each function (i.e. ?feature_selection). More details can be found in the blog-post (http://mlampros.github.io/2016/02/14/feature-selection/). To download the latest version from Github use,
14 |
15 |
16 |
17 | ```R
18 | remotes::install_github('mlampros/FeatureSelection')
19 |
20 | ```
21 |
22 |
23 |
24 | **Package Updates**:
25 |
26 | * Currently there is a new version of *glmnet* (3.0.0) with new functionality (*relax*, *trace*, *assess*, *bigGlm*), however it requires an R version of 3.6.0 (see the [new vignette](https://cran.r-project.org/web/packages/glmnet/vignettes/relax.pdf) for more information).
27 | * In the *ranger* R package the *ranger::importance_pvalues()* was added
28 | * Currently, the recommended approach for future selection is [SHAP](https://github.com/slundberg/shap)
29 |
30 |
31 |
32 |
33 | **UPDATE 03-02-2020**
34 |
35 |
36 |
37 | **Docker images** of the *FeatureSelection* package are available to download from my [dockerhub](https://hub.docker.com/r/mlampros/featureselection) account. The images come with *Rstudio* and the *R-development* version (latest) installed. The whole process was tested on Ubuntu 18.04. To **pull** & **run** the image do the following,
38 |
39 |
40 |
41 | ```R
42 |
43 | docker pull mlampros/featureselection:rstudiodev
44 |
45 | docker run -d --name rstudio_dev -e USER=rstudio -e PASSWORD=give_here_your_password --rm -p 8787:8787 mlampros/featureselection:rstudiodev
46 |
47 | ```
48 |
49 |
50 |
51 | The user can also **bind** a home directory / folder to the image to use its files by specifying the **-v** command,
52 |
53 |
54 |
55 | ```R
56 |
57 | docker run -d --name rstudio_dev -e USER=rstudio -e PASSWORD=give_here_your_password --rm -p 8787:8787 -v /home/YOUR_DIR:/home/rstudio/YOUR_DIR mlampros/featureselection:rstudiodev
58 |
59 |
60 | ```
61 |
62 |
63 |
64 | In the latter case you might have first give permission privileges for write access to **YOUR_DIR** directory (not necessarily) using,
65 |
66 |
67 |
68 | ```R
69 |
70 | chmod -R 777 /home/YOUR_DIR
71 |
72 |
73 | ```
74 |
75 |
76 |
77 | The **USER** defaults to *rstudio* but you have to give your **PASSWORD** of preference (see [www.rocker-project.org](https://www.rocker-project.org/) for more information).
78 |
79 |
80 |
81 | Open your web-browser and depending where the docker image was *build / run* give,
82 |
83 |
84 |
85 | **1st. Option** on your personal computer,
86 |
87 |
88 |
89 | ```R
90 | http://0.0.0.0:8787
91 |
92 | ```
93 |
94 |
95 |
96 | **2nd. Option** on a cloud instance,
97 |
98 |
99 |
100 | ```R
101 | http://Public DNS:8787
102 |
103 | ```
104 |
105 |
106 |
107 | to access the Rstudio console in order to give your username and password.
108 |
109 |
110 |
111 |
--------------------------------------------------------------------------------
/man/DataSplit.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/create_folds_and_data_split.R
3 | \name{DataSplit}
4 | \alias{DataSplit}
5 | \title{partition of data (train-test-split)}
6 | \usage{
7 | DataSplit(y, TrainRatio = 0.75, regression = TRUE, shuffle = FALSE, seed = 1)
8 | }
9 | \arguments{
10 | \item{y}{is a numeric vector (response variable)}
11 |
12 | \item{TrainRatio}{is the percentage of train-data after the partition}
13 |
14 | \item{regression}{is a boolean (TRUE, FALSE) indicating if it's a regression or classification task}
15 |
16 | \item{shuffle}{is a boolean (TRUE, FALSE) indicating if the data should be shuffled or not (by default 5 times)}
17 |
18 | \item{seed}{an integer specifying the random seed}
19 | }
20 | \value{
21 | a list of indices (train-test)
22 | }
23 | \description{
24 | partition of data (train-test-split)
25 | }
26 | \examples{
27 |
28 | \dontrun{
29 |
30 | data(iris)
31 |
32 | y = X[, 1]
33 |
34 | split = DataSplit(y, TrainRatio = 0.75, regression = FALSE, shuffle = FALSE)
35 |
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/man/add_probs_dfs.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/utils.R
3 | \name{add_probs_dfs}
4 | \alias{add_probs_dfs}
5 | \title{addition of probability-data-frames}
6 | \usage{
7 | add_probs_dfs(PREDS_LST)
8 | }
9 | \arguments{
10 | \item{PREDS_LST}{is a list of data frames}
11 | }
12 | \value{
13 | average of a number of data frames
14 | }
15 | \description{
16 | this function takes a number of probability data frames and returns their average (mainly used in multi-class classification).
17 | }
18 |
--------------------------------------------------------------------------------
/man/barplot_feat_select.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/plot_feature_selection.R
3 | \name{barplot_feat_select}
4 | \alias{barplot_feat_select}
5 | \title{plot important features}
6 | \usage{
7 | barplot_feat_select(obj, params_barplot, xgb_sort = NULL)
8 | }
9 | \arguments{
10 | \item{obj}{either a data frame or a list from the functions : feature_selection, wrapper_feat_select}
11 |
12 | \item{params_barplot}{a list of parameters needed for plotting the important features}
13 |
14 | \item{xgb_sort}{sort the xgboost features by "Gain", "Cover" or "Frequency" ( defaults to "Frequency")}
15 | }
16 | \value{
17 | a barplot with the important features of each method
18 | }
19 | \description{
20 | This function takes the result of the feature_selection function or the wrapper_feat_select function and returns a barplot with the important features
21 | }
22 | \details{
23 | This function takes a data frame (from the feature_selection function) or a list (from the wrapper_feat_select function) and returns a barplot of the important features.
24 | If union is TRUE in the params_barplot vector it returns also the average importance of all methods
25 | }
26 | \examples{
27 |
28 | \dontrun{
29 |
30 | data(iris)
31 |
32 | X = iris[, -5]
33 | y = X[, 1]
34 | X = X[, -1]
35 |
36 | #.............................
37 | # plot of the wrapper function
38 | #.............................
39 |
40 | feat = wrapper_feat_select(X, y, params_glmnet = params_glmnet, params_xgboost = params_xgboost, params_ranger = params_ranger, xgb_sort = NULL,
41 | CV_folds = 10, stratified_regr = FALSE, cores_glmnet = 2, params_features = params_features)
42 |
43 | params_barplot = list(keep_features = 5, horiz = TRUE, cex.names = 1.0)
44 |
45 | barplot_feat_select(feat, params_barplot, xgb_sort = NULL)
46 |
47 |
48 | #.......................................
49 | # plot of the feature_selection function
50 | #.......................................
51 |
52 | res = feature_selection(X, y, method = 'glmnet-lasso', params_glmnet = params_glmnet, CV_folds = 5, cores_glmnet = 5)
53 |
54 |
55 | params_barplot = list(keep_features = 5, horiz = TRUE, cex.names = 0.8)
56 |
57 | barplot_feat_select(res, params_barplot, xgb_sort = NULL)
58 |
59 | }
60 | }
61 |
--------------------------------------------------------------------------------
/man/class_folds.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/create_folds_and_data_split.R
3 | \name{class_folds}
4 | \alias{class_folds}
5 | \title{stratified folds (in classification)}
6 | \usage{
7 | class_folds(folds, RESP, shuffle = FALSE)
8 | }
9 | \arguments{
10 | \item{folds}{is an integer specifying the number of folds}
11 |
12 | \item{RESP}{is the response variable}
13 |
14 | \item{shuffle}{is a boolean specifying if the vector of indices should be shuffled or not}
15 | }
16 | \value{
17 | a list of indices
18 | }
19 | \description{
20 | this function creates stratified folds in binary and multiclass classification
21 | }
22 | \examples{
23 |
24 | \dontrun{
25 |
26 | data(iris)
27 |
28 | y = as.factor(iris[, 5])
29 |
30 | folds = class_folds(10, y, shuffle = TRUE)
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/man/feature_selection.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/feature_selection.R
3 | \name{feature_selection}
4 | \alias{feature_selection}
5 | \title{Feature selection}
6 | \usage{
7 | feature_selection(
8 | X,
9 | y,
10 | method = NULL,
11 | params_glmnet = NULL,
12 | params_xgboost = NULL,
13 | params_ranger = NULL,
14 | xgb_sort = NULL,
15 | CV_folds = 5,
16 | stratified_regr = FALSE,
17 | scale_coefs_glmnet = FALSE,
18 | cores_glmnet = NULL,
19 | verbose = FALSE
20 | )
21 | }
22 | \arguments{
23 | \item{X}{a sparse Matrix, a matrix or a data frame}
24 |
25 | \item{y}{a vector of length representing the response variable}
26 |
27 | \item{method}{one of 'glmnet-lasso', 'xgboost', 'ranger'}
28 |
29 | \item{params_glmnet}{a list of parameters for the glmnet model}
30 |
31 | \item{params_xgboost}{a list of parameters for the xgboost model}
32 |
33 | \item{params_ranger}{a list of parameters for the ranger model}
34 |
35 | \item{xgb_sort}{sort the xgboost features by "Gain", "Cover" or "Frequency" ( defaults to "Frequency")}
36 |
37 | \item{CV_folds}{a number specifying the number of folds for cross validation}
38 |
39 | \item{stratified_regr}{a boolean determining if the folds in regression should be stratified}
40 |
41 | \item{scale_coefs_glmnet}{if TRUE, less important coefficients will be smaller than the more important ones (ranking/plotting by magnitude possible)}
42 |
43 | \item{cores_glmnet}{an integer determining the number of cores to register in glmnet}
44 |
45 | \item{verbose}{outputs info}
46 | }
47 | \value{
48 | a data frame with the most important features
49 | }
50 | \description{
51 | This function uses three different methods (glmnet, xgboost, ranger) in order to select important features.
52 | }
53 | \details{
54 | This function returns the important features using one of the glmnet, xgboost or ranger algorithms. The glmnet algorithm can take either a sparse matrix, a matrix or a data frame
55 | and returns a data frame with non zero coefficients. The xgboost algorithm can take either a sparse matrix, a matrix or a data frame and returns the importance of the features in form
56 | of a data frame, furthermore it is possible to sort the features using one of the "Gain", "Cover" or "Frequency" methods. The ranger algorithm can take either a matrix or a data frame
57 | and returns the important features using one of the 'impurity' or 'permutation' methods.
58 | }
59 | \examples{
60 |
61 | \dontrun{
62 |
63 | #...........
64 | # regression
65 | #...........
66 |
67 | data(iris)
68 |
69 | X = iris[, -5]
70 | y = X[, 1]
71 | X = X[, -1]
72 |
73 | params_glmnet = list(alpha = 1,
74 | family = 'gaussian',
75 | nfolds = 3,
76 | parallel = TRUE)
77 |
78 | res = feature_selection(X,
79 | y,
80 | method = 'glmnet-lasso',
81 | params_glmnet = params_glmnet,
82 | CV_folds = 5,
83 | cores_glmnet = 5)
84 |
85 | #......................
86 | # binary classification
87 | #......................
88 |
89 | y = iris[, 5]
90 | y = as.character(y)
91 | y[y == 'setosa'] = 'virginica'
92 | X = iris[, -5]
93 |
94 | params_ranger = list(write.forest = TRUE,
95 | probability = TRUE,
96 | num.threads = 6,
97 | num.trees = 50,
98 | verbose = FALSE,
99 | classification = TRUE,
100 | mtry = 2,
101 | min.node.size = 5,
102 | importance = 'impurity')
103 |
104 | res = feature_selection(X,
105 | y,
106 | method = 'ranger',
107 | params_ranger = params_ranger,
108 | CV_folds = 5)
109 |
110 | #..........................
111 | # multiclass classification
112 | #..........................
113 |
114 | y = iris[, 5]
115 | multiclass_xgboost = ifelse(y == 'setosa', 0, ifelse(y == 'virginica', 1, 2))
116 | X = iris[, -5]
117 |
118 | params_xgboost = list(params = list("objective" = "multi:softprob",
119 | "bst:eta" = 0.35,
120 | "subsample" = 0.65,
121 | "num_class" = 3,
122 | "max_depth" = 6,
123 | "colsample_bytree" = 0.65,
124 | "nthread" = 2),
125 | nrounds = 50,
126 | print.every.n = 50,
127 | verbose = 0,
128 | maximize = FALSE)
129 |
130 | res = feature_selection(X,
131 | multiclass_xgboost,
132 | method = 'xgboost',
133 | params_xgboost = params_xgboost,
134 | CV_folds = 5)
135 | }
136 | }
137 | \author{
138 | Lampros Mouselimis
139 | }
140 |
--------------------------------------------------------------------------------
/man/func_correlation.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/correlation_of_multiple_predictors.R
3 | \name{func_correlation}
4 | \alias{func_correlation}
5 | \title{find correlated variables}
6 | \usage{
7 | func_correlation(
8 | data,
9 | target = NULL,
10 | correlation_thresh = NULL,
11 | use_obs = NULL,
12 | correlation_method = NULL
13 | )
14 | }
15 | \arguments{
16 | \item{data}{either a data frame or a matrix}
17 |
18 | \item{target}{either a string (name of the predictor/response in the data set) or a vector of strings (predictor/response names of the data set)}
19 |
20 | \item{correlation_thresh}{a float indicating the correlation threshold}
21 |
22 | \item{use_obs}{one of "everything", "all.obs", "complete.obs", "na.or.complete", "pairwise.complete.obs"}
23 |
24 | \item{correlation_method}{one of "pearson", "kendall", "spearman"}
25 | }
26 | \value{
27 | either a data frame or a list of data frames
28 | }
29 | \description{
30 | This function takes a data frame or a matrix and returns either a data frame or a list of data frames with correlated variables
31 | }
32 | \details{
33 | This function takes a data frame or a matrix and returns the correlated variables of the data
34 | }
35 |
--------------------------------------------------------------------------------
/man/func_replace_NAs.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/utils.R
3 | \name{func_replace_NAs}
4 | \alias{func_replace_NAs}
5 | \title{secondary function to replace NAs}
6 | \usage{
7 | func_replace_NAs(data, which_isna)
8 | }
9 | \description{
10 | secondary function to replace NAs
11 | }
12 | \keyword{internal}
13 |
--------------------------------------------------------------------------------
/man/func_shuffle.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/utils.R
3 | \name{func_shuffle}
4 | \alias{func_shuffle}
5 | \title{shuffle data}
6 | \usage{
7 | func_shuffle(vec, times = 10)
8 | }
9 | \arguments{
10 | \item{vec}{is a vector of indices}
11 |
12 | \item{times}{is a number}
13 | }
14 | \value{
15 | shuffled indices of a vector
16 | }
17 | \description{
18 | this function shuffles the items of a vector
19 | }
20 |
--------------------------------------------------------------------------------
/man/normalized.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/utils.R
3 | \name{normalized}
4 | \alias{normalized}
5 | \title{normalize data}
6 | \usage{
7 | normalized(x)
8 | }
9 | \arguments{
10 | \item{x}{is a numeric vector}
11 | }
12 | \value{
13 | normalized data in form of a vector
14 | }
15 | \description{
16 | this function normalizes the feature importance of the algorithms, so that data between algorithms is in the same scale
17 | }
18 |
--------------------------------------------------------------------------------
/man/regr_folds.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/create_folds_and_data_split.R
3 | \name{regr_folds}
4 | \alias{regr_folds}
5 | \title{create folds (in regression)}
6 | \usage{
7 | regr_folds(folds, RESP, stratified = FALSE)
8 | }
9 | \arguments{
10 | \item{folds}{is an integer specifying the number of folds}
11 |
12 | \item{RESP}{is the response variable}
13 |
14 | \item{stratified}{is a boolean specifying if the folds should be stratfied}
15 | }
16 | \value{
17 | a list of indices
18 | }
19 | \description{
20 | this function creates both stratified and non-stratified folds in regression
21 | }
22 | \examples{
23 |
24 | \dontrun{
25 |
26 | data(iris)
27 |
28 | y = X[, 1]
29 |
30 | folds = regr_folds(5, y, stratified = FALSE)
31 |
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/man/remove_duplic_func.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/correlation_of_multiple_predictors.R
3 | \name{remove_duplic_func}
4 | \alias{remove_duplic_func}
5 | \title{function to remove duplicated pairs of predictors ( used in the func_correlation )}
6 | \usage{
7 | remove_duplic_func(sublist)
8 | }
9 | \description{
10 | this is a secondary function that is used in the func_correlation
11 | }
12 | \keyword{internal}
13 |
--------------------------------------------------------------------------------
/man/second_func_cor.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/correlation_of_multiple_predictors.R
3 | \name{second_func_cor}
4 | \alias{second_func_cor}
5 | \title{secondary function ( used in the func_correlation )}
6 | \usage{
7 | second_func_cor(dat_frame)
8 | }
9 | \description{
10 | this is a secondary function that is used in the func_correlation
11 | }
12 | \keyword{internal}
13 |
--------------------------------------------------------------------------------
/man/wrapper_feat_select.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/wrapper_feature_selection.R
3 | \name{wrapper_feat_select}
4 | \alias{wrapper_feat_select}
5 | \title{Wraps all three methods}
6 | \usage{
7 | wrapper_feat_select(
8 | X,
9 | y,
10 | params_glmnet = NULL,
11 | params_xgboost = NULL,
12 | params_ranger = NULL,
13 | xgb_sort = NULL,
14 | CV_folds = 5,
15 | stratified_regr = FALSE,
16 | scale_coefs_glmnet = FALSE,
17 | cores_glmnet = NULL,
18 | params_features = NULL,
19 | verbose = FALSE
20 | )
21 | }
22 | \arguments{
23 | \item{X}{a sparse Matrix, a matrix or a data frame}
24 |
25 | \item{y}{a vector of length representing the response variable}
26 |
27 | \item{params_glmnet}{a list of parameters for the glmnet model}
28 |
29 | \item{params_xgboost}{a list of parameters for the xgboost model}
30 |
31 | \item{params_ranger}{a list of parameters for the ranger model}
32 |
33 | \item{xgb_sort}{sort the xgboost features by "Gain", "Cover" or "Frequency" ( defaults to "Frequency")}
34 |
35 | \item{CV_folds}{a number specifying the number of folds for cross validation}
36 |
37 | \item{stratified_regr}{a boolean determining if the folds in regression should be stratified}
38 |
39 | \item{scale_coefs_glmnet}{if TRUE, less important coefficients will be smaller than the more important ones (ranking/plotting by magnitude possible)}
40 |
41 | \item{cores_glmnet}{an integer determining the number of cores to register in glmnet}
42 |
43 | \item{params_features}{is a list of parameters for the wrapper function}
44 |
45 | \item{verbose}{outputs info}
46 | }
47 | \value{
48 | a list containing the important features of each method. If union in the params_feature list is enabled, then it also returns the average importance of all methods.
49 | }
50 | \description{
51 | This function is a wrapper for the feature_selection function
52 | }
53 | \details{
54 | This function returns the importance of the methods specified and if union in the params_feature list is TRUE then it also returns the average importance of all methods.
55 | Furthermore the user can limit the number of features using the keep_number_feat parameter of the params_feature list.
56 | }
57 | \examples{
58 |
59 | \dontrun{
60 |
61 | #...........
62 | # regression
63 | #...........
64 |
65 | data(iris)
66 |
67 | X = iris[, -5]
68 | y = X[, 1]
69 | X = X[, -1]
70 |
71 | params_glmnet = list(alpha = 1,
72 | family = 'gaussian',
73 | nfolds = 3,
74 | parallel = TRUE)
75 |
76 |
77 | params_xgboost = list( params = list("objective" = "reg:linear",
78 | "bst:eta" = 0.01,
79 | "subsample" = 0.65,
80 | "max_depth" = 5,
81 | "colsample_bytree" = 0.65,
82 | "nthread" = 2),
83 | nrounds = 100,
84 | print.every.n = 50,
85 | verbose = 0,
86 | maximize = FALSE)
87 |
88 |
89 | params_ranger = list(probability = FALSE,
90 | num.trees = 100,
91 | verbose = TRUE,
92 | classification = FALSE,
93 | mtry = 3,
94 | min.node.size = 10,
95 | num.threads = 2,
96 | importance = 'permutation')
97 |
98 |
99 | params_features = list(keep_number_feat = NULL,
100 | union = TRUE)
101 |
102 | feat = wrapper_feat_select(X,
103 | y,
104 | params_glmnet = params_glmnet,
105 | params_xgboost = params_xgboost,
106 | params_ranger = params_ranger,
107 | xgb_sort = NULL,
108 | CV_folds = 10,
109 | stratified_regr = FALSE,
110 | cores_glmnet = 2,
111 | params_features = params_features)
112 | }
113 | }
114 |
--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(FeatureSelection)
3 |
4 | test_check("FeatureSelection")
5 |
--------------------------------------------------------------------------------
/tests/testthat/helper-function_for_tests.R:
--------------------------------------------------------------------------------
1 |
2 | #.......................
3 | # function used in tests
4 | #.......................
5 |
6 | func_nas = function(X, col_idx) { # randomly adding NA's
7 |
8 | idx = sample(1:length(X[, col_idx]), 30, replace = F)
9 |
10 | X[idx, col_idx] = NA
11 |
12 | return(X[, col_idx])
13 | }
14 |
--------------------------------------------------------------------------------
/tests/testthat/test-add_probability_matrices_OR_data_frames.R:
--------------------------------------------------------------------------------
1 | context("Add probability data frames or matrices")
2 |
3 | testthat::test_that("adding 3 matrices or data.frames results in a a matrix of all 2's", {
4 | m1 = matrix(rep(1, 25), 5, 5)
5 | m2 = matrix(rep(2, 25), 5, 5)
6 | m3 = matrix(rep(3, 25), 5, 5)
7 |
8 | lst = list(m1, m2, m3)
9 |
10 | testthat::expect_equal(add_probs_dfs(lst), matrix(rep(2, 25), 5,5), check.attributes = FALSE) # check.attributes = F otherwise due to dimnames error
11 | })
12 |
13 |
14 | testthat::test_that("if PREDS_LST is a matrix throws an error", {
15 | m1 = matrix(rep(1, 25), 5, 5)
16 |
17 | testthat::expect_error(add_probs_dfs(m1), "PREDS_LST must be a list")
18 | })
19 |
20 |
21 | testthat::test_that("if the dimensions of each matrix or data frame in the list is different it throws an error", {
22 | m1 = matrix(rep(1, 25), 5, 5)
23 | m2 = matrix(rep(2, 100), 10, 10)
24 | m3 = matrix(rep(3, 25), 5, 5)
25 |
26 | lst = list(m1, m2, m3)
27 |
28 | testthat::expect_error(add_probs_dfs(lst), "the dimensions of the included data.frames or matrices differ")
29 | })
30 |
31 |
--------------------------------------------------------------------------------
/tests/testthat/test-barplot_feat_select.R:
--------------------------------------------------------------------------------
1 | data(iris)
2 |
3 | context('Bar plot feature selection')
4 |
5 |
6 |
7 | # first check single methods:
8 |
9 | testthat::test_that("it returns an error if the object is NULL", {
10 |
11 | X = iris[, -5]
12 | y = X[, 1]
13 | X = X[, -1]
14 |
15 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = F)
16 |
17 | res = feature_selection(X, y, method = 'glmnet-lasso', params_glmnet = params_glmnet, CV_folds = 5, cores_glmnet = NULL)
18 |
19 | params_barplot = list(keep_features = NULL, horiz = TRUE, cex.names = 0.8)
20 |
21 | testthat::expect_error(barplot_feat_select(res, params_barplot, xgb_sort = NULL))
22 | })
23 |
24 |
25 | testthat::test_that("it returns an error if sort method of xgoobst not one of Frequency, Gain, Cover", {
26 |
27 | y = iris[, 5]
28 | multiclass_xgboost = ifelse(y == 'setosa', 0, ifelse(y == 'virginica', 1, 2))
29 | X = iris[, -5]
30 |
31 | params_xgboost = list( params = list("objective" = "multi:softprob", "bst:eta" = 0.35, "subsample" = 0.65, "num_class" = 3, "max_depth" = 6, "colsample_bytree" = 0.65, "nthread" = 2),
32 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE)
33 |
34 | res = feature_selection(X, multiclass_xgboost, method = 'xgboost', params_xgboost = params_xgboost, CV_folds = 1, xgb_sort = NULL)
35 |
36 | params_barplot = list(keep_features = 5, horiz = TRUE, cex.names = 0.8)
37 |
38 | testthat::expect_error(barplot_feat_select(res, params_barplot, xgb_sort = 'some_method'))
39 | })
40 |
41 | #===========================================
42 | # use testthat::expect_silent to test plots
43 | #===========================================
44 |
45 | # feature_selection() function
46 |
47 | testthat::test_that("glmnet for the feature_selection object, returns 3 column data frame (CV_folds > 1) in the res object, and plot", {
48 |
49 | X = iris[, -5]
50 | y = X[, 1]
51 | X = X[, -1]
52 |
53 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = F)
54 |
55 | res = feature_selection(X, y, method = 'glmnet-lasso', params_glmnet = params_glmnet, CV_folds = 5, cores_glmnet = NULL)
56 |
57 | params_barplot = list(keep_features = 5, horiz = TRUE, cex.names = 0.8)
58 |
59 | testthat::expect_silent(barplot_feat_select(res, params_barplot, xgb_sort = NULL))
60 | })
61 |
62 |
63 |
64 | testthat::test_that("it returns plot in case of binomial", {
65 |
66 | X = iris
67 | y = X[, 5]
68 | y = as.character(y)
69 | y[y == 'setosa'] = 'versicolor'
70 | y = as.factor(y)
71 | X = X[, -5]
72 |
73 | params_glmnet = list(alpha = 1, family = 'binomial', nfolds = 3, parallel = F)
74 |
75 | res = feature_selection(X, y, method = 'glmnet-lasso', params_glmnet = params_glmnet, CV_folds = 5, cores_glmnet = NULL, scale_coefs_glmnet = T, verbose = F)
76 |
77 | params_barplot = list(keep_features = 5, horiz = TRUE, cex.names = 0.8)
78 |
79 | testthat::expect_silent(barplot_feat_select(res, params_barplot, xgb_sort = NULL))
80 | })
81 |
82 |
83 | testthat::test_that("glmnet for the feature_selection object, returns 2 column data frame (CV_folds = 1) in the res object, and plot", {
84 |
85 | X = iris[, -5]
86 | y = X[, 1]
87 | X = X[, -1]
88 |
89 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = F)
90 |
91 | res = feature_selection(X, y, method = 'glmnet-lasso', params_glmnet = params_glmnet, CV_folds = 1, cores_glmnet = NULL)
92 |
93 | params_barplot = list(keep_features = 5, horiz = TRUE, cex.names = 0.8)
94 |
95 | testthat::expect_silent(barplot_feat_select(res, params_barplot, xgb_sort = NULL))
96 | })
97 |
98 |
99 |
100 | testthat::test_that("xgboost for the feature_selection object, xgb_sort Frequency", {
101 |
102 | y = iris[, 5]
103 | multiclass_xgboost = ifelse(y == 'setosa', 0, ifelse(y == 'virginica', 1, 2))
104 | X = iris[, -5]
105 |
106 | params_xgboost = list( params = list("objective" = "multi:softprob", "bst:eta" = 0.35, "subsample" = 0.65, "num_class" = 3, "max_depth" = 6, "colsample_bytree" = 0.65, "nthread" = 2),
107 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE)
108 |
109 | res = feature_selection(X, multiclass_xgboost, method = 'xgboost', params_xgboost = params_xgboost, CV_folds = 1, xgb_sort = NULL)
110 |
111 | params_barplot = list(keep_features = 5, horiz = TRUE, cex.names = 0.8)
112 |
113 | testthat::expect_silent(barplot_feat_select(res, params_barplot, xgb_sort = 'Frequency'))
114 | })
115 |
116 |
117 | testthat::test_that("xgboost for the feature_selection object, xgb_sort Gain", {
118 |
119 | y = iris[, 5]
120 | multiclass_xgboost = ifelse(y == 'setosa', 0, ifelse(y == 'virginica', 1, 2))
121 | X = iris[, -5]
122 |
123 | params_xgboost = list( params = list("objective" = "multi:softprob", "bst:eta" = 0.35, "subsample" = 0.65, "num_class" = 3, "max_depth" = 6, "colsample_bytree" = 0.65, "nthread" = 2),
124 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE)
125 |
126 | res = feature_selection(X, multiclass_xgboost, method = 'xgboost', params_xgboost = params_xgboost, CV_folds = 1, xgb_sort = NULL)
127 |
128 | params_barplot = list(keep_features = 5, horiz = TRUE, cex.names = 0.8)
129 |
130 | testthat::expect_silent(barplot_feat_select(res, params_barplot, xgb_sort = 'Gain'))
131 | })
132 |
133 |
134 | testthat::test_that("xgboost for the feature_selection object, xgb_sort Cover", {
135 |
136 | y = iris[, 5]
137 | multiclass_xgboost = ifelse(y == 'setosa', 0, ifelse(y == 'virginica', 1, 2))
138 | X = iris[, -5]
139 |
140 | params_xgboost = list( params = list("objective" = "multi:softprob", "bst:eta" = 0.35, "subsample" = 0.65, "num_class" = 3, "max_depth" = 6, "colsample_bytree" = 0.65, "nthread" = 2),
141 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE)
142 |
143 | res = feature_selection(X, multiclass_xgboost, method = 'xgboost', params_xgboost = params_xgboost, CV_folds = 1, xgb_sort = NULL)
144 |
145 | params_barplot = list(keep_features = 5, horiz = TRUE, cex.names = 0.8)
146 |
147 | testthat::expect_silent(barplot_feat_select(res, params_barplot, xgb_sort = 'Cover'))
148 | })
149 |
150 |
151 | testthat::test_that("ranger for the feature_selection object", {
152 |
153 | X = iris[, -5]
154 | y = X[, 1]
155 | X = X[, -1]
156 |
157 | params_ranger = list(write.forest = TRUE, probability = F, num.threads = 6, num.trees = 50, verbose = FALSE, classification = F, mtry = 2, min.node.size = 5, importance = 'impurity')
158 |
159 | res = feature_selection(X, y, method = 'ranger', params_ranger = params_ranger, CV_folds = 5)
160 |
161 | params_barplot = list(keep_features = 5, horiz = TRUE, cex.names = 0.8)
162 |
163 | testthat::expect_silent(barplot_feat_select(res, params_barplot, xgb_sort = NULL))
164 | })
165 |
166 |
167 | # wrapper_feat_select() function
168 |
169 |
170 | testthat::test_that("plot of all methods when union = T", {
171 |
172 | X = iris[, -5]
173 | y = X[, 1]
174 | X = X[, -1]
175 |
176 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = F)
177 |
178 | params_xgboost = list( params = list("objective" = "reg:linear", "bst:eta" = 0.01, "subsample" = 0.65, "max_depth" = 5, "colsample_bytree" = 0.65, "nthread" = 2),
179 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE)
180 |
181 | params_ranger = list(probability = FALSE, num.trees = 50, verbose = TRUE, classification = FALSE, mtry = 2, min.node.size = 10, num.threads = 2, importance = 'permutation')
182 |
183 | params_features = list(keep_number_feat = NULL, union = T)
184 |
185 | res = wrapper_feat_select(X, y, params_glmnet = params_glmnet, params_xgboost = params_xgboost, params_ranger = params_ranger, xgb_sort = "Gain",
186 |
187 | CV_folds = 3, stratified_regr = FALSE, cores_glmnet = NULL, params_features = params_features)
188 |
189 | params_barplot = list(keep_features = 5, horiz = TRUE, cex.names = 1.0)
190 |
191 | testthat::expect_silent(barplot_feat_select(res, params_barplot, xgb_sort = NULL))
192 | })
193 |
194 |
195 | testthat::test_that("plot of all methods when union = T", {
196 |
197 | X = iris[, -5]
198 | y = X[, 1]
199 | X = X[, -1]
200 |
201 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = F)
202 |
203 | params_xgboost = list( params = list("objective" = "reg:linear", "bst:eta" = 0.01, "subsample" = 0.65, "max_depth" = 5, "colsample_bytree" = 0.65, "nthread" = 2),
204 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE)
205 |
206 | params_ranger = list(probability = FALSE, num.trees = 50, verbose = TRUE, classification = FALSE, mtry = 2, min.node.size = 10, num.threads = 2, importance = 'permutation')
207 |
208 | params_features = list(keep_number_feat = NULL, union = T)
209 |
210 | res = wrapper_feat_select(X, y, params_glmnet = params_glmnet, params_xgboost = params_xgboost, params_ranger = params_ranger, xgb_sort = "Gain",
211 |
212 | CV_folds = 3, stratified_regr = FALSE, cores_glmnet = NULL, params_features = params_features)
213 |
214 | params_barplot = list(keep_features = 5, horiz = TRUE, cex.names = 1.0)
215 |
216 | testthat::expect_silent(barplot_feat_select(res, params_barplot, xgb_sort = 'Gain'))
217 | })
218 |
219 |
220 | testthat::test_that("plot of all methods when union = T", {
221 |
222 | X = iris[, -5]
223 | y = X[, 1]
224 | X = X[, -1]
225 |
226 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = F)
227 |
228 | params_xgboost = list( params = list("objective" = "reg:linear", "bst:eta" = 0.01, "subsample" = 0.65, "max_depth" = 5, "colsample_bytree" = 0.65, "nthread" = 2),
229 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE)
230 |
231 | params_ranger = list(probability = FALSE, num.trees = 50, verbose = TRUE, classification = FALSE, mtry = 2, min.node.size = 10, num.threads = 2, importance = 'permutation')
232 |
233 | params_features = list(keep_number_feat = NULL, union = T)
234 |
235 | res = wrapper_feat_select(X, y, params_glmnet = params_glmnet, params_xgboost = params_xgboost, params_ranger = params_ranger, xgb_sort = NULL,
236 |
237 | CV_folds = 3, stratified_regr = FALSE, cores_glmnet = NULL, params_features = params_features)
238 |
239 | params_barplot = list(keep_features = 5, horiz = TRUE, cex.names = 1.0)
240 |
241 | testthat::expect_silent(barplot_feat_select(res, params_barplot, xgb_sort = 'Cover'))
242 | })
243 |
244 | testthat::test_that("plot of all methods when union = F", {
245 |
246 | X = iris[, -5]
247 | y = X[, 1]
248 | X = X[, -1]
249 |
250 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = F)
251 |
252 | params_xgboost = list( params = list("objective" = "reg:linear", "bst:eta" = 0.01, "subsample" = 0.65, "max_depth" = 5, "colsample_bytree" = 0.65, "nthread" = 2),
253 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE)
254 |
255 | params_ranger = list(probability = FALSE, num.trees = 50, verbose = TRUE, classification = FALSE, mtry = 2, min.node.size = 10, num.threads = 2, importance = 'permutation')
256 |
257 | params_features = list(keep_number_feat = NULL, union = F)
258 |
259 | res = wrapper_feat_select(X, y, params_glmnet = params_glmnet, params_xgboost = params_xgboost, params_ranger = params_ranger, xgb_sort = "Gain",
260 |
261 | CV_folds = 3, stratified_regr = FALSE, cores_glmnet = NULL, params_features = params_features)
262 |
263 | params_barplot = list(keep_features = 5, horiz = TRUE, cex.names = 1.0)
264 |
265 | testthat::expect_silent(barplot_feat_select(res, params_barplot, xgb_sort = NULL))
266 | })
267 |
268 |
269 | testthat::test_that("plot two of the methods when union = F", {
270 |
271 | X = iris[, -5]
272 | y = X[, 1]
273 | X = X[, -1]
274 |
275 |
276 | params_xgboost = list( params = list("objective" = "reg:linear", "bst:eta" = 0.01, "subsample" = 0.65, "max_depth" = 5, "colsample_bytree" = 0.65, "nthread" = 2),
277 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE)
278 |
279 | params_ranger = list(probability = FALSE, num.trees = 50, verbose = TRUE, classification = FALSE, mtry = 2, min.node.size = 10, num.threads = 2, importance = 'permutation')
280 |
281 | params_features = list(keep_number_feat = NULL, union = F)
282 |
283 | res = wrapper_feat_select(X, y, params_glmnet = NULL, params_xgboost = params_xgboost, params_ranger = params_ranger, xgb_sort = "Gain",
284 |
285 | CV_folds = 3, stratified_regr = FALSE, cores_glmnet = 2, params_features = params_features)
286 |
287 | params_barplot = list(keep_features = 5, horiz = TRUE, cex.names = 1.0)
288 |
289 | testthat::expect_silent(barplot_feat_select(res, params_barplot, xgb_sort = 'Gain'))
290 | })
291 |
292 |
293 | testthat::test_that("plot one of the methods when union = F, [ ranger ]", {
294 |
295 | X = iris[, -5]
296 | y = X[, 1]
297 | X = X[, -1]
298 |
299 | params_ranger = list(probability = FALSE, num.trees = 50, verbose = TRUE, classification = FALSE, mtry = 2, min.node.size = 10, num.threads = 2, importance = 'permutation')
300 |
301 | params_features = list(keep_number_feat = NULL, union = F)
302 |
303 | res = wrapper_feat_select(X, y, params_glmnet = NULL, params_xgboost = NULL, params_ranger = params_ranger, xgb_sort = "Gain",
304 |
305 | CV_folds = 3, stratified_regr = FALSE, cores_glmnet = 2, params_features = params_features)
306 |
307 | params_barplot = list(keep_features = 5, horiz = TRUE, cex.names = 1.0)
308 |
309 | testthat::expect_silent(barplot_feat_select(res, params_barplot, xgb_sort = NULL))
310 | })
311 |
312 |
313 | testthat::test_that("plot one of the methods when union = F [ glmnet - lasso ]", {
314 |
315 | X = iris[, -5]
316 | y = X[, 1]
317 | X = X[, -1]
318 |
319 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = F)
320 |
321 |
322 | params_features = list(keep_number_feat = NULL, union = F)
323 |
324 | res = wrapper_feat_select(X, y, params_glmnet = params_glmnet, params_xgboost = NULL, params_ranger = NULL, xgb_sort = "Gain",
325 |
326 | CV_folds = 3, stratified_regr = FALSE, cores_glmnet = NULL, params_features = params_features)
327 |
328 | params_barplot = list(keep_features = 5, horiz = TRUE, cex.names = 1.0)
329 |
330 | testthat::expect_silent(barplot_feat_select(res, params_barplot, xgb_sort = NULL))
331 | })
332 |
333 |
334 | testthat::test_that("plot one of the methods when union = F [ xgboost ]", {
335 |
336 | X = iris[, -5]
337 | y = X[, 1]
338 | X = X[, -1]
339 |
340 | params_xgboost = list( params = list("objective" = "reg:linear", "bst:eta" = 0.01, "subsample" = 0.65, "max_depth" = 5, "colsample_bytree" = 0.65, "nthread" = 2),
341 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE)
342 |
343 | params_features = list(keep_number_feat = NULL, union = F)
344 |
345 | res = wrapper_feat_select(X, y, params_glmnet = NULL, params_xgboost = params_xgboost, params_ranger = NULL, xgb_sort = "Gain",
346 |
347 | CV_folds = 3, stratified_regr = FALSE, cores_glmnet = 2, params_features = params_features)
348 |
349 | params_barplot = list(keep_features = 5, horiz = TRUE, cex.names = 1.0)
350 |
351 | testthat::expect_silent(barplot_feat_select(res, params_barplot, xgb_sort = NULL))
352 | })
353 |
354 |
355 | testthat::test_that("plot two of the methods when union = T", {
356 |
357 | X = iris[, -5]
358 | y = X[, 1]
359 | X = X[, -1]
360 |
361 |
362 | params_xgboost = list( params = list("objective" = "reg:linear", "bst:eta" = 0.01, "subsample" = 0.65, "max_depth" = 5, "colsample_bytree" = 0.65, "nthread" = 2),
363 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE)
364 |
365 | params_ranger = list(probability = FALSE, num.trees = 50, verbose = TRUE, classification = FALSE, mtry = 2, min.node.size = 10, num.threads = 2, importance = 'permutation')
366 |
367 | params_features = list(keep_number_feat = NULL, union = T)
368 |
369 | res = wrapper_feat_select(X, y, params_glmnet = NULL, params_xgboost = params_xgboost, params_ranger = params_ranger, xgb_sort = "Gain",
370 |
371 | CV_folds = 3, stratified_regr = FALSE, cores_glmnet = 2, params_features = params_features)
372 |
373 | params_barplot = list(keep_features = 5, horiz = TRUE, cex.names = 1.0)
374 |
375 | testthat::expect_silent(barplot_feat_select(res, params_barplot, xgb_sort = 'Gain'))
376 | })
377 |
378 |
--------------------------------------------------------------------------------
/tests/testthat/test-classification_folds.R:
--------------------------------------------------------------------------------
1 | context("Split data in classification folds")
2 |
3 |
4 | testthat::test_that("throws an error if the RESP is not a factor", {
5 |
6 | y = c(1:10)
7 |
8 | testthat::expect_error(class_folds(5, y, shuffle = T), "RESP must be a factor")
9 | })
10 |
11 |
12 | testthat::test_that("returns a warning if the folds are not equally split", {
13 |
14 | y = as.factor(sample(1:5, 99, replace = T))
15 |
16 | testthat::expect_warning(class_folds(5, y, shuffle = T), 'the folds are not equally split')
17 | })
18 |
19 | testthat::test_that("the number of folds equals the number of the resulted sublist indices", {
20 |
21 | y = as.factor(sample(1:5, 100, replace = T))
22 |
23 | testthat::expect_length(class_folds(5, y, shuffle = T), 5)
24 | })
25 |
26 |
27 | # return object with shuffle = F
28 |
29 | testthat::test_that("the number of folds equals the number of the resulted sublist indices", {
30 |
31 | y = as.factor(sample(1:5, 100, replace = T))
32 |
33 | testthat::expect_length(class_folds(5, y, shuffle = F), 5)
34 | })
35 |
--------------------------------------------------------------------------------
/tests/testthat/test-correlation_function.R:
--------------------------------------------------------------------------------
1 | data(iris)
2 | iris1 = iris
3 | iris1$Species = as.numeric(iris1$Species)
4 |
5 | context('Correlation function')
6 |
7 |
8 | testthat::test_that("if any of the data columns is factor or character it returns an error", {
9 |
10 | testthat::expect_error(func_correlation(iris, target = NULL, correlation_thresh = 0.75, use_obs = "everything", correlation_method = "pearson"))
11 | })
12 |
13 |
14 | testthat::test_that("if the correlation_thresh is NULL it returns an error", {
15 |
16 | testthat::expect_error(func_correlation(iris[, -ncol(iris)], target = NULL, correlation_thresh = NULL, use_obs = "everything", correlation_method = "pearson"))
17 | })
18 |
19 | testthat::test_that("if the correlation_thresh is > 1.0 it returns an error", {
20 |
21 | testthat::expect_error(func_correlation(iris[, -ncol(iris)], target = NULL, correlation_thresh = 1.1, use_obs = "everything", correlation_method = "pearson"))
22 | })
23 |
24 | testthat::test_that("if the correlation_thresh is <= 0.0 it returns an error", {
25 |
26 | testthat::expect_error(func_correlation(iris[, -ncol(iris)], target = NULL, correlation_thresh = -1.0, use_obs = "everything", correlation_method = "pearson"))
27 | })
28 |
29 | testthat::test_that("if the use_obs is NULL it returns an error", {
30 |
31 | testthat::expect_error(func_correlation(iris[, -ncol(iris)], target = NULL, correlation_thresh = 0.75, use_obs = NULL, correlation_method = "pearson"))
32 | })
33 |
34 |
35 | testthat::test_that("if the use_obs is NULL it returns an error", {
36 |
37 | testthat::expect_error(func_correlation(iris[, -ncol(iris)], target = NULL, correlation_thresh = 0.75, use_obs = "everything", correlation_method = NULL))
38 | })
39 |
40 |
41 | testthat::test_that("if data is not a data frame or matrix it returns an error", {
42 |
43 | testthat::expect_error(func_correlation(list(iris[, -ncol(iris)]), target = NULL, correlation_thresh = 0.75, use_obs = "everything", correlation_method = "pearson"))
44 | })
45 |
46 |
47 | testthat::test_that("it takes a data frame without a target and it returns a list", {
48 |
49 | testthat::expect_true(is.list(func_correlation(iris1, target = NULL, correlation_thresh = 0.75, use_obs = "everything", correlation_method = "pearson")))
50 | })
51 |
52 | testthat::test_that("it takes a data frame with a target and it returns a data frame", {
53 |
54 | testthat::expect_true(is.data.frame(func_correlation(iris1, target = c('Species'), correlation_thresh = 0.75, use_obs = "everything", correlation_method = "pearson")))
55 | })
56 |
57 |
58 | testthat::test_that("it takes a data frame with a vector of predictors and it returns a data frame", {
59 |
60 | testthat::expect_true(is.data.frame(func_correlation(iris1, target = c('Species', 'Petal.Width'), correlation_thresh = 0.75, use_obs = "everything", correlation_method = "pearson")))
61 | })
62 |
63 |
64 |
--------------------------------------------------------------------------------
/tests/testthat/test-data_split_function.R:
--------------------------------------------------------------------------------
1 | context('Split of data')
2 |
3 |
4 | testthat::test_that("throws an error if the TrainRation is less than expected", {
5 |
6 | y = c(1:50)
7 |
8 | testthat::expect_error(DataSplit(y, TrainRatio = 1.1), 'TrainRation should be a float number greater than 0 and less than 1.0')
9 | })
10 |
11 |
12 | testthat::test_that("returns a list of indices", {
13 |
14 | y = c(1:50)
15 |
16 | testthat::expect_true(is.list(DataSplit(y, TrainRatio = 0.75)))
17 | })
18 |
19 |
20 | # test function with regression = T, F and shuffle = T, F
21 |
22 | testthat::test_that("returns a list of indices", {
23 |
24 | y = as.factor(sample(letters[1:2], 50 , replace = T))
25 |
26 | testthat::expect_true(is.list(DataSplit(y, TrainRatio = 0.75, regression = F)))
27 | })
28 |
29 | testthat::test_that("returns a list of indices", {
30 |
31 | y = c(1:50)
32 |
33 | testthat::expect_true(is.list(DataSplit(y, TrainRatio = 0.75, shuffle = T)))
34 | })
35 |
36 | testthat::test_that("returns a list of indices", {
37 |
38 | y = c(1:50)
39 |
40 | testthat::expect_true(is.list(DataSplit(y, TrainRatio = 0.75, shuffle = F)))
41 | })
42 |
--------------------------------------------------------------------------------
/tests/testthat/test-feature_selection.R:
--------------------------------------------------------------------------------
1 | data(iris)
2 |
3 |
4 | context('Feature selection')
5 |
6 | #====================
7 | # handling of errors
8 | #====================
9 |
10 |
11 | testthat::test_that("feature selection returns a error if method is not specified", {
12 |
13 | X = iris[, -5]
14 | y = X[, 1]
15 | X = X[, -1]
16 |
17 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = F)
18 |
19 | testthat::expect_error(feature_selection(X, y, method = NULL, params_glmnet = params_glmnet, CV_folds = 1, cores_glmnet = NULL))
20 | })
21 |
22 |
23 | testthat::test_that("feature selection returns a error if CV less than 1", {
24 |
25 | X = iris[, -5]
26 | y = X[, 1]
27 | X = X[, -1]
28 |
29 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = F)
30 |
31 | testthat::expect_error(feature_selection(X, y, method = 'glmnet-lasso', params_glmnet = params_glmnet, CV_folds = 0, cores_glmnet = NULL))
32 | })
33 |
34 |
35 | testthat::test_that("glmnet lasso returns a error if data is not a data frame for folds = 1", {
36 |
37 | X = iris[, -5]
38 | y = X[, 1]
39 | X = list(X[, -1])
40 |
41 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = F)
42 |
43 | testthat::expect_error(feature_selection(X, y, method = 'glmnet-lasso', params_glmnet = params_glmnet, CV_folds = 1, cores_glmnet = NULL))
44 | })
45 |
46 |
47 | testthat::test_that("glmnet lasso returns a error if data is not a data frame for folds = 5", {
48 |
49 | X = iris[, -5]
50 | y = X[, 1]
51 | X = list(X[, -1])
52 |
53 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = F)
54 |
55 | testthat::expect_error(feature_selection(X, y, method = 'glmnet-lasso', params_glmnet = params_glmnet, CV_folds = 5, cores_glmnet = NULL))
56 | })
57 |
58 |
59 | #===========================================================
60 | # handling NA's in glmnet-lasso
61 | # xgboost, ranger treat missing values as an extra category
62 | #===========================================================
63 |
64 |
65 | testthat::test_that("missing values in data frame will be replaced with the median, in case of CV_folds = 1", {
66 |
67 | X = iris[, -5]
68 | y = X[, 1]
69 | X = X[, -1]
70 |
71 | X[, 1] = func_nas(X, 1) # add NA's to first and last column
72 | X[, ncol(X)] = func_nas(X, ncol(X))
73 |
74 |
75 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = F)
76 |
77 | res = feature_selection(X, y, method = 'glmnet-lasso', params_glmnet = params_glmnet, CV_folds = 1, cores_glmnet = NULL)
78 |
79 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty
80 | })
81 |
82 |
83 | testthat::test_that("missing values in data frame will be replaced with the median, in case of CV_folds = 5", {
84 |
85 | X = iris[, -5]
86 | y = X[, 1]
87 | X = X[, -1]
88 |
89 | X[, 1] = func_nas(X, 1) # add NA's to first and last column
90 | X[, ncol(X)] = func_nas(X, ncol(X))
91 |
92 |
93 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = F)
94 |
95 | res = feature_selection(X, y, method = 'glmnet-lasso', params_glmnet = params_glmnet, CV_folds = 5, cores_glmnet = NULL)
96 |
97 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty
98 | })
99 |
100 |
101 | testthat::test_that("missing values in sparse Matrix will be replaced with the median, in case of CV_folds = 1", {
102 |
103 | X = iris[, -5]
104 | y = X[, 1]
105 | X = X[, -1]
106 |
107 | X[, 1] = func_nas(X, 1) # add NA's to first and last column
108 | X[, ncol(X)] = func_nas(X, ncol(X))
109 |
110 | X = Matrix::Matrix(as.matrix(X), sparse = T) # convert to sparse matrix
111 |
112 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = F)
113 |
114 | res = feature_selection(X, y, method = 'glmnet-lasso', params_glmnet = params_glmnet, CV_folds = 1, cores_glmnet = NULL)
115 |
116 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty
117 | })
118 |
119 |
120 | testthat::test_that("missing values in sparse Matrix will be replaced with the median, in case of CV_folds = 5", {
121 |
122 | X = iris[, -5]
123 | y = X[, 1]
124 | X = X[, -1]
125 |
126 | X[, 1] = func_nas(X, 1) # add NA's to first and last column
127 | X[, ncol(X)] = func_nas(X, ncol(X))
128 | X = Matrix::Matrix(as.matrix(X), sparse = T) # convert to sparse matrix
129 |
130 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = F)
131 |
132 | res = feature_selection(X, y, method = 'glmnet-lasso', params_glmnet = params_glmnet, CV_folds = 5, cores_glmnet = NULL)
133 |
134 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty
135 | })
136 |
137 | #======================
138 | # test - cases : glmnet
139 | #======================
140 |
141 |
142 | testthat::test_that("glmnet returns a data frame with the important predictors, in case 3 folds", {
143 |
144 | X = iris[, -5]
145 | y = X[, 1]
146 | X = X[, -1]
147 |
148 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = F)
149 | res = feature_selection(X, y, method = 'glmnet-lasso', params_glmnet = params_glmnet, CV_folds = 5, cores_glmnet = NULL, verbose = T)
150 |
151 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty
152 | })
153 |
154 |
155 | testthat::test_that("glmnet returns a data frame with the important predictors, in case 1 fold", {
156 |
157 | X = iris[, -5]
158 | y = X[, 1]
159 | X = X[, -1]
160 |
161 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = F)
162 | res = feature_selection(X, y, method = 'glmnet-lasso', params_glmnet = params_glmnet, CV_folds = 1, cores_glmnet = NULL)
163 |
164 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty
165 | })
166 |
167 |
168 | testthat::test_that("glmnet returns an error if data is not a matrix, data.frame or sparse matrix , in case of fold = 1", {
169 |
170 | X = iris[, -5]
171 | y = X[, 1]
172 | X = list(X[, -1])
173 |
174 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = F)
175 |
176 | testthat::expect_error(feature_selection(X, y, method = 'glmnet-lasso', params_glmnet = params_glmnet, CV_folds = 1, cores_glmnet = NULL)) # is.data.frame and not empty
177 | })
178 |
179 |
180 | testthat::test_that("glmnet takes a matrix and returns a data frame with the important predictors, in case 1 fold", {
181 |
182 | X = iris[, -5]
183 | y = X[, 1]
184 | X = as.matrix(X[, -1])
185 |
186 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = F)
187 | res = feature_selection(X, y, method = 'glmnet-lasso', params_glmnet = params_glmnet, CV_folds = 1, cores_glmnet = NULL, verbose = T)
188 |
189 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty
190 | })
191 |
192 |
193 | testthat::test_that("glmnet takes a matrix and returns a data frame with the important predictors, in case 3 fold", {
194 |
195 | X = iris[, -5]
196 | y = X[, 1]
197 | X = as.matrix(X[, -1])
198 |
199 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = F)
200 | res = feature_selection(X, y, method = 'glmnet-lasso', params_glmnet = params_glmnet, CV_folds = 3, cores_glmnet = NULL)
201 |
202 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty
203 | })
204 |
205 |
206 | testthat::test_that("glmnet takes a sparse matrix and returns a data frame with the important predictors, in case 1 fold", {
207 |
208 | X = iris[, -5]
209 | y = X[, 1]
210 | X = Matrix::Matrix(as.matrix(X[, -1]), sparse = T)
211 |
212 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = F)
213 | res = feature_selection(X, y, method = 'glmnet-lasso', params_glmnet = params_glmnet, CV_folds = 1, cores_glmnet = NULL, verbose = T)
214 |
215 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty
216 | })
217 |
218 | testthat::test_that("glmnet takes a sparse matrix and returns a data frame with the important predictors, in case 3 fold", {
219 |
220 | X = iris[, -5]
221 | y = X[, 1]
222 | X = Matrix::Matrix(as.matrix(X[, -1]), sparse = T)
223 |
224 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = F)
225 | res = feature_selection(X, y, method = 'glmnet-lasso', params_glmnet = params_glmnet, CV_folds = 3, cores_glmnet = NULL, verbose = T)
226 |
227 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty
228 | })
229 |
230 |
231 | testthat::test_that("glmnet takes a sparse matrix and returns a data frame with the important predictors, in case of 1 fold, here use argument scale_coefs_glmnet AND verbose", {
232 |
233 | X = iris[, -5]
234 | y = X[, 1]
235 | X = Matrix::Matrix(as.matrix(X[, -1]), sparse = T)
236 |
237 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = F)
238 | res = feature_selection(X, y, method = 'glmnet-lasso', params_glmnet = params_glmnet, CV_folds = 1, cores_glmnet = NULL, scale_coefs_glmnet = T, verbose = T)
239 |
240 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty
241 | })
242 |
243 | testthat::test_that("glmnet takes a sparse matrix and returns a data frame with the important predictors, in case of 3 folds, here use argument scale_coefs_glmnet AND verbose", {
244 |
245 | X = iris[, -5]
246 | y = X[, 1]
247 | X = Matrix::Matrix(as.matrix(X[, -1]), sparse = T)
248 |
249 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = F)
250 | res = feature_selection(X, y, method = 'glmnet-lasso', params_glmnet = params_glmnet, CV_folds = 3, cores_glmnet = NULL, scale_coefs_glmnet = T, verbose = T)
251 |
252 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty
253 | })
254 |
255 | testthat::test_that("glmnet takes a data frame and in case of the binomial classification it returns a data frame with the important predictors, in case of 1 fold", {
256 |
257 | X = iris
258 | y = X[, 5]
259 | y = as.character(y)
260 | y[y == 'setosa'] = 'versicolor'
261 | y = as.factor(y)
262 | X = X[, -5]
263 |
264 | params_glmnet = list(alpha = 1, family = 'binomial', nfolds = 3, parallel = F)
265 | res = feature_selection(X, y, method = 'glmnet-lasso', params_glmnet = params_glmnet, CV_folds = 1, cores_glmnet = NULL, scale_coefs_glmnet = T, verbose = F)
266 |
267 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty
268 | })
269 |
270 |
271 | testthat::test_that("glmnet takes a data frame and in case of the binomial classification it returns a data frame with the important predictors, in case of 3 fold", {
272 |
273 | X = iris
274 | y = X[, 5]
275 | y = as.character(y)
276 | y[y == 'setosa'] = 'versicolor'
277 | y = as.factor(y)
278 | X = X[, -5]
279 |
280 | params_glmnet = list(alpha = 1, family = 'binomial', nfolds = 3, parallel = F)
281 | res = feature_selection(X, y, method = 'glmnet-lasso', params_glmnet = params_glmnet, CV_folds = 3, cores_glmnet = NULL, scale_coefs_glmnet = T, verbose = F)
282 |
283 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty
284 | })
285 |
286 |
287 | testthat::test_that("glmnet takes a data frame and in case of the multinomial classification it returns a data frame with the important predictors, in case of 1 fold", {
288 |
289 | X = iris
290 | y = X[, 5]
291 | X = X[, -5]
292 |
293 | params_glmnet = list(alpha = 1, family = 'multinomial', nfolds = 3, parallel = F)
294 | res = feature_selection(X, y, method = 'glmnet-lasso', params_glmnet = params_glmnet, CV_folds = 1, cores_glmnet = NULL, scale_coefs_glmnet = T, verbose = F)
295 |
296 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty
297 | })
298 |
299 | testthat::test_that("glmnet takes a data frame and in case of the multinomial classification it returns a data frame with the important predictors, in case of 3 fold", {
300 |
301 | X = iris
302 | y = X[, 5]
303 | X = X[, -5]
304 |
305 | params_glmnet = list(alpha = 1, family = 'multinomial', nfolds = 3, parallel = F)
306 | res = feature_selection(X, y, method = 'glmnet-lasso', params_glmnet = params_glmnet, CV_folds = 3, cores_glmnet = NULL, scale_coefs_glmnet = T, verbose = F)
307 |
308 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty
309 | })
310 |
311 |
312 | #=======================
313 | # test - cases : xgboost
314 | #=======================
315 |
316 |
317 | testthat::test_that("xgboost returns an error if data not data.frame or matrix, if folds = 1", {
318 |
319 | y = iris[, 5]
320 | multiclass_xgboost = ifelse(y == 'setosa', 0, ifelse(y == 'virginica', 1, 2))
321 | X = list(iris[, -5])
322 |
323 | params_xgboost = list( params = list("objective" = "multi:softprob", "bst:eta" = 0.35, "subsample" = 0.65, "num_class" = 3, "max_depth" = 6, "colsample_bytree" = 0.65, "nthread" = 2),
324 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE)
325 |
326 | testthat::expect_error(feature_selection(X, multiclass_xgboost, method = 'xgboost', params_xgboost = params_xgboost, CV_folds = 1)) # is.data.frame and not empty
327 | })
328 |
329 |
330 | testthat::test_that("xgboost returns an error if data not data.frame or matrix, if folds = 5", {
331 |
332 | y = iris[, 5]
333 | multiclass_xgboost = ifelse(y == 'setosa', 0, ifelse(y == 'virginica', 1, 2))
334 | X = list(iris[, -5])
335 |
336 | params_xgboost = list( params = list("objective" = "multi:softprob", "bst:eta" = 0.35, "subsample" = 0.65, "num_class" = 3, "max_depth" = 6, "colsample_bytree" = 0.65, "nthread" = 2),
337 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE)
338 |
339 | testthat::expect_error(feature_selection(X, multiclass_xgboost, method = 'xgboost', params_xgboost = params_xgboost, CV_folds = 5)) # is.data.frame and not empty
340 | })
341 |
342 |
343 | testthat::test_that("xgboost returns a data frame with the important predictors, in case 1 folds, default xgb_sort = Frequency", {
344 |
345 | y = iris[, 5]
346 | multiclass_xgboost = ifelse(y == 'setosa', 0, ifelse(y == 'virginica', 1, 2))
347 | X = iris[, -5]
348 |
349 | params_xgboost = list( params = list("objective" = "multi:softprob", "bst:eta" = 0.35, "subsample" = 0.65, "num_class" = 3, "max_depth" = 6, "colsample_bytree" = 0.65, "nthread" = 2),
350 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE)
351 |
352 | res = feature_selection(X, multiclass_xgboost, method = 'xgboost', params_xgboost = params_xgboost, CV_folds = 1, verbose = T)
353 |
354 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty
355 | })
356 |
357 |
358 | testthat::test_that("xgboost returns a data frame with the important predictors, in case 1 folds, xgb_sort = Gain", {
359 |
360 | y = iris[, 5]
361 | multiclass_xgboost = ifelse(y == 'setosa', 0, ifelse(y == 'virginica', 1, 2))
362 | X = iris[, -5]
363 |
364 | params_xgboost = list( params = list("objective" = "multi:softprob", "bst:eta" = 0.35, "subsample" = 0.65, "num_class" = 3, "max_depth" = 6, "colsample_bytree" = 0.65, "nthread" = 2),
365 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE)
366 |
367 | res = feature_selection(X, multiclass_xgboost, method = 'xgboost', params_xgboost = params_xgboost, CV_folds = 1, xgb_sort = 'Gain', verbose = T)
368 |
369 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty
370 | })
371 |
372 |
373 | testthat::test_that("xgboost returns a data frame with the important predictors, in case 1 folds, xgb_sort = Cover", {
374 |
375 | y = iris[, 5]
376 | multiclass_xgboost = ifelse(y == 'setosa', 0, ifelse(y == 'virginica', 1, 2))
377 | X = iris[, -5]
378 |
379 | params_xgboost = list( params = list("objective" = "multi:softprob", "bst:eta" = 0.35, "subsample" = 0.65, "num_class" = 3, "max_depth" = 6, "colsample_bytree" = 0.65, "nthread" = 2),
380 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE)
381 |
382 | res = feature_selection(X, multiclass_xgboost, method = 'xgboost', params_xgboost = params_xgboost, CV_folds = 1, xgb_sort = 'Cover')
383 |
384 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty
385 | })
386 |
387 |
388 |
389 | testthat::test_that("xgboost using sparse Matrix returns a data frame with the important predictors, in case 1 folds", {
390 |
391 | X = iris[, -5]
392 | y = X[, 1]
393 | X = Matrix::Matrix(as.matrix(X[, -1]), sparse = T)
394 |
395 | params_xgboost = list( params = list("objective" = "reg:linear", "bst:eta" = 0.35, "subsample" = 0.65, "max_depth" = 6, "colsample_bytree" = 0.65, "nthread" = 2),
396 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE)
397 |
398 | res = feature_selection(X, y, method = 'xgboost', params_xgboost = params_xgboost, CV_folds = 1)
399 |
400 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty
401 | })
402 |
403 |
404 | testthat::test_that("xgboost using sparse Matrix returns a data frame with the important predictors, in case 3 folds, default xgb_sort = Frequency", {
405 |
406 | X = iris[, -5]
407 | y = X[, 1]
408 | X = Matrix::Matrix(as.matrix(X[, -1]), sparse = T)
409 |
410 | params_xgboost = list( params = list("objective" = "reg:linear", "bst:eta" = 0.35, "subsample" = 0.65, "max_depth" = 6, "colsample_bytree" = 0.65, "nthread" = 2),
411 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE)
412 |
413 | res = feature_selection(X, y, method = 'xgboost', params_xgboost = params_xgboost, CV_folds = 3)
414 |
415 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty
416 | })
417 |
418 |
419 | testthat::test_that("xgboost using sparse Matrix returns a data frame with the important predictors, in case 3 folds, default xgb_sort = Frequency", {
420 |
421 | X = iris[, -5]
422 | y = X[, 1]
423 | X = Matrix::Matrix(as.matrix(X[, -1]), sparse = T)
424 |
425 | params_xgboost = list( params = list("objective" = "reg:linear", "bst:eta" = 0.35, "subsample" = 0.65, "max_depth" = 6, "colsample_bytree" = 0.65, "nthread" = 2),
426 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE)
427 |
428 | res = feature_selection(X, y, method = 'xgboost', params_xgboost = params_xgboost, CV_folds = 3, xgb_sort = 'Gain', verbose = T)
429 |
430 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty
431 | })
432 |
433 |
434 |
435 | testthat::test_that("xgboost using sparse Matrix returns a data frame with the important predictors, in case 3 folds, default xgb_sort = Frequency", {
436 |
437 | X = iris[, -5]
438 | y = X[, 1]
439 | X = Matrix::Matrix(as.matrix(X[, -1]), sparse = T)
440 |
441 | params_xgboost = list( params = list("objective" = "reg:linear", "bst:eta" = 0.35, "subsample" = 0.65, "max_depth" = 6, "colsample_bytree" = 0.65, "nthread" = 2),
442 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE)
443 |
444 | res = feature_selection(X, y, method = 'xgboost', params_xgboost = params_xgboost, CV_folds = 3, xgb_sort = 'Cover')
445 |
446 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty
447 | })
448 |
449 |
450 | testthat::test_that("xgboost using data.frame in multiclass classification, returns a data frame with the important predictors, in case 3 folds, default xgb_sort = Frequency", {
451 |
452 | y = iris[, 5]
453 | multiclass_xgboost = ifelse(y == 'setosa', 0, ifelse(y == 'virginica', 1, 2))
454 | X = iris[, -5]
455 |
456 | params_xgboost = list( params = list("objective" = "multi:softprob", "bst:eta" = 0.35, "subsample" = 0.65, "num_class" = 3, "max_depth" = 6, "colsample_bytree" = 0.65, "nthread" = 2),
457 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE)
458 |
459 | res = feature_selection(X, multiclass_xgboost, method = 'xgboost', params_xgboost = params_xgboost, CV_folds = 3, verbose = T)
460 |
461 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty
462 | })
463 |
464 |
465 | testthat::test_that("xgboost using data.frame in binary classification, returns a data frame with the important predictors, in case 5 folds, default xgb_sort = Frequency", {
466 |
467 | y = iris[, 5]
468 | multiclass_xgboost = ifelse(y == 'setosa', 0, 1)
469 | X = iris[, -5]
470 |
471 | params_xgboost = list( params = list("objective" = "binary:logistic", "bst:eta" = 0.35, "subsample" = 0.65, "max_depth" = 6, "colsample_bytree" = 0.65, "nthread" = 2),
472 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE)
473 |
474 | res = feature_selection(X, multiclass_xgboost, method = 'xgboost', params_xgboost = params_xgboost, CV_folds = 5)
475 |
476 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty
477 | })
478 |
479 |
480 |
481 | #=======================
482 | # test - cases : ranger
483 | #=======================
484 |
485 |
486 | testthat::test_that("ranger returns an error if data not data.frame or matrix, folds = 1", {
487 |
488 | y = iris[, 5]
489 | y = as.character(y)
490 | y[y == 'setosa'] = 'virginica'
491 | X = list(iris[, -5])
492 |
493 | params_ranger = list(write.forest = TRUE, probability = TRUE, num.threads = 6, num.trees = 50, verbose = FALSE, classification = TRUE, mtry = 2, min.node.size = 5, importance = 'impurity')
494 |
495 | testthat::expect_error(feature_selection(X, y, method = 'ranger', params_ranger = params_ranger, CV_folds = 1)) # is.data.frame and not empty
496 | })
497 |
498 | testthat::test_that("ranger returns an error if data not data.frame or matrix, folds > 1", {
499 |
500 | y = iris[, 5]
501 | y = as.character(y)
502 | y[y == 'setosa'] = 'virginica'
503 | X = list(iris[, -5])
504 |
505 | params_ranger = list(write.forest = TRUE, probability = TRUE, num.threads = 6, num.trees = 50, verbose = FALSE, classification = TRUE, mtry = 2, min.node.size = 5, importance = 'impurity')
506 |
507 | testthat::expect_error(feature_selection(X, y, method = 'ranger', params_ranger = params_ranger, CV_folds = 3)) # is.data.frame and not empty
508 | })
509 |
510 | testthat::test_that("ranger returns a data.frame with important predictors if folds = 1", {
511 |
512 | y = iris[, 5]
513 | y = as.character(y)
514 | y[y == 'setosa'] = 'virginica'
515 | X = iris[, -5]
516 |
517 | params_ranger = list(write.forest = TRUE, probability = TRUE, num.threads = 6, num.trees = 50, verbose = FALSE, classification = TRUE, mtry = 2, min.node.size = 5, importance = 'impurity')
518 |
519 | res = feature_selection(X, y, method = 'ranger', params_ranger = params_ranger, CV_folds = 1, verbose = T)
520 |
521 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty
522 | })
523 |
524 |
525 | testthat::test_that("ranger returns a data.frame with important predictors if folds = 5", {
526 |
527 | y = iris[, 5]
528 | y = as.character(y)
529 | y[y == 'setosa'] = 'virginica'
530 | X = iris[, -5]
531 |
532 | params_ranger = list(write.forest = TRUE, probability = TRUE, num.threads = 6, num.trees = 50, verbose = FALSE, classification = TRUE, mtry = 2, min.node.size = 5, importance = 'impurity')
533 |
534 | res = feature_selection(X, y, method = 'ranger', params_ranger = params_ranger, CV_folds = 5, verbose = T)
535 |
536 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty
537 | })
538 |
539 |
540 | testthat::test_that("ranger returns a data.frame with important predictors if folds = 1, in regression", {
541 |
542 | X = iris[, -5]
543 | y = X[, 1]
544 | X = X[, -1]
545 |
546 | params_ranger = list(write.forest = TRUE, probability = F, num.threads = 6, num.trees = 50, verbose = FALSE, classification = F, mtry = 2, min.node.size = 5, importance = 'impurity')
547 |
548 | res = feature_selection(X, y, method = 'ranger', params_ranger = params_ranger, CV_folds = 1, verbose = T)
549 |
550 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty
551 | })
552 |
553 |
554 | testthat::test_that("ranger returns a data.frame with important predictors if folds = 5, in regression", {
555 |
556 | X = iris[, -5]
557 | y = X[, 1]
558 | X = X[, -1]
559 |
560 | params_ranger = list(write.forest = TRUE, probability = F, num.threads = 6, num.trees = 50, verbose = FALSE, classification = F, mtry = 2, min.node.size = 5, importance = 'impurity')
561 |
562 | res = feature_selection(X, y, method = 'ranger', params_ranger = params_ranger, CV_folds = 5, verbose = T)
563 |
564 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty
565 | })
566 |
567 |
568 |
569 | testthat::test_that("ranger returns a data.frame with important predictors if folds = 5, in regression", {
570 |
571 | X = iris[, -5]
572 | y = X[, 1]
573 | X = X[, -1]
574 |
575 | params_ranger = list(write.forest = TRUE, probability = F, num.threads = 6, num.trees = 50, verbose = FALSE, classification = F, mtry = 2, min.node.size = 5, importance = 'impurity')
576 |
577 | res = feature_selection(X, y, method = 'ranger', params_ranger = params_ranger, CV_folds = 5)
578 |
579 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty
580 | })
581 |
582 |
583 | testthat::test_that("ranger returns a data.frame with important predictors if folds = 1, in regression, when using dependent.variable.name ", {
584 |
585 | X = iris
586 | y = iris[, 'Species'] # IN case that I give dependent.variable.name , THEN I should also specify the response variable (y), so that folds can be built
587 |
588 | params_ranger = list(dependent.variable.name = "Species", write.forest = TRUE, probability = T, num.threads = 6, num.trees = 50, verbose = FALSE, classification = T, mtry = 2,
589 |
590 | min.node.size = 5, importance = 'impurity')
591 |
592 | res = feature_selection(X, y, method = 'ranger', params_ranger = params_ranger, CV_folds = 1)
593 |
594 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty
595 | })
596 |
597 |
598 | testthat::test_that("ranger returns a data.frame with important predictors if folds = 3, in regression, when using dependent.variable.name ", {
599 |
600 | X = iris
601 | y = iris[, 'Species'] # IN case that I give dependent.variable.name , THEN I should also specify the response variable (y), so that folds can be built
602 |
603 | params_ranger = list(dependent.variable.name = "Species", write.forest = TRUE, probability = T, num.threads = 6, num.trees = 50, verbose = FALSE, classification = T, mtry = 2,
604 |
605 | min.node.size = 5, importance = 'impurity')
606 |
607 | res = feature_selection(X, y, method = 'ranger', params_ranger = params_ranger, CV_folds = 3, verbose = T)
608 |
609 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty
610 | })
611 |
612 |
--------------------------------------------------------------------------------
/tests/testthat/test-regression_folds.R:
--------------------------------------------------------------------------------
1 | context("Split data in regression folds")
2 |
3 |
4 | testthat::test_that("throws an error if the RESP is not a factor", {
5 |
6 | y = as.factor(c(1:50))
7 |
8 | testthat::expect_error(regr_folds(5, y, stratified = F), "this function is meant for regression for classification use the 'class_folds' function")
9 | })
10 |
11 |
12 | testthat::test_that("returns a warning if the folds are not equally split", {
13 |
14 | y = sample(1:5, 99, replace = T)
15 |
16 | testthat::expect_warning(regr_folds(5, y, stratified = F), 'the folds are not equally split')
17 | })
18 |
19 | testthat::test_that("the number of folds equals the number of the resulted sublist indices", {
20 |
21 | y = sample(1:5, 100, replace = T)
22 |
23 | testthat::expect_length(regr_folds(5, y, stratified = F), 5)
24 | })
25 |
26 | # object with stratified = T
27 |
28 | testthat::test_that("the number of folds equals the number of the resulted sublist indices", {
29 |
30 | y = sample(1:5, 100, replace = T)
31 |
32 | testthat::expect_length(regr_folds(5, y, stratified = T), 5)
33 | })
34 |
--------------------------------------------------------------------------------
/tests/testthat/test-secondary_functions_func_correlation.R:
--------------------------------------------------------------------------------
1 | context('Secondary functions func_correlation')
2 |
3 | # 'remove_duplic_func' function
4 |
5 | testthat::test_that("takes a data frame and returns a data frame", {
6 |
7 | df = data.frame(a = sample(1:4, 100, replace = T))
8 |
9 | testthat::expect_true(is.data.frame(remove_duplic_func(df)))
10 | })
11 |
12 |
13 | testthat::test_that("takes a single column data frame and returns a 3-column data frame", {
14 |
15 | df = data.frame(a = sample(1:4, 100, replace = T))
16 |
17 | out = ncol(remove_duplic_func(df))
18 |
19 | testthat::expect_true(out == 3)
20 | })
21 |
22 |
23 | # 'second_func_cor' function
24 |
25 | testthat::test_that("takes a data frame and returns a list", {
26 |
27 | df = data.frame(a = sample(0:4, 100, replace = T), d = sample(0:6, 100, replace = T))
28 |
29 | testthat::expect_true(is.list(second_func_cor(df)))
30 | })
31 |
32 |
33 | testthat::test_that("takes a data frame and returns a list, the length of the list equals the number of columns of the data frame", {
34 |
35 | df = data.frame(a = sample(0:4, 100, replace = T), d = sample(0:6, 100, replace = T))
36 |
37 | lst = second_func_cor(df)
38 |
39 | testthat::expect_true(ncol(df) == length(lst))
40 | })
41 |
42 |
--------------------------------------------------------------------------------
/tests/testthat/test-shuffle_data.R:
--------------------------------------------------------------------------------
1 | context('Shuffle and normalize data functions')
2 |
3 |
4 | # shuffle data
5 |
6 | testthat::test_that("shuffle data takes a vector as input and returns a vector as output", {
7 |
8 | y = c(1:50)
9 |
10 | testthat::expect_true(is.vector(func_shuffle(y, times = 10)))
11 | })
12 |
13 | testthat::test_that("the length of the input vector equals the length of the output vector", {
14 |
15 | y = c(1:50)
16 |
17 | output = func_shuffle(y, times = 10)
18 |
19 | testthat::expect_true(length(y) == length(output))
20 | })
21 |
22 |
23 | # normalize data
24 |
25 |
26 | testthat::test_that("normalize data takes a vector as input and returns a vector as output", {
27 |
28 | y = c(1:50)
29 |
30 | testthat::expect_true(is.vector(normalized(y)))
31 | })
32 |
33 | testthat::test_that("the length of the input vector equals the length of the output vector", {
34 |
35 | y = c(1:50)
36 |
37 | output = normalized(y)
38 |
39 | testthat::expect_true(length(y) == length(output))
40 | })
41 |
--------------------------------------------------------------------------------
/tests/testthat/test-wrapper_feature_selection.R:
--------------------------------------------------------------------------------
1 | data(iris)
2 |
3 | context('Wraps all the methods')
4 |
5 |
6 | testthat::test_that("it returns an error if all methods are NULL", {
7 |
8 | X = iris[, -5]
9 | y = X[, 1]
10 | X = X[, -1]
11 |
12 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = F)
13 |
14 | params_xgboost = list( params = list("objective" = "reg:linear", "bst:eta" = 0.01, "subsample" = 0.65, "max_depth" = 5, "colsample_bytree" = 0.65, "nthread" = 2),
15 | nrounds = 100, print.every.n = 50, verbose = 0, maximize = FALSE)
16 |
17 | params_ranger = list(probability = FALSE, num.trees = 100, verbose = TRUE, classification = FALSE, mtry = 5, min.node.size = 10, num.threads = 2, importance = 'permutation')
18 |
19 | params_features = list(keep_number_feat = NULL, union = TRUE)
20 |
21 | testthat::expect_error(wrapper_feat_select(X, y, params_glmnet = NULL, params_xgboost = NULL, params_ranger = NULL, xgb_sort = NULL,
22 | CV_folds = 1, stratified_regr = FALSE, cores_glmnet = NULL, params_features = params_features))
23 | })
24 |
25 |
26 | testthat::test_that("it returns an error if cv folds less than 2", {
27 |
28 | X = iris[, -5]
29 | y = X[, 1]
30 | X = X[, -1]
31 |
32 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = TRUE)
33 |
34 | params_xgboost = list( params = list("objective" = "reg:linear", "bst:eta" = 0.01, "subsample" = 0.65, "max_depth" = 5, "colsample_bytree" = 0.65, "nthread" = 2),
35 | nrounds = 100, print.every.n = 50, verbose = 0, maximize = FALSE)
36 |
37 | params_ranger = list(probability = FALSE, num.trees = 100, verbose = TRUE, classification = FALSE, mtry = 5, min.node.size = 10, num.threads = 2, importance = 'permutation')
38 |
39 | params_features = list(keep_number_feat = NULL, union = TRUE)
40 |
41 | testthat::expect_error(wrapper_feat_select(X, y, params_glmnet = params_glmnet, params_xgboost = params_xgboost, params_ranger = params_ranger, xgb_sort = NULL,
42 | CV_folds = 1, stratified_regr = FALSE, cores_glmnet = 2, params_features = params_features))
43 | })
44 |
45 |
46 | testthat::test_that("it returns an error if the importance method in ranger is not specified", {
47 |
48 | X = iris[, -5]
49 | y = X[, 1]
50 | X = X[, -1]
51 |
52 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = TRUE)
53 |
54 | params_xgboost = list( params = list("objective" = "reg:linear", "bst:eta" = 0.01, "subsample" = 0.65, "max_depth" = 5, "colsample_bytree" = 0.65, "nthread" = 2),
55 | nrounds = 100, print.every.n = 50, verbose = 0, maximize = FALSE)
56 |
57 | params_ranger = list(probability = FALSE, num.trees = 100, verbose = TRUE, classification = FALSE, mtry = 5, min.node.size = 10, num.threads = 2, importance = 'none')
58 |
59 | params_features = list(keep_number_feat = NULL, union = TRUE)
60 |
61 | testthat::expect_error(wrapper_feat_select(X, y, params_glmnet = params_glmnet, params_xgboost = params_xgboost, params_ranger = params_ranger, xgb_sort = NULL,
62 | CV_folds = 3, stratified_regr = FALSE, cores_glmnet = 2, params_features = params_features))
63 | })
64 |
65 |
66 |
67 | testthat::test_that("it returns an error if params_features$union == TRUE AND length(method) == 1", {
68 |
69 | X = iris[, -5]
70 | y = X[, 1]
71 | X = X[, -1]
72 |
73 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = TRUE)
74 |
75 | params_xgboost = list( params = list("objective" = "reg:linear", "bst:eta" = 0.01, "subsample" = 0.65, "max_depth" = 5, "colsample_bytree" = 0.65, "nthread" = 2),
76 | nrounds = 100, print.every.n = 50, verbose = 0, maximize = FALSE)
77 |
78 | params_ranger = list(probability = FALSE, num.trees = 100, verbose = TRUE, classification = FALSE, mtry = 5, min.node.size = 10, num.threads = 2, importance = 'permutation')
79 |
80 | params_features = list(keep_number_feat = NULL, union = TRUE)
81 |
82 | testthat::expect_error(wrapper_feat_select(X, y, params_glmnet = NULL, params_xgboost = NULL, params_ranger = params_ranger, xgb_sort = NULL,
83 | CV_folds = 3, stratified_regr = FALSE, cores_glmnet = 2, params_features = params_features))
84 | })
85 |
86 |
87 | testthat::test_that("it returns a list with non-empty data frames when union = F", {
88 |
89 | X = iris[, -5]
90 | y = X[, 1]
91 | X = X[, -1]
92 |
93 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = TRUE)
94 |
95 | params_xgboost = list( params = list("objective" = "reg:linear", "bst:eta" = 0.01, "subsample" = 0.65, "max_depth" = 5, "colsample_bytree" = 0.65, "nthread" = 2),
96 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE)
97 |
98 | params_ranger = list(probability = FALSE, num.trees = 50, verbose = TRUE, classification = FALSE, mtry = 2, min.node.size = 10, num.threads = 2, importance = 'permutation')
99 |
100 | params_features = list(keep_number_feat = NULL, union = F)
101 |
102 | res = wrapper_feat_select(X, y, params_glmnet = params_glmnet, params_xgboost = params_xgboost, params_ranger = params_ranger, xgb_sort = NULL,
103 |
104 | CV_folds = 3, stratified_regr = FALSE, cores_glmnet = 2, params_features = params_features)
105 |
106 | testthat::expect_true(is.list(res) & all(unlist(lapply(res, function(x) sum(dim(x)))) > 0))
107 | })
108 |
109 |
110 | testthat::test_that("it returns a list with non-empty data frames when union = F, sorted by Gain in xgboost", {
111 |
112 | X = iris[, -5]
113 | y = X[, 1]
114 | X = X[, -1]
115 |
116 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = TRUE)
117 |
118 | params_xgboost = list( params = list("objective" = "reg:linear", "bst:eta" = 0.01, "subsample" = 0.65, "max_depth" = 5, "colsample_bytree" = 0.65, "nthread" = 2),
119 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE)
120 |
121 | params_ranger = list(probability = FALSE, num.trees = 50, verbose = TRUE, classification = FALSE, mtry = 2, min.node.size = 10, num.threads = 2, importance = 'permutation')
122 |
123 | params_features = list(keep_number_feat = NULL, union = F)
124 |
125 | res = wrapper_feat_select(X, y, params_glmnet = params_glmnet, params_xgboost = params_xgboost, params_ranger = params_ranger, xgb_sort = "Gain",
126 |
127 | CV_folds = 3, stratified_regr = FALSE, cores_glmnet = 2, params_features = params_features)
128 |
129 | testthat::expect_true(is.list(res) & all(unlist(lapply(res, function(x) sum(dim(x)))) > 0))
130 | })
131 |
132 |
133 | testthat::test_that("it returns a list of two when union = T. The data frames in the first list and in the second list should be non-empty", {
134 |
135 | X = iris[, -5]
136 | y = X[, 1]
137 | X = X[, -1]
138 |
139 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = TRUE)
140 |
141 | params_xgboost = list( params = list("objective" = "reg:linear", "bst:eta" = 0.01, "subsample" = 0.65, "max_depth" = 5, "colsample_bytree" = 0.65, "nthread" = 2),
142 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE)
143 |
144 | params_ranger = list(probability = FALSE, num.trees = 50, verbose = TRUE, classification = FALSE, mtry = 2, min.node.size = 10, num.threads = 2, importance = 'permutation')
145 |
146 | params_features = list(keep_number_feat = NULL, union = T)
147 |
148 | res = wrapper_feat_select(X, y, params_glmnet = params_glmnet, params_xgboost = params_xgboost, params_ranger = params_ranger, xgb_sort = NULL,
149 |
150 | CV_folds = 3, stratified_regr = FALSE, cores_glmnet = 2, params_features = params_features)
151 |
152 | testthat::expect_true(is.list(res) & all(unlist(lapply(res$all_feat, function(x) sum(dim(x)))) > 0) & sum(dim(res$union_feat)) > 0)
153 | })
154 |
155 |
156 | testthat::test_that("it returns a list of two when union = F. The data frames in the first list and in the second list should be non-empty", {
157 |
158 | X = iris[, -5]
159 | y = X[, 1]
160 | X = X[, -1]
161 |
162 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = TRUE)
163 |
164 | params_xgboost = list( params = list("objective" = "reg:linear", "bst:eta" = 0.01, "subsample" = 0.65, "max_depth" = 5, "colsample_bytree" = 0.65, "nthread" = 2),
165 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE)
166 |
167 | params_ranger = list(probability = FALSE, num.trees = 50, verbose = TRUE, classification = FALSE, mtry = 2, min.node.size = 10, num.threads = 2, importance = 'permutation')
168 |
169 | params_features = list(keep_number_feat = NULL, union = T)
170 |
171 | res = wrapper_feat_select(X, y, params_glmnet = params_glmnet, params_xgboost = params_xgboost, params_ranger = params_ranger, xgb_sort = NULL,
172 |
173 | CV_folds = 3, stratified_regr = FALSE, cores_glmnet = 2, params_features = params_features)
174 |
175 | testthat::expect_true(is.list(res) & all(unlist(lapply(res$all_feat, function(x) sum(dim(x)))) > 0) & sum(dim(res$union_feat)) > 0)
176 | })
177 |
178 |
179 | # testthat::test_that("wrapper_feat_select works if the dependent.variable.name is in the parameters of ranger and all other methods are NULL", {
180 | #
181 | # X = iris
182 | # X$Species = as.numeric(X$Species)
183 | # y = X$Species
184 | #
185 | # params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = F)
186 | #
187 | # params_xgboost = list( params = list("objective" = "reg:linear", "bst:eta" = 0.01, "subsample" = 0.65, "max_depth" = 5, "colsample_bytree" = 0.65, "nthread" = 2),
188 | # nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE)
189 | #
190 | # params_ranger = list(dependent.variable.name = "Species", probability = FALSE, num.trees = 50, verbose = TRUE, classification = FALSE, mtry = 2, min.node.size = 10, num.threads = 2, importance = 'permutation')
191 | #
192 | # params_features = list(keep_number_feat = NULL, union = F)
193 | #
194 | # res = wrapper_feat_select(X, y, params_glmnet = NULL, params_xgboost = NULL, params_ranger = params_ranger, xgb_sort = NULL,
195 | #
196 | # CV_folds = 3, stratified_regr = FALSE, cores_glmnet = NULL, params_features = params_features)
197 | #
198 | # testthat::expect_true(is.list(res) & sum(dim(res$ranger)) > 0)
199 | # })
200 |
201 |
202 | # testthat::test_that("wrapper_feat_select returns an error if the dependent.variable.name is in the parameters of ranger and one or two of the other methods are not NULL", {
203 | #
204 | # X = iris
205 | # X$Species = as.numeric(X$Species)
206 | # y = X$Species
207 | #
208 | # params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = TRUE)
209 | #
210 | # params_xgboost = list( params = list("objective" = "reg:linear", "bst:eta" = 0.01, "subsample" = 0.65, "max_depth" = 5, "colsample_bytree" = 0.65, "nthread" = 2),
211 | # nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE)
212 | #
213 | # params_ranger = list(dependent.variable.name = "Species", probability = FALSE, num.trees = 50, verbose = TRUE, classification = FALSE, mtry = 2, min.node.size = 10, num.threads = 2, importance = 'permutation')
214 | #
215 | # params_features = list(keep_number_feat = NULL, union = F)
216 | #
217 | # testthat::expect_error(wrapper_feat_select(X, y, params_glmnet = NULL, params_xgboost = params_xgboost, params_ranger = params_ranger, xgb_sort = NULL,
218 | #
219 | # CV_folds = 3, stratified_regr = FALSE, cores_glmnet = 2, params_features = params_features))
220 | # })
221 |
222 |
223 |
224 | testthat::test_that("it works in BINOMIAL classification. The labels should be in c(0,1), so that xgboost works. It should return a
225 |
226 | list of two when union = T. The data frames in the first list and in the second list should be non-empty. Case binary classification", {
227 |
228 | y = iris[, 5]
229 | y = as.character(y)
230 | y[y == 'setosa'] = 'virginica'
231 | y = as.numeric(as.factor(y)) - 1
232 | X = iris[, -5]
233 |
234 | params_glmnet = list(alpha = 1, family = 'binomial', nfolds = 3, parallel = TRUE)
235 |
236 | params_xgboost = list( params = list("objective" = "reg:logistic", "bst:eta" = 0.01, "subsample" = 0.65, "max_depth" = 5, "colsample_bytree" = 0.65, "nthread" = 2),
237 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE)
238 |
239 | params_ranger = list(write.forest = TRUE, probability = TRUE, num.threads = 6, num.trees = 50, verbose = FALSE, classification = TRUE, mtry = 2, min.node.size = 5, importance = 'impurity')
240 |
241 | params_features = list(keep_number_feat = NULL, union = T)
242 |
243 | res = wrapper_feat_select(X, y, params_glmnet = params_glmnet, params_xgboost = params_xgboost, params_ranger = params_ranger, xgb_sort = NULL,
244 |
245 | CV_folds = 3, stratified_regr = FALSE, cores_glmnet = 2, params_features = params_features)
246 |
247 | testthat::expect_true(is.list(res) & all(unlist(lapply(res$all_feat, function(x) sum(dim(x)))) > 0) & sum(dim(res$union_feat)) > 0)
248 | })
249 |
250 |
251 | testthat::test_that("it works in BINOMIAL classification. The labels should be in c(0,1), so that xgboost works. It should return a
252 |
253 | list of two when union = T. The data frames in the first list and in the second list should be non-empty. Case binary classification, sorted by Cover in xgboost", {
254 |
255 | y = iris[, 5]
256 | y = as.character(y)
257 | y[y == 'setosa'] = 'virginica'
258 | y = as.numeric(as.factor(y)) - 1
259 | X = iris[, -5]
260 |
261 | params_glmnet = list(alpha = 1, family = 'binomial', nfolds = 3, parallel = TRUE)
262 |
263 | params_xgboost = list( params = list("objective" = "reg:logistic", "bst:eta" = 0.01, "subsample" = 0.65, "max_depth" = 5, "colsample_bytree" = 0.65, "nthread" = 2),
264 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE)
265 |
266 | params_ranger = list(write.forest = TRUE, probability = TRUE, num.threads = 6, num.trees = 50, verbose = FALSE, classification = TRUE, mtry = 2, min.node.size = 5, importance = 'impurity')
267 |
268 | params_features = list(keep_number_feat = NULL, union = T)
269 |
270 | res = wrapper_feat_select(X, y, params_glmnet = params_glmnet, params_xgboost = params_xgboost, params_ranger = params_ranger, xgb_sort = "Cover",
271 |
272 | CV_folds = 3, stratified_regr = FALSE, cores_glmnet = 2, params_features = params_features)
273 |
274 | testthat::expect_true(is.list(res) & all(unlist(lapply(res$all_feat, function(x) sum(dim(x)))) > 0) & sum(dim(res$union_feat)) > 0)
275 | })
276 |
277 |
278 | testthat::test_that("it works in MULTICLASS classification. The labels should be in c(0 to Inf), so that xgboost works. It should return a
279 |
280 | list of two when union = T. The data frames in the first and in the second list should be non-empty. Case multiclass classification", {
281 |
282 | y = iris[, 5]
283 | multiclass_xgboost = ifelse(y == 'setosa', 0, ifelse(y == 'virginica', 1, 2))
284 | X = iris[, -5]
285 |
286 | params_glmnet = list(alpha = 1, family = 'multinomial', nfolds = 3, parallel = F)
287 |
288 | params_xgboost = list( params = list("objective" = "multi:softprob", "bst:eta" = 0.35, "subsample" = 0.65, "num_class" = 3, "max_depth" = 6, "colsample_bytree" = 0.65, "nthread" = 2),
289 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE)
290 |
291 | params_ranger = list(write.forest = TRUE, probability = TRUE, num.threads = 6, num.trees = 50, verbose = FALSE, classification = TRUE, mtry = 2, min.node.size = 5, importance = 'impurity')
292 |
293 | params_features = list(keep_number_feat = NULL, union = T)
294 |
295 | res = wrapper_feat_select(X, multiclass_xgboost, params_glmnet = params_glmnet, params_xgboost = params_xgboost, params_ranger = params_ranger, xgb_sort = NULL,
296 |
297 | CV_folds = 3, stratified_regr = FALSE, cores_glmnet = 2, params_features = params_features)
298 |
299 | testthat::expect_true(is.list(res) & all(unlist(lapply(res$all_feat, function(x) sum(dim(x)))) > 0) & sum(dim(res$union_feat)) > 0)
300 | })
301 |
302 |
303 |
304 | testthat::test_that("it works in MULTICLASS classification. The labels should be in c(0 to Inf), so that xgboost works. It should return a
305 |
306 | list of two when union = T. The data frames in the first and in the second list should be non-empty. Case multiclass classification, sorted by Frequency in xgboost", {
307 |
308 | y = iris[, 5]
309 | multiclass_xgboost = ifelse(y == 'setosa', 0, ifelse(y == 'virginica', 1, 2))
310 | X = iris[, -5]
311 |
312 | params_glmnet = list(alpha = 1, family = 'multinomial', nfolds = 3, parallel = F)
313 |
314 | params_xgboost = list( params = list("objective" = "multi:softprob", "bst:eta" = 0.35, "subsample" = 0.65, "num_class" = 3, "max_depth" = 6, "colsample_bytree" = 0.65, "nthread" = 2),
315 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE)
316 |
317 | params_ranger = list(write.forest = TRUE, probability = TRUE, num.threads = 6, num.trees = 50, verbose = FALSE, classification = TRUE, mtry = 2, min.node.size = 5, importance = 'impurity')
318 |
319 | params_features = list(keep_number_feat = NULL, union = T)
320 |
321 | res = wrapper_feat_select(X, multiclass_xgboost, params_glmnet = params_glmnet, params_xgboost = params_xgboost, params_ranger = params_ranger, xgb_sort = "Frequency",
322 |
323 | CV_folds = 3, stratified_regr = FALSE, cores_glmnet = 2, params_features = params_features)
324 |
325 | testthat::expect_true(is.list(res) & all(unlist(lapply(res$all_feat, function(x) sum(dim(x)))) > 0) & sum(dim(res$union_feat)) > 0)
326 | })
327 |
--------------------------------------------------------------------------------
/tic.R:
--------------------------------------------------------------------------------
1 | # installs dependencies, runs R CMD check, runs covr::codecov()
2 | do_package_checks()
3 |
4 | if (ci_on_ghactions() && ci_has_env("BUILD_PKGDOWN")) {
5 | # creates pkgdown site and pushes to gh-pages branch
6 | # only for the runner with the "BUILD_PKGDOWN" env var set
7 | do_pkgdown()
8 | }
9 |
--------------------------------------------------------------------------------