├── .Rbuildignore ├── .github ├── ISSUE_TEMPLATE │ ├── config.yml │ └── issue_template.md └── workflows │ ├── issue.yml │ ├── stale-actions.yml │ └── tic.yml ├── .gitignore ├── DESCRIPTION ├── Dockerfile ├── NAMESPACE ├── NEWS.md ├── R ├── correlation_of_multiple_predictors.R ├── create_folds_and_data_split.R ├── feature_selection.R ├── plot_feature_selection.R ├── utils.R └── wrapper_feature_selection.R ├── README.md ├── man ├── DataSplit.Rd ├── add_probs_dfs.Rd ├── barplot_feat_select.Rd ├── class_folds.Rd ├── feature_selection.Rd ├── func_correlation.Rd ├── func_replace_NAs.Rd ├── func_shuffle.Rd ├── normalized.Rd ├── regr_folds.Rd ├── remove_duplic_func.Rd ├── second_func_cor.Rd └── wrapper_feat_select.Rd ├── tests ├── testthat.R └── testthat │ ├── helper-function_for_tests.R │ ├── test-add_probability_matrices_OR_data_frames.R │ ├── test-barplot_feat_select.R │ ├── test-classification_folds.R │ ├── test-correlation_function.R │ ├── test-data_split_function.R │ ├── test-feature_selection.R │ ├── test-regression_folds.R │ ├── test-secondary_functions_func_correlation.R │ ├── test-shuffle_data.R │ └── test-wrapper_feature_selection.R └── tic.R /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^Dockerfile$ 2 | ^.git$ 3 | ^.github$ 4 | ^\.ccache$ 5 | ^\.github$ 6 | ^tic\.R$ 7 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | # For more info see: https://docs.github.com/en/github/building-a-strong-community/configuring-issue-templates-for-your-repository#configuring-the-template-chooser 2 | 3 | blank_issues_enabled: true 4 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/issue_template.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report or feature request 3 | about: Describe a bug you've encountered or make a case for a new feature 4 | --- 5 | 6 | Please briefly describe your problem and what output you expect. If you have a question, you also have the option of (but I'm flexible if it's not too complicated) 7 | 8 | Please include a minimal reproducible example 9 | 10 | Please give a brief description of the problem 11 | 12 | Please add your Operating System (e.g., Windows10, Macintosh, Linux) and the R version that you use (e.g., 3.6.2) 13 | 14 | If my package uses Python (via 'reticulate') then please add also the Python version (e.g., Python 3.8) and the 'reticulate' version (e.g., 1.18.0) 15 | -------------------------------------------------------------------------------- /.github/workflows/issue.yml: -------------------------------------------------------------------------------- 1 | # For more info see: https://github.com/Renato66/auto-label 2 | # for the 'secrets.GITHUB_TOKEN' see: https://docs.github.com/en/actions/reference/authentication-in-a-workflow#about-the-github_token-secret 3 | 4 | name: Labeling new issue 5 | on: 6 | issues: 7 | types: ['opened'] 8 | jobs: 9 | build: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: Renato66/auto-label@v2 13 | with: 14 | repo-token: ${{ secrets.GITHUB_TOKEN }} 15 | ignore-comments: true 16 | labels-synonyms: '{"bug":["error","need fix","not working"],"enhancement":["upgrade"],"question":["help"]}' 17 | labels-not-allowed: '["good first issue"]' 18 | default-labels: '["help wanted"]' 19 | -------------------------------------------------------------------------------- /.github/workflows/stale-actions.yml: -------------------------------------------------------------------------------- 1 | # for the 'secrets.GITHUB_TOKEN' see: https://docs.github.com/en/actions/reference/authentication-in-a-workflow#about-the-github_token-secret 2 | 3 | name: "Mark or close stale issues and PRs" 4 | 5 | on: 6 | schedule: 7 | - cron: "00 * * * *" 8 | 9 | jobs: 10 | stale: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/stale@v3 14 | with: 15 | repo-token: ${{ secrets.GITHUB_TOKEN }} 16 | days-before-stale: 12 17 | days-before-close: 7 18 | stale-issue-message: "This is Robo-lampros because the Human-lampros is lazy. This issue has been automatically marked as stale because it has not had recent activity. It will be closed after 7 days if no further activity occurs. Feel free to re-open a closed issue and the Human-lampros will respond." 19 | stale-pr-message: "This is Robo-lampros because the Human-lampros is lazy. This PR has been automatically marked as stale because it has not had recent activity. It will be closed after 7 days if no further activity occurs." 20 | close-issue-message: "This issue was automatically closed because of being stale. Feel free to re-open a closed issue and the Human-lampros will respond." 21 | close-pr-message: "This PR was automatically closed because of being stale." 22 | stale-pr-label: "stale" 23 | stale-issue-label: "stale" 24 | exempt-issue-labels: "bug,enhancement,pinned,security,pending,work_in_progress" 25 | exempt-pr-labels: "bug,enhancement,pinned,security,pending,work_in_progress" 26 | -------------------------------------------------------------------------------- /.github/workflows/tic.yml: -------------------------------------------------------------------------------- 1 | ## tic GitHub Actions template: linux-macos-windows-deploy 2 | ## revision date: 2020-12-11 3 | on: 4 | workflow_dispatch: 5 | push: 6 | pull_request: 7 | # for now, CRON jobs only run on the default branch of the repo (i.e. usually on master) 8 | schedule: 9 | # * is a special character in YAML so you have to quote this string 10 | - cron: "0 4 * * *" 11 | 12 | name: tic 13 | 14 | jobs: 15 | all: 16 | runs-on: ${{ matrix.config.os }} 17 | 18 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 19 | 20 | strategy: 21 | fail-fast: false 22 | matrix: 23 | config: 24 | # use a different tic template type if you do not want to build on all listed platforms 25 | - { os: macOS-latest, r: "release", pkgdown: "true", latex: "true" } 26 | - { os: ubuntu-latest, r: "devel" } 27 | - { os: ubuntu-latest, r: "release" } 28 | 29 | env: 30 | # otherwise remotes::fun() errors cause the build to fail. Example: Unavailability of binaries 31 | R_REMOTES_NO_ERRORS_FROM_WARNINGS: true 32 | CRAN: ${{ matrix.config.cran }} 33 | # make sure to run `tic::use_ghactions_deploy()` to set up deployment 34 | TIC_DEPLOY_KEY: ${{ secrets.TIC_DEPLOY_KEY }} 35 | # prevent rgl issues because no X11 display is available 36 | RGL_USE_NULL: true 37 | # if you use bookdown or blogdown, replace "PKGDOWN" by the respective 38 | # capitalized term. This also might need to be done in tic.R 39 | BUILD_PKGDOWN: ${{ matrix.config.pkgdown }} 40 | # macOS >= 10.15.4 linking 41 | SDKROOT: /Library/Developer/CommandLineTools/SDKs/MacOSX.sdk 42 | # use GITHUB_TOKEN from GitHub to workaround rate limits in {remotes} 43 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 44 | 45 | steps: 46 | - uses: actions/checkout@v3 47 | 48 | - uses: r-lib/actions/setup-r@v2 49 | with: 50 | r-version: ${{ matrix.config.r }} 51 | Ncpus: 4 52 | 53 | # LaTeX. Installation time: 54 | # Linux: ~ 1 min 55 | # macOS: ~ 1 min 30s 56 | # Windows: never finishes 57 | - uses: r-lib/actions/setup-tinytex@v2 58 | if: matrix.config.latex == 'true' 59 | 60 | - uses: r-lib/actions/setup-pandoc@v2 61 | 62 | # set date/week for use in cache creation 63 | # https://github.community/t5/GitHub-Actions/How-to-set-and-access-a-Workflow-variable/m-p/42970 64 | # - cache R packages daily 65 | - name: "[Cache] Prepare daily timestamp for cache" 66 | if: runner.os != 'Windows' 67 | id: date 68 | run: echo "::set-output name=date::$(date '+%d-%m')" 69 | 70 | - name: "[Cache] Cache R packages" 71 | if: runner.os != 'Windows' 72 | uses: pat-s/always-upload-cache@v2.1.3 73 | with: 74 | path: ${{ env.R_LIBS_USER }} 75 | key: ${{ runner.os }}-r-${{ matrix.config.r }}-${{steps.date.outputs.date}} 76 | restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-${{steps.date.outputs.date}} 77 | 78 | # for some strange Windows reason this step and the next one need to be decoupled 79 | - name: "[Stage] Prepare" 80 | run: | 81 | Rscript -e "if (!requireNamespace('remotes')) install.packages('remotes', type = 'source')" 82 | Rscript -e "if (getRversion() < '3.2' && !requireNamespace('curl')) install.packages('curl', type = 'source')" 83 | 84 | - name: "[Stage] [Linux] Install curl and libgit2" 85 | if: runner.os == 'Linux' 86 | run: sudo apt install libcurl4-openssl-dev libgit2-dev 87 | 88 | - name: "[Stage] [macOS] Install libgit2" 89 | if: runner.os == 'macOS' 90 | run: brew install libgit2 91 | 92 | - name: "[Stage] [macOS] Install system libs for pkgdown" 93 | if: runner.os == 'macOS' && matrix.config.pkgdown != '' 94 | run: brew install harfbuzz fribidi 95 | 96 | - name: "[Stage] [Linux] Install system libs for pkgdown" 97 | if: runner.os == 'Linux' && matrix.config.pkgdown != '' 98 | run: sudo apt install libharfbuzz-dev libfribidi-dev 99 | 100 | - name: "[Stage] Install" 101 | if: matrix.config.os != 'macOS-latest' || matrix.config.r != 'devel' 102 | run: Rscript -e "remotes::install_github('ropensci/tic')" -e "print(tic::dsl_load())" -e "tic::prepare_all_stages()" -e "tic::before_install()" -e "tic::install()" 103 | 104 | # macOS devel needs its own stage because we need to work with an option to suppress the usage of binaries 105 | - name: "[Stage] Prepare & Install (macOS-devel)" 106 | if: matrix.config.os == 'macOS-latest' && matrix.config.r == 'devel' 107 | run: | 108 | echo -e 'options(Ncpus = 4, pkgType = "source", repos = structure(c(CRAN = "https://cloud.r-project.org/")))' > $HOME/.Rprofile 109 | Rscript -e "remotes::install_github('ropensci/tic')" -e "print(tic::dsl_load())" -e "tic::prepare_all_stages()" -e "tic::before_install()" -e "tic::install()" 110 | 111 | - name: "[Stage] Script" 112 | run: Rscript -e 'tic::script()' 113 | 114 | - name: "[Stage] After Success" 115 | if: matrix.config.os == 'macOS-latest' && matrix.config.r == 'release' 116 | run: Rscript -e "tic::after_success()" 117 | 118 | - name: "[Stage] Upload R CMD check artifacts" 119 | if: failure() 120 | uses: actions/upload-artifact@v2.2.1 121 | with: 122 | name: ${{ runner.os }}-r${{ matrix.config.r }}-results 123 | path: check 124 | - name: "[Stage] Before Deploy" 125 | run: | 126 | Rscript -e "tic::before_deploy()" 127 | 128 | - name: "[Stage] Deploy" 129 | run: Rscript -e "tic::deploy()" 130 | 131 | - name: "[Stage] After Deploy" 132 | run: Rscript -e "tic::after_deploy()" 133 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | docs/ 2 | .Rhistory 3 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: FeatureSelection 2 | Type: Package 3 | Title: Feature extraction and selection based on 'glmnet', 'xgboost' and 'ranger' 4 | Version: 1.0.0 5 | Date: 2024-08-09 6 | Authors@R: c( person(given = "Lampros", family = "Mouselimis", email = "mouselimislampros@gmail.com", role = c("aut", "cre"), comment = c(ORCID = "https://orcid.org/0000-0002-8024-1546"))) 7 | Maintainer: Lampros Mouselimis 8 | BugReports: https://github.com/mlampros/FeatureSelection/issues 9 | URL: https://github.com/mlampros/FeatureSelection 10 | Description: Feature extraction and selection based on 'glmnet', 'xgboost' and 'ranger' R packages. This package allows also the plotting of selected features and observing the correlation of multiple predictors. 11 | Depends: 12 | R(>= 3.3.0) 13 | Imports: 14 | doParallel, 15 | data.table, 16 | glmnet, 17 | ranger, 18 | xgboost, 19 | Matrix, 20 | magrittr, 21 | utils, 22 | stats, 23 | graphics, 24 | grDevices, 25 | rlang 26 | Suggests: 27 | testthat, 28 | covr 29 | SystemRequirements: update: apt-get -y update (deb) 30 | License: GPL-3 31 | Encoding: UTF-8 32 | RoxygenNote: 7.3.0 33 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM rocker/rstudio:devel 2 | 3 | LABEL maintainer='Lampros Mouselimis' 4 | 5 | RUN export DEBIAN_FRONTEND=noninteractive; apt-get -y update && \ 6 | apt-get install -y make zlib1g-dev libssl-dev libcurl4-openssl-dev && \ 7 | apt-get install -y sudo && \ 8 | apt-get -y update && \ 9 | R -e "install.packages(c( 'doParallel', 'data.table', 'glmnet', 'ranger', 'xgboost', 'Matrix', 'magrittr', 'utils', 'stats', 'graphics', 'grDevices', 'rlang', 'testthat', 'covr', 'remotes' ), repos = 'https://cloud.r-project.org/' )" && \ 10 | R -e "remotes::install_github('mlampros/FeatureSelection', upgrade = 'always', dependencies = TRUE, repos = 'https://cloud.r-project.org/')" && \ 11 | apt-get autoremove -y && \ 12 | apt-get clean 13 | 14 | ENV USER rstudio 15 | 16 | 17 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(DataSplit) 4 | export(add_probs_dfs) 5 | export(barplot_feat_select) 6 | export(class_folds) 7 | export(feature_selection) 8 | export(func_correlation) 9 | export(func_shuffle) 10 | export(normalized) 11 | export(regr_folds) 12 | export(wrapper_feat_select) 13 | importFrom(Matrix,Matrix) 14 | importFrom(Matrix,colSums) 15 | importFrom(data.table,as.data.table) 16 | importFrom(doParallel,registerDoParallel) 17 | importFrom(glmnet,cv.glmnet) 18 | importFrom(grDevices,dev.cur) 19 | importFrom(grDevices,dev.off) 20 | importFrom(graphics,barplot) 21 | importFrom(graphics,par) 22 | importFrom(magrittr,"%>%") 23 | importFrom(ranger,ranger) 24 | importFrom(rlang,.data) 25 | importFrom(stats,as.formula) 26 | importFrom(stats,cor) 27 | importFrom(stats,median) 28 | importFrom(stats,na.omit) 29 | importFrom(utils,combn) 30 | importFrom(xgboost,xgb.DMatrix) 31 | importFrom(xgboost,xgb.importance) 32 | importFrom(xgboost,xgb.train) 33 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | 2 | ## FeatureSelection 3 | 4 | * **09-08-2024**: I replaced **dplyr** with **data.table** because the **dplyr** functions 'summarise_each_' and 'funs' gave deprecation warnings 5 | * **19-05-2021**: I replaced **doMC** with **doParallel** because **doMC** does not work on both **Unix** and **Windows** OS (applies only to **'glmnet-lasso'** method if number of threads > 1) 6 | * **03-02-2020**: 7 | + Updated the R files so that *Feature Selection* works with the newest versions of the imported R packages 8 | + Adjusted the tests 9 | + Added Dockerfile and docker image 10 | + Updated the README.md and .travis.yml files 11 | * **18-05-2016**: I added tests and code-coverage 12 | -------------------------------------------------------------------------------- /R/correlation_of_multiple_predictors.R: -------------------------------------------------------------------------------- 1 | #' secondary function ( used in the func_correlation ) 2 | #' 3 | #' this is a secondary function that is used in the func_correlation 4 | #' 5 | #' @keywords internal 6 | #' @importFrom stats na.omit 7 | 8 | # use this function if more than one predictor output 9 | second_func_cor = function(dat_frame) { 10 | 11 | lst = list() 12 | 13 | for (i in 1:dim(dat_frame)[2]) { 14 | 15 | coln = colnames(dat_frame) 16 | temp_df = data.frame(dat_frame[, i], row.names = rownames(dat_frame)) 17 | colnames(temp_df) = coln[i] 18 | temp_df[temp_df == 0.0] = NA 19 | lst[[i]] = stats::na.omit(temp_df) 20 | } 21 | 22 | return(lst) 23 | } 24 | 25 | 26 | #' function to remove duplicated pairs of predictors ( used in the func_correlation ) 27 | #' 28 | #' this is a secondary function that is used in the func_correlation 29 | #' 30 | #' @keywords internal 31 | 32 | 33 | remove_duplic_func = function(sublist) { 34 | 35 | vec_col = vec_row = vec_prob = rep(NA, dim(sublist)[1]) 36 | 37 | for (j in 1:dim(sublist)[1]) { 38 | 39 | vec_col[j] = colnames(sublist) 40 | vec_row[j] = rownames(sublist)[j] 41 | vec_prob[j] = sublist[j, 1] 42 | } 43 | 44 | data.frame(predictor1 = vec_row, predictor2 = vec_col, prob = vec_prob) 45 | } 46 | 47 | 48 | #' find correlated variables 49 | #' 50 | #' This function takes a data frame or a matrix and returns either a data frame or a list of data frames with correlated variables 51 | #' 52 | #' @param data either a data frame or a matrix 53 | #' @param target either a string (name of the predictor/response in the data set) or a vector of strings (predictor/response names of the data set) 54 | #' @param correlation_thresh a float indicating the correlation threshold 55 | #' @param use_obs one of "everything", "all.obs", "complete.obs", "na.or.complete", "pairwise.complete.obs" 56 | #' @param correlation_method one of "pearson", "kendall", "spearman" 57 | #' @return either a data frame or a list of data frames 58 | #' @details 59 | #' This function takes a data frame or a matrix and returns the correlated variables of the data 60 | #' @export 61 | #' @importFrom stats cor na.omit 62 | 63 | 64 | func_correlation = function(data, target = NULL, correlation_thresh = NULL, use_obs = NULL, correlation_method = NULL) { 65 | 66 | if (!inherits(data, c("data.frame", "matrix"))) stop('the data should be either a data frame or a matrix') 67 | if (sum(unlist(lapply(1:dim(data)[2], function(x) is.factor(data[, x]) || is.character(data[, x]))) > 0)) stop(simpleError('data must be numeric')) 68 | if (is.null(correlation_method)) stop(simpleError('use one of "pearson", "kendall", "spearman" as correlation method')) 69 | if (is.null(use_obs)) stop(simpleError('use one of "everything", "all.obs", "complete.obs", "na.or.complete", "pairwise.complete.obs" as use_obs')) 70 | if (is.null(correlation_thresh) || correlation_thresh <= 0.0 || correlation_thresh >= 1.0) stop(simpleError('the correlation_thresh should be greater than 0.0 and less than 1.0')) 71 | 72 | df = data.frame(stats::cor(data, use = use_obs, method = correlation_method)) 73 | 74 | if (length(target) > 1) { # in case that target is a vector with multiple predictors, it returns a matrix with those predictors above the correlation_thresh 75 | 76 | df[upper.tri(df)] = diag(df) = 0.0 77 | df[df < correlation_thresh] = 0.0 78 | df = df[-as.vector(which(rowSums(df) == 0.0)), -as.vector(which(colSums(df) == 0.0))] # negate to remove all sparse columns and rows 79 | 80 | if (length(which(!is.na(match(colnames(df), target)))) == 1) { # exception if one of the column-names or row-names do not appear in the end-df 81 | 82 | filt = data.frame(row.names = rownames(df), df[, which(!is.na(match(colnames(df), target)))]) 83 | colnames(filt) = colnames(df)[which(!is.na(match(colnames(df), target)))] 84 | filt[filt == 0.0] = NA 85 | filt = stats::na.omit(filt) 86 | } 87 | 88 | else { 89 | 90 | filt = df[, which(!is.na(match(colnames(df), target)))] 91 | } 92 | } 93 | 94 | else if (is.null(target)) { 95 | 96 | df[upper.tri(df)] = diag(df) = 0.0 # return all predictors in case that is.null(target) 97 | df[df < correlation_thresh] = 0.0 98 | filt = df[-as.vector(which(rowSums(df) == 0.0)), -as.vector(which(colSums(df) == 0.0))] # negate to remove all sparse columns and rows 99 | } 100 | 101 | else { 102 | 103 | df_names = data.frame(df[, target], row.names = rownames(df)) # in case where target is a single string, I get a single column data.frame out 104 | colnames(df_names) = target 105 | df_names[rownames(df_names) == target, ] = NA # remove the 'target' column-name 106 | df_names = stats::na.omit(df_names) 107 | filt = subset(df_names, df_names[, target] >= correlation_thresh) 108 | filt = data.frame(features = rownames(filt), filt[, 1]) 109 | filt = filt[order(filt[, 2], decreasing = TRUE), ] 110 | filt = data.frame(filt[, 2], row.names = filt[, 1]) 111 | colnames(filt) = c(target) 112 | } 113 | 114 | if (dim(filt)[2] > 1) { 115 | 116 | out = second_func_cor(filt) 117 | out = out[unlist(lapply(out, function(x) dim(x)[1] != 0))] 118 | return(list(out_list = out, out_df = do.call(rbind, lapply(out, function(x) remove_duplic_func(x)))))} 119 | 120 | else { 121 | 122 | return(filt) 123 | } 124 | } 125 | 126 | 127 | -------------------------------------------------------------------------------- /R/create_folds_and_data_split.R: -------------------------------------------------------------------------------- 1 | #' stratified folds (in classification) 2 | #' 3 | #' this function creates stratified folds in binary and multiclass classification 4 | #' 5 | #' @param folds is an integer specifying the number of folds 6 | #' @param RESP is the response variable 7 | #' @param shuffle is a boolean specifying if the vector of indices should be shuffled or not 8 | #' @return a list of indices 9 | #' @export 10 | #' @importFrom utils combn 11 | #' @examples 12 | #' 13 | #' \dontrun{ 14 | #' 15 | #' data(iris) 16 | #' 17 | #' y = as.factor(iris[, 5]) 18 | #' 19 | #' folds = class_folds(10, y, shuffle = TRUE) 20 | #' } 21 | 22 | 23 | class_folds = function(folds, RESP, shuffle = FALSE) { 24 | 25 | if (!is.factor(RESP)) { 26 | 27 | stop(simpleError("RESP must be a factor")) 28 | } 29 | 30 | clas = lapply(unique(RESP), function(x) which(RESP == x)) 31 | 32 | len = lapply(clas, function(x) length(x)) 33 | 34 | samp_vec = rep(1/folds, folds) 35 | 36 | prop = lapply(len, function(y) sapply(1:length(samp_vec), function(x) round(y * samp_vec[x]))) 37 | 38 | repl = unlist(lapply(prop, function(x) sapply(1:length(x), function(y) rep(paste0('fold_', y), x[y])))) 39 | 40 | spl = suppressWarnings(split(1:length(RESP), repl)) 41 | 42 | sort_names = paste0('fold_', 1:folds) 43 | 44 | spl = spl[sort_names] 45 | 46 | if (length(table(unlist(lapply(spl, function(x) length(x))))) > 1) { 47 | 48 | warning('the folds are not equally split') # the warning appears when I divide the unique labels to the number of folds and instead of an integer I get a float 49 | } 50 | 51 | if (shuffle == TRUE) { 52 | 53 | spl = lapply(spl, function(x) func_shuffle(x)) # the indices of the unique levels will be shuffled 54 | } 55 | 56 | ind = t(utils::combn(1:folds, 2)) 57 | 58 | ind1 = apply(ind, 1, function(x) length(intersect(spl[x[1]], spl[x[2]]))) 59 | 60 | if (sum(ind1) > 0) { 61 | 62 | stop(simpleError("there is an intersection between the resulted indexes of the folds")) 63 | } 64 | 65 | if (length(unlist(spl)) != length(RESP)) { 66 | 67 | stop(simpleError("the number of items in the folds are not equal with the response items")) 68 | } 69 | 70 | spl 71 | } 72 | 73 | 74 | 75 | 76 | #' create folds (in regression) 77 | #' 78 | #' this function creates both stratified and non-stratified folds in regression 79 | #' 80 | #' @param folds is an integer specifying the number of folds 81 | #' @param RESP is the response variable 82 | #' @param stratified is a boolean specifying if the folds should be stratfied 83 | #' @return a list of indices 84 | #' @export 85 | #' @examples 86 | #' 87 | #' \dontrun{ 88 | #' 89 | #' data(iris) 90 | #' 91 | #' y = X[, 1] 92 | #' 93 | #' folds = regr_folds(5, y, stratified = FALSE) 94 | #' 95 | #' } 96 | 97 | 98 | regr_folds = function(folds, RESP, stratified = FALSE) { 99 | 100 | if (is.factor(RESP)) { 101 | 102 | stop(simpleError("this function is meant for regression for classification use the 'class_folds' function")) 103 | } 104 | 105 | samp_vec = rep(1/folds, folds) 106 | 107 | sort_names = paste0('fold_', 1:folds) 108 | 109 | 110 | if (stratified == TRUE) { 111 | 112 | stratif = cut(RESP, breaks = folds) 113 | 114 | clas = lapply(unique(stratif), function(x) which(stratif == x)) 115 | 116 | len = lapply(clas, function(x) length(x)) 117 | 118 | prop = lapply(len, function(y) sapply(1:length(samp_vec), function(x) round(y * samp_vec[x]))) 119 | 120 | repl = unlist(lapply(prop, function(x) sapply(1:length(x), function(y) rep(paste0('fold_', y), x[y])))) 121 | 122 | spl = suppressWarnings(split(1:length(RESP), repl))} 123 | 124 | else { 125 | 126 | prop = lapply(length(RESP), function(y) sapply(1:length(samp_vec), function(x) round(y * samp_vec[x]))) 127 | 128 | repl = func_shuffle(unlist(lapply(prop, function(x) sapply(1:length(x), function(y) rep(paste0('fold_', y), x[y]))))) 129 | 130 | spl = suppressWarnings(split(1:length(RESP), repl)) 131 | } 132 | 133 | spl = spl[sort_names] 134 | 135 | if (length(table(unlist(lapply(spl, function(x) length(x))))) > 1) { 136 | 137 | warning('the folds are not equally split') # the warning appears when I divide the unique labels to the number of folds and instead of an ingeger I get a float 138 | } 139 | 140 | if (length(unlist(spl)) != length(RESP)) { 141 | 142 | stop(simpleError("the length of the splits are not equal with the length of the response")) 143 | } 144 | 145 | spl 146 | } 147 | 148 | 149 | 150 | #' partition of data (train-test-split) 151 | #' 152 | #' @param y is a numeric vector (response variable) 153 | #' @param TrainRatio is the percentage of train-data after the partition 154 | #' @param regression is a boolean (TRUE, FALSE) indicating if it's a regression or classification task 155 | #' @param shuffle is a boolean (TRUE, FALSE) indicating if the data should be shuffled or not (by default 5 times) 156 | #' @param seed an integer specifying the random seed 157 | #' @return a list of indices (train-test) 158 | #' @export 159 | #' @examples 160 | #' 161 | #' \dontrun{ 162 | #' 163 | #' data(iris) 164 | #' 165 | #' y = X[, 1] 166 | #' 167 | #' split = DataSplit(y, TrainRatio = 0.75, regression = FALSE, shuffle = FALSE) 168 | #' 169 | #' } 170 | 171 | 172 | DataSplit = function(y, TrainRatio = 0.75, regression = TRUE, shuffle = FALSE, seed = 1) { 173 | 174 | if (TrainRatio >= 1.0 || TrainRatio <= 0.0) stop('TrainRation should be a float number greater than 0 and less than 1.0') 175 | 176 | if (regression) { 177 | set.seed(seed) 178 | idx_train = sample(1:length(y), size = round(TrainRatio * length(y))) 179 | idx_test = setdiff(1:length(y), idx_train) 180 | } 181 | 182 | if (!regression) { 183 | clas = lapply(unique(y), function(x) which(y == x)) 184 | set.seed(seed) 185 | idx_train = unlist(lapply(clas, function(x) sample(x, size = round(TrainRatio * length(x))))) 186 | idx_test = setdiff(1:length(y), idx_train) 187 | } 188 | 189 | if (shuffle) { 190 | 191 | for (i in c(1:5)){ idx_train = sample(idx_train, length(idx_train)) } 192 | 193 | for (i in c(1:5)){ idx_test = sample(idx_test, length(idx_test)) } 194 | } 195 | 196 | list(idx_train = idx_train, idx_test = idx_test) 197 | } 198 | -------------------------------------------------------------------------------- /R/feature_selection.R: -------------------------------------------------------------------------------- 1 | 2 | utils::globalVariables(c("%>%", 3 | ".", 4 | ".N", 5 | ".SD", 6 | "coefficients", 7 | "predict")) # Keep 'predict' as a global variable. It appears both in 'stats' and 'glmnet' however I can not specify 'predict.cv.glmnet' because the function does not appear in the >= 3.0.0 version of the package (I receive an error otherwise) 8 | 9 | 10 | #' Feature selection 11 | #' 12 | #' This function uses three different methods (glmnet, xgboost, ranger) in order to select important features. 13 | #' 14 | #' @param X a sparse Matrix, a matrix or a data frame 15 | #' @param y a vector of length representing the response variable 16 | #' @param method one of 'glmnet-lasso', 'xgboost', 'ranger' 17 | #' @param params_glmnet a list of parameters for the glmnet model 18 | #' @param params_xgboost a list of parameters for the xgboost model 19 | #' @param params_ranger a list of parameters for the ranger model 20 | #' @param xgb_sort sort the xgboost features by "Gain", "Cover" or "Frequency" ( defaults to "Frequency") 21 | #' @param CV_folds a number specifying the number of folds for cross validation 22 | #' @param stratified_regr a boolean determining if the folds in regression should be stratified 23 | #' @param scale_coefs_glmnet if TRUE, less important coefficients will be smaller than the more important ones (ranking/plotting by magnitude possible) 24 | #' @param cores_glmnet an integer determining the number of cores to register in glmnet 25 | #' @param verbose outputs info 26 | #' @return a data frame with the most important features 27 | #' @author Lampros Mouselimis 28 | #' 29 | #' @details 30 | #' 31 | #' This function returns the important features using one of the glmnet, xgboost or ranger algorithms. The glmnet algorithm can take either a sparse matrix, a matrix or a data frame 32 | #' and returns a data frame with non zero coefficients. The xgboost algorithm can take either a sparse matrix, a matrix or a data frame and returns the importance of the features in form 33 | #' of a data frame, furthermore it is possible to sort the features using one of the "Gain", "Cover" or "Frequency" methods. The ranger algorithm can take either a matrix or a data frame 34 | #' and returns the important features using one of the 'impurity' or 'permutation' methods. 35 | #' 36 | #' @export 37 | #' @importFrom glmnet cv.glmnet 38 | #' @importFrom data.table as.data.table 39 | #' @importFrom doParallel registerDoParallel 40 | #' @importFrom xgboost xgb.DMatrix xgb.train xgb.importance 41 | #' @importFrom ranger ranger 42 | #' @importFrom stats as.formula 43 | #' @importFrom Matrix colSums Matrix 44 | #' @importFrom magrittr %>% 45 | #' @importFrom rlang .data 46 | #' 47 | #' @examples 48 | #' 49 | #' \dontrun{ 50 | #' 51 | #' #........... 52 | #' # regression 53 | #' #........... 54 | #' 55 | #' data(iris) 56 | #' 57 | #' X = iris[, -5] 58 | #' y = X[, 1] 59 | #' X = X[, -1] 60 | #' 61 | #' params_glmnet = list(alpha = 1, 62 | #' family = 'gaussian', 63 | #' nfolds = 3, 64 | #' parallel = TRUE) 65 | #' 66 | #' res = feature_selection(X, 67 | #' y, 68 | #' method = 'glmnet-lasso', 69 | #' params_glmnet = params_glmnet, 70 | #' CV_folds = 5, 71 | #' cores_glmnet = 5) 72 | #' 73 | #' #...................... 74 | #' # binary classification 75 | #' #...................... 76 | #' 77 | #' y = iris[, 5] 78 | #' y = as.character(y) 79 | #' y[y == 'setosa'] = 'virginica' 80 | #' X = iris[, -5] 81 | #' 82 | #' params_ranger = list(write.forest = TRUE, 83 | #' probability = TRUE, 84 | #' num.threads = 6, 85 | #' num.trees = 50, 86 | #' verbose = FALSE, 87 | #' classification = TRUE, 88 | #' mtry = 2, 89 | #' min.node.size = 5, 90 | #' importance = 'impurity') 91 | #' 92 | #' res = feature_selection(X, 93 | #' y, 94 | #' method = 'ranger', 95 | #' params_ranger = params_ranger, 96 | #' CV_folds = 5) 97 | #' 98 | #' #.......................... 99 | #' # multiclass classification 100 | #' #.......................... 101 | #' 102 | #' y = iris[, 5] 103 | #' multiclass_xgboost = ifelse(y == 'setosa', 0, ifelse(y == 'virginica', 1, 2)) 104 | #' X = iris[, -5] 105 | #' 106 | #' params_xgboost = list(params = list("objective" = "multi:softprob", 107 | #' "bst:eta" = 0.35, 108 | #' "subsample" = 0.65, 109 | #' "num_class" = 3, 110 | #' "max_depth" = 6, 111 | #' "colsample_bytree" = 0.65, 112 | #' "nthread" = 2), 113 | #' nrounds = 50, 114 | #' print.every.n = 50, 115 | #' verbose = 0, 116 | #' maximize = FALSE) 117 | #' 118 | #' res = feature_selection(X, 119 | #' multiclass_xgboost, 120 | #' method = 'xgboost', 121 | #' params_xgboost = params_xgboost, 122 | #' CV_folds = 5) 123 | #' } 124 | 125 | 126 | feature_selection = function(X, 127 | y, 128 | method = NULL, 129 | params_glmnet = NULL, 130 | params_xgboost = NULL, 131 | params_ranger = NULL, 132 | xgb_sort = NULL, 133 | CV_folds = 5, 134 | stratified_regr = FALSE, 135 | scale_coefs_glmnet = FALSE, 136 | cores_glmnet = NULL, 137 | verbose = FALSE) { 138 | 139 | if (is.null(method)) stop("use method = .. to select one of the available methods : xgboost, glmnet-lasso, ranger") 140 | if (CV_folds < 1) stop("CV_folds should be >= 1") 141 | 142 | if (method == 'glmnet-lasso' && CV_folds == 1) { 143 | 144 | if (verbose) { 145 | cat('=====================================================================', '\n') 146 | cat('glmnet feature selection, starts...', '\n') 147 | cat('=====================================================================', '\n') 148 | } 149 | 150 | if (params_glmnet$family == 'binomial' || params_glmnet$family == 'multinomial') { 151 | 152 | y = as.factor(y) 153 | } 154 | 155 | isna = as.vector(Matrix::colSums(is.na(X))) # replace the NA-values of each column with the median 156 | 157 | if (sum(isna) > 0) { 158 | 159 | if (verbose) { 160 | cat('\n') 161 | cat('Missing values present in glmnet-lasso. They will be replaced with the median.', '\n') 162 | cat('\n') 163 | } 164 | 165 | X = func_replace_NAs(X, which(isna > 0)) 166 | } 167 | 168 | Feature = colnames(X) 169 | 170 | if (is.data.frame(X)) { 171 | 172 | X <- as.matrix(X) 173 | } 174 | 175 | else if (is.matrix(X) || (inherits(X, 'dgCMatrix'))) { 176 | 177 | X = X 178 | } 179 | 180 | else { 181 | 182 | stop(simpleError("X must be either a data.frame or a (sparse-) matrix")) 183 | } 184 | 185 | # scale the explanatory variables as explained here : http://stats.stackexchange.com/questions/14853/variable-importance-from-glmnet 186 | # [ exclude from scaling those predictors that have less than 2 unique values, OTHERWISE error ] 187 | if (scale_coefs_glmnet) X[, -which(as.vector(apply(X, 2, function(x) length(unique(x)))) < 2)] = scale(X[, -which(as.vector(apply(X, 2, function(x) length(unique(x)))) < 2)]) 188 | 189 | params_glmnet[['x']] = X 190 | params_glmnet[['y']] = y 191 | if (scale_coefs_glmnet) params_glmnet[['standardize']] = FALSE # after using scale() ensure that the variables won't be standardized prior to fitting the model 192 | 193 | cv = do.call(glmnet::cv.glmnet, params_glmnet) 194 | 195 | pr = predict(cv, type = 'coefficients', s = cv$lambda.min) 196 | 197 | if (is.factor(y)) { # in case of classification glmnet returns importance in form of a sparse matrix 198 | 199 | if (length(unique(y)) == 2) { # in case of binary-classification it returns a single column 200 | 201 | df1 = as.matrix(pr)[-1, ] 202 | df1 = data.frame(features = names(df1), importance = as.vector(df1)) 203 | 204 | if (scale_coefs_glmnet) { 205 | df1[, 2] = abs(df1[, 2]) 206 | df1 = df1[order(df1[, 2], decreasing = TRUE), ] 207 | } 208 | } 209 | 210 | if (length(unique(y)) > 2) { # in case of multiclass classification it returns a sparse matrix for each class separately 211 | 212 | df1 = do.call(rbind, lapply(pr, function(x) as.matrix(x)[-1, ])) 213 | df1 = colMeans(df1) 214 | 215 | if (any(df1 == 0.0)) { 216 | 217 | df1 = df1[-which(df1 == 0L)] # remove zero-coefficient predictors 218 | } 219 | 220 | df1 = data.frame(features = names(df1), importance = as.vector(df1)) 221 | if (scale_coefs_glmnet) { 222 | df1[, 2] = abs(df1[, 2]) # after scaling, I take the absolute value in order to plot the important features [ this because many of them have high negative values -- meaning high impact on the response ] 223 | df1 = df1[order(df1[, 2], decreasing = TRUE), ] 224 | } 225 | } 226 | } 227 | 228 | else { 229 | 230 | df = data.frame(Feature, coefficients = pr[2:length(pr)], stringsAsFactors = FALSE) 231 | 232 | df1 = subset(df, df[,2] != 0) 233 | if (scale_coefs_glmnet) { 234 | df1[, 2] = abs(df1[, 2]) 235 | df1 = df1[order(df1[, 2], decreasing = TRUE), ] 236 | } 237 | } 238 | 239 | return(df1) 240 | } 241 | 242 | else if (method == 'glmnet-lasso' && CV_folds > 1) { 243 | 244 | if (params_glmnet$parallel && !is.null(cores_glmnet)) { 245 | 246 | if (.Platform$OS.type == "unix") { 247 | doParallel::registerDoParallel(cores = cores_glmnet) 248 | } 249 | 250 | if (.Platform$OS.type == "windows") { 251 | cl = parallel::makePSOCKcluster(cores_glmnet) 252 | doParallel::registerDoParallel(cl = cl) # compared to unix, ".. if not specified, on Windows a three worker cluster is created and used .." [ see also: https://stackoverflow.com/a/45122448/8302386 ] 253 | } 254 | } 255 | 256 | if (verbose) { 257 | cat('=====================================================================', '\n') 258 | cat('glmnet feature selection, starts...', '\n') 259 | cat('=====================================================================', '\n') 260 | } 261 | 262 | if (params_glmnet$family == 'binomial' || params_glmnet$family == 'multinomial') { 263 | 264 | y = as.factor(y) 265 | } 266 | 267 | if (is.factor(y)) { 268 | 269 | folds = class_folds(CV_folds, y, shuffle = TRUE) 270 | } 271 | 272 | else { 273 | 274 | folds = regr_folds(CV_folds, y, stratified = stratified_regr) 275 | } 276 | 277 | get_all_feat = list() 278 | 279 | for (i in 1:CV_folds) { 280 | 281 | if (verbose) { 282 | cat('--------------------------------------------------------------------', '\n') 283 | cat('Fold ', i, '\n') 284 | cat('--------------------------------------------------------------------', '\n') 285 | } 286 | 287 | X_folds = X[unlist(folds[-i]), ] 288 | y_folds = y[unlist(folds[-i])] 289 | 290 | isna = as.vector(Matrix::colSums(is.na(X_folds))) # replace the NA-values of each column with the median 291 | 292 | if (sum(isna) > 0) { 293 | 294 | if (verbose) { 295 | cat('\n') 296 | cat('Missing values present in glmnet-lasso. They will be replaced with the median.', '\n') 297 | cat('\n') 298 | } 299 | 300 | X_folds = func_replace_NAs(X_folds, which(isna > 0)) 301 | } 302 | 303 | Feature = colnames(X_folds) 304 | 305 | if (is.data.frame(X_folds)) { 306 | 307 | X_folds <- as.matrix(X_folds) 308 | } 309 | 310 | else if (is.matrix(X_folds) || (inherits(X, 'dgCMatrix'))) { 311 | 312 | X_folds = X_folds 313 | } 314 | 315 | else { 316 | 317 | stop(simpleError("X must be either a data.frame or a (sparse-) matrix")) 318 | } 319 | 320 | # scale the explanatory variables as explained here : http://stats.stackexchange.com/questions/14853/variable-importance-from-glmnet 321 | # [ exclude from scaling those predictors that have less than 2 unique values, OTHERWISE error ] 322 | if (scale_coefs_glmnet) X_folds[, -which(as.vector(apply(X_folds, 2, function(x) length(unique(x)))) < 2)] = scale(X_folds[, -which(as.vector(apply(X_folds, 2, function(x) length(unique(x)))) < 2)]) 323 | 324 | params_glmnet[['x']] = X_folds 325 | params_glmnet[['y']] = y_folds 326 | if (scale_coefs_glmnet) params_glmnet[['standardize']] = FALSE # after using scale() ensure that the variables won't be standardized prior to fitting the model 327 | 328 | cv = do.call(glmnet::cv.glmnet, params_glmnet) 329 | 330 | pr = predict(cv, type = 'coefficients', s = cv$lambda.min) 331 | 332 | if (is.factor(y)) { # in case of classification glmnet returns importance in form of a sparse matrix 333 | 334 | if (length(unique(y)) == 2) { # in case of binary-classification it returns a single column 335 | 336 | get_all_feat[[i]] = as.matrix(pr)[-1, ] 337 | } 338 | 339 | if (length(unique(y)) > 2) { # in case of multiclass classification it returns a sparse matrix for each class separately 340 | 341 | get_all_feat[[i]] = do.call(rbind, lapply(pr, function(x) as.matrix(x)[-1, ])) 342 | 343 | gc() 344 | } 345 | } 346 | 347 | else { 348 | 349 | df = data.frame(Feature, coefficients = pr[2:length(pr)], stringsAsFactors = FALSE) 350 | 351 | df1 = subset(df, df[,2] != 0) 352 | 353 | get_all_feat[[i]] = df1 354 | 355 | gc() 356 | } 357 | } 358 | 359 | if (is.factor(y)) { 360 | 361 | if (length(unique(y)) == 2) { 362 | 363 | tbl_x = colMeans(data.frame(do.call(rbind, get_all_feat))) 364 | tbl_x = data.frame(features = names(tbl_x), importance = as.vector(tbl_x)) 365 | if (scale_coefs_glmnet) { 366 | tbl_x[, 2] = abs(tbl_x[, 2]) 367 | tbl_x = tbl_x[order(tbl_x[, 2], decreasing = TRUE), ] 368 | } 369 | } 370 | 371 | if (length(unique(y)) > 2) { 372 | 373 | df1 = data.frame(add_probs_dfs(get_all_feat), row.names = rownames(get_all_feat[[1]])) 374 | df1 = colMeans(df1) 375 | 376 | if (any(df1 == 0.0)) { 377 | 378 | df1 = df1[-which(df1 == 0L)] # remove zero-coefficient predictors 379 | } 380 | 381 | tbl_x = data.frame(features = names(df1), importance = as.vector(df1)) 382 | if (scale_coefs_glmnet) { 383 | tbl_x[, 2] = abs(tbl_x[, 2]) # after scaling, I take the absolute value in order to plot the important features [ this because many of them have high negative values -- meaning high impact on the response ] 384 | tbl_x = tbl_x[order(tbl_x[, 2], decreasing = TRUE), ] 385 | } 386 | } 387 | } 388 | 389 | else { 390 | 391 | all_feat = data.frame(do.call('rbind', get_all_feat)) |> 392 | data.table::as.data.table() 393 | 394 | tbl_x = all_feat[, .(coefficients = mean(coefficients, na.rm = TRUE), 395 | Frequency = .N), 396 | by = 'Feature'] |> 397 | as.data.frame() 398 | 399 | if (scale_coefs_glmnet) tbl_x[, 2] = abs(tbl_x[, 2]) 400 | tbl_x = tbl_x[order(tbl_x$Frequency, tbl_x$coefficients, decreasing = TRUE),] # the data.frame in 'glmnet-lasso' is sorted by Frequency (default) 401 | } 402 | 403 | if (params_glmnet$parallel && !is.null(cores_glmnet)) { 404 | 405 | if (.Platform$OS.type == "windows") { 406 | parallel::stopCluster(cl = cl) 407 | } 408 | } 409 | 410 | return(tbl_x) 411 | } 412 | 413 | else if (method == 'xgboost' && CV_folds == 1) { 414 | 415 | if (verbose) { 416 | cat('=====================================================================', '\n') 417 | cat('xgboost feature selection, starts...', '\n') 418 | cat('=====================================================================', '\n') 419 | } 420 | 421 | if (is.data.frame(X)) { 422 | 423 | dtrain <- xgboost::xgb.DMatrix(data = as.matrix(X), label = y, missing = NA) 424 | } 425 | 426 | else if (is.matrix(X) || (inherits(X, 'dgCMatrix'))) { 427 | 428 | dtrain <- xgboost::xgb.DMatrix(data = X, label = y, missing = NA) 429 | } 430 | 431 | else { 432 | 433 | stop(simpleError("X must be either a data.frame or a (sparse-) matrix")) 434 | } 435 | 436 | params_xgboost[['watchlist']] = list(train = dtrain) 437 | params_xgboost[['data']] = dtrain 438 | 439 | bst = suppressWarnings(do.call(xgboost::xgb.train, params_xgboost)) 440 | 441 | tbl1 <- data.frame(xgboost::xgb.importance(colnames(X), model = bst)) 442 | 443 | if (is.null(xgb_sort) || (xgb_sort == 'Frequency')) { 444 | 445 | tbl1 = tbl1[order(tbl1$Frequency, decreasing = TRUE),] 446 | } 447 | 448 | else if (xgb_sort == 'Gain') { 449 | 450 | tbl1 = tbl1[order(tbl1$Gain, decreasing = TRUE),] 451 | } 452 | 453 | else if (xgb_sort == 'Cover') { 454 | 455 | tbl1 = tbl1[order(tbl1$Cover, decreasing = TRUE),] 456 | } 457 | 458 | return(tbl1) 459 | } 460 | 461 | else if (method == 'xgboost' && CV_folds > 1) { 462 | 463 | if (verbose) { 464 | cat('=====================================================================', '\n') 465 | cat('xgboost feature selection, starts...', '\n') 466 | cat('=====================================================================', '\n') 467 | } 468 | 469 | if (length(unique(y)) == 2 || ("num_class" %in% names(params_xgboost$params))) { 470 | 471 | folds = class_folds(CV_folds, as.factor(y), shuffle = TRUE) 472 | } 473 | 474 | else { 475 | 476 | folds = regr_folds(CV_folds, y, stratified = stratified_regr) 477 | } 478 | 479 | get_all_feat = list() 480 | 481 | for (i in 1:CV_folds) { 482 | 483 | if (verbose) { 484 | cat('--------------------------------------------------------------------', '\n') 485 | cat('Fold ', i, '\n') 486 | cat('--------------------------------------------------------------------', '\n') 487 | } 488 | 489 | X_folds = X[unlist(folds[-i]), ] 490 | 491 | y_folds = y[unlist(folds[-i])] 492 | 493 | if (is.data.frame(X_folds)) { 494 | 495 | dtrain <- xgboost::xgb.DMatrix(data = as.matrix(X_folds), label = y_folds, missing = NA) 496 | } 497 | 498 | else if (is.matrix(X_folds) || (inherits(X_folds, 'dgCMatrix'))) { 499 | 500 | dtrain <- xgboost::xgb.DMatrix(data = X_folds, label = y_folds, missing = NA) 501 | } 502 | 503 | else { 504 | 505 | stop(simpleError("X must be either a data.frame or a (sparse-) matrix")) 506 | } 507 | 508 | params_xgboost[['watchlist']] = list(train = dtrain) 509 | params_xgboost[['data']] = dtrain 510 | 511 | bst = suppressWarnings(do.call(xgboost::xgb.train, params_xgboost)) 512 | 513 | get_all_feat[[i]] <- data.frame(xgboost::xgb.importance(colnames(X_folds), model = bst)) 514 | 515 | gc() 516 | } 517 | 518 | tbl_x = data.frame(do.call('rbind', get_all_feat)) |> 519 | data.table::as.data.table() 520 | 521 | tbl1 = tbl_x[, lapply(.SD, mean, na.rm = TRUE), by = 'Feature'] |> 522 | as.data.frame() 523 | 524 | if (is.null(xgb_sort) || (xgb_sort == 'Frequency')) { 525 | 526 | tbl1 = tbl1[order(tbl1$Frequency, decreasing = TRUE),] 527 | } 528 | 529 | else if (xgb_sort == 'Gain') { 530 | 531 | tbl1 = tbl1[order(tbl1$Gain, decreasing = TRUE),] 532 | } 533 | 534 | else if (xgb_sort == 'Cover') { 535 | 536 | tbl1 = tbl1[order(tbl1$Cover, decreasing = TRUE),] 537 | } 538 | 539 | return(tbl1) 540 | } 541 | 542 | else if (method == 'ranger' && CV_folds == 1) { 543 | 544 | if (!(is.matrix(X) || is.data.frame(X))) { 545 | 546 | stop(simpleError("X must be either a data.frame or a matrix")) 547 | } 548 | 549 | if (params_ranger$classification) { 550 | 551 | y = as.factor(y) 552 | } 553 | 554 | if (verbose) { 555 | cat('=====================================================================', '\n') 556 | cat('ranger feature selection, starts...', '\n') 557 | cat('=====================================================================', '\n') 558 | } 559 | 560 | if (!"dependent.variable.name" %in% names(params_ranger)) { 561 | 562 | form = stats::as.formula(paste0(paste0('y ~ '), paste(colnames(X), collapse = '+'))) 563 | 564 | params_ranger[['formula']] = form 565 | 566 | #dat = data.frame(y = y, X)} 567 | } 568 | 569 | dat = cbind(y = y, X) # include y in the data so that it works with or without the 'dependent.variable.name' 570 | 571 | # else { 572 | # 573 | # dat = X 574 | # } 575 | 576 | params_ranger[['data']] = dat 577 | 578 | fit = do.call(ranger::ranger, params_ranger) 579 | 580 | tbl_x = data.frame(names(fit$variable.importance), as.vector(fit$variable.importance)) 581 | colnames(tbl_x) = c('Feature', params_ranger$importance) 582 | 583 | tbl1 = tbl_x[order(tbl_x[, 2], decreasing = TRUE), ] 584 | 585 | return(tbl1) 586 | } 587 | 588 | else if (method == 'ranger' && CV_folds > 1) { 589 | 590 | if (verbose) { 591 | cat('=====================================================================', '\n') 592 | cat('ranger feature selection, starts...', '\n') 593 | cat('=====================================================================', '\n') 594 | } 595 | 596 | if (params_ranger$classification) { 597 | 598 | y = as.factor(y) 599 | } 600 | 601 | if (is.factor(y)) { 602 | 603 | folds = class_folds(CV_folds, y, shuffle = TRUE) 604 | } 605 | 606 | else { 607 | 608 | folds = regr_folds(CV_folds, y, stratified = stratified_regr) 609 | } 610 | 611 | get_all_feat = list() 612 | 613 | for (i in 1:CV_folds) { 614 | 615 | if (!(is.matrix(X) || is.data.frame(X))) { 616 | 617 | stop(simpleError("X must be either a data.frame or a matrix")) 618 | } 619 | 620 | if (verbose) { 621 | cat('--------------------------------------------------------------------', '\n') 622 | cat('Fold ', i, '\n') 623 | cat('--------------------------------------------------------------------', '\n') 624 | } 625 | 626 | X_folds = X[unlist(folds[-i]), ] 627 | 628 | y_folds = y[unlist(folds[-i])] 629 | 630 | if (!"dependent.variable.name" %in% names(params_ranger)) { 631 | 632 | form = stats::as.formula(paste0(paste0('y ~ '), paste(colnames(X_folds), collapse = '+'))) 633 | 634 | params_ranger[['formula']] = form 635 | 636 | #dat = data.frame(y = y_folds, X_folds)} 637 | } 638 | 639 | dat = cbind(y = y_folds, X_folds) # include y in the data so that it works with or without the 'dependent.variable.name' 640 | 641 | # else { 642 | # 643 | # dat = X_folds 644 | # } 645 | 646 | params_ranger[['data']] = dat 647 | 648 | fit = do.call(ranger::ranger, params_ranger) 649 | 650 | tbl_x = data.frame(names(fit$variable.importance), as.vector(fit$variable.importance)) 651 | colnames(tbl_x) = c('Feature', params_ranger$importance) 652 | 653 | get_all_feat[[i]] <- tbl_x 654 | 655 | gc() 656 | } 657 | 658 | tbl_x = data.frame(do.call('rbind', get_all_feat)) |> 659 | data.table::as.data.table() 660 | 661 | tbl1 = tbl_x[, lapply(.SD, mean, na.rm = TRUE), by = 'Feature'] |> 662 | as.data.frame() 663 | 664 | tbl1 = tbl1[order(tbl1[, 2], decreasing = TRUE), ] 665 | 666 | return(tbl1) 667 | } 668 | } 669 | 670 | 671 | -------------------------------------------------------------------------------- /R/plot_feature_selection.R: -------------------------------------------------------------------------------- 1 | 2 | #' plot important features 3 | #' 4 | #' This function takes the result of the feature_selection function or the wrapper_feat_select function and returns a barplot with the important features 5 | #' 6 | #' @param obj either a data frame or a list from the functions : feature_selection, wrapper_feat_select 7 | #' @param params_barplot a list of parameters needed for plotting the important features 8 | #' @param xgb_sort sort the xgboost features by "Gain", "Cover" or "Frequency" ( defaults to "Frequency") 9 | #' @return a barplot with the important features of each method 10 | #' @details 11 | #' This function takes a data frame (from the feature_selection function) or a list (from the wrapper_feat_select function) and returns a barplot of the important features. 12 | #' If union is TRUE in the params_barplot vector it returns also the average importance of all methods 13 | #' @export 14 | #' @importFrom graphics barplot par 15 | #' @importFrom grDevices dev.cur dev.off 16 | #' @examples 17 | #' 18 | #' \dontrun{ 19 | #' 20 | #' data(iris) 21 | #' 22 | #' X = iris[, -5] 23 | #' y = X[, 1] 24 | #' X = X[, -1] 25 | #' 26 | #' #............................. 27 | #' # plot of the wrapper function 28 | #' #............................. 29 | #' 30 | #' feat = wrapper_feat_select(X, y, params_glmnet = params_glmnet, params_xgboost = params_xgboost, params_ranger = params_ranger, xgb_sort = NULL, 31 | #' CV_folds = 10, stratified_regr = FALSE, cores_glmnet = 2, params_features = params_features) 32 | #' 33 | #' params_barplot = list(keep_features = 5, horiz = TRUE, cex.names = 1.0) 34 | #' 35 | #' barplot_feat_select(feat, params_barplot, xgb_sort = NULL) 36 | #' 37 | #' 38 | #' #....................................... 39 | #' # plot of the feature_selection function 40 | #' #....................................... 41 | #' 42 | #' res = feature_selection(X, y, method = 'glmnet-lasso', params_glmnet = params_glmnet, CV_folds = 5, cores_glmnet = 5) 43 | #' 44 | #' 45 | #' params_barplot = list(keep_features = 5, horiz = TRUE, cex.names = 0.8) 46 | #' 47 | #' barplot_feat_select(res, params_barplot, xgb_sort = NULL) 48 | #' 49 | #' } 50 | 51 | 52 | barplot_feat_select = function(obj, params_barplot, xgb_sort = NULL) { 53 | 54 | 55 | if (is.null(params_barplot$keep_features)) { 56 | 57 | stop(simpleError("specify a maximum number of features to be plotted using the parameter keep_features in the params_barplot list")) 58 | } 59 | 60 | if (is.data.frame(obj)) { 61 | 62 | if (grDevices::dev.cur() != 1) { 63 | 64 | grDevices::dev.off() # reset par() 65 | } 66 | 67 | graphics::par(las = 2) # make label text perpendicular to axis 68 | graphics::par(mar = c(5, 8, 4, 2)) # increase y-axis margin. 69 | 70 | if (dim(obj)[2] == 3) { # glmnet-lasso , CV_folds > 1 71 | 72 | graphics::barplot(obj[params_barplot$keep_features:1, 3], main = "glmnet-importance", horiz = params_barplot$horiz, names.arg = obj[params_barplot$keep_features:1, 1], 73 | 74 | cex.names = params_barplot$cex.names)} 75 | 76 | else if (dim(obj)[2] == 4) { # xgboost 77 | 78 | if (is.null(xgb_sort) || xgb_sort == 'Frequency') { 79 | 80 | obj = obj[order(obj[, 4], decreasing = TRUE), ] 81 | obj = obj[, c(1, 4)]} 82 | 83 | else if (xgb_sort == 'Cover') { 84 | 85 | obj = obj[order(obj[, 3], decreasing = TRUE), ] 86 | obj = obj[, c(1, 3)]} 87 | 88 | else if (xgb_sort == 'Gain') { 89 | 90 | obj = obj[order(obj[, 2], decreasing = TRUE), ] 91 | obj = obj[, c(1, 2)]} 92 | 93 | else { 94 | 95 | stop(simpleError("not a valid method for xgb_sort")) 96 | } 97 | 98 | graphics::barplot(obj[params_barplot$keep_features:1, 2], main = "xgboost-importance", horiz = params_barplot$horiz, names.arg = obj[params_barplot$keep_features:1, 1], 99 | 100 | cex.names = params_barplot$cex.names)} 101 | 102 | else if (dim(obj)[2] == 2 && (colnames(obj)[2] %in% c('impurity', 'permutation'))) { 103 | 104 | graphics::barplot(obj[params_barplot$keep_features:1, 2], main = "ranger-importance", horiz = params_barplot$horiz, names.arg = obj[params_barplot$keep_features:1, 1], 105 | 106 | cex.names = params_barplot$cex.names)} 107 | 108 | else if (dim(obj)[2] == 2) { # glmnet-lasso, CV_folds = 1 109 | 110 | graphics::barplot(obj[params_barplot$keep_features:1, 2], main = "glmnet-importance", horiz = params_barplot$horiz, names.arg = obj[params_barplot$keep_features:1, 1], 111 | 112 | cex.names = params_barplot$cex.names) 113 | } 114 | } 115 | 116 | else { 117 | 118 | if (length(names(obj)) < 3 && ("all_feat" %in% names(obj))) { # union = TRUE 119 | 120 | if (grDevices::dev.cur() != 1) { 121 | 122 | grDevices::dev.off() # reset par() 123 | } 124 | 125 | graphics::par(las = 2) # make label text perpendicular to axis 126 | graphics::par(mar = c(5, 8, 4, 2)) # increase y-axis margin. 127 | 128 | graphics::par(mfrow = c(1, length(names(obj$all_feat)) + 1)) 129 | 130 | if (is.null(xgb_sort)) xgb_sort = 'Frequency' 131 | 132 | for (i in names(obj$all_feat)) { 133 | 134 | if (i == 'glmnet-lasso') { 135 | 136 | obj_g = obj$all_feat[['glmnet-lasso']] 137 | 138 | graphics::barplot(obj_g[params_barplot$keep_features:1, 3], main = "glmnet-importance", horiz = params_barplot$horiz, names.arg = obj_g[params_barplot$keep_features:1, 1], 139 | 140 | cex.names = params_barplot$cex.names)} 141 | 142 | else if (i == 'xgboost') { 143 | 144 | obj_x = obj$all_feat[['xgboost']] 145 | obj_x = obj_x[order(obj_x[, xgb_sort], decreasing = TRUE), ] 146 | tmp_xgb = obj_x[params_barplot$keep_features:1, xgb_sort] 147 | 148 | graphics::barplot(tmp_xgb, main = "xgboost-importance", horiz = params_barplot$horiz, names.arg = obj_x[params_barplot$keep_features:1, 1], 149 | 150 | cex.names = params_barplot$cex.names)} 151 | 152 | else if (i == 'ranger') { 153 | 154 | obj_r = obj$all_feat[['ranger']] 155 | 156 | graphics::barplot(obj_r[params_barplot$keep_features:1, 2], main = "ranger-importance", horiz = params_barplot$horiz, names.arg = obj_r[params_barplot$keep_features:1, 1], 157 | 158 | cex.names = params_barplot$cex.names) 159 | } 160 | } 161 | 162 | graphics::barplot(obj$union_feat[params_barplot$keep_features:1, 2], main = "union-importance", 163 | 164 | horiz = params_barplot$horiz, names.arg = obj$union_feat[params_barplot$keep_features:1, 1], cex.names = params_barplot$cex.names) 165 | } 166 | 167 | else if (length(names(obj)) < 3 && (sum(c("glmnet-lasso", "xgboost", "ranger") %in% names(obj)) > 0)){ # union = FALSE 168 | 169 | if (grDevices::dev.cur() != 1) { 170 | 171 | grDevices::dev.off() # reset par() 172 | } 173 | 174 | graphics::par(las = 2) # make label text perpendicular to axis 175 | graphics::par(mar = c(5, 8, 4, 2)) # increase y-axis margin. 176 | 177 | graphics::par(mfrow = c(1, length(names(obj)))) 178 | 179 | if (is.null(xgb_sort)) xgb_sort = 'Frequency' 180 | 181 | for (i in names(obj)) { 182 | 183 | if (i == 'glmnet-lasso') { 184 | 185 | obj_g = obj[['glmnet-lasso']] 186 | 187 | graphics::barplot(obj_g[params_barplot$keep_features:1, 3], main = "glmnet-importance", horiz = params_barplot$horiz, names.arg = obj_g[params_barplot$keep_features:1, 1], 188 | 189 | cex.names = params_barplot$cex.names)} 190 | 191 | else if (i == 'xgboost') { 192 | 193 | obj_x = obj[['xgboost']] 194 | obj_x = obj_x[order(obj_x[, xgb_sort], decreasing = TRUE), ] 195 | tmp_xgb = obj_x[params_barplot$keep_features:1, xgb_sort] 196 | 197 | graphics::barplot(tmp_xgb, main = "xgboost-importance", horiz = params_barplot$horiz, names.arg = obj_x[params_barplot$keep_features:1, 1], 198 | 199 | cex.names = params_barplot$cex.names)} 200 | 201 | else if (i == 'ranger') { 202 | 203 | obj_r = obj[['ranger']] 204 | 205 | graphics::barplot(obj_r[params_barplot$keep_features:1, 2], main = "ranger-importance", horiz = params_barplot$horiz, names.arg = obj_r[params_barplot$keep_features:1, 1], 206 | 207 | cex.names = params_barplot$cex.names) 208 | } 209 | 210 | else { 211 | 212 | stop('invalid method') 213 | } 214 | } 215 | } 216 | } 217 | } 218 | 219 | -------------------------------------------------------------------------------- /R/utils.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | #' secondary function to replace NAs 4 | #' 5 | #' @keywords internal 6 | #' 7 | #' @importFrom stats median 8 | 9 | func_replace_NAs = function(data, which_isna) { 10 | 11 | for (i in which_isna) { 12 | 13 | tmp_median = stats::median(data[, i], na.rm = T) 14 | 15 | data[which(is.na(data[, i])), i] = tmp_median 16 | } 17 | 18 | return(data) 19 | } 20 | 21 | 22 | 23 | #' shuffle data 24 | #' 25 | #' this function shuffles the items of a vector 26 | #' 27 | #' @param vec is a vector of indices 28 | #' @param times is a number 29 | #' @return shuffled indices of a vector 30 | #' @export 31 | 32 | 33 | func_shuffle = function(vec, times = 10) { 34 | 35 | for (i in 1:times) { 36 | 37 | out = sample(vec, length(vec)) 38 | } 39 | out 40 | } 41 | 42 | 43 | #' addition of probability-data-frames 44 | #' 45 | #' this function takes a number of probability data frames and returns their average (mainly used in multi-class classification). 46 | #' 47 | #' @param PREDS_LST is a list of data frames 48 | #' @return average of a number of data frames 49 | #' @export 50 | 51 | 52 | add_probs_dfs = function(PREDS_LST) { 53 | 54 | if (!inherits(PREDS_LST, "list")) stop("PREDS_LST must be a list") 55 | 56 | r = all(unlist(lapply(PREDS_LST, nrow)) == unlist(lapply(PREDS_LST, nrow))[1]) 57 | c = all(unlist(lapply(PREDS_LST, ncol)) == unlist(lapply(PREDS_LST, ncol))[1]) 58 | 59 | if (!all(c(r,c))) stop("the dimensions of the included data.frames or matrices differ") 60 | 61 | init_df = data.frame(matrix(rep(0, dim(PREDS_LST[[1]])[1]*dim(PREDS_LST[[1]])[2]), nrow = dim(PREDS_LST[[1]])[1], ncol = dim(PREDS_LST[[1]])[2])) 62 | 63 | for (i in 1:length(PREDS_LST)) { 64 | 65 | init_df = init_df + PREDS_LST[[i]] 66 | } 67 | 68 | init_df = init_df/length(PREDS_LST) 69 | colnames(init_df) = colnames(PREDS_LST[[1]]) 70 | 71 | return(as.matrix(init_df)) 72 | } 73 | 74 | 75 | #' normalize data 76 | #' 77 | #' this function normalizes the feature importance of the algorithms, so that data between algorithms is in the same scale 78 | #' 79 | #' @param x is a numeric vector 80 | #' @return normalized data in form of a vector 81 | #' @export 82 | 83 | 84 | normalized = function(x) { 85 | 86 | out = (x - min(x))/(max(x) - min(x)) 87 | 88 | out 89 | } 90 | 91 | 92 | -------------------------------------------------------------------------------- /R/wrapper_feature_selection.R: -------------------------------------------------------------------------------- 1 | 2 | #' Wraps all three methods 3 | #' 4 | #' This function is a wrapper for the feature_selection function 5 | #' 6 | #' @param X a sparse Matrix, a matrix or a data frame 7 | #' @param y a vector of length representing the response variable 8 | #' @param params_glmnet a list of parameters for the glmnet model 9 | #' @param params_xgboost a list of parameters for the xgboost model 10 | #' @param params_ranger a list of parameters for the ranger model 11 | #' @param xgb_sort sort the xgboost features by "Gain", "Cover" or "Frequency" ( defaults to "Frequency") 12 | #' @param CV_folds a number specifying the number of folds for cross validation 13 | #' @param stratified_regr a boolean determining if the folds in regression should be stratified 14 | #' @param scale_coefs_glmnet if TRUE, less important coefficients will be smaller than the more important ones (ranking/plotting by magnitude possible) 15 | #' @param cores_glmnet an integer determining the number of cores to register in glmnet 16 | #' @param params_features is a list of parameters for the wrapper function 17 | #' @param verbose outputs info 18 | #' @return a list containing the important features of each method. If union in the params_feature list is enabled, then it also returns the average importance of all methods. 19 | #' 20 | #' @details 21 | #' This function returns the importance of the methods specified and if union in the params_feature list is TRUE then it also returns the average importance of all methods. 22 | #' Furthermore the user can limit the number of features using the keep_number_feat parameter of the params_feature list. 23 | #' 24 | #' @export 25 | #' @importFrom data.table as.data.table 26 | #' @importFrom magrittr %>% 27 | #' @importFrom rlang .data 28 | #' 29 | #' @examples 30 | #' 31 | #' \dontrun{ 32 | #' 33 | #' #........... 34 | #' # regression 35 | #' #........... 36 | #' 37 | #' data(iris) 38 | #' 39 | #' X = iris[, -5] 40 | #' y = X[, 1] 41 | #' X = X[, -1] 42 | #' 43 | #' params_glmnet = list(alpha = 1, 44 | #' family = 'gaussian', 45 | #' nfolds = 3, 46 | #' parallel = TRUE) 47 | #' 48 | #' 49 | #' params_xgboost = list( params = list("objective" = "reg:linear", 50 | #' "bst:eta" = 0.01, 51 | #' "subsample" = 0.65, 52 | #' "max_depth" = 5, 53 | #' "colsample_bytree" = 0.65, 54 | #' "nthread" = 2), 55 | #' nrounds = 100, 56 | #' print.every.n = 50, 57 | #' verbose = 0, 58 | #' maximize = FALSE) 59 | #' 60 | #' 61 | #' params_ranger = list(probability = FALSE, 62 | #' num.trees = 100, 63 | #' verbose = TRUE, 64 | #' classification = FALSE, 65 | #' mtry = 3, 66 | #' min.node.size = 10, 67 | #' num.threads = 2, 68 | #' importance = 'permutation') 69 | #' 70 | #' 71 | #' params_features = list(keep_number_feat = NULL, 72 | #' union = TRUE) 73 | #' 74 | #' feat = wrapper_feat_select(X, 75 | #' y, 76 | #' params_glmnet = params_glmnet, 77 | #' params_xgboost = params_xgboost, 78 | #' params_ranger = params_ranger, 79 | #' xgb_sort = NULL, 80 | #' CV_folds = 10, 81 | #' stratified_regr = FALSE, 82 | #' cores_glmnet = 2, 83 | #' params_features = params_features) 84 | #' } 85 | 86 | 87 | wrapper_feat_select = function(X, y, params_glmnet = NULL, params_xgboost = NULL, params_ranger = NULL, xgb_sort = NULL, CV_folds = 5, 88 | 89 | stratified_regr = FALSE, scale_coefs_glmnet = FALSE, cores_glmnet = NULL, params_features = NULL, verbose = FALSE) { 90 | 91 | method = c('glmnet-lasso', 'xgboost', 'ranger') 92 | met = list(params_glmnet, params_xgboost, params_ranger) 93 | met1 = unlist(lapply(met, function(x) !is.null(x))) 94 | 95 | if (sum(met1) == 0) stop('at least one of the methods should be non-null') 96 | 97 | method = method[met1] 98 | 99 | #if (!is.null(params_ranger) && ("dependent.variable.name" %in% names(params_ranger)) && sum(met1) > 1) stop("ranger can not be used simoultaneously with the other methods if 'dependent.variable.name' is in the params_ranger") 100 | 101 | if (CV_folds < 2) { 102 | 103 | stop(simpleError("CV_folds should be > 1")) 104 | } 105 | 106 | if (('ranger' %in% method) && (params_ranger$importance == 'none')) { 107 | 108 | stop(simpleError("enable importance in ranger using one of the arguments 'impurity' or 'permutation'")) 109 | } 110 | 111 | if (params_features$union == TRUE && length(method) == 1) { 112 | 113 | stop(simpleError("run union = TRUE only in case of more than one method")) 114 | } 115 | 116 | out_meth = list() 117 | 118 | for (meth in method) { 119 | 120 | out_meth[[meth]] = feature_selection(X, y, method = meth, params_glmnet = params_glmnet, params_xgboost = params_xgboost, params_ranger = params_ranger, xgb_sort = xgb_sort, 121 | 122 | CV_folds = CV_folds, stratified_regr = stratified_regr, scale_coefs_glmnet = scale_coefs_glmnet, cores_glmnet = cores_glmnet, verbose = verbose) 123 | } 124 | 125 | 126 | if (is.null(params_features$keep_number_feat)) { 127 | 128 | out = out_meth} 129 | 130 | else { 131 | 132 | out = lapply(out_meth, function(x) x[1:params_features$keep_number_feat, ]) 133 | } 134 | 135 | if (params_features$union == TRUE) { 136 | 137 | out_union = list() 138 | 139 | for (k in method) { 140 | 141 | if (k == 'glmnet-lasso' && (!params_glmnet$family %in% c('binomial', 'multinomial'))) { 142 | 143 | un_glmnet = out_meth[[k]][, -2] # exclude coefficients column 144 | un_glmnet[, 2] = normalized(un_glmnet[, 2]) # normalize the frequency so that is in same scale with the other algorithms 145 | colnames(un_glmnet) = c('features', 'importance') 146 | un_glmnet[, 1] = as.character(un_glmnet[, 1]) 147 | out_union[[k]] = un_glmnet} 148 | 149 | else if (k == 'glmnet-lasso' && params_glmnet$family %in% c('binomial', 'multinomial')) { 150 | 151 | un_glmnet = out_meth[[k]] 152 | un_glmnet[, 2] = normalized(un_glmnet[, 2]) 153 | colnames(un_glmnet) = c('features', 'importance') 154 | un_glmnet[, 1] = as.character(un_glmnet[, 1]) 155 | out_union[[k]] = un_glmnet} 156 | 157 | else if (k == 'xgboost') { 158 | 159 | if (is.null(xgb_sort)) { 160 | 161 | un_xgboost = out_meth[[k]][, c(1, 4)] 162 | un_xgboost[, 2] = normalized(un_xgboost[, 2]) 163 | colnames(un_xgboost) = c('features', 'importance') 164 | un_xgboost[, 1] = as.character(un_xgboost[, 1]) 165 | out_union[[k]] = un_xgboost} 166 | 167 | else { 168 | 169 | un_xgboost = data.frame(features = out_meth[[k]][, 1], importance = out_meth[[k]][, xgb_sort]) 170 | un_xgboost[, 2] = normalized(un_xgboost[, 2]) 171 | un_xgboost[, 1] = as.character(un_xgboost[, 1]) 172 | out_union[[k]] = un_xgboost} 173 | } 174 | 175 | else if (k == 'ranger') { 176 | 177 | un_ranger = out_meth[[k]] 178 | un_ranger[, 2] = normalized(un_ranger[, 2]) 179 | colnames(un_ranger) = c('features', 'importance') 180 | un_ranger[, 1] = as.character(un_ranger[, 1]) 181 | out_union[[k]] = un_ranger} 182 | } 183 | 184 | modify_lst = lapply(out_union, function(x) data.frame(feature = x$features, rank = normalized(length(x$features):1))) 185 | 186 | modify_lst1 = data.frame(do.call(rbind, modify_lst)) |> 187 | data.table::as.data.table() 188 | 189 | tbl_x = modify_lst1[, .(importance = sum(rank, na.rm = TRUE), 190 | Frequency = .N), 191 | by = 'feature'] |> 192 | as.data.frame() 193 | 194 | tbl1 = tbl_x[order(tbl_x$importance, decreasing = TRUE), ] 195 | 196 | tbl1$importance = normalized(tbl1$importance) 197 | 198 | return(list(all_feat = out, union_feat = tbl1))} 199 | 200 | else { 201 | 202 | return(all_feat = out) 203 | } 204 | } 205 | 206 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | [![tic](https://github.com/mlampros/FeatureSelection/workflows/tic/badge.svg?branch=master)](https://github.com/mlampros/FeatureSelection/actions) 3 | [![codecov.io](https://codecov.io/github/mlampros/FeatureSelection/coverage.svg?branch=master)](https://codecov.io/github/mlampros/FeatureSelection?branch=master) 4 | Buy Me A Coffee 5 | [![](https://img.shields.io/docker/automated/mlampros/featureselection.svg)](https://hub.docker.com/r/mlampros/featureselection) 6 | 7 |
8 | 9 | #### Feature Selection in R using glmnet-lasso, xgboost and ranger 10 | 11 |
12 | 13 | This R package wraps **glmnet-lasso**, **xgboost** and **ranger** to perform feature selection. After downloading use ? to read info about each function (i.e. ?feature_selection). More details can be found in the blog-post (http://mlampros.github.io/2016/02/14/feature-selection/). To download the latest version from Github use, 14 | 15 |
16 | 17 | ```R 18 | remotes::install_github('mlampros/FeatureSelection') 19 | 20 | ``` 21 | 22 |
23 | 24 | **Package Updates**: 25 | 26 | * Currently there is a new version of *glmnet* (3.0.0) with new functionality (*relax*, *trace*, *assess*, *bigGlm*), however it requires an R version of 3.6.0 (see the [new vignette](https://cran.r-project.org/web/packages/glmnet/vignettes/relax.pdf) for more information). 27 | * In the *ranger* R package the *ranger::importance_pvalues()* was added 28 | * Currently, the recommended approach for future selection is [SHAP](https://github.com/slundberg/shap) 29 | 30 |
31 | 32 | 33 | **UPDATE 03-02-2020** 34 | 35 |
36 | 37 | **Docker images** of the *FeatureSelection* package are available to download from my [dockerhub](https://hub.docker.com/r/mlampros/featureselection) account. The images come with *Rstudio* and the *R-development* version (latest) installed. The whole process was tested on Ubuntu 18.04. To **pull** & **run** the image do the following, 38 | 39 |
40 | 41 | ```R 42 | 43 | docker pull mlampros/featureselection:rstudiodev 44 | 45 | docker run -d --name rstudio_dev -e USER=rstudio -e PASSWORD=give_here_your_password --rm -p 8787:8787 mlampros/featureselection:rstudiodev 46 | 47 | ``` 48 | 49 |
50 | 51 | The user can also **bind** a home directory / folder to the image to use its files by specifying the **-v** command, 52 | 53 |
54 | 55 | ```R 56 | 57 | docker run -d --name rstudio_dev -e USER=rstudio -e PASSWORD=give_here_your_password --rm -p 8787:8787 -v /home/YOUR_DIR:/home/rstudio/YOUR_DIR mlampros/featureselection:rstudiodev 58 | 59 | 60 | ``` 61 | 62 |
63 | 64 | In the latter case you might have first give permission privileges for write access to **YOUR_DIR** directory (not necessarily) using, 65 | 66 |
67 | 68 | ```R 69 | 70 | chmod -R 777 /home/YOUR_DIR 71 | 72 | 73 | ``` 74 | 75 |
76 | 77 | The **USER** defaults to *rstudio* but you have to give your **PASSWORD** of preference (see [www.rocker-project.org](https://www.rocker-project.org/) for more information). 78 | 79 |
80 | 81 | Open your web-browser and depending where the docker image was *build / run* give, 82 | 83 |
84 | 85 | **1st. Option** on your personal computer, 86 | 87 |
88 | 89 | ```R 90 | http://0.0.0.0:8787 91 | 92 | ``` 93 | 94 |
95 | 96 | **2nd. Option** on a cloud instance, 97 | 98 |
99 | 100 | ```R 101 | http://Public DNS:8787 102 | 103 | ``` 104 | 105 |
106 | 107 | to access the Rstudio console in order to give your username and password. 108 | 109 |
110 | 111 | -------------------------------------------------------------------------------- /man/DataSplit.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/create_folds_and_data_split.R 3 | \name{DataSplit} 4 | \alias{DataSplit} 5 | \title{partition of data (train-test-split)} 6 | \usage{ 7 | DataSplit(y, TrainRatio = 0.75, regression = TRUE, shuffle = FALSE, seed = 1) 8 | } 9 | \arguments{ 10 | \item{y}{is a numeric vector (response variable)} 11 | 12 | \item{TrainRatio}{is the percentage of train-data after the partition} 13 | 14 | \item{regression}{is a boolean (TRUE, FALSE) indicating if it's a regression or classification task} 15 | 16 | \item{shuffle}{is a boolean (TRUE, FALSE) indicating if the data should be shuffled or not (by default 5 times)} 17 | 18 | \item{seed}{an integer specifying the random seed} 19 | } 20 | \value{ 21 | a list of indices (train-test) 22 | } 23 | \description{ 24 | partition of data (train-test-split) 25 | } 26 | \examples{ 27 | 28 | \dontrun{ 29 | 30 | data(iris) 31 | 32 | y = X[, 1] 33 | 34 | split = DataSplit(y, TrainRatio = 0.75, regression = FALSE, shuffle = FALSE) 35 | 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /man/add_probs_dfs.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{add_probs_dfs} 4 | \alias{add_probs_dfs} 5 | \title{addition of probability-data-frames} 6 | \usage{ 7 | add_probs_dfs(PREDS_LST) 8 | } 9 | \arguments{ 10 | \item{PREDS_LST}{is a list of data frames} 11 | } 12 | \value{ 13 | average of a number of data frames 14 | } 15 | \description{ 16 | this function takes a number of probability data frames and returns their average (mainly used in multi-class classification). 17 | } 18 | -------------------------------------------------------------------------------- /man/barplot_feat_select.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plot_feature_selection.R 3 | \name{barplot_feat_select} 4 | \alias{barplot_feat_select} 5 | \title{plot important features} 6 | \usage{ 7 | barplot_feat_select(obj, params_barplot, xgb_sort = NULL) 8 | } 9 | \arguments{ 10 | \item{obj}{either a data frame or a list from the functions : feature_selection, wrapper_feat_select} 11 | 12 | \item{params_barplot}{a list of parameters needed for plotting the important features} 13 | 14 | \item{xgb_sort}{sort the xgboost features by "Gain", "Cover" or "Frequency" ( defaults to "Frequency")} 15 | } 16 | \value{ 17 | a barplot with the important features of each method 18 | } 19 | \description{ 20 | This function takes the result of the feature_selection function or the wrapper_feat_select function and returns a barplot with the important features 21 | } 22 | \details{ 23 | This function takes a data frame (from the feature_selection function) or a list (from the wrapper_feat_select function) and returns a barplot of the important features. 24 | If union is TRUE in the params_barplot vector it returns also the average importance of all methods 25 | } 26 | \examples{ 27 | 28 | \dontrun{ 29 | 30 | data(iris) 31 | 32 | X = iris[, -5] 33 | y = X[, 1] 34 | X = X[, -1] 35 | 36 | #............................. 37 | # plot of the wrapper function 38 | #............................. 39 | 40 | feat = wrapper_feat_select(X, y, params_glmnet = params_glmnet, params_xgboost = params_xgboost, params_ranger = params_ranger, xgb_sort = NULL, 41 | CV_folds = 10, stratified_regr = FALSE, cores_glmnet = 2, params_features = params_features) 42 | 43 | params_barplot = list(keep_features = 5, horiz = TRUE, cex.names = 1.0) 44 | 45 | barplot_feat_select(feat, params_barplot, xgb_sort = NULL) 46 | 47 | 48 | #....................................... 49 | # plot of the feature_selection function 50 | #....................................... 51 | 52 | res = feature_selection(X, y, method = 'glmnet-lasso', params_glmnet = params_glmnet, CV_folds = 5, cores_glmnet = 5) 53 | 54 | 55 | params_barplot = list(keep_features = 5, horiz = TRUE, cex.names = 0.8) 56 | 57 | barplot_feat_select(res, params_barplot, xgb_sort = NULL) 58 | 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /man/class_folds.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/create_folds_and_data_split.R 3 | \name{class_folds} 4 | \alias{class_folds} 5 | \title{stratified folds (in classification)} 6 | \usage{ 7 | class_folds(folds, RESP, shuffle = FALSE) 8 | } 9 | \arguments{ 10 | \item{folds}{is an integer specifying the number of folds} 11 | 12 | \item{RESP}{is the response variable} 13 | 14 | \item{shuffle}{is a boolean specifying if the vector of indices should be shuffled or not} 15 | } 16 | \value{ 17 | a list of indices 18 | } 19 | \description{ 20 | this function creates stratified folds in binary and multiclass classification 21 | } 22 | \examples{ 23 | 24 | \dontrun{ 25 | 26 | data(iris) 27 | 28 | y = as.factor(iris[, 5]) 29 | 30 | folds = class_folds(10, y, shuffle = TRUE) 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /man/feature_selection.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/feature_selection.R 3 | \name{feature_selection} 4 | \alias{feature_selection} 5 | \title{Feature selection} 6 | \usage{ 7 | feature_selection( 8 | X, 9 | y, 10 | method = NULL, 11 | params_glmnet = NULL, 12 | params_xgboost = NULL, 13 | params_ranger = NULL, 14 | xgb_sort = NULL, 15 | CV_folds = 5, 16 | stratified_regr = FALSE, 17 | scale_coefs_glmnet = FALSE, 18 | cores_glmnet = NULL, 19 | verbose = FALSE 20 | ) 21 | } 22 | \arguments{ 23 | \item{X}{a sparse Matrix, a matrix or a data frame} 24 | 25 | \item{y}{a vector of length representing the response variable} 26 | 27 | \item{method}{one of 'glmnet-lasso', 'xgboost', 'ranger'} 28 | 29 | \item{params_glmnet}{a list of parameters for the glmnet model} 30 | 31 | \item{params_xgboost}{a list of parameters for the xgboost model} 32 | 33 | \item{params_ranger}{a list of parameters for the ranger model} 34 | 35 | \item{xgb_sort}{sort the xgboost features by "Gain", "Cover" or "Frequency" ( defaults to "Frequency")} 36 | 37 | \item{CV_folds}{a number specifying the number of folds for cross validation} 38 | 39 | \item{stratified_regr}{a boolean determining if the folds in regression should be stratified} 40 | 41 | \item{scale_coefs_glmnet}{if TRUE, less important coefficients will be smaller than the more important ones (ranking/plotting by magnitude possible)} 42 | 43 | \item{cores_glmnet}{an integer determining the number of cores to register in glmnet} 44 | 45 | \item{verbose}{outputs info} 46 | } 47 | \value{ 48 | a data frame with the most important features 49 | } 50 | \description{ 51 | This function uses three different methods (glmnet, xgboost, ranger) in order to select important features. 52 | } 53 | \details{ 54 | This function returns the important features using one of the glmnet, xgboost or ranger algorithms. The glmnet algorithm can take either a sparse matrix, a matrix or a data frame 55 | and returns a data frame with non zero coefficients. The xgboost algorithm can take either a sparse matrix, a matrix or a data frame and returns the importance of the features in form 56 | of a data frame, furthermore it is possible to sort the features using one of the "Gain", "Cover" or "Frequency" methods. The ranger algorithm can take either a matrix or a data frame 57 | and returns the important features using one of the 'impurity' or 'permutation' methods. 58 | } 59 | \examples{ 60 | 61 | \dontrun{ 62 | 63 | #........... 64 | # regression 65 | #........... 66 | 67 | data(iris) 68 | 69 | X = iris[, -5] 70 | y = X[, 1] 71 | X = X[, -1] 72 | 73 | params_glmnet = list(alpha = 1, 74 | family = 'gaussian', 75 | nfolds = 3, 76 | parallel = TRUE) 77 | 78 | res = feature_selection(X, 79 | y, 80 | method = 'glmnet-lasso', 81 | params_glmnet = params_glmnet, 82 | CV_folds = 5, 83 | cores_glmnet = 5) 84 | 85 | #...................... 86 | # binary classification 87 | #...................... 88 | 89 | y = iris[, 5] 90 | y = as.character(y) 91 | y[y == 'setosa'] = 'virginica' 92 | X = iris[, -5] 93 | 94 | params_ranger = list(write.forest = TRUE, 95 | probability = TRUE, 96 | num.threads = 6, 97 | num.trees = 50, 98 | verbose = FALSE, 99 | classification = TRUE, 100 | mtry = 2, 101 | min.node.size = 5, 102 | importance = 'impurity') 103 | 104 | res = feature_selection(X, 105 | y, 106 | method = 'ranger', 107 | params_ranger = params_ranger, 108 | CV_folds = 5) 109 | 110 | #.......................... 111 | # multiclass classification 112 | #.......................... 113 | 114 | y = iris[, 5] 115 | multiclass_xgboost = ifelse(y == 'setosa', 0, ifelse(y == 'virginica', 1, 2)) 116 | X = iris[, -5] 117 | 118 | params_xgboost = list(params = list("objective" = "multi:softprob", 119 | "bst:eta" = 0.35, 120 | "subsample" = 0.65, 121 | "num_class" = 3, 122 | "max_depth" = 6, 123 | "colsample_bytree" = 0.65, 124 | "nthread" = 2), 125 | nrounds = 50, 126 | print.every.n = 50, 127 | verbose = 0, 128 | maximize = FALSE) 129 | 130 | res = feature_selection(X, 131 | multiclass_xgboost, 132 | method = 'xgboost', 133 | params_xgboost = params_xgboost, 134 | CV_folds = 5) 135 | } 136 | } 137 | \author{ 138 | Lampros Mouselimis 139 | } 140 | -------------------------------------------------------------------------------- /man/func_correlation.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/correlation_of_multiple_predictors.R 3 | \name{func_correlation} 4 | \alias{func_correlation} 5 | \title{find correlated variables} 6 | \usage{ 7 | func_correlation( 8 | data, 9 | target = NULL, 10 | correlation_thresh = NULL, 11 | use_obs = NULL, 12 | correlation_method = NULL 13 | ) 14 | } 15 | \arguments{ 16 | \item{data}{either a data frame or a matrix} 17 | 18 | \item{target}{either a string (name of the predictor/response in the data set) or a vector of strings (predictor/response names of the data set)} 19 | 20 | \item{correlation_thresh}{a float indicating the correlation threshold} 21 | 22 | \item{use_obs}{one of "everything", "all.obs", "complete.obs", "na.or.complete", "pairwise.complete.obs"} 23 | 24 | \item{correlation_method}{one of "pearson", "kendall", "spearman"} 25 | } 26 | \value{ 27 | either a data frame or a list of data frames 28 | } 29 | \description{ 30 | This function takes a data frame or a matrix and returns either a data frame or a list of data frames with correlated variables 31 | } 32 | \details{ 33 | This function takes a data frame or a matrix and returns the correlated variables of the data 34 | } 35 | -------------------------------------------------------------------------------- /man/func_replace_NAs.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{func_replace_NAs} 4 | \alias{func_replace_NAs} 5 | \title{secondary function to replace NAs} 6 | \usage{ 7 | func_replace_NAs(data, which_isna) 8 | } 9 | \description{ 10 | secondary function to replace NAs 11 | } 12 | \keyword{internal} 13 | -------------------------------------------------------------------------------- /man/func_shuffle.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{func_shuffle} 4 | \alias{func_shuffle} 5 | \title{shuffle data} 6 | \usage{ 7 | func_shuffle(vec, times = 10) 8 | } 9 | \arguments{ 10 | \item{vec}{is a vector of indices} 11 | 12 | \item{times}{is a number} 13 | } 14 | \value{ 15 | shuffled indices of a vector 16 | } 17 | \description{ 18 | this function shuffles the items of a vector 19 | } 20 | -------------------------------------------------------------------------------- /man/normalized.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{normalized} 4 | \alias{normalized} 5 | \title{normalize data} 6 | \usage{ 7 | normalized(x) 8 | } 9 | \arguments{ 10 | \item{x}{is a numeric vector} 11 | } 12 | \value{ 13 | normalized data in form of a vector 14 | } 15 | \description{ 16 | this function normalizes the feature importance of the algorithms, so that data between algorithms is in the same scale 17 | } 18 | -------------------------------------------------------------------------------- /man/regr_folds.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/create_folds_and_data_split.R 3 | \name{regr_folds} 4 | \alias{regr_folds} 5 | \title{create folds (in regression)} 6 | \usage{ 7 | regr_folds(folds, RESP, stratified = FALSE) 8 | } 9 | \arguments{ 10 | \item{folds}{is an integer specifying the number of folds} 11 | 12 | \item{RESP}{is the response variable} 13 | 14 | \item{stratified}{is a boolean specifying if the folds should be stratfied} 15 | } 16 | \value{ 17 | a list of indices 18 | } 19 | \description{ 20 | this function creates both stratified and non-stratified folds in regression 21 | } 22 | \examples{ 23 | 24 | \dontrun{ 25 | 26 | data(iris) 27 | 28 | y = X[, 1] 29 | 30 | folds = regr_folds(5, y, stratified = FALSE) 31 | 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /man/remove_duplic_func.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/correlation_of_multiple_predictors.R 3 | \name{remove_duplic_func} 4 | \alias{remove_duplic_func} 5 | \title{function to remove duplicated pairs of predictors ( used in the func_correlation )} 6 | \usage{ 7 | remove_duplic_func(sublist) 8 | } 9 | \description{ 10 | this is a secondary function that is used in the func_correlation 11 | } 12 | \keyword{internal} 13 | -------------------------------------------------------------------------------- /man/second_func_cor.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/correlation_of_multiple_predictors.R 3 | \name{second_func_cor} 4 | \alias{second_func_cor} 5 | \title{secondary function ( used in the func_correlation )} 6 | \usage{ 7 | second_func_cor(dat_frame) 8 | } 9 | \description{ 10 | this is a secondary function that is used in the func_correlation 11 | } 12 | \keyword{internal} 13 | -------------------------------------------------------------------------------- /man/wrapper_feat_select.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/wrapper_feature_selection.R 3 | \name{wrapper_feat_select} 4 | \alias{wrapper_feat_select} 5 | \title{Wraps all three methods} 6 | \usage{ 7 | wrapper_feat_select( 8 | X, 9 | y, 10 | params_glmnet = NULL, 11 | params_xgboost = NULL, 12 | params_ranger = NULL, 13 | xgb_sort = NULL, 14 | CV_folds = 5, 15 | stratified_regr = FALSE, 16 | scale_coefs_glmnet = FALSE, 17 | cores_glmnet = NULL, 18 | params_features = NULL, 19 | verbose = FALSE 20 | ) 21 | } 22 | \arguments{ 23 | \item{X}{a sparse Matrix, a matrix or a data frame} 24 | 25 | \item{y}{a vector of length representing the response variable} 26 | 27 | \item{params_glmnet}{a list of parameters for the glmnet model} 28 | 29 | \item{params_xgboost}{a list of parameters for the xgboost model} 30 | 31 | \item{params_ranger}{a list of parameters for the ranger model} 32 | 33 | \item{xgb_sort}{sort the xgboost features by "Gain", "Cover" or "Frequency" ( defaults to "Frequency")} 34 | 35 | \item{CV_folds}{a number specifying the number of folds for cross validation} 36 | 37 | \item{stratified_regr}{a boolean determining if the folds in regression should be stratified} 38 | 39 | \item{scale_coefs_glmnet}{if TRUE, less important coefficients will be smaller than the more important ones (ranking/plotting by magnitude possible)} 40 | 41 | \item{cores_glmnet}{an integer determining the number of cores to register in glmnet} 42 | 43 | \item{params_features}{is a list of parameters for the wrapper function} 44 | 45 | \item{verbose}{outputs info} 46 | } 47 | \value{ 48 | a list containing the important features of each method. If union in the params_feature list is enabled, then it also returns the average importance of all methods. 49 | } 50 | \description{ 51 | This function is a wrapper for the feature_selection function 52 | } 53 | \details{ 54 | This function returns the importance of the methods specified and if union in the params_feature list is TRUE then it also returns the average importance of all methods. 55 | Furthermore the user can limit the number of features using the keep_number_feat parameter of the params_feature list. 56 | } 57 | \examples{ 58 | 59 | \dontrun{ 60 | 61 | #........... 62 | # regression 63 | #........... 64 | 65 | data(iris) 66 | 67 | X = iris[, -5] 68 | y = X[, 1] 69 | X = X[, -1] 70 | 71 | params_glmnet = list(alpha = 1, 72 | family = 'gaussian', 73 | nfolds = 3, 74 | parallel = TRUE) 75 | 76 | 77 | params_xgboost = list( params = list("objective" = "reg:linear", 78 | "bst:eta" = 0.01, 79 | "subsample" = 0.65, 80 | "max_depth" = 5, 81 | "colsample_bytree" = 0.65, 82 | "nthread" = 2), 83 | nrounds = 100, 84 | print.every.n = 50, 85 | verbose = 0, 86 | maximize = FALSE) 87 | 88 | 89 | params_ranger = list(probability = FALSE, 90 | num.trees = 100, 91 | verbose = TRUE, 92 | classification = FALSE, 93 | mtry = 3, 94 | min.node.size = 10, 95 | num.threads = 2, 96 | importance = 'permutation') 97 | 98 | 99 | params_features = list(keep_number_feat = NULL, 100 | union = TRUE) 101 | 102 | feat = wrapper_feat_select(X, 103 | y, 104 | params_glmnet = params_glmnet, 105 | params_xgboost = params_xgboost, 106 | params_ranger = params_ranger, 107 | xgb_sort = NULL, 108 | CV_folds = 10, 109 | stratified_regr = FALSE, 110 | cores_glmnet = 2, 111 | params_features = params_features) 112 | } 113 | } 114 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(FeatureSelection) 3 | 4 | test_check("FeatureSelection") 5 | -------------------------------------------------------------------------------- /tests/testthat/helper-function_for_tests.R: -------------------------------------------------------------------------------- 1 | 2 | #....................... 3 | # function used in tests 4 | #....................... 5 | 6 | func_nas = function(X, col_idx) { # randomly adding NA's 7 | 8 | idx = sample(1:length(X[, col_idx]), 30, replace = F) 9 | 10 | X[idx, col_idx] = NA 11 | 12 | return(X[, col_idx]) 13 | } 14 | -------------------------------------------------------------------------------- /tests/testthat/test-add_probability_matrices_OR_data_frames.R: -------------------------------------------------------------------------------- 1 | context("Add probability data frames or matrices") 2 | 3 | testthat::test_that("adding 3 matrices or data.frames results in a a matrix of all 2's", { 4 | m1 = matrix(rep(1, 25), 5, 5) 5 | m2 = matrix(rep(2, 25), 5, 5) 6 | m3 = matrix(rep(3, 25), 5, 5) 7 | 8 | lst = list(m1, m2, m3) 9 | 10 | testthat::expect_equal(add_probs_dfs(lst), matrix(rep(2, 25), 5,5), check.attributes = FALSE) # check.attributes = F otherwise due to dimnames error 11 | }) 12 | 13 | 14 | testthat::test_that("if PREDS_LST is a matrix throws an error", { 15 | m1 = matrix(rep(1, 25), 5, 5) 16 | 17 | testthat::expect_error(add_probs_dfs(m1), "PREDS_LST must be a list") 18 | }) 19 | 20 | 21 | testthat::test_that("if the dimensions of each matrix or data frame in the list is different it throws an error", { 22 | m1 = matrix(rep(1, 25), 5, 5) 23 | m2 = matrix(rep(2, 100), 10, 10) 24 | m3 = matrix(rep(3, 25), 5, 5) 25 | 26 | lst = list(m1, m2, m3) 27 | 28 | testthat::expect_error(add_probs_dfs(lst), "the dimensions of the included data.frames or matrices differ") 29 | }) 30 | 31 | -------------------------------------------------------------------------------- /tests/testthat/test-barplot_feat_select.R: -------------------------------------------------------------------------------- 1 | data(iris) 2 | 3 | context('Bar plot feature selection') 4 | 5 | 6 | 7 | # first check single methods: 8 | 9 | testthat::test_that("it returns an error if the object is NULL", { 10 | 11 | X = iris[, -5] 12 | y = X[, 1] 13 | X = X[, -1] 14 | 15 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = F) 16 | 17 | res = feature_selection(X, y, method = 'glmnet-lasso', params_glmnet = params_glmnet, CV_folds = 5, cores_glmnet = NULL) 18 | 19 | params_barplot = list(keep_features = NULL, horiz = TRUE, cex.names = 0.8) 20 | 21 | testthat::expect_error(barplot_feat_select(res, params_barplot, xgb_sort = NULL)) 22 | }) 23 | 24 | 25 | testthat::test_that("it returns an error if sort method of xgoobst not one of Frequency, Gain, Cover", { 26 | 27 | y = iris[, 5] 28 | multiclass_xgboost = ifelse(y == 'setosa', 0, ifelse(y == 'virginica', 1, 2)) 29 | X = iris[, -5] 30 | 31 | params_xgboost = list( params = list("objective" = "multi:softprob", "bst:eta" = 0.35, "subsample" = 0.65, "num_class" = 3, "max_depth" = 6, "colsample_bytree" = 0.65, "nthread" = 2), 32 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE) 33 | 34 | res = feature_selection(X, multiclass_xgboost, method = 'xgboost', params_xgboost = params_xgboost, CV_folds = 1, xgb_sort = NULL) 35 | 36 | params_barplot = list(keep_features = 5, horiz = TRUE, cex.names = 0.8) 37 | 38 | testthat::expect_error(barplot_feat_select(res, params_barplot, xgb_sort = 'some_method')) 39 | }) 40 | 41 | #=========================================== 42 | # use testthat::expect_silent to test plots 43 | #=========================================== 44 | 45 | # feature_selection() function 46 | 47 | testthat::test_that("glmnet for the feature_selection object, returns 3 column data frame (CV_folds > 1) in the res object, and plot", { 48 | 49 | X = iris[, -5] 50 | y = X[, 1] 51 | X = X[, -1] 52 | 53 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = F) 54 | 55 | res = feature_selection(X, y, method = 'glmnet-lasso', params_glmnet = params_glmnet, CV_folds = 5, cores_glmnet = NULL) 56 | 57 | params_barplot = list(keep_features = 5, horiz = TRUE, cex.names = 0.8) 58 | 59 | testthat::expect_silent(barplot_feat_select(res, params_barplot, xgb_sort = NULL)) 60 | }) 61 | 62 | 63 | 64 | testthat::test_that("it returns plot in case of binomial", { 65 | 66 | X = iris 67 | y = X[, 5] 68 | y = as.character(y) 69 | y[y == 'setosa'] = 'versicolor' 70 | y = as.factor(y) 71 | X = X[, -5] 72 | 73 | params_glmnet = list(alpha = 1, family = 'binomial', nfolds = 3, parallel = F) 74 | 75 | res = feature_selection(X, y, method = 'glmnet-lasso', params_glmnet = params_glmnet, CV_folds = 5, cores_glmnet = NULL, scale_coefs_glmnet = T, verbose = F) 76 | 77 | params_barplot = list(keep_features = 5, horiz = TRUE, cex.names = 0.8) 78 | 79 | testthat::expect_silent(barplot_feat_select(res, params_barplot, xgb_sort = NULL)) 80 | }) 81 | 82 | 83 | testthat::test_that("glmnet for the feature_selection object, returns 2 column data frame (CV_folds = 1) in the res object, and plot", { 84 | 85 | X = iris[, -5] 86 | y = X[, 1] 87 | X = X[, -1] 88 | 89 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = F) 90 | 91 | res = feature_selection(X, y, method = 'glmnet-lasso', params_glmnet = params_glmnet, CV_folds = 1, cores_glmnet = NULL) 92 | 93 | params_barplot = list(keep_features = 5, horiz = TRUE, cex.names = 0.8) 94 | 95 | testthat::expect_silent(barplot_feat_select(res, params_barplot, xgb_sort = NULL)) 96 | }) 97 | 98 | 99 | 100 | testthat::test_that("xgboost for the feature_selection object, xgb_sort Frequency", { 101 | 102 | y = iris[, 5] 103 | multiclass_xgboost = ifelse(y == 'setosa', 0, ifelse(y == 'virginica', 1, 2)) 104 | X = iris[, -5] 105 | 106 | params_xgboost = list( params = list("objective" = "multi:softprob", "bst:eta" = 0.35, "subsample" = 0.65, "num_class" = 3, "max_depth" = 6, "colsample_bytree" = 0.65, "nthread" = 2), 107 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE) 108 | 109 | res = feature_selection(X, multiclass_xgboost, method = 'xgboost', params_xgboost = params_xgboost, CV_folds = 1, xgb_sort = NULL) 110 | 111 | params_barplot = list(keep_features = 5, horiz = TRUE, cex.names = 0.8) 112 | 113 | testthat::expect_silent(barplot_feat_select(res, params_barplot, xgb_sort = 'Frequency')) 114 | }) 115 | 116 | 117 | testthat::test_that("xgboost for the feature_selection object, xgb_sort Gain", { 118 | 119 | y = iris[, 5] 120 | multiclass_xgboost = ifelse(y == 'setosa', 0, ifelse(y == 'virginica', 1, 2)) 121 | X = iris[, -5] 122 | 123 | params_xgboost = list( params = list("objective" = "multi:softprob", "bst:eta" = 0.35, "subsample" = 0.65, "num_class" = 3, "max_depth" = 6, "colsample_bytree" = 0.65, "nthread" = 2), 124 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE) 125 | 126 | res = feature_selection(X, multiclass_xgboost, method = 'xgboost', params_xgboost = params_xgboost, CV_folds = 1, xgb_sort = NULL) 127 | 128 | params_barplot = list(keep_features = 5, horiz = TRUE, cex.names = 0.8) 129 | 130 | testthat::expect_silent(barplot_feat_select(res, params_barplot, xgb_sort = 'Gain')) 131 | }) 132 | 133 | 134 | testthat::test_that("xgboost for the feature_selection object, xgb_sort Cover", { 135 | 136 | y = iris[, 5] 137 | multiclass_xgboost = ifelse(y == 'setosa', 0, ifelse(y == 'virginica', 1, 2)) 138 | X = iris[, -5] 139 | 140 | params_xgboost = list( params = list("objective" = "multi:softprob", "bst:eta" = 0.35, "subsample" = 0.65, "num_class" = 3, "max_depth" = 6, "colsample_bytree" = 0.65, "nthread" = 2), 141 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE) 142 | 143 | res = feature_selection(X, multiclass_xgboost, method = 'xgboost', params_xgboost = params_xgboost, CV_folds = 1, xgb_sort = NULL) 144 | 145 | params_barplot = list(keep_features = 5, horiz = TRUE, cex.names = 0.8) 146 | 147 | testthat::expect_silent(barplot_feat_select(res, params_barplot, xgb_sort = 'Cover')) 148 | }) 149 | 150 | 151 | testthat::test_that("ranger for the feature_selection object", { 152 | 153 | X = iris[, -5] 154 | y = X[, 1] 155 | X = X[, -1] 156 | 157 | params_ranger = list(write.forest = TRUE, probability = F, num.threads = 6, num.trees = 50, verbose = FALSE, classification = F, mtry = 2, min.node.size = 5, importance = 'impurity') 158 | 159 | res = feature_selection(X, y, method = 'ranger', params_ranger = params_ranger, CV_folds = 5) 160 | 161 | params_barplot = list(keep_features = 5, horiz = TRUE, cex.names = 0.8) 162 | 163 | testthat::expect_silent(barplot_feat_select(res, params_barplot, xgb_sort = NULL)) 164 | }) 165 | 166 | 167 | # wrapper_feat_select() function 168 | 169 | 170 | testthat::test_that("plot of all methods when union = T", { 171 | 172 | X = iris[, -5] 173 | y = X[, 1] 174 | X = X[, -1] 175 | 176 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = F) 177 | 178 | params_xgboost = list( params = list("objective" = "reg:linear", "bst:eta" = 0.01, "subsample" = 0.65, "max_depth" = 5, "colsample_bytree" = 0.65, "nthread" = 2), 179 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE) 180 | 181 | params_ranger = list(probability = FALSE, num.trees = 50, verbose = TRUE, classification = FALSE, mtry = 2, min.node.size = 10, num.threads = 2, importance = 'permutation') 182 | 183 | params_features = list(keep_number_feat = NULL, union = T) 184 | 185 | res = wrapper_feat_select(X, y, params_glmnet = params_glmnet, params_xgboost = params_xgboost, params_ranger = params_ranger, xgb_sort = "Gain", 186 | 187 | CV_folds = 3, stratified_regr = FALSE, cores_glmnet = NULL, params_features = params_features) 188 | 189 | params_barplot = list(keep_features = 5, horiz = TRUE, cex.names = 1.0) 190 | 191 | testthat::expect_silent(barplot_feat_select(res, params_barplot, xgb_sort = NULL)) 192 | }) 193 | 194 | 195 | testthat::test_that("plot of all methods when union = T", { 196 | 197 | X = iris[, -5] 198 | y = X[, 1] 199 | X = X[, -1] 200 | 201 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = F) 202 | 203 | params_xgboost = list( params = list("objective" = "reg:linear", "bst:eta" = 0.01, "subsample" = 0.65, "max_depth" = 5, "colsample_bytree" = 0.65, "nthread" = 2), 204 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE) 205 | 206 | params_ranger = list(probability = FALSE, num.trees = 50, verbose = TRUE, classification = FALSE, mtry = 2, min.node.size = 10, num.threads = 2, importance = 'permutation') 207 | 208 | params_features = list(keep_number_feat = NULL, union = T) 209 | 210 | res = wrapper_feat_select(X, y, params_glmnet = params_glmnet, params_xgboost = params_xgboost, params_ranger = params_ranger, xgb_sort = "Gain", 211 | 212 | CV_folds = 3, stratified_regr = FALSE, cores_glmnet = NULL, params_features = params_features) 213 | 214 | params_barplot = list(keep_features = 5, horiz = TRUE, cex.names = 1.0) 215 | 216 | testthat::expect_silent(barplot_feat_select(res, params_barplot, xgb_sort = 'Gain')) 217 | }) 218 | 219 | 220 | testthat::test_that("plot of all methods when union = T", { 221 | 222 | X = iris[, -5] 223 | y = X[, 1] 224 | X = X[, -1] 225 | 226 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = F) 227 | 228 | params_xgboost = list( params = list("objective" = "reg:linear", "bst:eta" = 0.01, "subsample" = 0.65, "max_depth" = 5, "colsample_bytree" = 0.65, "nthread" = 2), 229 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE) 230 | 231 | params_ranger = list(probability = FALSE, num.trees = 50, verbose = TRUE, classification = FALSE, mtry = 2, min.node.size = 10, num.threads = 2, importance = 'permutation') 232 | 233 | params_features = list(keep_number_feat = NULL, union = T) 234 | 235 | res = wrapper_feat_select(X, y, params_glmnet = params_glmnet, params_xgboost = params_xgboost, params_ranger = params_ranger, xgb_sort = NULL, 236 | 237 | CV_folds = 3, stratified_regr = FALSE, cores_glmnet = NULL, params_features = params_features) 238 | 239 | params_barplot = list(keep_features = 5, horiz = TRUE, cex.names = 1.0) 240 | 241 | testthat::expect_silent(barplot_feat_select(res, params_barplot, xgb_sort = 'Cover')) 242 | }) 243 | 244 | testthat::test_that("plot of all methods when union = F", { 245 | 246 | X = iris[, -5] 247 | y = X[, 1] 248 | X = X[, -1] 249 | 250 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = F) 251 | 252 | params_xgboost = list( params = list("objective" = "reg:linear", "bst:eta" = 0.01, "subsample" = 0.65, "max_depth" = 5, "colsample_bytree" = 0.65, "nthread" = 2), 253 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE) 254 | 255 | params_ranger = list(probability = FALSE, num.trees = 50, verbose = TRUE, classification = FALSE, mtry = 2, min.node.size = 10, num.threads = 2, importance = 'permutation') 256 | 257 | params_features = list(keep_number_feat = NULL, union = F) 258 | 259 | res = wrapper_feat_select(X, y, params_glmnet = params_glmnet, params_xgboost = params_xgboost, params_ranger = params_ranger, xgb_sort = "Gain", 260 | 261 | CV_folds = 3, stratified_regr = FALSE, cores_glmnet = NULL, params_features = params_features) 262 | 263 | params_barplot = list(keep_features = 5, horiz = TRUE, cex.names = 1.0) 264 | 265 | testthat::expect_silent(barplot_feat_select(res, params_barplot, xgb_sort = NULL)) 266 | }) 267 | 268 | 269 | testthat::test_that("plot two of the methods when union = F", { 270 | 271 | X = iris[, -5] 272 | y = X[, 1] 273 | X = X[, -1] 274 | 275 | 276 | params_xgboost = list( params = list("objective" = "reg:linear", "bst:eta" = 0.01, "subsample" = 0.65, "max_depth" = 5, "colsample_bytree" = 0.65, "nthread" = 2), 277 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE) 278 | 279 | params_ranger = list(probability = FALSE, num.trees = 50, verbose = TRUE, classification = FALSE, mtry = 2, min.node.size = 10, num.threads = 2, importance = 'permutation') 280 | 281 | params_features = list(keep_number_feat = NULL, union = F) 282 | 283 | res = wrapper_feat_select(X, y, params_glmnet = NULL, params_xgboost = params_xgboost, params_ranger = params_ranger, xgb_sort = "Gain", 284 | 285 | CV_folds = 3, stratified_regr = FALSE, cores_glmnet = 2, params_features = params_features) 286 | 287 | params_barplot = list(keep_features = 5, horiz = TRUE, cex.names = 1.0) 288 | 289 | testthat::expect_silent(barplot_feat_select(res, params_barplot, xgb_sort = 'Gain')) 290 | }) 291 | 292 | 293 | testthat::test_that("plot one of the methods when union = F, [ ranger ]", { 294 | 295 | X = iris[, -5] 296 | y = X[, 1] 297 | X = X[, -1] 298 | 299 | params_ranger = list(probability = FALSE, num.trees = 50, verbose = TRUE, classification = FALSE, mtry = 2, min.node.size = 10, num.threads = 2, importance = 'permutation') 300 | 301 | params_features = list(keep_number_feat = NULL, union = F) 302 | 303 | res = wrapper_feat_select(X, y, params_glmnet = NULL, params_xgboost = NULL, params_ranger = params_ranger, xgb_sort = "Gain", 304 | 305 | CV_folds = 3, stratified_regr = FALSE, cores_glmnet = 2, params_features = params_features) 306 | 307 | params_barplot = list(keep_features = 5, horiz = TRUE, cex.names = 1.0) 308 | 309 | testthat::expect_silent(barplot_feat_select(res, params_barplot, xgb_sort = NULL)) 310 | }) 311 | 312 | 313 | testthat::test_that("plot one of the methods when union = F [ glmnet - lasso ]", { 314 | 315 | X = iris[, -5] 316 | y = X[, 1] 317 | X = X[, -1] 318 | 319 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = F) 320 | 321 | 322 | params_features = list(keep_number_feat = NULL, union = F) 323 | 324 | res = wrapper_feat_select(X, y, params_glmnet = params_glmnet, params_xgboost = NULL, params_ranger = NULL, xgb_sort = "Gain", 325 | 326 | CV_folds = 3, stratified_regr = FALSE, cores_glmnet = NULL, params_features = params_features) 327 | 328 | params_barplot = list(keep_features = 5, horiz = TRUE, cex.names = 1.0) 329 | 330 | testthat::expect_silent(barplot_feat_select(res, params_barplot, xgb_sort = NULL)) 331 | }) 332 | 333 | 334 | testthat::test_that("plot one of the methods when union = F [ xgboost ]", { 335 | 336 | X = iris[, -5] 337 | y = X[, 1] 338 | X = X[, -1] 339 | 340 | params_xgboost = list( params = list("objective" = "reg:linear", "bst:eta" = 0.01, "subsample" = 0.65, "max_depth" = 5, "colsample_bytree" = 0.65, "nthread" = 2), 341 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE) 342 | 343 | params_features = list(keep_number_feat = NULL, union = F) 344 | 345 | res = wrapper_feat_select(X, y, params_glmnet = NULL, params_xgboost = params_xgboost, params_ranger = NULL, xgb_sort = "Gain", 346 | 347 | CV_folds = 3, stratified_regr = FALSE, cores_glmnet = 2, params_features = params_features) 348 | 349 | params_barplot = list(keep_features = 5, horiz = TRUE, cex.names = 1.0) 350 | 351 | testthat::expect_silent(barplot_feat_select(res, params_barplot, xgb_sort = NULL)) 352 | }) 353 | 354 | 355 | testthat::test_that("plot two of the methods when union = T", { 356 | 357 | X = iris[, -5] 358 | y = X[, 1] 359 | X = X[, -1] 360 | 361 | 362 | params_xgboost = list( params = list("objective" = "reg:linear", "bst:eta" = 0.01, "subsample" = 0.65, "max_depth" = 5, "colsample_bytree" = 0.65, "nthread" = 2), 363 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE) 364 | 365 | params_ranger = list(probability = FALSE, num.trees = 50, verbose = TRUE, classification = FALSE, mtry = 2, min.node.size = 10, num.threads = 2, importance = 'permutation') 366 | 367 | params_features = list(keep_number_feat = NULL, union = T) 368 | 369 | res = wrapper_feat_select(X, y, params_glmnet = NULL, params_xgboost = params_xgboost, params_ranger = params_ranger, xgb_sort = "Gain", 370 | 371 | CV_folds = 3, stratified_regr = FALSE, cores_glmnet = 2, params_features = params_features) 372 | 373 | params_barplot = list(keep_features = 5, horiz = TRUE, cex.names = 1.0) 374 | 375 | testthat::expect_silent(barplot_feat_select(res, params_barplot, xgb_sort = 'Gain')) 376 | }) 377 | 378 | -------------------------------------------------------------------------------- /tests/testthat/test-classification_folds.R: -------------------------------------------------------------------------------- 1 | context("Split data in classification folds") 2 | 3 | 4 | testthat::test_that("throws an error if the RESP is not a factor", { 5 | 6 | y = c(1:10) 7 | 8 | testthat::expect_error(class_folds(5, y, shuffle = T), "RESP must be a factor") 9 | }) 10 | 11 | 12 | testthat::test_that("returns a warning if the folds are not equally split", { 13 | 14 | y = as.factor(sample(1:5, 99, replace = T)) 15 | 16 | testthat::expect_warning(class_folds(5, y, shuffle = T), 'the folds are not equally split') 17 | }) 18 | 19 | testthat::test_that("the number of folds equals the number of the resulted sublist indices", { 20 | 21 | y = as.factor(sample(1:5, 100, replace = T)) 22 | 23 | testthat::expect_length(class_folds(5, y, shuffle = T), 5) 24 | }) 25 | 26 | 27 | # return object with shuffle = F 28 | 29 | testthat::test_that("the number of folds equals the number of the resulted sublist indices", { 30 | 31 | y = as.factor(sample(1:5, 100, replace = T)) 32 | 33 | testthat::expect_length(class_folds(5, y, shuffle = F), 5) 34 | }) 35 | -------------------------------------------------------------------------------- /tests/testthat/test-correlation_function.R: -------------------------------------------------------------------------------- 1 | data(iris) 2 | iris1 = iris 3 | iris1$Species = as.numeric(iris1$Species) 4 | 5 | context('Correlation function') 6 | 7 | 8 | testthat::test_that("if any of the data columns is factor or character it returns an error", { 9 | 10 | testthat::expect_error(func_correlation(iris, target = NULL, correlation_thresh = 0.75, use_obs = "everything", correlation_method = "pearson")) 11 | }) 12 | 13 | 14 | testthat::test_that("if the correlation_thresh is NULL it returns an error", { 15 | 16 | testthat::expect_error(func_correlation(iris[, -ncol(iris)], target = NULL, correlation_thresh = NULL, use_obs = "everything", correlation_method = "pearson")) 17 | }) 18 | 19 | testthat::test_that("if the correlation_thresh is > 1.0 it returns an error", { 20 | 21 | testthat::expect_error(func_correlation(iris[, -ncol(iris)], target = NULL, correlation_thresh = 1.1, use_obs = "everything", correlation_method = "pearson")) 22 | }) 23 | 24 | testthat::test_that("if the correlation_thresh is <= 0.0 it returns an error", { 25 | 26 | testthat::expect_error(func_correlation(iris[, -ncol(iris)], target = NULL, correlation_thresh = -1.0, use_obs = "everything", correlation_method = "pearson")) 27 | }) 28 | 29 | testthat::test_that("if the use_obs is NULL it returns an error", { 30 | 31 | testthat::expect_error(func_correlation(iris[, -ncol(iris)], target = NULL, correlation_thresh = 0.75, use_obs = NULL, correlation_method = "pearson")) 32 | }) 33 | 34 | 35 | testthat::test_that("if the use_obs is NULL it returns an error", { 36 | 37 | testthat::expect_error(func_correlation(iris[, -ncol(iris)], target = NULL, correlation_thresh = 0.75, use_obs = "everything", correlation_method = NULL)) 38 | }) 39 | 40 | 41 | testthat::test_that("if data is not a data frame or matrix it returns an error", { 42 | 43 | testthat::expect_error(func_correlation(list(iris[, -ncol(iris)]), target = NULL, correlation_thresh = 0.75, use_obs = "everything", correlation_method = "pearson")) 44 | }) 45 | 46 | 47 | testthat::test_that("it takes a data frame without a target and it returns a list", { 48 | 49 | testthat::expect_true(is.list(func_correlation(iris1, target = NULL, correlation_thresh = 0.75, use_obs = "everything", correlation_method = "pearson"))) 50 | }) 51 | 52 | testthat::test_that("it takes a data frame with a target and it returns a data frame", { 53 | 54 | testthat::expect_true(is.data.frame(func_correlation(iris1, target = c('Species'), correlation_thresh = 0.75, use_obs = "everything", correlation_method = "pearson"))) 55 | }) 56 | 57 | 58 | testthat::test_that("it takes a data frame with a vector of predictors and it returns a data frame", { 59 | 60 | testthat::expect_true(is.data.frame(func_correlation(iris1, target = c('Species', 'Petal.Width'), correlation_thresh = 0.75, use_obs = "everything", correlation_method = "pearson"))) 61 | }) 62 | 63 | 64 | -------------------------------------------------------------------------------- /tests/testthat/test-data_split_function.R: -------------------------------------------------------------------------------- 1 | context('Split of data') 2 | 3 | 4 | testthat::test_that("throws an error if the TrainRation is less than expected", { 5 | 6 | y = c(1:50) 7 | 8 | testthat::expect_error(DataSplit(y, TrainRatio = 1.1), 'TrainRation should be a float number greater than 0 and less than 1.0') 9 | }) 10 | 11 | 12 | testthat::test_that("returns a list of indices", { 13 | 14 | y = c(1:50) 15 | 16 | testthat::expect_true(is.list(DataSplit(y, TrainRatio = 0.75))) 17 | }) 18 | 19 | 20 | # test function with regression = T, F and shuffle = T, F 21 | 22 | testthat::test_that("returns a list of indices", { 23 | 24 | y = as.factor(sample(letters[1:2], 50 , replace = T)) 25 | 26 | testthat::expect_true(is.list(DataSplit(y, TrainRatio = 0.75, regression = F))) 27 | }) 28 | 29 | testthat::test_that("returns a list of indices", { 30 | 31 | y = c(1:50) 32 | 33 | testthat::expect_true(is.list(DataSplit(y, TrainRatio = 0.75, shuffle = T))) 34 | }) 35 | 36 | testthat::test_that("returns a list of indices", { 37 | 38 | y = c(1:50) 39 | 40 | testthat::expect_true(is.list(DataSplit(y, TrainRatio = 0.75, shuffle = F))) 41 | }) 42 | -------------------------------------------------------------------------------- /tests/testthat/test-feature_selection.R: -------------------------------------------------------------------------------- 1 | data(iris) 2 | 3 | 4 | context('Feature selection') 5 | 6 | #==================== 7 | # handling of errors 8 | #==================== 9 | 10 | 11 | testthat::test_that("feature selection returns a error if method is not specified", { 12 | 13 | X = iris[, -5] 14 | y = X[, 1] 15 | X = X[, -1] 16 | 17 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = F) 18 | 19 | testthat::expect_error(feature_selection(X, y, method = NULL, params_glmnet = params_glmnet, CV_folds = 1, cores_glmnet = NULL)) 20 | }) 21 | 22 | 23 | testthat::test_that("feature selection returns a error if CV less than 1", { 24 | 25 | X = iris[, -5] 26 | y = X[, 1] 27 | X = X[, -1] 28 | 29 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = F) 30 | 31 | testthat::expect_error(feature_selection(X, y, method = 'glmnet-lasso', params_glmnet = params_glmnet, CV_folds = 0, cores_glmnet = NULL)) 32 | }) 33 | 34 | 35 | testthat::test_that("glmnet lasso returns a error if data is not a data frame for folds = 1", { 36 | 37 | X = iris[, -5] 38 | y = X[, 1] 39 | X = list(X[, -1]) 40 | 41 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = F) 42 | 43 | testthat::expect_error(feature_selection(X, y, method = 'glmnet-lasso', params_glmnet = params_glmnet, CV_folds = 1, cores_glmnet = NULL)) 44 | }) 45 | 46 | 47 | testthat::test_that("glmnet lasso returns a error if data is not a data frame for folds = 5", { 48 | 49 | X = iris[, -5] 50 | y = X[, 1] 51 | X = list(X[, -1]) 52 | 53 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = F) 54 | 55 | testthat::expect_error(feature_selection(X, y, method = 'glmnet-lasso', params_glmnet = params_glmnet, CV_folds = 5, cores_glmnet = NULL)) 56 | }) 57 | 58 | 59 | #=========================================================== 60 | # handling NA's in glmnet-lasso 61 | # xgboost, ranger treat missing values as an extra category 62 | #=========================================================== 63 | 64 | 65 | testthat::test_that("missing values in data frame will be replaced with the median, in case of CV_folds = 1", { 66 | 67 | X = iris[, -5] 68 | y = X[, 1] 69 | X = X[, -1] 70 | 71 | X[, 1] = func_nas(X, 1) # add NA's to first and last column 72 | X[, ncol(X)] = func_nas(X, ncol(X)) 73 | 74 | 75 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = F) 76 | 77 | res = feature_selection(X, y, method = 'glmnet-lasso', params_glmnet = params_glmnet, CV_folds = 1, cores_glmnet = NULL) 78 | 79 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty 80 | }) 81 | 82 | 83 | testthat::test_that("missing values in data frame will be replaced with the median, in case of CV_folds = 5", { 84 | 85 | X = iris[, -5] 86 | y = X[, 1] 87 | X = X[, -1] 88 | 89 | X[, 1] = func_nas(X, 1) # add NA's to first and last column 90 | X[, ncol(X)] = func_nas(X, ncol(X)) 91 | 92 | 93 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = F) 94 | 95 | res = feature_selection(X, y, method = 'glmnet-lasso', params_glmnet = params_glmnet, CV_folds = 5, cores_glmnet = NULL) 96 | 97 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty 98 | }) 99 | 100 | 101 | testthat::test_that("missing values in sparse Matrix will be replaced with the median, in case of CV_folds = 1", { 102 | 103 | X = iris[, -5] 104 | y = X[, 1] 105 | X = X[, -1] 106 | 107 | X[, 1] = func_nas(X, 1) # add NA's to first and last column 108 | X[, ncol(X)] = func_nas(X, ncol(X)) 109 | 110 | X = Matrix::Matrix(as.matrix(X), sparse = T) # convert to sparse matrix 111 | 112 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = F) 113 | 114 | res = feature_selection(X, y, method = 'glmnet-lasso', params_glmnet = params_glmnet, CV_folds = 1, cores_glmnet = NULL) 115 | 116 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty 117 | }) 118 | 119 | 120 | testthat::test_that("missing values in sparse Matrix will be replaced with the median, in case of CV_folds = 5", { 121 | 122 | X = iris[, -5] 123 | y = X[, 1] 124 | X = X[, -1] 125 | 126 | X[, 1] = func_nas(X, 1) # add NA's to first and last column 127 | X[, ncol(X)] = func_nas(X, ncol(X)) 128 | X = Matrix::Matrix(as.matrix(X), sparse = T) # convert to sparse matrix 129 | 130 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = F) 131 | 132 | res = feature_selection(X, y, method = 'glmnet-lasso', params_glmnet = params_glmnet, CV_folds = 5, cores_glmnet = NULL) 133 | 134 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty 135 | }) 136 | 137 | #====================== 138 | # test - cases : glmnet 139 | #====================== 140 | 141 | 142 | testthat::test_that("glmnet returns a data frame with the important predictors, in case 3 folds", { 143 | 144 | X = iris[, -5] 145 | y = X[, 1] 146 | X = X[, -1] 147 | 148 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = F) 149 | res = feature_selection(X, y, method = 'glmnet-lasso', params_glmnet = params_glmnet, CV_folds = 5, cores_glmnet = NULL, verbose = T) 150 | 151 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty 152 | }) 153 | 154 | 155 | testthat::test_that("glmnet returns a data frame with the important predictors, in case 1 fold", { 156 | 157 | X = iris[, -5] 158 | y = X[, 1] 159 | X = X[, -1] 160 | 161 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = F) 162 | res = feature_selection(X, y, method = 'glmnet-lasso', params_glmnet = params_glmnet, CV_folds = 1, cores_glmnet = NULL) 163 | 164 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty 165 | }) 166 | 167 | 168 | testthat::test_that("glmnet returns an error if data is not a matrix, data.frame or sparse matrix , in case of fold = 1", { 169 | 170 | X = iris[, -5] 171 | y = X[, 1] 172 | X = list(X[, -1]) 173 | 174 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = F) 175 | 176 | testthat::expect_error(feature_selection(X, y, method = 'glmnet-lasso', params_glmnet = params_glmnet, CV_folds = 1, cores_glmnet = NULL)) # is.data.frame and not empty 177 | }) 178 | 179 | 180 | testthat::test_that("glmnet takes a matrix and returns a data frame with the important predictors, in case 1 fold", { 181 | 182 | X = iris[, -5] 183 | y = X[, 1] 184 | X = as.matrix(X[, -1]) 185 | 186 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = F) 187 | res = feature_selection(X, y, method = 'glmnet-lasso', params_glmnet = params_glmnet, CV_folds = 1, cores_glmnet = NULL, verbose = T) 188 | 189 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty 190 | }) 191 | 192 | 193 | testthat::test_that("glmnet takes a matrix and returns a data frame with the important predictors, in case 3 fold", { 194 | 195 | X = iris[, -5] 196 | y = X[, 1] 197 | X = as.matrix(X[, -1]) 198 | 199 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = F) 200 | res = feature_selection(X, y, method = 'glmnet-lasso', params_glmnet = params_glmnet, CV_folds = 3, cores_glmnet = NULL) 201 | 202 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty 203 | }) 204 | 205 | 206 | testthat::test_that("glmnet takes a sparse matrix and returns a data frame with the important predictors, in case 1 fold", { 207 | 208 | X = iris[, -5] 209 | y = X[, 1] 210 | X = Matrix::Matrix(as.matrix(X[, -1]), sparse = T) 211 | 212 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = F) 213 | res = feature_selection(X, y, method = 'glmnet-lasso', params_glmnet = params_glmnet, CV_folds = 1, cores_glmnet = NULL, verbose = T) 214 | 215 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty 216 | }) 217 | 218 | testthat::test_that("glmnet takes a sparse matrix and returns a data frame with the important predictors, in case 3 fold", { 219 | 220 | X = iris[, -5] 221 | y = X[, 1] 222 | X = Matrix::Matrix(as.matrix(X[, -1]), sparse = T) 223 | 224 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = F) 225 | res = feature_selection(X, y, method = 'glmnet-lasso', params_glmnet = params_glmnet, CV_folds = 3, cores_glmnet = NULL, verbose = T) 226 | 227 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty 228 | }) 229 | 230 | 231 | testthat::test_that("glmnet takes a sparse matrix and returns a data frame with the important predictors, in case of 1 fold, here use argument scale_coefs_glmnet AND verbose", { 232 | 233 | X = iris[, -5] 234 | y = X[, 1] 235 | X = Matrix::Matrix(as.matrix(X[, -1]), sparse = T) 236 | 237 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = F) 238 | res = feature_selection(X, y, method = 'glmnet-lasso', params_glmnet = params_glmnet, CV_folds = 1, cores_glmnet = NULL, scale_coefs_glmnet = T, verbose = T) 239 | 240 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty 241 | }) 242 | 243 | testthat::test_that("glmnet takes a sparse matrix and returns a data frame with the important predictors, in case of 3 folds, here use argument scale_coefs_glmnet AND verbose", { 244 | 245 | X = iris[, -5] 246 | y = X[, 1] 247 | X = Matrix::Matrix(as.matrix(X[, -1]), sparse = T) 248 | 249 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = F) 250 | res = feature_selection(X, y, method = 'glmnet-lasso', params_glmnet = params_glmnet, CV_folds = 3, cores_glmnet = NULL, scale_coefs_glmnet = T, verbose = T) 251 | 252 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty 253 | }) 254 | 255 | testthat::test_that("glmnet takes a data frame and in case of the binomial classification it returns a data frame with the important predictors, in case of 1 fold", { 256 | 257 | X = iris 258 | y = X[, 5] 259 | y = as.character(y) 260 | y[y == 'setosa'] = 'versicolor' 261 | y = as.factor(y) 262 | X = X[, -5] 263 | 264 | params_glmnet = list(alpha = 1, family = 'binomial', nfolds = 3, parallel = F) 265 | res = feature_selection(X, y, method = 'glmnet-lasso', params_glmnet = params_glmnet, CV_folds = 1, cores_glmnet = NULL, scale_coefs_glmnet = T, verbose = F) 266 | 267 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty 268 | }) 269 | 270 | 271 | testthat::test_that("glmnet takes a data frame and in case of the binomial classification it returns a data frame with the important predictors, in case of 3 fold", { 272 | 273 | X = iris 274 | y = X[, 5] 275 | y = as.character(y) 276 | y[y == 'setosa'] = 'versicolor' 277 | y = as.factor(y) 278 | X = X[, -5] 279 | 280 | params_glmnet = list(alpha = 1, family = 'binomial', nfolds = 3, parallel = F) 281 | res = feature_selection(X, y, method = 'glmnet-lasso', params_glmnet = params_glmnet, CV_folds = 3, cores_glmnet = NULL, scale_coefs_glmnet = T, verbose = F) 282 | 283 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty 284 | }) 285 | 286 | 287 | testthat::test_that("glmnet takes a data frame and in case of the multinomial classification it returns a data frame with the important predictors, in case of 1 fold", { 288 | 289 | X = iris 290 | y = X[, 5] 291 | X = X[, -5] 292 | 293 | params_glmnet = list(alpha = 1, family = 'multinomial', nfolds = 3, parallel = F) 294 | res = feature_selection(X, y, method = 'glmnet-lasso', params_glmnet = params_glmnet, CV_folds = 1, cores_glmnet = NULL, scale_coefs_glmnet = T, verbose = F) 295 | 296 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty 297 | }) 298 | 299 | testthat::test_that("glmnet takes a data frame and in case of the multinomial classification it returns a data frame with the important predictors, in case of 3 fold", { 300 | 301 | X = iris 302 | y = X[, 5] 303 | X = X[, -5] 304 | 305 | params_glmnet = list(alpha = 1, family = 'multinomial', nfolds = 3, parallel = F) 306 | res = feature_selection(X, y, method = 'glmnet-lasso', params_glmnet = params_glmnet, CV_folds = 3, cores_glmnet = NULL, scale_coefs_glmnet = T, verbose = F) 307 | 308 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty 309 | }) 310 | 311 | 312 | #======================= 313 | # test - cases : xgboost 314 | #======================= 315 | 316 | 317 | testthat::test_that("xgboost returns an error if data not data.frame or matrix, if folds = 1", { 318 | 319 | y = iris[, 5] 320 | multiclass_xgboost = ifelse(y == 'setosa', 0, ifelse(y == 'virginica', 1, 2)) 321 | X = list(iris[, -5]) 322 | 323 | params_xgboost = list( params = list("objective" = "multi:softprob", "bst:eta" = 0.35, "subsample" = 0.65, "num_class" = 3, "max_depth" = 6, "colsample_bytree" = 0.65, "nthread" = 2), 324 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE) 325 | 326 | testthat::expect_error(feature_selection(X, multiclass_xgboost, method = 'xgboost', params_xgboost = params_xgboost, CV_folds = 1)) # is.data.frame and not empty 327 | }) 328 | 329 | 330 | testthat::test_that("xgboost returns an error if data not data.frame or matrix, if folds = 5", { 331 | 332 | y = iris[, 5] 333 | multiclass_xgboost = ifelse(y == 'setosa', 0, ifelse(y == 'virginica', 1, 2)) 334 | X = list(iris[, -5]) 335 | 336 | params_xgboost = list( params = list("objective" = "multi:softprob", "bst:eta" = 0.35, "subsample" = 0.65, "num_class" = 3, "max_depth" = 6, "colsample_bytree" = 0.65, "nthread" = 2), 337 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE) 338 | 339 | testthat::expect_error(feature_selection(X, multiclass_xgboost, method = 'xgboost', params_xgboost = params_xgboost, CV_folds = 5)) # is.data.frame and not empty 340 | }) 341 | 342 | 343 | testthat::test_that("xgboost returns a data frame with the important predictors, in case 1 folds, default xgb_sort = Frequency", { 344 | 345 | y = iris[, 5] 346 | multiclass_xgboost = ifelse(y == 'setosa', 0, ifelse(y == 'virginica', 1, 2)) 347 | X = iris[, -5] 348 | 349 | params_xgboost = list( params = list("objective" = "multi:softprob", "bst:eta" = 0.35, "subsample" = 0.65, "num_class" = 3, "max_depth" = 6, "colsample_bytree" = 0.65, "nthread" = 2), 350 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE) 351 | 352 | res = feature_selection(X, multiclass_xgboost, method = 'xgboost', params_xgboost = params_xgboost, CV_folds = 1, verbose = T) 353 | 354 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty 355 | }) 356 | 357 | 358 | testthat::test_that("xgboost returns a data frame with the important predictors, in case 1 folds, xgb_sort = Gain", { 359 | 360 | y = iris[, 5] 361 | multiclass_xgboost = ifelse(y == 'setosa', 0, ifelse(y == 'virginica', 1, 2)) 362 | X = iris[, -5] 363 | 364 | params_xgboost = list( params = list("objective" = "multi:softprob", "bst:eta" = 0.35, "subsample" = 0.65, "num_class" = 3, "max_depth" = 6, "colsample_bytree" = 0.65, "nthread" = 2), 365 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE) 366 | 367 | res = feature_selection(X, multiclass_xgboost, method = 'xgboost', params_xgboost = params_xgboost, CV_folds = 1, xgb_sort = 'Gain', verbose = T) 368 | 369 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty 370 | }) 371 | 372 | 373 | testthat::test_that("xgboost returns a data frame with the important predictors, in case 1 folds, xgb_sort = Cover", { 374 | 375 | y = iris[, 5] 376 | multiclass_xgboost = ifelse(y == 'setosa', 0, ifelse(y == 'virginica', 1, 2)) 377 | X = iris[, -5] 378 | 379 | params_xgboost = list( params = list("objective" = "multi:softprob", "bst:eta" = 0.35, "subsample" = 0.65, "num_class" = 3, "max_depth" = 6, "colsample_bytree" = 0.65, "nthread" = 2), 380 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE) 381 | 382 | res = feature_selection(X, multiclass_xgboost, method = 'xgboost', params_xgboost = params_xgboost, CV_folds = 1, xgb_sort = 'Cover') 383 | 384 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty 385 | }) 386 | 387 | 388 | 389 | testthat::test_that("xgboost using sparse Matrix returns a data frame with the important predictors, in case 1 folds", { 390 | 391 | X = iris[, -5] 392 | y = X[, 1] 393 | X = Matrix::Matrix(as.matrix(X[, -1]), sparse = T) 394 | 395 | params_xgboost = list( params = list("objective" = "reg:linear", "bst:eta" = 0.35, "subsample" = 0.65, "max_depth" = 6, "colsample_bytree" = 0.65, "nthread" = 2), 396 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE) 397 | 398 | res = feature_selection(X, y, method = 'xgboost', params_xgboost = params_xgboost, CV_folds = 1) 399 | 400 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty 401 | }) 402 | 403 | 404 | testthat::test_that("xgboost using sparse Matrix returns a data frame with the important predictors, in case 3 folds, default xgb_sort = Frequency", { 405 | 406 | X = iris[, -5] 407 | y = X[, 1] 408 | X = Matrix::Matrix(as.matrix(X[, -1]), sparse = T) 409 | 410 | params_xgboost = list( params = list("objective" = "reg:linear", "bst:eta" = 0.35, "subsample" = 0.65, "max_depth" = 6, "colsample_bytree" = 0.65, "nthread" = 2), 411 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE) 412 | 413 | res = feature_selection(X, y, method = 'xgboost', params_xgboost = params_xgboost, CV_folds = 3) 414 | 415 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty 416 | }) 417 | 418 | 419 | testthat::test_that("xgboost using sparse Matrix returns a data frame with the important predictors, in case 3 folds, default xgb_sort = Frequency", { 420 | 421 | X = iris[, -5] 422 | y = X[, 1] 423 | X = Matrix::Matrix(as.matrix(X[, -1]), sparse = T) 424 | 425 | params_xgboost = list( params = list("objective" = "reg:linear", "bst:eta" = 0.35, "subsample" = 0.65, "max_depth" = 6, "colsample_bytree" = 0.65, "nthread" = 2), 426 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE) 427 | 428 | res = feature_selection(X, y, method = 'xgboost', params_xgboost = params_xgboost, CV_folds = 3, xgb_sort = 'Gain', verbose = T) 429 | 430 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty 431 | }) 432 | 433 | 434 | 435 | testthat::test_that("xgboost using sparse Matrix returns a data frame with the important predictors, in case 3 folds, default xgb_sort = Frequency", { 436 | 437 | X = iris[, -5] 438 | y = X[, 1] 439 | X = Matrix::Matrix(as.matrix(X[, -1]), sparse = T) 440 | 441 | params_xgboost = list( params = list("objective" = "reg:linear", "bst:eta" = 0.35, "subsample" = 0.65, "max_depth" = 6, "colsample_bytree" = 0.65, "nthread" = 2), 442 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE) 443 | 444 | res = feature_selection(X, y, method = 'xgboost', params_xgboost = params_xgboost, CV_folds = 3, xgb_sort = 'Cover') 445 | 446 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty 447 | }) 448 | 449 | 450 | testthat::test_that("xgboost using data.frame in multiclass classification, returns a data frame with the important predictors, in case 3 folds, default xgb_sort = Frequency", { 451 | 452 | y = iris[, 5] 453 | multiclass_xgboost = ifelse(y == 'setosa', 0, ifelse(y == 'virginica', 1, 2)) 454 | X = iris[, -5] 455 | 456 | params_xgboost = list( params = list("objective" = "multi:softprob", "bst:eta" = 0.35, "subsample" = 0.65, "num_class" = 3, "max_depth" = 6, "colsample_bytree" = 0.65, "nthread" = 2), 457 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE) 458 | 459 | res = feature_selection(X, multiclass_xgboost, method = 'xgboost', params_xgboost = params_xgboost, CV_folds = 3, verbose = T) 460 | 461 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty 462 | }) 463 | 464 | 465 | testthat::test_that("xgboost using data.frame in binary classification, returns a data frame with the important predictors, in case 5 folds, default xgb_sort = Frequency", { 466 | 467 | y = iris[, 5] 468 | multiclass_xgboost = ifelse(y == 'setosa', 0, 1) 469 | X = iris[, -5] 470 | 471 | params_xgboost = list( params = list("objective" = "binary:logistic", "bst:eta" = 0.35, "subsample" = 0.65, "max_depth" = 6, "colsample_bytree" = 0.65, "nthread" = 2), 472 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE) 473 | 474 | res = feature_selection(X, multiclass_xgboost, method = 'xgboost', params_xgboost = params_xgboost, CV_folds = 5) 475 | 476 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty 477 | }) 478 | 479 | 480 | 481 | #======================= 482 | # test - cases : ranger 483 | #======================= 484 | 485 | 486 | testthat::test_that("ranger returns an error if data not data.frame or matrix, folds = 1", { 487 | 488 | y = iris[, 5] 489 | y = as.character(y) 490 | y[y == 'setosa'] = 'virginica' 491 | X = list(iris[, -5]) 492 | 493 | params_ranger = list(write.forest = TRUE, probability = TRUE, num.threads = 6, num.trees = 50, verbose = FALSE, classification = TRUE, mtry = 2, min.node.size = 5, importance = 'impurity') 494 | 495 | testthat::expect_error(feature_selection(X, y, method = 'ranger', params_ranger = params_ranger, CV_folds = 1)) # is.data.frame and not empty 496 | }) 497 | 498 | testthat::test_that("ranger returns an error if data not data.frame or matrix, folds > 1", { 499 | 500 | y = iris[, 5] 501 | y = as.character(y) 502 | y[y == 'setosa'] = 'virginica' 503 | X = list(iris[, -5]) 504 | 505 | params_ranger = list(write.forest = TRUE, probability = TRUE, num.threads = 6, num.trees = 50, verbose = FALSE, classification = TRUE, mtry = 2, min.node.size = 5, importance = 'impurity') 506 | 507 | testthat::expect_error(feature_selection(X, y, method = 'ranger', params_ranger = params_ranger, CV_folds = 3)) # is.data.frame and not empty 508 | }) 509 | 510 | testthat::test_that("ranger returns a data.frame with important predictors if folds = 1", { 511 | 512 | y = iris[, 5] 513 | y = as.character(y) 514 | y[y == 'setosa'] = 'virginica' 515 | X = iris[, -5] 516 | 517 | params_ranger = list(write.forest = TRUE, probability = TRUE, num.threads = 6, num.trees = 50, verbose = FALSE, classification = TRUE, mtry = 2, min.node.size = 5, importance = 'impurity') 518 | 519 | res = feature_selection(X, y, method = 'ranger', params_ranger = params_ranger, CV_folds = 1, verbose = T) 520 | 521 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty 522 | }) 523 | 524 | 525 | testthat::test_that("ranger returns a data.frame with important predictors if folds = 5", { 526 | 527 | y = iris[, 5] 528 | y = as.character(y) 529 | y[y == 'setosa'] = 'virginica' 530 | X = iris[, -5] 531 | 532 | params_ranger = list(write.forest = TRUE, probability = TRUE, num.threads = 6, num.trees = 50, verbose = FALSE, classification = TRUE, mtry = 2, min.node.size = 5, importance = 'impurity') 533 | 534 | res = feature_selection(X, y, method = 'ranger', params_ranger = params_ranger, CV_folds = 5, verbose = T) 535 | 536 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty 537 | }) 538 | 539 | 540 | testthat::test_that("ranger returns a data.frame with important predictors if folds = 1, in regression", { 541 | 542 | X = iris[, -5] 543 | y = X[, 1] 544 | X = X[, -1] 545 | 546 | params_ranger = list(write.forest = TRUE, probability = F, num.threads = 6, num.trees = 50, verbose = FALSE, classification = F, mtry = 2, min.node.size = 5, importance = 'impurity') 547 | 548 | res = feature_selection(X, y, method = 'ranger', params_ranger = params_ranger, CV_folds = 1, verbose = T) 549 | 550 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty 551 | }) 552 | 553 | 554 | testthat::test_that("ranger returns a data.frame with important predictors if folds = 5, in regression", { 555 | 556 | X = iris[, -5] 557 | y = X[, 1] 558 | X = X[, -1] 559 | 560 | params_ranger = list(write.forest = TRUE, probability = F, num.threads = 6, num.trees = 50, verbose = FALSE, classification = F, mtry = 2, min.node.size = 5, importance = 'impurity') 561 | 562 | res = feature_selection(X, y, method = 'ranger', params_ranger = params_ranger, CV_folds = 5, verbose = T) 563 | 564 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty 565 | }) 566 | 567 | 568 | 569 | testthat::test_that("ranger returns a data.frame with important predictors if folds = 5, in regression", { 570 | 571 | X = iris[, -5] 572 | y = X[, 1] 573 | X = X[, -1] 574 | 575 | params_ranger = list(write.forest = TRUE, probability = F, num.threads = 6, num.trees = 50, verbose = FALSE, classification = F, mtry = 2, min.node.size = 5, importance = 'impurity') 576 | 577 | res = feature_selection(X, y, method = 'ranger', params_ranger = params_ranger, CV_folds = 5) 578 | 579 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty 580 | }) 581 | 582 | 583 | testthat::test_that("ranger returns a data.frame with important predictors if folds = 1, in regression, when using dependent.variable.name ", { 584 | 585 | X = iris 586 | y = iris[, 'Species'] # IN case that I give dependent.variable.name , THEN I should also specify the response variable (y), so that folds can be built 587 | 588 | params_ranger = list(dependent.variable.name = "Species", write.forest = TRUE, probability = T, num.threads = 6, num.trees = 50, verbose = FALSE, classification = T, mtry = 2, 589 | 590 | min.node.size = 5, importance = 'impurity') 591 | 592 | res = feature_selection(X, y, method = 'ranger', params_ranger = params_ranger, CV_folds = 1) 593 | 594 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty 595 | }) 596 | 597 | 598 | testthat::test_that("ranger returns a data.frame with important predictors if folds = 3, in regression, when using dependent.variable.name ", { 599 | 600 | X = iris 601 | y = iris[, 'Species'] # IN case that I give dependent.variable.name , THEN I should also specify the response variable (y), so that folds can be built 602 | 603 | params_ranger = list(dependent.variable.name = "Species", write.forest = TRUE, probability = T, num.threads = 6, num.trees = 50, verbose = FALSE, classification = T, mtry = 2, 604 | 605 | min.node.size = 5, importance = 'impurity') 606 | 607 | res = feature_selection(X, y, method = 'ranger', params_ranger = params_ranger, CV_folds = 3, verbose = T) 608 | 609 | testthat::expect_true(is.data.frame(res) & sum(dim(res)) > 0) # is.data.frame and not empty 610 | }) 611 | 612 | -------------------------------------------------------------------------------- /tests/testthat/test-regression_folds.R: -------------------------------------------------------------------------------- 1 | context("Split data in regression folds") 2 | 3 | 4 | testthat::test_that("throws an error if the RESP is not a factor", { 5 | 6 | y = as.factor(c(1:50)) 7 | 8 | testthat::expect_error(regr_folds(5, y, stratified = F), "this function is meant for regression for classification use the 'class_folds' function") 9 | }) 10 | 11 | 12 | testthat::test_that("returns a warning if the folds are not equally split", { 13 | 14 | y = sample(1:5, 99, replace = T) 15 | 16 | testthat::expect_warning(regr_folds(5, y, stratified = F), 'the folds are not equally split') 17 | }) 18 | 19 | testthat::test_that("the number of folds equals the number of the resulted sublist indices", { 20 | 21 | y = sample(1:5, 100, replace = T) 22 | 23 | testthat::expect_length(regr_folds(5, y, stratified = F), 5) 24 | }) 25 | 26 | # object with stratified = T 27 | 28 | testthat::test_that("the number of folds equals the number of the resulted sublist indices", { 29 | 30 | y = sample(1:5, 100, replace = T) 31 | 32 | testthat::expect_length(regr_folds(5, y, stratified = T), 5) 33 | }) 34 | -------------------------------------------------------------------------------- /tests/testthat/test-secondary_functions_func_correlation.R: -------------------------------------------------------------------------------- 1 | context('Secondary functions func_correlation') 2 | 3 | # 'remove_duplic_func' function 4 | 5 | testthat::test_that("takes a data frame and returns a data frame", { 6 | 7 | df = data.frame(a = sample(1:4, 100, replace = T)) 8 | 9 | testthat::expect_true(is.data.frame(remove_duplic_func(df))) 10 | }) 11 | 12 | 13 | testthat::test_that("takes a single column data frame and returns a 3-column data frame", { 14 | 15 | df = data.frame(a = sample(1:4, 100, replace = T)) 16 | 17 | out = ncol(remove_duplic_func(df)) 18 | 19 | testthat::expect_true(out == 3) 20 | }) 21 | 22 | 23 | # 'second_func_cor' function 24 | 25 | testthat::test_that("takes a data frame and returns a list", { 26 | 27 | df = data.frame(a = sample(0:4, 100, replace = T), d = sample(0:6, 100, replace = T)) 28 | 29 | testthat::expect_true(is.list(second_func_cor(df))) 30 | }) 31 | 32 | 33 | testthat::test_that("takes a data frame and returns a list, the length of the list equals the number of columns of the data frame", { 34 | 35 | df = data.frame(a = sample(0:4, 100, replace = T), d = sample(0:6, 100, replace = T)) 36 | 37 | lst = second_func_cor(df) 38 | 39 | testthat::expect_true(ncol(df) == length(lst)) 40 | }) 41 | 42 | -------------------------------------------------------------------------------- /tests/testthat/test-shuffle_data.R: -------------------------------------------------------------------------------- 1 | context('Shuffle and normalize data functions') 2 | 3 | 4 | # shuffle data 5 | 6 | testthat::test_that("shuffle data takes a vector as input and returns a vector as output", { 7 | 8 | y = c(1:50) 9 | 10 | testthat::expect_true(is.vector(func_shuffle(y, times = 10))) 11 | }) 12 | 13 | testthat::test_that("the length of the input vector equals the length of the output vector", { 14 | 15 | y = c(1:50) 16 | 17 | output = func_shuffle(y, times = 10) 18 | 19 | testthat::expect_true(length(y) == length(output)) 20 | }) 21 | 22 | 23 | # normalize data 24 | 25 | 26 | testthat::test_that("normalize data takes a vector as input and returns a vector as output", { 27 | 28 | y = c(1:50) 29 | 30 | testthat::expect_true(is.vector(normalized(y))) 31 | }) 32 | 33 | testthat::test_that("the length of the input vector equals the length of the output vector", { 34 | 35 | y = c(1:50) 36 | 37 | output = normalized(y) 38 | 39 | testthat::expect_true(length(y) == length(output)) 40 | }) 41 | -------------------------------------------------------------------------------- /tests/testthat/test-wrapper_feature_selection.R: -------------------------------------------------------------------------------- 1 | data(iris) 2 | 3 | context('Wraps all the methods') 4 | 5 | 6 | testthat::test_that("it returns an error if all methods are NULL", { 7 | 8 | X = iris[, -5] 9 | y = X[, 1] 10 | X = X[, -1] 11 | 12 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = F) 13 | 14 | params_xgboost = list( params = list("objective" = "reg:linear", "bst:eta" = 0.01, "subsample" = 0.65, "max_depth" = 5, "colsample_bytree" = 0.65, "nthread" = 2), 15 | nrounds = 100, print.every.n = 50, verbose = 0, maximize = FALSE) 16 | 17 | params_ranger = list(probability = FALSE, num.trees = 100, verbose = TRUE, classification = FALSE, mtry = 5, min.node.size = 10, num.threads = 2, importance = 'permutation') 18 | 19 | params_features = list(keep_number_feat = NULL, union = TRUE) 20 | 21 | testthat::expect_error(wrapper_feat_select(X, y, params_glmnet = NULL, params_xgboost = NULL, params_ranger = NULL, xgb_sort = NULL, 22 | CV_folds = 1, stratified_regr = FALSE, cores_glmnet = NULL, params_features = params_features)) 23 | }) 24 | 25 | 26 | testthat::test_that("it returns an error if cv folds less than 2", { 27 | 28 | X = iris[, -5] 29 | y = X[, 1] 30 | X = X[, -1] 31 | 32 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = TRUE) 33 | 34 | params_xgboost = list( params = list("objective" = "reg:linear", "bst:eta" = 0.01, "subsample" = 0.65, "max_depth" = 5, "colsample_bytree" = 0.65, "nthread" = 2), 35 | nrounds = 100, print.every.n = 50, verbose = 0, maximize = FALSE) 36 | 37 | params_ranger = list(probability = FALSE, num.trees = 100, verbose = TRUE, classification = FALSE, mtry = 5, min.node.size = 10, num.threads = 2, importance = 'permutation') 38 | 39 | params_features = list(keep_number_feat = NULL, union = TRUE) 40 | 41 | testthat::expect_error(wrapper_feat_select(X, y, params_glmnet = params_glmnet, params_xgboost = params_xgboost, params_ranger = params_ranger, xgb_sort = NULL, 42 | CV_folds = 1, stratified_regr = FALSE, cores_glmnet = 2, params_features = params_features)) 43 | }) 44 | 45 | 46 | testthat::test_that("it returns an error if the importance method in ranger is not specified", { 47 | 48 | X = iris[, -5] 49 | y = X[, 1] 50 | X = X[, -1] 51 | 52 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = TRUE) 53 | 54 | params_xgboost = list( params = list("objective" = "reg:linear", "bst:eta" = 0.01, "subsample" = 0.65, "max_depth" = 5, "colsample_bytree" = 0.65, "nthread" = 2), 55 | nrounds = 100, print.every.n = 50, verbose = 0, maximize = FALSE) 56 | 57 | params_ranger = list(probability = FALSE, num.trees = 100, verbose = TRUE, classification = FALSE, mtry = 5, min.node.size = 10, num.threads = 2, importance = 'none') 58 | 59 | params_features = list(keep_number_feat = NULL, union = TRUE) 60 | 61 | testthat::expect_error(wrapper_feat_select(X, y, params_glmnet = params_glmnet, params_xgboost = params_xgboost, params_ranger = params_ranger, xgb_sort = NULL, 62 | CV_folds = 3, stratified_regr = FALSE, cores_glmnet = 2, params_features = params_features)) 63 | }) 64 | 65 | 66 | 67 | testthat::test_that("it returns an error if params_features$union == TRUE AND length(method) == 1", { 68 | 69 | X = iris[, -5] 70 | y = X[, 1] 71 | X = X[, -1] 72 | 73 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = TRUE) 74 | 75 | params_xgboost = list( params = list("objective" = "reg:linear", "bst:eta" = 0.01, "subsample" = 0.65, "max_depth" = 5, "colsample_bytree" = 0.65, "nthread" = 2), 76 | nrounds = 100, print.every.n = 50, verbose = 0, maximize = FALSE) 77 | 78 | params_ranger = list(probability = FALSE, num.trees = 100, verbose = TRUE, classification = FALSE, mtry = 5, min.node.size = 10, num.threads = 2, importance = 'permutation') 79 | 80 | params_features = list(keep_number_feat = NULL, union = TRUE) 81 | 82 | testthat::expect_error(wrapper_feat_select(X, y, params_glmnet = NULL, params_xgboost = NULL, params_ranger = params_ranger, xgb_sort = NULL, 83 | CV_folds = 3, stratified_regr = FALSE, cores_glmnet = 2, params_features = params_features)) 84 | }) 85 | 86 | 87 | testthat::test_that("it returns a list with non-empty data frames when union = F", { 88 | 89 | X = iris[, -5] 90 | y = X[, 1] 91 | X = X[, -1] 92 | 93 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = TRUE) 94 | 95 | params_xgboost = list( params = list("objective" = "reg:linear", "bst:eta" = 0.01, "subsample" = 0.65, "max_depth" = 5, "colsample_bytree" = 0.65, "nthread" = 2), 96 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE) 97 | 98 | params_ranger = list(probability = FALSE, num.trees = 50, verbose = TRUE, classification = FALSE, mtry = 2, min.node.size = 10, num.threads = 2, importance = 'permutation') 99 | 100 | params_features = list(keep_number_feat = NULL, union = F) 101 | 102 | res = wrapper_feat_select(X, y, params_glmnet = params_glmnet, params_xgboost = params_xgboost, params_ranger = params_ranger, xgb_sort = NULL, 103 | 104 | CV_folds = 3, stratified_regr = FALSE, cores_glmnet = 2, params_features = params_features) 105 | 106 | testthat::expect_true(is.list(res) & all(unlist(lapply(res, function(x) sum(dim(x)))) > 0)) 107 | }) 108 | 109 | 110 | testthat::test_that("it returns a list with non-empty data frames when union = F, sorted by Gain in xgboost", { 111 | 112 | X = iris[, -5] 113 | y = X[, 1] 114 | X = X[, -1] 115 | 116 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = TRUE) 117 | 118 | params_xgboost = list( params = list("objective" = "reg:linear", "bst:eta" = 0.01, "subsample" = 0.65, "max_depth" = 5, "colsample_bytree" = 0.65, "nthread" = 2), 119 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE) 120 | 121 | params_ranger = list(probability = FALSE, num.trees = 50, verbose = TRUE, classification = FALSE, mtry = 2, min.node.size = 10, num.threads = 2, importance = 'permutation') 122 | 123 | params_features = list(keep_number_feat = NULL, union = F) 124 | 125 | res = wrapper_feat_select(X, y, params_glmnet = params_glmnet, params_xgboost = params_xgboost, params_ranger = params_ranger, xgb_sort = "Gain", 126 | 127 | CV_folds = 3, stratified_regr = FALSE, cores_glmnet = 2, params_features = params_features) 128 | 129 | testthat::expect_true(is.list(res) & all(unlist(lapply(res, function(x) sum(dim(x)))) > 0)) 130 | }) 131 | 132 | 133 | testthat::test_that("it returns a list of two when union = T. The data frames in the first list and in the second list should be non-empty", { 134 | 135 | X = iris[, -5] 136 | y = X[, 1] 137 | X = X[, -1] 138 | 139 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = TRUE) 140 | 141 | params_xgboost = list( params = list("objective" = "reg:linear", "bst:eta" = 0.01, "subsample" = 0.65, "max_depth" = 5, "colsample_bytree" = 0.65, "nthread" = 2), 142 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE) 143 | 144 | params_ranger = list(probability = FALSE, num.trees = 50, verbose = TRUE, classification = FALSE, mtry = 2, min.node.size = 10, num.threads = 2, importance = 'permutation') 145 | 146 | params_features = list(keep_number_feat = NULL, union = T) 147 | 148 | res = wrapper_feat_select(X, y, params_glmnet = params_glmnet, params_xgboost = params_xgboost, params_ranger = params_ranger, xgb_sort = NULL, 149 | 150 | CV_folds = 3, stratified_regr = FALSE, cores_glmnet = 2, params_features = params_features) 151 | 152 | testthat::expect_true(is.list(res) & all(unlist(lapply(res$all_feat, function(x) sum(dim(x)))) > 0) & sum(dim(res$union_feat)) > 0) 153 | }) 154 | 155 | 156 | testthat::test_that("it returns a list of two when union = F. The data frames in the first list and in the second list should be non-empty", { 157 | 158 | X = iris[, -5] 159 | y = X[, 1] 160 | X = X[, -1] 161 | 162 | params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = TRUE) 163 | 164 | params_xgboost = list( params = list("objective" = "reg:linear", "bst:eta" = 0.01, "subsample" = 0.65, "max_depth" = 5, "colsample_bytree" = 0.65, "nthread" = 2), 165 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE) 166 | 167 | params_ranger = list(probability = FALSE, num.trees = 50, verbose = TRUE, classification = FALSE, mtry = 2, min.node.size = 10, num.threads = 2, importance = 'permutation') 168 | 169 | params_features = list(keep_number_feat = NULL, union = T) 170 | 171 | res = wrapper_feat_select(X, y, params_glmnet = params_glmnet, params_xgboost = params_xgboost, params_ranger = params_ranger, xgb_sort = NULL, 172 | 173 | CV_folds = 3, stratified_regr = FALSE, cores_glmnet = 2, params_features = params_features) 174 | 175 | testthat::expect_true(is.list(res) & all(unlist(lapply(res$all_feat, function(x) sum(dim(x)))) > 0) & sum(dim(res$union_feat)) > 0) 176 | }) 177 | 178 | 179 | # testthat::test_that("wrapper_feat_select works if the dependent.variable.name is in the parameters of ranger and all other methods are NULL", { 180 | # 181 | # X = iris 182 | # X$Species = as.numeric(X$Species) 183 | # y = X$Species 184 | # 185 | # params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = F) 186 | # 187 | # params_xgboost = list( params = list("objective" = "reg:linear", "bst:eta" = 0.01, "subsample" = 0.65, "max_depth" = 5, "colsample_bytree" = 0.65, "nthread" = 2), 188 | # nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE) 189 | # 190 | # params_ranger = list(dependent.variable.name = "Species", probability = FALSE, num.trees = 50, verbose = TRUE, classification = FALSE, mtry = 2, min.node.size = 10, num.threads = 2, importance = 'permutation') 191 | # 192 | # params_features = list(keep_number_feat = NULL, union = F) 193 | # 194 | # res = wrapper_feat_select(X, y, params_glmnet = NULL, params_xgboost = NULL, params_ranger = params_ranger, xgb_sort = NULL, 195 | # 196 | # CV_folds = 3, stratified_regr = FALSE, cores_glmnet = NULL, params_features = params_features) 197 | # 198 | # testthat::expect_true(is.list(res) & sum(dim(res$ranger)) > 0) 199 | # }) 200 | 201 | 202 | # testthat::test_that("wrapper_feat_select returns an error if the dependent.variable.name is in the parameters of ranger and one or two of the other methods are not NULL", { 203 | # 204 | # X = iris 205 | # X$Species = as.numeric(X$Species) 206 | # y = X$Species 207 | # 208 | # params_glmnet = list(alpha = 1, family = 'gaussian', nfolds = 3, parallel = TRUE) 209 | # 210 | # params_xgboost = list( params = list("objective" = "reg:linear", "bst:eta" = 0.01, "subsample" = 0.65, "max_depth" = 5, "colsample_bytree" = 0.65, "nthread" = 2), 211 | # nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE) 212 | # 213 | # params_ranger = list(dependent.variable.name = "Species", probability = FALSE, num.trees = 50, verbose = TRUE, classification = FALSE, mtry = 2, min.node.size = 10, num.threads = 2, importance = 'permutation') 214 | # 215 | # params_features = list(keep_number_feat = NULL, union = F) 216 | # 217 | # testthat::expect_error(wrapper_feat_select(X, y, params_glmnet = NULL, params_xgboost = params_xgboost, params_ranger = params_ranger, xgb_sort = NULL, 218 | # 219 | # CV_folds = 3, stratified_regr = FALSE, cores_glmnet = 2, params_features = params_features)) 220 | # }) 221 | 222 | 223 | 224 | testthat::test_that("it works in BINOMIAL classification. The labels should be in c(0,1), so that xgboost works. It should return a 225 | 226 | list of two when union = T. The data frames in the first list and in the second list should be non-empty. Case binary classification", { 227 | 228 | y = iris[, 5] 229 | y = as.character(y) 230 | y[y == 'setosa'] = 'virginica' 231 | y = as.numeric(as.factor(y)) - 1 232 | X = iris[, -5] 233 | 234 | params_glmnet = list(alpha = 1, family = 'binomial', nfolds = 3, parallel = TRUE) 235 | 236 | params_xgboost = list( params = list("objective" = "reg:logistic", "bst:eta" = 0.01, "subsample" = 0.65, "max_depth" = 5, "colsample_bytree" = 0.65, "nthread" = 2), 237 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE) 238 | 239 | params_ranger = list(write.forest = TRUE, probability = TRUE, num.threads = 6, num.trees = 50, verbose = FALSE, classification = TRUE, mtry = 2, min.node.size = 5, importance = 'impurity') 240 | 241 | params_features = list(keep_number_feat = NULL, union = T) 242 | 243 | res = wrapper_feat_select(X, y, params_glmnet = params_glmnet, params_xgboost = params_xgboost, params_ranger = params_ranger, xgb_sort = NULL, 244 | 245 | CV_folds = 3, stratified_regr = FALSE, cores_glmnet = 2, params_features = params_features) 246 | 247 | testthat::expect_true(is.list(res) & all(unlist(lapply(res$all_feat, function(x) sum(dim(x)))) > 0) & sum(dim(res$union_feat)) > 0) 248 | }) 249 | 250 | 251 | testthat::test_that("it works in BINOMIAL classification. The labels should be in c(0,1), so that xgboost works. It should return a 252 | 253 | list of two when union = T. The data frames in the first list and in the second list should be non-empty. Case binary classification, sorted by Cover in xgboost", { 254 | 255 | y = iris[, 5] 256 | y = as.character(y) 257 | y[y == 'setosa'] = 'virginica' 258 | y = as.numeric(as.factor(y)) - 1 259 | X = iris[, -5] 260 | 261 | params_glmnet = list(alpha = 1, family = 'binomial', nfolds = 3, parallel = TRUE) 262 | 263 | params_xgboost = list( params = list("objective" = "reg:logistic", "bst:eta" = 0.01, "subsample" = 0.65, "max_depth" = 5, "colsample_bytree" = 0.65, "nthread" = 2), 264 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE) 265 | 266 | params_ranger = list(write.forest = TRUE, probability = TRUE, num.threads = 6, num.trees = 50, verbose = FALSE, classification = TRUE, mtry = 2, min.node.size = 5, importance = 'impurity') 267 | 268 | params_features = list(keep_number_feat = NULL, union = T) 269 | 270 | res = wrapper_feat_select(X, y, params_glmnet = params_glmnet, params_xgboost = params_xgboost, params_ranger = params_ranger, xgb_sort = "Cover", 271 | 272 | CV_folds = 3, stratified_regr = FALSE, cores_glmnet = 2, params_features = params_features) 273 | 274 | testthat::expect_true(is.list(res) & all(unlist(lapply(res$all_feat, function(x) sum(dim(x)))) > 0) & sum(dim(res$union_feat)) > 0) 275 | }) 276 | 277 | 278 | testthat::test_that("it works in MULTICLASS classification. The labels should be in c(0 to Inf), so that xgboost works. It should return a 279 | 280 | list of two when union = T. The data frames in the first and in the second list should be non-empty. Case multiclass classification", { 281 | 282 | y = iris[, 5] 283 | multiclass_xgboost = ifelse(y == 'setosa', 0, ifelse(y == 'virginica', 1, 2)) 284 | X = iris[, -5] 285 | 286 | params_glmnet = list(alpha = 1, family = 'multinomial', nfolds = 3, parallel = F) 287 | 288 | params_xgboost = list( params = list("objective" = "multi:softprob", "bst:eta" = 0.35, "subsample" = 0.65, "num_class" = 3, "max_depth" = 6, "colsample_bytree" = 0.65, "nthread" = 2), 289 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE) 290 | 291 | params_ranger = list(write.forest = TRUE, probability = TRUE, num.threads = 6, num.trees = 50, verbose = FALSE, classification = TRUE, mtry = 2, min.node.size = 5, importance = 'impurity') 292 | 293 | params_features = list(keep_number_feat = NULL, union = T) 294 | 295 | res = wrapper_feat_select(X, multiclass_xgboost, params_glmnet = params_glmnet, params_xgboost = params_xgboost, params_ranger = params_ranger, xgb_sort = NULL, 296 | 297 | CV_folds = 3, stratified_regr = FALSE, cores_glmnet = 2, params_features = params_features) 298 | 299 | testthat::expect_true(is.list(res) & all(unlist(lapply(res$all_feat, function(x) sum(dim(x)))) > 0) & sum(dim(res$union_feat)) > 0) 300 | }) 301 | 302 | 303 | 304 | testthat::test_that("it works in MULTICLASS classification. The labels should be in c(0 to Inf), so that xgboost works. It should return a 305 | 306 | list of two when union = T. The data frames in the first and in the second list should be non-empty. Case multiclass classification, sorted by Frequency in xgboost", { 307 | 308 | y = iris[, 5] 309 | multiclass_xgboost = ifelse(y == 'setosa', 0, ifelse(y == 'virginica', 1, 2)) 310 | X = iris[, -5] 311 | 312 | params_glmnet = list(alpha = 1, family = 'multinomial', nfolds = 3, parallel = F) 313 | 314 | params_xgboost = list( params = list("objective" = "multi:softprob", "bst:eta" = 0.35, "subsample" = 0.65, "num_class" = 3, "max_depth" = 6, "colsample_bytree" = 0.65, "nthread" = 2), 315 | nrounds = 50, print.every.n = 50, verbose = 0, maximize = FALSE) 316 | 317 | params_ranger = list(write.forest = TRUE, probability = TRUE, num.threads = 6, num.trees = 50, verbose = FALSE, classification = TRUE, mtry = 2, min.node.size = 5, importance = 'impurity') 318 | 319 | params_features = list(keep_number_feat = NULL, union = T) 320 | 321 | res = wrapper_feat_select(X, multiclass_xgboost, params_glmnet = params_glmnet, params_xgboost = params_xgboost, params_ranger = params_ranger, xgb_sort = "Frequency", 322 | 323 | CV_folds = 3, stratified_regr = FALSE, cores_glmnet = 2, params_features = params_features) 324 | 325 | testthat::expect_true(is.list(res) & all(unlist(lapply(res$all_feat, function(x) sum(dim(x)))) > 0) & sum(dim(res$union_feat)) > 0) 326 | }) 327 | -------------------------------------------------------------------------------- /tic.R: -------------------------------------------------------------------------------- 1 | # installs dependencies, runs R CMD check, runs covr::codecov() 2 | do_package_checks() 3 | 4 | if (ci_on_ghactions() && ci_has_env("BUILD_PKGDOWN")) { 5 | # creates pkgdown site and pushes to gh-pages branch 6 | # only for the runner with the "BUILD_PKGDOWN" env var set 7 | do_pkgdown() 8 | } 9 | --------------------------------------------------------------------------------