├── .github ├── .gitignore ├── workflows │ ├── pkgdown.yaml │ └── R-CMD-check.yaml ├── CODE_OF_CONDUCT.md └── pull_request_template.md ├── vignettes ├── .gitignore ├── Using-FLORAL-with-phyloseq.Rmd ├── Using-FLORAL-for-survival-models-with-longitudinal-microbiome-data.Rmd └── Using-FLORAL-for-Microbiome-Analysis.Rmd ├── R ├── sysdata.rda ├── coxsplit.R ├── RcppExports.R ├── utils.R ├── LogRatioLasso.R ├── LogRatioLogisticLasso.R ├── simu.R └── LogRatioCoxLasso.R ├── man ├── figures │ ├── logo.jpg │ ├── logo.png │ ├── README-plot-1.png │ └── README-plot-2.png ├── phy_to_floral_data.Rd ├── simu.Rd ├── a.FLORAL.Rd ├── mcv.FLORAL.Rd └── FLORAL.Rd ├── _pkgdown.yml ├── pkgdown └── favicon │ ├── favicon.ico │ ├── favicon-16x16.png │ ├── favicon-32x32.png │ ├── apple-touch-icon.png │ ├── apple-touch-icon-60x60.png │ ├── apple-touch-icon-76x76.png │ ├── apple-touch-icon-120x120.png │ ├── apple-touch-icon-152x152.png │ └── apple-touch-icon-180x180.png ├── inst ├── extdata │ └── YachidaS_2019.Rdata └── WORDLIST ├── tests ├── spelling.R ├── testthat │ ├── test-simu.R │ ├── test-FLORAL.R │ └── test-utils.R └── testthat.R ├── .gitignore ├── .Rbuildignore ├── codecov.yml ├── cran-comments.md ├── LogRatioReg.Rproj ├── src ├── Makevars ├── Makevars.win ├── RcppExports.cpp └── pgee.cpp ├── data-raw └── internal.R ├── NAMESPACE ├── DESCRIPTION ├── NEWS.md ├── README.Rmd └── README.md /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /vignettes/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | *.R 3 | -------------------------------------------------------------------------------- /R/sysdata.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vdblab/FLORAL/HEAD/R/sysdata.rda -------------------------------------------------------------------------------- /man/figures/logo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vdblab/FLORAL/HEAD/man/figures/logo.jpg -------------------------------------------------------------------------------- /man/figures/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vdblab/FLORAL/HEAD/man/figures/logo.png -------------------------------------------------------------------------------- /_pkgdown.yml: -------------------------------------------------------------------------------- 1 | url: https://vdblab.github.io/FLORAL/ 2 | template: 3 | bootstrap: 5 4 | 5 | -------------------------------------------------------------------------------- /pkgdown/favicon/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vdblab/FLORAL/HEAD/pkgdown/favicon/favicon.ico -------------------------------------------------------------------------------- /man/figures/README-plot-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vdblab/FLORAL/HEAD/man/figures/README-plot-1.png -------------------------------------------------------------------------------- /man/figures/README-plot-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vdblab/FLORAL/HEAD/man/figures/README-plot-2.png -------------------------------------------------------------------------------- /inst/extdata/YachidaS_2019.Rdata: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vdblab/FLORAL/HEAD/inst/extdata/YachidaS_2019.Rdata -------------------------------------------------------------------------------- /pkgdown/favicon/favicon-16x16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vdblab/FLORAL/HEAD/pkgdown/favicon/favicon-16x16.png -------------------------------------------------------------------------------- /pkgdown/favicon/favicon-32x32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vdblab/FLORAL/HEAD/pkgdown/favicon/favicon-32x32.png -------------------------------------------------------------------------------- /pkgdown/favicon/apple-touch-icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vdblab/FLORAL/HEAD/pkgdown/favicon/apple-touch-icon.png -------------------------------------------------------------------------------- /pkgdown/favicon/apple-touch-icon-60x60.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vdblab/FLORAL/HEAD/pkgdown/favicon/apple-touch-icon-60x60.png -------------------------------------------------------------------------------- /pkgdown/favicon/apple-touch-icon-76x76.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vdblab/FLORAL/HEAD/pkgdown/favicon/apple-touch-icon-76x76.png -------------------------------------------------------------------------------- /pkgdown/favicon/apple-touch-icon-120x120.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vdblab/FLORAL/HEAD/pkgdown/favicon/apple-touch-icon-120x120.png -------------------------------------------------------------------------------- /pkgdown/favicon/apple-touch-icon-152x152.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vdblab/FLORAL/HEAD/pkgdown/favicon/apple-touch-icon-152x152.png -------------------------------------------------------------------------------- /pkgdown/favicon/apple-touch-icon-180x180.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vdblab/FLORAL/HEAD/pkgdown/favicon/apple-touch-icon-180x180.png -------------------------------------------------------------------------------- /tests/spelling.R: -------------------------------------------------------------------------------- 1 | if(requireNamespace('spelling', quietly = TRUE)) 2 | spelling::spell_check_test(vignettes = TRUE, error = FALSE, 3 | skip_on_cran = TRUE) 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | src/*.o 6 | src/*.so 7 | src/*.dll 8 | R/simulation.R 9 | R/main_script.R 10 | docs 11 | inst/doc 12 | /doc/ 13 | /Meta/ 14 | -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^renv$ 2 | ^renv\.lock$ 3 | ^.*\.Rproj$ 4 | ^\.Rproj\.user$ 5 | ^LICENSE$ 6 | ^\.github$ 7 | ^README\.Rmd$ 8 | ^_pkgdown\.yml$ 9 | ^docs$ 10 | ^pkgdown$ 11 | ^codecov\.yml$ 12 | ^doc$ 13 | ^Meta$ 14 | ^cran-comments\.md$ 15 | ^data-raw$ 16 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | comment: false 2 | 3 | coverage: 4 | status: 5 | project: 6 | default: 7 | target: auto 8 | threshold: 1% 9 | informational: true 10 | patch: 11 | default: 12 | target: auto 13 | threshold: 1% 14 | informational: true 15 | -------------------------------------------------------------------------------- /cran-comments.md: -------------------------------------------------------------------------------- 1 | ## Test environments 2 | * windows-latest (release) (on github actions) 3 | * ubuntu-latest (release) (on github actions) 4 | 5 | ## R CMD check results 6 | Maintainer: 'Teng Fei ' 7 | 8 | New submission 9 | 10 | ## Additional Comments 11 | 12 | Thank you for your time. -------------------------------------------------------------------------------- /tests/testthat/test-simu.R: -------------------------------------------------------------------------------- 1 | test_that("simu() works", { 2 | expect_error( 3 | dat <- simu(n=50,p=30,model="linear"), 4 | NA 5 | ) 6 | expect_error( 7 | dat <- simu(n=50,p=30,model="binomial"), 8 | NA 9 | ) 10 | expect_error( 11 | dat <- simu(n=50,p=30,model="cox"), 12 | NA 13 | ) 14 | }) 15 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | # This file is part of the standard setup for testthat. 2 | # It is recommended that you do not modify it. 3 | # 4 | # Where should you do additional test configuration? 5 | # Learn more about the roles of various files in: 6 | # * https://r-pkgs.org/tests.html 7 | # * https://testthat.r-lib.org/reference/test_package.html#special-files 8 | 9 | library(testthat) 10 | library(FLORAL) 11 | 12 | test_check("FLORAL") 13 | -------------------------------------------------------------------------------- /LogRatioReg.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | ProjectId: 80ae1746-c1e0-4fc6-a99b-9e728485768c 3 | 4 | RestoreWorkspace: Default 5 | SaveWorkspace: Default 6 | AlwaysSaveHistory: Default 7 | 8 | EnableCodeIndexing: Yes 9 | UseSpacesForTab: Yes 10 | NumSpacesForTab: 2 11 | Encoding: UTF-8 12 | 13 | RnwWeave: Sweave 14 | LaTeX: pdfLaTeX 15 | 16 | BuildType: Package 17 | PackageUseDevtools: Yes 18 | PackageInstallArgs: --no-multiarch --with-keep.source 19 | PackageCheckArgs: --as-cran 20 | -------------------------------------------------------------------------------- /inst/WORDLIST: -------------------------------------------------------------------------------- 1 | Baichoo 2 | Biostatistics 3 | CMD 4 | CRC 5 | Compositional 6 | DM 7 | Dai 8 | Epub 9 | Gönen 10 | Hendry 11 | Interpretable 12 | JU 13 | LOg 14 | Lv 15 | MRM 16 | MSKCC 17 | Microbiome 18 | Miltiadous 19 | PMC 20 | PMCID 21 | PMID 22 | Peled 23 | Perales 24 | Phyloseq 25 | RAtio 26 | Reproducibility 27 | Sadeghi 28 | Scalable 29 | Shouval 30 | al 31 | bioRxiv 32 | colorectal 33 | compositional 34 | conda 35 | crmeth 36 | doi 37 | et 38 | feit 39 | finegray 40 | glom 41 | microbiome 42 | mskcc 43 | multivariable 44 | ncv 45 | performace 46 | pfilter 47 | phy's 48 | phyloseq 49 | pre 50 | preprint 51 | reproducibility 52 | se 53 | timedep 54 | -------------------------------------------------------------------------------- /tests/testthat/test-FLORAL.R: -------------------------------------------------------------------------------- 1 | test_that("FLORAL() works", { 2 | 3 | set.seed(23420) 4 | 5 | dat <- simu(n=50,p=30,model="linear") 6 | expect_error( 7 | fit <- FLORAL(dat$xcount,dat$y,family="gaussian",progress=FALSE,step2=TRUE), 8 | NA 9 | ) 10 | 11 | dat <- simu(n=50,p=30,model="binomial") 12 | expect_error( 13 | fit <- FLORAL(dat$xcount,dat$y,family="binomial",progress=FALSE,step2=TRUE), 14 | NA 15 | ) 16 | 17 | dat <- simu(n=50,p=30,model="cox") 18 | expect_error( 19 | fit <- FLORAL(dat$xcount,survival::Surv(dat$t,dat$d),family="cox",progress=FALSE,step2=TRUE), 20 | NA 21 | ) 22 | 23 | }) 24 | -------------------------------------------------------------------------------- /src/Makevars: -------------------------------------------------------------------------------- 1 | 2 | ## With R 3.1.0 or later, you can uncomment the following line to tell R to 3 | ## enable compilation with C++11 (where available) 4 | ## 5 | ## Also, OpenMP support in Armadillo prefers C++11 support. However, for wider 6 | ## availability of the package we do not yet enforce this here. It is however 7 | ## recommended for client packages to set it. 8 | ## 9 | ## And with R 3.4.0, and RcppArmadillo 0.7.960.*, we turn C++11 on as OpenMP 10 | ## support within Armadillo prefers / requires it 11 | ## CXX_STD = CXX11 12 | 13 | PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) 14 | PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) 15 | CXX_STD=CXX17 -------------------------------------------------------------------------------- /src/Makevars.win: -------------------------------------------------------------------------------- 1 | 2 | ## With R 3.1.0 or later, you can uncomment the following line to tell R to 3 | ## enable compilation with C++11 (where available) 4 | ## 5 | ## Also, OpenMP support in Armadillo prefers C++11 support. However, for wider 6 | ## availability of the package we do not yet enforce this here. It is however 7 | ## recommended for client packages to set it. 8 | ## 9 | ## And with R 3.4.0, and RcppArmadillo 0.7.960.*, we turn C++11 on as OpenMP 10 | ## support within Armadillo prefers / requires it 11 | ## CXX_STD = CXX11 12 | 13 | PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) 14 | PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) 15 | CXX_STD=CXX17 16 | -------------------------------------------------------------------------------- /data-raw/internal.R: -------------------------------------------------------------------------------- 1 | if (interactive()){ 2 | cols = c("SampleID", "PatientID", "Timepoint", "Consistency", "Accession", 3 | "BioProject", "DayRelativeToNearestHCT") 4 | #this file has duplicate rows, and has multiple rows per pool 5 | samples <- read.csv("https://figshare.com/ndownloader/files/33076496")[, cols] 6 | samples <- samples[!duplicated(samples),] 7 | samples <- samples[1:100,] # Using the first 100 samples only. 8 | counts <- read.csv("https://figshare.com/ndownloader/files/26393788") 9 | counts <- counts[counts$SampleID %in% samples$SampleID, ] 10 | taxonomy <- read.csv("https://figshare.com/ndownloader/files/26770997") 11 | 12 | usethis::use_data(samples,counts,taxonomy, internal = TRUE) 13 | } -------------------------------------------------------------------------------- /man/phy_to_floral_data.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{phy_to_floral_data} 4 | \alias{phy_to_floral_data} 5 | \title{Create data input list from phyloseq object} 6 | \usage{ 7 | phy_to_floral_data(phy, y = NULL, covariates = NULL) 8 | } 9 | \arguments{ 10 | \item{phy}{Phyloseq object} 11 | 12 | \item{y}{Outcome column of interest from phy's sample_data} 13 | 14 | \item{covariates}{Covariate column names from phy's sample_data} 15 | } 16 | \value{ 17 | list 18 | } 19 | \description{ 20 | Create data input list from phyloseq object 21 | } 22 | \examples{ 23 | library(phyloseq) 24 | data(GlobalPatterns) 25 | # add a covariate 26 | sample_data(GlobalPatterns)$test <- rep(c(1, 0), nsamples(GlobalPatterns)/2) 27 | # GlobalPatterns <- tax_glom(GlobalPatterns, "Phylum") 28 | dat <- phy_to_floral_data(GlobalPatterns, y = "test", covariates = c("SampleType")) 29 | # res <- FLORAL(x = dat$xcount, y=dat$y, ncov=dat$ncov, family = "binomial", ncv=NULL) 30 | 31 | } 32 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(FLORAL) 4 | export(a.FLORAL) 5 | export(mcv.FLORAL) 6 | export(phy_to_floral_data) 7 | export(simu) 8 | import(Rcpp) 9 | import(doParallel) 10 | import(doRNG) 11 | import(dplyr) 12 | import(foreach) 13 | import(ggplot2) 14 | import(glmnet) 15 | import(parallel) 16 | import(phyloseq) 17 | import(survival) 18 | importFrom(caret,createFolds) 19 | importFrom(grDevices,rainbow) 20 | importFrom(msm,dpexp) 21 | importFrom(msm,ppexp) 22 | importFrom(msm,rpexp) 23 | importFrom(mvtnorm,rmvnorm) 24 | importFrom(reshape,melt) 25 | importFrom(stats,binomial) 26 | importFrom(stats,dist) 27 | importFrom(stats,gaussian) 28 | importFrom(stats,glm) 29 | importFrom(stats,median) 30 | importFrom(stats,na.omit) 31 | importFrom(stats,rbinom) 32 | importFrom(stats,rexp) 33 | importFrom(stats,rmultinom) 34 | importFrom(stats,rnorm) 35 | importFrom(stats,runif) 36 | importFrom(stats,sd) 37 | importFrom(stats,step) 38 | importFrom(survcomp,concordance.index) 39 | importFrom(utils,combn) 40 | useDynLib(FLORAL) 41 | -------------------------------------------------------------------------------- /.github/workflows/pkgdown.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | #pull_request: 7 | #branches: [main, master] 8 | 9 | name: pkgdown 10 | 11 | jobs: 12 | pkgdown: 13 | runs-on: ubuntu-latest 14 | # Only restrict concurrency for non-PR jobs 15 | concurrency: 16 | group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }} 17 | env: 18 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 19 | permissions: 20 | contents: write 21 | steps: 22 | - uses: actions/checkout@v4 23 | 24 | - uses: r-lib/actions/setup-pandoc@v2 25 | 26 | - uses: r-lib/actions/setup-r@v2 27 | with: 28 | use-public-rspm: true 29 | 30 | - uses: r-lib/actions/setup-r-dependencies@v2 31 | with: 32 | extra-packages: any::pkgdown, local::. 33 | needs: website 34 | 35 | - name: Build site 36 | run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) 37 | shell: Rscript {0} 38 | 39 | - name: Deploy package 40 | run: | 41 | git config --local user.name "$GITHUB_ACTOR" 42 | git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com" 43 | Rscript -e 'pkgdown::deploy_to_branch(new_process = FALSE)' -------------------------------------------------------------------------------- /.github/CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Code of Conduct 2 | 3 | As contributors and maintainers of this project, we pledge to respect all people who 4 | contribute through reporting issues, posting feature requests, updating documentation, 5 | submitting pull requests or patches, and other activities. 6 | 7 | We are committed to making participation in this project a harassment-free experience for 8 | everyone, regardless of level of experience, gender, gender identity and expression, 9 | sexual orientation, disability, personal appearance, body size, race, ethnicity, age, or religion. 10 | 11 | Examples of unacceptable behavior by participants include the use of sexual language or 12 | imagery, derogatory comments or personal attacks, trolling, public or private harassment, 13 | insults, or other unprofessional conduct. 14 | 15 | Project maintainers have the right and responsibility to remove, edit, or reject comments, 16 | commits, code, wiki edits, issues, and other contributions that are not aligned to this 17 | Code of Conduct. Project maintainers who do not follow the Code of Conduct may be removed 18 | from the project team. 19 | 20 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by 21 | opening an issue or contacting one or more of the project maintainers. 22 | 23 | This Code of Conduct is adapted from the Contributor Covenant 24 | (https://www.contributor-covenant.org), version 1.0.0, available at 25 | https://contributor-covenant.org/version/1/0/0/. 26 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: FLORAL 2 | Type: Package 3 | Title: Fit Log-Ratio Lasso Regression for Compositional Data 4 | Version: 0.6.0 5 | Date: 2025-07-13 6 | Authors@R: c( 7 | person("Teng", "Fei", , "feit1@mskcc.org", role = c("aut", "cre", "cph"), 8 | comment = c(ORCID = "0000-0001-7888-1715")), 9 | person("Tyler", "Funnell", role=c("aut"), 10 | comment = c(ORCID = "0000-0003-1612-5644")), 11 | person("Nicholas", "Waters", role=c("aut"), 12 | comment = c(ORCID = "0000-0002-9035-2143")), 13 | person("Sandeep", "Raj", role=c("aut"), 14 | comment = c(ORCID = "0000-0003-4629-0528")) 15 | ) 16 | Description: Log-ratio Lasso regression for continuous, binary, and survival outcomes with (longitudinal) compositional features. See Fei and others (2024) . 17 | License: GPL (>= 3) 18 | URL: https://vdblab.github.io/FLORAL/ 19 | BugReports: https://github.com/vdblab/FLORAL/issues 20 | Depends: 21 | R (>= 3.5.0) 22 | SystemRequirements: C++17 23 | biocViews: 24 | Imports: 25 | Rcpp (>= 1.0.9), 26 | stats, 27 | survival, 28 | ggplot2, 29 | survcomp, 30 | reshape, 31 | dplyr, 32 | glmnet, 33 | caret, 34 | grDevices, 35 | utils, 36 | mvtnorm, 37 | doParallel, 38 | doRNG, 39 | foreach, 40 | msm, 41 | phyloseq 42 | LinkingTo: Rcpp, RcppArmadillo, RcppProgress 43 | RoxygenNote: 7.3.2 44 | Encoding: UTF-8 45 | Suggests: 46 | covr, 47 | knitr, 48 | rmarkdown, 49 | spelling, 50 | testthat (>= 3.0.0), 51 | patchwork, 52 | tidyverse 53 | Language: en-US 54 | Config/testthat/edition: 3 55 | VignetteBuilder: knitr 56 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | **What changes are proposed in this pull request?** 2 | 3 | 4 | **If there is an GitHub issue associated with this pull request, please provide link.** 5 | 6 | 7 | -------------------------------------------------------------------------------- 8 | 9 | Checklist for PR reviewer 10 | 11 | - [ ] PR branch has pulled the most recent updates from master branch. Ensure the pull request branch and your local version match and both have the latest updates from the master branch. 12 | - [ ] If a new function was added, function included in `_pkgdown.yml` 13 | - [ ] If a bug was fixed, a unit test was added for the bug check 14 | - [ ] Run `pkgdown::build_site()`. Check the R console for errors, and review the rendered website. 15 | - [ ] Code coverage is suitable for any new functions/features. Review coverage with `covr::report()`. Before you run, set `Sys.setenv(NOT_CRAN="true")` and begin in a fresh R session without any packages loaded. 16 | - [ ] R CMD Check runs without errors, warnings, and notes 17 | - [ ] `usethis::use_spell_check()` runs with no spelling errors in documentation 18 | 19 | When the branch is ready to be merged into master: 20 | - [ ] Update `NEWS.md` with the changes from this pull request under the heading "`# FLORAL (development version)`". If there is an issue associated with the pull request, reference it in parentheses at the end of the update. 21 | - [ ] Increment the version number using `usethis::use_version(which = "dev")` 22 | - [ ] Run `usethis::use_spell_check()` again 23 | - [ ] Approve Pull Request 24 | - [ ] Merge the PR. Please use "Squash and merge". 25 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # FLORAL 0.6.0 2 | 3 | * Removing dependencies with R package `ast2ast`. 4 | 5 | # FLORAL 0.5.0 6 | 7 | * Adding instructions to install the package via `bioconda`. 8 | 9 | * Address new CRAN requirements regarding Internet access. 10 | 11 | # FLORAL 0.4.0 12 | 13 | * Adding a function `phy_to_floral_data` which helps convert a `phyloseq` object to be compatible with `FLORAL`. 14 | 15 | * Adding a corresponding vignette for `phy_to_floral_data`. 16 | 17 | * Introducing the new GEE model for longitudinal continuous and binary outcomes. More documentations to follow in the next development version. 18 | 19 | # FLORAL 0.3.0 20 | 21 | * Improves stability when fitting Fine-Gray model with longitudinal covariates. 22 | 23 | * Enables parallel computation for cross-validation. 24 | 25 | * Fixes several bugs as reported in Issues. 26 | 27 | * Adding options to use user-specified pseudo counts 28 | 29 | * Adding options to use user-specified number of maximum number of iterations 30 | 31 | * Adding a simulation scenario for survival regression models with longitudinal features 32 | 33 | * (BETA version of) the new GEE method 34 | 35 | # FLORAL 0.2.0 36 | 37 | * Including more examples in document compared to CRAN version (0.1.0.9000) 38 | 39 | * Enables elastic net models. Users can specify the weight of lasso penalty using argument `a`. (0.1.0.9001) 40 | 41 | * Allows adding non-compositional covariates which are not constrained by the zero-sum constraint. (0.1.0.9001) 42 | 43 | * Adds a function `mcv.FLORAL()` to perform multiple runs of k-fold cross-validation to summarize selection probabilities for features. (0.1.0.9001) 44 | 45 | * Adds a function `a.FLORAL()` to compare different choices of elastic net weight `a` for a fixed cross-validation setting. (0.1.0.9001) 46 | 47 | # FLORAL 0.1.0 48 | 49 | * Initial release. 50 | -------------------------------------------------------------------------------- /vignettes/Using-FLORAL-with-phyloseq.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Using FLORAL with phyloseq data" 3 | output: 4 | rmarkdown::html_vignette: 5 | md_extensions: [ 6 | "-autolink_bare_uris" 7 | ] 8 | vignette: > 9 | %\VignetteIndexEntry{Using FLORAL with phyloseq data} 10 | %\VignetteEncoding{UTF-8} 11 | %\VignetteEngine{knitr::rmarkdown} 12 | editor_options: 13 | chunk_output_type: console 14 | --- 15 | 16 | ```{r setup, include=FALSE} 17 | knitr::opts_chunk$set(echo = TRUE) 18 | library(tidyverse) 19 | library(phyloseq) 20 | ``` 21 | 22 | [Phyloseq](http://joey711.github.io/phyloseq/) is a popular package for working with microbiome data. Here we show how to use the `phy_to_floral_data` helper function to convert phyloseq data into a format accepted by FLORAL. 23 | 24 | The following code downloads data described in [this paper](https://www.nature.com/articles/s41597-021-00860-8) and turns it into a phyloseq object. The tax_glom step here takes some time, and can be replaced with [`speedyseq::tax_glom`](https://github.com/mikemc/speedyseq) for better performace. 25 | ```{r} 26 | samples <- get0("samples", envir = asNamespace("FLORAL")) 27 | counts <- get0("counts", envir = asNamespace("FLORAL")) 28 | taxonomy <- get0("taxonomy", envir = asNamespace("FLORAL")) 29 | 30 | phy <- phyloseq( 31 | sample_data(samples %>% column_to_rownames("SampleID")), 32 | tax_table(taxonomy %>% select(ASV, Kingdom:Genus) %>% column_to_rownames("ASV") %>% as.matrix()), 33 | otu_table(counts %>% pivot_wider(names_from = "SampleID", values_from = "Count", values_fill = 0) %>% column_to_rownames("ASV") %>% as.matrix(), taxa_are_rows = TRUE) 34 | ) %>% subset_samples(DayRelativeToNearestHCT > -30 & DayRelativeToNearestHCT < 0) %>% 35 | tax_glom("Genus") 36 | ``` 37 | 38 | Next, we convert that phyloseq object into a list of results to be used by FLORAL; we have to specify the main outcome of interest as `y`, and any metadata columns (from `sample_data(phy)`) to use as covariates. Note that the analysis described here is just an example for using the function; this 39 | 40 | 41 | ```{r} 42 | dat <- FLORAL::phy_to_floral_data(phy, covariates=c("Consistency"), y = "DayRelativeToNearestHCT") 43 | ``` 44 | 45 | The resulting list has named entities for the main arguments to FLORAL: 46 | 47 | ```{r} 48 | res <- FLORAL::FLORAL(y = dat$y, x = dat$xcount, ncov = dat$ncov, family = "gaussian", ncv=NULL, progress=FALSE) 49 | ``` -------------------------------------------------------------------------------- /.github/workflows/R-CMD-check.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | #pull_request: 7 | # branches: [main, master] 8 | 9 | name: R-CMD-check 10 | 11 | jobs: 12 | R-CMD-check: 13 | runs-on: ${{ matrix.config.os }} 14 | 15 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 16 | 17 | strategy: 18 | fail-fast: false 19 | matrix: 20 | config: 21 | # - {os: macos-latest, r: 'release'} 22 | - {os: windows-latest, r: 'release'} 23 | # - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} 24 | - {os: ubuntu-latest, r: 'release'} 25 | # - {os: ubuntu-latest, r: 'oldrel-1'} 26 | 27 | env: 28 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 29 | R_KEEP_PKG_SOURCE: yes 30 | 31 | steps: 32 | - uses: actions/checkout@v4 33 | 34 | - uses: r-lib/actions/setup-pandoc@v2 35 | 36 | - uses: r-lib/actions/setup-r@v2 37 | with: 38 | r-version: ${{ matrix.config.r }} 39 | http-user-agent: ${{ matrix.config.http-user-agent }} 40 | use-public-rspm: true 41 | 42 | - uses: r-lib/actions/setup-r-dependencies@v2 43 | with: 44 | extra-packages: any::rcmdcheck,any::covr 45 | needs: check,coverage 46 | 47 | - uses: r-lib/actions/check-r-package@v2 48 | with: 49 | upload-snapshots: true 50 | - name: Test coverage 51 | if: matrix.config.os == 'ubuntu-latest' 52 | run: | 53 | covr::codecov( 54 | quiet = FALSE, 55 | clean = FALSE, 56 | install_path = file.path(Sys.getenv("RUNNER_TEMP"), "package") 57 | ) 58 | shell: Rscript {0} 59 | 60 | - name: Show testthat output 61 | if: matrix.config.os == 'ubuntu-latest' 62 | run: | 63 | ## -------------------------------------------------------------------- 64 | find ${{ runner.temp }}/package -name 'testthat.Rout*' -exec cat '{}' \; || true 65 | shell: bash 66 | 67 | - name: Upload test results 68 | if: failure() && matrix.config.os == 'ubuntu-latest' 69 | uses: actions/upload-artifact@v4 70 | with: 71 | name: coverage-test-failures 72 | path: ${{ runner.temp }}/package 73 | -------------------------------------------------------------------------------- /R/coxsplit.R: -------------------------------------------------------------------------------- 1 | coxsplity=function(y, nfolds){ 2 | N=nrow(y) 3 | tem=data.frame(y, i=seq(N), foldid=0) 4 | tem=tem[order(y[, "time"], y[, "status"]), ] 5 | n1=sum(y[, "status"]);n2=N-n1 6 | 7 | tem$foldid[tem[, "status"]==1]=sample(rep(seq(nfolds), length=n1)) 8 | tem$foldid[tem[, "status"]==0]=sample(rep(seq(nfolds), length=n2)) 9 | 10 | foldid=tem$foldid[order(tem$i)] 11 | return(foldid) 12 | } 13 | 14 | coxsplitss=function(y, id, nfolds){ 15 | full = data.frame(y, foldid=0, id=id) 16 | tem = full %>% dplyr::group_by(.data$id) %>% dplyr::filter(row_number()==n()) 17 | N=nrow(tem) 18 | tem$i = seq(N) 19 | tem=tem[order(tem$stop, tem$status), ] 20 | n1=sum(y[, "status"]);n2=N-n1 21 | # tem = as.matrix(tem) 22 | 23 | tem$foldid[tem$status==1]=sample(rep(seq(nfolds), length=n1)) 24 | tem$foldid[tem$status==0]=sample(rep(seq(nfolds), length=n2)) 25 | 26 | temif <- tem %>% select(.data$foldid,.data$id) #data.frame(tem[,c("foldid","id")]) 27 | full <- full %>% select(.data$start,.data$stop,.data$status,.data$id) %>% left_join(temif,by="id") 28 | 29 | foldid <- full$foldid 30 | 31 | return(foldid) 32 | } 33 | 34 | fgfoldid=function(id, foldid){ 35 | idfoldid <- data.frame(id=unique(id),foldid=foldid) 36 | yfoldid <- data.frame(id=id) 37 | mergedid <- yfoldid %>% left_join(idfoldid,by="id") 38 | 39 | foldid <- mergedid$foldid 40 | return(foldid) 41 | } 42 | 43 | binsimuar1 <- function(mu,gamma){ 44 | 45 | # simulate correlated binary variables using method by Qaqish (2003) with AR(1) corstr 46 | 47 | len <- length(mu) 48 | y <- rep(0,len) 49 | y[1] <- rbinom(1,1,mu[1]) 50 | for (i in 2:len){ 51 | lambda <- mu[i] + gamma*(y[i-1] - mu[i-1])*sqrt((mu[i]*(1-mu[i]))/(mu[i-1]*(1-mu[i-1]))) 52 | if (lambda < 0) lambda = 0.0001 53 | if (lambda > 1) lambda = 0.9999 54 | y[i] <- rbinom(1,1,lambda) 55 | } 56 | y 57 | } 58 | 59 | binsimuexch <- function(mu,gamma){ 60 | 61 | # simulate correlated binary variables using method by Qaqish (2003) with exchangeable corstr 62 | 63 | len <- length(mu) 64 | y <- rep(0,len) 65 | y[1] <- rbinom(1,1,mu[1]) 66 | 67 | for (i in 2:len){ 68 | 69 | lambda <- mu[i] 70 | 71 | for (j in 1:(i-1)){ 72 | 73 | lambda <- lambda + (gamma/(1+(i-2)*gamma)) * sqrt((mu[i]*(1-mu[i]))/(mu[j]*(1-mu[j]))) * (y[j]-mu[j]) 74 | 75 | } 76 | 77 | if (lambda < 0) lambda = 0.0001 78 | if (lambda > 1) lambda = 0.9999 79 | y[i] <- rbinom(1,1,lambda) 80 | } 81 | y 82 | } 83 | -------------------------------------------------------------------------------- /R/RcppExports.R: -------------------------------------------------------------------------------- 1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | softthreshold <- function(x, lambda) { 5 | .Call('_FLORAL_softthreshold', PACKAGE = 'FLORAL', x, lambda) 6 | } 7 | 8 | gd_cov <- function(xx, xy, n, l, beta) { 9 | .Call('_FLORAL_gd_cov', PACKAGE = 'FLORAL', xx, xy, n, l, beta) 10 | } 11 | 12 | gd_cov_al <- function(xx, xy, n, l, a, beta, mu, alpha, adjust = FALSE, ncov = 0L, wcov = 0) { 13 | .Call('_FLORAL_gd_cov_al', PACKAGE = 'FLORAL', xx, xy, n, l, a, beta, mu, alpha, adjust, ncov, wcov) 14 | } 15 | 16 | linear_enet_al <- function(x, y, len, mu, ub, lambda, wcov, a, adjust, ncov, display_progress = TRUE) { 17 | .Call('_FLORAL_linear_enet_al', PACKAGE = 'FLORAL', x, y, len, mu, ub, lambda, wcov, a, adjust, ncov, display_progress) 18 | } 19 | 20 | logistic_enet_al <- function(x, y, len, mu, ub, lambda, wcov, a, adjust, ncov, display_progress = TRUE, loop1 = FALSE, loop2 = FALSE) { 21 | .Call('_FLORAL_logistic_enet_al', PACKAGE = 'FLORAL', x, y, len, mu, ub, lambda, wcov, a, adjust, ncov, display_progress, loop1, loop2) 22 | } 23 | 24 | cox_enet_al <- function(x, t, d, tj, len, mu, ub, lambda, wcov, a, adjust, ncov, devnull, display_progress = TRUE, loop1 = FALSE, loop2 = FALSE, notcv = TRUE) { 25 | .Call('_FLORAL_cox_enet_al', PACKAGE = 'FLORAL', x, t, d, tj, len, mu, ub, lambda, wcov, a, adjust, ncov, devnull, display_progress, loop1, loop2, notcv) 26 | } 27 | 28 | cox_timedep_enet_al <- function(x, t0, t1, d, tj, len, mu, ub, lambda, wcov, a, adjust, ncov, devnull, display_progress = TRUE) { 29 | .Call('_FLORAL_cox_timedep_enet_al', PACKAGE = 'FLORAL', x, t0, t1, d, tj, len, mu, ub, lambda, wcov, a, adjust, ncov, devnull, display_progress) 30 | } 31 | 32 | fg_enet_al <- function(x, t0, t1, d, tj, w, len, mu, ub, lambda, wcov, a, adjust, ncov, devnull, display_progress = TRUE) { 33 | .Call('_FLORAL_fg_enet_al', PACKAGE = 'FLORAL', x, t0, t1, d, tj, w, len, mu, ub, lambda, wcov, a, adjust, ncov, devnull, display_progress) 34 | } 35 | 36 | gee_NR <- function(N, nt, y, X, nx, linkinv, mueta, variance, beta_new, Rhat, fihat, lambda, a, alpha, ncov, wcov, eps = 1e-6, muu = 1e6) { 37 | .Call('_FLORAL_gee_NR', PACKAGE = 'FLORAL', N, nt, y, X, nx, linkinv, mueta, variance, beta_new, Rhat, fihat, lambda, a, alpha, ncov, wcov, eps, muu) 38 | } 39 | 40 | gee_cor <- function(N, nt, y, X, linkinv, variance, beta_new, corstr, maxclsz, scalefix, scalevalue = 1) { 41 | .Call('_FLORAL_gee_cor', PACKAGE = 'FLORAL', N, nt, y, X, linkinv, variance, beta_new, corstr, maxclsz, scalefix, scalevalue) 42 | } 43 | 44 | gee_fit <- function(y, X, nt, linkinv, mueta, variance, corstr, lambda, a, ncov, wcov, tol = 1e-3, eps = 1e-6, muu = 1e6, maxiter1 = 100L, maxiter2 = 10L, scalefix = FALSE, scalevalue = 1, display_progress = TRUE) { 45 | .Call('_FLORAL_gee_fit', PACKAGE = 'FLORAL', y, X, nt, linkinv, mueta, variance, corstr, lambda, a, ncov, wcov, tol, eps, muu, maxiter1, maxiter2, scalefix, scalevalue, display_progress) 46 | } 47 | 48 | -------------------------------------------------------------------------------- /R/utils.R: -------------------------------------------------------------------------------- 1 | # #' Clean covariate Columns 2 | # #'.This converts categorical data to 1-hot encoded columns, and retains numeric column 3 | # #' @param df Data.frame of covariates for this dataset 4 | # #' @param cols columns to clean; columns not mentioned here are dropped 5 | # #' @param drop_first default TRUE; drop first level of encoding for 1-hot 6 | # #' 7 | # #' @return `data.frame` of recoded covariates 8 | # #' 9 | # #' @examples 10 | # #' df = data.frame(x=c("A", "B","C", "A", "B")) 11 | # #' clean_covariate_columns(df,cols="x") 12 | # #' clean_covariate_columns(df,cols="x", drop_first=FALSE) 13 | 14 | clean_covariate_columns <- function(df, cols, drop_first = TRUE){ 15 | result <- data.frame(placeholder = character(nrow(df))) 16 | for (col in cols) { 17 | # keep numeric asis 18 | if (is.numeric(df[, col])){ 19 | result = cbind(result, df[, col]) 20 | colnames(result)[-1] <- col 21 | } else { 22 | vals <- unique(df[, col]) 23 | valnames = paste0(col, "_", vals) 24 | if (drop_first) { 25 | vals <- vals[-1] 26 | valnames = valnames[-1] 27 | } 28 | res <- data.frame(ifelse(vals[1] == df[, col], 1, 0)) 29 | if (length(vals) > 1){ 30 | for (val_i in 2:length(vals)){ 31 | res = cbind(res, data.frame(ifelse(vals[val_i] == df[, col], 1, 0))) 32 | } 33 | } 34 | colnames(res) = valnames 35 | result <- cbind(result, res) 36 | } 37 | } 38 | #drop placeholder 39 | result <- result[, !colnames(result) == "placeholder"] 40 | # deal with single covariates where results becomes a vector 41 | if (!is.data.frame(result)){ 42 | result = data.frame(result) 43 | colnames(result) <- cols 44 | } 45 | return(result) 46 | } 47 | 48 | #' Create data input list from phyloseq object 49 | #' 50 | #' @param phy Phyloseq object 51 | #' @param y Outcome column of interest from phy's sample_data 52 | #' @param covariates Covariate column names from phy's sample_data 53 | #' 54 | #' @return list 55 | #' @export 56 | #' 57 | #' @examples 58 | #' library(phyloseq) 59 | #' data(GlobalPatterns) 60 | #' # add a covariate 61 | #' sample_data(GlobalPatterns)$test <- rep(c(1, 0), nsamples(GlobalPatterns)/2) 62 | #' # GlobalPatterns <- tax_glom(GlobalPatterns, "Phylum") 63 | #' dat <- phy_to_floral_data(GlobalPatterns, y = "test", covariates = c("SampleType")) 64 | #' # res <- FLORAL(x = dat$xcount, y=dat$y, ncov=dat$ncov, family = "binomial", ncv=NULL) 65 | #' 66 | #' @import phyloseq 67 | 68 | phy_to_floral_data<- function(phy, y=NULL, covariates=NULL){ 69 | 70 | xcount = otu_table(phy) 71 | if (nrow(xcount) != nsamples(phy)){ 72 | # support both phyloseq objects with taxa as rows or columns 73 | xcount = t(xcount) 74 | } 75 | if(any(rownames(xcount) != sample_names(phy))){ 76 | stop("malformed phyloseq object; columns of otu_table do not match sample IDs") 77 | } 78 | ncov = 0 79 | sampdat = sample_data(phy) %>% data.frame() 80 | yres = sampdat %>% pull(all_of(y)) 81 | if (!missing(covariates)){ 82 | cov_df <- sampdat %>% select(all_of(covariates)) 83 | cov_df_clean <- clean_covariate_columns(df=cov_df, cols = covariates) 84 | ncov = ncol(cov_df_clean) 85 | # as.matrix here speeds things significantly 86 | xcount = cbind(as.matrix(cov_df_clean), xcount ) 87 | } 88 | return(list("xcount" = xcount, "ncov"=ncov, "y"=yres)) 89 | 90 | } 91 | 92 | -------------------------------------------------------------------------------- /vignettes/Using-FLORAL-for-survival-models-with-longitudinal-microbiome-data.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Using FLORAL for survival models with longitudinal microbiome data" 3 | output: 4 | rmarkdown::html_vignette: 5 | md_extensions: [ 6 | "-autolink_bare_uris" 7 | ] 8 | vignette: > 9 | %\VignetteIndexEntry{Using FLORAL for survival models with longitudinal microbiome data} 10 | %\VignetteEncoding{UTF-8} 11 | %\VignetteEngine{knitr::rmarkdown} 12 | editor_options: 13 | chunk_output_type: console 14 | --- 15 | 16 | ```{r, include = FALSE} 17 | knitr::opts_chunk$set( 18 | collapse = TRUE, 19 | comment = "#>", 20 | out.width = "100%" 21 | ) 22 | ``` 23 | 24 | ```{r setup, warning=FALSE, message=FALSE} 25 | library(FLORAL) 26 | library(dplyr) 27 | library(patchwork) 28 | library(survival) 29 | set.seed(8192024) 30 | ``` 31 | 32 | In this vignette, we illustrate how to apply `FLORAL` to fit a Cox model with longitudinal microbiome data. Due to limited availability of public data sets with survival information, we use simulated data for illustrative purposes. 33 | 34 | ## Data simulation 35 | We will use the built-in simulation function `simu()` to generate longitudinal compositional features and the corresponding time-to-event. The underlying methodology used for the simulation is based on a piece-wise exponential distribution as described by [Hendry 2014](https://doi.org/10.1002/sim.5945). 36 | 37 | By default, the first 10 features out of the 500 features simulated below are associated with the time-to-event. 38 | 39 | ```{r simulation} 40 | 41 | simdat <- simu(n=200, # sample size 42 | p=500, # number of features 43 | model="timedep", 44 | pct.sparsity = 0.8, # proportion of zeros 45 | rho=0, # feature-wise correlation 46 | longitudinal_stability = TRUE # choose to simulate longitudinal features with stable trajectories 47 | ) 48 | 49 | ``` 50 | 51 | With the simulated data, the log-ratio lasso Cox model with time-dependent features can be fitted by running the following function. Here we provide a detailed description on each arguments: 52 | 53 | * First of all, please use `longitudinal = TRUE` such that the algorithm would use the appropriate method to handle longitudinal data. 54 | * The feature matrix input `x` should be the count matrix where rows specify samples and columns specify features. 55 | * The vector of IDs of subjects/patients corresponding to the rows of `x` should be input as `id`. 56 | * The vector of sample collection times corresponding to the rows of `x` should be input as `tobs`. 57 | * The `Surv` object (`Surv(time,status)`) of **unique patients** should be input as `y`. Please note that the survival data should be sorted with respect to the IDs specified in `id`. 58 | 59 | ```{r FLORAL, warning=FALSE, message=FALSE} 60 | 61 | fit <- FLORAL(x=simdat$xcount, 62 | y=Surv(simdat$data_unique$t,simdat$data_unique$d), 63 | family="cox", 64 | longitudinal = TRUE, 65 | id = simdat$data$id, 66 | tobs = simdat$data$t0, 67 | progress=FALSE, 68 | plot=TRUE) 69 | 70 | fit$selected 71 | 72 | ``` 73 | 74 | The list of selected features is saved in `fit$selected` as shown above. 75 | 76 | To appropriately prepare the data in practice, we have the following recommendations: 77 | 78 | * Start with patient metadata which includes survival data (time and status), sorting the metadata by patient IDs. Extract time and status variables for the `Surv` object for input as `y`. 79 | * Curate the microbiome feature data matrix, sorted by patient IDs and time of sample collection. Save the patient ID and time of sample collection vectors for `id` and `tobs`. Save the feature table for input as `x`. -------------------------------------------------------------------------------- /man/simu.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/simu.R 3 | \name{simu} 4 | \alias{simu} 5 | \title{Simulate data following log-ratio model} 6 | \usage{ 7 | simu( 8 | n = 100, 9 | p = 200, 10 | model = "linear", 11 | weak = 4, 12 | strong = 6, 13 | weaksize = 0.125, 14 | strongsize = 0.25, 15 | pct.sparsity = 0.5, 16 | rho = 0, 17 | timedep_slope = NULL, 18 | timedep_cor = NULL, 19 | geetype = "gaussian", 20 | m = 4, 21 | corstr = "exchangeable", 22 | sdvec = NULL, 23 | rhogee = 0.8, 24 | geeslope = 2.5, 25 | longitudinal_stability = TRUE, 26 | ncov = 0, 27 | betacov = 0, 28 | intercept = FALSE 29 | ) 30 | } 31 | \arguments{ 32 | \item{n}{An integer of sample size} 33 | 34 | \item{p}{An integer of number of features (taxa).} 35 | 36 | \item{model}{Type of models associated with outcome variable, can be "linear", "binomial", "cox", "finegray", "gee" (scalar outcome with time-dependent features), or "timedep" (survival endpoint with time-dependent features).} 37 | 38 | \item{weak}{Number of features with \code{weak} effect size.} 39 | 40 | \item{strong}{Number of features with \code{strong} effect size.} 41 | 42 | \item{weaksize}{Actual effect size for \code{weak} effect size. Must be positive.} 43 | 44 | \item{strongsize}{Actual effect size for \code{strong} effect size. Must be positive.} 45 | 46 | \item{pct.sparsity}{Percentage of zero counts for each sample.} 47 | 48 | \item{rho}{Parameter controlling the correlated structure between taxa. Ranges between 0 and 1.} 49 | 50 | \item{timedep_slope}{If \code{model} is "timedep", this parameter specifies the slope for the feature trajectories. Please refer to the Simulation section of the manuscript for more details.} 51 | 52 | \item{timedep_cor}{If \code{model} is "timedep", this parameter specifies the sample-wise correlations between longitudinal features. Please refer to the Simulation section of the manuscript for more details.} 53 | 54 | \item{geetype}{If \code{model} is "gee", \code{geetype} is the type of GEE outcomes. Now support "gaussian" and "binomial".} 55 | 56 | \item{m}{If \code{model} is "gee", \code{m} is the number of repeated measurements per subject.} 57 | 58 | \item{corstr}{If \code{model} is "gee", \code{corstr} is the working correlation structure. Now support "independence", "exchangeable", and "AR-1".} 59 | 60 | \item{sdvec}{If \code{model} is "gee" and \code{geetype} is "gaussian", \code{sdvec} is the vector of standard deviations of each outcome variable.} 61 | 62 | \item{rhogee}{If \code{model} is "gee", \code{rhogee} is the correlation parameter between longitudinal outcomes under the selected working correlation structure.} 63 | 64 | \item{geeslope}{If \code{model} is "gee", \code{geeslope} is the linear time effect.} 65 | 66 | \item{longitudinal_stability}{If \code{model} is "timedep", this is a binary indicator which determines whether the trajectories are more stable (\code{TRUE}) or more volatile (\code{FALSE}).} 67 | 68 | \item{ncov}{Number of covariates that are not compositional features.} 69 | 70 | \item{betacov}{Coefficients corresponding to the covariates that are not compositional features.} 71 | 72 | \item{intercept}{Boolean. If TRUE, then a random intercept will be generated in the model. Only works for \code{linear} or \code{binomial} models.} 73 | } 74 | \value{ 75 | A list with simulated count matrix \code{xcount}, log1p-transformed count matrix \code{x}, outcome (continuous \code{y}, continuous centered \code{y0}, binary \code{y}, or survival \code{t}, \code{d}), true coefficient vector \code{beta}, list of non-zero features \code{idx}, value of intercept \code{intercept} (if applicable). 76 | } 77 | \description{ 78 | Simulate a dataset from log-ratio model. 79 | } 80 | \examples{ 81 | 82 | set.seed(23420) 83 | dat <- simu(n=50,p=30,model="linear") 84 | 85 | } 86 | \references{ 87 | Fei T, Funnell T, Waters N, Raj SS et al. Enhanced Feature Selection for Microbiome Data using FLORAL: Scalable Log-ratio Lasso Regression bioRxiv 2023.05.02.538599. 88 | } 89 | \author{ 90 | Teng Fei. Email: feit1@mskcc.org 91 | } 92 | -------------------------------------------------------------------------------- /man/a.FLORAL.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/FLORAL.R 3 | \name{a.FLORAL} 4 | \alias{a.FLORAL} 5 | \title{Comparing prediction performances under different choices of weights for lasso/ridge penalty} 6 | \usage{ 7 | a.FLORAL( 8 | a = c(0.1, 0.5, 1), 9 | ncore = 1, 10 | seed = NULL, 11 | x, 12 | y, 13 | ncov = 0, 14 | family = "gaussian", 15 | longitudinal = FALSE, 16 | id = NULL, 17 | tobs = NULL, 18 | failcode = NULL, 19 | corstr = "exchangeable", 20 | scalefix = FALSE, 21 | scalevalue = 1, 22 | pseudo = 1, 23 | length.lambda = 100, 24 | lambda.min.ratio = NULL, 25 | ncov.lambda.weight = 0, 26 | mu = 1, 27 | pfilter = 0, 28 | maxiter = 100, 29 | ncv = 5, 30 | intercept = FALSE, 31 | step2 = FALSE, 32 | progress = TRUE 33 | ) 34 | } 35 | \arguments{ 36 | \item{a}{vector of scalars between 0 and 1 for comparison.} 37 | 38 | \item{ncore}{Number of cores used for parallel computation. Default is to use only 1 core.} 39 | 40 | \item{seed}{A random seed for reproducibility of the results. By default the seed is the numeric form of \code{Sys.Date()}.} 41 | 42 | \item{x}{Feature matrix, where rows specify subjects and columns specify features. The first \code{ncov} columns should be patient characteristics and the rest columns are microbiome absolute counts corresponding to various taxa. If \code{x} contains longitudinal data, the rows must be sorted in the same order of the subject IDs used in \code{y}.} 43 | 44 | \item{y}{Outcome. For a continuous or binary outcome, \code{y} is a vector. For survival outcome, \code{y} is a \code{Surv} object.} 45 | 46 | \item{ncov}{An integer indicating the number of first \code{ncov} columns in \code{x} that will not be subject to the zero-sum constraint.} 47 | 48 | \item{family}{Available options are \code{gaussian}, \code{binomial}, \code{cox}, \code{finegray}.} 49 | 50 | \item{longitudinal}{\code{TRUE} or \code{FALSE}, indicating whether longitudinal data matrix is specified for input \code{x}. (\code{Longitudinal=TRUE} and \code{family="cox"} or \code{"finegray"} will fit a time-dependent covariate model. \code{Longitudinal=TRUE} and \code{family="gaussian"} or \code{"binomial"} will fit a GEE model.)} 51 | 52 | \item{id}{If \code{longitudinal} is \code{TRUE}, \code{id} specifies subject IDs corresponding to the rows of input \code{x}.} 53 | 54 | \item{tobs}{If \code{longitudinal} is \code{TRUE}, \code{tobs} specifies time points corresponding to the rows of input \code{x}.} 55 | 56 | \item{failcode}{If \code{family = finegray}, \code{failcode} specifies the failure type of interest. This must be a positive integer.} 57 | 58 | \item{corstr}{If a GEE model is specified, then \code{corstr} is the corresponding working correlation structure. Options are \code{independence}, \code{exchangeable}, \code{AR-1} and \code{unstructured}.} 59 | 60 | \item{scalefix}{\code{TRUE} or \code{FALSE}, indicating whether the scale parameter is estimated or fixed if a GEE model is specified.} 61 | 62 | \item{scalevalue}{Specify the scale parameter if \code{scalefix=TRUE}.} 63 | 64 | \item{pseudo}{Pseudo count to be added to \code{x} before taking log-transformation} 65 | 66 | \item{length.lambda}{Number of penalty parameters used in the path} 67 | 68 | \item{lambda.min.ratio}{Ratio between the minimum and maximum choice of lambda. Default is \code{NULL}, where the ratio is chosen as 1e-2.} 69 | 70 | \item{ncov.lambda.weight}{Weight of the penalty lambda applied to the first \code{ncov} covariates. Default is 0 such that the first \code{ncov} covariates are not penalized.} 71 | 72 | \item{mu}{Value of penalty for the augmented Lagrangian} 73 | 74 | \item{pfilter}{A pre-specified threshold to force coefficients with absolute values less than pfilter times the maximum value of absolute coefficient as zeros in the GEE model. Default is zero, such that all coefficients will be reported.} 75 | 76 | \item{maxiter}{Number of iterations needed for the outer loop of the augmented Lagrangian algorithm.} 77 | 78 | \item{ncv}{Folds of cross-validation. Use \code{NULL} if cross-validation is not wanted.} 79 | 80 | \item{intercept}{\code{TRUE} or \code{FALSE}, indicating whether an intercept should be estimated.} 81 | 82 | \item{step2}{\code{TRUE} or \code{FALSE}, indicating whether a second-stage feature selection for specific ratios should be performed for the features selected by the main lasso algorithm. Will only be performed if cross validation is enabled.} 83 | 84 | \item{progress}{\code{TRUE} or \code{FALSE}, indicating whether printing progress bar as the algorithm runs.} 85 | } 86 | \value{ 87 | A \code{ggplot2} object of cross-validated prediction metric versus \code{lambda}, stratified by \code{a}. Detailed data can be retrieved from the \code{ggplot2} object itself. 88 | } 89 | \description{ 90 | Summarizing \code{FLORAL} outputs from various choices of \code{a} 91 | } 92 | \examples{ 93 | 94 | set.seed(23420) 95 | 96 | dat <- simu(n=50,p=30,model="linear") 97 | pmetric <- a.FLORAL(a=c(0.1,1),ncore=1,x=dat$xcount,y=dat$y,family="gaussian",ncv=2,progress=FALSE) 98 | 99 | } 100 | \references{ 101 | Fei T, Funnell T, Waters N, Raj SS et al. Scalable Log-ratio Lasso Regression Enhances Microbiome Feature Selection for Predictive Models. bioRxiv 2023.05.02.538599. 102 | } 103 | \author{ 104 | Teng Fei. Email: feit1@mskcc.org 105 | } 106 | -------------------------------------------------------------------------------- /man/mcv.FLORAL.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/FLORAL.R 3 | \name{mcv.FLORAL} 4 | \alias{mcv.FLORAL} 5 | \title{Summarizing selected compositional features over multiple cross validations} 6 | \usage{ 7 | mcv.FLORAL( 8 | mcv = 10, 9 | ncore = 1, 10 | seed = NULL, 11 | x, 12 | y, 13 | ncov = 0, 14 | family = "gaussian", 15 | longitudinal = FALSE, 16 | id = NULL, 17 | tobs = NULL, 18 | failcode = NULL, 19 | corstr = "exchangeable", 20 | scalefix = FALSE, 21 | scalevalue = 1, 22 | pseudo = 1, 23 | length.lambda = 100, 24 | lambda.min.ratio = NULL, 25 | ncov.lambda.weight = 0, 26 | a = 1, 27 | mu = 1, 28 | pfilter = 0, 29 | maxiter = 100, 30 | ncv = 5, 31 | intercept = FALSE, 32 | step2 = TRUE, 33 | progress = TRUE, 34 | plot = TRUE 35 | ) 36 | } 37 | \arguments{ 38 | \item{mcv}{Number of random `ncv`-fold cross-validation to be performed.} 39 | 40 | \item{ncore}{Number of cores used for parallel computation. Default is to use only 1 core.} 41 | 42 | \item{seed}{A random seed for reproducibility of the results. By default the seed is the numeric form of \code{Sys.Date()}.} 43 | 44 | \item{x}{Feature matrix, where rows specify subjects and columns specify features. The first \code{ncov} columns should be patient characteristics and the rest columns are microbiome absolute counts corresponding to various taxa. If \code{x} contains longitudinal data, the rows must be sorted in the same order of the subject IDs used in \code{y}.} 45 | 46 | \item{y}{Outcome. For a continuous or binary outcome, \code{y} is a vector. For survival outcome, \code{y} is a \code{Surv} object.} 47 | 48 | \item{ncov}{An integer indicating the number of first \code{ncov} columns in \code{x} that will not be subject to the zero-sum constraint.} 49 | 50 | \item{family}{Available options are \code{gaussian}, \code{binomial}, \code{cox}, \code{finegray}.} 51 | 52 | \item{longitudinal}{\code{TRUE} or \code{FALSE}, indicating whether longitudinal data matrix is specified for input \code{x}. (\code{Longitudinal=TRUE} and \code{family="cox"} or \code{"finegray"} will fit a time-dependent covariate model. \code{Longitudinal=TRUE} and \code{family="gaussian"} or \code{"binomial"} will fit a GEE model.)} 53 | 54 | \item{id}{If \code{longitudinal} is \code{TRUE}, \code{id} specifies subject IDs corresponding to the rows of input \code{x}.} 55 | 56 | \item{tobs}{If \code{longitudinal} is \code{TRUE}, \code{tobs} specifies time points corresponding to the rows of input \code{x}.} 57 | 58 | \item{failcode}{If \code{family = finegray}, \code{failcode} specifies the failure type of interest. This must be a positive integer.} 59 | 60 | \item{corstr}{If a GEE model is specified, then \code{corstr} is the corresponding working correlation structure. Options are \code{independence}, \code{exchangeable}, \code{AR-1} and \code{unstructured}.} 61 | 62 | \item{scalefix}{\code{TRUE} or \code{FALSE}, indicating whether the scale parameter is estimated or fixed if a GEE model is specified.} 63 | 64 | \item{scalevalue}{Specify the scale parameter if \code{scalefix=TRUE}.} 65 | 66 | \item{pseudo}{Pseudo count to be added to \code{x} before taking log-transformation} 67 | 68 | \item{length.lambda}{Number of penalty parameters used in the path} 69 | 70 | \item{lambda.min.ratio}{Ratio between the minimum and maximum choice of lambda. Default is \code{NULL}, where the ratio is chosen as 1e-2.} 71 | 72 | \item{ncov.lambda.weight}{Weight of the penalty lambda applied to the first \code{ncov} covariates. Default is 0 such that the first \code{ncov} covariates are not penalized.} 73 | 74 | \item{a}{A scalar between 0 and 1: \code{a} is the weight for lasso penalty while \code{1-a} is the weight for ridge penalty.} 75 | 76 | \item{mu}{Value of penalty for the augmented Lagrangian} 77 | 78 | \item{pfilter}{A pre-specified threshold to force coefficients with absolute values less than pfilter times the maximum value of absolute coefficient as zeros in the GEE model. Default is zero, such that all coefficients will be reported.} 79 | 80 | \item{maxiter}{Number of iterations needed for the outer loop of the augmented Lagrangian algorithm.} 81 | 82 | \item{ncv}{Folds of cross-validation. Use \code{NULL} if cross-validation is not wanted.} 83 | 84 | \item{intercept}{\code{TRUE} or \code{FALSE}, indicating whether an intercept should be estimated.} 85 | 86 | \item{step2}{\code{TRUE} or \code{FALSE}, indicating whether a second-stage feature selection for specific ratios should be performed for the features selected by the main lasso algorithm. Will only be performed if cross validation is enabled.} 87 | 88 | \item{progress}{\code{TRUE} or \code{FALSE}, indicating whether printing progress bar as the algorithm runs.} 89 | 90 | \item{plot}{\code{TRUE} or \code{FALSE}, indicating whether returning summary plots of selection probability for taxa features.} 91 | } 92 | \value{ 93 | A list with relative frequencies of a certain feature being selected over \code{mcv} \code{ncv}-fold cross-validations. 94 | } 95 | \description{ 96 | Summarizing \code{FLORAL} outputs from multiple random k-fold cross validations 97 | } 98 | \examples{ 99 | 100 | set.seed(23420) 101 | 102 | dat <- simu(n=50,p=30,model="linear") 103 | fit <- mcv.FLORAL(mcv=2,ncore=1,x=dat$xcount,y=dat$y,ncv=2,progress=FALSE,step2=TRUE,plot=FALSE) 104 | 105 | } 106 | \references{ 107 | Fei T, Funnell T, Waters N, Raj SS et al. Scalable Log-ratio Lasso Regression Enhances Microbiome Feature Selection for Predictive Models. bioRxiv 2023.05.02.538599. 108 | } 109 | \author{ 110 | Teng Fei. Email: feit1@mskcc.org 111 | } 112 | -------------------------------------------------------------------------------- /tests/testthat/test-utils.R: -------------------------------------------------------------------------------- 1 | test_that("encoding covariates works", { 2 | dat = data.frame( 3 | colA=1:5, 4 | colB=c("a", "b", "a", "c", "b"), 5 | colC=factor(c("a", "b", "a", "c", "b"))) 6 | res = data.frame( 7 | colA=1:5, 8 | colB_a=c(1, 0, 1, 0, 0), 9 | colB_b=c(0, 1, 0, 0, 1), 10 | colB_c=c(0, 0, 0, 1, 0), 11 | colC_a=c(1, 0, 1, 0, 0), 12 | colC_b=c(0, 1, 0, 0, 1), 13 | colC_c=c(0, 0, 0, 1, 0) 14 | ) 15 | expect_equal(res, FLORAL:::clean_covariate_columns(dat, cols = colnames(dat), drop_first = FALSE)) 16 | } 17 | ) 18 | 19 | 20 | test_that("FLORAL() works from phy", { 21 | require(phyloseq) 22 | testphy <- new( 23 | "phyloseq", 24 | otu_table = new( 25 | "otu_table", 26 | .Data = structure( 27 | c(14L, 28 | 150L, 260L, 434L, 79L, 287L, 361L, 8L, 42L, 501L, 89L, 8L, 29L, 29 | 2629L, 154L, 200L, 865L, 694L, 217L, 144L, 0L, 0L, 0L, 0L, 0L, 30 | 10L, 0L, 0L, 521L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 46L, 0L, 0L, 31 | 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 32 | 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 436L, 0L, 0L, 0L, 0L, 33 | 0L, 291L, 4L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 34 | 0L, 0L, 0L, 0L, 0L, 0L, 30L, 3L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 35 | 0L, 0L, 37L, 65L, 59L, 0L, 106L, 388L, 61L, 26L, 0L, 0L, 0L, 36 | 0L, 0L, 24L, 0L, 0L, 0L), dim = c(20L, 6L), 37 | dimnames = list(c("ASV_1216", 38 | "ASV_12580", "ASV_12691", "ASV_135", "ASV_147", "ASV_15", "ASV_16", 39 | "ASV_184", "ASV_19", "ASV_20", "ASV_21", "ASV_2202", "ASV_239", 40 | "ASV_25", "ASV_253", "ASV_260", "ASV_29", "ASV_3005", "ASV_302", 41 | "ASV_3048"), c("1000A", "1000B", "1000C", "1000D", "1000E", "1001" 42 | ))), taxa_are_rows = TRUE), 43 | tax_table = new("taxonomyTable", 44 | .Data = structure( 45 | c("Bacteria", "Bacteria", "Bacteria", "Bacteria", 46 | "Bacteria", "Bacteria", "Bacteria", "Bacteria", "Bacteria", 47 | "Bacteria", "Bacteria", "Bacteria", "Bacteria", "Bacteria", 48 | "Bacteria", "Bacteria", "Bacteria", "Bacteria", "Bacteria", 49 | "Bacteria", "Firmicutes", "Firmicutes", "Firmicutes", "Firmicutes", 50 | "Firmicutes", "Firmicutes", "Firmicutes", "Firmicutes", "Firmicutes", 51 | "Firmicutes", "Firmicutes", "Firmicutes", "Firmicutes", "Firmicutes", 52 | "Firmicutes", "Firmicutes", "Firmicutes", "Firmicutes", "Firmicutes", 53 | "Firmicutes", "Clostridia", "Clostridia", "Clostridia", "Clostridia", 54 | "Clostridia", "Clostridia", "Bacilli", "Clostridia", "Erysipelotrichia", 55 | "Clostridia", "Clostridia", "Clostridia", "Bacilli", "Clostridia", 56 | "Erysipelotrichia", "Clostridia", "Clostridia", "Erysipelotrichia", 57 | "Clostridia", "Clostridia", "Clostridiales", "Clostridiales", 58 | "Clostridiales", "Clostridiales", "Clostridiales", "Clostridiales", 59 | "Lactobacillales", "Clostridiales", "Erysipelotrichales", 60 | "Clostridiales", "Clostridiales", "Clostridiales", "Lactobacillales", 61 | "Clostridiales", "Erysipelotrichales", "Clostridiales", "Clostridiales", 62 | "Erysipelotrichales", "Clostridiales", "Clostridiales", "Lachnospiraceae", 63 | "Lachnospiraceae", "Lachnospiraceae", "Lachnospiraceae", 64 | "Lachnospiraceae", "Lachnospiraceae", "Streptococcaceae", 65 | "Lachnospiraceae", "Erysipelotrichaceae", "Lachnospiraceae", 66 | "Lachnospiraceae", "Ruminococcaceae", "Streptococcaceae", 67 | "Ruminococcaceae", "Erysipelotrichaceae", "Lachnospiraceae", 68 | "Lachnospiraceae", "Erysipelotrichaceae", "Lachnospiraceae", 69 | "Lachnospiraceae", "", "Blautia", "", 70 | "Blautia", "Anaerostipes", "Sellimonas", "Streptococcus", 71 | "Eisenbergiella", "[Clostridium] innocuum group", "", 72 | "Lachnoclostridium", "", "Lactococcus", "Subdoligranulum", 73 | "Candidatus Stoquefichus", "Blautia", "[Ruminococcus] gnavus group", 74 | "Faecalitalea", "Blautia", ""), dim = c(20L, 6L), 75 | dimnames = list( 76 | c("ASV_1216", "ASV_12580", "ASV_12691", 77 | "ASV_135", "ASV_147", "ASV_15", "ASV_16", "ASV_184", "ASV_19", 78 | "ASV_20", "ASV_21", "ASV_2202", "ASV_239", "ASV_25", "ASV_253", 79 | "ASV_260", "ASV_29", "ASV_3005", "ASV_302", "ASV_3048"), 80 | c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus" 81 | )))), 82 | sam_data = new( 83 | "sample_data", 84 | .Data = list( 85 | c("1000", 86 | "1000", "1000", "1000", "1000", "pt_with_samples_1001_1002_1003_1004_1005_1006_1007_1008_1048_1121_152" 87 | ), 88 | c(0L, 5L, 15L, 18L, 22L, 15L), 89 | c("formed", "liquid", "liquid", "semi-formed", "formed", "formed"), 90 | c("SRR11414397", "SRR11414992", "SRR11414991", "SRR11414990", "SRR11414989", "SRR11414988"), 91 | c("PRJNA545312", "PRJNA545312", "PRJNA545312", "PRJNA545312", 92 | "PRJNA545312", "PRJNA545312"), c(-9, -4, 6, 9, 13, -6), 93 | c("", "", "", "", "", "")), 94 | names = c("PatientID", "Timepoint", "Consistency", "Accession", "BioProject", "DayRelativeToNearestHCT", 95 | "AccessionShotgun"), 96 | row.names = c("1000A", "1000B", "1000C", "1000D", "1000E", "1001"), .S3Class = "data.frame"), phy_tree = NULL, 97 | refseq = NULL) 98 | 99 | 100 | dat <- phy_to_floral_data( 101 | testphy, covariates=c("Consistency"), y = "DayRelativeToNearestHCT") 102 | 103 | fit <- FLORAL(dat$xcount,dat$y,family="gaussian",progress=FALSE,step2=TRUE, ncv = NULL) 104 | expect_true(!is.null(fit)) 105 | 106 | 107 | }) 108 | -------------------------------------------------------------------------------- /man/FLORAL.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/FLORAL.R 3 | \name{FLORAL} 4 | \alias{FLORAL} 5 | \title{Fit Log-ratio lasso regression for compositional covariates} 6 | \usage{ 7 | FLORAL( 8 | x, 9 | y, 10 | ncov = 0, 11 | family = "gaussian", 12 | longitudinal = FALSE, 13 | id = NULL, 14 | tobs = NULL, 15 | failcode = NULL, 16 | corstr = "exchangeable", 17 | scalefix = FALSE, 18 | scalevalue = 1, 19 | pseudo = 1, 20 | length.lambda = 100, 21 | lambda.min.ratio = NULL, 22 | ncov.lambda.weight = 0, 23 | a = 1, 24 | mu = 1, 25 | pfilter = 0, 26 | maxiter = 100, 27 | ncv = 5, 28 | ncore = 1, 29 | intercept = FALSE, 30 | foldid = NULL, 31 | step2 = TRUE, 32 | progress = TRUE, 33 | plot = TRUE 34 | ) 35 | } 36 | \arguments{ 37 | \item{x}{Feature matrix, where rows specify subjects and columns specify features. The first \code{ncov} columns should be patient characteristics and the rest columns are microbiome absolute counts corresponding to various taxa. If \code{x} contains longitudinal data, the rows must be sorted in the same order of the subject IDs used in \code{y}.} 38 | 39 | \item{y}{Outcome. For a continuous or binary outcome, \code{y} is a vector. For survival outcome, \code{y} is a \code{Surv} object.} 40 | 41 | \item{ncov}{An integer indicating the number of first \code{ncov} columns in \code{x} that will not be subject to the zero-sum constraint.} 42 | 43 | \item{family}{Available options are \code{gaussian}, \code{binomial}, \code{cox}, \code{finegray}.} 44 | 45 | \item{longitudinal}{\code{TRUE} or \code{FALSE}, indicating whether longitudinal data matrix is specified for input \code{x}. (\code{Longitudinal=TRUE} and \code{family="cox"} or \code{"finegray"} will fit a time-dependent covariate model. \code{Longitudinal=TRUE} and \code{family="gaussian"} or \code{"binomial"} will fit a GEE model.)} 46 | 47 | \item{id}{If \code{longitudinal} is \code{TRUE}, \code{id} specifies subject IDs corresponding to the rows of input \code{x}.} 48 | 49 | \item{tobs}{If \code{longitudinal} is \code{TRUE}, \code{tobs} specifies time points corresponding to the rows of input \code{x}.} 50 | 51 | \item{failcode}{If \code{family = finegray}, \code{failcode} specifies the failure type of interest. This must be a positive integer.} 52 | 53 | \item{corstr}{If a GEE model is specified, then \code{corstr} is the corresponding working correlation structure. Options are \code{independence}, \code{exchangeable}, \code{AR-1} and \code{unstructured}.} 54 | 55 | \item{scalefix}{\code{TRUE} or \code{FALSE}, indicating whether the scale parameter is estimated or fixed if a GEE model is specified.} 56 | 57 | \item{scalevalue}{Specify the scale parameter if \code{scalefix=TRUE}.} 58 | 59 | \item{pseudo}{Pseudo count to be added to \code{x} before taking log-transformation. If unspecified, then the log-transformation will not be performed.} 60 | 61 | \item{length.lambda}{Number of penalty parameters used in the path} 62 | 63 | \item{lambda.min.ratio}{Ratio between the minimum and maximum choice of lambda. Default is \code{NULL}, where the ratio is chosen as 1e-2.} 64 | 65 | \item{ncov.lambda.weight}{Weight of the penalty lambda applied to the first \code{ncov} covariates. Default is 0 such that the first \code{ncov} covariates are not penalized.} 66 | 67 | \item{a}{A scalar between 0 and 1: \code{a} is the weight for lasso penalty while \code{1-a} is the weight for ridge penalty.} 68 | 69 | \item{mu}{Value of penalty for the augmented Lagrangian} 70 | 71 | \item{pfilter}{A pre-specified threshold to force coefficients with absolute values less than pfilter times the maximum value of absolute coefficient as zeros in the GEE model. Default is zero, such that all coefficients will be reported.} 72 | 73 | \item{maxiter}{Number of iterations needed for the outer loop of the augmented Lagrangian algorithm.} 74 | 75 | \item{ncv}{Folds of cross-validation. Use \code{NULL} if cross-validation is not wanted.} 76 | 77 | \item{ncore}{Number of cores for parallel computing for cross-validation. Default is 1.} 78 | 79 | \item{intercept}{\code{TRUE} or \code{FALSE}, indicating whether an intercept should be estimated.} 80 | 81 | \item{foldid}{A vector of fold indicator. Default is \code{NULL}.} 82 | 83 | \item{step2}{\code{TRUE} or \code{FALSE}, indicating whether a second-stage feature selection for specific ratios should be performed for the features selected by the main lasso algorithm. Will only be performed if cross validation is enabled.} 84 | 85 | \item{progress}{\code{TRUE} or \code{FALSE}, indicating whether printing progress bar as the algorithm runs.} 86 | 87 | \item{plot}{\code{TRUE} or \code{FALSE}, indicating whether returning plots of model fitting.} 88 | } 89 | \value{ 90 | A list with path-specific estimates (beta), path (lambda), and others. Details can be found in \code{README.md}. 91 | } 92 | \description{ 93 | Conduct log-ratio lasso regression for continuous, binary and survival outcomes. 94 | } 95 | \examples{ 96 | 97 | set.seed(23420) 98 | 99 | # Continuous outcome 100 | dat <- simu(n=50,p=30,model="linear") 101 | fit <- FLORAL(dat$xcount,dat$y,family="gaussian",ncv=2,progress=FALSE,step2=TRUE) 102 | 103 | # Binary outcome 104 | # dat <- simu(n=50,p=30,model="binomial") 105 | # fit <- FLORAL(dat$xcount,dat$y,family="binomial",progress=FALSE,step2=TRUE) 106 | 107 | # Survival outcome 108 | # dat <- simu(n=50,p=30,model="cox") 109 | # fit <- FLORAL(dat$xcount,survival::Surv(dat$t,dat$d),family="cox",progress=FALSE,step2=TRUE) 110 | 111 | # Competing risks outcome 112 | # dat <- simu(n=50,p=30,model="finegray") 113 | # fit <- FLORAL(dat$xcount,survival::Surv(dat$t,dat$d,type="mstate"),failcode=1, 114 | # family="finegray",progress=FALSE,step2=FALSE) 115 | 116 | # Longitudinal continuous outcome 117 | # dat <- simu(n=50,p=30,model="gee",geetype="gaussian",m=3,corstr="exchangeable",sdvec=rep(1,3)) 118 | # fit <- FLORAL(x=cbind(dat$tvec, dat$xcount),y=dat$y,id=dat$id,family="gaussian", 119 | # ncov=1,longitudinal = TRUE,corstr = "exchangeable",lambda.min.ratio=1e-3, 120 | # progress=FALSE,step2=FALSE) 121 | 122 | } 123 | \references{ 124 | Fei T, Funnell T, Waters N, Raj SS et al. Enhanced Feature Selection for Microbiome Data using FLORAL: Scalable Log-ratio Lasso Regression bioRxiv 2023.05.02.538599. 125 | } 126 | \author{ 127 | Teng Fei. Email: feit1@mskcc.org 128 | } 129 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: github_document 3 | --- 4 | 5 | 6 | 7 | ```{r, include = FALSE} 8 | knitr::opts_chunk$set( 9 | collapse = TRUE, 10 | comment = "#>", 11 | fig.path = "man/figures/README-", 12 | out.width = "100%" 13 | ) 14 | ``` 15 | 16 | # FLORAL: Fit LOg-RAtio Lasso regression for compositional covariates 17 | 18 | 19 | [![R-CMD-check](https://github.com/vdblab/FLORAL/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/vdblab/FLORAL/actions/workflows/R-CMD-check.yaml) 20 | [![CRAN status](https://www.r-pkg.org/badges/version/FLORAL)](https://cran.r-project.org/package=FLORAL) 21 | [![](https://cranlogs.r-pkg.org/badges/FLORAL)](https://cran.r-project.org/package=FLORAL) 22 | [![](https://cranlogs.r-pkg.org/badges/grand-total/FLORAL)](https://cran.r-project.org/package=FLORAL) 23 | 24 | 25 | The `FLORAL` package is an open-source computational tool to perform log-ratio lasso regression modeling and compositional feature selection for continuous, binary, time-to-event, and competing risk outcomes. The proposed method adapts the augmented Lagrangian algorithm for a zero-sum constraint optimization problem while enabling a two-stage screening process for extended false-positive control. 26 | 27 | The associated article for `FLORAL` is available on [Cell Reports Methods](https://doi.org/10.1016/j.crmeth.2024.100899). 28 | 29 | ## System requirements and installation 30 | 31 | The current version of `FLORAL` (0.5.0) was built in R version 4.4.2. R package dependencies can be found in the `DESCRIPTION` file. 32 | 33 | You can install `FLORAL` with the following code. The installation is typically complete within minutes. 34 | 35 | ``` r 36 | install.packages("FLORAL") 37 | ``` 38 | 39 | You can also install `FLORAL` using conda: 40 | 41 | ``` bash 42 | conda install -c bioconda r-floral 43 | ``` 44 | 45 | You can install the development version of `FLORAL` from [GitHub](https://github.com/) with: 46 | 47 | ``` r 48 | # install.packages("devtools") 49 | devtools::install_github("vdblab/FLORAL") 50 | ``` 51 | 52 | ## Example 53 | 54 | Here is a toy example for linear regression with 10-fold cross-validation for a simulated data with 50 samples and 100 compositional features. Option `progress=TRUE` can be used to show the progress bar of the running algorithm. 55 | 56 | The data simulation procedure is described in the preprint. The expected run time for the following demo is about a minute. 57 | 58 | ```{r example} 59 | set.seed(23420) 60 | library(FLORAL) 61 | 62 | dat <- simu(n=50,p=100,model="linear") 63 | fit <- FLORAL(dat$xcount,dat$y,family="gaussian",ncv=10,progress=FALSE) 64 | ``` 65 | 66 | To view plots of cross-validated prediction error and parameter coefficients, use `fit$pmse` or `fit$pcoef`: 67 | 68 | ```{r plot, echo = FALSE,out.width = "50%",fig.height=4,fig.width=6,dpi=300} 69 | fit$pmse 70 | fit$pcoef 71 | ``` 72 | 73 | To view selected compositional features, use `fit$selected.feature`, where features are sorted by their names. Features under `min` and `1se` correspond to penalty parameter $\lambda_{\min}$ and $\lambda_{\text{1se}}$, respectively. Features under `min.2stage` and `1se.2stage` are obtained after applying 2-stage filtering based on features under `min` and `1se`, respectively. 74 | 75 | We recommend interpreting the selected compositional features as potential predictive markers to the outcome in the regression model in the sense that the cross-validated prediction error is improved by considering these selected features. 76 | 77 | ```{r feature} 78 | fit$selected.feature 79 | ``` 80 | 81 | To get specific log-ratios selected by the 2-stage procedure, use `fit$step2.log-ratios`, where `min` and `1se` display the log-ratios between features. For each identified ratio, `min.idx` and `1se.idx` return the column indices in the original input matrix for the two corresponding features forming the ratio. 82 | 83 | ```{r ratio} 84 | fit$step2.ratios 85 | ``` 86 | 87 | More detailed interpretations can be obtained for the selected log-ratios. First, the selected log-ratios also improve the cross-validated prediction errors because these log-ratios are derived from the constrained lasso estimate. Moreover, as guided by the association table between log-ratios and the outcome, it is possible to interpret the directions of the covariate effects associated with certain log-ratios on the outcome. To view detailed associations between selected log-ratios and the outcome, use `fit$step2.tables` to print summary tables for the multivariable stepwise regression models obtained by the 2-stage procedure. 88 | 89 | ```{r gtsummary1, eval=FALSE} 90 | fit$step2.tables$min 91 | ``` 92 | 93 | ```{r gtsummary2, eval=FALSE} 94 | fit$step2.tables$`1se` 95 | ``` 96 | 97 | For binary and survival outcomes, please specify `family="binomial"`, `family="cox"`, or `family="finegray"` accordingly. 98 | 99 | ```{r example2, warning=FALSE} 100 | dat.bin <- simu(n=50,p=100,model="binomial") 101 | fit.bin <- FLORAL(dat.bin$xcount,dat.bin$y,family="binomial",ncv=10,progress=FALSE) 102 | 103 | dat.cox <- simu(n=50,p=100,model="cox") 104 | fit.cox <- FLORAL(dat.cox$xcount,survival::Surv(dat.cox$t,dat.cox$d),family="cox",ncv=10,progress=FALSE) 105 | 106 | dat.fg <- simu(n=50,p=100,model="finegray") 107 | fit.fg <- FLORAL(dat.cox$xcount,survival::Surv(dat.cox$t,dat.cox$d,type="mstate"),family="finegray",ncv=10,progress=FALSE,step2=FALSE) 108 | ``` 109 | 110 | ## Repository for Reproducibility 111 | 112 | Reproducible code for the analyses results reported in the manuscript can be found at [this repository](https://github.com/vdblab/FLORAL-analysis). 113 | 114 | ## Contributing 115 | 116 | The `FLORAL` package is jointly managed by [MSKCC Biostatistics service](https://www.mskcc.org/departments/epidemiology-biostatistics/biostatistics) and [the Marcel van den Brink Lab](https://vandenbrinklab.org/). Please note that the `FLORAL` project is released with a [Contributor Code of Conduct](https://github.com/vdblab/FLORAL/blob/master/.github/CODE_OF_CONDUCT.md). By contributing to this project, you agree to abide by its terms. Thank you to all contributors! 117 | 118 | ## Reference 119 | 120 | Fei T, Funnell T, Waters NR, Raj SS, Sadeghi K, Dai A, Miltiadous O, Shouval R, Lv M, Peled JU, Ponce DM, Perales M-A, Gönen M, van den Brink MRM, Scalable log-ratio lasso regression for enhanced microbial feature selection with FLORAL, Cell Reports Methods (2024), 100899; doi: . 121 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # FLORAL: Fit LOg-RAtio Lasso regression for compositional covariates 5 | 6 | 7 | 8 | [![R-CMD-check](https://github.com/vdblab/FLORAL/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/vdblab/FLORAL/actions/workflows/R-CMD-check.yaml) 9 | [![CRAN 10 | status](https://www.r-pkg.org/badges/version/FLORAL)](https://cran.r-project.org/package=FLORAL) 11 | [![](https://cranlogs.r-pkg.org/badges/FLORAL)](https://cran.r-project.org/package=FLORAL) 12 | [![](https://cranlogs.r-pkg.org/badges/grand-total/FLORAL)](https://cran.r-project.org/package=FLORAL) 13 | 14 | 15 | The `FLORAL` package is an open-source computational tool to perform 16 | log-ratio lasso regression modeling and compositional feature selection 17 | for continuous, binary, time-to-event, and competing risk outcomes. The 18 | proposed method adapts the augmented Lagrangian algorithm for a zero-sum 19 | constraint optimization problem while enabling a two-stage screening 20 | process for extended false-positive control. 21 | 22 | The associated article for `FLORAL` is available on [Cell Reports 23 | Methods](https://doi.org/10.1016/j.crmeth.2024.100899). 24 | 25 | ## System requirements and installation 26 | 27 | The current version of `FLORAL` (0.5.0) was built in R version 4.4.2. R 28 | package dependencies can be found in the `DESCRIPTION` file. 29 | 30 | You can install `FLORAL` with the following code. The installation is 31 | typically complete within minutes. 32 | 33 | ``` r 34 | install.packages("FLORAL") 35 | ``` 36 | 37 | You can also install `FLORAL` using conda: 38 | 39 | ``` bash 40 | conda install -c bioconda r-floral 41 | ``` 42 | 43 | You can install the development version of `FLORAL` from 44 | [GitHub](https://github.com/) with: 45 | 46 | ``` r 47 | # install.packages("devtools") 48 | devtools::install_github("vdblab/FLORAL") 49 | ``` 50 | 51 | ## Example 52 | 53 | Here is a toy example for linear regression with 10-fold 54 | cross-validation for a simulated data with 50 samples and 100 55 | compositional features. Option `progress=TRUE` can be used to show the 56 | progress bar of the running algorithm. 57 | 58 | The data simulation procedure is described in the preprint. The expected 59 | run time for the following demo is about a minute. 60 | 61 | ``` r 62 | set.seed(23420) 63 | library(FLORAL) 64 | 65 | dat <- simu(n=50,p=100,model="linear") 66 | fit <- FLORAL(dat$xcount,dat$y,family="gaussian",ncv=10,progress=FALSE) 67 | ``` 68 | 69 | To view plots of cross-validated prediction error and parameter 70 | coefficients, use `fit$pmse` or `fit$pcoef`: 71 | 72 | 73 | 74 | To view selected compositional features, use `fit$selected.feature`, 75 | where features are sorted by their names. Features under `min` and `1se` 76 | correspond to penalty parameter $\lambda_{\min}$ and 77 | $\lambda_{\text{1se}}$, respectively. Features under `min.2stage` and 78 | `1se.2stage` are obtained after applying 2-stage filtering based on 79 | features under `min` and `1se`, respectively. 80 | 81 | We recommend interpreting the selected compositional features as 82 | potential predictive markers to the outcome in the regression model in 83 | the sense that the cross-validated prediction error is improved by 84 | considering these selected features. 85 | 86 | ``` r 87 | fit$selected.feature 88 | #> $min 89 | #> [1] "taxa1" "taxa10" "taxa15" "taxa2" "taxa29" "taxa3" "taxa39" "taxa43" 90 | #> [9] "taxa5" "taxa6" "taxa7" "taxa8" "taxa9" "taxa92" 91 | #> 92 | #> $`1se` 93 | #> [1] "taxa1" "taxa10" "taxa2" "taxa3" "taxa39" "taxa5" "taxa6" "taxa7" 94 | #> [9] "taxa8" "taxa9" 95 | #> 96 | #> $min.2stage 97 | #> [1] "taxa1" "taxa10" "taxa2" "taxa3" "taxa43" "taxa5" "taxa6" "taxa7" 98 | #> [9] "taxa8" "taxa9" "taxa92" 99 | #> 100 | #> $`1se.2stage` 101 | #> [1] "taxa1" "taxa10" "taxa2" "taxa3" "taxa5" "taxa6" "taxa7" "taxa8" 102 | #> [9] "taxa9" 103 | ``` 104 | 105 | To get specific log-ratios selected by the 2-stage procedure, use 106 | `fit$step2.log-ratios`, where `min` and `1se` display the log-ratios 107 | between features. For each identified ratio, `min.idx` and `1se.idx` 108 | return the column indices in the original input matrix for the two 109 | corresponding features forming the ratio. 110 | 111 | ``` r 112 | fit$step2.ratios 113 | #> $min 114 | #> [1] "taxa1/taxa10" "taxa2/taxa5" "taxa3/taxa8" "taxa5/taxa8" 115 | #> [5] "taxa6/taxa9" "taxa7/taxa92" "taxa10/taxa43" 116 | #> 117 | #> $`1se` 118 | #> [1] "taxa1/taxa10" "taxa2/taxa5" "taxa3/taxa8" "taxa5/taxa8" "taxa6/taxa9" 119 | #> [6] "taxa7/taxa10" 120 | #> 121 | #> $min.idx 122 | #> [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] 123 | #> [1,] NA 1 2 3 5 6 7 10 124 | #> [2,] NA 10 5 8 8 9 92 43 125 | #> 126 | #> $`1se.idx` 127 | #> [,1] [,2] [,3] [,4] [,5] [,6] [,7] 128 | #> [1,] NA 1 2 3 5 6 7 129 | #> [2,] NA 10 5 8 8 9 10 130 | ``` 131 | 132 | More detailed interpretations can be obtained for the selected 133 | log-ratios. First, the selected log-ratios also improve the 134 | cross-validated prediction errors because these log-ratios are derived 135 | from the constrained lasso estimate. Moreover, as guided by the 136 | association table between log-ratios and the outcome, it is possible to 137 | interpret the directions of the covariate effects associated with 138 | certain log-ratios on the outcome. To view detailed associations between 139 | selected log-ratios and the outcome, use `fit$step2.tables` to print 140 | summary tables for the multivariable stepwise regression models obtained 141 | by the 2-stage procedure. 142 | 143 | ``` r 144 | fit$step2.tables$min 145 | ``` 146 | 147 | ``` r 148 | fit$step2.tables$`1se` 149 | ``` 150 | 151 | For binary and survival outcomes, please specify `family="binomial"`, 152 | `family="cox"`, or `family="finegray"` accordingly. 153 | 154 | ``` r 155 | dat.bin <- simu(n=50,p=100,model="binomial") 156 | fit.bin <- FLORAL(dat.bin$xcount,dat.bin$y,family="binomial",ncv=10,progress=FALSE) 157 | 158 | dat.cox <- simu(n=50,p=100,model="cox") 159 | fit.cox <- FLORAL(dat.cox$xcount,survival::Surv(dat.cox$t,dat.cox$d),family="cox",ncv=10,progress=FALSE) 160 | 161 | dat.fg <- simu(n=50,p=100,model="finegray") 162 | fit.fg <- FLORAL(dat.cox$xcount,survival::Surv(dat.cox$t,dat.cox$d,type="mstate"),family="finegray",ncv=10,progress=FALSE,step2=FALSE) 163 | ``` 164 | 165 | ## Repository for Reproducibility 166 | 167 | Reproducible code for the analyses results reported in the manuscript 168 | can be found at [this 169 | repository](https://github.com/vdblab/FLORAL-analysis). 170 | 171 | ## Contributing 172 | 173 | The `FLORAL` package is jointly managed by [MSKCC Biostatistics 174 | service](https://www.mskcc.org/departments/epidemiology-biostatistics/biostatistics) 175 | and the Marcel van den Brink Lab. Please 176 | note that the `FLORAL` project is released with a [Contributor Code of 177 | Conduct](https://github.com/vdblab/FLORAL/blob/master/.github/CODE_OF_CONDUCT.md). 178 | By contributing to this project, you agree to abide by its terms. Thank 179 | you to all contributors! 180 | 181 | ## Reference 182 | 183 | Fei T, Funnell T, Waters NR, Raj SS, Baichoo M, 184 | Sadeghi K, Dai A, Miltiadous O, Shouval R, Lv M, 185 | Peled JU, Ponce DM, Perales MA, Gönen M, van den Brink MRM. 186 | Scalable log-ratio lasso regression for enhanced microbial feature 187 | selection with FLORAL, Cell Reports Methods (2024), 100899; doi: 188 | . Epub 2024 Nov 7. 189 | PMID: 39515336; PMCID: PMC11705925. 190 | -------------------------------------------------------------------------------- /vignettes/Using-FLORAL-for-Microbiome-Analysis.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Using FLORAL for Microbiome Analysis" 3 | output: 4 | rmarkdown::html_vignette: 5 | md_extensions: [ 6 | "-autolink_bare_uris" 7 | ] 8 | vignette: > 9 | %\VignetteIndexEntry{Using FLORAL for Microbiome Analysis} 10 | %\VignetteEncoding{UTF-8} 11 | %\VignetteEngine{knitr::rmarkdown} 12 | editor_options: 13 | chunk_output_type: console 14 | --- 15 | 16 | ```{r, include = FALSE} 17 | knitr::opts_chunk$set( 18 | collapse = TRUE, 19 | comment = "#>", 20 | out.width = "100%" 21 | ) 22 | ``` 23 | 24 | ```{r setup, warning=FALSE, message=FALSE} 25 | library(FLORAL) 26 | library(dplyr) 27 | library(patchwork) 28 | ``` 29 | 30 | ## Data 31 | We will be using data from the `curatedMetagenomicData` package. For easier installation, we saved a flat copy of the data, but the steps below show how that was created. 32 | 33 | ```r 34 | if (! "BiocManager" %in% installed.packages()) install.packages("BiocManager") 35 | if (! "curatedMetagenomicData" %in% installed.packages()) BiocManager::install("curatedMetagenomicData") 36 | if (! "patchwork" %in% installed.packages()) install.packages("patchwork") 37 | library(curatedMetagenomicData) 38 | 39 | # Take a look at the summary of the studies available: 40 | curatedMetagenomicData::sampleMetadata |> group_by(.data$study_name, .data$study_condition) |> count() |> arrange(.data$study_name) 41 | #As an example, let us look at the `YachidaS_2019` study between healthy controls and colorectal cancer (CRC) patients. 42 | curatedMetagenomicData::curatedMetagenomicData("YachidaS_2019") 43 | # note -- if you are behind a firewall, see the solutions to 500 errors here: 44 | # https://support.bioconductor.org/p/132864/ 45 | rawdata <- curatedMetagenomicData::curatedMetagenomicData("2021-03-31.YachidaS_2019.relative_abundance", dryrun = FALSE, counts = TRUE) |> mergeData() 46 | x <- SummarizedExperiment::assays(rawdata)$relative_abundance %>% t() 47 | y <- rawdata@colData$disease 48 | 49 | save(list = c("x", "y"), file = file.path("inst", "extdata", "YachidaS_2019.Rdata")) 50 | ``` 51 | 52 | ```{r getData} 53 | load(system.file("extdata", "YachidaS_2019.Rdata", package="FLORAL")) 54 | ``` 55 | 56 | ## Running FLORAL 57 | 58 | We extracted the data from the `TreeSummarizedExperiment` object to two objects: the taxa matrix `x` and the "outcomes" vector `y` of whether a patient is healthy or has colorectal cancer (CRC). Note that for binary outcomes, the input vector `y` needs to be formatted with entries equal to either `0` or `1`. In addition, we need to specify `family = "binomial"` in `FLORAL` to fit the logistic regression model. To print the progress bar as the algorithm runs, please use `progress = TRUE`. 59 | 60 | ```{r floral} 61 | 62 | x <- x[y %in% c("CRC","healthy"),] 63 | x <- x[,colSums(x >= 100) >= nrow(x)*0.2] # filter low abundance taxa 64 | 65 | colnames(x) <- sapply(colnames(x), function(x) strsplit(x,split="[|]")[[1]][length(strsplit(x,split="[|]")[[1]])]) 66 | 67 | y <- as.numeric(as.factor(y[y %in% c("CRC","healthy")]))-1 68 | fit <- FLORAL(x = x, y = y, family="binomial", ncv=10, progress=TRUE) 69 | ``` 70 | 71 | ## Interpreting the Model 72 | 73 | FLORAL, like other methods that have an optimization step, has two "best" solutions for $\lambda$ available: one minimizing the mean squared error ($\lambda_\min$), and one maximizing the value of $\lambda$ withing 1 standard error of the minimum mean squared error ($\lambda_{\text{1se}}$). These are referred to as the `min` and `1se` solutions, respectively. 74 | 75 | We can see the mean squared error (MSE) and the coefficients vs log($\lambda$) as follows: 76 | 77 | ```{r plots,fig.height=4,fig.width=10,dpi=300} 78 | fit$pmse + fit$pcoef 79 | ``` 80 | 81 | In both plots, the vertical dashed line and dotted line represent $\lambda_\min$ and $\lambda_{\text{1se}}$, respectively. In the MSE plot, the bands represent plus minus one standard error of the MSE. In the coefficient plot, the colored lines represent individual taxa, where taxa with non-zero values at $\lambda_\min$ and $\lambda_{\text{1se}}$ are selected as predictive of the outcome. 82 | 83 | To view specific names of the selected taxa, please see `fit$selected.feature$min` or `fit$selected.feature$1se` vectors. To view all coefficient estimates, please see `fit$best.beta$min` or `fit$best.beta$1se`. Without looking into ratios, one can crudely interpret positive or negative association between a taxon and the outcome by the positive or negative sign of the coefficient estimates. However, we recommend referring to the two-step procedure discussed below for a more rigorous interpretation based on ratios, which is derived from the log-ratio model assumption. 84 | 85 | ```{r viewTaxa} 86 | 87 | head(fit$selected.feature$min) 88 | 89 | head(sort(fit$best.beta$min)) 90 | 91 | ``` 92 | 93 | ## The Two-step Procedure 94 | 95 | In the previous section, we checked the lasso estimates without identifying specific ratios that are predictive of the outcome (CRC in this case). By default, `FLORAL` performs a two-step selection procedure to use `glmnet` and `step` regression to further identify taxa pairs which form predictive log-ratios. To view those pairs, use `fit$step2.ratios$min` or `fit$step2.ratios$1se` for names of ratios and `fit$step2.ratios$min.idx` or `fit$step2.ratios$1se.idx` for the pairs of indices in the original input count matrix `x`. Note that one taxon can occur in multiple ratios. 96 | 97 | ```{r view2step} 98 | 99 | head(fit$step2.ratios$`1se`) 100 | 101 | fit$step2.ratios$`1se.idx` 102 | 103 | ``` 104 | 105 | To further interpret the positive or negative associations between the outcome, please refer to the output `step` regression tables, where the effect sizes of the ratios can be found. 106 | 107 | While the corresponding p-values are also available, we recommend only using the p-values as a criterion to rank the strength of the association. We do not recommend directly reporting the p-values for inference, because these p-values were obtained after running the first step lasso model without rigorous post-selective inference. However, it is still valid to claim these selected log-ratios are predictive of the outcome, as demonstrated by the improved 10-fold cross-validated prediction errors. 108 | 109 | ```{r viewTable} 110 | 111 | fit$step2.tables$`1se` 112 | 113 | ``` 114 | 115 | ## Generating taxa selection probabilities 116 | 117 | It is encouraged to run k-fold cross-validation for several times to account for the random fold splits. `FLORAL` provides `mcv.FLORAL` functions to repeat cross-validations for `mcv` times and on `ncore` cores. The output summarizes taxa selection probabilities, average coefficients based on $\lambda_\min$ and $\lambda_{\text{1se}}$. Interpretable plots can be created if `plot = TRUE` is specified. 118 | 119 | ```{r mcv} 120 | 121 | mcv.fit <- mcv.FLORAL(mcv=2, 122 | ncore=1, 123 | x = x, 124 | y = y, 125 | family = "binomial", 126 | ncv = 3, 127 | progress=TRUE) 128 | 129 | ``` 130 | 131 | ```{r mcvplots,fig.height=6,fig.width=10,dpi=300} 132 | 133 | mcv.fit$p_min 134 | 135 | #Other options are also available 136 | #mcv.fit$p_min_ratio 137 | #mcv.fit$p_1se 138 | #mcv.fit$p_1se_ratio 139 | 140 | ``` 141 | 142 | ## Elastic net 143 | 144 | Beyond lasso model, `FLORAL` also supports elastic net models by specifying the tuning parameter `a` between 0 and 1. Lasso penalty will be used when `a=1` while ridge penalty will be used when `a=0`. 145 | 146 | The `a.FLORAL` function can help investigate the prediction performance for different choices of `a` and return a plot of the corresponding prediction metric trajectories against the choice of $\lambda$. 147 | 148 | ```{r a.floral,out.width = '50%',fig.height=4,fig.width=4,dpi=300} 149 | 150 | a.fit <- a.FLORAL(a = c(0.1,1), 151 | ncore = 1, 152 | x = x, 153 | y = y, 154 | family = "binomial", 155 | ncv = 3, 156 | progress=TRUE) 157 | 158 | a.fit 159 | 160 | ``` 161 | 162 | -------------------------------------------------------------------------------- /src/RcppExports.cpp: -------------------------------------------------------------------------------- 1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | #include 5 | #include 6 | 7 | using namespace Rcpp; 8 | 9 | #ifdef RCPP_USE_GLOBAL_ROSTREAM 10 | Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); 11 | Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); 12 | #endif 13 | 14 | // softthreshold 15 | double softthreshold(double x, double lambda); 16 | RcppExport SEXP _FLORAL_softthreshold(SEXP xSEXP, SEXP lambdaSEXP) { 17 | BEGIN_RCPP 18 | Rcpp::RObject rcpp_result_gen; 19 | Rcpp::RNGScope rcpp_rngScope_gen; 20 | Rcpp::traits::input_parameter< double >::type x(xSEXP); 21 | Rcpp::traits::input_parameter< double >::type lambda(lambdaSEXP); 22 | rcpp_result_gen = Rcpp::wrap(softthreshold(x, lambda)); 23 | return rcpp_result_gen; 24 | END_RCPP 25 | } 26 | // gd_cov 27 | arma::vec gd_cov(arma::mat xx, arma::vec xy, int n, double l, arma::vec beta); 28 | RcppExport SEXP _FLORAL_gd_cov(SEXP xxSEXP, SEXP xySEXP, SEXP nSEXP, SEXP lSEXP, SEXP betaSEXP) { 29 | BEGIN_RCPP 30 | Rcpp::RObject rcpp_result_gen; 31 | Rcpp::RNGScope rcpp_rngScope_gen; 32 | Rcpp::traits::input_parameter< arma::mat >::type xx(xxSEXP); 33 | Rcpp::traits::input_parameter< arma::vec >::type xy(xySEXP); 34 | Rcpp::traits::input_parameter< int >::type n(nSEXP); 35 | Rcpp::traits::input_parameter< double >::type l(lSEXP); 36 | Rcpp::traits::input_parameter< arma::vec >::type beta(betaSEXP); 37 | rcpp_result_gen = Rcpp::wrap(gd_cov(xx, xy, n, l, beta)); 38 | return rcpp_result_gen; 39 | END_RCPP 40 | } 41 | // gd_cov_al 42 | arma::vec gd_cov_al(arma::mat xx, arma::vec xy, int n, double l, double a, arma::vec beta, double mu, double alpha, bool adjust, unsigned int ncov, double wcov); 43 | RcppExport SEXP _FLORAL_gd_cov_al(SEXP xxSEXP, SEXP xySEXP, SEXP nSEXP, SEXP lSEXP, SEXP aSEXP, SEXP betaSEXP, SEXP muSEXP, SEXP alphaSEXP, SEXP adjustSEXP, SEXP ncovSEXP, SEXP wcovSEXP) { 44 | BEGIN_RCPP 45 | Rcpp::RObject rcpp_result_gen; 46 | Rcpp::RNGScope rcpp_rngScope_gen; 47 | Rcpp::traits::input_parameter< arma::mat >::type xx(xxSEXP); 48 | Rcpp::traits::input_parameter< arma::vec >::type xy(xySEXP); 49 | Rcpp::traits::input_parameter< int >::type n(nSEXP); 50 | Rcpp::traits::input_parameter< double >::type l(lSEXP); 51 | Rcpp::traits::input_parameter< double >::type a(aSEXP); 52 | Rcpp::traits::input_parameter< arma::vec >::type beta(betaSEXP); 53 | Rcpp::traits::input_parameter< double >::type mu(muSEXP); 54 | Rcpp::traits::input_parameter< double >::type alpha(alphaSEXP); 55 | Rcpp::traits::input_parameter< bool >::type adjust(adjustSEXP); 56 | Rcpp::traits::input_parameter< unsigned int >::type ncov(ncovSEXP); 57 | Rcpp::traits::input_parameter< double >::type wcov(wcovSEXP); 58 | rcpp_result_gen = Rcpp::wrap(gd_cov_al(xx, xy, n, l, a, beta, mu, alpha, adjust, ncov, wcov)); 59 | return rcpp_result_gen; 60 | END_RCPP 61 | } 62 | // linear_enet_al 63 | Rcpp::List linear_enet_al(arma::mat x, arma::vec y, int len, double mu, int ub, arma::vec lambda, double wcov, double a, bool adjust, unsigned int ncov, bool display_progress); 64 | RcppExport SEXP _FLORAL_linear_enet_al(SEXP xSEXP, SEXP ySEXP, SEXP lenSEXP, SEXP muSEXP, SEXP ubSEXP, SEXP lambdaSEXP, SEXP wcovSEXP, SEXP aSEXP, SEXP adjustSEXP, SEXP ncovSEXP, SEXP display_progressSEXP) { 65 | BEGIN_RCPP 66 | Rcpp::RObject rcpp_result_gen; 67 | Rcpp::RNGScope rcpp_rngScope_gen; 68 | Rcpp::traits::input_parameter< arma::mat >::type x(xSEXP); 69 | Rcpp::traits::input_parameter< arma::vec >::type y(ySEXP); 70 | Rcpp::traits::input_parameter< int >::type len(lenSEXP); 71 | Rcpp::traits::input_parameter< double >::type mu(muSEXP); 72 | Rcpp::traits::input_parameter< int >::type ub(ubSEXP); 73 | Rcpp::traits::input_parameter< arma::vec >::type lambda(lambdaSEXP); 74 | Rcpp::traits::input_parameter< double >::type wcov(wcovSEXP); 75 | Rcpp::traits::input_parameter< double >::type a(aSEXP); 76 | Rcpp::traits::input_parameter< bool >::type adjust(adjustSEXP); 77 | Rcpp::traits::input_parameter< unsigned int >::type ncov(ncovSEXP); 78 | Rcpp::traits::input_parameter< bool >::type display_progress(display_progressSEXP); 79 | rcpp_result_gen = Rcpp::wrap(linear_enet_al(x, y, len, mu, ub, lambda, wcov, a, adjust, ncov, display_progress)); 80 | return rcpp_result_gen; 81 | END_RCPP 82 | } 83 | // logistic_enet_al 84 | Rcpp::List logistic_enet_al(arma::mat x, arma::vec y, int len, double mu, int ub, arma::vec lambda, double wcov, double a, bool adjust, unsigned int ncov, bool display_progress, bool loop1, bool loop2); 85 | RcppExport SEXP _FLORAL_logistic_enet_al(SEXP xSEXP, SEXP ySEXP, SEXP lenSEXP, SEXP muSEXP, SEXP ubSEXP, SEXP lambdaSEXP, SEXP wcovSEXP, SEXP aSEXP, SEXP adjustSEXP, SEXP ncovSEXP, SEXP display_progressSEXP, SEXP loop1SEXP, SEXP loop2SEXP) { 86 | BEGIN_RCPP 87 | Rcpp::RObject rcpp_result_gen; 88 | Rcpp::RNGScope rcpp_rngScope_gen; 89 | Rcpp::traits::input_parameter< arma::mat >::type x(xSEXP); 90 | Rcpp::traits::input_parameter< arma::vec >::type y(ySEXP); 91 | Rcpp::traits::input_parameter< int >::type len(lenSEXP); 92 | Rcpp::traits::input_parameter< double >::type mu(muSEXP); 93 | Rcpp::traits::input_parameter< int >::type ub(ubSEXP); 94 | Rcpp::traits::input_parameter< arma::vec >::type lambda(lambdaSEXP); 95 | Rcpp::traits::input_parameter< double >::type wcov(wcovSEXP); 96 | Rcpp::traits::input_parameter< double >::type a(aSEXP); 97 | Rcpp::traits::input_parameter< bool >::type adjust(adjustSEXP); 98 | Rcpp::traits::input_parameter< unsigned int >::type ncov(ncovSEXP); 99 | Rcpp::traits::input_parameter< bool >::type display_progress(display_progressSEXP); 100 | Rcpp::traits::input_parameter< bool >::type loop1(loop1SEXP); 101 | Rcpp::traits::input_parameter< bool >::type loop2(loop2SEXP); 102 | rcpp_result_gen = Rcpp::wrap(logistic_enet_al(x, y, len, mu, ub, lambda, wcov, a, adjust, ncov, display_progress, loop1, loop2)); 103 | return rcpp_result_gen; 104 | END_RCPP 105 | } 106 | // cox_enet_al 107 | Rcpp::List cox_enet_al(arma::mat x, arma::vec t, arma::vec d, arma::vec tj, int len, double mu, int ub, arma::vec lambda, double wcov, double a, bool adjust, unsigned int ncov, double devnull, bool display_progress, bool loop1, bool loop2, bool notcv); 108 | RcppExport SEXP _FLORAL_cox_enet_al(SEXP xSEXP, SEXP tSEXP, SEXP dSEXP, SEXP tjSEXP, SEXP lenSEXP, SEXP muSEXP, SEXP ubSEXP, SEXP lambdaSEXP, SEXP wcovSEXP, SEXP aSEXP, SEXP adjustSEXP, SEXP ncovSEXP, SEXP devnullSEXP, SEXP display_progressSEXP, SEXP loop1SEXP, SEXP loop2SEXP, SEXP notcvSEXP) { 109 | BEGIN_RCPP 110 | Rcpp::RObject rcpp_result_gen; 111 | Rcpp::RNGScope rcpp_rngScope_gen; 112 | Rcpp::traits::input_parameter< arma::mat >::type x(xSEXP); 113 | Rcpp::traits::input_parameter< arma::vec >::type t(tSEXP); 114 | Rcpp::traits::input_parameter< arma::vec >::type d(dSEXP); 115 | Rcpp::traits::input_parameter< arma::vec >::type tj(tjSEXP); 116 | Rcpp::traits::input_parameter< int >::type len(lenSEXP); 117 | Rcpp::traits::input_parameter< double >::type mu(muSEXP); 118 | Rcpp::traits::input_parameter< int >::type ub(ubSEXP); 119 | Rcpp::traits::input_parameter< arma::vec >::type lambda(lambdaSEXP); 120 | Rcpp::traits::input_parameter< double >::type wcov(wcovSEXP); 121 | Rcpp::traits::input_parameter< double >::type a(aSEXP); 122 | Rcpp::traits::input_parameter< bool >::type adjust(adjustSEXP); 123 | Rcpp::traits::input_parameter< unsigned int >::type ncov(ncovSEXP); 124 | Rcpp::traits::input_parameter< double >::type devnull(devnullSEXP); 125 | Rcpp::traits::input_parameter< bool >::type display_progress(display_progressSEXP); 126 | Rcpp::traits::input_parameter< bool >::type loop1(loop1SEXP); 127 | Rcpp::traits::input_parameter< bool >::type loop2(loop2SEXP); 128 | Rcpp::traits::input_parameter< bool >::type notcv(notcvSEXP); 129 | rcpp_result_gen = Rcpp::wrap(cox_enet_al(x, t, d, tj, len, mu, ub, lambda, wcov, a, adjust, ncov, devnull, display_progress, loop1, loop2, notcv)); 130 | return rcpp_result_gen; 131 | END_RCPP 132 | } 133 | // cox_timedep_enet_al 134 | Rcpp::List cox_timedep_enet_al(arma::mat x, arma::vec t0, arma::vec t1, arma::vec d, arma::vec tj, int len, double mu, int ub, arma::vec lambda, double wcov, double a, bool adjust, unsigned int ncov, double devnull, bool display_progress); 135 | RcppExport SEXP _FLORAL_cox_timedep_enet_al(SEXP xSEXP, SEXP t0SEXP, SEXP t1SEXP, SEXP dSEXP, SEXP tjSEXP, SEXP lenSEXP, SEXP muSEXP, SEXP ubSEXP, SEXP lambdaSEXP, SEXP wcovSEXP, SEXP aSEXP, SEXP adjustSEXP, SEXP ncovSEXP, SEXP devnullSEXP, SEXP display_progressSEXP) { 136 | BEGIN_RCPP 137 | Rcpp::RObject rcpp_result_gen; 138 | Rcpp::RNGScope rcpp_rngScope_gen; 139 | Rcpp::traits::input_parameter< arma::mat >::type x(xSEXP); 140 | Rcpp::traits::input_parameter< arma::vec >::type t0(t0SEXP); 141 | Rcpp::traits::input_parameter< arma::vec >::type t1(t1SEXP); 142 | Rcpp::traits::input_parameter< arma::vec >::type d(dSEXP); 143 | Rcpp::traits::input_parameter< arma::vec >::type tj(tjSEXP); 144 | Rcpp::traits::input_parameter< int >::type len(lenSEXP); 145 | Rcpp::traits::input_parameter< double >::type mu(muSEXP); 146 | Rcpp::traits::input_parameter< int >::type ub(ubSEXP); 147 | Rcpp::traits::input_parameter< arma::vec >::type lambda(lambdaSEXP); 148 | Rcpp::traits::input_parameter< double >::type wcov(wcovSEXP); 149 | Rcpp::traits::input_parameter< double >::type a(aSEXP); 150 | Rcpp::traits::input_parameter< bool >::type adjust(adjustSEXP); 151 | Rcpp::traits::input_parameter< unsigned int >::type ncov(ncovSEXP); 152 | Rcpp::traits::input_parameter< double >::type devnull(devnullSEXP); 153 | Rcpp::traits::input_parameter< bool >::type display_progress(display_progressSEXP); 154 | rcpp_result_gen = Rcpp::wrap(cox_timedep_enet_al(x, t0, t1, d, tj, len, mu, ub, lambda, wcov, a, adjust, ncov, devnull, display_progress)); 155 | return rcpp_result_gen; 156 | END_RCPP 157 | } 158 | // fg_enet_al 159 | Rcpp::List fg_enet_al(arma::mat x, arma::vec t0, arma::vec t1, arma::vec d, arma::vec tj, arma::vec w, int len, double mu, int ub, arma::vec lambda, double wcov, double a, bool adjust, unsigned int ncov, double devnull, bool display_progress); 160 | RcppExport SEXP _FLORAL_fg_enet_al(SEXP xSEXP, SEXP t0SEXP, SEXP t1SEXP, SEXP dSEXP, SEXP tjSEXP, SEXP wSEXP, SEXP lenSEXP, SEXP muSEXP, SEXP ubSEXP, SEXP lambdaSEXP, SEXP wcovSEXP, SEXP aSEXP, SEXP adjustSEXP, SEXP ncovSEXP, SEXP devnullSEXP, SEXP display_progressSEXP) { 161 | BEGIN_RCPP 162 | Rcpp::RObject rcpp_result_gen; 163 | Rcpp::RNGScope rcpp_rngScope_gen; 164 | Rcpp::traits::input_parameter< arma::mat >::type x(xSEXP); 165 | Rcpp::traits::input_parameter< arma::vec >::type t0(t0SEXP); 166 | Rcpp::traits::input_parameter< arma::vec >::type t1(t1SEXP); 167 | Rcpp::traits::input_parameter< arma::vec >::type d(dSEXP); 168 | Rcpp::traits::input_parameter< arma::vec >::type tj(tjSEXP); 169 | Rcpp::traits::input_parameter< arma::vec >::type w(wSEXP); 170 | Rcpp::traits::input_parameter< int >::type len(lenSEXP); 171 | Rcpp::traits::input_parameter< double >::type mu(muSEXP); 172 | Rcpp::traits::input_parameter< int >::type ub(ubSEXP); 173 | Rcpp::traits::input_parameter< arma::vec >::type lambda(lambdaSEXP); 174 | Rcpp::traits::input_parameter< double >::type wcov(wcovSEXP); 175 | Rcpp::traits::input_parameter< double >::type a(aSEXP); 176 | Rcpp::traits::input_parameter< bool >::type adjust(adjustSEXP); 177 | Rcpp::traits::input_parameter< unsigned int >::type ncov(ncovSEXP); 178 | Rcpp::traits::input_parameter< double >::type devnull(devnullSEXP); 179 | Rcpp::traits::input_parameter< bool >::type display_progress(display_progressSEXP); 180 | rcpp_result_gen = Rcpp::wrap(fg_enet_al(x, t0, t1, d, tj, w, len, mu, ub, lambda, wcov, a, adjust, ncov, devnull, display_progress)); 181 | return rcpp_result_gen; 182 | END_RCPP 183 | } 184 | // gee_NR 185 | Rcpp::List gee_NR(double N, arma::vec nt, arma::vec y, arma::mat X, double nx, Rcpp::Function linkinv, Rcpp::Function mueta, Rcpp::Function variance, arma::vec beta_new, arma::cube Rhat, double fihat, double lambda, double a, double alpha, double ncov, double wcov, double eps, double muu); 186 | RcppExport SEXP _FLORAL_gee_NR(SEXP NSEXP, SEXP ntSEXP, SEXP ySEXP, SEXP XSEXP, SEXP nxSEXP, SEXP linkinvSEXP, SEXP muetaSEXP, SEXP varianceSEXP, SEXP beta_newSEXP, SEXP RhatSEXP, SEXP fihatSEXP, SEXP lambdaSEXP, SEXP aSEXP, SEXP alphaSEXP, SEXP ncovSEXP, SEXP wcovSEXP, SEXP epsSEXP, SEXP muuSEXP) { 187 | BEGIN_RCPP 188 | Rcpp::RObject rcpp_result_gen; 189 | Rcpp::RNGScope rcpp_rngScope_gen; 190 | Rcpp::traits::input_parameter< double >::type N(NSEXP); 191 | Rcpp::traits::input_parameter< arma::vec >::type nt(ntSEXP); 192 | Rcpp::traits::input_parameter< arma::vec >::type y(ySEXP); 193 | Rcpp::traits::input_parameter< arma::mat >::type X(XSEXP); 194 | Rcpp::traits::input_parameter< double >::type nx(nxSEXP); 195 | Rcpp::traits::input_parameter< Rcpp::Function >::type linkinv(linkinvSEXP); 196 | Rcpp::traits::input_parameter< Rcpp::Function >::type mueta(muetaSEXP); 197 | Rcpp::traits::input_parameter< Rcpp::Function >::type variance(varianceSEXP); 198 | Rcpp::traits::input_parameter< arma::vec >::type beta_new(beta_newSEXP); 199 | Rcpp::traits::input_parameter< arma::cube >::type Rhat(RhatSEXP); 200 | Rcpp::traits::input_parameter< double >::type fihat(fihatSEXP); 201 | Rcpp::traits::input_parameter< double >::type lambda(lambdaSEXP); 202 | Rcpp::traits::input_parameter< double >::type a(aSEXP); 203 | Rcpp::traits::input_parameter< double >::type alpha(alphaSEXP); 204 | Rcpp::traits::input_parameter< double >::type ncov(ncovSEXP); 205 | Rcpp::traits::input_parameter< double >::type wcov(wcovSEXP); 206 | Rcpp::traits::input_parameter< double >::type eps(epsSEXP); 207 | Rcpp::traits::input_parameter< double >::type muu(muuSEXP); 208 | rcpp_result_gen = Rcpp::wrap(gee_NR(N, nt, y, X, nx, linkinv, mueta, variance, beta_new, Rhat, fihat, lambda, a, alpha, ncov, wcov, eps, muu)); 209 | return rcpp_result_gen; 210 | END_RCPP 211 | } 212 | // gee_cor 213 | Rcpp::List gee_cor(double N, arma::vec nt, arma::vec y, arma::mat X, Rcpp::Function linkinv, Rcpp::Function variance, arma::vec beta_new, std::string corstr, double maxclsz, bool scalefix, double scalevalue); 214 | RcppExport SEXP _FLORAL_gee_cor(SEXP NSEXP, SEXP ntSEXP, SEXP ySEXP, SEXP XSEXP, SEXP linkinvSEXP, SEXP varianceSEXP, SEXP beta_newSEXP, SEXP corstrSEXP, SEXP maxclszSEXP, SEXP scalefixSEXP, SEXP scalevalueSEXP) { 215 | BEGIN_RCPP 216 | Rcpp::RObject rcpp_result_gen; 217 | Rcpp::RNGScope rcpp_rngScope_gen; 218 | Rcpp::traits::input_parameter< double >::type N(NSEXP); 219 | Rcpp::traits::input_parameter< arma::vec >::type nt(ntSEXP); 220 | Rcpp::traits::input_parameter< arma::vec >::type y(ySEXP); 221 | Rcpp::traits::input_parameter< arma::mat >::type X(XSEXP); 222 | Rcpp::traits::input_parameter< Rcpp::Function >::type linkinv(linkinvSEXP); 223 | Rcpp::traits::input_parameter< Rcpp::Function >::type variance(varianceSEXP); 224 | Rcpp::traits::input_parameter< arma::vec >::type beta_new(beta_newSEXP); 225 | Rcpp::traits::input_parameter< std::string >::type corstr(corstrSEXP); 226 | Rcpp::traits::input_parameter< double >::type maxclsz(maxclszSEXP); 227 | Rcpp::traits::input_parameter< bool >::type scalefix(scalefixSEXP); 228 | Rcpp::traits::input_parameter< double >::type scalevalue(scalevalueSEXP); 229 | rcpp_result_gen = Rcpp::wrap(gee_cor(N, nt, y, X, linkinv, variance, beta_new, corstr, maxclsz, scalefix, scalevalue)); 230 | return rcpp_result_gen; 231 | END_RCPP 232 | } 233 | // gee_fit 234 | Rcpp::List gee_fit(arma::vec y, arma::mat X, arma::vec nt, Rcpp::Function linkinv, Rcpp::Function mueta, Rcpp::Function variance, std::string corstr, arma::vec lambda, double a, double ncov, double wcov, double tol, double eps, double muu, int maxiter1, int maxiter2, bool scalefix, double scalevalue, bool display_progress); 235 | RcppExport SEXP _FLORAL_gee_fit(SEXP ySEXP, SEXP XSEXP, SEXP ntSEXP, SEXP linkinvSEXP, SEXP muetaSEXP, SEXP varianceSEXP, SEXP corstrSEXP, SEXP lambdaSEXP, SEXP aSEXP, SEXP ncovSEXP, SEXP wcovSEXP, SEXP tolSEXP, SEXP epsSEXP, SEXP muuSEXP, SEXP maxiter1SEXP, SEXP maxiter2SEXP, SEXP scalefixSEXP, SEXP scalevalueSEXP, SEXP display_progressSEXP) { 236 | BEGIN_RCPP 237 | Rcpp::RObject rcpp_result_gen; 238 | Rcpp::RNGScope rcpp_rngScope_gen; 239 | Rcpp::traits::input_parameter< arma::vec >::type y(ySEXP); 240 | Rcpp::traits::input_parameter< arma::mat >::type X(XSEXP); 241 | Rcpp::traits::input_parameter< arma::vec >::type nt(ntSEXP); 242 | Rcpp::traits::input_parameter< Rcpp::Function >::type linkinv(linkinvSEXP); 243 | Rcpp::traits::input_parameter< Rcpp::Function >::type mueta(muetaSEXP); 244 | Rcpp::traits::input_parameter< Rcpp::Function >::type variance(varianceSEXP); 245 | Rcpp::traits::input_parameter< std::string >::type corstr(corstrSEXP); 246 | Rcpp::traits::input_parameter< arma::vec >::type lambda(lambdaSEXP); 247 | Rcpp::traits::input_parameter< double >::type a(aSEXP); 248 | Rcpp::traits::input_parameter< double >::type ncov(ncovSEXP); 249 | Rcpp::traits::input_parameter< double >::type wcov(wcovSEXP); 250 | Rcpp::traits::input_parameter< double >::type tol(tolSEXP); 251 | Rcpp::traits::input_parameter< double >::type eps(epsSEXP); 252 | Rcpp::traits::input_parameter< double >::type muu(muuSEXP); 253 | Rcpp::traits::input_parameter< int >::type maxiter1(maxiter1SEXP); 254 | Rcpp::traits::input_parameter< int >::type maxiter2(maxiter2SEXP); 255 | Rcpp::traits::input_parameter< bool >::type scalefix(scalefixSEXP); 256 | Rcpp::traits::input_parameter< double >::type scalevalue(scalevalueSEXP); 257 | Rcpp::traits::input_parameter< bool >::type display_progress(display_progressSEXP); 258 | rcpp_result_gen = Rcpp::wrap(gee_fit(y, X, nt, linkinv, mueta, variance, corstr, lambda, a, ncov, wcov, tol, eps, muu, maxiter1, maxiter2, scalefix, scalevalue, display_progress)); 259 | return rcpp_result_gen; 260 | END_RCPP 261 | } 262 | 263 | static const R_CallMethodDef CallEntries[] = { 264 | {"_FLORAL_softthreshold", (DL_FUNC) &_FLORAL_softthreshold, 2}, 265 | {"_FLORAL_gd_cov", (DL_FUNC) &_FLORAL_gd_cov, 5}, 266 | {"_FLORAL_gd_cov_al", (DL_FUNC) &_FLORAL_gd_cov_al, 11}, 267 | {"_FLORAL_linear_enet_al", (DL_FUNC) &_FLORAL_linear_enet_al, 11}, 268 | {"_FLORAL_logistic_enet_al", (DL_FUNC) &_FLORAL_logistic_enet_al, 13}, 269 | {"_FLORAL_cox_enet_al", (DL_FUNC) &_FLORAL_cox_enet_al, 17}, 270 | {"_FLORAL_cox_timedep_enet_al", (DL_FUNC) &_FLORAL_cox_timedep_enet_al, 15}, 271 | {"_FLORAL_fg_enet_al", (DL_FUNC) &_FLORAL_fg_enet_al, 16}, 272 | {"_FLORAL_gee_NR", (DL_FUNC) &_FLORAL_gee_NR, 18}, 273 | {"_FLORAL_gee_cor", (DL_FUNC) &_FLORAL_gee_cor, 11}, 274 | {"_FLORAL_gee_fit", (DL_FUNC) &_FLORAL_gee_fit, 19}, 275 | {NULL, NULL, 0} 276 | }; 277 | 278 | RcppExport void R_init_FLORAL(DllInfo *dll) { 279 | R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); 280 | R_useDynamicSymbols(dll, FALSE); 281 | } 282 | -------------------------------------------------------------------------------- /R/LogRatioLasso.R: -------------------------------------------------------------------------------- 1 | LogRatioLasso <- function(x, 2 | y, 3 | ncov, 4 | length.lambda=100, 5 | lambda.min.ratio=NULL, 6 | wcov, 7 | a=1, 8 | mu=1, 9 | maxiter=100, 10 | ncv=5, 11 | intercept=FALSE, 12 | foldid=NULL, 13 | step2=FALSE, 14 | progress=TRUE, 15 | plot=TRUE, 16 | ncore=1){ 17 | 18 | ptm <- proc.time() 19 | 20 | y0 <- y 21 | y <- y - mean(y) 22 | n <- length(y) 23 | p <- ncol(x) 24 | 25 | if (a > 0){ 26 | lambda0 <- max(abs(t(y) %*% x))/(a*n) 27 | }else if (a == 0){ 28 | lambda0 <- max(abs(t(y) %*% x))/(1e-3*n) 29 | } 30 | 31 | adjust = FALSE 32 | if (ncov > 0) adjust = TRUE 33 | 34 | if (is.null(lambda.min.ratio)) lambda.min.ratio = ifelse(n < p, 1e-02, 1e-02) 35 | 36 | lambda <- 10^(seq(log10(lambda0),log10(lambda0*lambda.min.ratio),length.out=length.lambda)) 37 | 38 | if (progress) cat("Algorithm running for full dataset: \n") 39 | 40 | fullfit <- linear_enet_al(x,y,length.lambda,mu,maxiter,lambda,wcov,a,adjust,ncov,progress) 41 | 42 | if (!is.null(colnames(x))){ 43 | rownames(fullfit$beta) = colnames(x) 44 | }else{ 45 | colnames(x) = 1:ncol(x) 46 | rownames(fullfit$beta) = 1:ncol(x) 47 | } 48 | 49 | if (intercept){ 50 | beta0 <- mean(y0) - colMeans(x %*% fullfit$beta) 51 | } 52 | 53 | if (!is.null(ncv)){ 54 | 55 | cvmse <- matrix(NA,nrow=length.lambda,ncol=ncv) 56 | 57 | if (is.null(foldid)){ 58 | labels <- caret::createFolds(1:nrow(x),k=ncv,list=FALSE) 59 | }else{ 60 | labels <- foldid 61 | } 62 | 63 | if (ncore == 1){ 64 | 65 | for (cv in 1:ncv){ 66 | 67 | if (progress) cat(paste0("Algorithm running for cv dataset ",cv," out of ",ncv,": \n")) 68 | 69 | train.x <- x[labels!=cv,] 70 | train.y <- y[labels!=cv] 71 | test.x <- x[labels==cv,] 72 | test.y <- y[labels==cv] 73 | 74 | cvfit <- linear_enet_al(train.x,train.y,length.lambda,mu,maxiter,lambda,wcov,a,adjust,ncov,progress) 75 | 76 | cvmse[,cv] <- apply(test.x %*% cvfit$beta,2,function(x) sum((test.y-x)^2)/length(test.y)) 77 | 78 | } 79 | 80 | }else if(ncore > 1){ 81 | 82 | if (progress) warning(paste0("Using ", ncore ," core for cross-validation computation.")) 83 | 84 | cl <- makeCluster(ncore) 85 | registerDoParallel(cl) 86 | 87 | cvmse <- foreach(cv=1:ncv,.combine=cbind) %dopar% { 88 | 89 | train.x <- x[labels!=cv,] 90 | train.y <- y[labels!=cv] 91 | test.x <- x[labels==cv,] 92 | test.y <- y[labels==cv] 93 | 94 | cvfit <- linear_enet_al(train.x,train.y,length.lambda,mu,maxiter,lambda,wcov,a,adjust,ncov,FALSE) 95 | 96 | apply(test.x %*% cvfit$beta,2,function(x) sum((test.y-x)^2)/length(test.y)) 97 | 98 | } 99 | 100 | stopCluster(cl) 101 | 102 | } 103 | 104 | mean.cvmse <- rowMeans(cvmse) 105 | se.cvmse <- apply(cvmse,1,function(x) sd(x)/sqrt(ncv)) 106 | 107 | idx.min <- which.min(mean.cvmse) 108 | se.min <- se.cvmse[idx.min] 109 | idx.1se <- suppressWarnings(min(which(mean.cvmse < mean.cvmse[idx.min] + se.min & 1:length.lambda < idx.min))) 110 | if (idx.1se == -Inf) idx.1se = 1 111 | 112 | best.beta <- list(min.mse = fullfit$beta[,idx.min], 113 | add.1se = fullfit$beta[,idx.1se]) 114 | 115 | best.idx <- list(idx.min = idx.min, 116 | idx.1se = idx.1se) 117 | 118 | ret <- list(beta=fullfit$beta, 119 | lambda=fullfit$lambda, 120 | a=a, 121 | loss=fullfit$loss, 122 | mse=fullfit$mse, 123 | tol=fullfit$tol, 124 | iters=fullfit$iters, 125 | cvmse.mean=mean.cvmse, 126 | cvmse.se=se.cvmse, 127 | best.beta=best.beta, 128 | best.idx=best.idx, 129 | foldid=labels 130 | ) 131 | 132 | if (plot){ 133 | 134 | beta_nzero <- suppressWarnings(data.frame(reshape::melt(ret$beta[rowSums(ret$beta != 0) > 0,]))) 135 | beta_nzero$lambda <- ret$lambda[beta_nzero$X2] 136 | beta_nzero$loglambda <- log(beta_nzero$lambda) 137 | 138 | lambda_count <- data.frame(loglambda = log(ret$lambda), 139 | count = colSums(ret$beta != 0)) 140 | lambda_count <- lambda_count[seq(5,nrow(lambda_count),length.out=10),] 141 | 142 | top10feat <- sort(ret$beta[,length(ret$lambda)])[c(1:5,(p-4):p)] 143 | top10name <- names(top10feat) 144 | 145 | pcoef <- ggplot(beta_nzero, aes(x=.data$loglambda,y=.data$value,group=.data$X1,color=as.factor(.data$X1))) + 146 | geom_line() + 147 | scale_color_manual(values=rainbow(sum(rowSums(ret$beta != 0) > 0))) + 148 | theme_bw() + 149 | theme(legend.position = "none") + 150 | xlab(expression(paste("log(",lambda,")"))) + 151 | ylab("Coefficient") + 152 | geom_vline(xintercept=log(ret$lambda[ret$best.idx$idx.min]),linetype="dashed",color="darkgrey")+ 153 | geom_vline(xintercept=log(ret$lambda[ret$best.idx$idx.1se]),linetype="dotted",color="darkgrey")+ 154 | # annotate("text",x=min(beta_nzero$loglambda)-2,y=top10feat,label=top10name,hjust=0)+ 155 | annotate("text",x=lambda_count$loglambda,y=max(beta_nzero$value)+0.2,label=as.character(lambda_count$count))+ 156 | ggtitle(expression(paste("Coefficients versus log(",lambda,")"))) 157 | 158 | mseplot <- data.frame(loglambda=log(ret$lambda), 159 | mse=ret$cvmse.mean, 160 | se=ret$cvmse.se, 161 | mseaddse=ret$cvmse.mean+ret$cvmse.se, 162 | mseminse=ret$cvmse.mean-ret$cvmse.se) 163 | 164 | pmse <- ggplot(mseplot, aes(x=.data$loglambda, y=.data$mse)) + 165 | geom_errorbar(aes(ymin=.data$mseminse,ymax=.data$mseaddse),color="grey")+ 166 | geom_point(color="red")+ 167 | theme_bw() + 168 | xlab(expression(paste("log(",lambda,")"))) + 169 | ylab("Mean-Squared Error")+ 170 | geom_vline(xintercept=log(ret$lambda[ret$best.idx$idx.min]),linetype="dashed",color="darkgrey")+ 171 | geom_vline(xintercept=log(ret$lambda[ret$best.idx$idx.1se]),linetype="dotted",color="darkgrey")+ 172 | annotate("text",x=lambda_count$loglambda,y=max(mseplot$mseaddse)+0.05,label=as.character(lambda_count$count))+ 173 | ggtitle(expression(paste("Cross-validated MSE versus log(",lambda,")"))) 174 | 175 | ret$pcoef <- pcoef 176 | ret$pmse <- pmse 177 | 178 | } 179 | 180 | if (step2){ 181 | 182 | if (!adjust){ 183 | 184 | if (length(which(ret$best.beta$min.mse!=0)) > 1){ 185 | idxs <- combn(which(ret$best.beta$min.mse!=0),2) 186 | 187 | x.select.min <- matrix(NA,nrow=n,ncol=ncol(idxs)) 188 | for (k in 1:ncol(idxs)){ 189 | x.select.min[,k] <- x[,idxs[1,k]] - x[,idxs[2,k]] 190 | } 191 | 192 | if (ncol(x.select.min) > 1){ 193 | stepglmnet <- cv.glmnet(x=x.select.min,y=y,type.measure = "mse",family="gaussian") 194 | x.select.min <- x.select.min[,which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)] 195 | idxs <- idxs[,which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)] 196 | }else{ 197 | idxs <- as.vector(idxs) 198 | } 199 | 200 | df_step2 <- data.frame(y=y,x=x.select.min) 201 | step2fit <- suppressWarnings(step(glm(y~.,data=df_step2,family=gaussian),trace=0)) 202 | vars <- as.numeric(sapply(names(step2fit$coefficients),function(x) strsplit(x,split = "[.]")[[1]][2])) 203 | 204 | if (is.null(ncol(idxs))){ 205 | if (length(vars) == 2){ 206 | selected <- idxs 207 | }else{ 208 | selected <- NULL 209 | } 210 | }else{ 211 | selected <- idxs[,vars] 212 | } 213 | # for (k1 in 1:nrow(selected)){ 214 | # for (k2 in 1:ncol(selected)){ 215 | # selected[k1,k2] <- colnames(x)[as.numeric(selected[k1,k2])] 216 | # } 217 | # } 218 | 219 | ret$step2.feature.min = selected 220 | ret$step2fit.min <- step2fit 221 | } 222 | 223 | if (length(which(ret$best.beta$add.1se!=0)) > 1){ 224 | 225 | idxs <- combn(which(ret$best.beta$add.1se!=0),2) 226 | 227 | x.select.min <- matrix(NA,nrow=n,ncol=ncol(idxs)) 228 | for (k in 1:ncol(idxs)){ 229 | x.select.min[,k] <- x[,idxs[1,k]] - x[,idxs[2,k]] 230 | } 231 | if (ncol(x.select.min) > 1){ 232 | stepglmnet <- cv.glmnet(x=x.select.min,y=y,type.measure = "mse",family="gaussian") 233 | x.select.min <- x.select.min[,which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)] 234 | idxs <- idxs[,which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)] 235 | }else{ 236 | idxs <- as.vector(idxs) 237 | } 238 | df_step2 <- data.frame(y=y,x=x.select.min) 239 | step2fit <- suppressWarnings(step(glm(y~.,data=df_step2,family=gaussian),trace=0)) 240 | vars <- as.numeric(sapply(names(step2fit$coefficients),function(x) strsplit(x,split = "[.]")[[1]][2])) 241 | 242 | if (is.null(ncol(idxs))){ 243 | if (length(vars) == 2){ 244 | selected <- idxs 245 | }else{ 246 | selected <- NULL 247 | } 248 | }else{ 249 | selected <- idxs[,vars] 250 | } 251 | # for (k1 in 1:nrow(selected)){ 252 | # for (k2 in 1:ncol(selected)){ 253 | # selected[k1,k2] <- colnames(x)[as.numeric(selected[k1,k2])] 254 | # } 255 | # } 256 | 257 | ret$step2.feature.1se = selected 258 | ret$step2fit.1se <- step2fit 259 | } 260 | 261 | }else{ 262 | 263 | if (length(which(ret$best.beta$min.mse!=0)) > 1){ 264 | 265 | allidx <- which(ret$best.beta$min.mse!=0) 266 | 267 | covidx <- allidx[allidx <= ncov] 268 | taxidx <- allidx[allidx > ncov] 269 | 270 | idxs <- NULL 271 | x.select.min <- NULL 272 | 273 | if (length(taxidx) > 1){ 274 | idxs <- combn(taxidx,2) 275 | for (k in 1:ncol(idxs)){ 276 | x.select.min <- cbind(x.select.min, x[,idxs[1,k]] - x[,idxs[2,k]]) 277 | } 278 | colnames(x.select.min) <- rep("",ncol(x.select.min)) 279 | } 280 | 281 | if (length(covidx) > 0){ 282 | x.select.min <- cbind(x.select.min, x[,covidx]) 283 | if (!is.null(idxs)){ 284 | colnames(x.select.min)[(ncol(idxs)+1):ncol(x.select.min)] = colnames(x)[covidx] 285 | }else{ 286 | colnames(x.select.min) <- colnames(x)[covidx] 287 | } 288 | } 289 | 290 | 291 | # if(is.null(x.select.min)) break 292 | 293 | if (ncol(x.select.min) > 1){ 294 | stepglmnet <- cv.glmnet(x=x.select.min,y=y,type.measure = "mse",family="gaussian") 295 | 296 | if (length(taxidx) < 2){ 297 | 298 | idxs <- NULL 299 | 300 | # covs <- colnames(x.select.min)[which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)] 301 | 302 | }else{ 303 | 304 | if (length(covidx) == 0) idxs <- idxs[,which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)] 305 | 306 | if (length(covidx) > 0){ 307 | 308 | # covs <- colnames(x.select.min)[setdiff(which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0), 309 | # 1:ncol(idxs))] 310 | 311 | idxs <- idxs[,setdiff(which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0), 312 | (ncol(idxs)+1):(ncol(idxs)+length(covidx)))] 313 | 314 | } 315 | 316 | } 317 | 318 | x.select.min <- x.select.min[,which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)] 319 | } 320 | 321 | if (sum(colnames(x.select.min)=="") > 0) colnames(x.select.min)[colnames(x.select.min)==""] <- paste0("x.",1:sum(colnames(x.select.min)=="")) 322 | df_step2 <- data.frame(y=y,x.select.min) 323 | step2fit <- suppressWarnings(step(glm(y~.,data=df_step2,family=gaussian),trace=0)) 324 | vars <- suppressWarnings(as.numeric(sapply(names(step2fit$coefficients),function(x) strsplit(x,split = "[.]")[[1]][2]))) 325 | vars <- vars[!is.na(vars)] 326 | 327 | selected <- NULL 328 | if (is.null(ncol(idxs)) & length(vars) > 0){ 329 | selected <- idxs 330 | }else if (length(vars) > 0){ 331 | selected <- idxs[,vars] 332 | } 333 | 334 | ret$step2.feature.min = selected 335 | ret$step2fit.min <- step2fit 336 | } 337 | 338 | if (length(which(ret$best.beta$add.1se!=0)) > 1){ 339 | 340 | allidx <- which(ret$best.beta$add.1se!=0) 341 | 342 | covidx <- allidx[allidx <= ncov] 343 | taxidx <- allidx[allidx > ncov] 344 | 345 | idxs <- NULL 346 | x.select.min <- NULL 347 | 348 | if (length(taxidx) > 1){ 349 | idxs <- combn(taxidx,2) 350 | for (k in 1:ncol(idxs)){ 351 | x.select.min <- cbind(x.select.min, x[,idxs[1,k]] - x[,idxs[2,k]]) 352 | } 353 | colnames(x.select.min) <- rep("",ncol(x.select.min)) 354 | } 355 | 356 | if (length(covidx) > 0){ 357 | x.select.min <- cbind(x.select.min, x[,covidx]) 358 | if (!is.null(idxs)){ 359 | colnames(x.select.min)[(ncol(idxs)+1):ncol(x.select.min)] = colnames(x)[covidx] 360 | }else{ 361 | colnames(x.select.min) <- colnames(x)[covidx] 362 | } 363 | } 364 | 365 | # if(is.null(x.select.min)) break 366 | 367 | if (ncol(x.select.min) > 1){ 368 | stepglmnet <- cv.glmnet(x=x.select.min,y=y,type.measure = "mse",family="gaussian") 369 | 370 | if (length(taxidx) < 2){ 371 | 372 | idxs <- NULL 373 | 374 | # covs <- colnames(x.select.min)[which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)] 375 | 376 | }else{ 377 | 378 | if (length(covidx) == 0) idxs <- idxs[,which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)] 379 | 380 | if (length(covidx) > 0){ 381 | 382 | # covs <- colnames(x.select.min)[setdiff(which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0),1:ncol(idxs))] 383 | 384 | idxs <- idxs[,setdiff(which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0), 385 | (ncol(idxs)+1):(ncol(idxs)+length(covidx)))] 386 | 387 | } 388 | 389 | } 390 | 391 | x.select.min <- x.select.min[,which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)] 392 | } 393 | 394 | if (sum(colnames(x.select.min)=="") > 0) colnames(x.select.min)[colnames(x.select.min)==""] <- paste0("x.",1:sum(colnames(x.select.min)=="")) 395 | df_step2 <- data.frame(y=y,x.select.min) 396 | step2fit <- suppressWarnings(step(glm(y~.,data=df_step2,family=gaussian),trace=0)) 397 | vars <- suppressWarnings(as.numeric(sapply(names(step2fit$coefficients),function(x) strsplit(x,split = "[.]")[[1]][2]))) 398 | vars <- vars[!is.na(vars)] 399 | 400 | selected <- NULL 401 | if (is.null(ncol(idxs)) & length(vars) > 0){ 402 | selected <- idxs 403 | }else if (length(vars) > 0){ 404 | selected <- idxs[,vars] 405 | } 406 | 407 | ret$step2.feature.1se = selected 408 | ret$step2fit.1se <- step2fit 409 | 410 | } 411 | 412 | } 413 | 414 | } 415 | 416 | }else{ 417 | 418 | ret <- list(beta=fullfit$beta, 419 | lambda=fullfit$lambda, 420 | a=a, 421 | loss=fullfit$loss, 422 | mse=fullfit$mse 423 | ) 424 | 425 | if (plot){ 426 | 427 | beta_nzero <- suppressWarnings(data.frame(reshape::melt(ret$beta[rowSums(ret$beta != 0) > 0,]))) 428 | beta_nzero$lambda <- ret$lambda[beta_nzero$X2] 429 | beta_nzero$loglambda <- log(beta_nzero$lambda) 430 | 431 | lambda_count <- data.frame(loglambda = log(ret$lambda), 432 | count = colSums(ret$beta != 0)) 433 | lambda_count <- lambda_count[seq(5,nrow(lambda_count),length.out=10),] 434 | 435 | top10feat <- sort(ret$beta[,length(ret$lambda)])[c(1:5,(p-4):p)] 436 | top10name <- names(top10feat) 437 | 438 | pcoef <- ggplot(beta_nzero, aes(x=.data$loglambda,y=.data$value,group=.data$X1,color=as.factor(.data$X1))) + 439 | geom_line() + 440 | scale_color_manual(values=rainbow(sum(rowSums(ret$beta != 0) > 0))) + 441 | theme_bw() + 442 | theme(legend.position = "none") + 443 | xlab(expression(paste("log(",lambda,")"))) + 444 | ylab("Coefficient") + 445 | # annotate("text",x=min(beta_nzero$loglambda)-2,y=top10feat,label=top10name,hjust=0)+ 446 | annotate("text",x=lambda_count$loglambda,y=max(beta_nzero$value)+0.2,label=as.character(lambda_count$count))+ 447 | ggtitle(expression(paste("Coefficients versus log(",lambda,")"))) 448 | 449 | ret$pcoef <- pcoef 450 | 451 | } 452 | 453 | } 454 | 455 | if(intercept) ret$intercept = beta0 456 | 457 | ret$runtime = proc.time() - ptm 458 | 459 | return(ret) 460 | 461 | } -------------------------------------------------------------------------------- /R/LogRatioLogisticLasso.R: -------------------------------------------------------------------------------- 1 | LogRatioLogisticLasso <- function(x, 2 | y, 3 | ncov, 4 | length.lambda=100, 5 | lambda.min.ratio=NULL, 6 | wcov, 7 | a=1, 8 | mu=1, 9 | maxiter=100, 10 | ncv=5, 11 | foldid=NULL, 12 | step2=FALSE, 13 | progress=TRUE, 14 | plot=TRUE, 15 | loop1=FALSE, 16 | loop2=FALSE, 17 | ncore=1){ 18 | 19 | ptm <- proc.time() 20 | 21 | n <- length(y) 22 | p <- ncol(x) 23 | sfun = y-0.5 24 | 25 | if (a > 0){ 26 | lambda0 <- max(abs(t(sfun) %*% x))/(a*n) 27 | }else if (a == 0){ 28 | lambda0 <- max(abs(t(sfun) %*% x))/(1e-3*n) 29 | } 30 | 31 | adjust = FALSE 32 | if (ncov > 0) adjust = TRUE 33 | 34 | if (is.null(lambda.min.ratio)) lambda.min.ratio = ifelse(n < p, 1e-02, 1e-02) 35 | 36 | lambda <- 10^(seq(log10(lambda0),log10(lambda0*lambda.min.ratio),length.out=length.lambda)) 37 | 38 | if (progress) cat("Algorithm running for full dataset: \n") 39 | 40 | fullfit <- logistic_enet_al(x,y,length.lambda,mu,maxiter,lambda,wcov,a,adjust,ncov,progress,loop1,loop2) 41 | 42 | if (!is.null(colnames(x))){ 43 | rownames(fullfit$beta) = colnames(x) 44 | }else{ 45 | colnames(x) = 1:ncol(x) 46 | rownames(fullfit$beta) = 1:ncol(x) 47 | } 48 | 49 | if (!is.null(ncv)){ 50 | 51 | cvmse <- matrix(NA,nrow=length.lambda,ncol=ncv) 52 | 53 | if (is.null(foldid)){ 54 | labels <- caret::createFolds(factor(y),k=ncv,list=FALSE) 55 | }else{ 56 | labels <- foldid 57 | } 58 | 59 | if (ncore == 1){ 60 | 61 | for (cv in 1:ncv){ 62 | 63 | if (progress) cat(paste0("Algorithm running for cv dataset ",cv," out of ",ncv,": \n")) 64 | 65 | train.x <- x[labels!=cv,] 66 | train.y <- y[labels!=cv] 67 | test.x <- x[labels==cv,] 68 | test.y <- y[labels==cv] 69 | 70 | cvfit <- logistic_enet_al(train.x,train.y,length.lambda,mu,maxiter,lambda,wcov,a,adjust,ncov,progress,loop1,loop2) 71 | 72 | cvmse[,cv] <- apply(cbind(1,test.x) %*% rbind(t(cvfit$beta0),cvfit$beta),2,function(x) sum((test.y- exp(x)/(1+exp(x)))^2)/length(test.y)) 73 | 74 | } 75 | 76 | }else if (ncore > 1){ 77 | 78 | if (progress) warning(paste0("Using ", ncore ," core for cross-validation computation.")) 79 | 80 | cl <- makeCluster(ncore) 81 | registerDoParallel(cl) 82 | 83 | cvmse <- foreach(cv=1:ncv,.combine=cbind) %dopar% { 84 | 85 | train.x <- x[labels!=cv,] 86 | train.y <- y[labels!=cv] 87 | test.x <- x[labels==cv,] 88 | test.y <- y[labels==cv] 89 | 90 | cvfit <- logistic_enet_al(train.x,train.y,length.lambda,mu,maxiter,lambda,wcov,a,adjust,ncov,FALSE,loop1,loop2) 91 | 92 | apply(cbind(1,test.x) %*% rbind(t(cvfit$beta0),cvfit$beta),2,function(x) sum((test.y- exp(x)/(1+exp(x)))^2)/length(test.y)) 93 | 94 | } 95 | 96 | stopCluster(cl) 97 | 98 | } 99 | 100 | mean.cvmse <- rowMeans(cvmse) 101 | se.cvmse <- apply(cvmse,1,function(x) sd(x)/sqrt(ncv)) 102 | 103 | idx.min <- which.min(mean.cvmse) 104 | se.min <- se.cvmse[idx.min] 105 | idx.1se <- suppressWarnings(max(which(mean.cvmse > mean.cvmse[idx.min] + se.min & 1:length.lambda < idx.min))) 106 | if (idx.1se == -Inf) idx.1se = 1 107 | 108 | best.beta0 <- list(min.mse = fullfit$beta0[idx.min], 109 | add.1se = fullfit$beta0[idx.1se]) 110 | 111 | best.beta <- list(min.mse = fullfit$beta[,idx.min], 112 | add.1se = fullfit$beta[,idx.1se]) 113 | 114 | best.idx <- list(idx.min = idx.min, 115 | idx.1se = idx.1se) 116 | 117 | ret <- list(beta=fullfit$beta, 118 | beta0=fullfit$beta0, 119 | lambda=fullfit$lambda, 120 | a=a, 121 | loss=fullfit$loss, 122 | mse=fullfit$mse, 123 | tol=fullfit$tol, 124 | iters=fullfit$iters, 125 | cvmse.mean=mean.cvmse, 126 | cvmse.se=se.cvmse, 127 | best.beta=best.beta, 128 | best.beta0=best.beta0, 129 | best.idx=best.idx, 130 | foldid=labels 131 | ) 132 | 133 | if (plot){ 134 | 135 | beta_nzero <- suppressWarnings(data.frame(reshape::melt(ret$beta[rowSums(ret$beta != 0) > 0,]))) 136 | beta_nzero$lambda <- ret$lambda[beta_nzero$X2] 137 | beta_nzero$loglambda <- log(beta_nzero$lambda) 138 | 139 | lambda_count <- data.frame(loglambda = log(ret$lambda), 140 | count = colSums(ret$beta != 0)) 141 | lambda_count <- lambda_count[seq(5,nrow(lambda_count),length.out=10),] 142 | 143 | top10feat <- sort(ret$beta[,length(ret$lambda)])[c(1:5,(p-4):p)] 144 | top10name <- names(top10feat) 145 | 146 | pcoef <- ggplot(beta_nzero, aes(x=.data$loglambda,y=.data$value,group=.data$X1,color=as.factor(.data$X1))) + 147 | geom_line() + 148 | scale_color_manual(values=rainbow(sum(rowSums(ret$beta != 0) > 0))) + 149 | theme_bw() + 150 | theme(legend.position = "none") + 151 | xlab(expression(paste("log(",lambda,")"))) + 152 | ylab("Coefficient") + 153 | geom_vline(xintercept=log(ret$lambda[ret$best.idx$idx.min]),linetype="dashed",color="darkgrey")+ 154 | geom_vline(xintercept=log(ret$lambda[ret$best.idx$idx.1se]),linetype="dotted",color="darkgrey")+ 155 | # annotate("text",x=min(beta_nzero$loglambda)-2,y=top10feat,label=top10name,hjust=0)+ 156 | annotate("text",x=lambda_count$loglambda,y=max(beta_nzero$value)+0.2,label=as.character(lambda_count$count))+ 157 | ggtitle(expression(paste("Coefficients versus log(",lambda,")"))) 158 | 159 | mseplot <- data.frame(loglambda=log(ret$lambda), 160 | mse=ret$cvmse.mean, 161 | se=ret$cvmse.se, 162 | mseaddse=ret$cvmse.mean+ret$cvmse.se, 163 | mseminse=ret$cvmse.mean-ret$cvmse.se) 164 | 165 | pmse <- ggplot(mseplot, aes(x=.data$loglambda, y=.data$mse)) + 166 | geom_errorbar(aes(ymin=.data$mseminse,ymax=.data$mseaddse),color="grey")+ 167 | geom_point(color="red")+ 168 | theme_bw() + 169 | xlab(expression(paste("log(",lambda,")"))) + 170 | ylab("Mean-Squared Error")+ 171 | geom_vline(xintercept=log(ret$lambda[ret$best.idx$idx.min]),linetype="dashed",color="darkgrey")+ 172 | geom_vline(xintercept=log(ret$lambda[ret$best.idx$idx.1se]),linetype="dotted",color="darkgrey")+ 173 | annotate("text",x=lambda_count$loglambda,y=max(mseplot$mseaddse)+0.05,label=as.character(lambda_count$count))+ 174 | ggtitle(expression(paste("Cross-validated MSE versus log(",lambda,")"))) 175 | 176 | ret$pcoef <- pcoef 177 | ret$pmse <- pmse 178 | 179 | } 180 | 181 | if (step2){ 182 | 183 | if (!adjust){ 184 | 185 | if (length(which(ret$best.beta$min.mse!=0)) > 1){ 186 | 187 | idxs <- combn(which(ret$best.beta$min.mse!=0),2) 188 | x.select.min <- matrix(NA,nrow=n,ncol=ncol(idxs)) 189 | for (k in 1:ncol(idxs)){ 190 | x.select.min[,k] <- x[,idxs[1,k]] - x[,idxs[2,k]] 191 | } 192 | 193 | if (ncol(x.select.min) > 1){ 194 | stepglmnet <- cv.glmnet(x=x.select.min,y=y,type.measure = "mse",family="binomial") 195 | x.select.min <- x.select.min[,which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)] 196 | idxs <- idxs[,which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)] 197 | }else{ 198 | idxs <- as.vector(idxs) 199 | } 200 | 201 | df_step2 <- data.frame(y=y,x=x.select.min) 202 | step2fit <- suppressWarnings(step(glm(y~.,data=df_step2,family=binomial),trace=0)) 203 | vars <- as.numeric(sapply(names(step2fit$coefficients),function(x) strsplit(x,split = "[.]")[[1]][2])) 204 | 205 | if (is.null(ncol(idxs))){ 206 | if (length(vars) == 2){ 207 | selected <- idxs 208 | }else{ 209 | selected <- NULL 210 | } 211 | }else{ 212 | selected <- idxs[,vars] 213 | } 214 | 215 | # for (k1 in 1:nrow(selected)){ 216 | # for (k2 in 1:ncol(selected)){ 217 | # selected[k1,k2] <- colnames(x)[as.numeric(selected[k1,k2])] 218 | # } 219 | # } 220 | ret$step2.feature.min = selected 221 | ret$step2fit.min <- step2fit 222 | } 223 | 224 | if (length(which(ret$best.beta$add.1se!=0)) > 1){ 225 | 226 | idxs <- combn(which(ret$best.beta$add.1se!=0),2) 227 | x.select.min <- matrix(NA,nrow=n,ncol=ncol(idxs)) 228 | for (k in 1:ncol(idxs)){ 229 | x.select.min[,k] <- x[,idxs[1,k]] - x[,idxs[2,k]] 230 | } 231 | 232 | if (ncol(x.select.min) > 1){ 233 | stepglmnet <- cv.glmnet(x=x.select.min,y=y,type.measure = "mse",family="binomial") 234 | x.select.min <- x.select.min[,which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)] 235 | idxs <- idxs[,which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)] 236 | }else{ 237 | idxs <- as.vector(idxs) 238 | } 239 | 240 | df_step2 <- data.frame(y=y,x=x.select.min) 241 | step2fit <- suppressWarnings(step(glm(y~.,data=df_step2,family=binomial),trace=0)) 242 | vars <- as.numeric(sapply(names(step2fit$coefficients),function(x) strsplit(x,split = "[.]")[[1]][2])) 243 | 244 | if (is.null(ncol(idxs))){ 245 | if (length(vars) == 2){ 246 | selected <- idxs 247 | }else{ 248 | selected <- NULL 249 | } 250 | }else{ 251 | selected <- idxs[,vars] 252 | } 253 | # for (k1 in 1:nrow(selected)){ 254 | # for (k2 in 1:ncol(selected)){ 255 | # selected[k1,k2] <- colnames(x)[as.numeric(selected[k1,k2])] 256 | # } 257 | # } 258 | ret$step2.feature.1se = selected 259 | ret$step2fit.1se <- step2fit 260 | } 261 | 262 | }else{ 263 | 264 | if (length(which(ret$best.beta$min.mse!=0)) > 1){ 265 | 266 | allidx <- which(ret$best.beta$min.mse!=0) 267 | 268 | covidx <- allidx[allidx <= ncov] 269 | taxidx <- allidx[allidx > ncov] 270 | 271 | idxs <- NULL 272 | x.select.min <- NULL 273 | 274 | if (length(taxidx) > 1){ 275 | idxs <- combn(taxidx,2) 276 | for (k in 1:ncol(idxs)){ 277 | x.select.min <- cbind(x.select.min, x[,idxs[1,k]] - x[,idxs[2,k]]) 278 | } 279 | colnames(x.select.min) <- rep("",ncol(x.select.min)) 280 | } 281 | 282 | if (length(covidx) > 0){ 283 | x.select.min <- cbind(x.select.min, x[,covidx]) 284 | if (!is.null(idxs)){ 285 | colnames(x.select.min)[(ncol(idxs)+1):ncol(x.select.min)] = colnames(x)[covidx] 286 | }else{ 287 | colnames(x.select.min) <- colnames(x)[covidx] 288 | } 289 | } 290 | 291 | 292 | # if(is.null(x.select.min)) break 293 | 294 | if (ncol(x.select.min) > 1){ 295 | stepglmnet <- cv.glmnet(x=x.select.min,y=y,type.measure = "mse",family="binomial") 296 | 297 | if (length(taxidx) < 2){ 298 | 299 | idxs <- NULL 300 | 301 | # covs <- colnames(x.select.min)[which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)] 302 | 303 | }else{ 304 | 305 | if (length(covidx) == 0) idxs <- idxs[,which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)] 306 | 307 | if (length(covidx) > 0){ 308 | 309 | # covs <- colnames(x.select.min)[setdiff(which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0), 310 | # 1:ncol(idxs))] 311 | 312 | idxs <- idxs[,setdiff(which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0), 313 | (ncol(idxs)+1):(ncol(idxs)+length(covidx)))] 314 | 315 | } 316 | 317 | } 318 | 319 | x.select.min <- x.select.min[,which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)] 320 | } 321 | 322 | if (sum(colnames(x.select.min)=="") > 0) colnames(x.select.min)[colnames(x.select.min)==""] <- paste0("x.",1:sum(colnames(x.select.min)=="")) 323 | 324 | df_step2 <- data.frame(y=y,x.select.min) 325 | step2fit <- suppressWarnings(step(glm(y~.,data=df_step2,family=binomial),trace=0)) 326 | vars <- suppressWarnings(as.numeric(sapply(names(step2fit$coefficients),function(x) strsplit(x,split = "[.]")[[1]][2]))) 327 | vars <- vars[!is.na(vars)] 328 | 329 | selected <- NULL 330 | if (is.null(ncol(idxs)) & length(vars) > 0){ 331 | selected <- idxs 332 | }else if (length(vars) > 0){ 333 | selected <- idxs[,vars] 334 | } 335 | 336 | ret$step2.feature.min = selected 337 | ret$step2fit.min <- step2fit 338 | } 339 | 340 | if (length(which(ret$best.beta$add.1se!=0)) > 1){ 341 | 342 | allidx <- which(ret$best.beta$add.1se!=0) 343 | 344 | covidx <- allidx[allidx <= ncov] 345 | taxidx <- allidx[allidx > ncov] 346 | 347 | idxs <- NULL 348 | x.select.min <- NULL 349 | 350 | if (length(taxidx) > 1){ 351 | idxs <- combn(taxidx,2) 352 | for (k in 1:ncol(idxs)){ 353 | x.select.min <- cbind(x.select.min, x[,idxs[1,k]] - x[,idxs[2,k]]) 354 | } 355 | colnames(x.select.min) <- rep("",ncol(x.select.min)) 356 | } 357 | 358 | if (length(covidx) > 0){ 359 | x.select.min <- cbind(x.select.min, x[,covidx]) 360 | if (!is.null(idxs)){ 361 | colnames(x.select.min)[(ncol(idxs)+1):ncol(x.select.min)] = colnames(x)[covidx] 362 | }else{ 363 | colnames(x.select.min) <- colnames(x)[covidx] 364 | } 365 | } 366 | 367 | # if(is.null(x.select.min)) break 368 | 369 | if (ncol(x.select.min) > 1){ 370 | stepglmnet <- cv.glmnet(x=x.select.min,y=y,type.measure = "mse",family="binomial") 371 | 372 | if (length(taxidx) < 2){ 373 | 374 | idxs <- NULL 375 | 376 | # covs <- colnames(x.select.min)[which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)] 377 | 378 | }else{ 379 | 380 | if (length(covidx) == 0) idxs <- idxs[,which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)] 381 | 382 | if (length(covidx) > 0){ 383 | 384 | # covs <- colnames(x.select.min)[setdiff(which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0),1:ncol(idxs))] 385 | 386 | idxs <- idxs[,setdiff(which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0), 387 | (ncol(idxs)+1):(ncol(idxs)+length(covidx)))] 388 | 389 | } 390 | 391 | } 392 | 393 | x.select.min <- x.select.min[,which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)] 394 | } 395 | 396 | if (sum(colnames(x.select.min)=="") > 0) colnames(x.select.min)[colnames(x.select.min)==""] <- paste0("x.",1:sum(colnames(x.select.min)=="")) 397 | 398 | df_step2 <- data.frame(y=y,x.select.min) 399 | step2fit <- suppressWarnings(step(glm(y~.,data=df_step2,family=binomial),trace=0)) 400 | vars <- suppressWarnings(as.numeric(sapply(names(step2fit$coefficients),function(x) strsplit(x,split = "[.]")[[1]][2]))) 401 | vars <- vars[!is.na(vars)] 402 | 403 | selected <- NULL 404 | if (is.null(ncol(idxs)) & length(vars) > 0){ 405 | selected <- idxs 406 | }else if (length(vars) > 0){ 407 | selected <- idxs[,vars] 408 | } 409 | 410 | ret$step2.feature.1se = selected 411 | ret$step2fit.1se <- step2fit 412 | 413 | } 414 | 415 | } 416 | 417 | } 418 | 419 | }else{ 420 | 421 | ret <- list(beta=fullfit$beta, 422 | beta0=fullfit$beta0, 423 | lambda=fullfit$lambda, 424 | a=a, 425 | loss=fullfit$loss, 426 | mse=fullfit$mse 427 | ) 428 | 429 | if (plot){ 430 | 431 | beta_nzero <- suppressWarnings(data.frame(reshape::melt(ret$beta[rowSums(ret$beta != 0) > 0,]))) 432 | beta_nzero$lambda <- ret$lambda[beta_nzero$X2] 433 | beta_nzero$loglambda <- log(beta_nzero$lambda) 434 | 435 | lambda_count <- data.frame(loglambda = log(ret$lambda), 436 | count = colSums(ret$beta != 0)) 437 | lambda_count <- lambda_count[seq(5,nrow(lambda_count),length.out=10),] 438 | 439 | top10feat <- sort(ret$beta[,length(ret$lambda)])[c(1:5,(p-4):p)] 440 | top10name <- names(top10feat) 441 | 442 | pcoef <- ggplot(beta_nzero, aes(x=.data$loglambda,y=.data$value,group=.data$X1,color=as.factor(.data$X1))) + 443 | geom_line() + 444 | scale_color_manual(values=rainbow(sum(rowSums(ret$beta != 0) > 0))) + 445 | theme_bw() + 446 | theme(legend.position = "none") + 447 | xlab(expression(paste("log(",lambda,")"))) + 448 | ylab("Coefficient") + 449 | # annotate("text",x=min(beta_nzero$loglambda)-2,y=top10feat,label=top10name,hjust=0)+ 450 | annotate("text",x=lambda_count$loglambda,y=max(beta_nzero$value)+0.2,label=as.character(lambda_count$count))+ 451 | ggtitle(expression(paste("Coefficients versus log(",lambda,")"))) 452 | 453 | ret$pcoef <- pcoef 454 | 455 | } 456 | 457 | } 458 | 459 | ret$runtime <- proc.time() - ptm 460 | 461 | ret$intercept <- ret$beta0 462 | ret$beta0 <- NULL 463 | 464 | return(ret) 465 | 466 | } -------------------------------------------------------------------------------- /src/pgee.cpp: -------------------------------------------------------------------------------- 1 | // -*- mode: C++; c-indent-level: 4; c-basic-offset: 4; indent-tabs-mode: nil; -*- 2 | 3 | // [[Rcpp::plugins(cpp17)]] 4 | // [[Rcpp::depends(RcppArmadillo)]] 5 | #include 6 | #include 7 | using namespace arma; 8 | using namespace std; 9 | using namespace Rcpp; 10 | // [[Rcpp::depends(RcppProgress)]] 11 | #include 12 | #include 13 | 14 | // [[Rcpp::export]] 15 | Rcpp::List gee_NR(double N, // Number of subjects 16 | arma::vec nt, // number of obs per subject 17 | arma::vec y, 18 | arma::mat X, 19 | double nx, // Number of covariates (ncol(X)) 20 | Rcpp::Function linkinv, 21 | Rcpp::Function mueta, 22 | Rcpp::Function variance, 23 | arma::vec beta_new, 24 | arma::cube Rhat, // estimated working correlation matrix (Ehat from cor_gee.R) 25 | double fihat, // estimated scale parameter (fi from cor_gee.R) 26 | double lambda, 27 | double a, 28 | double alpha, 29 | double ncov, 30 | double wcov, 31 | double eps=1e-6, 32 | double muu=1e6){ 33 | 34 | // Rcpp::Rcout << "Flag1" << endl; 35 | 36 | arma::vec aindex=cumsum(nt); 37 | arma::vec index; 38 | index.zeros(N); 39 | index.subvec(1,N-1) = aindex.subvec(0,N-2); 40 | 41 | arma::vec eta = X * beta_new; 42 | // sexp linkinveta = linkinv(eta); 43 | // arma::vec mu = linkinveta; 44 | arma::vec mu = Rcpp::as(linkinv(eta)); 45 | 46 | // Rcpp::Rcout << "mu: " << mu << endl; 47 | 48 | arma::mat E1; 49 | arma::mat E2; 50 | 51 | if (a <= 1){ 52 | 53 | if (ncov == 0){ 54 | 55 | E1 = diagmat(lambda*(a + (1-a)*abs(beta_new))/(abs(beta_new)+eps)); 56 | E2.ones(nx); 57 | E2 = E2 * (muu * (accu(beta_new) + alpha)); 58 | 59 | }else if (ncov > 0){ 60 | 61 | // This is E on Wang et al.(2012) 62 | 63 | arma::vec lw; 64 | lw.ones(size(beta_new)); 65 | lw.subvec(0,ncov-1) = lw.subvec(0,ncov-1)*wcov; 66 | 67 | E1 = diagmat((lambda*lw % (a + (1-a)*abs(beta_new)))/(abs(beta_new)+eps)); 68 | E2.ones(nx); 69 | arma::vec muuvec(size(beta_new)); 70 | muuvec.fill(muu); 71 | muuvec.subvec(0,ncov-1).zeros(); 72 | arma::vec beta_new_sub = beta_new.subvec(ncov,beta_new.n_elem-1); 73 | E2 = E2 % (muuvec * (accu(beta_new_sub) + alpha)); 74 | 75 | } 76 | 77 | }else if (a > 0){ 78 | 79 | if (ncov == 0){ 80 | 81 | arma::vec beta_abs = abs(beta_new); 82 | arma::vec b1 = zeros(size(beta_new)); 83 | arma::uvec pos1 = find(beta_abs > lambda); 84 | b1(pos1).ones(); 85 | 86 | // Rcpp::Rcout << "b1: " << 1-b1 << endl; 87 | 88 | arma::vec b2 = zeros(size(beta_new)); 89 | arma::uvec pos2 = find(beta_abs < (lambda*a)); 90 | b2(pos2).ones(); 91 | 92 | // Rcpp::Rcout << "b2: " << 1-b2 << endl; 93 | 94 | E1 = diagmat((lambda*(1-b1)+b1%((lambda*a)-beta_new)%b2/(a-1))/(abs(beta_new)+eps)); 95 | E2.ones(nx); 96 | E2 = E2 * (muu * (accu(beta_new) + alpha)); 97 | 98 | }else if (ncov > 0){ 99 | 100 | // This is E on Wang et al.(2012) 101 | 102 | arma::vec beta_abs = abs(beta_new); 103 | arma::vec b1 = zeros(size(beta_new)); 104 | arma::uvec pos1 = find(beta_abs > lambda); 105 | b1(pos1).ones(); 106 | 107 | // Rcpp::Rcout << "b1: " << b1 << endl; 108 | 109 | arma::vec b2 = zeros(size(beta_new)); 110 | arma::uvec pos2 = find(beta_abs < (lambda*a)); 111 | b2(pos2).ones(); 112 | 113 | arma::vec lw; 114 | lw.ones(size(beta_new)); 115 | lw.subvec(0,ncov-1) = lw.subvec(0,ncov-1)*wcov; 116 | 117 | E1 = diagmat((lw % (lambda*(1-b1)+b1%((lambda*a)-beta_new)%b2/(a-1)))/(abs(beta_new)+eps)); 118 | E2.ones(nx); 119 | arma::vec muuvec(size(beta_new)); 120 | muuvec.fill(muu); 121 | muuvec.subvec(0,ncov-1).zeros(); 122 | arma::vec beta_new_sub = beta_new.subvec(ncov,beta_new.n_elem-1); 123 | E2 = E2 % (muuvec * (accu(beta_new_sub) + alpha)); 124 | 125 | } 126 | 127 | } 128 | 129 | 130 | // if(is.null(pindex)==TRUE) { 131 | // E<-E1 132 | // } else 133 | // if(is.null(pindex)!=TRUE) { 134 | // E1[,pindex]<-0 135 | // E<-E1 136 | // } 137 | // 138 | arma::vec sum201; //<-matrix(0,nx,1) //gradient:S 139 | sum201.zeros(nx); 140 | 141 | arma::mat sum301; //<-matrix(0,nx,nx) //naive variance:H 142 | sum301.zeros(nx,nx); 143 | 144 | arma::mat sum401; //<-matrix(0,nx,nx) //naive variance:H 145 | sum401.zeros(nx,nx); 146 | 147 | // sexp variancemu = variance(mu); 148 | // arma::vec varimu = variancemu; 149 | arma::vec varimu = Rcpp::as(variance(mu)); 150 | 151 | // sexp muetaeta = mueta(eta); 152 | // arma::vec mee = muetaeta; 153 | arma::vec mee = Rcpp::as(mueta(eta)); 154 | 155 | // Rcpp::Rcout << "Flag3" << endl; 156 | 157 | for (int i=0; i(linkinv(eta)); 249 | 250 | // sexp variancemu = variance(mu); 251 | // arma::vec varimu = variancemu; 252 | arma::vec varimu = Rcpp::as(variance(mu)); 253 | 254 | arma::vec sd = pow(varimu,0.5); 255 | arma::vec res = (y-mu)/sd; 256 | 257 | double fi; 258 | 259 | if(scalefix) { 260 | fi = accu(pow(res,2))/(accu(nt)); 261 | }else{ 262 | fi = scalevalue; 263 | } 264 | 265 | arma::vec aindex=cumsum(nt); 266 | arma::vec index; 267 | index.zeros(N); 268 | index.subvec(1,N-1) = aindex.subvec(0,N-2); 269 | 270 | double alfa_hat; 271 | arma::mat alfa_mat; 272 | 273 | if (corstr=="independence") { 274 | 275 | alfa_hat = 0; 276 | 277 | }else if (corstr == "exchangeable"){ 278 | 279 | double sum1 = 0; 280 | double sum3 = 0; 281 | 282 | for (int i=0; i jj && abs(j-jj)==1){ 304 | double sum7 = res(j+index(i)) * res(jj+index(i)); 305 | sum5 = sum5 + sum7; 306 | } 307 | } 308 | } 309 | double sum8 = (nt(i)-1); 310 | sum6 = sum6 + sum8; 311 | } 312 | alfa_hat = sum5 / (sum6*fi); 313 | 314 | }else if (corstr=="unstructured"){ 315 | 316 | alfa_mat.zeros(nt(1),nt(1)); //not allowed for unequal number of cluster sizes. 317 | 318 | for (int j=0; j= 1) betai = beta_mat.col(i-1); 430 | arma::vec beta0; 431 | // double loss0; 432 | 433 | diff = 1; // Initialize diff > tol for each lambda to start the while loop 434 | 435 | // arma::vec eta = X * betai; 436 | // sexp linkinveta = linkinv(eta); 437 | // arma::vec mu = linkinveta; 438 | // arma::vec wt; 439 | // wt.ones(size(mu)); 440 | // sexp resi = devresids(y,mu,wt); 441 | // arma::vec resid = resi; 442 | // double loss = accu(resid); 443 | 444 | double alpha = 0; 445 | 446 | while (abs(diff) > tol and k < maxiter1){ 447 | 448 | k = k + 1; 449 | beta0 = betai; 450 | // loss0 = loss; 451 | int k1 = 0; 452 | double diff1 = 1; 453 | arma::vec beta00 = beta0; 454 | 455 | if (muu > 0){ 456 | 457 | while (abs(diff1) > tol and k1 < maxiter2){ 458 | 459 | k1 = k1 + 1; 460 | beta00 = betai; 461 | 462 | Rcpp::List cor_obj = gee_cor(N, 463 | nt, 464 | y, 465 | X, 466 | linkinv, 467 | variance, 468 | betai, 469 | corstr, 470 | maxclsz, 471 | scalefix, 472 | scalevalue 473 | ); 474 | 475 | arma::cube Rhat = cor_obj["Rhat"]; 476 | double fihat = cor_obj["fihat"]; 477 | 478 | Rcpp::List NR_obj=gee_NR(N, 479 | nt, 480 | y, 481 | X, 482 | nx, 483 | linkinv, 484 | mueta, 485 | variance, 486 | betai, 487 | Rhat, 488 | fihat, 489 | l, 490 | a, 491 | alpha, 492 | ncov, 493 | wcov, 494 | eps, 495 | muu); 496 | 497 | arma::vec S = NR_obj["S"]; 498 | arma::mat H = NR_obj["H"]; 499 | arma::mat E = NR_obj["E"]; 500 | arma::vec C = NR_obj["C"]; 501 | 502 | arma::mat Nmat = mat(size(E),fill::value(N)); 503 | // arma::mat muumat = mat(size(E),fill::value(muu)); 504 | arma::mat muumat = mat(size(E),fill::zeros); // this should be changed to assign muu only to the constrained part 505 | muumat.submat(ncov,ncov,nx-1,nx-1).fill(muu); 506 | arma::mat nugmat = mat(size(E)); 507 | nugmat.diag() += 1e-10; 508 | 509 | betai = betai + pinv(H+Nmat%E+muumat+nugmat) * (S-((Nmat%E)*betai)-C); 510 | 511 | // Rcpp::Rcout << "Flag440" << endl; 512 | 513 | // arma::uvec zidx = find(abs(betai) < 1e-6); 514 | // betai(zidx).zeros(); 515 | 516 | arma::vec diffvec1 = beta00-betai; 517 | diff1 = max(abs(diffvec1)); 518 | 519 | // arma::vec eta1 = X * betai; 520 | // sexp linkinveta1 = linkinv(eta1); 521 | // arma::vec mu1 = linkinveta1; 522 | // arma::vec wt1; 523 | // wt1.ones(size(mu1)); 524 | // sexp resi1 = devresids(y,mu1,wt1); 525 | // arma::vec resid1 = resi1; 526 | // loss = accu(resid1); 527 | 528 | // if (display_progress){ 529 | // 530 | // Rcpp::Rcout << "Lambda:" << l << ",Iter:" << k1 << ",Diff:" << diff << endl; 531 | // // Rcpp::Rcout << "Lambda:" << l << ",Iter:" << k1 << ",Diff loss:" << loss-loss0 << endl; 532 | // 533 | // } 534 | } 535 | 536 | // Rcpp::Rcout << "k1:" << k1 << "diffo" << diff1 << endl; 537 | // Rcpp::Rcout << "k1:" << k1 << "diff" << max(abs(beta0-betai)) << endl; 538 | 539 | alpha = alpha + accu(betai.subvec(ncov,betai.n_elem-1)); 540 | 541 | }else if (muu == 0){ 542 | 543 | Rcpp::List cor_obj = gee_cor(N, 544 | nt, 545 | y, 546 | X, 547 | linkinv, 548 | variance, 549 | betai, 550 | corstr, 551 | maxclsz, 552 | scalefix, 553 | scalevalue 554 | ); 555 | 556 | arma::cube Rhat = cor_obj["Rhat"]; 557 | double fihat = cor_obj["fihat"]; 558 | 559 | Rcpp::List NR_obj=gee_NR(N, 560 | nt, 561 | y, 562 | X, 563 | nx, 564 | linkinv, 565 | mueta, 566 | variance, 567 | betai, 568 | Rhat, 569 | fihat, 570 | l, 571 | a, 572 | alpha, 573 | ncov, 574 | wcov, 575 | eps, 576 | muu); 577 | 578 | arma::vec S = NR_obj["S"]; 579 | arma::mat H = NR_obj["H"]; 580 | arma::mat E = NR_obj["E"]; 581 | arma::vec C = NR_obj["C"]; 582 | 583 | arma::mat Nmat = mat(size(E),fill::value(N)); 584 | arma::mat muumat = mat(size(E),fill::zeros); // this should be changed to assign muu only to the constrained part 585 | // muumat.submat(ncov,ncov,nx-1,nx-1).fill(muu); 586 | arma::mat nugmat = mat(size(E)); 587 | nugmat.diag() += 1e-6; 588 | 589 | betai = betai + pinv(H+Nmat%E+muumat+nugmat) * (S-((Nmat%E)*betai)-C); 590 | 591 | } 592 | 593 | arma::vec diffvec = beta0-betai; 594 | diff = max(abs(diffvec)); 595 | 596 | // Rcpp::Rcout << "k:" << k << "diff:" << diff << endl; 597 | 598 | } 599 | 600 | beta_mat.col(i) = betai; 601 | iters(i) = k; 602 | diffs(i) = diff; 603 | 604 | } 605 | 606 | Rcpp::List ret; 607 | 608 | ret["beta"] = beta_mat; 609 | ret["lambda"] = lambda; 610 | ret["iters"] = iters; 611 | ret["tol"] = diffs; 612 | 613 | return ret; 614 | 615 | } -------------------------------------------------------------------------------- /R/simu.R: -------------------------------------------------------------------------------- 1 | #' Simulate data following log-ratio model 2 | #' 3 | #' @description Simulate a dataset from log-ratio model. 4 | #' @param n An integer of sample size 5 | #' @param p An integer of number of features (taxa). 6 | #' @param model Type of models associated with outcome variable, can be "linear", "binomial", "cox", "finegray", "gee" (scalar outcome with time-dependent features), or "timedep" (survival endpoint with time-dependent features). 7 | #' @param weak Number of features with \code{weak} effect size. 8 | #' @param strong Number of features with \code{strong} effect size. 9 | #' @param weaksize Actual effect size for \code{weak} effect size. Must be positive. 10 | #' @param strongsize Actual effect size for \code{strong} effect size. Must be positive. 11 | #' @param pct.sparsity Percentage of zero counts for each sample. 12 | #' @param rho Parameter controlling the correlated structure between taxa. Ranges between 0 and 1. 13 | #' @param timedep_slope If \code{model} is "timedep", this parameter specifies the slope for the feature trajectories. Please refer to the Simulation section of the manuscript for more details. 14 | #' @param timedep_cor If \code{model} is "timedep", this parameter specifies the sample-wise correlations between longitudinal features. Please refer to the Simulation section of the manuscript for more details. 15 | #' @param geetype If \code{model} is "gee", \code{geetype} is the type of GEE outcomes. Now support "gaussian" and "binomial". 16 | #' @param m If \code{model} is "gee", \code{m} is the number of repeated measurements per subject. 17 | #' @param corstr If \code{model} is "gee", \code{corstr} is the working correlation structure. Now support "independence", "exchangeable", and "AR-1". 18 | #' @param sdvec If \code{model} is "gee" and \code{geetype} is "gaussian", \code{sdvec} is the vector of standard deviations of each outcome variable. 19 | #' @param rhogee If \code{model} is "gee", \code{rhogee} is the correlation parameter between longitudinal outcomes under the selected working correlation structure. 20 | #' @param geeslope If \code{model} is "gee", \code{geeslope} is the linear time effect. 21 | #' @param longitudinal_stability If \code{model} is "timedep", this is a binary indicator which determines whether the trajectories are more stable (\code{TRUE}) or more volatile (\code{FALSE}). 22 | #' @param ncov Number of covariates that are not compositional features. 23 | #' @param betacov Coefficients corresponding to the covariates that are not compositional features. 24 | #' @param intercept Boolean. If TRUE, then a random intercept will be generated in the model. Only works for \code{linear} or \code{binomial} models. 25 | #' @return A list with simulated count matrix \code{xcount}, log1p-transformed count matrix \code{x}, outcome (continuous \code{y}, continuous centered \code{y0}, binary \code{y}, or survival \code{t}, \code{d}), true coefficient vector \code{beta}, list of non-zero features \code{idx}, value of intercept \code{intercept} (if applicable). 26 | #' @author Teng Fei. Email: feit1@mskcc.org 27 | #' @references Fei T, Funnell T, Waters N, Raj SS et al. Enhanced Feature Selection for Microbiome Data using FLORAL: Scalable Log-ratio Lasso Regression bioRxiv 2023.05.02.538599. 28 | #' 29 | #' @examples 30 | #' 31 | #' set.seed(23420) 32 | #' dat <- simu(n=50,p=30,model="linear") 33 | #' 34 | #' @import ggplot2 survival glmnet dplyr 35 | #' @importFrom survcomp concordance.index 36 | #' @importFrom reshape melt 37 | #' @importFrom utils combn 38 | #' @importFrom grDevices rainbow 39 | #' @importFrom caret createFolds 40 | #' @importFrom stats dist rbinom rexp rmultinom rnorm runif sd step glm binomial gaussian na.omit 41 | #' @importFrom msm dpexp ppexp rpexp 42 | #' @importFrom mvtnorm rmvnorm 43 | #' @useDynLib FLORAL 44 | #' @export 45 | 46 | simu <- function(n = 100, 47 | p = 200, 48 | model = "linear", 49 | weak = 4, 50 | strong = 6, 51 | weaksize = 0.125, 52 | strongsize = 0.25, 53 | pct.sparsity = 0.5, 54 | rho=0, 55 | timedep_slope=NULL, 56 | timedep_cor=NULL, 57 | geetype="gaussian", 58 | m=4, 59 | corstr="exchangeable", 60 | sdvec=NULL, 61 | rhogee=0.8, 62 | geeslope=2.5, 63 | longitudinal_stability=TRUE, 64 | ncov=0, 65 | betacov=0, 66 | intercept=FALSE){ 67 | 68 | true_set <- 1:(weak+strong) 69 | weak_idx <- 1:weak 70 | strong_idx <- (weak+1):(weak+strong) 71 | 72 | beta <- rep(NA,weak+strong) 73 | beta[weak_idx] <- rep(c(weaksize,-weaksize),weak/2) 74 | beta[strong_idx] <- rep(c(strongsize,-strongsize),strong/2) 75 | 76 | if (model %in% c("linear","binomial")){ 77 | y <- rep(NA,length=n) 78 | }else if (model == "cox"){ 79 | t <- rep(NA,length=n) 80 | d <- rep(NA,length=n) 81 | }else if (model == "finegray"){ 82 | t <- t0 <- rep(NA,length=n) 83 | d <- rep(NA,length=n) 84 | }else if (model == "timedep"){ 85 | m <- 10 86 | id.vect <- rep(1:n, each = m) 87 | n0 <- n 88 | n <- length(id.vect) 89 | 90 | if (is.null(timedep_cor)){ 91 | timedep_cor <- 0.4 92 | } 93 | if (is.null(timedep_slope)){ 94 | timedep_slope <- 0.5 95 | }else{ 96 | timedep_slope <- timedep_slope 97 | } 98 | }else if (model == "gee"){ 99 | 100 | id.vect <- rep(1:n, each = m) 101 | n0 <- n 102 | n <- length(id.vect) 103 | 104 | if (is.null(sdvec) & geetype=="gaussian"){ 105 | sdvec <- rep(1,m) 106 | } 107 | 108 | if (is.null(timedep_cor)){ 109 | timedep_cor <- 0.4 110 | } 111 | if (is.null(timedep_slope)){ 112 | timedep_slope <- 0.5 113 | }else{ 114 | timedep_slope <- timedep_slope 115 | } 116 | 117 | } 118 | 119 | x <- xobs <- matrix(NA,nrow=n,ncol=p) 120 | 121 | seqdep <- floor(runif(n,min=5000,max=50000)) 122 | # highidx <- true_set 123 | 124 | ############################################### 125 | 126 | # if (method == "manual"){ 127 | 128 | if (!(model %in% c("timedep","gee"))){ 129 | 130 | sigma <- rho^(as.matrix(dist(1:p))) 131 | diag(sigma) <- c(rep(log(p)/2,3),1,rep(log(p)/2,2),1,log(p)/2,rep(1,p-8)) 132 | mu <- c(rep(log(p),3),0,rep(log(p),2),0,log(p),rep(0,p-8)) 133 | 134 | x <- mvtnorm::rmvnorm(n=n,mean=mu,sigma=sigma) 135 | 136 | if (pct.sparsity > 0){ 137 | for (i in 1:n){ 138 | zeroidx <- sample(1:p,size=floor(p*pct.sparsity)) 139 | x[i,zeroidx] <- -Inf 140 | } 141 | } 142 | 143 | }else if(model %in% c("timedep","gee")){ 144 | 145 | sigma <- rho^(as.matrix(dist(1:(p-(weak+strong))))) 146 | diag(sigma) <- 1 147 | mu <- rep(0,p-(weak+strong)) 148 | 149 | x0 <- mvtnorm::rmvnorm(n=n,mean=mu,sigma=sigma) 150 | x[,setdiff(1:p,true_set)] <- x0 151 | 152 | for (i in 1:n0){ 153 | 154 | # Mu <- rep(c(rep(log(p),3),0,rep(log(p),2),0,log(p),rep(0,2)),m) 155 | # mu <- c(rep(log(p),3),0,rep(log(p),2),0,log(p),rep(0,2)) 156 | # slopes <- outer(0:9,timedep_slope) 157 | # Mu <- t(mu + t(slopes)) 158 | 159 | slopes <- rep(0:1,(weak+strong)/2)*timedep_slope 160 | Mu <- t(c(rep(log(p),3),0,rep(log(p),2),0,log(p),rep(0,2)) + t(outer(0:(m-1),slopes))) 161 | 162 | sigma1 <- rho^(as.matrix(dist(1:(weak+strong)))) 163 | Sigma <- (diag(m) %x% sigma1) + ((matrix(1,nrow=m,ncol=m) - diag(m)) %x% (diag(strong+weak)*timedep_cor)) 164 | # Sigma <- Matrix::bdiag(replicate(m,sigma,simplify=FALSE)) 165 | # sigma_offdiag <- diag(strong+weak)*0.8 166 | # mat_template <- matrix(1,nrow=m,ncol=m) - diag(m) 167 | 168 | x1 <- matrix(rmvnorm(n=1,mean=as.vector(Mu),sigma=Sigma),nrow=m,ncol=weak+strong,byrow=FALSE) 169 | x[id.vect==i,true_set] <- x1 170 | 171 | if (pct.sparsity > 0){ 172 | # for (i in 1:n0){ 173 | 174 | if (longitudinal_stability){ 175 | zeroidx <- sample(true_set,size=floor(length(true_set)*pct.sparsity)) 176 | x[id.vect==i,zeroidx] <- -Inf 177 | }else{ 178 | ids <- which(id.vect==i) 179 | for (i in 1:m){ 180 | zeroidx <- sample(true_set,size=floor(length(true_set)*pct.sparsity)) 181 | x[ids[i],zeroidx] <- -Inf 182 | } 183 | } 184 | 185 | # zeroidx <- sample(1:(length(true_set)/2),size=floor(length(true_set)/2*pct.sparsity)) 186 | # x[id.vect==i,c(zeroidx*2,zeroidx*2-1)] <- -Inf 187 | 188 | # } 189 | } 190 | 191 | } 192 | 193 | if (pct.sparsity > 0){ 194 | for (j in 1:n){ 195 | zeroidx <- sample(setdiff(1:p,true_set),size=floor(p*pct.sparsity)) 196 | x[j,zeroidx] <- -Inf 197 | } 198 | } 199 | 200 | } 201 | 202 | x = apply(x,2,function(y) exp(y)/rowSums(exp(x))) 203 | 204 | for (k in 1:n){ 205 | xobs[k,] <- rmultinom(1,size=seqdep[k],prob=x[k,]) 206 | } 207 | xcount = xobs 208 | colnames(xcount) <- paste0("taxa",1:p) 209 | 210 | for (k in 1:n){ 211 | x[k,] <- rmultinom(1,size=1000000,prob=x[k,]) 212 | } 213 | x = log(x+1) 214 | 215 | # }else if (method == "SparseDOSSA2"){ 216 | 217 | # sim <- SparseDOSSA2::SparseDOSSA2(template = "Stool", 218 | # n_sample=n, 219 | # median_read_depth = 25000, 220 | # new_features=FALSE, 221 | # verbose = FALSE) 222 | # xcount <- t(sim$simulated_data) 223 | # taxa <- colnames(xcount) 224 | # 225 | # true_set <- which(taxa %in% c("k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Lachnospiraceae|g__Blautia|s__Ruminococcus_torques", 226 | # "k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Clostridiaceae|g__Clostridium|s__Clostridium_leptum", 227 | # "k__Bacteria|p__Firmicutes|c__Negativicutes|o__Selenomonadales|f__Veillonellaceae|g__Veillonella|s__Veillonella_unclassified", 228 | # "k__Bacteria|p__Verrucomicrobia|c__Verrucomicrobiae|o__Verrucomicrobiales|f__Verrucomicrobiaceae|g__Akkermansia|s__Akkermansia_muciniphila", 229 | # "k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o__Bacteroidales|f__Bacteroidaceae|g__Bacteroides|s__Bacteroides_uniformis", 230 | # "k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o__Bacteroidales|f__Porphyromonadaceae|g__Parabacteroides|s__Parabacteroides_merdae", 231 | # "k__Bacteria|p__Firmicutes|c__Negativicutes|o__Selenomonadales|f__Veillonellaceae|g__Veillonella|s__Veillonella_parvula", 232 | # "k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Ruminococcaceae|g__Ruminococcus|s__Ruminococcus_bromii", 233 | # "k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Lachnospiraceae|g__Roseburia|s__Roseburia_inulinivorans", 234 | # "k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Coriobacteriales|f__Coriobacteriaceae|g__Collinsella|s__Collinsella_aerofaciens" 235 | # )) 236 | # 237 | # colnames(xcount) <- paste0("taxa",1:ncol(xcount)) 238 | # rownames(xcount) <- NULL 239 | # x = t(sim$simulated_matrices$rel) 240 | # for (k in 1:nrow(xcount)){ 241 | # x[k,] <- rmultinom(1,size=1000000,prob=x[k,]) 242 | # } 243 | # x = log(x+1) 244 | 245 | # } 246 | 247 | betavec <- rep(0,ncol(xcount)) 248 | betavec[true_set] <- beta 249 | xobs <- log(xcount+1) 250 | colnames(xobs) <- NULL 251 | 252 | if (ncov > 0){ 253 | xcov <- mvtnorm::rmvnorm(n=n,mean=rep(0,ncov)) 254 | colnames(xcov) <- paste0("cov",1:ncov) 255 | } 256 | 257 | if (model == "linear"){ 258 | 259 | y <- x[,true_set] %*% beta + rnorm(n,mean=0,sd=1) 260 | 261 | if(ncov > 0) y <- y + xcov %*% betacov 262 | 263 | if(intercept) { 264 | 265 | intcpt <- rnorm(1,mean=1,sd=1) 266 | y <- y + intcpt 267 | 268 | } 269 | y0 = y - mean(y) 270 | 271 | ret <- list(xcount=xcount,x=xobs,y=y,y0=y0,beta=betavec,idx=true_set) 272 | 273 | if (intercept) ret$intercept=intcpt 274 | 275 | if (ncov > 0) ret$xcov=xcov 276 | 277 | }else if(model == "binomial"){ 278 | 279 | eta <- x[,true_set] %*% beta 280 | 281 | if (ncov > 0) eta <- eta + xcov %*% betacov 282 | 283 | if(intercept) { 284 | 285 | intcpt <- rnorm(1,mean=1,sd=1) 286 | eta <- eta + intcpt 287 | 288 | } 289 | 290 | prob <- exp(eta)/(1+exp(eta)) 291 | 292 | for (i in 1:n){ 293 | y[i] <- rbinom(1,1,prob=prob[i]) 294 | } 295 | 296 | ret <- list(xcount=xcount,x=xobs,y=y,beta=betavec,idx=true_set) 297 | 298 | if (intercept) ret$intercept=intcpt 299 | 300 | if (ncov > 0) ret$xcov=xcov 301 | 302 | }else if(model == "cox"){ 303 | 304 | eta <- x[,true_set] %*% beta 305 | 306 | if (ncov > 0) eta <- eta + xcov %*% betacov 307 | 308 | lambda <- exp(eta) 309 | 310 | for(i in 1:n){ 311 | U <- runif(1,min=0,max=1) 312 | t0 <- log(1-(log(1-U))/(0.1*lambda[i])) 313 | c0 <- min(rexp(1,rate=0.1),runif(1,min=5,max=6)) 314 | t[i] <- min(t0,c0) 315 | d[i] <- as.numeric(I(t0 <= c0)) 316 | } 317 | 318 | ret <- list(xcount=xcount,x=xobs,t=t,d=d,beta=betavec,idx=true_set) 319 | 320 | if (ncov > 0) ret$xcov=xcov 321 | 322 | }else if(model == "finegray"){ 323 | 324 | eta <- x[,true_set] %*% beta 325 | 326 | if (ncov > 0) eta <- eta + xcov %*% betacov 327 | 328 | p.cif = 0.66 329 | lambda <- exp(eta) 330 | cl=0.19 331 | cu=10 332 | 333 | P1 <- 1-(1-p.cif)^(lambda) 334 | epsilon <- rep(0,n) 335 | for (i in 1:n){ 336 | epsilon[i] <- 2 - rbinom(1,1,P1[i]) 337 | } 338 | 339 | #generate the event time based on the type of outcome 340 | t0 <- rep(0,n) 341 | u <- runif(n) 342 | for (i in 1:n){ 343 | if (epsilon[i] == 1){ 344 | t0[i] <- -log(1 - (1 - (1-u[i]*(1-(1-p.cif)^lambda[i]))^(1/(lambda[i]+0.001)))/p.cif) 345 | } 346 | if (epsilon[i] == 2){ 347 | t0[i] <- -log((1-u[i])^(1/(lambda[i]+0.001))) 348 | } 349 | } 350 | 351 | #generate censoring time 352 | c <- runif(n,cl,cu) 353 | #observed time 354 | t <- ifelse(t0 == Inf, c ,t0*I(t0<=c) + c*I(t0>c)) 355 | # outcome 356 | d <- ifelse(t0 == Inf, 0, 0*I(t == c) + epsilon*I(t < c)) 357 | 358 | ret <- list(xcount=xcount,x=xobs,t=t,d=d,beta=betavec,idx=true_set) 359 | 360 | if (ncov > 0) ret$xcov=xcov 361 | 362 | }else if (model=="timedep"){ 363 | 364 | g.inv <- sqrt 365 | g <- function(x) { 366 | x^2 367 | } 368 | 369 | t.max <- 9 370 | t.min <- 0 371 | 372 | z.list <- list() 373 | for (i in 1:n0) { 374 | z <- x[id.vect == i,true_set] 375 | z.list[[i]] <- cbind(z, exp(z %*% beta)) 376 | } 377 | 378 | k <- function(x, m, M, rates, t){ 379 | ifelse(x <= m | x >= M, 0, dpexp(x, rates, t)) 380 | } 381 | 382 | gen.y <- function(z,time_base,t.max,t.min){ 383 | 384 | exp.etay <- z[,ncol(z)] 385 | 386 | t <- time_base 387 | t.diff <- (t[-1] - t[1:(length(t) - 1)])[-(length(t) - 1)] 388 | g.inv.t <- g.inv(t) 389 | g.inv.t.diff <- (g.inv(t[-1]) - g.inv(t[1:(length(t) - 1)]))[-(length(t) - 1)] 390 | 391 | g.inv.t.max <- g.inv(t.max) 392 | g.inv.t.min <- g.inv(t.min) 393 | 394 | x1 <- exp.etay 395 | d <- ppexp(g.inv.t.max, x1, g.inv.t) - ppexp(g.inv.t.min, x1,g.inv.t) 396 | M <- 1 / d 397 | r <- 60 398 | repeat{ 399 | y <- rpexp(r, x1, g.inv.t) 400 | u <- runif(r) 401 | t <- M * ((k(y, g.inv.t.min, g.inv.t.max, x1, g.inv.t) / d / 402 | dpexp(y, x1, g.inv.t))) 403 | y <- y[u <= t][1] 404 | if (!is.na(y)) break 405 | } 406 | y 407 | } 408 | 409 | y <- sapply(z.list, gen.y, time_base = 0:9, t.max=t.max, t.min=t.min) 410 | g.y <- g(y) 411 | 412 | # print(summary(g.y)) 413 | 414 | # ct <- runif(n, min=5,max=6) 415 | # d <- ifelse(g.y < ct, 1, 0) 416 | # g.y <- ifelse(g.y < ct, g.y, ct) 417 | 418 | prop.cen <- 0.5 419 | d <- sample(0:1, n, replace = TRUE, prob = c(prop.cen, 1 - prop.cen)) 420 | 421 | data <- NULL 422 | for (i in 1:n0) { 423 | id.temp <- rep(i, ceiling(g.y[i])) 424 | time.temp <- c(1:ceiling(g.y[i])) 425 | time0.temp <- 0:(ceiling(g.y[i]) - 1) 426 | d.temp <- c(rep(0, length(time.temp) - 1), d[i]) 427 | 428 | if (ceiling(g.y[i]) > 9){ 429 | z.temp <- xcount[id.vect==i,] 430 | 431 | if (length(id.temp) > nrow(z.temp)){ 432 | 433 | z.temp <- rbind(z.temp, 434 | # do.call("rbind", replicate(length(id.temp) - nrow(z.temp), 435 | # z.temp[nrow(z.temp),], 436 | # simplify = FALSE)) 437 | z.temp[nrow(z.temp),] 438 | ) 439 | 440 | id.temp <- id.temp[c(1:10,length(id.temp))] 441 | time.temp <- time.temp[c(1:10,length(id.temp))] 442 | time0.temp <- time0.temp[c(1:11)] 443 | d.temp <- d.temp[c(1:10,length(id.temp))] 444 | 445 | } 446 | 447 | }else{ 448 | z.temp <- xcount[id.vect==i,][1:(ceiling(g.y[i])),] 449 | } 450 | 451 | if (is.null(nrow(z.temp))){ 452 | 453 | data.temp <- c(id.temp, time.temp, time0.temp, d.temp, z.temp) 454 | 455 | }else{ 456 | 457 | data.temp <- cbind(id.temp, time.temp, time0.temp, d.temp, z.temp) 458 | } 459 | 460 | data <- rbind(data, data.temp) 461 | } 462 | 463 | colnames(data) <- c('id', 't', 't0', 'd', paste0("z",1:p)) 464 | data <- data.frame(data) 465 | data_unique <- data %>% 466 | group_by(id) %>% 467 | filter(row_number() == n()) %>% 468 | ungroup() 469 | 470 | xcount <- data[,-c(1:4)] 471 | colnames(xcount) <- rownames(xcount) <- NULL 472 | colnames(xcount) <- paste0("taxa",1:p) 473 | 474 | xcount_baseline <- data_unique[,-c(1:4)] 475 | colnames(xcount_baseline) <- rownames(xcount_baseline) <- NULL 476 | colnames(xcount_baseline) <- paste0("taxa",1:p) 477 | 478 | ret <- list(xcount=xcount, 479 | xcount_baseline=xcount_baseline, 480 | data=data, 481 | data_unique=data_unique, 482 | beta=betavec, 483 | idx=true_set) 484 | 485 | }else if (model == "gee"){ 486 | 487 | tvec <- rep(0:(m-1),n0) 488 | 489 | if (geetype == "gaussian"){ 490 | 491 | if (corstr == "independence"){ 492 | 493 | SIGMA <- diag(sdvec) %*% diag(m) %*% diag(sdvec) 494 | 495 | }else if (corstr == "exchangeable"){ 496 | 497 | R <- matrix(rhogee,m,m)+diag(rep(1-rhogee,m)) # Working correlation matrix 498 | SIGMA <- diag(sdvec) %*% R %*% diag(sdvec) 499 | 500 | }else if (corstr == "AR-1"){ 501 | 502 | R <- matrix(NA,nrow=m,ncol=m) 503 | for (t1 in 1:m) { 504 | for (t2 in 1:m) { 505 | R[t1,t2]<-rhogee^abs(t1-t2) 506 | } 507 | } 508 | SIGMA <- diag(sdvec) %*% R %*% diag(sdvec) 509 | 510 | } 511 | 512 | # covariance matrix of error 513 | error <- rmvnorm(n0, mean = rep(0,m),SIGMA) 514 | 515 | # form continuous longitudinal outcomes 516 | y <- tvec*geeslope + x[,true_set] %*% beta + as.vector(t(error)) 517 | 518 | if(intercept) { 519 | intcpt <- rnorm(1,mean=1,sd=1) 520 | y <- y + intcpt 521 | } 522 | 523 | }else if (geetype == "binomial"){ 524 | 525 | eta <- tvec*geeslope + x[,true_set] %*% beta 526 | if(intercept) { 527 | intcpt <- rnorm(1,mean=1,sd=1) 528 | eta <- eta + intcpt 529 | } 530 | prob <- exp(eta)/(1+exp(eta)) 531 | y <- rep(NA,n) 532 | 533 | if (corstr == "independence"){ 534 | 535 | for (i in 1:n){ 536 | y[i] <- rbinom(1,1,prob=prob[i]) 537 | } 538 | 539 | }else if (corstr == "exchangeable"){ 540 | 541 | for (i in 1:n0){ 542 | 543 | mu <- prob[id.vect==i] 544 | y[id.vect==i] <- binsimuexch(mu,rhogee) 545 | 546 | } 547 | 548 | }else if (corstr == "AR-1"){ 549 | 550 | for (i in 1:n0){ 551 | 552 | mu <- prob[id.vect==i] 553 | y[id.vect==i] <- binsimuar1(mu,rhogee) 554 | 555 | } 556 | 557 | } 558 | 559 | } 560 | 561 | ret <- list(xcount=xcount, 562 | x=xobs, 563 | y=y, 564 | id=id.vect, 565 | tvec=tvec, 566 | beta=betavec, 567 | idx=true_set) 568 | 569 | if (intercept) ret$intercept=intcpt 570 | 571 | } 572 | 573 | return(ret) 574 | 575 | } 576 | -------------------------------------------------------------------------------- /R/LogRatioCoxLasso.R: -------------------------------------------------------------------------------- 1 | LogRatioCoxLasso <- function(x, 2 | y, 3 | ncov, 4 | length.lambda=100, 5 | lambda.min.ratio=NULL, 6 | wcov, 7 | a=1, 8 | mu=1, 9 | maxiter=100, 10 | ncv=5, 11 | foldid=NULL, 12 | step2=FALSE, 13 | progress=TRUE, 14 | plot=TRUE, 15 | mcv="Deviance", 16 | loop1=FALSE, 17 | loop2=FALSE, 18 | ncore=1){ 19 | 20 | ptm <- proc.time() 21 | 22 | t <- y[,1] 23 | d <- y[,2] 24 | n <- nrow(y) 25 | p <- ncol(x) 26 | if (length(unique(t)) < n) t <- t+runif(n,min=0,max=1e-4) # break ties 27 | 28 | tj <- sort(t[d==1]) 29 | devnull <- log(sum(t >= tj[1])) 30 | denomj <- rep(NA,length(tj)) 31 | denomj[1] <- sum(t >= tj[1]) 32 | for (j in 2:length(tj)){ 33 | denomj[j] <- denomj[j-1] - sum(t >= tj[j-1] & t < tj[j]) 34 | devnull <- devnull + log(sum(t >= tj[j])) 35 | } 36 | devnull <- 2*devnull 37 | 38 | expect <- sapply(t,function(x) sum(1/denomj[which(tj <= x)])) 39 | sfun = d - expect 40 | # hfun <- expect - sapply(t,function(x) sum(1/denomj[which(tj <= x)]^2)) + 1e-8 # hessian 41 | # z <- sfun/hfun 42 | if (a > 0){ 43 | lambda0 <- max(abs(t(sfun) %*% x))/(a*n) 44 | }else if (a == 0){ 45 | lambda0 <- max(abs(t(sfun) %*% x))/(1e-3*n) 46 | } 47 | 48 | adjust = FALSE 49 | if (ncov > 0) adjust = TRUE 50 | 51 | if (is.null(lambda.min.ratio)) lambda.min.ratio = ifelse(n < p, 1e-02, 1e-02) 52 | lambda <- 10^(seq(log10(lambda0),log10(lambda0*lambda.min.ratio),length.out=length.lambda)) 53 | 54 | if (progress) cat("Algorithm running for full dataset: \n") 55 | 56 | fullfit <- cox_enet_al(x,t,d,tj,length.lambda,mu,maxiter,lambda,wcov,a,adjust,ncov,devnull,progress,loop1,loop2,notcv=TRUE) 57 | lidx <- which(fullfit$loglik != 0 | !is.nan(fullfit$loglik)) 58 | 59 | dev <- -2*fullfit$loglik[lidx] 60 | 61 | if (!is.null(colnames(x))){ 62 | rownames(fullfit$beta) = colnames(x) 63 | }else{ 64 | colnames(x) = 1:ncol(x) 65 | rownames(fullfit$beta) = 1:ncol(x) 66 | } 67 | 68 | if (!is.null(ncv)){ 69 | 70 | cvdev <- matrix(0,nrow=length(lidx),ncol=ncv) 71 | # cvdevnull <- rep(0,ncol=ncv) 72 | 73 | if (is.null(foldid)){ 74 | labels <- coxsplity(as.matrix(y),ncv) 75 | }else{ 76 | labels <- foldid 77 | } 78 | 79 | # labels <- caret::createFolds(factor(d),k=ncv) 80 | 81 | if (ncore == 1){ 82 | 83 | for (cv in 1:ncv){ 84 | 85 | if (progress) cat(paste0("Algorithm running for cv dataset ",cv," out of ",ncv,": \n")) 86 | 87 | # train.x <- x[-labels[[cv]],] 88 | # train.d <- d[-labels[[cv]]] 89 | # train.t <- t[-labels[[cv]]] 90 | # test.x <- x[labels[[cv]],] 91 | # test.d <- d[labels[[cv]]] 92 | # test.t <- t[labels[[cv]]] 93 | 94 | train.x <- x[labels!=cv,] 95 | train.d <- d[labels!=cv] 96 | train.t <- t[labels!=cv] 97 | test.x <- x[labels==cv,] 98 | test.d <- d[labels==cv] 99 | test.t <- t[labels==cv] 100 | 101 | cv.devnull <- 0 102 | train.tj <- sort(train.t[train.d==1]) 103 | for (j in 1:length(train.tj)){ 104 | cv.devnull <- cv.devnull + log(sum(train.t >= train.tj[j])) 105 | } 106 | cv.devnull <- 2*cv.devnull 107 | 108 | cvfit <- cox_enet_al(train.x,train.t,train.d,train.tj,length(lidx),mu,maxiter,lambda[lidx],wcov,a,adjust,ncov,cv.devnull,progress,loop1,loop2,notcv=FALSE) 109 | 110 | cv.devnull <- 0 111 | loglik <- rep(0,length(lidx)) 112 | linprod <- test.x %*% cvfit$beta 113 | 114 | if (mcv == "Deviance"){ 115 | 116 | test.tj <- sort(test.t[test.d==1]) 117 | for (j in 1:length(test.tj)){ 118 | cv.devnull <- cv.devnull + log(sum(test.t >= test.tj[j])) 119 | if (sum(test.t >= test.tj[j]) > 1){ 120 | cvdev[,cv] <- cvdev[,cv] + linprod[test.t == test.tj[j],] - 121 | log(colSums(exp(linprod[test.t >= test.tj[j],])) + 1e-8) 122 | }else if (sum(test.t >= test.tj[j]) == 1){ 123 | cvdev[,cv] <- cvdev[,cv] + linprod[test.t == test.tj[j],] - 124 | log(exp(linprod[test.t >= test.tj[j],]) + 1e-8) 125 | } 126 | #- log(accu(link(widx))+1e-8) 127 | } 128 | 129 | # cvdevnull[cv] <- 2*cv.devnull 130 | cvdev[,cv] <- -2*cvdev[,cv] 131 | 132 | }else if (mcv == "Cindex"){ 133 | 134 | for (kk in 1:length(lidx)){ 135 | cvdev[kk,cv] <- 1-concordance.index(x=linprod[,kk],surv.time=test.t,surv.event=test.d)$c.index 136 | } 137 | 138 | } 139 | # cvmse[,cv] <- apply(cbind(1,test.x) %*% rbind(t(cvfit$beta0),cvfit$beta),2,function(x) sum((test.y - exp(x)/(1+exp(x)))^2)/length(test.y)) 140 | 141 | } 142 | 143 | }else if (ncore > 1){ 144 | 145 | if (progress) warning(paste0("Using ", ncore ," core for cross-validation computation.")) 146 | 147 | cl <- makeCluster(ncore) 148 | registerDoParallel(cl) 149 | 150 | cvdev <- foreach(cv=1:ncv,.combine=cbind) %dopar% { 151 | 152 | train.x <- x[labels!=cv,] 153 | train.d <- d[labels!=cv] 154 | train.t <- t[labels!=cv] 155 | test.x <- x[labels==cv,] 156 | test.d <- d[labels==cv] 157 | test.t <- t[labels==cv] 158 | 159 | cv.devnull <- 0 160 | train.tj <- sort(train.t[train.d==1]) 161 | for (j in 1:length(train.tj)){ 162 | cv.devnull <- cv.devnull + log(sum(train.t >= train.tj[j])) 163 | } 164 | cv.devnull <- 2*cv.devnull 165 | 166 | cvfit <- cox_enet_al(train.x,train.t,train.d,train.tj,length(lidx),mu,maxiter,lambda[lidx],wcov,a,adjust,ncov,cv.devnull,FALSE,loop1,loop2,notcv=FALSE) 167 | 168 | linprod <- test.x %*% cvfit$beta 169 | cv.dev <- rep(0,length(lidx)) 170 | 171 | test.tj <- sort(test.t[test.d==1]) 172 | for (j in 1:length(test.tj)){ 173 | if (sum(test.t >= test.tj[j]) > 1){ 174 | cv.dev <- cv.dev + linprod[test.t == test.tj[j],] - 175 | log(colSums(exp(linprod[test.t >= test.tj[j],])) + 1e-8) 176 | }else if (sum(test.t >= test.tj[j]) == 1){ 177 | cv.dev <- cv.dev + linprod[test.t == test.tj[j],] - 178 | log(exp(linprod[test.t >= test.tj[j],]) + 1e-8) 179 | } 180 | } 181 | 182 | cv.dev <- -2*cv.dev 183 | cv.dev 184 | } 185 | 186 | stopCluster(cl) 187 | 188 | } 189 | 190 | mean.cvdev <- rowMeans(cvdev) 191 | se.cvdev <- apply(cvdev,1,function(x) sd(x)/sqrt(ncv)) 192 | 193 | idx.min <- which.min(mean.cvdev) 194 | se.min <- se.cvdev[idx.min] 195 | idx.1se <- suppressWarnings(max(which(mean.cvdev > mean.cvdev[idx.min] + se.min & 1:length(lidx) < idx.min))) 196 | if (idx.1se == -Inf) idx.1se = 1 197 | 198 | best.beta <- list(min.mse = fullfit$beta[,idx.min], 199 | add.1se = fullfit$beta[,idx.1se]) 200 | 201 | best.idx <- list(idx.min = idx.min, 202 | idx.1se = idx.1se) 203 | 204 | ret <- list(beta=fullfit$beta[,lidx], 205 | lambda=fullfit$lambda[lidx], 206 | a=a, 207 | loss=fullfit$loss[lidx], 208 | mse=fullfit$mse[lidx], 209 | tol=fullfit$tol[lidx], 210 | iters=fullfit$iters[lidx], 211 | cvdev.mean=mean.cvdev, 212 | cvdev.se=se.cvdev, 213 | best.beta=best.beta, 214 | best.idx=best.idx, 215 | foldid=labels 216 | ) 217 | 218 | if (plot){ 219 | 220 | beta_nzero <- suppressWarnings(data.frame(reshape::melt(ret$beta[rowSums(ret$beta != 0) > 0,]))) 221 | beta_nzero$lambda <- ret$lambda[beta_nzero$X2] 222 | beta_nzero$loglambda <- log(beta_nzero$lambda) 223 | 224 | lambda_count <- data.frame(loglambda = log(ret$lambda), 225 | count = colSums(ret$beta != 0)) 226 | lambda_count <- lambda_count[seq(5,nrow(lambda_count),length.out=10),] 227 | 228 | top10feat <- sort(ret$beta[,length(ret$lambda)])[c(1:5,(p-4):p)] 229 | top10name <- names(top10feat) 230 | 231 | pcoef <- ggplot(beta_nzero, aes(x=.data$loglambda,y=.data$value,group=.data$X1,color=as.factor(.data$X1))) + 232 | geom_line() + 233 | scale_color_manual(values=rainbow(sum(rowSums(ret$beta != 0) > 0))) + 234 | theme_bw() + 235 | theme(legend.position = "none") + 236 | xlab(expression(paste("log(",lambda,")"))) + 237 | ylab("Coefficient") + 238 | geom_vline(xintercept=log(ret$lambda[ret$best.idx$idx.min]),linetype="dashed",color="darkgrey")+ 239 | geom_vline(xintercept=log(ret$lambda[ret$best.idx$idx.1se]),linetype="dotted",color="darkgrey")+ 240 | # annotate("text",x=min(beta_nzero$loglambda)-2,y=top10feat,label=top10name,hjust=0)+ 241 | annotate("text",x=lambda_count$loglambda,y=max(beta_nzero$value)+0.2,label=as.character(lambda_count$count))+ 242 | ggtitle(expression(paste("Coefficients versus log(",lambda,")"))) 243 | 244 | devplot <- data.frame(loglambda=log(ret$lambda), 245 | dev=ret$cvdev.mean, 246 | se=ret$cvdev.se, 247 | devaddse=ret$cvdev.mean+ret$cvdev.se, 248 | devminse=ret$cvdev.mean-ret$cvdev.se) 249 | 250 | pdev <- ggplot(devplot, aes(x=.data$loglambda, y=.data$dev)) + 251 | geom_errorbar(aes(ymin=.data$devminse,ymax=.data$devaddse),color="grey")+ 252 | geom_point(color="red")+ 253 | theme_bw() + 254 | xlab(expression(paste("log(",lambda,")"))) + 255 | ylab("Partial Likelihood Deviance")+ 256 | geom_vline(xintercept=log(ret$lambda[ret$best.idx$idx.min]),linetype="dashed",color="darkgrey")+ 257 | geom_vline(xintercept=log(ret$lambda[ret$best.idx$idx.1se]),linetype="dotted",color="darkgrey")+ 258 | annotate("text",x=lambda_count$loglambda,y=max(devplot$devaddse)+0.05,label=as.character(lambda_count$count))+ 259 | ggtitle(expression(paste("Cross-validated deviance versus log(",lambda,")"))) 260 | 261 | ret$pcoef <- pcoef 262 | ret$pdev <- pdev 263 | 264 | } 265 | 266 | if (step2){ # need to develop a equivalent lasso procedure for this. Stepwise selection is too slow for a big number of selected variables. 267 | 268 | if (!adjust){ 269 | 270 | if (length(which(ret$best.beta$min.mse!=0)) > 1){ 271 | 272 | idxs <- combn(which(ret$best.beta$min.mse!=0),2) 273 | 274 | x.select.min <- matrix(NA,nrow=n,ncol=ncol(idxs)) 275 | for (k in 1:ncol(idxs)){ 276 | x.select.min[,k] <- x[,idxs[1,k]] - x[,idxs[2,k]] 277 | } 278 | 279 | if (ncol(x.select.min) > 1){ 280 | stepglmnet <- suppressWarnings(cv.glmnet(x=x.select.min,y=Surv(t,d),type.measure = "deviance",family="cox")) 281 | x.select.min <- x.select.min[,which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)] 282 | idxs <- idxs[,which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)] 283 | }else{ 284 | idxs <- as.vector(idxs) 285 | } 286 | 287 | df_step2 <- data.frame(t=t,d=d,x=x.select.min) 288 | step2fit <- suppressWarnings(step(coxph(Surv(t,d)~.,data=df_step2),trace=0)) 289 | vars <- as.numeric(sapply(names(step2fit$coefficients),function(x) strsplit(x,split = "[.]")[[1]][2])) 290 | 291 | if (is.null(ncol(idxs))){ 292 | selected <- idxs 293 | }else{ 294 | selected <- idxs[,vars] 295 | } 296 | 297 | ret$step2.feature.min = selected 298 | ret$step2fit.min <- step2fit 299 | } 300 | 301 | if (length(which(ret$best.beta$add.1se!=0)) > 1){ 302 | 303 | idxs <- combn(which(ret$best.beta$add.1se!=0),2) 304 | 305 | x.select.min <- matrix(NA,nrow=n,ncol=ncol(idxs)) 306 | for (k in 1:ncol(idxs)){ 307 | x.select.min[,k] <- x[,idxs[1,k]] - x[,idxs[2,k]] 308 | } 309 | 310 | if (ncol(x.select.min) > 1){ 311 | stepglmnet <- suppressWarnings(cv.glmnet(x=x.select.min,y=Surv(t,d),type.measure = "deviance",family="cox")) 312 | x.select.min <- x.select.min[,which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)] 313 | idxs <- idxs[,which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)] 314 | }else{ 315 | idxs <- as.vector(idxs) 316 | } 317 | 318 | df_step2 <- data.frame(t=t,d=d,x=x.select.min) 319 | step2fit <- suppressWarnings(step(coxph(Surv(t,d)~.,data=df_step2),trace=0)) 320 | vars <- as.numeric(sapply(names(step2fit$coefficients),function(x) strsplit(x,split = "[.]")[[1]][2])) 321 | 322 | if (is.null(ncol(idxs))){ 323 | selected <- idxs 324 | }else{ 325 | selected <- idxs[,vars] 326 | } 327 | 328 | # for (k1 in 1:nrow(selected)){ 329 | # for (k2 in 1:ncol(selected)){ 330 | # selected[k1,k2] <- colnames(x)[as.numeric(selected[k1,k2])] 331 | # } 332 | # } 333 | 334 | ret$step2.feature.1se = selected 335 | ret$step2fit.1se <- step2fit 336 | } 337 | 338 | }else{ 339 | 340 | if (length(which(ret$best.beta$min.mse!=0)) > 1){ 341 | 342 | allidx <- which(ret$best.beta$min.mse!=0) 343 | 344 | covidx <- allidx[allidx <= ncov] 345 | taxidx <- allidx[allidx > ncov] 346 | 347 | idxs <- NULL 348 | x.select.min <- NULL 349 | 350 | if (length(taxidx) > 1){ 351 | idxs <- combn(taxidx,2) 352 | for (k in 1:ncol(idxs)){ 353 | x.select.min <- cbind(x.select.min, x[,idxs[1,k]] - x[,idxs[2,k]]) 354 | } 355 | colnames(x.select.min) <- rep("",ncol(x.select.min)) 356 | } 357 | 358 | if (length(covidx) > 0){ 359 | x.select.min <- cbind(x.select.min, x[,covidx]) 360 | if (!is.null(idxs)){ 361 | colnames(x.select.min)[(ncol(idxs)+1):ncol(x.select.min)] = colnames(x)[covidx] 362 | }else{ 363 | colnames(x.select.min) <- colnames(x)[covidx] 364 | } 365 | } 366 | 367 | 368 | # if(is.null(x.select.min)) break 369 | 370 | if (ncol(x.select.min) > 1){ 371 | stepglmnet <- suppressWarnings(cv.glmnet(x=x.select.min,y=Surv(t,d),type.measure = "deviance",family="cox")) 372 | 373 | if (length(taxidx) < 2){ 374 | 375 | idxs <- NULL 376 | 377 | # covs <- colnames(x.select.min)[which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)] 378 | 379 | }else{ 380 | 381 | if (length(covidx) == 0) idxs <- idxs[,which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)] 382 | 383 | if (length(covidx) > 0){ 384 | 385 | # covs <- colnames(x.select.min)[setdiff(which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0), 386 | # 1:ncol(idxs))] 387 | 388 | idxs <- idxs[,setdiff(which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0), 389 | (ncol(idxs)+1):(ncol(idxs)+length(covidx)))] 390 | 391 | } 392 | 393 | } 394 | 395 | x.select.min <- x.select.min[,which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)] 396 | } 397 | 398 | if (sum(colnames(x.select.min)=="") > 0) colnames(x.select.min)[colnames(x.select.min)==""] <- paste0("x.",1:sum(colnames(x.select.min)=="")) 399 | df_step2 <- data.frame(t=t,d=d,x.select.min) 400 | step2fit <- suppressWarnings(step(coxph(Surv(t,d)~.,data=df_step2),trace=0)) 401 | vars <- suppressWarnings(as.numeric(sapply(names(step2fit$coefficients),function(x) strsplit(x,split = "[.]")[[1]][2]))) 402 | vars <- vars[!is.na(vars)] 403 | 404 | selected <- NULL 405 | if (is.null(ncol(idxs)) & length(vars) > 0){ 406 | selected <- idxs 407 | }else if (length(vars) > 0){ 408 | selected <- idxs[,vars] 409 | } 410 | 411 | ret$step2.feature.min = selected 412 | ret$step2fit.min <- step2fit 413 | } 414 | 415 | if (length(which(ret$best.beta$add.1se!=0)) > 1){ 416 | 417 | allidx <- which(ret$best.beta$add.1se!=0) 418 | 419 | covidx <- allidx[allidx <= ncov] 420 | taxidx <- allidx[allidx > ncov] 421 | 422 | idxs <- NULL 423 | x.select.min <- NULL 424 | 425 | if (length(taxidx) > 1){ 426 | idxs <- combn(taxidx,2) 427 | for (k in 1:ncol(idxs)){ 428 | x.select.min <- cbind(x.select.min, x[,idxs[1,k]] - x[,idxs[2,k]]) 429 | } 430 | colnames(x.select.min) <- rep("",ncol(x.select.min)) 431 | } 432 | 433 | if (length(covidx) > 0){ 434 | x.select.min <- cbind(x.select.min, x[,covidx]) 435 | if (!is.null(idxs)){ 436 | colnames(x.select.min)[(ncol(idxs)+1):ncol(x.select.min)] = colnames(x)[covidx] 437 | }else{ 438 | colnames(x.select.min) <- colnames(x)[covidx] 439 | } 440 | } 441 | 442 | # if(is.null(x.select.min)) break 443 | 444 | if (ncol(x.select.min) > 1){ 445 | stepglmnet <- suppressWarnings(cv.glmnet(x=x.select.min,y=Surv(t,d),type.measure = "deviance",family="cox")) 446 | 447 | if (length(taxidx) < 2){ 448 | 449 | idxs <- NULL 450 | 451 | # covs <- colnames(x.select.min)[which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)] 452 | 453 | }else{ 454 | 455 | if (length(covidx) == 0) idxs <- idxs[,which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)] 456 | 457 | if (length(covidx) > 0){ 458 | 459 | # covs <- colnames(x.select.min)[setdiff(which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0),1:ncol(idxs))] 460 | 461 | idxs <- idxs[,setdiff(which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0), 462 | (ncol(idxs)+1):(ncol(idxs)+length(covidx)))] 463 | 464 | } 465 | 466 | } 467 | 468 | x.select.min <- x.select.min[,which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)] 469 | } 470 | 471 | if (sum(colnames(x.select.min)=="") > 0) colnames(x.select.min)[colnames(x.select.min)==""] <- paste0("x.",1:sum(colnames(x.select.min)=="")) 472 | df_step2 <- data.frame(t=t,d=d,x.select.min) 473 | step2fit <- suppressWarnings(step(coxph(Surv(t,d)~.,data=df_step2),trace=0)) 474 | vars <- suppressWarnings(as.numeric(sapply(names(step2fit$coefficients),function(x) strsplit(x,split = "[.]")[[1]][2]))) 475 | vars <- vars[!is.na(vars)] 476 | 477 | selected <- NULL 478 | if (is.null(ncol(idxs)) & length(vars) > 0){ 479 | selected <- idxs 480 | }else if (length(vars) > 0){ 481 | selected <- idxs[,vars] 482 | } 483 | 484 | ret$step2.feature.1se = selected 485 | ret$step2fit.1se <- step2fit 486 | 487 | } 488 | 489 | } 490 | 491 | } 492 | 493 | }else{ 494 | 495 | ret <- list(beta=fullfit$beta[,lidx], 496 | lambda=fullfit$lambda[lidx], 497 | a=a, 498 | loss=fullfit$loss[lidx], 499 | mse=fullfit$mse[lidx] 500 | ) 501 | 502 | if (plot){ 503 | 504 | beta_nzero <- suppressWarnings(data.frame(reshape::melt(ret$beta[rowSums(ret$beta != 0) > 0,]))) 505 | beta_nzero$lambda <- ret$lambda[beta_nzero$X2] 506 | beta_nzero$loglambda <- log(beta_nzero$lambda) 507 | 508 | lambda_count <- data.frame(loglambda = log(ret$lambda), 509 | count = colSums(ret$beta != 0)) 510 | lambda_count <- lambda_count[seq(5,nrow(lambda_count),length.out=10),] 511 | 512 | top10feat <- sort(ret$beta[,length(ret$lambda)])[c(1:5,(p-4):p)] 513 | top10name <- names(top10feat) 514 | 515 | pcoef <- ggplot(beta_nzero, aes(x=.data$loglambda,y=.data$value,group=.data$X1,color=as.factor(.data$X1))) + 516 | geom_line() + 517 | scale_color_manual(values=rainbow(sum(rowSums(ret$beta != 0) > 0))) + 518 | theme_bw() + 519 | theme(legend.position = "none") + 520 | xlab(expression(paste("log(",lambda,")"))) + 521 | ylab("Coefficient") + 522 | # annotate("text",x=min(beta_nzero$loglambda)-2,y=top10feat,label=top10name,hjust=0)+ 523 | annotate("text",x=lambda_count$loglambda,y=max(beta_nzero$value)+0.2,label=as.character(lambda_count$count))+ 524 | ggtitle(expression(paste("Coefficients versus log(",lambda,")"))) 525 | 526 | ret$pcoef <- pcoef 527 | 528 | } 529 | 530 | } 531 | 532 | ret$runtime <- proc.time() - ptm 533 | 534 | return(ret) 535 | 536 | } --------------------------------------------------------------------------------