├── .github
    ├── .gitignore
    ├── workflows
    │   ├── pkgdown.yaml
    │   └── R-CMD-check.yaml
    ├── CODE_OF_CONDUCT.md
    └── pull_request_template.md
├── vignettes
    ├── .gitignore
    ├── Using-FLORAL-with-phyloseq.Rmd
    ├── Using-FLORAL-for-survival-models-with-longitudinal-microbiome-data.Rmd
    └── Using-FLORAL-for-Microbiome-Analysis.Rmd
├── R
    ├── sysdata.rda
    ├── coxsplit.R
    ├── RcppExports.R
    ├── utils.R
    ├── LogRatioLasso.R
    ├── LogRatioLogisticLasso.R
    ├── simu.R
    └── LogRatioCoxLasso.R
├── man
    ├── figures
    │   ├── logo.jpg
    │   ├── logo.png
    │   ├── README-plot-1.png
    │   └── README-plot-2.png
    ├── phy_to_floral_data.Rd
    ├── simu.Rd
    ├── a.FLORAL.Rd
    ├── mcv.FLORAL.Rd
    └── FLORAL.Rd
├── _pkgdown.yml
├── pkgdown
    └── favicon
    │   ├── favicon.ico
    │   ├── favicon-16x16.png
    │   ├── favicon-32x32.png
    │   ├── apple-touch-icon.png
    │   ├── apple-touch-icon-60x60.png
    │   ├── apple-touch-icon-76x76.png
    │   ├── apple-touch-icon-120x120.png
    │   ├── apple-touch-icon-152x152.png
    │   └── apple-touch-icon-180x180.png
├── inst
    ├── extdata
    │   └── YachidaS_2019.Rdata
    └── WORDLIST
├── tests
    ├── spelling.R
    ├── testthat
    │   ├── test-simu.R
    │   ├── test-FLORAL.R
    │   └── test-utils.R
    └── testthat.R
├── .gitignore
├── .Rbuildignore
├── codecov.yml
├── cran-comments.md
├── LogRatioReg.Rproj
├── src
    ├── Makevars
    ├── Makevars.win
    ├── RcppExports.cpp
    └── pgee.cpp
├── data-raw
    └── internal.R
├── NAMESPACE
├── DESCRIPTION
├── NEWS.md
├── README.Rmd
└── README.md


/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | 


--------------------------------------------------------------------------------
/vignettes/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | *.R
3 | 


--------------------------------------------------------------------------------
/R/sysdata.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vdblab/FLORAL/HEAD/R/sysdata.rda


--------------------------------------------------------------------------------
/man/figures/logo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vdblab/FLORAL/HEAD/man/figures/logo.jpg


--------------------------------------------------------------------------------
/man/figures/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vdblab/FLORAL/HEAD/man/figures/logo.png


--------------------------------------------------------------------------------
/_pkgdown.yml:
--------------------------------------------------------------------------------
1 | url: https://vdblab.github.io/FLORAL/
2 | template:
3 |   bootstrap: 5
4 | 
5 | 


--------------------------------------------------------------------------------
/pkgdown/favicon/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vdblab/FLORAL/HEAD/pkgdown/favicon/favicon.ico


--------------------------------------------------------------------------------
/man/figures/README-plot-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vdblab/FLORAL/HEAD/man/figures/README-plot-1.png


--------------------------------------------------------------------------------
/man/figures/README-plot-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vdblab/FLORAL/HEAD/man/figures/README-plot-2.png


--------------------------------------------------------------------------------
/inst/extdata/YachidaS_2019.Rdata:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vdblab/FLORAL/HEAD/inst/extdata/YachidaS_2019.Rdata


--------------------------------------------------------------------------------
/pkgdown/favicon/favicon-16x16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vdblab/FLORAL/HEAD/pkgdown/favicon/favicon-16x16.png


--------------------------------------------------------------------------------
/pkgdown/favicon/favicon-32x32.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vdblab/FLORAL/HEAD/pkgdown/favicon/favicon-32x32.png


--------------------------------------------------------------------------------
/pkgdown/favicon/apple-touch-icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vdblab/FLORAL/HEAD/pkgdown/favicon/apple-touch-icon.png


--------------------------------------------------------------------------------
/pkgdown/favicon/apple-touch-icon-60x60.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vdblab/FLORAL/HEAD/pkgdown/favicon/apple-touch-icon-60x60.png


--------------------------------------------------------------------------------
/pkgdown/favicon/apple-touch-icon-76x76.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vdblab/FLORAL/HEAD/pkgdown/favicon/apple-touch-icon-76x76.png


--------------------------------------------------------------------------------
/pkgdown/favicon/apple-touch-icon-120x120.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vdblab/FLORAL/HEAD/pkgdown/favicon/apple-touch-icon-120x120.png


--------------------------------------------------------------------------------
/pkgdown/favicon/apple-touch-icon-152x152.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vdblab/FLORAL/HEAD/pkgdown/favicon/apple-touch-icon-152x152.png


--------------------------------------------------------------------------------
/pkgdown/favicon/apple-touch-icon-180x180.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vdblab/FLORAL/HEAD/pkgdown/favicon/apple-touch-icon-180x180.png


--------------------------------------------------------------------------------
/tests/spelling.R:
--------------------------------------------------------------------------------
1 | if(requireNamespace('spelling', quietly = TRUE))
2 |   spelling::spell_check_test(vignettes = TRUE, error = FALSE,
3 |                              skip_on_cran = TRUE)
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .Rproj.user
 2 | .Rhistory
 3 | .RData
 4 | .Ruserdata
 5 | src/*.o
 6 | src/*.so
 7 | src/*.dll
 8 | R/simulation.R
 9 | R/main_script.R
10 | docs
11 | inst/doc
12 | /doc/
13 | /Meta/
14 | 


--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^renv$
 2 | ^renv\.lock$
 3 | ^.*\.Rproj$
 4 | ^\.Rproj\.user$
 5 | ^LICENSE$
 6 | ^\.github$
 7 | ^README\.Rmd$
 8 | ^_pkgdown\.yml$
 9 | ^docs$
10 | ^pkgdown$
11 | ^codecov\.yml$
12 | ^doc$
13 | ^Meta$
14 | ^cran-comments\.md$
15 | ^data-raw$
16 | 


--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
 1 | comment: false
 2 | 
 3 | coverage:
 4 |   status:
 5 |     project:
 6 |       default:
 7 |         target: auto
 8 |         threshold: 1%
 9 |         informational: true
10 |     patch:
11 |       default:
12 |         target: auto
13 |         threshold: 1%
14 |         informational: true
15 | 


--------------------------------------------------------------------------------
/cran-comments.md:
--------------------------------------------------------------------------------
 1 | ## Test environments
 2 | * windows-latest (release) (on github actions)
 3 | * ubuntu-latest (release) (on github actions)
 4 | 
 5 | ## R CMD check results
 6 | Maintainer: 'Teng Fei <feit1@mskcc.org>'
 7 | 
 8 | New submission
 9 |   
10 | ## Additional Comments
11 | 
12 | Thank you for your time.


--------------------------------------------------------------------------------
/tests/testthat/test-simu.R:
--------------------------------------------------------------------------------
 1 | test_that("simu() works", {
 2 |   expect_error(
 3 |     dat <- simu(n=50,p=30,model="linear"),
 4 |     NA
 5 |   )
 6 |   expect_error(
 7 |     dat <- simu(n=50,p=30,model="binomial"),
 8 |     NA
 9 |   )
10 |   expect_error(
11 |     dat <- simu(n=50,p=30,model="cox"),
12 |     NA
13 |   )
14 | })
15 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
 1 | # This file is part of the standard setup for testthat.
 2 | # It is recommended that you do not modify it.
 3 | #
 4 | # Where should you do additional test configuration?
 5 | # Learn more about the roles of various files in:
 6 | # * https://r-pkgs.org/tests.html
 7 | # * https://testthat.r-lib.org/reference/test_package.html#special-files
 8 | 
 9 | library(testthat)
10 | library(FLORAL)
11 | 
12 | test_check("FLORAL")
13 | 


--------------------------------------------------------------------------------
/LogRatioReg.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | ProjectId: 80ae1746-c1e0-4fc6-a99b-9e728485768c
 3 | 
 4 | RestoreWorkspace: Default
 5 | SaveWorkspace: Default
 6 | AlwaysSaveHistory: Default
 7 | 
 8 | EnableCodeIndexing: Yes
 9 | UseSpacesForTab: Yes
10 | NumSpacesForTab: 2
11 | Encoding: UTF-8
12 | 
13 | RnwWeave: Sweave
14 | LaTeX: pdfLaTeX
15 | 
16 | BuildType: Package
17 | PackageUseDevtools: Yes
18 | PackageInstallArgs: --no-multiarch --with-keep.source
19 | PackageCheckArgs: --as-cran
20 | 


--------------------------------------------------------------------------------
/inst/WORDLIST:
--------------------------------------------------------------------------------
 1 | Baichoo
 2 | Biostatistics
 3 | CMD
 4 | CRC
 5 | Compositional
 6 | DM
 7 | Dai
 8 | Epub
 9 | Gönen
10 | Hendry
11 | Interpretable
12 | JU
13 | LOg
14 | Lv
15 | MRM
16 | MSKCC
17 | Microbiome
18 | Miltiadous
19 | PMC
20 | PMCID
21 | PMID
22 | Peled
23 | Perales
24 | Phyloseq
25 | RAtio
26 | Reproducibility
27 | Sadeghi
28 | Scalable
29 | Shouval
30 | al
31 | bioRxiv
32 | colorectal
33 | compositional
34 | conda
35 | crmeth
36 | doi
37 | et
38 | feit
39 | finegray
40 | glom
41 | microbiome
42 | mskcc
43 | multivariable
44 | ncv
45 | performace
46 | pfilter
47 | phy's
48 | phyloseq
49 | pre
50 | preprint
51 | reproducibility
52 | se
53 | timedep
54 | 


--------------------------------------------------------------------------------
/tests/testthat/test-FLORAL.R:
--------------------------------------------------------------------------------
 1 | test_that("FLORAL() works", {
 2 |   
 3 |   set.seed(23420)
 4 |   
 5 |   dat <- simu(n=50,p=30,model="linear")
 6 |   expect_error(
 7 |     fit <- FLORAL(dat$xcount,dat$y,family="gaussian",progress=FALSE,step2=TRUE),
 8 |     NA
 9 |   )
10 |   
11 |   dat <- simu(n=50,p=30,model="binomial")
12 |   expect_error(
13 |     fit <- FLORAL(dat$xcount,dat$y,family="binomial",progress=FALSE,step2=TRUE),
14 |     NA
15 |   )
16 |   
17 |   dat <- simu(n=50,p=30,model="cox")
18 |   expect_error(
19 |     fit <- FLORAL(dat$xcount,survival::Surv(dat$t,dat$d),family="cox",progress=FALSE,step2=TRUE),
20 |     NA
21 |   )
22 |   
23 | })
24 | 


--------------------------------------------------------------------------------
/src/Makevars:
--------------------------------------------------------------------------------
 1 | 
 2 | ## With R 3.1.0 or later, you can uncomment the following line to tell R to 
 3 | ## enable compilation with C++11 (where available)
 4 | ##
 5 | ## Also, OpenMP support in Armadillo prefers C++11 support. However, for wider
 6 | ## availability of the package we do not yet enforce this here.  It is however
 7 | ## recommended for client packages to set it.
 8 | ##
 9 | ## And with R 3.4.0, and RcppArmadillo 0.7.960.*, we turn C++11 on as OpenMP
10 | ## support within Armadillo prefers / requires it
11 | ## CXX_STD = CXX11
12 | 
13 | PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) 
14 | PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS)
15 | CXX_STD=CXX17


--------------------------------------------------------------------------------
/src/Makevars.win:
--------------------------------------------------------------------------------
 1 | 
 2 | ## With R 3.1.0 or later, you can uncomment the following line to tell R to 
 3 | ## enable compilation with C++11 (where available)
 4 | ##
 5 | ## Also, OpenMP support in Armadillo prefers C++11 support. However, for wider
 6 | ## availability of the package we do not yet enforce this here.  It is however
 7 | ## recommended for client packages to set it.
 8 | ##
 9 | ## And with R 3.4.0, and RcppArmadillo 0.7.960.*, we turn C++11 on as OpenMP
10 | ## support within Armadillo prefers / requires it
11 | ## CXX_STD = CXX11
12 | 
13 | PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) 
14 | PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS)
15 | CXX_STD=CXX17
16 | 


--------------------------------------------------------------------------------
/data-raw/internal.R:
--------------------------------------------------------------------------------
 1 | if (interactive()){
 2 |   cols = c("SampleID", "PatientID", "Timepoint", "Consistency", "Accession", 
 3 |            "BioProject", "DayRelativeToNearestHCT")
 4 |   #this file has duplicate rows, and has multiple rows per pool
 5 |   samples <- read.csv("https://figshare.com/ndownloader/files/33076496")[, cols] 
 6 |   samples <- samples[!duplicated(samples),]
 7 |   samples <- samples[1:100,] # Using the first 100 samples only.
 8 |   counts <- read.csv("https://figshare.com/ndownloader/files/26393788") 
 9 |   counts <- counts[counts$SampleID %in% samples$SampleID, ]
10 |   taxonomy <- read.csv("https://figshare.com/ndownloader/files/26770997")
11 |   
12 |   usethis::use_data(samples,counts,taxonomy, internal = TRUE)
13 | }


--------------------------------------------------------------------------------
/man/phy_to_floral_data.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{phy_to_floral_data}
 4 | \alias{phy_to_floral_data}
 5 | \title{Create data input list from phyloseq object}
 6 | \usage{
 7 | phy_to_floral_data(phy, y = NULL, covariates = NULL)
 8 | }
 9 | \arguments{
10 | \item{phy}{Phyloseq object}
11 | 
12 | \item{y}{Outcome column of interest from phy's sample_data}
13 | 
14 | \item{covariates}{Covariate column names from phy's sample_data}
15 | }
16 | \value{
17 | list
18 | }
19 | \description{
20 | Create data input list from phyloseq object
21 | }
22 | \examples{
23 | library(phyloseq)
24 | data(GlobalPatterns)
25 | # add a covariate
26 | sample_data(GlobalPatterns)$test <- rep(c(1, 0), nsamples(GlobalPatterns)/2)
27 | # GlobalPatterns <- tax_glom(GlobalPatterns, "Phylum")
28 | dat <- phy_to_floral_data(GlobalPatterns, y = "test", covariates = c("SampleType"))
29 | # res <- FLORAL(x = dat$xcount, y=dat$y, ncov=dat$ncov, family = "binomial", ncv=NULL)
30 | 
31 | }
32 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(FLORAL)
 4 | export(a.FLORAL)
 5 | export(mcv.FLORAL)
 6 | export(phy_to_floral_data)
 7 | export(simu)
 8 | import(Rcpp)
 9 | import(doParallel)
10 | import(doRNG)
11 | import(dplyr)
12 | import(foreach)
13 | import(ggplot2)
14 | import(glmnet)
15 | import(parallel)
16 | import(phyloseq)
17 | import(survival)
18 | importFrom(caret,createFolds)
19 | importFrom(grDevices,rainbow)
20 | importFrom(msm,dpexp)
21 | importFrom(msm,ppexp)
22 | importFrom(msm,rpexp)
23 | importFrom(mvtnorm,rmvnorm)
24 | importFrom(reshape,melt)
25 | importFrom(stats,binomial)
26 | importFrom(stats,dist)
27 | importFrom(stats,gaussian)
28 | importFrom(stats,glm)
29 | importFrom(stats,median)
30 | importFrom(stats,na.omit)
31 | importFrom(stats,rbinom)
32 | importFrom(stats,rexp)
33 | importFrom(stats,rmultinom)
34 | importFrom(stats,rnorm)
35 | importFrom(stats,runif)
36 | importFrom(stats,sd)
37 | importFrom(stats,step)
38 | importFrom(survcomp,concordance.index)
39 | importFrom(utils,combn)
40 | useDynLib(FLORAL)
41 | 


--------------------------------------------------------------------------------
/.github/workflows/pkgdown.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   #pull_request:
 7 |     #branches: [main, master]
 8 | 
 9 | name: pkgdown
10 | 
11 | jobs:
12 |   pkgdown:
13 |     runs-on: ubuntu-latest
14 |     # Only restrict concurrency for non-PR jobs
15 |     concurrency:
16 |       group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }}
17 |     env:
18 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
19 |     permissions:
20 |       contents: write
21 |     steps:
22 |       - uses: actions/checkout@v4
23 | 
24 |       - uses: r-lib/actions/setup-pandoc@v2
25 | 
26 |       - uses: r-lib/actions/setup-r@v2
27 |         with:
28 |           use-public-rspm: true
29 | 
30 |       - uses: r-lib/actions/setup-r-dependencies@v2
31 |         with:
32 |           extra-packages: any::pkgdown, local::.
33 |           needs: website
34 | 
35 |       - name: Build site
36 |         run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE)
37 |         shell: Rscript {0}
38 |         
39 |       - name: Deploy package
40 |         run: |
41 |           git config --local user.name "$GITHUB_ACTOR"
42 |           git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com"
43 |           Rscript -e 'pkgdown::deploy_to_branch(new_process = FALSE)'


--------------------------------------------------------------------------------
/.github/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Code of Conduct
 2 | 
 3 | As contributors and maintainers of this project, we pledge to respect all people who 
 4 | contribute through reporting issues, posting feature requests, updating documentation,
 5 | submitting pull requests or patches, and other activities.
 6 | 
 7 | We are committed to making participation in this project a harassment-free experience for
 8 | everyone, regardless of level of experience, gender, gender identity and expression,
 9 | sexual orientation, disability, personal appearance, body size, race, ethnicity, age, or religion.
10 | 
11 | Examples of unacceptable behavior by participants include the use of sexual language or
12 | imagery, derogatory comments or personal attacks, trolling, public or private harassment,
13 | insults, or other unprofessional conduct.
14 | 
15 | Project maintainers have the right and responsibility to remove, edit, or reject comments,
16 | commits, code, wiki edits, issues, and other contributions that are not aligned to this 
17 | Code of Conduct. Project maintainers who do not follow the Code of Conduct may be removed 
18 | from the project team.
19 | 
20 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by 
21 | opening an issue or contacting one or more of the project maintainers.
22 | 
23 | This Code of Conduct is adapted from the Contributor Covenant 
24 | (https://www.contributor-covenant.org), version 1.0.0, available at 
25 | https://contributor-covenant.org/version/1/0/0/.
26 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: FLORAL
 2 | Type: Package
 3 | Title: Fit Log-Ratio Lasso Regression for Compositional Data
 4 | Version: 0.6.0
 5 | Date: 2025-07-13
 6 | Authors@R: c(
 7 |     person("Teng", "Fei", , "feit1@mskcc.org", role = c("aut", "cre", "cph"),
 8 |            comment = c(ORCID = "0000-0001-7888-1715")),
 9 |     person("Tyler", "Funnell", role=c("aut"),
10 |            comment = c(ORCID = "0000-0003-1612-5644")),
11 |     person("Nicholas", "Waters", role=c("aut"),
12 |            comment = c(ORCID = "0000-0002-9035-2143")),
13 |     person("Sandeep", "Raj", role=c("aut"),
14 |            comment = c(ORCID = "0000-0003-4629-0528"))
15 |   )
16 | Description: Log-ratio Lasso regression for continuous, binary, and survival outcomes with (longitudinal) compositional features. See Fei and others (2024) <doi:10.1016/j.crmeth.2024.100899>.
17 | License: GPL (>= 3)
18 | URL: https://vdblab.github.io/FLORAL/
19 | BugReports: https://github.com/vdblab/FLORAL/issues
20 | Depends: 
21 |   R (>= 3.5.0)
22 | SystemRequirements: C++17
23 | biocViews:
24 | Imports:
25 |   Rcpp (>= 1.0.9),
26 |   stats,
27 |   survival,
28 |   ggplot2,
29 |   survcomp,
30 |   reshape,
31 |   dplyr,
32 |   glmnet,
33 |   caret,
34 |   grDevices,
35 |   utils,
36 |   mvtnorm,
37 |   doParallel,
38 |   doRNG,
39 |   foreach,
40 |   msm,
41 |   phyloseq
42 | LinkingTo: Rcpp, RcppArmadillo, RcppProgress
43 | RoxygenNote: 7.3.2
44 | Encoding: UTF-8
45 | Suggests:
46 |     covr,
47 |     knitr,
48 |     rmarkdown,
49 |     spelling,
50 |     testthat (>= 3.0.0),
51 |     patchwork,
52 |     tidyverse
53 | Language: en-US
54 | Config/testthat/edition: 3
55 | VignetteBuilder: knitr
56 | 


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | **What changes are proposed in this pull request?**
 2 | 
 3 | 
 4 | **If there is an GitHub issue associated with this pull request, please provide link.**
 5 | 
 6 | 
 7 | --------------------------------------------------------------------------------
 8 | 
 9 | Checklist for PR reviewer
10 | 
11 | - [ ] PR branch has pulled the most recent updates from master branch. Ensure the pull request branch and your local version match and both have the latest updates from the master branch.
12 | - [ ] If a new function was added, function included in `_pkgdown.yml`
13 | - [ ] If a bug was fixed, a unit test was added for the bug check
14 | - [ ] Run `pkgdown::build_site()`. Check the R console for errors, and review the rendered website.
15 | - [ ] Code coverage is suitable for any new functions/features. Review coverage with `covr::report()`. Before you run, set `Sys.setenv(NOT_CRAN="true")` and begin in a fresh R session without any packages loaded. 
16 | - [ ] R CMD Check runs without errors, warnings, and notes
17 | - [ ] `usethis::use_spell_check()` runs with no spelling errors in documentation
18 | 
19 | When the branch is ready to be merged into master:
20 | - [ ] Update `NEWS.md` with the changes from this pull request under the heading "`# FLORAL (development version)`". If there is an issue associated with the pull request, reference it in parentheses at the end of the update.
21 | - [ ] Increment the version number using `usethis::use_version(which = "dev")` 
22 | - [ ] Run `usethis::use_spell_check()` again
23 | - [ ] Approve Pull Request
24 | - [ ] Merge the PR. Please use "Squash and merge".
25 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
 1 | # FLORAL 0.6.0
 2 | 
 3 | * Removing dependencies with R package `ast2ast`.
 4 | 
 5 | # FLORAL 0.5.0
 6 | 
 7 | * Adding instructions to install the package via `bioconda`.
 8 | 
 9 | * Address new CRAN requirements regarding Internet access.
10 | 
11 | # FLORAL 0.4.0
12 | 
13 | * Adding a function `phy_to_floral_data` which helps convert a `phyloseq` object to be compatible with `FLORAL`.
14 | 
15 | * Adding a corresponding vignette for `phy_to_floral_data`.
16 | 
17 | * Introducing the new GEE model for longitudinal continuous and binary outcomes. More documentations to follow in the next development version.
18 | 
19 | # FLORAL 0.3.0
20 | 
21 | * Improves stability when fitting Fine-Gray model with longitudinal covariates.
22 | 
23 | * Enables parallel computation for cross-validation.
24 | 
25 | * Fixes several bugs as reported in Issues. 
26 | 
27 | * Adding options to use user-specified pseudo counts
28 | 
29 | * Adding options to use user-specified number of maximum number of iterations
30 | 
31 | * Adding a simulation scenario for survival regression models with longitudinal features
32 | 
33 | * (BETA version of) the new GEE method
34 | 
35 | # FLORAL 0.2.0
36 | 
37 | * Including more examples in document compared to CRAN version (0.1.0.9000)
38 | 
39 | * Enables elastic net models. Users can specify the weight of lasso penalty using argument `a`. (0.1.0.9001)
40 | 
41 | * Allows adding non-compositional covariates which are not constrained by the zero-sum constraint. (0.1.0.9001)
42 | 
43 | * Adds a function `mcv.FLORAL()` to perform multiple runs of k-fold cross-validation to summarize selection probabilities for features. (0.1.0.9001)
44 | 
45 | * Adds a function `a.FLORAL()` to compare different choices of elastic net weight `a` for a fixed cross-validation setting. (0.1.0.9001)
46 | 
47 | # FLORAL 0.1.0
48 | 
49 | * Initial release.
50 | 


--------------------------------------------------------------------------------
/vignettes/Using-FLORAL-with-phyloseq.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Using FLORAL with phyloseq data"
 3 | output:
 4 |   rmarkdown::html_vignette:
 5 |     md_extensions: [ 
 6 |       "-autolink_bare_uris" 
 7 |     ]
 8 | vignette: >
 9 |   %\VignetteIndexEntry{Using FLORAL with phyloseq data}
10 |   %\VignetteEncoding{UTF-8}
11 |   %\VignetteEngine{knitr::rmarkdown}
12 | editor_options: 
13 |   chunk_output_type: console
14 | ---
15 | 
16 | ```{r setup, include=FALSE}
17 | knitr::opts_chunk$set(echo = TRUE)
18 | library(tidyverse)
19 | library(phyloseq)
20 | ```
21 | 
22 | [Phyloseq](http://joey711.github.io/phyloseq/) is a popular package for working with microbiome data.  Here we show how to use the `phy_to_floral_data` helper function to convert phyloseq data into a format accepted by FLORAL.
23 | 
24 | The following code downloads data described in [this paper](https://www.nature.com/articles/s41597-021-00860-8) and turns it into a phyloseq object.  The tax_glom step here takes some time, and can be replaced with [`speedyseq::tax_glom`](https://github.com/mikemc/speedyseq) for better performace.
25 | ```{r}
26 | samples <- get0("samples", envir = asNamespace("FLORAL"))
27 | counts <- get0("counts", envir = asNamespace("FLORAL"))
28 | taxonomy <- get0("taxonomy", envir = asNamespace("FLORAL"))
29 | 
30 | phy <- phyloseq(
31 |   sample_data(samples %>% column_to_rownames("SampleID")),
32 |   tax_table(taxonomy %>% select(ASV, Kingdom:Genus) %>% column_to_rownames("ASV") %>% as.matrix()),
33 |   otu_table(counts  %>% pivot_wider(names_from = "SampleID", values_from = "Count", values_fill = 0) %>% column_to_rownames("ASV") %>% as.matrix(), taxa_are_rows = TRUE)
34 | ) %>%  subset_samples(DayRelativeToNearestHCT > -30 & DayRelativeToNearestHCT < 0) %>% 
35 |   tax_glom("Genus")
36 | ```
37 | 
38 | Next, we convert that phyloseq object into a list of results to be used by FLORAL; we have to specify the main outcome of interest as `y`, and any metadata columns (from `sample_data(phy)`) to use as covariates.  Note that the analysis described here is just an example for using the function; this
39 | 
40 | 
41 | ```{r}
42 | dat <- FLORAL::phy_to_floral_data(phy, covariates=c("Consistency"), y = "DayRelativeToNearestHCT")
43 | ```
44 | 
45 | The resulting list has named entities for the main arguments to FLORAL:
46 | 
47 | ```{r}
48 | res <- FLORAL::FLORAL(y = dat$y, x = dat$xcount, ncov = dat$ncov, family = "gaussian", ncv=NULL, progress=FALSE)
49 | ```


--------------------------------------------------------------------------------
/.github/workflows/R-CMD-check.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   #pull_request:
 7 |   #  branches: [main, master]
 8 | 
 9 | name: R-CMD-check
10 | 
11 | jobs:
12 |   R-CMD-check:
13 |     runs-on: ${{ matrix.config.os }}
14 | 
15 |     name: ${{ matrix.config.os }} (${{ matrix.config.r }})
16 | 
17 |     strategy:
18 |       fail-fast: false
19 |       matrix:
20 |         config:
21 | #          - {os: macos-latest,   r: 'release'}
22 |           - {os: windows-latest, r: 'release'}
23 | #          - {os: ubuntu-latest,   r: 'devel', http-user-agent: 'release'}
24 |           - {os: ubuntu-latest,   r: 'release'}
25 | #          - {os: ubuntu-latest,   r: 'oldrel-1'}
26 | 
27 |     env:
28 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
29 |       R_KEEP_PKG_SOURCE: yes
30 | 
31 |     steps:
32 |       - uses: actions/checkout@v4
33 | 
34 |       - uses: r-lib/actions/setup-pandoc@v2
35 | 
36 |       - uses: r-lib/actions/setup-r@v2
37 |         with:
38 |           r-version: ${{ matrix.config.r }}
39 |           http-user-agent: ${{ matrix.config.http-user-agent }}
40 |           use-public-rspm: true
41 | 
42 |       - uses: r-lib/actions/setup-r-dependencies@v2
43 |         with:
44 |           extra-packages: any::rcmdcheck,any::covr 
45 |           needs: check,coverage
46 | 
47 |       - uses: r-lib/actions/check-r-package@v2
48 |         with:
49 |           upload-snapshots: true
50 |       - name: Test coverage
51 |         if: matrix.config.os == 'ubuntu-latest'
52 |         run: |
53 |           covr::codecov(
54 |             quiet = FALSE,
55 |             clean = FALSE,
56 |             install_path = file.path(Sys.getenv("RUNNER_TEMP"), "package")
57 |           )
58 |         shell: Rscript {0}
59 | 
60 |       - name: Show testthat output
61 |         if: matrix.config.os == 'ubuntu-latest'
62 |         run: |
63 |           ## --------------------------------------------------------------------
64 |           find ${{ runner.temp }}/package -name 'testthat.Rout*' -exec cat '{}' \; || true
65 |         shell: bash
66 | 
67 |       - name: Upload test results
68 |         if: failure() && matrix.config.os == 'ubuntu-latest'
69 |         uses: actions/upload-artifact@v4
70 |         with:
71 |           name: coverage-test-failures
72 |           path: ${{ runner.temp }}/package
73 | 


--------------------------------------------------------------------------------
/R/coxsplit.R:
--------------------------------------------------------------------------------
 1 | coxsplity=function(y, nfolds){
 2 |   N=nrow(y)
 3 |   tem=data.frame(y, i=seq(N), foldid=0)
 4 |   tem=tem[order(y[, "time"], y[, "status"]), ]
 5 |   n1=sum(y[, "status"]);n2=N-n1
 6 |   
 7 |   tem$foldid[tem[, "status"]==1]=sample(rep(seq(nfolds), length=n1))
 8 |   tem$foldid[tem[, "status"]==0]=sample(rep(seq(nfolds), length=n2))
 9 |   
10 |   foldid=tem$foldid[order(tem$i)]
11 |   return(foldid)
12 | }
13 | 
14 | coxsplitss=function(y, id, nfolds){
15 |   full = data.frame(y, foldid=0, id=id)
16 |   tem = full %>% dplyr::group_by(.data$id) %>% dplyr::filter(row_number()==n())
17 |   N=nrow(tem)
18 |   tem$i = seq(N)
19 |   tem=tem[order(tem$stop, tem$status), ]
20 |   n1=sum(y[, "status"]);n2=N-n1
21 |   # tem = as.matrix(tem)
22 |   
23 |   tem$foldid[tem$status==1]=sample(rep(seq(nfolds), length=n1))
24 |   tem$foldid[tem$status==0]=sample(rep(seq(nfolds), length=n2))
25 |   
26 |   temif <- tem %>% select(.data$foldid,.data$id)  #data.frame(tem[,c("foldid","id")])
27 |   full <- full %>% select(.data$start,.data$stop,.data$status,.data$id) %>% left_join(temif,by="id")
28 |   
29 |   foldid <- full$foldid
30 |   
31 |   return(foldid)
32 | }
33 | 
34 | fgfoldid=function(id, foldid){
35 |   idfoldid <- data.frame(id=unique(id),foldid=foldid)
36 |   yfoldid <- data.frame(id=id)
37 |   mergedid <- yfoldid %>% left_join(idfoldid,by="id")
38 |   
39 |   foldid <- mergedid$foldid
40 |   return(foldid)
41 | }
42 | 
43 | binsimuar1 <- function(mu,gamma){
44 |   
45 |   # simulate correlated binary variables using method by Qaqish (2003) with AR(1) corstr
46 |   
47 |   len <- length(mu)
48 |   y <- rep(0,len)
49 |   y[1] <- rbinom(1,1,mu[1])
50 |   for (i in 2:len){
51 |     lambda <- mu[i] + gamma*(y[i-1] - mu[i-1])*sqrt((mu[i]*(1-mu[i]))/(mu[i-1]*(1-mu[i-1])))
52 |     if (lambda < 0) lambda = 0.0001
53 |     if (lambda > 1) lambda = 0.9999
54 |     y[i] <- rbinom(1,1,lambda)
55 |   }
56 |   y
57 | }
58 | 
59 | binsimuexch <- function(mu,gamma){
60 |   
61 |   # simulate correlated binary variables using method by Qaqish (2003) with exchangeable corstr
62 |   
63 |   len <- length(mu)
64 |   y <- rep(0,len)
65 |   y[1] <- rbinom(1,1,mu[1])
66 |   
67 |   for (i in 2:len){
68 |     
69 |     lambda <- mu[i]
70 |     
71 |     for (j in 1:(i-1)){
72 |       
73 |       lambda <- lambda + (gamma/(1+(i-2)*gamma)) * sqrt((mu[i]*(1-mu[i]))/(mu[j]*(1-mu[j]))) * (y[j]-mu[j])
74 |       
75 |     }
76 |     
77 |     if (lambda < 0) lambda = 0.0001
78 |     if (lambda > 1) lambda = 0.9999
79 |     y[i] <- rbinom(1,1,lambda)
80 |   }
81 |   y
82 | }
83 | 


--------------------------------------------------------------------------------
/R/RcppExports.R:
--------------------------------------------------------------------------------
 1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand
 2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 3 | 
 4 | softthreshold <- function(x, lambda) {
 5 |     .Call('_FLORAL_softthreshold', PACKAGE = 'FLORAL', x, lambda)
 6 | }
 7 | 
 8 | gd_cov <- function(xx, xy, n, l, beta) {
 9 |     .Call('_FLORAL_gd_cov', PACKAGE = 'FLORAL', xx, xy, n, l, beta)
10 | }
11 | 
12 | gd_cov_al <- function(xx, xy, n, l, a, beta, mu, alpha, adjust = FALSE, ncov = 0L, wcov = 0) {
13 |     .Call('_FLORAL_gd_cov_al', PACKAGE = 'FLORAL', xx, xy, n, l, a, beta, mu, alpha, adjust, ncov, wcov)
14 | }
15 | 
16 | linear_enet_al <- function(x, y, len, mu, ub, lambda, wcov, a, adjust, ncov, display_progress = TRUE) {
17 |     .Call('_FLORAL_linear_enet_al', PACKAGE = 'FLORAL', x, y, len, mu, ub, lambda, wcov, a, adjust, ncov, display_progress)
18 | }
19 | 
20 | logistic_enet_al <- function(x, y, len, mu, ub, lambda, wcov, a, adjust, ncov, display_progress = TRUE, loop1 = FALSE, loop2 = FALSE) {
21 |     .Call('_FLORAL_logistic_enet_al', PACKAGE = 'FLORAL', x, y, len, mu, ub, lambda, wcov, a, adjust, ncov, display_progress, loop1, loop2)
22 | }
23 | 
24 | cox_enet_al <- function(x, t, d, tj, len, mu, ub, lambda, wcov, a, adjust, ncov, devnull, display_progress = TRUE, loop1 = FALSE, loop2 = FALSE, notcv = TRUE) {
25 |     .Call('_FLORAL_cox_enet_al', PACKAGE = 'FLORAL', x, t, d, tj, len, mu, ub, lambda, wcov, a, adjust, ncov, devnull, display_progress, loop1, loop2, notcv)
26 | }
27 | 
28 | cox_timedep_enet_al <- function(x, t0, t1, d, tj, len, mu, ub, lambda, wcov, a, adjust, ncov, devnull, display_progress = TRUE) {
29 |     .Call('_FLORAL_cox_timedep_enet_al', PACKAGE = 'FLORAL', x, t0, t1, d, tj, len, mu, ub, lambda, wcov, a, adjust, ncov, devnull, display_progress)
30 | }
31 | 
32 | fg_enet_al <- function(x, t0, t1, d, tj, w, len, mu, ub, lambda, wcov, a, adjust, ncov, devnull, display_progress = TRUE) {
33 |     .Call('_FLORAL_fg_enet_al', PACKAGE = 'FLORAL', x, t0, t1, d, tj, w, len, mu, ub, lambda, wcov, a, adjust, ncov, devnull, display_progress)
34 | }
35 | 
36 | gee_NR <- function(N, nt, y, X, nx, linkinv, mueta, variance, beta_new, Rhat, fihat, lambda, a, alpha, ncov, wcov, eps = 1e-6, muu = 1e6) {
37 |     .Call('_FLORAL_gee_NR', PACKAGE = 'FLORAL', N, nt, y, X, nx, linkinv, mueta, variance, beta_new, Rhat, fihat, lambda, a, alpha, ncov, wcov, eps, muu)
38 | }
39 | 
40 | gee_cor <- function(N, nt, y, X, linkinv, variance, beta_new, corstr, maxclsz, scalefix, scalevalue = 1) {
41 |     .Call('_FLORAL_gee_cor', PACKAGE = 'FLORAL', N, nt, y, X, linkinv, variance, beta_new, corstr, maxclsz, scalefix, scalevalue)
42 | }
43 | 
44 | gee_fit <- function(y, X, nt, linkinv, mueta, variance, corstr, lambda, a, ncov, wcov, tol = 1e-3, eps = 1e-6, muu = 1e6, maxiter1 = 100L, maxiter2 = 10L, scalefix = FALSE, scalevalue = 1, display_progress = TRUE) {
45 |     .Call('_FLORAL_gee_fit', PACKAGE = 'FLORAL', y, X, nt, linkinv, mueta, variance, corstr, lambda, a, ncov, wcov, tol, eps, muu, maxiter1, maxiter2, scalefix, scalevalue, display_progress)
46 | }
47 | 
48 | 


--------------------------------------------------------------------------------
/R/utils.R:
--------------------------------------------------------------------------------
 1 | # #' Clean covariate Columns
 2 | # #'.This converts categorical data to 1-hot encoded columns, and retains numeric column
 3 | # #' @param df Data.frame of covariates for this dataset
 4 | # #' @param cols columns to clean; columns not mentioned here are dropped
 5 | # #' @param drop_first default TRUE; drop first level of encoding for 1-hot
 6 | # #'
 7 | # #' @return `data.frame` of recoded covariates 
 8 | # #'
 9 | # #' @examples
10 | # #' df = data.frame(x=c("A", "B","C", "A", "B"))
11 | # #' clean_covariate_columns(df,cols="x")
12 | # #' clean_covariate_columns(df,cols="x", drop_first=FALSE)
13 | 
14 | clean_covariate_columns <- function(df, cols, drop_first = TRUE){
15 |   result <- data.frame(placeholder = character(nrow(df)))
16 |   for (col in cols) {
17 |     # keep numeric asis
18 |     if (is.numeric(df[, col])){
19 |       result = cbind(result, df[, col])
20 |       colnames(result)[-1] <- col
21 |     } else {
22 |       vals <- unique(df[, col])
23 |       valnames = paste0(col, "_", vals)
24 |       if (drop_first) {
25 |         vals <- vals[-1]
26 |         valnames = valnames[-1]
27 |       }
28 |       res <- data.frame(ifelse(vals[1]  == df[, col], 1, 0))
29 |       if (length(vals) > 1){
30 |         for (val_i in 2:length(vals)){
31 |           res = cbind(res, data.frame(ifelse(vals[val_i]  == df[, col], 1, 0)))
32 |         }
33 |       }
34 |       colnames(res) = valnames 
35 |       result <- cbind(result, res)
36 |     }
37 |   }
38 |   #drop placeholder
39 |   result <- result[, !colnames(result) == "placeholder"]
40 |   # deal with single covariates where results becomes a vector
41 |   if (!is.data.frame(result)){
42 |     result = data.frame(result)
43 |     colnames(result) <- cols
44 |   }
45 |   return(result)
46 | }
47 | 
48 | #' Create data input list from phyloseq object
49 | #'
50 | #' @param phy Phyloseq object
51 | #' @param y Outcome column of interest from phy's sample_data
52 | #' @param covariates Covariate column names from phy's sample_data
53 | #'
54 | #' @return list
55 | #' @export
56 | #'
57 | #' @examples
58 | #' library(phyloseq)
59 | #' data(GlobalPatterns)
60 | #' # add a covariate
61 | #' sample_data(GlobalPatterns)$test <- rep(c(1, 0), nsamples(GlobalPatterns)/2)
62 | #' # GlobalPatterns <- tax_glom(GlobalPatterns, "Phylum")
63 | #' dat <- phy_to_floral_data(GlobalPatterns, y = "test", covariates = c("SampleType"))
64 | #' # res <- FLORAL(x = dat$xcount, y=dat$y, ncov=dat$ncov, family = "binomial", ncv=NULL)
65 | #' 
66 | #' @import phyloseq
67 | 
68 | phy_to_floral_data<- function(phy, y=NULL, covariates=NULL){
69 |   
70 |   xcount = otu_table(phy) 
71 |   if (nrow(xcount) != nsamples(phy)){
72 |     # support both phyloseq objects with taxa as rows or columns
73 |     xcount = t(xcount)
74 |   }
75 |   if(any(rownames(xcount) != sample_names(phy))){
76 |     stop("malformed phyloseq object; columns of otu_table do not match sample IDs")
77 |   }
78 |   ncov = 0
79 |   sampdat = sample_data(phy) %>% data.frame() 
80 |   yres = sampdat %>% pull(all_of(y))
81 |   if (!missing(covariates)){
82 |     cov_df <- sampdat %>% select(all_of(covariates))
83 |     cov_df_clean <- clean_covariate_columns(df=cov_df, cols = covariates)
84 |     ncov = ncol(cov_df_clean)
85 |     # as.matrix here speeds things significantly
86 |     xcount = cbind(as.matrix(cov_df_clean), xcount )
87 |   }
88 |   return(list("xcount" = xcount, "ncov"=ncov, "y"=yres))
89 |   
90 | }
91 | 
92 | 


--------------------------------------------------------------------------------
/vignettes/Using-FLORAL-for-survival-models-with-longitudinal-microbiome-data.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Using FLORAL for survival models with longitudinal microbiome data"
 3 | output: 
 4 |   rmarkdown::html_vignette:
 5 |     md_extensions: [ 
 6 |       "-autolink_bare_uris" 
 7 |     ]
 8 | vignette: >
 9 |   %\VignetteIndexEntry{Using FLORAL for survival models with longitudinal microbiome data}
10 |   %\VignetteEncoding{UTF-8}
11 |   %\VignetteEngine{knitr::rmarkdown}
12 | editor_options: 
13 |   chunk_output_type: console
14 | ---
15 | 
16 | ```{r, include = FALSE}
17 | knitr::opts_chunk$set(
18 |   collapse = TRUE,
19 |   comment = "#>",
20 |   out.width = "100%"
21 | )
22 | ```
23 | 
24 | ```{r setup, warning=FALSE, message=FALSE}
25 | library(FLORAL)
26 | library(dplyr)
27 | library(patchwork)
28 | library(survival)
29 | set.seed(8192024)
30 | ```
31 | 
32 | In this vignette, we illustrate how to apply `FLORAL` to fit a Cox model with longitudinal microbiome data. Due to limited availability of public data sets with survival information, we use simulated data for illustrative purposes.
33 | 
34 | ## Data simulation
35 | We will use the built-in simulation function `simu()` to generate longitudinal compositional features and the corresponding time-to-event. The underlying methodology used for the simulation is based on a piece-wise exponential distribution as described by [Hendry 2014](https://doi.org/10.1002/sim.5945).
36 | 
37 | By default, the first 10 features out of the 500 features simulated below are associated with the time-to-event.
38 | 
39 | ```{r simulation}
40 | 
41 | simdat <- simu(n=200, # sample size
42 |                p=500, # number of features
43 |                model="timedep",
44 |                pct.sparsity = 0.8, # proportion of zeros
45 |                rho=0, # feature-wise correlation
46 |                longitudinal_stability = TRUE # choose to simulate longitudinal features with stable trajectories
47 | )
48 | 
49 | ```
50 | 
51 | With the simulated data, the log-ratio lasso Cox model with time-dependent features can be fitted by running the following function. Here we provide a detailed description on each arguments:
52 | 
53 | * First of all, please use `longitudinal = TRUE` such that the algorithm would use the appropriate method to handle longitudinal data.
54 | * The feature matrix input `x` should be the count matrix where rows specify samples and columns specify features.
55 | * The vector of IDs of subjects/patients corresponding to the rows of `x` should be input as `id`.
56 | * The vector of sample collection times corresponding to the rows of `x` should be input as `tobs`.
57 | * The `Surv` object (`Surv(time,status)`) of **unique patients** should be input as `y`. Please note that the survival data should be sorted with respect to the IDs specified in `id`. 
58 | 
59 | ```{r FLORAL, warning=FALSE, message=FALSE}
60 | 
61 | fit <- FLORAL(x=simdat$xcount,
62 |               y=Surv(simdat$data_unique$t,simdat$data_unique$d),
63 |               family="cox",
64 |               longitudinal = TRUE,
65 |               id = simdat$data$id,
66 |               tobs = simdat$data$t0,
67 |               progress=FALSE,
68 |               plot=TRUE)
69 | 
70 | fit$selected
71 |   
72 | ```
73 | 
74 | The list of selected features is saved in `fit$selected` as shown above.
75 | 
76 | To appropriately prepare the data in practice, we have the following recommendations:
77 | 
78 | * Start with patient metadata which includes survival data (time and status), sorting the metadata by patient IDs. Extract time and status variables for the `Surv` object for input as `y`.
79 | * Curate the microbiome feature data matrix, sorted by patient IDs and time of sample collection. Save the patient ID and time of sample collection vectors for `id` and `tobs`. Save the feature table for input as `x`.


--------------------------------------------------------------------------------
/man/simu.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/simu.R
 3 | \name{simu}
 4 | \alias{simu}
 5 | \title{Simulate data following log-ratio model}
 6 | \usage{
 7 | simu(
 8 |   n = 100,
 9 |   p = 200,
10 |   model = "linear",
11 |   weak = 4,
12 |   strong = 6,
13 |   weaksize = 0.125,
14 |   strongsize = 0.25,
15 |   pct.sparsity = 0.5,
16 |   rho = 0,
17 |   timedep_slope = NULL,
18 |   timedep_cor = NULL,
19 |   geetype = "gaussian",
20 |   m = 4,
21 |   corstr = "exchangeable",
22 |   sdvec = NULL,
23 |   rhogee = 0.8,
24 |   geeslope = 2.5,
25 |   longitudinal_stability = TRUE,
26 |   ncov = 0,
27 |   betacov = 0,
28 |   intercept = FALSE
29 | )
30 | }
31 | \arguments{
32 | \item{n}{An integer of sample size}
33 | 
34 | \item{p}{An integer of number of features (taxa).}
35 | 
36 | \item{model}{Type of models associated with outcome variable, can be "linear", "binomial", "cox", "finegray", "gee" (scalar outcome with time-dependent features), or "timedep" (survival endpoint with time-dependent features).}
37 | 
38 | \item{weak}{Number of features with \code{weak} effect size.}
39 | 
40 | \item{strong}{Number of features with \code{strong} effect size.}
41 | 
42 | \item{weaksize}{Actual effect size for \code{weak} effect size. Must be positive.}
43 | 
44 | \item{strongsize}{Actual effect size for \code{strong} effect size. Must be positive.}
45 | 
46 | \item{pct.sparsity}{Percentage of zero counts for each sample.}
47 | 
48 | \item{rho}{Parameter controlling the correlated structure between taxa. Ranges between 0 and 1.}
49 | 
50 | \item{timedep_slope}{If \code{model} is "timedep", this parameter specifies the slope for the feature trajectories. Please refer to the Simulation section of the manuscript for more details.}
51 | 
52 | \item{timedep_cor}{If \code{model} is "timedep", this parameter specifies the sample-wise correlations between longitudinal features. Please refer to the Simulation section of the manuscript for more details.}
53 | 
54 | \item{geetype}{If \code{model} is "gee", \code{geetype} is the type of GEE outcomes. Now support "gaussian" and "binomial".}
55 | 
56 | \item{m}{If \code{model} is "gee", \code{m} is the number of repeated measurements per subject.}
57 | 
58 | \item{corstr}{If \code{model} is "gee", \code{corstr} is the working correlation structure. Now support "independence", "exchangeable", and "AR-1".}
59 | 
60 | \item{sdvec}{If \code{model} is "gee" and \code{geetype} is "gaussian", \code{sdvec} is the vector of standard deviations of each outcome variable.}
61 | 
62 | \item{rhogee}{If \code{model} is "gee", \code{rhogee} is the correlation parameter between longitudinal outcomes under the selected working correlation structure.}
63 | 
64 | \item{geeslope}{If \code{model} is "gee", \code{geeslope} is the linear time effect.}
65 | 
66 | \item{longitudinal_stability}{If \code{model} is "timedep", this is a binary indicator which determines whether the trajectories are more stable (\code{TRUE}) or more volatile (\code{FALSE}).}
67 | 
68 | \item{ncov}{Number of covariates that are not compositional features.}
69 | 
70 | \item{betacov}{Coefficients corresponding to the covariates that are not compositional features.}
71 | 
72 | \item{intercept}{Boolean. If TRUE, then a random intercept will be generated in the model. Only works for \code{linear} or \code{binomial} models.}
73 | }
74 | \value{
75 | A list with simulated count matrix \code{xcount}, log1p-transformed count matrix \code{x}, outcome (continuous \code{y}, continuous centered \code{y0}, binary \code{y}, or survival \code{t}, \code{d}), true coefficient vector \code{beta}, list of non-zero features \code{idx}, value of intercept \code{intercept} (if applicable).
76 | }
77 | \description{
78 | Simulate a dataset from log-ratio model.
79 | }
80 | \examples{
81 | 
82 | set.seed(23420)
83 | dat <- simu(n=50,p=30,model="linear")
84 | 
85 | }
86 | \references{
87 | Fei T, Funnell T, Waters N, Raj SS et al. Enhanced Feature Selection for Microbiome Data using FLORAL: Scalable Log-ratio Lasso Regression bioRxiv 2023.05.02.538599.
88 | }
89 | \author{
90 | Teng Fei. Email: feit1@mskcc.org
91 | }
92 | 


--------------------------------------------------------------------------------
/man/a.FLORAL.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/FLORAL.R
  3 | \name{a.FLORAL}
  4 | \alias{a.FLORAL}
  5 | \title{Comparing prediction performances under different choices of weights for lasso/ridge penalty}
  6 | \usage{
  7 | a.FLORAL(
  8 |   a = c(0.1, 0.5, 1),
  9 |   ncore = 1,
 10 |   seed = NULL,
 11 |   x,
 12 |   y,
 13 |   ncov = 0,
 14 |   family = "gaussian",
 15 |   longitudinal = FALSE,
 16 |   id = NULL,
 17 |   tobs = NULL,
 18 |   failcode = NULL,
 19 |   corstr = "exchangeable",
 20 |   scalefix = FALSE,
 21 |   scalevalue = 1,
 22 |   pseudo = 1,
 23 |   length.lambda = 100,
 24 |   lambda.min.ratio = NULL,
 25 |   ncov.lambda.weight = 0,
 26 |   mu = 1,
 27 |   pfilter = 0,
 28 |   maxiter = 100,
 29 |   ncv = 5,
 30 |   intercept = FALSE,
 31 |   step2 = FALSE,
 32 |   progress = TRUE
 33 | )
 34 | }
 35 | \arguments{
 36 | \item{a}{vector of scalars between 0 and 1 for comparison.}
 37 | 
 38 | \item{ncore}{Number of cores used for parallel computation. Default is to use only 1 core.}
 39 | 
 40 | \item{seed}{A random seed for reproducibility of the results. By default the seed is the numeric form of \code{Sys.Date()}.}
 41 | 
 42 | \item{x}{Feature matrix, where rows specify subjects and columns specify features. The first \code{ncov} columns should be patient characteristics and the rest columns are microbiome absolute counts corresponding to various taxa. If \code{x} contains longitudinal data, the rows must be sorted in the same order of the subject IDs used in \code{y}.}
 43 | 
 44 | \item{y}{Outcome. For a continuous or binary outcome, \code{y} is a vector. For survival outcome, \code{y} is a \code{Surv} object.}
 45 | 
 46 | \item{ncov}{An integer indicating the number of first \code{ncov} columns in \code{x} that will not be subject to the zero-sum constraint.}
 47 | 
 48 | \item{family}{Available options are \code{gaussian}, \code{binomial}, \code{cox}, \code{finegray}.}
 49 | 
 50 | \item{longitudinal}{\code{TRUE} or \code{FALSE}, indicating whether longitudinal data matrix is specified for input \code{x}. (\code{Longitudinal=TRUE} and \code{family="cox"} or \code{"finegray"} will fit a time-dependent covariate model. \code{Longitudinal=TRUE} and \code{family="gaussian"} or \code{"binomial"} will fit a GEE model.)}
 51 | 
 52 | \item{id}{If \code{longitudinal} is \code{TRUE}, \code{id} specifies subject IDs corresponding to the rows of input \code{x}.}
 53 | 
 54 | \item{tobs}{If \code{longitudinal} is \code{TRUE}, \code{tobs} specifies time points corresponding to the rows of input \code{x}.}
 55 | 
 56 | \item{failcode}{If \code{family = finegray}, \code{failcode} specifies the failure type of interest. This must be a positive integer.}
 57 | 
 58 | \item{corstr}{If a GEE model is specified, then \code{corstr} is the corresponding working correlation structure. Options are \code{independence}, \code{exchangeable}, \code{AR-1} and \code{unstructured}.}
 59 | 
 60 | \item{scalefix}{\code{TRUE} or \code{FALSE}, indicating whether the scale parameter is estimated or fixed if a GEE model is specified.}
 61 | 
 62 | \item{scalevalue}{Specify the scale parameter if \code{scalefix=TRUE}.}
 63 | 
 64 | \item{pseudo}{Pseudo count to be added to \code{x} before taking log-transformation}
 65 | 
 66 | \item{length.lambda}{Number of penalty parameters used in the path}
 67 | 
 68 | \item{lambda.min.ratio}{Ratio between the minimum and maximum choice of lambda. Default is \code{NULL}, where the ratio is chosen as 1e-2.}
 69 | 
 70 | \item{ncov.lambda.weight}{Weight of the penalty lambda applied to the first \code{ncov} covariates. Default is 0 such that the first \code{ncov} covariates are not penalized.}
 71 | 
 72 | \item{mu}{Value of penalty for the augmented Lagrangian}
 73 | 
 74 | \item{pfilter}{A pre-specified threshold to force coefficients with absolute values less than pfilter times the maximum value of absolute coefficient as zeros in the GEE model. Default is zero, such that all coefficients will be reported.}
 75 | 
 76 | \item{maxiter}{Number of iterations needed for the outer loop of the augmented Lagrangian algorithm.}
 77 | 
 78 | \item{ncv}{Folds of cross-validation. Use \code{NULL} if cross-validation is not wanted.}
 79 | 
 80 | \item{intercept}{\code{TRUE} or \code{FALSE}, indicating whether an intercept should be estimated.}
 81 | 
 82 | \item{step2}{\code{TRUE} or \code{FALSE}, indicating whether a second-stage feature selection for specific ratios should be performed for the features selected by the main lasso algorithm. Will only be performed if cross validation is enabled.}
 83 | 
 84 | \item{progress}{\code{TRUE} or \code{FALSE}, indicating whether printing progress bar as the algorithm runs.}
 85 | }
 86 | \value{
 87 | A \code{ggplot2} object of cross-validated prediction metric versus \code{lambda}, stratified by \code{a}. Detailed data can be retrieved from the \code{ggplot2} object itself.
 88 | }
 89 | \description{
 90 | Summarizing \code{FLORAL} outputs from various choices of \code{a}
 91 | }
 92 | \examples{
 93 | 
 94 | set.seed(23420)
 95 | 
 96 | dat <- simu(n=50,p=30,model="linear")
 97 | pmetric <- a.FLORAL(a=c(0.1,1),ncore=1,x=dat$xcount,y=dat$y,family="gaussian",ncv=2,progress=FALSE)
 98 | 
 99 | }
100 | \references{
101 | Fei T, Funnell T, Waters N, Raj SS et al. Scalable Log-ratio Lasso Regression Enhances Microbiome Feature Selection for Predictive Models. bioRxiv 2023.05.02.538599.
102 | }
103 | \author{
104 | Teng Fei. Email: feit1@mskcc.org
105 | }
106 | 


--------------------------------------------------------------------------------
/man/mcv.FLORAL.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/FLORAL.R
  3 | \name{mcv.FLORAL}
  4 | \alias{mcv.FLORAL}
  5 | \title{Summarizing selected compositional features over multiple cross validations}
  6 | \usage{
  7 | mcv.FLORAL(
  8 |   mcv = 10,
  9 |   ncore = 1,
 10 |   seed = NULL,
 11 |   x,
 12 |   y,
 13 |   ncov = 0,
 14 |   family = "gaussian",
 15 |   longitudinal = FALSE,
 16 |   id = NULL,
 17 |   tobs = NULL,
 18 |   failcode = NULL,
 19 |   corstr = "exchangeable",
 20 |   scalefix = FALSE,
 21 |   scalevalue = 1,
 22 |   pseudo = 1,
 23 |   length.lambda = 100,
 24 |   lambda.min.ratio = NULL,
 25 |   ncov.lambda.weight = 0,
 26 |   a = 1,
 27 |   mu = 1,
 28 |   pfilter = 0,
 29 |   maxiter = 100,
 30 |   ncv = 5,
 31 |   intercept = FALSE,
 32 |   step2 = TRUE,
 33 |   progress = TRUE,
 34 |   plot = TRUE
 35 | )
 36 | }
 37 | \arguments{
 38 | \item{mcv}{Number of random `ncv`-fold cross-validation to be performed.}
 39 | 
 40 | \item{ncore}{Number of cores used for parallel computation. Default is to use only 1 core.}
 41 | 
 42 | \item{seed}{A random seed for reproducibility of the results. By default the seed is the numeric form of \code{Sys.Date()}.}
 43 | 
 44 | \item{x}{Feature matrix, where rows specify subjects and columns specify features. The first \code{ncov} columns should be patient characteristics and the rest columns are microbiome absolute counts corresponding to various taxa. If \code{x} contains longitudinal data, the rows must be sorted in the same order of the subject IDs used in \code{y}.}
 45 | 
 46 | \item{y}{Outcome. For a continuous or binary outcome, \code{y} is a vector. For survival outcome, \code{y} is a \code{Surv} object.}
 47 | 
 48 | \item{ncov}{An integer indicating the number of first \code{ncov} columns in \code{x} that will not be subject to the zero-sum constraint.}
 49 | 
 50 | \item{family}{Available options are \code{gaussian}, \code{binomial}, \code{cox}, \code{finegray}.}
 51 | 
 52 | \item{longitudinal}{\code{TRUE} or \code{FALSE}, indicating whether longitudinal data matrix is specified for input \code{x}. (\code{Longitudinal=TRUE} and \code{family="cox"} or \code{"finegray"} will fit a time-dependent covariate model. \code{Longitudinal=TRUE} and \code{family="gaussian"} or \code{"binomial"} will fit a GEE model.)}
 53 | 
 54 | \item{id}{If \code{longitudinal} is \code{TRUE}, \code{id} specifies subject IDs corresponding to the rows of input \code{x}.}
 55 | 
 56 | \item{tobs}{If \code{longitudinal} is \code{TRUE}, \code{tobs} specifies time points corresponding to the rows of input \code{x}.}
 57 | 
 58 | \item{failcode}{If \code{family = finegray}, \code{failcode} specifies the failure type of interest. This must be a positive integer.}
 59 | 
 60 | \item{corstr}{If a GEE model is specified, then \code{corstr} is the corresponding working correlation structure. Options are \code{independence}, \code{exchangeable}, \code{AR-1} and \code{unstructured}.}
 61 | 
 62 | \item{scalefix}{\code{TRUE} or \code{FALSE}, indicating whether the scale parameter is estimated or fixed if a GEE model is specified.}
 63 | 
 64 | \item{scalevalue}{Specify the scale parameter if \code{scalefix=TRUE}.}
 65 | 
 66 | \item{pseudo}{Pseudo count to be added to \code{x} before taking log-transformation}
 67 | 
 68 | \item{length.lambda}{Number of penalty parameters used in the path}
 69 | 
 70 | \item{lambda.min.ratio}{Ratio between the minimum and maximum choice of lambda. Default is \code{NULL}, where the ratio is chosen as 1e-2.}
 71 | 
 72 | \item{ncov.lambda.weight}{Weight of the penalty lambda applied to the first \code{ncov} covariates. Default is 0 such that the first \code{ncov} covariates are not penalized.}
 73 | 
 74 | \item{a}{A scalar between 0 and 1: \code{a} is the weight for lasso penalty while \code{1-a} is the weight for ridge penalty.}
 75 | 
 76 | \item{mu}{Value of penalty for the augmented Lagrangian}
 77 | 
 78 | \item{pfilter}{A pre-specified threshold to force coefficients with absolute values less than pfilter times the maximum value of absolute coefficient as zeros in the GEE model. Default is zero, such that all coefficients will be reported.}
 79 | 
 80 | \item{maxiter}{Number of iterations needed for the outer loop of the augmented Lagrangian algorithm.}
 81 | 
 82 | \item{ncv}{Folds of cross-validation. Use \code{NULL} if cross-validation is not wanted.}
 83 | 
 84 | \item{intercept}{\code{TRUE} or \code{FALSE}, indicating whether an intercept should be estimated.}
 85 | 
 86 | \item{step2}{\code{TRUE} or \code{FALSE}, indicating whether a second-stage feature selection for specific ratios should be performed for the features selected by the main lasso algorithm. Will only be performed if cross validation is enabled.}
 87 | 
 88 | \item{progress}{\code{TRUE} or \code{FALSE}, indicating whether printing progress bar as the algorithm runs.}
 89 | 
 90 | \item{plot}{\code{TRUE} or \code{FALSE}, indicating whether returning summary plots of selection probability for taxa features.}
 91 | }
 92 | \value{
 93 | A list with relative frequencies of a certain feature being selected over \code{mcv} \code{ncv}-fold cross-validations.
 94 | }
 95 | \description{
 96 | Summarizing \code{FLORAL} outputs from multiple random k-fold cross validations
 97 | }
 98 | \examples{
 99 | 
100 | set.seed(23420)
101 | 
102 | dat <- simu(n=50,p=30,model="linear")
103 | fit <- mcv.FLORAL(mcv=2,ncore=1,x=dat$xcount,y=dat$y,ncv=2,progress=FALSE,step2=TRUE,plot=FALSE)
104 | 
105 | }
106 | \references{
107 | Fei T, Funnell T, Waters N, Raj SS et al. Scalable Log-ratio Lasso Regression Enhances Microbiome Feature Selection for Predictive Models. bioRxiv 2023.05.02.538599.
108 | }
109 | \author{
110 | Teng Fei. Email: feit1@mskcc.org
111 | }
112 | 


--------------------------------------------------------------------------------
/tests/testthat/test-utils.R:
--------------------------------------------------------------------------------
  1 | test_that("encoding covariates works", {
  2 |   dat = data.frame(
  3 |     colA=1:5,
  4 |     colB=c("a", "b", "a", "c", "b"),
  5 |     colC=factor(c("a", "b", "a", "c", "b")))
  6 |   res = data.frame(
  7 |     colA=1:5,
  8 |     colB_a=c(1, 0, 1, 0, 0),
  9 |     colB_b=c(0, 1, 0, 0, 1),
 10 |     colB_c=c(0, 0, 0, 1, 0),
 11 |     colC_a=c(1, 0, 1, 0, 0),
 12 |     colC_b=c(0, 1, 0, 0, 1),
 13 |     colC_c=c(0, 0, 0, 1, 0)
 14 |   )
 15 |   expect_equal(res, FLORAL:::clean_covariate_columns(dat, cols = colnames(dat), drop_first = FALSE))
 16 | }
 17 | )
 18 | 
 19 | 
 20 | test_that("FLORAL() works from phy", {
 21 |   require(phyloseq)
 22 |   testphy <- new(
 23 |     "phyloseq",
 24 |     otu_table = new(
 25 |       "otu_table",
 26 |       .Data = structure(
 27 |         c(14L,
 28 |           150L, 260L, 434L, 79L, 287L, 361L, 8L, 42L, 501L, 89L, 8L, 29L,
 29 |           2629L, 154L, 200L, 865L, 694L, 217L, 144L, 0L, 0L, 0L, 0L, 0L,
 30 |           10L, 0L, 0L, 521L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 46L, 0L, 0L,
 31 |           0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
 32 |           0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 436L, 0L, 0L, 0L, 0L,
 33 |           0L, 291L, 4L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
 34 |           0L, 0L, 0L, 0L, 0L, 0L, 30L, 3L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
 35 |           0L, 0L, 37L, 65L, 59L, 0L, 106L, 388L, 61L, 26L, 0L, 0L, 0L,
 36 |           0L, 0L, 24L, 0L, 0L, 0L), dim = c(20L, 6L),
 37 |         dimnames = list(c("ASV_1216",
 38 |                           "ASV_12580", "ASV_12691", "ASV_135", "ASV_147", "ASV_15", "ASV_16",
 39 |                           "ASV_184", "ASV_19", "ASV_20", "ASV_21", "ASV_2202", "ASV_239",
 40 |                           "ASV_25", "ASV_253", "ASV_260", "ASV_29", "ASV_3005", "ASV_302",
 41 |                           "ASV_3048"), c("1000A", "1000B", "1000C", "1000D", "1000E", "1001"
 42 |                           ))), taxa_are_rows = TRUE),
 43 |     tax_table = new("taxonomyTable",
 44 |                     .Data = structure(
 45 |                       c("Bacteria", "Bacteria", "Bacteria", "Bacteria",
 46 |                         "Bacteria", "Bacteria", "Bacteria", "Bacteria", "Bacteria",
 47 |                         "Bacteria", "Bacteria", "Bacteria", "Bacteria", "Bacteria",
 48 |                         "Bacteria", "Bacteria", "Bacteria", "Bacteria", "Bacteria",
 49 |                         "Bacteria", "Firmicutes", "Firmicutes", "Firmicutes", "Firmicutes",
 50 |                         "Firmicutes", "Firmicutes", "Firmicutes", "Firmicutes", "Firmicutes",
 51 |                         "Firmicutes", "Firmicutes", "Firmicutes", "Firmicutes", "Firmicutes",
 52 |                         "Firmicutes", "Firmicutes", "Firmicutes", "Firmicutes", "Firmicutes",
 53 |                         "Firmicutes", "Clostridia", "Clostridia", "Clostridia", "Clostridia",
 54 |                         "Clostridia", "Clostridia", "Bacilli", "Clostridia", "Erysipelotrichia",
 55 |                         "Clostridia", "Clostridia", "Clostridia", "Bacilli", "Clostridia",
 56 |                         "Erysipelotrichia", "Clostridia", "Clostridia", "Erysipelotrichia",
 57 |                         "Clostridia", "Clostridia", "Clostridiales", "Clostridiales",
 58 |                         "Clostridiales", "Clostridiales", "Clostridiales", "Clostridiales",
 59 |                         "Lactobacillales", "Clostridiales", "Erysipelotrichales",
 60 |                         "Clostridiales", "Clostridiales", "Clostridiales", "Lactobacillales",
 61 |                         "Clostridiales", "Erysipelotrichales", "Clostridiales", "Clostridiales",
 62 |                         "Erysipelotrichales", "Clostridiales", "Clostridiales", "Lachnospiraceae",
 63 |                         "Lachnospiraceae", "Lachnospiraceae", "Lachnospiraceae",
 64 |                         "Lachnospiraceae", "Lachnospiraceae", "Streptococcaceae",
 65 |                         "Lachnospiraceae", "Erysipelotrichaceae", "Lachnospiraceae",
 66 |                         "Lachnospiraceae", "Ruminococcaceae", "Streptococcaceae",
 67 |                         "Ruminococcaceae", "Erysipelotrichaceae", "Lachnospiraceae",
 68 |                         "Lachnospiraceae", "Erysipelotrichaceae", "Lachnospiraceae",
 69 |                         "Lachnospiraceae", "<not present>", "Blautia", "<not present>",
 70 |                         "Blautia", "Anaerostipes", "Sellimonas", "Streptococcus",
 71 |                         "Eisenbergiella", "[Clostridium] innocuum group", "<not present>",
 72 |                         "Lachnoclostridium", "<not present>", "Lactococcus", "Subdoligranulum",
 73 |                         "Candidatus Stoquefichus", "Blautia", "[Ruminococcus] gnavus group",
 74 |                         "Faecalitalea", "Blautia", "<not present>"), dim = c(20L, 6L),
 75 |                       dimnames = list(
 76 |                         c("ASV_1216", "ASV_12580", "ASV_12691",
 77 |                           "ASV_135", "ASV_147", "ASV_15", "ASV_16", "ASV_184", "ASV_19",
 78 |                           "ASV_20", "ASV_21", "ASV_2202", "ASV_239", "ASV_25", "ASV_253",
 79 |                           "ASV_260", "ASV_29", "ASV_3005", "ASV_302", "ASV_3048"),
 80 |                         c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus"
 81 |                         )))),
 82 |     sam_data = new(
 83 |       "sample_data",
 84 |       .Data = list(
 85 |         c("1000",
 86 |           "1000", "1000", "1000", "1000", "pt_with_samples_1001_1002_1003_1004_1005_1006_1007_1008_1048_1121_152"
 87 |         ),
 88 |         c(0L, 5L, 15L, 18L, 22L, 15L),
 89 |         c("formed", "liquid", "liquid", "semi-formed", "formed", "formed"),
 90 |         c("SRR11414397", "SRR11414992", "SRR11414991", "SRR11414990", "SRR11414989", "SRR11414988"),
 91 |         c("PRJNA545312", "PRJNA545312", "PRJNA545312", "PRJNA545312",
 92 |           "PRJNA545312", "PRJNA545312"), c(-9, -4, 6, 9, 13, -6),
 93 |         c("", "", "", "", "", "")),
 94 |       names = c("PatientID", "Timepoint", "Consistency", "Accession", "BioProject", "DayRelativeToNearestHCT",
 95 |                 "AccessionShotgun"),
 96 |       row.names = c("1000A", "1000B", "1000C", "1000D", "1000E", "1001"), .S3Class = "data.frame"), phy_tree = NULL,
 97 |     refseq = NULL)
 98 | 
 99 | 
100 |   dat <- phy_to_floral_data(
101 |     testphy, covariates=c("Consistency"), y = "DayRelativeToNearestHCT")
102 | 
103 |   fit <- FLORAL(dat$xcount,dat$y,family="gaussian",progress=FALSE,step2=TRUE, ncv = NULL)
104 |   expect_true(!is.null(fit))
105 |     
106 | 
107 | })
108 | 


--------------------------------------------------------------------------------
/man/FLORAL.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/FLORAL.R
  3 | \name{FLORAL}
  4 | \alias{FLORAL}
  5 | \title{Fit Log-ratio lasso regression for compositional covariates}
  6 | \usage{
  7 | FLORAL(
  8 |   x,
  9 |   y,
 10 |   ncov = 0,
 11 |   family = "gaussian",
 12 |   longitudinal = FALSE,
 13 |   id = NULL,
 14 |   tobs = NULL,
 15 |   failcode = NULL,
 16 |   corstr = "exchangeable",
 17 |   scalefix = FALSE,
 18 |   scalevalue = 1,
 19 |   pseudo = 1,
 20 |   length.lambda = 100,
 21 |   lambda.min.ratio = NULL,
 22 |   ncov.lambda.weight = 0,
 23 |   a = 1,
 24 |   mu = 1,
 25 |   pfilter = 0,
 26 |   maxiter = 100,
 27 |   ncv = 5,
 28 |   ncore = 1,
 29 |   intercept = FALSE,
 30 |   foldid = NULL,
 31 |   step2 = TRUE,
 32 |   progress = TRUE,
 33 |   plot = TRUE
 34 | )
 35 | }
 36 | \arguments{
 37 | \item{x}{Feature matrix, where rows specify subjects and columns specify features. The first \code{ncov} columns should be patient characteristics and the rest columns are microbiome absolute counts corresponding to various taxa. If \code{x} contains longitudinal data, the rows must be sorted in the same order of the subject IDs used in \code{y}.}
 38 | 
 39 | \item{y}{Outcome. For a continuous or binary outcome, \code{y} is a vector. For survival outcome, \code{y} is a \code{Surv} object.}
 40 | 
 41 | \item{ncov}{An integer indicating the number of first \code{ncov} columns in \code{x} that will not be subject to the zero-sum constraint.}
 42 | 
 43 | \item{family}{Available options are \code{gaussian}, \code{binomial}, \code{cox}, \code{finegray}.}
 44 | 
 45 | \item{longitudinal}{\code{TRUE} or \code{FALSE}, indicating whether longitudinal data matrix is specified for input \code{x}. (\code{Longitudinal=TRUE} and \code{family="cox"} or \code{"finegray"} will fit a time-dependent covariate model. \code{Longitudinal=TRUE} and \code{family="gaussian"} or \code{"binomial"} will fit a GEE model.)}
 46 | 
 47 | \item{id}{If \code{longitudinal} is \code{TRUE}, \code{id} specifies subject IDs corresponding to the rows of input \code{x}.}
 48 | 
 49 | \item{tobs}{If \code{longitudinal} is \code{TRUE}, \code{tobs} specifies time points corresponding to the rows of input \code{x}.}
 50 | 
 51 | \item{failcode}{If \code{family = finegray}, \code{failcode} specifies the failure type of interest. This must be a positive integer.}
 52 | 
 53 | \item{corstr}{If a GEE model is specified, then \code{corstr} is the corresponding working correlation structure. Options are \code{independence}, \code{exchangeable}, \code{AR-1} and \code{unstructured}.}
 54 | 
 55 | \item{scalefix}{\code{TRUE} or \code{FALSE}, indicating whether the scale parameter is estimated or fixed if a GEE model is specified.}
 56 | 
 57 | \item{scalevalue}{Specify the scale parameter if \code{scalefix=TRUE}.}
 58 | 
 59 | \item{pseudo}{Pseudo count to be added to \code{x} before taking log-transformation. If unspecified, then the log-transformation will not be performed.}
 60 | 
 61 | \item{length.lambda}{Number of penalty parameters used in the path}
 62 | 
 63 | \item{lambda.min.ratio}{Ratio between the minimum and maximum choice of lambda. Default is \code{NULL}, where the ratio is chosen as 1e-2.}
 64 | 
 65 | \item{ncov.lambda.weight}{Weight of the penalty lambda applied to the first \code{ncov} covariates. Default is 0 such that the first \code{ncov} covariates are not penalized.}
 66 | 
 67 | \item{a}{A scalar between 0 and 1: \code{a} is the weight for lasso penalty while \code{1-a} is the weight for ridge penalty.}
 68 | 
 69 | \item{mu}{Value of penalty for the augmented Lagrangian}
 70 | 
 71 | \item{pfilter}{A pre-specified threshold to force coefficients with absolute values less than pfilter times the maximum value of absolute coefficient as zeros in the GEE model. Default is zero, such that all coefficients will be reported.}
 72 | 
 73 | \item{maxiter}{Number of iterations needed for the outer loop of the augmented Lagrangian algorithm.}
 74 | 
 75 | \item{ncv}{Folds of cross-validation. Use \code{NULL} if cross-validation is not wanted.}
 76 | 
 77 | \item{ncore}{Number of cores for parallel computing for cross-validation. Default is 1.}
 78 | 
 79 | \item{intercept}{\code{TRUE} or \code{FALSE}, indicating whether an intercept should be estimated.}
 80 | 
 81 | \item{foldid}{A vector of fold indicator. Default is \code{NULL}.}
 82 | 
 83 | \item{step2}{\code{TRUE} or \code{FALSE}, indicating whether a second-stage feature selection for specific ratios should be performed for the features selected by the main lasso algorithm. Will only be performed if cross validation is enabled.}
 84 | 
 85 | \item{progress}{\code{TRUE} or \code{FALSE}, indicating whether printing progress bar as the algorithm runs.}
 86 | 
 87 | \item{plot}{\code{TRUE} or \code{FALSE}, indicating whether returning plots of model fitting.}
 88 | }
 89 | \value{
 90 | A list with path-specific estimates (beta), path (lambda), and others. Details can be found in \code{README.md}.
 91 | }
 92 | \description{
 93 | Conduct log-ratio lasso regression for continuous, binary and survival outcomes.
 94 | }
 95 | \examples{
 96 | 
 97 | set.seed(23420)
 98 | 
 99 | # Continuous outcome
100 | dat <- simu(n=50,p=30,model="linear")
101 | fit <- FLORAL(dat$xcount,dat$y,family="gaussian",ncv=2,progress=FALSE,step2=TRUE)
102 | 
103 | # Binary outcome
104 | # dat <- simu(n=50,p=30,model="binomial")
105 | # fit <- FLORAL(dat$xcount,dat$y,family="binomial",progress=FALSE,step2=TRUE)
106 | 
107 | # Survival outcome
108 | # dat <- simu(n=50,p=30,model="cox")
109 | # fit <- FLORAL(dat$xcount,survival::Surv(dat$t,dat$d),family="cox",progress=FALSE,step2=TRUE)
110 | 
111 | # Competing risks outcome
112 | # dat <- simu(n=50,p=30,model="finegray")
113 | # fit <- FLORAL(dat$xcount,survival::Surv(dat$t,dat$d,type="mstate"),failcode=1,
114 | #               family="finegray",progress=FALSE,step2=FALSE)
115 | 
116 | # Longitudinal continuous outcome
117 | #  dat <- simu(n=50,p=30,model="gee",geetype="gaussian",m=3,corstr="exchangeable",sdvec=rep(1,3))
118 | #  fit <- FLORAL(x=cbind(dat$tvec, dat$xcount),y=dat$y,id=dat$id,family="gaussian",
119 | #                ncov=1,longitudinal = TRUE,corstr = "exchangeable",lambda.min.ratio=1e-3,
120 | #                progress=FALSE,step2=FALSE)
121 | 
122 | }
123 | \references{
124 | Fei T, Funnell T, Waters N, Raj SS et al. Enhanced Feature Selection for Microbiome Data using FLORAL: Scalable Log-ratio Lasso Regression bioRxiv 2023.05.02.538599.
125 | }
126 | \author{
127 | Teng Fei. Email: feit1@mskcc.org
128 | }
129 | 


--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | output: github_document
  3 | ---
  4 | 
  5 | <!-- README.md is generated from README.Rmd. Please edit that file -->
  6 | 
  7 | ```{r, include = FALSE}
  8 | knitr::opts_chunk$set(
  9 |   collapse = TRUE,
 10 |   comment = "#>",
 11 |   fig.path = "man/figures/README-",
 12 |   out.width = "100%"
 13 | )
 14 | ```
 15 | 
 16 | # FLORAL: Fit LOg-RAtio Lasso regression for compositional covariates <img src="man/figures/logo.png" align="right" height="139" />
 17 | 
 18 | <!-- badges: start -->
 19 | [![R-CMD-check](https://github.com/vdblab/FLORAL/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/vdblab/FLORAL/actions/workflows/R-CMD-check.yaml)
 20 | [![CRAN status](https://www.r-pkg.org/badges/version/FLORAL)](https://cran.r-project.org/package=FLORAL)
 21 | [![](https://cranlogs.r-pkg.org/badges/FLORAL)](https://cran.r-project.org/package=FLORAL)
 22 | [![](https://cranlogs.r-pkg.org/badges/grand-total/FLORAL)](https://cran.r-project.org/package=FLORAL)
 23 | <!-- badges: end -->
 24 | 
 25 | The `FLORAL` package is an open-source computational tool to perform log-ratio lasso regression modeling and compositional feature selection for continuous, binary, time-to-event, and competing risk outcomes. The proposed method adapts the augmented Lagrangian algorithm for a zero-sum constraint optimization problem while enabling a two-stage screening process for extended false-positive control.
 26 | 
 27 | The associated article for `FLORAL` is available on [Cell Reports Methods](https://doi.org/10.1016/j.crmeth.2024.100899).
 28 | 
 29 | ## System requirements and installation
 30 | 
 31 | The current version of `FLORAL` (0.5.0) was built in R version 4.4.2. R package dependencies can be found in the `DESCRIPTION` file.
 32 | 
 33 | You can install `FLORAL` with the following code. The installation is typically complete within minutes.
 34 | 
 35 | ``` r
 36 | install.packages("FLORAL")
 37 | ```
 38 | 
 39 | You can also install `FLORAL` using conda:
 40 | 
 41 | ``` bash
 42 | conda install -c bioconda r-floral
 43 | ```
 44 | 
 45 | You can install the development version of `FLORAL` from [GitHub](https://github.com/) with:
 46 | 
 47 | ``` r
 48 | # install.packages("devtools")
 49 | devtools::install_github("vdblab/FLORAL")
 50 | ```
 51 | 
 52 | ## Example
 53 | 
 54 | Here is a toy example for linear regression with 10-fold cross-validation for a simulated data with 50 samples and 100 compositional features. Option `progress=TRUE` can be used to show the progress bar of the running algorithm. 
 55 | 
 56 | The data simulation procedure is described in the preprint. The expected run time for the following demo is about a minute.
 57 | 
 58 | ```{r example}
 59 | set.seed(23420)
 60 | library(FLORAL)
 61 | 
 62 | dat <- simu(n=50,p=100,model="linear")
 63 | fit <- FLORAL(dat$xcount,dat$y,family="gaussian",ncv=10,progress=FALSE)
 64 | ```
 65 | 
 66 | To view plots of cross-validated prediction error and parameter coefficients, use `fit$pmse` or `fit$pcoef`:
 67 | 
 68 | ```{r plot, echo = FALSE,out.width = "50%",fig.height=4,fig.width=6,dpi=300}
 69 | fit$pmse
 70 | fit$pcoef
 71 | ```
 72 | 
 73 | To view selected compositional features, use `fit$selected.feature`, where features are sorted by their names. Features under `min` and `1se` correspond to penalty parameter $\lambda_{\min}$ and $\lambda_{\text{1se}}$, respectively. Features under `min.2stage` and `1se.2stage` are obtained after applying 2-stage filtering based on features under `min` and `1se`, respectively.
 74 | 
 75 | We recommend interpreting the selected compositional features as potential predictive markers to the outcome in the regression model in the sense that the cross-validated prediction error is improved by considering these selected features. 
 76 | 
 77 | ```{r feature}
 78 | fit$selected.feature
 79 | ```
 80 | 
 81 | To get specific log-ratios selected by the 2-stage procedure, use `fit$step2.log-ratios`, where `min` and `1se` display the log-ratios between features. For each identified ratio, `min.idx` and `1se.idx` return the column indices in the original input matrix for the two corresponding features forming the ratio.
 82 | 
 83 | ```{r ratio}
 84 | fit$step2.ratios
 85 | ```
 86 | 
 87 | More detailed interpretations can be obtained for the selected log-ratios. First, the selected log-ratios also improve the cross-validated prediction errors because these log-ratios are derived from the constrained lasso estimate. Moreover, as guided by the association table between log-ratios and the outcome, it is possible to interpret the directions of the covariate effects associated with certain log-ratios on the outcome. To view detailed associations between selected log-ratios and the outcome, use `fit$step2.tables` to print summary tables for the multivariable stepwise regression models obtained by the 2-stage procedure. 
 88 | 
 89 | ```{r gtsummary1, eval=FALSE}
 90 | fit$step2.tables$min
 91 | ```
 92 | 
 93 | ```{r gtsummary2, eval=FALSE}
 94 | fit$step2.tables$`1se`
 95 | ```
 96 | 
 97 | For binary and survival outcomes, please specify `family="binomial"`, `family="cox"`, or `family="finegray"` accordingly. 
 98 | 
 99 | ```{r example2, warning=FALSE}
100 | dat.bin <- simu(n=50,p=100,model="binomial")
101 | fit.bin <- FLORAL(dat.bin$xcount,dat.bin$y,family="binomial",ncv=10,progress=FALSE)
102 | 
103 | dat.cox <- simu(n=50,p=100,model="cox")
104 | fit.cox <- FLORAL(dat.cox$xcount,survival::Surv(dat.cox$t,dat.cox$d),family="cox",ncv=10,progress=FALSE)
105 | 
106 | dat.fg <- simu(n=50,p=100,model="finegray")
107 | fit.fg <- FLORAL(dat.cox$xcount,survival::Surv(dat.cox$t,dat.cox$d,type="mstate"),family="finegray",ncv=10,progress=FALSE,step2=FALSE)
108 | ```
109 | 
110 | ## Repository for Reproducibility
111 | 
112 | Reproducible code for the analyses results reported in the manuscript can be found at [this repository](https://github.com/vdblab/FLORAL-analysis).
113 | 
114 | ## Contributing
115 | 
116 | The `FLORAL` package is jointly managed by [MSKCC Biostatistics service](https://www.mskcc.org/departments/epidemiology-biostatistics/biostatistics) and [the Marcel van den Brink Lab](https://vandenbrinklab.org/). Please note that the `FLORAL` project is released with a [Contributor Code of Conduct](https://github.com/vdblab/FLORAL/blob/master/.github/CODE_OF_CONDUCT.md). By contributing to this project, you agree to abide by its terms. Thank you to all contributors!
117 | 
118 | ## Reference
119 | 
120 | Fei T, Funnell T, Waters NR, Raj SS, Sadeghi K, Dai A, Miltiadous O, Shouval R, Lv M, Peled JU, Ponce DM, Perales M-A, G&ouml;nen M, van den Brink MRM, Scalable log-ratio lasso regression for enhanced microbial feature selection with FLORAL, Cell Reports Methods (2024), 100899; doi: <https://doi.org/10.1016/j.crmeth.2024.100899>.
121 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | <!-- README.md is generated from README.Rmd. Please edit that file -->
  3 | 
  4 | # FLORAL: Fit LOg-RAtio Lasso regression for compositional covariates <img src="man/figures/logo.png" align="right" height="139" />
  5 | 
  6 | <!-- badges: start -->
  7 | 
  8 | [![R-CMD-check](https://github.com/vdblab/FLORAL/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/vdblab/FLORAL/actions/workflows/R-CMD-check.yaml)
  9 | [![CRAN
 10 | status](https://www.r-pkg.org/badges/version/FLORAL)](https://cran.r-project.org/package=FLORAL)
 11 | [![](https://cranlogs.r-pkg.org/badges/FLORAL)](https://cran.r-project.org/package=FLORAL)
 12 | [![](https://cranlogs.r-pkg.org/badges/grand-total/FLORAL)](https://cran.r-project.org/package=FLORAL)
 13 | <!-- badges: end -->
 14 | 
 15 | The `FLORAL` package is an open-source computational tool to perform
 16 | log-ratio lasso regression modeling and compositional feature selection
 17 | for continuous, binary, time-to-event, and competing risk outcomes. The
 18 | proposed method adapts the augmented Lagrangian algorithm for a zero-sum
 19 | constraint optimization problem while enabling a two-stage screening
 20 | process for extended false-positive control.
 21 | 
 22 | The associated article for `FLORAL` is available on [Cell Reports
 23 | Methods](https://doi.org/10.1016/j.crmeth.2024.100899).
 24 | 
 25 | ## System requirements and installation
 26 | 
 27 | The current version of `FLORAL` (0.5.0) was built in R version 4.4.2. R
 28 | package dependencies can be found in the `DESCRIPTION` file.
 29 | 
 30 | You can install `FLORAL` with the following code. The installation is
 31 | typically complete within minutes.
 32 | 
 33 | ``` r
 34 | install.packages("FLORAL")
 35 | ```
 36 | 
 37 | You can also install `FLORAL` using conda:
 38 | 
 39 | ``` bash
 40 | conda install -c bioconda r-floral
 41 | ```
 42 | 
 43 | You can install the development version of `FLORAL` from
 44 | [GitHub](https://github.com/) with:
 45 | 
 46 | ``` r
 47 | # install.packages("devtools")
 48 | devtools::install_github("vdblab/FLORAL")
 49 | ```
 50 | 
 51 | ## Example
 52 | 
 53 | Here is a toy example for linear regression with 10-fold
 54 | cross-validation for a simulated data with 50 samples and 100
 55 | compositional features. Option `progress=TRUE` can be used to show the
 56 | progress bar of the running algorithm.
 57 | 
 58 | The data simulation procedure is described in the preprint. The expected
 59 | run time for the following demo is about a minute.
 60 | 
 61 | ``` r
 62 | set.seed(23420)
 63 | library(FLORAL)
 64 | 
 65 | dat <- simu(n=50,p=100,model="linear")
 66 | fit <- FLORAL(dat$xcount,dat$y,family="gaussian",ncv=10,progress=FALSE)
 67 | ```
 68 | 
 69 | To view plots of cross-validated prediction error and parameter
 70 | coefficients, use `fit$pmse` or `fit$pcoef`:
 71 | 
 72 | <img src="man/figures/README-plot-1.png" width="50%" /><img src="man/figures/README-plot-2.png" width="50%" />
 73 | 
 74 | To view selected compositional features, use `fit$selected.feature`,
 75 | where features are sorted by their names. Features under `min` and `1se`
 76 | correspond to penalty parameter $\lambda_{\min}$ and
 77 | $\lambda_{\text{1se}}$, respectively. Features under `min.2stage` and
 78 | `1se.2stage` are obtained after applying 2-stage filtering based on
 79 | features under `min` and `1se`, respectively.
 80 | 
 81 | We recommend interpreting the selected compositional features as
 82 | potential predictive markers to the outcome in the regression model in
 83 | the sense that the cross-validated prediction error is improved by
 84 | considering these selected features.
 85 | 
 86 | ``` r
 87 | fit$selected.feature
 88 | #> $min
 89 | #>  [1] "taxa1"  "taxa10" "taxa15" "taxa2"  "taxa29" "taxa3"  "taxa39" "taxa43"
 90 | #>  [9] "taxa5"  "taxa6"  "taxa7"  "taxa8"  "taxa9"  "taxa92"
 91 | #> 
 92 | #> $`1se`
 93 | #>  [1] "taxa1"  "taxa10" "taxa2"  "taxa3"  "taxa39" "taxa5"  "taxa6"  "taxa7" 
 94 | #>  [9] "taxa8"  "taxa9" 
 95 | #> 
 96 | #> $min.2stage
 97 | #>  [1] "taxa1"  "taxa10" "taxa2"  "taxa3"  "taxa43" "taxa5"  "taxa6"  "taxa7" 
 98 | #>  [9] "taxa8"  "taxa9"  "taxa92"
 99 | #> 
100 | #> $`1se.2stage`
101 | #> [1] "taxa1"  "taxa10" "taxa2"  "taxa3"  "taxa5"  "taxa6"  "taxa7"  "taxa8" 
102 | #> [9] "taxa9"
103 | ```
104 | 
105 | To get specific log-ratios selected by the 2-stage procedure, use
106 | `fit$step2.log-ratios`, where `min` and `1se` display the log-ratios
107 | between features. For each identified ratio, `min.idx` and `1se.idx`
108 | return the column indices in the original input matrix for the two
109 | corresponding features forming the ratio.
110 | 
111 | ``` r
112 | fit$step2.ratios
113 | #> $min
114 | #> [1] "taxa1/taxa10"  "taxa2/taxa5"   "taxa3/taxa8"   "taxa5/taxa8"  
115 | #> [5] "taxa6/taxa9"   "taxa7/taxa92"  "taxa10/taxa43"
116 | #> 
117 | #> $`1se`
118 | #> [1] "taxa1/taxa10" "taxa2/taxa5"  "taxa3/taxa8"  "taxa5/taxa8"  "taxa6/taxa9" 
119 | #> [6] "taxa7/taxa10"
120 | #> 
121 | #> $min.idx
122 | #>      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
123 | #> [1,]   NA    1    2    3    5    6    7   10
124 | #> [2,]   NA   10    5    8    8    9   92   43
125 | #> 
126 | #> $`1se.idx`
127 | #>      [,1] [,2] [,3] [,4] [,5] [,6] [,7]
128 | #> [1,]   NA    1    2    3    5    6    7
129 | #> [2,]   NA   10    5    8    8    9   10
130 | ```
131 | 
132 | More detailed interpretations can be obtained for the selected
133 | log-ratios. First, the selected log-ratios also improve the
134 | cross-validated prediction errors because these log-ratios are derived
135 | from the constrained lasso estimate. Moreover, as guided by the
136 | association table between log-ratios and the outcome, it is possible to
137 | interpret the directions of the covariate effects associated with
138 | certain log-ratios on the outcome. To view detailed associations between
139 | selected log-ratios and the outcome, use `fit$step2.tables` to print
140 | summary tables for the multivariable stepwise regression models obtained
141 | by the 2-stage procedure.
142 | 
143 | ``` r
144 | fit$step2.tables$min
145 | ```
146 | 
147 | ``` r
148 | fit$step2.tables$`1se`
149 | ```
150 | 
151 | For binary and survival outcomes, please specify `family="binomial"`,
152 | `family="cox"`, or `family="finegray"` accordingly.
153 | 
154 | ``` r
155 | dat.bin <- simu(n=50,p=100,model="binomial")
156 | fit.bin <- FLORAL(dat.bin$xcount,dat.bin$y,family="binomial",ncv=10,progress=FALSE)
157 | 
158 | dat.cox <- simu(n=50,p=100,model="cox")
159 | fit.cox <- FLORAL(dat.cox$xcount,survival::Surv(dat.cox$t,dat.cox$d),family="cox",ncv=10,progress=FALSE)
160 | 
161 | dat.fg <- simu(n=50,p=100,model="finegray")
162 | fit.fg <- FLORAL(dat.cox$xcount,survival::Surv(dat.cox$t,dat.cox$d,type="mstate"),family="finegray",ncv=10,progress=FALSE,step2=FALSE)
163 | ```
164 | 
165 | ## Repository for Reproducibility
166 | 
167 | Reproducible code for the analyses results reported in the manuscript
168 | can be found at [this
169 | repository](https://github.com/vdblab/FLORAL-analysis).
170 | 
171 | ## Contributing
172 | 
173 | The `FLORAL` package is jointly managed by [MSKCC Biostatistics
174 | service](https://www.mskcc.org/departments/epidemiology-biostatistics/biostatistics)
175 | and the Marcel van den Brink Lab. Please
176 | note that the `FLORAL` project is released with a [Contributor Code of
177 | Conduct](https://github.com/vdblab/FLORAL/blob/master/.github/CODE_OF_CONDUCT.md).
178 | By contributing to this project, you agree to abide by its terms. Thank
179 | you to all contributors!
180 | 
181 | ## Reference
182 | 
183 | Fei T, Funnell T, Waters NR, Raj SS, Baichoo M, 
184 | Sadeghi K, Dai A, Miltiadous O, Shouval R, Lv M, 
185 | Peled JU, Ponce DM, Perales MA, Gönen M, van den Brink MRM. 
186 | Scalable log-ratio lasso regression for enhanced microbial feature
187 | selection with FLORAL, Cell Reports Methods (2024), 100899; doi:
188 | <https://doi.org/10.1016/j.crmeth.2024.100899>. Epub 2024 Nov 7. 
189 | PMID: 39515336; PMCID: PMC11705925.
190 | 


--------------------------------------------------------------------------------
/vignettes/Using-FLORAL-for-Microbiome-Analysis.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Using FLORAL for Microbiome Analysis"
  3 | output: 
  4 |   rmarkdown::html_vignette:
  5 |     md_extensions: [ 
  6 |       "-autolink_bare_uris" 
  7 |     ]
  8 | vignette: >
  9 |   %\VignetteIndexEntry{Using FLORAL for Microbiome Analysis}
 10 |   %\VignetteEncoding{UTF-8}
 11 |   %\VignetteEngine{knitr::rmarkdown}
 12 | editor_options: 
 13 |   chunk_output_type: console
 14 | ---
 15 | 
 16 | ```{r, include = FALSE}
 17 | knitr::opts_chunk$set(
 18 |   collapse = TRUE,
 19 |   comment = "#>",
 20 |   out.width = "100%"
 21 | )
 22 | ```
 23 | 
 24 | ```{r setup, warning=FALSE, message=FALSE}
 25 | library(FLORAL)
 26 | library(dplyr)
 27 | library(patchwork)
 28 | ```
 29 | 
 30 | ## Data
 31 | We will be using data from the `curatedMetagenomicData` package. For easier installation, we saved a flat copy of the data, but the steps below show how that was created. 
 32 | 
 33 | ```r
 34 | if (! "BiocManager" %in% installed.packages()) install.packages("BiocManager")
 35 | if (! "curatedMetagenomicData" %in% installed.packages()) BiocManager::install("curatedMetagenomicData")
 36 | if (! "patchwork" %in% installed.packages()) install.packages("patchwork")
 37 | library(curatedMetagenomicData)
 38 | 
 39 | # Take a look at the summary of the studies available:
 40 | curatedMetagenomicData::sampleMetadata  |> group_by(.data$study_name, .data$study_condition) |> count() |> arrange(.data$study_name) 
 41 | #As an example, let us look at the `YachidaS_2019` study between healthy controls and colorectal cancer (CRC) patients.
 42 | curatedMetagenomicData::curatedMetagenomicData("YachidaS_2019") 
 43 | # note --  if you are behind a firewall, see the solutions to 500 errors here:
 44 | # https://support.bioconductor.org/p/132864/
 45 | rawdata <- curatedMetagenomicData::curatedMetagenomicData("2021-03-31.YachidaS_2019.relative_abundance", dryrun = FALSE, counts = TRUE) |>  mergeData()
 46 | x <- SummarizedExperiment::assays(rawdata)$relative_abundance %>% t()
 47 | y <- rawdata@colData$disease
 48 | 
 49 | save(list = c("x", "y"),  file = file.path("inst", "extdata", "YachidaS_2019.Rdata"))
 50 | ```
 51 | 
 52 | ```{r getData}
 53 | load(system.file("extdata", "YachidaS_2019.Rdata", package="FLORAL"))
 54 | ```
 55 | 
 56 | ## Running FLORAL
 57 | 
 58 | We extracted the data from the `TreeSummarizedExperiment` object to two objects: the taxa matrix `x` and the "outcomes" vector `y` of whether a patient is healthy or has colorectal cancer (CRC). Note that for binary outcomes, the input vector `y` needs to be formatted with entries equal to either `0` or `1`. In addition, we need to specify `family = "binomial"` in `FLORAL` to fit the logistic regression model. To print the progress bar as the algorithm runs, please use `progress = TRUE`. 
 59 | 
 60 | ```{r floral}
 61 | 
 62 | x <- x[y %in% c("CRC","healthy"),]
 63 | x <- x[,colSums(x >= 100) >= nrow(x)*0.2] # filter low abundance taxa
 64 | 
 65 | colnames(x) <- sapply(colnames(x), function(x) strsplit(x,split="[|]")[[1]][length(strsplit(x,split="[|]")[[1]])])
 66 | 
 67 | y <- as.numeric(as.factor(y[y %in% c("CRC","healthy")]))-1
 68 | fit <- FLORAL(x = x, y = y, family="binomial", ncv=10, progress=TRUE)
 69 | ```
 70 | 
 71 | ## Interpreting the Model
 72 | 
 73 | FLORAL, like other methods that have an optimization step, has two "best" solutions for $\lambda$ available: one minimizing the mean squared error ($\lambda_\min$), and one maximizing the value of $\lambda$ withing 1 standard error of the minimum mean squared error ($\lambda_{\text{1se}}$). These are referred to as the `min` and `1se` solutions, respectively.
 74 | 
 75 | We can see the mean squared error (MSE) and the coefficients vs log($\lambda$) as follows:
 76 | 
 77 | ```{r plots,fig.height=4,fig.width=10,dpi=300}
 78 | fit$pmse + fit$pcoef
 79 | ```
 80 | 
 81 | In both plots, the vertical dashed line and dotted line represent $\lambda_\min$ and $\lambda_{\text{1se}}$, respectively. In the MSE plot, the bands represent plus minus one standard error of the MSE. In the coefficient plot, the colored lines represent individual taxa, where taxa with non-zero values at $\lambda_\min$ and $\lambda_{\text{1se}}$ are selected as predictive of the outcome. 
 82 | 
 83 | To view specific names of the selected taxa, please see `fit$selected.feature$min` or `fit$selected.feature$1se` vectors. To view all coefficient estimates, please see `fit$best.beta$min` or `fit$best.beta$1se`. Without looking into ratios, one can crudely interpret positive or negative association between a taxon and the outcome by the positive or negative sign of the coefficient estimates. However, we recommend referring to the two-step procedure discussed below for a more rigorous interpretation based on ratios, which is derived from the log-ratio model assumption. 
 84 | 
 85 | ```{r viewTaxa}
 86 | 
 87 | head(fit$selected.feature$min)
 88 | 
 89 | head(sort(fit$best.beta$min))
 90 | 
 91 | ```
 92 | 
 93 | ## The Two-step Procedure
 94 | 
 95 | In the previous section, we checked the lasso estimates without identifying specific ratios that are predictive of the outcome (CRC in this case). By default, `FLORAL` performs a two-step selection procedure to use `glmnet` and `step` regression to further identify taxa pairs which form predictive log-ratios. To view those pairs, use `fit$step2.ratios$min` or `fit$step2.ratios$1se` for names of ratios and `fit$step2.ratios$min.idx` or `fit$step2.ratios$1se.idx` for the pairs of indices in the original input count matrix `x`. Note that one taxon can occur in multiple ratios.
 96 | 
 97 | ```{r view2step}
 98 | 
 99 | head(fit$step2.ratios$`1se`)
100 | 
101 | fit$step2.ratios$`1se.idx`
102 | 
103 | ```
104 | 
105 | To further interpret the positive or negative associations between the outcome, please refer to the output `step` regression tables, where the effect sizes of the ratios can be found. 
106 | 
107 | While the corresponding p-values are also available, we recommend only using the p-values as a criterion to rank the strength of the association. We do not recommend directly reporting the p-values for inference, because these p-values were obtained after running the first step lasso model without rigorous post-selective inference. However, it is still valid to claim these selected log-ratios are predictive of the outcome, as demonstrated by the improved 10-fold cross-validated prediction errors.
108 | 
109 | ```{r viewTable}
110 | 
111 | fit$step2.tables$`1se`
112 | 
113 | ```
114 | 
115 | ## Generating taxa selection probabilities
116 | 
117 | It is encouraged to run k-fold cross-validation for several times to account for the random fold splits. `FLORAL` provides `mcv.FLORAL` functions to repeat cross-validations for `mcv` times and on `ncore` cores. The output summarizes taxa selection probabilities, average coefficients based on $\lambda_\min$ and $\lambda_{\text{1se}}$. Interpretable plots can be created if `plot = TRUE` is specified.
118 | 
119 | ```{r mcv}
120 | 
121 | mcv.fit <- mcv.FLORAL(mcv=2,
122 |                       ncore=1,
123 |                       x = x, 
124 |                       y = y, 
125 |                       family = "binomial", 
126 |                       ncv = 3,
127 |                       progress=TRUE)
128 | 
129 | ```
130 | 
131 | ```{r mcvplots,fig.height=6,fig.width=10,dpi=300}
132 | 
133 | mcv.fit$p_min
134 | 
135 | #Other options are also available
136 | #mcv.fit$p_min_ratio
137 | #mcv.fit$p_1se
138 | #mcv.fit$p_1se_ratio
139 | 
140 | ```
141 | 
142 | ## Elastic net
143 | 
144 | Beyond lasso model, `FLORAL` also supports elastic net models by specifying the tuning parameter `a` between 0 and 1. Lasso penalty will be used when `a=1` while ridge penalty will be used when `a=0`. 
145 | 
146 | The `a.FLORAL` function can help investigate the prediction performance for different choices of `a` and return a plot of the corresponding prediction metric trajectories against the choice of $\lambda$.
147 | 
148 | ```{r a.floral,out.width = '50%',fig.height=4,fig.width=4,dpi=300}
149 | 
150 | a.fit <- a.FLORAL(a = c(0.1,1),
151 |                   ncore = 1,
152 |                   x = x, 
153 |                   y = y, 
154 |                   family = "binomial", 
155 |                   ncv = 3,
156 |                   progress=TRUE)
157 | 
158 | a.fit
159 | 
160 | ```
161 | 
162 | 


--------------------------------------------------------------------------------
/src/RcppExports.cpp:
--------------------------------------------------------------------------------
  1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand
  2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
  3 | 
  4 | #include <RcppArmadillo.h>
  5 | #include <Rcpp.h>
  6 | 
  7 | using namespace Rcpp;
  8 | 
  9 | #ifdef RCPP_USE_GLOBAL_ROSTREAM
 10 | Rcpp::Rostream<true>&  Rcpp::Rcout = Rcpp::Rcpp_cout_get();
 11 | Rcpp::Rostream<false>& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get();
 12 | #endif
 13 | 
 14 | // softthreshold
 15 | double softthreshold(double x, double lambda);
 16 | RcppExport SEXP _FLORAL_softthreshold(SEXP xSEXP, SEXP lambdaSEXP) {
 17 | BEGIN_RCPP
 18 |     Rcpp::RObject rcpp_result_gen;
 19 |     Rcpp::RNGScope rcpp_rngScope_gen;
 20 |     Rcpp::traits::input_parameter< double >::type x(xSEXP);
 21 |     Rcpp::traits::input_parameter< double >::type lambda(lambdaSEXP);
 22 |     rcpp_result_gen = Rcpp::wrap(softthreshold(x, lambda));
 23 |     return rcpp_result_gen;
 24 | END_RCPP
 25 | }
 26 | // gd_cov
 27 | arma::vec gd_cov(arma::mat xx, arma::vec xy, int n, double l, arma::vec beta);
 28 | RcppExport SEXP _FLORAL_gd_cov(SEXP xxSEXP, SEXP xySEXP, SEXP nSEXP, SEXP lSEXP, SEXP betaSEXP) {
 29 | BEGIN_RCPP
 30 |     Rcpp::RObject rcpp_result_gen;
 31 |     Rcpp::RNGScope rcpp_rngScope_gen;
 32 |     Rcpp::traits::input_parameter< arma::mat >::type xx(xxSEXP);
 33 |     Rcpp::traits::input_parameter< arma::vec >::type xy(xySEXP);
 34 |     Rcpp::traits::input_parameter< int >::type n(nSEXP);
 35 |     Rcpp::traits::input_parameter< double >::type l(lSEXP);
 36 |     Rcpp::traits::input_parameter< arma::vec >::type beta(betaSEXP);
 37 |     rcpp_result_gen = Rcpp::wrap(gd_cov(xx, xy, n, l, beta));
 38 |     return rcpp_result_gen;
 39 | END_RCPP
 40 | }
 41 | // gd_cov_al
 42 | arma::vec gd_cov_al(arma::mat xx, arma::vec xy, int n, double l, double a, arma::vec beta, double mu, double alpha, bool adjust, unsigned int ncov, double wcov);
 43 | RcppExport SEXP _FLORAL_gd_cov_al(SEXP xxSEXP, SEXP xySEXP, SEXP nSEXP, SEXP lSEXP, SEXP aSEXP, SEXP betaSEXP, SEXP muSEXP, SEXP alphaSEXP, SEXP adjustSEXP, SEXP ncovSEXP, SEXP wcovSEXP) {
 44 | BEGIN_RCPP
 45 |     Rcpp::RObject rcpp_result_gen;
 46 |     Rcpp::RNGScope rcpp_rngScope_gen;
 47 |     Rcpp::traits::input_parameter< arma::mat >::type xx(xxSEXP);
 48 |     Rcpp::traits::input_parameter< arma::vec >::type xy(xySEXP);
 49 |     Rcpp::traits::input_parameter< int >::type n(nSEXP);
 50 |     Rcpp::traits::input_parameter< double >::type l(lSEXP);
 51 |     Rcpp::traits::input_parameter< double >::type a(aSEXP);
 52 |     Rcpp::traits::input_parameter< arma::vec >::type beta(betaSEXP);
 53 |     Rcpp::traits::input_parameter< double >::type mu(muSEXP);
 54 |     Rcpp::traits::input_parameter< double >::type alpha(alphaSEXP);
 55 |     Rcpp::traits::input_parameter< bool >::type adjust(adjustSEXP);
 56 |     Rcpp::traits::input_parameter< unsigned int >::type ncov(ncovSEXP);
 57 |     Rcpp::traits::input_parameter< double >::type wcov(wcovSEXP);
 58 |     rcpp_result_gen = Rcpp::wrap(gd_cov_al(xx, xy, n, l, a, beta, mu, alpha, adjust, ncov, wcov));
 59 |     return rcpp_result_gen;
 60 | END_RCPP
 61 | }
 62 | // linear_enet_al
 63 | Rcpp::List linear_enet_al(arma::mat x, arma::vec y, int len, double mu, int ub, arma::vec lambda, double wcov, double a, bool adjust, unsigned int ncov, bool display_progress);
 64 | RcppExport SEXP _FLORAL_linear_enet_al(SEXP xSEXP, SEXP ySEXP, SEXP lenSEXP, SEXP muSEXP, SEXP ubSEXP, SEXP lambdaSEXP, SEXP wcovSEXP, SEXP aSEXP, SEXP adjustSEXP, SEXP ncovSEXP, SEXP display_progressSEXP) {
 65 | BEGIN_RCPP
 66 |     Rcpp::RObject rcpp_result_gen;
 67 |     Rcpp::RNGScope rcpp_rngScope_gen;
 68 |     Rcpp::traits::input_parameter< arma::mat >::type x(xSEXP);
 69 |     Rcpp::traits::input_parameter< arma::vec >::type y(ySEXP);
 70 |     Rcpp::traits::input_parameter< int >::type len(lenSEXP);
 71 |     Rcpp::traits::input_parameter< double >::type mu(muSEXP);
 72 |     Rcpp::traits::input_parameter< int >::type ub(ubSEXP);
 73 |     Rcpp::traits::input_parameter< arma::vec >::type lambda(lambdaSEXP);
 74 |     Rcpp::traits::input_parameter< double >::type wcov(wcovSEXP);
 75 |     Rcpp::traits::input_parameter< double >::type a(aSEXP);
 76 |     Rcpp::traits::input_parameter< bool >::type adjust(adjustSEXP);
 77 |     Rcpp::traits::input_parameter< unsigned int >::type ncov(ncovSEXP);
 78 |     Rcpp::traits::input_parameter< bool >::type display_progress(display_progressSEXP);
 79 |     rcpp_result_gen = Rcpp::wrap(linear_enet_al(x, y, len, mu, ub, lambda, wcov, a, adjust, ncov, display_progress));
 80 |     return rcpp_result_gen;
 81 | END_RCPP
 82 | }
 83 | // logistic_enet_al
 84 | Rcpp::List logistic_enet_al(arma::mat x, arma::vec y, int len, double mu, int ub, arma::vec lambda, double wcov, double a, bool adjust, unsigned int ncov, bool display_progress, bool loop1, bool loop2);
 85 | RcppExport SEXP _FLORAL_logistic_enet_al(SEXP xSEXP, SEXP ySEXP, SEXP lenSEXP, SEXP muSEXP, SEXP ubSEXP, SEXP lambdaSEXP, SEXP wcovSEXP, SEXP aSEXP, SEXP adjustSEXP, SEXP ncovSEXP, SEXP display_progressSEXP, SEXP loop1SEXP, SEXP loop2SEXP) {
 86 | BEGIN_RCPP
 87 |     Rcpp::RObject rcpp_result_gen;
 88 |     Rcpp::RNGScope rcpp_rngScope_gen;
 89 |     Rcpp::traits::input_parameter< arma::mat >::type x(xSEXP);
 90 |     Rcpp::traits::input_parameter< arma::vec >::type y(ySEXP);
 91 |     Rcpp::traits::input_parameter< int >::type len(lenSEXP);
 92 |     Rcpp::traits::input_parameter< double >::type mu(muSEXP);
 93 |     Rcpp::traits::input_parameter< int >::type ub(ubSEXP);
 94 |     Rcpp::traits::input_parameter< arma::vec >::type lambda(lambdaSEXP);
 95 |     Rcpp::traits::input_parameter< double >::type wcov(wcovSEXP);
 96 |     Rcpp::traits::input_parameter< double >::type a(aSEXP);
 97 |     Rcpp::traits::input_parameter< bool >::type adjust(adjustSEXP);
 98 |     Rcpp::traits::input_parameter< unsigned int >::type ncov(ncovSEXP);
 99 |     Rcpp::traits::input_parameter< bool >::type display_progress(display_progressSEXP);
100 |     Rcpp::traits::input_parameter< bool >::type loop1(loop1SEXP);
101 |     Rcpp::traits::input_parameter< bool >::type loop2(loop2SEXP);
102 |     rcpp_result_gen = Rcpp::wrap(logistic_enet_al(x, y, len, mu, ub, lambda, wcov, a, adjust, ncov, display_progress, loop1, loop2));
103 |     return rcpp_result_gen;
104 | END_RCPP
105 | }
106 | // cox_enet_al
107 | Rcpp::List cox_enet_al(arma::mat x, arma::vec t, arma::vec d, arma::vec tj, int len, double mu, int ub, arma::vec lambda, double wcov, double a, bool adjust, unsigned int ncov, double devnull, bool display_progress, bool loop1, bool loop2, bool notcv);
108 | RcppExport SEXP _FLORAL_cox_enet_al(SEXP xSEXP, SEXP tSEXP, SEXP dSEXP, SEXP tjSEXP, SEXP lenSEXP, SEXP muSEXP, SEXP ubSEXP, SEXP lambdaSEXP, SEXP wcovSEXP, SEXP aSEXP, SEXP adjustSEXP, SEXP ncovSEXP, SEXP devnullSEXP, SEXP display_progressSEXP, SEXP loop1SEXP, SEXP loop2SEXP, SEXP notcvSEXP) {
109 | BEGIN_RCPP
110 |     Rcpp::RObject rcpp_result_gen;
111 |     Rcpp::RNGScope rcpp_rngScope_gen;
112 |     Rcpp::traits::input_parameter< arma::mat >::type x(xSEXP);
113 |     Rcpp::traits::input_parameter< arma::vec >::type t(tSEXP);
114 |     Rcpp::traits::input_parameter< arma::vec >::type d(dSEXP);
115 |     Rcpp::traits::input_parameter< arma::vec >::type tj(tjSEXP);
116 |     Rcpp::traits::input_parameter< int >::type len(lenSEXP);
117 |     Rcpp::traits::input_parameter< double >::type mu(muSEXP);
118 |     Rcpp::traits::input_parameter< int >::type ub(ubSEXP);
119 |     Rcpp::traits::input_parameter< arma::vec >::type lambda(lambdaSEXP);
120 |     Rcpp::traits::input_parameter< double >::type wcov(wcovSEXP);
121 |     Rcpp::traits::input_parameter< double >::type a(aSEXP);
122 |     Rcpp::traits::input_parameter< bool >::type adjust(adjustSEXP);
123 |     Rcpp::traits::input_parameter< unsigned int >::type ncov(ncovSEXP);
124 |     Rcpp::traits::input_parameter< double >::type devnull(devnullSEXP);
125 |     Rcpp::traits::input_parameter< bool >::type display_progress(display_progressSEXP);
126 |     Rcpp::traits::input_parameter< bool >::type loop1(loop1SEXP);
127 |     Rcpp::traits::input_parameter< bool >::type loop2(loop2SEXP);
128 |     Rcpp::traits::input_parameter< bool >::type notcv(notcvSEXP);
129 |     rcpp_result_gen = Rcpp::wrap(cox_enet_al(x, t, d, tj, len, mu, ub, lambda, wcov, a, adjust, ncov, devnull, display_progress, loop1, loop2, notcv));
130 |     return rcpp_result_gen;
131 | END_RCPP
132 | }
133 | // cox_timedep_enet_al
134 | Rcpp::List cox_timedep_enet_al(arma::mat x, arma::vec t0, arma::vec t1, arma::vec d, arma::vec tj, int len, double mu, int ub, arma::vec lambda, double wcov, double a, bool adjust, unsigned int ncov, double devnull, bool display_progress);
135 | RcppExport SEXP _FLORAL_cox_timedep_enet_al(SEXP xSEXP, SEXP t0SEXP, SEXP t1SEXP, SEXP dSEXP, SEXP tjSEXP, SEXP lenSEXP, SEXP muSEXP, SEXP ubSEXP, SEXP lambdaSEXP, SEXP wcovSEXP, SEXP aSEXP, SEXP adjustSEXP, SEXP ncovSEXP, SEXP devnullSEXP, SEXP display_progressSEXP) {
136 | BEGIN_RCPP
137 |     Rcpp::RObject rcpp_result_gen;
138 |     Rcpp::RNGScope rcpp_rngScope_gen;
139 |     Rcpp::traits::input_parameter< arma::mat >::type x(xSEXP);
140 |     Rcpp::traits::input_parameter< arma::vec >::type t0(t0SEXP);
141 |     Rcpp::traits::input_parameter< arma::vec >::type t1(t1SEXP);
142 |     Rcpp::traits::input_parameter< arma::vec >::type d(dSEXP);
143 |     Rcpp::traits::input_parameter< arma::vec >::type tj(tjSEXP);
144 |     Rcpp::traits::input_parameter< int >::type len(lenSEXP);
145 |     Rcpp::traits::input_parameter< double >::type mu(muSEXP);
146 |     Rcpp::traits::input_parameter< int >::type ub(ubSEXP);
147 |     Rcpp::traits::input_parameter< arma::vec >::type lambda(lambdaSEXP);
148 |     Rcpp::traits::input_parameter< double >::type wcov(wcovSEXP);
149 |     Rcpp::traits::input_parameter< double >::type a(aSEXP);
150 |     Rcpp::traits::input_parameter< bool >::type adjust(adjustSEXP);
151 |     Rcpp::traits::input_parameter< unsigned int >::type ncov(ncovSEXP);
152 |     Rcpp::traits::input_parameter< double >::type devnull(devnullSEXP);
153 |     Rcpp::traits::input_parameter< bool >::type display_progress(display_progressSEXP);
154 |     rcpp_result_gen = Rcpp::wrap(cox_timedep_enet_al(x, t0, t1, d, tj, len, mu, ub, lambda, wcov, a, adjust, ncov, devnull, display_progress));
155 |     return rcpp_result_gen;
156 | END_RCPP
157 | }
158 | // fg_enet_al
159 | Rcpp::List fg_enet_al(arma::mat x, arma::vec t0, arma::vec t1, arma::vec d, arma::vec tj, arma::vec w, int len, double mu, int ub, arma::vec lambda, double wcov, double a, bool adjust, unsigned int ncov, double devnull, bool display_progress);
160 | RcppExport SEXP _FLORAL_fg_enet_al(SEXP xSEXP, SEXP t0SEXP, SEXP t1SEXP, SEXP dSEXP, SEXP tjSEXP, SEXP wSEXP, SEXP lenSEXP, SEXP muSEXP, SEXP ubSEXP, SEXP lambdaSEXP, SEXP wcovSEXP, SEXP aSEXP, SEXP adjustSEXP, SEXP ncovSEXP, SEXP devnullSEXP, SEXP display_progressSEXP) {
161 | BEGIN_RCPP
162 |     Rcpp::RObject rcpp_result_gen;
163 |     Rcpp::RNGScope rcpp_rngScope_gen;
164 |     Rcpp::traits::input_parameter< arma::mat >::type x(xSEXP);
165 |     Rcpp::traits::input_parameter< arma::vec >::type t0(t0SEXP);
166 |     Rcpp::traits::input_parameter< arma::vec >::type t1(t1SEXP);
167 |     Rcpp::traits::input_parameter< arma::vec >::type d(dSEXP);
168 |     Rcpp::traits::input_parameter< arma::vec >::type tj(tjSEXP);
169 |     Rcpp::traits::input_parameter< arma::vec >::type w(wSEXP);
170 |     Rcpp::traits::input_parameter< int >::type len(lenSEXP);
171 |     Rcpp::traits::input_parameter< double >::type mu(muSEXP);
172 |     Rcpp::traits::input_parameter< int >::type ub(ubSEXP);
173 |     Rcpp::traits::input_parameter< arma::vec >::type lambda(lambdaSEXP);
174 |     Rcpp::traits::input_parameter< double >::type wcov(wcovSEXP);
175 |     Rcpp::traits::input_parameter< double >::type a(aSEXP);
176 |     Rcpp::traits::input_parameter< bool >::type adjust(adjustSEXP);
177 |     Rcpp::traits::input_parameter< unsigned int >::type ncov(ncovSEXP);
178 |     Rcpp::traits::input_parameter< double >::type devnull(devnullSEXP);
179 |     Rcpp::traits::input_parameter< bool >::type display_progress(display_progressSEXP);
180 |     rcpp_result_gen = Rcpp::wrap(fg_enet_al(x, t0, t1, d, tj, w, len, mu, ub, lambda, wcov, a, adjust, ncov, devnull, display_progress));
181 |     return rcpp_result_gen;
182 | END_RCPP
183 | }
184 | // gee_NR
185 | Rcpp::List gee_NR(double N, arma::vec nt, arma::vec y, arma::mat X, double nx, Rcpp::Function linkinv, Rcpp::Function mueta, Rcpp::Function variance, arma::vec beta_new, arma::cube Rhat, double fihat, double lambda, double a, double alpha, double ncov, double wcov, double eps, double muu);
186 | RcppExport SEXP _FLORAL_gee_NR(SEXP NSEXP, SEXP ntSEXP, SEXP ySEXP, SEXP XSEXP, SEXP nxSEXP, SEXP linkinvSEXP, SEXP muetaSEXP, SEXP varianceSEXP, SEXP beta_newSEXP, SEXP RhatSEXP, SEXP fihatSEXP, SEXP lambdaSEXP, SEXP aSEXP, SEXP alphaSEXP, SEXP ncovSEXP, SEXP wcovSEXP, SEXP epsSEXP, SEXP muuSEXP) {
187 | BEGIN_RCPP
188 |     Rcpp::RObject rcpp_result_gen;
189 |     Rcpp::RNGScope rcpp_rngScope_gen;
190 |     Rcpp::traits::input_parameter< double >::type N(NSEXP);
191 |     Rcpp::traits::input_parameter< arma::vec >::type nt(ntSEXP);
192 |     Rcpp::traits::input_parameter< arma::vec >::type y(ySEXP);
193 |     Rcpp::traits::input_parameter< arma::mat >::type X(XSEXP);
194 |     Rcpp::traits::input_parameter< double >::type nx(nxSEXP);
195 |     Rcpp::traits::input_parameter< Rcpp::Function >::type linkinv(linkinvSEXP);
196 |     Rcpp::traits::input_parameter< Rcpp::Function >::type mueta(muetaSEXP);
197 |     Rcpp::traits::input_parameter< Rcpp::Function >::type variance(varianceSEXP);
198 |     Rcpp::traits::input_parameter< arma::vec >::type beta_new(beta_newSEXP);
199 |     Rcpp::traits::input_parameter< arma::cube >::type Rhat(RhatSEXP);
200 |     Rcpp::traits::input_parameter< double >::type fihat(fihatSEXP);
201 |     Rcpp::traits::input_parameter< double >::type lambda(lambdaSEXP);
202 |     Rcpp::traits::input_parameter< double >::type a(aSEXP);
203 |     Rcpp::traits::input_parameter< double >::type alpha(alphaSEXP);
204 |     Rcpp::traits::input_parameter< double >::type ncov(ncovSEXP);
205 |     Rcpp::traits::input_parameter< double >::type wcov(wcovSEXP);
206 |     Rcpp::traits::input_parameter< double >::type eps(epsSEXP);
207 |     Rcpp::traits::input_parameter< double >::type muu(muuSEXP);
208 |     rcpp_result_gen = Rcpp::wrap(gee_NR(N, nt, y, X, nx, linkinv, mueta, variance, beta_new, Rhat, fihat, lambda, a, alpha, ncov, wcov, eps, muu));
209 |     return rcpp_result_gen;
210 | END_RCPP
211 | }
212 | // gee_cor
213 | Rcpp::List gee_cor(double N, arma::vec nt, arma::vec y, arma::mat X, Rcpp::Function linkinv, Rcpp::Function variance, arma::vec beta_new, std::string corstr, double maxclsz, bool scalefix, double scalevalue);
214 | RcppExport SEXP _FLORAL_gee_cor(SEXP NSEXP, SEXP ntSEXP, SEXP ySEXP, SEXP XSEXP, SEXP linkinvSEXP, SEXP varianceSEXP, SEXP beta_newSEXP, SEXP corstrSEXP, SEXP maxclszSEXP, SEXP scalefixSEXP, SEXP scalevalueSEXP) {
215 | BEGIN_RCPP
216 |     Rcpp::RObject rcpp_result_gen;
217 |     Rcpp::RNGScope rcpp_rngScope_gen;
218 |     Rcpp::traits::input_parameter< double >::type N(NSEXP);
219 |     Rcpp::traits::input_parameter< arma::vec >::type nt(ntSEXP);
220 |     Rcpp::traits::input_parameter< arma::vec >::type y(ySEXP);
221 |     Rcpp::traits::input_parameter< arma::mat >::type X(XSEXP);
222 |     Rcpp::traits::input_parameter< Rcpp::Function >::type linkinv(linkinvSEXP);
223 |     Rcpp::traits::input_parameter< Rcpp::Function >::type variance(varianceSEXP);
224 |     Rcpp::traits::input_parameter< arma::vec >::type beta_new(beta_newSEXP);
225 |     Rcpp::traits::input_parameter< std::string >::type corstr(corstrSEXP);
226 |     Rcpp::traits::input_parameter< double >::type maxclsz(maxclszSEXP);
227 |     Rcpp::traits::input_parameter< bool >::type scalefix(scalefixSEXP);
228 |     Rcpp::traits::input_parameter< double >::type scalevalue(scalevalueSEXP);
229 |     rcpp_result_gen = Rcpp::wrap(gee_cor(N, nt, y, X, linkinv, variance, beta_new, corstr, maxclsz, scalefix, scalevalue));
230 |     return rcpp_result_gen;
231 | END_RCPP
232 | }
233 | // gee_fit
234 | Rcpp::List gee_fit(arma::vec y, arma::mat X, arma::vec nt, Rcpp::Function linkinv, Rcpp::Function mueta, Rcpp::Function variance, std::string corstr, arma::vec lambda, double a, double ncov, double wcov, double tol, double eps, double muu, int maxiter1, int maxiter2, bool scalefix, double scalevalue, bool display_progress);
235 | RcppExport SEXP _FLORAL_gee_fit(SEXP ySEXP, SEXP XSEXP, SEXP ntSEXP, SEXP linkinvSEXP, SEXP muetaSEXP, SEXP varianceSEXP, SEXP corstrSEXP, SEXP lambdaSEXP, SEXP aSEXP, SEXP ncovSEXP, SEXP wcovSEXP, SEXP tolSEXP, SEXP epsSEXP, SEXP muuSEXP, SEXP maxiter1SEXP, SEXP maxiter2SEXP, SEXP scalefixSEXP, SEXP scalevalueSEXP, SEXP display_progressSEXP) {
236 | BEGIN_RCPP
237 |     Rcpp::RObject rcpp_result_gen;
238 |     Rcpp::RNGScope rcpp_rngScope_gen;
239 |     Rcpp::traits::input_parameter< arma::vec >::type y(ySEXP);
240 |     Rcpp::traits::input_parameter< arma::mat >::type X(XSEXP);
241 |     Rcpp::traits::input_parameter< arma::vec >::type nt(ntSEXP);
242 |     Rcpp::traits::input_parameter< Rcpp::Function >::type linkinv(linkinvSEXP);
243 |     Rcpp::traits::input_parameter< Rcpp::Function >::type mueta(muetaSEXP);
244 |     Rcpp::traits::input_parameter< Rcpp::Function >::type variance(varianceSEXP);
245 |     Rcpp::traits::input_parameter< std::string >::type corstr(corstrSEXP);
246 |     Rcpp::traits::input_parameter< arma::vec >::type lambda(lambdaSEXP);
247 |     Rcpp::traits::input_parameter< double >::type a(aSEXP);
248 |     Rcpp::traits::input_parameter< double >::type ncov(ncovSEXP);
249 |     Rcpp::traits::input_parameter< double >::type wcov(wcovSEXP);
250 |     Rcpp::traits::input_parameter< double >::type tol(tolSEXP);
251 |     Rcpp::traits::input_parameter< double >::type eps(epsSEXP);
252 |     Rcpp::traits::input_parameter< double >::type muu(muuSEXP);
253 |     Rcpp::traits::input_parameter< int >::type maxiter1(maxiter1SEXP);
254 |     Rcpp::traits::input_parameter< int >::type maxiter2(maxiter2SEXP);
255 |     Rcpp::traits::input_parameter< bool >::type scalefix(scalefixSEXP);
256 |     Rcpp::traits::input_parameter< double >::type scalevalue(scalevalueSEXP);
257 |     Rcpp::traits::input_parameter< bool >::type display_progress(display_progressSEXP);
258 |     rcpp_result_gen = Rcpp::wrap(gee_fit(y, X, nt, linkinv, mueta, variance, corstr, lambda, a, ncov, wcov, tol, eps, muu, maxiter1, maxiter2, scalefix, scalevalue, display_progress));
259 |     return rcpp_result_gen;
260 | END_RCPP
261 | }
262 | 
263 | static const R_CallMethodDef CallEntries[] = {
264 |     {"_FLORAL_softthreshold", (DL_FUNC) &_FLORAL_softthreshold, 2},
265 |     {"_FLORAL_gd_cov", (DL_FUNC) &_FLORAL_gd_cov, 5},
266 |     {"_FLORAL_gd_cov_al", (DL_FUNC) &_FLORAL_gd_cov_al, 11},
267 |     {"_FLORAL_linear_enet_al", (DL_FUNC) &_FLORAL_linear_enet_al, 11},
268 |     {"_FLORAL_logistic_enet_al", (DL_FUNC) &_FLORAL_logistic_enet_al, 13},
269 |     {"_FLORAL_cox_enet_al", (DL_FUNC) &_FLORAL_cox_enet_al, 17},
270 |     {"_FLORAL_cox_timedep_enet_al", (DL_FUNC) &_FLORAL_cox_timedep_enet_al, 15},
271 |     {"_FLORAL_fg_enet_al", (DL_FUNC) &_FLORAL_fg_enet_al, 16},
272 |     {"_FLORAL_gee_NR", (DL_FUNC) &_FLORAL_gee_NR, 18},
273 |     {"_FLORAL_gee_cor", (DL_FUNC) &_FLORAL_gee_cor, 11},
274 |     {"_FLORAL_gee_fit", (DL_FUNC) &_FLORAL_gee_fit, 19},
275 |     {NULL, NULL, 0}
276 | };
277 | 
278 | RcppExport void R_init_FLORAL(DllInfo *dll) {
279 |     R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
280 |     R_useDynamicSymbols(dll, FALSE);
281 | }
282 | 


--------------------------------------------------------------------------------
/R/LogRatioLasso.R:
--------------------------------------------------------------------------------
  1 | LogRatioLasso <- function(x,
  2 |                           y,
  3 |                           ncov,
  4 |                           length.lambda=100,
  5 |                           lambda.min.ratio=NULL,
  6 |                           wcov,
  7 |                           a=1,
  8 |                           mu=1,
  9 |                           maxiter=100,
 10 |                           ncv=5,
 11 |                           intercept=FALSE,
 12 |                           foldid=NULL,
 13 |                           step2=FALSE,
 14 |                           progress=TRUE,
 15 |                           plot=TRUE,
 16 |                           ncore=1){
 17 |   
 18 |   ptm <- proc.time()
 19 |   
 20 |   y0 <- y
 21 |   y <- y - mean(y)
 22 |   n <- length(y)
 23 |   p <- ncol(x)
 24 |   
 25 |   if (a > 0){
 26 |     lambda0 <- max(abs(t(y) %*% x))/(a*n)
 27 |   }else if (a == 0){
 28 |     lambda0 <- max(abs(t(y) %*% x))/(1e-3*n)
 29 |   }
 30 |   
 31 |   adjust = FALSE
 32 |   if (ncov > 0) adjust = TRUE
 33 |   
 34 |   if (is.null(lambda.min.ratio)) lambda.min.ratio = ifelse(n < p, 1e-02, 1e-02)
 35 |   
 36 |   lambda <- 10^(seq(log10(lambda0),log10(lambda0*lambda.min.ratio),length.out=length.lambda))
 37 |   
 38 |   if (progress) cat("Algorithm running for full dataset: \n")
 39 |   
 40 |   fullfit <- linear_enet_al(x,y,length.lambda,mu,maxiter,lambda,wcov,a,adjust,ncov,progress)
 41 |   
 42 |   if (!is.null(colnames(x))){
 43 |     rownames(fullfit$beta) = colnames(x)
 44 |   }else{
 45 |     colnames(x) = 1:ncol(x)
 46 |     rownames(fullfit$beta) = 1:ncol(x)
 47 |   }
 48 |   
 49 |   if (intercept){
 50 |     beta0 <- mean(y0) - colMeans(x %*% fullfit$beta)
 51 |   }
 52 |   
 53 |   if (!is.null(ncv)){
 54 |     
 55 |     cvmse <- matrix(NA,nrow=length.lambda,ncol=ncv)
 56 |     
 57 |     if (is.null(foldid)){
 58 |       labels <- caret::createFolds(1:nrow(x),k=ncv,list=FALSE)
 59 |     }else{
 60 |       labels <- foldid
 61 |     }
 62 |     
 63 |     if (ncore == 1){
 64 |       
 65 |       for (cv in 1:ncv){
 66 |         
 67 |         if (progress) cat(paste0("Algorithm running for cv dataset ",cv," out of ",ncv,": \n"))
 68 |         
 69 |         train.x <- x[labels!=cv,]
 70 |         train.y <- y[labels!=cv]
 71 |         test.x <- x[labels==cv,]
 72 |         test.y <- y[labels==cv]
 73 |         
 74 |         cvfit <- linear_enet_al(train.x,train.y,length.lambda,mu,maxiter,lambda,wcov,a,adjust,ncov,progress)
 75 |         
 76 |         cvmse[,cv] <- apply(test.x %*% cvfit$beta,2,function(x) sum((test.y-x)^2)/length(test.y))
 77 |         
 78 |       }
 79 |       
 80 |     }else if(ncore > 1){
 81 |       
 82 |       if (progress) warning(paste0("Using ", ncore ," core for cross-validation computation."))
 83 |       
 84 |       cl <- makeCluster(ncore)
 85 |       registerDoParallel(cl)
 86 |       
 87 |       cvmse <- foreach(cv=1:ncv,.combine=cbind) %dopar% {
 88 |         
 89 |         train.x <- x[labels!=cv,]
 90 |         train.y <- y[labels!=cv]
 91 |         test.x <- x[labels==cv,]
 92 |         test.y <- y[labels==cv]
 93 |         
 94 |         cvfit <- linear_enet_al(train.x,train.y,length.lambda,mu,maxiter,lambda,wcov,a,adjust,ncov,FALSE)
 95 |         
 96 |         apply(test.x %*% cvfit$beta,2,function(x) sum((test.y-x)^2)/length(test.y))
 97 |         
 98 |       }
 99 |       
100 |       stopCluster(cl)
101 |       
102 |     }
103 |     
104 |     mean.cvmse <- rowMeans(cvmse)
105 |     se.cvmse <- apply(cvmse,1,function(x) sd(x)/sqrt(ncv))
106 |     
107 |     idx.min <- which.min(mean.cvmse)
108 |     se.min <- se.cvmse[idx.min]
109 |     idx.1se <- suppressWarnings(min(which(mean.cvmse < mean.cvmse[idx.min] + se.min & 1:length.lambda < idx.min)))
110 |     if (idx.1se == -Inf) idx.1se = 1
111 |     
112 |     best.beta <- list(min.mse = fullfit$beta[,idx.min],
113 |                       add.1se = fullfit$beta[,idx.1se])
114 |     
115 |     best.idx <- list(idx.min = idx.min,
116 |                      idx.1se = idx.1se)
117 |     
118 |     ret <- list(beta=fullfit$beta,
119 |                 lambda=fullfit$lambda,
120 |                 a=a,
121 |                 loss=fullfit$loss,
122 |                 mse=fullfit$mse,
123 |                 tol=fullfit$tol,
124 |                 iters=fullfit$iters,
125 |                 cvmse.mean=mean.cvmse,
126 |                 cvmse.se=se.cvmse,
127 |                 best.beta=best.beta,
128 |                 best.idx=best.idx,
129 |                 foldid=labels
130 |     )
131 |     
132 |     if (plot){
133 |       
134 |       beta_nzero <- suppressWarnings(data.frame(reshape::melt(ret$beta[rowSums(ret$beta != 0) > 0,])))
135 |       beta_nzero$lambda <- ret$lambda[beta_nzero$X2]
136 |       beta_nzero$loglambda <- log(beta_nzero$lambda)
137 |       
138 |       lambda_count <- data.frame(loglambda = log(ret$lambda),
139 |                                  count = colSums(ret$beta != 0))
140 |       lambda_count <- lambda_count[seq(5,nrow(lambda_count),length.out=10),]
141 |       
142 |       top10feat <- sort(ret$beta[,length(ret$lambda)])[c(1:5,(p-4):p)]
143 |       top10name <- names(top10feat)
144 |       
145 |       pcoef <- ggplot(beta_nzero, aes(x=.data$loglambda,y=.data$value,group=.data$X1,color=as.factor(.data$X1))) + 
146 |         geom_line() + 
147 |         scale_color_manual(values=rainbow(sum(rowSums(ret$beta != 0) > 0))) + 
148 |         theme_bw() + 
149 |         theme(legend.position = "none") +
150 |         xlab(expression(paste("log(",lambda,")"))) +
151 |         ylab("Coefficient") +
152 |         geom_vline(xintercept=log(ret$lambda[ret$best.idx$idx.min]),linetype="dashed",color="darkgrey")+
153 |         geom_vline(xintercept=log(ret$lambda[ret$best.idx$idx.1se]),linetype="dotted",color="darkgrey")+
154 |         # annotate("text",x=min(beta_nzero$loglambda)-2,y=top10feat,label=top10name,hjust=0)+
155 |         annotate("text",x=lambda_count$loglambda,y=max(beta_nzero$value)+0.2,label=as.character(lambda_count$count))+
156 |         ggtitle(expression(paste("Coefficients versus log(",lambda,")")))
157 |       
158 |       mseplot <- data.frame(loglambda=log(ret$lambda),
159 |                             mse=ret$cvmse.mean,
160 |                             se=ret$cvmse.se,
161 |                             mseaddse=ret$cvmse.mean+ret$cvmse.se,
162 |                             mseminse=ret$cvmse.mean-ret$cvmse.se)
163 |       
164 |       pmse <- ggplot(mseplot, aes(x=.data$loglambda, y=.data$mse)) +
165 |         geom_errorbar(aes(ymin=.data$mseminse,ymax=.data$mseaddse),color="grey")+
166 |         geom_point(color="red")+
167 |         theme_bw() +
168 |         xlab(expression(paste("log(",lambda,")"))) +
169 |         ylab("Mean-Squared Error")+
170 |         geom_vline(xintercept=log(ret$lambda[ret$best.idx$idx.min]),linetype="dashed",color="darkgrey")+
171 |         geom_vline(xintercept=log(ret$lambda[ret$best.idx$idx.1se]),linetype="dotted",color="darkgrey")+
172 |         annotate("text",x=lambda_count$loglambda,y=max(mseplot$mseaddse)+0.05,label=as.character(lambda_count$count))+
173 |         ggtitle(expression(paste("Cross-validated MSE versus log(",lambda,")")))
174 |       
175 |       ret$pcoef <- pcoef
176 |       ret$pmse <- pmse
177 |       
178 |     }
179 |     
180 |     if (step2){
181 |       
182 |       if (!adjust){
183 |         
184 |         if (length(which(ret$best.beta$min.mse!=0)) > 1){
185 |           idxs <- combn(which(ret$best.beta$min.mse!=0),2)
186 |           
187 |           x.select.min <- matrix(NA,nrow=n,ncol=ncol(idxs))
188 |           for (k in 1:ncol(idxs)){
189 |             x.select.min[,k] <- x[,idxs[1,k]] - x[,idxs[2,k]]
190 |           }
191 |           
192 |           if (ncol(x.select.min) > 1){
193 |             stepglmnet <- cv.glmnet(x=x.select.min,y=y,type.measure = "mse",family="gaussian")
194 |             x.select.min <- x.select.min[,which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)]
195 |             idxs <- idxs[,which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)]
196 |           }else{
197 |             idxs <- as.vector(idxs)
198 |           }
199 |           
200 |           df_step2 <- data.frame(y=y,x=x.select.min)
201 |           step2fit <- suppressWarnings(step(glm(y~.,data=df_step2,family=gaussian),trace=0))
202 |           vars <- as.numeric(sapply(names(step2fit$coefficients),function(x) strsplit(x,split = "[.]")[[1]][2]))
203 |           
204 |           if (is.null(ncol(idxs))){
205 |             if (length(vars) == 2){
206 |               selected <- idxs
207 |             }else{
208 |               selected <- NULL
209 |             }
210 |           }else{
211 |             selected <- idxs[,vars]
212 |           }
213 |           # for (k1 in 1:nrow(selected)){
214 |           #   for (k2 in 1:ncol(selected)){
215 |           #     selected[k1,k2] <- colnames(x)[as.numeric(selected[k1,k2])]
216 |           #   }
217 |           # }
218 |           
219 |           ret$step2.feature.min = selected
220 |           ret$step2fit.min <- step2fit
221 |         }
222 |         
223 |         if (length(which(ret$best.beta$add.1se!=0)) > 1){
224 |           
225 |           idxs <- combn(which(ret$best.beta$add.1se!=0),2)
226 |           
227 |           x.select.min <- matrix(NA,nrow=n,ncol=ncol(idxs))
228 |           for (k in 1:ncol(idxs)){
229 |             x.select.min[,k] <- x[,idxs[1,k]] - x[,idxs[2,k]]
230 |           }
231 |           if (ncol(x.select.min) > 1){
232 |             stepglmnet <- cv.glmnet(x=x.select.min,y=y,type.measure = "mse",family="gaussian")
233 |             x.select.min <- x.select.min[,which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)]
234 |             idxs <- idxs[,which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)]
235 |           }else{
236 |             idxs <- as.vector(idxs)
237 |           }
238 |           df_step2 <- data.frame(y=y,x=x.select.min)
239 |           step2fit <- suppressWarnings(step(glm(y~.,data=df_step2,family=gaussian),trace=0))
240 |           vars <- as.numeric(sapply(names(step2fit$coefficients),function(x) strsplit(x,split = "[.]")[[1]][2]))
241 |           
242 |           if (is.null(ncol(idxs))){
243 |             if (length(vars) == 2){
244 |               selected <- idxs
245 |             }else{
246 |               selected <- NULL
247 |             }
248 |           }else{
249 |             selected <- idxs[,vars]
250 |           }
251 |           # for (k1 in 1:nrow(selected)){
252 |           #   for (k2 in 1:ncol(selected)){
253 |           #     selected[k1,k2] <- colnames(x)[as.numeric(selected[k1,k2])]
254 |           #   }
255 |           # }
256 |           
257 |           ret$step2.feature.1se = selected
258 |           ret$step2fit.1se <- step2fit
259 |         }
260 |         
261 |       }else{
262 |         
263 |         if (length(which(ret$best.beta$min.mse!=0)) > 1){
264 |           
265 |           allidx <- which(ret$best.beta$min.mse!=0)
266 |           
267 |           covidx <- allidx[allidx <= ncov]
268 |           taxidx <- allidx[allidx > ncov]
269 |           
270 |           idxs <- NULL
271 |           x.select.min <- NULL
272 |           
273 |           if (length(taxidx) > 1){
274 |             idxs <- combn(taxidx,2)
275 |             for (k in 1:ncol(idxs)){
276 |               x.select.min <- cbind(x.select.min, x[,idxs[1,k]] - x[,idxs[2,k]])
277 |             }
278 |             colnames(x.select.min) <- rep("",ncol(x.select.min))
279 |           }
280 |           
281 |           if (length(covidx) > 0){
282 |             x.select.min <- cbind(x.select.min, x[,covidx])
283 |             if (!is.null(idxs)){
284 |               colnames(x.select.min)[(ncol(idxs)+1):ncol(x.select.min)] = colnames(x)[covidx]
285 |             }else{
286 |               colnames(x.select.min) <- colnames(x)[covidx]
287 |             }
288 |           }
289 |           
290 |           
291 |           # if(is.null(x.select.min)) break
292 |           
293 |           if (ncol(x.select.min) > 1){
294 |             stepglmnet <- cv.glmnet(x=x.select.min,y=y,type.measure = "mse",family="gaussian")
295 |             
296 |             if (length(taxidx) < 2){
297 |               
298 |               idxs <- NULL
299 |               
300 |               # covs <- colnames(x.select.min)[which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)]
301 |               
302 |             }else{
303 |               
304 |               if (length(covidx) == 0)  idxs <- idxs[,which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)]
305 |               
306 |               if (length(covidx) > 0){
307 |                 
308 |                 # covs <- colnames(x.select.min)[setdiff(which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0),
309 |                 #                                        1:ncol(idxs))]
310 |                 
311 |                 idxs <- idxs[,setdiff(which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0),
312 |                                       (ncol(idxs)+1):(ncol(idxs)+length(covidx)))]
313 |                 
314 |               }
315 |               
316 |             }
317 |             
318 |             x.select.min <- x.select.min[,which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)]
319 |           }
320 |           
321 |           if (sum(colnames(x.select.min)=="") > 0)  colnames(x.select.min)[colnames(x.select.min)==""] <- paste0("x.",1:sum(colnames(x.select.min)==""))
322 |           df_step2 <- data.frame(y=y,x.select.min)
323 |           step2fit <- suppressWarnings(step(glm(y~.,data=df_step2,family=gaussian),trace=0))
324 |           vars <- suppressWarnings(as.numeric(sapply(names(step2fit$coefficients),function(x) strsplit(x,split = "[.]")[[1]][2])))
325 |           vars <- vars[!is.na(vars)]
326 |           
327 |           selected <- NULL
328 |           if (is.null(ncol(idxs)) & length(vars) > 0){
329 |             selected <- idxs
330 |           }else if (length(vars) > 0){
331 |             selected <- idxs[,vars]
332 |           }
333 |           
334 |           ret$step2.feature.min = selected
335 |           ret$step2fit.min <- step2fit
336 |         }
337 |         
338 |         if (length(which(ret$best.beta$add.1se!=0)) > 1){
339 |           
340 |           allidx <- which(ret$best.beta$add.1se!=0)
341 |           
342 |           covidx <- allidx[allidx <= ncov]
343 |           taxidx <- allidx[allidx > ncov]
344 |           
345 |           idxs <- NULL
346 |           x.select.min <- NULL
347 |           
348 |           if (length(taxidx) > 1){
349 |             idxs <- combn(taxidx,2)
350 |             for (k in 1:ncol(idxs)){
351 |               x.select.min <- cbind(x.select.min, x[,idxs[1,k]] - x[,idxs[2,k]])
352 |             }
353 |             colnames(x.select.min) <- rep("",ncol(x.select.min))
354 |           }
355 |           
356 |           if (length(covidx) > 0){
357 |             x.select.min <- cbind(x.select.min, x[,covidx])
358 |             if (!is.null(idxs)){
359 |               colnames(x.select.min)[(ncol(idxs)+1):ncol(x.select.min)] = colnames(x)[covidx]
360 |             }else{
361 |               colnames(x.select.min) <- colnames(x)[covidx]
362 |             }
363 |           }
364 |           
365 |           # if(is.null(x.select.min)) break
366 |           
367 |           if (ncol(x.select.min) > 1){
368 |             stepglmnet <- cv.glmnet(x=x.select.min,y=y,type.measure = "mse",family="gaussian")
369 |             
370 |             if (length(taxidx) < 2){
371 |               
372 |               idxs <- NULL
373 |               
374 |               # covs <- colnames(x.select.min)[which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)]
375 |               
376 |             }else{
377 |               
378 |               if (length(covidx) == 0)  idxs <- idxs[,which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)]
379 |               
380 |               if (length(covidx) > 0){
381 |                 
382 |                 # covs <- colnames(x.select.min)[setdiff(which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0),1:ncol(idxs))]
383 |                 
384 |                 idxs <- idxs[,setdiff(which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0),
385 |                                       (ncol(idxs)+1):(ncol(idxs)+length(covidx)))]
386 |                 
387 |               }
388 |               
389 |             }
390 |             
391 |             x.select.min <- x.select.min[,which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)]
392 |           }
393 |           
394 |           if (sum(colnames(x.select.min)=="") > 0)  colnames(x.select.min)[colnames(x.select.min)==""] <- paste0("x.",1:sum(colnames(x.select.min)==""))
395 |           df_step2 <- data.frame(y=y,x.select.min)
396 |           step2fit <- suppressWarnings(step(glm(y~.,data=df_step2,family=gaussian),trace=0))
397 |           vars <- suppressWarnings(as.numeric(sapply(names(step2fit$coefficients),function(x) strsplit(x,split = "[.]")[[1]][2])))
398 |           vars <- vars[!is.na(vars)]
399 |           
400 |           selected <- NULL
401 |           if (is.null(ncol(idxs)) & length(vars) > 0){
402 |             selected <- idxs
403 |           }else if (length(vars) > 0){
404 |             selected <- idxs[,vars]
405 |           }
406 |           
407 |           ret$step2.feature.1se = selected
408 |           ret$step2fit.1se <- step2fit
409 |           
410 |         }
411 |         
412 |       }
413 |       
414 |     }
415 |     
416 |   }else{
417 |     
418 |     ret <- list(beta=fullfit$beta,
419 |                 lambda=fullfit$lambda,
420 |                 a=a,
421 |                 loss=fullfit$loss,
422 |                 mse=fullfit$mse
423 |     )
424 |     
425 |     if (plot){
426 |       
427 |       beta_nzero <- suppressWarnings(data.frame(reshape::melt(ret$beta[rowSums(ret$beta != 0) > 0,])))
428 |       beta_nzero$lambda <- ret$lambda[beta_nzero$X2]
429 |       beta_nzero$loglambda <- log(beta_nzero$lambda)
430 |       
431 |       lambda_count <- data.frame(loglambda = log(ret$lambda),
432 |                                  count = colSums(ret$beta != 0))
433 |       lambda_count <- lambda_count[seq(5,nrow(lambda_count),length.out=10),]
434 |       
435 |       top10feat <- sort(ret$beta[,length(ret$lambda)])[c(1:5,(p-4):p)]
436 |       top10name <- names(top10feat)
437 |       
438 |       pcoef <- ggplot(beta_nzero, aes(x=.data$loglambda,y=.data$value,group=.data$X1,color=as.factor(.data$X1))) + 
439 |         geom_line() + 
440 |         scale_color_manual(values=rainbow(sum(rowSums(ret$beta != 0) > 0))) + 
441 |         theme_bw() + 
442 |         theme(legend.position = "none") +
443 |         xlab(expression(paste("log(",lambda,")"))) +
444 |         ylab("Coefficient") +
445 |         # annotate("text",x=min(beta_nzero$loglambda)-2,y=top10feat,label=top10name,hjust=0)+
446 |         annotate("text",x=lambda_count$loglambda,y=max(beta_nzero$value)+0.2,label=as.character(lambda_count$count))+
447 |         ggtitle(expression(paste("Coefficients versus log(",lambda,")")))
448 |       
449 |       ret$pcoef <- pcoef
450 |       
451 |     }
452 |     
453 |   }
454 |   
455 |   if(intercept) ret$intercept = beta0
456 |   
457 |   ret$runtime = proc.time() - ptm
458 |   
459 |   return(ret)
460 |   
461 | }


--------------------------------------------------------------------------------
/R/LogRatioLogisticLasso.R:
--------------------------------------------------------------------------------
  1 | LogRatioLogisticLasso <- function(x,
  2 |                                   y,
  3 |                                   ncov,
  4 |                                   length.lambda=100,
  5 |                                   lambda.min.ratio=NULL,
  6 |                                   wcov,
  7 |                                   a=1,
  8 |                                   mu=1,
  9 |                                   maxiter=100,
 10 |                                   ncv=5,
 11 |                                   foldid=NULL,
 12 |                                   step2=FALSE,
 13 |                                   progress=TRUE,
 14 |                                   plot=TRUE,
 15 |                                   loop1=FALSE,
 16 |                                   loop2=FALSE,
 17 |                                   ncore=1){
 18 |   
 19 |   ptm <- proc.time()
 20 |   
 21 |   n <- length(y)
 22 |   p <- ncol(x)
 23 |   sfun = y-0.5
 24 |   
 25 |   if (a > 0){
 26 |     lambda0 <- max(abs(t(sfun) %*% x))/(a*n)
 27 |   }else if (a == 0){
 28 |     lambda0 <- max(abs(t(sfun) %*% x))/(1e-3*n)
 29 |   }
 30 |   
 31 |   adjust = FALSE
 32 |   if (ncov > 0) adjust = TRUE
 33 |   
 34 |   if (is.null(lambda.min.ratio)) lambda.min.ratio = ifelse(n < p, 1e-02, 1e-02)
 35 |   
 36 |   lambda <- 10^(seq(log10(lambda0),log10(lambda0*lambda.min.ratio),length.out=length.lambda))
 37 |   
 38 |   if (progress) cat("Algorithm running for full dataset: \n")
 39 |   
 40 |   fullfit <- logistic_enet_al(x,y,length.lambda,mu,maxiter,lambda,wcov,a,adjust,ncov,progress,loop1,loop2)
 41 |   
 42 |   if (!is.null(colnames(x))){
 43 |     rownames(fullfit$beta) = colnames(x)
 44 |   }else{
 45 |     colnames(x) = 1:ncol(x)
 46 |     rownames(fullfit$beta) = 1:ncol(x)
 47 |   }
 48 |   
 49 |   if (!is.null(ncv)){
 50 |     
 51 |     cvmse <- matrix(NA,nrow=length.lambda,ncol=ncv)
 52 |     
 53 |     if (is.null(foldid)){
 54 |       labels <- caret::createFolds(factor(y),k=ncv,list=FALSE)
 55 |     }else{
 56 |       labels <- foldid
 57 |     }
 58 |     
 59 |     if (ncore == 1){
 60 |       
 61 |       for (cv in 1:ncv){
 62 |         
 63 |         if (progress) cat(paste0("Algorithm running for cv dataset ",cv," out of ",ncv,": \n"))
 64 |         
 65 |         train.x <- x[labels!=cv,]
 66 |         train.y <- y[labels!=cv]
 67 |         test.x <- x[labels==cv,]
 68 |         test.y <- y[labels==cv]
 69 |         
 70 |         cvfit <- logistic_enet_al(train.x,train.y,length.lambda,mu,maxiter,lambda,wcov,a,adjust,ncov,progress,loop1,loop2)
 71 |         
 72 |         cvmse[,cv] <- apply(cbind(1,test.x) %*% rbind(t(cvfit$beta0),cvfit$beta),2,function(x) sum((test.y- exp(x)/(1+exp(x)))^2)/length(test.y))
 73 |         
 74 |       }
 75 |       
 76 |     }else if (ncore > 1){
 77 |       
 78 |       if (progress) warning(paste0("Using ", ncore ," core for cross-validation computation."))
 79 |       
 80 |       cl <- makeCluster(ncore)
 81 |       registerDoParallel(cl)
 82 |       
 83 |       cvmse <- foreach(cv=1:ncv,.combine=cbind) %dopar% {
 84 |         
 85 |         train.x <- x[labels!=cv,]
 86 |         train.y <- y[labels!=cv]
 87 |         test.x <- x[labels==cv,]
 88 |         test.y <- y[labels==cv]
 89 |         
 90 |         cvfit <- logistic_enet_al(train.x,train.y,length.lambda,mu,maxiter,lambda,wcov,a,adjust,ncov,FALSE,loop1,loop2)
 91 |         
 92 |         apply(cbind(1,test.x) %*% rbind(t(cvfit$beta0),cvfit$beta),2,function(x) sum((test.y- exp(x)/(1+exp(x)))^2)/length(test.y))
 93 |         
 94 |       }
 95 |       
 96 |       stopCluster(cl)
 97 |       
 98 |     }
 99 |     
100 |     mean.cvmse <- rowMeans(cvmse)
101 |     se.cvmse <- apply(cvmse,1,function(x) sd(x)/sqrt(ncv))
102 |     
103 |     idx.min <- which.min(mean.cvmse)
104 |     se.min <- se.cvmse[idx.min]
105 |     idx.1se <- suppressWarnings(max(which(mean.cvmse > mean.cvmse[idx.min] + se.min & 1:length.lambda < idx.min)))
106 |     if (idx.1se == -Inf) idx.1se = 1
107 |     
108 |     best.beta0 <- list(min.mse = fullfit$beta0[idx.min],
109 |                        add.1se = fullfit$beta0[idx.1se])
110 |     
111 |     best.beta <- list(min.mse = fullfit$beta[,idx.min],
112 |                       add.1se = fullfit$beta[,idx.1se])
113 |     
114 |     best.idx <- list(idx.min = idx.min,
115 |                      idx.1se = idx.1se)
116 |     
117 |     ret <- list(beta=fullfit$beta,
118 |                 beta0=fullfit$beta0,
119 |                 lambda=fullfit$lambda,
120 |                 a=a,
121 |                 loss=fullfit$loss,
122 |                 mse=fullfit$mse,
123 |                 tol=fullfit$tol,
124 |                 iters=fullfit$iters,
125 |                 cvmse.mean=mean.cvmse,
126 |                 cvmse.se=se.cvmse,
127 |                 best.beta=best.beta,
128 |                 best.beta0=best.beta0,
129 |                 best.idx=best.idx,
130 |                 foldid=labels
131 |     )
132 |     
133 |     if (plot){
134 |       
135 |       beta_nzero <- suppressWarnings(data.frame(reshape::melt(ret$beta[rowSums(ret$beta != 0) > 0,])))
136 |       beta_nzero$lambda <- ret$lambda[beta_nzero$X2]
137 |       beta_nzero$loglambda <- log(beta_nzero$lambda)
138 |       
139 |       lambda_count <- data.frame(loglambda = log(ret$lambda),
140 |                                  count = colSums(ret$beta != 0))
141 |       lambda_count <- lambda_count[seq(5,nrow(lambda_count),length.out=10),]
142 |       
143 |       top10feat <- sort(ret$beta[,length(ret$lambda)])[c(1:5,(p-4):p)]
144 |       top10name <- names(top10feat)
145 |       
146 |       pcoef <- ggplot(beta_nzero, aes(x=.data$loglambda,y=.data$value,group=.data$X1,color=as.factor(.data$X1))) + 
147 |         geom_line() + 
148 |         scale_color_manual(values=rainbow(sum(rowSums(ret$beta != 0) > 0))) + 
149 |         theme_bw() + 
150 |         theme(legend.position = "none") +
151 |         xlab(expression(paste("log(",lambda,")"))) +
152 |         ylab("Coefficient") +
153 |         geom_vline(xintercept=log(ret$lambda[ret$best.idx$idx.min]),linetype="dashed",color="darkgrey")+
154 |         geom_vline(xintercept=log(ret$lambda[ret$best.idx$idx.1se]),linetype="dotted",color="darkgrey")+
155 |         # annotate("text",x=min(beta_nzero$loglambda)-2,y=top10feat,label=top10name,hjust=0)+
156 |         annotate("text",x=lambda_count$loglambda,y=max(beta_nzero$value)+0.2,label=as.character(lambda_count$count))+
157 |         ggtitle(expression(paste("Coefficients versus log(",lambda,")")))
158 |       
159 |       mseplot <- data.frame(loglambda=log(ret$lambda),
160 |                             mse=ret$cvmse.mean,
161 |                             se=ret$cvmse.se,
162 |                             mseaddse=ret$cvmse.mean+ret$cvmse.se,
163 |                             mseminse=ret$cvmse.mean-ret$cvmse.se)
164 |       
165 |       pmse <- ggplot(mseplot, aes(x=.data$loglambda, y=.data$mse)) +
166 |         geom_errorbar(aes(ymin=.data$mseminse,ymax=.data$mseaddse),color="grey")+
167 |         geom_point(color="red")+
168 |         theme_bw() +
169 |         xlab(expression(paste("log(",lambda,")"))) +
170 |         ylab("Mean-Squared Error")+
171 |         geom_vline(xintercept=log(ret$lambda[ret$best.idx$idx.min]),linetype="dashed",color="darkgrey")+
172 |         geom_vline(xintercept=log(ret$lambda[ret$best.idx$idx.1se]),linetype="dotted",color="darkgrey")+
173 |         annotate("text",x=lambda_count$loglambda,y=max(mseplot$mseaddse)+0.05,label=as.character(lambda_count$count))+
174 |         ggtitle(expression(paste("Cross-validated MSE versus log(",lambda,")")))
175 |       
176 |       ret$pcoef <- pcoef
177 |       ret$pmse <- pmse
178 |       
179 |     }
180 |     
181 |     if (step2){
182 |       
183 |       if (!adjust){
184 |         
185 |         if (length(which(ret$best.beta$min.mse!=0)) > 1){
186 |           
187 |           idxs <- combn(which(ret$best.beta$min.mse!=0),2)
188 |           x.select.min <- matrix(NA,nrow=n,ncol=ncol(idxs))
189 |           for (k in 1:ncol(idxs)){
190 |             x.select.min[,k] <- x[,idxs[1,k]] - x[,idxs[2,k]]
191 |           }
192 |           
193 |           if (ncol(x.select.min) > 1){
194 |             stepglmnet <- cv.glmnet(x=x.select.min,y=y,type.measure = "mse",family="binomial")
195 |             x.select.min <- x.select.min[,which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)]
196 |             idxs <- idxs[,which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)]
197 |           }else{
198 |             idxs <- as.vector(idxs)
199 |           }
200 |           
201 |           df_step2 <- data.frame(y=y,x=x.select.min)
202 |           step2fit <- suppressWarnings(step(glm(y~.,data=df_step2,family=binomial),trace=0))
203 |           vars <- as.numeric(sapply(names(step2fit$coefficients),function(x) strsplit(x,split = "[.]")[[1]][2]))
204 |           
205 |           if (is.null(ncol(idxs))){
206 |             if (length(vars) == 2){
207 |               selected <- idxs
208 |             }else{
209 |               selected <- NULL
210 |             }
211 |           }else{
212 |             selected <- idxs[,vars]
213 |           }
214 |           
215 |           # for (k1 in 1:nrow(selected)){
216 |           #   for (k2 in 1:ncol(selected)){
217 |           #     selected[k1,k2] <- colnames(x)[as.numeric(selected[k1,k2])]
218 |           #   }
219 |           # }
220 |           ret$step2.feature.min = selected
221 |           ret$step2fit.min <- step2fit
222 |         }
223 |         
224 |         if (length(which(ret$best.beta$add.1se!=0)) > 1){
225 |           
226 |           idxs <- combn(which(ret$best.beta$add.1se!=0),2)
227 |           x.select.min <- matrix(NA,nrow=n,ncol=ncol(idxs))
228 |           for (k in 1:ncol(idxs)){
229 |             x.select.min[,k] <- x[,idxs[1,k]] - x[,idxs[2,k]]
230 |           }
231 |           
232 |           if (ncol(x.select.min) > 1){
233 |             stepglmnet <- cv.glmnet(x=x.select.min,y=y,type.measure = "mse",family="binomial")
234 |             x.select.min <- x.select.min[,which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)]
235 |             idxs <- idxs[,which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)]
236 |           }else{
237 |             idxs <- as.vector(idxs)
238 |           }
239 |           
240 |           df_step2 <- data.frame(y=y,x=x.select.min)
241 |           step2fit <- suppressWarnings(step(glm(y~.,data=df_step2,family=binomial),trace=0))
242 |           vars <- as.numeric(sapply(names(step2fit$coefficients),function(x) strsplit(x,split = "[.]")[[1]][2]))
243 |           
244 |           if (is.null(ncol(idxs))){
245 |             if (length(vars) == 2){
246 |               selected <- idxs
247 |             }else{
248 |               selected <- NULL
249 |             }
250 |           }else{
251 |             selected <- idxs[,vars]
252 |           }
253 |           # for (k1 in 1:nrow(selected)){
254 |           #   for (k2 in 1:ncol(selected)){
255 |           #     selected[k1,k2] <- colnames(x)[as.numeric(selected[k1,k2])]
256 |           #   }
257 |           # }
258 |           ret$step2.feature.1se = selected
259 |           ret$step2fit.1se <- step2fit
260 |         }
261 |         
262 |       }else{
263 |         
264 |         if (length(which(ret$best.beta$min.mse!=0)) > 1){
265 |           
266 |           allidx <- which(ret$best.beta$min.mse!=0)
267 |           
268 |           covidx <- allidx[allidx <= ncov]
269 |           taxidx <- allidx[allidx > ncov]
270 |           
271 |           idxs <- NULL
272 |           x.select.min <- NULL
273 |           
274 |           if (length(taxidx) > 1){
275 |             idxs <- combn(taxidx,2)
276 |             for (k in 1:ncol(idxs)){
277 |               x.select.min <- cbind(x.select.min, x[,idxs[1,k]] - x[,idxs[2,k]])
278 |             }
279 |             colnames(x.select.min) <- rep("",ncol(x.select.min))
280 |           }
281 |           
282 |           if (length(covidx) > 0){
283 |             x.select.min <- cbind(x.select.min, x[,covidx])
284 |             if (!is.null(idxs)){
285 |               colnames(x.select.min)[(ncol(idxs)+1):ncol(x.select.min)] = colnames(x)[covidx]
286 |             }else{
287 |               colnames(x.select.min) <- colnames(x)[covidx]
288 |             }
289 |           }
290 |           
291 |           
292 |           # if(is.null(x.select.min)) break
293 |           
294 |           if (ncol(x.select.min) > 1){
295 |             stepglmnet <- cv.glmnet(x=x.select.min,y=y,type.measure = "mse",family="binomial")
296 |             
297 |             if (length(taxidx) < 2){
298 |               
299 |               idxs <- NULL
300 |               
301 |               # covs <- colnames(x.select.min)[which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)]
302 |               
303 |             }else{
304 |               
305 |               if (length(covidx) == 0)  idxs <- idxs[,which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)]
306 |               
307 |               if (length(covidx) > 0){
308 |                 
309 |                 # covs <- colnames(x.select.min)[setdiff(which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0),
310 |                 #                                        1:ncol(idxs))]
311 |                 
312 |                 idxs <- idxs[,setdiff(which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0),
313 |                                       (ncol(idxs)+1):(ncol(idxs)+length(covidx)))]
314 |                 
315 |               }
316 |               
317 |             }
318 |             
319 |             x.select.min <- x.select.min[,which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)]
320 |           }
321 |           
322 |           if (sum(colnames(x.select.min)=="") > 0)  colnames(x.select.min)[colnames(x.select.min)==""] <- paste0("x.",1:sum(colnames(x.select.min)==""))
323 |           
324 |           df_step2 <- data.frame(y=y,x.select.min)
325 |           step2fit <- suppressWarnings(step(glm(y~.,data=df_step2,family=binomial),trace=0))
326 |           vars <- suppressWarnings(as.numeric(sapply(names(step2fit$coefficients),function(x) strsplit(x,split = "[.]")[[1]][2])))
327 |           vars <- vars[!is.na(vars)]
328 |           
329 |           selected <- NULL
330 |           if (is.null(ncol(idxs)) & length(vars) > 0){
331 |             selected <- idxs
332 |           }else if (length(vars) > 0){
333 |             selected <- idxs[,vars]
334 |           }
335 |           
336 |           ret$step2.feature.min = selected
337 |           ret$step2fit.min <- step2fit
338 |         }
339 |         
340 |         if (length(which(ret$best.beta$add.1se!=0)) > 1){
341 |           
342 |           allidx <- which(ret$best.beta$add.1se!=0)
343 |           
344 |           covidx <- allidx[allidx <= ncov]
345 |           taxidx <- allidx[allidx > ncov]
346 |           
347 |           idxs <- NULL
348 |           x.select.min <- NULL
349 |           
350 |           if (length(taxidx) > 1){
351 |             idxs <- combn(taxidx,2)
352 |             for (k in 1:ncol(idxs)){
353 |               x.select.min <- cbind(x.select.min, x[,idxs[1,k]] - x[,idxs[2,k]])
354 |             }
355 |             colnames(x.select.min) <- rep("",ncol(x.select.min))
356 |           }
357 |           
358 |           if (length(covidx) > 0){
359 |             x.select.min <- cbind(x.select.min, x[,covidx])
360 |             if (!is.null(idxs)){
361 |               colnames(x.select.min)[(ncol(idxs)+1):ncol(x.select.min)] = colnames(x)[covidx]
362 |             }else{
363 |               colnames(x.select.min) <- colnames(x)[covidx]
364 |             }
365 |           }
366 |           
367 |           # if(is.null(x.select.min)) break
368 |           
369 |           if (ncol(x.select.min) > 1){
370 |             stepglmnet <- cv.glmnet(x=x.select.min,y=y,type.measure = "mse",family="binomial")
371 |             
372 |             if (length(taxidx) < 2){
373 |               
374 |               idxs <- NULL
375 |               
376 |               # covs <- colnames(x.select.min)[which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)]
377 |               
378 |             }else{
379 |               
380 |               if (length(covidx) == 0)  idxs <- idxs[,which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)]
381 |               
382 |               if (length(covidx) > 0){
383 |                 
384 |                 # covs <- colnames(x.select.min)[setdiff(which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0),1:ncol(idxs))]
385 |                 
386 |                 idxs <- idxs[,setdiff(which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0),
387 |                                       (ncol(idxs)+1):(ncol(idxs)+length(covidx)))]
388 |                 
389 |               }
390 |               
391 |             }
392 |             
393 |             x.select.min <- x.select.min[,which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)]
394 |           }
395 |           
396 |           if (sum(colnames(x.select.min)=="") > 0)  colnames(x.select.min)[colnames(x.select.min)==""] <- paste0("x.",1:sum(colnames(x.select.min)==""))
397 |           
398 |           df_step2 <- data.frame(y=y,x.select.min)
399 |           step2fit <- suppressWarnings(step(glm(y~.,data=df_step2,family=binomial),trace=0))
400 |           vars <- suppressWarnings(as.numeric(sapply(names(step2fit$coefficients),function(x) strsplit(x,split = "[.]")[[1]][2])))
401 |           vars <- vars[!is.na(vars)]
402 |           
403 |           selected <- NULL
404 |           if (is.null(ncol(idxs)) & length(vars) > 0){
405 |             selected <- idxs
406 |           }else if (length(vars) > 0){
407 |             selected <- idxs[,vars]
408 |           }
409 |           
410 |           ret$step2.feature.1se = selected
411 |           ret$step2fit.1se <- step2fit
412 |           
413 |         }
414 |         
415 |       }
416 |       
417 |     }
418 |     
419 |   }else{
420 |     
421 |     ret <- list(beta=fullfit$beta,
422 |                 beta0=fullfit$beta0,
423 |                 lambda=fullfit$lambda,
424 |                 a=a,
425 |                 loss=fullfit$loss,
426 |                 mse=fullfit$mse
427 |     )
428 |     
429 |     if (plot){
430 |       
431 |       beta_nzero <- suppressWarnings(data.frame(reshape::melt(ret$beta[rowSums(ret$beta != 0) > 0,])))
432 |       beta_nzero$lambda <- ret$lambda[beta_nzero$X2]
433 |       beta_nzero$loglambda <- log(beta_nzero$lambda)
434 |       
435 |       lambda_count <- data.frame(loglambda = log(ret$lambda),
436 |                                  count = colSums(ret$beta != 0))
437 |       lambda_count <- lambda_count[seq(5,nrow(lambda_count),length.out=10),]
438 |       
439 |       top10feat <- sort(ret$beta[,length(ret$lambda)])[c(1:5,(p-4):p)]
440 |       top10name <- names(top10feat)
441 |       
442 |       pcoef <- ggplot(beta_nzero, aes(x=.data$loglambda,y=.data$value,group=.data$X1,color=as.factor(.data$X1))) + 
443 |         geom_line() + 
444 |         scale_color_manual(values=rainbow(sum(rowSums(ret$beta != 0) > 0))) + 
445 |         theme_bw() + 
446 |         theme(legend.position = "none") +
447 |         xlab(expression(paste("log(",lambda,")"))) +
448 |         ylab("Coefficient") +
449 |         # annotate("text",x=min(beta_nzero$loglambda)-2,y=top10feat,label=top10name,hjust=0)+
450 |         annotate("text",x=lambda_count$loglambda,y=max(beta_nzero$value)+0.2,label=as.character(lambda_count$count))+
451 |         ggtitle(expression(paste("Coefficients versus log(",lambda,")")))
452 |       
453 |       ret$pcoef <- pcoef
454 |       
455 |     }
456 |     
457 |   }
458 |   
459 |   ret$runtime <- proc.time() - ptm
460 |   
461 |   ret$intercept <- ret$beta0
462 |   ret$beta0 <- NULL
463 |   
464 |   return(ret)
465 |   
466 | }


--------------------------------------------------------------------------------
/src/pgee.cpp:
--------------------------------------------------------------------------------
  1 | // -*- mode: C++; c-indent-level: 4; c-basic-offset: 4; indent-tabs-mode: nil; -*-
  2 | 
  3 | // [[Rcpp::plugins(cpp17)]]
  4 | // [[Rcpp::depends(RcppArmadillo)]]
  5 | #include <RcppArmadillo.h>
  6 | #include <Rcpp.h>
  7 | using namespace arma;
  8 | using namespace std;
  9 | using namespace Rcpp;
 10 | // [[Rcpp::depends(RcppProgress)]]
 11 | #include <progress.hpp>
 12 | #include <progress_bar.hpp>
 13 | 
 14 | // [[Rcpp::export]]
 15 | Rcpp::List gee_NR(double N, // Number of subjects
 16 |                   arma::vec nt, // number of obs per subject
 17 |                   arma::vec y,
 18 |                   arma::mat X,
 19 |                   double nx, // Number of covariates (ncol(X))
 20 |                   Rcpp::Function linkinv,
 21 |                   Rcpp::Function mueta,
 22 |                   Rcpp::Function variance,
 23 |                   arma::vec beta_new,
 24 |                   arma::cube Rhat, // estimated working correlation matrix (Ehat from cor_gee.R)
 25 |                   double fihat, // estimated scale parameter (fi from cor_gee.R)
 26 |                   double lambda,
 27 |                   double a,
 28 |                   double alpha,
 29 |                   double ncov,
 30 |                   double wcov,
 31 |                   double eps=1e-6,
 32 |                   double muu=1e6){
 33 |   
 34 |   // Rcpp::Rcout << "Flag1" << endl;
 35 |   
 36 |   arma::vec aindex=cumsum(nt);
 37 |   arma::vec index;
 38 |   index.zeros(N);
 39 |   index.subvec(1,N-1) = aindex.subvec(0,N-2);
 40 |   
 41 |   arma::vec eta = X * beta_new;
 42 |   // sexp linkinveta = linkinv(eta);
 43 |   // arma::vec mu = linkinveta;
 44 |   arma::vec mu = Rcpp::as<arma::vec>(linkinv(eta));
 45 |   
 46 |   // Rcpp::Rcout << "mu: " << mu << endl;
 47 |   
 48 |   arma::mat E1;
 49 |   arma::mat E2;
 50 |   
 51 |   if (a <= 1){
 52 |     
 53 |     if (ncov == 0){
 54 |       
 55 |       E1 = diagmat(lambda*(a + (1-a)*abs(beta_new))/(abs(beta_new)+eps));
 56 |       E2.ones(nx);
 57 |       E2 = E2 * (muu * (accu(beta_new) + alpha));
 58 |       
 59 |     }else if (ncov > 0){
 60 |       
 61 |       // This is E on Wang et al.(2012)
 62 |       
 63 |       arma::vec lw;
 64 |       lw.ones(size(beta_new));
 65 |       lw.subvec(0,ncov-1) = lw.subvec(0,ncov-1)*wcov;
 66 |       
 67 |       E1 = diagmat((lambda*lw % (a + (1-a)*abs(beta_new)))/(abs(beta_new)+eps));
 68 |       E2.ones(nx);
 69 |       arma::vec muuvec(size(beta_new));
 70 |       muuvec.fill(muu);
 71 |       muuvec.subvec(0,ncov-1).zeros();
 72 |       arma::vec beta_new_sub = beta_new.subvec(ncov,beta_new.n_elem-1);
 73 |       E2 = E2 % (muuvec * (accu(beta_new_sub) + alpha));
 74 |       
 75 |     }
 76 |     
 77 |   }else if (a > 0){
 78 |     
 79 |     if (ncov == 0){
 80 |       
 81 |       arma::vec beta_abs = abs(beta_new);
 82 |       arma::vec b1 = zeros(size(beta_new));
 83 |       arma::uvec pos1 = find(beta_abs > lambda);
 84 |       b1(pos1).ones();
 85 |       
 86 |       // Rcpp::Rcout << "b1: " << 1-b1 << endl;
 87 |       
 88 |       arma::vec b2 = zeros(size(beta_new));
 89 |       arma::uvec pos2 = find(beta_abs < (lambda*a));
 90 |       b2(pos2).ones();
 91 |       
 92 |       // Rcpp::Rcout << "b2: " << 1-b2 << endl;
 93 |       
 94 |       E1 = diagmat((lambda*(1-b1)+b1%((lambda*a)-beta_new)%b2/(a-1))/(abs(beta_new)+eps));
 95 |       E2.ones(nx);
 96 |       E2 = E2 * (muu * (accu(beta_new) + alpha));
 97 |       
 98 |     }else if (ncov > 0){
 99 |       
100 |       // This is E on Wang et al.(2012)
101 |       
102 |       arma::vec beta_abs = abs(beta_new);
103 |       arma::vec b1 = zeros(size(beta_new));
104 |       arma::uvec pos1 = find(beta_abs > lambda);
105 |       b1(pos1).ones();
106 |       
107 |       // Rcpp::Rcout << "b1: " << b1 << endl;
108 |       
109 |       arma::vec b2 = zeros(size(beta_new));
110 |       arma::uvec pos2 = find(beta_abs < (lambda*a));
111 |       b2(pos2).ones();
112 |       
113 |       arma::vec lw;
114 |       lw.ones(size(beta_new));
115 |       lw.subvec(0,ncov-1) = lw.subvec(0,ncov-1)*wcov;
116 |       
117 |       E1 = diagmat((lw % (lambda*(1-b1)+b1%((lambda*a)-beta_new)%b2/(a-1)))/(abs(beta_new)+eps));
118 |       E2.ones(nx);
119 |       arma::vec muuvec(size(beta_new));
120 |       muuvec.fill(muu);
121 |       muuvec.subvec(0,ncov-1).zeros();
122 |       arma::vec beta_new_sub = beta_new.subvec(ncov,beta_new.n_elem-1);
123 |       E2 = E2 % (muuvec * (accu(beta_new_sub) + alpha));
124 |       
125 |     }
126 |     
127 |   }
128 |   
129 |   
130 |   // if(is.null(pindex)==TRUE) {
131 |   //   E<-E1 
132 |   // } else 
133 |   //   if(is.null(pindex)!=TRUE) {
134 |   //     E1[,pindex]<-0
135 |   //     E<-E1
136 |   //   }
137 |   //   
138 |   arma::vec sum201; //<-matrix(0,nx,1)      //gradient:S
139 |   sum201.zeros(nx);
140 |   
141 |   arma::mat sum301; //<-matrix(0,nx,nx)     //naive variance:H
142 |   sum301.zeros(nx,nx);
143 |   
144 |   arma::mat sum401; //<-matrix(0,nx,nx)     //naive variance:H
145 |   sum401.zeros(nx,nx);
146 |   
147 |   // sexp variancemu = variance(mu);
148 |   // arma::vec varimu = variancemu;
149 |   arma::vec varimu = Rcpp::as<arma::vec>(variance(mu));
150 |   
151 |   // sexp muetaeta = mueta(eta);
152 |   // arma::vec mee = muetaeta;
153 |   arma::vec mee = Rcpp::as<arma::vec>(mueta(eta));
154 |   
155 |   // Rcpp::Rcout << "Flag3" << endl;
156 |   
157 |   for (int i=0; i<N; ++i){
158 |     
159 |     arma::vec ym;
160 |     ym.zeros(nt(i));
161 |     
162 |     arma::mat bigD;
163 |     bigD.zeros(nt(i),nx);
164 |     
165 |     arma::mat bigA;
166 |     bigA.zeros(nt(i),nt(i));
167 |     
168 |     for (int j=0; j<nt(i); ++j){
169 |       
170 |       ym(j) = y(j+index(i)) - mu(j+index(i));
171 |       
172 |       bigA(j,j) = varimu(j+index(i));
173 |       
174 |       for (int k=0; k<nx; ++k){
175 |         
176 |         bigD(j,k) = mee(j+index(i)) * X(j+index(i),k);
177 |         
178 |       }
179 |     }
180 |     
181 |     //working covariance matrix
182 |     
183 |     arma::mat Rmat = Rhat.slice(i);
184 |     arma::mat bigV = pow(bigA,0.5) * Rmat.submat(0,0,size(nt(i),nt(i))) * pow(bigA,0.5);
185 |     arma::mat nugmat = mat(size(bigV));
186 |     nugmat.diag() += 1e-6;
187 |     
188 |     ////This is S in Wang et al.(2012)
189 |     arma::vec sum200 = bigD.t() * pinv(bigV+nugmat) * ym;      //this is like gradient
190 |     sum201 = sum201 + sum200;
191 |     
192 |     ////This is H in Wang et al.(2012)
193 |     arma::mat sum300 = bigD.t() * pinv(bigV+nugmat) * bigD;    //this is for information matrix.
194 |     sum301 = sum301 + sum300;
195 |     
196 |     //Speed up the code////
197 |     // arma::mat nugmat2 = mat(size(bigA));
198 |     // nugmat2.diag() += 1e-6;
199 |     // 
200 |     // arma::mat SSA = pow(pinv(bigA+nugmat2),2);
201 |     // 
202 |     // Rmat.submat(0,0,size(nt(i),nt(i))).diag() += 1e-6;
203 |     // arma::mat SRhat = pinv(Rmat.submat(0,0,size(nt(i),nt(i))));
204 |     // arma::vec SSAym = SSA * ym;
205 |     // 
206 |     // arma::mat sum400 = bigD.t() * SSA * SRhat * (SSAym * SSAym.t()) * SRhat * SSA * bigD;
207 |     // sum401 = sum401 + sum400;
208 |     
209 |   }
210 |   
211 |   // Rcpp::Rcout << "Flag4" << endl;
212 |   
213 |   arma::vec S = fihat*sum201;
214 |   arma::mat H = fihat*sum301;
215 |   // arma::mat M = fihat*sum401;
216 |   
217 |   Rcpp::List ret;
218 |   // ret["eta"] = eta;
219 |   // ret["mu"] = mu;
220 |   ret["E"] = E1;
221 |   ret["C"] = E2;
222 |   ret["S"] = S;
223 |   ret["H"] = H;
224 |   // ret["M"] = M;
225 |   
226 |   return ret;
227 | }
228 | 
229 | // [[Rcpp::export]]
230 | Rcpp::List gee_cor(double N, // Number of subjects
231 |                    arma::vec nt, // number of obs per subject
232 |                    arma::vec y,
233 |                    arma::mat X,
234 |                    Rcpp::Function linkinv,
235 |                    Rcpp::Function variance,
236 |                    arma::vec beta_new,
237 |                    std::string corstr,
238 |                    double maxclsz, // max number of obs
239 |                    bool scalefix, // indicator of fixed scale parameter
240 |                    double scalevalue=1 //Value of the scale parameter (if fixed)
241 | ){
242 |   
243 |   // Rcpp::Rcout << "Flag1" << endl;
244 |   
245 |   arma::vec eta = X * beta_new;
246 |   // sexp linkinveta = linkinv(eta);
247 |   // arma::vec mu = linkinveta;
248 |   arma::vec mu = Rcpp::as<arma::vec>(linkinv(eta));
249 |   
250 |   // sexp variancemu = variance(mu);
251 |   // arma::vec varimu = variancemu;
252 |   arma::vec varimu = Rcpp::as<arma::vec>(variance(mu));
253 |   
254 |   arma::vec sd = pow(varimu,0.5);
255 |   arma::vec res = (y-mu)/sd;
256 |   
257 |   double fi;
258 |   
259 |   if(scalefix) {
260 |     fi = accu(pow(res,2))/(accu(nt));
261 |   }else{
262 |     fi = scalevalue;
263 |   }
264 |   
265 |   arma::vec aindex=cumsum(nt);
266 |   arma::vec index;
267 |   index.zeros(N);
268 |   index.subvec(1,N-1) = aindex.subvec(0,N-2);
269 |   
270 |   double alfa_hat;
271 |   arma::mat alfa_mat;
272 |   
273 |   if (corstr=="independence") {
274 |     
275 |     alfa_hat = 0;
276 |     
277 |   }else if (corstr == "exchangeable"){
278 |     
279 |     double sum1 = 0;
280 |     double sum3 = 0;
281 |     
282 |     for (int i=0; i<N; ++i){
283 |       for (int j=0; j<nt(i); ++j){
284 |         for (int jj=0; jj<nt(i); ++jj){
285 |           if (j != jj){
286 |             double sum2 = res(j+index(i)) * res(jj+index(i));
287 |             sum1 = sum1 + sum2;
288 |           }
289 |         }
290 |       }
291 |       double sum4 = nt(i)*(nt(i)-1);
292 |       sum3 = sum3 + sum4;
293 |     }
294 |     alfa_hat = sum1 / (sum3*fi);
295 |     
296 |   }else if (corstr=="AR-1") { 
297 |     double sum5 = 0;
298 |     double sum6 = 0;
299 |     
300 |     for (int i=0; i<N; ++i){
301 |       for (int j=0; j<nt(i); ++j){
302 |         for (int jj=0; jj<nt(i); ++jj){
303 |           if (j > jj && abs(j-jj)==1){
304 |             double sum7 = res(j+index(i)) * res(jj+index(i));
305 |             sum5 = sum5 + sum7;
306 |           }
307 |         }
308 |       }
309 |       double sum8 = (nt(i)-1);
310 |       sum6 = sum6 + sum8;
311 |     }
312 |     alfa_hat = sum5 / (sum6*fi);
313 |     
314 |   }else if (corstr=="unstructured"){
315 |     
316 |     alfa_mat.zeros(nt(1),nt(1)); //not allowed for unequal number of cluster sizes.
317 |     
318 |     for (int j=0; j<nt(1); ++j){
319 |       for (int jj=0; jj<nt(1); ++jj){
320 |         
321 |         double sum20 = 0;
322 |         
323 |         for (int i=0; i<N; ++i){
324 |           double sum21 = res(j+index(i)) * res(jj+index(i));
325 |           sum20 = sum21 + sum20;    
326 |         }
327 |         
328 |         alfa_mat(j,jj) = sum20/(N*fi); 
329 |         
330 |       }
331 |     }
332 |   }
333 |   
334 |   arma::cube Ehat;
335 |   Ehat.zeros(maxclsz,maxclsz,N); // For each subject, the corr matrix is maxclsz by maxclsz
336 |   
337 |   for (int i=0; i<N; ++i){
338 |     
339 |     arma::mat cor1;
340 |     cor1.zeros(nt(i),nt(i));
341 |     
342 |     if (corstr=="independence") {
343 |       
344 |       cor1.eye(nt(i),nt(i));
345 |       
346 |     }else if (corstr == "exchangeable"){
347 |       
348 |       for (int j=0; j<nt(i); ++j){
349 |         for (int jj=0; jj<nt(i); ++jj){
350 |           if (j != jj){
351 |             cor1(j,jj) = alfa_hat;
352 |           }else{
353 |             cor1(j,jj) = 1;
354 |           }
355 |         }
356 |       }
357 |       
358 |     }else if (corstr=="AR-1") { 
359 |       
360 |       for (int j=0; j<nt(i); ++j){
361 |         for (int jj=0; jj<nt(i); ++jj){
362 |           cor1(j,jj) = pow(alfa_hat,abs(j-jj));
363 |         }
364 |       }
365 |       
366 |     }else if (corstr=="unstructured"){
367 |       
368 |       cor1 = alfa_mat + alfa_mat.t();
369 |       cor1.diag().ones();
370 |       
371 |     }
372 |     
373 |     Ehat.slice(i).submat(0,0,nt(i)-1,nt(i)-1) = cor1; 
374 |     
375 |   }
376 |   
377 |   Rcpp::List ret;
378 |   
379 |   ret["Rhat"] = Ehat;
380 |   ret["fihat"] = fi;
381 |   
382 |   return ret;
383 | }
384 | 
385 | // [[Rcpp::export]]
386 | Rcpp::List gee_fit(arma::vec y,
387 |                    arma::mat X,
388 |                    arma::vec nt, // number of obs per subject
389 |                    Rcpp::Function linkinv,
390 |                    Rcpp::Function mueta,
391 |                    Rcpp::Function variance,
392 |                    std::string corstr,
393 |                    arma::vec lambda,
394 |                    double a,
395 |                    double ncov,
396 |                    double wcov,
397 |                    double tol=1e-3,
398 |                    double eps=1e-6,
399 |                    double muu=1e6,
400 |                    int maxiter1=100,
401 |                    int maxiter2=10,
402 |                    bool scalefix=false, // indicator of fixed scale parameter
403 |                    double scalevalue=1, //Value of the scale parameter (if fixed)
404 |                    bool display_progress=true
405 | ){
406 |   
407 |   double N = nt.n_elem;
408 |   double nx = X.n_cols;
409 |   double maxclsz = nt.max();
410 |   
411 |   double len = lambda.n_elem;
412 |   arma::mat beta_mat;
413 |   beta_mat.zeros(nx,len);
414 |   
415 |   Progress prog(len, display_progress);
416 |   
417 |   double diff; 
418 |   arma::vec iters = vec(len);
419 |   arma::vec diffs = vec(len);
420 |   
421 |   for (int i=0; i<len; ++i){
422 |     
423 |     prog.increment();
424 |     
425 |     double l = lambda(i);
426 |     int k = 0;
427 |     
428 |     arma::vec betai = beta_mat.col(i);
429 |     if (i >= 1) betai = beta_mat.col(i-1);
430 |     arma::vec beta0;
431 |     // double loss0;
432 |     
433 |     diff = 1; // Initialize diff > tol for each lambda to start the while loop
434 |     
435 |     // arma::vec eta = X * betai;
436 |     // sexp linkinveta = linkinv(eta);
437 |     // arma::vec mu = linkinveta;
438 |     // arma::vec wt;
439 |     // wt.ones(size(mu));
440 |     // sexp resi = devresids(y,mu,wt);
441 |     // arma::vec resid = resi;
442 |     // double loss = accu(resid);
443 |     
444 |     double alpha = 0;
445 |     
446 |     while (abs(diff) > tol and k < maxiter1){
447 |       
448 |       k = k + 1;
449 |       beta0 = betai;
450 |       // loss0 = loss;
451 |       int k1 = 0;
452 |       double diff1 = 1;
453 |       arma::vec beta00 = beta0;
454 |       
455 |       if (muu > 0){
456 |         
457 |         while (abs(diff1) > tol and k1 < maxiter2){
458 |           
459 |           k1 = k1 + 1;
460 |           beta00 = betai;
461 |           
462 |           Rcpp::List cor_obj = gee_cor(N,
463 |                                        nt,
464 |                                        y,
465 |                                        X,
466 |                                        linkinv,
467 |                                        variance,
468 |                                        betai,
469 |                                        corstr,
470 |                                        maxclsz, 
471 |                                        scalefix,
472 |                                        scalevalue
473 |           );
474 |           
475 |           arma::cube Rhat = cor_obj["Rhat"];
476 |           double fihat = cor_obj["fihat"];
477 |           
478 |           Rcpp::List NR_obj=gee_NR(N,
479 |                                    nt,
480 |                                    y,
481 |                                    X,
482 |                                    nx,
483 |                                    linkinv,
484 |                                    mueta,
485 |                                    variance,
486 |                                    betai,
487 |                                    Rhat,
488 |                                    fihat,
489 |                                    l,
490 |                                    a,
491 |                                    alpha,
492 |                                    ncov,
493 |                                    wcov,
494 |                                    eps,
495 |                                    muu);
496 |           
497 |           arma::vec S = NR_obj["S"];
498 |           arma::mat H = NR_obj["H"];
499 |           arma::mat E = NR_obj["E"];
500 |           arma::vec C = NR_obj["C"];
501 |           
502 |           arma::mat Nmat = mat(size(E),fill::value(N));
503 |           // arma::mat muumat = mat(size(E),fill::value(muu));
504 |           arma::mat muumat = mat(size(E),fill::zeros); // this should be changed to assign muu only to the constrained part
505 |           muumat.submat(ncov,ncov,nx-1,nx-1).fill(muu);
506 |           arma::mat nugmat = mat(size(E));
507 |           nugmat.diag() += 1e-10;
508 |           
509 |           betai = betai + pinv(H+Nmat%E+muumat+nugmat) * (S-((Nmat%E)*betai)-C);
510 |           
511 |           // Rcpp::Rcout << "Flag440" << endl;
512 |           
513 |           // arma::uvec zidx = find(abs(betai) < 1e-6);
514 |           // betai(zidx).zeros();
515 |           
516 |           arma::vec diffvec1 = beta00-betai; 
517 |           diff1 = max(abs(diffvec1));
518 |           
519 |           // arma::vec eta1 = X * betai;
520 |           // sexp linkinveta1 = linkinv(eta1);
521 |           // arma::vec mu1 = linkinveta1;
522 |           // arma::vec wt1;
523 |           // wt1.ones(size(mu1));
524 |           // sexp resi1 = devresids(y,mu1,wt1);
525 |           // arma::vec resid1 = resi1;
526 |           // loss = accu(resid1);
527 |           
528 |           // if (display_progress){
529 |           // 
530 |           //   Rcpp::Rcout << "Lambda:" << l << ",Iter:" << k1 << ",Diff:" << diff << endl;
531 |           //   // Rcpp::Rcout << "Lambda:" << l << ",Iter:" << k1 << ",Diff loss:" << loss-loss0 << endl;
532 |           // 
533 |           // }
534 |         }
535 |         
536 |         // Rcpp::Rcout << "k1:" << k1 << "diffo" << diff1 << endl;
537 |         // Rcpp::Rcout << "k1:" << k1 << "diff" << max(abs(beta0-betai)) << endl;
538 |         
539 |         alpha = alpha + accu(betai.subvec(ncov,betai.n_elem-1));
540 |         
541 |       }else if (muu == 0){
542 |         
543 |         Rcpp::List cor_obj = gee_cor(N,
544 |                                      nt,
545 |                                      y,
546 |                                      X,
547 |                                      linkinv,
548 |                                      variance,
549 |                                      betai,
550 |                                      corstr,
551 |                                      maxclsz, 
552 |                                      scalefix,
553 |                                      scalevalue
554 |         );
555 |         
556 |         arma::cube Rhat = cor_obj["Rhat"];
557 |         double fihat = cor_obj["fihat"];
558 |         
559 |         Rcpp::List NR_obj=gee_NR(N,
560 |                                  nt,
561 |                                  y,
562 |                                  X,
563 |                                  nx,
564 |                                  linkinv,
565 |                                  mueta,
566 |                                  variance,
567 |                                  betai,
568 |                                  Rhat,
569 |                                  fihat,
570 |                                  l,
571 |                                  a,
572 |                                  alpha,
573 |                                  ncov,
574 |                                  wcov,
575 |                                  eps,
576 |                                  muu);
577 |         
578 |         arma::vec S = NR_obj["S"];
579 |         arma::mat H = NR_obj["H"];
580 |         arma::mat E = NR_obj["E"];
581 |         arma::vec C = NR_obj["C"];
582 |         
583 |         arma::mat Nmat = mat(size(E),fill::value(N));
584 |         arma::mat muumat = mat(size(E),fill::zeros); // this should be changed to assign muu only to the constrained part
585 |         // muumat.submat(ncov,ncov,nx-1,nx-1).fill(muu);
586 |         arma::mat nugmat = mat(size(E));
587 |         nugmat.diag() += 1e-6;
588 |         
589 |         betai = betai + pinv(H+Nmat%E+muumat+nugmat) * (S-((Nmat%E)*betai)-C);
590 |         
591 |       }
592 |       
593 |       arma::vec diffvec = beta0-betai; 
594 |       diff = max(abs(diffvec));
595 |       
596 |       // Rcpp::Rcout << "k:" << k << "diff:" << diff << endl;
597 |       
598 |     }
599 |     
600 |     beta_mat.col(i) = betai;
601 |     iters(i) = k;
602 |     diffs(i) = diff;
603 |     
604 |   }
605 |   
606 |   Rcpp::List ret;
607 |   
608 |   ret["beta"] = beta_mat;
609 |   ret["lambda"] = lambda;
610 |   ret["iters"] = iters;
611 |   ret["tol"] = diffs;
612 |   
613 |   return ret;
614 |   
615 | }


--------------------------------------------------------------------------------
/R/simu.R:
--------------------------------------------------------------------------------
  1 | #' Simulate data following log-ratio model
  2 | #'
  3 | #' @description Simulate a dataset from log-ratio model.
  4 | #' @param n An integer of sample size
  5 | #' @param p An integer of number of features (taxa).
  6 | #' @param model Type of models associated with outcome variable, can be "linear", "binomial", "cox", "finegray", "gee" (scalar outcome with time-dependent features), or "timedep" (survival endpoint with time-dependent features).
  7 | #' @param weak Number of features with \code{weak} effect size.
  8 | #' @param strong Number of features with \code{strong} effect size.
  9 | #' @param weaksize Actual effect size for \code{weak} effect size. Must be positive.
 10 | #' @param strongsize Actual effect size for \code{strong} effect size. Must be positive.
 11 | #' @param pct.sparsity Percentage of zero counts for each sample.
 12 | #' @param rho Parameter controlling the correlated structure between taxa. Ranges between 0 and 1.
 13 | #' @param timedep_slope If \code{model} is "timedep", this parameter specifies the slope for the feature trajectories. Please refer to the Simulation section of the manuscript for more details.
 14 | #' @param timedep_cor If \code{model} is "timedep", this parameter specifies the sample-wise correlations between longitudinal features. Please refer to the Simulation section of the manuscript for more details.
 15 | #' @param geetype If \code{model} is "gee", \code{geetype} is the type of GEE outcomes. Now support "gaussian" and "binomial".
 16 | #' @param m If \code{model} is "gee", \code{m} is the number of repeated measurements per subject.
 17 | #' @param corstr If \code{model} is "gee", \code{corstr} is the working correlation structure. Now support "independence", "exchangeable", and "AR-1".
 18 | #' @param sdvec If \code{model} is "gee" and \code{geetype} is "gaussian", \code{sdvec} is the vector of standard deviations of each outcome variable.
 19 | #' @param rhogee If \code{model} is "gee", \code{rhogee} is the correlation parameter between longitudinal outcomes under the selected working correlation structure.
 20 | #' @param geeslope If \code{model} is "gee", \code{geeslope} is the linear time effect.
 21 | #' @param longitudinal_stability If \code{model} is "timedep", this is a binary indicator which determines whether the trajectories are more stable (\code{TRUE}) or more volatile (\code{FALSE}).
 22 | #' @param ncov Number of covariates that are not compositional features.
 23 | #' @param betacov Coefficients corresponding to the covariates that are not compositional features.
 24 | #' @param intercept Boolean. If TRUE, then a random intercept will be generated in the model. Only works for \code{linear} or \code{binomial} models.
 25 | #' @return A list with simulated count matrix \code{xcount}, log1p-transformed count matrix \code{x}, outcome (continuous \code{y}, continuous centered \code{y0}, binary \code{y}, or survival \code{t}, \code{d}), true coefficient vector \code{beta}, list of non-zero features \code{idx}, value of intercept \code{intercept} (if applicable).
 26 | #' @author Teng Fei. Email: feit1@mskcc.org
 27 | #' @references Fei T, Funnell T, Waters N, Raj SS et al. Enhanced Feature Selection for Microbiome Data using FLORAL: Scalable Log-ratio Lasso Regression bioRxiv 2023.05.02.538599.
 28 | #'
 29 | #' @examples 
 30 | #' 
 31 | #' set.seed(23420)
 32 | #' dat <- simu(n=50,p=30,model="linear")
 33 | #' 
 34 | #' @import ggplot2 survival glmnet dplyr
 35 | #' @importFrom survcomp concordance.index
 36 | #' @importFrom reshape melt
 37 | #' @importFrom utils combn
 38 | #' @importFrom grDevices rainbow
 39 | #' @importFrom caret createFolds
 40 | #' @importFrom stats dist rbinom rexp rmultinom rnorm runif sd step glm binomial gaussian na.omit
 41 | #' @importFrom msm dpexp ppexp rpexp
 42 | #' @importFrom mvtnorm rmvnorm
 43 | #' @useDynLib FLORAL
 44 | #' @export
 45 | 
 46 | simu <- function(n = 100, 
 47 |                  p = 200, 
 48 |                  model = "linear",
 49 |                  weak = 4,
 50 |                  strong = 6,
 51 |                  weaksize = 0.125,
 52 |                  strongsize = 0.25,
 53 |                  pct.sparsity = 0.5,
 54 |                  rho=0,
 55 |                  timedep_slope=NULL,
 56 |                  timedep_cor=NULL,
 57 |                  geetype="gaussian",
 58 |                  m=4,
 59 |                  corstr="exchangeable",
 60 |                  sdvec=NULL,
 61 |                  rhogee=0.8,
 62 |                  geeslope=2.5,
 63 |                  longitudinal_stability=TRUE,
 64 |                  ncov=0,
 65 |                  betacov=0,
 66 |                  intercept=FALSE){
 67 |   
 68 |   true_set <- 1:(weak+strong)
 69 |   weak_idx <- 1:weak
 70 |   strong_idx <- (weak+1):(weak+strong)
 71 |   
 72 |   beta <- rep(NA,weak+strong)
 73 |   beta[weak_idx] <- rep(c(weaksize,-weaksize),weak/2)
 74 |   beta[strong_idx] <- rep(c(strongsize,-strongsize),strong/2)
 75 |   
 76 |   if (model %in% c("linear","binomial")){
 77 |     y <- rep(NA,length=n)
 78 |   }else if (model == "cox"){
 79 |     t <- rep(NA,length=n)
 80 |     d <- rep(NA,length=n)
 81 |   }else if (model == "finegray"){
 82 |     t <- t0 <- rep(NA,length=n)
 83 |     d <- rep(NA,length=n)
 84 |   }else if (model == "timedep"){
 85 |     m <- 10
 86 |     id.vect <- rep(1:n, each = m)
 87 |     n0 <- n
 88 |     n <- length(id.vect)
 89 |     
 90 |     if (is.null(timedep_cor)){
 91 |       timedep_cor <- 0.4
 92 |     }
 93 |     if (is.null(timedep_slope)){
 94 |       timedep_slope <- 0.5
 95 |     }else{
 96 |       timedep_slope <- timedep_slope
 97 |     }
 98 |   }else if (model == "gee"){
 99 |     
100 |     id.vect <- rep(1:n, each = m)
101 |     n0 <- n
102 |     n <- length(id.vect)
103 |     
104 |     if (is.null(sdvec) & geetype=="gaussian"){
105 |       sdvec <- rep(1,m)
106 |     }
107 |     
108 |     if (is.null(timedep_cor)){
109 |       timedep_cor <- 0.4
110 |     }
111 |     if (is.null(timedep_slope)){
112 |       timedep_slope <- 0.5
113 |     }else{
114 |       timedep_slope <- timedep_slope
115 |     }
116 |     
117 |   }
118 |   
119 |   x <- xobs <- matrix(NA,nrow=n,ncol=p)
120 |   
121 |   seqdep <- floor(runif(n,min=5000,max=50000))
122 |   # highidx <- true_set
123 |   
124 |   ###############################################
125 |   
126 |   # if (method == "manual"){
127 |   
128 |   if (!(model %in% c("timedep","gee"))){
129 |     
130 |     sigma <- rho^(as.matrix(dist(1:p)))
131 |     diag(sigma) <- c(rep(log(p)/2,3),1,rep(log(p)/2,2),1,log(p)/2,rep(1,p-8))
132 |     mu <- c(rep(log(p),3),0,rep(log(p),2),0,log(p),rep(0,p-8))
133 |     
134 |     x <- mvtnorm::rmvnorm(n=n,mean=mu,sigma=sigma)
135 |     
136 |     if (pct.sparsity > 0){
137 |       for (i in 1:n){
138 |         zeroidx <- sample(1:p,size=floor(p*pct.sparsity))
139 |         x[i,zeroidx] <- -Inf
140 |       }
141 |     }
142 |     
143 |   }else if(model %in% c("timedep","gee")){
144 |     
145 |     sigma <- rho^(as.matrix(dist(1:(p-(weak+strong)))))
146 |     diag(sigma) <- 1
147 |     mu <- rep(0,p-(weak+strong))
148 |     
149 |     x0 <- mvtnorm::rmvnorm(n=n,mean=mu,sigma=sigma)
150 |     x[,setdiff(1:p,true_set)] <- x0
151 |     
152 |     for (i in 1:n0){
153 |       
154 |       # Mu <- rep(c(rep(log(p),3),0,rep(log(p),2),0,log(p),rep(0,2)),m)
155 |       # mu <- c(rep(log(p),3),0,rep(log(p),2),0,log(p),rep(0,2))
156 |       # slopes <- outer(0:9,timedep_slope)
157 |       # Mu <- t(mu + t(slopes))
158 |       
159 |       slopes <- rep(0:1,(weak+strong)/2)*timedep_slope
160 |       Mu <- t(c(rep(log(p),3),0,rep(log(p),2),0,log(p),rep(0,2)) + t(outer(0:(m-1),slopes)))
161 |       
162 |       sigma1 <- rho^(as.matrix(dist(1:(weak+strong))))
163 |       Sigma <- (diag(m) %x% sigma1) + ((matrix(1,nrow=m,ncol=m) - diag(m)) %x% (diag(strong+weak)*timedep_cor))
164 |       # Sigma <- Matrix::bdiag(replicate(m,sigma,simplify=FALSE))
165 |       # sigma_offdiag <- diag(strong+weak)*0.8
166 |       # mat_template <- matrix(1,nrow=m,ncol=m) - diag(m)
167 |       
168 |       x1 <- matrix(rmvnorm(n=1,mean=as.vector(Mu),sigma=Sigma),nrow=m,ncol=weak+strong,byrow=FALSE)
169 |       x[id.vect==i,true_set] <- x1
170 |       
171 |       if (pct.sparsity > 0){
172 |         # for (i in 1:n0){
173 |         
174 |         if (longitudinal_stability){
175 |           zeroidx <- sample(true_set,size=floor(length(true_set)*pct.sparsity))
176 |           x[id.vect==i,zeroidx] <- -Inf
177 |         }else{
178 |           ids <- which(id.vect==i)
179 |           for (i in 1:m){
180 |             zeroidx <- sample(true_set,size=floor(length(true_set)*pct.sparsity))
181 |             x[ids[i],zeroidx] <- -Inf
182 |           }
183 |         }
184 |         
185 |         # zeroidx <- sample(1:(length(true_set)/2),size=floor(length(true_set)/2*pct.sparsity))
186 |         # x[id.vect==i,c(zeroidx*2,zeroidx*2-1)] <- -Inf
187 |         
188 |         # }
189 |       }
190 |       
191 |     }
192 |     
193 |     if (pct.sparsity > 0){
194 |       for (j in 1:n){
195 |         zeroidx <- sample(setdiff(1:p,true_set),size=floor(p*pct.sparsity))
196 |         x[j,zeroidx] <- -Inf
197 |       }
198 |     }
199 |     
200 |   }
201 |   
202 |   x = apply(x,2,function(y) exp(y)/rowSums(exp(x)))
203 |   
204 |   for (k in 1:n){
205 |     xobs[k,] <- rmultinom(1,size=seqdep[k],prob=x[k,])
206 |   }
207 |   xcount = xobs
208 |   colnames(xcount) <- paste0("taxa",1:p)
209 |   
210 |   for (k in 1:n){
211 |     x[k,] <- rmultinom(1,size=1000000,prob=x[k,])
212 |   }
213 |   x = log(x+1)
214 |   
215 |   # }else if (method == "SparseDOSSA2"){
216 |   
217 |   # sim <- SparseDOSSA2::SparseDOSSA2(template = "Stool",
218 |   #                                   n_sample=n,
219 |   #                                   median_read_depth = 25000,
220 |   #                                   new_features=FALSE,
221 |   #                                   verbose = FALSE)
222 |   # xcount <- t(sim$simulated_data)
223 |   # taxa <- colnames(xcount)
224 |   # 
225 |   # true_set <- which(taxa %in% c("k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Lachnospiraceae|g__Blautia|s__Ruminococcus_torques",
226 |   #                               "k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Clostridiaceae|g__Clostridium|s__Clostridium_leptum",
227 |   #                               "k__Bacteria|p__Firmicutes|c__Negativicutes|o__Selenomonadales|f__Veillonellaceae|g__Veillonella|s__Veillonella_unclassified",
228 |   #                               "k__Bacteria|p__Verrucomicrobia|c__Verrucomicrobiae|o__Verrucomicrobiales|f__Verrucomicrobiaceae|g__Akkermansia|s__Akkermansia_muciniphila",
229 |   #                               "k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o__Bacteroidales|f__Bacteroidaceae|g__Bacteroides|s__Bacteroides_uniformis",
230 |   #                               "k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o__Bacteroidales|f__Porphyromonadaceae|g__Parabacteroides|s__Parabacteroides_merdae",
231 |   #                               "k__Bacteria|p__Firmicutes|c__Negativicutes|o__Selenomonadales|f__Veillonellaceae|g__Veillonella|s__Veillonella_parvula",
232 |   #                               "k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Ruminococcaceae|g__Ruminococcus|s__Ruminococcus_bromii",
233 |   #                               "k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Lachnospiraceae|g__Roseburia|s__Roseburia_inulinivorans",
234 |   #                               "k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Coriobacteriales|f__Coriobacteriaceae|g__Collinsella|s__Collinsella_aerofaciens"
235 |   # ))
236 |   # 
237 |   # colnames(xcount) <- paste0("taxa",1:ncol(xcount))
238 |   # rownames(xcount) <- NULL
239 |   # x = t(sim$simulated_matrices$rel)
240 |   # for (k in 1:nrow(xcount)){
241 |   #   x[k,] <- rmultinom(1,size=1000000,prob=x[k,])
242 |   # }
243 |   # x = log(x+1)
244 |   
245 |   # }
246 |   
247 |   betavec <- rep(0,ncol(xcount))
248 |   betavec[true_set] <- beta
249 |   xobs <- log(xcount+1)
250 |   colnames(xobs) <- NULL
251 |   
252 |   if (ncov > 0){
253 |     xcov <- mvtnorm::rmvnorm(n=n,mean=rep(0,ncov))
254 |     colnames(xcov) <- paste0("cov",1:ncov)
255 |   }
256 |   
257 |   if (model == "linear"){
258 |     
259 |     y <- x[,true_set] %*% beta + rnorm(n,mean=0,sd=1)
260 |     
261 |     if(ncov > 0) y <- y + xcov %*% betacov
262 |     
263 |     if(intercept) {
264 |       
265 |       intcpt <- rnorm(1,mean=1,sd=1)
266 |       y <- y + intcpt
267 |       
268 |     }
269 |     y0 = y - mean(y)
270 |     
271 |     ret <- list(xcount=xcount,x=xobs,y=y,y0=y0,beta=betavec,idx=true_set)
272 |     
273 |     if (intercept) ret$intercept=intcpt
274 |     
275 |     if (ncov > 0) ret$xcov=xcov
276 |     
277 |   }else if(model == "binomial"){
278 |     
279 |     eta <- x[,true_set] %*% beta
280 |     
281 |     if (ncov > 0) eta <- eta + xcov %*% betacov
282 |     
283 |     if(intercept) {
284 |       
285 |       intcpt <- rnorm(1,mean=1,sd=1)
286 |       eta <- eta + intcpt
287 |       
288 |     }
289 |     
290 |     prob <- exp(eta)/(1+exp(eta))
291 |     
292 |     for (i in 1:n){
293 |       y[i] <- rbinom(1,1,prob=prob[i])
294 |     }
295 |     
296 |     ret <- list(xcount=xcount,x=xobs,y=y,beta=betavec,idx=true_set)
297 |     
298 |     if (intercept) ret$intercept=intcpt
299 |     
300 |     if (ncov > 0) ret$xcov=xcov
301 |     
302 |   }else if(model == "cox"){
303 |     
304 |     eta <- x[,true_set] %*% beta
305 |     
306 |     if (ncov > 0) eta <- eta + xcov %*% betacov
307 |     
308 |     lambda <- exp(eta)
309 |     
310 |     for(i in 1:n){
311 |       U <- runif(1,min=0,max=1)
312 |       t0 <- log(1-(log(1-U))/(0.1*lambda[i]))
313 |       c0 <- min(rexp(1,rate=0.1),runif(1,min=5,max=6))
314 |       t[i] <- min(t0,c0)
315 |       d[i] <- as.numeric(I(t0 <= c0))
316 |     }
317 |     
318 |     ret <- list(xcount=xcount,x=xobs,t=t,d=d,beta=betavec,idx=true_set)
319 |     
320 |     if (ncov > 0) ret$xcov=xcov
321 |     
322 |   }else if(model == "finegray"){
323 |     
324 |     eta <- x[,true_set] %*% beta
325 |     
326 |     if (ncov > 0) eta <- eta + xcov %*% betacov
327 |     
328 |     p.cif = 0.66
329 |     lambda <- exp(eta)
330 |     cl=0.19
331 |     cu=10
332 |     
333 |     P1 <- 1-(1-p.cif)^(lambda)
334 |     epsilon <- rep(0,n)
335 |     for (i in 1:n){
336 |       epsilon[i] <- 2 - rbinom(1,1,P1[i])
337 |     }
338 |     
339 |     #generate the event time based on the type of outcome
340 |     t0 <- rep(0,n)
341 |     u <- runif(n)
342 |     for (i in 1:n){
343 |       if (epsilon[i] == 1){
344 |         t0[i] <- -log(1 - (1 - (1-u[i]*(1-(1-p.cif)^lambda[i]))^(1/(lambda[i]+0.001)))/p.cif)
345 |       }
346 |       if (epsilon[i] == 2){
347 |         t0[i] <- -log((1-u[i])^(1/(lambda[i]+0.001)))
348 |       }
349 |     }
350 |     
351 |     #generate censoring time
352 |     c <- runif(n,cl,cu)
353 |     #observed time
354 |     t <- ifelse(t0 == Inf, c ,t0*I(t0<=c) + c*I(t0>c))
355 |     # outcome
356 |     d <- ifelse(t0 == Inf, 0, 0*I(t == c) + epsilon*I(t < c))
357 |     
358 |     ret <- list(xcount=xcount,x=xobs,t=t,d=d,beta=betavec,idx=true_set)
359 |     
360 |     if (ncov > 0) ret$xcov=xcov
361 |     
362 |   }else if (model=="timedep"){
363 |     
364 |     g.inv <- sqrt
365 |     g <- function(x) {
366 |       x^2
367 |     }
368 |     
369 |     t.max <- 9
370 |     t.min <- 0
371 |     
372 |     z.list <- list()
373 |     for (i in 1:n0) {
374 |       z <- x[id.vect == i,true_set]
375 |       z.list[[i]] <- cbind(z, exp(z %*% beta))
376 |     }
377 |     
378 |     k <- function(x, m, M, rates, t){
379 |       ifelse(x <= m | x >= M, 0, dpexp(x, rates, t))
380 |     }
381 |     
382 |     gen.y <- function(z,time_base,t.max,t.min){
383 |       
384 |       exp.etay <- z[,ncol(z)]
385 |       
386 |       t <- time_base
387 |       t.diff <- (t[-1] - t[1:(length(t) - 1)])[-(length(t) - 1)]
388 |       g.inv.t <- g.inv(t)
389 |       g.inv.t.diff <- (g.inv(t[-1]) - g.inv(t[1:(length(t) - 1)]))[-(length(t) - 1)]
390 |       
391 |       g.inv.t.max <- g.inv(t.max)
392 |       g.inv.t.min <- g.inv(t.min)
393 |       
394 |       x1 <- exp.etay
395 |       d <- ppexp(g.inv.t.max, x1, g.inv.t) - ppexp(g.inv.t.min, x1,g.inv.t)
396 |       M <- 1 / d
397 |       r <- 60
398 |       repeat{
399 |         y <- rpexp(r, x1, g.inv.t)
400 |         u <- runif(r)
401 |         t <- M * ((k(y, g.inv.t.min, g.inv.t.max, x1, g.inv.t) / d /
402 |                      dpexp(y, x1, g.inv.t)))
403 |         y <- y[u <= t][1]
404 |         if (!is.na(y)) break
405 |       }
406 |       y
407 |     }
408 |     
409 |     y <- sapply(z.list, gen.y, time_base = 0:9, t.max=t.max, t.min=t.min)
410 |     g.y <- g(y)
411 |     
412 |     # print(summary(g.y))
413 |     
414 |     # ct <- runif(n, min=5,max=6)
415 |     # d <- ifelse(g.y < ct, 1, 0)
416 |     # g.y <- ifelse(g.y < ct, g.y, ct)
417 |     
418 |     prop.cen <- 0.5
419 |     d <- sample(0:1, n, replace = TRUE, prob = c(prop.cen, 1 - prop.cen))
420 |     
421 |     data <- NULL
422 |     for (i in 1:n0) {
423 |       id.temp <- rep(i, ceiling(g.y[i]))
424 |       time.temp <- c(1:ceiling(g.y[i]))
425 |       time0.temp <- 0:(ceiling(g.y[i]) - 1)
426 |       d.temp <- c(rep(0, length(time.temp) - 1), d[i])
427 |       
428 |       if (ceiling(g.y[i]) > 9){
429 |         z.temp <- xcount[id.vect==i,]
430 |         
431 |         if (length(id.temp) > nrow(z.temp)){
432 |           
433 |           z.temp <- rbind(z.temp,
434 |                           # do.call("rbind", replicate(length(id.temp) - nrow(z.temp),
435 |                           #                            z.temp[nrow(z.temp),],
436 |                           #                            simplify = FALSE))
437 |                           z.temp[nrow(z.temp),]
438 |           )
439 |           
440 |           id.temp <- id.temp[c(1:10,length(id.temp))]
441 |           time.temp <- time.temp[c(1:10,length(id.temp))]
442 |           time0.temp <- time0.temp[c(1:11)]
443 |           d.temp <- d.temp[c(1:10,length(id.temp))]
444 |           
445 |         }
446 |         
447 |       }else{
448 |         z.temp <- xcount[id.vect==i,][1:(ceiling(g.y[i])),]
449 |       }
450 |       
451 |       if (is.null(nrow(z.temp))){
452 |         
453 |         data.temp <- c(id.temp, time.temp, time0.temp, d.temp, z.temp)
454 |         
455 |       }else{
456 |         
457 |         data.temp <- cbind(id.temp, time.temp, time0.temp, d.temp, z.temp)
458 |       }
459 |       
460 |       data <- rbind(data, data.temp)
461 |     }
462 |     
463 |     colnames(data) <- c('id', 't', 't0', 'd', paste0("z",1:p))
464 |     data <- data.frame(data)
465 |     data_unique <- data %>% 
466 |       group_by(id) %>% 
467 |       filter(row_number() == n()) %>% 
468 |       ungroup()
469 |     
470 |     xcount <- data[,-c(1:4)]
471 |     colnames(xcount) <- rownames(xcount) <- NULL
472 |     colnames(xcount) <- paste0("taxa",1:p)
473 |     
474 |     xcount_baseline <- data_unique[,-c(1:4)]
475 |     colnames(xcount_baseline) <- rownames(xcount_baseline) <- NULL
476 |     colnames(xcount_baseline) <- paste0("taxa",1:p)
477 |     
478 |     ret <- list(xcount=xcount,
479 |                 xcount_baseline=xcount_baseline,
480 |                 data=data,
481 |                 data_unique=data_unique,
482 |                 beta=betavec,
483 |                 idx=true_set)
484 |     
485 |   }else if (model == "gee"){
486 |     
487 |     tvec <- rep(0:(m-1),n0)
488 |     
489 |     if (geetype == "gaussian"){
490 |       
491 |       if (corstr == "independence"){
492 |         
493 |         SIGMA <- diag(sdvec) %*% diag(m) %*% diag(sdvec)
494 |         
495 |       }else if (corstr == "exchangeable"){
496 |         
497 |         R <- matrix(rhogee,m,m)+diag(rep(1-rhogee,m)) # Working correlation matrix
498 |         SIGMA <- diag(sdvec) %*% R %*% diag(sdvec)
499 |         
500 |       }else if (corstr == "AR-1"){
501 |         
502 |         R <- matrix(NA,nrow=m,ncol=m)
503 |         for (t1 in 1:m) {
504 |           for (t2 in 1:m) {
505 |             R[t1,t2]<-rhogee^abs(t1-t2)   
506 |           }
507 |         }
508 |         SIGMA <- diag(sdvec) %*% R %*% diag(sdvec)
509 |         
510 |       }
511 |       
512 |       # covariance matrix of error
513 |       error <- rmvnorm(n0, mean = rep(0,m),SIGMA)
514 |       
515 |       # form continuous longitudinal outcomes
516 |       y <- tvec*geeslope + x[,true_set] %*% beta + as.vector(t(error))
517 |       
518 |       if(intercept) {
519 |         intcpt <- rnorm(1,mean=1,sd=1)
520 |         y <- y + intcpt
521 |       }
522 |       
523 |     }else if (geetype == "binomial"){
524 |       
525 |       eta <- tvec*geeslope + x[,true_set] %*% beta
526 |       if(intercept) {
527 |         intcpt <- rnorm(1,mean=1,sd=1)
528 |         eta <- eta + intcpt
529 |       }
530 |       prob <- exp(eta)/(1+exp(eta))
531 |       y <- rep(NA,n)
532 |       
533 |       if (corstr == "independence"){
534 |         
535 |         for (i in 1:n){
536 |           y[i] <- rbinom(1,1,prob=prob[i])
537 |         }
538 |         
539 |       }else if (corstr == "exchangeable"){
540 |         
541 |         for (i in 1:n0){
542 |           
543 |           mu <- prob[id.vect==i]
544 |           y[id.vect==i] <- binsimuexch(mu,rhogee)
545 |           
546 |         }
547 |         
548 |       }else if (corstr == "AR-1"){
549 |         
550 |         for (i in 1:n0){
551 |           
552 |           mu <- prob[id.vect==i]
553 |           y[id.vect==i] <- binsimuar1(mu,rhogee)
554 |           
555 |         }
556 |         
557 |       }
558 |       
559 |     }
560 |     
561 |     ret <- list(xcount=xcount,
562 |                 x=xobs,
563 |                 y=y,
564 |                 id=id.vect,
565 |                 tvec=tvec,
566 |                 beta=betavec,
567 |                 idx=true_set)
568 |     
569 |     if (intercept) ret$intercept=intcpt
570 |     
571 |   }
572 |   
573 |   return(ret)
574 |   
575 | }
576 | 


--------------------------------------------------------------------------------
/R/LogRatioCoxLasso.R:
--------------------------------------------------------------------------------
  1 | LogRatioCoxLasso <- function(x,
  2 |                              y,
  3 |                              ncov,
  4 |                              length.lambda=100,
  5 |                              lambda.min.ratio=NULL,
  6 |                              wcov,
  7 |                              a=1,
  8 |                              mu=1,
  9 |                              maxiter=100,
 10 |                              ncv=5,
 11 |                              foldid=NULL,
 12 |                              step2=FALSE,
 13 |                              progress=TRUE,
 14 |                              plot=TRUE,
 15 |                              mcv="Deviance",
 16 |                              loop1=FALSE,
 17 |                              loop2=FALSE,
 18 |                              ncore=1){
 19 |   
 20 |   ptm <- proc.time()
 21 |   
 22 |   t <- y[,1]
 23 |   d <- y[,2]
 24 |   n <- nrow(y)
 25 |   p <- ncol(x)
 26 |   if (length(unique(t)) < n)  t <- t+runif(n,min=0,max=1e-4) # break ties
 27 |   
 28 |   tj <- sort(t[d==1])
 29 |   devnull <- log(sum(t >= tj[1]))
 30 |   denomj <- rep(NA,length(tj))
 31 |   denomj[1] <- sum(t >= tj[1])
 32 |   for (j in 2:length(tj)){
 33 |     denomj[j] <- denomj[j-1] - sum(t >= tj[j-1] & t < tj[j])
 34 |     devnull <- devnull + log(sum(t >= tj[j]))
 35 |   }
 36 |   devnull <- 2*devnull
 37 |   
 38 |   expect <- sapply(t,function(x) sum(1/denomj[which(tj <= x)]))
 39 |   sfun = d - expect
 40 |   # hfun <- expect - sapply(t,function(x) sum(1/denomj[which(tj <= x)]^2)) + 1e-8 # hessian
 41 |   # z <- sfun/hfun
 42 |   if (a > 0){
 43 |     lambda0 <- max(abs(t(sfun) %*% x))/(a*n)
 44 |   }else if (a == 0){
 45 |     lambda0 <- max(abs(t(sfun) %*% x))/(1e-3*n)
 46 |   }
 47 |   
 48 |   adjust = FALSE
 49 |   if (ncov > 0) adjust = TRUE
 50 |   
 51 |   if (is.null(lambda.min.ratio)) lambda.min.ratio = ifelse(n < p, 1e-02, 1e-02)
 52 |   lambda <- 10^(seq(log10(lambda0),log10(lambda0*lambda.min.ratio),length.out=length.lambda))
 53 |   
 54 |   if (progress) cat("Algorithm running for full dataset: \n")
 55 |   
 56 |   fullfit <- cox_enet_al(x,t,d,tj,length.lambda,mu,maxiter,lambda,wcov,a,adjust,ncov,devnull,progress,loop1,loop2,notcv=TRUE)
 57 |   lidx <- which(fullfit$loglik != 0 | !is.nan(fullfit$loglik))
 58 |   
 59 |   dev <- -2*fullfit$loglik[lidx]
 60 |   
 61 |   if (!is.null(colnames(x))){
 62 |     rownames(fullfit$beta) = colnames(x)
 63 |   }else{
 64 |     colnames(x) = 1:ncol(x)
 65 |     rownames(fullfit$beta) = 1:ncol(x)
 66 |   }
 67 |   
 68 |   if (!is.null(ncv)){
 69 |     
 70 |     cvdev <- matrix(0,nrow=length(lidx),ncol=ncv)
 71 |     # cvdevnull <- rep(0,ncol=ncv)
 72 |     
 73 |     if (is.null(foldid)){
 74 |       labels <- coxsplity(as.matrix(y),ncv)
 75 |     }else{
 76 |       labels <- foldid
 77 |     }
 78 |     
 79 |     # labels <- caret::createFolds(factor(d),k=ncv)
 80 |     
 81 |     if (ncore == 1){
 82 |       
 83 |       for (cv in 1:ncv){
 84 |         
 85 |         if (progress) cat(paste0("Algorithm running for cv dataset ",cv," out of ",ncv,": \n"))
 86 |         
 87 |         # train.x <- x[-labels[[cv]],]
 88 |         # train.d <- d[-labels[[cv]]]
 89 |         # train.t <- t[-labels[[cv]]]
 90 |         # test.x <- x[labels[[cv]],]
 91 |         # test.d <- d[labels[[cv]]]
 92 |         # test.t <- t[labels[[cv]]]
 93 |         
 94 |         train.x <- x[labels!=cv,]
 95 |         train.d <- d[labels!=cv]
 96 |         train.t <- t[labels!=cv]
 97 |         test.x <- x[labels==cv,]
 98 |         test.d <- d[labels==cv]
 99 |         test.t <- t[labels==cv]
100 |         
101 |         cv.devnull <- 0
102 |         train.tj <- sort(train.t[train.d==1])
103 |         for (j in 1:length(train.tj)){
104 |           cv.devnull <- cv.devnull + log(sum(train.t >= train.tj[j]))
105 |         }
106 |         cv.devnull <- 2*cv.devnull
107 |         
108 |         cvfit <- cox_enet_al(train.x,train.t,train.d,train.tj,length(lidx),mu,maxiter,lambda[lidx],wcov,a,adjust,ncov,cv.devnull,progress,loop1,loop2,notcv=FALSE)
109 |         
110 |         cv.devnull <- 0
111 |         loglik <- rep(0,length(lidx))
112 |         linprod <- test.x %*% cvfit$beta
113 |         
114 |         if (mcv == "Deviance"){
115 |           
116 |           test.tj <- sort(test.t[test.d==1])
117 |           for (j in 1:length(test.tj)){
118 |             cv.devnull <- cv.devnull + log(sum(test.t >= test.tj[j]))
119 |             if (sum(test.t >= test.tj[j]) > 1){
120 |               cvdev[,cv] <- cvdev[,cv] + linprod[test.t == test.tj[j],] - 
121 |                 log(colSums(exp(linprod[test.t >= test.tj[j],])) + 1e-8)
122 |             }else if (sum(test.t >= test.tj[j]) == 1){
123 |               cvdev[,cv] <- cvdev[,cv] + linprod[test.t == test.tj[j],] - 
124 |                 log(exp(linprod[test.t >= test.tj[j],]) + 1e-8) 
125 |             }
126 |             #- log(accu(link(widx))+1e-8)
127 |           }
128 |           
129 |           # cvdevnull[cv] <- 2*cv.devnull
130 |           cvdev[,cv] <- -2*cvdev[,cv]
131 |           
132 |         }else if (mcv == "Cindex"){
133 |           
134 |           for (kk in 1:length(lidx)){
135 |             cvdev[kk,cv] <- 1-concordance.index(x=linprod[,kk],surv.time=test.t,surv.event=test.d)$c.index
136 |           }
137 |           
138 |         }
139 |         # cvmse[,cv] <- apply(cbind(1,test.x) %*% rbind(t(cvfit$beta0),cvfit$beta),2,function(x) sum((test.y - exp(x)/(1+exp(x)))^2)/length(test.y))
140 |         
141 |       }
142 |       
143 |     }else if (ncore > 1){
144 |       
145 |       if (progress) warning(paste0("Using ", ncore ," core for cross-validation computation."))
146 |       
147 |       cl <- makeCluster(ncore)
148 |       registerDoParallel(cl)
149 |       
150 |       cvdev <- foreach(cv=1:ncv,.combine=cbind) %dopar% {
151 |         
152 |         train.x <- x[labels!=cv,]
153 |         train.d <- d[labels!=cv]
154 |         train.t <- t[labels!=cv]
155 |         test.x <- x[labels==cv,]
156 |         test.d <- d[labels==cv]
157 |         test.t <- t[labels==cv]
158 |         
159 |         cv.devnull <- 0
160 |         train.tj <- sort(train.t[train.d==1])
161 |         for (j in 1:length(train.tj)){
162 |           cv.devnull <- cv.devnull + log(sum(train.t >= train.tj[j]))
163 |         }
164 |         cv.devnull <- 2*cv.devnull
165 |         
166 |         cvfit <- cox_enet_al(train.x,train.t,train.d,train.tj,length(lidx),mu,maxiter,lambda[lidx],wcov,a,adjust,ncov,cv.devnull,FALSE,loop1,loop2,notcv=FALSE)
167 |         
168 |         linprod <- test.x %*% cvfit$beta
169 |         cv.dev <- rep(0,length(lidx))
170 |         
171 |         test.tj <- sort(test.t[test.d==1])
172 |         for (j in 1:length(test.tj)){
173 |           if (sum(test.t >= test.tj[j]) > 1){
174 |             cv.dev <- cv.dev + linprod[test.t == test.tj[j],] - 
175 |               log(colSums(exp(linprod[test.t >= test.tj[j],])) + 1e-8)
176 |           }else if (sum(test.t >= test.tj[j]) == 1){
177 |             cv.dev <- cv.dev + linprod[test.t == test.tj[j],] - 
178 |               log(exp(linprod[test.t >= test.tj[j],]) + 1e-8) 
179 |           }
180 |         }
181 |         
182 |         cv.dev <- -2*cv.dev
183 |         cv.dev
184 |       }
185 |       
186 |       stopCluster(cl)
187 |       
188 |     }
189 |     
190 |     mean.cvdev <- rowMeans(cvdev)
191 |     se.cvdev <- apply(cvdev,1,function(x) sd(x)/sqrt(ncv))
192 |     
193 |     idx.min <- which.min(mean.cvdev)
194 |     se.min <- se.cvdev[idx.min]
195 |     idx.1se <- suppressWarnings(max(which(mean.cvdev > mean.cvdev[idx.min] + se.min & 1:length(lidx) < idx.min)))
196 |     if (idx.1se == -Inf) idx.1se = 1
197 |     
198 |     best.beta <- list(min.mse = fullfit$beta[,idx.min],
199 |                       add.1se = fullfit$beta[,idx.1se])
200 |     
201 |     best.idx <- list(idx.min = idx.min,
202 |                      idx.1se = idx.1se)
203 |     
204 |     ret <- list(beta=fullfit$beta[,lidx],
205 |                 lambda=fullfit$lambda[lidx],
206 |                 a=a,
207 |                 loss=fullfit$loss[lidx],
208 |                 mse=fullfit$mse[lidx],
209 |                 tol=fullfit$tol[lidx],
210 |                 iters=fullfit$iters[lidx],
211 |                 cvdev.mean=mean.cvdev,
212 |                 cvdev.se=se.cvdev,
213 |                 best.beta=best.beta,
214 |                 best.idx=best.idx,
215 |                 foldid=labels
216 |     )
217 |     
218 |     if (plot){
219 |       
220 |       beta_nzero <- suppressWarnings(data.frame(reshape::melt(ret$beta[rowSums(ret$beta != 0) > 0,])))
221 |       beta_nzero$lambda <- ret$lambda[beta_nzero$X2]
222 |       beta_nzero$loglambda <- log(beta_nzero$lambda)
223 |       
224 |       lambda_count <- data.frame(loglambda = log(ret$lambda),
225 |                                  count = colSums(ret$beta != 0))
226 |       lambda_count <- lambda_count[seq(5,nrow(lambda_count),length.out=10),]
227 |       
228 |       top10feat <- sort(ret$beta[,length(ret$lambda)])[c(1:5,(p-4):p)]
229 |       top10name <- names(top10feat)
230 |       
231 |       pcoef <- ggplot(beta_nzero, aes(x=.data$loglambda,y=.data$value,group=.data$X1,color=as.factor(.data$X1))) + 
232 |         geom_line() + 
233 |         scale_color_manual(values=rainbow(sum(rowSums(ret$beta != 0) > 0))) + 
234 |         theme_bw() + 
235 |         theme(legend.position = "none") +
236 |         xlab(expression(paste("log(",lambda,")"))) +
237 |         ylab("Coefficient") +
238 |         geom_vline(xintercept=log(ret$lambda[ret$best.idx$idx.min]),linetype="dashed",color="darkgrey")+
239 |         geom_vline(xintercept=log(ret$lambda[ret$best.idx$idx.1se]),linetype="dotted",color="darkgrey")+
240 |         # annotate("text",x=min(beta_nzero$loglambda)-2,y=top10feat,label=top10name,hjust=0)+
241 |         annotate("text",x=lambda_count$loglambda,y=max(beta_nzero$value)+0.2,label=as.character(lambda_count$count))+
242 |         ggtitle(expression(paste("Coefficients versus log(",lambda,")")))
243 |       
244 |       devplot <- data.frame(loglambda=log(ret$lambda),
245 |                             dev=ret$cvdev.mean,
246 |                             se=ret$cvdev.se,
247 |                             devaddse=ret$cvdev.mean+ret$cvdev.se,
248 |                             devminse=ret$cvdev.mean-ret$cvdev.se)
249 |       
250 |       pdev <- ggplot(devplot, aes(x=.data$loglambda, y=.data$dev)) +
251 |         geom_errorbar(aes(ymin=.data$devminse,ymax=.data$devaddse),color="grey")+
252 |         geom_point(color="red")+
253 |         theme_bw() +
254 |         xlab(expression(paste("log(",lambda,")"))) +
255 |         ylab("Partial Likelihood Deviance")+
256 |         geom_vline(xintercept=log(ret$lambda[ret$best.idx$idx.min]),linetype="dashed",color="darkgrey")+
257 |         geom_vline(xintercept=log(ret$lambda[ret$best.idx$idx.1se]),linetype="dotted",color="darkgrey")+
258 |         annotate("text",x=lambda_count$loglambda,y=max(devplot$devaddse)+0.05,label=as.character(lambda_count$count))+
259 |         ggtitle(expression(paste("Cross-validated deviance versus log(",lambda,")")))
260 |       
261 |       ret$pcoef <- pcoef
262 |       ret$pdev <- pdev
263 |       
264 |     }
265 |     
266 |     if (step2){ # need to develop a equivalent lasso procedure for this. Stepwise selection is too slow for a big number of selected variables.
267 |       
268 |       if (!adjust){
269 |         
270 |         if (length(which(ret$best.beta$min.mse!=0)) > 1){
271 |           
272 |           idxs <- combn(which(ret$best.beta$min.mse!=0),2)
273 |           
274 |           x.select.min <- matrix(NA,nrow=n,ncol=ncol(idxs))
275 |           for (k in 1:ncol(idxs)){
276 |             x.select.min[,k] <- x[,idxs[1,k]] - x[,idxs[2,k]]
277 |           }
278 |           
279 |           if (ncol(x.select.min) > 1){
280 |             stepglmnet <- suppressWarnings(cv.glmnet(x=x.select.min,y=Surv(t,d),type.measure = "deviance",family="cox"))
281 |             x.select.min <- x.select.min[,which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)]
282 |             idxs <- idxs[,which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)]
283 |           }else{
284 |             idxs <- as.vector(idxs)
285 |           }
286 |           
287 |           df_step2 <- data.frame(t=t,d=d,x=x.select.min)
288 |           step2fit <- suppressWarnings(step(coxph(Surv(t,d)~.,data=df_step2),trace=0))
289 |           vars <- as.numeric(sapply(names(step2fit$coefficients),function(x) strsplit(x,split = "[.]")[[1]][2]))
290 |           
291 |           if (is.null(ncol(idxs))){
292 |             selected <- idxs
293 |           }else{
294 |             selected <- idxs[,vars]
295 |           }
296 |           
297 |           ret$step2.feature.min = selected
298 |           ret$step2fit.min <- step2fit
299 |         }
300 |         
301 |         if (length(which(ret$best.beta$add.1se!=0)) > 1){
302 |           
303 |           idxs <- combn(which(ret$best.beta$add.1se!=0),2)
304 |           
305 |           x.select.min <- matrix(NA,nrow=n,ncol=ncol(idxs))
306 |           for (k in 1:ncol(idxs)){
307 |             x.select.min[,k] <- x[,idxs[1,k]] - x[,idxs[2,k]]
308 |           }
309 |           
310 |           if (ncol(x.select.min) > 1){
311 |             stepglmnet <- suppressWarnings(cv.glmnet(x=x.select.min,y=Surv(t,d),type.measure = "deviance",family="cox"))
312 |             x.select.min <- x.select.min[,which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)]
313 |             idxs <- idxs[,which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)]
314 |           }else{
315 |             idxs <- as.vector(idxs)
316 |           }
317 |           
318 |           df_step2 <- data.frame(t=t,d=d,x=x.select.min)
319 |           step2fit <- suppressWarnings(step(coxph(Surv(t,d)~.,data=df_step2),trace=0))
320 |           vars <- as.numeric(sapply(names(step2fit$coefficients),function(x) strsplit(x,split = "[.]")[[1]][2]))
321 |           
322 |           if (is.null(ncol(idxs))){
323 |             selected <- idxs
324 |           }else{
325 |             selected <- idxs[,vars]
326 |           }
327 |           
328 |           # for (k1 in 1:nrow(selected)){
329 |           #   for (k2 in 1:ncol(selected)){
330 |           #     selected[k1,k2] <- colnames(x)[as.numeric(selected[k1,k2])]
331 |           #   }
332 |           # }
333 |           
334 |           ret$step2.feature.1se = selected
335 |           ret$step2fit.1se <- step2fit
336 |         }
337 |         
338 |       }else{
339 |         
340 |         if (length(which(ret$best.beta$min.mse!=0)) > 1){
341 |           
342 |           allidx <- which(ret$best.beta$min.mse!=0)
343 |           
344 |           covidx <- allidx[allidx <= ncov]
345 |           taxidx <- allidx[allidx > ncov]
346 |           
347 |           idxs <- NULL
348 |           x.select.min <- NULL
349 |           
350 |           if (length(taxidx) > 1){
351 |             idxs <- combn(taxidx,2)
352 |             for (k in 1:ncol(idxs)){
353 |               x.select.min <- cbind(x.select.min, x[,idxs[1,k]] - x[,idxs[2,k]])
354 |             }
355 |             colnames(x.select.min) <- rep("",ncol(x.select.min))
356 |           }
357 |           
358 |           if (length(covidx) > 0){
359 |             x.select.min <- cbind(x.select.min, x[,covidx])
360 |             if (!is.null(idxs)){
361 |               colnames(x.select.min)[(ncol(idxs)+1):ncol(x.select.min)] = colnames(x)[covidx]
362 |             }else{
363 |               colnames(x.select.min) <- colnames(x)[covidx]
364 |             }
365 |           }
366 |           
367 |           
368 |           # if(is.null(x.select.min)) break
369 |           
370 |           if (ncol(x.select.min) > 1){
371 |             stepglmnet <- suppressWarnings(cv.glmnet(x=x.select.min,y=Surv(t,d),type.measure = "deviance",family="cox"))
372 |             
373 |             if (length(taxidx) < 2){
374 |               
375 |               idxs <- NULL
376 |               
377 |               # covs <- colnames(x.select.min)[which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)]
378 |               
379 |             }else{
380 |               
381 |               if (length(covidx) == 0)  idxs <- idxs[,which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)]
382 |               
383 |               if (length(covidx) > 0){
384 |                 
385 |                 # covs <- colnames(x.select.min)[setdiff(which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0),
386 |                 #                                        1:ncol(idxs))]
387 |                 
388 |                 idxs <- idxs[,setdiff(which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0),
389 |                                       (ncol(idxs)+1):(ncol(idxs)+length(covidx)))]
390 |                 
391 |               }
392 |               
393 |             }
394 |             
395 |             x.select.min <- x.select.min[,which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)]
396 |           }
397 |           
398 |           if (sum(colnames(x.select.min)=="") > 0)  colnames(x.select.min)[colnames(x.select.min)==""] <- paste0("x.",1:sum(colnames(x.select.min)==""))
399 |           df_step2 <- data.frame(t=t,d=d,x.select.min)
400 |           step2fit <- suppressWarnings(step(coxph(Surv(t,d)~.,data=df_step2),trace=0))
401 |           vars <- suppressWarnings(as.numeric(sapply(names(step2fit$coefficients),function(x) strsplit(x,split = "[.]")[[1]][2])))
402 |           vars <- vars[!is.na(vars)]
403 |           
404 |           selected <- NULL
405 |           if (is.null(ncol(idxs)) & length(vars) > 0){
406 |             selected <- idxs
407 |           }else if (length(vars) > 0){
408 |             selected <- idxs[,vars]
409 |           }
410 |           
411 |           ret$step2.feature.min = selected
412 |           ret$step2fit.min <- step2fit
413 |         }
414 |         
415 |         if (length(which(ret$best.beta$add.1se!=0)) > 1){
416 |           
417 |           allidx <- which(ret$best.beta$add.1se!=0)
418 |           
419 |           covidx <- allidx[allidx <= ncov]
420 |           taxidx <- allidx[allidx > ncov]
421 |           
422 |           idxs <- NULL
423 |           x.select.min <- NULL
424 |           
425 |           if (length(taxidx) > 1){
426 |             idxs <- combn(taxidx,2)
427 |             for (k in 1:ncol(idxs)){
428 |               x.select.min <- cbind(x.select.min, x[,idxs[1,k]] - x[,idxs[2,k]])
429 |             }
430 |             colnames(x.select.min) <- rep("",ncol(x.select.min))
431 |           }
432 |           
433 |           if (length(covidx) > 0){
434 |             x.select.min <- cbind(x.select.min, x[,covidx])
435 |             if (!is.null(idxs)){
436 |               colnames(x.select.min)[(ncol(idxs)+1):ncol(x.select.min)] = colnames(x)[covidx]
437 |             }else{
438 |               colnames(x.select.min) <- colnames(x)[covidx]
439 |             }
440 |           }
441 |           
442 |           # if(is.null(x.select.min)) break
443 |           
444 |           if (ncol(x.select.min) > 1){
445 |             stepglmnet <- suppressWarnings(cv.glmnet(x=x.select.min,y=Surv(t,d),type.measure = "deviance",family="cox"))
446 |             
447 |             if (length(taxidx) < 2){
448 |               
449 |               idxs <- NULL
450 |               
451 |               # covs <- colnames(x.select.min)[which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)]
452 |               
453 |             }else{
454 |               
455 |               if (length(covidx) == 0)  idxs <- idxs[,which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)]
456 |               
457 |               if (length(covidx) > 0){
458 |                 
459 |                 # covs <- colnames(x.select.min)[setdiff(which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0),1:ncol(idxs))]
460 |                 
461 |                 idxs <- idxs[,setdiff(which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0),
462 |                                       (ncol(idxs)+1):(ncol(idxs)+length(covidx)))]
463 |                 
464 |               }
465 |               
466 |             }
467 |             
468 |             x.select.min <- x.select.min[,which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)]
469 |           }
470 |           
471 |           if (sum(colnames(x.select.min)=="") > 0)  colnames(x.select.min)[colnames(x.select.min)==""] <- paste0("x.",1:sum(colnames(x.select.min)==""))
472 |           df_step2 <- data.frame(t=t,d=d,x.select.min)
473 |           step2fit <- suppressWarnings(step(coxph(Surv(t,d)~.,data=df_step2),trace=0))
474 |           vars <- suppressWarnings(as.numeric(sapply(names(step2fit$coefficients),function(x) strsplit(x,split = "[.]")[[1]][2])))
475 |           vars <- vars[!is.na(vars)]
476 |           
477 |           selected <- NULL
478 |           if (is.null(ncol(idxs)) & length(vars) > 0){
479 |             selected <- idxs
480 |           }else if (length(vars) > 0){
481 |             selected <- idxs[,vars]
482 |           }
483 |           
484 |           ret$step2.feature.1se = selected
485 |           ret$step2fit.1se <- step2fit
486 |           
487 |         }
488 |         
489 |       }
490 |       
491 |     }
492 |     
493 |   }else{
494 |     
495 |     ret <- list(beta=fullfit$beta[,lidx],
496 |                 lambda=fullfit$lambda[lidx],
497 |                 a=a,
498 |                 loss=fullfit$loss[lidx],
499 |                 mse=fullfit$mse[lidx]
500 |     )
501 |     
502 |     if (plot){
503 |       
504 |       beta_nzero <- suppressWarnings(data.frame(reshape::melt(ret$beta[rowSums(ret$beta != 0) > 0,])))
505 |       beta_nzero$lambda <- ret$lambda[beta_nzero$X2]
506 |       beta_nzero$loglambda <- log(beta_nzero$lambda)
507 |       
508 |       lambda_count <- data.frame(loglambda = log(ret$lambda),
509 |                                  count = colSums(ret$beta != 0))
510 |       lambda_count <- lambda_count[seq(5,nrow(lambda_count),length.out=10),]
511 |       
512 |       top10feat <- sort(ret$beta[,length(ret$lambda)])[c(1:5,(p-4):p)]
513 |       top10name <- names(top10feat)
514 |       
515 |       pcoef <- ggplot(beta_nzero, aes(x=.data$loglambda,y=.data$value,group=.data$X1,color=as.factor(.data$X1))) + 
516 |         geom_line() + 
517 |         scale_color_manual(values=rainbow(sum(rowSums(ret$beta != 0) > 0))) + 
518 |         theme_bw() + 
519 |         theme(legend.position = "none") +
520 |         xlab(expression(paste("log(",lambda,")"))) +
521 |         ylab("Coefficient") +
522 |         # annotate("text",x=min(beta_nzero$loglambda)-2,y=top10feat,label=top10name,hjust=0)+
523 |         annotate("text",x=lambda_count$loglambda,y=max(beta_nzero$value)+0.2,label=as.character(lambda_count$count))+
524 |         ggtitle(expression(paste("Coefficients versus log(",lambda,")")))
525 |       
526 |       ret$pcoef <- pcoef
527 |       
528 |     }
529 |     
530 |   }
531 |   
532 |   ret$runtime <- proc.time() - ptm
533 |   
534 |   return(ret)
535 |   
536 | }


--------------------------------------------------------------------------------