├── data ├── QMP.rda ├── SynthData.rda └── SynthData2.rda ├── tests ├── testthat.R └── testthat │ ├── test-SPRING.R │ └── test-helpers.R ├── .Rbuildignore ├── .gitignore ├── NAMESPACE ├── SPRING.Rproj ├── man ├── examples │ ├── ex.R │ └── synthData_ex.R ├── qstepcdf.Rd ├── QMP.Rd ├── SynthData.Rd ├── mclr.Rd ├── hugeKmb.Rd ├── synthData_from_ecdf.Rd └── SPRING.Rd ├── DESCRIPTION ├── R ├── data.R ├── synthData.R ├── helpers.R └── SPRING.R ├── data-raw └── DATASET.R ├── README.md └── README.Rmd /data/QMP.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GraceYoon/SPRING/HEAD/data/QMP.rda -------------------------------------------------------------------------------- /data/SynthData.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GraceYoon/SPRING/HEAD/data/SynthData.rda -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(SPRING) 3 | 4 | test_check("SPRING") 5 | -------------------------------------------------------------------------------- /data/SynthData2.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GraceYoon/SPRING/HEAD/data/SynthData2.rda -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^SPRING\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^README\.Rmd$ 4 | ^README-.*\.png$ 5 | ^data-raw$ 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | .Rdata 6 | .httr-oauth 7 | .DS_Store 8 | -------------------------------------------------------------------------------- /tests/testthat/test-SPRING.R: -------------------------------------------------------------------------------- 1 | test_that("multiplication works", { 2 | expect_equal(2 * 2, 4) 3 | }) 4 | -------------------------------------------------------------------------------- /tests/testthat/test-helpers.R: -------------------------------------------------------------------------------- 1 | test_that("multiplication works", { 2 | expect_equal(2 * 2, 4) 3 | }) 4 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(SPRING) 4 | export(hugeKmb) 5 | export(mclr) 6 | export(synthData_from_ecdf) 7 | importFrom(SpiecEasi,graph2prec) 8 | importFrom(SpiecEasi,make_graph) 9 | importFrom(SpiecEasi,prec2cov) 10 | importFrom(huge,huge.mb) 11 | importFrom(mixedCCA,estimateR) 12 | importFrom(mvtnorm,rmvnorm) 13 | importFrom(pulsar,pulsar) 14 | importFrom(rootSolve,uniroot.all) 15 | importFrom(stats,ecdf) 16 | importFrom(stats,pnorm) 17 | -------------------------------------------------------------------------------- /SPRING.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: No 4 | SaveWorkspace: No 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Package 19 | PackageUseDevtools: Yes 20 | PackageInstallArgs: --no-multiarch --with-keep.source 21 | PackageRoxygenize: rd,collate,namespace 22 | -------------------------------------------------------------------------------- /man/examples/ex.R: -------------------------------------------------------------------------------- 1 | rm(list = ls()) 2 | library(SPRING) 3 | 4 | # Load the synthetic count data 5 | data("QMP") # n = 1000 and p = 100 synthetic dataset 6 | 7 | # SPRING on Synthetic Data, when assuming the data as quantitative counts. 8 | # The same setting used in Yoon et al. (2019) Frontiers in Genetics. 9 | \dontrun{ 10 | # This takes around 23 minutes. 11 | fit.spring <- SPRING(QMP, quantitative = TRUE, lambdaseq = "data-specific", 12 | nlambda = 50, seed = 10010, ncores = 2, rep.num = 50) 13 | } 14 | 15 | # SPRING on Compositional data. Row sums are scaled to 1. Then, mclr-transformation will be applied. 16 | \dontrun{ 17 | compoData <- QMP/rowSums(QMP) 18 | fit.spring <- SPRING(compoData, quantitative = FALSE, lambdaseq = "data-specific", 19 | nlambda = 10, rep.num = 10) 20 | } 21 | -------------------------------------------------------------------------------- /man/qstepcdf.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/synthData.R 3 | \name{qstepcdf} 4 | \alias{qstepcdf} 5 | \title{Inverse function of empirical cumulative distribution function (ecdf)} 6 | \usage{ 7 | qstepcdf(p, empf, interval, tol = 0.001, maxiter = 100) 8 | } 9 | \arguments{ 10 | \item{p}{probability (between 0 and 1)} 11 | 12 | \item{empf}{empirical cdf or any quantile function.} 13 | 14 | \item{interval}{find a solution only within this interval. a vector containing two end points of the interval.} 15 | 16 | \item{tol}{the desired accuracy (convergence tolerance).} 17 | 18 | \item{maxiter}{the maximum number of iterations for \code{uniroot.all}.} 19 | } 20 | \description{ 21 | Inverse function of empirical cumulative distribution function (ecdf) 22 | } 23 | \examples{ 24 | ### This is an internal function used in synthData_from_ecdf. 25 | } 26 | -------------------------------------------------------------------------------- /man/examples/synthData_ex.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | require(SpiecEasi) 4 | 5 | # goal is to generate synthetic data with a prescribed graph structure. 6 | # load real data "QMP" in SPRING package. 7 | data(QMP) 8 | set.seed(12345) # set the seed number for make_graph part. 9 | p1 = ncol(QMP) # the number of nodes. 10 | e1 = 2*p1 # the number of edges is set as twice the number of nodes. 11 | gtype = "cluster" 12 | # available types in SpiecEasi: "band", "cluster", "scale_free", "erdos_renyi", "hub", "block". 13 | graph_p1 <- SpiecEasi::make_graph(gtype, p1, e1) # adjacency matrix. 1: edge, 0: no edge. 14 | Prec1 <- SpiecEasi::graph2prec(graph_p1) # precision matrix. inverse of covariance. 15 | Cor1 <- cov2cor(SpiecEasi::prec2cov(Prec1)) # correlation matrix. 16 | 17 | X1_count <- synthData_from_ecdf(QMP, Sigma = Cor1, n = 100) 18 | # generate data of size n by p. 19 | # p = ncol(Cor1) = ncol(QMP) should hold. 20 | # need to specify sample size n. 21 | 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: SPRING 2 | Title: Semi-Parametric Rank-based approach for INference in Graphical model (SPRING) 3 | Version: 1.0.4 4 | Authors@R: c( 5 | person(given = "Grace", 6 | family = "Yoon", 7 | role = c("aut", "cre"), 8 | email = "gyoon6067@gmail.com", 9 | comment = c(ORCID = "0000-0003-3263-1352")), 10 | person(given = "Irina", 11 | family = "Gaynanova", 12 | role = c("aut"), 13 | email = "irinag@stat.tamu.edu"), 14 | person(given = "Christian", 15 | family = "Müller", 16 | role = c("aut"), 17 | email = "cmueller@flatironinstitute.org")) 18 | Description: SPRING is to estimate sparse microbial association networks using rank-based correlation with sparse graphical modeling techniques. 19 | License: GPL-3 20 | Encoding: UTF-8 21 | LazyData: true 22 | Depends: R (>= 2.10) 23 | Imports: mixedCCA, SpiecEasi, huge, pulsar, rootSolve, mvtnorm, stats 24 | Remotes: irinagain/mixedCCA, zdk123/SpiecEasi 25 | Suggests: 26 | testthat (>= 2.1.0) 27 | RoxygenNote: 7.1.1 28 | -------------------------------------------------------------------------------- /man/QMP.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{QMP} 5 | \alias{QMP} 6 | \title{Quantitative Microbiome Project data} 7 | \format{ 8 | An object of class \code{matrix} (inherits from \code{array}) with 106 rows and 91 columns. 9 | } 10 | \source{ 11 | Yoon, Gaynanova and Müller (2019) Microbial Networks in SPRING - Semi-parametric Rank-Based Correlation and Partial Correlation Estimation for Quantitative Microbiome Data. \emph{Frontiers in Genetics}. 10:516. \url{doi:10.3389/fgene.2019.00516} 12 | 13 | Vanderputte et al. (2017) Quantitative microbiome profiling links gut community variation to microbial load. \emph{Nature}. 551: 507-511. \url{doi:10.1038/nature24460} 14 | } 15 | \usage{ 16 | QMP 17 | } 18 | \description{ 19 | The data containing quantitative microbiome count data of dimenstion 106 samples/subjects (in rows) and 91 OTUs (in columns). The raw dataset is pruned the taxa present less than 30\% of samples and final dataset contains only healthy subjects from two cohorts: Study cohort and Disease cohort. 20 | } 21 | \keyword{datasets} 22 | -------------------------------------------------------------------------------- /man/SynthData.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{SynthData} 5 | \alias{SynthData} 6 | \alias{SynthData2} 7 | \title{Synthetic count data} 8 | \format{ 9 | \code{SynthData} is an object of class \code{matrix} with 500 rows and 200 columns. \code{SynthData2} is an object of class \code{matrix} with 1000 rows and 100 columns. 10 | 11 | An object of class \code{matrix} (inherits from \code{array}) with 1000 rows and 100 columns. 12 | } 13 | \source{ 14 | Yoon, Gaynanova and Müller (2019) Microbial Networks in SPRING - Semi-parametric Rank-Based Correlation and Partial Correlation Estimation for Quantitative Microbiome Data. \emph{Frontiers in Genetics.} 10:516. \url{doi:10.3389/fgene.2019.00516} 15 | } 16 | \usage{ 17 | SynthData 18 | 19 | SynthData2 20 | } 21 | \description{ 22 | SynthData and SynthData2 were generated using empirical cdf of American Gut Project Data. SynthData has scale_free-type-graph structure of size 500 rows and 200 columns, and SynthData2 has cluster-type-graph structure of size 1000 rows and 100 columns. 23 | } 24 | \keyword{datasets} 25 | -------------------------------------------------------------------------------- /man/mclr.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/helpers.R 3 | \name{mclr} 4 | \alias{mclr} 5 | \title{Modified central log ratio (mclr) transformation} 6 | \usage{ 7 | mclr(dat, base = exp(1), tol = 1e-16, eps = NULL, atleast = 1) 8 | } 9 | \arguments{ 10 | \item{dat}{raw count data or compositional data (n by p) does not matter.} 11 | 12 | \item{base}{exp(1) for natural log} 13 | 14 | \item{tol}{tolerance for checking zeros} 15 | 16 | \item{eps}{epsilon in eq (2) of the paper "Yoon, Gaynanova, M\"{u}ller (2019), Frontiers in Genetics". positive shifts to all non-zero compositions. Refer to the paper for more details. eps = absolute value of minimum of log ratio counts plus c.} 17 | 18 | \item{atleast}{default value is 1. Constant c which ensures all nonzero values to be strictly positive. default is 1.} 19 | } 20 | \value{ 21 | \code{mclr} returns a data matrix of the same dimension with input data matrix. 22 | } 23 | \description{ 24 | Modified central log ratio (mclr) transformation 25 | } 26 | \examples{ 27 | data(QMP) 28 | RMP <- QMP/rowSums(QMP) 29 | mclr_RMP <- mclr(RMP) 30 | 31 | } 32 | -------------------------------------------------------------------------------- /R/data.R: -------------------------------------------------------------------------------- 1 | #' Synthetic count data 2 | #' @name SynthData 3 | #' @description SynthData and SynthData2 were generated using empirical cdf of American Gut Project Data. SynthData has scale_free-type-graph structure of size 500 rows and 200 columns, and SynthData2 has cluster-type-graph structure of size 1000 rows and 100 columns. 4 | #' 5 | #' @source 6 | #' 7 | #' Yoon, Gaynanova and Müller (2019) Microbial Networks in SPRING - Semi-parametric Rank-Based Correlation and Partial Correlation Estimation for Quantitative Microbiome Data. \emph{Frontiers in Genetics.} 10:516. \url{doi:10.3389/fgene.2019.00516} 8 | #' @format \code{SynthData} is an object of class \code{matrix} with 500 rows and 200 columns. \code{SynthData2} is an object of class \code{matrix} with 1000 rows and 100 columns. 9 | "SynthData" 10 | 11 | 12 | 13 | #' @name SynthData 14 | #' @aliases SynthData2 15 | "SynthData2" 16 | 17 | #' Quantitative Microbiome Project data 18 | #' 19 | #' @description The data containing quantitative microbiome count data of dimenstion 106 samples/subjects (in rows) and 91 OTUs (in columns). The raw dataset is pruned the taxa present less than 30\% of samples and final dataset contains only healthy subjects from two cohorts: Study cohort and Disease cohort. 20 | #' 21 | #' @source 22 | #' 23 | #' Yoon, Gaynanova and Müller (2019) Microbial Networks in SPRING - Semi-parametric Rank-Based Correlation and Partial Correlation Estimation for Quantitative Microbiome Data. \emph{Frontiers in Genetics}. 10:516. \url{doi:10.3389/fgene.2019.00516} 24 | #' 25 | #' Vanderputte et al. (2017) Quantitative microbiome profiling links gut community variation to microbial load. \emph{Nature}. 551: 507-511. \url{doi:10.1038/nature24460} 26 | #' 27 | "QMP" 28 | -------------------------------------------------------------------------------- /data-raw/DATASET.R: -------------------------------------------------------------------------------- 1 | ## code to prepare `DATASET` dataset goes here 2 | 3 | # To generate synthetic data using extended american gut data. 4 | 5 | rm(list=ls()) 6 | 7 | library(SpiecEasi) # make_graph and graph2prec function. 8 | library(mixedCCA) # calculate Kendall Correlation. function "estimateR" 9 | source("../copulaMicrobiome/Rfunctions/synthData.R") # this has the function "synthData_from_ecdf" 10 | 11 | seed = 10010 12 | # load the amgut real data (subset of real data: size is n=2000 and p=p1 with minimum depth 1e4) 13 | load(paste0("../copulaMicrobiome/Data/amgutsim_p", p1, ".rdata")) 14 | 15 | 16 | n <- 500; p1 <- 200 17 | set.seed(seed) 18 | gtype = "scale_free" 19 | e1 = 2*p1 # number of edges 20 | 21 | set.seed(seed) # set the seed number for make_graph part. 22 | graph_p1 <- SpiecEasi::make_graph(gtype, p1, e1) 23 | Prec1 <- SpiecEasi::graph2prec(graph_p1) 24 | Cor1 <- cov2cor(prec2cov(Prec1)) 25 | 26 | # True counts 27 | SynthData <- synthData_from_ecdf(get(paste0("amgutsim_p", p1)), Sigma = Cor1, n = n, seed = seed) 28 | 29 | usethis::use_data(SynthData) 30 | 31 | 32 | n <- 1000; p1 <- 100 33 | set.seed(seed) 34 | gtype = "cluster" 35 | e1 = 2*p1 # number of edges 36 | 37 | set.seed(seed) # set the seed number for make_graph part. 38 | graph_p1 <- SpiecEasi::make_graph(gtype, p1, e1) 39 | Prec1 <- SpiecEasi::graph2prec(graph_p1) 40 | Cor1 <- cov2cor(prec2cov(Prec1)) 41 | 42 | # True counts 43 | SynthData2 <- synthData_from_ecdf(get(paste0("amgutsim_p", p1)), Sigma = Cor1, n = n, seed = seed) 44 | 45 | usethis::use_data(SynthData2) 46 | 47 | 48 | 49 | 50 | load("../copulaMicrobiome/Analysis/SimpleEx/qmphealthyrank6pruned.rdata") 51 | # this containing three data variable: 52 | # X: copyadjusted count data 53 | # QMP: quantitative count data (X -> RMP by dividing by total abundance -> QMP by multiplying by cell counts) 54 | # qmphealthy6_only1filt: phyloseq class data. 55 | 56 | ### But only save QMP for this package. 57 | usethis::use_data(QMP) 58 | 59 | 60 | -------------------------------------------------------------------------------- /man/hugeKmb.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/helpers.R 3 | \name{hugeKmb} 4 | \alias{hugeKmb} 5 | \title{Internal wrapper function to implement rank-based correlation to huge.mb function in "huge" package.} 6 | \usage{ 7 | hugeKmb( 8 | data, 9 | lambda, 10 | type = "trunc", 11 | sym = "or", 12 | verbose = TRUE, 13 | verboseR = TRUE, 14 | Rmethod = "approx", 15 | tol = 1e-06 16 | ) 17 | } 18 | \arguments{ 19 | \item{data}{n by p matrix data. usually through pulsar, data will receive subsamples.} 20 | 21 | \item{lambda}{a vector of lambda values} 22 | 23 | \item{type}{a type of variables. "trunc" is default.} 24 | 25 | \item{sym}{"or" is the symmetrizing rule of the output graphs. If sym = "and", the edge between node i and node j is selected ONLY when both node i and node j are selected as neighbors for each other. If sym = "or", the edge is selected when either node i or node j is selected as the neighbor for each other. The default value is "or". (refer to huge manual)} 26 | 27 | \item{verbose}{If \code{verbose = FALSE}, tracing information printing for HUGE (High-dimensional Undirected Graph Estimation) with a specified method (currently "mb" is only available) is disabled. The default value is TRUE.} 28 | 29 | \item{verboseR}{If \code{verboseR = FALSE}, printing information whetehr nearPD is used or not is disabled. The defalut value is TRUE.} 30 | 31 | \item{Rmethod}{The calculation method of latent correlation. Either "original" method or "approx". If \code{Rmethod = "approx"}, multilinear approximation method is used, which is much faster than the original method. If \code{Rmethod = "original"}, optimization of the bridge inverse function is used. The default is "approx".} 32 | 33 | \item{tol}{Desired accuracy when calculating the solution of bridge function in estimateR function.} 34 | } 35 | \value{ 36 | \code{hugeKmb} returns a data.frame containing 37 | \itemize{ 38 | \item{beta: } 39 | \item{path: }{a list of} 40 | \item{df: } 41 | } 42 | } 43 | \description{ 44 | Internal wrapper function to implement rank-based correlation to huge.mb function in "huge" package. 45 | } 46 | -------------------------------------------------------------------------------- /man/synthData_from_ecdf.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/synthData.R 3 | \name{synthData_from_ecdf} 4 | \alias{synthData_from_ecdf} 5 | \title{Synthetic data generator from real counts} 6 | \usage{ 7 | synthData_from_ecdf(comm, mar = 2, Sigma, n, seed = 10010, verbose = FALSE) 8 | } 9 | \arguments{ 10 | \item{comm}{community; a matrix of real count data that we want to simulate/sythesize. Samples are in rows and OTUs are in columns.} 11 | 12 | \item{mar}{MARGIN for apply function to calculate zero proportion for each row (mar = 1) or column (mar = 2).} 13 | 14 | \item{Sigma}{covariance structure of size p by p. p should match with the number of OTUs in \code{comm}, in other words, the number of columns of \code{comm}.} 15 | 16 | \item{n}{number of samples} 17 | 18 | \item{seed}{seed number for data generation (rmvnorm)} 19 | 20 | \item{verbose}{logical value. If it is TRUE, it will print out which iteration is going on and how long it took for calculation for each step. The defulat is FALSE.} 21 | } 22 | \value{ 23 | \code{synthData_from_ecdf} returns a data matrix of size n by p. 24 | } 25 | \description{ 26 | This function generates synthetic count data based on empirical cumulative distribution (ecdf) of real count data 27 | } 28 | \examples{ 29 | 30 | 31 | require(SpiecEasi) 32 | 33 | # goal is to generate synthetic data with a prescribed graph structure. 34 | # load real data "QMP" in SPRING package. 35 | data(QMP) 36 | set.seed(12345) # set the seed number for make_graph part. 37 | p1 = ncol(QMP) # the number of nodes. 38 | e1 = 2*p1 # the number of edges is set as twice the number of nodes. 39 | gtype = "cluster" 40 | # available types in SpiecEasi: "band", "cluster", "scale_free", "erdos_renyi", "hub", "block". 41 | graph_p1 <- SpiecEasi::make_graph(gtype, p1, e1) # adjacency matrix. 1: edge, 0: no edge. 42 | Prec1 <- SpiecEasi::graph2prec(graph_p1) # precision matrix. inverse of covariance. 43 | Cor1 <- cov2cor(SpiecEasi::prec2cov(Prec1)) # correlation matrix. 44 | 45 | X1_count <- synthData_from_ecdf(QMP, Sigma = Cor1, n = 100) 46 | # generate data of size n by p. 47 | # p = ncol(Cor1) = ncol(QMP) should hold. 48 | # need to specify sample size n. 49 | 50 | 51 | 52 | 53 | } 54 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # SPRING 5 | 6 | 7 | 8 | 9 | 10 | The R package `SPRING` (Semi-Parametric Rank-based approach for 11 | INference in Graphical model) estimates sparse microbial association 12 | networks using rank-based correlation with sparse graphical modeling 13 | techniques. The corresponding reference is 14 | 15 | Yoon G., Gaynanova I. and Müller C.L. (2019) [Microbial Networks in 16 | SPRING - Semi-parametric Rank-Based Correlation and Partial Correlation 17 | Estimation for Quantitative Microbiome 18 | Data](https://www.frontiersin.org/articles/10.3389/fgene.2019.00516/full). 19 | *Frontiers in Genetics*, 10:516. 20 | 21 | The faster version of latent correlation computation part is now fully 22 | available and implemented to the R package `SPRING`. The corresponding 23 | reference is available on arXiv: 24 | 25 | Yoon G., Müller C.L. and Gaynanova I. [Fast computation of latent 26 | correlations](https://arxiv.org/abs/2006.13875). *arXiv*. 27 | 28 | ## Installation 29 | 30 | ``` r 31 | # install.packages("devtools") 32 | devtools::install_github("GraceYoon/SPRING") 33 | ``` 34 | 35 | ## Example 36 | 37 | ``` r 38 | library(SPRING) 39 | data("QMP") # load the data available from this package, containing 106 samples and 91 OTUs. 40 | 41 | # Apply SPRING on QMP data. 42 | fit.spring <- SPRING(QMP, Rmethod = "approx", quantitative = TRUE, 43 | lambdaseq = "data-specific", nlambda = 50, rep.num = 50) 44 | # With Rmethod = "original", this takes around 23 minutes. 45 | # With Rmethod = "approx", this takes around 2.23 minutes. 46 | # More details on the comparison of accuracy and speed ("original" vs. "approx") 47 | # are available on the above arXiv reference. 48 | 49 | # StARS-selected lambda index based on the threshold (default = 0.01) 50 | opt.K <- fit.spring$output$stars$opt.index 51 | # Estimated adjacency matrix from sparse graphical modeling technique ("mb" method) (1 = edge, 0 = no edge) 52 | adj.K <- as.matrix(fit.spring$fit$est$path[[opt.K]]) 53 | # Estimated partial correlation coefficient, same as negative precision matrix. 54 | pcor.K <- as.matrix(SpiecEasi::symBeta(fit.spring$output$est$beta[[opt.K]], mode = 'maxabs')) 55 | ``` 56 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: github_document 3 | --- 4 | 5 | 6 | 7 | ```{r, include = FALSE} 8 | knitr::opts_chunk$set( 9 | collapse = TRUE, 10 | comment = "#>", 11 | fig.path = "man/figures/README-", 12 | out.width = "100%" 13 | ) 14 | ``` 15 | # SPRING 16 | 17 | 18 | 19 | 20 | The R package `SPRING` (Semi-Parametric Rank-based approach for INference in Graphical model) estimates sparse microbial association networks using rank-based correlation with sparse graphical modeling techniques. The corresponding reference is 21 | 22 | Yoon G., Gaynanova I. and Müller C.L. (2019) [Microbial Networks in SPRING - Semi-parametric Rank-Based Correlation and Partial Correlation Estimation for Quantitative Microbiome Data](https://www.frontiersin.org/articles/10.3389/fgene.2019.00516/full). *Frontiers in Genetics*, 10:516. 23 | 24 | The faster version of latent correlation computation part is now fully available and implemented to the R package `SPRING`. The corresponding reference is available on arXiv: 25 | 26 | Yoon G., Müller C.L. and Gaynanova I. [Fast computation of latent correlations](https://arxiv.org/abs/2006.13875). *arXiv*. 27 | 28 | 29 | ## Installation 30 | 31 | ``` r 32 | # install.packages("devtools") 33 | devtools::install_github("GraceYoon/SPRING") 34 | ``` 35 | ## Example 36 | 37 | ```{r example, eval=FALSE} 38 | library(SPRING) 39 | data("QMP") # load the data available from this package, containing 106 samples and 91 OTUs. 40 | 41 | # Apply SPRING on QMP data. 42 | fit.spring <- SPRING(QMP, Rmethod = "approx", quantitative = TRUE, 43 | lambdaseq = "data-specific", nlambda = 50, rep.num = 50) 44 | # With Rmethod = "original", this takes around 23 minutes. 45 | # With Rmethod = "approx", this takes around 2.23 minutes. 46 | # More details on the comparison of accuracy and speed ("original" vs. "approx") 47 | # are available on the above arXiv reference. 48 | 49 | # StARS-selected lambda index based on the threshold (default = 0.01) 50 | opt.K <- fit.spring$output$stars$opt.index 51 | # Estimated adjacency matrix from sparse graphical modeling technique ("mb" method) (1 = edge, 0 = no edge) 52 | adj.K <- as.matrix(fit.spring$fit$est$path[[opt.K]]) 53 | # Estimated partial correlation coefficient, same as negative precision matrix. 54 | pcor.K <- as.matrix(SpiecEasi::symBeta(fit.spring$output$est$beta[[opt.K]], mode = 'maxabs')) 55 | ``` 56 | 57 | -------------------------------------------------------------------------------- /R/synthData.R: -------------------------------------------------------------------------------- 1 | #' Synthetic data generator from real counts 2 | #' 3 | #' This function generates synthetic count data based on empirical cumulative distribution (ecdf) of real count data 4 | #' 5 | #' @param comm community; a matrix of real count data that we want to simulate/sythesize. Samples are in rows and OTUs are in columns. 6 | #' @param mar MARGIN for apply function to calculate zero proportion for each row (mar = 1) or column (mar = 2). 7 | #' @param Sigma covariance structure of size p by p. p should match with the number of OTUs in \code{comm}, in other words, the number of columns of \code{comm}. 8 | #' @param n number of samples 9 | #' @param seed seed number for data generation (rmvnorm) 10 | #' @param verbose logical value. If it is TRUE, it will print out which iteration is going on and how long it took for calculation for each step. The defulat is FALSE. 11 | #' 12 | #' @return \code{synthData_from_ecdf} returns a data matrix of size n by p. 13 | #' @importFrom mvtnorm rmvnorm 14 | #' @importFrom stats ecdf 15 | #' @importFrom stats pnorm 16 | #' @importFrom SpiecEasi make_graph graph2prec prec2cov 17 | #' @export 18 | #' 19 | #' @example man/examples/synthData_ex.R 20 | synthData_from_ecdf <- function(comm, mar = 2, Sigma, n, seed = 10010, verbose = FALSE){ 21 | 22 | d <- ncol(comm) 23 | zratio <- apply(comm, MARGIN = mar, function(x) (sum(x==0)/length(x))) 24 | maxabund <- apply(comm, MARGIN = mar, max) # to restrict the search range of the solution. 25 | 26 | if(!is.null(seed)) { 27 | set.seed(seed) 28 | } 29 | normd <- mvtnorm::rmvnorm(n, mean=rep(0, d), sigma = Sigma) # mvtnorm package is the fastest one to generate multivariate normal. 30 | unif <- pnorm(normd) 31 | dat <- matrix(0, n, d) 32 | 33 | for ( j in 1:d ){ 34 | nzind <- which(unif[, j] > zratio[j]) # to keep the zero ratio as the true data. 35 | empf <- ecdf(comm[, j]) # empf is a cdf function. empf(c) = Pr(X <= c) 36 | 37 | ptm <- proc.time() 38 | for ( k in 1:length(nzind) ){ 39 | # This is called "inverse transform sampling". https://en.wikipedia.org/wiki/Inverse_transform_sampling 40 | # Since the range of the cdf/quantile function is in [0, 1] 41 | # we want to what is the data value corresponding to a probability (value from unif variable) between 0 and 1. 42 | # Basically, we numerically calculate the inverse of empirical cdf. find a solution "empf^{-1}(prob)=?" 43 | dat[nzind[k], j] <- qstepcdf(unif[nzind[k], j], empf, interval = c(0, maxabund[j])) 44 | } 45 | if(verbose == TRUE) { 46 | cat("iteration = ", j , ": time = ", proc.time() - ptm, "\n") 47 | } 48 | 49 | } 50 | return(dat) 51 | 52 | } 53 | 54 | #' Inverse function of empirical cumulative distribution function (ecdf) 55 | #' 56 | #' @param p probability (between 0 and 1) 57 | #' @param empf empirical cdf or any quantile function. 58 | #' @param interval find a solution only within this interval. a vector containing two end points of the interval. 59 | #' @param tol the desired accuracy (convergence tolerance). 60 | #' @param maxiter the maximum number of iterations for \code{uniroot.all}. 61 | #' 62 | #' @importFrom rootSolve uniroot.all 63 | #' @examples 64 | #' ### This is an internal function used in synthData_from_ecdf. 65 | qstepcdf <- function(p, empf, interval, tol = 1e-3, maxiter = 100){ 66 | ans <- c() 67 | # uniroot.all from rootSolve package was the fastest one. 68 | sol <- as.numeric(uniroot.all(function(x){empf(x)-p}, interval = interval, tol = tol, maxiter = maxiter)) 69 | 70 | # in case quantile function is a step function, added the following step. 71 | if (p <= empf(floor(sol))) { 72 | ans <- floor(sol) 73 | } else if (p > empf(floor(sol))) { 74 | ans <- ceiling(sol) 75 | } 76 | return(ans) 77 | } 78 | 79 | 80 | -------------------------------------------------------------------------------- /R/helpers.R: -------------------------------------------------------------------------------- 1 | # Rfunctions for simple example for SPRING method 2 | # Yoon, Gaynanova and M\"{u}eller (2019) Frontiers in Genetics, Microbial Networks in SPRING - Semi-parametric Rank-Based Correlation and Partial Correlation Estimation for Quantitative Microbiome Data. 3 | # doi:10.3389/fgene.2019.00516 4 | 5 | # To implement Kendall correlation estimates on huge. 6 | # original huge function can only take covariance or data matrix. 7 | # huge.mb funciton returns beta values (MB coefficient estimates) 8 | # huge function with method="mb" does not return beta values. 9 | # to do network visualization (for edge color, I need beta) 10 | 11 | 12 | #' Internal wrapper function to implement rank-based correlation to huge.mb function in "huge" package. 13 | #' 14 | #' @param data n by p matrix data. usually through pulsar, data will receive subsamples. 15 | #' @param lambda a vector of lambda values 16 | #' @param type a type of variables. "trunc" is default. 17 | #' @param sym "or" is the symmetrizing rule of the output graphs. If sym = "and", the edge between node i and node j is selected ONLY when both node i and node j are selected as neighbors for each other. If sym = "or", the edge is selected when either node i or node j is selected as the neighbor for each other. The default value is "or". (refer to huge manual) 18 | #' @param verbose If \code{verbose = FALSE}, tracing information printing for HUGE (High-dimensional Undirected Graph Estimation) with a specified method (currently "mb" is only available) is disabled. The default value is TRUE. 19 | #' @param verboseR If \code{verboseR = FALSE}, printing information whetehr nearPD is used or not is disabled. The defalut value is TRUE. 20 | #' @param Rmethod The calculation method of latent correlation. Either "original" method or "approx". If \code{Rmethod = "approx"}, multilinear approximation method is used, which is much faster than the original method. If \code{Rmethod = "original"}, optimization of the bridge inverse function is used. The default is "approx". 21 | #' @param tol Desired accuracy when calculating the solution of bridge function in estimateR function. 22 | #' 23 | #' @return \code{hugeKmb} returns a data.frame containing 24 | #' \itemize{ 25 | #' \item{beta: } 26 | #' \item{path: }{a list of} 27 | #' \item{df: } 28 | #' } 29 | #' 30 | #' @importFrom huge huge.mb 31 | #' @export 32 | #' 33 | hugeKmb <- function(data, lambda, type = "trunc", sym = "or", verbose = TRUE, verboseR = TRUE, Rmethod = "approx", tol = 1e-6) { 34 | S <- mixedCCA::estimateR(data, type = type, method = Rmethod, tol = tol, verbose = verboseR)$R 35 | est <- huge::huge.mb(S, lambda, sym = sym, verbose = verbose) 36 | est 37 | } 38 | 39 | 40 | 41 | 42 | #' Modified central log ratio (mclr) transformation 43 | #' 44 | #' @param dat raw count data or compositional data (n by p) does not matter. 45 | #' @param base exp(1) for natural log 46 | #' @param tol tolerance for checking zeros 47 | 48 | # For eps and atleast, users do not have to specify any values. Default should be enough. 49 | #' @param eps epsilon in eq (2) of the paper "Yoon, Gaynanova, M\"{u}ller (2019), Frontiers in Genetics". positive shifts to all non-zero compositions. Refer to the paper for more details. eps = absolute value of minimum of log ratio counts plus c. 50 | #' @param atleast default value is 1. Constant c which ensures all nonzero values to be strictly positive. default is 1. 51 | #' 52 | #' 53 | #' @return \code{mclr} returns a data matrix of the same dimension with input data matrix. 54 | #' @export 55 | #' 56 | #' @examples 57 | #' data(QMP) 58 | #' RMP <- QMP/rowSums(QMP) 59 | #' mclr_RMP <- mclr(RMP) 60 | #' 61 | mclr <- function(dat, base = exp(1), tol = 1e-16, eps = NULL, atleast = 1){ 62 | dat <- as.matrix(dat) 63 | nzero <- (dat >= tol) # index for nonzero part 64 | LOG <- ifelse(nzero, log(dat, base), 0.0) # take log for only nonzero values. zeros stay as zeros. 65 | 66 | # centralize by the log of "geometric mean of only nonzero part" # it should be calculated by each row. 67 | if (nrow(dat) > 1){ 68 | clrdat <- ifelse(nzero, LOG - rowMeans(LOG)/rowMeans(nzero), 0.0) 69 | } else if (nrow(dat) == 1){ 70 | clrdat <- ifelse(nzero, LOG - mean(LOG)/mean(nzero), 0.0) 71 | } 72 | 73 | if (is.null(eps)){ 74 | if(atleast < 0){ 75 | warning("atleast should be positive. The functions uses default value 1 instead.") 76 | atleast = 1 77 | } 78 | if( min(clrdat) < 0 ){ # to find the smallest negative value and add 1 to shift all data larger than zero. 79 | positivecst <- abs(min(clrdat)) + atleast # "atleast" has default 1. 80 | }else{ 81 | positivecst <- 0 82 | } 83 | # positive shift 84 | ADDpos <- ifelse(nzero, clrdat + positivecst, 0.0) ## make all non-zero values strictly positive. 85 | return(ADDpos) 86 | } else if(eps == 0){ 87 | ## no shift. clr transform applied to non-zero proportions only. without pseudo count. 88 | return(clrdat) 89 | } else if(eps > 0){ 90 | ## use user-defined eps for additional positive shift. 91 | ADDpos <- ifelse(nzero, clrdat + eps, 0.0) 92 | return(ADDpos) 93 | } else { 94 | stop("check your eps value for additional positive shift. Otherwise, leave it as NULL.") 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /man/SPRING.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/SPRING.R 3 | \name{SPRING} 4 | \alias{SPRING} 5 | \title{Semi-Parametric Rank-based approach for INference in Graphical model (SPRING)} 6 | \usage{ 7 | SPRING( 8 | data, 9 | quantitative = FALSE, 10 | method = "mb", 11 | lambda.min.ratio = 0.01, 12 | nlambda = 20, 13 | lambdaseq = exp(seq(log(0.6), log(0.6 * lambda.min.ratio), length.out = nlambda)), 14 | seed = 10010, 15 | ncores = 1, 16 | thresh = 0.1, 17 | subsample.ratio = 0.8, 18 | rep.num = 20, 19 | Rtol = 1e-06, 20 | verbose = TRUE, 21 | verboseR = FALSE, 22 | Rmethod = "original" 23 | ) 24 | } 25 | \arguments{ 26 | \item{data}{n by p matrix of microbiome count data, either quantitative or compositional counts. Each row represents each subject/sample and each column represents each OTU (operational taxonomic unit).} 27 | 28 | \item{quantitative}{default is FALSE, which means input "data" is compositional data, which will be normalized using mclr transformation within a function. If TRUE, it means "quantitative" counts are input and no normalization will be applied.} 29 | 30 | \item{method}{graph estimation methods. Currently, only "mb" method is available.} 31 | 32 | \item{lambda.min.ratio}{default is 0.01} 33 | 34 | \item{nlambda}{default is 20.} 35 | 36 | \item{lambdaseq}{a sequence of decreasing positive numbers to control the regularization. The default sequence has 20 values generated to be equally spaced on a logarithmic scale from 0.6 to 0.006. Users can specify a sequence to override the default sequence. If user specify as "data-specific", then the lambda sequence will be generated using estimated rank-based correlation matrix from data.} 37 | 38 | \item{seed}{the seed for subsampling.} 39 | 40 | \item{ncores}{number of cores to use for subsampling. The default is 1.} 41 | 42 | \item{thresh}{threshold for StARS selection criterion. 0.1 is recommended (default). The smaller threshold returns sparser graph.} 43 | 44 | \item{subsample.ratio}{0.8 is default. The recommended values are 10*sqrt(n)/n for n > 144 or 0.8 otherwise.} 45 | 46 | \item{rep.num}{the repetition number of subsampling for StARS eddge stability selection. The default value is 20.} 47 | 48 | \item{Rtol}{Desired accuracy when calculating the solution of bridge function in estimateR function.} 49 | 50 | \item{verbose}{If \code{verbose = FALSE}, tracing information printing for HUGE (High-dimensional Undirected Graph Estimation) with a specified method (currently "mb" is only available) is disabled. The default value is TRUE.} 51 | 52 | \item{verboseR}{If \code{verboseR = FALSE}, printing information whetehr nearPD is used or not when calculating rank-based correlation matrices is disabled. The defalut value is FALSE.} 53 | 54 | \item{Rmethod}{The calculation method of latent correlation. Either "original" method or "approx". If \code{Rmethod = "original"}, multilinear approximation method is used, which is much faster than the original method. If \code{Rmethod = "original"}, optimization of the bridge inverse function is used. The default is "approx".} 55 | } 56 | \value{ 57 | \code{SPRING} returns a data.frame containing 58 | \itemize{ 59 | \item{output: }{Output results of \code{pulsar::pulsar} based on StARS criterion. It contains:} 60 | \itemize{ 61 | \item{merge: } a list of length \code{nlambda} and each element of list contains a matrix of edge selection probability. Each lambda value, this edge selection probability is calculated across \code{rep.num}. 62 | \item{summary: } the summary statistic over \code{rep.num} graphs at each value of lambda 63 | \item{opt.index: } index (along the path) of optimal lambda selected by the criterion at the desired threshold. Will return \eqn{0} if no optimum is found or \code{NULL} if selection for the criterion is not implemented. 64 | \item{criterion: } we use StARS for our stability criterion. 65 | } 66 | \item{fit: }{Output results of \code{pulsar::refit} function. It contains:} 67 | \itemize{ 68 | \item{est: } a data frame containing 69 | \itemize{ 70 | \item{beta: } Estimates of beta coefficient matrices (of size p by p) by "mb" method on the whole data at each of whole lambda sequence value. 71 | \item{path: } Estimates of precision matrix (of size p by p) on the whole data at each of whole lambda sequence value. 72 | } 73 | \item{refit: } final estimates of precision matrix (of size p by p). 74 | } 75 | \item{lambdaseq: }{lambda sequence used in the analysis} 76 | } 77 | } 78 | \description{ 79 | SPRING follows the neighborhood selection methodology outlined in "mb" method (Meinshausen and Buhlmann (2006)). 80 | } 81 | \examples{ 82 | rm(list = ls()) 83 | library(SPRING) 84 | 85 | # Load the synthetic count data 86 | data("QMP") # n = 1000 and p = 100 synthetic dataset 87 | 88 | # SPRING on Synthetic Data, when assuming the data as quantitative counts. 89 | # The same setting used in Yoon et al. (2019) Frontiers in Genetics. 90 | \dontrun{ 91 | # This takes around 23 minutes. 92 | fit.spring <- SPRING(QMP, quantitative = TRUE, lambdaseq = "data-specific", 93 | nlambda = 50, seed = 10010, ncores = 2, rep.num = 50) 94 | } 95 | 96 | # SPRING on Compositional data. Row sums are scaled to 1. Then, mclr-transformation will be applied. 97 | \dontrun{ 98 | compoData <- QMP/rowSums(QMP) 99 | fit.spring <- SPRING(compoData, quantitative = FALSE, lambdaseq = "data-specific", 100 | nlambda = 10, rep.num = 10) 101 | } 102 | } 103 | \references{ 104 | Meinshausen N. and Buhlmann P. (2006) \href{https://projecteuclid.org/download/pdfview_1/euclid.aos/1152540754}{"High-dimensional graphs and variable selection with the lasso"}, \emph{The Annals of Statistics}, Vol 34, No. 3, 1436 - 1462. 105 | 106 | Yoon G., Gaynanova I. and Müller C. (2019) \href{https://www.frontiersin.org/articles/10.3389/fgene.2019.00516/full}{"Microbial Networks in SPRING - Semi-parametric Rank-Based Correlation and Partial Correlation Estimation for Quantitative Microbiome Data"}, \emph{Frontiers in Genetics}, 10:516. 107 | } 108 | -------------------------------------------------------------------------------- /R/SPRING.R: -------------------------------------------------------------------------------- 1 | #' Semi-Parametric Rank-based approach for INference in Graphical model (SPRING) 2 | #' 3 | #' @description SPRING follows the neighborhood selection methodology outlined in "mb" method (Meinshausen and Buhlmann (2006)). 4 | #' 5 | #' @param data n by p matrix of microbiome count data, either quantitative or compositional counts. Each row represents each subject/sample and each column represents each OTU (operational taxonomic unit). 6 | #' @param quantitative default is FALSE, which means input "data" is compositional data, which will be normalized using mclr transformation within a function. If TRUE, it means "quantitative" counts are input and no normalization will be applied. 7 | #' @param method graph estimation methods. Currently, only "mb" method is available. 8 | #' @param lambda.min.ratio default is 0.01 9 | #' @param nlambda default is 20. 10 | #' @param lambdaseq a sequence of decreasing positive numbers to control the regularization. The default sequence has 20 values generated to be equally spaced on a logarithmic scale from 0.6 to 0.006. Users can specify a sequence to override the default sequence. If user specify as "data-specific", then the lambda sequence will be generated using estimated rank-based correlation matrix from data. 11 | #' @param seed the seed for subsampling. 12 | #' @param ncores number of cores to use for subsampling. The default is 1. 13 | #' @param thresh threshold for StARS selection criterion. 0.1 is recommended (default). The smaller threshold returns sparser graph. 14 | #' @param subsample.ratio 0.8 is default. The recommended values are 10*sqrt(n)/n for n > 144 or 0.8 otherwise. 15 | #' @param rep.num the repetition number of subsampling for StARS eddge stability selection. The default value is 20. 16 | #' @param Rtol Desired accuracy when calculating the solution of bridge function in estimateR function. 17 | #' @param verbose If \code{verbose = FALSE}, tracing information printing for HUGE (High-dimensional Undirected Graph Estimation) with a specified method (currently "mb" is only available) is disabled. The default value is TRUE. 18 | #' @param verboseR If \code{verboseR = FALSE}, printing information whetehr nearPD is used or not when calculating rank-based correlation matrices is disabled. The defalut value is FALSE. 19 | #' @param Rmethod The calculation method of latent correlation. Either "original" method or "approx". If \code{Rmethod = "original"}, multilinear approximation method is used, which is much faster than the original method. If \code{Rmethod = "original"}, optimization of the bridge inverse function is used. The default is "approx". 20 | #' 21 | #' @return \code{SPRING} returns a data.frame containing 22 | #' \itemize{ 23 | #' \item{output: }{Output results of \code{pulsar::pulsar} based on StARS criterion. It contains:} 24 | #' \itemize{ 25 | #' \item{merge: } a list of length \code{nlambda} and each element of list contains a matrix of edge selection probability. Each lambda value, this edge selection probability is calculated across \code{rep.num}. 26 | #' \item{summary: } the summary statistic over \code{rep.num} graphs at each value of lambda 27 | #' \item{opt.index: } index (along the path) of optimal lambda selected by the criterion at the desired threshold. Will return \eqn{0} if no optimum is found or \code{NULL} if selection for the criterion is not implemented. 28 | #' \item{criterion: } we use StARS for our stability criterion. 29 | #' } 30 | #' \item{fit: }{Output results of \code{pulsar::refit} function. It contains:} 31 | #' \itemize{ 32 | #' \item{est: } a data frame containing 33 | #' \itemize{ 34 | #' \item{beta: } Estimates of beta coefficient matrices (of size p by p) by "mb" method on the whole data at each of whole lambda sequence value. 35 | #' \item{path: } Estimates of precision matrix (of size p by p) on the whole data at each of whole lambda sequence value. 36 | #' } 37 | #' \item{refit: } final estimates of precision matrix (of size p by p). 38 | #' } 39 | #' \item{lambdaseq: }{lambda sequence used in the analysis} 40 | #' } 41 | #' @importFrom huge huge.mb 42 | #' @importFrom pulsar pulsar 43 | #' @importFrom mixedCCA estimateR 44 | #' 45 | #' @export 46 | #' 47 | #' @references 48 | #' 49 | #' Meinshausen N. and Buhlmann P. (2006) \href{https://projecteuclid.org/download/pdfview_1/euclid.aos/1152540754}{"High-dimensional graphs and variable selection with the lasso"}, \emph{The Annals of Statistics}, Vol 34, No. 3, 1436 - 1462. 50 | #' 51 | #' Yoon G., Gaynanova I. and Müller C. (2019) \href{https://www.frontiersin.org/articles/10.3389/fgene.2019.00516/full}{"Microbial Networks in SPRING - Semi-parametric Rank-Based Correlation and Partial Correlation Estimation for Quantitative Microbiome Data"}, \emph{Frontiers in Genetics}, 10:516. 52 | #' 53 | #' @example man/examples/ex.R 54 | #' 55 | SPRING <- function(data, quantitative = FALSE, method = "mb", lambda.min.ratio = 1e-2, nlambda = 20, lambdaseq = exp(seq(log(0.6), log(0.6*lambda.min.ratio), length.out = nlambda)), seed = 10010, ncores = 1, thresh = 0.1, subsample.ratio = 0.8, rep.num = 20, Rtol = 1e-6, verbose = TRUE, verboseR = FALSE, Rmethod = "original"){ 56 | 57 | if (any(data < 0)) { 58 | stop("Negative values are detected, but either quantitative or compositional counts are expected.\n") 59 | } 60 | p <- ncol(data) 61 | if (quantitative){ 62 | if (max(rowSums(data)) <= 1 | isTRUE(all.equal(max(rowSums(data)), 1))){ 63 | warning("The input data is normalized, but quantitative count data is expected.\n") 64 | } 65 | qdat <- data 66 | } else { 67 | qdat <- mclr(data) 68 | } 69 | rm(data) 70 | gc() 71 | 72 | if(is.character(lambdaseq)){ 73 | if(lambdaseq == "data-specific"){ 74 | Kcor <- mixedCCA::estimateR(qdat, type = "trunc", method = Rmethod, tol = Rtol, verbose = verboseR)$R 75 | # generate lambda sequence 76 | lambda.max <- max(max(Kcor-diag(p)), -min(Kcor-diag(p))) 77 | lambda.min <- lambda.min.ratio * lambda.max 78 | lambdaseq <- exp(seq(log(lambda.max), log(lambda.min), length = nlambda)) 79 | } else { 80 | stop("The input for lambdaseq is not correct.\n") 81 | } 82 | } 83 | 84 | if(method == "mb"){ 85 | fun <- hugeKmb 86 | } 87 | 88 | out1.K_count <- pulsar::pulsar(qdat, fun = fun, fargs = list(lambda = lambdaseq, Rmethod = Rmethod, tol = Rtol, verbose = verbose, verboseR = verboseR), rep.num = rep.num, criterion = 'stars', seed = seed, ncores = ncores, thresh = thresh, subsample.ratio = subsample.ratio) 89 | 90 | fit1.K_count <- pulsar::refit(out1.K_count) 91 | 92 | return(list(output = out1.K_count, fit = fit1.K_count, lambdaseq = lambdaseq)) 93 | } 94 | --------------------------------------------------------------------------------