├── .Rbuildignore
├── .gitignore
├── DESCRIPTION
├── LICENSE
├── NAMESPACE
├── NEWS.md
├── R
    ├── construct_data.R
    ├── data.R
    ├── extract_para.R
    ├── fit_copula.R
    ├── fit_marginal.R
    ├── gamlss_fix.R
    ├── perform_lrt.R
    ├── plot_reduceddim.R
    ├── scdesign3.R
    ├── simu_new.R
    └── sparse_cov.R
├── README.md
├── data
    └── example_sce.rda
├── inst
    └── CITATION
├── man
    ├── ba.Rd
    ├── construct_data.Rd
    ├── example_sce.Rd
    ├── extract_para.Rd
    ├── figures
    │   └── scDesign3_illustration.png
    ├── fit_copula.Rd
    ├── fit_marginal.Rd
    ├── ga.Rd
    ├── gamlss.ba.Rd
    ├── gamlss.ga.Rd
    ├── perform_lrt.Rd
    ├── plot_reduceddim.Rd
    ├── scdesign3.Rd
    ├── simu_new.Rd
    └── sparse_cov.Rd
├── tests
    ├── testthat.R
    └── testthat
    │   └── test-scDesign3.R
└── vignettes
    ├── .gitignore
    └── scDesign3.Rmd


/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^.*\.Rproj$
 2 | ^\.Rproj\.user$
 3 | ^LICENSE\.md$
 4 | 
 5 | # Retrieved 2017-Oct-12 from https://github.com/github/gitignore/blob/master/R.gitignore
 6 | # Licensed under CC0-1.0 https://github.com/github/gitignore/blob/master/LICENSE
 7 | 
 8 | # History files
 9 | .Rhistory
10 | .Rapp.history
11 | 
12 | # Session Data files
13 | .RData
14 | 
15 | # Example code in package build process
16 | /*-Ex.R
17 | 
18 | # Output files from R CMD build
19 | /*.tar.gz
20 | 
21 | # Output files from R CMD check
22 | /*.Rcheck/
23 | 
24 | # RStudio files
25 | .Rproj.user/
26 | 
27 | # produced vignettes
28 | #vignettes/*.html
29 | vignettes/*.pdf
30 | 
31 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
32 | .httr-oauth
33 | 
34 | # knitr and R markdown default cache directories
35 | /*_cache/
36 | /cache/
37 | vignettes/*_cache/
38 | 
39 | # Temporary files created by R markdown
40 | ^utf8.md$
41 | ^knit.md$
42 | .Rproj.user
43 | 
44 | # Temporary files created by cpp
45 | src/*.o
46 | src/*.so
47 | src/*.dll
48 | #inst/doc
49 | ^doc$
50 | ^Meta$
51 | ^_pkgdown\.yml$
52 | ^docs$
53 | ^pkgdown$
54 | ^~/Package/scDesign3_vignette$
55 | ^/home/qingyang/Package/scDesign3_vignette$
56 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | scDesign3.Rproj
 2 | .git
 3 | .Rproj
 4 | .Rproj.user
 5 | .Rhistory
 6 | .RData
 7 | .Ruserdata
 8 | inst/doc
 9 | /doc/
10 | /Meta/
11 | docs
12 | # pkgdown
13 | .yml 
14 | # History files
15 | .Rhistory
16 | .Rapp.history
17 | 
18 | # Session Data files
19 | .RData
20 | .RDataTmp
21 | 
22 | # User-specific files
23 | .Ruserdata
24 | 
25 | # Example code in package build process
26 | *-Ex.R
27 | 
28 | # Output files from R CMD build
29 | /*.tar.gz
30 | 
31 | # Output files from R CMD check
32 | /*.Rcheck/
33 | 
34 | # RStudio files
35 | .Rproj.user/
36 | 
37 | # produced vignettes
38 | vignettes/*.html
39 | vignettes/*.pdf
40 | 
41 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
42 | .httr-oauth
43 | 
44 | # knitr and R markdown default cache directories
45 | *_cache/
46 | /cache/
47 | 
48 | # Temporary files created by R markdown
49 | *.utf8.md
50 | *.knit.md
51 | 
52 | # R Environment Variables
53 | .Renviron
54 | 
55 | # pkgdown site
56 | docs/
57 | 
58 | # translation temp files
59 | po/*~
60 | 
61 | # RStudio Connect folder
62 | rsconnect/
63 | 
64 | .git/
65 | .Rproj.user/


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: scDesign3
 2 | Type: Package
 3 | Title: A unified framework of realistic in silico data generation and statistical model inference for single-cell and spatial omics
 4 | Version: 1.5.0
 5 | Authors@R: 
 6 |     c(person("Dongyuan", "Song", , "dongyuansong@ucla.edu", role = c("aut", "cre"),
 7 |            comment = c(ORCID = "0000-0003-1114-1215")),
 8 |     person("Qingyang", "Wang", , "qw802@g.ucla.edu",  role = c("aut"),
 9 |            comment = c(ORCID = "0000-0002-1051-609X")),
10 |     person("Chenxin", "Jiang", , "cflorajiang@g.ucla.edu",  role = c("aut"),
11 |            comment = c(ORCID = "0009-0005-7369-4116")))
12 | Description: We present a statistical simulator, scDesign3, to generate realistic single-cell and spatial omics data, including various cell states, experimental designs, and feature modalities, by learning interpretable parameters from real data. Using a unified probabilistic model for single-cell and spatial omics data, scDesign3 infers biologically meaningful parameters; assesses the goodness-of-fit of inferred cell clusters, trajectories, and spatial locations; and generates in silico negative and positive controls for benchmarking computational tools.
13 | License: MIT + file LICENSE
14 | Encoding: UTF-8
15 | LazyData: false
16 | Depends: R (>= 4.3.0)
17 | Imports:
18 |     dplyr,
19 |     tibble,
20 |     stats,
21 |     methods,
22 |     mgcv,
23 |     gamlss,
24 |     gamlss.dist,
25 |     SummarizedExperiment,
26 |     SingleCellExperiment,
27 |     mclust,
28 |     mvtnorm,
29 |     parallel,
30 |     pbmcapply,
31 |     rvinecopulib,
32 |     umap,
33 |     ggplot2,
34 |     irlba,
35 |     viridis,
36 |     BiocParallel,
37 |     matrixStats,
38 |     Matrix,
39 |     sparseMVN,
40 |     coop
41 | Suggests:
42 |     mvnfast,
43 |     igraph,
44 |     knitr,
45 |     rmarkdown,
46 |     testthat (>= 3.0.0),
47 |     RefManageR,
48 |     sessioninfo,
49 |     BiocStyle
50 | biocViews:
51 |     Software,
52 |     SingleCell,
53 |     Sequencing,
54 |     GeneExpression,
55 |     Spatial
56 | URL: https://github.com/SONGDONGYUAN1994/scDesign3
57 | BugReports: https://github.com/SONGDONGYUAN1994/scDesign3/issues
58 | RoxygenNote: 7.3.2
59 | Config/testthat/edition: 3
60 | VignetteBuilder: knitr
61 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2024 Dongyuan Song
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | S3method(predict,gamlss)
 4 | export(ba)
 5 | export(construct_data)
 6 | export(extract_para)
 7 | export(fit_copula)
 8 | export(fit_marginal)
 9 | export(ga)
10 | export(gamlss.ba)
11 | export(gamlss.ga)
12 | export(perform_lrt)
13 | export(plot_reduceddim)
14 | export(scdesign3)
15 | export(simu_new)
16 | export(sparse_cov)
17 | import(gamlss)
18 | import(ggplot2)
19 | import(mclust)
20 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
 1 | # 1.3.0 (2025-03-01)
 2 | + Add multiple random samples
 3 | 
 4 | # 1.1.4 (2024-11-07)
 5 | * Add options for correlation matrix calculation
 6 | 
 7 | # 1.1.3 (2024-06-22)
 8 | * Add sparse Gaussian copula for `fit_copula`
 9 | * Add automatic k selection for `fit_marginal`
10 | 
11 | # 1.0.1 (2023-01-20)
12 | * Submitted to Bioconductor
13 | 
14 | # 0.99.5 (2023-07-15)
15 | * Update the MVN sampling by R package `mvnfast`


--------------------------------------------------------------------------------
/R/construct_data.R:
--------------------------------------------------------------------------------
  1 | #' Construct the input data (covariate matrix and expression matrix)
  2 | #'
  3 | #' This function constructs the input data for \code{\link{fit_marginal}}.
  4 | #'
  5 | #' This function takes a \code{SingleCellExperiment} object as the input.
  6 | #' Based on users' choice, it constructs the matrix of covariates
  7 | #' (explanatory variables) and the expression matrix (e.g., count matrix for scRNA-seq).
  8 | #'
  9 | #' @param sce A \code{SingleCellExperiment} object.
 10 | #' @param assay_use A string which indicates the assay you will use in the sce. Default is 'counts'.
 11 | #' @param celltype A string of the name of cell type variable in the \code{colData} of the sce. Default is 'cell_type'.
 12 | #' @param pseudotime A string or a string vector of the name of pseudotime and (if exist)
 13 | #' multiple lineages. Default is NULL.
 14 | #' @param spatial A length two string vector of the names of spatial coordinates. Default is NULL.
 15 | #' @param other_covariates A string or a string vector of the other covariates you want to include in the data.
 16 | #' @param ncell The number of cell you want to simulate. Default is \code{dim(sce)[2]} (the same number as the input data).
 17 | #' If an arbitrary number is provided, the function will use Vine Copula to simulate a new covariate matrix.
 18 | #' @param corr_by A string or a string vector which indicates the groups for correlation structure. If '1', all cells have one estimated corr. If 'ind', no corr (features are independent). If others, this variable decides the corr structures.
 19 | #' @param parallelization A string indicating the specific parallelization function to use.
 20 | #' Must be one of 'mcmapply', 'bpmapply', or 'pbmcmapply', which corresponds to the parallelization function in the package
 21 | #' \code{parallel},\code{BiocParallel}, and \code{pbmcapply} respectively. The default value is 'mcmapply'.
 22 | #' @param BPPARAM A \code{MulticoreParam} object or NULL. When the parameter parallelization = 'mcmapply' or 'pbmcmapply',
 23 | #' this parameter must be NULL. When the parameter parallelization = 'bpmapply',  this parameter must be one of the
 24 | #' \code{MulticoreParam} object offered by the package 'BiocParallel. The default value is NULL.
 25 | #'
 26 | #' @return A list with the components:
 27 | #' \describe{
 28 | #'   \item{\code{count_mat}}{The expression matrix}
 29 | #'   \item{\code{dat}}{The original covariate matrix}
 30 | #'   \item{\code{newCovariate}}{The simulated new covariate matrix, is NULL if the parameter ncell is default}
 31 | #'   \item{\code{filtered_gene}}{The genes that are excluded in the marginal and copula fitting 
 32 | #' steps because these genes only express in less than two cells.}
 33 | #' }
 34 | #' @examples
 35 | #'   data(example_sce)
 36 | #'   my_data <- construct_data(
 37 | #'   sce = example_sce,
 38 | #'   assay_use = "counts",
 39 | #'   celltype = "cell_type",
 40 | #'   pseudotime = "pseudotime",
 41 | #'   spatial = NULL,
 42 | #'   other_covariates = NULL,
 43 | #'   corr_by = "1"
 44 | #'   )
 45 | #'
 46 | #'
 47 | #' @export construct_data
 48 | construct_data <- function(sce,
 49 |                            assay_use = "counts",
 50 |                            celltype,
 51 |                            pseudotime,
 52 |                            spatial,
 53 |                            other_covariates,
 54 |                            ncell = dim(sce)[2],
 55 |                            corr_by,
 56 |                            parallelization = "mcmapply",
 57 |                            BPPARAM = NULL) {
 58 |   
 59 |   ## check unique cell names and gene names
 60 |   if(length(unique(colnames(sce))) != dim(sce)[2]){
 61 |     stop("Please make sure your inputted SingleCellExperiment object does not have duplicate cell names")
 62 |   }
 63 |   if(length(unique(rownames(sce))) != dim(sce)[1]){
 64 |     stop("Please make sure your inputted SingleCellExperiment object does not have duplicate gene names")
 65 |   }
 66 |   ## Extract expression matrix
 67 |   count_mat <-
 68 |     t(as.matrix(SummarizedExperiment::assay(sce, assay_use)))
 69 |   ## Extract col data
 70 |   coldata_mat <- data.frame(SummarizedExperiment::colData(sce))
 71 |   
 72 |   ##
 73 |   if (is.null(celltype) & is.null(pseudotime) & is.null(spatial)) {
 74 |     stop("One of celltype, pseudotime and spatial must be provided!")
 75 |   } else {
 76 |     primary_covariate <- c(celltype, pseudotime, spatial)
 77 |     dat <- as.data.frame(coldata_mat[, primary_covariate, drop = FALSE])
 78 |   }
 79 |   
 80 |   if(!is.null(celltype)) {
 81 |     dat[, celltype] <- as.factor(dat[, celltype])}
 82 |   
 83 |   # ## Extract pseudotime / cell type / spatial
 84 |   # if (!is.null(celltype)) {
 85 |   #   celltype <- as.matrix(coldata_mat[, celltype, drop = FALSE])
 86 |   # }
 87 |   #
 88 |   # if (!is.null(pseudotime)) {
 89 |   #   pseudotime <- as.matrix(coldata_mat[, pseudotime, drop = FALSE])
 90 |   # }
 91 |   #
 92 |   # if (!is.null(spatial)) {
 93 |   #   spatial <- as.matrix(coldata_mat[, spatial, drop = FALSE])
 94 |   # }
 95 |   #
 96 |   #
 97 |   # if (covariate_use == "celltype") {
 98 |   #   dat <- data.frame(celltype)
 99 |   #   dat$cell_type <- as.factor(dat$cell_type)
100 |   # } else if (covariate_use == "pseudotime") {
101 |   #   n_l <- dim(pseudotime)[2]
102 |   #   dat <- data.frame(pseudotime)
103 |   # } else if (covariate_use == "spatial") {
104 |   #   dat <- data.frame(spatial)
105 |   # } else {
106 |   #   stop("Covairate_use must be one of 'celltype', 'pseudotime' or 'spatial'!")
107 |   # }
108 |   
109 |   ## Convert NA to -1
110 |   #pseudotime[is.na(pseudotime)] <- -1
111 |   
112 |   ## dat is the input covariate matrix
113 |   if (!is.null(other_covariates)) {
114 |     other_covariates <- (coldata_mat[, other_covariates, drop = FALSE])
115 |     dat <- dplyr::bind_cols(dat, other_covariates)
116 |     if("condition" %in% colnames(other_covariates)){
117 |       dat$condition <- as.factor(dat$condition)
118 |     }
119 |     if("batch" %in% colnames(other_covariates)){
120 |       dat$batch <- as.factor(dat$batch)
121 |     }
122 |   }
123 |   
124 |   ## check if user wants to simulate new number of cells
125 |   if(ncell != dim(dat)[1]){
126 |     newCovariate <- as.data.frame(simuCovariateMat(dat,ncell, parallelization, BPPARAM))
127 |   }else{
128 |     newCovariate <- dat
129 |   }
130 |   
131 |   # identify groups
132 |   n_gene <- dim(sce)[1]
133 |   n_cell <- dim(sce)[2]
134 |   group <- unlist(corr_by)
135 |   if(ncell != dim(dat)[1]){
136 |     if (group[1] == "1") {
137 |       corr_group <- rep(1, n_cell)
138 |       corr_group2 <- rep(1, dim(newCovariate)[1])
139 |     } else if (group[1] == "ind"){
140 |       corr_group <- rep("ind", n_cell)
141 |       corr_group2 <- rep("ind", dim(newCovariate)[1])
142 |     } else if (group[1] == "pseudotime" | length(group) > 1) {
143 |       ## For continuous pseudotime, discretize it
144 |       corr_group <- SummarizedExperiment::colData(sce)[, group]
145 |       mclust_mod <- mclust::Mclust(corr_group, G = seq_len(5))
146 |       corr_group <- mclust_mod$classification
147 |       
148 |       corr_group2 <- newCovariate[, group]
149 |       corr_group2 <- mclust::predict.Mclust(mclust_mod, newdata = corr_group2)$classification
150 |       
151 |     } else {
152 |       corr_group <- SummarizedExperiment::colData(sce)[, group]
153 |       corr_group2 <- newCovariate[, group]
154 |     }
155 |     newCovariate$corr_group <- corr_group2
156 |   }else{
157 |     if (group[1] == "1") {
158 |       corr_group <- rep(1, n_cell)
159 |     } else if (group[1] == "ind"){
160 |       corr_group <- rep("ind", n_cell)
161 |     }else if (group[1] == "pseudotime" | length(group) > 1) {
162 |       ## For continuous pseudotime, discretize it
163 |       corr_group <- SummarizedExperiment::colData(sce)[, group]
164 |       mclust_mod <- mclust::Mclust(corr_group, G = seq_len(5))
165 |       
166 |       corr_group <- mclust_mod$classification
167 |       
168 |     } else {
169 |       corr_group <- SummarizedExperiment::colData(sce)[, group]
170 |     }
171 |   }
172 |   dat$corr_group <- corr_group
173 |   
174 |   qc <- apply(count_mat, 2, function(x){
175 |     return(length(which(x < 1e-5)) > length(x) - 2)
176 |   })
177 |   if(length(which(qc)) == 0){
178 |     filtered_gene <- NULL
179 |   }else{
180 |     filtered_gene <- names(which(qc)) 
181 |   }
182 |   return(list(count_mat = count_mat, dat = dat, newCovariate = newCovariate, filtered_gene = filtered_gene))
183 | }
184 | 
185 | 
186 | ## Simulate covariate matrix
187 | simuCovariateMat <- function(covariate_mat,
188 |                              n_cell_new = 50000,
189 |                              parallelization = "mcmapply",
190 |                              BPPARAM = NULL) {
191 |   
192 |   n_cell_ori <- dim(covariate_mat)[1]
193 |   n_covraite_ori <- dim(covariate_mat)[2]
194 |   
195 |   if_factor_exist <- sum(sapply(covariate_mat, is.factor))
196 |   if_numeric_exist <- sum(sapply(covariate_mat, is.numeric))
197 |   
198 |   
199 |   
200 |   df <- covariate_mat
201 |   
202 |   if(if_factor_exist) {
203 |     df_all <-  dplyr::mutate(df, discrete_group = interaction(dplyr::select_if(df, is.factor), sep = "-"))
204 |     
205 |     df_list <- split(df_all, df_all$discrete_group)
206 |     
207 |     group_prop <-  table(df_all$discrete_group)/dim(df_all)[1]
208 |     group_name <- names(group_prop)
209 |     group_n_new <- stats::rmultinom(1, size = n_cell_new, prob = group_prop)
210 |     paraFunc <- parallel::mcmapply
211 |     if(.Platform$OS.type == "windows"){
212 |       BPPARAM <- BiocParallel::SnowParam()
213 |       parallelization <- "bpmapply"
214 |     }
215 |     if(parallelization == "bpmapply"){
216 |       paraFunc <- BiocParallel::bpmapply
217 |     }
218 |     if(parallelization == "pbmcmapply"){
219 |       paraFunc <- pbmcapply::pbmcmapply
220 |     }
221 |     
222 |     if(if_numeric_exist) {
223 |       dat_function <- {function(df, n) {
224 |         df <- dplyr::select(df, -"discrete_group")
225 |         df_numeric <- dplyr::select_if(df, is.numeric)
226 |         df_factor <- dplyr::select_if(df, is.factor)
227 |         if(n == 0){
228 |           return(NA)
229 |         }
230 |         fit_kde <- rvinecopulib::vine(df_numeric, cores = 1)
231 |         new_dat <- rvinecopulib::rvine(n, fit_kde)
232 |         
233 |         new_dat <- as.data.frame(new_dat)
234 |         new_dat[colnames(df_factor)] <- df_factor[1, ]
235 |         new_dat
236 |         
237 |       }}
238 |       
239 |       if(parallelization == "bpmapply"){
240 |         new_dat_list <- paraFunc(FUN = dat_function , df = df_list, n = group_n_new, BPPARAM = BPPARAM,SIMPLIFY = FALSE)
241 |       }else{
242 |         new_dat_list <- paraFunc(FUN = dat_function , df = df_list, n = group_n_new,SIMPLIFY = FALSE)
243 |       }
244 |       covariate_new <- do.call("rbind", new_dat_list[!is.na(new_dat_list)])
245 |     }
246 |     else {
247 |       dat_function <- function(df, n) {
248 |         df <- dplyr::select(df, -"discrete_group")
249 |         df_factor <- dplyr::select_if(df, is.factor)
250 |         df_onerow <- as.data.frame(df_factor[1, ])
251 |         colnames(df_onerow) <- colnames(df_factor)
252 |         new_dat <- as.data.frame(df_onerow[rep(1, n), ])
253 |         colnames(new_dat) <- colnames(df_onerow)
254 |         new_dat
255 |       }
256 |       if(parallelization == "bpmapply"){
257 |         new_dat_list <- paraFunc(FUN = dat_function, df = df_list, n = group_n_new, BPPARAM = BPPARAM, SIMPLIFY = FALSE)
258 |       }else{
259 |         new_dat_list <- paraFunc(FUN = dat_function, df = df_list, n = group_n_new,SIMPLIFY = FALSE)
260 |       }
261 |       covariate_new <- do.call("rbind", new_dat_list)
262 |       
263 |     }
264 |   }
265 |   else {
266 |     fit_kde <- rvinecopulib::vine(df, cores = 1)
267 |     covariate_new <- rvinecopulib::rvine(n_cell_new, fit_kde)
268 |   }
269 |   
270 |   rownames(covariate_new) <- paste0("Cell", seq_len(n_cell_new))
271 |   return(covariate_new)
272 | }


--------------------------------------------------------------------------------
/R/data.R:
--------------------------------------------------------------------------------
1 | #' A SingleCellExperiment object containing both cell type and pseudotime
2 | #' @format A dataset with 10 rows (genes) and 1289 cols (cells)
3 | #' @return The corresponding SingleCellExperiment object
4 | #' @usage data("example_sce")
5 | "example_sce"
6 | 


--------------------------------------------------------------------------------
/R/extract_para.R:
--------------------------------------------------------------------------------
  1 | #' Extract the parameters of each cell's distribution
  2 | #'
  3 | #' \code{extract_para} generates parameter matrices which determine each cell's distribution
  4 | #'
  5 | #' The function takes the new covariate (if use) from \code{\link{construct_data}} and
  6 | #' marginal models from \code{\link{fit_marginal}}.
  7 | #'
  8 | #' @param sce A \code{SingleCellExperiment} object.
  9 | #' @param assay_use A string which indicates the assay you will use in the sce. Default is 'counts'.
 10 | #' @param marginal_list A list of fitted regression models from \code{\link{fit_marginal}} for each gene in sce.
 11 | #' @param n_cores An integer. The number of cores to use.
 12 | #' @param family_use A string of the marginal distribution.
 13 | #' Must be one of 'poisson', 'nb', 'zip', 'zinb' or 'gaussian', which represent 'poisson distribution',
 14 | #' 'negative binomial distribution', 'zero-inflated poisson distribution', 'zero-inflated negative binomial distribution',
 15 | #' and 'gaussian distribution' respectively.
 16 | #' @param new_covariate A data.frame which contains covariates of targeted simulated data from  \code{\link{construct_data}} and the
 17 | #' correlation group assignment for each cell in the column 'corr_group'.
 18 | #' @param parallelization A string indicating the specific parallelization function to use.
 19 | #' Must be one of 'mcmapply', 'bpmapply', or 'pbmcmapply', which corresponds to the parallelization function in the package
 20 | #' \code{parallel},\code{BiocParallel}, and \code{pbmcapply} respectively. The default value is 'mcmapply'.
 21 | #' @param BPPARAM A \code{MulticoreParam} object or NULL. When the parameter parallelization = 'mcmapply' or 'pbmcmapply',
 22 | #' this parameter must be NULL. When the parameter parallelization = 'bpmapply',  this parameter must be one of the
 23 | #' \code{MulticoreParam} object offered by the package 'BiocParallel. The default value is NULL.
 24 | #' @param data A dataframe which is used when fitting the gamlss model
 25 | #' @return A list with the components:
 26 | #' \describe{
 27 | #'   \item{\code{mean_mat}}{A cell by feature matrix of the mean parameter.}
 28 | #'   \item{\code{sigma_mat}}{A cell by feature matrix of the sigma parameter (for Gaussian, the variance; for NB, the dispersion.).}
 29 | #'   \item{\code{zero_mat}}{A cell by feature matrix of the zero-inflation parameter (only non-zero for ZIP and ZINB).}
 30 | #' }
 31 | #' @examples
 32 | #'   data(example_sce)
 33 | #'   my_data <- construct_data(
 34 | #'   sce = example_sce,
 35 | #'   assay_use = "counts",
 36 | #'   celltype = "cell_type",
 37 | #'   pseudotime = "pseudotime",
 38 | #'   spatial = NULL,
 39 | #'   other_covariates = NULL,
 40 | #'   corr_by = "1"
 41 | #'   )
 42 | #'   my_marginal <- fit_marginal(
 43 | #'   data = my_data,
 44 | #'   mu_formula = "s(pseudotime, bs = 'cr', k = 10)",
 45 | #'   sigma_formula = "1",
 46 | #'   family_use = "nb",
 47 | #'   n_cores = 1,
 48 | #'   usebam = FALSE
 49 | #'   )
 50 | #'   my_copula <- fit_copula(
 51 | #'   sce = example_sce,
 52 | #'   assay_use = "counts",
 53 | #'   marginal_list = my_marginal,
 54 | #'   family_use = c(rep("nb", 5), rep("zip", 5)),
 55 | #'   copula = "vine",
 56 | #'   n_cores = 1,
 57 | #'   input_data = my_data$dat
 58 | #'   )
 59 | #'   my_para <- extract_para(
 60 | #'     sce = example_sce,
 61 | #'     marginal_list = my_marginal,
 62 | #'     n_cores = 1,
 63 | #'     family_use = c(rep("nb", 5), rep("zip", 5)),
 64 | #'     new_covariate = my_data$new_covariate,
 65 | #'     data = my_data$dat
 66 | #'   )
 67 | #'
 68 | #' @export extract_para
 69 | 
 70 | extract_para <-  function(sce,
 71 |                           assay_use = "counts",
 72 |                           marginal_list,
 73 |                           n_cores,
 74 |                           family_use,
 75 |                           new_covariate,
 76 |                           parallelization = "mcmapply",
 77 |                           BPPARAM = NULL,
 78 |                           data) {
 79 |   removed_cell_list <- lapply(marginal_list, function(x){x$removed_cell})
 80 |   marginal_list <- lapply(marginal_list, function(x){x$fit})
 81 | 
 82 |   # find gene whose marginal is fitted
 83 |   qc_gene_idx <- which(!is.na(marginal_list))
 84 |   if(length(family_use) != 1){
 85 |     family_use <- family_use[qc_gene_idx]
 86 |   }
 87 |   # check if user inputted new covariates
 88 |   data_temp <- data[,colnames(new_covariate), drop = FALSE]
 89 |   if(identical(data_temp, new_covariate)){
 90 |     new_covariate <- NULL
 91 |   }
 92 |   
 93 |   count_mat <- SummarizedExperiment::assay(sce, assay_use)
 94 |   
 95 |   mat_function <-function(x, y) {
 96 |     fit <- marginal_list[[x]]
 97 |     removed_cell <- removed_cell_list[[x]]
 98 |     #count_mat <-
 99 |     #  t(as.matrix(SummarizedExperiment::assay(sce, assay_use)))
100 |     #data$gene <- count_mat[,x]
101 |     data$gene <- count_mat[x, ]
102 |     # if(!"gamlss" %in% class(fit)){
103 |     #   modelframe <- model.frame(fit)
104 |     # }else{
105 |     #   modelframe <- fit$mu.x
106 |     # }
107 |     if(is.null(new_covariate)){
108 |       total_cells <- dim(sce)[2]
109 |       cell_names <- colnames(sce)
110 |     }else{
111 |       total_cells <- dim(new_covariate)[1]
112 |       cell_names <- rownames(new_covariate)
113 |     }
114 | 
115 |     if(length(removed_cell) > 0 && !any(is.na(removed_cell))){
116 |       if(is.null(new_covariate)){
117 |         data <- data[-removed_cell,]
118 |       }else{
119 |         if (methods::is(fit, "gamlss")){
120 |           all_covariates <- all.vars(fit$mu.formula)[-1]
121 |         }else{
122 |           all_covariates <- all.vars(fit$formula)[-1]
123 |         }
124 |         remove_idx <- lapply(all_covariates, function(x){
125 |           curr_x <- tapply(data$gene, data[,x], sum)
126 |           zero_group <- which(curr_x==0)
127 |           if(length(zero_group) == 0){
128 |             return(NA)
129 |           }else{
130 |             type <- names(curr_x)[zero_group]
131 |             return(which(new_covariate[,x] %in% type))
132 |           }
133 | 
134 |         })
135 |         remove_cell_idx <- unlist(remove_idx)
136 |         remove_cell_idx <- unique(stats::na.omit(remove_cell_idx))
137 |         if(length(remove_cell_idx) > 0){
138 |           new_covariate <- new_covariate[-remove_cell_idx,]
139 |         }
140 |       }
141 |     }
142 | 
143 |     if (methods::is(fit, "gamlss")) {
144 |       mean_vec <-
145 |         stats::predict(fit,
146 |                        type = "response",
147 |                        what = "mu",
148 |                        newdata = new_covariate, data = data)
149 |       if (y == "poisson" | y == "binomial") {
150 |         theta_vec <- rep(NA, total_cells)
151 |       } else if (y == "gaussian") {
152 |         theta_vec = stats::predict(fit,
153 |                                    type = "response",
154 |                                    what = "sigma",
155 |                                    newdata = new_covariate, data = data) # this thete_vec is used for sigma_vec
156 |       } else if (y == "nb") {
157 |         theta_vec <-
158 |           stats::predict(fit,
159 |                          type = "response",
160 |                          what = "sigma",
161 |                          newdata = new_covariate, data = data)
162 |         #theta_vec[theta_vec < 1e-3] <- 1e-3
163 |       } else if (y == "zip") {
164 |         theta_vec <- rep(NA,  total_cells)
165 |         zero_vec <-
166 |           stats::predict(fit,
167 |                          type = "response",
168 |                          what = "sigma",
169 |                          newdata = new_covariate, data = data)
170 |       } else if (y == "zinb") {
171 |         theta_vec <-
172 |           stats::predict(fit,
173 |                          type = "response",
174 |                          what = "sigma",
175 |                          newdata = new_covariate, data = data)
176 |         zero_vec <-
177 |           stats::predict(fit,
178 |                          type = "response",
179 |                          what = "nu",
180 |                          newdata = new_covariate, data = data)
181 |       } else {
182 |         stop("Distribution of gamlss must be one of gaussian, binomial, poisson, nb, zip or zinb!")
183 |       }
184 |     } else {
185 |       ## Fit is mgcv::gam
186 |       if (is.null(new_covariate)) {
187 |         y <- stats::family(fit)$family[1]
188 |         if (grepl("Negative Binomial", y)) {
189 |           y <- "nb"
190 |         }
191 | 
192 |         mean_vec <- stats::predict(fit, type = "response")
193 |         if (y == "poisson" | y == "binomial") {
194 |           theta_vec <- rep(NA,  total_cells)
195 |         } else if (y == "gaussian") {
196 |           theta_vec <- rep(sqrt(fit$sig2), total_cells) # this thete_vec is used for sigma_vec
197 |         } else if (y == "nb") {
198 |           theta <- fit$family$getTheta(TRUE)
199 |           theta_vec <- 1/rep(theta, total_cells)
200 |         } else {
201 |           stop("Distribution of gamlss must be one of gaussian, binomial, poisson, nb!")
202 |         }
203 |       } else{
204 |         y <- stats::family(fit)$family[1]
205 |         if (grepl("Negative Binomial", y)) {
206 |           y <- "nb"
207 |         }
208 | 
209 | 
210 |         mean_vec <-
211 |           stats::predict(fit, type = "response", newdata = new_covariate)
212 |         if (y == "poisson" | y == "binomial") {
213 |           theta_vec <- rep(NA, total_cells)
214 |         } else if (y == "gaussian") {
215 |           theta_vec = stats::predict(fit,
216 |                                      type = "response",
217 |                                      what = "sigma",
218 |                                      newdata = new_covariate) # this thete_vec is used for sigma_vec
219 |         } else if (y == "nb") {
220 |           theta <- fit$family$getTheta(TRUE)
221 |           theta_vec <- 1/rep(theta, total_cells)
222 |         } else {
223 |           stop("Distribution of gam must be one of gaussian, binomial, poisson, nb!")
224 |         }
225 |       }
226 |     }
227 | 
228 |     if (!exists("zero_vec")) {
229 |       zero_vec <- rep(0, length(mean_vec))
230 |       names(zero_vec) <- names(mean_vec)
231 |     }
232 | 
233 |     if(length(mean_vec) < total_cells){
234 |       full_means <- rep(0, total_cells)
235 |       names(full_means) <- cell_names
236 |       full_means[names(mean_vec)] <- mean_vec
237 |       full_theta <- rep(NA, total_cells)
238 |       names(full_theta) <- cell_names
239 |       full_zero <- rep(NA, total_cells)
240 |       names(full_zero) <- cell_names
241 |       if(is.null(names(theta_vec))){
242 |         if(length(theta_vec) == length(mean_vec)){
243 |           names(theta_vec) <- names(mean_vec)  ## for gamlss case
244 |         }else{
245 |           names(theta_vec) <- cell_names ## for gam case
246 |         }
247 | 
248 |       }
249 |       full_theta[names(theta_vec)] <- theta_vec
250 |       full_zero[names(zero_vec)] <- zero_vec
251 |       mean_vec <- full_means
252 |       theta_vec <- full_theta
253 |       zero_vec <- full_zero
254 |     }
255 | 
256 |     #q_vec <- quantile_mat[, x]
257 | 
258 | 
259 |     para_mat <- cbind(mean_vec, theta_vec, zero_vec)
260 |     rownames(para_mat) <- cell_names
261 |     para_mat
262 |   }
263 |   paraFunc <- parallel::mcmapply
264 |   if(.Platform$OS.type == "windows"){
265 |     BPPARAM <- BiocParallel::SnowParam()
266 |     parallelization <- "bpmapply"
267 |   }
268 |   if(parallelization == "bpmapply"){
269 |     paraFunc <- BiocParallel::bpmapply
270 |   }
271 |   if(parallelization == "pbmcmapply"){
272 |     paraFunc <- pbmcapply::pbmcmapply
273 |   }
274 |   if(parallelization == "bpmapply"){
275 |     if(class(BPPARAM)[1] != "SerialParam"){
276 |       BPPARAM$workers <- n_cores
277 |     }
278 |     mat <- suppressMessages(paraFunc(mat_function, x = seq_len(dim(sce)[1])[qc_gene_idx], y = family_use,BPPARAM = BPPARAM,SIMPLIFY = FALSE))
279 |   }else{
280 |     mat <- suppressMessages(paraFunc(mat_function, x = seq_len(dim(sce)[1])[qc_gene_idx], y = family_use,SIMPLIFY = FALSE
281 |                                   ,mc.cores = n_cores
282 |                                    ))
283 |   }
284 |   mean_mat <- sapply(mat, function(x)
285 |     x[, 1])
286 |   sigma_mat <- sapply(mat, function(x)
287 |     x[, 2])
288 |   zero_mat <- sapply(mat, function(x)
289 |     x[, 3])
290 | 
291 |   if(length(qc_gene_idx) < dim(sce)[1]){
292 |     colnames(mean_mat) <-
293 |       colnames(sigma_mat) <- colnames(zero_mat) <- rownames(sce)[qc_gene_idx]
294 |     zeros <- matrix(0, nrow = dim(mean_mat)[1], ncol = dim(sce)[1] - length(qc_gene_idx))
295 |     na_mat <- matrix(NA, nrow = dim(mean_mat)[1], ncol = dim(sce)[1] - length(qc_gene_idx))
296 |     rownames(zeros) <- rownames(mean_mat)
297 |     colnames(zeros) <- rownames(sce)[-qc_gene_idx]
298 |     rownames(na_mat) <- rownames(mean_mat)
299 |     colnames(na_mat) <- rownames(sce)[-qc_gene_idx]
300 |     mean_mat <- cbind(mean_mat,zeros)
301 |     sigma_mat <- cbind(sigma_mat,na_mat)
302 |     zero_mat <- cbind(zero_mat,na_mat)
303 |     mean_mat <- mean_mat[,rownames(sce)]
304 |     sigma_mat <- sigma_mat[,rownames(sce)]
305 |     zero_mat <- zero_mat[,rownames(sce)]
306 |   }else{
307 |     colnames(mean_mat) <-
308 |       colnames(sigma_mat) <- colnames(zero_mat) <- rownames(sce)
309 |   }
310 | 
311 |   zero_mat <- Matrix::Matrix(zero_mat, sparse = TRUE)
312 |   return(list(
313 |     mean_mat = mean_mat,
314 |     sigma_mat = sigma_mat,
315 |     zero_mat = zero_mat
316 |   ))
317 | }
318 | 


--------------------------------------------------------------------------------
/R/fit_marginal.R:
--------------------------------------------------------------------------------
  1 | #' Fit the marginal models
  2 | #'
  3 | #' \code{fit_marginal} fits the per-feature regression models.
  4 | #'
  5 | #' The function takes the result from \code{\link{construct_data}} as the input,
  6 | #' and fit the regression models for each feature based on users' specification.
  7 | #'
  8 | #' @param data An object from \code{\link{construct_data}}.
  9 | #' @param predictor A string of the predictor for the gam/gamlss model. Default is "gene". This is just a name.
 10 | #' @param mu_formula A string of the mu parameter formula. It follows the format of formula in \code{\link[mgcv]{bam}}. Note: if the formula has multiple smoothers (\code{s()}) (we do not recommend this), please put the one with largest k (most complex one) as the first one. 
 11 | #' @param sigma_formula A string of the sigma parameter formula
 12 | #' @param family_use A string or a vector of strings of the marginal distribution.
 13 | #' Must be one of 'binomial', 'poisson', 'nb', 'zip', 'zinb' or 'gaussian', which represent 'poisson distribution',
 14 | #' 'negative binomial distribution', 'zero-inflated poisson distribution', 'zero-inflated negative binomial distribution',
 15 | #' and 'gaussian distribution' respectively.
 16 | #' @param n_cores An integer. The number of cores to use.
 17 | #' @param usebam A logic variable. If use \code{\link[mgcv]{bam}} for acceleration.
 18 | #' @param edf_flexible A logic variable. It uses simpler model to accelerate the marginal fitting with a mild loss of accuracy. If TRUE, the fitted regression model will use the fitted relationship between Gini coefficient and the effective degrees of freedom on a random selected gene sets. Default is FALSE.
 19 | #' @param parallelization A string indicating the specific parallelization function to use.
 20 | #' Must be one of 'mcmapply', 'bpmapply', or 'pbmcmapply', which corresponds to the parallelization function in the package
 21 | #' \code{parallel},\code{BiocParallel}, and \code{pbmcapply} respectively. The default value is 'mcmapply'.
 22 | #' @param BPPARAM A \code{MulticoreParam} object or NULL. When the parameter parallelization = 'mcmapply' or 'pbmcmapply',
 23 | #' this parameter must be NULL. When the parameter parallelization = 'bpmapply',  this parameter must be one of the
 24 | #' \code{MulticoreParam} object offered by the package 'BiocParallel. The default value is NULL.
 25 | #' @param trace A logic variable. If TRUE, the warning/error log and runtime for gam/gamlss will be returned.
 26 | #' will be returned, FALSE otherwise. Default is FALSE.
 27 | #' @param simplify A logic variable. If TRUE, the fitted regression model will only keep the essential contains for \code{predict}. Default is FALSE.
 28 | #' @param filter_cells A logic variable. If TRUE, when all covariates used for fitting the GAM/GAMLSS model are categorical, the code will check each unique combination of categories and remove cells in that category if it has all zero gene expression for each fitted gene.
 29 | #' @return A list of fitted regression models. The length is equal to the total feature number.
 30 | #' @examples
 31 | #'   data(example_sce)
 32 | #'   my_data <- construct_data(
 33 | #'   sce = example_sce,
 34 | #'   assay_use = "counts",
 35 | #'   celltype = "cell_type",
 36 | #'   pseudotime = "pseudotime",
 37 | #'   spatial = NULL,
 38 | #'   other_covariates = NULL,
 39 | #'   corr_by = "1"
 40 | #'   )
 41 | #'   my_marginal <- fit_marginal(
 42 | #'   data = my_data,
 43 | #'   mu_formula = "s(pseudotime, bs = 'cr', k = 10)",
 44 | #'   sigma_formula = "1",
 45 | #'   family_use = "nb",
 46 | #'   n_cores = 1,
 47 | #'   usebam = FALSE
 48 | #'   )
 49 | #'
 50 | #' @export fit_marginal
 51 | #'
 52 | fit_marginal <- function(data,
 53 |                          predictor = "gene", ## Fix this later.
 54 |                          mu_formula,
 55 |                          sigma_formula,
 56 |                          family_use,
 57 |                          n_cores,
 58 |                          usebam = FALSE,
 59 |                          edf_flexible = FALSE,
 60 |                          parallelization = "mcmapply",
 61 |                          BPPARAM = NULL,
 62 |                          trace = FALSE, 
 63 |                          simplify = FALSE,
 64 |                          filter_cells = FALSE) {
 65 |   count_mat <-  data$count_mat
 66 |   dat_cov <- data$dat
 67 |   filtered_gene <- data$filtered_gene
 68 |   feature_names <- colnames(count_mat)
 69 |   
 70 |   
 71 |   # Extract K from mu formula
 72 |   matches <- regexpr("k\\s*=\\s*([0-9]+)", mu_formula, perl = TRUE)
 73 |   extracted_value <- regmatches(mu_formula, matches)
 74 |   extracted_K <- as.numeric(sub("k\\s*=\\s*", "", extracted_value))
 75 |   if(identical(extracted_K, numeric(0))) {
 76 |     extracted_K <- 0
 77 |   }
 78 |   
 79 |   # Randomly select genes for edf fitting
 80 |   num <- 100
 81 |   if(dim(count_mat)[2] > num & extracted_K >= 200 & edf_flexible == TRUE){
 82 |     edf_fitting <- TRUE
 83 |     
 84 |     # genes for fitting edf-gini relationship
 85 |     edf_gini_genes <- sample(seq_len(dim(count_mat)[2]), num)
 86 |     edf_gini_count_mat <-  count_mat[,edf_gini_genes]
 87 |     edf_gini_feature_names <- feature_names[edf_gini_genes]
 88 |     
 89 |     # genes for flexible edf
 90 |     edf_flexible_genes <- seq_len(dim(count_mat)[2])[-edf_gini_genes]
 91 |     edf_flexible_count_mat <- count_mat[,-edf_gini_genes]
 92 |     edf_flexible_feature_names <- feature_names[-edf_gini_genes]
 93 |     
 94 |   }else{
 95 |     edf_fitting <- FALSE
 96 |   }
 97 |   
 98 |   
 99 |   ## Check family_use
100 |   if(length(family_use) == 1) {
101 |     if(edf_fitting == TRUE) {
102 |       edf_gini_family_use <- rep(family_use, length(edf_gini_feature_names))
103 |       edf_flexible_family_use <- rep(family_use, length(edf_flexible_feature_names))
104 |     }
105 |     family_use <- rep(family_use, length(feature_names))
106 |   }
107 |   if(length(family_use) != length(feature_names)) {
108 |     stop("The family_use must be either a single string or a vector with the same length as all features!")
109 |   }
110 |   
111 |   
112 |   fit_model_func <- function(gene,
113 |                              family_gene,
114 |                              dat_use,
115 |                              #mgcv_formula,
116 |                              mu_formula,
117 |                              sigma_formula,
118 |                              predictor,
119 |                              count_mat,
120 |                              edf=NULL
121 |   ) {
122 |     
123 |     if(!is.null(edf)){
124 |       mu_formula_ex <- sub("(k\\s*=).*", "\\1", mu_formula)
125 |       mu_formula = paste0(mu_formula_ex, round(edf[[gene]]), ")")
126 |     }
127 |     
128 |     mgcv_formula <-
129 |       stats::formula(paste0(predictor, "~", mu_formula))
130 |     
131 |     ## If use the mgcv s() smoother
132 |     mu_mgcvform <- grepl("s\\(", mu_formula) | grepl("te\\(", mu_formula)
133 |     
134 |     ## If use bam to fit marginal distribution
135 |     usebam <- usebam & mu_mgcvform ## If no smoothing terms, no need to to use bam.
136 |     if(usebam){
137 |       fitfunc = mgcv::bam
138 |     }else{
139 |       fitfunc = mgcv::gam
140 |     }
141 |     
142 |     if (mu_mgcvform) {
143 |       terms <- attr(stats::terms(mgcv_formula), "term.labels")
144 |       terms_smooth <- terms[which(grepl("s\\(", terms))] 
145 |       
146 |       if(usebam){
147 |         terms_smooth_update <- sapply(terms_smooth, function(x){paste0("ba(~", x, ", method = 'fREML', gc.level = 0, discrete = TRUE)")})
148 |         if(length(terms_smooth) == length(terms)){## only contain smooth terms
149 |           mu_formula <-
150 |             stats::formula(paste0(predictor, "~", paste0(terms_smooth_update, collapse = "+")))
151 |         }else{
152 |           terms_linear <- terms[which(!grepl("s\\(", terms))] 
153 |           terms_update <- c(terms_linear, terms_smooth_update)
154 |           mu_formula <-
155 |             stats::formula(paste0(predictor, "~", paste0(terms_update, collapse = "+")))
156 |         }
157 |       }else{
158 |         terms_smooth_update <- sapply(terms_smooth, function(x){paste0("ga(~", x, ", method = 'REML')")})
159 |         if(length(terms_smooth) == length(terms)){## only contain smooth terms
160 |           mu_formula <-
161 |             stats::formula(paste0(predictor, "~", paste0(terms_smooth_update, collapse = "+")))
162 |         }else{
163 |           terms_linear <- terms[which(!grepl("s\\(", terms))] 
164 |           terms_update <- c(terms_linear, terms_smooth_update)
165 |           mu_formula <-
166 |             stats::formula(paste0(predictor, "~", paste0(terms_update, collapse = "+")))
167 |         }
168 |       }
169 |     }
170 |     else {
171 |       mu_formula <- stats::formula(paste0(predictor, "~", mu_formula))
172 |     }
173 |     
174 |     sigma_mgcvform <- grepl("s\\(", sigma_formula) | grepl("te\\(", sigma_formula)
175 |     if (sigma_mgcvform) {
176 |       temp_sigma_formula <- stats::formula(paste0(predictor, "~", sigma_formula))
177 |       terms <- attr(stats::terms(temp_sigma_formula), "term.labels")
178 |       terms_smooth <- terms[which(grepl("s\\(", terms))]
179 |       if(usebam){
180 |         terms_smooth_update <- sapply(terms_smooth, function(x){paste0("ba(~", x, ", method = 'fREML', gc.level = 0, discrete = TRUE)")})
181 |         if(length(terms_smooth) == length(terms)){## only contain smooth terms
182 |           sigma_formula <-
183 |             stats::formula(paste0("~", paste0(terms_smooth_update, collapse = "+")))
184 |         }else{
185 |           terms_linear <- terms[which(!grepl("s\\(", terms))] 
186 |           terms_update <- c(terms_linear, terms_smooth_update)
187 |           sigma_formula <-
188 |             stats::formula(paste0("~", paste0(terms_update, collapse = "+")))
189 |         }
190 |       }else{
191 |         terms_smooth_update <- sapply(terms_smooth, function(x){paste0("ga(~", x, ", method = 'REML')")})
192 |         if(length(terms_smooth) == length(terms)){## only contain smooth terms
193 |           sigma_formula <-
194 |             stats::formula(paste0("~", paste0(terms_smooth_update, collapse = "+")))
195 |         }else{
196 |           terms_linear <- terms[which(!grepl("s\\(", terms))] 
197 |           terms_update <- c(terms_linear, terms_smooth_update)
198 |           sigma_formula <-
199 |             stats::formula(paste0("~", paste0(terms_update, collapse = "+")))
200 |         }
201 |       }
202 |       
203 |     } else {
204 |       sigma_formula <- stats::formula(paste0("~", sigma_formula))
205 |     }
206 |     
207 |     
208 |     ## Add gene expr
209 |     dat_use$gene <- count_mat[, gene]
210 |     
211 |     ## For error/warning logging
212 |     add_log <- function(function_name, type, message) {
213 |       new_l <- logs
214 |       new_log <- list(function_name = function_name,
215 |                       type = type,
216 |                       message =  message)
217 |       new_l[[length(new_l) + 1]]  <- new_log
218 |       logs <<- new_l
219 |     }
220 |     
221 |     logs <- list()
222 |     ## Don't fit marginal if gene only have two or less non-zero expression
223 |     if(!is.null(filtered_gene) & gene %in% filtered_gene){
224 |       add_log("fit_marginal","warning", paste0(gene, "is expressed in too few cells."))
225 |       return(list(fit = NA, warning = logs, time = c(NA,NA)))
226 |     }
227 |     
228 |     if(filter_cells){
229 |       all_covariates <- all.vars(mgcv_formula)[-1]
230 |       dat_cova <- dat_use[, all_covariates]
231 |       check_factor <- all(sapply(dat_cova,is.factor))
232 |       if (length(all_covariates) > 0 & check_factor){
233 |         remove_idx_list <- lapply(all_covariates, function(x){
234 |           curr_x <- tapply(dat_use$gene, dat_use[,x], sum)
235 |           zero_group <- which(curr_x==0)
236 |           if(length(zero_group) == 0){
237 |             return(list(idx = NA, changeFormula = FALSE))
238 |           }else{
239 |             type <- names(curr_x)[zero_group]
240 |             if(length(type) == length(unique(dat_use[,x])) - 1){
241 |               return(list(idx = NA, changeFormula = TRUE))
242 |             }
243 |             return(list(idx = which(dat_use[,x] %in% type), changeFormula = FALSE))
244 |           }
245 |           
246 |         })
247 |         names(remove_idx_list) <- all_covariates
248 |         remove_idx <- lapply(remove_idx_list, function(x)x$idx)
249 |         remove_cell <- unlist(remove_idx)
250 |         if(all(is.na(remove_cell))){
251 |           remove_cell <- NA
252 |         }else{
253 |           remove_cell <- unique(stats::na.omit(remove_cell))
254 |         }
255 |         if(length(remove_cell) > 0 && !any(is.na(remove_cell))){
256 |           dat_use <- dat_use[-remove_cell,]
257 |         }
258 |         
259 |         changeFormula <-  sapply(remove_idx_list, function(x)x$changeFormula)
260 |         if(length(which(changeFormula)) > 0){
261 |           changeVars <- names(which(changeFormula))
262 |           formulaUpdate <- paste0(changeVars, collapse = "-")
263 |           mgcv_formula <- stats::update.formula(mgcv_formula, stats::as.formula(paste0("~.-",formulaUpdate)))
264 |           mu_formula <- stats::update.formula(mu_formula, stats::as.formula(paste0("~.-",formulaUpdate)))
265 |           sigmaVars <- which(changeVars %in% as.character(sigma_formula))
266 |           if(length(sigmaVars) > 0){
267 |             formulaUpdate <- paste0(changeVars[sigmaVars], collapse = "-")
268 |           }
269 |           sigma_formula = stats::update.formula(sigma_formula, stats::as.formula(paste0("~.-",formulaUpdate)))
270 |         }
271 |         
272 |       }else{
273 |         remove_cell <- NA
274 |       }
275 |     }else{
276 |       remove_cell <- NA
277 |     }
278 |     
279 |     
280 |     time_list <- c(NA,NA)
281 |     
282 |     if (family_gene == "binomial") {
283 |       mgcv.fit <- withCallingHandlers(
284 |         tryCatch({
285 |           start.time <- Sys.time()
286 |           res <-fitfunc(formula = mgcv_formula, data = dat_use, family = "binomial", discrete = usebam)
287 |           end.time <- Sys.time()
288 |           time <- as.numeric(end.time - start.time)
289 |           time_list[1] <- time
290 |           res
291 |         }, error=function(e) {
292 |           add_log("gam","error", toString(e))
293 |           NULL
294 |         }), warning=function(w) {
295 |           add_log("gam","warning", toString(w))
296 |           
297 |         })
298 |       
299 |       if (sigma_formula != "~1") {
300 |         gamlss.fit <- withCallingHandlers(
301 |           tryCatch({
302 |             start.time = Sys.time()
303 |             res <- gamlss::gamlss(
304 |               formula = mu_formula,
305 |               #sigma.formula = sigma_formula, ## Binomial is one para dist.
306 |               data = dat_use,
307 |               family = gamlss.dist::BI,
308 |               control = gamlss::gamlss.control(trace = FALSE, c.crit = 0.1)
309 |             )
310 |             end.time = Sys.time()
311 |             time = as.numeric(end.time - start.time)
312 |             time_list[2] <- time
313 |             res
314 |           }, error=function(e) {
315 |             add_log("gamlss","error", toString(e))
316 |             NULL
317 |           }), warning=function(w) {
318 |             add_log("gamlss","warning", toString(w))
319 |           })
320 |         
321 |       } else {
322 |         gamlss.fit <- NULL
323 |       }
324 |     } else if (family_gene == "poisson") {
325 |       mgcv.fit <- withCallingHandlers(
326 |         tryCatch({
327 |           start.time <- Sys.time()
328 |           res <-fitfunc(formula = mgcv_formula, data = dat_use, family = "poisson", discrete = usebam)
329 |           end.time <- Sys.time()
330 |           time <- as.numeric(end.time - start.time)
331 |           time_list[1] <- time
332 |           res
333 |         }, error=function(e) {
334 |           add_log("gam","error", toString(e))
335 |           NULL
336 |         }), warning=function(w) {
337 |           add_log("gam","warning", toString(w))
338 |         })
339 |       
340 |       if (sigma_formula != "~1") {
341 |         gamlss.fit <- withCallingHandlers(
342 |           tryCatch({
343 |             start.time = Sys.time()
344 |             res <- gamlss::gamlss(
345 |               formula = mu_formula,
346 |               #sigma.formula = sigma_formula, ## Poisson has constant mean
347 |               data = dat_use,
348 |               family = gamlss.dist::PO,
349 |               control = gamlss::gamlss.control(trace = FALSE, c.crit = 0.1))
350 |             end.time = Sys.time()
351 |             time = as.numeric(end.time - start.time)
352 |             time_list[2] <- time
353 |             res
354 |           }, error=function(e) {
355 |             add_log("gamlss","error", toString(e))
356 |             NULL
357 |           }), warning=function(w) {
358 |             add_log("gamlss","warning", toString(w))
359 |           })
360 |       } else {
361 |         gamlss.fit <- NULL
362 |       }
363 |     } else if (family_gene == "gaussian") {
364 |       mgcv.fit <- withCallingHandlers(
365 |         tryCatch({
366 |           start.time <- Sys.time()
367 |           res <- fitfunc(formula = mgcv_formula, data = dat_use, family = "gaussian", discrete = usebam)
368 |           end.time <- Sys.time()
369 |           time <- as.numeric(end.time - start.time)
370 |           time_list[1] <- time
371 |           res
372 |         }, error=function(e) {
373 |           add_log("gam","error", toString(e))
374 |           NULL
375 |         }), warning=function(w) {
376 |           add_log("gam","warning", toString(w))
377 |         })
378 |       
379 |       if (sigma_formula != "~1") {
380 |         gamlss.fit<- withCallingHandlers(
381 |           tryCatch({
382 |             start.time = Sys.time()
383 |             res <- gamlss::gamlss(
384 |               formula = mu_formula,
385 |               sigma.formula = sigma_formula,
386 |               data = dat_use,
387 |               family = gamlss.dist::NO,
388 |               control = gamlss::gamlss.control(trace = FALSE, c.crit = 0.1)
389 |             )
390 |             end.time = Sys.time()
391 |             time = as.numeric(end.time - start.time)
392 |             time_list[2] <- time
393 |             res
394 |           }, error=function(e) {
395 |             add_log("gamlss","error", toString(e))
396 |             NULL
397 |           }), warning=function(w) {
398 |             add_log("gamlss","warning", toString(w))
399 |           })
400 |         
401 |       } else {
402 |         gamlss.fit <- NULL
403 |       }
404 |     } else if (family_gene == "nb"){
405 |       mgcv.fit <- withCallingHandlers(
406 |         tryCatch({
407 |           start.time <- Sys.time()
408 |           res <-fitfunc(formula = mgcv_formula, data = dat_use, family = "nb", discrete = usebam)
409 |           end.time <- Sys.time()
410 |           time <- as.numeric(end.time - start.time)
411 |           time_list[1] <- time
412 |           res
413 |         }, error=function(e) {
414 |           add_log("gam","error", toString(e))
415 |           NULL
416 |         }), warning=function(w) {
417 |           add_log("gam","warning", toString(w))
418 |         })
419 |       
420 |       if (sigma_formula != "~1") {
421 |         gamlss.fit <- withCallingHandlers(
422 |           tryCatch({
423 |             start.time = Sys.time()
424 |             res <- gamlss::gamlss(
425 |               formula = mu_formula,
426 |               sigma.formula = sigma_formula,
427 |               data = dat_use,
428 |               family = gamlss.dist::NBI,
429 |               control = gamlss::gamlss.control(trace = FALSE,  c.crit = 0.1)
430 |             )
431 |             end.time = Sys.time()
432 |             time = as.numeric(end.time - start.time)
433 |             time_list[2] <- time
434 |             res
435 |           }, error=function(e) {
436 |             add_log("gamlss","error", toString(e))
437 |             NULL
438 |           }), warning=function(w) {
439 |             add_log("gamlss","warning", toString(w))
440 |           })
441 |       } else {
442 |         gamlss.fit <- NULL
443 |       }
444 |     } else if (family_gene == "zip") {
445 |       mgcv.fit <- withCallingHandlers(
446 |         tryCatch({
447 |           start.time <- Sys.time()
448 |           res <-fitfunc(formula = mgcv_formula, data = dat_use, family = "poisson", discrete = usebam)
449 |           end.time <- Sys.time()
450 |           time <- as.numeric(end.time - start.time)
451 |           time_list[1] <- time
452 |           res
453 |         }, error=function(e) {
454 |           add_log("gam","error", toString(e))
455 |           NULL
456 |         }), warning=function(w) {
457 |           add_log("gam","warning", toString(w))
458 |         })
459 |       gamlss.fit <- withCallingHandlers(
460 |         tryCatch({
461 |           start.time = Sys.time()
462 |           res <- gamlss::gamlss(
463 |             formula = mu_formula,
464 |             sigma.formula = mu_formula, ## Here sigma is the dropout prob, not variance!
465 |             data = dat_use,
466 |             family = gamlss.dist::ZIP,
467 |             control = gamlss::gamlss.control(trace = FALSE, c.crit = 0.1)
468 |             
469 |           )
470 |           end.time = Sys.time()
471 |           time = as.numeric(end.time - start.time)
472 |           time_list[2] <- time
473 |           res
474 |         }, error=function(e) {
475 |           add_log("gamlss","error", toString(e))
476 |           NULL
477 |         }), warning=function(w) {
478 |           add_log("gamlss","warning", toString(w))
479 |         })
480 |       
481 |     } else if (family_gene == "zinb"){
482 |       mgcv.fit <- withCallingHandlers(
483 |         tryCatch({
484 |           start.time <- Sys.time()
485 |           res <- fitfunc(formula = mgcv_formula, data = dat_use, family = "nb", discrete = usebam)
486 |           end.time <- Sys.time()
487 |           time <- as.numeric(end.time - start.time)
488 |           time_list[1] <- time
489 |           res
490 |         }, error=function(e) {
491 |           add_log("gam","error", toString(e))
492 |           NULL
493 |         }), warning=function(w) {
494 |           add_log("gam","warning", toString(w))
495 |         })
496 |       gamlss.fit <- withCallingHandlers(
497 |         tryCatch({
498 |           start.time = Sys.time()
499 |           res <- gamlss::gamlss(
500 |             formula = mu_formula,
501 |             sigma.formula = sigma_formula,
502 |             nu.formula = mu_formula, ## Here nu is the dropout probability!
503 |             data = dat_use,
504 |             family = gamlss.dist::ZINBI,
505 |             control = gamlss::gamlss.control(trace = FALSE, c.crit = 0.1)
506 |           )
507 |           end.time = Sys.time()
508 |           time = as.numeric(end.time - start.time)
509 |           time_list[2] <- time
510 |           res
511 |         }, error=function(e) {
512 |           add_log("gamlss","error", toString(e))
513 |           NULL
514 |         }), warning=function(w) {
515 |           add_log("gamlss","warning", toString(w))
516 |         })
517 |       
518 |     } else {
519 |       stop("The regression distribution must be one of gaussian, poisson, nb, zip or zinb!")
520 |     }
521 |     
522 |     ## Check if gamlss is fitted.
523 |     if (!"gamlss" %in% class(gamlss.fit)) {
524 |       if (sigma_formula != "~1") {
525 |         message(paste0(gene, " uses mgcv::gam due to gamlss's error!"))
526 |         ## gamlss.fit contains warning message
527 |         if(!is.null(gamlss.fit)){
528 |           ## check whether gam has warning messages
529 |           if(is.null(warn)){
530 |             warn = gamlss.fit
531 |           }else{
532 |             warn = c(warn, gamlss.fit)
533 |           }
534 |         }
535 |       }
536 |       
537 |       fit <- mgcv.fit
538 |     } else {
539 |       
540 |       mean_vec <- stats::predict(gamlss.fit, type = "response", what = "mu", data = dat_use)
541 |       theta_vec <-
542 |         stats::predict(gamlss.fit, type = "response", what = "sigma", data = dat_use)
543 |       
544 |       if_infinite <- (sum(is.infinite(mean_vec + theta_vec)) > 0)
545 |       if_overmax <- (max(mean_vec, na.rm = TRUE) > 10* max(dat_use$gene, na.rm = TRUE))
546 |       if(family_gene %in% c("nb","zinb")){
547 |         #if_overdisp <- (min(theta_vec, na.rm = TRUE) < 1/ 1000)
548 |         if_overdisp <- (max(theta_vec, na.rm = TRUE) > 1000)
549 |         
550 |       }else{
551 |         if_overdisp <- FALSE
552 |       }
553 |       
554 |       
555 |       if (if_infinite | if_overmax | if_overdisp) {
556 |         add_log("fit_marginal","warning", paste0(gene, " gamlss returns abnormal fitting values!"))
557 |         #message(paste0(gene, " gamlss returns abnormal fitting values!"))
558 |         fit <- mgcv.fit
559 |       } else if (stats::AIC(mgcv.fit) - stats::AIC(gamlss.fit) < -Inf) {
560 |         message(paste0(
561 |           gene,
562 |           "'s gamlss AIC is not signifincantly smaller than gam!"
563 |         ))
564 |         fit <- mgcv.fit
565 |       }
566 |       else {
567 |         fit <- gamlss.fit
568 |       }
569 |     }
570 |     
571 |     if(simplify) {
572 |       fit <- simplify_fit(fit)
573 |     }
574 |     
575 |     if(trace){
576 |       return(list(fit = fit, warning = logs, time = time_list, removed_cell = remove_cell))
577 |     }
578 |     return(list(fit = fit,removed_cell = remove_cell))
579 |     #return(fit)
580 |   }
581 |   
582 |   paraFunc <- parallel::mcmapply
583 |   if(.Platform$OS.type == "windows"){
584 |     BPPARAM <- BiocParallel::SnowParam()
585 |     parallelization <- "bpmapply"
586 |   }
587 |   if(parallelization == "bpmapply"){
588 |     paraFunc <- BiocParallel::bpmapply
589 |   }
590 |   if(parallelization == "pbmcmapply"){
591 |     paraFunc <- pbmcapply::pbmcmapply
592 |   }
593 |   # If not using edf flexible fitting
594 |   if(edf_fitting==FALSE){
595 |     if(parallelization == "bpmapply"){
596 |       if(class(BPPARAM)[1] != "SerialParam"){
597 |         BPPARAM$workers <- n_cores
598 |       }
599 |       model_fit <- suppressMessages(paraFunc(fit_model_func, gene = feature_names,
600 |                                              family_gene = family_use,
601 |                                              MoreArgs = list(dat_use = dat_cov,
602 |                                                              #mgcv_formula = mgcv_formula,
603 |                                                              mu_formula = mu_formula,
604 |                                                              sigma_formula = sigma_formula,
605 |                                                              predictor = predictor,
606 |                                                              count_mat = count_mat),
607 |                                              SIMPLIFY = FALSE, BPPARAM = BPPARAM))
608 |     }else{
609 |       model_fit <-  suppressMessages(paraFunc(fit_model_func, gene = feature_names,
610 |                                               family_gene = family_use,
611 |                                               mc.cores = n_cores,
612 |                                               MoreArgs = list(dat_use = dat_cov,
613 |                                                               #mgcv_formula = mgcv_formula,
614 |                                                               mu_formula = mu_formula,
615 |                                                               sigma_formula = sigma_formula,
616 |                                                               predictor = predictor,
617 |                                                               count_mat = count_mat),
618 |                                               SIMPLIFY = FALSE))
619 |     }
620 |   }else{ 
621 |     # If using edf flexible fitting
622 |     
623 |     if(parallelization == "bpmapply"){
624 |       if(class(BPPARAM)[1] != "SerialParam"){
625 |         BPPARAM$workers <- n_cores
626 |       }
627 |       # Fit model to selected edf_gini_genes 
628 |       model_fit_edf_gini <- suppressMessages(paraFunc(fit_model_func, gene = edf_gini_feature_names,
629 |                                                       family_gene = edf_gini_family_use,
630 |                                                       MoreArgs = list(dat_use = dat_cov,
631 |                                                                       #mgcv_formula = mgcv_formula,
632 |                                                                       mu_formula = mu_formula,
633 |                                                                       sigma_formula = sigma_formula,
634 |                                                                       predictor = predictor,
635 |                                                                       count_mat = edf_gini_count_mat),
636 |                                                       SIMPLIFY = FALSE, BPPARAM = BPPARAM))
637 |     }else{
638 |       
639 |       # Fit model to selected edf_gini_genes
640 |       model_fit_edf_gini <-  suppressMessages(paraFunc(fit_model_func, gene = edf_gini_feature_names,
641 |                                                        family_gene = edf_gini_family_use,
642 |                                                        mc.cores = n_cores,
643 |                                                        MoreArgs = list(dat_use = dat_cov,
644 |                                                                        #mgcv_formula = mgcv_formula,
645 |                                                                        mu_formula = mu_formula,
646 |                                                                        sigma_formula = sigma_formula,
647 |                                                                        predictor = predictor,
648 |                                                                        count_mat = edf_gini_count_mat),
649 |                                                        SIMPLIFY = FALSE))
650 |     }
651 |     
652 |     
653 |     # Extract the fitted edf
654 |     edf <- rep(NA, length(model_fit_edf_gini))
655 |     for(i in 1:length(model_fit_edf_gini)){
656 |       res_ind <- model_fit_edf_gini[i]
657 |       if(lengths(res_ind)==2){
658 |         res_ind <- res_ind[[names(res_ind)]]
659 |         edf[i] <- sum(res_ind$fit$edf)
660 |       }
661 |     }
662 |     
663 |     # Fit a edf-gini relationship for edf_gini_genes
664 |     edf_gini_count_gini <- apply(log(edf_gini_count_mat+1), MARGIN=2, FUN=gini)
665 |     edf_gini_df <- data.frame(edf=edf, gini=edf_gini_count_gini)
666 |     lm_edf_gini <- stats::lm(edf~gini, data=edf_gini_df)
667 |     # Upper bound for the lm coef
668 |     #coef <- confint(lm_edf_gini)[,2]
669 |     
670 |     
671 |     # Predict edf for edf_flexible_genes
672 |     edf_flexible_count_gini <- apply(log(edf_flexible_count_mat+1), MARGIN=2, FUN=gini)
673 |     edf_flexible_df <- data.frame(gini=edf_flexible_count_gini)
674 |     edf_flexible_predicted <- stats::predict(lm_edf_gini, edf_flexible_df, se.fit = TRUE, interval = "confidence", level = 0.95)
675 |     edf_flexible_predicted_upr <- edf_flexible_predicted$fit[,3]
676 |     
677 |     
678 |     # Fit again for the rest genes
679 |     if(parallelization == "bpmapply"){
680 |       if(class(BPPARAM)[1] != "SerialParam"){
681 |         BPPARAM$workers <- n_cores
682 |       }
683 |       model_fit_edf_flexible <- suppressMessages(paraFunc(fit_model_func, gene = edf_flexible_feature_names,
684 |                                                           family_gene = edf_flexible_family_use,
685 |                                                           MoreArgs = list(dat_use = dat_cov,
686 |                                                                           #mgcv_formula = mgcv_formula,
687 |                                                                           mu_formula = mu_formula,
688 |                                                                           sigma_formula = sigma_formula,
689 |                                                                           predictor = predictor,
690 |                                                                           count_mat = edf_flexible_count_mat,
691 |                                                                           edf=edf_flexible_predicted_upr),
692 |                                                           SIMPLIFY = FALSE, BPPARAM = BPPARAM))
693 |     }else{
694 |       model_fit_edf_flexible <-  suppressMessages(paraFunc(fit_model_func, gene = edf_flexible_feature_names,
695 |                                                            family_gene = edf_flexible_family_use,
696 |                                                            mc.cores = n_cores,
697 |                                                            MoreArgs = list(dat_use = dat_cov,
698 |                                                                            #mgcv_formula = mgcv_formula,
699 |                                                                            mu_formula = mu_formula,
700 |                                                                            sigma_formula = sigma_formula,
701 |                                                                            predictor = predictor,
702 |                                                                            count_mat = edf_flexible_count_mat,
703 |                                                                            edf=edf_flexible_predicted_upr),
704 |                                                            SIMPLIFY = FALSE))
705 |     }
706 |     
707 |     
708 |     # Combine model_fit_edf_gini and model_fit_edf_flexible
709 |     model_fit <- vector(mode = "list", length = length(feature_names))
710 |     names(model_fit) <- feature_names
711 |     
712 |     # Populate the new list based on indices:
713 |     for (index in names(model_fit_edf_gini)) {
714 |       model_fit[[index]] <- model_fit_edf_gini[[index]]
715 |     }
716 |     for (index in names(model_fit_edf_flexible)) {
717 |       model_fit[[index]] <- model_fit_edf_flexible[[index]]
718 |     }
719 |   } 
720 |   
721 |   # if(!is.null(model_fit$warning)) {
722 |   #   #stop("Model has warning!")
723 |   #   model_fit <- model_fit$value
724 |   # }
725 |   return(model_fit)
726 | }
727 | 
728 | simplify_fit <- function(cm) {
729 |   ## This function is modified from https://win-vector.com/2014/05/30/trimming-the-fat-from-glm-models-in-r/
730 |   cm$y = c()
731 |   #cm$model = c()
732 |   
733 |   cm$residuals = c()
734 |   cm$fitted.values = c()
735 |   cm$effects = c()
736 |   cm$qr$qr = c()  
737 |   cm$linear.predictors = c()
738 |   cm$weights = c()
739 |   cm$prior.weights = c()
740 |   cm$data = c()
741 |   
742 |   #cm$mu.x = c()
743 |   #cm$sigma.x = c()
744 |   #cm$nu.x = c()
745 |   
746 |   #cm$family$variance = c()
747 |   #cm$family$dev.resids = c()
748 |   #cm$family$aic = c()
749 |   #cm$family$validmu = c()
750 |   #cm$family$simulate = c()
751 |   attr(cm$terms,".Environment") = c()
752 |   attr(cm$formula,".Environment") = c()
753 |   
754 |   attr(cm$mu.terms,".Environment") = c()
755 |   attr(cm$mu.formula,".Environment") = c()
756 |   
757 |   attr(cm$sigma.terms,".Environment") = c()
758 |   attr(cm$sigma.formula,".Environment") = c()
759 |   
760 |   attr(cm$nu.terms,".Environment") = c()
761 |   attr(cm$nu.formula,".Environment") = c()
762 |   cm
763 | }
764 | 
765 | ## Function from R package reldist by Dr. Mark S. Handcock      
766 | gini <- function(x, weights=rep(1,length=length(x))){
767 |   ox <- order(x)
768 |   x <- x[ox]
769 |   weights <- weights[ox]/sum(weights)
770 |   p <- cumsum(weights)
771 |   nu <- cumsum(weights*x)
772 |   n <- length(nu)
773 |   nu <- nu / nu[n]
774 |   sum(nu[-1]*p[-n]) - sum(nu[-n]*p[-1])
775 | }


--------------------------------------------------------------------------------
/R/gamlss_fix.R:
--------------------------------------------------------------------------------
  1 | #' Functions from gamlss/gamlss.add with bugs fixed
  2 | #'
  3 | #' An additive function to be used while fitting GAMLSS models. The interface for \code{gam()} in the \pkg{mgcv} package.
  4 | #' @section ga
  5 | #' @param formula A formula of the model.
  6 | #' @param envir The environment.
  7 | #' @param control The control of the model fitting.
  8 | #' @param ... Other arguments.
  9 | #' 
 10 | #' @return A xvar list.
 11 | #' 
 12 | #' @examples
 13 | #' print("No example")
 14 | #' 
 15 | #' @export
 16 | ga <- function(formula, envir, control = ga.control(...), ...)
 17 | {
 18 |   #------------------------------------------
 19 |   # function starts here
 20 |   #------------------------------------------
 21 |   scall <- deparse(sys.call(), width.cutoff = 500L)
 22 |   if (!methods::is(formula, "formula"))
 23 |     stop("formula argument in ga() needs a formula starting with ~")
 24 |   # get where "gamlss" is in system call, it can be in gamlss() or predict.gamlss()
 25 |   rexpr <- grepl("gamlss",sys.calls())
 26 |   #rexpr <- grepl("stats::model.frame.default", sys.calls()) ##
 27 | 
 28 | 
 29 | 
 30 |   for (i in length(rexpr):1) {
 31 |     position <- i # get the position
 32 |     if (rexpr[i] == TRUE)
 33 |       break
 34 |   }
 35 | 
 36 |   gamlss.env <- sys.frame(position) #gamlss or predict.gamlss
 37 |   #gamlss.env <- environment(formula)
 38 | 
 39 |   ## get the data
 40 |   ## this has been modified on the 12-12-14 to make sure that
 41 |   ##  if model.frame.gamlss() is used as for example in term.plot() the
 42 |   ## function does not fail (It need to implemented to all smoother using formula?)
 43 |   if (sys.call(position)[1] == "predict.gamlss()") {
 44 |     # if stats::predict is used
 45 |     Data <- get("data", envir = gamlss.env)
 46 |   } else if (sys.call(position)[1] == "gamlss()") {
 47 |     # if gamlss() is used
 48 |     if (is.null(get("gamlsscall", envir = gamlss.env)$data)) {
 49 |       # if no data argument but the formula can be interpreted
 50 |       Data <- stats::model.frame(formula)
 51 |     } else {
 52 |       # data argument in gamlss
 53 |       Data <- get("gamlsscall", envir = gamlss.env)$data
 54 |     }
 55 |   } else {
 56 |     Data <- get("data", envir = gamlss.env)
 57 |   }
 58 |   Data <- data.frame(base::eval(substitute(Data)))
 59 |   #-------------------------------------------------
 60 |   # new Daniil and Vlasis
 61 |   # Initialize gam
 62 |   formula <-
 63 |     stats::as.formula(paste0("Y.var", deparse(formula, width.cutoff = 500L)))
 64 | 
 65 |   Data$Y.var <- rep(0, nrow(Data))
 66 |   G <- mgcv::gam(
 67 |     formula,
 68 |     data = Data,
 69 |     offset = control$offset,
 70 |     method = control$method,
 71 |     optimizer = control$optimizer,
 72 |     control = control$control,
 73 |     scale =  control$scale,
 74 |     select = control$select,
 75 |     knots = control$knots,
 76 |     sp = control$sp,
 77 |     min.sp = control$min.sp,
 78 |     H = control$H,
 79 |     gamma = control$gamma,
 80 |     paraPen = control$paraPen,
 81 |     in.out = control$in.out,
 82 |     drop.unused.levels = control$drop.unused.levels,
 83 |     drop.intercept = control$drop.intercept,
 84 |     discrete = control$discrete,
 85 |     G = NULL,
 86 |     fit = FALSE
 87 |   )
 88 |   #--------------------------------------------------
 89 |   xvar <- rep(0,  dim(Data)[1])
 90 |   attr(xvar, "formula")     <- formula
 91 |   attr(xvar, "control")     <- control
 92 |   attr(xvar, "gamlss.env") <- gamlss.env
 93 |   attr(xvar, "data")       <- as.data.frame(Data)
 94 |   attr(xvar, "call")       <-
 95 |     substitute(gamlss.ga(data[[scall]], z, w, ...))
 96 |   attr(xvar, "class")      <- "smooth"
 97 |   attr(xvar, "G")          <- G
 98 |   xvar
 99 | }
100 | 
101 | #' Functions from gamlss/gamlss.add with bugs fixed
102 | #'
103 | #' An additive function to be used while fitting GAMLSS models. The interface for \code{bam()} in the \pkg{mgcv} package.
104 | #' @section ba
105 | #' @param formula A formula of the model.
106 | #' @param control The control of the model fitting.
107 | #' @param ... Other arguments.
108 | #' @return A xvar list.
109 | #' @examples
110 | #' print("No example")
111 | #' @export
112 | ba <-function(formula, control = ba.control(...), ...)
113 | {
114 |   #------------------------------------------
115 |   # function starts here
116 |   #------------------------------------------
117 |   scall <- Reduce(paste, deparse(sys.call(), width.cutoff = 500L))
118 |   if (!methods::is(formula, "formula"))
119 |     stop("formula argument in ba() needs a formula starting with ~")
120 |   # get where "gamlss" is in system call, it can be in gamlss() or predict.gamlss()
121 |   rexpr <- grepl("gamlss",sys.calls()) ##
122 |   #rexpr <- grepl("fitModel", sys.calls())
123 |   #rexpr <- grepl("stats::model.frame.default", sys.calls())
124 | 
125 |   for (i in length(rexpr):1) {
126 |     position <- i # get the position
127 |     if (rexpr[i]==TRUE) break
128 |   }
129 |   gamlss.env <- sys.frame(position) #gamlss or predict.gamlss
130 |   ## get the data
131 |   ## this has been modified on the 12-12-14 to make sure that
132 |   ##  if model.frame.gamlss() is used as for example in term.plot() the
133 |   ## function does not fail (It need to implemented to all smoother using formulea?)
134 |   if (sys.call(position)[1]=="predict.gamlss()") { # if stats::predict is used
135 |     Data <- get("data", envir=gamlss.env)
136 |   } else if (sys.call(position)[1]=="gamlss()") { # if gamlss() is used
137 |     if (is.null(get("gamlsscall", envir=gamlss.env)$data)) { # if no data argument but the formula can be interpreted
138 |       Data <- stats::model.frame(formula)
139 |     } else {# data argument in gamlss
140 |       Data <- get("gamlsscall", envir=gamlss.env)$data
141 |     }
142 |   } else {
143 |     Data <- get("data", envir=gamlss.env)
144 |   }
145 |   Data <- data.frame(eval(substitute(Data)))
146 |   #-------------------------------------------------
147 |   # new Daniil and Vlasis
148 |   # Initialize bam
149 |   formula <- stats::as.formula(paste0("Y.var", Reduce(paste,deparse(formula, width.cutoff = 500L)) ))
150 |   Data$Y.var = rep(0, nrow(Data))
151 |   #browser()
152 |   G = mgcv::bam(formula,
153 |                 data = Data,
154 |                 offset = control$offset,
155 |                 method = control$method,
156 |                 control = control$control,
157 |                 select = control$select,
158 |                 scale = control$scale,
159 |                 gamma = control$gamma,
160 |                 knots = control$knots,
161 |                 sp = control$sp,
162 |                 min.sp = control$min.sp,
163 |                 paraPen = control$paraPen,
164 |                 chunk.size = control$chunk.size,
165 |                 rho = control$rho,
166 |                 AR.start = control$AR.start,
167 |                 discrete = control$discrete,
168 |                 cluster = control$cluster,
169 |                 nthreads = control$nthreads,
170 |                 gc.level = control$gc.level,
171 |                 use.chol = control$use.chol,
172 |                 samfrac = control$samfrac,
173 |                 coef = control$coef,
174 |                 drop.unused.levels = control$drop.unused.levels,
175 |                 drop.intercept = control$drop.intercept,
176 |                 G = NULL,
177 |                 fit = FALSE
178 |   )
179 |   ##bam(formula, family=gaussian(),
180 |   ##      data=list()#, weights=NULL, subset=NULL,
181 |   #    na.action=na.omit, offset=NULL#, method="fREML"#,control=list()#,
182 |   #    select=FALSE#, scale=0#,gamma=1,knots=NULL,sp=NULL,min.sp=NULL,
183 |   #    paraPen=NULL,chunk.size=10000,rho=0,AR.start=NULL,discrete=FALSE,
184 |   #    cluster=NULL,nthreads=1,gc.level=1,use.chol=FALSE,samfrac=1,
185 |   #    coef=NULL,drop.unused.levels=TRUE,G=NULL,fit=TRUE,drop.intercept=NULL,...)
186 |   #
187 |   #--------------------------------------------------
188 |   xvar <- rep(0,  dim(Data)[1])
189 |   attr(xvar,"formula")     <- formula
190 |   attr(xvar,"control")     <- control
191 |   attr(xvar, "gamlss.env") <- gamlss.env
192 |   attr(xvar, "data")       <- as.data.frame(Data)
193 |   attr(xvar, "call")       <- substitute(gamlss.ba(data[[scall]], z, w, ...))
194 |   attr(xvar, "class")      <- "smooth"
195 |   attr(xvar, "G")          <- G
196 |   xvar
197 | }
198 | 
199 | #' Functions from gamlss/gamlss.add with bugs fixed
200 | #'
201 | #' The control for \code{ba()}. From \code{gamlss.add::ba.control()} and \code{gamlss::bam()}.
202 | #' @section ba.control
203 | #'
204 | #' @param offset The offset in the formula.
205 | #' @param method The method argument in \code{bam()}.
206 | #' @param control A list of fit control parameters to replace defaults returned by gam.control. Any control parameters not supplied stay at their default values.
207 | #' @param select The \code{select} argument in \code{bam()}. Determine should selection penalties be added to the smooth effects, so that they can in principle be penalized out of the model.
208 | #' @param scale For the scale parameter. If this is positive then it is taken as the known scale parameter. Negative signals that the scale parameter is unknown. 0 signals that the scale parameter is 1 for Poisson and binomial and unknown otherwise.
209 | #' @param gamma The \code{gamma} argument in \code{bam()}. Increase above 1 to force smoother fits.
210 | #' @param knots The \code{knots} argument in \code{bam()}. An optional list containing user specified knot values to be used for basis construction.
211 | #' @param sp The \code{sp} argument in \code{bam()}. A vector of smoothing parameters can be provided here.
212 | #' @param min.sp The \code{min.sp} argument in \code{bam()}. Lower bounds can be supplied for the smoothing parameters.
213 | #' @param paraPen The \code{paraPen} argument in \code{bam()}. Optional list specifying any penalties to be applied to parametric model terms.
214 | #' @param chunk.size The model matrix is created in chunks of this size, rather than ever being formed whole.
215 | #' @param rho An AR1 error model can be used for the residuals (based on dataframe order), of Gaussian-identity link models. This is the AR1 correlation parameter.
216 | #' @param AR.start Logical variable of same length as data, \code{TRUE} at first observation of an independent section of AR1 correlation.
217 | #' @param discrete With \code{method="fREML"} it is possible to discretize covariates for storage and efficiency reasons. If \code{discrete} is \code{TRUE}, a number or a vector of numbers for each smoother term, then discretization happens. If numbers are supplied they give the number of discretization bins.
218 | #' @param cluster \code{bam} can compute the computationally dominant QR decomposition in parallel using parLapply from the \code{parallel} package, if it is supplied with a cluster on which to do this (a cluster here can be some cores of a single machine).
219 | #' @param nthreads Number of threads to use for non-cluster computation (e.g. combining results from cluster nodes).
220 | #' @param gc.level To keep the memory footprint down, it can help to call the garbage collector often, but this takes a substatial amount of time. Setting this to zero means that garbage collection only happens when R decides it should. Setting to 2 gives frequent garbage collection. 1 is in between.
221 | #' @param use.chol By default \code{bam} uses a very stable QR update approach to obtaining the QR decomposition of the model matrix. For well conditioned models an alternative accumulates the crossproduct of the model matrix and then finds its Choleski decomposition, at the end. This is somewhat more efficient, computationally.
222 | #' @param samfrac For very large sample size Generalized additive models the number of iterations needed for the model fit can be reduced by first fitting a model to a random sample of the data, and using the results to supply starting values. This initial fit is run with sloppy convergence tolerances, so is typically very low cost. \code{samfrac} is the sampling fraction to use. 0.1 is often reasonable.
223 | #' @param coef Initial values for model coefficients.
224 | #' @param drop.unused.levels By default unused levels are dropped from factors before fitting. For some smooths involving factor variables you might want to turn this off.
225 | #' @param drop.intercept Set to \code{TRUE} to force the model to really not have the a constant in the parametric model part, even with factor variables present.
226 | #' @param ... Other arguments.
227 | #'
228 | #' @return A control object
229 | #' @noRd
230 | ba.control <- function(offset = NULL,
231 |                        method = "fREML",
232 |                        control = list(),
233 |                        select = FALSE,
234 |                        scale = 0,
235 |                        gamma = 1,
236 |                        knots = NULL,
237 |                        sp = NULL,
238 |                        min.sp = NULL,
239 |                        paraPen = NULL,
240 |                        chunk.size = 10000,
241 |                        rho = 0,
242 |                        AR.start = NULL,
243 |                        discrete = TRUE,
244 |                        cluster = NULL,
245 |                        nthreads = 2,
246 |                        gc.level = 1,
247 |                        use.chol = FALSE,
248 |                        samfrac = 1,
249 |                        coef = NULL,
250 |                        drop.unused.levels = TRUE,
251 |                        drop.intercept = NULL,
252 |                        ...)
253 | {
254 |   #gam()
255 |   control <- mgcv::gam.control(...)
256 |   #ga()
257 |   list( offset=offset, method=method, control=control,  select=select,
258 |         scale=scale, gamma=gamma, knots=knots, sp=sp, min.sp=min.sp,
259 |         paraPen = paraPen, chunk.size = chunk.size, rho = rho,
260 |         AR.start = AR.start,
261 |         discrete=discrete,  cluster=cluster, nthreads=nthreads,
262 |         gc.level=gc.level, use.chol=use.chol, samfrac=samfrac,
263 |         coef= coef,
264 |         drop.unused.levels = drop.unused.levels,
265 |         drop.intercept=drop.intercept, ...)
266 | }
267 | 
268 | #' Functions from gamlss/gamlss.add with bugs fixed
269 | #'
270 | #' The control for \code{ga()}. From \code{gamlss.add::ga.control()} and \code{gamlss::gam()}.
271 | #' @section ga.control
272 | #' @param offset The offset in the formula.
273 | #' @param method The smoothing parameter estimation method.
274 | #' @param optimizer An array specifying the numerical optimization method to use to optimize the smoothing parameter estimation criterion (given by \code{method})
275 | #' @param control A list of fit control parameters to replace defaults returned by \code{gam.control}.
276 | #' @param scale If this is positive then it is taken as the known scale parameter. Negative signals that the scale parameter is unknown. 0 signals that the scale parameter is 1 for Poisson and binomial and unknown otherwise.
277 | #' @param select If this is \code{TRUE} then \code{gam()} can add an extra penalty to each term so that it can be penalized to zero.
278 | #' @param knots This is an optional list containing user specified knot values to be used for basis construction.
279 | #' @param sp A vector of smoothing parameters can be provided here.
280 | #' @param min.sp Lower bounds can be supplied for the smoothing parameters.
281 | #' @param H A user supplied fixed quadratic penalty on the parameters of the GAM can be supplied, with this as its coefficient matrix.
282 | #' @param gamma Increase this beyond 1 to produce smoother models.
283 | #' @param paraPen Optional list specifying any penalties to be applied to parametric model terms.
284 | #' @param in.out Optional list for initializing outer iteration.
285 | #' @param drop.unused.levels By default unused levels are dropped from factors before fitting. For some smooths involving factor variables you might want to turn this off.
286 | #' @param drop.intercept Set to \code{TRUE} to force the model to really not have the a constant in the parametric model part, even with factor variables present. Can be vector when \code{formula} is a list.
287 | #' @param discrete Experimental option for setting up models for use with discrete methods employed in \code{bam}.
288 | #' @param ... Other arguments
289 | #'
290 | #' @return A control object
291 | #' @noRd
292 | ga.control <- function(offset = NULL,
293 |                        method = "REML",
294 |                        optimizer = c("outer","newton"),
295 |                        control = list(),
296 |                        scale = 0,
297 |                        select = FALSE,
298 |                        knots = NULL,
299 |                        sp = NULL,
300 |                        min.sp = NULL,
301 |                        H = NULL,
302 |                        gamma = 1,
303 |                        paraPen = NULL,
304 |                        in.out = NULL,
305 |                        drop.unused.levels = TRUE,
306 |                        drop.intercept = NULL,
307 |                        discrete = FALSE,
308 |                        ...)
309 | {
310 |   #gam()
311 |   control <- mgcv::gam.control(...)
312 |   #ga()
313 |   list(offset=offset, method=method, optimizer=optimizer, control=control,
314 |        scale= scale,
315 |        select=select, knots=knots, sp=sp, min.sp=min.sp, H=H, gamma=gamma,
316 |        paraPen=paraPen, in.out=in.out,   drop.unused.levels = drop.unused.levels,
317 |        drop.intercept=drop.intercept, discrete = discrete, ...)
318 | }
319 | 
320 | 
321 | 
322 | 
323 | 
324 | #' Functions from gamlss/gamlss.add with bugs fixed
325 | #'
326 | #' The gamlss versions of the generic function \code{model.frame}
327 | #' @section model.frame.gamlss
328 | #' @param formula A formula of the model.
329 | #' @param what For which parameter to extract the model.frame, terms or model.frame.
330 | #' @param parameter Equivalent to \code{what}.
331 | #' @param ... Other arguments.
332 | #'
333 | #' @return a vector or matrix of predicted values.
334 | #' @noRd
335 | model.frame.gamlss <- function(formula, what = c("mu", "sigma", "nu", "tau"), parameter = NULL, ...)
336 | {
337 |   object <- formula
338 |   dots <- list(...)
339 |   what <- if (!is.null(parameter)) {
340 |     match.arg(parameter, choices = c("mu", "sigma", "nu", "tau"))
341 |   } else match.arg(what)
342 |   Call <- object$call
343 |   parform <- stats::formula(object, what)
344 |   data <- if (!is.null(Call$data)) {
345 |     # problem here, as Call$data is .
346 |     #eval(Call$data)
347 |     # instead, this would work:
348 |     if(what == "mu") {
349 |       eval(Call$data, environment(formula$mu.terms))
350 |     }
351 |     else if (what == "sigma") {
352 |       eval(Call$data, environment(formula$sigma.terms))
353 |     } else if (what == "nu") {
354 |       eval(Call$data, environment(formula$nu.terms))
355 |     } else if (what == "tau") {
356 |       eval(Call$data, environment(formula$tau.terms))
357 |     }
358 |     # (there is no formula$terms, just mu.terms and sigma.terms)
359 |   } else {
360 |     environment(formula$terms)
361 |   }
362 |   Terms <- stats::terms(parform)
363 |   mf <- stats::model.frame(
364 |     Terms,
365 |     data,
366 |     xlev = object[[paste(what, "xlevels", sep = ".")]]
367 |   )
368 |   mf
369 | }
370 | 
371 | #' Support for Function ga()
372 | #'
373 | #'This is support for the  smoother functions \code{ga()} interfaces for Simon Wood's \code{gam()} functions from package \pkg{mgcv}. It is not intended to be called directly by users. From \code{gamlss.add::gamlss.ga}.
374 | #' @param x The explanatory variables
375 | #' @param y Iterative y variable
376 | #' @param w Iterative weights
377 | #' @param xeval If xeval=TRUE then prediction is used
378 | #' @param ... Other arguments
379 | #' @return Not used
380 | #' @examples
381 | #' print("No example")
382 | #' @export
383 | gamlss.ga <-function(x, y, w, xeval = NULL, ...) {
384 |   if (is.null(xeval))
385 |   {#fitting
386 |     #formula <- attr(x,"formula")
387 |     #control <- as.list(attr(x, "control"))
388 |     Y.var <- y
389 |     W.var <- w
390 |     G <- attr(x,"G")
391 |     G$y <- Y.var
392 |     G$w <- W.var
393 |     G$mf$Y.var <- Y.var
394 |     G$mf$`(weights)` <- W.var
395 |     fit <-  mgcv::gam(G=G, fit=TRUE)
396 |     df <- sum(ifelse(is.null(fit$edf2), yes = fit$edf, fit$edf2) + fit$edf1 - fit$edf)-1
397 |     fv <- stats::fitted(fit)
398 |     residuals <- y-fv
399 |     list(fitted.values=fv, residuals=residuals,
400 |          nl.df = df, lambda=fit$sp[1], #
401 |          coefSmo = fit, var=NA)    # var=fv has to fixed
402 |   } else { # predict
403 |     gamlss.env <- as.environment(attr(x, "gamlss.env"))
404 |     obj <- get("object", envir=gamlss.env ) # get the object from predict
405 |     TT <- get("TT", envir=gamlss.env ) # get wich position is now
406 |     SL <- get("smooth.labels", envir=gamlss.env) # all the labels of the smoother
407 |     fit <- eval(parse(text=paste("obj$", get("what", envir=gamlss.env), ".coefSmo[[",as.character(match(TT,SL)), "]]", sep="")))
408 |     OData <- attr(x,"data")
409 |     ll <- dim(OData)[1]
410 |     pred <- stats::predict(fit,newdata = OData[seq(length(y)+1,ll),])
411 |   }
412 | }
413 | 
414 | 
415 | #' Support for Function ba()
416 | #'
417 | #'This is support for the  smoother functions \code{ba()} interfaces for Simon Wood's \code{bam()} functions from package \pkg{mgcv}. It is not intended to be called directly by users. From \code{gamlss.add::gamlss.ba}.
418 | #' @param x The explanatory variables
419 | #' @param y Iterative y variable
420 | #' @param w Iterative weights
421 | #' @param xeval If xeval=TRUE then prediction is used
422 | #' @param ... Other arguments
423 | #' @return Not used
424 | #' @examples
425 | #' print("No example")
426 | #' @export
427 | gamlss.ba <-function(x, y, w, xeval = NULL, ...) {
428 |   if (is.null(xeval))
429 |   {#fitting
430 |     Y.var <- y
431 |     W.var <- w
432 |     G <- attr(x,"G")
433 |     control = attr(x,"control")
434 |     G$y <- Y.var
435 |     G$w <- W.var
436 |     G$mf$Y.var <- Y.var
437 |     G$mf$`(weights)` <- W.var
438 |     fit <-  mgcv::bam(G=G, fit=TRUE,
439 |                       offset=control$offset, method=control$method,
440 |                       control=control$control, select=control$select,
441 |                       scale=control$scale, gamma=control$gamma,
442 |                       knots=control$knots, sp=control$sp, min.sp=control$min.sp,
443 |                       paraPen=control$paraPen, chunk.size=control$chunk.size,
444 |                       rho=control$rho, AR.start=control$AR.start,
445 |                       discrete=control$discrete,
446 |                       cluster=control$cluster, nthreads=control$nthreads,
447 |                       gc.level=control$gc.level, use.chol=control$use.chol,
448 |                       samfrac=control$samfrac,
449 |                       drop.unused.levels=control$bam$drop.unused.levels)
450 |     df <- sum(ifelse(is.null(fit$edf2), yes = fit$edf, fit$edf2) + fit$edf1 - fit$edf)-1
451 |     fv <- stats::fitted(fit)
452 |     residuals <- y-fv
453 |     list( fitted.values=fv, residuals=residuals,
454 |           nl.df = df, lambda=fit$sp[1], #
455 |           coefSmo = fit, var=NA)    # var=fv has to fixed
456 |   } else { # predict
457 |     gamlss.env <- as.environment(attr(x, "gamlss.env"))
458 |     obj <- get("object", envir=gamlss.env ) # get the object from predict
459 |     TT <- get("TT", envir=gamlss.env ) # get wich position is now
460 |     SL <- get("smooth.labels", envir=gamlss.env) # all the labels of the smoother
461 |     fit <- eval(parse(text=paste("obj$", get("what", envir=gamlss.env), ".coefSmo[[",as.character(match(TT,SL)), "]]", sep="")))
462 |     OData <- attr(x,"data")
463 |     ll <- dim(OData)[1]
464 |     pred <- stats::predict(fit,newdata = OData[seq(length(y)+1,ll),])
465 |   }
466 | }
467 | 
468 | # Accessed from gamlss's github 04/10/2023
469 | #' @export
470 | predict.gamlss <- function(object,
471 |                            what = c("mu", "sigma", "nu", "tau"),
472 |                            parameter = NULL,
473 |                            newdata = NULL,
474 |                            type = c("link", "response", "terms"), # terms not working
475 |                            terms = NULL,
476 |                            se.fit = FALSE,
477 |                            data = NULL, ...)
478 | {
479 |   ## this little function put data frames together
480 |   ##  originated from an the R-help reply by B. Ripley
481 |   ##------------------------------------------------------------------------------
482 |   ##------------------------------------------------------------------------------
483 |   ##-------- concat starts here
484 |   concat <- function(..., names=NULL)
485 |   {
486 |     tmp <- list(...)
487 |     if(is.null(names)) names <- names(tmp)
488 |     if(is.null(names)) names <- sapply( as.list(match.call()), deparse)[-1]
489 |     if( any(
490 |       sapply(tmp, is.matrix)
491 |       |
492 |       sapply(tmp, is.data.frame) ) )
493 |     {
494 |       len <- sapply(tmp, function(x) c(dim(x),1)[1] )
495 |       len[is.null(len)] <- 1
496 |       data <- rbind( ... )
497 |     }
498 |     else
499 |     {
500 |       len <- sapply(tmp,length)
501 |       data <- unlist(tmp)
502 |     }
503 |     namelist <- factor(rep(names, len), levels=names)
504 |     return( data.frame( data, source=namelist) )
505 |   }
506 |   ##----------concat finish here
507 |   ##--------------------------------------------------------------------------------------------
508 |   ##--------------------------------------------------------------------------------------------
509 |   ##   main function starts here
510 |   ##----------------------------
511 |   ## If no new data just use lpred() and finish
512 |   if (is.null(newdata))  #
513 |   {
514 |     newdata = data
515 |     #predictor<- lpred(object, what = what, type = type, terms = terms, se.fit = se.fit, ... )
516 |     #return(predictor)
517 |     #newdata <- object$mu.x
518 |   }
519 |   ## at the moment se.fit is not supported for new data
520 |   if (se.fit)
521 |     warning(" se.fit = TRUE is not supported for new data values at the moment \n")
522 |   ##  stop if newdata is not data frame
523 |   ## note that atomic is not working here so better to take it out Mikis 23-10-13
524 |   ## if (!(is.atomic(newdata) | inherits(newdata, "data.frame")))
525 |   #if (!(inherits(newdata, "data.frame")))
526 |    # stop("newdata must be a data frame ") # or a frame mumber
527 |   ## getting which parameter and type
528 |   what <- if (!is.null(parameter))  {
529 |     match.arg(parameter, choices=c("mu", "sigma", "nu", "tau"))} else  match.arg(what)
530 |   type <- match.arg(type)
531 |   ## get the original call
532 |   Call <- object$call
533 |   ## we need both the old and the new data sets
534 |   ## the argument data can be provided by predict
535 |   data<-if (is.null(data))
536 |   {        ## if it is not provided then get it from the original call
537 |     if (!is.null(Call$data)) eval(Call$data)
538 |     else stop("define the original data using the option data")
539 |   }else{
540 |     data
541 |   }
542 |   # else data # if it provide get it
543 |   ## keep only the same variables
544 |   ## this assumes that all the relevant variables will be in newdata
545 |   ## what happens if not?
546 |   data <- data[,match(names(newdata),names(data)), drop=FALSE]
547 |   ## merge the two data together
548 |   data <- concat(data,newdata)
549 |   ## get the formula
550 |   parform <- stats::formula(object, what)# object[[paste(what, "formula", sep=".")]]
551 |   ## put response to NULL
552 |   if (length(parform)==3)
553 |     parform[2] <- NULL
554 |   ## define the terms
555 |   Terms <- terms(parform)
556 |   ## get the offset
557 |   offsetVar <- if (!is.null(off.num <- attr(Terms, "offset"))) # new
558 |     eval(attr(Terms, "variables")[[off.num + 1]], data)
559 |   ## model frame
560 |   if(is.null(environment(Terms))){
561 |     environment(Terms) <- globalenv()
562 |   }
563 |   m <- stats::model.frame(Terms, data, xlev = object[[paste(what,"xlevels",sep=".")]])
564 |   ## model design matrix y and w
565 |   X <- stats::model.matrix(Terms, data, contrasts = object$contrasts)
566 |   y <- object[[paste(what,"lp",sep=".")]]
567 |   w <- object[[paste(what,"wt",sep=".")]]
568 |   ## leave for future checks
569 |   #  aN <- dim(newdata)[1]
570 |   #zeros <- rep(0,aN)
571 |   #ones <- rep(1,aN)
572 |   #yaug <- as.vector(c(y,zeros))
573 |   #waug <- as.vector(c(w,zeros))
574 |   ## for keeping only the original data
575 |   onlydata <- data$source == "data" # TRUE or FALSE
576 |   ## whether additive terms are involved in the fitting
577 |   smo.mat <- object[[paste(what,"s",sep=".")]]
578 |   ## if offset take it out from fitting
579 |   if (!is.null(off.num))
580 |     y <- (y - offsetVar[onlydata])
581 |   ## if smoothing
582 |   if (!is.null(smo.mat))
583 |   {
584 |     n.smooths <- dim(smo.mat)[2]
585 |     y <- (y - smo.mat %*% rep(1, n.smooths))
586 |   }
587 |   ## refit the model
588 |   # refit <- lm.wfit(X[onlydata,  , drop = FALSE], y, w)
589 |   # ## ckeck the residuals if they are zero
590 |   # ##if (any(abs(resid(refit))>1e-005))
591 |   # if (abs(sum(resid(refit)))>1e-001||abs(sum(coef(object, what=what)-coef(refit), na.rm=TRUE))>1e-005)
592 |   #   warning(paste("There is a discrepancy  between the original and the re-fit",
593 |   #                 " \n used to achieve 'safe' predictions \n ", sep = "" ))
594 |   ## this is disturbing fit and refit have different coefficients  why?
595 |   ## fit <- lm.wfit(X, yaug, waug)
596 |   ## get the coefficients
597 |   coef <- coef(object, what=what)        ## save the coefficints
598 |   nX <- dimnames(X)        ## the names of rows and columns
599 |   rownames <- nX[[1]][!onlydata] ## only the newdata rows
600 |   nrows <- sum(!onlydata)     ## the number of rows in the new data
601 |   nac <- is.na(coef)        ## whether they are NA in coefficients
602 |   assign.coef <- attr(X, "assign")  ## X is a matrix
603 |   collapse <- type != "terms"## !collapse is for whether type is not "terms"
604 |   Xpred <- X[!onlydata,]
605 |   Xpred <- matrix(Xpred, nrow=nrows) # I think this probably is not needed sinse allready a matrix
606 |   # I will check this later
607 |   if (!collapse)       ## whether type=="terms"
608 |   {
609 |     aa <- attr(X, "assign")
610 |     ll <- attr(Terms, "term.labels")
611 |     if (attr(Terms, "intercept") > 0)  ll <- c("(Intercept)", ll)
612 |     aaa <- factor(aa, labels = ll)
613 |     asgn <- split(order(aa), aaa)
614 |     hasintercept <- attr(Terms, "intercept") > 0
615 |     fit_qr <- object[[paste0(what,".qr")]]
616 |     p <- fit_qr$rank
617 |     p1 <- seq(len = p)
618 |     piv <- fit_qr$pivot[p1]
619 |     if (hasintercept)
620 |     {
621 |       asgn$"(Intercept)" <- NULL
622 |       avx <- colMeans(X[onlydata, ])
623 |       termsconst <- sum(avx[piv] * coef[piv])
624 |     }
625 |     # TT <- sum(onlydata)
626 |     # xbar <- drop(array(1/TT, c(1, TT)) %*% X[onlydata, !nac])
627 |     nterms <- length(asgn)
628 |     #    if (nterms > 0)
629 |     # define the prediction matrix
630 |     pred <- matrix(ncol = nterms, nrow = nrows)
631 |     dimnames(pred) <- list(rownames(newdata), names(asgn))
632 |     #          if (se.fit )
633 |     #          {
634 |     #              ip <- matrix(ncol = nterms, nrow = NROW(X))
635 |     #              dimnames(ip) <- list(rownames(X), names(asgn))
636 |     #              Rinv <- qr.solve(qr.R(obj[[paste(what,"qr",sep=".")]])[p1, p1])
637 |     #          }
638 |     if (hasintercept)
639 |       Xpred <- sweep(Xpred, 2, avx)
640 |     unpiv <- rep.int(0, NCOL(Xpred))
641 |     unpiv[piv] <- p1
642 |     for (i in seq(1, nterms, length = nterms))
643 |     {
644 |       iipiv <- asgn[[i]]
645 |       ii <- unpiv[iipiv]
646 |       iipiv[ii == 0] <- 0
647 |       pred[, i] <- if (any(iipiv > 0)) # ms Thursday, May 1, 2008 at 10:12
648 |         Xpred[, iipiv, drop = FALSE] %*% coef[iipiv]
649 |       else 0
650 |       #              if (se.fit )
651 |       #                ip[, i] <- if (any(iipiv > 0))
652 |       #                  as.matrix(X[, iipiv, drop = FALSE] %*% Rinv[ii,
653 |       #                    , drop = FALSE])^2 %*% rep.int(1,p)
654 |       #                else 0
655 |     }
656 |     attr(pred, "constant") <- if (hasintercept)  termsconst
657 |     else 0
658 |     #   Xpred <- Xpred - outer(rep(1, sum(!onlydata)), avx)
659 |     if (!is.null(terms))
660 |     {
661 |       pred <- pred[, terms, drop = FALSE]
662 |       #   if (se.fit)
663 |       #     ip <- ip[, terms, drop = FALSE]
664 |     }
665 |   } ## end if for terms
666 |   else  ## if type is not terms but "link" or "response"
667 |   {
668 |     pred <- drop(Xpred[, !nac, drop = FALSE] %*% coef[!nac])
669 |     if (!is.null(off.num) && collapse)
670 |       pred <- pred + offsetVar[!onlydata]
671 |   }
672 |   ##
673 |   ## now the smoothing part
674 |   ##
675 |   if (!is.null(smo.mat))
676 |   {
677 |     #cat("new prediction", "\n")
678 |     smooth.labels <- dimnames(smo.mat)[[2]]       ## getting the labels i.e. "pb(Fl)" "pb(A)"
679 |     pred.s <- array(0, c(nrows, n.smooths), list(names(pred),
680 |                                                  dimnames(smo.mat)[[2]])) ## creating the prediction matrix
681 |     # smooth.labels[smooth.labels%in%colnames(X)]
682 |     # smooth.wanted <- smooth.labels[match(smooth.labels, colnames(X), 0) > 0]
683 |     ## getting the smoothing call
684 |     smooth.calls <- lapply(m[smooth.labels], attr, "call") # i.e $`pb(Fl)`
685 |     #     gamlss.pb(data[["pb(Fl)"]], z, w)
686 |     data <- subset(m, onlydata, drop=FALSE)        ## get the  original data
687 |     attr(data, "class") <- NULL                                   ## note that m is the data.frame with all data
688 |     new.m <- subset(m, !onlydata, drop=FALSE)       ## get the new data
689 |     attr(new.m, "class") <- NULL
690 |     residuals <-  if (!is.null(off.num)) object[[paste(what,"wv",sep=".")]] - object[[paste(what,"lp",sep=".")]]+offsetVar[onlydata]
691 |     else object[[paste(what,"wv",sep=".")]] - object[[paste(what,"lp",sep=".")]]
692 |     for(TT in smooth.labels)
693 |     {
694 |       if (is.matrix(m[[TT]])) # the problem is that for some smoother the m[[TT]] is a matrix (for example pvc())
695 |       { # MS 27-6-11         # in this case  we have to protect the dim attributes of data[[tt]]
696 |         nm <- names(attributes(m[[TT]])) # first we get the names of all attributes
697 |         attributes(data[[TT]]) <- attributes(m[[TT]])[nm[-c(1,2)]]# then we pass all but
698 |       }                                 # 1 and 2 i.e. dim and names
699 |       else   attributes(data[[TT]]) <- attributes(m[[TT]])
700 |       Call <- smooth.calls[[TT]] #
701 |       Call$xeval <- substitute(new.m[[TT]], list(TT = TT))
702 |       z <- residuals + smo.mat[, TT]
703 |       # debug(gamlss.pvc)
704 |       pred.s[, TT] <- eval(Call)
705 |     }
706 |     if(type == "terms")
707 |     {
708 |       # pred[, smooth.wanted] <- pred[, smooth.wanted] + pred.s[, smooth.wanted]
709 |       pred[, smooth.labels] <- pred[, smooth.labels] + pred.s[, smooth.labels]
710 |     }
711 |     else pred <- drop(pred + pred.s %*% rep(1, n.smooths))
712 |   }
713 |   if(type == "response")
714 |   {
715 |     FAM <- eval(object$call$family)#
716 |     if (!methods::is(FAM,"gamlss.family"))
717 |     {
718 |       FAM <- stats::family(object)[1]
719 |     }
720 |     # else
721 |     # {
722 |     FAM <- gamlss.dist::as.gamlss.family(FAM)# this should get a gamlss family but not alway
723 |     pred <- FAM[[paste0(what,".linkinv")]](pred)
724 |   }
725 |   names(pred) <- rownames(newdata)
726 |   pred
727 | }
728 | 
729 | 
730 | 
731 | 
732 | 


--------------------------------------------------------------------------------
/R/perform_lrt.R:
--------------------------------------------------------------------------------
  1 | #' Perform the likelihood ratio test
  2 | #'
  3 | #' \code{perform_lrt} performs the likelihood ratio test to compare two list of marginal models.
  4 | #'
  5 | #' The function takes two lists of marginal models (by default, the first list is the alternative and the second is the null)
  6 | #' from \code{\link{fit_marginal}}. Note that LRT only makes sense for NESTED models. This can be quite tricky if you use penalized-splines (e.g., for trajectory data).
  7 | #'
  8 | #' @param alter_marginal A list of marginal models from the alternative hypothesis.
  9 | #' @param null_marginal A list of marginal models from the null hypothesis. It must be strictly nested in the alternative model.
 10 | #'
 11 | #' @return A data.frame of the LRT result.
 12 | #' @examples
 13 | #' data(example_sce)
 14 | #' my_data <- construct_data(
 15 | #' sce = example_sce,
 16 | #' assay_use = "counts",
 17 | #' celltype = "cell_type",
 18 | #' pseudotime = "pseudotime",
 19 | #' spatial = NULL,
 20 | #' other_covariates = NULL,
 21 | #' corr_by = "cell_type"
 22 | #' )
 23 | #'
 24 | #' my_data2 <- construct_data(
 25 | #'   sce = example_sce,
 26 | #'   assay_use = "counts",
 27 | #'   celltype = "cell_type",
 28 | #'   pseudotime = "pseudotime",
 29 | #'   spatial = NULL,
 30 | #'   other_covariates = NULL,
 31 | #'   corr_by = "pseudotime",
 32 | #'   ncell = 10000
 33 | #' )
 34 | #'
 35 | #' my_marginal1 <- fit_marginal(
 36 | #'   data = my_data,
 37 | #'   mu_formula = "1",
 38 | #'   sigma_formula = "1",
 39 | #'   family_use = "nb",
 40 | #'   n_cores = 1,
 41 | #'   usebam = FALSE
 42 | #' )
 43 | #' my_marginal2 <- fit_marginal(
 44 | #'   data = my_data,
 45 | #'   mu_formula = "s(pseudotime, bs = 'cr', k = 10)",
 46 | #'   sigma_formula = "1",
 47 | #'   family_use = "nb",
 48 | #'   n_cores = 1,
 49 | #'   usebam = FALSE
 50 | #' )
 51 | #' my_fit1 <- lapply(my_marginal1, function(x)x$fit)
 52 | #' my_fit2 <- lapply(my_marginal2, function(x)x$fit)
 53 | #' my_pvalue <- perform_lrt(my_fit2, my_fit1)
 54 | #'
 55 | #' @export perform_lrt
 56 | 
 57 | perform_lrt <- function(alter_marginal,
 58 |                         null_marginal) {
 59 | 
 60 |   p_tab <- mapply(function(fit1, fit2) {
 61 |     ## Check if same models for fit1/2
 62 |     sm <- identical(class(fit1), class(fit2))
 63 | 
 64 |     ## get predicted probabilities for both models
 65 |     m1y <- fit1$y
 66 |     m2y <- fit2$y
 67 |     m1n <- length(m1y)
 68 |     m2n <- length(m2y)
 69 |     if(m1n==0 | m2n==0)
 70 |       stop("Could not extract dependent variables from models.")
 71 | 
 72 |     if(m1n != m2n)
 73 |       stop(paste("Models appear to have different numbers of observations.\n",
 74 |                  "Model 1 has ",m1n," observations.\n",
 75 |                  "Model 2 has ",m2n," observations.\n",
 76 |                  sep="")
 77 |       )
 78 | 
 79 |     if(any(m1y != m2y)){
 80 |       stop(paste("Models appear to have different values on dependent variables.\n"))
 81 |     }
 82 | 
 83 |     ## gather up degrees of freedom
 84 |     if(methods::is(fit1, "gamlss")) {
 85 |       k1 <- fit1$df.fit
 86 |     } else if (methods::is(fit1, "gam"))
 87 |     {
 88 |       k1 <- sum(fit1$edf2 + fit1$edf1 - fit1$edf)
 89 |     } else {
 90 |       stop("Model must be either gamlss or mgcv::gam!")
 91 |     }
 92 | 
 93 |     if(methods::is(fit2, "gamlss")) {
 94 |       k2 <- fit2$df.fit
 95 |     } else if (methods::is(fit2, "gam"))
 96 |     {
 97 |       k2 <- sum(fit2$edf2 + fit2$edf1 - fit2$edf)
 98 |     } else {
 99 |       stop("Model must be either gamlss or mgcv::gam!")
100 |     }
101 | 
102 |     #k1 <- length(coef(m1))
103 |     #k2 <- length(coef(m2))
104 | 
105 |     ll1 <- stats::logLik(fit1)
106 |     ll2 <- stats::logLik(fit2)
107 | 
108 |     sign_k <- sign(k1 - k2)
109 |     lr <- -2*(ll2 - ll1)
110 | 
111 |     llk1 <- ll1/k1
112 |     llk2 <- ll1/k2
113 | 
114 |     lrt.p <- ifelse(lr > 0, stats::pchisq(lr, (k1-k2), lower.tail = FALSE), NA)
115 | 
116 |     res <- c(sm, ll1, ll2, k1, k2, lrt.p)
117 |     names(res) <- c("same_model", "LogLik_alter", "LogLik_null", "df_alter", "df_null", "p_value")
118 |     return(res)
119 |   }, fit1 = alter_marginal, fit2 = null_marginal)
120 |   p_tab <- as.data.frame(t(p_tab))
121 | 
122 |   return(p_tab)
123 | }
124 | 


--------------------------------------------------------------------------------
/R/plot_reduceddim.R:
--------------------------------------------------------------------------------
  1 | #' Dimensionality reduction and visualization
  2 | #' 
  3 | #' \code{plot_reduceddim} performs the dimensionality reduction
  4 | #' 
  5 | #' This function takes a reference sce and a list of new sces, performs the dimensionality reduction on the reference data, 
  6 | #' projects the synthetic datasets on the same low dimensional space,
  7 | #' then visualize the results.
  8 | #'
  9 | #' @param ref_sce The reference sce.
 10 | #' @param sce_list A list of synthetic sce.
 11 | #' @param name_vec A vector of the names of each dataset. The length should be \code{length(sce_list) + 1}, where the first name is for \code{ref_sce}.
 12 | #' @param assay_use A string which indicates the assay you will use in the sce.
 13 | #' Default is 'logcounts'.
 14 | #' @param n_pc An integer of the number of PCs.
 15 | #' @param pc_umap A logic value of whether using PCs as the input of UMAP. Default is TRUE.
 16 | #' @param center A logic value of whether centering the data before PCA. Default is TRUE.
 17 | #' @param scale. A logic value of whether scaling the data before PCA. Default is TRUE.
 18 | #' @param if_plot A logic value of whether returning the plot. If FALSE, return the reduced dimensions of each dataset.
 19 | #' @param shape_by A string which indicates the column in \code{colData} used for shape.
 20 | #' @param color_by A string which indicates the column in \code{colData} used for color.
 21 | #' @param point_size A numeric value of the point size in the final plot. Default is 1.
 22 | #'
 23 | #' @return The ggplot or the data.frame of reduced dimensions.
 24 | #'
 25 | #' @import ggplot2
 26 | #'
 27 | #' @export plot_reduceddim
 28 | 
 29 | plot_reduceddim <- function(ref_sce,
 30 |                             sce_list,
 31 |                             name_vec,
 32 |                             assay_use = "logcounts",
 33 |                             pc_umap = TRUE,
 34 |                             n_pc = 50,
 35 |                             center = TRUE,
 36 |                             scale. = TRUE,
 37 |                             if_plot = TRUE,
 38 |                             shape_by = NULL,
 39 |                             color_by,
 40 |                             point_size = 1) {
 41 | 
 42 |   Method <- NULL ## Avoid check note.
 43 |   stopifnot(length(name_vec) == (length(sce_list) + 1))
 44 | 
 45 |   mat_ref <- t(as.matrix(SummarizedExperiment::assay(ref_sce, assay_use)))
 46 | 
 47 |   if(sum(matrixStats::colVars(mat_ref) == 0) > 0) {
 48 |     stop("The ref dataset contains 0 variance features. Please remove them.")
 49 |   }
 50 | 
 51 |   if(!is.null(shape_by)){
 52 |     if(!(shape_by %in% colnames(SummarizedExperiment::colData(ref_sce)))) {
 53 |       stop("The shape_by in not in your ref_sce's colData. Please double check the variable name for shape_by.")
 54 |     }
 55 |     shape_by_check <- sapply(sce_list, function(x){shape_by %in% colnames(SummarizedExperiment::colData(x))})
 56 |     if(!all(shape_by_check)){
 57 |       stop("The shape_by in not in your sce_list's colData. Please double check the variable name for shape_by.")
 58 |     }
 59 |   }
 60 | 
 61 |   mat_list <- lapply(sce_list, function(x){
 62 |     mat <- t(as.matrix(SummarizedExperiment::assay(x, assay_use)))
 63 |     mat
 64 |   })
 65 | 
 66 |   ref_pca_fit <- irlba::prcomp_irlba(mat_ref,
 67 |                                      center = center,
 68 |                                      scale. = scale.,
 69 |                                      n = n_pc)
 70 |   ref_pca <- ref_pca_fit$x
 71 |   if(pc_umap) {
 72 |     ref_umap_fit <- umap::umap(ref_pca_fit$x)
 73 |   } else {
 74 |     ref_umap_fit <- umap::umap(mat_ref)
 75 |   }
 76 |   ref_umap <- ref_umap_fit$layout
 77 |   colnames(ref_umap) <- c("UMAP1", "UMAP2")
 78 | 
 79 |   SingleCellExperiment::reducedDim(ref_sce, "PCA") <- ref_pca
 80 |   SingleCellExperiment::reducedDim(ref_sce, "UMAP") <- ref_umap
 81 | 
 82 |   sce_list <- lapply(sce_list, function(x) {
 83 |     mat <- t(as.matrix(SummarizedExperiment::assay(x, assay_use)))
 84 |     SingleCellExperiment::reducedDim(x, "PCA") <- stats::predict(ref_pca_fit, newdata = mat)
 85 |     if(pc_umap) {
 86 |       res <- stats::predict(object = ref_umap_fit, data = SingleCellExperiment::reducedDim(x, "PCA"))
 87 |     } else {
 88 |       res <- stats::predict(object = ref_umap_fit, data = mat)
 89 |     }
 90 |      colnames(res) <- c("UMAP1", "UMAP2")
 91 |     SingleCellExperiment::reducedDim(x, "UMAP") <- res
 92 |     return(x)
 93 |   })
 94 | 
 95 |   sce_list_new <- c(list(ref_sce), sce_list)
 96 |   names(sce_list_new) <- name_vec
 97 | 
 98 |   rd_list <- lapply(sce_list_new, function(x) {
 99 |     rd <- tibble::as_tibble(SummarizedExperiment::colData(x))
100 |     if(is.null(shape_by)){
101 |       rd <- dplyr::select(rd, color_by)
102 |     }else{
103 |       rd <- dplyr::select(rd, c(color_by,shape_by))
104 |     }
105 | 
106 | 
107 |     rd_pca <- tibble::as_tibble(SingleCellExperiment::reducedDim(x, "PCA"))
108 |     rd_umap <- tibble::as_tibble(SingleCellExperiment::reducedDim(x, "UMAP"))
109 |     rd <- dplyr::bind_cols(rd, rd_pca)
110 |     rd <- dplyr::bind_cols(rd, rd_umap)
111 |     rd
112 |   })
113 | 
114 |   names(rd_list) <- names(sce_list_new)
115 | 
116 |   rd_tbl <- dplyr::bind_rows(rd_list, .id = "Method")
117 |   rd_tbl <- dplyr::mutate(rd_tbl, Method = factor(Method, levels = name_vec))
118 | 
119 |   if(if_plot) {
120 | 
121 |     p_pca <- ggplot(rd_tbl, aes_string(x = "PC1", y = "PC2", color = color_by)) +
122 |       geom_point(alpha = 0.5, size = point_size, aes_string(shape = shape_by)) +
123 |       facet_wrap(~Method, nrow = 1) +
124 |       theme(aspect.ratio = 1, legend.position = "bottom") 
125 |       
126 |     p_umap <- ggplot(rd_tbl, aes_string(x = "UMAP1", y = "UMAP2", color = color_by)) +
127 |       geom_point(alpha = 0.5, size = point_size, aes_string(shape = shape_by)) +
128 |       facet_wrap(~Method, nrow = 1) +
129 |       theme(aspect.ratio = 1, legend.position = "bottom") 
130 |       
131 | 
132 |     if(is.numeric(unlist(rd_tbl[, color_by]))) {
133 |       p_pca <- p_pca + viridis::scale_color_viridis()
134 |       p_umap <- p_umap + viridis::scale_color_viridis()
135 |     }else{
136 |       p_pca <- p_pca + guides(color = guide_legend(override.aes = list(size = 2, alpha = 1)))
137 |       p_umap <- p_umap + guides(color = guide_legend(override.aes = list(size = 2, alpha = 1)))
138 |     }
139 | 
140 |     return(list(p_pca = p_pca, p_umap = p_umap))
141 | 
142 |   } else {
143 |     return(rd_tbl)
144 |   }
145 | 
146 | }
147 | 


--------------------------------------------------------------------------------
/R/scdesign3.R:
--------------------------------------------------------------------------------
  1 | #' The wrapper for the whole scDesign3 pipeline
  2 | #'
  3 | #' \code{scdesign3} takes the input data, fits the model and
  4 | #'
  5 | #' @param sce A \code{SingleCellExperiment} object.
  6 | #' @param assay_use A string which indicates the assay you will use in the sce. Default is 'counts'.
  7 | #' @param celltype A string of the name of cell type variable in the \code{colData} of the sce. Default is 'cell_type'.
  8 | #' @param pseudotime A string or a string vector of the name of pseudotime and (if exist)
  9 | #' multiple lineages. Default is NULL.
 10 | #' @param spatial A length two string vector of the names of spatial coordinates. Default is NULL.
 11 | #' @param other_covariates A string or a string vector of the other covariates you want to include in the data.
 12 | #' @param ncell The number of cell you want to simulate. Default is \code{dim(sce)[2]} (the same number as the input data).
 13 | #' @param mu_formula A string of the mu parameter formula
 14 | #' @param sigma_formula A string of the sigma parameter formula
 15 | #' @param family_use A string of the marginal distribution.
 16 | #' Must be one of 'poisson', 'nb', 'zip', 'zinb' or 'gaussian'.
 17 | #' @param n_cores An integer. The number of cores to use.
 18 | #' @param correlation_function A string. If 'default', the function from \code{Rfast}; if 'coop', the function from \code{coop}, which calls BLAS.
 19 | #' @param usebam A logic variable. If use \code{\link[mgcv]{bam}} for acceleration in marginal fitting.
 20 | #' @param edf_flexible A logic variable. It is used for accelerating for spatial model if k is large in 'mu_formula'. Default is FALSE.
 21 | #' @param corr_formula A string of the correlation structure.
 22 | #' @param empirical_quantile Please only use it if you clearly know what will happen! A logic variable. If TRUE, DO NOT fit the copula and use the EMPIRICAL CDF values of the original data; it will make the simulated data fixed (no randomness). Default is FALSE. Only works if ncell is the same as your original data.
 23 | #' @param copula A string of the copula choice. Must be one of 'gaussian' or 'vine'. Default is 'gaussian'. Note that vine copula may have better modeling of high-dimensions, but can be very slow when features are >1000.
 24 | #' @param if_sparse A logic variable. Only works for Gaussian copula (\code{family_set = "gaussian"}). If TRUE, a thresholding strategy will make the corr matrix sparse.
 25 | #' @param fastmvn An logical variable. If TRUE, the sampling of multivariate Gaussian is done by \code{mvnfast}, otherwise by \code{mvtnorm}. Default is FALSE. It only matters for Gaussian copula.
 26 | #' @param DT A logic variable. If TRUE, perform the distributional transformation
 27 | #' to make the discrete data 'continuous'. This is useful for discrete distributions (e.g., Poisson, NB).
 28 | #' Default is TRUE. Note that for continuous data (e.g., Gaussian), DT does not make sense and should be set as FALSE.
 29 | #' @param pseudo_obs A logic variable. If TRUE, use the empirical quantiles instead of theoretical quantiles for fitting copula.
 30 | #' Default is FALSE.
 31 | #' @param family_set A string or a string vector of the bivariate copula families. Default is c("gauss", "indep"). For more information please check package \code{rvinecoplib}.
 32 | #' @param important_feature A numeric value or vector which indicates whether a gene will be used in correlation estimation or not. If this is a numeric value, then
 33 | #' gene with zero proportion greater than this value will be excluded form gene-gene correlation estimation. If this is a vector, then this should
 34 | #' be a logical vector with length equal to the number of genes in \code{sce}. \code{TRUE} in the logical vector means the corresponding gene will be included in
 35 | #' gene-gene correlation estimation and \code{FALSE} in the logical vector means the corresponding gene will be excluded from the gene-gene correlation estimation.
 36 | #' The default value is "all" (a special string which means no filtering).
 37 | #' @param nonnegative A logical variable. If TRUE, values < 0 in the synthetic data will be converted to 0. Default is TRUE (since the expression matrix is nonnegative).
 38 | #' @param nonzerovar A logical variable. If TRUE, for any gene with zero variance, a cell will be replaced with 1. This is designed for avoiding potential errors, for example, PCA. Default is FALSE.
 39 | #' @param return_model A logic variable. If TRUE, the marginal models and copula models will be returned. Default is FALSE.
 40 | #' @param simplify A logic variable. If TRUE, the fitted regression model will only keep the essential contains for \code{predict}, otherwise the fitted models can be VERY large. Default is FALSE.
 41 | #' @param parallelization A string indicating the specific parallelization function to use.
 42 | #' @param n_rep An integer number. The number of replicates of simulated new count matrix. Default is 1.
 43 | #' Must be one of 'mcmapply', 'bpmapply', or 'pbmcmapply', which corresponds to the parallelization function in the package
 44 | #' \code{parallel},\code{BiocParallel}, and \code{pbmcapply} respectively. The default value is 'mcmapply'.
 45 | #' @param BPPARAM A \code{MulticoreParam} object or NULL. When the parameter parallelization = 'mcmapply' or 'pbmcmapply',
 46 | #' this parameter must be NULL. When the parameter parallelization = 'bpmapply',  this parameter must be one of the
 47 | #' \code{MulticoreParam} object offered by the package 'BiocParallel. The default value is NULL.
 48 | #' @param trace A logic variable. If TRUE, the warning/error log and runtime for gam/gamlss
 49 | #' will be returned, FALSE otherwise. Default is FALSE.
 50 | #' @return A list with the components:
 51 | #' \describe{
 52 | #'   \item{\code{new_count}}{A matrix of the new simulated count (expression) matrix.}
 53 | #'   \item{\code{new_covariate}}{A data.frame of the new covariate matrix.}
 54 | #'   \item{\code{model_aic}}{The model AIC.}
 55 | #'   \item{\code{marginal_list}}{A list of marginal regression models if return_model = TRUE.}
 56 | #'   \item{\code{corr_list}}{A list of correlation models (conditional copulas) if return_model = TRUE.}
 57 | #' }
 58 | #' @examples
 59 | #' data(example_sce)
 60 | #' my_simu <- scdesign3(
 61 | #' sce = example_sce,
 62 | #' assay_use = "counts",
 63 | #' celltype = "cell_type",
 64 | #' pseudotime = "pseudotime",
 65 | #' spatial = NULL,
 66 | #' other_covariates = NULL,
 67 | #' mu_formula = "s(pseudotime, bs = 'cr', k = 10)",
 68 | #' sigma_formula = "1",
 69 | #' family_use = "nb",
 70 | #' n_cores = 2,
 71 | #' usebam = FALSE,
 72 | #' edf_flexible = FALSE,
 73 | #' corr_formula = "pseudotime",
 74 | #' copula = "gaussian",
 75 | #' if_sparse = TRUE,
 76 | #' DT = TRUE,
 77 | #' pseudo_obs = FALSE,
 78 | #' ncell = 1000,
 79 | #' return_model = FALSE
 80 | #' )
 81 | #'
 82 | #' @export scdesign3
 83 | scdesign3 <- function(sce,
 84 |                       assay_use = "counts",
 85 |                       celltype,
 86 |                       pseudotime = NULL,
 87 |                       spatial = NULL,
 88 |                       other_covariates,
 89 |                       ncell = dim(sce)[2],
 90 |                       mu_formula,
 91 |                       sigma_formula = "1",
 92 |                       family_use = "nb",
 93 |                       n_cores = 2,
 94 |                       correlation_function = "default",
 95 |                       usebam = FALSE,
 96 |                       edf_flexible = FALSE,
 97 |                       corr_formula,
 98 |                       empirical_quantile = FALSE,
 99 |                       copula = "gaussian",
100 |                       if_sparse = FALSE,
101 |                       fastmvn = FALSE,
102 |                       DT = TRUE,
103 |                       pseudo_obs = FALSE,
104 |                       family_set = c("gauss", "indep"),
105 |                       important_feature = "all",
106 |                       nonnegative = TRUE,
107 |                       nonzerovar = FALSE,
108 |                       return_model = FALSE,
109 |                       simplify = FALSE,
110 |                       parallelization = "mcmapply",
111 |                       n_rep = 1,
112 |                       BPPARAM = NULL,
113 |                       trace = FALSE) {
114 |   message("Input Data Construction Start")
115 | 
116 |   input_data <- construct_data(
117 |     sce = sce,
118 |     assay_use = assay_use,
119 |     celltype = celltype,
120 |     pseudotime = pseudotime,
121 |     spatial = spatial,
122 |     other_covariates = other_covariates,
123 |     ncell = ncell,
124 |     corr_by = corr_formula,
125 |     parallelization = parallelization,
126 |     BPPARAM = BPPARAM
127 |   )
128 |   message("Input Data Construction End")
129 | 
130 |   message("Start Marginal Fitting")
131 |   marginal_res <- fit_marginal(
132 |     mu_formula = mu_formula,
133 |     sigma_formula = sigma_formula,
134 |     n_cores = n_cores,
135 |     data = input_data,
136 |     family_use = family_use,
137 |     usebam = usebam,
138 |     edf_flexible = edf_flexible,
139 |     parallelization = parallelization,
140 |     BPPARAM = BPPARAM,
141 |     trace = trace, 
142 |     simplify = simplify
143 |   )
144 |   message("Marginal Fitting End")
145 | 
146 |   if(empirical_quantile == TRUE) {
147 |     message("Extract Empirical Quantile Matrices")
148 |     copula_res <- fit_copula(
149 |       sce = sce,
150 |       assay_use = assay_use,
151 |       input_data = input_data$dat,
152 |       marginal_list = marginal_res,
153 |       family_use = family_use,
154 |       empirical_quantile = TRUE,
155 |       copula = copula,
156 |       family_set = family_set,
157 |       n_cores = n_cores,
158 |       important_feature = important_feature,
159 |       if_sparse = if_sparse,
160 |       parallelization = parallelization,
161 |       BPPARAM = BPPARAM
162 |     )
163 |   } else {
164 |     message("Start Copula Fitting")
165 |     copula_res <- fit_copula(
166 |       sce = sce,
167 |       assay_use = assay_use,
168 |       input_data = input_data$dat,
169 |       marginal_list = marginal_res,
170 |       family_use = family_use,
171 |       copula = copula,
172 |       family_set = family_set,
173 |       n_cores = n_cores,
174 |       correlation_function = correlation_function,
175 |       important_feature = important_feature,
176 |       if_sparse = if_sparse,
177 |       parallelization = parallelization,
178 |       BPPARAM = BPPARAM
179 |     )
180 |     message("Copula Fitting End")
181 |   }
182 |   
183 |   
184 | 
185 |   message("Start Parameter Extraction")
186 |   para_list <- extract_para(
187 |     sce = sce,
188 |     assay_use = assay_use,
189 |     marginal_list = marginal_res,
190 |     n_cores = n_cores,
191 |     family_use = family_use,
192 |     new_covariate = input_data$newCovariate,
193 |     parallelization = parallelization,
194 |     BPPARAM = BPPARAM,
195 |     data = input_data$dat
196 |   )
197 |   message("Parameter
198 | Extraction End")
199 | 
200 |   message("Start Generate New Data")
201 |   
202 |   if(empirical_quantile == TRUE) {
203 |     new_count <- simu_new(
204 |       sce = sce,
205 |       assay_use= assay_use,
206 |       mean_mat = para_list$mean_mat,
207 |       sigma_mat = para_list$sigma_mat,
208 |       zero_mat = para_list$zero_mat,
209 |       quantile_mat = copula_res$quantile_mat,
210 |       copula_list = NULL,
211 |       n_cores = n_cores,
212 |       family_use = family_use,
213 |       nonnegative = nonnegative,
214 |       nonzerovar = nonzerovar,
215 |       input_data = input_data$dat,
216 |       new_covariate = input_data$newCovariate,
217 |       important_feature = copula_res$important_feature,
218 |       parallelization = parallelization,
219 |       BPPARAM = BPPARAM,
220 |       filtered_gene = input_data$filtered_gene
221 |     )
222 |   } else {
223 |     if(n_rep == 1) {
224 |       new_count <- simu_new(
225 |         sce = sce,
226 |         assay_use= assay_use,
227 |         mean_mat = para_list$mean_mat,
228 |         sigma_mat = para_list$sigma_mat,
229 |         zero_mat = para_list$zero_mat,
230 |         quantile_mat = NULL,
231 |         copula_list = copula_res$copula_list,
232 |         n_cores = n_cores,
233 |         family_use = family_use,
234 |         nonnegative = nonnegative,
235 |         nonzerovar = nonzerovar,
236 |         input_data = input_data$dat,
237 |         new_covariate = input_data$newCovariate,
238 |         important_feature = copula_res$important_feature,
239 |         parallelization = parallelization,
240 |         BPPARAM = BPPARAM,
241 |         filtered_gene = input_data$filtered_gene
242 |       )
243 |     } else {
244 |       new_count <- lapply(seq_len(n_rep), function(x) {
245 |         current_count <- simu_new(
246 |           sce = sce,
247 |           assay_use= assay_use,
248 |           mean_mat = para_list$mean_mat,
249 |           sigma_mat = para_list$sigma_mat,
250 |           zero_mat = para_list$zero_mat,
251 |           quantile_mat = NULL,
252 |           copula_list = copula_res$copula_list,
253 |           n_cores = n_cores,
254 |           family_use = family_use,
255 |           nonnegative = nonnegative,
256 |           nonzerovar = nonzerovar,
257 |           input_data = input_data$dat,
258 |           new_covariate = input_data$newCovariate,
259 |           important_feature = copula_res$important_feature,
260 |           parallelization = parallelization,
261 |           BPPARAM = BPPARAM,
262 |           filtered_gene = input_data$filtered_gene
263 |         )
264 |         current_count
265 |       })
266 |     }
267 |   }
268 |   
269 |   message("New Data Generating End")
270 | 
271 |   scdesign3_res <- list(
272 |     new_count = new_count,
273 |     new_covariate = input_data$newCovariate,
274 |     model_aic = copula_res$model_aic,
275 |     model_bic = copula_res$model_bic,
276 |     marginal_list = if (return_model)
277 |       marginal_res
278 |     else
279 |       NULL,
280 |     corr_list = if (return_model)
281 |       copula_res$copula_list
282 |     else
283 |       NULL
284 |   )
285 |   return(scdesign3_res)
286 | }
287 | 
288 | 
289 | 
290 | 
291 | 
292 | 


--------------------------------------------------------------------------------
/R/simu_new.R:
--------------------------------------------------------------------------------
  1 | #' Simulate new data
  2 | #'
  3 | #' \code{simu_new} generates new simulated data based on fitted marginal and copula models.
  4 | #'
  5 | #' The function takes the new covariate (if use) from \code{\link{construct_data}},
  6 | #' parameter matrices from \code{\link{extract_para}} and multivariate Unifs from \code{\link{fit_copula}}.
  7 | #'
  8 | #' @param sce A \code{SingleCellExperiment} object.
  9 | #' @param assay_use A string which indicates the assay you will use in the sce. Default is 'counts'.
 10 | #' @param mean_mat A cell by feature matrix of the mean parameter.
 11 | #' @param sigma_mat A cell by feature matrix of the sigma parameter.
 12 | #' @param zero_mat A cell by feature matrix of the zero-inflation parameter.
 13 | #' @param quantile_mat A cell by feature matrix of the multivariate quantile.
 14 | #' @param copula_list A list of copulas for generating the multivariate quantile matrix. If provided, the \code{quantile_mat} must be NULL.
 15 | #' @param n_cores An integer. The number of cores to use.
 16 | #' @param fastmvn An logical variable. If TRUE, the sampling of multivariate Gaussian is done by \code{mvnfast}, otherwise by \code{mvtnorm}. Default is FALSE.
 17 | #' @param family_use A string of the marginal distribution.
 18 | #' Must be one of 'poisson', "binomial", 'nb', 'zip', 'zinb' or 'gaussian'.
 19 | #' @param nonnegative A logical variable. If TRUE, values < 0 in the synthetic data will be converted to 0. Default is TRUE (since the expression matrix is nonnegative).
 20 | #' @param nonzerovar A logical variable. If TRUE, for any gene with zero variance, a cell will be replaced with 1. This is designed for avoiding potential errors, for example, PCA.
 21 | #' @param input_data A input count matrix.
 22 | #' @param new_covariate A data.frame which contains covariates of targeted simulated data from  \code{\link{construct_data}}.
 23 | #' @param important_feature important_feature A string or vector which indicates whether a gene will be used in correlation estimation or not. If this is a string, then
 24 | #' this string must be either "all" (using all genes) or "auto", which indicates that the genes will be automatically selected based on the proportion of zero expression across cells
 25 | #' for each gene. Gene with zero proportion greater than 0.8 will be excluded form gene-gene correlation estimation. If this is a vector, then this should
 26 | #' be a logical vector with length equal to the number of genes in \code{sce}. \code{TRUE} in the logical vector means the corresponding gene will be included in
 27 | #' gene-gene correlation estimation and \code{FALSE} in the logical vector means the corresponding gene will be excluded from the gene-gene correlation estimation.
 28 | #' The default value for is "all".
 29 | #' @param parallelization A string indicating the specific parallelization function to use.
 30 | #' Must be one of 'mcmapply', 'bpmapply', or 'pbmcmapply', which corresponds to the parallelization function in the package
 31 | #' \code{parallel},\code{BiocParallel}, and \code{pbmcapply} respectively. The default value is 'mcmapply'.
 32 | #' @param BPPARAM A \code{MulticoreParam} object or NULL. When the parameter parallelization = 'mcmapply' or 'pbmcmapply',
 33 | #' this parameter must be NULL. When the parameter parallelization = 'bpmapply',  this parameter must be one of the
 34 | #' \code{MulticoreParam} object offered by the package 'BiocParallel. The default value is NULL.
 35 | #' @param filtered_gene A vector or NULL which contains genes that are excluded in the marginal and copula fitting 
 36 | #' steps because these genes only express in less than two cells. This can be obtain from  \code{\link{construct_data}}
 37 | #' @return A feature by cell matrix of the new simulated count (expression) matrix or sparse matrix.
 38 | #' @examples
 39 | #'   data(example_sce)
 40 | #'   my_data <- construct_data(
 41 | #'   sce = example_sce,
 42 | #'   assay_use = "counts",
 43 | #'   celltype = "cell_type",
 44 | #'   pseudotime = "pseudotime",
 45 | #'   spatial = NULL,
 46 | #'   other_covariates = NULL,
 47 | #'   corr_by = "1"
 48 | #'   )
 49 | #'   my_marginal <- fit_marginal(
 50 | #'   data = my_data,
 51 | #'   mu_formula = "s(pseudotime, bs = 'cr', k = 10)",
 52 | #'   sigma_formula = "1",
 53 | #'   family_use = "nb",
 54 | #'   n_cores = 1,
 55 | #'   usebam = FALSE
 56 | #'   )
 57 | #'   my_copula <- fit_copula(
 58 | #'   sce = example_sce,
 59 | #'   assay_use = "counts",
 60 | #'   marginal_list = my_marginal,
 61 | #'   family_use = c(rep("nb", 5), rep("zip", 5)),
 62 | #'   copula = "vine",
 63 | #'   n_cores = 1,
 64 | #'   input_data = my_data$dat
 65 | #'   )
 66 | #'   my_para <- extract_para(
 67 | #'     sce = example_sce,
 68 | #'     marginal_list = my_marginal,
 69 | #'     n_cores = 1,
 70 | #'     family_use = c(rep("nb", 5), rep("zip", 5)),
 71 | #'     new_covariate = my_data$new_covariate,
 72 | #'     data = my_data$dat
 73 | #'   )
 74 | #'   my_newcount <- simu_new(
 75 | #'   sce = example_sce,
 76 | #'   mean_mat = my_para$mean_mat,
 77 | #'   sigma_mat = my_para$sigma_mat,
 78 | #'   zero_mat = my_para$zero_mat,
 79 | #'   quantile_mat = NULL,
 80 | #'   copula_list = my_copula$copula_list,
 81 | #'   n_cores = 1,
 82 | #'   family_use = c(rep("nb", 5), rep("zip", 5)),
 83 | #'   input_data = my_data$dat,
 84 | #'   new_covariate = my_data$new_covariate,
 85 | #'   important_feature = my_copula$important_feature,
 86 | #'   filtered_gene = my_data$filtered_gene
 87 | #'   )
 88 | #'
 89 | #' @export simu_new
 90 | 
 91 | simu_new <- function(sce,
 92 |                      assay_use = "counts",
 93 |                      mean_mat,
 94 |                      sigma_mat,
 95 |                      zero_mat,
 96 |                      quantile_mat = NULL,
 97 |                      copula_list,
 98 |                      n_cores,
 99 |                      fastmvn = FALSE,
100 |                      family_use,
101 |                      nonnegative = TRUE,
102 |                      nonzerovar = FALSE,
103 |                      input_data,
104 |                      new_covariate,
105 |                      important_feature = "all",
106 |                      parallelization = "mcmapply",
107 |                      BPPARAM = NULL,
108 |                      filtered_gene){
109 |   if(!is.null(quantile_mat) & !is.null(copula_list)) {
110 |     stop("You can only provide either the quantile_mat or the copula_list!")
111 |   }
112 | 
113 |   # check if user inputted new covariates
114 |   data_temp <- input_data[,colnames(new_covariate), drop = FALSE]
115 |   if(identical(data_temp, new_covariate)){
116 |     new_covariate <- NULL
117 |   }
118 |   
119 |   qc_gene_idx <- which(!rownames(sce) %in% filtered_gene)
120 |   if(length(family_use) != 1){
121 |     family_use <- family_use[qc_gene_idx]
122 |   }
123 |  
124 |     if(!is.null(quantile_mat)) {
125 |       message("Multivariate quantile matrix is provided")
126 |       
127 |       newmvn_full <- matrix(NA, nrow = dim(quantile_mat)[1], ncol = dim(sce)[1])
128 |       rownames(newmvn_full) <- rownames(quantile_mat)
129 |       colnames(newmvn_full) <- rownames(sce)
130 |       newmvn_full[rownames(quantile_mat), colnames(quantile_mat)] <- quantile_mat
131 |       quantile_mat <- as.matrix(newmvn_full)
132 |       
133 |       
134 |     } else {
135 |       message("Use Copula to sample a multivariate quantile matrix")
136 |       
137 |       group_index <- unique(input_data$corr_group)
138 |       corr_group <- as.data.frame(input_data$corr_group)
139 |       colnames(corr_group) <- "corr_group"
140 |       ngene <- length(qc_gene_idx)
141 |       if (is.null(new_covariate)) {
142 |         new_corr_group <- NULL
143 |       } else{
144 |         new_corr_group <- as.data.frame(new_covariate$corr_group)
145 |         colnames(new_corr_group) <- "corr_group"
146 |       }
147 |       ind <- group_index[1] == "ind"
148 |       newmvn.list <-
149 |         lapply(group_index, function(x,
150 |                                      sce,
151 |                                      corr_group,
152 |                                      new_corr_group,
153 |                                      ind,
154 |                                      n_cores,
155 |                                      copula_list) {
156 |           message(paste0("Sample Copula group ", x, " starts"))
157 |           curr_index <- which(corr_group[, 1] == x)
158 |           if (is.null(new_covariate)) {
159 |             curr_ncell <- length(curr_index)
160 |             curr_ncell_idx <- curr_index
161 |           } else{
162 |             curr_ncell <- length(which(new_corr_group[, 1] == x))
163 |             curr_ncell_idx <-which(new_corr_group[, 1] == x)
164 |             #paste0("Cell", which(new_corr_group[, 1] == x))
165 |           }
166 |           cor.mat <- copula_list[[x]]
167 |           
168 |           if(curr_ncell == 0) {
169 |             new_mvu <- NULL
170 |           } else {
171 |             if (methods::is(cor.mat, "matrix") | methods::is(cor.mat, "dsCMatrix")) {
172 |               #message(paste0("Group ", group_index, " Start"))
173 |               
174 |               #message("Sample MVN")
175 |               #sample from mvn for important genes only
176 |               corr_gene_idx <- apply(cor.mat, 2, function(x) length(which(x < 1e-5)) != length(x)-1)
177 |               corr_gene <- colnames(cor.mat)[which(corr_gene_idx)]
178 |               if(length(corr_gene)!=0) {
179 |                 new_mvn_important <- sampleMVN(n = curr_ncell,
180 |                                                Sigma = cor.mat[corr_gene, corr_gene],
181 |                                                n_cores = n_cores,
182 |                                                fastmvn = fastmvn)
183 |               
184 |               colnames(new_mvn_important) <- corr_gene} else {
185 |                 new_mvn_important <- NULL
186 |               }
187 |               #message("MVN Sampling End")
188 |               ind_gene <- colnames(cor.mat)[which(corr_gene_idx==FALSE)]
189 |               if(length(ind_gene) > 0){
190 |                 new_mvn_non_important <- lapply(ind_gene, function(x) return(stats::rnorm(n = curr_ncell)))
191 |                 new_mvn_non_important_mat <- do.call("cbind",new_mvn_non_important)
192 |                 colnames(new_mvn_non_important_mat) <- ind_gene
193 |                 
194 |                 mvnrvq <- apply(new_mvn_non_important_mat, 2, stats::pnorm)
195 |                 new_mvu <- cbind(new_mvn_important, mvnrvq)
196 |                 new_mvu <- new_mvu[,colnames(cor.mat)]
197 |               }else{
198 |                 new_mvu <- new_mvn_important
199 |               }
200 | 
201 |               rownames(new_mvu) <- curr_ncell_idx
202 |             } else if (methods::is(cor.mat, "vinecop")) {
203 |               new_mvu <- matrix(0, nrow = curr_ncell, ncol = ngene)
204 |               #message("Sampling Vine Copula Starts")
205 |               mvu <- rvinecopulib::rvinecop(
206 |                 curr_ncell,
207 |                 vine = cor.mat,
208 |                 cores = n_cores,
209 |                 qrng = TRUE
210 |               )
211 |               new_mvu[, which(important_feature)] <- mvu
212 |               if(length(which(important_feature)) != ngene){
213 |                 cor.mat <- diag(rep(1, length(which(!important_feature))))
214 |                 mvu2 <- sampleMVN(n = curr_ncell,
215 |                                   Sigma = cor.mat,
216 |                                   n_cores = n_cores,
217 |                                   fastmvn = fastmvn)
218 |                 new_mvu[, which(!important_feature)] <- mvu2
219 |               }
220 |               #message("Sampling Vine Copula Ends")
221 |               rownames(new_mvu) <- curr_ncell_idx
222 |             } else if (ind) {
223 |               "Use independent copula (random Unif)."
224 |               new_mvu <-
225 |                 matrix(data = stats::runif(curr_ncell * ngene),
226 |                        nrow = curr_ncell)
227 |               rownames(new_mvu) <- curr_ncell_idx
228 |             } else{
229 |               stop("Copula must be one from 'vine' or 'gaussian', or assume gene-gene is independent")
230 |             }
231 |           }
232 |           return(
233 |             list(
234 |               new_mvu = new_mvu
235 |             )
236 |           )
237 |         }, sce = sce, ind = ind, n_cores = n_cores, corr_group = corr_group, new_corr_group = new_corr_group, copula_list = copula_list)
238 |       
239 |       newmvn <-
240 |         do.call(rbind, lapply(newmvn.list, function(x)
241 |           x$new_mvu))
242 |       newmvn[as.numeric(rownames(newmvn)),] <- newmvn
243 |       rownames(newmvn) <- as.character(1:dim(newmvn)[1])
244 |       colnames(newmvn) <- rownames(sce)[qc_gene_idx]
245 |       newmvn_full <- matrix(NA, nrow = dim(newmvn)[1], ncol = dim(sce)[1])
246 |       rownames(newmvn_full) <- rownames(newmvn)
247 |       colnames(newmvn_full) <- rownames(sce)
248 |       newmvn_full[rownames(newmvn), colnames(newmvn)] <- newmvn
249 |       quantile_mat <- as.matrix(newmvn_full)
250 |     }
251 |     
252 |     mat_function <- function(x, y) {
253 |       
254 |       idx <- which(mean_mat[,x] !=0)
255 |       para_mat <- cbind(mean_mat[idx, x], sigma_mat[idx, x], quantile_mat[idx, x], zero_mat[idx, x])
256 |       
257 |       if (y == "binomial") {
258 |         qfvec <- stats::qbinom(p = para_mat[, 3], prob = para_mat[, 1], size = 1)
259 |       } else if (y == "poisson") {
260 |         
261 |         qfvec <- stats::qpois(p = para_mat[, 3], lambda = para_mat[, 1])
262 |       } else if (y == "gaussian") {
263 |         qfvec <-
264 |           gamlss.dist::qNO(p = para_mat[, 3],
265 |                            mu = para_mat[, 1],
266 |                            sigma = abs(para_mat[, 2]))
267 |       } else if (y == "nb") {
268 |         qfvec <-
269 |           gamlss.dist::qNBI(p = para_mat[, 3],
270 |                             mu = para_mat[, 1],
271 |                             sigma = para_mat[, 2])
272 |       } else if (y == "zip") {
273 |         qfvec <-
274 |           gamlss.dist::qZIP(p = para_mat[, 3],
275 |                             mu = para_mat[, 1],
276 |                             sigma = ifelse(para_mat[, 4] != 0, para_mat[, 4],  2.2e-16))## Avoid zero zero-inflated prob
277 |       } else if (y == "zinb") {
278 |         
279 |         qfvec <-
280 |           gamlss.dist::qZINBI(p = para_mat[, 3],
281 |                               mu = para_mat[, 1],
282 |                               sigma = para_mat[, 2],
283 |                               nu = ifelse(para_mat[, 4] != 0, para_mat[, 4],  2.2e-16))
284 |       } else {
285 |         stop("Distribution of gamlss must be one of gaussian, poisson, nb, zip or zinb!")
286 |       }
287 |       
288 |       #message(paste0("Gene ", x , " End!"))
289 |       
290 |       r <- as.vector(qfvec)
291 |       if(length(r) < total_cells){
292 |         new_r <- rep(0, total_cells)
293 |         new_r[idx] <- r
294 |         names(new_r) <- cell_names
295 |         r <- new_r
296 |       }
297 |       r
298 |     }
299 | 
300 |   
301 | 
302 | 
303 |   ## New count
304 |   paraFunc <- parallel::mcmapply
305 |   if(.Platform$OS.type == "windows"){
306 |     BPPARAM <- BiocParallel::SnowParam()
307 |     parallelization <- "bpmapply"
308 |   }
309 |   if(parallelization == "bpmapply"){
310 |     paraFunc <- BiocParallel::bpmapply
311 |   }
312 |   if(parallelization == "pbmcmapply"){
313 |     paraFunc <- pbmcapply::pbmcmapply
314 |   }
315 | 
316 |   if(is.null(new_covariate)){
317 |     total_cells <- dim(sce)[2]
318 |     cell_names <- colnames(sce)
319 |   }else{
320 |     total_cells <- dim(new_covariate)[1]
321 |     cell_names <- rownames(new_covariate)
322 |   }
323 |   if(parallelization == "bpmapply"){
324 |     if(class(BPPARAM)[1] != "SerialParam"){
325 |       BPPARAM$workers <- n_cores
326 |     }
327 |     mat <-  paraFunc(mat_function, x = seq_len(dim(sce)[1])[qc_gene_idx], y = family_use, SIMPLIFY = TRUE, BPPARAM = BPPARAM)
328 |   }else{
329 |     mat <- paraFunc(mat_function, x = seq_len(dim(sce)[1])[qc_gene_idx], y = family_use, SIMPLIFY = TRUE
330 |                   , mc.cores = n_cores
331 |                    )
332 |  }
333 |   new_count <- mat #simplify2array(mat)
334 |   rownames(new_count) <- cell_names
335 |   colnames(new_count) <- rownames(sce)[qc_gene_idx]
336 | 
337 |   if(length(qc_gene_idx) < dim(sce)[1]){
338 |     temp_count <- matrix(0, total_cells, dim(sce)[1])
339 |     rownames(temp_count) <- cell_names
340 |     colnames(temp_count) <- rownames(sce)
341 |     temp_count[rownames(new_count),colnames(new_count)] <- new_count
342 |     new_count <- temp_count
343 |   }
344 |   new_count <- as.matrix(t(new_count))
345 | 
346 |   if(nonnegative) new_count[new_count < 0] <- 0
347 | 
348 |   if(nonzerovar) {
349 |     row_vars <- matrixStats::rowVars(new_count[qc_gene_idx,])
350 |     if(sum(row_vars == 0) > 0) {
351 |       message("Some genes have zero variance. Replace a random one with 1.")
352 |       row_vars_index <- which(row_vars == 0)
353 |       col_index <- seq_len(dim(new_count)[2])
354 |       for(i in row_vars_index) {
355 |         new_count[i, sample(col_index, 1)] <- 1
356 |       }
357 |     }
358 |   }
359 |   new_count <- as.matrix(new_count)
360 |   if(methods::is(SummarizedExperiment::assay(sce, assay_use), "sparseMatrix")){
361 |     new_count<- Matrix::Matrix(new_count, sparse = TRUE)
362 |   }
363 | 
364 |   return(new_count)
365 | }
366 | 


--------------------------------------------------------------------------------
/R/sparse_cov.R:
--------------------------------------------------------------------------------
  1 | #' This function computes the thresholding sparse covariance/correlation estimator 
  2 | #' with the optimal threshold level.
  3 | #'
  4 | #' Part from Chenxin Jiang
  5 | #'
  6 | #' @param data The data matrix.
  7 | #' @param method The choice of method to select the optimal threshold level.
  8 | #' @param operator The choice of the thresholding operator.
  9 | #' @param corr The indicator of computing correlation or covariance matrix.
 10 | #'
 11 | #' @return The thresholding sparse covariance/correlation estimator.
 12 | #' @export sparse_cov
 13 | #' 
 14 | #' @examples
 15 | #' print("No example")
 16 | 
 17 | sparse_cov <- function(data, 
 18 |                        method=c('cv', 'qiu'),
 19 |                        operator=c('hard', 'soft', 'scad', 'al'),
 20 |                        corr=TRUE){
 21 |   p <- dim(data)[2]
 22 |   n <- dim(data)[1]
 23 |   
 24 |   # sample covariance
 25 |   z <- covariance(data)
 26 |   
 27 |   # select the optimal thresholding level
 28 |   delta <- est_delta(data, method=method, operator=operator)
 29 |   s <- thresh_op(z, operator=operator, delta=delta, n=n)
 30 |   
 31 |   # Modify s to make it psd
 32 |   tol <- 1e-6
 33 |   ev <- eigen(s, symmetric=TRUE, only.values = TRUE)$values
 34 |   s1 <- s + (tol-min(ev))*diag(dim(s)[1]) 
 35 |   
 36 |   if(corr){
 37 |     # make corr
 38 |     s1_corr <- Matrix::cov2cor(s1)
 39 |     output <- s1_corr
 40 |   }else{
 41 |     output <- s1
 42 |   }
 43 |   
 44 |   return(output)
 45 | }
 46 | 
 47 | 
 48 | 
 49 | 
 50 | 
 51 | ## This function computes the thresholding sparse covariance estimator for a given threshold level.
 52 | thresh_op <- function(z, operator, delta, n){
 53 |   if(operator == 'hard'){
 54 |     s_method <- s_hard
 55 |   }else if(operator == 'soft'){
 56 |     s_method <- s_soft
 57 |   }else if(operator == 'scad'){
 58 |     s_method <- s_scad
 59 |   }else if(operator == 'al'){
 60 |     s_method <- s_al
 61 |   }else{
 62 |     stop('Please specify a valid thresholding operator.')
 63 |   }
 64 |   s_method(z, delta, n)
 65 | }
 66 | 
 67 | # Operator 1: Hard Thresholding
 68 | s_hard <- function(z, delta, n){
 69 |   p<-dim(z)[1]
 70 |   lambda <- sqrt(log(p)/n)*delta
 71 |   output <- (z>lambda)*z
 72 |   diag(output) <- diag(z)
 73 |   return(output)
 74 | }
 75 | 
 76 | # Operator 2: Soft Thresholding
 77 | s_soft <- function(z, delta, n){
 78 |   p <- dim(z)[1]
 79 |   lambda <- sqrt(log(p)/n)*delta
 80 |   z0 <- abs(z)-lambda
 81 |   output <- sign(z)*(z0>0)*z0
 82 |   diag(output) <- diag(z)
 83 |   return(output)
 84 | }
 85 | 
 86 | 
 87 | # Operator 3: SCAD (smoothly clipped absolute deviation)
 88 | s_scad <- function(z, delta, n, a=3.7){
 89 |   p <- dim(z)[1]
 90 |   lambda <- sqrt(log(p)/n)*delta
 91 |   
 92 |   output <- matrix(NA, dim(z)[1], dim(z)[2])
 93 |   
 94 |   index1 <- which(abs(z)<=2*lambda)
 95 |   z0 <- abs(z)-lambda
 96 |   output[index1] <- sign(z[index1])*(z0[index1]>0)*z0[index1]
 97 |   
 98 |   index2 <- which(abs(z)>2*lambda & abs(z)<=a*lambda)
 99 |   output[index2] <- ((a-1)*z[index2]-sign(z[index2])*a*lambda)/(a-2)
100 |   
101 |   index3 <- which(abs(z)>a*lambda)
102 |   output[index3] <- z[index3]
103 |   
104 |   diag(output) <- diag(z)
105 |   return(output)
106 | }
107 | 
108 | 
109 | # Operator 4: Adaptive lasso
110 | s_al <- function(z, delta, n){
111 |   p <- dim(z)[1]
112 |   lambda <- sqrt(log(p)/n)*delta
113 |   
114 |   eta <- 3
115 |   z0 <- abs(z) - lambda^(eta+1)*abs(z)^(-eta)
116 |   output <- sign(z)*(z0>0)*z0
117 |   diag(output) <- diag(z)
118 |   return(output)
119 | }
120 | 
121 | 
122 | 
123 | 
124 | 
125 | 
126 | ## This function select the optimal thresholding level delta
127 | est_delta <- function(data, 
128 |                       method=c('cv', 'qiu'),
129 |                       operator=c('hard', 'soft', 'scad', 'al')){
130 |   n <- dim(data)[1]
131 |   if((method=='qiu') ){
132 |     s <- covariance(data)
133 |     delta <- qiu.select(data, s)
134 |   }else if(method=='cv'){
135 |     delta <- cv.min(data, operator)
136 |   }else{
137 |     stop('Please specify a valid thresholding method and an operator function.')
138 |   }
139 |   return(delta)
140 | }
141 | 
142 | 
143 | ### Qiu function to tune delta
144 | qiu.select = function(data, s=NULL){
145 |   n <- dim(data)[1]
146 |   p <- dim(data)[2]
147 |   
148 |   if(is.null(s)){
149 |     s <- covariance(data)
150 |   }
151 |   
152 |   # standardized covariance of Sigma
153 |   eta <- sqrt(n/log(p))*s 
154 |   # select lower triangular
155 |   ltrig <- abs(eta[lower.tri(eta, diag = FALSE)])
156 |   
157 |   # parameter to select the optimal thr
158 |   a <- min(sqrt(2+log(n)/log(p)), 2)
159 |   a1 <- 2 - a
160 |   a0 <- (log(log(p)))^(-1/2)
161 |   M <- sum(((a1+a0)<ltrig) & (ltrig<=2))
162 |   q <- (p-1)*p/2
163 |   plog <- (log(p))^(1/2)
164 |   V <- 2*q*(stats::pnorm(2*plog)-stats::pnorm((a1+a0)*plog))
165 |   N2 <- max(M-V, plog)
166 |   
167 |   delta <- sqrt(2*(2-log(N2*1/plog)/log(p)))
168 |   
169 |   return(delta)
170 | }
171 | 
172 | #### CV to tune delta
173 | # The tuning parameter lambda was selected by minimizing the Frobenius norm of the 
174 | # difference between s(sample_cov(train)) and sample_cov(test) by cross validation
175 | 
176 | # cv.select: For implementation, we choose the optimal delta from a delta.list 
177 | # instead of really solving the optimization problem.
178 | # lambda.list ranges in [0,lambda.max], with a user specified length.
179 | 
180 | # cv.min: solve the optimization problem directly (currently use this)
181 | 
182 | cv.select <- function(data, 
183 |                       operator, 
184 |                       fold=5, 
185 |                       delta.length=10, 
186 |                       delta.max=2){
187 |   
188 |   n <- dim(data)[1]
189 |   p <- dim(data)[2]
190 |   
191 |   
192 |   # Create delta list
193 |   delta.list <- seq(0, delta.max, length.out=delta.length)
194 |   
195 |   # Create equal size folds
196 |   folds <- cut(seq(1, nrow(data)), breaks=fold, labels=FALSE)
197 |   
198 |   fold_losses <- rep(0, fold) # Record fold loss 
199 |   cv_errors <- rep(0, delta.length) # Record average estimated loss
200 |   
201 |   
202 |   ## Perform k fold cross validation
203 |   
204 |   # iterate over the values of lambda
205 |   for(j in 1:delta.length){
206 |     
207 |     # iterate over CV folds
208 |     for(i in 1:fold){
209 |       # Segment the data by fold 
210 |       testIndexes <- which(folds==i, arr.ind=TRUE) # i-th fold as the testing data
211 |       #trainIndexes <- which(folds!=i, arr.ind=TRUE) # i-th fold as the testing data
212 |       testData <- data[testIndexes, ]
213 |       trainData <- data[-testIndexes, ]
214 |       # Get the covariance matrix estimator s based on the training data
215 |       sample.cov.train <- covariance(trainData)
216 |       s <- thresh_op(sample.cov.train, operator=operator, delta = delta.list[j], n=n)
217 |       # Compute Frobenius risk = norm(distance of s and covariance of testing data)
218 |       sample.cov.test <- covariance(testData)
219 |       fold_losses[i] <- norm(s - sample.cov.test, type='F')
220 |     }
221 |     # Get the average estimated loss of CV
222 |     cv_errors[j] <-  mean(fold_losses)
223 |   }
224 |   
225 |   # Find the lambda that minimize cv_errors
226 |   delta <- delta.list[which.min(cv_errors)]
227 |   
228 |   delta
229 | }
230 | 
231 | 
232 | 
233 | cv.min <- function(data, 
234 |                    operator, 
235 |                    fold=5,
236 |                    delta.max=2){
237 |   n <- dim(data)[1]
238 |   p <- dim(data)[2]
239 |   
240 |   
241 |   ## Perform k fold cross validation
242 |   cv.loss <- function(delta){
243 |     # Create equal size folds
244 |     folds <- cut(seq(1, nrow(data)), breaks=fold, labels=FALSE)
245 |     fold_losses <- rep(0, fold) # Record fold loss 
246 |     
247 |     # iterate over CV folds
248 |     for(i in seq_len(fold)){
249 |       # Segment the data by fold 
250 |       testIndexes <- which(folds==i, arr.ind=TRUE) # i-th fold as the testing data
251 |       testData <- data[testIndexes, ]
252 |       trainData <- data[-testIndexes, ]
253 |       # Get the covariance matrix estimator s based on the training data
254 |       sample.cov.train <- covariance(trainData)
255 |       s <- thresh_op(sample.cov.train, operator=operator, delta = delta, n=n)
256 |       # Compute Frobenius risk = norm(distance of s and covariance of testing data)
257 |       sample.cov.test <- covariance(testData)
258 |       fold_losses[i] <- norm(s - sample.cov.test, type='F')
259 |     }
260 |     # Get the average estimated loss of CV
261 |     cv_errors <- mean(fold_losses)
262 |     cv_errors
263 |   }
264 |   
265 |   # Solve the optimization problem
266 |   res <- stats::optimize(cv.loss, c(0, delta.max))
267 |   delta <- res$minimum
268 |   
269 |   delta
270 | }
271 | 
272 | 
273 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## scDesign3
  2 | 
  3 | ------------------------------------------------------------------------
  4 | 
  5 | The R package **scDesign3** is an all-in-one single-cell data simulation tool by using reference datasets with different cell states (cell types, trajectories or and spatial coordinates), different modalities (gene expression, chromatin accessibility, protein abundance, DNA methylation, etc), and complex experimental designs. The transparent parameters enable users to alter models as needed; the model evaluation metrics (AIC, BIC) and convenient visualization function help users select models. <span style="color:blue"> **Detailed tutorials that illustrate various functionalities of scDesign3 are available at this [website](https://songdongyuan1994.github.io/scDesign3/docs/index.html)**</span>. The following illustration figure summarizes the usage of scDesign3:
  6 | 
  7 | <img src="man/figures/scDesign3_illustration.png" width="600"/>
  8 | 
  9 | 
 10 | To find out more details about **scDesign3**, you can check out our manuscript on Nature Biotechnology:
 11 | 
 12 | [Song, D., Wang, Q., Yan, G. *et al.* scDesign3 generates realistic in silico data for multimodal single-cell and spatial omics. *Nat Biotechnol* **42**, 247–252 (2024).](https://www.nature.com/articles/s41587-023-01772-1)
 13 | 
 14 | The computational time is quadratic to the number of features used in copula modeling. Reducing this number will greatly speed up the calculation.
 15 | 
 16 | Please note that the parallel computing of scDesign3 is mainly designed for **UNIX OS**; be careful when you set `n_cores`. Please note that you should consider **the balance** between `n_cores` and your ROM (memory). Simply increasing the number of cores without the increase of memory will slow down or froze your program. We recommend that you should allocate at least 1 GB for 1 core.
 17 | 
 18 | # Table of contents
 19 | 1. [Installation](#installation-)
 20 | 2. [Quick Start](#quick-start)
 21 | 3. [Tutorials](#tutorials)
 22 | 4. [Contact](#contact)
 23 | 5. [Related Manuscripts](#related-manuscripts)
 24 | 
 25 | 
 26 | ## Installation<a name="installation-"></a>
 27 | 
 28 | To install the development version from GitHub, please run:
 29 | 
 30 | ``` r
 31 | if (!require("devtools", quietly = TRUE))
 32 |     install.packages("devtools")
 33 | devtools::install_github("SONGDONGYUAN1994/scDesign3")
 34 | ```
 35 | 
 36 | We are now working on submitting it to Bioconductor and will provide the link once online.
 37 | 
 38 | ## Quick Start<a name="quick-start"></a>
 39 | 
 40 | The following code is a quick example of running our simulator. The function `scdesign3()` takes in a `SinglecellExperiment` object with the cell covariates(such as cell types, pseudotime, or spatial coordinates) stored in the `colData` of the `SinglecellExperiment` object. For more details on the `SinlgeCellExperiment` object, please check on its [Bioconductor link](https://bioconductor.org/packages/release/bioc/html/SingleCellExperiment.html).
 41 | 
 42 | ``` r
 43 | example_simu <- scdesign3(
 44 |     sce = example_sce,
 45 |     assay_use = "counts",
 46 |     celltype = "cell_type",
 47 |     pseudotime = "pseudotime",
 48 |     spatial = NULL,
 49 |     other_covariates = NULL,
 50 |     mu_formula = "s(pseudotime, k = 10, bs = 'cr')",
 51 |     sigma_formula = "s(pseudotime, k = 5, bs = 'cr')",
 52 |     family_use = "nb",
 53 |     n_cores = 2,
 54 |     correlation_function = "default",
 55 |     usebam = FALSE,
 56 |     corr_formula = "1",
 57 |     copula = "gaussian",
 58 |     fastmvn = FALSE,
 59 |     DT = TRUE,
 60 |     pseudo_obs = FALSE,
 61 |     family_set = c("gauss", "indep"),
 62 |     important_feature = "all",
 63 |     nonnegative = TRUE,
 64 |     return_model = FALSE,
 65 |     nonzerovar = FALSE,
 66 |     parallelization = "mcmapply",
 67 |     BPPARAM = NULL,
 68 |     trace = FALSE
 69 |   )
 70 | ```
 71 | 
 72 | The parameters of `scdesign3()` are:
 73 | 
 74 | - `sce`: A SingleCellExperiment object.
 75 | - `assay_use`: A string which indicates the assay you will use in the sce. Default is 'counts'.
 76 | - `celltype`: A string of the column name of the cell type variable in the colData of the sce. Default is 'cell_type'. The cell type variable in the colData of the sce should be a factor variable. Use NULL if there is no column in the colData that contains the cell-type information. 
 77 | - `pseudotime`: A string or a string vector of the name of pseudotime and (if exist) multiple lineages. Default is NULL. If the data only has one lineage, then this parameter should be the column name of the pseudotime variable in the colData of the sce. If the data has multiple lingaes, then this parameter be the column names of the pseudotime variables for each lineage and the variables indicating which lineage that a cell belongs to. The pseudotime variables should be continuous numeric variables.
 78 | - `spatial`: A length-two string vector of the column names of spatial coordinates in the colData of sce. Default is NULL. 
 79 | - `other_covariate`: A string or a string vector of the other covariates in the colData of sce you want to include in the data. For example, you can put the column names of the batch variables and/or condition variables in the colData of sce here if your sce contains these information and you want to include these variables in`mu_formula` or `sigma_formula` or `corr_formula`.
 80 | - `mu_formula`: A string of the mu parameter formula for fitting each gene's marginal distribution.
 81 | - `sigma_formula`: A string of the sigma parameter formula for fitting each gene's marginal distribution.
 82 | - `family_use`: A string of the marginal distribution you want to use when fitting each gene's marginal distribution. Must be one of 'poisson', 'nb', 'zip', 'zinb' or 'gaussian'.
 83 | - `n_cores`: An integer. The number of cores to use.
 84 | - `correlation_function`: A string. If 'default', the function from Rfast; if 'coop', the function from coop, which calls BLAS.
 85 | - `usebam`: A logic variable. If TRUE, use bam (generalized additive models for very large datasets) for acceleration.
 86 | - `edf_flexible`: A logic variable. If TRUE, the degree of freedom for each gene's regression model will be automatically selected for acceleration.
 87 | - `corr_formula`: A string of the correlation structure. For example, if you want to obtain a correlation structure for each cell type, then this parameter should be the column name of the cell type variable in the colData of sce.
 88 | - `copula`: A string of the copula choice. Must be one of 'gaussian' or 'vine'. Default is 'gaussian'. Note that vine copula may have better modeling of high-dimensions, but can be very slow when features are >1000.
 89 | - `fastmvn`: An logical variable. If TRUE, the sampling of multivariate Gaussian is done by mvnfast, otherwise by mvtnorm. Default is FALSE. It only matters for Gaussian copula.
 90 | - `DT`: A logic variable. If TRUE, perform the distributional transformation to make the discrete data 'continuous'. This is useful for discrete distributions (e.g., Poisson, NB). Default is TRUE. Note that for continuous data (e.g., Gaussian), DT does not make sense and should be set as FALSE.
 91 | - `pseudo_obs`: A logic variable. If TRUE, use the empirical quantiles instead of theoretical quantiles for fitting copula. Default is FALSE.
 92 | - `family_set`: A string or a string vector of the bivariate copula families. Default is c("gauss", "indep").
 93 | - `important_feature`: A string or vector which indicates whether a gene will be used in correlation estimation or not. If this is a string, then this string must be "all" or "auto", which indicates that all genes are used or the genes will be automatically selected based on the proportion of zero expression across cells for each gene. Gene with zero proportion greater than 0.8 will be excluded form gene-gene correlation estimation. If this is a vector, then this should be a logical vector with length equal to the number of genes in sce. TRUE in the logical vector means the corresponding gene will be included in gene-gene correlation estimation and FALSE in the logical vector means the corresponding gene will be excluded from the gene-gene correlation estimation. The default value is "all".
 94 | - `nonnegative`: A logical variable. If TRUE, values < 0 in the synthetic data will be converted to 0. Default is TRUE (since the expression matrix is nonnegative).
 95 | - `return_model`: A logic variable. If TRUE, the marginal models and copula models will be returned. Default is FALSE.
 96 | - `nonzerovar`: A logical variable. If TRUE, for any gene with zero variance, a cell will be replaced with 1. This is designed for avoiding potential errors, for example, PCA.
 97 | - `parallelization`: A string indicating the specific parallelization function to use. Must be one of 'mcmapply', 'bpmapply', or 'pbmcmapply', which corresponds to the parallelization function in the package parallel,BiocParallel, and pbmcapply respectively. The default value is 'mcmapply'.
 98 | - `BPPARAM`: A MulticoreParam object or NULL. When the parameter parallelization = 'mcmapply' or 'pbmcmapply', this parameter must be NULL. When the parameter parallelization = 'bpmapply', this parameter must be one of the MulticoreParam object offered by the package 'BiocParallel. The default value is NULL.
 99 | - `TRACE`: A logic variable. If TRUE, the warning/error log and runtime for gam/gamlss will be returned, FALSE otherwise. Default is FALSE.
100 | 
101 | The output of `scdesign3()` is a list which includes:
102 | 
103 | -   `new_count`: This is the synthetic count matrix generated by `scdesign3()`.
104 | -   `new_covariate`:
105 |     -   If the parameter `ncell` is set to a number that is different from the number of cells in the input data, this will be a matrix that has the new cell covariates that are used for generating new data.
106 |     -   If the parameter `ncell` is the default value, this will be `NULL`.
107 | -   `model_aic`: This is a vector include the genes' marginal models' AIC, fitted copula's AIC, and total AIC, which is the sum of the previous two.
108 | -   `model_bic`: This is a vector include the genes' marginal models' BIC, fitted copula's BIC, and total BIC, which is the sum of the previous two.
109 | -   `marginal_list`:
110 |     -   If the parameter `return_model` is set to `TRUE`, this will be a list which contains the fitted gam or gamlss model for all genes in the input data. This may greatly increase the object size.
111 |     -   If the parameter `return_model` is set to the default value `FALSE`, this will be `NULL`.
112 | -   `corr_list`:
113 |     -   If the parameter `return_model` is set to `TRUE`, this will be a list which contains either a correlation matrix (when `copula = "gaussian"`) or the fitted Vine copula (when `copula = "vine"`) for each user specified correlation groups (based on the parameter `corr_by`).
114 |     -   If the parameter `return_model` is set to the default value `FALSE`, this will be `NULL`.
115 | 
116 | For more details about the `mu_formula` and `sigma_formula` formula specification, please check online materials about the package [mgcv](https://cran.r-project.org/web/packages/mgcv/index.html). Technically speaking, you can try any formulas as long as they are available for **mgcv**.
117 | 
118 | ## Tutorials<a name="tutorials"></a>
119 | 
120 | For all detailed tutorials, please check the [website](https://songdongyuan1994.github.io/scDesign3/docs/index.html). The tutorials will demonstrate the applications of **scDesign3** from the following four perspectives: data simulation, model parameters, model selection, and model alteration.
121 | 
122 | -   Data simulation
123 |     -   [Simulate datasets with cell type and modified cell-type proportions](https://songdongyuan1994.github.io/scDesign3/docs/articles/scDesign3-cellType-vignette.html)
124 |     -   [Simulate datasets with cell library size](https://songdongyuan1994.github.io/scDesign3/docs/articles/scDesign3-librarySize-vignette.html)
125 |     -   [Simulate datasets with multiple lineages](https://songdongyuan1994.github.io/scDesign3/docs/articles/scDesign3-multipleLineages-vignette.html)
126 |     -   [Simulate spatial transcriptomic data](https://songdongyuan1994.github.io/scDesign3/docs/articles/scDesign3-spatial-vignette.html)
127 |     -   [Simulate spot-resolution spatial data for cell-type deconvolution](https://songdongyuan1994.github.io/scDesign3/docs/articles/scDesign3-spatial-deconvolution.html)
128 |     -   [Simulate single-cell ATAC-seq data](https://songdongyuan1994.github.io/scDesign3/docs/articles/scDesign3-scATACseq-vignette.html)
129 |     -   [Simulate CITE-seq data](https://songdongyuan1994.github.io/scDesign3/docs/articles/scDesign3-CITEseq-vignette.html)
130 |     -   [Simulate multi-omics data from multiple single-omic datasets](https://songdongyuan1994.github.io/scDesign3/docs/articles/scDesign3-multiomics-vignette.html)
131 | -   Model parameter
132 |     -   [scDesign3 introduction](https://songdongyuan1994.github.io/scDesign3/docs/articles/scDesign3-introduction-vignette.html)
133 |     -   [scDesign3 marginal distribution for genes](https://songdongyuan1994.github.io/scDesign3/docs/articles/scDesign3-marginal-vignette.html)
134 |     -   [Compare Gaussian copula and Vine copula](https://songdongyuan1994.github.io/scDesign3/docs/articles/scDesign3-copulaCompare-vignette.html)
135 |     -   [Parallelization in scDesign3](https://songdongyuan1994.github.io/scDesign3/docs/articles/scDesign3-parallelization-vignette.html)
136 | -   Model selection
137 |     -   [Evaluate clustering goodness-of-fit by scDesign3](https://songdongyuan1994.github.io/scDesign3/docs/articles/scDesign3-clusterGOF-vignette.html)
138 |     -   [Evaluate pseudotime goodness-of-fit by scDesign3](https://songdongyuan1994.github.io/scDesign3/docs/articles/scDesign3-pseudotimeGOF-vignette.html)
139 | -   Model alteration
140 |     -   [Simulate datasets with/without batch effect](https://songdongyuan1994.github.io/scDesign3/docs/articles/scDesign3-batchEffect-vignette.html)
141 |     -   [Simulate datasets with/without condition effect](https://songdongyuan1994.github.io/scDesign3/docs/articles/scDesign3-conditionEffect-vignette.html)
142 |     -   [Simulate datasets for DE test](https://songdongyuan1994.github.io/scDesign3/docs/articles/scDesign3-DEanalysis-vignette.html)
143 | 
144 | ## Contact<a name="contact"></a>
145 | 
146 | Any questions or suggestions on `scDesign3` are welcomed! Please report it on [issues](https://github.com/SONGDONGYUAN1994/scDesign3/issues), or contact Dongyuan Song ([dongyuansong\@ucla.edu](mailto:dongyuansong@ucla.edu){.email}) or Qingyang Wang ([qw802\@g.ucla.edu](mailto:qw802@g.ucla.edu){.email}).
147 | 
148 | ## Changelog
149 | -   2024-11-07
150 |     - Add options for correlation estimation. The alternative is from R package [coop](https://cran.rstudio.com/web/packages/coop/index.html), which requires BLAS
151 |     
152 | -   2024-06-22 Important changes
153 |     - Add sparse Gaussian copula for `fit_copula`
154 |     - Add automatic k selection for `fit_marginal`
155 |     
156 | ## Related Manuscripts<a name="related-manuscripts"></a>
157 | -   The original **scDesign3** paper
158 |     -   **scDesign3**: [Song, D., Wang, Q., Yan, G. *et al.* scDesign3 generates realistic in silico data for multimodal single-cell and spatial omics. *Nat Biotechnol* **42**, 247–252 (2024).](https://www.nature.com/articles/s41587-023-01772-1)
159 | -   The predecessors of **scDesign3**
160 |     -   **scDesign**: [Li, W. V., & Li, J. J. (2019). A statistical simulator scDesign for rational scRNA-seq experimental design. *Bioinformatics*, **35**(14), i41-i50.](https://academic.oup.com/bioinformatics/article/35/14/i41/5529133)
161 |     -   **scDesign2**: [Sun, T., Song, D., Li, W. V., & Li, J. J. (2021). scDesign2: a transparent simulator that generates high-fidelity single-cell gene expression count data with gene correlations captured. *Genome biology*, **22**(1), 1-37.](https://link.springer.com/article/10.1186/s13059-021-02367-2)
162 | -   The simulator for single-cell multi-omics reads developed by our lab member Guanao Yan
163 |     -   **scReadSim**: [Yan, G., Song, D. & Li, J.J. scReadSim: a single-cell RNA-seq and ATAC-seq read simulator. *Nat Commun* **14**, 7482 (2023)](https://doi.org/10.1038/s41467-023-43162-w)
164 | 


--------------------------------------------------------------------------------
/data/example_sce.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SONGDONGYUAN1994/scDesign3/4370074cc5392ddd7821e66e1e1c1d1181f21d3d/data/example_sce.rda


--------------------------------------------------------------------------------
/inst/CITATION:
--------------------------------------------------------------------------------
 1 | bibentry(
 2 |   bibtype  = "Article",
 3 |   title = "scDesign3 generates realistic in silico data for multimodal single-cell and spatial omics",
 4 |   author = "Song, Dongyuan and Wang, Qingyang and Yan, Guanao and Liu, Tianyang and Sun, Tianyi and Li, Jingyi Jessica",
 5 |   journal = "Nature Biotechnology",
 6 |   year = 2023,
 7 |   publisher = "Nature Publishing Group US New York",
 8 |   doi = "10.1038/s41587-023-01772-1",
 9 | )
10 | 


--------------------------------------------------------------------------------
/man/ba.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/gamlss_fix.R
 3 | \name{ba}
 4 | \alias{ba}
 5 | \title{Functions from gamlss/gamlss.add with bugs fixed}
 6 | \usage{
 7 | ba(formula, control = ba.control(...), ...)
 8 | }
 9 | \arguments{
10 | \item{formula}{A formula of the model.}
11 | 
12 | \item{control}{The control of the model fitting.}
13 | 
14 | \item{...}{Other arguments.}
15 | }
16 | \value{
17 | A xvar list.
18 | }
19 | \description{
20 | An additive function to be used while fitting GAMLSS models. The interface for \code{bam()} in the \pkg{mgcv} package.
21 | }
22 | \section{ba}{
23 | NA
24 | }
25 | 
26 | \examples{
27 | print("No example")
28 | }
29 | 


--------------------------------------------------------------------------------
/man/construct_data.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/construct_data.R
 3 | \name{construct_data}
 4 | \alias{construct_data}
 5 | \title{Construct the input data (covariate matrix and expression matrix)}
 6 | \usage{
 7 | construct_data(
 8 |   sce,
 9 |   assay_use = "counts",
10 |   celltype,
11 |   pseudotime,
12 |   spatial,
13 |   other_covariates,
14 |   ncell = dim(sce)[2],
15 |   corr_by,
16 |   parallelization = "mcmapply",
17 |   BPPARAM = NULL
18 | )
19 | }
20 | \arguments{
21 | \item{sce}{A \code{SingleCellExperiment} object.}
22 | 
23 | \item{assay_use}{A string which indicates the assay you will use in the sce. Default is 'counts'.}
24 | 
25 | \item{celltype}{A string of the name of cell type variable in the \code{colData} of the sce. Default is 'cell_type'.}
26 | 
27 | \item{pseudotime}{A string or a string vector of the name of pseudotime and (if exist)
28 | multiple lineages. Default is NULL.}
29 | 
30 | \item{spatial}{A length two string vector of the names of spatial coordinates. Default is NULL.}
31 | 
32 | \item{other_covariates}{A string or a string vector of the other covariates you want to include in the data.}
33 | 
34 | \item{ncell}{The number of cell you want to simulate. Default is \code{dim(sce)[2]} (the same number as the input data).
35 | If an arbitrary number is provided, the function will use Vine Copula to simulate a new covariate matrix.}
36 | 
37 | \item{corr_by}{A string or a string vector which indicates the groups for correlation structure. If '1', all cells have one estimated corr. If 'ind', no corr (features are independent). If others, this variable decides the corr structures.}
38 | 
39 | \item{parallelization}{A string indicating the specific parallelization function to use.
40 | Must be one of 'mcmapply', 'bpmapply', or 'pbmcmapply', which corresponds to the parallelization function in the package
41 | \code{parallel},\code{BiocParallel}, and \code{pbmcapply} respectively. The default value is 'mcmapply'.}
42 | 
43 | \item{BPPARAM}{A \code{MulticoreParam} object or NULL. When the parameter parallelization = 'mcmapply' or 'pbmcmapply',
44 | this parameter must be NULL. When the parameter parallelization = 'bpmapply',  this parameter must be one of the
45 | \code{MulticoreParam} object offered by the package 'BiocParallel. The default value is NULL.}
46 | }
47 | \value{
48 | A list with the components:
49 | \describe{
50 |   \item{\code{count_mat}}{The expression matrix}
51 |   \item{\code{dat}}{The original covariate matrix}
52 |   \item{\code{newCovariate}}{The simulated new covariate matrix, is NULL if the parameter ncell is default}
53 |   \item{\code{filtered_gene}}{The genes that are excluded in the marginal and copula fitting 
54 | steps because these genes only express in less than two cells.}
55 | }
56 | }
57 | \description{
58 | This function constructs the input data for \code{\link{fit_marginal}}.
59 | }
60 | \details{
61 | This function takes a \code{SingleCellExperiment} object as the input.
62 | Based on users' choice, it constructs the matrix of covariates
63 | (explanatory variables) and the expression matrix (e.g., count matrix for scRNA-seq).
64 | }
65 | \examples{
66 |   data(example_sce)
67 |   my_data <- construct_data(
68 |   sce = example_sce,
69 |   assay_use = "counts",
70 |   celltype = "cell_type",
71 |   pseudotime = "pseudotime",
72 |   spatial = NULL,
73 |   other_covariates = NULL,
74 |   corr_by = "1"
75 |   )
76 | 
77 | 
78 | }
79 | 


--------------------------------------------------------------------------------
/man/example_sce.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{example_sce}
 5 | \alias{example_sce}
 6 | \title{A SingleCellExperiment object containing both cell type and pseudotime}
 7 | \format{
 8 | A dataset with 10 rows (genes) and 1289 cols (cells)
 9 | }
10 | \usage{
11 | data("example_sce")
12 | }
13 | \value{
14 | The corresponding SingleCellExperiment object
15 | }
16 | \description{
17 | A SingleCellExperiment object containing both cell type and pseudotime
18 | }
19 | \keyword{datasets}
20 | 


--------------------------------------------------------------------------------
/man/extract_para.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/extract_para.R
 3 | \name{extract_para}
 4 | \alias{extract_para}
 5 | \title{Extract the parameters of each cell's distribution}
 6 | \usage{
 7 | extract_para(
 8 |   sce,
 9 |   assay_use = "counts",
10 |   marginal_list,
11 |   n_cores,
12 |   family_use,
13 |   new_covariate,
14 |   parallelization = "mcmapply",
15 |   BPPARAM = NULL,
16 |   data
17 | )
18 | }
19 | \arguments{
20 | \item{sce}{A \code{SingleCellExperiment} object.}
21 | 
22 | \item{assay_use}{A string which indicates the assay you will use in the sce. Default is 'counts'.}
23 | 
24 | \item{marginal_list}{A list of fitted regression models from \code{\link{fit_marginal}} for each gene in sce.}
25 | 
26 | \item{n_cores}{An integer. The number of cores to use.}
27 | 
28 | \item{family_use}{A string of the marginal distribution.
29 | Must be one of 'poisson', 'nb', 'zip', 'zinb' or 'gaussian', which represent 'poisson distribution',
30 | 'negative binomial distribution', 'zero-inflated poisson distribution', 'zero-inflated negative binomial distribution',
31 | and 'gaussian distribution' respectively.}
32 | 
33 | \item{new_covariate}{A data.frame which contains covariates of targeted simulated data from  \code{\link{construct_data}} and the
34 | correlation group assignment for each cell in the column 'corr_group'.}
35 | 
36 | \item{parallelization}{A string indicating the specific parallelization function to use.
37 | Must be one of 'mcmapply', 'bpmapply', or 'pbmcmapply', which corresponds to the parallelization function in the package
38 | \code{parallel},\code{BiocParallel}, and \code{pbmcapply} respectively. The default value is 'mcmapply'.}
39 | 
40 | \item{BPPARAM}{A \code{MulticoreParam} object or NULL. When the parameter parallelization = 'mcmapply' or 'pbmcmapply',
41 | this parameter must be NULL. When the parameter parallelization = 'bpmapply',  this parameter must be one of the
42 | \code{MulticoreParam} object offered by the package 'BiocParallel. The default value is NULL.}
43 | 
44 | \item{data}{A dataframe which is used when fitting the gamlss model}
45 | }
46 | \value{
47 | A list with the components:
48 | \describe{
49 |   \item{\code{mean_mat}}{A cell by feature matrix of the mean parameter.}
50 |   \item{\code{sigma_mat}}{A cell by feature matrix of the sigma parameter (for Gaussian, the variance; for NB, the dispersion.).}
51 |   \item{\code{zero_mat}}{A cell by feature matrix of the zero-inflation parameter (only non-zero for ZIP and ZINB).}
52 | }
53 | }
54 | \description{
55 | \code{extract_para} generates parameter matrices which determine each cell's distribution
56 | }
57 | \details{
58 | The function takes the new covariate (if use) from \code{\link{construct_data}} and
59 | marginal models from \code{\link{fit_marginal}}.
60 | }
61 | \examples{
62 |   data(example_sce)
63 |   my_data <- construct_data(
64 |   sce = example_sce,
65 |   assay_use = "counts",
66 |   celltype = "cell_type",
67 |   pseudotime = "pseudotime",
68 |   spatial = NULL,
69 |   other_covariates = NULL,
70 |   corr_by = "1"
71 |   )
72 |   my_marginal <- fit_marginal(
73 |   data = my_data,
74 |   mu_formula = "s(pseudotime, bs = 'cr', k = 10)",
75 |   sigma_formula = "1",
76 |   family_use = "nb",
77 |   n_cores = 1,
78 |   usebam = FALSE
79 |   )
80 |   my_copula <- fit_copula(
81 |   sce = example_sce,
82 |   assay_use = "counts",
83 |   marginal_list = my_marginal,
84 |   family_use = c(rep("nb", 5), rep("zip", 5)),
85 |   copula = "vine",
86 |   n_cores = 1,
87 |   input_data = my_data$dat
88 |   )
89 |   my_para <- extract_para(
90 |     sce = example_sce,
91 |     marginal_list = my_marginal,
92 |     n_cores = 1,
93 |     family_use = c(rep("nb", 5), rep("zip", 5)),
94 |     new_covariate = my_data$new_covariate,
95 |     data = my_data$dat
96 |   )
97 | 
98 | }
99 | 


--------------------------------------------------------------------------------
/man/figures/scDesign3_illustration.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SONGDONGYUAN1994/scDesign3/4370074cc5392ddd7821e66e1e1c1d1181f21d3d/man/figures/scDesign3_illustration.png


--------------------------------------------------------------------------------
/man/fit_copula.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/fit_copula.R
  3 | \name{fit_copula}
  4 | \alias{fit_copula}
  5 | \title{Fit the copula model}
  6 | \usage{
  7 | fit_copula(
  8 |   sce,
  9 |   assay_use,
 10 |   input_data,
 11 |   empirical_quantile = FALSE,
 12 |   marginal_list,
 13 |   family_use,
 14 |   copula = "gaussian",
 15 |   DT = TRUE,
 16 |   pseudo_obs = FALSE,
 17 |   epsilon = 1e-06,
 18 |   family_set = c("gaussian", "indep"),
 19 |   important_feature = "all",
 20 |   if_sparse = FALSE,
 21 |   correlation_function = "default",
 22 |   n_cores,
 23 |   parallelization = "mcmapply",
 24 |   BPPARAM = NULL
 25 | )
 26 | }
 27 | \arguments{
 28 | \item{sce}{A \code{SingleCellExperiment} object.}
 29 | 
 30 | \item{assay_use}{A string which indicates the assay you will use in the sce.
 31 | Default is 'counts'.}
 32 | 
 33 | \item{input_data}{The input data, which is one of the output from \code{\link{construct_data}}.}
 34 | 
 35 | \item{empirical_quantile}{Please only use it if you clearly know what will happen! A logic variable. If TRUE, DO NOT fit the copula and use the EMPIRICAL CDF values of the original data; it will make the simulated data fixed (no randomness). Default is FALSE. Only works if ncell is the same as your original data.}
 36 | 
 37 | \item{marginal_list}{A list of fitted regression models from \code{\link{fit_marginal}}.}
 38 | 
 39 | \item{family_use}{A string or a vector of strings of the marginal distribution. Must be one of 'poisson', 'nb', 'zip', 'zinb' or 'gaussian'.}
 40 | 
 41 | \item{copula}{A string of the copula choice. Must be one of 'gaussian' or 'vine'. Default is 'gaussian'. Note that vine copula may have better modeling of high-dimensions, but can be very slow when features are >1000.}
 42 | 
 43 | \item{DT}{A logic variable. If TRUE, perform the distributional transformation
 44 | to make the discrete data 'continuous'. This is useful for discrete distributions (e.g., Poisson, NB).
 45 | Default is TRUE. Note that for continuous data (e.g., Gaussian), DT does not make sense and should be set as FALSE.}
 46 | 
 47 | \item{pseudo_obs}{A logic variable. If TRUE, use the empirical quantiles instead of theoretical quantiles for fitting copula.
 48 | Default is FALSE.}
 49 | 
 50 | \item{epsilon}{A numeric variable for preventing the transformed quantiles to collapse to 0 or 1.}
 51 | 
 52 | \item{family_set}{A string or a string vector of the bivariate copula families. Default is c("gaussian", "indep").}
 53 | 
 54 | \item{important_feature}{A numeric value or vector which indicates whether a gene will be used in correlation estimation or not. If this is a numeric value, then
 55 | gene with zero proportion greater than this value will be excluded form gene-gene correlation estimation. If this is a vector, then this should
 56 | be a logical vector with length equal to the number of genes in \code{sce}. \code{TRUE} in the logical vector means the corresponding gene will be included in
 57 | gene-gene correlation estimation and \code{FALSE} in the logical vector means the corresponding gene will be excluded from the gene-gene correlation estimation.
 58 | The default value for is "all" (a special string which means no filtering).}
 59 | 
 60 | \item{if_sparse}{A logic variable. Only works for Gaussian copula (\code{family_set = "gaussian"}). If TRUE, a thresholding strategy will make the corr matrix sparse.}
 61 | 
 62 | \item{correlation_function}{A string. If 'default', the function from \code{Rfast}; if 'coop', the function from \code{coop}, which calls BLAS.}
 63 | 
 64 | \item{n_cores}{An integer. The number of cores to use.}
 65 | 
 66 | \item{parallelization}{A string indicating the specific parallelization function to use.
 67 | Must be one of 'mcmapply', 'bpmapply', or 'pbmcmapply', which corresponds to the parallelization function in the package
 68 | \code{parallel},\code{BiocParallel}, and \code{pbmcapply} respectively. The default value is 'mcmapply'.}
 69 | 
 70 | \item{BPPARAM}{A \code{MulticoreParam} object or NULL. When the parameter parallelization = 'mcmapply' or 'pbmcmapply',
 71 | this parameter must be NULL. When the parameter parallelization = 'bpmapply',  this parameter must be one of the
 72 | \code{MulticoreParam} object offered by the package 'BiocParallel. The default value is NULL.}
 73 | }
 74 | \value{
 75 | A list with the components:
 76 | \describe{
 77 |   \item{\code{new_mvu}}{A matrix of the new multivariate uniform distribution from the copula.}
 78 |   \item{\code{copula_list}}{A list of the fitted copula model. If using Gaussian copula, a list of correlation matrices; if vine, a list of vine objects.}
 79 |   \item{\code{model_aic}}{A vector of the marginal AIC and the copula AIC.}
 80 |   \item{\code{model_bic}}{A vector of the marginal BIC and the copula BIC.}
 81 | }
 82 | }
 83 | \description{
 84 | \code{fit_copula} fits the copula model.
 85 | }
 86 | \details{
 87 | This function takes the result from \code{\link{fit_marginal}} as the input and
 88 | and fit the copula model on the residuals.
 89 | }
 90 | \examples{
 91 |   data(example_sce)
 92 |   my_data <- construct_data(
 93 |   sce = example_sce,
 94 |   assay_use = "counts",
 95 |   celltype = "cell_type",
 96 |   pseudotime = "pseudotime",
 97 |   spatial = NULL,
 98 |   other_covariates = NULL,
 99 |   corr_by = "1"
100 |   )
101 |   my_marginal <- fit_marginal(
102 |   data = my_data,
103 |   mu_formula = "s(pseudotime, bs = 'cr', k = 10)",
104 |   sigma_formula = "1",
105 |   family_use = "nb",
106 |   n_cores = 1,
107 |   usebam = FALSE
108 |   )
109 |   my_copula <- fit_copula(
110 |   sce = example_sce,
111 |   assay_use = "counts",
112 |   marginal_list = my_marginal,
113 |   family_use = c(rep("nb", 5), rep("zip", 5)),
114 |   copula = "vine",
115 |   n_cores = 1,
116 |   input_data = my_data$dat
117 |   )
118 |   
119 | }
120 | 


--------------------------------------------------------------------------------
/man/fit_marginal.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/fit_marginal.R
 3 | \name{fit_marginal}
 4 | \alias{fit_marginal}
 5 | \title{Fit the marginal models}
 6 | \usage{
 7 | fit_marginal(
 8 |   data,
 9 |   predictor = "gene",
10 |   mu_formula,
11 |   sigma_formula,
12 |   family_use,
13 |   n_cores,
14 |   usebam = FALSE,
15 |   edf_flexible = FALSE,
16 |   parallelization = "mcmapply",
17 |   BPPARAM = NULL,
18 |   trace = FALSE,
19 |   simplify = FALSE,
20 |   filter_cells = FALSE
21 | )
22 | }
23 | \arguments{
24 | \item{data}{An object from \code{\link{construct_data}}.}
25 | 
26 | \item{predictor}{A string of the predictor for the gam/gamlss model. Default is "gene". This is just a name.}
27 | 
28 | \item{mu_formula}{A string of the mu parameter formula. It follows the format of formula in \code{\link[mgcv]{bam}}. Note: if the formula has multiple smoothers (\code{s()}) (we do not recommend this), please put the one with largest k (most complex one) as the first one.}
29 | 
30 | \item{sigma_formula}{A string of the sigma parameter formula}
31 | 
32 | \item{family_use}{A string or a vector of strings of the marginal distribution.
33 | Must be one of 'binomial', 'poisson', 'nb', 'zip', 'zinb' or 'gaussian', which represent 'poisson distribution',
34 | 'negative binomial distribution', 'zero-inflated poisson distribution', 'zero-inflated negative binomial distribution',
35 | and 'gaussian distribution' respectively.}
36 | 
37 | \item{n_cores}{An integer. The number of cores to use.}
38 | 
39 | \item{usebam}{A logic variable. If use \code{\link[mgcv]{bam}} for acceleration.}
40 | 
41 | \item{edf_flexible}{A logic variable. It uses simpler model to accelerate the marginal fitting with a mild loss of accuracy. If TRUE, the fitted regression model will use the fitted relationship between Gini coefficient and the effective degrees of freedom on a random selected gene sets. Default is FALSE.}
42 | 
43 | \item{parallelization}{A string indicating the specific parallelization function to use.
44 | Must be one of 'mcmapply', 'bpmapply', or 'pbmcmapply', which corresponds to the parallelization function in the package
45 | \code{parallel},\code{BiocParallel}, and \code{pbmcapply} respectively. The default value is 'mcmapply'.}
46 | 
47 | \item{BPPARAM}{A \code{MulticoreParam} object or NULL. When the parameter parallelization = 'mcmapply' or 'pbmcmapply',
48 | this parameter must be NULL. When the parameter parallelization = 'bpmapply',  this parameter must be one of the
49 | \code{MulticoreParam} object offered by the package 'BiocParallel. The default value is NULL.}
50 | 
51 | \item{trace}{A logic variable. If TRUE, the warning/error log and runtime for gam/gamlss will be returned.
52 | will be returned, FALSE otherwise. Default is FALSE.}
53 | 
54 | \item{simplify}{A logic variable. If TRUE, the fitted regression model will only keep the essential contains for \code{predict}. Default is FALSE.}
55 | 
56 | \item{filter_cells}{A logic variable. If TRUE, when all covariates used for fitting the GAM/GAMLSS model are categorical, the code will check each unique combination of categories and remove cells in that category if it has all zero gene expression for each fitted gene.}
57 | }
58 | \value{
59 | A list of fitted regression models. The length is equal to the total feature number.
60 | }
61 | \description{
62 | \code{fit_marginal} fits the per-feature regression models.
63 | }
64 | \details{
65 | The function takes the result from \code{\link{construct_data}} as the input,
66 | and fit the regression models for each feature based on users' specification.
67 | }
68 | \examples{
69 |   data(example_sce)
70 |   my_data <- construct_data(
71 |   sce = example_sce,
72 |   assay_use = "counts",
73 |   celltype = "cell_type",
74 |   pseudotime = "pseudotime",
75 |   spatial = NULL,
76 |   other_covariates = NULL,
77 |   corr_by = "1"
78 |   )
79 |   my_marginal <- fit_marginal(
80 |   data = my_data,
81 |   mu_formula = "s(pseudotime, bs = 'cr', k = 10)",
82 |   sigma_formula = "1",
83 |   family_use = "nb",
84 |   n_cores = 1,
85 |   usebam = FALSE
86 |   )
87 | 
88 | }
89 | 


--------------------------------------------------------------------------------
/man/ga.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/gamlss_fix.R
 3 | \name{ga}
 4 | \alias{ga}
 5 | \title{Functions from gamlss/gamlss.add with bugs fixed}
 6 | \usage{
 7 | ga(formula, envir, control = ga.control(...), ...)
 8 | }
 9 | \arguments{
10 | \item{formula}{A formula of the model.}
11 | 
12 | \item{envir}{The environment.}
13 | 
14 | \item{control}{The control of the model fitting.}
15 | 
16 | \item{...}{Other arguments.}
17 | }
18 | \value{
19 | A xvar list.
20 | }
21 | \description{
22 | An additive function to be used while fitting GAMLSS models. The interface for \code{gam()} in the \pkg{mgcv} package.
23 | }
24 | \section{ga}{
25 | NA
26 | }
27 | 
28 | \examples{
29 | print("No example")
30 | 
31 | }
32 | 


--------------------------------------------------------------------------------
/man/gamlss.ba.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/gamlss_fix.R
 3 | \name{gamlss.ba}
 4 | \alias{gamlss.ba}
 5 | \title{Support for Function ba()}
 6 | \usage{
 7 | gamlss.ba(x, y, w, xeval = NULL, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{The explanatory variables}
11 | 
12 | \item{y}{Iterative y variable}
13 | 
14 | \item{w}{Iterative weights}
15 | 
16 | \item{xeval}{If xeval=TRUE then prediction is used}
17 | 
18 | \item{...}{Other arguments}
19 | }
20 | \value{
21 | Not used
22 | }
23 | \description{
24 | This is support for the  smoother functions \code{ba()} interfaces for Simon Wood's \code{bam()} functions from package \pkg{mgcv}. It is not intended to be called directly by users. From \code{gamlss.add::gamlss.ba}.
25 | }
26 | \examples{
27 | print("No example")
28 | }
29 | 


--------------------------------------------------------------------------------
/man/gamlss.ga.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/gamlss_fix.R
 3 | \name{gamlss.ga}
 4 | \alias{gamlss.ga}
 5 | \title{Support for Function ga()}
 6 | \usage{
 7 | gamlss.ga(x, y, w, xeval = NULL, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{The explanatory variables}
11 | 
12 | \item{y}{Iterative y variable}
13 | 
14 | \item{w}{Iterative weights}
15 | 
16 | \item{xeval}{If xeval=TRUE then prediction is used}
17 | 
18 | \item{...}{Other arguments}
19 | }
20 | \value{
21 | Not used
22 | }
23 | \description{
24 | This is support for the  smoother functions \code{ga()} interfaces for Simon Wood's \code{gam()} functions from package \pkg{mgcv}. It is not intended to be called directly by users. From \code{gamlss.add::gamlss.ga}.
25 | }
26 | \examples{
27 | print("No example")
28 | }
29 | 


--------------------------------------------------------------------------------
/man/perform_lrt.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/perform_lrt.R
 3 | \name{perform_lrt}
 4 | \alias{perform_lrt}
 5 | \title{Perform the likelihood ratio test}
 6 | \usage{
 7 | perform_lrt(alter_marginal, null_marginal)
 8 | }
 9 | \arguments{
10 | \item{alter_marginal}{A list of marginal models from the alternative hypothesis.}
11 | 
12 | \item{null_marginal}{A list of marginal models from the null hypothesis. It must be strictly nested in the alternative model.}
13 | }
14 | \value{
15 | A data.frame of the LRT result.
16 | }
17 | \description{
18 | \code{perform_lrt} performs the likelihood ratio test to compare two list of marginal models.
19 | }
20 | \details{
21 | The function takes two lists of marginal models (by default, the first list is the alternative and the second is the null)
22 | from \code{\link{fit_marginal}}. Note that LRT only makes sense for NESTED models. This can be quite tricky if you use penalized-splines (e.g., for trajectory data).
23 | }
24 | \examples{
25 | data(example_sce)
26 | my_data <- construct_data(
27 | sce = example_sce,
28 | assay_use = "counts",
29 | celltype = "cell_type",
30 | pseudotime = "pseudotime",
31 | spatial = NULL,
32 | other_covariates = NULL,
33 | corr_by = "cell_type"
34 | )
35 | 
36 | my_data2 <- construct_data(
37 |   sce = example_sce,
38 |   assay_use = "counts",
39 |   celltype = "cell_type",
40 |   pseudotime = "pseudotime",
41 |   spatial = NULL,
42 |   other_covariates = NULL,
43 |   corr_by = "pseudotime",
44 |   ncell = 10000
45 | )
46 | 
47 | my_marginal1 <- fit_marginal(
48 |   data = my_data,
49 |   mu_formula = "1",
50 |   sigma_formula = "1",
51 |   family_use = "nb",
52 |   n_cores = 1,
53 |   usebam = FALSE
54 | )
55 | my_marginal2 <- fit_marginal(
56 |   data = my_data,
57 |   mu_formula = "s(pseudotime, bs = 'cr', k = 10)",
58 |   sigma_formula = "1",
59 |   family_use = "nb",
60 |   n_cores = 1,
61 |   usebam = FALSE
62 | )
63 | my_fit1 <- lapply(my_marginal1, function(x)x$fit)
64 | my_fit2 <- lapply(my_marginal2, function(x)x$fit)
65 | my_pvalue <- perform_lrt(my_fit2, my_fit1)
66 | 
67 | }
68 | 


--------------------------------------------------------------------------------
/man/plot_reduceddim.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plot_reduceddim.R
 3 | \name{plot_reduceddim}
 4 | \alias{plot_reduceddim}
 5 | \title{Dimensionality reduction and visualization}
 6 | \usage{
 7 | plot_reduceddim(
 8 |   ref_sce,
 9 |   sce_list,
10 |   name_vec,
11 |   assay_use = "logcounts",
12 |   pc_umap = TRUE,
13 |   n_pc = 50,
14 |   center = TRUE,
15 |   scale. = TRUE,
16 |   if_plot = TRUE,
17 |   shape_by = NULL,
18 |   color_by,
19 |   point_size = 1
20 | )
21 | }
22 | \arguments{
23 | \item{ref_sce}{The reference sce.}
24 | 
25 | \item{sce_list}{A list of synthetic sce.}
26 | 
27 | \item{name_vec}{A vector of the names of each dataset. The length should be \code{length(sce_list) + 1}, where the first name is for \code{ref_sce}.}
28 | 
29 | \item{assay_use}{A string which indicates the assay you will use in the sce.
30 | Default is 'logcounts'.}
31 | 
32 | \item{pc_umap}{A logic value of whether using PCs as the input of UMAP. Default is TRUE.}
33 | 
34 | \item{n_pc}{An integer of the number of PCs.}
35 | 
36 | \item{center}{A logic value of whether centering the data before PCA. Default is TRUE.}
37 | 
38 | \item{scale.}{A logic value of whether scaling the data before PCA. Default is TRUE.}
39 | 
40 | \item{if_plot}{A logic value of whether returning the plot. If FALSE, return the reduced dimensions of each dataset.}
41 | 
42 | \item{shape_by}{A string which indicates the column in \code{colData} used for shape.}
43 | 
44 | \item{color_by}{A string which indicates the column in \code{colData} used for color.}
45 | 
46 | \item{point_size}{A numeric value of the point size in the final plot. Default is 1.}
47 | }
48 | \value{
49 | The ggplot or the data.frame of reduced dimensions.
50 | }
51 | \description{
52 | \code{plot_reduceddim} performs the dimensionality reduction
53 | }
54 | \details{
55 | This function takes a reference sce and a list of new sces, performs the dimensionality reduction on the reference data, 
56 | projects the synthetic datasets on the same low dimensional space,
57 | then visualize the results.
58 | }
59 | 


--------------------------------------------------------------------------------
/man/scdesign3.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/scdesign3.R
  3 | \name{scdesign3}
  4 | \alias{scdesign3}
  5 | \title{The wrapper for the whole scDesign3 pipeline}
  6 | \usage{
  7 | scdesign3(
  8 |   sce,
  9 |   assay_use = "counts",
 10 |   celltype,
 11 |   pseudotime = NULL,
 12 |   spatial = NULL,
 13 |   other_covariates,
 14 |   ncell = dim(sce)[2],
 15 |   mu_formula,
 16 |   sigma_formula = "1",
 17 |   family_use = "nb",
 18 |   n_cores = 2,
 19 |   correlation_function = "default",
 20 |   usebam = FALSE,
 21 |   edf_flexible = FALSE,
 22 |   corr_formula,
 23 |   empirical_quantile = FALSE,
 24 |   copula = "gaussian",
 25 |   if_sparse = FALSE,
 26 |   fastmvn = FALSE,
 27 |   DT = TRUE,
 28 |   pseudo_obs = FALSE,
 29 |   family_set = c("gauss", "indep"),
 30 |   important_feature = "all",
 31 |   nonnegative = TRUE,
 32 |   nonzerovar = FALSE,
 33 |   return_model = FALSE,
 34 |   simplify = FALSE,
 35 |   parallelization = "mcmapply",
 36 |   n_rep = 1,
 37 |   BPPARAM = NULL,
 38 |   trace = FALSE
 39 | )
 40 | }
 41 | \arguments{
 42 | \item{sce}{A \code{SingleCellExperiment} object.}
 43 | 
 44 | \item{assay_use}{A string which indicates the assay you will use in the sce. Default is 'counts'.}
 45 | 
 46 | \item{celltype}{A string of the name of cell type variable in the \code{colData} of the sce. Default is 'cell_type'.}
 47 | 
 48 | \item{pseudotime}{A string or a string vector of the name of pseudotime and (if exist)
 49 | multiple lineages. Default is NULL.}
 50 | 
 51 | \item{spatial}{A length two string vector of the names of spatial coordinates. Default is NULL.}
 52 | 
 53 | \item{other_covariates}{A string or a string vector of the other covariates you want to include in the data.}
 54 | 
 55 | \item{ncell}{The number of cell you want to simulate. Default is \code{dim(sce)[2]} (the same number as the input data).}
 56 | 
 57 | \item{mu_formula}{A string of the mu parameter formula}
 58 | 
 59 | \item{sigma_formula}{A string of the sigma parameter formula}
 60 | 
 61 | \item{family_use}{A string of the marginal distribution.
 62 | Must be one of 'poisson', 'nb', 'zip', 'zinb' or 'gaussian'.}
 63 | 
 64 | \item{n_cores}{An integer. The number of cores to use.}
 65 | 
 66 | \item{correlation_function}{A string. If 'default', the function from \code{Rfast}; if 'coop', the function from \code{coop}, which calls BLAS.}
 67 | 
 68 | \item{usebam}{A logic variable. If use \code{\link[mgcv]{bam}} for acceleration in marginal fitting.}
 69 | 
 70 | \item{edf_flexible}{A logic variable. It is used for accelerating for spatial model if k is large in 'mu_formula'. Default is FALSE.}
 71 | 
 72 | \item{corr_formula}{A string of the correlation structure.}
 73 | 
 74 | \item{empirical_quantile}{Please only use it if you clearly know what will happen! A logic variable. If TRUE, DO NOT fit the copula and use the EMPIRICAL CDF values of the original data; it will make the simulated data fixed (no randomness). Default is FALSE. Only works if ncell is the same as your original data.}
 75 | 
 76 | \item{copula}{A string of the copula choice. Must be one of 'gaussian' or 'vine'. Default is 'gaussian'. Note that vine copula may have better modeling of high-dimensions, but can be very slow when features are >1000.}
 77 | 
 78 | \item{if_sparse}{A logic variable. Only works for Gaussian copula (\code{family_set = "gaussian"}). If TRUE, a thresholding strategy will make the corr matrix sparse.}
 79 | 
 80 | \item{fastmvn}{An logical variable. If TRUE, the sampling of multivariate Gaussian is done by \code{mvnfast}, otherwise by \code{mvtnorm}. Default is FALSE. It only matters for Gaussian copula.}
 81 | 
 82 | \item{DT}{A logic variable. If TRUE, perform the distributional transformation
 83 | to make the discrete data 'continuous'. This is useful for discrete distributions (e.g., Poisson, NB).
 84 | Default is TRUE. Note that for continuous data (e.g., Gaussian), DT does not make sense and should be set as FALSE.}
 85 | 
 86 | \item{pseudo_obs}{A logic variable. If TRUE, use the empirical quantiles instead of theoretical quantiles for fitting copula.
 87 | Default is FALSE.}
 88 | 
 89 | \item{family_set}{A string or a string vector of the bivariate copula families. Default is c("gauss", "indep"). For more information please check package \code{rvinecoplib}.}
 90 | 
 91 | \item{important_feature}{A numeric value or vector which indicates whether a gene will be used in correlation estimation or not. If this is a numeric value, then
 92 | gene with zero proportion greater than this value will be excluded form gene-gene correlation estimation. If this is a vector, then this should
 93 | be a logical vector with length equal to the number of genes in \code{sce}. \code{TRUE} in the logical vector means the corresponding gene will be included in
 94 | gene-gene correlation estimation and \code{FALSE} in the logical vector means the corresponding gene will be excluded from the gene-gene correlation estimation.
 95 | The default value is "all" (a special string which means no filtering).}
 96 | 
 97 | \item{nonnegative}{A logical variable. If TRUE, values < 0 in the synthetic data will be converted to 0. Default is TRUE (since the expression matrix is nonnegative).}
 98 | 
 99 | \item{nonzerovar}{A logical variable. If TRUE, for any gene with zero variance, a cell will be replaced with 1. This is designed for avoiding potential errors, for example, PCA. Default is FALSE.}
100 | 
101 | \item{return_model}{A logic variable. If TRUE, the marginal models and copula models will be returned. Default is FALSE.}
102 | 
103 | \item{simplify}{A logic variable. If TRUE, the fitted regression model will only keep the essential contains for \code{predict}, otherwise the fitted models can be VERY large. Default is FALSE.}
104 | 
105 | \item{parallelization}{A string indicating the specific parallelization function to use.}
106 | 
107 | \item{n_rep}{An integer number. The number of replicates of simulated new count matrix. Default is 1.
108 | Must be one of 'mcmapply', 'bpmapply', or 'pbmcmapply', which corresponds to the parallelization function in the package
109 | \code{parallel},\code{BiocParallel}, and \code{pbmcapply} respectively. The default value is 'mcmapply'.}
110 | 
111 | \item{BPPARAM}{A \code{MulticoreParam} object or NULL. When the parameter parallelization = 'mcmapply' or 'pbmcmapply',
112 | this parameter must be NULL. When the parameter parallelization = 'bpmapply',  this parameter must be one of the
113 | \code{MulticoreParam} object offered by the package 'BiocParallel. The default value is NULL.}
114 | 
115 | \item{trace}{A logic variable. If TRUE, the warning/error log and runtime for gam/gamlss
116 | will be returned, FALSE otherwise. Default is FALSE.}
117 | }
118 | \value{
119 | A list with the components:
120 | \describe{
121 |   \item{\code{new_count}}{A matrix of the new simulated count (expression) matrix.}
122 |   \item{\code{new_covariate}}{A data.frame of the new covariate matrix.}
123 |   \item{\code{model_aic}}{The model AIC.}
124 |   \item{\code{marginal_list}}{A list of marginal regression models if return_model = TRUE.}
125 |   \item{\code{corr_list}}{A list of correlation models (conditional copulas) if return_model = TRUE.}
126 | }
127 | }
128 | \description{
129 | \code{scdesign3} takes the input data, fits the model and
130 | }
131 | \examples{
132 | data(example_sce)
133 | my_simu <- scdesign3(
134 | sce = example_sce,
135 | assay_use = "counts",
136 | celltype = "cell_type",
137 | pseudotime = "pseudotime",
138 | spatial = NULL,
139 | other_covariates = NULL,
140 | mu_formula = "s(pseudotime, bs = 'cr', k = 10)",
141 | sigma_formula = "1",
142 | family_use = "nb",
143 | n_cores = 2,
144 | usebam = FALSE,
145 | edf_flexible = FALSE,
146 | corr_formula = "pseudotime",
147 | copula = "gaussian",
148 | if_sparse = TRUE,
149 | DT = TRUE,
150 | pseudo_obs = FALSE,
151 | ncell = 1000,
152 | return_model = FALSE
153 | )
154 | 
155 | }
156 | 


--------------------------------------------------------------------------------
/man/simu_new.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/simu_new.R
  3 | \name{simu_new}
  4 | \alias{simu_new}
  5 | \title{Simulate new data}
  6 | \usage{
  7 | simu_new(
  8 |   sce,
  9 |   assay_use = "counts",
 10 |   mean_mat,
 11 |   sigma_mat,
 12 |   zero_mat,
 13 |   quantile_mat = NULL,
 14 |   copula_list,
 15 |   n_cores,
 16 |   fastmvn = FALSE,
 17 |   family_use,
 18 |   nonnegative = TRUE,
 19 |   nonzerovar = FALSE,
 20 |   input_data,
 21 |   new_covariate,
 22 |   important_feature = "all",
 23 |   parallelization = "mcmapply",
 24 |   BPPARAM = NULL,
 25 |   filtered_gene
 26 | )
 27 | }
 28 | \arguments{
 29 | \item{sce}{A \code{SingleCellExperiment} object.}
 30 | 
 31 | \item{assay_use}{A string which indicates the assay you will use in the sce. Default is 'counts'.}
 32 | 
 33 | \item{mean_mat}{A cell by feature matrix of the mean parameter.}
 34 | 
 35 | \item{sigma_mat}{A cell by feature matrix of the sigma parameter.}
 36 | 
 37 | \item{zero_mat}{A cell by feature matrix of the zero-inflation parameter.}
 38 | 
 39 | \item{quantile_mat}{A cell by feature matrix of the multivariate quantile.}
 40 | 
 41 | \item{copula_list}{A list of copulas for generating the multivariate quantile matrix. If provided, the \code{quantile_mat} must be NULL.}
 42 | 
 43 | \item{n_cores}{An integer. The number of cores to use.}
 44 | 
 45 | \item{fastmvn}{An logical variable. If TRUE, the sampling of multivariate Gaussian is done by \code{mvnfast}, otherwise by \code{mvtnorm}. Default is FALSE.}
 46 | 
 47 | \item{family_use}{A string of the marginal distribution.
 48 | Must be one of 'poisson', "binomial", 'nb', 'zip', 'zinb' or 'gaussian'.}
 49 | 
 50 | \item{nonnegative}{A logical variable. If TRUE, values < 0 in the synthetic data will be converted to 0. Default is TRUE (since the expression matrix is nonnegative).}
 51 | 
 52 | \item{nonzerovar}{A logical variable. If TRUE, for any gene with zero variance, a cell will be replaced with 1. This is designed for avoiding potential errors, for example, PCA.}
 53 | 
 54 | \item{input_data}{A input count matrix.}
 55 | 
 56 | \item{new_covariate}{A data.frame which contains covariates of targeted simulated data from  \code{\link{construct_data}}.}
 57 | 
 58 | \item{important_feature}{important_feature A string or vector which indicates whether a gene will be used in correlation estimation or not. If this is a string, then
 59 | this string must be either "all" (using all genes) or "auto", which indicates that the genes will be automatically selected based on the proportion of zero expression across cells
 60 | for each gene. Gene with zero proportion greater than 0.8 will be excluded form gene-gene correlation estimation. If this is a vector, then this should
 61 | be a logical vector with length equal to the number of genes in \code{sce}. \code{TRUE} in the logical vector means the corresponding gene will be included in
 62 | gene-gene correlation estimation and \code{FALSE} in the logical vector means the corresponding gene will be excluded from the gene-gene correlation estimation.
 63 | The default value for is "all".}
 64 | 
 65 | \item{parallelization}{A string indicating the specific parallelization function to use.
 66 | Must be one of 'mcmapply', 'bpmapply', or 'pbmcmapply', which corresponds to the parallelization function in the package
 67 | \code{parallel},\code{BiocParallel}, and \code{pbmcapply} respectively. The default value is 'mcmapply'.}
 68 | 
 69 | \item{BPPARAM}{A \code{MulticoreParam} object or NULL. When the parameter parallelization = 'mcmapply' or 'pbmcmapply',
 70 | this parameter must be NULL. When the parameter parallelization = 'bpmapply',  this parameter must be one of the
 71 | \code{MulticoreParam} object offered by the package 'BiocParallel. The default value is NULL.}
 72 | 
 73 | \item{filtered_gene}{A vector or NULL which contains genes that are excluded in the marginal and copula fitting 
 74 | steps because these genes only express in less than two cells. This can be obtain from  \code{\link{construct_data}}}
 75 | }
 76 | \value{
 77 | A feature by cell matrix of the new simulated count (expression) matrix or sparse matrix.
 78 | }
 79 | \description{
 80 | \code{simu_new} generates new simulated data based on fitted marginal and copula models.
 81 | }
 82 | \details{
 83 | The function takes the new covariate (if use) from \code{\link{construct_data}},
 84 | parameter matrices from \code{\link{extract_para}} and multivariate Unifs from \code{\link{fit_copula}}.
 85 | }
 86 | \examples{
 87 |   data(example_sce)
 88 |   my_data <- construct_data(
 89 |   sce = example_sce,
 90 |   assay_use = "counts",
 91 |   celltype = "cell_type",
 92 |   pseudotime = "pseudotime",
 93 |   spatial = NULL,
 94 |   other_covariates = NULL,
 95 |   corr_by = "1"
 96 |   )
 97 |   my_marginal <- fit_marginal(
 98 |   data = my_data,
 99 |   mu_formula = "s(pseudotime, bs = 'cr', k = 10)",
100 |   sigma_formula = "1",
101 |   family_use = "nb",
102 |   n_cores = 1,
103 |   usebam = FALSE
104 |   )
105 |   my_copula <- fit_copula(
106 |   sce = example_sce,
107 |   assay_use = "counts",
108 |   marginal_list = my_marginal,
109 |   family_use = c(rep("nb", 5), rep("zip", 5)),
110 |   copula = "vine",
111 |   n_cores = 1,
112 |   input_data = my_data$dat
113 |   )
114 |   my_para <- extract_para(
115 |     sce = example_sce,
116 |     marginal_list = my_marginal,
117 |     n_cores = 1,
118 |     family_use = c(rep("nb", 5), rep("zip", 5)),
119 |     new_covariate = my_data$new_covariate,
120 |     data = my_data$dat
121 |   )
122 |   my_newcount <- simu_new(
123 |   sce = example_sce,
124 |   mean_mat = my_para$mean_mat,
125 |   sigma_mat = my_para$sigma_mat,
126 |   zero_mat = my_para$zero_mat,
127 |   quantile_mat = NULL,
128 |   copula_list = my_copula$copula_list,
129 |   n_cores = 1,
130 |   family_use = c(rep("nb", 5), rep("zip", 5)),
131 |   input_data = my_data$dat,
132 |   new_covariate = my_data$new_covariate,
133 |   important_feature = my_copula$important_feature,
134 |   filtered_gene = my_data$filtered_gene
135 |   )
136 | 
137 | }
138 | 


--------------------------------------------------------------------------------
/man/sparse_cov.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/sparse_cov.R
 3 | \name{sparse_cov}
 4 | \alias{sparse_cov}
 5 | \title{This function computes the thresholding sparse covariance/correlation estimator 
 6 | with the optimal threshold level.}
 7 | \usage{
 8 | sparse_cov(
 9 |   data,
10 |   method = c("cv", "qiu"),
11 |   operator = c("hard", "soft", "scad", "al"),
12 |   corr = TRUE
13 | )
14 | }
15 | \arguments{
16 | \item{data}{The data matrix.}
17 | 
18 | \item{method}{The choice of method to select the optimal threshold level.}
19 | 
20 | \item{operator}{The choice of the thresholding operator.}
21 | 
22 | \item{corr}{The indicator of computing correlation or covariance matrix.}
23 | }
24 | \value{
25 | The thresholding sparse covariance/correlation estimator.
26 | }
27 | \description{
28 | Part from Chenxin Jiang
29 | }
30 | \examples{
31 | print("No example")
32 | }
33 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(scDesign3)
3 | 
4 | test_check("scDesign3")
5 | 


--------------------------------------------------------------------------------
/tests/testthat/test-scDesign3.R:
--------------------------------------------------------------------------------
  1 | #context("Run scDesign3")
  2 | library(scDesign3)
  3 | 
  4 | test_that("Run scDesign3", {
  5 |   data(example_sce)
  6 |   my_data <- construct_data(
  7 |     sce = example_sce,
  8 |     assay_use = "counts",
  9 |     celltype = "cell_type",
 10 |     pseudotime = "pseudotime",
 11 |     spatial = NULL,
 12 |     other_covariates = NULL,
 13 |     corr_by = "cell_type"
 14 |   )
 15 | 
 16 |   my_data2 <- construct_data(
 17 |     sce = example_sce,
 18 |     assay_use = "counts",
 19 |     celltype = "cell_type",
 20 |     pseudotime = "pseudotime",
 21 |     spatial = NULL,
 22 |     other_covariates = NULL,
 23 |     corr_by = "pseudotime",
 24 |     ncell = 10000
 25 |   )
 26 | 
 27 |   my_marginal1 <- fit_marginal(
 28 |     data = my_data,
 29 |     mu_formula = "1",
 30 |     sigma_formula = "1",
 31 |     family_use = "nb",
 32 |     n_cores = 1,
 33 |     usebam = FALSE
 34 |   )
 35 | 
 36 |    my_marginal2 <- fit_marginal(
 37 |     data = my_data,
 38 |     mu_formula = "s(pseudotime, bs = 'cr', k = 10)",
 39 |     sigma_formula = "1",
 40 |     family_use = "nb",
 41 |     n_cores = 1,
 42 |     usebam = FALSE
 43 |   )
 44 | 
 45 |   my_fit1 <- lapply(my_marginal1, function(x)x$fit)
 46 |   my_fit2 <- lapply(my_marginal2, function(x)x$fit)
 47 | 
 48 |   my_pvalue <- perform_lrt(my_fit2, my_fit1)
 49 | 
 50 |   my_marginal3 <- fit_marginal(
 51 |     data = my_data,
 52 |     mu_formula = "s(pseudotime, bs = 'cr', k = 10)",
 53 |     sigma_formula = "1", #s(pseudotime, bs = 'cr', k = 3)
 54 |     family_use = c(rep("nb", 9), rep("zip", 1)),
 55 |     n_cores = 2,
 56 |     usebam = TRUE, trace = TRUE, simplify = TRUE
 57 |   )
 58 | 
 59 |   my_copula <- fit_copula(
 60 |     sce = example_sce,
 61 |     assay_use = "counts",
 62 |     marginal_list = my_marginal3,
 63 |     family_use = c(rep("nb", 9), rep("zip", 1)),
 64 |     copula = "vine",
 65 |     n_cores = 1,
 66 |     input_data = my_data$dat
 67 |   )
 68 | 
 69 |   my_quantile_mat <- fit_copula(
 70 |     sce = example_sce,
 71 |     assay_use = "counts",
 72 |     empirical_quantile = TRUE,
 73 |     marginal_list = my_marginal3,
 74 |     family_use = c(rep("nb", 9), rep("zip", 1)),
 75 |     copula = "vine",
 76 |     n_cores = 2,
 77 |     input_data = my_data$dat
 78 |   )
 79 |   
 80 |   my_copula1 <- fit_copula(
 81 |     sce = example_sce,
 82 |     assay_use = "counts",
 83 |     marginal_list = my_marginal3,
 84 |     family_use = c(rep("nb", 5), rep("zip", 5)),
 85 |     copula = "gaussian",
 86 |     n_cores = 1,
 87 |     input_data = my_data$dat,
 88 |     if_sparse = TRUE
 89 |   )
 90 | 
 91 |   my_para <- extract_para(
 92 |     sce = example_sce,
 93 |     marginal_list = my_marginal3,
 94 |     n_cores = 1,
 95 |     family_use = c(rep("nb", 9), rep("zip", 1)),
 96 |     new_covariate = my_data2$new_covariate,
 97 |     data = my_data$dat
 98 |   )
 99 | 
100 |   my_newcount <- simu_new(
101 |     sce = example_sce,
102 |     mean_mat = my_para$mean_mat,
103 |     sigma_mat = my_para$sigma_mat,
104 |     zero_mat = my_para$zero_mat,
105 |     quantile_mat = NULL,
106 |     copula_list = my_copula$copula_list,
107 |     n_cores = 1,
108 |     family_use = c(rep("nb", 9), rep("zip", 1)),
109 |     input_data = my_data$dat,
110 |     new_covariate = my_data$new_covariate,
111 |     important_feature = my_copula$important_feature,
112 |     filtered_gene = my_data$filtered_gene
113 |   )
114 |   
115 |   my_newcount2 <- simu_new(
116 |     sce = example_sce,
117 |     mean_mat = my_para$mean_mat,
118 |     sigma_mat = my_para$sigma_mat,
119 |     zero_mat = my_para$zero_mat,
120 |     quantile_mat = my_quantile_mat$quantile_mat,
121 |     copula_list = NULL,
122 |     n_cores = 1,
123 |     family_use = c(rep("nb", 9), rep("zip", 1)),
124 |     input_data = my_data$dat,
125 |     new_covariate = my_data2$new_covariate,
126 |     important_feature = my_copula$important_feature,
127 |     filtered_gene = my_data$filtered_gene
128 |   )
129 | 
130 |   my_newcount3 <- simu_new(
131 |     sce = example_sce,
132 |     mean_mat = my_para$mean_mat,
133 |     sigma_mat = my_para$sigma_mat,
134 |     zero_mat = my_para$zero_mat,
135 |     quantile_mat = NULL,
136 |     copula_list = my_copula1$copula_list,
137 |     n_cores = 1,
138 |     family_use = c(rep("nb", 9), rep("zip", 1)),
139 |     input_data = my_data$dat,
140 |     new_covariate = my_data2$new_covariate,
141 |     important_feature = my_copula$important_feature,
142 |     filtered_gene = my_data$filtered_gene
143 |   )
144 |   
145 |   my_simu <- scdesign3(
146 |     sce = example_sce,
147 |     assay_use = "counts",
148 |     celltype = "cell_type",
149 |     pseudotime = "pseudotime",
150 |     spatial = NULL,
151 |     other_covariates = NULL,
152 |     mu_formula = "s(pseudotime, bs = 'cr', k = 10)",
153 |     sigma_formula = "1",
154 |     family_use = c(rep("nb", 9), rep("zip", 1)),
155 |     n_cores = 2,
156 |     usebam = FALSE,
157 |     corr_formula = "pseudotime",
158 |     copula = "vine",
159 |     DT = TRUE,
160 |     pseudo_obs = FALSE,
161 |     ncell = 1000,
162 |     return_model = TRUE, simplify = TRUE, 
163 |     n_rep = 2
164 |   )
165 | 
166 |   # my_simu2 <- scdesign3(
167 |   #   sce = example_sce,
168 |   #   assay_use = "counts",
169 |   #   celltype = "cell_type",
170 |   #   pseudotime = "pseudotime",
171 |   #   spatial = NULL,
172 |   #   other_covariates = NULL,
173 |   #   mu_formula = "s(pseudotime, bs = 'cr', k = 10)",
174 |   #   sigma_formula = "s(pseudotime, bs = 'cr', k = 3)",
175 |   #   family_use = c(rep("nb", 5), rep("zip", 5)),
176 |   #   n_cores = 2,
177 |   #   usebam = FALSE,
178 |   #   corr_formula = "pseudotime",
179 |   #   empirical_quantile = TRUE,
180 |   #   copula = "vine",
181 |   #   DT = TRUE,
182 |   #   pseudo_obs = FALSE,
183 |   #   return_model = FALSE
184 |   # )
185 | })
186 | 
187 | 


--------------------------------------------------------------------------------
/vignettes/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | *.R
3 | 


--------------------------------------------------------------------------------
/vignettes/scDesign3.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "scDesign3 Quickstart"
  3 | author: 
  4 |   - name: Dongyuan Song
  5 |     affiliation:
  6 |     - Bioinformatics IDP, University of California, Los Angeles
  7 |     email: dongyuansong@ucla.edu
  8 |   - name: Qingyang Wang
  9 |     affiliation:
 10 |     - Department of Statistics, University of California, Los Angeles
 11 |     email: qw802@g.ucla.edu
 12 | output: 
 13 |   BiocStyle::html_document:
 14 |     self_contained: yes
 15 |     toc: true
 16 |     toc_float: true
 17 |     toc_depth: 2
 18 |     code_folding: show
 19 | date: "`r doc_date()`"
 20 | package: "`r pkg_ver('scDesign3')`"
 21 | vignette: >
 22 |   %\VignetteIndexEntry{scDesign3-quickstart-vignette}
 23 |   %\VignetteEngine{knitr::rmarkdown}
 24 |   %\VignetteEncoding{UTF-8}  
 25 | ---
 26 | ```{css, echo=FALSE}
 27 | pre {
 28 |   white-space: pre !important;
 29 |   overflow-x: scroll !important;
 30 | }
 31 | ```
 32 | 
 33 | ```{r setup, include = FALSE}
 34 | knitr::opts_chunk$set(
 35 |     message = FALSE,
 36 |     collapse = TRUE,
 37 |     comment = "#>",
 38 |     crop = NULL ## Related to https://stat.ethz.ch/pipermail/bioc-devel/2020-April/016656.html
 39 | )
 40 | tools::R_user_dir("scDesign3", which="cache")
 41 | ```
 42 | 
 43 | ```{r, message=FALSE, warning=FALSE, results='hide'}
 44 | library(scDesign3)
 45 | library(SingleCellExperiment)
 46 | library(ggplot2)
 47 | theme_set(theme_bw())
 48 | ```
 49 | 
 50 | ## Introduction
 51 | scDesign3 is a unified probabilistic framework that generates realistic in silico high-dimensional single-cell omics data of various cell states, including discrete cell types, continuous trajectories, and spatial locations by learning from real datasets. Since the functions of scDesign3 is very comprehensive, here we only introduce how scDesign3 simulates an scRNA-seq dataset with one continuous developmental trajectory. For more information, please check the Articles on our website: (https://songdongyuan1994.github.io/scDesign3/docs/index.html).
 52 | 
 53 | ## Read in the reference data
 54 | 
 55 | The raw data is from the [scvelo](https://scvelo.readthedocs.io/scvelo.datasets.pancreas/), which describes pancreatic endocrinogenesis. We pre-select the top 1000 highly variable genes and filter out some cell types to ensure a **single trajectory**.
 56 | 
 57 | ```{r}
 58 | example_sce <- readRDS((url("https://figshare.com/ndownloader/files/40581992")))
 59 | print(example_sce)
 60 | ```
 61 | To save computational time, we only use the top 100 genes.
 62 | ```{r}
 63 | example_sce <- example_sce[1:100, ]
 64 | ```
 65 | 
 66 | ## Simulation
 67 | The function `scdesign3()` takes in a `SinglecellExperiment` object with the cell covariates (such as cell types, pseudotime, or spatial coordinates) stored in the `colData` of the `SinglecellExperiment` object.
 68 | ```{r, message=FALSE, warning=FALSE, results='hide'}
 69 | set.seed(123)
 70 | example_simu <- scdesign3(
 71 |     sce = example_sce,
 72 |     assay_use = "counts",
 73 |     celltype = "cell_type",
 74 |     pseudotime = "pseudotime",
 75 |     spatial = NULL,
 76 |     other_covariates = NULL,
 77 |     mu_formula = "s(pseudotime, k = 10, bs = 'cr')",
 78 |     sigma_formula = "1", # If you want your dispersion also varies along pseudotime, use "s(pseudotime, k = 5, bs = 'cr')"
 79 |     family_use = "nb",
 80 |     n_cores = 2,
 81 |     usebam = FALSE,
 82 |     corr_formula = "1",
 83 |     copula = "gaussian",
 84 |     DT = TRUE,
 85 |     pseudo_obs = FALSE,
 86 |     return_model = FALSE,
 87 |     nonzerovar = FALSE
 88 |   )
 89 | ```
 90 | The output of `scdesign3()` is a list which includes:
 91 | 
 92 | * `new_count`: This is the synthetic count matrix generated by `scdesign3()`.
 93 | * `new_covariate`: 
 94 |   + If the parameter `ncell` is set to a number that is different from the number of cells in the input data, this will be a matrix that has the new cell covariates that are used for generating new data.
 95 |   + If the parameter `ncell` is the default value, this will be `NULL`.
 96 | * `model_aic`: This is a vector include the genes' marginal models' AIC, fitted copula's AIC, and total AIC, which is the sum of the previous two AIC. 
 97 | * `model_bic`: This is a vector include the genes' marginal models' BIC, fitted copula's BIC, and total BIC, which is the sum of the previous two BIC.
 98 | * `marginal_list`:
 99 |   + If the parameter `return_model` is set to `TRUE`, this will be a list which contains the fitted gam or gamlss model for all genes in the input data.
100 |   + If the parameter `return_model` is set to the default value `FALSE`, this will be `NULL`.
101 | * `corr_list`:
102 |   + If the parameter `return_model` is set to `TRUE`, this will be a list which contains the either a correlation matrix (when `copula = "gaussian"`) or the fitted vine copula (when `copula = "vine`) for each user specified correlation groups (based on the parameter `corr_by`).
103 |   + If the parameter `return_model` is set to the default value `FALSE`, this will be `NULL`.
104 | 
105 | In this example, since we did not change the parameter `ncell`, the synthetic count matrix will have the same dimension as the input count matrix. 
106 | ```{r}
107 | dim(example_simu$new_count)
108 | ```
109 | Then, we can create the `SinglecellExperiment` object using the synthetic count matrix and store the `logcounts` to the input and synthetic `SinglecellExperiment` objects.
110 | ```{r}
111 | logcounts(example_sce) <- log1p(counts(example_sce))
112 | simu_sce <- SingleCellExperiment(list(counts = example_simu$new_count), colData = example_simu$new_covariate)
113 | logcounts(simu_sce) <- log1p(counts(simu_sce))
114 | ```
115 | 
116 | ## Visualization
117 | ```{r}
118 | set.seed(123)
119 | compare_figure <- plot_reduceddim(ref_sce = example_sce, 
120 |                                   sce_list = list(simu_sce), 
121 |                                   name_vec = c("Reference", "scDesign3"),
122 |                                   assay_use = "logcounts", 
123 |                                   if_plot = TRUE, 
124 |                                   color_by = "pseudotime", 
125 |                                   n_pc = 20)
126 | plot(compare_figure$p_umap)
127 | ```
128 | 
129 | ## Session information
130 | ```{r}
131 | sessionInfo()
132 | ```
133 | 


--------------------------------------------------------------------------------