├── .Rbuildignore
├── .github
    ├── .gitignore
    └── workflows
    │   └── pkgdown.yaml
├── .gitignore
├── DESCRIPTION
├── Meta
    └── vignette.rds
├── NAMESPACE
├── NEWS.md
├── R
    ├── 0_opts.R
    ├── 1.1_cif.R
    ├── 1_main.R
    ├── 2_sim.R
    ├── 3.1_spatial.R
    ├── 3.2_dyngrn.R
    ├── 6_technoise.R
    ├── 7_benchmark.R
    ├── 8_utils.R
    ├── 9.1_shiny.R
    ├── 9_meta.R
    ├── data.R
    ├── imports.R
    └── results.R
├── README.md
├── _pkgdown.yml
├── data
    ├── GRN_params_100.RData
    ├── GRN_params_1139.RData
    ├── dens_nonzero.RData
    ├── gene_len_pool.RData
    ├── len2nfrag.RData
    └── param_realdata.zeisel.imputed.RData
├── inst
    ├── extdata
    │   ├── Newick_ABCDE.txt
    │   └── Newick_animals.txt
    └── shiny-app
    │   ├── app.R
    │   └── www
    │       ├── .prettierrc
    │       ├── index.html
    │       ├── options.js
    │       ├── output.js
    │       ├── phyla1.png
    │       ├── phyla3.png
    │       ├── phyla5.png
    │       ├── scm_logo.png
    │       ├── style.css
    │       └── validate.js
├── man
    ├── GRN_params_100.Rd
    ├── GRN_params_1139.Rd
    ├── Get_1region_ATAC_correlation.Rd
    ├── Get_ATAC_correlation.Rd
    ├── OP.Rd
    ├── Phyla1.Rd
    ├── Phyla3.Rd
    ├── Phyla5.Rd
    ├── SampleDen.Rd
    ├── True2ObservedATAC.Rd
    ├── True2ObservedCounts.Rd
    ├── add_expr_noise.Rd
    ├── add_outliers.Rd
    ├── cci_cell_type_params.Rd
    ├── dens_nonzero.Rd
    ├── divide_batches.Rd
    ├── dot-amplifyOneCell.Rd
    ├── dot-calAmpBias.Rd
    ├── dot-continuousCIF.Rd
    ├── dot-divideBatchesImpl.Rd
    ├── dot-expandToBinary.Rd
    ├── dot-getCountCorrMatrix.Rd
    ├── dot-getParams.Rd
    ├── dot-normalizeGRNParams.Rd
    ├── dot-rnormTrunc.Rd
    ├── gen_1branch.Rd
    ├── gen_clutter.Rd
    ├── gene_corr_cci.Rd
    ├── gene_corr_regulator.Rd
    ├── gene_len_pool.Rd
    ├── len2nfrag.Rd
    ├── match_params.Rd
    ├── plot_cell_loc.Rd
    ├── plot_gene_module_cor_heatmap.Rd
    ├── plot_grid.Rd
    ├── plot_grn.Rd
    ├── plot_phyla.Rd
    ├── plot_rna_velocity.Rd
    ├── plot_tsne.Rd
    ├── run_shiny.Rd
    ├── scmultisim_help.Rd
    ├── sim_example.Rd
    ├── sim_example_spatial.Rd
    ├── sim_true_counts.Rd
    └── spatialGrid-class.Rd
├── pkgdown
    └── extra.css
├── tests
    ├── testthat.R
    └── testthat
    │   └── test-1_main.R
└── vignettes
    ├── .gitignore
    ├── basics.Rmd
    ├── options.Rmd
    ├── spatialCCI.Rmd
    └── workflow.Rmd


/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^.*\.Rproj$
 2 | ^\.Rproj\.user$
 3 | ^doc$
 4 | ^bench$
 5 | ^figures$
 6 | ^Meta$
 7 | ^\.vscode$
 8 | ^\.idea$
 9 | ^tmp$
10 | ^_pkgdown\.yml$
11 | ^docs$
12 | ^pkgdown$
13 | ^\.github$
14 | ^vignettes/articles$
15 | 


--------------------------------------------------------------------------------
/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | 


--------------------------------------------------------------------------------
/.github/workflows/pkgdown.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 |     branches: [main, master]
 8 |   release:
 9 |     types: [published]
10 |   workflow_dispatch:
11 | 
12 | name: pkgdown.yaml
13 | 
14 | permissions: read-all
15 | 
16 | jobs:
17 |   pkgdown:
18 |     runs-on: ubuntu-latest
19 |     # Only restrict concurrency for non-PR jobs
20 |     concurrency:
21 |       group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }}
22 |     env:
23 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
24 |     permissions:
25 |       contents: write
26 |     steps:
27 |       - uses: actions/checkout@v4
28 | 
29 |       - uses: r-lib/actions/setup-pandoc@v2
30 | 
31 |       - uses: r-lib/actions/setup-r@v2
32 |         with:
33 |           use-public-rspm: true
34 | 
35 |       - uses: r-lib/actions/setup-r-dependencies@v2
36 |         with:
37 |           extra-packages: any::pkgdown, local::.
38 |           needs: website
39 | 
40 |       - name: Build site
41 |         run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE)
42 |         shell: Rscript {0}
43 | 
44 |       - name: Deploy to GitHub pages 🚀
45 |         if: github.event_name != 'pull_request'
46 |         uses: JamesIves/github-pages-deploy-action@v4.5.0
47 |         with:
48 |           clean: false
49 |           branch: gh-pages
50 |           folder: docs
51 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | 
  2 | # Created by https://www.toptal.com/developers/gitignore/api/r,macos
  3 | # Edit at https://www.toptal.com/developers/gitignore?templates=r,macos
  4 | 
  5 | ### macOS ###
  6 | # General
  7 | .DS_Store
  8 | .AppleDouble
  9 | .LSOverride
 10 | 
 11 | # Icon must end with two \r
 12 | Icon
 13 | 
 14 | 
 15 | # Thumbnails
 16 | ._*
 17 | 
 18 | # Files that might appear in the root of a volume
 19 | .DocumentRevisions-V100
 20 | .fseventsd
 21 | .Spotlight-V100
 22 | .TemporaryItems
 23 | .Trashes
 24 | .VolumeIcon.icns
 25 | .com.apple.timemachine.donotpresent
 26 | 
 27 | # Directories potentially created on remote AFP share
 28 | .AppleDB
 29 | .AppleDesktop
 30 | Network Trash Folder
 31 | Temporary Items
 32 | .apdisk
 33 | 
 34 | ### R ###
 35 | # History files
 36 | .Rhistory
 37 | .Rapp.history
 38 | 
 39 | # Session Data files
 40 | .RData
 41 | 
 42 | # User-specific files
 43 | .Ruserdata
 44 | 
 45 | # Example code in package build process
 46 | *-Ex.R
 47 | 
 48 | # Output files from R CMD build
 49 | /*.tar.gz
 50 | 
 51 | # Output files from R CMD check
 52 | /*.Rcheck/
 53 | 
 54 | # RStudio files
 55 | .Rproj.user/
 56 | 
 57 | # produced vignettes
 58 | vignettes/*.html
 59 | vignettes/*.pdf
 60 | 
 61 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
 62 | .httr-oauth
 63 | 
 64 | # knitr and R markdown default cache directories
 65 | *_cache/
 66 | /cache/
 67 | 
 68 | # Temporary files created by R markdown
 69 | *.utf8.md
 70 | *.knit.md
 71 | 
 72 | # R Environment Variables
 73 | .Renviron
 74 | 
 75 | # pkgdown site
 76 | docs/
 77 | 
 78 | # translation temp files
 79 | po/*~
 80 | 
 81 | ### R.Bookdown Stack ###
 82 | # R package: bookdown caching files
 83 | /*_files/
 84 | 
 85 | # End of https://www.toptal.com/developers/gitignore/api/r,macos
 86 | 
 87 | /temp
 88 | /sim
 89 | /bench
 90 | /.idea
 91 | vignettes/tmp.Rmd
 92 | vignettes/common.Rmd
 93 | /*.csv
 94 | /*.zip
 95 | /.vscode
 96 | /tmp
 97 | /doc/
 98 | /Meta/
 99 | /figures
100 | 
101 | R/.Rhistory
102 | scMultiSim.Rproj
103 | docs
104 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: scMultiSim
 2 | Title: Simulation of Multi-Modality Single Cell Data Guided By Gene Regulatory Networks and Cell-Cell Interactions
 3 | Version: 1.1.10
 4 | Authors@R:
 5 |     c(person(given = "Hechen",
 6 |              family = "Li",
 7 |              role = c("aut", "cre"),
 8 |              email = "hli691@gatech.edu",
 9 |              comment = c(ORCID = "0000-0003-4907-429X")),
10 |       person(given = "Xiuwei",
11 |              family = "Zhang",
12 |              email = "zhangxiuwei03@gmail.com",
13 |              role = "aut"),
14 |       person(given = "Ziqi",
15 |              family = "Zhang",
16 |              email = "ziqi.zhang@gatech.edu",
17 |              role = "aut"),
18 |       person(given = "Michael",
19 |              family = "Squires",
20 |              email = "squiresmf@gatech.edu",
21 |              role = "aut"))
22 | Description:
23 |     scMultiSim simulates paired single cell RNA-seq, single cell ATAC-seq and RNA velocity data,
24 |     while incorporating mechanisms of gene regulatory networks, chromatin accessibility and
25 |     cell-cell interactions. It allows users to tune various parameters controlling the
26 |     amount of each biological factor, variation of gene-expression levels,
27 |     the influence of chromatin accessibility on RNA sequence data, and so on.
28 |     It can be used to benchmark various computational methods for single cell multi-omics data,
29 |     and to assist in experimental design of wet-lab experiments.
30 | License: Artistic-2.0
31 | Encoding: UTF-8
32 | RoxygenNote: 7.3.1
33 | Depends:
34 |     R (>= 4.4.0)
35 | Imports:
36 |     foreach,
37 |     rlang,
38 |     dplyr,
39 |     ggplot2,
40 |     Rtsne,
41 |     ape,
42 |     MASS,
43 |     matrixStats,
44 |     phytools,
45 |     KernelKnn,
46 |     gplots,
47 |     zeallot,
48 |     crayon,
49 |     assertthat,
50 |     igraph,
51 |     methods,
52 |     grDevices,
53 |     graphics,
54 |     stats,
55 |     utils,
56 |     markdown,
57 |     SummarizedExperiment,
58 |     BiocParallel
59 | Suggests:
60 |     knitr,
61 |     rmarkdown,
62 |     roxygen2,
63 |     shiny,
64 |     testthat (>= 3.0.0)
65 | biocViews: SingleCell, Transcriptomics, GeneExpression, Sequencing, ExperimentalDesign
66 | VignetteBuilder: knitr
67 | Roxygen: list(markdown = TRUE)
68 | BugReports: https://github.com/ZhangLabGT/scMultiSim/issues
69 | URL: https://zhanglabgt.github.io/scMultiSim/
70 | Config/testthat/edition: 3
71 | 


--------------------------------------------------------------------------------
/Meta/vignette.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhangLabGT/scMultiSim/12e3799a445316c93df9fc357909c796cfc61f6e/Meta/vignette.rds


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(Get_1region_ATAC_correlation)
 4 | export(Get_ATAC_correlation)
 5 | export(Phyla1)
 6 | export(Phyla3)
 7 | export(Phyla5)
 8 | export(True2ObservedATAC)
 9 | export(True2ObservedCounts)
10 | export(add_expr_noise)
11 | export(add_outliers)
12 | export(cci_cell_type_params)
13 | export(divide_batches)
14 | export(gen_clutter)
15 | export(gene_corr_cci)
16 | export(gene_corr_regulator)
17 | export(plot_cell_loc)
18 | export(plot_gene_module_cor_heatmap)
19 | export(plot_grid)
20 | export(plot_grn)
21 | export(plot_phyla)
22 | export(plot_rna_velocity)
23 | export(plot_tsne)
24 | export(run_shiny)
25 | export(scmultisim_help)
26 | export(sim_example)
27 | export(sim_example_spatial)
28 | export(sim_true_counts)
29 | exportClasses(spatialGrid)
30 | import(ape)
31 | import(foreach)
32 | import(ggplot2)
33 | import(markdown)
34 | import(rlang)
35 | importFrom(BiocParallel,MulticoreParam)
36 | importFrom(BiocParallel,bplapply)
37 | importFrom(Rtsne,Rtsne)
38 | importFrom(SummarizedExperiment,SummarizedExperiment)
39 | importFrom(dplyr,"%>%")
40 | importFrom(stats,cor)
41 | importFrom(stats,density)
42 | importFrom(stats,dist)
43 | importFrom(stats,dnorm)
44 | importFrom(stats,hclust)
45 | importFrom(stats,median)
46 | importFrom(stats,na.omit)
47 | importFrom(stats,rbeta)
48 | importFrom(stats,rbinom)
49 | importFrom(stats,rnorm)
50 | importFrom(stats,rpois)
51 | importFrom(stats,runif)
52 | importFrom(stats,setNames)
53 | importFrom(utils,data)
54 | importFrom(utils,write.csv)
55 | importFrom(zeallot,"%->%")
56 | importFrom(zeallot,"%<-%")
57 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
 1 | # scMultiSim 1.1.3
 2 | 
 3 | - Added the Shiny app to help users visualize the effect of each parameter and adjust the simulation options.
 4 | - Added the `speed.up` parameter to enable experimental speed optimization.
 5 | - Bug fixes and improvements.
 6 | 
 7 | # scMultiSim 1.1.0
 8 | 
 9 | Prepare for the Bioconductor release
10 | 
11 | - Fix build errors
12 | 
13 | # scMultiSim 0.99.8
14 | 
15 | Prepare for the Bioconductor release
16 | 
17 | - Tidy up the code, add more comments
18 | 
19 | # scMultiSim 0.99.7
20 | 
21 | Prepare for the Bioconductor release
22 | 
23 | - Fix build errors
24 | 


--------------------------------------------------------------------------------
/R/0_opts.R:
--------------------------------------------------------------------------------
  1 | 
  2 | .default <- \(...) list(FALSE, as.character(enexprs(...)), ...)
  3 | .required <- list(TRUE)
  4 | 
  5 | .should.be.logical <- list(
  6 |   is.logical,
  7 |   "The value should be a logical."
  8 | )
  9 | 
 10 | .should.be.int <- list(
 11 |   \(x) x %% 1 == 0,
 12 |   "The value should be a numeric."
 13 | )
 14 | 
 15 | .should.be.int.between <- function(a, b) list(
 16 |   \(x) x %% 1 == 0 && x >= a && x <= b,
 17 |   sprintf("The value should be an integer between %g and %g.", a, b)
 18 | )
 19 | 
 20 | .should.be.num <- list(
 21 |   is.numeric,
 22 |   "The value should be a numeric."
 23 | )
 24 | 
 25 | .should.be.num.between <- function(a, b) list(
 26 |   \(x) is.numeric(x) && x >= a && x <= b,
 27 |   sprintf("The value should be a numeric between %g and %g.", a, b)
 28 | )
 29 | 
 30 | .choose_from <- function(...) {
 31 |   opts <- list(...)
 32 |   list(
 33 |     \(x) x %in% opts,
 34 |     sprintf("The value should be one of [%s].", do.call(paste, c(opts, sep = ", ")))
 35 |   )
 36 | }
 37 | 
 38 | 
 39 | # ==============================================================================
 40 | # OPTIONS: each option should be a list(default, checker, description)
 41 | # ==============================================================================
 42 | 
 43 | .opt_list <- function() list(
 44 |   "GENERAL",
 45 |   rand.seed                                                              = list(
 46 |     .default(0),
 47 |     .should.be.int,
 48 |     "scMultiSim should produce the same result if all other parameters are the same."
 49 |   ),
 50 |   threads                                                                = list(
 51 |     .default(1),
 52 |     .should.be.int.between(1, 4096),
 53 |     "Set to larger than 1 to use multithreading for some part of the simulation."
 54 |   ),
 55 |   speed.up                                                               = list(
 56 |     .default(FALSE),
 57 |     .should.be.logical,
 58 |     "Use experimental speed and memory optimization."
 59 |   ),
 60 |   # ========================== Gene ============================================
 61 |   "GENE",
 62 |   GRN                                                                    = list(
 63 |     .default(NULL),
 64 |     list(
 65 |       \(x) (length(x) == 1 && is.na(x)) || (is.data.frame(x) && ncol(x) >= 3 && is.numeric(x[[3]])),
 66 |       "It should be a data frame with 3 columns (target, regulator, effect). Supply NA to disable the GRN effect."
 67 |     ),
 68 |     "The GRN network."
 69 |   ),
 70 |   grn.effect                                                             = list(
 71 |     .default(1),
 72 |     .should.be.num.between(0, Inf),
 73 |     "Overall strength of the GRN effect on the expression. Different from the effect column in the GRN data frame, which is the relative effect of each TF-target pair."
 74 |   ),
 75 |   num.genes                                                              = list(
 76 |     .default(NULL),
 77 |     .should.be.int.between(1, Inf),
 78 |     "Number of genes if GRN is disabled."
 79 |   ),
 80 |   unregulated.gene.ratio                                                 = list(
 81 |     .default(0.1),
 82 |     .should.be.num.between(0, 1),
 83 |     "Ratio of unreulated to regulated genes. Extra unregulated genes will be simulated in addition to the genes in GRN."
 84 |   ),
 85 |   giv.mean                                                               = list(
 86 |     .default(0),
 87 |     .should.be.num.between(-Inf, Inf),
 88 |     "Mean of the Gene Identity Vectors."
 89 |   ),
 90 |   giv.prob                                                               = list(
 91 |     .default(0.3),
 92 |     .should.be.num.between(0, 1),
 93 |     "Probability of non-zero values in the Gene Identity Vectors."
 94 |   ),
 95 |   giv.sd                                                                 = list(
 96 |     .default(1),
 97 |     .should.be.num.between(0, Inf),
 98 |     "Stddev of the Gene Identity Vectors."
 99 |   ),
100 |   hge.range                                                              = list(
101 |     .default(1),
102 |     .should.be.num.between(1, Inf),
103 |     "Only choose highly expressed genes after this range."
104 |   ),
105 |   hge.prop                                                               = list(
106 |     .default(0),
107 |     .should.be.num.between(0, 1),
108 |     "Propotion of highly expressed genes."
109 |   ),
110 |   hge.mean                                                               = list(
111 |     .default(5),
112 |     .should.be.num.between(1, Inf),
113 |     "Scale of highly expressed genes."
114 |   ),
115 |   hge.sd                                                                 = list(
116 |     .default(1),
117 |     .should.be.num.between(0, Inf),
118 |     "Variation of highly expressed genes."
119 |   ),
120 |   hge.max.var                                                            = list(
121 |     .default(500),
122 |     .should.be.num.between(0, Inf),
123 |     "Genes with higher variation will not be selected as highly expressed genes."
124 |   ),
125 |   dynamic.GRN                                                            = list(
126 |     .default(NA),
127 |     NULL,
128 |     "Specification of the dynamic GRN. See scmultisim_help(\"dynamic.GRN\") for details."
129 |   ),
130 |   # ========================== Cell ============================================
131 |   "CELL",
132 |   num.cells                                                              = list(
133 |     .default(1000),
134 |     .should.be.int.between(0, Inf),
135 |     "Total number of cells from all populations."
136 |   ),
137 |   tree                                                                   = list(
138 |     .default(Phyla5()),
139 |     NULL,
140 |     "A tree defining relationship between populations."
141 |   ),
142 |   discrete.cif                                                           = list(
143 |     .default(FALSE),
144 |     .should.be.logical,
145 |     "Whether the cell population is discrete."
146 |   ),
147 |   discrete.pop.size                                                      = list(
148 |     .default(NA),
149 |     list(
150 |       \(x) (length(x) == 1 && is.na(x)) || all(is.integer(x)),
151 |       "the value should be an integer vector"
152 |     ),
153 |     "Specify the cell numbers in each population."
154 |   ),
155 |   discrete.min.pop.size                                                  = list(
156 |     .default(70),
157 |     .should.be.int,
158 |     "Size of the smallest discrete cell population."
159 |   ),
160 |   discrete.min.pop.index                                                 = list(
161 |     .default(1),
162 |     .should.be.int.between(0, Inf),
163 |     "Index of the smallest discrete cell population."
164 |   ),
165 |   num.cifs                                                               = list(
166 |     .default(50),
167 |     .should.be.int,
168 |     "Number of Cell Identity Factors for each kinetic parameter."
169 |   ),
170 |   diff.cif.fraction                                                      = list(
171 |     .default(0.9),
172 |     .should.be.num.between(0, 1),
173 |     "Fraction of CIFs which are differential factors between cell types."
174 |   ),
175 |   cif.center                                                             = list(
176 |     .default(1),
177 |     .should.be.num,
178 |     "Mean of the CIF values."
179 |   ),
180 |   cif.sigma                                                              = list(
181 |     .default(0.1),
182 |     .should.be.num.between(0, Inf),
183 |     "Stddev of the CIF values."
184 |   ),
185 |   use.impulse                                                            = list(
186 |     .default(FALSE),
187 |     .should.be.logical,
188 |     "Use the impulse model when generating the continuous CIF."
189 |   ),
190 |   # ========================== ATAC ============================================
191 |   "SIMULATION - ATAC",
192 |   atac.effect                                                            = list(
193 |     .default(0.5),
194 |     .should.be.num.between(0, 1),
195 |     "The influence of chromatin accessability data on gene expression."
196 |   ),
197 |   region.distrib                                                         = list(
198 |     .default(c(0.1, 0.5, 0.4)),
199 |     list(
200 |       \(x) x > 0 && length(x) == 3 && sum(x) == 1,
201 |       "the value should be a vector with 3 elements sum to 1"
202 |     ),
203 |     "The probability that a gene is regulated by respectively 0, 1, 2 consecutive regions."
204 |   ),
205 |   atac.p_zero                                                            = list(
206 |     .default(0.8),
207 |     NULL,
208 |     "The proportion of 0s we see in the ATAC-seq data."
209 |   ),
210 |   atac.density                                                           = list(
211 |     .default(NA),
212 |     list(
213 |       \(x) class(x) == "density",
214 |       "the value should be a density object."
215 |     ),
216 |     "Density of the non-zero ATAC-seq values. Use atac_dens_nonzero() to generate."
217 |   ),
218 |   riv.mean                                                               = list(
219 |     .default(0),
220 |     .should.be.num.between(0, Inf),
221 |     "Mean of the Region Identity Vectors."
222 |   ),
223 |   riv.prob                                                               = list(
224 |     .default(0.3),
225 |     .should.be.num.between(0, 1),
226 |     "Probability of non-zero values in the Region Identity Vectors."
227 |   ),
228 |   riv.sd                                                                 = list(
229 |     .default(1),
230 |     .should.be.num.between(0, Inf),
231 |     "Stddev of the Region Identity Vectors."
232 |   ),
233 |   # ========================== Simulation ======================================
234 |   "SIMULATION - RNA",
235 |   vary                                                                   = list(
236 |     .default("s"),
237 |     .choose_from("all", "kon", "koff", "s", "except_kon", "except_koff", "except_s"),
238 |     "Which kinetic parameters have differential CIFs."
239 |   ),
240 |   bimod                                                                  = list(
241 |     .default(0),
242 |     .should.be.num.between(0, 1),
243 |     "Adjust the bimodality of gene expression, thus controlling intrinsic variation."
244 |   ),
245 |   scale.s                                                                = list(
246 |     .default(1),
247 |     NULL,
248 |     "Scale of the s parameter. Use smaller value for cell types known to be small (like naive cells). When discrete.cif = T, it can be a vector specifying the scale.s for each cluster."
249 |   ),
250 |   intrinsic.noise                                                        = list(
251 |     .default(1),
252 |     .should.be.num.between(0, 1),
253 |     "The weight assigned to the random sample from the Beta-Poisson distribution, where the weight of the Beta-Poisson mean value is given a weight of (1 - intrinsic.noise)."
254 |   ),
255 |   # ========================== Kinetic Model ===================================
256 |   "SIMULATION - KINETIC MODEL",
257 |   do.velocity                                                            = list(
258 |     .default(FALSE),
259 |     .should.be.logical,
260 |     "Simulate using the whole kinetic model and generate RNA velocity data."
261 |   ),
262 |   beta                                                                   = list(
263 |     .default(0.4),
264 |     .should.be.num,
265 |     "Splicing rate of each gene in the kinetic model."
266 |   ),
267 |   d                                                                      = list(
268 |     .default(1),
269 |     .should.be.num,
270 |     "Degradation rate of each gene in the kinetic model."
271 |   ),
272 |   num.cycles                                                             = list(
273 |     .default(3),
274 |     .should.be.int.between(1, Inf),
275 |     "For velocity mode, the number of cycles run before sampling the gene expression of a cell."),
276 |   cycle.len                                                              = list(
277 |     .default(1),
278 |     .should.be.num.between(0, Inf),
279 |     "For velocity mode, a factor multiplied by the expected time to transition from kon to koff and back to form the the length of a cycle."
280 |   ),
281 |   mod.cif.giv = list(
282 |     .default(NA),
283 |     list(
284 |       is.function, "should be a function"
285 |     ),
286 |     "Modify the generated CIF and GIV. The function takes four arguments: the kinetic parameter index (1=kon, 2=koff, 3=s), the current CIF matrix, the GIV matrix, and the cell metadata dataframe. It should return a list of two elements: the modified CIF matrix and the modified GIV matrix."
287 |   ),
288 |   ext.cif.giv = list(
289 |     .default(NA),
290 |     list(
291 |       is.function, "should be a function"
292 |     ),
293 |     "Add customized CIF and GIV. The function takes one argument, the kinetic parameter index (1=kon, 2=koff, 3=s). It should return a list of two elements: the extra CIF matrix (n_extra_cif x n_cells) and the GIV matrix (n_genes x n_extra_cif). Return NULL for no extra CIF and GIV."
294 |   ),
295 |   # ========================== Spatial =========================================
296 |   "SIMULATION - SPATIAL",
297 |   cci                                                                    = list(
298 |     .default(NA),
299 |     list(
300 |       \(x) is.list(x) && is.data.frame(x[["params"]]),
301 |       "Enables cell-cell interaction. See scmultisim_help(\"cci\") for details."
302 |     ),
303 |     "The regulation network for spatial cell-cell interaction."
304 |   )
305 | )
306 | 
307 | # utils: check if the option is valid
308 | .check_opt <- function(options) {
309 |   opt_list <- .opt_list()
310 |   opt_list <- opt_list[!vapply(opt_list, is.character, logical(1))]
311 |   for (name in names(opt_list)) {
312 |     c(val, checker, desc) %<-% opt_list[[name]]
313 |     required <- val[[1]]
314 |     user_val <- options[[name]]
315 |     if (is.null(user_val)) {
316 |       # if option not exist
317 |       if (required) {
318 |         abort(sprintf("ERROR: Option '%s' is required.\n%s", name, desc))
319 |       } else {
320 |         # assign default value
321 |         options[[name]] <- val[[3]]
322 |       }
323 |     } else {
324 |       # check the value    
325 |       if (!is.null(checker)) {
326 |         c(check, err_msg) %<-% checker
327 |         if (!check(user_val)) {
328 |           abort(sprintf("ERROR: Option '%s' is invalid.\n%s", name, err_msg))
329 |         }
330 |       }
331 |     }
332 |   }
333 |   
334 |   options
335 | }
336 | 
337 | 
338 | # manually add line breaks to a long string after 72 characters
339 | .split_long_string <- function(x) {
340 |   if (!is.character(x)) return(NULL)
341 |   ss <- strsplit(x, "(?<=.{72})", perl = TRUE)[[1]]
342 |   do.call(paste, c(as.list(ss), sep = "\n\t"))
343 | }
344 | 
345 | 
346 | # utils: print the option list
347 | .print_opt <- function(name = NULL) {
348 |   opt_list <- .opt_list()
349 |   names <- names(opt_list)
350 |   
351 |   opts <- if (is.null(name)) {
352 |     seq_along(names)
353 |   } else {
354 |     which(names %in% name)
355 |   }
356 |   
357 |   if (is.null(opts) || length(opts) == 0) {
358 |     stop(sprintf("Option %s doesn't exist.\n", name))
359 |   }
360 |   
361 |   for (i in opts) {
362 |     n <- names[i]
363 |     opt <- opt_list[[i]]
364 |     if (n == "") {
365 |       sprintf("\n[%s]\n\n", opt) %>% cat()
366 |     } else {
367 |       c(val, checker, desc) %<-% opt
368 |       if (val[[1]]) {
369 |         sprintf("%s  (required)\n", n) %>% cat()
370 |       } else {
371 |         sprintf("%s  (default: %s)\n", n, val[[2]]) %>% cat()
372 |       }
373 |       sprintf("\t%s\n", .split_long_string(desc)) %>% cat()
374 |       sprintf("\t%s\n", .split_long_string(checker[[2]])) %>% cat()
375 |     }
376 |   }
377 | }
378 | 
379 | 
380 | #' Get option from an object in the current environment
381 | #'
382 | #' @param ... the parameter name
383 | #' @param .name get option from this object
384 | #' @keywords internal
385 | #' @return the parameter value
386 | OP <- function(..., .name = 'options') {
387 |   options <- get(.name, envir = caller_env())
388 |   k <- as.character(dplyr::expr(...))
389 |   if (!(k %in% names(options))) {
390 |     stop(sprintf("Option %s is required but not presented.", k))
391 |   }
392 |   options[[k]]
393 | }
394 | 
395 | 
396 | # print the help message for dynamic grn params
397 | .dynamic_grn_default_params <- function(help = FALSE) {
398 |   if (help) {
399 |     cat("Dynamic GRN deletes and creates some edges in the GRN in each epoch.
400 | One epoch contains multiple steps, and the change is done gradually in steps.
401 | The specific GRN at each step will be used by one or more cells sequentially.
402 | When an epoch is done, another epoch will start.
403 | 
404 | Available options for dynamic.GRN:
405 |   - seed: the random seed
406 |   - num.steps: number of steps in each epoch.
407 |   - cell.per.step: how many cells share the GRN in the same step.
408 |   - involved.genes: a new edge will only be created within these specified genes.
409 |       The default value is NA, which will use all existing genes in the GRN.
410 |   - num.changing.edges: if < 1, it means the portion of edges added/deleted in each epoch.
411 |       if >= 1, it means the number of edges added/deleted in each epoch.
412 |   - create.tf.edges: whether a new edge can connect two TFs in the GRN.
413 |   - weight.mean: the mean value of the weight for a newly created edge.
414 |       The default value is NA, meaning that it will use the mean value of the input GRN.
415 |   - weight.sd: the standard deviation of the weight for a newly created edge.
416 | 
417 | See the returned list for the default values.
418 | ")
419 |   }
420 | 
421 |   list(
422 |     seed = 0,
423 |     num.steps = 200,
424 |     cell.per.step = 1,
425 |     involved.genes = NA,
426 |     num.changing.edges = 2,
427 |     create.tf.edges = FALSE,
428 |     weight.mean = NA,
429 |     weight.sd = 1
430 |   )  
431 | }
432 | 


--------------------------------------------------------------------------------
/R/1.1_cif.R:
--------------------------------------------------------------------------------
  1 | # Parameters:
  2 | # ncells, n_nd_cif, n_diff_cif, n_reg_cif,
  3 | # cif_center, cif_sigma,
  4 | # neutral, phyla, tree_info,
  5 | # use_impulse
  6 | 
  7 | 
  8 | # called by .continuousCIF() to generate the CIF for a continuous population
  9 | .continuousCIFParam <- function(is_spatial, ...) {
 10 |   if (is_spatial) {
 11 |     .continuousCIFParamSpatial(...)
 12 |   } else {
 13 |     .continuousCIFParamNormal(...)
 14 |   }
 15 | }
 16 | 
 17 | 
 18 | # generate the CIF for a continuous population, when spatial mode is enabled
 19 | # @return a list of cif, diff_cif_by_path, meta_by_path, layer_idx_by_path
 20 | .continuousCIFParamSpatial <- function(
 21 |   ncells, N_nd.cif, N_diff.cif, n_reg_cif,
 22 |   cif_center, cif_sigma, step_size,
 23 |   neutral, phyla, tree_info,
 24 |   use_impulse, sp_params, ...
 25 | ) {
 26 |   # paths: list of int vector, each path
 27 |   # cell_path: int vector, the path idx of each cell
 28 |   # path_len: int vector, the length of each path
 29 |   param_names <- c("kon", "koff", "s")
 30 | 
 31 |   sp_params %->% c(
 32 |     max_layers, paths, cell_path, path_len
 33 |   )
 34 | 
 35 |   # nd and reg cif
 36 |   cif <- foreach(i_cell = seq(ncells)) %do% {
 37 |     i_path <- cell_path[i_cell]
 38 |     n_layers <- path_len[i_path]
 39 | 
 40 |     if (i_cell %% 100 == 0) cat(sprintf("%i..", i_cell))
 41 |     # for each cell, generate n_layer x n_cif
 42 |     cif_cell <- lapply(seq_len(3), function(i) {
 43 |       param_name <- param_names[i]
 44 |       n_nd_cif <- N_nd.cif[i]
 45 |       n_diff_cif <- N_diff.cif[i]
 46 | 
 47 |       # nd cif
 48 |       nd_cif <- lapply(seq(n_nd_cif), \(icif) rnorm(n_layers, cif_center, cif_sigma)) %>% do.call(cbind, .)
 49 |       colnames(nd_cif) <- paste(param_name, "nonDE", seq(n_nd_cif), sep = "_")
 50 | 
 51 |       # diff cif
 52 |       need_diff_cif <- n_diff_cif > 0
 53 |       # for cell 1, output the diff_cif itself; for other cells, only output TRUE or FALSE
 54 |       diff_cif <- need_diff_cif
 55 |       if (need_diff_cif && i_cell == 1) {
 56 |         # diff cif is shared among all cell & layers; generate them lazily
 57 |         # make sure only generated once for kon, koff and s
 58 |         # n_layers x n_diff_cif
 59 |         # =============================================== COPY
 60 |         diff_cif <- if (use_impulse) {
 61 |           c(edges, root, tips, internal) %<-% tree_info
 62 |           # impulse model
 63 |           # pdf(file = .plot.name, width = 15, height = 5)
 64 |           tip <- rep(tips, ceiling(n_diff_cif / length(tips)))
 65 |           lapply(seq(n_diff_cif), function(cif_i) {
 66 |             impulse <- Impulsecifpertip(phyla, edges, root, tips, internal, neutral, tip[cif_i], cif_sigma, cif_center, step_size)
 67 |             # if (.plot) { PlotRoot2Leave(impulse, tips, edges, root, internal) }
 68 |             re_order <- match(
 69 |               apply(neutral[, seq_len(3)], 1, \(X) paste0(X, collapse = "_")),
 70 |               apply(impulse[, seq_len(3)], 1, \(X) paste0(X, collapse = "_"))
 71 |             )
 72 |             return(impulse[re_order,])
 73 |           })
 74 |           # dev.off()
 75 |         } else {
 76 |           # Gaussian sample
 77 |           lapply(seq(n_diff_cif), function(icif) {
 78 |             # supply neutral to have the same t_sample values for all cells
 79 |             SampleSubtree(tree_info$root, 0, cif_center, tree_info$edges, ncells, step_size, neutral = neutral)[, 4]
 80 |           }) %>%
 81 |             do.call(cbind, .) %>%
 82 |             .[seq(max_layers),]
 83 |         }
 84 |         colnames(diff_cif) <- paste(param_name, "DE", seq(n_diff_cif), sep = "_")
 85 |         # ================================================ COPY
 86 | 
 87 |         diff_cif
 88 |       }
 89 | 
 90 |       # reg cif
 91 |       reg_cif <- NULL
 92 |       if (i <= 2 && n_reg_cif > 0) {
 93 |         reg_cif <- lapply(
 94 |           seq(n_reg_cif),
 95 |           \(.) rnorm(n_layers, cif_center, cif_sigma)
 96 |         ) %>% do.call(cbind, .)
 97 |         colnames(reg_cif) <- paste(param_name, "reg", seq(n_reg_cif), sep = "_")
 98 |       }
 99 | 
100 |       # TRUE if diff_cif is needed to be combined later
101 |       list(nd = nd_cif, diff = diff_cif, reg = reg_cif)
102 |     })
103 | 
104 |     setNames(cif_cell, param_names)
105 |   }
106 | 
107 |   cat("Done\n")
108 |   # gather diff_cif
109 |   diff_cif_all <- list(NULL, NULL, NULL)
110 |   for (i in seq_len(3)) {
111 |     d_cif <- cif[[1]][[i]]$diff
112 |     if (!is.logical(d_cif)) {
113 |       # if this param has diff cif, move it to diff_cif_all and replace it as FALSE
114 |       diff_cif_all[[i]] <- d_cif
115 |       cif[[1]][[i]]$diff <- TRUE
116 |     }
117 |   }
118 | 
119 |   # get the index on each path
120 |   neutral <- neutral[seq(max_layers),]
121 |   layer_idx_by_path <- lapply(paths, function(path) {
122 |     idx <- integer()
123 |     for (i in seq(length(path) - 1)) {
124 |       a <- path[i]
125 |       b <- path[i + 1]
126 |       idx <- c(idx, which(neutral[, 1] == a & neutral[, 2] == b))
127 |     }
128 |     idx
129 |   })
130 | 
131 |   # now process diff cif
132 |   diff_cif_by_path <- lapply(diff_cif_all, function(d_cif) {
133 |     lapply(seq_along(paths), function(i_path) {
134 |       if (is.null(d_cif)) return(NULL)
135 |       d_cif[layer_idx_by_path[[i_path]],]
136 |     })
137 |   })
138 |   names(diff_cif_by_path) <- param_names
139 | 
140 |   # cell types & meta
141 |   cell_types <- character(length = nrow(neutral))
142 |   for (i in seq(nrow(tree_info$edges))) {
143 |     c(id, from, to, len) %<-% tree_info$edges[i,]
144 |     n_steps <- len %/% step_size + ceiling(len %% step_size)
145 |     pts <- which(neutral[, 1] == from & neutral[, 2] == to)
146 |     n_pts <- length(pts)
147 |     cell_types[pts] <- if (n_steps == 1) {
148 |       paste(from, to, sep = "_")
149 |     } else {
150 |       type_id <- ceiling(seq(n_pts) * (n_steps / n_pts))
151 |       paste(from, to, type_id, sep = "_")
152 |     }
153 |   }
154 | 
155 |   meta_by_path <- lapply(seq_along(paths), function(i_path) {
156 |     idx <- layer_idx_by_path[[i_path]]
157 |     n <- neutral[idx,]
158 |     data.frame(
159 |       pop = apply(n[, seq_len(2)], 1, \(X) paste0(X, collapse = "_")),
160 |       depth = n[, 3],
161 |       cell.type = cell_types[idx]
162 |     )
163 |   })
164 | 
165 |   for (d_cif in diff_cif_by_path) {
166 |     for (i in seq_along(paths)) {
167 |       if (is.null(d_cif[[i]])) next
168 |       stopifnot(nrow(d_cif[[i]]) == path_len[i])
169 |     }
170 |   }
171 | 
172 |   list(
173 |     cif = cif, diff_cif_by_path = diff_cif_by_path,
174 |     meta_by_path = meta_by_path,
175 |     layer_idx_by_path = layer_idx_by_path
176 |   )
177 | }
178 | 
179 | 
180 | # generate the CIF for a continuous population, when spatial mode is disabled
181 | .continuousCIFParamNormal <- function(
182 |   ncells, N_nd.cif, N_diff.cif, n_reg_cif,
183 |   cif_center, cif_sigma, step_size,
184 |   neutral, phyla, tree_info,
185 |   use_impulse, ...
186 | ) {
187 |   param_names <- c("kon", "koff", "s")
188 | 
189 |   cif <- lapply(seq_len(3), function(i) {
190 |     param_name <- param_names[i]
191 |     n_nd_cif <- N_nd.cif[i]
192 |     n_diff_cif <- N_diff.cif[i]
193 | 
194 |     # ========== de_cif ==========
195 |     nd_cif <- lapply(seq(n_nd_cif), \(icif) rnorm(ncells, cif_center, cif_sigma)) %>% do.call(cbind, .)
196 |     colnames(nd_cif) <- paste(param_name, "nonDE", seq(n_nd_cif), sep = "_")
197 |     cifs <- nd_cif
198 | 
199 |     # ========== nd_cif ==========
200 |     if (n_diff_cif > 0) {
201 |       # generate de_cif if there exist de_cifs for the parameter we are looking at
202 |       diff_cif <- if (use_impulse) {
203 |         c(edges, root, tips, internal) %<-% tree_info
204 |         # impulse model
205 |         # pdf(file = .plot.name, width = 15, height = 5)
206 |         tip <- rep(tips, ceiling(n_diff_cif / length(tips)))
207 |         lapply(seq(n_diff_cif), function(cif_i) {
208 |           impulse <- Impulsecifpertip(phyla, edges, root, tips, internal, neutral, tip[cif_i], cif_sigma, cif_center, step_size)
209 |           # if (.plot) { PlotRoot2Leave(impulse, tips, edges, root, internal) }
210 |           re_order <- match(
211 |             apply(neutral[, seq_len(3)], 1, \(X) paste0(X, collapse = "_")),
212 |             apply(impulse[, seq_len(3)], 1, \(X) paste0(X, collapse = "_"))
213 |           )
214 |           return(impulse[re_order,])
215 |         })
216 |         # dev.off()
217 |       } else {
218 |         # Gaussian sample
219 |         lapply(seq(n_diff_cif), function(icif) {
220 |           # supply neutral to have the same t_sample values for all cells
221 |           SampleSubtree(tree_info$root, 0, cif_center, tree_info$edges, ncells, step_size, neutral = neutral)[, 4]
222 |         }) %>%
223 |           do.call(cbind, .) %>%
224 |           .[seq(ncells),]
225 |       }
226 |       colnames(diff_cif) <- paste(param_name, "DE", seq(n_diff_cif), sep = "_")
227 |       cifs <- cbind(nd_cif, diff_cif)
228 |     }
229 | 
230 |     # ========== generate reg_cif for k_on, k_off ===========
231 |     if (i <= 2 && n_reg_cif > 0) {
232 |       reg_cif <- lapply(
233 |         seq_len(n_reg_cif),
234 |         \(.) rnorm(ncells, cif_center, cif_sigma)
235 |       ) %>% do.call(cbind, .)
236 |       colnames(reg_cif) <- paste(param_name, "reg", seq_len(n_reg_cif), sep = "_")
237 |       cifs <- cbind(cifs, reg_cif)
238 |     }
239 | 
240 |     return(cifs)
241 |   })
242 | 
243 |   names(cif) <- param_names
244 |   cif
245 | }
246 | 
247 | 
248 | .discreteCIFSpatial <- function(
249 |   seed, N, options, sim, ...
250 | ) {
251 |   # set.seed(seed)
252 |   param_names <- c("kon", "koff", "s")
253 | 
254 |   phyla <- OP("tree")
255 |   cif_center <- OP("cif.center")
256 |   cif_sigma <- OP("cif.sigma")
257 |   user_popsize <- OP("discrete.pop.size")
258 |   min_popsize <- OP("discrete.min.pop.size")
259 |   i_minpop <- OP("discrete.min.pop.index")
260 | 
261 |   npop <- length(phyla$tip.label)
262 |   if (!is.null(sim$ncells_pop)) {
263 |     ncells_pop <- sim$ncells_pop
264 |   } else if (npop == 1) {
265 |     ncells_pop <- N$cell
266 |   } else if (is.integer(user_popsize)) {
267 |     stopifnot(length(user_popsize) == npop)
268 |     stopifnot(sum(user_popsize) == N$cell)
269 |     ncells_pop <- user_popsize
270 |   } else {
271 |     ncells_pop <- rep(min_popsize, npop)
272 |     if (N$cell < min_popsize * npop) {
273 |       stop(sprintf(
274 |         "The size of the smallest population (%g * %g) is too big for the total number of cells (%g)",
275 |         min_popsize, npop, N$cell))
276 |     }
277 | 
278 |     larger_pops <- setdiff(seq(npop), i_minpop)
279 |     ncells_pop[larger_pops] <- floor((N$cell - min_popsize) / length(larger_pops))
280 |     leftover <- N$cell - sum(ncells_pop)
281 |     if (leftover > 0) {
282 |       temp <- sample(larger_pops, leftover, replace = FALSE)
283 |       ncells_pop[temp] <- ncells_pop[temp] + 1
284 |     }
285 |   }
286 | 
287 |   if (is.null(sim$ncells_pop)) {
288 |     sim$ncells_pop <- ncells_pop
289 |   }
290 | 
291 |   vcv_evf_mean <- vcv.phylo(phyla, corr = TRUE)
292 |   param_name <- c("kon", "koff", "s")
293 | 
294 |   # nd and reg cif
295 |   cif <- foreach(i_cell = seq(N$cell)) %do% {
296 |     # === each cell ===
297 |     n_layers <- N$cell
298 | 
299 |     # for each cell, generate n_layer x n_cif
300 |     cif_cell <- lapply(seq_len(3), function(i) {
301 |       param_name <- param_names[i]
302 |       n_nd_cif <- N$nd.cif[i]
303 |       n_diff_cif <- N$diff.cif[i]
304 |       need_diff_cif <- n_diff_cif > 0
305 | 
306 |       # nd cif
307 |       nd_cif <- lapply(seq(n_nd_cif), \(icif) rnorm(n_layers, cif_center, cif_sigma)) %>% do.call(cbind, .)
308 |       colnames(nd_cif) <- paste(param_name, "nonDE", seq(n_nd_cif), sep = "_")
309 | 
310 |       # reg cif
311 |       reg_cif <- NULL
312 |       if (i <= 2 && N$reg_cif > 0) {
313 |         reg_cif <- lapply(
314 |           seq(N$reg_cif),
315 |           \(.) rnorm(n_layers, cif_center, cif_sigma)
316 |         ) %>% do.call(cbind, .)
317 |         colnames(reg_cif) <- paste(param_name, "reg", seq(N$reg_cif), sep = "_")
318 |       }
319 | 
320 |       list(nd = nd_cif, diff = need_diff_cif, reg = reg_cif)
321 |     })
322 | 
323 |     setNames(cif_cell, param_names)
324 |     # === end: each cell ===
325 |   }
326 | 
327 | 
328 |   # diff cif
329 |   diff_cif <- lapply(seq_len(3), function(i) {
330 |     n_diff_cif <- N$diff.cif[i]
331 |     need_diff_cif <- n_diff_cif > 0
332 |     if (need_diff_cif) {
333 |       pop_diff_cif_mean <- MASS::mvrnorm(n_diff_cif, rep(cif_center, npop), vcv_evf_mean)
334 |       dcif <- lapply(seq(npop), function(ipop) {
335 |         evf <- vapply(seq(n_diff_cif), function(ievf) {
336 |           rnorm(ncells_pop[ipop], pop_diff_cif_mean[ievf, ipop], cif_sigma)
337 |         }, numeric(ncells_pop[ipop]))
338 |         return(evf)
339 |       }) %>% do.call(rbind, .)
340 |       colnames(dcif) <- rep("DE", n_diff_cif)
341 |       dcif
342 |     } else {
343 |       NULL
344 |     }
345 |   })
346 |   diff_cif <- setNames(diff_cif, param_names)
347 | 
348 |   pop <- do.call(c, lapply(seq(npop), function(i) rep(i, ncells_pop[i])))
349 | 
350 |   meta <- data.frame(
351 |     pop = pop, cell.type = pop, cell.type.idx = pop
352 |   )
353 | 
354 |   list(
355 |     cif = cif,
356 |     meta = meta,
357 |     diff_cif = diff_cif
358 |   )
359 | }
360 | 
361 | 
362 | # return (node_from, node_to, t, state)
363 | SampleEdge <- function(edge, depth, anc_state, edges, ncells, step_size, t_sample = NA) {
364 |   if (is.na(t_sample[1])) {
365 |     #t_sample <- c(0,sort( runif(round(edge[4]*ncells/sum(edges[,4])),0,edge[4]) ))
366 |     branch_len <- edge[4]
367 |     ncell_branch <- ceiling(branch_len * ncells / sum(edges[, 4])) - 1
368 |     if (ncell_branch < 0) { stop("the total number of cells is too few.") }
369 |     t_sample <- c(0, seq(0, branch_len, branch_len / ncell_branch))
370 |     t_sample <- c(t_sample, branch_len)
371 |   } else {
372 |     t_sample <- sort(c(0, t_sample - depth))
373 |   }
374 |   t_interval <- diff(t_sample)
375 |   x_change <- vapply(t_interval, function(sig) rnorm(1, 0, sqrt(sig)),
376 |                      numeric(1))
377 |   x_sample <- cumsum(x_change)
378 |   col_time <- depth + t_sample[-1]
379 |   col_state <- anc_state + x_sample
380 |   # return
381 |   cbind(edge[2], edge[3], col_time, col_state)
382 | }
383 | 
384 | SampleSubtree <- function(par, depth, anc_state, edges, ncells, step_size, neutral = NA) {
385 |   # get the children of the current node
386 |   children <- edges[edges[, 2] == par, 3]
387 |   result <- lapply(c(seq_along(children)), function(j) {
388 |     edge <- edges[edges[, 2] == par & edges[, 3] == children[j],] # given the parent and child, find the edge
389 |     if (sum(edges[, 2] == children[j]) == 0) { # this means the current node is a leaf
390 |       if (is.na(neutral[1])) {
391 |         result <- SampleEdge(edge, depth, anc_state, edges, ncells, step_size)
392 |       } else {
393 |         t_sample <- neutral[neutral[, 1] == edge[2] & neutral[, 2] == edge[3], 3]
394 |         result <- SampleEdge(edge, depth, anc_state, edges, ncells, step_size, t_sample)
395 |       }
396 |       result <- result[c(seq(length(result[, 1] - 1))),]
397 |     } else {
398 |       if (is.na(neutral[1])) {
399 |         result <- SampleEdge(edge, depth, anc_state, edges, ncells, step_size)
400 |       } else {
401 |         t_sample <- neutral[neutral[, 1] == edge[2] & neutral[, 2] == edge[3], 3]
402 |         result <- SampleEdge(edge, depth, anc_state, edges, ncells, step_size, t_sample)
403 |       }
404 |       anc_state <- result[length(result[, 1]), 4]
405 |       # !!! why this line
406 |       result <- result[c(seq(length(result[, 1] - 1))),]
407 |       depth <- depth + edge[4]
408 |       result1 <- SampleSubtree(children[j], depth, anc_state, edges, ncells, step_size, neutral)
409 |       result <- rbind(result, result1)
410 |     }
411 |     return(result)
412 |   })
413 |   result <- do.call(rbind, result)
414 |   colnames(result) <- c("from", "to", "time", "state")
415 |   rownames(result) <- NULL
416 |   return(result)
417 | }
418 | 


--------------------------------------------------------------------------------
/R/3.2_dyngrn.R:
--------------------------------------------------------------------------------
  1 | .dynGRN <- setRefClass("dynGRN", fields = c(
  2 |   "randseed", "randstate",
  3 |   "involved_genes",
  4 |   "geff", "params", "regulators", "targets", "n_tgt", "n_reg", "name_map",
  5 |   "del_edges", "gen_edges", "has_tf_edges",
  6 |   "max_steps", "remaining_steps", "remaining_cells",
  7 |   "cell_per_step", "n_edges", "n_changing_edges", "weight_mean", "weight_sd",
  8 |   "history", "version" 
  9 | ))
 10 | 
 11 | .dynGRN$methods(
 12 |   restructure = function() {
 13 |     # set all deleted edges' weights to 0
 14 |     if (!is.null(del_edges)) {
 15 |       geff[del_edges[,seq_len(2)]] <<- 0
 16 |     }
 17 |     grn_region <- geff[involved_genes,]
 18 |     edges <- which(grn_region != 0, arr.ind = TRUE)
 19 |     nonedges <- which(grn_region == 0, arr.ind = TRUE)
 20 |     if (!has_tf_edges) {
 21 |       nonedges <- nonedges[-(nonedges[,1] %in% regulators),]
 22 |     }
 23 |     N_changed_edges <- if (n_changing_edges < 1) {
 24 |       as.integer(n_edges * n_changing_edges)
 25 |     } else {
 26 |       as.integer(n_changing_edges)
 27 |     }
 28 |     stopifnot(N_changed_edges > 0)
 29 |     # get new del_edges and gen_edges
 30 |     dedges <- edges[sample(nrow(edges), N_changed_edges),]
 31 |     del_edges <<- cbind(dedges, geff[dedges])
 32 |     gedges <- nonedges[sample(nrow(nonedges), N_changed_edges),]
 33 |     gen_edges <<- cbind(gedges, rnorm(N_changed_edges, mean = weight_mean, sd = weight_sd))
 34 |     stopifnot(all(geff[del_edges[,seq_len(2)]] != 0))
 35 |     stopifnot(all(geff[gen_edges[,seq_len(2)]] == 0))
 36 |   }
 37 | )
 38 | 
 39 | .dynGRN$methods(
 40 |   update = function() {
 41 |     if (remaining_steps == 0) {
 42 |       # change grn structure
 43 |       restructure()
 44 |       remaining_steps <<- max_steps
 45 |     }
 46 |     if (remaining_cells == 0) {
 47 |       # update gradually
 48 |       s <- 1 / max_steps
 49 |       for (row in seq_len(nrow(del_edges))) {
 50 |         i <- del_edges[row, 1]
 51 |         j <- del_edges[row, 2]
 52 |         w <- del_edges[row, 3]
 53 |         geff[i, j] <<- geff[i, j] - w * s
 54 |         if (abs(geff[i, j]) <= 1e-5) {
 55 |           geff[i, j] <<- 0
 56 |         }
 57 |       }
 58 |       for (row in seq_len(nrow(gen_edges))) {
 59 |         i <- gen_edges[row, 1]
 60 |         j <- gen_edges[row, 2]
 61 |         w <- gen_edges[row, 3]
 62 |         geff[i, j] <<- geff[i, j] + w * s
 63 |       }
 64 |       remaining_steps <<- remaining_steps - 1
 65 |       remaining_cells <<- cell_per_step
 66 |     }
 67 |     remaining_cells <<- remaining_cells - 1
 68 |     # update history
 69 |     history[[version]] <<- geff
 70 |     # return version
 71 |     ver_ <- version
 72 |     version <<- version + 1
 73 |     ver_
 74 |   }
 75 | )
 76 | 
 77 | .CreateDynGRN <- function(grn, opts) {
 78 |   if (is.na(opts$involved.genes)) {
 79 |     opts$involved.genes <- sort(unique(c(grn$regulators, grn$targets)))
 80 |   }
 81 |   if (is.na(opts$weight.mean)) {
 82 |     opts$weight.mean <- round(mean(grn$params[,3]), digits = 2)
 83 |   }
 84 |   
 85 |   dyngrn <- .dynGRN$new(
 86 |     randseed = opts$seed,
 87 |     # opts
 88 |     involved_genes = opts$involved.genes,
 89 |     max_steps = opts$num.steps,
 90 |     n_changing_edges = opts$num.changing.edges,
 91 |     cell_per_step = opts$cell.per.step,
 92 |     weight_mean = opts$weight.mean,
 93 |     weight_sd = opts$weight.sd,
 94 |     has_tf_edges = opts$create.tf.edges,
 95 |     # grn
 96 |     geff = grn$geff,
 97 |     params = grn$params,
 98 |     regulators = grn$regulators,
 99 |     targets = grn$targets,
100 |     n_tgt = grn$n_tgt,
101 |     n_reg = grn$n_reg,
102 |     name_map = grn$name_map,
103 |     # other fields
104 |     del_edges = NULL,
105 |     gen_edges = NULL,
106 |     remaining_steps = opts$num.steps,
107 |     remaining_cells = opts$cell.per.step,
108 |     n_edges = nrow(grn$params),
109 |     history = list(),
110 |     version = 1
111 |   )
112 |   dyngrn$restructure()
113 |   return(dyngrn)
114 | }
115 | 
116 | 
117 | .getDynGRNOpts <- function(options) {
118 |   opts <- .dynamic_grn_default_params()
119 |   for (name in names(opts)) {
120 |     val <- options[[name]]
121 |     if (!is.null(val)) {
122 |       opts[[name]] <- val
123 |     }
124 |   }
125 |   opts
126 | }
127 | 


--------------------------------------------------------------------------------
/R/6_technoise.R:
--------------------------------------------------------------------------------
  1 | #' Add experimental noise to true counts
  2 | #'
  3 | #' @param results The scMultisim result object
  4 | #' @param ...
  5 | #' `randseed`: The random seed
  6 | #' `protocol`: `UMI` or `non-UMI`
  7 | #' `gene_len`:  A vector with lengths of all genes
  8 | #' `alpha_mean`, `alpha_sd`: rate of subsampling of transcripts during capture step
  9 | #' `depth_mean`, `depth_sd`: The sequencing depth
 10 | #'
 11 | #' @seealso
 12 | #' The underlying methods
 13 | #' \link{True2ObservedCounts} and \link{True2ObservedATAC}
 14 | #'
 15 | #' @return none
 16 | #' @export
 17 | #'
 18 | #' @examples
 19 | #' results <- sim_example(ncells = 10)
 20 | #' add_expr_noise(results)
 21 | add_expr_noise <- function(results, ...) {
 22 |   cat("Adding experimental noise...\n")
 23 |   start_time <- Sys.time()
 24 |   data(gene_len_pool, envir = environment())
 25 |   gene_len <- sample(gene_len_pool, results$num_genes, replace = FALSE)
 26 |   args <- list(...)
 27 |   if (length(args) > 0) {
 28 |     rna_args <- args[names(args)[!startsWith(names(args), "atac.")]]
 29 |     atac_args <- args[names(args)[startsWith(names(args), "atac.")]]
 30 |   } else {
 31 |     rna_args <- list(); atac_args <- list()
 32 |   }
 33 |   rna_args <- .defaultArgs(rna_args, randseed = 0,
 34 |                            protocol = "nonUMI", alpha_mean = 0.1, alpha_sd = 0.02,
 35 |                            gene_len = gene_len, depth_mean = 1e5, depth_sd = 3e3,
 36 |                            nPCR1 = 16, nPCR2 = 10)
 37 |   atac_args <- .defaultArgs(atac_args, atac.obs.prob = 0.3, atac.sd.frac = 0.5)
 38 |   rna_args$true_counts <- floor(results$counts)
 39 |   rna_args$meta_cell <- results$cell_meta
 40 |   results$counts_obs <- do.call(True2ObservedCounts, rna_args)
 41 | 
 42 |   atac_data <- if (!is.null(results$atac_counts)) {
 43 |     cat("Using atac_counts\n")
 44 |     results$atac_counts
 45 |   } else {
 46 |     stop()
 47 |     cat("Using atacseq_data\n")
 48 |     results$atacseq_data
 49 |   }
 50 |   results$atacseq_obs <- True2ObservedATAC(atac_data, randseed = args$randseed,
 51 |                                            observation_prob = atac_args$atac.obs.prob,
 52 |                                            sd_frac = atac_args$atac.sd.frac)
 53 |   message(sprintf("Time spent: %.2f mins\n",
 54 |                   as.numeric(Sys.time() - start_time, units = "mins")))
 55 | }
 56 | 
 57 | 
 58 | #' Divide batches for observed counts
 59 | #'
 60 | #' @param results The scMultisim result object, after running `addExprNoise()`
 61 | #' @param nbatch Number of batches
 62 | #' @param effect Batch effect size, default is 3
 63 | #' @param randseed Random seed
 64 | #'
 65 | #' @return none
 66 | #' @export
 67 | #'
 68 | #' @examples
 69 | #' results <- sim_example(ncells = 10)
 70 | #' add_expr_noise(results)
 71 | #' divide_batches(results)
 72 | divide_batches <- function(results, nbatch = 2, effect = 3, randseed = 0) {
 73 |   cat("Adding batch effects...\n")
 74 |   obs <- results$counts_obs
 75 |   if (is.list(obs)) {
 76 |     obs <- obs$counts
 77 |   }
 78 |   ngene <- nrow(obs)
 79 |   merged <- rbind(obs, results$atacseq_obs)
 80 |   if ("batch" %in% names(results$cell_meta)) {
 81 |     results$cell_meta <- results$cell_meta[, !(names(results$cell_meta) %in% "batch")]
 82 |   }
 83 |   b <- .divideBatchesImpl(
 84 |     counts = merged, meta_cell = results$cell_meta,
 85 |     nbatch = nbatch, batch_effect_size = effect, randseed = 0
 86 |   )
 87 |   results$counts_with_batches <- b$counts[seq(ngene),]
 88 |   results$atac_with_batches <- b$counts[-(seq(ngene)),]
 89 |   results$cell_meta <- b$cell_meta
 90 | }
 91 | 
 92 | 
 93 | #' Divide the observed counts into multiple batches by adding batch effect to each batch
 94 | #' @param counts gene cell matrix
 95 | #' @param meta_cell the meta information related to cells, will be combined with technical cell level information and returned
 96 | #' @param nbatch number of batches
 97 | #' @param batch_effect_size amount of batch effects. Larger values result in bigger differences between batches. Default is 1.
 98 | #' @param randseed random seed
 99 | #' @keywords internal
100 | #' @return a list with two elements: counts and meta_cell
101 | .divideBatchesImpl <- function(counts, meta_cell, nbatch, batch_effect_size = 1, randseed = 0) {
102 |   # set.seed(randseed)
103 |   ## add batch effects to observed counts
104 |   # use different mean and same sd to create the multiplicative factor for different part (gene/region) in different batch
105 |   ncells <- dim(counts)[2]; nparts <- dim(counts)[1]
106 |   batchIDs <- sample(seq(nbatch), ncells, replace = TRUE)
107 |   meta_cell2 <- data.frame(batch = batchIDs, stringsAsFactors = FALSE)
108 |   meta_cell <- cbind(meta_cell, meta_cell2)
109 | 
110 |   mean_matrix <- matrix(0, nparts, nbatch)
111 |   part_mean <- rnorm(nparts, 0, 1)
112 |   temp <- lapply(seq(nparts), function(ipart) {
113 |     return(runif(nbatch, min = part_mean[ipart] - batch_effect_size, max = part_mean[ipart] + batch_effect_size))
114 |   })
115 |   mean_matrix <- do.call(rbind, temp)
116 | 
117 |   batch_factor <- matrix(0, nparts, ncells)
118 |   for (ipart in seq(nparts)) {
119 |     for (icell in seq(ncells)) {
120 |       batch_factor[ipart, icell] <- rnorm(n = 1, mean = mean_matrix[ipart, batchIDs[icell]], sd = 0.01)
121 |     }
122 |   }
123 |   counts <- round(2^(log2(counts) + batch_factor))
124 |   return(list(counts = counts, cell_meta = meta_cell))
125 | }
126 | 
127 | 
128 | #' This function simulates the amplification, library prep, and the sequencing processes.
129 | #' @param true_counts_1cell the true transcript counts for one cell (one vector)
130 | #' @param protocol a string, can be "nonUMI" or "UMI"
131 | #' @param rate_2cap the capture efficiency for this cell
132 | #' @param gene_len gene lengths for the genes/transcripts, sampled from real human transcript length
133 | #' @param amp_bias amplification bias for each gene, a vector of length ngenes
134 | #' @param rate_2PCR PCR efficiency, usually very high
135 | #' @param nPCR1 the number of PCR cycles
136 | #' @param nPCR2 the number of PCR cycles
137 | #' @param LinearAmp if linear amplification is used for pre-amplification step, default is FALSE
138 | #' @param LinearAmp_coef the coeficient of linear amplification, that is, how many times each molecule is amplified by
139 | #' @param N_molecules_SEQ number of molecules sent for sequencing; sequencing depth
140 | #' @keywords internal
141 | #' @return read counts (if protocol="nonUMI") or UMI counts (if protocol="UMI)
142 | .amplifyOneCell <- function(true_counts_1cell, protocol, rate_2cap, gene_len, amp_bias,
143 |                             rate_2PCR, nPCR1, nPCR2, LinearAmp, LinearAmp_coef, N_molecules_SEQ) {
144 |   ngenes <- length(gene_len)
145 |   if (protocol == "nonUMI") {
146 |     if (!exists("len2nfrag")) data(len2nfrag)
147 |   } else if (protocol == "UMI") { } else  {
148 |     stop("protocol input should be nonUMI or UMI")
149 |   }
150 |   inds <- vector("list", 2)
151 |   # expand the original vector and apply capture efficiency
152 |   # maintain a transcript index vector: which transcript the molecule belongs to
153 |   expanded_res <- .expandToBinary(c(true_counts_1cell, 1))
154 |   expanded_vec <- expanded_res[[1]]
155 |   trans_idx <- expanded_res[[2]]
156 |   inds[[1]] <- expanded_vec > 0
157 |   expanded_vec <- expanded_vec[inds[[1]]]
158 |   trans_idx <- trans_idx[inds[[1]]]
159 | 
160 |   rate_2cap_gene <- rate_2cap[trans_idx]
161 |   captured_vec <- expanded_vec
162 |   captured_vec[runif(length(captured_vec)) > rate_2cap_gene] <- 0
163 |   if (sum(captured_vec[seq(length(captured_vec) - 1)]) < 1) { return(rep(0, ngenes)) }
164 |   captured_vec[length(captured_vec)] <- 1
165 | 
166 |   inds[[2]] <- captured_vec > 0
167 |   captured_vec <- captured_vec[inds[[2]]]
168 |   trans_idx <- trans_idx[inds[[2]]]
169 |   amp_rate <- c((rate_2PCR + amp_bias[trans_idx[seq(length(trans_idx) - 1)]]), 1)
170 |   # pre-amplification:
171 |   if (LinearAmp) {
172 |     PCRed_vec <- captured_vec * LinearAmp_coef
173 |   } else {
174 |     temp <- runif(length(captured_vec)) < amp_rate
175 |     temp <- temp * 2 + captured_vec - temp
176 |     for (iPCR in 2:nPCR1) {
177 |       eff <- runif(length(temp)) * amp_rate
178 |       v1 <- temp * (1 - eff)
179 |       round_down <- (v1 - floor(v1)) < runif(length(v1))
180 |       v1[round_down] <- floor(v1[round_down])
181 |       v1[!round_down] <- ceiling(v1[!round_down])
182 |       temp <- v1 + 2 * (temp - v1)
183 |     }
184 |     PCRed_vec <- temp
185 |   }
186 | 
187 |   if (protocol == "nonUMI") { # add fragmentation step here
188 |     temp_vec <- PCRed_vec
189 |     for (i in seq(2, 1, -1)) {
190 |       temp_vec1 <- numeric(); temp_vec1[inds[[i]]] <- temp_vec;
191 |       temp_vec <- temp_vec1; temp_vec[is.na(temp_vec)] <- 0
192 |     }
193 |     recovered_vec <- temp_vec[seq(length(temp_vec) - 1)]
194 |     amp_mol_count <- numeric(ngenes);
195 |     GI <- c(0, cumsum(true_counts_1cell));
196 |     for (i in which(true_counts_1cell > 0)) {
197 |       x <- recovered_vec[(GI[i] + 1):GI[i + 1]]
198 |       amp_mol_count[i] <- sum(x)
199 |     }
200 | 
201 |     # for every copy of each transcript, convert it into number of fragments
202 |     frag_vec <- numeric(ngenes)
203 |     for (igene in which(amp_mol_count > 0)) {
204 |       frag_vec[igene] <- sum(sample(len2nfrag[as.character(gene_len[igene]),],
205 |                                     amp_mol_count[igene], replace = TRUE)) }
206 |     # another 8 rounds of amplification to the fragments (fragmentation bias gets amplified)
207 |     for (iPCR in seq_len(2)) {
208 |       frag_vec <- frag_vec + vapply(
209 |         frag_vec,\(x) rbinom(n = 1, x, prob = rate_2PCR), numeric(1))
210 |     }
211 |     for (iPCR in 3:nPCR2) {
212 |       frag_vec <- frag_vec + round(frag_vec * rate_2PCR)
213 |     }
214 |     SEQ_efficiency <- N_molecules_SEQ / sum(frag_vec)
215 |     if (SEQ_efficiency >= 1) {
216 |       read_count <- frag_vec
217 |     } else {
218 |       read_count <- vapply(
219 |         frag_vec,
220 |         \(Y) rbinom(n = 1, size = Y, prob = SEQ_efficiency), numeric(1))
221 |     }
222 |     return(read_count)
223 |   } else if (protocol == "UMI") {
224 | 
225 |     prob_vec <- vapply(
226 |       gene_len[trans_idx[seq(length(trans_idx) - 1)]], .getProb, numeric(1))
227 |     # fragmentation:
228 |     frag_vec <- vapply(
229 |       seq(length(PCRed_vec) - 1),
230 |       \(igene) rbinom(n = 1, size = PCRed_vec[igene], prob = prob_vec[igene]),
231 |       numeric(1))
232 | 
233 |     # another 10 rounds of amplification to the fragments (fragmentation bias gets amplified)
234 |     for (iPCR in seq_len(2)) {
235 |       frag_vec <- frag_vec + vapply(
236 |         frag_vec, \(x) rbinom(n = 1, x, prob = rate_2PCR), numeric(1))
237 |     }
238 | 
239 |     frag_vec <- round(frag_vec * (1 + rate_2PCR)^(nPCR2 - 1))
240 | 
241 |     SEQ_efficiency <- N_molecules_SEQ / sum(frag_vec)
242 |     if (SEQ_efficiency >= 1) {
243 |       sequenced_vec <- frag_vec
244 |     } else {
245 |       sequenced_vec <- vapply(
246 |         frag_vec, \(Y) rbinom(n = 1, size = Y, prob = SEQ_efficiency),
247 |         numeric(1))
248 |     }
249 | 
250 |     temp_vec <- c(sequenced_vec, 1)
251 |     for (i in seq(2, 1, -1)) {
252 |       temp_vec1 <- numeric(); temp_vec1[inds[[i]]] <- temp_vec;
253 |       temp_vec <- temp_vec1; temp_vec[is.na(temp_vec)] <- 0
254 |     }
255 |     recovered_vec <- temp_vec[seq(length(temp_vec) - 1)]
256 | 
257 |     UMI_counts <- numeric(ngenes);
258 |     GI <- c(0, cumsum(true_counts_1cell));
259 |     for (i in which(true_counts_1cell > 0)) {
260 |       x <- recovered_vec[(GI[i] + 1):GI[i + 1]];
261 |       UMI_counts[i] <- sum(x > 0);
262 |     }
263 | 
264 |     return(list(UMI_counts, sequenced_vec, sum(frag_vec > 0)))
265 |   }
266 | }
267 | 
268 | 
269 | #' Simulate observed count matrix given technical biases and the true counts
270 | #' @param true_counts gene cell matrix
271 | #' @param meta_cell the meta information related to cells, will be combined with technical cell level information and returned
272 | #' @param protocol a string, can be "nonUMI" or "UMI"
273 | #' @param alpha_mean the mean of rate of subsampling of transcripts during capture step, default at 10 percent efficiency
274 | #' @param alpha_sd the std of rate of subsampling of transcripts
275 | #' @param alpha_gene_mean the per-gene scale factor of the alpha parameter, default at 1
276 | #' @param alpha_gene_sd the standard deviation of the per-gene scale factor of the alpha parameter, default at 0
277 | #' @param lenslope amount of length bias
278 | #' @param nbins number of bins for gene length
279 | #' @param gene_len a vector with lengths of all genes
280 | #' @param amp_bias_limit range of amplification bias for each gene, a vector of length ngenes
281 | #' @param rate_2PCR PCR efficiency, usually very high, default is 0.8
282 | #' @param nPCR1 the number of PCR cycles in "pre-amplification" step, default is 16
283 | #' @param nPCR2 the number of PCR cycles used after fragmentation.
284 | #' @param LinearAmp if linear amplification is used for pre-amplification step, default is FALSE
285 | #' @param LinearAmp_coef the coeficient of linear amplification, that is, how many times each molecule is amplified by
286 | #' @param depth_mean mean of sequencing depth
287 | #' @param depth_sd std of sequencing depth
288 | #' @param randseed (should produce same result if nregions, nevf and randseed are all the same)
289 | #' @return if UMI, a list with two elements, the first is the observed count matrix, the second is the metadata; if nonUMI, a matrix
290 | #' @export
291 | #' @examples
292 | #' \donttest{
293 | #' results <- sim_example(ncells = 10)
294 | #' data(gene_len_pool)
295 | #' gene_len <- sample(gene_len_pool, results$num_genes, replace = FALSE)
296 | #' True2ObservedCounts(
297 | #'   results$counts, results$cell_meta, protocol = "nonUMI", randseed = 1,
298 | #'   alpha_mean = 0.1, alpha_sd = 0.05, gene_len = gene_len, depth_mean = 1e5, depth_sd = 3e3
299 | #' )
300 | #' }
301 | True2ObservedCounts <- function(true_counts, meta_cell, protocol, randseed, alpha_mean = 0.1, alpha_sd = 0.002,
302 |                                 alpha_gene_mean = 1, alpha_gene_sd = 0,
303 |                                 gene_len, depth_mean, depth_sd, lenslope = 0.02, nbins = 20,
304 |                                 amp_bias_limit = c(-0.2, 0.2),
305 |                                 rate_2PCR = 0.8, nPCR1 = 16, nPCR2 = 10, LinearAmp = FALSE, LinearAmp_coef = 2000) {
306 |   # set.seed(randseed)
307 |   ngenes <- dim(true_counts)[1]; ncells <- dim(true_counts)[2]
308 |   amp_bias <- .calAmpBias(lenslope, nbins, gene_len, amp_bias_limit)
309 |   rate_2cap_lb <- 0.0005; depth_lb <- 200 # lower bound for capture efficiency and sequencing depth
310 |   rate_2cap_vec <- .rnormTrunc(n = ncells, mean = alpha_mean, sd = alpha_sd, a = rate_2cap_lb, b = 1)
311 |   rate_2cap_vec_gene <- .rnormTrunc(n = ngenes, mean = alpha_gene_mean, sd = alpha_gene_sd, a = 0, b = 3)
312 |   rate_2cap <- rate_2cap_vec_gene %o% rate_2cap_vec
313 |   depth_vec <- .rnormTrunc(n = ncells, mean = depth_mean, sd = depth_sd, a = depth_lb, b = Inf)
314 |   observed_counts <- lapply(seq(ncells), function(icell) {
315 |     if (icell %% 50 == 0) cat(sprintf("%d..", icell))
316 |     .amplifyOneCell(true_counts_1cell = true_counts[, icell], protocol = protocol,
317 |                     rate_2cap = c(rate_2cap[, icell], rate_2cap_vec[icell]),
318 |                     gene_len = gene_len, amp_bias = amp_bias,
319 |                     rate_2PCR = rate_2PCR, nPCR1 = nPCR1, nPCR2 = nPCR2, LinearAmp = LinearAmp,
320 |                     LinearAmp_coef = LinearAmp_coef, N_molecules_SEQ = depth_vec[icell])
321 |   })
322 |   gc()
323 | 
324 |   meta_cell2 <- data.frame(alpha = rate_2cap_vec, depth = depth_vec, stringsAsFactors = FALSE)
325 |   meta_cell <- cbind(meta_cell, meta_cell2)
326 | 
327 |   if (protocol == "UMI") {
328 |     UMI_counts <- do.call(cbind, lapply(observed_counts, "[[", 1))
329 |     nreads_perUMI <- lapply(observed_counts, "[[", 2)
330 |     nUMI2seq <- vapply(observed_counts, "[[", numeric(1), 3)
331 |     observed_counts <- UMI_counts
332 |   } else
333 |     observed_counts <- do.call(cbind, observed_counts)
334 | 
335 |   if (protocol == "UMI") { return(list(counts = observed_counts, cell_meta = meta_cell, nreads_perUMI = nreads_perUMI,
336 |                                        nUMI2seq = nUMI2seq))
337 |   } else
338 |     return(observed_counts)
339 | }
340 | 
341 | 
342 | #' Simulate observed ATAC-seq matrix given technical noise and the true counts
343 | #' @param atacseq_data true ATAC-seq data
344 | #' @param observation_prob for each integer count of a particular region for a particular cell, the probability the count will be observed
345 | #' @param sd_frac the fraction of ATAC-seq data value used as the standard deviation of added normally distrubted noise
346 | #' @param randseed (should produce same result if nregions, nevf and randseed are all the same)
347 | #' @return a matrix of observed ATAC-seq data
348 | #' @export
349 | #' @examples
350 | #' results <- sim_example(ncells = 10)
351 | #' True2ObservedATAC(results$atac_counts, randseed = 1)
352 | True2ObservedATAC <- function(atacseq_data, randseed, observation_prob = 0.3, sd_frac = 0.1) {
353 |   # set.seed(randseed)
354 |   atacseq_data <- round(atacseq_data)
355 |   atacseq_noisy <- atacseq_data
356 |   for (icell in seq(ncol(atacseq_data))) {
357 |     for (iregion in seq(nrow(atacseq_data))) {
358 |       if (atacseq_data[iregion, icell] > 0) {
359 |         atacseq_noisy[iregion, icell] <- rbinom(n = 1, size = atacseq_data[iregion, icell], prob = observation_prob)
360 |         atacseq_noisy[iregion, icell] <- max(atacseq_noisy[iregion, icell] + rnorm(1, mean = 0, sd = atacseq_noisy[iregion, icell] * sd_frac), 0)
361 |       }
362 |     }
363 |   }
364 |   return(atacseq_noisy)
365 | }
366 | 
367 | 
368 | #' Simulate technical biases
369 | #' @param lenslope amount of length bias. This value sould be less than 2*amp_bias_limit\[2\]/(nbins-1)
370 | #' @param nbins number of bins for gene length
371 | #' @param gene_len transcript length of each gene
372 | #' @param amp_bias_limit range of amplification bias for each gene, a vector of length ngenes
373 | #' @keywords internal
374 | #' @return a vector
375 | .calAmpBias <- function(lenslope, nbins, gene_len, amp_bias_limit) {
376 |   ngenes <- length(gene_len)
377 |   len_bias_bin <- (-(seq(nbins))) * lenslope
378 |   len_bias_bin <- len_bias_bin - median(len_bias_bin)
379 |   if (max(len_bias_bin) > amp_bias_limit[2]) {
380 |     stop("The lenslope parameter is too large.")
381 |   }
382 |   max_rand_bias <- amp_bias_limit[2] - max(len_bias_bin)
383 | 
384 |   rand_bias <- rnorm(ngenes, mean = 0, sd = max_rand_bias)
385 |   rand_bias[rand_bias > max_rand_bias] <- max_rand_bias
386 |   rand_bias[rand_bias < -max_rand_bias] <- -max_rand_bias
387 |   #rand_bias <- runif(ngenes, -max_rand_bias,  max_rand_bias)
388 | 
389 |   binsize <- floor(ngenes / nbins)
390 |   genes_in_bins <- vector("list", nbins)
391 |   bin4genes <- numeric(ngenes)
392 |   for (ibin in seq(nbins - 1)) {
393 |     genes_in_bins[[ibin]] <- order(gene_len)[((ibin - 1) * binsize + 1):(ibin * binsize)]
394 |     bin4genes[genes_in_bins[[ibin]]] <- ibin
395 |   }
396 |   genes_in_bins[[nbins]] <- order(gene_len)[((nbins - 1) * binsize + 1):ngenes]
397 |   bin4genes[genes_in_bins[[nbins]]] <- nbins
398 | 
399 |   len_bias <- numeric(ngenes); len_bias <- len_bias_bin[bin4genes]
400 |   amp_bias <- rand_bias + len_bias
401 |   return(amp_bias)
402 | }
403 | 
404 | 
405 | #' expand transcript counts to a vector of binaries of the same length of as the number of transcripts
406 | #' @param true_counts_1cell number of transcript in one cell
407 | #' @keywords internal
408 | #' @return a list of two vectors, the first vector is a vector of 1s, the second vector is the index of transcripts
409 | .expandToBinary <- function(true_counts_1cell) {
410 |   names(true_counts_1cell) <- NULL
411 |   expanded_vec <- rep(1, sum(true_counts_1cell))
412 |   trans_idx <- lapply(which(true_counts_1cell > 0),
413 |                       function(igene) rep(igene, true_counts_1cell[igene]))
414 |   trans_idx <- unlist(trans_idx)
415 |   return(list(expanded_vec, trans_idx))
416 | }
417 | 
418 | #' sample from truncated normal distribution
419 | #' @param n number of values to create
420 | #' @param a the minimum value allowed
421 | #' @param b the maximum value allowed
422 | #' @param mean mean of the normal distribution
423 | #' @param sd standard deviation of the normal distribution
424 | #' @keywords internal
425 | #' @return a vector of length n
426 | .rnormTrunc <- function(n, mean, sd, a, b) {
427 |   vec1 <- rnorm(n, mean = mean, sd = sd)
428 |   beyond_idx <- which(vec1 < a | vec1 > b)
429 |   if (length(beyond_idx) > 0) { # for each value < rate_2cap_lb
430 |     substi_vec <- vapply(seq_along(beyond_idx), function(i) {
431 |       while (TRUE) {
432 |         temp <- rnorm(1, mean = mean, sd = sd)
433 |         if (temp > a | temp > b) { break } }
434 |       return(temp)
435 |     }, numeric(1))
436 |     vec1[beyond_idx] <- substi_vec
437 |   }
438 |   return(vec1)
439 | }
440 | 
441 | .getProb <- function(glength) {
442 |   if (glength >= 1000) { prob <- 0.7 } else {
443 |     if (glength >= 100 & glength < 1000) { prob <- 0.78 }
444 |     else if (glength < 100) { prob <- 0 }
445 |   }
446 |   return(prob)
447 | }
448 | 
449 | #' Add outliers to the observed counts
450 | #' @param res The scMultisim result object
451 | #' @param prob The probability of adding outliers for each gene
452 | #' @param factor The factor of the outliers
453 | #' @param sd The standard deviation of the outliers
454 | #' @param cell.num For a gene, the number of cells chosen to add outliers
455 | #' @param max.var The maximum variance allowed
456 | #' @export
457 | #' @return none
458 | add_outliers <- function (
459 |   res, prob = 0.01, factor = 2, sd = 0.5, cell.num = 1, max.var = Inf
460 | ) {
461 |   if (is.null(res$counts_obs)) {
462 |     stop("No counts found in the result object")
463 |   }
464 |   ngenes <- nrow(res$counts_obs)
465 |   ncells <- ncol(res$counts_obs)
466 |   gene_range <- if (is.null(res$.grn)) {
467 |     seq(ngenes)
468 |   } else {
469 |     (max(res$.grn$name_map) + 1):ngenes
470 |   }
471 |   gene_range <- setdiff(gene_range, which(rowVars(res$counts_obs) > max.var))
472 |   chosen_genes <- sample(gene_range, floor(ngenes * prob))
473 |   for (i in chosen_genes) {
474 |     # chosen_cells <- sample(which(res$counts_obs[i,] > 0), cell.num)
475 |     chosen_cells <- sample(seq(ncells), cell.num)
476 |     q <- rnorm(1, factor, sd)
477 |     message(sprintf("Gene %d, cells %s, factor %.2f", i, paste(chosen_cells, collapse = ", "), q))
478 |     res$counts_obs[i, chosen_cells] <- res$counts_obs[i, chosen_cells] * q
479 |   }
480 | }
481 | 


--------------------------------------------------------------------------------
/R/8_utils.R:
--------------------------------------------------------------------------------
  1 | # String concatenation
  2 | `%+%` <- function(a, b) paste0(a, b)
  3 | 
  4 | 
  5 | # get default arguments
  6 | .defaultArgs <- function(args = NULL, ...) {
  7 |   defaults <- list2(...)
  8 |   if (is.null(args)) {
  9 |     args <- eval(substitute(list(...), env = parent.frame()))
 10 |   }
 11 |   for (name in names(args)) {
 12 |     defaults[[name]] <- args[[name]]
 13 |   }
 14 |   defaults
 15 | }
 16 | 
 17 | 
 18 | .regionToTFMatrix <- function(GRN, region_to_gene, .all.genes = FALSE) {
 19 |   res <- matrix(0, nrow = nrow(region_to_gene), ncol = GRN$n_reg)
 20 |   # GRN$geff: gene x tf
 21 |   geff <- GRN$geff > 0
 22 |   # region_to_gene: region x gene
 23 |   # for each region
 24 |   for (i in seq_len(nrow(region_to_gene))) {
 25 |     # get genes in this region
 26 |     genes <- which(region_to_gene[i, ] > 0)
 27 |     if (length(genes) == 0) {
 28 |       next
 29 |     }
 30 |     if (.all.genes) {
 31 |       # if a TF also regulates all these genes
 32 |       tfs <- which(colSums(geff[genes, , drop = F]) == length(genes))
 33 |     } else {
 34 |       tfs <- which(colSums(geff[genes, , drop = F]) > 0)
 35 |     }
 36 |     res[i, tfs] <- 1
 37 |   }
 38 |   res
 39 | }
 40 | 
 41 | 
 42 | #' sample from smoothed density function
 43 | #' @param nsample number of samples needed
 44 | #' @param den_fun density function estimated from density() from R default
 45 | #' @param reduce.mem use alternative implementation to reduce memory usage
 46 | #' @keywords internal
 47 | #' @return a vector of samples
 48 | SampleDen <- function(nsample, den_fun, reduce.mem = FALSE) {
 49 |   probs <- den_fun$y / sum(den_fun$y)
 50 |   bw <- den_fun$x[2] - den_fun$x[1]
 51 |   probs_seq = seq_along(probs)
 52 |   mins <- den_fun$x[probs_seq] - 0.5 * bw
 53 |   maxs <- den_fun$x[probs_seq] + 0.5 * bw
 54 | 
 55 |   if (reduce.mem) {
 56 |     counts <- rmultinom(n = 1, size = nsample, prob = probs)
 57 |     total_samples <- sum(counts)
 58 |     samples <- runif(total_samples) *
 59 |       rep(maxs - mins, times = counts) +
 60 |       rep(mins, times = counts)
 61 |   } else {
 62 |     bin_id <- sample(size = nsample, x = probs_seq, prob = probs, replace = TRUE)
 63 |     counts <- tabulate(bin_id, nbins = length(probs))
 64 |     total_samples <- sum(counts)
 65 |     samples <- numeric(length = total_samples)
 66 |     cum_counts <- c(0, cumsum(counts))
 67 |     for (j in 1:length(counts)) {
 68 |       if (counts[j] > 0) {
 69 |         samples[(cum_counts[j] + 1):cum_counts[j + 1]] <-
 70 |           runif(counts[j], min = mins[j], max = maxs[j])
 71 |       }
 72 |     }
 73 |   }
 74 | 
 75 |   return(samples)
 76 | }
 77 | 
 78 | 
 79 | #' Creating an example tree with 5 tips
 80 | #' @param plotting True for plotting the tree on console, False for no plot
 81 | #' @return a R phylo object
 82 | #' @export
 83 | #' @examples
 84 | #' Phyla5()
 85 | Phyla5 <- function(plotting = FALSE) {
 86 |   phyla <- rtree(2)
 87 |   phyla <- compute.brlen(phyla, 1)
 88 |   tip <- compute.brlen(phyla, 1)
 89 |   phyla <- bind.tree(phyla, tip, 1)
 90 |   phyla <- bind.tree(phyla, tip, 2)
 91 |   phyla <- bind.tree(phyla, tip, 2)
 92 |   phyla <- compute.brlen(phyla, c(1, 1, 1, 1, 1, 0.2, 0.2, 3))
 93 |   edges <- cbind(phyla$edge, phyla$edge.length)
 94 |   edges <- cbind(seq_along(edges[, 1]), edges)
 95 |   connections <- table(c(edges[, 2], edges[, 3]))
 96 |   root <- as.numeric(names(connections)[connections == 2])
 97 |   tips <- as.numeric(names(connections)[connections == 1])
 98 |   phyla$tip.label <- as.character(tips)
 99 |   if (plotting == TRUE) {
100 |     plot(phyla, show.tip.label = FALSE, lwd = 2)
101 |     tiplabels(cex = 2)
102 |     nodelabels(cex = 2)
103 |   }
104 |   return(phyla)
105 | }
106 | 
107 | #' Creating an example tree with 3 tips
108 | #' @param plotting True for plotting the tree on console, False for no plot
109 | #' @return a R phylo object
110 | #' @export
111 | #' @examples
112 | #' Phyla3()
113 | Phyla3 <- function(plotting = FALSE) {
114 |   # par(mfrow=c(2,2))
115 |   phyla <- rtree(2)
116 |   phyla <- compute.brlen(phyla, 1)
117 |   tip <- compute.brlen(phyla, 1)
118 |   phyla <- bind.tree(phyla, tip, 1)
119 |   phyla <- compute.brlen(phyla, c(1, 1, 1, 2))
120 |   edges <- cbind(phyla$edge, phyla$edge.length)
121 |   edges <- cbind(seq_along(edges[, 1]), edges)
122 |   connections <- table(c(edges[, 2], edges[, 3]))
123 |   root <- as.numeric(names(connections)[connections == 2])
124 |   tips <- as.numeric(names(connections)[connections == 1])
125 |   phyla$tip.label <- as.character(tips)
126 | 
127 |   if (plotting == TRUE) {
128 |     plot(phyla, show.tip.label = FALSE, lwd = 2)
129 |     tiplabels(cex = 2)
130 |     nodelabels(cex = 2)
131 |   }
132 |   return(phyla)
133 | }
134 | 
135 | #' Creating a linear example tree
136 | #' @param len length of the tree
137 | #' @return a R phylo object
138 | #' @export
139 | #' @examples
140 | #' Phyla1(len = 1)
141 | Phyla1 <- function(len = 1) {
142 |   myTree <- ape::read.tree(text='(A);')
143 |   myTree <- compute.brlen(myTree, len)
144 |   myTree
145 | }
146 | 
147 | 
148 | # get root, internal nodes and tips from a tree.
149 | .tree_info <- function(tree) {
150 |   edges <- cbind(seq_len(nrow(tree$edge)), tree$edge, tree$edge.length)
151 |   colnames(edges) <- c("id", "from", "to", "len")
152 |   parents <- unique(edges[, 2])
153 |   children <- unique(edges[, 3])
154 |   root <- setdiff(parents, children) %>% as.numeric()
155 |   tips <- setdiff(children, parents) %>% as.numeric()
156 |   internal <- union(parents, children) %>% as.numeric()
157 | 
158 |   list(edges = edges, root = root, tips = tips, internal = internal)
159 | }
160 | 
161 | # print a summary of simulation parameters
162 | .print_param_summary <- function(sim) {
163 |   cat(sprintf("intr noise: %g\n", sim$options$intrinsic.noise))
164 | 
165 |   N <- sim$N
166 |   cat("======== Params Summary ========\n")
167 |   cat(sprintf("Genes: %d (%d GRN + %d Non-GRN)\n", N$gene, N$grn.gene, N$non.grn.gene))
168 |   cat(sprintf("CIF_%s: %d (%d nd + %d diff) + %d reg",
169 |               c("kon", "koff", "s"), N$cif, N$nd.cif, N$diff.cif, N$reg_cif), sep = "\n")
170 |   if (!is.null(sim$GRN)) {
171 |     cat(sprintf("GRN: %d regulators, %d targets\n", sim$GRN$n_reg, sim$GRN$n_tgt))
172 |   }
173 |   if (sim$do_spatial) {
174 |     cat(sprintf("Spatial: %d regulators\n", length(sim$sp_regulators)))
175 |   }
176 | 
177 |   cat("Params:\n")
178 |   cat("  CIF ")
179 |   if (sim$do_spatial) {
180 |     cat("(NA)\n")
181 |   } else {
182 |     .print_matrix_dim(sim$CIF_all$cif$kon, "kon", newline = FALSE)
183 |     .print_matrix_dim(sim$CIF_all$cif$koff, "koff", newline = FALSE)
184 |     .print_matrix_dim(sim$CIF_all$cif$s, "s")
185 |   }
186 | 
187 |   cat("  GIV ")
188 |   .print_matrix_dim(sim$GIV$kon, "kon", newline = FALSE)
189 |   .print_matrix_dim(sim$GIV$koff, "koff", newline = FALSE)
190 |   .print_matrix_dim(sim$GIV$s, "s")
191 | 
192 |   cat("  Params ")
193 |   if (sim$do_spatial) {
194 |     .print_matrix_dim(sim$params_spatial[[1]]$kon, "kon", newline = FALSE)
195 |     .print_matrix_dim(sim$params_spatial[[1]]$koff, "koff")
196 |   } else {
197 |     .print_matrix_dim(sim$params$kon, "kon", newline = FALSE)
198 |     .print_matrix_dim(sim$params$koff, "koff")
199 |   }
200 | 
201 |   .print_matrix_dim(sim$CIF_atac, "  CIF_atac")
202 |   .print_matrix_dim(sim$region_to_gene, "  Region2Gene")
203 |   .print_matrix_dim(sim$atac_data, "  ATAC")
204 | 
205 |   cat("================================\n")
206 | }
207 | 
208 | .print_matrix_dim <- function(mtx, name = NULL, newline = TRUE) {
209 |   if (is.null(name)) {
210 |     cat(sprintf("%dx%d", nrow(mtx), ncol(mtx)))
211 |   } else {
212 |     cat(sprintf("%s: %dx%d  ", name, nrow(mtx), ncol(mtx)))
213 |   }
214 |   if (newline) {
215 |     cat("\n")
216 |   }
217 | }
218 | 
219 | .print_time <- function(sim) {
220 |   cat(sprintf("Time spent: %.2f mins\n", as.numeric(Sys.time() - sim$start_time, units = "mins")))
221 | }
222 | 
223 | .print_gene_in_grn <- function(sim) {
224 |   rg <- sim$GRN$regulators
225 |   tg <- sim$GRN$targets
226 | 
227 |   if (sim$do_spatial) {
228 | 
229 |   } else {
230 | 
231 |   }
232 | }
233 | 
234 | 
235 | #' Simulate a small example dataset with 200 cells and the 100-gene GRN
236 | #' @param ncells number of cells, please increase this number on your machine
237 | #' @param velocity whether to simulate RNA velocity
238 | #' @return the simulation result
239 | #' @export
240 | #' @examples
241 | #' sim_example(ncells = 10)
242 | sim_example <- function(ncells = 10, velocity = FALSE) {
243 |   data(GRN_params_100, envir = environment())
244 |   options <- list(
245 |     rand.seed = 0,
246 |     GRN = GRN_params_100,
247 |     num.cells = ncells,
248 |     num.cifs = 20,
249 |     cif.sigma = 0.5,
250 |     tree = Phyla3(),
251 |     diff.cif.fraction = 0.8,
252 |     do.velocity = velocity
253 |   )
254 |   sim_true_counts(options)
255 | }
256 | 
257 | 
258 | #' Simulate a small example dataset with 200 cells and the 100-gene GRN, with CCI enabled
259 | #' @param ncells number of cells, please increase this number on your machine
260 | #' @return the simulation result
261 | #' @export
262 | #' @examples
263 | #' sim_example_spatial(ncells = 10)
264 | sim_example_spatial <- function(ncells = 10) {
265 |   data(GRN_params_100, envir = environment())
266 |   lig_params <- data.frame(
267 |     target    = c(101, 102),
268 |     regulator = c(103, 104),
269 |     effect    = c(5.2, 5.9)
270 |   )
271 |   options <- list2(
272 |     rand.seed = 0,
273 |     GRN = GRN_params_100,
274 |     num.genes = 110,
275 |     num.cells = ncells,
276 |     num.cifs = 50,
277 |     tree = Phyla3(),
278 |     intrinsic.noise = 0.5,
279 |     cci = list(
280 |       params = lig_params,
281 |       max.neighbors = 4,
282 |       cell.type.interaction = "random",
283 |       step.size = 0.5
284 |     )
285 |   )
286 |   sim_true_counts(options)
287 | }
288 | 
289 | atac_dens_nonzero <- function(data) {
290 |   x <- data[data > 0]
291 |   density(x = log2(x + 1), adjust = 1, n = 999)
292 | }
293 | 


--------------------------------------------------------------------------------
/R/9.1_shiny.R:
--------------------------------------------------------------------------------
  1 | #' Launch the Shiny App to configure the simulation
  2 | #' @export
  3 | run_shiny <- function() {
  4 |   appDir <- system.file("shiny-app", package = "scMultiSim")
  5 |   # appDir <- "inst/shiny-app"
  6 |   shiny::runApp(appDir, port = 8888, launch.browser = T)
  7 | }
  8 | 
  9 | generateSpatialLoc <- function(opt) {
 10 |   phyla <- opt$tree
 11 |   step_size <- opt$step_size
 12 |   ncell <- opt$ncell
 13 |   is_discrete <- opt$is_discrete
 14 |   lr_num <- opt$lr_num
 15 |   ctype_lr <- opt$ctype_lr
 16 | 
 17 |   ctp <- cci_cell_type_params(phyla, lr_num, ctype_lr, step_size,
 18 |                               rand = TRUE, discrete = is_discrete)
 19 | 
 20 |   c(paths, max_layers) %<-% .getPaths(
 21 |     list(cell = ncell),
 22 |     list(tree = phyla)
 23 |   )
 24 |   cell_path <- sample(seq_along(paths), ncell, replace = TRUE)
 25 | 
 26 | 
 27 |   tree_info <- .tree_info(phyla)
 28 |   neutral <- SampleSubtree(
 29 |     tree_info$root, 0, 0,
 30 |     tree_info$edges,
 31 |     max_layers,
 32 |     step_size,
 33 |     neutral = NA
 34 |   )
 35 | 
 36 |   neutral <- neutral[1:max_layers,]
 37 |   layer_idx_by_path <- lapply(paths, function(path) {
 38 |     idx <- integer()
 39 |     for (i in 1:(length(path) - 1)) {
 40 |       a <- path[i]
 41 |       b <- path[i + 1]
 42 |       idx <- c(idx, which(neutral[, 1] == a & neutral[, 2] == b))
 43 |     }
 44 |     idx
 45 |   })
 46 | 
 47 |   cell_types <- character(length = nrow(neutral))
 48 |   for (i in 1:nrow(tree_info$edges)) {
 49 |     c(id, from, to, len) %<-% tree_info$edges[i,]
 50 |     n_steps <- len %/% step_size + ceiling(len %% step_size)
 51 |     pts <- which(neutral[, 1] == from & neutral[, 2] == to)
 52 |     n_pts <- length(pts)
 53 |     cell_types[pts] <- if (n_steps == 1) {
 54 |       paste(from, to, sep = "_")
 55 |     } else {
 56 |       type_id <- ceiling(1:n_pts * (n_steps / n_pts))
 57 |       paste(from, to, type_id, sep = "_")
 58 |     }
 59 |   }
 60 | 
 61 |   meta_by_path <- lapply(seq_along(paths), function(i_path) {
 62 |     idx <- layer_idx_by_path[[i_path]]
 63 |     n <- neutral[idx,]
 64 |     data.frame(
 65 |       pop = apply(n[, 1:2], 1, \(X) paste0(X, collapse = "_")),
 66 |       cell.type = cell_types[idx]
 67 |     )
 68 |   })
 69 | 
 70 |   if (!is.null(ctp$type_map)) {
 71 |     for (i in seq_along(meta_by_path)) {
 72 |       meta_by_path[[i]] <- cbind(
 73 |         meta_by_path[[i]],
 74 |         data.frame(cell.type.idx = ctp$type_map[meta_by_path[[i]]$cell.type])
 75 |       )
 76 |     }
 77 |   }
 78 | 
 79 |   final_ctype <- integer(length = ncell)
 80 |   for (i in seq_len(ncell)) {
 81 |     final_ctype[i] <- if (is_discrete) {
 82 |       meta[i, "cell.type.idx"]
 83 |     } else {
 84 |       path_i <- cell_path[i]
 85 |       layer <- min(ncell - i + 1, nrow(meta_by_path[[path_i]]))
 86 |       meta_by_path[[path_i]][layer, "cell.type.idx"]
 87 |     }
 88 |   }
 89 | 
 90 | 
 91 |   grid <- CreateSpatialGrid(
 92 |     ncells = ncell,
 93 |     max_nbs = opt$max_nbs,
 94 |     .grid.size = opt$grid.size,
 95 |     .same.type.prob = opt$same.type.prob,
 96 |     .method = opt$layout,
 97 |     .method.param = NULL,
 98 |     .nb.radius = 1
 99 |   )
100 | 
101 |   grid$set_final_ctypes(final_ctype)
102 |   for (i in 1:ncell) {
103 |     new_cell_type <- if (is_discrete) meta[i, "cell.type.idx"] else cell_path[i]
104 |     grid$allocate(i, new_cell_type)
105 |   }
106 | 
107 |   grid
108 | }
109 | 


--------------------------------------------------------------------------------
/R/9_meta.R:
--------------------------------------------------------------------------------
 1 | .ver <- "1.2.0"
 2 | 
 3 | #' Show detailed documentations of scMultiSim's parameters
 4 | #'
 5 | #' @param topic Can be `options`, `dynamic.GRN`, or `cci`
 6 | #' @return none
 7 | #' @export
 8 | #'
 9 | #' @examples scmultisim_help()
10 | scmultisim_help <- function(topic = NULL) {
11 |   if (is.null(topic)) {
12 |     meta_help <- "Call scmultisim_help(topic) where topic can be \"options\" or an option name. Printing help for options by default.\n"
13 |     sprintf(.split_long_string(meta_help)) %>% cat()
14 |     topic <- "options"
15 |   }
16 |   
17 |   if (topic == "options") {
18 |     sprintf("scMultiSim  v%s\n", .ver) %>% cat()
19 |     .print_opt()
20 |     return()
21 |   }
22 |   
23 |   if (topic == "dynamic.GRN") {
24 |     .dynamic_grn_default_params(help = TRUE)
25 |     return()
26 |   }
27 |   
28 |   if (topic == "cci") {
29 |     .cci_help()
30 |     return()
31 |   }
32 |   
33 |   .print_opt(topic)
34 | }
35 | 
36 | 
37 | .cci_help <- function() {
38 |   cat("
39 | To enable simulating cell-cell interaction, the value should be a list including
40 | the following names:
41 | 
42 | - grid.size: (integer)
43 |     Manually specify the width and height of the grid.
44 | - layout: (character or function)
45 |     Supported values are \"enhanced\",  \"layers\", \"islands\", or a custom function.
46 |     If set to \"islands\", you can specify which cell types are the islands,
47 |         e.g. \"islands:1,2\".
48 |     The custom function should take two arguments: (grid_size, cell_types)
49 |         grid_size: (integer)
50 |             The width and height of the grid.
51 |         cell_types: (integer vector)
52 |             Each cell's cell type.
53 |     It should return a n_cell x 2 matrix, where each row is the x and y coordinates of a cell.
54 | - params: (data.frame)
55 |     The spatial effect between neighbor cells.
56 |     It should be a data frame similar to the GRN parameter.
57 | - step.size: (number, optional)
58 |     If using continuous population, use this step size to further divide the
59 |     cell types on the tree. For example, if the tree only has one branch `a -> b`
60 |     and the branch length is 1 while the step size is 0.34,
61 |     there will be totally three cell types: a_b_1, a_b_2, a_b_3.
62 | - cell.type.interaction: (\"random\" or a matrix)
63 |     The interaction level between different cell types.
64 |     They act as factors multiplied to the ligand effect.
65 |     Supply the string \"random\" to let scMultiSim generate these factors randomly.
66 |     Otherwise, use cci_cell_type_params() to generate the template data structure.
67 |     See the help of this method for more info.
68 | - cell.type.lr.pairs: (integer vector)
69 |     If cell.type.interaction is \"random\", how many LR pairs should be enabled
70 |     between each cell type pair.
71 |     Should be a range, e.g. 4:6. The actual number of LR pairs will be uniformly
72 |     sampled from this range.
73 | - max.neighbors: (integer)
74 |     Constraint the maxinum number of neighbors with CCI for each cell.
75 |     The neighbors with CCI will be randomly sampled.
76 | - radius: (number or string)
77 |     Which cells should be considered as neighbors.
78 |     The interacting cells are those within these neighbors.
79 |     When it is a number, it controls the maximum distance between two cells for
80 |     them to interact.
81 |     When it is a string, it should be in the format `gaussian:sigma`, for example,
82 |     `gaussian:1.2`.
83 |     In this case, the probability of two cells interacting is proportional to
84 |     the distance with a Gaussian kernel applied.
85 | - start.layer: (integer)
86 |     From which layer (time step) the simulation should start.
87 |     If set to 1, the simulation will start with one cell in the grid and add one
88 |     more cell in each following layer.
89 |     If set to `num_cells`, the simulation will start from all cells available in
90 |     the grid and only continues for a few static layers, which will greatly speed
91 |     up the simulation.
92 |       ")
93 | }
94 | 


--------------------------------------------------------------------------------
/R/data.R:
--------------------------------------------------------------------------------
 1 | #' distribution of kinetic parameters learned from the Zeisel UMI cortex datasets
 2 | #' @name match_params
 3 | #' @docType data
 4 | #' @usage data(param_realdata.zeisel.imputed)
 5 | #' @format a data frame.
 6 | #' @return a data frame.
 7 | #' @keywords datasets internal
 8 | #' @examples 
 9 | #' data(param_realdata.zeisel.imputed)
10 | "match_params"
11 | 
12 | 
13 | #' a pool of gene lengths to sample from
14 | #' @name gene_len_pool
15 | #' @docType data
16 | #' @usage data(gene_len_pool)
17 | #' @format a vector.
18 | #' @return a vector of gene lengths.
19 | #' @keywords datasets internal
20 | #' @examples 
21 | #' data(gene_len_pool)
22 | "gene_len_pool"
23 | 
24 | 
25 | #' from transcript length to number of fragments (for the nonUMI protocol)
26 | #' @name len2nfrag
27 | #' @docType data
28 | #' @usage data(len2nfrag)
29 | #' @format a vector.
30 | #' @return a vector.
31 | #' @keywords datasets internal
32 | #' @examples 
33 | #' data(len2nfrag)
34 | "len2nfrag"
35 | 
36 | 
37 | #' this is the density function of log(x+1), where x is the non-zero values for ATAC-SEQ data
38 | #' @name dens_nonzero
39 | #' @docType data
40 | #' @usage data(dens_nonzero)
41 | #' @format a vector.
42 | #' @return a vector.
43 | #' @keywords datasets
44 | #' @examples
45 | #' data(dens_nonzero)
46 | "dens_nonzero"
47 | 
48 | 
49 | #' 100_gene_GRN is a matrix of GRN params consisting of 100 genes where: #    - column 1 is the target gene ID, #    - column 2 is the gene ID which acts as a transcription factor for the target (regulated) gene #    - column 3 is the effect of the column 2 gene ID on the column 1 gene ID
50 | #' @name GRN_params_100
51 | #' @docType data
52 | #' @usage data(GRN_params_100)
53 | #' @format a data frame.
54 | #' @return a data frame with three columns: target gene ID, TF gene ID, and the effect of TF on target gene.
55 | #' @keywords datasets
56 | #' @examples
57 | #' data(GRN_params_100)
58 | "GRN_params_100"
59 | 
60 | 
61 | #' GRN_params_1139 is a matrix of GRN params consisting of 1139 genes where: #    - column 1 is the target gene ID, #    - column 2 is the gene ID which acts as a transcription factor for the target (regulated) gene #    - column 3 is the effect of the column 2 gene ID on the column 1 gene ID
62 | #' @name GRN_params_1139
63 | #' @docType data
64 | #' @usage data(GRN_params_1139)
65 | #' @format a data frame.
66 | #' @return a data frame with three columns: target gene ID, TF gene ID, and the effect of TF on target gene.
67 | #' @keywords datasets
68 | #' @examples
69 | #' data(GRN_params_1139)
70 | "GRN_params_1139"
71 | 
72 | 


--------------------------------------------------------------------------------
/R/imports.R:
--------------------------------------------------------------------------------
 1 | #' @import ggplot2
 2 | #' @import ape
 3 | #' @import rlang
 4 | #' @import foreach
 5 | #' @import markdown
 6 | #' @importFrom stats cor density dist dnorm hclust median na.omit rbeta rbinom rnorm rpois runif setNames
 7 | #' @importFrom Rtsne Rtsne
 8 | #' @importFrom utils data write.csv
 9 | #' @importFrom dplyr %>%
10 | #' @importFrom zeallot %<-% %->%
11 | #' @importFrom SummarizedExperiment SummarizedExperiment
12 | #' @importFrom BiocParallel bplapply MulticoreParam
13 | NULL
14 | 


--------------------------------------------------------------------------------
/R/results.R:
--------------------------------------------------------------------------------
 1 | rna_velo_knn <- function(results, velocity, perplexity = 70, randseed = 0, raw = FALSE) {
 2 |   # set.seed(randseed)
 3 |   counts_s <- results$counts
 4 |   pop <- results$cell_meta$pop
 5 |   depth <- results$cell_meta$depth
 6 |   
 7 |   counts_s_lg <- t(log2(counts_s + 1))
 8 |   
 9 |   if (is.null(results$velocity)) {
10 |     stop("The result object is not produced in velocity mode.")
11 |   }
12 |   
13 |   process_velocity <- function(v) {
14 |     assertthat::assert_that(
15 |       nrow(counts_s) == nrow(v),
16 |       ncol(counts_s) == ncol(v)
17 |     )
18 |     
19 |     future_counts_s <- counts_s + v
20 |     future_counts_s[future_counts_s < 0] <- 0
21 |     future_counts_s_lg <- t(log2(future_counts_s + 1))
22 |     future_counts_s_lg - counts_s_lg
23 |   }
24 |   
25 |   
26 |   normalize_velocity <- function(v) {
27 |     v_normalizer <- apply(v, 2, \(vi) vi^2) %>% rowSums() %>% sqrt()
28 |     t(t(v) / v_normalizer)
29 |   }
30 |   
31 |   if (raw) {
32 |     return(
33 |       paired_simil(velocity, results$velocity, method = "cosine")
34 |     )
35 |   }
36 |   
37 |   dist_obj <- dist(counts_s_lg)
38 |   dist_mat <- as.matrix(dist_obj)
39 |   n_cells <- nrow(dist_mat)
40 |   k <- ceiling(n_cells / 50)
41 |   
42 |   v_knn <- process_velocity(velocity) %>%
43 |     apply(2, \(vi)
44 |       distMat.KernelKnn(dist_mat, TEST_indices = NULL,
45 |                         weights_function = 'gaussian',
46 |                         y = vi, k = k, regression = TRUE)
47 |     ) %>%
48 |     normalize_velocity()
49 |   
50 |   v_true_knn <- process_velocity(results$velocity) %>%
51 |     apply(2, \(vi)
52 |       distMat.KernelKnn(dist_mat, TEST_indices = NULL,
53 |                         weights_function = 'gaussian',
54 |                         y = vi, k = k, regression = TRUE)
55 |     ) %>%
56 |     normalize_velocity()
57 |   
58 |   sim <- paired_simil(v_knn, v_true_knn, method = "cosine")
59 |   
60 |   mean(sim)
61 | }


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # scMultiSim
 2 | 
 3 | **Table of contents**
 4 | 
 5 | - [Tutorials and documentation](#tutorials-and-documentation)
 6 | - [Installation](#installation)
 7 | - [Shiny App](#shiny-app)
 8 | - [FAQ](#faq)
 9 | - [Contact and reference](#contact)
10 | 
11 | scMultiSim is an in silico simulator that generates multi-modality data of single-cells, including gene expression, chromatin accessibility, RNA velocity, and spatial location of cells. It takes a cell differential tree and a gene regulatory network (GRN) as input, and simulates spliced and unspliced counts while accounting for the relationships between modalities. The output single cell gene expression data is determined by three factors: cell-cell interactions, within-cell GRNs and chromatin accessibility. Users can tune the effect of each factor on the output data and set various parameters for the underlying model. Furthermore, the GRN can be set in a time-varying mode where the network's structure changes temporally to reflect the dynamic nature of biological networks. We also provide options to simulate technical variations such as batch effects. scMultiSim can be used to benchmark challenging computational tasks on single-cell multi-omics data, including the inference of GRNs, estimation of RNA velocity, integration of single-cell datasets from multiple batches and modalities, and analysis of cell-cell interaction using the cell spatial location data.
12 | 
13 | ![Overview](https://github.com/ZhangLabGT/scMultiSim/raw/img/img/scMultisim.png)
14 | 
15 | The following figure briefly shows results from the same cell differential tree:
16 | 
17 | 1. Connected scATAC-seq and scRNA-seq, in continuous or discrete mode. Visualized by t-SNE.
18 | 2. GRN correlation heatmap, where genes regulated by the same regulator have similar correlations with others.
19 | 3. Unspliced counts and RNA velocity ground truth visualized by t-SNE.
20 | 4. Spatial cell locations and cell-cell interaction ground truth.
21 | 5. Discrete cell population with added batch effects.
22 | 
23 | ![Results](https://github.com/ZhangLabGT/scMultiSim/raw/img/img/results.png)
24 | 
25 | ## Tutorials and documentation
26 | 
27 | Please check out the [tutorials](https://zhanglabgt.github.io/scMultiSim/articles)
28 | for detailed instructions on how to use scMultiSim.
29 | 
30 | ## Installation
31 | 
32 | scMultiSim can be installed from BioConductor using the following command:
33 | 
34 | ```R
35 | if (!require("BiocManager")) {
36 |   install.packages("BiocManager")
37 | }
38 | 
39 | BiocManager::install("scMultiSim")
40 | ```
41 | 
42 | ## Shiny App
43 | 
44 | A Shiny app is provided to help users visualize the effect of each parameter and adjust the simulation options.
45 | To run the app, simply call `run_shiny()`.
46 | 
47 | <img src="https://github.com/ZhangLabGT/scMultiSim/raw/img/img/shiny_app_sc.png" height="400">
48 | 
49 | ## FAQ
50 | 
51 | ### Running Speed
52 | 
53 | Simulations should finish in a reasonable time in most cases. On a machine with an i7-12700K CPU and 64GB RAM, using 1000 cells, 100 genes and 50 CIFs, the simulation took under 1 mimute to generate both scRNA-seq and scATAC-seq data. If also generating unspliced and spliced counts, or enabling cell-cell interactions, the running time is longer (~3 minutes when RNA velocity is enabled, and 30 minutes for 500 cells with spatial cell-cell interaction enabled).
54 | 
55 | ## Contact and reference
56 | 
57 | GitHub issues are welcomed.
58 | It is also possible to send email to the main author
59 | `Hechen Li (hli691 at gatech.edu)`.
60 | 
61 | ### Please cite
62 | 
63 | Hechen Li, Ziqi Zhang, Michael Squires, Xi Chen, and Xiuwei Zhang. 2023. “scMultiSim: Simulation of Multi-Modality Single Cell Data Guided by Cell-Cell Interactions and Gene Regulatory Networks.” bioRxiv.
64 | 


--------------------------------------------------------------------------------
/_pkgdown.yml:
--------------------------------------------------------------------------------
 1 | url: https://zhanglabgt.github.io/scMultiSim/
 2 | template:
 3 |   bootstrap: 5
 4 |   light-switch: true
 5 | articles:
 6 |   - title: Tutorials
 7 |     navbar: ~
 8 |     contents:
 9 |       - workflow
10 |       - basics
11 |       - spatialCCI
12 |       - options
13 | reference:
14 |   - title: Simulation
15 |     desc: Functions for simulating single-cell data.
16 |   - contents:
17 |       - sim_true_counts
18 |       - add_expr_noise
19 |       - divide_batches
20 |       - add_outliers
21 |   - title: Visualization
22 |     desc: Functions for visualizing the results.
23 |   - contents:
24 |     - starts_with("plot_")
25 |     - gene_corr_cci
26 |     - gene_corr_regulator
27 |   - title: Help
28 |     desc: Functions for getting help.
29 |   - contents:
30 |     - run_shiny
31 |     - scmultisim_help
32 |   - title: Utilities
33 |     desc: Utility functions that can be useful for simulating data.
34 |   - contents:
35 |     - cci_cell_type_params
36 |     - gen_clutter
37 |   - title: Data
38 |     desc: Default data provided by scMultiSim
39 |   - contents:
40 |     - starts_with("Phyla")
41 |     - has_keyword("datasets")
42 |   - title: Internal helpers
43 |     desc: Internal helper functions, but can be useful for advanced customization.
44 |   - contents:
45 |     - Get_1region_ATAC_correlation
46 |     - Get_ATAC_correlation
47 |     - True2ObservedATAC
48 |     - True2ObservedCounts
49 |     - sim_example_200_cells
50 |     - sim_example_200_cells_spatial
51 | 


--------------------------------------------------------------------------------
/data/GRN_params_100.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhangLabGT/scMultiSim/12e3799a445316c93df9fc357909c796cfc61f6e/data/GRN_params_100.RData


--------------------------------------------------------------------------------
/data/GRN_params_1139.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhangLabGT/scMultiSim/12e3799a445316c93df9fc357909c796cfc61f6e/data/GRN_params_1139.RData


--------------------------------------------------------------------------------
/data/dens_nonzero.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhangLabGT/scMultiSim/12e3799a445316c93df9fc357909c796cfc61f6e/data/dens_nonzero.RData


--------------------------------------------------------------------------------
/data/gene_len_pool.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhangLabGT/scMultiSim/12e3799a445316c93df9fc357909c796cfc61f6e/data/gene_len_pool.RData


--------------------------------------------------------------------------------
/data/len2nfrag.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhangLabGT/scMultiSim/12e3799a445316c93df9fc357909c796cfc61f6e/data/len2nfrag.RData


--------------------------------------------------------------------------------
/data/param_realdata.zeisel.imputed.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhangLabGT/scMultiSim/12e3799a445316c93df9fc357909c796cfc61f6e/data/param_realdata.zeisel.imputed.RData


--------------------------------------------------------------------------------
/inst/extdata/Newick_ABCDE.txt:
--------------------------------------------------------------------------------
1 | (((A:1,B:1):1,(C:0.5,D:0.5):1.5):1,E:3);
2 | 


--------------------------------------------------------------------------------
/inst/extdata/Newick_animals.txt:
--------------------------------------------------------------------------------
1 | ((raccoon:19.19959,bear:6.80041):0.84600,((sea_lion:11.99700, seal:12.00300):7.52973,((monkey:100.85930,cat:47.14069):20.59201, weasel:18.87953):2.09460):3.87382,dog:25.46154);


--------------------------------------------------------------------------------
/inst/shiny-app/app.R:
--------------------------------------------------------------------------------
  1 | library(shiny)
  2 | 
  3 | getGlobalObjects <- function() {
  4 |   res <- list()
  5 |   i <- 1
  6 |   obj.names <- ls(envir=.GlobalEnv)
  7 |   for (n in obj.names) {
  8 |     obj <- get(n, envir=.GlobalEnv)
  9 |     if (is.data.frame(obj) || is.matrix(obj)) {
 10 |       res[[i]] <- list(name = n, type = "data.frame")
 11 |     } else if (is.vector(obj)) {
 12 |       res[[i]] <- list(name = n, type = "vector", length = length(obj))
 13 |     } else if (is.list(obj)) {
 14 |       res[[i]] <- list(name = n, type = "list", length = length(obj))
 15 |     } else {
 16 |       res[[i]] <- list(name = n, type = "other")
 17 |     }
 18 |     i <- i + 1
 19 |   }
 20 |   res
 21 | }
 22 | 
 23 | getOptDefaults <- function() {
 24 |   opt_list <- .opt_list()
 25 |   names <- names(opt_list)
 26 |   opts <- seq_along(names)
 27 |   res <- list()
 28 |   
 29 |   for (i in opts) {
 30 |     n <- names[i]
 31 |     opt <- opt_list[[i]]
 32 |     if (n == "") next;
 33 |     val <- opt[[1]]
 34 |     if (!val[[1]]) {
 35 |       res[[n]] <- val[[2]]
 36 |     }
 37 |   }
 38 | 
 39 |   res
 40 | }
 41 | 
 42 | 
 43 | # Define server logic for random distribution app ----
 44 | server <- function(input, output, session) {
 45 |   grn_info <- reactive({
 46 |     print(input$GRN)
 47 |     error <- NULL
 48 |     grn_df <- get(input$GRN)
 49 |     if (!is.data.frame(grn_df)) {
 50 |       return(list(error = "GRN must be a data frame"))
 51 |     }
 52 |     if (ncol(grn_df) != 3) {
 53 |       return(list(error = "GRN must have 3 columns"))
 54 |     }
 55 |     rg_genes <- unique(grn_df[,2])
 56 |     tg_genes <- unique(grn_df[,1])
 57 |     all_genes <- unique(c(rg_genes, tg_genes))
 58 |     list(
 59 |       data = grn_df,
 60 |       ngenes = length(all_genes),
 61 |       nrows = nrow(grn_df),
 62 |       nregulators = length(rg_genes),
 63 |       ntargets = length(tg_genes),
 64 |       error = FALSE
 65 |     )
 66 |   })
 67 | 
 68 |   tree_info <- reactive({
 69 |     tree_name <- input$tree
 70 |     if (is.null(tree_name)) {
 71 |       return(list(error = "No tree selected"))
 72 |     }
 73 |     tree <- switch(tree_name,
 74 |       phyla1 = Phyla1(),
 75 |       phyla3 = Phyla3(),
 76 |       phyla5 = Phyla5(),
 77 |       tryCatch(
 78 |         { eval(parse(text=tree_name), envir=.GlobalEnv) }, error = function(e) NULL
 79 |       )
 80 |     )
 81 |     if (is.null(tree)) {
 82 |       return(list(error = paste0("Error loading tree: ", tree_name, ", please check syntax")))
 83 |     }
 84 |     edges <- cbind(1:nrow(tree$edge), tree$edge, tree$edge.length)
 85 |     colnames(edges) <- c("id", "from", "to", "len")
 86 |     parents <- unique(edges[, 2])
 87 |     children <- unique(edges[, 3])
 88 |     root <- setdiff(parents, children) %>% as.numeric()
 89 |     tips <- setdiff(children, parents) %>% as.numeric()
 90 |     internal <- union(parents, children) %>% as.numeric()
 91 | 
 92 |     list(error = FALSE, tree = tree, edges = edges, root = root, tips = tips, internal = internal)
 93 |   })
 94 | 
 95 |   grid <- eventReactive(input$submit_spatial, {
 96 |     opt <- input$submit_spatial
 97 |     g <- generateSpatialLoc(list(
 98 |       layout = opt$layout,
 99 |       tree = tree_info()$tree,
100 |       step_size = opt$stepSize,
101 |       ncell = opt$ncell,
102 |       is_discrete = F,
103 |       lr_num = 0,
104 |       ctype_lr = 0,
105 |       grid.size = opt$gridSize,
106 |       same.type.prob = opt$sameTypeProb,
107 |       max_nbs = 4
108 |     ))
109 | 
110 |     list(
111 |       locs = g$locs,
112 |       size = g$grid_size,
113 |       final_types = g$final_types
114 |     )
115 |   })
116 | 
117 |   # Generate a plot of the data ----
118 |   # Also uses the inputs to build the plot label. Note that the
119 |   # dependencies on the inputs and the data reactive expression are
120 |   # both tracked, and all expressions are called in the sequence
121 |   # implied by the dependency graph.
122 |   # output$plot <- renderPlot({
123 |   #   dist <- input$dist
124 |   #   n <- input$n
125 | 
126 |   #   hist(d(),
127 |   #        main = paste("r", dist, "(", n, ")", sep = ""),
128 |   #        col = "#007bc2", border = "white")
129 |   # })
130 | 
131 |   # Generate a summary of the data ----
132 |   # output$summary <- renderPrint({
133 |   #   summary(d())
134 |   # })
135 | 
136 |   # Generate an HTML table view of the head of the data ----
137 |   output$grn_head <- renderTable({
138 |     info <- grn_info()
139 |     if (is.character(info$error)) {
140 |       NULL
141 |     } else if (is.null(info$data)) {
142 |       "No data available"
143 |     } else {
144 |       head(data.frame(info$data))
145 |     }
146 |   }, html.table.attributes = 'class="table table-sm"')
147 | 
148 |   output$grn_summary <- renderText({
149 |     info <- grn_info()
150 |     if (is.character(info$error)) {
151 |       paste0("Error: ", info$error)
152 |     } else if (is.null(info$data)) {
153 |       "No data available"
154 |     } else {
155 |       paste("GRN with", info$nrows, "edges and" , info$ngenes, "genes, incl.", info$nregulators, "regulators and", info$ntargets, "targets")
156 |     }
157 |   })
158 | 
159 |   output$tree_plot <- renderPlot({
160 |     info <- tree_info()
161 |     if (is.null(info)) {
162 |       NULL
163 |     } else {
164 |       plot(info$tree, no.margin = TRUE)
165 |       nodelabels()
166 |     }
167 |   })
168 | 
169 |   observe({
170 |     session$sendCustomMessage(type = "RObjects", getGlobalObjects())
171 |   })
172 |   observe({
173 |     session$sendCustomMessage(type = "GRNInfo", grn_info())
174 |   })
175 |   observe({
176 |     session$sendCustomMessage(type = "TreeInfo", tree_info())
177 |   })
178 |   observe({
179 |     session$sendCustomMessage(type = "Grid", grid())
180 |   })
181 |   observe({
182 |     session$sendCustomMessage(type = "Defaults", getOptDefaults())
183 |   })
184 | 
185 |   observe({
186 |     print(input$generatedOptions)
187 |     if (!is.null(input$generatedOptions)) {
188 |       eval(parse(text=input$generatedOptions), envir=.GlobalEnv)
189 |     }
190 |   })
191 | 
192 |   observe({
193 |     print(input$stopApp)
194 |     if (is.character(input$stopApp) && input$stopApp == "YES") {
195 |       stopApp()
196 |     }
197 |   })
198 | }
199 | 
200 | a <- shinyApp(ui = htmlTemplate("www/index.html"), server)
201 | 


--------------------------------------------------------------------------------
/inst/shiny-app/www/.prettierrc:
--------------------------------------------------------------------------------
1 | {
2 |   "printWidth": 140
3 | }
4 | 


--------------------------------------------------------------------------------
/inst/shiny-app/www/output.js:
--------------------------------------------------------------------------------
  1 | const defaultValues = {};
  2 | 
  3 | function outputOptions() {
  4 |   const options = {};
  5 | 
  6 |   function add(name, getter = null, inputName = null) {
  7 |     const defaultValue = defaultValues[name];
  8 |     let value;
  9 |     if (getter) {
 10 |       value = typeof getter === "function" ? getter() : getter;
 11 |     } else {
 12 |       const el = document.querySelector(`[name="${inputName || name}"]`);
 13 |       if (el === null) {
 14 |         console.error(`Element with name ${name} not found`);
 15 |         return;
 16 |       }
 17 |       value = el.type === "checkbox" ? el.checked.toString().toUpperCase() : el.value;
 18 |     }
 19 |     if (value !== defaultValue) {
 20 |       options[name] = value;
 21 |     }
 22 |   }
 23 | 
 24 |   const isDiscrete = document.querySelector("[name=_cellPop]:checked").value === "discrete";
 25 |   const grnEnabled = document.querySelector("[name=_grnEnabled]:checked").value === "on";
 26 | 
 27 |   add("seed");
 28 |   add("speed.up");
 29 | 
 30 |   add("num.cifs");
 31 |   add("diff.cif.fraction");
 32 |   add("cif.center");
 33 |   add("cif.sigma");
 34 |   add("giv.mean");
 35 |   add("giv.sd");
 36 |   add("giv.prob");
 37 |   add("num.cells");
 38 | 
 39 |   if (isDiscrete) {
 40 |     add("discrete.cif", "TRUE");
 41 |     const numCluster = document.querySelector("[name=_numCluster]").value;
 42 |     add("tree", `rtree(${numCluster})`);
 43 |     add("discrete.min.pop.size");
 44 |     const discretePopSize = document.querySelector("[name=discretePopSize]").value;
 45 |     if (discretePopSize.length > 0) {
 46 |       add("discrete.pop.size", `as.integer(c(${discretePopSize}))`);
 47 |     }
 48 |   } else {
 49 |     add("tree", () => {
 50 |       let treeVal = document.querySelector("[name=tree]:checked").value;
 51 |       if (treeVal === "custom") {
 52 |         treeVal = document.querySelector("[name='_treeCustom']").value;
 53 |       } else {
 54 |         treeVal = {
 55 |           phyla1: "Phyla1()",
 56 |           phyla3: "Phyla3()",
 57 |           phyla5: "Phyla5()",
 58 |         }[treeVal];
 59 |       }
 60 |       return treeVal;
 61 |     });
 62 |     add("use.impulse");
 63 |   }
 64 | 
 65 |   if (grnEnabled) {
 66 |     add("GRN");
 67 |     add("num.genes");
 68 |   } else {
 69 |     add("GRN", "NA");
 70 |     add("num.genes", null, "num.genes2");
 71 |   }
 72 | 
 73 |   const useCustomATACDensity = document.querySelector("[name=useCustomATACDensity]").checked;
 74 | 
 75 |   add("riv.mean");
 76 |   add("riv.sd");
 77 |   add("riv.prob");
 78 | 
 79 |   add("region.distrib", () => {
 80 |     const v = [0, 1, 2].map((i) => document.querySelector(`[name=_regionDist${i}]`).value).join(", ");
 81 |     return `c(${v})`;
 82 |   });
 83 |   add("atac.effect");
 84 |   add("atac.p_zero");
 85 |   if (useCustomATACDensity) {
 86 |     add("atac.density");
 87 |   }
 88 | 
 89 |   add("scale.s");
 90 |   add("bimod");
 91 | 
 92 |   const velocityEnabled = document.querySelector("[name=velocity]:checked").value === "on";
 93 |   if (velocityEnabled) {
 94 |     add("do.velocity", "TRUE");
 95 |     add("beta");
 96 |     add("d");
 97 |     add("num.cycles");
 98 |     add("cycle.len");
 99 |   } else {
100 |     add("intrinsic.noise");
101 |   }
102 | 
103 |   let cciOptions = null;
104 |   function add_sp(name, getter = null) {
105 |     let value;
106 |     if (getter) {
107 |       value = typeof getter === "function" ? getter() : getter;
108 |     } else {
109 |       const el = document.querySelector(`[name="sp.${name}"]`);
110 |       if (el === null) {
111 |         console.error(`Element with name ${name} not found`);
112 |         return;
113 |       }
114 |       value = el.type === "checkbox" ? el.checked.toString().toUpperCase() : el.value;
115 |     }
116 |     cciOptions[name] = value;
117 |   }
118 | 
119 |   const spatialEnabled = document.querySelector("[name=_spatialEnabled]").checked;
120 |   if (spatialEnabled) {
121 |     cciOptions = {};
122 | 
123 |     add_sp("grid.size");
124 |     add_sp("step.size");
125 |     add_sp("max.neighbors");
126 |     add_sp("params", () => document.querySelector("[name=_CCI]").value);
127 | 
128 |     const spLayout = document.querySelector("[name=_spLayout]").value;
129 |     if (spLayout === "normal") {
130 |       add_sp("layout", `"enhanced"`);
131 |       add_sp("same.type.prob");
132 |     } else if (spLayout === "islands") {
133 |       const spIslands = document.querySelector("[name=_spIslands]").value;
134 |       add_sp("layout", `"islands:${spIslands}"`);
135 |     } else if (spLayout === "layers") {
136 |       add_sp("layout", `"layers"`);
137 |     }
138 | 
139 |     if (cciOptions.params.length === 0) {
140 |       alert("Please select a CCI dataframe to simulate spatial data.");
141 |     }
142 |   }
143 | 
144 |   console.log(options, cciOptions);
145 | 
146 |   const optString = `options <- list(
147 |     ${Object.entries(options)
148 |       .map(([k, v]) => `${k} = ${v}`)
149 |       .join(", ")}
150 |     ${
151 |       cciOptions
152 |         ? `, cci = list(${Object.entries(cciOptions)
153 |             .map(([k, v]) => `${k} = ${v}`)
154 |             .join(", ")})`
155 |         : ""
156 |     }
157 |     )`;
158 |   Shiny.setInputValue("generatedOptions", optString);
159 |   const modal = new bootstrap.Modal(document.getElementById("outputModal"));
160 |   modal.show();
161 | }
162 | 
163 | function init() {
164 |   const outputButton = document.querySelector("#outputButton");
165 |   outputButton.addEventListener("click", outputOptions);
166 | 
167 |   document.querySelector("#resetButton").addEventListener("click", () => {
168 |     location.reload();
169 |   });
170 | 
171 |   document.querySelector("#stopApp").addEventListener("click", () => {
172 |     Shiny.setInputValue("stopApp", "YES");
173 |     close();
174 |   });
175 | 
176 |   Shiny.addCustomMessageHandler("Defaults", (v) => {
177 |     console.log("Def", v);
178 |     Object.assign(defaultValues, v);
179 |   });
180 | }
181 | 
182 | document.addEventListener("DOMContentLoaded", init);
183 | 


--------------------------------------------------------------------------------
/inst/shiny-app/www/phyla1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhangLabGT/scMultiSim/12e3799a445316c93df9fc357909c796cfc61f6e/inst/shiny-app/www/phyla1.png


--------------------------------------------------------------------------------
/inst/shiny-app/www/phyla3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhangLabGT/scMultiSim/12e3799a445316c93df9fc357909c796cfc61f6e/inst/shiny-app/www/phyla3.png


--------------------------------------------------------------------------------
/inst/shiny-app/www/phyla5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhangLabGT/scMultiSim/12e3799a445316c93df9fc357909c796cfc61f6e/inst/shiny-app/www/phyla5.png


--------------------------------------------------------------------------------
/inst/shiny-app/www/scm_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhangLabGT/scMultiSim/12e3799a445316c93df9fc357909c796cfc61f6e/inst/shiny-app/www/scm_logo.png


--------------------------------------------------------------------------------
/inst/shiny-app/www/style.css:
--------------------------------------------------------------------------------
 1 | .text-small {
 2 |     font-size: 85%;
 3 | }
 4 | 
 5 | .bi {
 6 |     width: 1em;
 7 |     height: 1em;
 8 |     vertical-align: -.125em;
 9 |     fill: currentcolor;
10 | }
11 | 
12 | .legend-rect {
13 |     width: 16px;
14 |     display: inline-block;
15 |     height: 16px;
16 |     border: 1px solid black;
17 |     margin-right: 4px;
18 |     transform: translateY(2px);
19 | }
20 | 
21 | nav#mainNav {
22 |     border-bottom: 2px solid #555;
23 | }
24 | 


--------------------------------------------------------------------------------
/inst/shiny-app/www/validate.js:
--------------------------------------------------------------------------------
  1 | function setInvalid(el, msg) {
  2 |   el.classList.add("is-invalid");
  3 |   let sb = el.nextElementSibling;
  4 |   while (sb !== null && !sb.classList.contains("invalid-feedback")) {
  5 |     sb = sb.nextElementSibling;
  6 |   }
  7 |   if (sb === null) {
  8 |     console.error("No sibling with class 'invalid-feedback' found");
  9 |   }
 10 |   sb.innerText = msg;
 11 | }
 12 | 
 13 | function setValid(el) {
 14 |   el.classList.remove("is-invalid");
 15 | }
 16 | 
 17 | function validateMultipleElements(els, custom_fn) {
 18 |   const [valid, msg] = custom_fn(els);
 19 |   if (valid) {
 20 |     els.forEach((el) => setValid(el));
 21 |   } else {
 22 |     els.forEach((el) => setInvalid(el, msg));
 23 |   }
 24 | }
 25 | 
 26 | function validateSingleElement(el, custom_fn) {
 27 |   if (typeof custom_fn === "function") {
 28 |     const [valid, msg] = custom_fn(el);
 29 |     if (valid) {
 30 |       setValid(el);
 31 |     } else {
 32 |       setInvalid(el, msg);
 33 |     }
 34 |     return;
 35 |   }
 36 | 
 37 |   const data = el.dataset.v;
 38 |   const [type, valueRange] = data.split(":");
 39 | 
 40 |   let value = el.value;
 41 |   if (type === "i" || type === "f") {
 42 |     if (type === "i" && /[^\d]/.test(value)) {
 43 |       setInvalid(el, "Please enter an integer number");
 44 |       return;
 45 |     }
 46 |     value = type === "i" ? parseInt(value) : parseFloat(value);
 47 |     if (isNaN(value)) {
 48 |       setInvalid(el, `Please enter a number`);
 49 |       return;
 50 |     }
 51 |   } else if (type === "n") {
 52 |     return;
 53 |   }
 54 | 
 55 |   if (valueRange !== undefined) {
 56 |     const [min, max] = valueRange.split("-");
 57 |     if (value < min || value > max) {
 58 |       setInvalid(el, `Please enter a value between ${min} and ${max}`);
 59 |       return;
 60 |     }
 61 |   }
 62 | 
 63 |   setValid(el);
 64 | }
 65 | 
 66 | function validate(name, custom_fn) {
 67 |   if (Array.isArray(name)) {
 68 |     const els = name.map((n) => document.querySelector(`[name="${n}"]`));
 69 |     validateMultipleElements(els, custom_fn);
 70 |     for (const el of els) {
 71 |       el.addEventListener("input", () => {
 72 |         validateMultipleElements(els, custom_fn);
 73 |       });
 74 |     }
 75 |     return;
 76 |   }
 77 |   const el = document.querySelectorAll(`[name="${name}"]`);
 78 |   if (el.length === 0) {
 79 |     console.error(`Element with name ${name} not found`);
 80 |   } else if (el.length === 1) {
 81 |     validateSingleElement(el[0], custom_fn);
 82 |     el[0].addEventListener("input", () => {
 83 |       validateSingleElement(el[0], custom_fn);
 84 |     });
 85 |   }
 86 | }
 87 | 
 88 | function init() {
 89 |   const validatedNames = new Set();
 90 | 
 91 |   function customValidate(n, fn) {
 92 |     validate(n, fn);
 93 |     validatedNames.add(n);
 94 |   }
 95 | 
 96 |   customValidate("discretePopSize", (el) => {
 97 |     const popType = document.querySelector("input[name='_cellPop']:checked").value;
 98 |     if (popType === "continuous" || el.value.length === 0) return [true, null];
 99 |     const nClus = parseInt(document.querySelector("input[name='_numCluster']").value);
100 |     const nCell = parseInt(document.querySelector("input[name='num.cells']").value);
101 |     const values = el.value.split(",").map((v) => parseInt(v));
102 |     if (values.length !== nClus) return [false, "Number of clusters not matching"];
103 |     if (values.reduce((a, b) => a + b, 0) !== nCell) return [false, "Number of cells not matching"];
104 |     return [true, null];
105 |   });
106 | 
107 |   customValidate(["_regionDist0", "_regionDist1", "_regionDist2"], (el) => {
108 |     const distValues = [0, 1, 2].map((i) => parseFloat(document.querySelector(`input[name='_regionDist${i}']`).value));
109 |     const currValue = parseFloat(el.value);
110 |     if (currValue < 0 || currValue > 1) return [false, "Please enter a value between 0 and 1"];
111 |     if (distValues.reduce((a, b) => a + b, 0) !== 1) return [false, "Sum of all values should be 1"];
112 |     return [true, null];
113 |   });
114 | 
115 |   const allInputs = document.querySelectorAll("input");
116 |   for (const input of allInputs) {
117 |     if (typeof input.dataset.v === "string") {
118 |       if (input.dataset.v !== "c" && !validatedNames.has(input.name)) {
119 |         validate(input.name);
120 |         validatedNames.add(input.name);
121 |       }
122 |     } else {
123 |       if (!(input.type === "radio" || input.type === "checkbox")) {
124 |         console.warn("Validation: data attribute not found", input);
125 |       }
126 |     }
127 |   }
128 | }
129 | 
130 | document.addEventListener("DOMContentLoaded", init);
131 | 


--------------------------------------------------------------------------------
/man/GRN_params_100.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{GRN_params_100}
 5 | \alias{GRN_params_100}
 6 | \title{100_gene_GRN is a matrix of GRN params consisting of 100 genes where: #    - column 1 is the target gene ID, #    - column 2 is the gene ID which acts as a transcription factor for the target (regulated) gene #    - column 3 is the effect of the column 2 gene ID on the column 1 gene ID}
 7 | \format{
 8 | a data frame.
 9 | }
10 | \usage{
11 | data(GRN_params_100)
12 | }
13 | \value{
14 | a data frame with three columns: target gene ID, TF gene ID, and the effect of TF on target gene.
15 | }
16 | \description{
17 | 100_gene_GRN is a matrix of GRN params consisting of 100 genes where: #    - column 1 is the target gene ID, #    - column 2 is the gene ID which acts as a transcription factor for the target (regulated) gene #    - column 3 is the effect of the column 2 gene ID on the column 1 gene ID
18 | }
19 | \examples{
20 | data(GRN_params_100)
21 | }
22 | \keyword{datasets}
23 | 


--------------------------------------------------------------------------------
/man/GRN_params_1139.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{GRN_params_1139}
 5 | \alias{GRN_params_1139}
 6 | \title{GRN_params_1139 is a matrix of GRN params consisting of 1139 genes where: #    - column 1 is the target gene ID, #    - column 2 is the gene ID which acts as a transcription factor for the target (regulated) gene #    - column 3 is the effect of the column 2 gene ID on the column 1 gene ID}
 7 | \format{
 8 | a data frame.
 9 | }
10 | \usage{
11 | data(GRN_params_1139)
12 | }
13 | \value{
14 | a data frame with three columns: target gene ID, TF gene ID, and the effect of TF on target gene.
15 | }
16 | \description{
17 | GRN_params_1139 is a matrix of GRN params consisting of 1139 genes where: #    - column 1 is the target gene ID, #    - column 2 is the gene ID which acts as a transcription factor for the target (regulated) gene #    - column 3 is the effect of the column 2 gene ID on the column 1 gene ID
18 | }
19 | \examples{
20 | data(GRN_params_1139)
21 | }
22 | \keyword{datasets}
23 | 


--------------------------------------------------------------------------------
/man/Get_1region_ATAC_correlation.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/7_benchmark.R
 3 | \name{Get_1region_ATAC_correlation}
 4 | \alias{Get_1region_ATAC_correlation}
 5 | \title{This function gets the average correlation rna seq counts and region effect on genes for genes which are only associated with 1 chromatin region}
 6 | \usage{
 7 | Get_1region_ATAC_correlation(counts, atacseq_data, region2gene)
 8 | }
 9 | \arguments{
10 | \item{counts}{rna seq counts}
11 | 
12 | \item{atacseq_data}{atac seq data}
13 | 
14 | \item{region2gene}{a 0 1 coupling matrix between regions and genes of shape (nregions) x (num_genes), where a value of 1 indicates the gene is affected by a particular region}
15 | }
16 | \value{
17 | the correlation value
18 | }
19 | \description{
20 | This function gets the average correlation rna seq counts and region effect on genes for genes which are only associated with 1 chromatin region
21 | }
22 | \examples{
23 | \donttest{
24 | results <- sim_example(ncells = 10)
25 | Get_1region_ATAC_correlation(results$counts, results$atacseq_data, results$region_to_gene)
26 | }
27 | }
28 | 


--------------------------------------------------------------------------------
/man/Get_ATAC_correlation.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/7_benchmark.R
 3 | \name{Get_ATAC_correlation}
 4 | \alias{Get_ATAC_correlation}
 5 | \title{This function gets the average correlation rna seq counts and chromatin region effect on genes}
 6 | \usage{
 7 | Get_ATAC_correlation(counts, atacseq_data, num_genes)
 8 | }
 9 | \arguments{
10 | \item{counts}{rna seq counts}
11 | 
12 | \item{atacseq_data}{atac seq data}
13 | 
14 | \item{num_genes}{number of genes}
15 | }
16 | \value{
17 | the correlation value
18 | }
19 | \description{
20 | This function gets the average correlation rna seq counts and chromatin region effect on genes
21 | }
22 | \examples{
23 | results <- sim_example(ncells = 10)
24 | Get_ATAC_correlation(results$counts, results$atacseq_data, results$num_genes)
25 | }
26 | 


--------------------------------------------------------------------------------
/man/OP.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/0_opts.R
 3 | \name{OP}
 4 | \alias{OP}
 5 | \title{Get option from an object in the current environment}
 6 | \usage{
 7 | OP(..., .name = "options")
 8 | }
 9 | \arguments{
10 | \item{...}{the parameter name}
11 | 
12 | \item{.name}{get option from this object}
13 | }
14 | \value{
15 | the parameter value
16 | }
17 | \description{
18 | Get option from an object in the current environment
19 | }
20 | \keyword{internal}
21 | 


--------------------------------------------------------------------------------
/man/Phyla1.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/8_utils.R
 3 | \name{Phyla1}
 4 | \alias{Phyla1}
 5 | \title{Creating a linear example tree}
 6 | \usage{
 7 | Phyla1(len = 1)
 8 | }
 9 | \arguments{
10 | \item{len}{length of the tree}
11 | }
12 | \value{
13 | a R phylo object
14 | }
15 | \description{
16 | Creating a linear example tree
17 | }
18 | \examples{
19 | Phyla1(len = 1)
20 | }
21 | 


--------------------------------------------------------------------------------
/man/Phyla3.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/8_utils.R
 3 | \name{Phyla3}
 4 | \alias{Phyla3}
 5 | \title{Creating an example tree with 3 tips}
 6 | \usage{
 7 | Phyla3(plotting = FALSE)
 8 | }
 9 | \arguments{
10 | \item{plotting}{True for plotting the tree on console, False for no plot}
11 | }
12 | \value{
13 | a R phylo object
14 | }
15 | \description{
16 | Creating an example tree with 3 tips
17 | }
18 | \examples{
19 | Phyla3()
20 | }
21 | 


--------------------------------------------------------------------------------
/man/Phyla5.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/8_utils.R
 3 | \name{Phyla5}
 4 | \alias{Phyla5}
 5 | \title{Creating an example tree with 5 tips}
 6 | \usage{
 7 | Phyla5(plotting = FALSE)
 8 | }
 9 | \arguments{
10 | \item{plotting}{True for plotting the tree on console, False for no plot}
11 | }
12 | \value{
13 | a R phylo object
14 | }
15 | \description{
16 | Creating an example tree with 5 tips
17 | }
18 | \examples{
19 | Phyla5()
20 | }
21 | 


--------------------------------------------------------------------------------
/man/SampleDen.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/8_utils.R
 3 | \name{SampleDen}
 4 | \alias{SampleDen}
 5 | \title{sample from smoothed density function}
 6 | \usage{
 7 | SampleDen(nsample, den_fun, reduce.mem = FALSE)
 8 | }
 9 | \arguments{
10 | \item{nsample}{number of samples needed}
11 | 
12 | \item{den_fun}{density function estimated from density() from R default}
13 | 
14 | \item{reduce.mem}{use alternative implementation to reduce memory usage}
15 | }
16 | \value{
17 | a vector of samples
18 | }
19 | \description{
20 | sample from smoothed density function
21 | }
22 | \keyword{internal}
23 | 


--------------------------------------------------------------------------------
/man/True2ObservedATAC.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/6_technoise.R
 3 | \name{True2ObservedATAC}
 4 | \alias{True2ObservedATAC}
 5 | \title{Simulate observed ATAC-seq matrix given technical noise and the true counts}
 6 | \usage{
 7 | True2ObservedATAC(
 8 |   atacseq_data,
 9 |   randseed,
10 |   observation_prob = 0.3,
11 |   sd_frac = 0.1
12 | )
13 | }
14 | \arguments{
15 | \item{atacseq_data}{true ATAC-seq data}
16 | 
17 | \item{randseed}{(should produce same result if nregions, nevf and randseed are all the same)}
18 | 
19 | \item{observation_prob}{for each integer count of a particular region for a particular cell, the probability the count will be observed}
20 | 
21 | \item{sd_frac}{the fraction of ATAC-seq data value used as the standard deviation of added normally distrubted noise}
22 | }
23 | \value{
24 | a matrix of observed ATAC-seq data
25 | }
26 | \description{
27 | Simulate observed ATAC-seq matrix given technical noise and the true counts
28 | }
29 | \examples{
30 | results <- sim_example(ncells = 10)
31 | True2ObservedATAC(results$atac_counts, randseed = 1)
32 | }
33 | 


--------------------------------------------------------------------------------
/man/True2ObservedCounts.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/6_technoise.R
 3 | \name{True2ObservedCounts}
 4 | \alias{True2ObservedCounts}
 5 | \title{Simulate observed count matrix given technical biases and the true counts}
 6 | \usage{
 7 | True2ObservedCounts(
 8 |   true_counts,
 9 |   meta_cell,
10 |   protocol,
11 |   randseed,
12 |   alpha_mean = 0.1,
13 |   alpha_sd = 0.002,
14 |   alpha_gene_mean = 1,
15 |   alpha_gene_sd = 0,
16 |   gene_len,
17 |   depth_mean,
18 |   depth_sd,
19 |   lenslope = 0.02,
20 |   nbins = 20,
21 |   amp_bias_limit = c(-0.2, 0.2),
22 |   rate_2PCR = 0.8,
23 |   nPCR1 = 16,
24 |   nPCR2 = 10,
25 |   LinearAmp = FALSE,
26 |   LinearAmp_coef = 2000
27 | )
28 | }
29 | \arguments{
30 | \item{true_counts}{gene cell matrix}
31 | 
32 | \item{meta_cell}{the meta information related to cells, will be combined with technical cell level information and returned}
33 | 
34 | \item{protocol}{a string, can be "nonUMI" or "UMI"}
35 | 
36 | \item{randseed}{(should produce same result if nregions, nevf and randseed are all the same)}
37 | 
38 | \item{alpha_mean}{the mean of rate of subsampling of transcripts during capture step, default at 10 percent efficiency}
39 | 
40 | \item{alpha_sd}{the std of rate of subsampling of transcripts}
41 | 
42 | \item{alpha_gene_mean}{the per-gene scale factor of the alpha parameter, default at 1}
43 | 
44 | \item{alpha_gene_sd}{the standard deviation of the per-gene scale factor of the alpha parameter, default at 0}
45 | 
46 | \item{gene_len}{a vector with lengths of all genes}
47 | 
48 | \item{depth_mean}{mean of sequencing depth}
49 | 
50 | \item{depth_sd}{std of sequencing depth}
51 | 
52 | \item{lenslope}{amount of length bias}
53 | 
54 | \item{nbins}{number of bins for gene length}
55 | 
56 | \item{amp_bias_limit}{range of amplification bias for each gene, a vector of length ngenes}
57 | 
58 | \item{rate_2PCR}{PCR efficiency, usually very high, default is 0.8}
59 | 
60 | \item{nPCR1}{the number of PCR cycles in "pre-amplification" step, default is 16}
61 | 
62 | \item{nPCR2}{the number of PCR cycles used after fragmentation.}
63 | 
64 | \item{LinearAmp}{if linear amplification is used for pre-amplification step, default is FALSE}
65 | 
66 | \item{LinearAmp_coef}{the coeficient of linear amplification, that is, how many times each molecule is amplified by}
67 | }
68 | \value{
69 | if UMI, a list with two elements, the first is the observed count matrix, the second is the metadata; if nonUMI, a matrix
70 | }
71 | \description{
72 | Simulate observed count matrix given technical biases and the true counts
73 | }
74 | \examples{
75 | \donttest{
76 | results <- sim_example(ncells = 10)
77 | data(gene_len_pool)
78 | gene_len <- sample(gene_len_pool, results$num_genes, replace = FALSE)
79 | True2ObservedCounts(
80 |   results$counts, results$cell_meta, protocol = "nonUMI", randseed = 1,
81 |   alpha_mean = 0.1, alpha_sd = 0.05, gene_len = gene_len, depth_mean = 1e5, depth_sd = 3e3
82 | )
83 | }
84 | }
85 | 


--------------------------------------------------------------------------------
/man/add_expr_noise.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/6_technoise.R
 3 | \name{add_expr_noise}
 4 | \alias{add_expr_noise}
 5 | \title{Add experimental noise to true counts}
 6 | \usage{
 7 | add_expr_noise(results, ...)
 8 | }
 9 | \arguments{
10 | \item{results}{The scMultisim result object}
11 | 
12 | \item{...}{\code{randseed}: The random seed
13 | \code{protocol}: \code{UMI} or \code{non-UMI}
14 | \code{gene_len}:  A vector with lengths of all genes
15 | \code{alpha_mean}, \code{alpha_sd}: rate of subsampling of transcripts during capture step
16 | \code{depth_mean}, \code{depth_sd}: The sequencing depth}
17 | }
18 | \value{
19 | none
20 | }
21 | \description{
22 | Add experimental noise to true counts
23 | }
24 | \examples{
25 | results <- sim_example(ncells = 10)
26 | add_expr_noise(results)
27 | }
28 | \seealso{
29 | The underlying methods
30 | \link{True2ObservedCounts} and \link{True2ObservedATAC}
31 | }
32 | 


--------------------------------------------------------------------------------
/man/add_outliers.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/6_technoise.R
 3 | \name{add_outliers}
 4 | \alias{add_outliers}
 5 | \title{Add outliers to the observed counts}
 6 | \usage{
 7 | add_outliers(
 8 |   res,
 9 |   prob = 0.01,
10 |   factor = 2,
11 |   sd = 0.5,
12 |   cell.num = 1,
13 |   max.var = Inf
14 | )
15 | }
16 | \arguments{
17 | \item{res}{The scMultisim result object}
18 | 
19 | \item{prob}{The probability of adding outliers for each gene}
20 | 
21 | \item{factor}{The factor of the outliers}
22 | 
23 | \item{sd}{The standard deviation of the outliers}
24 | 
25 | \item{cell.num}{For a gene, the number of cells chosen to add outliers}
26 | 
27 | \item{max.var}{The maximum variance allowed}
28 | }
29 | \value{
30 | none
31 | }
32 | \description{
33 | Add outliers to the observed counts
34 | }
35 | 


--------------------------------------------------------------------------------
/man/cci_cell_type_params.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/3.1_spatial.R
 3 | \name{cci_cell_type_params}
 4 | \alias{cci_cell_type_params}
 5 | \title{Generate cell-type level CCI parameters}
 6 | \usage{
 7 | cci_cell_type_params(
 8 |   tree,
 9 |   total.lr,
10 |   ctype.lr = 4:6,
11 |   step.size = 1,
12 |   rand = TRUE,
13 |   discrete = FALSE
14 | )
15 | }
16 | \arguments{
17 | \item{tree}{Use the same value for \code{sim_true_counts()}.}
18 | 
19 | \item{total.lr}{Total number of LR pairs in the database. Use the same value for \code{sim_true_counts()}.}
20 | 
21 | \item{ctype.lr}{If \code{rand} is \code{TRUE}, how many LR pairs should be enabled between each cell type pair. Should be a range, e.g. 4:6.}
22 | 
23 | \item{step.size}{Use the same value for \code{sim_true_counts()}.}
24 | 
25 | \item{rand}{Whether fill the matrix randomly}
26 | 
27 | \item{discrete}{Whether the cell population is discrete. Use the same value for \code{sim_true_counts()}.}
28 | }
29 | \value{
30 | A 3D matrix of (n_cell_type, n_cell_type, n_lr). The value at (i, j, k) is 1 if there exist CCI of LR-pair k between cell type i and cell type j.
31 | }
32 | \description{
33 | See the return value if you want to specify the cell-type level ground truth.
34 | }
35 | \examples{
36 | cci_cell_type_params(Phyla3(), 100, 4:6, 0.5, TRUE, FALSE)
37 | 
38 | }
39 | 


--------------------------------------------------------------------------------
/man/dens_nonzero.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{dens_nonzero}
 5 | \alias{dens_nonzero}
 6 | \title{this is the density function of log(x+1), where x is the non-zero values for ATAC-SEQ data}
 7 | \format{
 8 | a vector.
 9 | }
10 | \usage{
11 | data(dens_nonzero)
12 | }
13 | \value{
14 | a vector.
15 | }
16 | \description{
17 | this is the density function of log(x+1), where x is the non-zero values for ATAC-SEQ data
18 | }
19 | \examples{
20 | data(dens_nonzero)
21 | }
22 | \keyword{datasets}
23 | 


--------------------------------------------------------------------------------
/man/divide_batches.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/6_technoise.R
 3 | \name{divide_batches}
 4 | \alias{divide_batches}
 5 | \title{Divide batches for observed counts}
 6 | \usage{
 7 | divide_batches(results, nbatch = 2, effect = 3, randseed = 0)
 8 | }
 9 | \arguments{
10 | \item{results}{The scMultisim result object, after running \code{addExprNoise()}}
11 | 
12 | \item{nbatch}{Number of batches}
13 | 
14 | \item{effect}{Batch effect size, default is 3}
15 | 
16 | \item{randseed}{Random seed}
17 | }
18 | \value{
19 | none
20 | }
21 | \description{
22 | Divide batches for observed counts
23 | }
24 | \examples{
25 | results <- sim_example(ncells = 10)
26 | add_expr_noise(results)
27 | divide_batches(results)
28 | }
29 | 


--------------------------------------------------------------------------------
/man/dot-amplifyOneCell.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/6_technoise.R
 3 | \name{.amplifyOneCell}
 4 | \alias{.amplifyOneCell}
 5 | \title{This function simulates the amplification, library prep, and the sequencing processes.}
 6 | \usage{
 7 | .amplifyOneCell(
 8 |   true_counts_1cell,
 9 |   protocol,
10 |   rate_2cap,
11 |   gene_len,
12 |   amp_bias,
13 |   rate_2PCR,
14 |   nPCR1,
15 |   nPCR2,
16 |   LinearAmp,
17 |   LinearAmp_coef,
18 |   N_molecules_SEQ
19 | )
20 | }
21 | \arguments{
22 | \item{true_counts_1cell}{the true transcript counts for one cell (one vector)}
23 | 
24 | \item{protocol}{a string, can be "nonUMI" or "UMI"}
25 | 
26 | \item{rate_2cap}{the capture efficiency for this cell}
27 | 
28 | \item{gene_len}{gene lengths for the genes/transcripts, sampled from real human transcript length}
29 | 
30 | \item{amp_bias}{amplification bias for each gene, a vector of length ngenes}
31 | 
32 | \item{rate_2PCR}{PCR efficiency, usually very high}
33 | 
34 | \item{nPCR1}{the number of PCR cycles}
35 | 
36 | \item{nPCR2}{the number of PCR cycles}
37 | 
38 | \item{LinearAmp}{if linear amplification is used for pre-amplification step, default is FALSE}
39 | 
40 | \item{LinearAmp_coef}{the coeficient of linear amplification, that is, how many times each molecule is amplified by}
41 | 
42 | \item{N_molecules_SEQ}{number of molecules sent for sequencing; sequencing depth}
43 | }
44 | \value{
45 | read counts (if protocol="nonUMI") or UMI counts (if protocol="UMI)
46 | }
47 | \description{
48 | This function simulates the amplification, library prep, and the sequencing processes.
49 | }
50 | \keyword{internal}
51 | 


--------------------------------------------------------------------------------
/man/dot-calAmpBias.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/6_technoise.R
 3 | \name{.calAmpBias}
 4 | \alias{.calAmpBias}
 5 | \title{Simulate technical biases}
 6 | \usage{
 7 | .calAmpBias(lenslope, nbins, gene_len, amp_bias_limit)
 8 | }
 9 | \arguments{
10 | \item{lenslope}{amount of length bias. This value sould be less than 2*amp_bias_limit[2]/(nbins-1)}
11 | 
12 | \item{nbins}{number of bins for gene length}
13 | 
14 | \item{gene_len}{transcript length of each gene}
15 | 
16 | \item{amp_bias_limit}{range of amplification bias for each gene, a vector of length ngenes}
17 | }
18 | \value{
19 | a vector
20 | }
21 | \description{
22 | Simulate technical biases
23 | }
24 | \keyword{internal}
25 | 


--------------------------------------------------------------------------------
/man/dot-continuousCIF.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/1_main.R
 3 | \name{.continuousCIF}
 4 | \alias{.continuousCIF}
 5 | \title{Generates cifs for cells sampled along the trajectory of cell development}
 6 | \usage{
 7 | .continuousCIF(
 8 |   seed,
 9 |   N,
10 |   options,
11 |   ncell_key = "cell",
12 |   is_spatial = FALSE,
13 |   spatial_params = NULL,
14 |   .plot = FALSE,
15 |   .plot.name = "cont_cif.pdf"
16 | )
17 | }
18 | \arguments{
19 | \item{seed}{random seed}
20 | 
21 | \item{N}{the number list}
22 | 
23 | \item{options}{the option list}
24 | 
25 | \item{ncell_key}{the key for the number of cells in N}
26 | 
27 | \item{is_spatial}{return a list of cifs for spatial}
28 | 
29 | \item{spatial_params}{the spatial parameters}
30 | 
31 | \item{.plot}{save the CIF plot}
32 | 
33 | \item{.plot.name}{plot name}
34 | }
35 | \value{
36 | a list containing the cif and meta data
37 | }
38 | \description{
39 | Generates cifs for cells sampled along the trajectory of cell development
40 | }
41 | \keyword{internal}
42 | 


--------------------------------------------------------------------------------
/man/dot-divideBatchesImpl.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/6_technoise.R
 3 | \name{.divideBatchesImpl}
 4 | \alias{.divideBatchesImpl}
 5 | \title{Divide the observed counts into multiple batches by adding batch effect to each batch}
 6 | \usage{
 7 | .divideBatchesImpl(
 8 |   counts,
 9 |   meta_cell,
10 |   nbatch,
11 |   batch_effect_size = 1,
12 |   randseed = 0
13 | )
14 | }
15 | \arguments{
16 | \item{counts}{gene cell matrix}
17 | 
18 | \item{meta_cell}{the meta information related to cells, will be combined with technical cell level information and returned}
19 | 
20 | \item{nbatch}{number of batches}
21 | 
22 | \item{batch_effect_size}{amount of batch effects. Larger values result in bigger differences between batches. Default is 1.}
23 | 
24 | \item{randseed}{random seed}
25 | }
26 | \value{
27 | a list with two elements: counts and meta_cell
28 | }
29 | \description{
30 | Divide the observed counts into multiple batches by adding batch effect to each batch
31 | }
32 | \keyword{internal}
33 | 


--------------------------------------------------------------------------------
/man/dot-expandToBinary.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/6_technoise.R
 3 | \name{.expandToBinary}
 4 | \alias{.expandToBinary}
 5 | \title{expand transcript counts to a vector of binaries of the same length of as the number of transcripts}
 6 | \usage{
 7 | .expandToBinary(true_counts_1cell)
 8 | }
 9 | \arguments{
10 | \item{true_counts_1cell}{number of transcript in one cell}
11 | }
12 | \value{
13 | a list of two vectors, the first vector is a vector of 1s, the second vector is the index of transcripts
14 | }
15 | \description{
16 | expand transcript counts to a vector of binaries of the same length of as the number of transcripts
17 | }
18 | \keyword{internal}
19 | 


--------------------------------------------------------------------------------
/man/dot-getCountCorrMatrix.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/7_benchmark.R
 3 | \name{.getCountCorrMatrix}
 4 | \alias{.getCountCorrMatrix}
 5 | \title{This function finds the correlation between every pair of genes}
 6 | \usage{
 7 | .getCountCorrMatrix(counts)
 8 | }
 9 | \arguments{
10 | \item{counts}{rna seq counts}
11 | }
12 | \value{
13 | the correlation matrix
14 | }
15 | \description{
16 | This function finds the correlation between every pair of genes
17 | }
18 | \keyword{internal}
19 | 


--------------------------------------------------------------------------------
/man/dot-getParams.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/2_sim.R
 3 | \name{.getParams}
 4 | \alias{.getParams}
 5 | \title{Get Kineic Parameters for all cells and genes}
 6 | \usage{
 7 | .getParams(seed, sim, sp_cell_i = NULL, sp_path_i = NULL)
 8 | }
 9 | \arguments{
10 | \item{seed}{random seed}
11 | 
12 | \item{sim}{the simulation environment}
13 | 
14 | \item{sp_cell_i}{spatial cell index}
15 | 
16 | \item{sp_path_i}{the pre-sampled path along the tree for this cell}
17 | }
18 | \value{
19 | the kinetic parameters
20 | }
21 | \description{
22 | Get Kineic Parameters for all cells and genes
23 | }
24 | \keyword{internal}
25 | 


--------------------------------------------------------------------------------
/man/dot-normalizeGRNParams.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/1_main.R
 3 | \name{.normalizeGRNParams}
 4 | \alias{.normalizeGRNParams}
 5 | \title{Rename the original gene IDs in the GRN table to integers.}
 6 | \usage{
 7 | .normalizeGRNParams(params)
 8 | }
 9 | \arguments{
10 | \item{params}{GRN parameters.}
11 | }
12 | \value{
13 | list
14 | }
15 | \description{
16 | Rename the original gene IDs in the GRN table to integers.
17 | }
18 | \keyword{internal}
19 | 


--------------------------------------------------------------------------------
/man/dot-rnormTrunc.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/6_technoise.R
 3 | \name{.rnormTrunc}
 4 | \alias{.rnormTrunc}
 5 | \title{sample from truncated normal distribution}
 6 | \usage{
 7 | .rnormTrunc(n, mean, sd, a, b)
 8 | }
 9 | \arguments{
10 | \item{n}{number of values to create}
11 | 
12 | \item{mean}{mean of the normal distribution}
13 | 
14 | \item{sd}{standard deviation of the normal distribution}
15 | 
16 | \item{a}{the minimum value allowed}
17 | 
18 | \item{b}{the maximum value allowed}
19 | }
20 | \value{
21 | a vector of length n
22 | }
23 | \description{
24 | sample from truncated normal distribution
25 | }
26 | \keyword{internal}
27 | 


--------------------------------------------------------------------------------
/man/gen_1branch.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/2_sim.R
 3 | \name{gen_1branch}
 4 | \alias{gen_1branch}
 5 | \title{Generate true transcript counts for linear structure}
 6 | \usage{
 7 | gen_1branch(
 8 |   kinet_params,
 9 |   start_state,
10 |   start_s,
11 |   start_u,
12 |   randpoints1,
13 |   ncells1,
14 |   ngenes,
15 |   beta_vec,
16 |   d_vec,
17 |   cycle_length_factor,
18 |   cell
19 | )
20 | }
21 | \arguments{
22 | \item{kinet_params}{kinetic parameters, include k_on, k_off, s and beta}
23 | 
24 | \item{start_state}{the starting state: on or off of each gene}
25 | 
26 | \item{start_s}{spliced count of the root cell in the branch}
27 | 
28 | \item{start_u}{unspliced count of the root cell in the branch}
29 | 
30 | \item{randpoints1}{the value which evf mean is generated from}
31 | 
32 | \item{ncells1}{number of cells in the branch}
33 | 
34 | \item{ngenes}{number of genes}
35 | 
36 | \item{beta_vec}{splicing rate of each gene}
37 | 
38 | \item{d_vec}{degradation rate of each gene}
39 | 
40 | \item{cycle_length_factor}{for generating velocity data, a factor which is multiplied by the expected time to transition from kon to koff and back to to form the the length of a cycle}
41 | 
42 | \item{cell}{the cell number currently having counts generated}
43 | }
44 | \value{
45 | a list of 4 elements, the first element is true counts, second is the gene level meta information, the third is cell level meta information, including a matrix of evf and a vector of cell identity, and the fourth is the parameters kon, koff and s used to simulation the true counts
46 | }
47 | \description{
48 | Generate true transcript counts for linear structure
49 | }
50 | \keyword{internal}
51 | 


--------------------------------------------------------------------------------
/man/gen_clutter.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/3.1_spatial.R
 3 | \name{gen_clutter}
 4 | \alias{gen_clutter}
 5 | \title{generate a clutter of cells by growing from the center}
 6 | \usage{
 7 | gen_clutter(
 8 |   n_cell,
 9 |   grid_size = NA,
10 |   center = c(0, 0),
11 |   existing_loc = NULL,
12 |   existing_grid = NULL
13 | )
14 | }
15 | \arguments{
16 | \item{n_cell}{the number of cells}
17 | 
18 | \item{grid_size}{the width and height of the grid}
19 | 
20 | \item{center}{the center of the grid}
21 | 
22 | \item{existing_loc}{only place cells on the specified existing locations}
23 | 
24 | \item{existing_grid}{manually specify what locations are in the grid}
25 | }
26 | \value{
27 | a matrix of locations
28 | }
29 | \description{
30 | generate a clutter of cells by growing from the center
31 | }
32 | \examples{
33 | gen_clutter(10, 10, c(5, 5))
34 | 
35 | }
36 | 


--------------------------------------------------------------------------------
/man/gene_corr_cci.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/7_benchmark.R
 3 | \name{gene_corr_cci}
 4 | \alias{gene_corr_cci}
 5 | \title{Plot the ligand-receptor correlation summary}
 6 | \usage{
 7 | gene_corr_cci(
 8 |   results = .getResultsFromGlobal(),
 9 |   all.genes = FALSE,
10 |   .pair = NULL,
11 |   .exclude.same.types = TRUE
12 | )
13 | }
14 | \arguments{
15 | \item{results}{The scMultisim result object}
16 | 
17 | \item{all.genes}{Whether to use all genes or only the ligand/receptor genes}
18 | 
19 | \item{.pair}{Return the raw data for the given LR pair}
20 | 
21 | \item{.exclude.same.types}{Whether to exclude neighbor cells with same cell type}
22 | }
23 | \value{
24 | none
25 | }
26 | \description{
27 | Plot the ligand-receptor correlation summary
28 | }
29 | \examples{
30 | results <- sim_example_spatial(ncells = 10)
31 | gene_corr_cci(results)
32 | }
33 | 


--------------------------------------------------------------------------------
/man/gene_corr_regulator.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/7_benchmark.R
 3 | \name{gene_corr_regulator}
 4 | \alias{gene_corr_regulator}
 5 | \title{Print the correlations between targets of each regulator}
 6 | \usage{
 7 | gene_corr_regulator(results = .getResultsFromGlobal(), regulator)
 8 | }
 9 | \arguments{
10 | \item{results}{The scMultisim result object}
11 | 
12 | \item{regulator}{The regulator ID in the GRN params}
13 | }
14 | \value{
15 | none
16 | }
17 | \description{
18 | Print the correlations between targets of each regulator
19 | }
20 | \examples{
21 | results <- sim_example(ncells = 10)
22 | gene_corr_regulator(results, 2)
23 | }
24 | 


--------------------------------------------------------------------------------
/man/gene_len_pool.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{gene_len_pool}
 5 | \alias{gene_len_pool}
 6 | \title{a pool of gene lengths to sample from}
 7 | \format{
 8 | a vector.
 9 | }
10 | \usage{
11 | data(gene_len_pool)
12 | }
13 | \value{
14 | a vector of gene lengths.
15 | }
16 | \description{
17 | a pool of gene lengths to sample from
18 | }
19 | \examples{
20 | data(gene_len_pool)
21 | }
22 | \keyword{datasets}
23 | \keyword{internal}
24 | 


--------------------------------------------------------------------------------
/man/len2nfrag.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{len2nfrag}
 5 | \alias{len2nfrag}
 6 | \title{from transcript length to number of fragments (for the nonUMI protocol)}
 7 | \format{
 8 | a vector.
 9 | }
10 | \usage{
11 | data(len2nfrag)
12 | }
13 | \value{
14 | a vector.
15 | }
16 | \description{
17 | from transcript length to number of fragments (for the nonUMI protocol)
18 | }
19 | \examples{
20 | data(len2nfrag)
21 | }
22 | \keyword{datasets}
23 | \keyword{internal}
24 | 


--------------------------------------------------------------------------------
/man/match_params.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{match_params}
 5 | \alias{match_params}
 6 | \title{distribution of kinetic parameters learned from the Zeisel UMI cortex datasets}
 7 | \format{
 8 | a data frame.
 9 | }
10 | \usage{
11 | data(param_realdata.zeisel.imputed)
12 | }
13 | \value{
14 | a data frame.
15 | }
16 | \description{
17 | distribution of kinetic parameters learned from the Zeisel UMI cortex datasets
18 | }
19 | \examples{
20 | data(param_realdata.zeisel.imputed)
21 | }
22 | \keyword{datasets}
23 | \keyword{internal}
24 | 


--------------------------------------------------------------------------------
/man/plot_cell_loc.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/7_benchmark.R
 3 | \name{plot_cell_loc}
 4 | \alias{plot_cell_loc}
 5 | \title{Plot cell locations}
 6 | \usage{
 7 | plot_cell_loc(
 8 |   results = .getResultsFromGlobal(),
 9 |   size = 4,
10 |   show.label = FALSE,
11 |   show.arrows = TRUE,
12 |   lr.pair = 1,
13 |   .cell.pop = NULL,
14 |   .locs = NULL
15 | )
16 | }
17 | \arguments{
18 | \item{results}{The scMultisim result object}
19 | 
20 | \item{size}{Fig size}
21 | 
22 | \item{show.label}{Show cell numbers}
23 | 
24 | \item{show.arrows}{Show arrows representing cell-cell interactions}
25 | 
26 | \item{lr.pair}{The ligand-receptor pair used to plot CCI arrows
27 | \code{results$cci_cell_type_param[lr.pair]}}
28 | 
29 | \item{.cell.pop}{Specify the cell population metadata}
30 | 
31 | \item{.locs}{Manually specify the cell locations as a 2x\code{ncells} matrix}
32 | }
33 | \value{
34 | none
35 | }
36 | \description{
37 | Plot cell locations
38 | }
39 | \examples{
40 | results <- sim_example_spatial(ncells = 10)
41 | plot_cell_loc(results)
42 | }
43 | 


--------------------------------------------------------------------------------
/man/plot_gene_module_cor_heatmap.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/7_benchmark.R
 3 | \name{plot_gene_module_cor_heatmap}
 4 | \alias{plot_gene_module_cor_heatmap}
 5 | \title{Plot the gene module correlation heatmap}
 6 | \usage{
 7 | plot_gene_module_cor_heatmap(
 8 |   results = .getResultsFromGlobal(),
 9 |   seed = 0,
10 |   grn.genes.only = TRUE,
11 |   save = FALSE
12 | )
13 | }
14 | \arguments{
15 | \item{results}{The scMultisim result object}
16 | 
17 | \item{seed}{The random seed}
18 | 
19 | \item{grn.genes.only}{Plot the GRN gens only}
20 | 
21 | \item{save}{save the plot as pdf}
22 | }
23 | \value{
24 | none
25 | }
26 | \description{
27 | Plot the gene module correlation heatmap
28 | }
29 | \examples{
30 | results <- sim_example(ncells = 10)
31 | plot_gene_module_cor_heatmap(results)
32 | }
33 | 


--------------------------------------------------------------------------------
/man/plot_grid.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/7_benchmark.R
 3 | \name{plot_grid}
 4 | \alias{plot_grid}
 5 | \title{Plot the CCI grid}
 6 | \usage{
 7 | plot_grid(results = .getResultsFromGlobal())
 8 | }
 9 | \arguments{
10 | \item{results}{The scMultisim result object}
11 | }
12 | \value{
13 | none
14 | }
15 | \description{
16 | In normal cases, please use \code{plotCellLoc} instead.
17 | }
18 | \examples{
19 | results <- sim_example_spatial(ncells = 10)
20 | plot_grid(results)
21 | }
22 | 


--------------------------------------------------------------------------------
/man/plot_grn.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/7_benchmark.R
 3 | \name{plot_grn}
 4 | \alias{plot_grn}
 5 | \title{Plot the GRN network}
 6 | \usage{
 7 | plot_grn(params)
 8 | }
 9 | \arguments{
10 | \item{params}{The GRN params data frame}
11 | }
12 | \value{
13 | none
14 | }
15 | \description{
16 | Plot the GRN network
17 | }
18 | \examples{
19 | data(GRN_params_100, envir = environment())
20 | plot_grn(GRN_params_100)
21 | }
22 | 


--------------------------------------------------------------------------------
/man/plot_phyla.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/7_benchmark.R
 3 | \name{plot_phyla}
 4 | \alias{plot_phyla}
 5 | \title{Plot a R phylogenic tree}
 6 | \usage{
 7 | plot_phyla(tree)
 8 | }
 9 | \arguments{
10 | \item{tree}{The tree}
11 | }
12 | \value{
13 | none
14 | }
15 | \description{
16 | Plot a R phylogenic tree
17 | }
18 | \examples{
19 | plot_phyla(Phyla5())
20 | }
21 | 


--------------------------------------------------------------------------------
/man/plot_rna_velocity.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/7_benchmark.R
 3 | \name{plot_rna_velocity}
 4 | \alias{plot_rna_velocity}
 5 | \title{Plot RNA velocity as arrows on tSNE plot}
 6 | \usage{
 7 | plot_rna_velocity(
 8 |   results = .getResultsFromGlobal(),
 9 |   velocity = results$velocity,
10 |   perplexity = 70,
11 |   arrow.length = 1,
12 |   save = FALSE,
13 |   randseed = 0,
14 |   ...
15 | )
16 | }
17 | \arguments{
18 | \item{results}{The scMultiSim result object}
19 | 
20 | \item{velocity}{The velocity matrix, by default using the velocity matrix in the result object}
21 | 
22 | \item{perplexity}{The perplexity for tSNE}
23 | 
24 | \item{arrow.length}{The length scaler of the arrow}
25 | 
26 | \item{save}{Whether to save the plot}
27 | 
28 | \item{randseed}{The random seed}
29 | 
30 | \item{...}{Other parameters passed to ggplot}
31 | }
32 | \value{
33 | The plot
34 | }
35 | \description{
36 | Plot RNA velocity as arrows on tSNE plot
37 | }
38 | \examples{
39 | results <- sim_example(ncells = 10, velocity = TRUE)
40 | plot_rna_velocity(results, perplexity = 3)
41 | }
42 | 


--------------------------------------------------------------------------------
/man/plot_tsne.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/7_benchmark.R
 3 | \name{plot_tsne}
 4 | \alias{plot_tsne}
 5 | \title{Plot t-SNE visualization of a data matrix}
 6 | \usage{
 7 | plot_tsne(
 8 |   data,
 9 |   labels,
10 |   perplexity = 60,
11 |   legend = "",
12 |   plot.name = "",
13 |   save = FALSE,
14 |   rand.seed = 0,
15 |   continuous = FALSE,
16 |   labels2 = NULL,
17 |   lim = NULL,
18 |   runPCA = FALSE,
19 |   alpha = 1
20 | )
21 | }
22 | \arguments{
23 | \item{data}{The \code{d}x\code{n} matrix}
24 | 
25 | \item{labels}{A vector of length \code{n}, usually cell clusters}
26 | 
27 | \item{perplexity}{Perplexity value used for t-SNE}
28 | 
29 | \item{legend}{A list of colors for the labels}
30 | 
31 | \item{plot.name}{The plot title}
32 | 
33 | \item{save}{If \code{TRUE}, save as \code{plot.name}.pdf}
34 | 
35 | \item{rand.seed}{The random seed}
36 | 
37 | \item{continuous}{Whether \code{labels} should be treated as continuous, e.g. pseudotime}
38 | 
39 | \item{labels2}{Additional label}
40 | 
41 | \item{lim}{Specify the xlim and y lim c(x_min, x_max, y_min, y_max)}
42 | 
43 | \item{runPCA}{Whether to run PCA before t-SNE}
44 | 
45 | \item{alpha}{The alpha value for the points}
46 | }
47 | \value{
48 | the figure if not \code{save}, otherwise save the figure as \code{plot.name}.pdf
49 | }
50 | \description{
51 | Plot t-SNE visualization of a data matrix
52 | }
53 | \examples{
54 | results <- sim_example(ncells = 10)
55 | plot_tsne(log2(results$counts + 1), results$cell_meta$pop, perplexity = 3)
56 | }
57 | 


--------------------------------------------------------------------------------
/man/run_shiny.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/9.1_shiny.R
 3 | \name{run_shiny}
 4 | \alias{run_shiny}
 5 | \title{Launch the Shiny App to configure the simulation}
 6 | \usage{
 7 | run_shiny()
 8 | }
 9 | \description{
10 | Launch the Shiny App to configure the simulation
11 | }
12 | 


--------------------------------------------------------------------------------
/man/scmultisim_help.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/9_meta.R
 3 | \name{scmultisim_help}
 4 | \alias{scmultisim_help}
 5 | \title{Show detailed documentations of scMultiSim's parameters}
 6 | \usage{
 7 | scmultisim_help(topic = NULL)
 8 | }
 9 | \arguments{
10 | \item{topic}{Can be \code{options}, \code{dynamic.GRN}, or \code{cci}}
11 | }
12 | \value{
13 | none
14 | }
15 | \description{
16 | Show detailed documentations of scMultiSim's parameters
17 | }
18 | \examples{
19 | scmultisim_help()
20 | }
21 | 


--------------------------------------------------------------------------------
/man/sim_example.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/8_utils.R
 3 | \name{sim_example}
 4 | \alias{sim_example}
 5 | \title{Simulate a small example dataset with 200 cells and the 100-gene GRN}
 6 | \usage{
 7 | sim_example(ncells = 10, velocity = FALSE)
 8 | }
 9 | \arguments{
10 | \item{ncells}{number of cells, please increase this number on your machine}
11 | 
12 | \item{velocity}{whether to simulate RNA velocity}
13 | }
14 | \value{
15 | the simulation result
16 | }
17 | \description{
18 | Simulate a small example dataset with 200 cells and the 100-gene GRN
19 | }
20 | \examples{
21 | sim_example(ncells = 10)
22 | }
23 | 


--------------------------------------------------------------------------------
/man/sim_example_spatial.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/8_utils.R
 3 | \name{sim_example_spatial}
 4 | \alias{sim_example_spatial}
 5 | \title{Simulate a small example dataset with 200 cells and the 100-gene GRN, with CCI enabled}
 6 | \usage{
 7 | sim_example_spatial(ncells = 10)
 8 | }
 9 | \arguments{
10 | \item{ncells}{number of cells, please increase this number on your machine}
11 | }
12 | \value{
13 | the simulation result
14 | }
15 | \description{
16 | Simulate a small example dataset with 200 cells and the 100-gene GRN, with CCI enabled
17 | }
18 | \examples{
19 | sim_example_spatial(ncells = 10)
20 | }
21 | 


--------------------------------------------------------------------------------
/man/sim_true_counts.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/1_main.R
 3 | \name{sim_true_counts}
 4 | \alias{sim_true_counts}
 5 | \title{Simulate true scRNA and scATAC counts from the parameters}
 6 | \usage{
 7 | sim_true_counts(options, return_summarized_exp = FALSE)
 8 | }
 9 | \arguments{
10 | \item{options}{See scMultiSim_help().}
11 | 
12 | \item{return_summarized_exp}{Whether to return a SummarizedExperiment object.}
13 | }
14 | \value{
15 | scMultiSim returns an environment with the following fields:
16 | \itemize{
17 | \item \code{counts}: Gene-by-cell scRNA-seq counts.
18 | \item \code{atac_counts}: Region-by-cell scATAC-seq counts.
19 | \item \code{region_to_gene}: Region-by-gene 0-1 marix indicating the corresponding relationship between chtomatin regions and genes.
20 | \item \code{atacseq_data}: The "clean" scATAC-seq counts without added intrinsic noise.
21 | \item \code{cell_meta}: A dataframe containing cell type labels and pseudotime information.
22 | \item \code{cif}: The CIF used during the simulation.
23 | \item \code{giv}: The GIV used during the simulation.
24 | \item \code{kinetic_params}: The kinetic parameters used during the simulation.
25 | \item \code{.grn}: The GRN used during the simulation.
26 | \item \code{.grn$regulators}: The list of TFs used by all gene-by-TF matrices.
27 | \item \code{.grn$geff}: Gene-by-TF matrix representing the GRN used during the simulation.
28 | \item \code{.n}: Other metadata, e.g. \code{.n$cells} is the number of cells.
29 | }
30 | 
31 | If \code{do.velocity} is enabled, it has these additional fields:
32 | \itemize{
33 | \item \code{unspliced_counts}: Gene-by-cell unspliced RNA counts.
34 | \item \code{velocity}: Gene-by-cell RNA velocity ground truth.
35 | \item \code{cell_time}: The pseudotime at which the cell counts were generated.
36 | }
37 | 
38 | If dynamic GRN is enabled, it has these additional fields:
39 | \itemize{
40 | \item \code{cell_specific_grn}: A list of length \code{n_cells}. Each element is a gene-by-TF matrix, indicating the cell's GRN.
41 | }
42 | 
43 | If cell-cell interaction is enabled, it has these additional fields:
44 | \itemize{
45 | \item \code{grid}: The grid object used during the simulation.
46 | \itemize{
47 | \item \code{grid$get_neighbours(i)}: Get the neighbour cells of cell \code{i}.
48 | }
49 | \item \code{cci_locs}: A dataframe containing the X and Y coordinates of each cell.
50 | \item \code{cci_cell_type_param}: A dataframe containing the CCI network ground truth: all ligand-receptor pairs between each pair of cell types.
51 | \item \code{cci_cell_types}: For continuous cell population, the sub-divided cell types along the trajectory used when simulating CCI.
52 | }
53 | 
54 | If it is a debug session (\code{debug = TRUE}), a \code{sim} field is available,
55 | which is an environment contains all internal states and data structures.
56 | }
57 | \description{
58 | Simulate true scRNA and scATAC counts from the parameters
59 | }
60 | \examples{
61 | data(GRN_params_100, envir = environment())
62 | sim_true_counts(list(
63 |   rand.seed = 0,
64 |   GRN = GRN_params_100,
65 |   num.cells = 100,
66 |   num.cifs = 50,
67 |   tree = Phyla5()
68 | ))
69 | }
70 | 


--------------------------------------------------------------------------------
/man/spatialGrid-class.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/3.1_spatial.R
 3 | \docType{class}
 4 | \name{spatialGrid-class}
 5 | \alias{spatialGrid-class}
 6 | \alias{.SpatialGrid}
 7 | \title{The class for spatial grids}
 8 | \value{
 9 | a spatialGrid object
10 | }
11 | \description{
12 | The class for spatial grids
13 | }
14 | \section{Fields}{
15 | 
16 | \describe{
17 | \item{\code{method}}{the method to generate the cell layout}
18 | 
19 | \item{\code{grid_size}}{the width and height of the grid}
20 | 
21 | \item{\code{ncells}}{the number of cells}
22 | 
23 | \item{\code{grid}}{the grid matrix}
24 | 
25 | \item{\code{locs}}{a list containing the locations of all cells}
26 | 
27 | \item{\code{loc_order}}{deprecated, don't use; the order of the locations}
28 | 
29 | \item{\code{cell_types}}{a map to save the cell type of each allocated cell}
30 | 
31 | \item{\code{same_type_prob}}{the probability of a new cell placed next to a cell with the same type}
32 | 
33 | \item{\code{max_nbs}}{the maximum number of neighbors for each cell}
34 | 
35 | \item{\code{nb_map}}{a list containing the neighbors for each cell}
36 | 
37 | \item{\code{nb_adj}}{adjacency matrix for neighbors}
38 | 
39 | \item{\code{nb_radius}}{the radius of neighbors}
40 | 
41 | \item{\code{final_types}}{the final cell types after the final time step}
42 | 
43 | \item{\code{pre_allocated_pos}}{the pre-allocated positions for each cell, if any}
44 | 
45 | \item{\code{method_param}}{additional parameters for the layout method}
46 | }}
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/pkgdown/extra.css:
--------------------------------------------------------------------------------
 1 | h2 {
 2 |     margin-top: 2rem;
 3 |     margin-bottom: 1.5rem;
 4 | }
 5 | 
 6 | h3 {
 7 |     margin-top: 1.5rem;
 8 |     margin-bottom: 1rem;
 9 | }
10 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
 1 | # This file is part of the standard setup for testthat.
 2 | # It is recommended that you do not modify it.
 3 | #
 4 | # Where should you do additional test configuration?
 5 | # Learn more about the roles of various files in:
 6 | # * https://r-pkgs.org/tests.html
 7 | # * https://testthat.r-lib.org/reference/test_package.html#special-files
 8 | 
 9 | library(testthat)
10 | library(scMultiSim)
11 | 
12 | test_check("scMultiSim")
13 | 


--------------------------------------------------------------------------------
/tests/testthat/test-1_main.R:
--------------------------------------------------------------------------------
  1 | # test_that("simulates data using Kinetic model", {
  2 | #   data(GRN_params_100, envir = environment())
  3 | #
  4 | #   set.seed(0)
  5 | #   options_ <- list(
  6 | #     GRN = GRN_params_100,
  7 | #     num.cells = 100,
  8 | #     num.cifs = 20,
  9 | #     tree = Phyla5(),
 10 | #     diff.cif.fraction = 0.8,
 11 | #     do.velocity = T
 12 | #   )
 13 | #
 14 | #   res <- sim_true_counts(options_)
 15 | #
 16 | #   selectedIndicies <- c(1:5, 1000:1005, 10000:10005)
 17 | #   expect_equal(dim(res$counts), c(110, 100))
 18 | #   expect_equal(
 19 | #     res$counts[selectedIndicies],
 20 | #     c(34, 5, 28, 21, 49, 0, 0, 10, 4, 90, 18, 18, 0, 12, 0, 37, 171)
 21 | #   )
 22 | #
 23 | #   add_expr_noise(res)
 24 | #   divide_batches(res, nbatch = 2)
 25 | #
 26 | #   expect_equal(
 27 | #     res$counts_obs[selectedIndicies],
 28 | #     c(585, 307, 141, 187, 309, 0, 0, 326, 0, 2692, 0, 401, 0, 22, 0, 187, 2291)
 29 | #   )
 30 | #   expect_equal(
 31 | #     res$counts_with_batches[selectedIndicies],
 32 | #     c(2331, 5031, 180, 1263, 131, 0, 0, 93, 0, 21462, 0, 467, 0, 357, 0,
 33 | #       1020, 1495)
 34 | #   )
 35 | #
 36 | #   expect_equal(
 37 | #     res$atac_counts[selectedIndicies],
 38 | #     c(0.0000000, 0.0000000, 0.6887022, 0.7694152, 0.0000000, 2.5172858,
 39 | #       0.3549220, 5.6001554, 3.5924741, 0.2475742, 0.0000000, 1.0374884,
 40 | #       3.4453020, 4.0073263, 0.0000000, 0.0000000, 0.0000000)
 41 | #   )
 42 | #
 43 | #   expect_equal(
 44 | #     res$velocity[selectedIndicies],
 45 | #     c(4.3146894, 0.4199234, -2.1558627, 2.0012763, -23.2763955, 0.0000000,
 46 | #       0.0000000, -2.0025398, 1.2857669, 2.8214759, 2.5935024, 1.3315967,
 47 | #       0.0000000, -2.5724074, 0.2749523, 1.9946433, -2.0456734)
 48 | #   )
 49 | #
 50 | #   expect_no_error(plot_gene_module_cor_heatmap(res, save = FALSE))
 51 | #   expect_no_error(gene_corr_regulator(res, 2))
 52 | #   expect_no_error(plot_rna_velocity(res, perplexity = 20))
 53 | # })
 54 | #
 55 | #
 56 | # test_that("simulates data using Beta-Poisson model", {
 57 | #   data(GRN_params_100, envir = environment())
 58 | #
 59 | #   set.seed(0)
 60 | #   options_ <- list(
 61 | #     GRN = GRN_params_100,
 62 | #     num.cells = 100,
 63 | #     num.cifs = 20,
 64 | #     tree = Phyla5(),
 65 | #     diff.cif.fraction = 0.8,
 66 | #     do.velocity = F
 67 | #   )
 68 | #
 69 | #   res <- sim_true_counts(options_)
 70 | #
 71 | #   selectedIndicies <- c(1:5, 101:105, 10001:10005)
 72 | #   expect_equal(dim(res$counts), c(110, 100))
 73 | #   expect_equal(
 74 | #     res$counts[selectedIndicies],
 75 | #     c(120, 5, 18, 33, 88, 5, 4, 0, 8, 96, 0, 18, 0, 15, 146)
 76 | #   )
 77 | # })
 78 | #
 79 | #
 80 | # test_that("simulates spatial data", {
 81 | #   data(GRN_params_100, envir = environment())
 82 | #
 83 | #   lig_params <- data.frame(
 84 | #     target    = c(101, 102),
 85 | #     regulator = c(103, 104),
 86 | #     effect    = c(5.2, 5.9)
 87 | #   )
 88 | #
 89 | #   options_ <- list2(
 90 | #     GRN = GRN_params_100,
 91 | #     num.genes = 200,
 92 | #     num.cells = 100,
 93 | #     num.cifs = 20,
 94 | #     tree = Phyla3(),
 95 | #     intrinsic.noise = 0.5,
 96 | #     cci = list(
 97 | #       params = lig_params,
 98 | #       max.neighbors = 4,
 99 | #       cell.type.interaction = "random",
100 | #       step.size = 0.5
101 | #     )
102 | #   )
103 | #
104 | #   set.seed(0)
105 | #   res <- sim_true_counts(options_)
106 | #
107 | #   selectedIndicies <- c(1:5, 1000:1005, 10000:10005)
108 | #   expect_equal(
109 | #     res$counts[selectedIndicies],
110 | #     c(40.675564, 30.876988, 29.984167, 49.430348, 25.113605, 4.093944,
111 | #       45.194247, 29.063519, 47.389263, 42.516067, 43.014273, 7.110385,
112 | #       55.992341, 13.604489, 14.811897, 10.213004, 24.046141)
113 | #   )
114 | #
115 | #   expect_no_error(plot_cell_loc(res))
116 | #   expect_no_error(gene_corr_cci(res))
117 | # })
118 | #
119 | #
120 | # test_that("simulates spatial data with discrete population and HGE", {
121 | #   data(GRN_params_100, envir = environment())
122 | #
123 | #   lig_params <- data.frame(
124 | #     target    = c(101, 102),
125 | #     regulator = c(103, 104),
126 | #     effect    = c(5.2, 5.9)
127 | #   )
128 | #
129 | #   options_ <- list2(
130 | #     GRN = GRN_params_100,
131 | #     num.genes = 200,
132 | #     num.cells = 100,
133 | #     num.cifs = 20,
134 | #     tree = Phyla3(),
135 | #     discrete.cif = T,
136 | #     discrete.min.pop.size = 20,
137 | #     intrinsic.noise = 0.5,
138 | #     hge.prop = 0.05,
139 | #     cci = list(
140 | #       params = lig_params,
141 | #       max.neighbors = 4,
142 | #       cell.type.interaction = "random",
143 | #       step.size = 0.5
144 | #     )
145 | #   )
146 | #
147 | #   set.seed(0)
148 | #   res <- sim_true_counts(options_)
149 | #
150 | #   selectedIndicies <- c(1:5, 1000:1005, 10000:10005)
151 | #   expect_equal(
152 | #     res$counts[selectedIndicies],
153 | #     c(109.0693303, 60.4151790, 91.9120934, 0.5816326, 177.6741585, 197.0663584,
154 | #       102.3145704, 65.9978484, 89.1613630, 3.1734446, 156.9202179, 29.8315553,
155 | #       92.3944947, 66.1421921, 105.4677530, 0.5729707, 110.5115346)
156 | #   )
157 | # })
158 | 


--------------------------------------------------------------------------------
/vignettes/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | *.R
3 | 


--------------------------------------------------------------------------------
/vignettes/basics.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "2. Simulating Multimodal Single-cell Datasets"
  3 | output:
  4 |   BiocStyle::html_document:
  5 |   toc: true
  6 |   toc_depth: 2
  7 | vignette: >
  8 |   %\VignetteEngine{knitr::knitr}
  9 |   %\VignetteIndexEntry{2. Simulating Multimodal Single-cell Datasets}
 10 |   %\usepackage[UTF-8]{inputenc}
 11 | ---
 12 | ```{r "setup", include=FALSE}
 13 | require("knitr")
 14 | opts_chunk$set(fig.width=4, fig.height=3)
 15 | ```
 16 | 
 17 | ```{r install-packages, include=FALSE, message=FALSE, warning=FALSE, eval=FALSE}
 18 | # The following chunk will install all the required packages.
 19 | (function() {
 20 |   installed <- installed.packages()[,"Package"]
 21 |   install <- function(list, fn) {
 22 |     pkg <- setdiff(list, installed)
 23 |     if (length(pkg)) fn(pkg, dependencies=TRUE)
 24 |   }
 25 | 
 26 |   r_packages <- c(
 27 |     "devtools", "dplyr", "ggplot2", "Rtsne", "rlang",
 28 |     "reshape", "ape", "phytools", "repr", "KernelKnn",
 29 |     "gridExtra", "parallel", 'foreach', 'phytools', "doParallel",
 30 |     "zeallot", "gtools", "gplots", "roxygen2", "usethis"
 31 |   )
 32 |   install(r_packages, install.packages)
 33 | 
 34 |   if (requireNamespace("BiocManager", quietly = TRUE)) {
 35 |     bioc_packages <- c('Biobase')
 36 |     install(bioc_packages, BiocManager::install)
 37 |   }
 38 | })()
 39 | ```
 40 | 
 41 | In this tutorial, we will demonstrate how to use scMultiSim to simulate multi-omics data
 42 | with different biological effects, including:
 43 | 
 44 | - Simulating true RNA counts and ATAC-seq data
 45 | - Controlling the cell population and GRN effects
 46 | - Adding technical variation and batch effect to the true counts
 47 | - Adjusting the parameters to control different biological effects
 48 | 
 49 | We first load the package:
 50 | 
 51 | ```{r load-package, quietly=TRUE, message=FALSE, warning=FALSE}
 52 | library("scMultiSim")
 53 | ```
 54 | 
 55 | # Simulating True Counts
 56 | 
 57 | scMultiSim first generates the true RNA counts, and then add technical variation and batch effect to the true counts.
 58 | To simulate true counts, call `sim_true_counts(options)` where `options` is a
 59 | list. You can use `scmultisim_help()` to get help on the options,
 60 | or like `scmulti_help("num.cells")` to get help on the options for a specific function.
 61 | 
 62 | ```{r scmultisim-help, echo = TRUE, results = "hide"}
 63 | scmultisim_help("options")
 64 | ```
 65 | 
 66 | ## GRN and Differentiation Tree
 67 | 
 68 | Before start, we define a utility function to modify a list.
 69 | ```{r load-dplyr, quietly=TRUE, message=FALSE, warning=FALSE}
 70 | library(dplyr)
 71 | ```
 72 | ```{r define-list-modify}
 73 | list_modify <- function (curr_list, ...) {
 74 |   args <- list(...)
 75 |   for (i in names(args)) {
 76 |     curr_list[[i]] <- args[[i]]
 77 |   }
 78 |   curr_list
 79 | }
 80 | ```
 81 | 
 82 | The minimal input to scMultiSim is a **differentiation tree**, and you can optionally provide
 83 | ground truth for GRN and cell-cell interactions.
 84 | The differentiation tree is an R phylo object, which can be created using e.g.
 85 | `ape::read.tree()` or `ape::rtree()`.
 86 | It controls the cell population structure: each node of the tree should represent a cell type,
 87 | and connected nodes indicate the differentiation relationship between cell types.
 88 | _scMultiSim provides this explicit control on the cell population structure
 89 | while preserving all other effects (such as GRN and Cell-Cell Interactions)_,
 90 | so you can generate any cell trajectory or clustering structure you want, which is especially useful
 91 | for benchmarking trajectory inference and clustering methods.
 92 | 
 93 | If generating a continuous population, this tree
 94 | specifies the cell differentiation trajectory; if generating a discrete population, the
 95 | tips of this tree will be the clusters (cell types are the terminal cell states).
 96 | 
 97 | scMultiSim also provides three differentiation trees.
 98 | `Phyla5()` and `Phyla3()` return bifurcating trees with 5 and 3 leaves respectively.
 99 | `Phyla1()` returns only a single branch, which can be useful when we don't want any specific trajectory.
100 | ```{r plot-tree, fig.width = 8, fig.height = 4}
101 | par(mfrow=c(1,2))
102 | Phyla5(plotting = TRUE)
103 | Phyla3(plotting = TRUE)
104 | 
105 | # It's not possible to plot Phyla1() because it only contains 1 branch connecting two nodes.
106 | Phyla1()
107 | ```
108 | 
109 | If you only need `n` cell clusters without any specific trajectory, you can use code like below to generate a simple tree with `n` leaves.
110 | ```{r random-tree}
111 | # tree with four leaves
112 | ape::read.tree(text = "(A:1,B:1,C:1,D:1);")
113 | ```
114 | 
115 | The GRN should be a data frame with 3 columns, each representing the `target`, `regulator`, and `effect`.
116 | The target and regulator should be gene names, which can be integers or strings.
117 | The effect should be a numeric value, indicating the effect of the regulator on the target.
118 | 
119 | scMultiSim provides two sample GRNs, `GRN_params_100` and `GRN_params_1139`,
120 | which contain 100 and 1139 genes respectively.
121 | Let's load them first.
122 | ```{r load-grn}
123 | data(GRN_params_100)
124 | GRN_params <- GRN_params_100
125 | head(GRN_params)
126 | ```
127 | 
128 | ## Simulating True Counts
129 | 
130 | Now, we create the options list for the simulation session.
131 | In the following example, we simulate 500 cells with 50 CIFs.
132 | 
133 | The number of genes is determined by the option `num.genes` or the number of genes in the GRN.
134 | If `num.genes` is not specified, the number of genes will be the number of unique genes in the GRN,
135 | plus a fraction of genes that are not regulated by any other genes.
136 | this is controlled by the option `unregulated.gene.ratio` (default is 0.1).
137 | Since our `GRN_params` contains 100 gene names, 10% more genes will be added to the simulation,
138 | and the number of genes in the simulated data will be 110.
139 | If you don't need to simulate GRN effects, simply set `GRN = NA`.
140 | 
141 | The `cif.sigma` controls the variance of the CIFs. Usually, with `cif.sigma` = 0.1,
142 | the trajectory will be very clear, while with `cif.sigma` = 1, the trajectory will be more
143 | noisy. We use `cif.sigma` = 0.5 in this example.
144 | 
145 | We also have `do.velocity` option to use the Kinetic model to simulate RNA velocity data.
146 | 
147 | ```{r define-options}
148 | set.seed(42)
149 | 
150 | options <- list(
151 |   GRN = GRN_params,
152 |   num.cells = 300,
153 |   num.cifs = 20,
154 |   cif.sigma = 1,
155 |   tree = Phyla5(),
156 |   diff.cif.fraction = 0.8,
157 |   do.velocity = TRUE
158 | )
159 | ```
160 | 
161 | ### Omitting the GRN
162 | 
163 | Note that the minimal input to scMultiSim is the cell population structure (differentiation tree) and number of cells.
164 | You can omit the GRN by using `GRN = NA`:
165 | ```
166 | options <- list(
167 |   GRN = NA
168 |   num.cells = 1000,
169 |   num.genes = 500,
170 |   tree = Phyla5(),
171 | )
172 | ```
173 | 
174 | ### Running the Simulation
175 | 
176 | Now we run the simulation and check what kind of data is in the returned result:
177 | ```{r run-simulation}
178 | results <- sim_true_counts(options)
179 | names(results)
180 | ```
181 | 
182 | ## Accessing the Results
183 | 
184 | The return value will be a `scMultiSim Environment` object,
185 | and you can access various data and parameters using the `$` operator.
186 | 
187 | - `counts`: Gene-by-cell scRNA-seq counts.
188 | - `atac_counts`: Region-by-cell scATAC-seq counts.
189 | - `region_to_gene`: Region-by-gene 0-1 marix indicating the corresponding relationship between chtomatin regions and genes.
190 | - `atacseq_data`: The "clean" scATAC-seq counts without added intrinsic noise.
191 | - `cell_meta`: A dataframe containing cell type labels and pseudotime information.
192 | - `cif`: The CIF used during the simulation.
193 | - `giv`: The GIV used during the simulation.
194 | - `kinetic_params`: The kinetic parameters used during the simulation.
195 | - `.grn`: The GRN used during the simulation.
196 | - `.grn$regulators`: The list of TFs used by all gene-by-TF matrices.
197 | - `.grn$geff`: Gene-by-TF matrix representing the GRN used during the simulation.
198 | - `.n`: Other metadata, e.g. `.n$cells` is the number of cells.
199 | 
200 | If `do.velocity` is enabled, it has these additional fields:
201 | 
202 | - `unspliced_counts`: Gene-by-cell unspliced RNA counts.
203 | - `velocity`: Gene-by-cell RNA velocity ground truth.
204 | - `cell_time`: The pseudotime at which the cell counts were generated.
205 | 
206 | If dynamic GRN is enabled, it has these additional fields:
207 | 
208 | - `cell_specific_grn`: A list of length `n_cells`. Each element is a gene-by-TF matrix, indicating the cell's GRN.
209 | 
210 | If cell-cell interaction is enabled, it has these additional fields:
211 | 
212 | - `grid`: The grid object used during the simulation.
213 | - `grid$get_neighbours(i)`: Get the neighbour cells of cell `i`.
214 | - `cci_locs`: A dataframe containing the X and Y coordinates of each cell.
215 | - `cci_cell_type_param`: A dataframe containing the CCI network ground truth: all ligand-receptor pairs between each pair of cell types.
216 | - `cci_cell_types`: For continuous cell population, the sub-divided cell types along the trajectory used when simulating CCI.
217 | 
218 | If it is a debug session (`debug = TRUE`), a `sim` field is available,
219 | which is an environment contains all internal states and data structures.
220 | 
221 | ## Visualizing the Results
222 | 
223 | We can visualize the true counts and ATAC-seq data using `plot_tsne()`:
224 | ```{r plot-counts, fig.width = 4, fig.height = 3.5, out.width = "60%"}
225 | plot_tsne(log2(results$counts + 1),
226 |          results$cell_meta$pop,
227 |          legend = 'pop', plot.name = 'True RNA Counts Tsne')
228 | plot_tsne(log2(results$atacseq_data + 1),
229 |          results$cell_meta$pop,
230 |          legend = 'pop', plot.name = 'True ATAC-seq Tsne')
231 | ```
232 | 
233 | 
234 | Since we also have RNA velocity enabled, the `results` also contains the following data:
235 | - `velocity`: the true RNA velocity (genes x cells)
236 | - `unspliced_counts`: the true unspliced RNA counts (genes x cells)
237 | 
238 | ```{r plot-velocity, fig.width = 4, fig.height = 3.5, out.width = "60%"}
239 | plot_rna_velocity(results, arrow.length = 2)
240 | ```
241 | 
242 | We can inspect the gene-gene correlation using `plot_gene_module_cor_heatmap(results)`:
243 | ```{r plot-gene-correlation, fig.width = 8, fig.height = 8}
244 | plot_gene_module_cor_heatmap(results)
245 | ```
246 | 
247 | # Adding Technical Variation and Batch Effect
248 | 
249 | We can also add the technical variation and batch effect to the true counts.
250 | 
251 | ## Adding technical noise
252 | 
253 | Simply use the `add_expr_noise` function to add technical noise to the dataset.
254 | 
255 | ```{r technical-noise}
256 | add_expr_noise(
257 |   results,
258 |   # options go here
259 |   alpha_mean = 1e4
260 | )
261 | ```
262 | 
263 | A `counts_obs` field will be added to the `results` object.
264 | 
265 | This function also accepts a list of options. See the documentation for more details.
266 | 
267 | - `protocol`: `"umi"` or `"nonUMI"`, whether simulate the UMI protocol.
268 | - `alpha_mean`, `alpha_sd`: Mean and deviation of rate of subsampling of transcripts during capture step.
269 | - `alpha_gene_mean`, `alpha_gene_sd`: `alpha` parameters, but gene-wise.
270 | - `depth_mean`, `depth_sd`: Mean and deviation of sequencing depth.
271 | - `gene_len`: A vector with lengths of all genes.
272 | - `atac.obs.prob`: For each integer count of a particular region for a particular cell, the probability the count will be observed.
273 | - `atac.sd.frac`: The fraction of ATAC-seq data value used as the standard deviation of added normally distrubted noise.
274 | - `randseed`: random seed.
275 | 
276 | ## Adding batch effects
277 | 
278 | Finally, use the `divide_batches` function to add batch effects.
279 | 
280 | ```{r batch-effects}
281 | divide_batches(
282 |   results,
283 |   nbatch = 2,
284 |   effect = 1
285 | )
286 | ```
287 | 
288 | A `counts_with_batches` field will be added to the `results` object.
289 | 
290 | The available options are:
291 | 
292 | - `nbatch`: Number of batches.
293 | - `effect`: The batch effect size.
294 | 
295 | We can visualize the result with technical noise and batches:
296 | 
297 | ```{r add-expr-noise, fig.width = 4, fig.height = 3.5, out.width = "60%"}
298 | plot_tsne(log2(results$counts_with_batches + 1),
299 |           results$cell_meta$pop,
300 |           legend = 'pop', plot.name = 'RNA Counts Tsne with Batches')
301 | ```
302 | 
303 | # Adjusting Parameters
304 | 
305 | scMultiSim provides various parameters to control each type of biological effect.
306 | Here, we describe the most important parameters and how they affect the simulation results:
307 | 
308 | - `num.cifs`, `diff.cif.fraction`
309 | - `cif.mean`, `cif.sigma`
310 | - `discrete.cif`
311 | - `intinsic.noise`
312 | 
313 | For a complete list of parameters, please check out the [Parameter Guide](https://zhanglabgt.github.io/scMultiSim/articles/options)
314 | page in the documentation.
315 | 
316 | ## The Shiny App
317 | 
318 | scMultiSim provides a Shiny app to help you generate the options list and visualize the effects of different parameters.
319 | It is highly recommended to use the Shiny app to explore the available parameters.
320 | You can run the app by calling `run_shiny()`.
321 | 
322 | ```{r run-shiny, eval=FALSE}
323 | run_shiny()
324 | ```
325 | 
326 | ![Shiny App](https://github.com/ZhangLabGT/scMultiSim/raw/img/img/shiny_app_sc.png)
327 | 
328 | ## Deciding Number of CIFs: `num.cifs`
329 | 
330 | In scMultiSim, user use `num.cifs` to control the total number of diff-CIF and non-diff-CIFs.
331 | The number of CIFs should be large enough to represent the cell population structure and gene information.
332 | By default, `num.cifs` is set to 50, which is a good starting point for most cases.
333 | However, each gene's base expression is affected by two random diff-CIF entries,
334 | therefore if you have a large number of genes, they may have similar expression patterns, which may not be ideal.
335 | It is recommended to increase `num.cifs` to 50-100 if you have more than 2000 genes.
336 | If you have a small number of genes (less than 1000), you can also decrease `num.cifs` to 20-40.
337 | 
338 | ## Discrete Cell Population: `discrete.cif`
339 | 
340 | We can also simulate discrete cell population by setting `discrete.cif = TRUE`.
341 | In this case, each tip of the tree will be one cell type,
342 | therefore there will be 5 clusters in the following result.
343 | 
344 | ```{r simulate-discrete, fig.width = 4, fig.height = 3.5, out.width = "60%"}
345 | set.seed(42)
346 | 
347 | options <- list(
348 |   GRN = GRN_params,
349 |   num.cells = 400,
350 |   num.cifs = 20,
351 |   tree = Phyla5(),
352 |   diff.cif.fraction = 0.8,
353 |   discrete.cif = TRUE
354 | )
355 | 
356 | results <- sim_true_counts(options)
357 | 
358 | plot_tsne(log2(results$counts + 1),
359 |          results$cell_meta$pop,
360 |          legend = 'pop', plot.name = 'True RNA Counts Tsne')
361 | ```
362 | 
363 | ## Adjusting the Effect of Cell Population: `diff.cif.fraction`
364 | 
365 | In scMultiSim, the differentiation tree provides explicit control of the cell population.
366 | The effect of the tree can be adjusted by the option `diff.cif.fraction`,
367 | which controls how many CIFs are affected by the cell population.
368 | With a larger `diff.cif.fraction`, the effect of cell population will be larger
369 | and you may see a clearer trajectory or well separated clusters.
370 | With a smaller `diff.cif.fraction`, the resulting RNA counts will be more affected by
371 | other factors, such as the GRN.
372 | 
373 | Now let's visualize the trajectory with different `diff.cif.fraction` values:
374 | 
375 | ```{r adjust-diff-cif-fraction, fig.width = 4, fig.height = 3.5, out.width = "60%"}
376 | set.seed(42)
377 | 
378 | options <- list(
379 |   GRN = GRN_params,
380 |   num.cells = 300,
381 |   num.cifs = 20,
382 |   tree = Phyla5(),
383 |   diff.cif.fraction = 0.8
384 | )
385 | 
386 | results <- sim_true_counts(
387 |         options %>% list_modify(diff.cif.fraction = 0.4))
388 | plot_tsne(log2(results$counts + 1),
389 |          results$cell_meta$pop,
390 |          legend = 'pop', plot.name = 'RNA Counts (diff.cif.fraction = 0.2)')
391 | 
392 | results <- sim_true_counts(
393 |         options %>% list_modify(diff.cif.fraction = 0.9))
394 | plot_tsne(log2(results$counts + 1),
395 |          results$cell_meta$pop,
396 |          legend = 'pop', plot.name = 'RNA Counts (diff.cif.fraction = 0.8)')
397 | ```
398 | 
399 | ## Adjusting the Inherent Cell Heterogeneity: `cif.mean` and `cif.sigma`
400 | 
401 | The inherent cell heterogeneity is controlled by the non-diff-CIF,
402 | which is sampled from a normal distribution with mean `cif.mean` and standard deviation `cif.sigma`.
403 | Therefore, the larger `cif.sigma` is, the larger the inherent cell heterogeneity is.
404 | 
405 | Now, let's visualize the effect of `cif.sigma`:
406 | 
407 | ```{r adjust-cif-sigma, fig.width = 4, fig.height = 3.5, out.width = "60%"}
408 | set.seed(42)
409 | 
410 | options <- list(
411 |   GRN = GRN_params,
412 |   num.cells = 300,
413 |   num.cifs = 20,
414 |   tree = Phyla5(),
415 |   diff.cif.fraction = 0.8,
416 |   cif.sigma = 0.5
417 | )
418 | 
419 | results <- sim_true_counts(
420 |         options %>% list_modify(cif.sigma = 0.1))
421 | plot_tsne(log2(results$counts + 1),
422 |          results$cell_meta$pop,
423 |          legend = 'pop', plot.name = 'RNA Counts (cif.sigma = 0.1)')
424 | 
425 | results <- sim_true_counts(
426 |         options %>% list_modify(cif.sigma = 1.0))
427 | plot_tsne(log2(results$counts + 1),
428 |          results$cell_meta$pop,
429 |          legend = 'pop', plot.name = 'RNA Counts (cif.sigma = 1.0)')
430 | ```
431 | 
432 | ## Adjusting the Intrinsic Noise: `intinsic.noise`
433 | 
434 | If we set `do.velocity = FALSE`, scMultiSim will simulate the RNA counts using the Beta-Poisson model,
435 | which is faster but doesn't output RNA velocity.
436 | When using the Beta-Possion model, scMultiSim provides a `intrinsic.noise` parameter to control the
437 | intrinsic noise during the transcription process.
438 | By default, `intrinsic.noise` is set to 1, which means the true counts will be sampled from the Beta-Poisson
439 | model. If we set `intrinsic.noise` to a smaller value like 0.5,
440 | the true counts will be 0.5 * (theoretical mean) + 0.5 * (sampled from the Beta-Poisson model).
441 | _More intrinsic noise will make the encoded effects (e.g. GRN) harder to be inferred._
442 | 
443 | 
444 | ```{r adjust-intrinsic-noise, fig.width = 4, fig.height = 3.5, out.width = "60%"}
445 | set.seed(42)
446 | 
447 | options <- list(
448 |   GRN = GRN_params,
449 |   num.cells = 300,
450 |   num.cifs = 20,
451 |   tree = Phyla5(),
452 |   diff.cif.fraction = 0.8,
453 |   intrinsic.noise = 1
454 | )
455 | 
456 | results <- sim_true_counts(
457 |         options %>% list_modify(intrinsic.noise = 0.5))
458 | plot_tsne(log2(results$counts + 1),
459 |          results$cell_meta$pop,
460 |          legend = 'pop', plot.name = 'RNA Counts (intrinsic.noise = 0.5)')
461 | 
462 | results <- sim_true_counts(
463 |         options %>% list_modify(intrinsic.noise = 1))
464 | plot_tsne(log2(results$counts + 1),
465 |          results$cell_meta$pop,
466 |          legend = 'pop', plot.name = 'RNA Counts (intrinsic.noise = 1)')
467 | ```
468 | 
469 | ## Adjust the effect of chromatin accessibility: `atac.effect`
470 | 
471 | `atac.effect` Controls the contribution of the chromatin accessibility.
472 | A higher `atac.effect` means the RNA counts are more affected by the ATAC-seq data,
473 | therefore the correlation between the ATAC-seq and RNA-seq data will be higher.
474 | 
475 | # Simulating Dynamic GRN
476 | 
477 | First, call the following function to check the usage of dynamic GRN.
478 | ```{r help-dynamic-grn}
479 | scmultisim_help("dynamic.GRN")
480 | ```
481 | 
482 | Here we use `Phyla1()` as the differentiation tree to remove the effect of the trajectory. Additionally, we can use `discrete.cif = TRUE` to simulate discrete cell population.
483 | ```{r define-options-dynamic-grn}
484 | set.seed(42)
485 | 
486 | options_ <- list(
487 |   GRN = GRN_params,
488 |   num.cells = 300,
489 |   num.cifs = 20,
490 |   tree = Phyla1(),
491 |   diff.cif.fraction = 0.8,
492 |   do.velocity = FALSE,
493 |   dynamic.GRN = list(
494 |     cell.per.step = 3,
495 |     num.changing.edges = 5,
496 |     weight.mean = 0,
497 |     weight.sd = 4
498 |   )
499 | )
500 | 
501 | results <- sim_true_counts(options_)
502 | ```
503 | 
504 | `results$cell_specific_grn` is a list containing the gene effects matrix for each cell. Each row is a target and each column is a regulator. The corresponding gene names are displayed as column and row names.
505 | ```{r show-cell-specific-grn}
506 | # GRN for cell 1 (first 10 rows)
507 | results$cell_specific_grn[[1]][1:10,]
508 | ```
509 | 
510 | Since we set `cell.per.step = 3`, we expect each adjacent 3 cells share the same GRN:
511 | ```{r check-cell-specific-grn}
512 | print(all(results$cell_specific_grn[[1]] == results$cell_specific_grn[[2]]))
513 | print(all(results$cell_specific_grn[[2]] == results$cell_specific_grn[[3]]))
514 | print(all(results$cell_specific_grn[[3]] == results$cell_specific_grn[[4]]))
515 | ```
516 | 
517 | # Session Information
518 | 
519 | ```{r session-info}
520 | sessionInfo()
521 | ```


--------------------------------------------------------------------------------
/vignettes/options.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "4. Parameter Guide"
  3 | output:
  4 |     BiocStyle::html_document:
  5 |     toc: true
  6 |     toc_depth: 2
  7 | vignette: >
  8 |     %\VignetteEngine{knitr::knitr}
  9 |     %\VignetteIndexEntry{4. Parameter Guide}
 10 |     %\usepackage[UTF-8]{inputenc}
 11 | ---
 12 | 
 13 | ```{r, include = FALSE}
 14 | knitr::opts_chunk$set(
 15 |   collapse = TRUE,
 16 |   comment = "#>"
 17 | )
 18 | ```
 19 | 
 20 | This article introduces the available options in `scMultiSim`.
 21 | 
 22 | The following flow chart shows the workflow of `scMultiSim` and each parameter's role in the simulation.
 23 | 
 24 | ![scMultiSim parameters flow chart](https://github.com/ZhangLabGT/scMultiSim/raw/img/img/params.png)
 25 | 
 26 | ## Options: General
 27 | 
 28 | ###  rand.seed
 29 | 
 30 | > integer (default: `0`)
 31 | 
 32 | scMultiSim should produce the same result if all other parameters are the same.
 33 | 
 34 | ###  threads
 35 | 
 36 | > integer (default: `1`)
 37 | 
 38 | Use multithreading only when generating the CIF matrix.
 39 | It will not speed up the simulation a lot, thus not recommended.
 40 | 
 41 | ###  speed.up
 42 | 
 43 | > logical (default: `FALSE`)
 44 | 
 45 | Enable experimental speed-up mode.
 46 | It is recommended to **enable** this option, and it will be the default in the future.
 47 | Currently, it is disabled for reproducibility.
 48 | 
 49 | ## Options: Genes
 50 | 
 51 | ###  GRN
 52 | 
 53 | > A data frame with 3 columns as below.
 54 | > Supply `NA` to disable the GRN effect. (required)
 55 | 
 56 | | Column | Value                                      |
 57 | | ------ | ------------------------------------------ |
 58 | | 1      | target gene ID: `integer or character`;    |
 59 | | 2      | regulator gene ID: `integer or character`; |
 60 | | 3      | effect: `number`.                          |
 61 | 
 62 | If `num.genes` presents, the gene IDs should not exceed this number.
 63 | The gene IDs should start from 1 and should not ship any intermidiate numbers.
 64 | 
 65 | Two sample datasets `GRN_params_100` and `GRN_params_1000` from
 66 | [Dibaeinia, P., &amp; Sinha, S. (2020)](https://doi.org/10.1016/j.cels.2020.08.003) are provided for testing and inspection.
 67 | 
 68 | ###  num.genes
 69 | 
 70 | > integer (default: `NULL`)
 71 | 
 72 | If a GRN is supplied, override the total number of genes.
 73 | It should be larger than the largest gene ID in the GRN.
 74 | Otherwise, the number of genes will be determined by `N_genes * (1 + r_u)`,
 75 | where `r_u` is `unregulated.gene.ratio`.
 76 | 
 77 | If GRN is disabled,
 78 | this option specifies the total number of genes.
 79 | 
 80 | ###  unregulated.gene.ratio
 81 | 
 82 | > number > 0 (default: `0.1`)
 83 | 
 84 | Ratio of unreulated to regulated genes.
 85 | When a GRN is supplied with `N` genes,
 86 | scMultiSim will simulate `N * r_u` extra (unregulated) genes.
 87 | 
 88 | ###  giv.mean, giv.sd, giv.prob
 89 | 
 90 | > (default: `0, 1, 0.3`)
 91 | 
 92 | The parameters used to sample the GIV matrix.
 93 | With probability `giv.prob`, the value is sampled from N(`giv.mean`, `giv.sd`).
 94 | Otherwise the value is 0.
 95 | 
 96 | ###  dynamic.GRN
 97 | 
 98 | > list (default: `NULL`)
 99 | 
100 | Enables dynamic (cell-specific GRN).
101 | Run `scmultisim_help("dynamic.GRN")` to see more explaination.
102 | 
103 | ###  hge.prop, hge.mean, hge.sd
104 | 
105 | > (default: `0, 5, 1`)
106 | 
107 | Treat some random genes as highly-expressed (house-keeping) genes.
108 | A proportion of `hge.prop` genes will have expression scaled by a
109 | multiplier sampled from N(`hge.mean`, `hge.sd`).
110 | 
111 | ###  hge.range
112 | 
113 | > integer (default: `1`)
114 | 
115 | When selecting highly-expressed genes, only choose genes with ID > `hge.range`.
116 | 
117 | ###  hge.max.var
118 | 
119 | > number (default: `500`)
120 | 
121 | When selecting highly-expressed genes, only choose genes
122 | with variation < `hge.max.var`.
123 | 
124 | ## Options: Cells
125 | 
126 | ###  num.cells
127 | 
128 | > integer (default: `1000`)
129 | 
130 | The number of cells to be simulated.
131 | 
132 | ###  tree
133 | 
134 | > phylo (default: `Phyla5()`)
135 | 
136 | The cell differential tree,
137 | which will be used to generate cell trajectories (if `discrete.cif = T`)
138 | or clusters (if `discrete.cif = F`).
139 | In discrete population mode, only the tree tips will be used.
140 | Three demo trees, `Phyla5()`, `Phyla3()` and `Phyla1()`, are provided.
141 | 
142 | ###  discrete.cif
143 | 
144 | > logical (default: `FALSE`)
145 | 
146 | Whether the cell population is discrete (continuous otherwise).
147 | 
148 | ###  discrete.min.pop.size, discrete.min.pop.index
149 | 
150 | > integer, integer (default: `70, 1`)
151 | 
152 | In discrete population mode, specify one cluster to have the
153 | smallest cell population.
154 | The cluster will contain `discrete.min.pop.size` cells.
155 | `discrete.min.pop.index` should be a valid cluster index (tree tip number).
156 | 
157 | ###  discrete.pop.size
158 | 
159 | > integer vector (default: `NA`); e.g. `c(200, 250, 300)`
160 | 
161 | Manually specify the size of each cluster.
162 | 
163 | ## Options: CIF
164 | 
165 | ###  num.cifs
166 | 
167 | > integer (default: `50`)
168 | 
169 | Total number of differential and non-differential CIFs,
170 | which can be viewed as latent representation of cells.
171 | 
172 | ###  diff.cif.fraction
173 | 
174 | > number (default: `0.9`)
175 | 
176 | Fraction of differential CIFs.
177 | Differential CIFs encode the cell type information,
178 | while non-differential CIFs are randomly sampled for each cell.
179 | 
180 | ###  cif.center, cif.sigma
181 | 
182 | > (default: `1, 0.1`)
183 | 
184 | The distribution used to sample CIF values.
185 | 
186 | ###  use.impulse
187 | 
188 | > logical (default: `FALSE`)
189 | 
190 | In continuous population mode, when sampling CIFs along the tree,
191 | use the impulse model rather than the default gaussian random walk.
192 | 
193 | ## Options: Simulation - ATAC
194 | 
195 | ###  atac.effect
196 | 
197 | > number ∈ [0, 1] (default: `0.5`)
198 | 
199 | The influence of chromatin accessability data on gene expression.
200 | 
201 | ###  region.distrib
202 | 
203 | > vector of length 3, should sum to 1 (default: `c(0.1, 0.5, 0.4)`)
204 | 
205 | The probability that a gene is regulated by 0, 1, 2
206 | consecutive regions, respectively.
207 | 
208 | ###  atac.p_zero
209 | 
210 | > number ∈ [0, 1] (default: `0.8`)
211 | 
212 | The proportion of zeros we see in the simulated scATAC-seq data.
213 | 
214 | ###  riv.mean, riv.sd, riv.prob
215 | 
216 | > (default: `0, 1, 0.3`)
217 | 
218 | The parameters used to sample the RIV (Region Identity Vectors).
219 | With probability `riv.prob`, the value is sampled from N(`riv.mean`, `riv.sd`).
220 | Otherwise the value is 0.
221 | 
222 | ## Customization
223 | 
224 | ###  mod.cif.giv
225 | 
226 | > function (default: `NULL`)
227 | 
228 | Modify the generated CIF and GIV.
229 | The function takes four arguments: the kinetic parameter index (1=kon, 2=koff, 3=s),
230 | the current CIF matrix, the GIV matrix, and the cell metadata dataframe.
231 | It should return a list of two elements: the modified CIF matrix and the modified GIV matrix.
232 | 
233 | ```R
234 | sim_true_counts(list(
235 |     # ...
236 |     mod.cif.giv = function(i, cif, giv, meta) {
237 |         # modify cif and giv
238 |         return(list(cif, giv))
239 |     }
240 | ))
241 | ```
242 | 
243 | ### ext.cif.giv
244 | 
245 | > function (default: `NULL`)
246 | 
247 | Add extra CIF and GIV.
248 | The function takes one argument, the kinetic parameter index (1=kon, 2=koff, 3=s).
249 | It should return a list of two elements: the extra CIF matrix `(n_extra_cif x n_cells)`
250 | and the GIV matrix `(n_genes x n_extra_cif)`. Return `NULL` for no extra CIF and GIV."
251 | 
252 | ```R
253 | sim_true_counts(list(
254 |     # ...
255 |     ext.cif.giv = function(i) {
256 |         # add extra cif and giv
257 |         return(list(extra_cif, extra_giv))
258 |     }
259 | ))
260 | ```
261 | 
262 | ## Optins: Simulation
263 | 
264 | ### vary
265 | 
266 | > character (default: `"s"`)
267 | 
268 | Can be `"all", "kon", "koff", "s", "except_kon", "except_koff", "except_s"`.
269 | It specifies which kinetic parameters to vary across cells, i.e. which kinetic parameters have differential CIFs
270 | sampled from the tree.
271 | 
272 | ### bimod
273 | 
274 | > number (default: `0`)
275 | 
276 | A number between 0 and 1, which adjust the bimodality of the gene expression distribution.
277 | 
278 | ### scale.s
279 | 
280 | > number (default: `1`)
281 | 
282 | Manually scale the final `s` parameter, thus the gene expression.
283 | When discrete.cif = T, it can be a vector specifying the scale.s for each cluster.
284 | In this case, you can use smaller value for cell types known to be small (like naive cells).
285 | 
286 | ### intrinsic.noise
287 | 
288 | > number (default: `1`)
289 | 
290 | A number between 0 and 1, which specify the weight of the random sample from the Beta-Poisson distribution.
291 | 
292 | ```
293 |        0 <----------------------> 1
294 | Theoritical mean          Random sample from
295 |                       Beta-Poisson distribution
296 | ```
297 | 
298 | ## Options: Simulation - RNA Velocity
299 | 
300 | ###  do.velocity
301 | 
302 | > logical (default: `FALSE`)
303 | 
304 | When set to `TRUE`,
305 | simulate using the full kinetic model and generate RNA velocity data.
306 | Otherwise, the Beta-Poission model will be used.
307 | 
308 | ###  beta
309 | 
310 | > number (default: `0.4`)
311 | 
312 | The splicing rate of each gene in the kinetic model.
313 | 
314 | ###  d
315 | 
316 | > number (default: `1`)
317 | 
318 | The degradation rate of each gene in the kinetic model.
319 | 
320 | ###  num.cycles
321 | 
322 | > number (default: `3`)
323 | 
324 | The number of cycles run before sampling the gene expression of a cell.
325 | 
326 | ###  cycle.len
327 | 
328 | > number (default: `1`)
329 | 
330 | In velocity mode, a multiplier for the cell cycle length.
331 | It is multiplied by the expected time to
332 | transition from k_on to k_off and back to form the the length of a cycle.
333 | 
334 | ## Options: Simulation - Spatial Cell-Cell Interaction
335 | 
336 | The simulation of cell-cell interaction can be enabled by passing a `list` as the `cci` option.
337 | In this list, you can specify the following options:
338 | 
339 | ### grid.size
340 | 
341 | > integer
342 | 
343 | Manually specify the width and height of the grid.
344 | 
345 | ### layout
346 | 
347 | > "enhanced", "layers", "islands", or a function (default: `"enhanced"`)
348 | 
349 | Specify the layout of the cell types.
350 | scMultiSim provides three built-in layouts: `"enhanced"`, `"layers"`, and `"islands"`.
351 | 
352 | If set to `"islands"`, you can specify which cell types are the islands, e.g. `"islands:1,2"`.
353 | 
354 | If using a custom function, it should take two arguments: `function (grid_size, cell_types)`
355 | - grid_size: (integer) The width and height of the grid.
356 | - cell_types: (integer vector) Each cell's cell type.
357 | 
358 | It should return a `n_cell x 2` matrix, where each row is the x and y coordinates of a cell.
359 | 
360 | ### step.size
361 | 
362 | > number
363 | 
364 | If using continuous population, use this step size to further divide the
365 | cell types on the tree. For example, if the tree only has one branch `a -> b`
366 | and the branch length is 1 while the step size is 0.34, there will be totally three cell types: a_b_1, a_b_2, a_b_3.
367 | 
368 | ### params
369 | 
370 | > data.frame
371 | 
372 | The spatial effect between a ligand and a receptor gene.
373 | It should be a data frame similar to the GRN parameter, i.e. with columns `receptor`, `ligand`, and `effect`.
374 | 
375 | Example:
376 | ```R
377 | cci = list(
378 |   params = data.frame(
379 |     target    = c(2,   6,   10,   8, 20,  30),
380 |     regulator = c(101, 102, 103, 104, 105, 106),
381 |     effect    = 20
382 |   )
383 | )
384 | ```
385 | 
386 | ### cell.type.interaction
387 | 
388 | > "random" or a matrix
389 | 
390 | Specify which cell types can communicate using which ligand-receptor pair.
391 | It should be a 3d `n_cell_types x n_cell_types x n_ligand_pair` numeric matrix.
392 | The value at (i, j, k) is 1 if there exist CCI of LR-pair k between cell type i and cell type j.
393 | 
394 | This matrix can be generated using the `cci_cell_type_params()` function.
395 | It can fill the matrix randomly, or return an empty matrix for you to fill manually.
396 | If you want to fill it randomly, you can simply supply `"random"` for this option.
397 | 
398 | ### cell.type.lr.pairs
399 | 
400 | > integer vector
401 | 
402 | If `cell.type.interaction` is `"random"`, specify how many LR pairs should be enabled between each cell type pair.
403 | Should be a range, e.g. `4:6`. The actual number of LR pairs will be uniformly sampled from this range.
404 | 
405 | ### max.neighbors
406 | 
407 | > integer
408 | 
409 | The number of interacting cells for each cell.
410 | If the cell's available neighbor count is not large enough, the actual interacting cells may be smaller than this value.
411 | 
412 | ### radius
413 | 
414 | > number (default: `1`), or "gaussian:sigma"
415 | 
416 | Which cells should be considered as neighbors.
417 | The interacting cells are those within these neighbors.
418 | 
419 | When it is a number, it controls the maximum distance between two cells for them to interact.
420 | 
421 | When it is a string, it should be in the format `gaussian:sigma`, for example, `gaussian:1.2`.
422 | In this case, the probability of two cells interacting is proportional to the distance with a Gaussian kernel applied.
423 | 
424 | ### start.layer
425 | 
426 | > integer
427 | 
428 | From which layer (time step) the simulation should start.
429 | If set to 1, the simulation will start with one cell in the grid and add one more cell in each following layer.
430 | If set to `num_cells`, the simulation will start from all cells available in the grid
431 | and only continues for a few static layers, which will greatly speed up the simulation.
432 | 


--------------------------------------------------------------------------------
/vignettes/spatialCCI.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "3. Simulating Spatial Cell-Cell Interactions"
  3 | output:
  4 |     BiocStyle::html_document:
  5 |     toc: true
  6 |     toc_depth: 2
  7 | vignette: >
  8 |     %\VignetteEngine{knitr::knitr}
  9 |     %\VignetteIndexEntry{3. Simulating Spatial Cell-Cell Interactions}
 10 |     %\usepackage[UTF-8]{inputenc}
 11 | ---
 12 | 
 13 | ```{r "setup", include=FALSE}
 14 | require("knitr")
 15 | opts_chunk$set(fig.width=4, fig.height=3)
 16 | 
 17 | # devtools::load_all(".")
 18 | ```
 19 | 
 20 | 
 21 | ## Simulating Spatial Cell-Cell Interactions
 22 | 
 23 | scMultiSim can simulate spatial cell-cell interactions.
 24 | To do so, we need to provide the `cci` option as a list.
 25 | The following code will print more instructions on how to use the `cci` option.
 26 | 
 27 | ```{r help-cci}
 28 | library(scMultiSim)
 29 | 
 30 | scmultisim_help("cci")
 31 | ```
 32 | 
 33 | Now, we prepare a ligand-receptor interaction database.
 34 | This is pretty similar to the GRN network: it is a data frame with three columns,
 35 | specifying `target`, `regulator`, and `effect`, respectively.
 36 | The target and regulator columns should contain the IDs of the target and regulator genes.
 37 | In the following example, we have two ligand-receptor pairs interacting between two neighboring cells.
 38 | 
 39 | ```{r cci-network}
 40 | lig_params <- data.frame(
 41 |   target    = c(101, 102),
 42 |   regulator = c(103, 104),
 43 |   effect    = c(5.2, 5.9)
 44 | )
 45 | ```
 46 | 
 47 | We can now simulate the spatial cell-cell interactions.
 48 | In scMultiSim, the CCI network is cell-type based, which means that between each cell type pair,
 49 | we can have a different CCI network sampled from the database defined above.
 50 | Here, we set the `step.size` to 0.5, so the differentiation tree is divided into segments of length 0.5,
 51 | each segment is treated as a cell type in CCI.
 52 | We set `cell.type.interaction` to `random`, so the CCI network between each cell type pair is randomly sampled from the database.
 53 | 
 54 | Here, we use only 100 cells to speed up the simulation. Feel free to try a larger number of cells when running this vignette locally.
 55 | 
 56 | ```{r}
 57 | data(GRN_params_100)
 58 | set.seed(42)
 59 | 
 60 | options_ <- list(
 61 |   GRN = GRN_params_100,
 62 |   speed.up = TRUE,
 63 |   num.genes = 120,
 64 |   num.cells = 80,
 65 |   num.cifs = 20,
 66 |   cif.sigma = 0.2,
 67 |   tree = Phyla3(),
 68 |   intrinsic.noise = 0.5,
 69 |   cci = list(
 70 |     params = lig_params,
 71 |     max.neighbors = 4,
 72 |     grid.size = 13,
 73 |     cell.type.interaction = "random",
 74 |     step.size = 0.5
 75 |   )
 76 | )
 77 | 
 78 | results <- sim_true_counts(options_)
 79 | ```
 80 | 
 81 | The `results$cell_meta` will contain the cell type information used in CCI.
 82 | We can plot the cell spatial locations using `plot_cell_loc()`.
 83 | The arrows indicate cell-cell interactions between two cells (for the first ligand-receptor pair).
 84 | 
 85 | ```{r plot-cell-loc, fig.width=6, fig.height=6}
 86 | plot_cell_loc(results)
 87 | ```
 88 | 
 89 | The cell locations are available in `results$cci_locs`.
 90 | 
 91 | ```{r print-cell-loc}
 92 | head(results$cci_locs)
 93 | ```
 94 | 
 95 | ### Speeding up the Simulation
 96 | 
 97 | Simulating spatial cell-cell interactions can be computationally expensive.
 98 | Setting these two options can speed up the simulation:
 99 | 
100 | ```
101 | options_ <- list(
102 |     # ...
103 |     speed.up = T,
104 |     cci = list(
105 |         # ...
106 |         start.layer = ncells
107 |     )
108 | )
109 | ```
110 | 
111 | First of all, it is recommended to set the experimental `speed.up = T` option. This option will become default in later versions of scMultiSim.
112 | 
113 | Next, it is possible to set the CCI option `start.layer = n_cells`, where `n_cells` is the number of cells.
114 | scMultiSim simulates a spatial dataset by following `n_cells` steps, adding one more cell to the spatial grid in each step.
115 | Only the final step is outputted as the result.
116 | The CCI option `start.layer` can be used to start simulation from a specific time step.
117 | When set to `n_cells`, the simulation will skip all previous steps by adding all cells at once.
118 | By default, `start.layer` will be set to `n_cells` when number of cells is greater than 800.
119 | 
120 | 
121 | ## Spatial layouts
122 | 
123 | scMultiSim provides powerful customization options for spatial cell layouts.
124 | 
125 | ### Built-in layouts
126 | 
127 | scMultiSim ships with several built-in spatial layouts.
128 | The `enhanced` layout is the default layout, where cells are added to the grid one by one.
129 | When adding a new cell, it has a higher probability of being placed near the existing cells of the same cell type.
130 | ```{r layout-enhanced, fig.width=6, fig.height=6}
131 | # helper function to add `layout` to options, to make the code more readable
132 | spatial_options <- function (...) {
133 |   cci_opt <- list(
134 |     params = lig_params,
135 |     max.neighbors = 4,
136 |     start.layer = 300,
137 |     grid.size = 28,
138 |     cell.type.interaction = "random"
139 |   )
140 |   list(
141 |     rand.seed = 0,
142 |     GRN = GRN_params_100,
143 |     speed.up = TRUE,
144 |     num.genes = 200,
145 |     num.cells = 300,
146 |     num.cifs = 50,
147 |     tree = Phyla3(),
148 |     cci = c(cci_opt, list(...))
149 |   )
150 | }
151 | 
152 | 
153 | results <- sim_true_counts(spatial_options(
154 |   layout = "enhanced"
155 | ))
156 | plot_cell_loc(results, show.arrows = FALSE)
157 | ```
158 | 
159 | An option `same.type.prob` decides the probability of a new cell being placed near the existing cells of the same cell type.
160 | By default, it is 0.8; and if we use a lower value, the new cell will be placed more randomly.
161 | ```{r layout-random, fig.width=6, fig.height=6}
162 | 
163 | results <- sim_true_counts(spatial_options(
164 |   layout = "enhanced",
165 |   same.type.prob = 0.1
166 | ))
167 | plot_cell_loc(results, show.arrows = FALSE)
168 | ```
169 | 
170 | The `layers` layout arranges cells in layers.
171 | 
172 | ```{r layout-layers, fig.width=6, fig.height=6}
173 | results <- sim_true_counts(spatial_options(
174 |   layout = "layers"
175 | ))
176 | plot_cell_loc(results, show.arrows = FALSE)
177 | ```
178 | 
179 | The `islands` layout will put some cell types in the center like islands, and others around them.
180 | You may specify which cell type should be islands in the format `islands:1,2,3`.
181 | The number here can be looked up in `results$cci_cell_types`.
182 | 
183 | ```{r}
184 | results$cci_cell_types
185 | ```
186 | 
187 | ```{r layout-islands, fig.width=6, fig.height=6}
188 | results <- sim_true_counts(spatial_options(
189 |   # cell type 4_1_2 should be the island
190 |   layout = "islands:5"
191 | ))
192 | plot_cell_loc(results, show.arrows = FALSE)
193 | ```
194 | 
195 | ### Custom layouts
196 | 
197 | It is also possible to layout the cells programmatically.
198 | The `layout` option can be a function that takes the cell type information and returns the spatial locations of the cells:
199 | ```
200 | # grid_size is a number
201 | # cell_types is an integer vector, representing the cell types
202 | function(grids_size, cell_types) {
203 |   # return a matrix with two columns, representing the x and y coordinates of the cells
204 |   return matrix(nrow = 2, ncol = ncells)
205 | }
206 | ```
207 | 
208 | For example, the following layout function will place the cells sequentially in the grid,
209 | starting from the bottom-left corner.
210 | 
211 | ```{r layout-custom, fig.width=6, fig.height=6}
212 | results <- sim_true_counts(spatial_options(
213 |   layout = function (grid_size, cell_types) {
214 |     ncells <- length(cell_types)
215 |     new_locs <- matrix(nrow = ncells, ncol = 2)
216 |     # for each cell...
217 |     for (i in 1:ncells) {
218 |       # ...place it in the grid
219 |       new_locs[i,] <- c(i %% grid_size, i %/% grid_size)
220 |     }
221 |     return(new_locs)
222 |   }
223 | ))
224 | plot_cell_loc(results, show.arrows = FALSE)
225 | ```
226 | 
227 | ## Spatial domains
228 | 
229 | Next, we demonstrate how to use custom layout function to create spatial domains.
230 | We want to have three spatial domains in a layered layout, and we have four cell types.
231 | Each cell type has a different probability of being in each domain.
232 | 
233 | The following layout function will do this job: First of all, it generates a set of locations that form a circular shape.
234 | Next, it assigns cells to these locations; the leftmost cell is selected as the origin.
235 | Then, we can create a layered layout by sorting the locations based on their euclidian distance to the origin.
236 | The three domains are determined by the distance to the origin.
237 | We have a matrix `ct_matrix` that specifies the probability of each cell type being in each domain.
238 | Finally, we sample the cells based on the probabilities and assign them to the domains.
239 | 
240 | ```{r layout-domains}
241 | layout_fn <- function(grid_size, final_types) {
242 |   ncells <- length(final_types)
243 |   grid_center <- c(round(grid_size / 2), round(grid_size / 2))
244 |   all_locs <- gen_clutter(ncells, grid_size, grid_center)
245 |   # center is bottom-left
246 |   left_ones <- which(all_locs[,1] == min(all_locs[,1]))
247 |   new_center <<- all_locs[left_ones[which.min(all_locs[left_ones, 2])],]
248 |   dist_to_center <- sqrt(colSums((t(all_locs) - new_center)^2))
249 |   new_locs <- all_locs[order(dist_to_center),]
250 |   # prob of a cell type being in a zone (cell_type x zone)
251 |   ct_matrix <- matrix(c(
252 |     0.9, 0.1, 0.0,
253 |     0.1, 0.8, 0.1,
254 |     0.1, 0.7, 0.2,
255 |     0.0, 0.1, 0.9
256 |   ), nrow = 4, byrow = TRUE)
257 |   # number of cells per type
258 |   ct_pop <- c(160, 80, 100, 140)
259 |   pop_mtx <- round(ct_matrix * ct_pop)
260 |   if (sum(pop_mtx) != ncells) {
261 |     diffrence <- ncells - sum(pop_mtx)
262 |     pop_mtx[1, 1] <- pop_mtx[1, 1] + diffrence
263 |   }
264 |   # number of cells per zone
265 |   zone_pop <- colSums(pop_mtx)
266 |   # assign cells to zones
267 |   cs <- cumsum(zone_pop)
268 |   # sample cells
269 |   cell_idx <- unlist(lapply(1:3, function(izone) {
270 |     sample(rep(1:4, pop_mtx[,izone]), zone_pop[izone])
271 |   }))
272 |   locs <<- new_locs[order(cell_idx),]
273 |   zone_gt <<- rep(1:3, zone_pop)[order(cell_idx)]
274 |   return(locs)
275 | }
276 | ```
277 | 
278 | Inspecting the result, we can see the three spatial domains, where the middle one contains a mix of two cell types.
279 | 
280 | ```{r layout-domains-plot, fig.width=6, fig.height=6}
281 | results <- sim_true_counts(list(
282 |   num.cells = 500,
283 |   num.genes = 300,
284 |   num.cifs = 40,
285 |   GRN = NA,
286 |   speed.up = T,
287 |   cif.sigma = 0.8,
288 |   tree = ape::read.tree(text = "(A:1,B:1,C:1,D:1);"),
289 |   diff.cif.fraction = 0.8,
290 |   discrete.cif = T,
291 |   discrete.pop.size = as.integer(c(120,150,100,130)),
292 |   cci = list(
293 |     params = lig_params,
294 |     max.neighbors = 4,
295 |     start.layer = 500,
296 |     cell.type.interaction = "random",
297 |     layout = layout_fn,
298 |     step.size = 1
299 |   )
300 | ))
301 | 
302 | plot_cell_loc(results, show.arrows = FALSE)
303 | ```
304 | 
305 | ## Spatially variable genes
306 | 
307 | The `ext.cif.giv` option allows us to append custom CIF and GIV entries for each cell and gene.
308 | We can use this option to simulate spatially variable genes.
309 | This option should be a function that takes the kinetic parameter index and returns a list of extra CIF and GIV matrices.
310 | 
311 | ```{r}
312 | scmultisim_help("ext.cif.giv")
313 | ```
314 | 
315 | Using the previous layout function, we can add extra CIF with value based on the distance to the origin.
316 | 
317 | ```{r}
318 | ext_cif <- function(i) {
319 |   # We manually set genes 290-300 to be spatially variable
320 |   spatial_genes <- 290:300
321 |   dist_to_center <- colSums((t(locs) - new_center)^2)
322 |   dist_to_center <- dist_to_center / max(dist_to_center)
323 |   # 3 is the s parameter
324 |   if (i == 3) {
325 |     # n_extra_cif x n_cells
326 |     ex_cif <- cbind(
327 |       # the two CIFs have large values when distance to the center is near 0.5
328 |       rnorm(500, 0.5 * dnorm(abs(dist_to_center - 0.5), 0, 0.04), 0.02),
329 |       rnorm(500, 0.5 * dnorm(abs(dist_to_center - 0.5), 0, 0.04), 0.02)
330 |     )
331 |     # n_genes x n_extra_cif
332 |     ex_giv <- matrix(0, nrow = 300, ncol = 2)
333 |     for (i in spatial_genes) {
334 |       # odd genes affected by the first two CIF, even genes affected by the last two CIF
335 |       ex_giv[i, ] <- rnorm(2, 1, 0.5)
336 |     }
337 |     list(ex_cif, ex_giv * 2)
338 |   } else {
339 |     NULL
340 |   }
341 | }
342 | ```
343 | 
344 | ```{r}
345 | results <- sim_true_counts(list(
346 |   num.cells = 500,
347 |   num.genes = 300,
348 |   num.cifs = 40,
349 |   GRN = NA,
350 |   speed.up = T,
351 |   cif.sigma = 0.8,
352 |   tree = ape::read.tree(text = "(A:1,B:1,C:1,D:1);"),
353 |   diff.cif.fraction = 0.8,
354 |   ext.cif.giv = ext_cif,
355 |   discrete.cif = T,
356 |   discrete.pop.size = as.integer(c(120,150,100,130)),
357 |   cci = list(
358 |     params = lig_params,
359 |     max.neighbors = 4,
360 |     start.layer = 500,
361 |     cell.type.interaction = "random",
362 |     layout = layout_fn,
363 |     step.size = 1
364 |   )
365 | ))
366 | ```
367 | 
368 | Try plotting one of the spatially variable genes. We can see that the gene expression is higher in the specific spatial
369 | region.
370 | ```{r spatially-variable-gene, fig.width=6, fig.height=6}
371 | library(ggplot2)
372 | 
373 | plot_cell_loc(results, show.arrows = FALSE,
374 |               .cell.pop = log(results$counts[299,] + 1)) + scale_colour_viridis_c()
375 | ```
376 | 
377 | ## Long-distance Cell-Cell Interactions
378 | 
379 | scMultiSim also supports simulation of long-distance cell-cell interactions.
380 | 
381 | The CCI option `radius` controls the maximum distance between two cells for them to interact.
382 | It can be a number or a string.
383 | When it is a number, it specifies the maximum distance.
384 | When it is a string it should be in the format `gaussian:sigma`, for example, `gaussian:1.2`.
385 | In this case, the probability of two cells interacting is proportional to the distance with a Gaussian kernel applied.
386 | 
387 | By default, `radius = 1`, which means scMultiSim only consider the four nearest neighbors.
388 | 
389 | We can compare the result with different sigma values 1 and 3:
390 | 
391 | ```{r long-distance-cci}
392 | 
393 | options <- lapply(c(1, 3), \(sigma) {
394 |   list(
395 |     rand.seed = 1,
396 |     GRN = NA,
397 |     num.genes = 200,
398 |     num.cells = 500,
399 |     num.cifs = 50,
400 |     tree = Phyla5(),
401 |     discrete.cif = T,
402 |     discrete.min.pop.size = 20,
403 |     discrete.pop.size = as.integer(c(110, 80, 140, 40, 130)),
404 |     do.velocity = F,
405 |     scale.s = 1,
406 |     cci = list(
407 |       params = lig_params,
408 |       max.neighbors = 4,
409 |       cell.type.interaction = "random",
410 |       cell.type.lr.pairs = 3:6,
411 |       step.size = 0.3,
412 |       grid.size = 35,
413 |       start.layer = 500,
414 |       radius = paste0("gaussian:", sigma),
415 |       layout = "layers"
416 |     )
417 |   )
418 | 
419 | })
420 | 
421 | results_1 <- sim_true_counts(options[[1]])
422 | results_3 <- sim_true_counts(options[[2]])
423 | 
424 | ```
425 | 
426 | ```{r plot-long-distance-cci, fig.width=6, fig.height=6}
427 | plot_cell_loc(results_1, show.arrows = T, .cell.pop = as.character(results$grid$final_types))
428 | plot_cell_loc(results_3, show.arrows = T, .cell.pop = as.character(results$grid$final_types))
429 | ```
430 | 
431 | ## Session Information
432 | 
433 | ```{r session-info}
434 | sessionInfo()
435 | ```
436 | 


--------------------------------------------------------------------------------
/vignettes/workflow.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "1. Getting Started"
  3 | output:
  4 |     BiocStyle::html_document:
  5 |     toc: true
  6 |     toc_depth: 2
  7 | vignette: >
  8 |     %\VignetteEngine{knitr::knitr}
  9 |     %\VignetteIndexEntry{1. Getting Started}
 10 |     %\usepackage[UTF-8]{inputenc}
 11 | ---
 12 | 
 13 | scMultiSim is a simulation tool for single-cell multi-omics data.
 14 | It can simulate RNA counts, ATAC-seq data, RNA velocity,
 15 | and spatial locations of continuous or discrete cell populations.
 16 | It can model the effects of gene regulatory networks (GRN), chromatin accessibility,
 17 | and cell-cell interactions on the simulated data.
 18 | 
 19 | This article introduces the basic workflow of `scMultiSim`.
 20 | 
 21 | ```{r, include = FALSE}
 22 | knitr::opts_chunk$set(
 23 |   collapse = TRUE,
 24 |   comment = "#>"
 25 | )
 26 | ```
 27 | 
 28 | # Installation
 29 | 
 30 | It is recommended to install `scMultiSim` from Bioconductor with:
 31 | ```R
 32 | if (!requireNamespace("BiocManager", quietly = TRUE))
 33 |     install.packages("BiocManager")
 34 | BiocManager::install("scMultiSim")
 35 | ```
 36 | 
 37 | You can also install the development version of `scMultiSim` from GitHub with:
 38 | ```R
 39 | devtools::install_github("ZhangLabGT/scMultiSim@main")
 40 | ```
 41 | 
 42 | # Running Simulation
 43 | 
 44 | Once installed, you can load the package with:
 45 | 
 46 | ```{r setup}
 47 | library(scMultiSim)
 48 | ```
 49 | 
 50 | A typical workflow consists two main steps:
 51 | 
 52 | 1. Simulate the true counts;
 53 | 2. Add technical noise and batch to the dataset.
 54 | 
 55 | The `sim_true_counts` function generates the true counts.
 56 | It accepts a list of options as input.
 57 | You are able to control most of the simulated effects here.
 58 | 
 59 | ```{r true-counts}
 60 | data(GRN_params_100)
 61 | 
 62 | results <- sim_true_counts(list(
 63 |   # required options
 64 |   GRN = GRN_params_100,
 65 |   tree = Phyla3(),
 66 |   num.cells = 500,
 67 |   # optional options
 68 |   num.cif = 20,
 69 |   discrete.cif = F,
 70 |   cif.sigma = 0.1
 71 |   # ... other options
 72 | ))
 73 | ```
 74 | 
 75 | scMultiSim requires users to provide the following options:
 76 | 
 77 | - `GRN`: The Gene Regulatory Network.
 78 | - `tree`: The cell differential tree.
 79 | 
 80 | Typically, you may also want to adjust the following options to control other important factors:
 81 | 
 82 | - `num.cells`: Specify the number of cells.
 83 | - `unregulated.gene.ratio` or `num.genes`: Control the total number of genes.
 84 | - `discrete.cif`: Whether generating discrete or continuous cell population.
 85 | - `diff.cif.fraction`: Control the contribution of the trajectory/cluster specified by the tree.
 86 | - `cif.sigma`: Control the variation of cells along the trajectory.
 87 | 
 88 | The [Simulating Multimodal Single-Cell Data](https://zhanglabgt.github.io/scMultiSim/articles/basics.html)
 89 | tutorial will introduce these functions in more detail,
 90 | including how to simulate RNA velocity data and ATAC-seq data.
 91 | The [Simulating Spatial Cell-Cell Interactions](https://zhanglabgt.github.io/scMultiSim/articles/spatialCCI.html)
 92 | tutorial will focus on simulating spatial cell locations and cell-cell interactions.
 93 | You may also want to check the [Parameter Guide](https://zhanglabgt.github.io/scMultiSim/articles/options.html)
 94 | or running the `scmultisim_help()` function for a complete list of options.
 95 | 
 96 | ## The Shiny app
 97 | 
 98 | Don't forget that scMultiSim provides a Shiny app to help you explore the options interactively.
 99 | Simply run `run_shiny()` to start the app.
100 | 
101 | ```{r run-shiny, eval = FALSE}
102 | run_shiny()
103 | ```
104 | 
105 | 
106 | ## Add technical noise and batch effect
107 | 
108 | You can use `add_expr_noise` to add technical noise to the dataset, and `divide_batches` to add batch effects.
109 | 
110 | ```{r technical-noise}
111 | add_expr_noise(results)
112 | divide_batches(results, nbatch = 2)
113 | ```
114 | 
115 | ## Visualize the results
116 | 
117 | scMultiSim provides various visualization functions to help you understand the simulated data.
118 | 
119 | For example, `plot_tsne()` visualizes the cells using t-SNE.
120 | 
121 | ```{r visualize}
122 | plot_tsne(results$counts, results$cell_meta$pop)
123 | ```
124 | 


--------------------------------------------------------------------------------