├── .Rbuildignore ├── .github ├── .gitignore └── workflows │ └── pkgdown.yaml ├── .gitignore ├── DESCRIPTION ├── Meta └── vignette.rds ├── NAMESPACE ├── NEWS.md ├── R ├── 0_opts.R ├── 1.1_cif.R ├── 1_main.R ├── 2_sim.R ├── 3.1_spatial.R ├── 3.2_dyngrn.R ├── 6_technoise.R ├── 7_benchmark.R ├── 8_utils.R ├── 9.1_shiny.R ├── 9_meta.R ├── data.R ├── imports.R └── results.R ├── README.md ├── _pkgdown.yml ├── data ├── GRN_params_100.RData ├── GRN_params_1139.RData ├── dens_nonzero.RData ├── gene_len_pool.RData ├── len2nfrag.RData └── param_realdata.zeisel.imputed.RData ├── inst ├── extdata │ ├── Newick_ABCDE.txt │ └── Newick_animals.txt └── shiny-app │ ├── app.R │ └── www │ ├── .prettierrc │ ├── index.html │ ├── options.js │ ├── output.js │ ├── phyla1.png │ ├── phyla3.png │ ├── phyla5.png │ ├── scm_logo.png │ ├── style.css │ └── validate.js ├── man ├── GRN_params_100.Rd ├── GRN_params_1139.Rd ├── Get_1region_ATAC_correlation.Rd ├── Get_ATAC_correlation.Rd ├── OP.Rd ├── Phyla1.Rd ├── Phyla3.Rd ├── Phyla5.Rd ├── SampleDen.Rd ├── True2ObservedATAC.Rd ├── True2ObservedCounts.Rd ├── add_expr_noise.Rd ├── add_outliers.Rd ├── cci_cell_type_params.Rd ├── dens_nonzero.Rd ├── divide_batches.Rd ├── dot-amplifyOneCell.Rd ├── dot-calAmpBias.Rd ├── dot-continuousCIF.Rd ├── dot-divideBatchesImpl.Rd ├── dot-expandToBinary.Rd ├── dot-getCountCorrMatrix.Rd ├── dot-getParams.Rd ├── dot-normalizeGRNParams.Rd ├── dot-rnormTrunc.Rd ├── gen_1branch.Rd ├── gen_clutter.Rd ├── gene_corr_cci.Rd ├── gene_corr_regulator.Rd ├── gene_len_pool.Rd ├── len2nfrag.Rd ├── match_params.Rd ├── plot_cell_loc.Rd ├── plot_gene_module_cor_heatmap.Rd ├── plot_grid.Rd ├── plot_grn.Rd ├── plot_phyla.Rd ├── plot_rna_velocity.Rd ├── plot_tsne.Rd ├── run_shiny.Rd ├── scmultisim_help.Rd ├── sim_example.Rd ├── sim_example_spatial.Rd ├── sim_true_counts.Rd └── spatialGrid-class.Rd ├── pkgdown └── extra.css ├── tests ├── testthat.R └── testthat │ └── test-1_main.R └── vignettes ├── .gitignore ├── basics.Rmd ├── options.Rmd ├── spatialCCI.Rmd └── workflow.Rmd /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^doc$ 4 | ^bench$ 5 | ^figures$ 6 | ^Meta$ 7 | ^\.vscode$ 8 | ^\.idea$ 9 | ^tmp$ 10 | ^_pkgdown\.yml$ 11 | ^docs$ 12 | ^pkgdown$ 13 | ^\.github$ 14 | ^vignettes/articles$ 15 | -------------------------------------------------------------------------------- /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /.github/workflows/pkgdown.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | release: 9 | types: [published] 10 | workflow_dispatch: 11 | 12 | name: pkgdown.yaml 13 | 14 | permissions: read-all 15 | 16 | jobs: 17 | pkgdown: 18 | runs-on: ubuntu-latest 19 | # Only restrict concurrency for non-PR jobs 20 | concurrency: 21 | group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }} 22 | env: 23 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 24 | permissions: 25 | contents: write 26 | steps: 27 | - uses: actions/checkout@v4 28 | 29 | - uses: r-lib/actions/setup-pandoc@v2 30 | 31 | - uses: r-lib/actions/setup-r@v2 32 | with: 33 | use-public-rspm: true 34 | 35 | - uses: r-lib/actions/setup-r-dependencies@v2 36 | with: 37 | extra-packages: any::pkgdown, local::. 38 | needs: website 39 | 40 | - name: Build site 41 | run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) 42 | shell: Rscript {0} 43 | 44 | - name: Deploy to GitHub pages 🚀 45 | if: github.event_name != 'pull_request' 46 | uses: JamesIves/github-pages-deploy-action@v4.5.0 47 | with: 48 | clean: false 49 | branch: gh-pages 50 | folder: docs 51 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Created by https://www.toptal.com/developers/gitignore/api/r,macos 3 | # Edit at https://www.toptal.com/developers/gitignore?templates=r,macos 4 | 5 | ### macOS ### 6 | # General 7 | .DS_Store 8 | .AppleDouble 9 | .LSOverride 10 | 11 | # Icon must end with two \r 12 | Icon 13 | 14 | 15 | # Thumbnails 16 | ._* 17 | 18 | # Files that might appear in the root of a volume 19 | .DocumentRevisions-V100 20 | .fseventsd 21 | .Spotlight-V100 22 | .TemporaryItems 23 | .Trashes 24 | .VolumeIcon.icns 25 | .com.apple.timemachine.donotpresent 26 | 27 | # Directories potentially created on remote AFP share 28 | .AppleDB 29 | .AppleDesktop 30 | Network Trash Folder 31 | Temporary Items 32 | .apdisk 33 | 34 | ### R ### 35 | # History files 36 | .Rhistory 37 | .Rapp.history 38 | 39 | # Session Data files 40 | .RData 41 | 42 | # User-specific files 43 | .Ruserdata 44 | 45 | # Example code in package build process 46 | *-Ex.R 47 | 48 | # Output files from R CMD build 49 | /*.tar.gz 50 | 51 | # Output files from R CMD check 52 | /*.Rcheck/ 53 | 54 | # RStudio files 55 | .Rproj.user/ 56 | 57 | # produced vignettes 58 | vignettes/*.html 59 | vignettes/*.pdf 60 | 61 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 62 | .httr-oauth 63 | 64 | # knitr and R markdown default cache directories 65 | *_cache/ 66 | /cache/ 67 | 68 | # Temporary files created by R markdown 69 | *.utf8.md 70 | *.knit.md 71 | 72 | # R Environment Variables 73 | .Renviron 74 | 75 | # pkgdown site 76 | docs/ 77 | 78 | # translation temp files 79 | po/*~ 80 | 81 | ### R.Bookdown Stack ### 82 | # R package: bookdown caching files 83 | /*_files/ 84 | 85 | # End of https://www.toptal.com/developers/gitignore/api/r,macos 86 | 87 | /temp 88 | /sim 89 | /bench 90 | /.idea 91 | vignettes/tmp.Rmd 92 | vignettes/common.Rmd 93 | /*.csv 94 | /*.zip 95 | /.vscode 96 | /tmp 97 | /doc/ 98 | /Meta/ 99 | /figures 100 | 101 | R/.Rhistory 102 | scMultiSim.Rproj 103 | docs 104 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: scMultiSim 2 | Title: Simulation of Multi-Modality Single Cell Data Guided By Gene Regulatory Networks and Cell-Cell Interactions 3 | Version: 1.1.10 4 | Authors@R: 5 | c(person(given = "Hechen", 6 | family = "Li", 7 | role = c("aut", "cre"), 8 | email = "hli691@gatech.edu", 9 | comment = c(ORCID = "0000-0003-4907-429X")), 10 | person(given = "Xiuwei", 11 | family = "Zhang", 12 | email = "zhangxiuwei03@gmail.com", 13 | role = "aut"), 14 | person(given = "Ziqi", 15 | family = "Zhang", 16 | email = "ziqi.zhang@gatech.edu", 17 | role = "aut"), 18 | person(given = "Michael", 19 | family = "Squires", 20 | email = "squiresmf@gatech.edu", 21 | role = "aut")) 22 | Description: 23 | scMultiSim simulates paired single cell RNA-seq, single cell ATAC-seq and RNA velocity data, 24 | while incorporating mechanisms of gene regulatory networks, chromatin accessibility and 25 | cell-cell interactions. It allows users to tune various parameters controlling the 26 | amount of each biological factor, variation of gene-expression levels, 27 | the influence of chromatin accessibility on RNA sequence data, and so on. 28 | It can be used to benchmark various computational methods for single cell multi-omics data, 29 | and to assist in experimental design of wet-lab experiments. 30 | License: Artistic-2.0 31 | Encoding: UTF-8 32 | RoxygenNote: 7.3.1 33 | Depends: 34 | R (>= 4.4.0) 35 | Imports: 36 | foreach, 37 | rlang, 38 | dplyr, 39 | ggplot2, 40 | Rtsne, 41 | ape, 42 | MASS, 43 | matrixStats, 44 | phytools, 45 | KernelKnn, 46 | gplots, 47 | zeallot, 48 | crayon, 49 | assertthat, 50 | igraph, 51 | methods, 52 | grDevices, 53 | graphics, 54 | stats, 55 | utils, 56 | markdown, 57 | SummarizedExperiment, 58 | BiocParallel 59 | Suggests: 60 | knitr, 61 | rmarkdown, 62 | roxygen2, 63 | shiny, 64 | testthat (>= 3.0.0) 65 | biocViews: SingleCell, Transcriptomics, GeneExpression, Sequencing, ExperimentalDesign 66 | VignetteBuilder: knitr 67 | Roxygen: list(markdown = TRUE) 68 | BugReports: https://github.com/ZhangLabGT/scMultiSim/issues 69 | URL: https://zhanglabgt.github.io/scMultiSim/ 70 | Config/testthat/edition: 3 71 | -------------------------------------------------------------------------------- /Meta/vignette.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhangLabGT/scMultiSim/12e3799a445316c93df9fc357909c796cfc61f6e/Meta/vignette.rds -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(Get_1region_ATAC_correlation) 4 | export(Get_ATAC_correlation) 5 | export(Phyla1) 6 | export(Phyla3) 7 | export(Phyla5) 8 | export(True2ObservedATAC) 9 | export(True2ObservedCounts) 10 | export(add_expr_noise) 11 | export(add_outliers) 12 | export(cci_cell_type_params) 13 | export(divide_batches) 14 | export(gen_clutter) 15 | export(gene_corr_cci) 16 | export(gene_corr_regulator) 17 | export(plot_cell_loc) 18 | export(plot_gene_module_cor_heatmap) 19 | export(plot_grid) 20 | export(plot_grn) 21 | export(plot_phyla) 22 | export(plot_rna_velocity) 23 | export(plot_tsne) 24 | export(run_shiny) 25 | export(scmultisim_help) 26 | export(sim_example) 27 | export(sim_example_spatial) 28 | export(sim_true_counts) 29 | exportClasses(spatialGrid) 30 | import(ape) 31 | import(foreach) 32 | import(ggplot2) 33 | import(markdown) 34 | import(rlang) 35 | importFrom(BiocParallel,MulticoreParam) 36 | importFrom(BiocParallel,bplapply) 37 | importFrom(Rtsne,Rtsne) 38 | importFrom(SummarizedExperiment,SummarizedExperiment) 39 | importFrom(dplyr,"%>%") 40 | importFrom(stats,cor) 41 | importFrom(stats,density) 42 | importFrom(stats,dist) 43 | importFrom(stats,dnorm) 44 | importFrom(stats,hclust) 45 | importFrom(stats,median) 46 | importFrom(stats,na.omit) 47 | importFrom(stats,rbeta) 48 | importFrom(stats,rbinom) 49 | importFrom(stats,rnorm) 50 | importFrom(stats,rpois) 51 | importFrom(stats,runif) 52 | importFrom(stats,setNames) 53 | importFrom(utils,data) 54 | importFrom(utils,write.csv) 55 | importFrom(zeallot,"%->%") 56 | importFrom(zeallot,"%<-%") 57 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # scMultiSim 1.1.3 2 | 3 | - Added the Shiny app to help users visualize the effect of each parameter and adjust the simulation options. 4 | - Added the `speed.up` parameter to enable experimental speed optimization. 5 | - Bug fixes and improvements. 6 | 7 | # scMultiSim 1.1.0 8 | 9 | Prepare for the Bioconductor release 10 | 11 | - Fix build errors 12 | 13 | # scMultiSim 0.99.8 14 | 15 | Prepare for the Bioconductor release 16 | 17 | - Tidy up the code, add more comments 18 | 19 | # scMultiSim 0.99.7 20 | 21 | Prepare for the Bioconductor release 22 | 23 | - Fix build errors 24 | -------------------------------------------------------------------------------- /R/0_opts.R: -------------------------------------------------------------------------------- 1 | 2 | .default <- \(...) list(FALSE, as.character(enexprs(...)), ...) 3 | .required <- list(TRUE) 4 | 5 | .should.be.logical <- list( 6 | is.logical, 7 | "The value should be a logical." 8 | ) 9 | 10 | .should.be.int <- list( 11 | \(x) x %% 1 == 0, 12 | "The value should be a numeric." 13 | ) 14 | 15 | .should.be.int.between <- function(a, b) list( 16 | \(x) x %% 1 == 0 && x >= a && x <= b, 17 | sprintf("The value should be an integer between %g and %g.", a, b) 18 | ) 19 | 20 | .should.be.num <- list( 21 | is.numeric, 22 | "The value should be a numeric." 23 | ) 24 | 25 | .should.be.num.between <- function(a, b) list( 26 | \(x) is.numeric(x) && x >= a && x <= b, 27 | sprintf("The value should be a numeric between %g and %g.", a, b) 28 | ) 29 | 30 | .choose_from <- function(...) { 31 | opts <- list(...) 32 | list( 33 | \(x) x %in% opts, 34 | sprintf("The value should be one of [%s].", do.call(paste, c(opts, sep = ", "))) 35 | ) 36 | } 37 | 38 | 39 | # ============================================================================== 40 | # OPTIONS: each option should be a list(default, checker, description) 41 | # ============================================================================== 42 | 43 | .opt_list <- function() list( 44 | "GENERAL", 45 | rand.seed = list( 46 | .default(0), 47 | .should.be.int, 48 | "scMultiSim should produce the same result if all other parameters are the same." 49 | ), 50 | threads = list( 51 | .default(1), 52 | .should.be.int.between(1, 4096), 53 | "Set to larger than 1 to use multithreading for some part of the simulation." 54 | ), 55 | speed.up = list( 56 | .default(FALSE), 57 | .should.be.logical, 58 | "Use experimental speed and memory optimization." 59 | ), 60 | # ========================== Gene ============================================ 61 | "GENE", 62 | GRN = list( 63 | .default(NULL), 64 | list( 65 | \(x) (length(x) == 1 && is.na(x)) || (is.data.frame(x) && ncol(x) >= 3 && is.numeric(x[[3]])), 66 | "It should be a data frame with 3 columns (target, regulator, effect). Supply NA to disable the GRN effect." 67 | ), 68 | "The GRN network." 69 | ), 70 | grn.effect = list( 71 | .default(1), 72 | .should.be.num.between(0, Inf), 73 | "Overall strength of the GRN effect on the expression. Different from the effect column in the GRN data frame, which is the relative effect of each TF-target pair." 74 | ), 75 | num.genes = list( 76 | .default(NULL), 77 | .should.be.int.between(1, Inf), 78 | "Number of genes if GRN is disabled." 79 | ), 80 | unregulated.gene.ratio = list( 81 | .default(0.1), 82 | .should.be.num.between(0, 1), 83 | "Ratio of unreulated to regulated genes. Extra unregulated genes will be simulated in addition to the genes in GRN." 84 | ), 85 | giv.mean = list( 86 | .default(0), 87 | .should.be.num.between(-Inf, Inf), 88 | "Mean of the Gene Identity Vectors." 89 | ), 90 | giv.prob = list( 91 | .default(0.3), 92 | .should.be.num.between(0, 1), 93 | "Probability of non-zero values in the Gene Identity Vectors." 94 | ), 95 | giv.sd = list( 96 | .default(1), 97 | .should.be.num.between(0, Inf), 98 | "Stddev of the Gene Identity Vectors." 99 | ), 100 | hge.range = list( 101 | .default(1), 102 | .should.be.num.between(1, Inf), 103 | "Only choose highly expressed genes after this range." 104 | ), 105 | hge.prop = list( 106 | .default(0), 107 | .should.be.num.between(0, 1), 108 | "Propotion of highly expressed genes." 109 | ), 110 | hge.mean = list( 111 | .default(5), 112 | .should.be.num.between(1, Inf), 113 | "Scale of highly expressed genes." 114 | ), 115 | hge.sd = list( 116 | .default(1), 117 | .should.be.num.between(0, Inf), 118 | "Variation of highly expressed genes." 119 | ), 120 | hge.max.var = list( 121 | .default(500), 122 | .should.be.num.between(0, Inf), 123 | "Genes with higher variation will not be selected as highly expressed genes." 124 | ), 125 | dynamic.GRN = list( 126 | .default(NA), 127 | NULL, 128 | "Specification of the dynamic GRN. See scmultisim_help(\"dynamic.GRN\") for details." 129 | ), 130 | # ========================== Cell ============================================ 131 | "CELL", 132 | num.cells = list( 133 | .default(1000), 134 | .should.be.int.between(0, Inf), 135 | "Total number of cells from all populations." 136 | ), 137 | tree = list( 138 | .default(Phyla5()), 139 | NULL, 140 | "A tree defining relationship between populations." 141 | ), 142 | discrete.cif = list( 143 | .default(FALSE), 144 | .should.be.logical, 145 | "Whether the cell population is discrete." 146 | ), 147 | discrete.pop.size = list( 148 | .default(NA), 149 | list( 150 | \(x) (length(x) == 1 && is.na(x)) || all(is.integer(x)), 151 | "the value should be an integer vector" 152 | ), 153 | "Specify the cell numbers in each population." 154 | ), 155 | discrete.min.pop.size = list( 156 | .default(70), 157 | .should.be.int, 158 | "Size of the smallest discrete cell population." 159 | ), 160 | discrete.min.pop.index = list( 161 | .default(1), 162 | .should.be.int.between(0, Inf), 163 | "Index of the smallest discrete cell population." 164 | ), 165 | num.cifs = list( 166 | .default(50), 167 | .should.be.int, 168 | "Number of Cell Identity Factors for each kinetic parameter." 169 | ), 170 | diff.cif.fraction = list( 171 | .default(0.9), 172 | .should.be.num.between(0, 1), 173 | "Fraction of CIFs which are differential factors between cell types." 174 | ), 175 | cif.center = list( 176 | .default(1), 177 | .should.be.num, 178 | "Mean of the CIF values." 179 | ), 180 | cif.sigma = list( 181 | .default(0.1), 182 | .should.be.num.between(0, Inf), 183 | "Stddev of the CIF values." 184 | ), 185 | use.impulse = list( 186 | .default(FALSE), 187 | .should.be.logical, 188 | "Use the impulse model when generating the continuous CIF." 189 | ), 190 | # ========================== ATAC ============================================ 191 | "SIMULATION - ATAC", 192 | atac.effect = list( 193 | .default(0.5), 194 | .should.be.num.between(0, 1), 195 | "The influence of chromatin accessability data on gene expression." 196 | ), 197 | region.distrib = list( 198 | .default(c(0.1, 0.5, 0.4)), 199 | list( 200 | \(x) x > 0 && length(x) == 3 && sum(x) == 1, 201 | "the value should be a vector with 3 elements sum to 1" 202 | ), 203 | "The probability that a gene is regulated by respectively 0, 1, 2 consecutive regions." 204 | ), 205 | atac.p_zero = list( 206 | .default(0.8), 207 | NULL, 208 | "The proportion of 0s we see in the ATAC-seq data." 209 | ), 210 | atac.density = list( 211 | .default(NA), 212 | list( 213 | \(x) class(x) == "density", 214 | "the value should be a density object." 215 | ), 216 | "Density of the non-zero ATAC-seq values. Use atac_dens_nonzero() to generate." 217 | ), 218 | riv.mean = list( 219 | .default(0), 220 | .should.be.num.between(0, Inf), 221 | "Mean of the Region Identity Vectors." 222 | ), 223 | riv.prob = list( 224 | .default(0.3), 225 | .should.be.num.between(0, 1), 226 | "Probability of non-zero values in the Region Identity Vectors." 227 | ), 228 | riv.sd = list( 229 | .default(1), 230 | .should.be.num.between(0, Inf), 231 | "Stddev of the Region Identity Vectors." 232 | ), 233 | # ========================== Simulation ====================================== 234 | "SIMULATION - RNA", 235 | vary = list( 236 | .default("s"), 237 | .choose_from("all", "kon", "koff", "s", "except_kon", "except_koff", "except_s"), 238 | "Which kinetic parameters have differential CIFs." 239 | ), 240 | bimod = list( 241 | .default(0), 242 | .should.be.num.between(0, 1), 243 | "Adjust the bimodality of gene expression, thus controlling intrinsic variation." 244 | ), 245 | scale.s = list( 246 | .default(1), 247 | NULL, 248 | "Scale of the s parameter. Use smaller value for cell types known to be small (like naive cells). When discrete.cif = T, it can be a vector specifying the scale.s for each cluster." 249 | ), 250 | intrinsic.noise = list( 251 | .default(1), 252 | .should.be.num.between(0, 1), 253 | "The weight assigned to the random sample from the Beta-Poisson distribution, where the weight of the Beta-Poisson mean value is given a weight of (1 - intrinsic.noise)." 254 | ), 255 | # ========================== Kinetic Model =================================== 256 | "SIMULATION - KINETIC MODEL", 257 | do.velocity = list( 258 | .default(FALSE), 259 | .should.be.logical, 260 | "Simulate using the whole kinetic model and generate RNA velocity data." 261 | ), 262 | beta = list( 263 | .default(0.4), 264 | .should.be.num, 265 | "Splicing rate of each gene in the kinetic model." 266 | ), 267 | d = list( 268 | .default(1), 269 | .should.be.num, 270 | "Degradation rate of each gene in the kinetic model." 271 | ), 272 | num.cycles = list( 273 | .default(3), 274 | .should.be.int.between(1, Inf), 275 | "For velocity mode, the number of cycles run before sampling the gene expression of a cell."), 276 | cycle.len = list( 277 | .default(1), 278 | .should.be.num.between(0, Inf), 279 | "For velocity mode, a factor multiplied by the expected time to transition from kon to koff and back to form the the length of a cycle." 280 | ), 281 | mod.cif.giv = list( 282 | .default(NA), 283 | list( 284 | is.function, "should be a function" 285 | ), 286 | "Modify the generated CIF and GIV. The function takes four arguments: the kinetic parameter index (1=kon, 2=koff, 3=s), the current CIF matrix, the GIV matrix, and the cell metadata dataframe. It should return a list of two elements: the modified CIF matrix and the modified GIV matrix." 287 | ), 288 | ext.cif.giv = list( 289 | .default(NA), 290 | list( 291 | is.function, "should be a function" 292 | ), 293 | "Add customized CIF and GIV. The function takes one argument, the kinetic parameter index (1=kon, 2=koff, 3=s). It should return a list of two elements: the extra CIF matrix (n_extra_cif x n_cells) and the GIV matrix (n_genes x n_extra_cif). Return NULL for no extra CIF and GIV." 294 | ), 295 | # ========================== Spatial ========================================= 296 | "SIMULATION - SPATIAL", 297 | cci = list( 298 | .default(NA), 299 | list( 300 | \(x) is.list(x) && is.data.frame(x[["params"]]), 301 | "Enables cell-cell interaction. See scmultisim_help(\"cci\") for details." 302 | ), 303 | "The regulation network for spatial cell-cell interaction." 304 | ) 305 | ) 306 | 307 | # utils: check if the option is valid 308 | .check_opt <- function(options) { 309 | opt_list <- .opt_list() 310 | opt_list <- opt_list[!vapply(opt_list, is.character, logical(1))] 311 | for (name in names(opt_list)) { 312 | c(val, checker, desc) %<-% opt_list[[name]] 313 | required <- val[[1]] 314 | user_val <- options[[name]] 315 | if (is.null(user_val)) { 316 | # if option not exist 317 | if (required) { 318 | abort(sprintf("ERROR: Option '%s' is required.\n%s", name, desc)) 319 | } else { 320 | # assign default value 321 | options[[name]] <- val[[3]] 322 | } 323 | } else { 324 | # check the value 325 | if (!is.null(checker)) { 326 | c(check, err_msg) %<-% checker 327 | if (!check(user_val)) { 328 | abort(sprintf("ERROR: Option '%s' is invalid.\n%s", name, err_msg)) 329 | } 330 | } 331 | } 332 | } 333 | 334 | options 335 | } 336 | 337 | 338 | # manually add line breaks to a long string after 72 characters 339 | .split_long_string <- function(x) { 340 | if (!is.character(x)) return(NULL) 341 | ss <- strsplit(x, "(?<=.{72})", perl = TRUE)[[1]] 342 | do.call(paste, c(as.list(ss), sep = "\n\t")) 343 | } 344 | 345 | 346 | # utils: print the option list 347 | .print_opt <- function(name = NULL) { 348 | opt_list <- .opt_list() 349 | names <- names(opt_list) 350 | 351 | opts <- if (is.null(name)) { 352 | seq_along(names) 353 | } else { 354 | which(names %in% name) 355 | } 356 | 357 | if (is.null(opts) || length(opts) == 0) { 358 | stop(sprintf("Option %s doesn't exist.\n", name)) 359 | } 360 | 361 | for (i in opts) { 362 | n <- names[i] 363 | opt <- opt_list[[i]] 364 | if (n == "") { 365 | sprintf("\n[%s]\n\n", opt) %>% cat() 366 | } else { 367 | c(val, checker, desc) %<-% opt 368 | if (val[[1]]) { 369 | sprintf("%s (required)\n", n) %>% cat() 370 | } else { 371 | sprintf("%s (default: %s)\n", n, val[[2]]) %>% cat() 372 | } 373 | sprintf("\t%s\n", .split_long_string(desc)) %>% cat() 374 | sprintf("\t%s\n", .split_long_string(checker[[2]])) %>% cat() 375 | } 376 | } 377 | } 378 | 379 | 380 | #' Get option from an object in the current environment 381 | #' 382 | #' @param ... the parameter name 383 | #' @param .name get option from this object 384 | #' @keywords internal 385 | #' @return the parameter value 386 | OP <- function(..., .name = 'options') { 387 | options <- get(.name, envir = caller_env()) 388 | k <- as.character(dplyr::expr(...)) 389 | if (!(k %in% names(options))) { 390 | stop(sprintf("Option %s is required but not presented.", k)) 391 | } 392 | options[[k]] 393 | } 394 | 395 | 396 | # print the help message for dynamic grn params 397 | .dynamic_grn_default_params <- function(help = FALSE) { 398 | if (help) { 399 | cat("Dynamic GRN deletes and creates some edges in the GRN in each epoch. 400 | One epoch contains multiple steps, and the change is done gradually in steps. 401 | The specific GRN at each step will be used by one or more cells sequentially. 402 | When an epoch is done, another epoch will start. 403 | 404 | Available options for dynamic.GRN: 405 | - seed: the random seed 406 | - num.steps: number of steps in each epoch. 407 | - cell.per.step: how many cells share the GRN in the same step. 408 | - involved.genes: a new edge will only be created within these specified genes. 409 | The default value is NA, which will use all existing genes in the GRN. 410 | - num.changing.edges: if < 1, it means the portion of edges added/deleted in each epoch. 411 | if >= 1, it means the number of edges added/deleted in each epoch. 412 | - create.tf.edges: whether a new edge can connect two TFs in the GRN. 413 | - weight.mean: the mean value of the weight for a newly created edge. 414 | The default value is NA, meaning that it will use the mean value of the input GRN. 415 | - weight.sd: the standard deviation of the weight for a newly created edge. 416 | 417 | See the returned list for the default values. 418 | ") 419 | } 420 | 421 | list( 422 | seed = 0, 423 | num.steps = 200, 424 | cell.per.step = 1, 425 | involved.genes = NA, 426 | num.changing.edges = 2, 427 | create.tf.edges = FALSE, 428 | weight.mean = NA, 429 | weight.sd = 1 430 | ) 431 | } 432 | -------------------------------------------------------------------------------- /R/1.1_cif.R: -------------------------------------------------------------------------------- 1 | # Parameters: 2 | # ncells, n_nd_cif, n_diff_cif, n_reg_cif, 3 | # cif_center, cif_sigma, 4 | # neutral, phyla, tree_info, 5 | # use_impulse 6 | 7 | 8 | # called by .continuousCIF() to generate the CIF for a continuous population 9 | .continuousCIFParam <- function(is_spatial, ...) { 10 | if (is_spatial) { 11 | .continuousCIFParamSpatial(...) 12 | } else { 13 | .continuousCIFParamNormal(...) 14 | } 15 | } 16 | 17 | 18 | # generate the CIF for a continuous population, when spatial mode is enabled 19 | # @return a list of cif, diff_cif_by_path, meta_by_path, layer_idx_by_path 20 | .continuousCIFParamSpatial <- function( 21 | ncells, N_nd.cif, N_diff.cif, n_reg_cif, 22 | cif_center, cif_sigma, step_size, 23 | neutral, phyla, tree_info, 24 | use_impulse, sp_params, ... 25 | ) { 26 | # paths: list of int vector, each path 27 | # cell_path: int vector, the path idx of each cell 28 | # path_len: int vector, the length of each path 29 | param_names <- c("kon", "koff", "s") 30 | 31 | sp_params %->% c( 32 | max_layers, paths, cell_path, path_len 33 | ) 34 | 35 | # nd and reg cif 36 | cif <- foreach(i_cell = seq(ncells)) %do% { 37 | i_path <- cell_path[i_cell] 38 | n_layers <- path_len[i_path] 39 | 40 | if (i_cell %% 100 == 0) cat(sprintf("%i..", i_cell)) 41 | # for each cell, generate n_layer x n_cif 42 | cif_cell <- lapply(seq_len(3), function(i) { 43 | param_name <- param_names[i] 44 | n_nd_cif <- N_nd.cif[i] 45 | n_diff_cif <- N_diff.cif[i] 46 | 47 | # nd cif 48 | nd_cif <- lapply(seq(n_nd_cif), \(icif) rnorm(n_layers, cif_center, cif_sigma)) %>% do.call(cbind, .) 49 | colnames(nd_cif) <- paste(param_name, "nonDE", seq(n_nd_cif), sep = "_") 50 | 51 | # diff cif 52 | need_diff_cif <- n_diff_cif > 0 53 | # for cell 1, output the diff_cif itself; for other cells, only output TRUE or FALSE 54 | diff_cif <- need_diff_cif 55 | if (need_diff_cif && i_cell == 1) { 56 | # diff cif is shared among all cell & layers; generate them lazily 57 | # make sure only generated once for kon, koff and s 58 | # n_layers x n_diff_cif 59 | # =============================================== COPY 60 | diff_cif <- if (use_impulse) { 61 | c(edges, root, tips, internal) %<-% tree_info 62 | # impulse model 63 | # pdf(file = .plot.name, width = 15, height = 5) 64 | tip <- rep(tips, ceiling(n_diff_cif / length(tips))) 65 | lapply(seq(n_diff_cif), function(cif_i) { 66 | impulse <- Impulsecifpertip(phyla, edges, root, tips, internal, neutral, tip[cif_i], cif_sigma, cif_center, step_size) 67 | # if (.plot) { PlotRoot2Leave(impulse, tips, edges, root, internal) } 68 | re_order <- match( 69 | apply(neutral[, seq_len(3)], 1, \(X) paste0(X, collapse = "_")), 70 | apply(impulse[, seq_len(3)], 1, \(X) paste0(X, collapse = "_")) 71 | ) 72 | return(impulse[re_order,]) 73 | }) 74 | # dev.off() 75 | } else { 76 | # Gaussian sample 77 | lapply(seq(n_diff_cif), function(icif) { 78 | # supply neutral to have the same t_sample values for all cells 79 | SampleSubtree(tree_info$root, 0, cif_center, tree_info$edges, ncells, step_size, neutral = neutral)[, 4] 80 | }) %>% 81 | do.call(cbind, .) %>% 82 | .[seq(max_layers),] 83 | } 84 | colnames(diff_cif) <- paste(param_name, "DE", seq(n_diff_cif), sep = "_") 85 | # ================================================ COPY 86 | 87 | diff_cif 88 | } 89 | 90 | # reg cif 91 | reg_cif <- NULL 92 | if (i <= 2 && n_reg_cif > 0) { 93 | reg_cif <- lapply( 94 | seq(n_reg_cif), 95 | \(.) rnorm(n_layers, cif_center, cif_sigma) 96 | ) %>% do.call(cbind, .) 97 | colnames(reg_cif) <- paste(param_name, "reg", seq(n_reg_cif), sep = "_") 98 | } 99 | 100 | # TRUE if diff_cif is needed to be combined later 101 | list(nd = nd_cif, diff = diff_cif, reg = reg_cif) 102 | }) 103 | 104 | setNames(cif_cell, param_names) 105 | } 106 | 107 | cat("Done\n") 108 | # gather diff_cif 109 | diff_cif_all <- list(NULL, NULL, NULL) 110 | for (i in seq_len(3)) { 111 | d_cif <- cif[[1]][[i]]$diff 112 | if (!is.logical(d_cif)) { 113 | # if this param has diff cif, move it to diff_cif_all and replace it as FALSE 114 | diff_cif_all[[i]] <- d_cif 115 | cif[[1]][[i]]$diff <- TRUE 116 | } 117 | } 118 | 119 | # get the index on each path 120 | neutral <- neutral[seq(max_layers),] 121 | layer_idx_by_path <- lapply(paths, function(path) { 122 | idx <- integer() 123 | for (i in seq(length(path) - 1)) { 124 | a <- path[i] 125 | b <- path[i + 1] 126 | idx <- c(idx, which(neutral[, 1] == a & neutral[, 2] == b)) 127 | } 128 | idx 129 | }) 130 | 131 | # now process diff cif 132 | diff_cif_by_path <- lapply(diff_cif_all, function(d_cif) { 133 | lapply(seq_along(paths), function(i_path) { 134 | if (is.null(d_cif)) return(NULL) 135 | d_cif[layer_idx_by_path[[i_path]],] 136 | }) 137 | }) 138 | names(diff_cif_by_path) <- param_names 139 | 140 | # cell types & meta 141 | cell_types <- character(length = nrow(neutral)) 142 | for (i in seq(nrow(tree_info$edges))) { 143 | c(id, from, to, len) %<-% tree_info$edges[i,] 144 | n_steps <- len %/% step_size + ceiling(len %% step_size) 145 | pts <- which(neutral[, 1] == from & neutral[, 2] == to) 146 | n_pts <- length(pts) 147 | cell_types[pts] <- if (n_steps == 1) { 148 | paste(from, to, sep = "_") 149 | } else { 150 | type_id <- ceiling(seq(n_pts) * (n_steps / n_pts)) 151 | paste(from, to, type_id, sep = "_") 152 | } 153 | } 154 | 155 | meta_by_path <- lapply(seq_along(paths), function(i_path) { 156 | idx <- layer_idx_by_path[[i_path]] 157 | n <- neutral[idx,] 158 | data.frame( 159 | pop = apply(n[, seq_len(2)], 1, \(X) paste0(X, collapse = "_")), 160 | depth = n[, 3], 161 | cell.type = cell_types[idx] 162 | ) 163 | }) 164 | 165 | for (d_cif in diff_cif_by_path) { 166 | for (i in seq_along(paths)) { 167 | if (is.null(d_cif[[i]])) next 168 | stopifnot(nrow(d_cif[[i]]) == path_len[i]) 169 | } 170 | } 171 | 172 | list( 173 | cif = cif, diff_cif_by_path = diff_cif_by_path, 174 | meta_by_path = meta_by_path, 175 | layer_idx_by_path = layer_idx_by_path 176 | ) 177 | } 178 | 179 | 180 | # generate the CIF for a continuous population, when spatial mode is disabled 181 | .continuousCIFParamNormal <- function( 182 | ncells, N_nd.cif, N_diff.cif, n_reg_cif, 183 | cif_center, cif_sigma, step_size, 184 | neutral, phyla, tree_info, 185 | use_impulse, ... 186 | ) { 187 | param_names <- c("kon", "koff", "s") 188 | 189 | cif <- lapply(seq_len(3), function(i) { 190 | param_name <- param_names[i] 191 | n_nd_cif <- N_nd.cif[i] 192 | n_diff_cif <- N_diff.cif[i] 193 | 194 | # ========== de_cif ========== 195 | nd_cif <- lapply(seq(n_nd_cif), \(icif) rnorm(ncells, cif_center, cif_sigma)) %>% do.call(cbind, .) 196 | colnames(nd_cif) <- paste(param_name, "nonDE", seq(n_nd_cif), sep = "_") 197 | cifs <- nd_cif 198 | 199 | # ========== nd_cif ========== 200 | if (n_diff_cif > 0) { 201 | # generate de_cif if there exist de_cifs for the parameter we are looking at 202 | diff_cif <- if (use_impulse) { 203 | c(edges, root, tips, internal) %<-% tree_info 204 | # impulse model 205 | # pdf(file = .plot.name, width = 15, height = 5) 206 | tip <- rep(tips, ceiling(n_diff_cif / length(tips))) 207 | lapply(seq(n_diff_cif), function(cif_i) { 208 | impulse <- Impulsecifpertip(phyla, edges, root, tips, internal, neutral, tip[cif_i], cif_sigma, cif_center, step_size) 209 | # if (.plot) { PlotRoot2Leave(impulse, tips, edges, root, internal) } 210 | re_order <- match( 211 | apply(neutral[, seq_len(3)], 1, \(X) paste0(X, collapse = "_")), 212 | apply(impulse[, seq_len(3)], 1, \(X) paste0(X, collapse = "_")) 213 | ) 214 | return(impulse[re_order,]) 215 | }) 216 | # dev.off() 217 | } else { 218 | # Gaussian sample 219 | lapply(seq(n_diff_cif), function(icif) { 220 | # supply neutral to have the same t_sample values for all cells 221 | SampleSubtree(tree_info$root, 0, cif_center, tree_info$edges, ncells, step_size, neutral = neutral)[, 4] 222 | }) %>% 223 | do.call(cbind, .) %>% 224 | .[seq(ncells),] 225 | } 226 | colnames(diff_cif) <- paste(param_name, "DE", seq(n_diff_cif), sep = "_") 227 | cifs <- cbind(nd_cif, diff_cif) 228 | } 229 | 230 | # ========== generate reg_cif for k_on, k_off =========== 231 | if (i <= 2 && n_reg_cif > 0) { 232 | reg_cif <- lapply( 233 | seq_len(n_reg_cif), 234 | \(.) rnorm(ncells, cif_center, cif_sigma) 235 | ) %>% do.call(cbind, .) 236 | colnames(reg_cif) <- paste(param_name, "reg", seq_len(n_reg_cif), sep = "_") 237 | cifs <- cbind(cifs, reg_cif) 238 | } 239 | 240 | return(cifs) 241 | }) 242 | 243 | names(cif) <- param_names 244 | cif 245 | } 246 | 247 | 248 | .discreteCIFSpatial <- function( 249 | seed, N, options, sim, ... 250 | ) { 251 | # set.seed(seed) 252 | param_names <- c("kon", "koff", "s") 253 | 254 | phyla <- OP("tree") 255 | cif_center <- OP("cif.center") 256 | cif_sigma <- OP("cif.sigma") 257 | user_popsize <- OP("discrete.pop.size") 258 | min_popsize <- OP("discrete.min.pop.size") 259 | i_minpop <- OP("discrete.min.pop.index") 260 | 261 | npop <- length(phyla$tip.label) 262 | if (!is.null(sim$ncells_pop)) { 263 | ncells_pop <- sim$ncells_pop 264 | } else if (npop == 1) { 265 | ncells_pop <- N$cell 266 | } else if (is.integer(user_popsize)) { 267 | stopifnot(length(user_popsize) == npop) 268 | stopifnot(sum(user_popsize) == N$cell) 269 | ncells_pop <- user_popsize 270 | } else { 271 | ncells_pop <- rep(min_popsize, npop) 272 | if (N$cell < min_popsize * npop) { 273 | stop(sprintf( 274 | "The size of the smallest population (%g * %g) is too big for the total number of cells (%g)", 275 | min_popsize, npop, N$cell)) 276 | } 277 | 278 | larger_pops <- setdiff(seq(npop), i_minpop) 279 | ncells_pop[larger_pops] <- floor((N$cell - min_popsize) / length(larger_pops)) 280 | leftover <- N$cell - sum(ncells_pop) 281 | if (leftover > 0) { 282 | temp <- sample(larger_pops, leftover, replace = FALSE) 283 | ncells_pop[temp] <- ncells_pop[temp] + 1 284 | } 285 | } 286 | 287 | if (is.null(sim$ncells_pop)) { 288 | sim$ncells_pop <- ncells_pop 289 | } 290 | 291 | vcv_evf_mean <- vcv.phylo(phyla, corr = TRUE) 292 | param_name <- c("kon", "koff", "s") 293 | 294 | # nd and reg cif 295 | cif <- foreach(i_cell = seq(N$cell)) %do% { 296 | # === each cell === 297 | n_layers <- N$cell 298 | 299 | # for each cell, generate n_layer x n_cif 300 | cif_cell <- lapply(seq_len(3), function(i) { 301 | param_name <- param_names[i] 302 | n_nd_cif <- N$nd.cif[i] 303 | n_diff_cif <- N$diff.cif[i] 304 | need_diff_cif <- n_diff_cif > 0 305 | 306 | # nd cif 307 | nd_cif <- lapply(seq(n_nd_cif), \(icif) rnorm(n_layers, cif_center, cif_sigma)) %>% do.call(cbind, .) 308 | colnames(nd_cif) <- paste(param_name, "nonDE", seq(n_nd_cif), sep = "_") 309 | 310 | # reg cif 311 | reg_cif <- NULL 312 | if (i <= 2 && N$reg_cif > 0) { 313 | reg_cif <- lapply( 314 | seq(N$reg_cif), 315 | \(.) rnorm(n_layers, cif_center, cif_sigma) 316 | ) %>% do.call(cbind, .) 317 | colnames(reg_cif) <- paste(param_name, "reg", seq(N$reg_cif), sep = "_") 318 | } 319 | 320 | list(nd = nd_cif, diff = need_diff_cif, reg = reg_cif) 321 | }) 322 | 323 | setNames(cif_cell, param_names) 324 | # === end: each cell === 325 | } 326 | 327 | 328 | # diff cif 329 | diff_cif <- lapply(seq_len(3), function(i) { 330 | n_diff_cif <- N$diff.cif[i] 331 | need_diff_cif <- n_diff_cif > 0 332 | if (need_diff_cif) { 333 | pop_diff_cif_mean <- MASS::mvrnorm(n_diff_cif, rep(cif_center, npop), vcv_evf_mean) 334 | dcif <- lapply(seq(npop), function(ipop) { 335 | evf <- vapply(seq(n_diff_cif), function(ievf) { 336 | rnorm(ncells_pop[ipop], pop_diff_cif_mean[ievf, ipop], cif_sigma) 337 | }, numeric(ncells_pop[ipop])) 338 | return(evf) 339 | }) %>% do.call(rbind, .) 340 | colnames(dcif) <- rep("DE", n_diff_cif) 341 | dcif 342 | } else { 343 | NULL 344 | } 345 | }) 346 | diff_cif <- setNames(diff_cif, param_names) 347 | 348 | pop <- do.call(c, lapply(seq(npop), function(i) rep(i, ncells_pop[i]))) 349 | 350 | meta <- data.frame( 351 | pop = pop, cell.type = pop, cell.type.idx = pop 352 | ) 353 | 354 | list( 355 | cif = cif, 356 | meta = meta, 357 | diff_cif = diff_cif 358 | ) 359 | } 360 | 361 | 362 | # return (node_from, node_to, t, state) 363 | SampleEdge <- function(edge, depth, anc_state, edges, ncells, step_size, t_sample = NA) { 364 | if (is.na(t_sample[1])) { 365 | #t_sample <- c(0,sort( runif(round(edge[4]*ncells/sum(edges[,4])),0,edge[4]) )) 366 | branch_len <- edge[4] 367 | ncell_branch <- ceiling(branch_len * ncells / sum(edges[, 4])) - 1 368 | if (ncell_branch < 0) { stop("the total number of cells is too few.") } 369 | t_sample <- c(0, seq(0, branch_len, branch_len / ncell_branch)) 370 | t_sample <- c(t_sample, branch_len) 371 | } else { 372 | t_sample <- sort(c(0, t_sample - depth)) 373 | } 374 | t_interval <- diff(t_sample) 375 | x_change <- vapply(t_interval, function(sig) rnorm(1, 0, sqrt(sig)), 376 | numeric(1)) 377 | x_sample <- cumsum(x_change) 378 | col_time <- depth + t_sample[-1] 379 | col_state <- anc_state + x_sample 380 | # return 381 | cbind(edge[2], edge[3], col_time, col_state) 382 | } 383 | 384 | SampleSubtree <- function(par, depth, anc_state, edges, ncells, step_size, neutral = NA) { 385 | # get the children of the current node 386 | children <- edges[edges[, 2] == par, 3] 387 | result <- lapply(c(seq_along(children)), function(j) { 388 | edge <- edges[edges[, 2] == par & edges[, 3] == children[j],] # given the parent and child, find the edge 389 | if (sum(edges[, 2] == children[j]) == 0) { # this means the current node is a leaf 390 | if (is.na(neutral[1])) { 391 | result <- SampleEdge(edge, depth, anc_state, edges, ncells, step_size) 392 | } else { 393 | t_sample <- neutral[neutral[, 1] == edge[2] & neutral[, 2] == edge[3], 3] 394 | result <- SampleEdge(edge, depth, anc_state, edges, ncells, step_size, t_sample) 395 | } 396 | result <- result[c(seq(length(result[, 1] - 1))),] 397 | } else { 398 | if (is.na(neutral[1])) { 399 | result <- SampleEdge(edge, depth, anc_state, edges, ncells, step_size) 400 | } else { 401 | t_sample <- neutral[neutral[, 1] == edge[2] & neutral[, 2] == edge[3], 3] 402 | result <- SampleEdge(edge, depth, anc_state, edges, ncells, step_size, t_sample) 403 | } 404 | anc_state <- result[length(result[, 1]), 4] 405 | # !!! why this line 406 | result <- result[c(seq(length(result[, 1] - 1))),] 407 | depth <- depth + edge[4] 408 | result1 <- SampleSubtree(children[j], depth, anc_state, edges, ncells, step_size, neutral) 409 | result <- rbind(result, result1) 410 | } 411 | return(result) 412 | }) 413 | result <- do.call(rbind, result) 414 | colnames(result) <- c("from", "to", "time", "state") 415 | rownames(result) <- NULL 416 | return(result) 417 | } 418 | -------------------------------------------------------------------------------- /R/3.2_dyngrn.R: -------------------------------------------------------------------------------- 1 | .dynGRN <- setRefClass("dynGRN", fields = c( 2 | "randseed", "randstate", 3 | "involved_genes", 4 | "geff", "params", "regulators", "targets", "n_tgt", "n_reg", "name_map", 5 | "del_edges", "gen_edges", "has_tf_edges", 6 | "max_steps", "remaining_steps", "remaining_cells", 7 | "cell_per_step", "n_edges", "n_changing_edges", "weight_mean", "weight_sd", 8 | "history", "version" 9 | )) 10 | 11 | .dynGRN$methods( 12 | restructure = function() { 13 | # set all deleted edges' weights to 0 14 | if (!is.null(del_edges)) { 15 | geff[del_edges[,seq_len(2)]] <<- 0 16 | } 17 | grn_region <- geff[involved_genes,] 18 | edges <- which(grn_region != 0, arr.ind = TRUE) 19 | nonedges <- which(grn_region == 0, arr.ind = TRUE) 20 | if (!has_tf_edges) { 21 | nonedges <- nonedges[-(nonedges[,1] %in% regulators),] 22 | } 23 | N_changed_edges <- if (n_changing_edges < 1) { 24 | as.integer(n_edges * n_changing_edges) 25 | } else { 26 | as.integer(n_changing_edges) 27 | } 28 | stopifnot(N_changed_edges > 0) 29 | # get new del_edges and gen_edges 30 | dedges <- edges[sample(nrow(edges), N_changed_edges),] 31 | del_edges <<- cbind(dedges, geff[dedges]) 32 | gedges <- nonedges[sample(nrow(nonedges), N_changed_edges),] 33 | gen_edges <<- cbind(gedges, rnorm(N_changed_edges, mean = weight_mean, sd = weight_sd)) 34 | stopifnot(all(geff[del_edges[,seq_len(2)]] != 0)) 35 | stopifnot(all(geff[gen_edges[,seq_len(2)]] == 0)) 36 | } 37 | ) 38 | 39 | .dynGRN$methods( 40 | update = function() { 41 | if (remaining_steps == 0) { 42 | # change grn structure 43 | restructure() 44 | remaining_steps <<- max_steps 45 | } 46 | if (remaining_cells == 0) { 47 | # update gradually 48 | s <- 1 / max_steps 49 | for (row in seq_len(nrow(del_edges))) { 50 | i <- del_edges[row, 1] 51 | j <- del_edges[row, 2] 52 | w <- del_edges[row, 3] 53 | geff[i, j] <<- geff[i, j] - w * s 54 | if (abs(geff[i, j]) <= 1e-5) { 55 | geff[i, j] <<- 0 56 | } 57 | } 58 | for (row in seq_len(nrow(gen_edges))) { 59 | i <- gen_edges[row, 1] 60 | j <- gen_edges[row, 2] 61 | w <- gen_edges[row, 3] 62 | geff[i, j] <<- geff[i, j] + w * s 63 | } 64 | remaining_steps <<- remaining_steps - 1 65 | remaining_cells <<- cell_per_step 66 | } 67 | remaining_cells <<- remaining_cells - 1 68 | # update history 69 | history[[version]] <<- geff 70 | # return version 71 | ver_ <- version 72 | version <<- version + 1 73 | ver_ 74 | } 75 | ) 76 | 77 | .CreateDynGRN <- function(grn, opts) { 78 | if (is.na(opts$involved.genes)) { 79 | opts$involved.genes <- sort(unique(c(grn$regulators, grn$targets))) 80 | } 81 | if (is.na(opts$weight.mean)) { 82 | opts$weight.mean <- round(mean(grn$params[,3]), digits = 2) 83 | } 84 | 85 | dyngrn <- .dynGRN$new( 86 | randseed = opts$seed, 87 | # opts 88 | involved_genes = opts$involved.genes, 89 | max_steps = opts$num.steps, 90 | n_changing_edges = opts$num.changing.edges, 91 | cell_per_step = opts$cell.per.step, 92 | weight_mean = opts$weight.mean, 93 | weight_sd = opts$weight.sd, 94 | has_tf_edges = opts$create.tf.edges, 95 | # grn 96 | geff = grn$geff, 97 | params = grn$params, 98 | regulators = grn$regulators, 99 | targets = grn$targets, 100 | n_tgt = grn$n_tgt, 101 | n_reg = grn$n_reg, 102 | name_map = grn$name_map, 103 | # other fields 104 | del_edges = NULL, 105 | gen_edges = NULL, 106 | remaining_steps = opts$num.steps, 107 | remaining_cells = opts$cell.per.step, 108 | n_edges = nrow(grn$params), 109 | history = list(), 110 | version = 1 111 | ) 112 | dyngrn$restructure() 113 | return(dyngrn) 114 | } 115 | 116 | 117 | .getDynGRNOpts <- function(options) { 118 | opts <- .dynamic_grn_default_params() 119 | for (name in names(opts)) { 120 | val <- options[[name]] 121 | if (!is.null(val)) { 122 | opts[[name]] <- val 123 | } 124 | } 125 | opts 126 | } 127 | -------------------------------------------------------------------------------- /R/6_technoise.R: -------------------------------------------------------------------------------- 1 | #' Add experimental noise to true counts 2 | #' 3 | #' @param results The scMultisim result object 4 | #' @param ... 5 | #' `randseed`: The random seed 6 | #' `protocol`: `UMI` or `non-UMI` 7 | #' `gene_len`: A vector with lengths of all genes 8 | #' `alpha_mean`, `alpha_sd`: rate of subsampling of transcripts during capture step 9 | #' `depth_mean`, `depth_sd`: The sequencing depth 10 | #' 11 | #' @seealso 12 | #' The underlying methods 13 | #' \link{True2ObservedCounts} and \link{True2ObservedATAC} 14 | #' 15 | #' @return none 16 | #' @export 17 | #' 18 | #' @examples 19 | #' results <- sim_example(ncells = 10) 20 | #' add_expr_noise(results) 21 | add_expr_noise <- function(results, ...) { 22 | cat("Adding experimental noise...\n") 23 | start_time <- Sys.time() 24 | data(gene_len_pool, envir = environment()) 25 | gene_len <- sample(gene_len_pool, results$num_genes, replace = FALSE) 26 | args <- list(...) 27 | if (length(args) > 0) { 28 | rna_args <- args[names(args)[!startsWith(names(args), "atac.")]] 29 | atac_args <- args[names(args)[startsWith(names(args), "atac.")]] 30 | } else { 31 | rna_args <- list(); atac_args <- list() 32 | } 33 | rna_args <- .defaultArgs(rna_args, randseed = 0, 34 | protocol = "nonUMI", alpha_mean = 0.1, alpha_sd = 0.02, 35 | gene_len = gene_len, depth_mean = 1e5, depth_sd = 3e3, 36 | nPCR1 = 16, nPCR2 = 10) 37 | atac_args <- .defaultArgs(atac_args, atac.obs.prob = 0.3, atac.sd.frac = 0.5) 38 | rna_args$true_counts <- floor(results$counts) 39 | rna_args$meta_cell <- results$cell_meta 40 | results$counts_obs <- do.call(True2ObservedCounts, rna_args) 41 | 42 | atac_data <- if (!is.null(results$atac_counts)) { 43 | cat("Using atac_counts\n") 44 | results$atac_counts 45 | } else { 46 | stop() 47 | cat("Using atacseq_data\n") 48 | results$atacseq_data 49 | } 50 | results$atacseq_obs <- True2ObservedATAC(atac_data, randseed = args$randseed, 51 | observation_prob = atac_args$atac.obs.prob, 52 | sd_frac = atac_args$atac.sd.frac) 53 | message(sprintf("Time spent: %.2f mins\n", 54 | as.numeric(Sys.time() - start_time, units = "mins"))) 55 | } 56 | 57 | 58 | #' Divide batches for observed counts 59 | #' 60 | #' @param results The scMultisim result object, after running `addExprNoise()` 61 | #' @param nbatch Number of batches 62 | #' @param effect Batch effect size, default is 3 63 | #' @param randseed Random seed 64 | #' 65 | #' @return none 66 | #' @export 67 | #' 68 | #' @examples 69 | #' results <- sim_example(ncells = 10) 70 | #' add_expr_noise(results) 71 | #' divide_batches(results) 72 | divide_batches <- function(results, nbatch = 2, effect = 3, randseed = 0) { 73 | cat("Adding batch effects...\n") 74 | obs <- results$counts_obs 75 | if (is.list(obs)) { 76 | obs <- obs$counts 77 | } 78 | ngene <- nrow(obs) 79 | merged <- rbind(obs, results$atacseq_obs) 80 | if ("batch" %in% names(results$cell_meta)) { 81 | results$cell_meta <- results$cell_meta[, !(names(results$cell_meta) %in% "batch")] 82 | } 83 | b <- .divideBatchesImpl( 84 | counts = merged, meta_cell = results$cell_meta, 85 | nbatch = nbatch, batch_effect_size = effect, randseed = 0 86 | ) 87 | results$counts_with_batches <- b$counts[seq(ngene),] 88 | results$atac_with_batches <- b$counts[-(seq(ngene)),] 89 | results$cell_meta <- b$cell_meta 90 | } 91 | 92 | 93 | #' Divide the observed counts into multiple batches by adding batch effect to each batch 94 | #' @param counts gene cell matrix 95 | #' @param meta_cell the meta information related to cells, will be combined with technical cell level information and returned 96 | #' @param nbatch number of batches 97 | #' @param batch_effect_size amount of batch effects. Larger values result in bigger differences between batches. Default is 1. 98 | #' @param randseed random seed 99 | #' @keywords internal 100 | #' @return a list with two elements: counts and meta_cell 101 | .divideBatchesImpl <- function(counts, meta_cell, nbatch, batch_effect_size = 1, randseed = 0) { 102 | # set.seed(randseed) 103 | ## add batch effects to observed counts 104 | # use different mean and same sd to create the multiplicative factor for different part (gene/region) in different batch 105 | ncells <- dim(counts)[2]; nparts <- dim(counts)[1] 106 | batchIDs <- sample(seq(nbatch), ncells, replace = TRUE) 107 | meta_cell2 <- data.frame(batch = batchIDs, stringsAsFactors = FALSE) 108 | meta_cell <- cbind(meta_cell, meta_cell2) 109 | 110 | mean_matrix <- matrix(0, nparts, nbatch) 111 | part_mean <- rnorm(nparts, 0, 1) 112 | temp <- lapply(seq(nparts), function(ipart) { 113 | return(runif(nbatch, min = part_mean[ipart] - batch_effect_size, max = part_mean[ipart] + batch_effect_size)) 114 | }) 115 | mean_matrix <- do.call(rbind, temp) 116 | 117 | batch_factor <- matrix(0, nparts, ncells) 118 | for (ipart in seq(nparts)) { 119 | for (icell in seq(ncells)) { 120 | batch_factor[ipart, icell] <- rnorm(n = 1, mean = mean_matrix[ipart, batchIDs[icell]], sd = 0.01) 121 | } 122 | } 123 | counts <- round(2^(log2(counts) + batch_factor)) 124 | return(list(counts = counts, cell_meta = meta_cell)) 125 | } 126 | 127 | 128 | #' This function simulates the amplification, library prep, and the sequencing processes. 129 | #' @param true_counts_1cell the true transcript counts for one cell (one vector) 130 | #' @param protocol a string, can be "nonUMI" or "UMI" 131 | #' @param rate_2cap the capture efficiency for this cell 132 | #' @param gene_len gene lengths for the genes/transcripts, sampled from real human transcript length 133 | #' @param amp_bias amplification bias for each gene, a vector of length ngenes 134 | #' @param rate_2PCR PCR efficiency, usually very high 135 | #' @param nPCR1 the number of PCR cycles 136 | #' @param nPCR2 the number of PCR cycles 137 | #' @param LinearAmp if linear amplification is used for pre-amplification step, default is FALSE 138 | #' @param LinearAmp_coef the coeficient of linear amplification, that is, how many times each molecule is amplified by 139 | #' @param N_molecules_SEQ number of molecules sent for sequencing; sequencing depth 140 | #' @keywords internal 141 | #' @return read counts (if protocol="nonUMI") or UMI counts (if protocol="UMI) 142 | .amplifyOneCell <- function(true_counts_1cell, protocol, rate_2cap, gene_len, amp_bias, 143 | rate_2PCR, nPCR1, nPCR2, LinearAmp, LinearAmp_coef, N_molecules_SEQ) { 144 | ngenes <- length(gene_len) 145 | if (protocol == "nonUMI") { 146 | if (!exists("len2nfrag")) data(len2nfrag) 147 | } else if (protocol == "UMI") { } else { 148 | stop("protocol input should be nonUMI or UMI") 149 | } 150 | inds <- vector("list", 2) 151 | # expand the original vector and apply capture efficiency 152 | # maintain a transcript index vector: which transcript the molecule belongs to 153 | expanded_res <- .expandToBinary(c(true_counts_1cell, 1)) 154 | expanded_vec <- expanded_res[[1]] 155 | trans_idx <- expanded_res[[2]] 156 | inds[[1]] <- expanded_vec > 0 157 | expanded_vec <- expanded_vec[inds[[1]]] 158 | trans_idx <- trans_idx[inds[[1]]] 159 | 160 | rate_2cap_gene <- rate_2cap[trans_idx] 161 | captured_vec <- expanded_vec 162 | captured_vec[runif(length(captured_vec)) > rate_2cap_gene] <- 0 163 | if (sum(captured_vec[seq(length(captured_vec) - 1)]) < 1) { return(rep(0, ngenes)) } 164 | captured_vec[length(captured_vec)] <- 1 165 | 166 | inds[[2]] <- captured_vec > 0 167 | captured_vec <- captured_vec[inds[[2]]] 168 | trans_idx <- trans_idx[inds[[2]]] 169 | amp_rate <- c((rate_2PCR + amp_bias[trans_idx[seq(length(trans_idx) - 1)]]), 1) 170 | # pre-amplification: 171 | if (LinearAmp) { 172 | PCRed_vec <- captured_vec * LinearAmp_coef 173 | } else { 174 | temp <- runif(length(captured_vec)) < amp_rate 175 | temp <- temp * 2 + captured_vec - temp 176 | for (iPCR in 2:nPCR1) { 177 | eff <- runif(length(temp)) * amp_rate 178 | v1 <- temp * (1 - eff) 179 | round_down <- (v1 - floor(v1)) < runif(length(v1)) 180 | v1[round_down] <- floor(v1[round_down]) 181 | v1[!round_down] <- ceiling(v1[!round_down]) 182 | temp <- v1 + 2 * (temp - v1) 183 | } 184 | PCRed_vec <- temp 185 | } 186 | 187 | if (protocol == "nonUMI") { # add fragmentation step here 188 | temp_vec <- PCRed_vec 189 | for (i in seq(2, 1, -1)) { 190 | temp_vec1 <- numeric(); temp_vec1[inds[[i]]] <- temp_vec; 191 | temp_vec <- temp_vec1; temp_vec[is.na(temp_vec)] <- 0 192 | } 193 | recovered_vec <- temp_vec[seq(length(temp_vec) - 1)] 194 | amp_mol_count <- numeric(ngenes); 195 | GI <- c(0, cumsum(true_counts_1cell)); 196 | for (i in which(true_counts_1cell > 0)) { 197 | x <- recovered_vec[(GI[i] + 1):GI[i + 1]] 198 | amp_mol_count[i] <- sum(x) 199 | } 200 | 201 | # for every copy of each transcript, convert it into number of fragments 202 | frag_vec <- numeric(ngenes) 203 | for (igene in which(amp_mol_count > 0)) { 204 | frag_vec[igene] <- sum(sample(len2nfrag[as.character(gene_len[igene]),], 205 | amp_mol_count[igene], replace = TRUE)) } 206 | # another 8 rounds of amplification to the fragments (fragmentation bias gets amplified) 207 | for (iPCR in seq_len(2)) { 208 | frag_vec <- frag_vec + vapply( 209 | frag_vec,\(x) rbinom(n = 1, x, prob = rate_2PCR), numeric(1)) 210 | } 211 | for (iPCR in 3:nPCR2) { 212 | frag_vec <- frag_vec + round(frag_vec * rate_2PCR) 213 | } 214 | SEQ_efficiency <- N_molecules_SEQ / sum(frag_vec) 215 | if (SEQ_efficiency >= 1) { 216 | read_count <- frag_vec 217 | } else { 218 | read_count <- vapply( 219 | frag_vec, 220 | \(Y) rbinom(n = 1, size = Y, prob = SEQ_efficiency), numeric(1)) 221 | } 222 | return(read_count) 223 | } else if (protocol == "UMI") { 224 | 225 | prob_vec <- vapply( 226 | gene_len[trans_idx[seq(length(trans_idx) - 1)]], .getProb, numeric(1)) 227 | # fragmentation: 228 | frag_vec <- vapply( 229 | seq(length(PCRed_vec) - 1), 230 | \(igene) rbinom(n = 1, size = PCRed_vec[igene], prob = prob_vec[igene]), 231 | numeric(1)) 232 | 233 | # another 10 rounds of amplification to the fragments (fragmentation bias gets amplified) 234 | for (iPCR in seq_len(2)) { 235 | frag_vec <- frag_vec + vapply( 236 | frag_vec, \(x) rbinom(n = 1, x, prob = rate_2PCR), numeric(1)) 237 | } 238 | 239 | frag_vec <- round(frag_vec * (1 + rate_2PCR)^(nPCR2 - 1)) 240 | 241 | SEQ_efficiency <- N_molecules_SEQ / sum(frag_vec) 242 | if (SEQ_efficiency >= 1) { 243 | sequenced_vec <- frag_vec 244 | } else { 245 | sequenced_vec <- vapply( 246 | frag_vec, \(Y) rbinom(n = 1, size = Y, prob = SEQ_efficiency), 247 | numeric(1)) 248 | } 249 | 250 | temp_vec <- c(sequenced_vec, 1) 251 | for (i in seq(2, 1, -1)) { 252 | temp_vec1 <- numeric(); temp_vec1[inds[[i]]] <- temp_vec; 253 | temp_vec <- temp_vec1; temp_vec[is.na(temp_vec)] <- 0 254 | } 255 | recovered_vec <- temp_vec[seq(length(temp_vec) - 1)] 256 | 257 | UMI_counts <- numeric(ngenes); 258 | GI <- c(0, cumsum(true_counts_1cell)); 259 | for (i in which(true_counts_1cell > 0)) { 260 | x <- recovered_vec[(GI[i] + 1):GI[i + 1]]; 261 | UMI_counts[i] <- sum(x > 0); 262 | } 263 | 264 | return(list(UMI_counts, sequenced_vec, sum(frag_vec > 0))) 265 | } 266 | } 267 | 268 | 269 | #' Simulate observed count matrix given technical biases and the true counts 270 | #' @param true_counts gene cell matrix 271 | #' @param meta_cell the meta information related to cells, will be combined with technical cell level information and returned 272 | #' @param protocol a string, can be "nonUMI" or "UMI" 273 | #' @param alpha_mean the mean of rate of subsampling of transcripts during capture step, default at 10 percent efficiency 274 | #' @param alpha_sd the std of rate of subsampling of transcripts 275 | #' @param alpha_gene_mean the per-gene scale factor of the alpha parameter, default at 1 276 | #' @param alpha_gene_sd the standard deviation of the per-gene scale factor of the alpha parameter, default at 0 277 | #' @param lenslope amount of length bias 278 | #' @param nbins number of bins for gene length 279 | #' @param gene_len a vector with lengths of all genes 280 | #' @param amp_bias_limit range of amplification bias for each gene, a vector of length ngenes 281 | #' @param rate_2PCR PCR efficiency, usually very high, default is 0.8 282 | #' @param nPCR1 the number of PCR cycles in "pre-amplification" step, default is 16 283 | #' @param nPCR2 the number of PCR cycles used after fragmentation. 284 | #' @param LinearAmp if linear amplification is used for pre-amplification step, default is FALSE 285 | #' @param LinearAmp_coef the coeficient of linear amplification, that is, how many times each molecule is amplified by 286 | #' @param depth_mean mean of sequencing depth 287 | #' @param depth_sd std of sequencing depth 288 | #' @param randseed (should produce same result if nregions, nevf and randseed are all the same) 289 | #' @return if UMI, a list with two elements, the first is the observed count matrix, the second is the metadata; if nonUMI, a matrix 290 | #' @export 291 | #' @examples 292 | #' \donttest{ 293 | #' results <- sim_example(ncells = 10) 294 | #' data(gene_len_pool) 295 | #' gene_len <- sample(gene_len_pool, results$num_genes, replace = FALSE) 296 | #' True2ObservedCounts( 297 | #' results$counts, results$cell_meta, protocol = "nonUMI", randseed = 1, 298 | #' alpha_mean = 0.1, alpha_sd = 0.05, gene_len = gene_len, depth_mean = 1e5, depth_sd = 3e3 299 | #' ) 300 | #' } 301 | True2ObservedCounts <- function(true_counts, meta_cell, protocol, randseed, alpha_mean = 0.1, alpha_sd = 0.002, 302 | alpha_gene_mean = 1, alpha_gene_sd = 0, 303 | gene_len, depth_mean, depth_sd, lenslope = 0.02, nbins = 20, 304 | amp_bias_limit = c(-0.2, 0.2), 305 | rate_2PCR = 0.8, nPCR1 = 16, nPCR2 = 10, LinearAmp = FALSE, LinearAmp_coef = 2000) { 306 | # set.seed(randseed) 307 | ngenes <- dim(true_counts)[1]; ncells <- dim(true_counts)[2] 308 | amp_bias <- .calAmpBias(lenslope, nbins, gene_len, amp_bias_limit) 309 | rate_2cap_lb <- 0.0005; depth_lb <- 200 # lower bound for capture efficiency and sequencing depth 310 | rate_2cap_vec <- .rnormTrunc(n = ncells, mean = alpha_mean, sd = alpha_sd, a = rate_2cap_lb, b = 1) 311 | rate_2cap_vec_gene <- .rnormTrunc(n = ngenes, mean = alpha_gene_mean, sd = alpha_gene_sd, a = 0, b = 3) 312 | rate_2cap <- rate_2cap_vec_gene %o% rate_2cap_vec 313 | depth_vec <- .rnormTrunc(n = ncells, mean = depth_mean, sd = depth_sd, a = depth_lb, b = Inf) 314 | observed_counts <- lapply(seq(ncells), function(icell) { 315 | if (icell %% 50 == 0) cat(sprintf("%d..", icell)) 316 | .amplifyOneCell(true_counts_1cell = true_counts[, icell], protocol = protocol, 317 | rate_2cap = c(rate_2cap[, icell], rate_2cap_vec[icell]), 318 | gene_len = gene_len, amp_bias = amp_bias, 319 | rate_2PCR = rate_2PCR, nPCR1 = nPCR1, nPCR2 = nPCR2, LinearAmp = LinearAmp, 320 | LinearAmp_coef = LinearAmp_coef, N_molecules_SEQ = depth_vec[icell]) 321 | }) 322 | gc() 323 | 324 | meta_cell2 <- data.frame(alpha = rate_2cap_vec, depth = depth_vec, stringsAsFactors = FALSE) 325 | meta_cell <- cbind(meta_cell, meta_cell2) 326 | 327 | if (protocol == "UMI") { 328 | UMI_counts <- do.call(cbind, lapply(observed_counts, "[[", 1)) 329 | nreads_perUMI <- lapply(observed_counts, "[[", 2) 330 | nUMI2seq <- vapply(observed_counts, "[[", numeric(1), 3) 331 | observed_counts <- UMI_counts 332 | } else 333 | observed_counts <- do.call(cbind, observed_counts) 334 | 335 | if (protocol == "UMI") { return(list(counts = observed_counts, cell_meta = meta_cell, nreads_perUMI = nreads_perUMI, 336 | nUMI2seq = nUMI2seq)) 337 | } else 338 | return(observed_counts) 339 | } 340 | 341 | 342 | #' Simulate observed ATAC-seq matrix given technical noise and the true counts 343 | #' @param atacseq_data true ATAC-seq data 344 | #' @param observation_prob for each integer count of a particular region for a particular cell, the probability the count will be observed 345 | #' @param sd_frac the fraction of ATAC-seq data value used as the standard deviation of added normally distrubted noise 346 | #' @param randseed (should produce same result if nregions, nevf and randseed are all the same) 347 | #' @return a matrix of observed ATAC-seq data 348 | #' @export 349 | #' @examples 350 | #' results <- sim_example(ncells = 10) 351 | #' True2ObservedATAC(results$atac_counts, randseed = 1) 352 | True2ObservedATAC <- function(atacseq_data, randseed, observation_prob = 0.3, sd_frac = 0.1) { 353 | # set.seed(randseed) 354 | atacseq_data <- round(atacseq_data) 355 | atacseq_noisy <- atacseq_data 356 | for (icell in seq(ncol(atacseq_data))) { 357 | for (iregion in seq(nrow(atacseq_data))) { 358 | if (atacseq_data[iregion, icell] > 0) { 359 | atacseq_noisy[iregion, icell] <- rbinom(n = 1, size = atacseq_data[iregion, icell], prob = observation_prob) 360 | atacseq_noisy[iregion, icell] <- max(atacseq_noisy[iregion, icell] + rnorm(1, mean = 0, sd = atacseq_noisy[iregion, icell] * sd_frac), 0) 361 | } 362 | } 363 | } 364 | return(atacseq_noisy) 365 | } 366 | 367 | 368 | #' Simulate technical biases 369 | #' @param lenslope amount of length bias. This value sould be less than 2*amp_bias_limit\[2\]/(nbins-1) 370 | #' @param nbins number of bins for gene length 371 | #' @param gene_len transcript length of each gene 372 | #' @param amp_bias_limit range of amplification bias for each gene, a vector of length ngenes 373 | #' @keywords internal 374 | #' @return a vector 375 | .calAmpBias <- function(lenslope, nbins, gene_len, amp_bias_limit) { 376 | ngenes <- length(gene_len) 377 | len_bias_bin <- (-(seq(nbins))) * lenslope 378 | len_bias_bin <- len_bias_bin - median(len_bias_bin) 379 | if (max(len_bias_bin) > amp_bias_limit[2]) { 380 | stop("The lenslope parameter is too large.") 381 | } 382 | max_rand_bias <- amp_bias_limit[2] - max(len_bias_bin) 383 | 384 | rand_bias <- rnorm(ngenes, mean = 0, sd = max_rand_bias) 385 | rand_bias[rand_bias > max_rand_bias] <- max_rand_bias 386 | rand_bias[rand_bias < -max_rand_bias] <- -max_rand_bias 387 | #rand_bias <- runif(ngenes, -max_rand_bias, max_rand_bias) 388 | 389 | binsize <- floor(ngenes / nbins) 390 | genes_in_bins <- vector("list", nbins) 391 | bin4genes <- numeric(ngenes) 392 | for (ibin in seq(nbins - 1)) { 393 | genes_in_bins[[ibin]] <- order(gene_len)[((ibin - 1) * binsize + 1):(ibin * binsize)] 394 | bin4genes[genes_in_bins[[ibin]]] <- ibin 395 | } 396 | genes_in_bins[[nbins]] <- order(gene_len)[((nbins - 1) * binsize + 1):ngenes] 397 | bin4genes[genes_in_bins[[nbins]]] <- nbins 398 | 399 | len_bias <- numeric(ngenes); len_bias <- len_bias_bin[bin4genes] 400 | amp_bias <- rand_bias + len_bias 401 | return(amp_bias) 402 | } 403 | 404 | 405 | #' expand transcript counts to a vector of binaries of the same length of as the number of transcripts 406 | #' @param true_counts_1cell number of transcript in one cell 407 | #' @keywords internal 408 | #' @return a list of two vectors, the first vector is a vector of 1s, the second vector is the index of transcripts 409 | .expandToBinary <- function(true_counts_1cell) { 410 | names(true_counts_1cell) <- NULL 411 | expanded_vec <- rep(1, sum(true_counts_1cell)) 412 | trans_idx <- lapply(which(true_counts_1cell > 0), 413 | function(igene) rep(igene, true_counts_1cell[igene])) 414 | trans_idx <- unlist(trans_idx) 415 | return(list(expanded_vec, trans_idx)) 416 | } 417 | 418 | #' sample from truncated normal distribution 419 | #' @param n number of values to create 420 | #' @param a the minimum value allowed 421 | #' @param b the maximum value allowed 422 | #' @param mean mean of the normal distribution 423 | #' @param sd standard deviation of the normal distribution 424 | #' @keywords internal 425 | #' @return a vector of length n 426 | .rnormTrunc <- function(n, mean, sd, a, b) { 427 | vec1 <- rnorm(n, mean = mean, sd = sd) 428 | beyond_idx <- which(vec1 < a | vec1 > b) 429 | if (length(beyond_idx) > 0) { # for each value < rate_2cap_lb 430 | substi_vec <- vapply(seq_along(beyond_idx), function(i) { 431 | while (TRUE) { 432 | temp <- rnorm(1, mean = mean, sd = sd) 433 | if (temp > a | temp > b) { break } } 434 | return(temp) 435 | }, numeric(1)) 436 | vec1[beyond_idx] <- substi_vec 437 | } 438 | return(vec1) 439 | } 440 | 441 | .getProb <- function(glength) { 442 | if (glength >= 1000) { prob <- 0.7 } else { 443 | if (glength >= 100 & glength < 1000) { prob <- 0.78 } 444 | else if (glength < 100) { prob <- 0 } 445 | } 446 | return(prob) 447 | } 448 | 449 | #' Add outliers to the observed counts 450 | #' @param res The scMultisim result object 451 | #' @param prob The probability of adding outliers for each gene 452 | #' @param factor The factor of the outliers 453 | #' @param sd The standard deviation of the outliers 454 | #' @param cell.num For a gene, the number of cells chosen to add outliers 455 | #' @param max.var The maximum variance allowed 456 | #' @export 457 | #' @return none 458 | add_outliers <- function ( 459 | res, prob = 0.01, factor = 2, sd = 0.5, cell.num = 1, max.var = Inf 460 | ) { 461 | if (is.null(res$counts_obs)) { 462 | stop("No counts found in the result object") 463 | } 464 | ngenes <- nrow(res$counts_obs) 465 | ncells <- ncol(res$counts_obs) 466 | gene_range <- if (is.null(res$.grn)) { 467 | seq(ngenes) 468 | } else { 469 | (max(res$.grn$name_map) + 1):ngenes 470 | } 471 | gene_range <- setdiff(gene_range, which(rowVars(res$counts_obs) > max.var)) 472 | chosen_genes <- sample(gene_range, floor(ngenes * prob)) 473 | for (i in chosen_genes) { 474 | # chosen_cells <- sample(which(res$counts_obs[i,] > 0), cell.num) 475 | chosen_cells <- sample(seq(ncells), cell.num) 476 | q <- rnorm(1, factor, sd) 477 | message(sprintf("Gene %d, cells %s, factor %.2f", i, paste(chosen_cells, collapse = ", "), q)) 478 | res$counts_obs[i, chosen_cells] <- res$counts_obs[i, chosen_cells] * q 479 | } 480 | } 481 | -------------------------------------------------------------------------------- /R/8_utils.R: -------------------------------------------------------------------------------- 1 | # String concatenation 2 | `%+%` <- function(a, b) paste0(a, b) 3 | 4 | 5 | # get default arguments 6 | .defaultArgs <- function(args = NULL, ...) { 7 | defaults <- list2(...) 8 | if (is.null(args)) { 9 | args <- eval(substitute(list(...), env = parent.frame())) 10 | } 11 | for (name in names(args)) { 12 | defaults[[name]] <- args[[name]] 13 | } 14 | defaults 15 | } 16 | 17 | 18 | .regionToTFMatrix <- function(GRN, region_to_gene, .all.genes = FALSE) { 19 | res <- matrix(0, nrow = nrow(region_to_gene), ncol = GRN$n_reg) 20 | # GRN$geff: gene x tf 21 | geff <- GRN$geff > 0 22 | # region_to_gene: region x gene 23 | # for each region 24 | for (i in seq_len(nrow(region_to_gene))) { 25 | # get genes in this region 26 | genes <- which(region_to_gene[i, ] > 0) 27 | if (length(genes) == 0) { 28 | next 29 | } 30 | if (.all.genes) { 31 | # if a TF also regulates all these genes 32 | tfs <- which(colSums(geff[genes, , drop = F]) == length(genes)) 33 | } else { 34 | tfs <- which(colSums(geff[genes, , drop = F]) > 0) 35 | } 36 | res[i, tfs] <- 1 37 | } 38 | res 39 | } 40 | 41 | 42 | #' sample from smoothed density function 43 | #' @param nsample number of samples needed 44 | #' @param den_fun density function estimated from density() from R default 45 | #' @param reduce.mem use alternative implementation to reduce memory usage 46 | #' @keywords internal 47 | #' @return a vector of samples 48 | SampleDen <- function(nsample, den_fun, reduce.mem = FALSE) { 49 | probs <- den_fun$y / sum(den_fun$y) 50 | bw <- den_fun$x[2] - den_fun$x[1] 51 | probs_seq = seq_along(probs) 52 | mins <- den_fun$x[probs_seq] - 0.5 * bw 53 | maxs <- den_fun$x[probs_seq] + 0.5 * bw 54 | 55 | if (reduce.mem) { 56 | counts <- rmultinom(n = 1, size = nsample, prob = probs) 57 | total_samples <- sum(counts) 58 | samples <- runif(total_samples) * 59 | rep(maxs - mins, times = counts) + 60 | rep(mins, times = counts) 61 | } else { 62 | bin_id <- sample(size = nsample, x = probs_seq, prob = probs, replace = TRUE) 63 | counts <- tabulate(bin_id, nbins = length(probs)) 64 | total_samples <- sum(counts) 65 | samples <- numeric(length = total_samples) 66 | cum_counts <- c(0, cumsum(counts)) 67 | for (j in 1:length(counts)) { 68 | if (counts[j] > 0) { 69 | samples[(cum_counts[j] + 1):cum_counts[j + 1]] <- 70 | runif(counts[j], min = mins[j], max = maxs[j]) 71 | } 72 | } 73 | } 74 | 75 | return(samples) 76 | } 77 | 78 | 79 | #' Creating an example tree with 5 tips 80 | #' @param plotting True for plotting the tree on console, False for no plot 81 | #' @return a R phylo object 82 | #' @export 83 | #' @examples 84 | #' Phyla5() 85 | Phyla5 <- function(plotting = FALSE) { 86 | phyla <- rtree(2) 87 | phyla <- compute.brlen(phyla, 1) 88 | tip <- compute.brlen(phyla, 1) 89 | phyla <- bind.tree(phyla, tip, 1) 90 | phyla <- bind.tree(phyla, tip, 2) 91 | phyla <- bind.tree(phyla, tip, 2) 92 | phyla <- compute.brlen(phyla, c(1, 1, 1, 1, 1, 0.2, 0.2, 3)) 93 | edges <- cbind(phyla$edge, phyla$edge.length) 94 | edges <- cbind(seq_along(edges[, 1]), edges) 95 | connections <- table(c(edges[, 2], edges[, 3])) 96 | root <- as.numeric(names(connections)[connections == 2]) 97 | tips <- as.numeric(names(connections)[connections == 1]) 98 | phyla$tip.label <- as.character(tips) 99 | if (plotting == TRUE) { 100 | plot(phyla, show.tip.label = FALSE, lwd = 2) 101 | tiplabels(cex = 2) 102 | nodelabels(cex = 2) 103 | } 104 | return(phyla) 105 | } 106 | 107 | #' Creating an example tree with 3 tips 108 | #' @param plotting True for plotting the tree on console, False for no plot 109 | #' @return a R phylo object 110 | #' @export 111 | #' @examples 112 | #' Phyla3() 113 | Phyla3 <- function(plotting = FALSE) { 114 | # par(mfrow=c(2,2)) 115 | phyla <- rtree(2) 116 | phyla <- compute.brlen(phyla, 1) 117 | tip <- compute.brlen(phyla, 1) 118 | phyla <- bind.tree(phyla, tip, 1) 119 | phyla <- compute.brlen(phyla, c(1, 1, 1, 2)) 120 | edges <- cbind(phyla$edge, phyla$edge.length) 121 | edges <- cbind(seq_along(edges[, 1]), edges) 122 | connections <- table(c(edges[, 2], edges[, 3])) 123 | root <- as.numeric(names(connections)[connections == 2]) 124 | tips <- as.numeric(names(connections)[connections == 1]) 125 | phyla$tip.label <- as.character(tips) 126 | 127 | if (plotting == TRUE) { 128 | plot(phyla, show.tip.label = FALSE, lwd = 2) 129 | tiplabels(cex = 2) 130 | nodelabels(cex = 2) 131 | } 132 | return(phyla) 133 | } 134 | 135 | #' Creating a linear example tree 136 | #' @param len length of the tree 137 | #' @return a R phylo object 138 | #' @export 139 | #' @examples 140 | #' Phyla1(len = 1) 141 | Phyla1 <- function(len = 1) { 142 | myTree <- ape::read.tree(text='(A);') 143 | myTree <- compute.brlen(myTree, len) 144 | myTree 145 | } 146 | 147 | 148 | # get root, internal nodes and tips from a tree. 149 | .tree_info <- function(tree) { 150 | edges <- cbind(seq_len(nrow(tree$edge)), tree$edge, tree$edge.length) 151 | colnames(edges) <- c("id", "from", "to", "len") 152 | parents <- unique(edges[, 2]) 153 | children <- unique(edges[, 3]) 154 | root <- setdiff(parents, children) %>% as.numeric() 155 | tips <- setdiff(children, parents) %>% as.numeric() 156 | internal <- union(parents, children) %>% as.numeric() 157 | 158 | list(edges = edges, root = root, tips = tips, internal = internal) 159 | } 160 | 161 | # print a summary of simulation parameters 162 | .print_param_summary <- function(sim) { 163 | cat(sprintf("intr noise: %g\n", sim$options$intrinsic.noise)) 164 | 165 | N <- sim$N 166 | cat("======== Params Summary ========\n") 167 | cat(sprintf("Genes: %d (%d GRN + %d Non-GRN)\n", N$gene, N$grn.gene, N$non.grn.gene)) 168 | cat(sprintf("CIF_%s: %d (%d nd + %d diff) + %d reg", 169 | c("kon", "koff", "s"), N$cif, N$nd.cif, N$diff.cif, N$reg_cif), sep = "\n") 170 | if (!is.null(sim$GRN)) { 171 | cat(sprintf("GRN: %d regulators, %d targets\n", sim$GRN$n_reg, sim$GRN$n_tgt)) 172 | } 173 | if (sim$do_spatial) { 174 | cat(sprintf("Spatial: %d regulators\n", length(sim$sp_regulators))) 175 | } 176 | 177 | cat("Params:\n") 178 | cat(" CIF ") 179 | if (sim$do_spatial) { 180 | cat("(NA)\n") 181 | } else { 182 | .print_matrix_dim(sim$CIF_all$cif$kon, "kon", newline = FALSE) 183 | .print_matrix_dim(sim$CIF_all$cif$koff, "koff", newline = FALSE) 184 | .print_matrix_dim(sim$CIF_all$cif$s, "s") 185 | } 186 | 187 | cat(" GIV ") 188 | .print_matrix_dim(sim$GIV$kon, "kon", newline = FALSE) 189 | .print_matrix_dim(sim$GIV$koff, "koff", newline = FALSE) 190 | .print_matrix_dim(sim$GIV$s, "s") 191 | 192 | cat(" Params ") 193 | if (sim$do_spatial) { 194 | .print_matrix_dim(sim$params_spatial[[1]]$kon, "kon", newline = FALSE) 195 | .print_matrix_dim(sim$params_spatial[[1]]$koff, "koff") 196 | } else { 197 | .print_matrix_dim(sim$params$kon, "kon", newline = FALSE) 198 | .print_matrix_dim(sim$params$koff, "koff") 199 | } 200 | 201 | .print_matrix_dim(sim$CIF_atac, " CIF_atac") 202 | .print_matrix_dim(sim$region_to_gene, " Region2Gene") 203 | .print_matrix_dim(sim$atac_data, " ATAC") 204 | 205 | cat("================================\n") 206 | } 207 | 208 | .print_matrix_dim <- function(mtx, name = NULL, newline = TRUE) { 209 | if (is.null(name)) { 210 | cat(sprintf("%dx%d", nrow(mtx), ncol(mtx))) 211 | } else { 212 | cat(sprintf("%s: %dx%d ", name, nrow(mtx), ncol(mtx))) 213 | } 214 | if (newline) { 215 | cat("\n") 216 | } 217 | } 218 | 219 | .print_time <- function(sim) { 220 | cat(sprintf("Time spent: %.2f mins\n", as.numeric(Sys.time() - sim$start_time, units = "mins"))) 221 | } 222 | 223 | .print_gene_in_grn <- function(sim) { 224 | rg <- sim$GRN$regulators 225 | tg <- sim$GRN$targets 226 | 227 | if (sim$do_spatial) { 228 | 229 | } else { 230 | 231 | } 232 | } 233 | 234 | 235 | #' Simulate a small example dataset with 200 cells and the 100-gene GRN 236 | #' @param ncells number of cells, please increase this number on your machine 237 | #' @param velocity whether to simulate RNA velocity 238 | #' @return the simulation result 239 | #' @export 240 | #' @examples 241 | #' sim_example(ncells = 10) 242 | sim_example <- function(ncells = 10, velocity = FALSE) { 243 | data(GRN_params_100, envir = environment()) 244 | options <- list( 245 | rand.seed = 0, 246 | GRN = GRN_params_100, 247 | num.cells = ncells, 248 | num.cifs = 20, 249 | cif.sigma = 0.5, 250 | tree = Phyla3(), 251 | diff.cif.fraction = 0.8, 252 | do.velocity = velocity 253 | ) 254 | sim_true_counts(options) 255 | } 256 | 257 | 258 | #' Simulate a small example dataset with 200 cells and the 100-gene GRN, with CCI enabled 259 | #' @param ncells number of cells, please increase this number on your machine 260 | #' @return the simulation result 261 | #' @export 262 | #' @examples 263 | #' sim_example_spatial(ncells = 10) 264 | sim_example_spatial <- function(ncells = 10) { 265 | data(GRN_params_100, envir = environment()) 266 | lig_params <- data.frame( 267 | target = c(101, 102), 268 | regulator = c(103, 104), 269 | effect = c(5.2, 5.9) 270 | ) 271 | options <- list2( 272 | rand.seed = 0, 273 | GRN = GRN_params_100, 274 | num.genes = 110, 275 | num.cells = ncells, 276 | num.cifs = 50, 277 | tree = Phyla3(), 278 | intrinsic.noise = 0.5, 279 | cci = list( 280 | params = lig_params, 281 | max.neighbors = 4, 282 | cell.type.interaction = "random", 283 | step.size = 0.5 284 | ) 285 | ) 286 | sim_true_counts(options) 287 | } 288 | 289 | atac_dens_nonzero <- function(data) { 290 | x <- data[data > 0] 291 | density(x = log2(x + 1), adjust = 1, n = 999) 292 | } 293 | -------------------------------------------------------------------------------- /R/9.1_shiny.R: -------------------------------------------------------------------------------- 1 | #' Launch the Shiny App to configure the simulation 2 | #' @export 3 | run_shiny <- function() { 4 | appDir <- system.file("shiny-app", package = "scMultiSim") 5 | # appDir <- "inst/shiny-app" 6 | shiny::runApp(appDir, port = 8888, launch.browser = T) 7 | } 8 | 9 | generateSpatialLoc <- function(opt) { 10 | phyla <- opt$tree 11 | step_size <- opt$step_size 12 | ncell <- opt$ncell 13 | is_discrete <- opt$is_discrete 14 | lr_num <- opt$lr_num 15 | ctype_lr <- opt$ctype_lr 16 | 17 | ctp <- cci_cell_type_params(phyla, lr_num, ctype_lr, step_size, 18 | rand = TRUE, discrete = is_discrete) 19 | 20 | c(paths, max_layers) %<-% .getPaths( 21 | list(cell = ncell), 22 | list(tree = phyla) 23 | ) 24 | cell_path <- sample(seq_along(paths), ncell, replace = TRUE) 25 | 26 | 27 | tree_info <- .tree_info(phyla) 28 | neutral <- SampleSubtree( 29 | tree_info$root, 0, 0, 30 | tree_info$edges, 31 | max_layers, 32 | step_size, 33 | neutral = NA 34 | ) 35 | 36 | neutral <- neutral[1:max_layers,] 37 | layer_idx_by_path <- lapply(paths, function(path) { 38 | idx <- integer() 39 | for (i in 1:(length(path) - 1)) { 40 | a <- path[i] 41 | b <- path[i + 1] 42 | idx <- c(idx, which(neutral[, 1] == a & neutral[, 2] == b)) 43 | } 44 | idx 45 | }) 46 | 47 | cell_types <- character(length = nrow(neutral)) 48 | for (i in 1:nrow(tree_info$edges)) { 49 | c(id, from, to, len) %<-% tree_info$edges[i,] 50 | n_steps <- len %/% step_size + ceiling(len %% step_size) 51 | pts <- which(neutral[, 1] == from & neutral[, 2] == to) 52 | n_pts <- length(pts) 53 | cell_types[pts] <- if (n_steps == 1) { 54 | paste(from, to, sep = "_") 55 | } else { 56 | type_id <- ceiling(1:n_pts * (n_steps / n_pts)) 57 | paste(from, to, type_id, sep = "_") 58 | } 59 | } 60 | 61 | meta_by_path <- lapply(seq_along(paths), function(i_path) { 62 | idx <- layer_idx_by_path[[i_path]] 63 | n <- neutral[idx,] 64 | data.frame( 65 | pop = apply(n[, 1:2], 1, \(X) paste0(X, collapse = "_")), 66 | cell.type = cell_types[idx] 67 | ) 68 | }) 69 | 70 | if (!is.null(ctp$type_map)) { 71 | for (i in seq_along(meta_by_path)) { 72 | meta_by_path[[i]] <- cbind( 73 | meta_by_path[[i]], 74 | data.frame(cell.type.idx = ctp$type_map[meta_by_path[[i]]$cell.type]) 75 | ) 76 | } 77 | } 78 | 79 | final_ctype <- integer(length = ncell) 80 | for (i in seq_len(ncell)) { 81 | final_ctype[i] <- if (is_discrete) { 82 | meta[i, "cell.type.idx"] 83 | } else { 84 | path_i <- cell_path[i] 85 | layer <- min(ncell - i + 1, nrow(meta_by_path[[path_i]])) 86 | meta_by_path[[path_i]][layer, "cell.type.idx"] 87 | } 88 | } 89 | 90 | 91 | grid <- CreateSpatialGrid( 92 | ncells = ncell, 93 | max_nbs = opt$max_nbs, 94 | .grid.size = opt$grid.size, 95 | .same.type.prob = opt$same.type.prob, 96 | .method = opt$layout, 97 | .method.param = NULL, 98 | .nb.radius = 1 99 | ) 100 | 101 | grid$set_final_ctypes(final_ctype) 102 | for (i in 1:ncell) { 103 | new_cell_type <- if (is_discrete) meta[i, "cell.type.idx"] else cell_path[i] 104 | grid$allocate(i, new_cell_type) 105 | } 106 | 107 | grid 108 | } 109 | -------------------------------------------------------------------------------- /R/9_meta.R: -------------------------------------------------------------------------------- 1 | .ver <- "1.2.0" 2 | 3 | #' Show detailed documentations of scMultiSim's parameters 4 | #' 5 | #' @param topic Can be `options`, `dynamic.GRN`, or `cci` 6 | #' @return none 7 | #' @export 8 | #' 9 | #' @examples scmultisim_help() 10 | scmultisim_help <- function(topic = NULL) { 11 | if (is.null(topic)) { 12 | meta_help <- "Call scmultisim_help(topic) where topic can be \"options\" or an option name. Printing help for options by default.\n" 13 | sprintf(.split_long_string(meta_help)) %>% cat() 14 | topic <- "options" 15 | } 16 | 17 | if (topic == "options") { 18 | sprintf("scMultiSim v%s\n", .ver) %>% cat() 19 | .print_opt() 20 | return() 21 | } 22 | 23 | if (topic == "dynamic.GRN") { 24 | .dynamic_grn_default_params(help = TRUE) 25 | return() 26 | } 27 | 28 | if (topic == "cci") { 29 | .cci_help() 30 | return() 31 | } 32 | 33 | .print_opt(topic) 34 | } 35 | 36 | 37 | .cci_help <- function() { 38 | cat(" 39 | To enable simulating cell-cell interaction, the value should be a list including 40 | the following names: 41 | 42 | - grid.size: (integer) 43 | Manually specify the width and height of the grid. 44 | - layout: (character or function) 45 | Supported values are \"enhanced\", \"layers\", \"islands\", or a custom function. 46 | If set to \"islands\", you can specify which cell types are the islands, 47 | e.g. \"islands:1,2\". 48 | The custom function should take two arguments: (grid_size, cell_types) 49 | grid_size: (integer) 50 | The width and height of the grid. 51 | cell_types: (integer vector) 52 | Each cell's cell type. 53 | It should return a n_cell x 2 matrix, where each row is the x and y coordinates of a cell. 54 | - params: (data.frame) 55 | The spatial effect between neighbor cells. 56 | It should be a data frame similar to the GRN parameter. 57 | - step.size: (number, optional) 58 | If using continuous population, use this step size to further divide the 59 | cell types on the tree. For example, if the tree only has one branch `a -> b` 60 | and the branch length is 1 while the step size is 0.34, 61 | there will be totally three cell types: a_b_1, a_b_2, a_b_3. 62 | - cell.type.interaction: (\"random\" or a matrix) 63 | The interaction level between different cell types. 64 | They act as factors multiplied to the ligand effect. 65 | Supply the string \"random\" to let scMultiSim generate these factors randomly. 66 | Otherwise, use cci_cell_type_params() to generate the template data structure. 67 | See the help of this method for more info. 68 | - cell.type.lr.pairs: (integer vector) 69 | If cell.type.interaction is \"random\", how many LR pairs should be enabled 70 | between each cell type pair. 71 | Should be a range, e.g. 4:6. The actual number of LR pairs will be uniformly 72 | sampled from this range. 73 | - max.neighbors: (integer) 74 | Constraint the maxinum number of neighbors with CCI for each cell. 75 | The neighbors with CCI will be randomly sampled. 76 | - radius: (number or string) 77 | Which cells should be considered as neighbors. 78 | The interacting cells are those within these neighbors. 79 | When it is a number, it controls the maximum distance between two cells for 80 | them to interact. 81 | When it is a string, it should be in the format `gaussian:sigma`, for example, 82 | `gaussian:1.2`. 83 | In this case, the probability of two cells interacting is proportional to 84 | the distance with a Gaussian kernel applied. 85 | - start.layer: (integer) 86 | From which layer (time step) the simulation should start. 87 | If set to 1, the simulation will start with one cell in the grid and add one 88 | more cell in each following layer. 89 | If set to `num_cells`, the simulation will start from all cells available in 90 | the grid and only continues for a few static layers, which will greatly speed 91 | up the simulation. 92 | ") 93 | } 94 | -------------------------------------------------------------------------------- /R/data.R: -------------------------------------------------------------------------------- 1 | #' distribution of kinetic parameters learned from the Zeisel UMI cortex datasets 2 | #' @name match_params 3 | #' @docType data 4 | #' @usage data(param_realdata.zeisel.imputed) 5 | #' @format a data frame. 6 | #' @return a data frame. 7 | #' @keywords datasets internal 8 | #' @examples 9 | #' data(param_realdata.zeisel.imputed) 10 | "match_params" 11 | 12 | 13 | #' a pool of gene lengths to sample from 14 | #' @name gene_len_pool 15 | #' @docType data 16 | #' @usage data(gene_len_pool) 17 | #' @format a vector. 18 | #' @return a vector of gene lengths. 19 | #' @keywords datasets internal 20 | #' @examples 21 | #' data(gene_len_pool) 22 | "gene_len_pool" 23 | 24 | 25 | #' from transcript length to number of fragments (for the nonUMI protocol) 26 | #' @name len2nfrag 27 | #' @docType data 28 | #' @usage data(len2nfrag) 29 | #' @format a vector. 30 | #' @return a vector. 31 | #' @keywords datasets internal 32 | #' @examples 33 | #' data(len2nfrag) 34 | "len2nfrag" 35 | 36 | 37 | #' this is the density function of log(x+1), where x is the non-zero values for ATAC-SEQ data 38 | #' @name dens_nonzero 39 | #' @docType data 40 | #' @usage data(dens_nonzero) 41 | #' @format a vector. 42 | #' @return a vector. 43 | #' @keywords datasets 44 | #' @examples 45 | #' data(dens_nonzero) 46 | "dens_nonzero" 47 | 48 | 49 | #' 100_gene_GRN is a matrix of GRN params consisting of 100 genes where: # - column 1 is the target gene ID, # - column 2 is the gene ID which acts as a transcription factor for the target (regulated) gene # - column 3 is the effect of the column 2 gene ID on the column 1 gene ID 50 | #' @name GRN_params_100 51 | #' @docType data 52 | #' @usage data(GRN_params_100) 53 | #' @format a data frame. 54 | #' @return a data frame with three columns: target gene ID, TF gene ID, and the effect of TF on target gene. 55 | #' @keywords datasets 56 | #' @examples 57 | #' data(GRN_params_100) 58 | "GRN_params_100" 59 | 60 | 61 | #' GRN_params_1139 is a matrix of GRN params consisting of 1139 genes where: # - column 1 is the target gene ID, # - column 2 is the gene ID which acts as a transcription factor for the target (regulated) gene # - column 3 is the effect of the column 2 gene ID on the column 1 gene ID 62 | #' @name GRN_params_1139 63 | #' @docType data 64 | #' @usage data(GRN_params_1139) 65 | #' @format a data frame. 66 | #' @return a data frame with three columns: target gene ID, TF gene ID, and the effect of TF on target gene. 67 | #' @keywords datasets 68 | #' @examples 69 | #' data(GRN_params_1139) 70 | "GRN_params_1139" 71 | 72 | -------------------------------------------------------------------------------- /R/imports.R: -------------------------------------------------------------------------------- 1 | #' @import ggplot2 2 | #' @import ape 3 | #' @import rlang 4 | #' @import foreach 5 | #' @import markdown 6 | #' @importFrom stats cor density dist dnorm hclust median na.omit rbeta rbinom rnorm rpois runif setNames 7 | #' @importFrom Rtsne Rtsne 8 | #' @importFrom utils data write.csv 9 | #' @importFrom dplyr %>% 10 | #' @importFrom zeallot %<-% %->% 11 | #' @importFrom SummarizedExperiment SummarizedExperiment 12 | #' @importFrom BiocParallel bplapply MulticoreParam 13 | NULL 14 | -------------------------------------------------------------------------------- /R/results.R: -------------------------------------------------------------------------------- 1 | rna_velo_knn <- function(results, velocity, perplexity = 70, randseed = 0, raw = FALSE) { 2 | # set.seed(randseed) 3 | counts_s <- results$counts 4 | pop <- results$cell_meta$pop 5 | depth <- results$cell_meta$depth 6 | 7 | counts_s_lg <- t(log2(counts_s + 1)) 8 | 9 | if (is.null(results$velocity)) { 10 | stop("The result object is not produced in velocity mode.") 11 | } 12 | 13 | process_velocity <- function(v) { 14 | assertthat::assert_that( 15 | nrow(counts_s) == nrow(v), 16 | ncol(counts_s) == ncol(v) 17 | ) 18 | 19 | future_counts_s <- counts_s + v 20 | future_counts_s[future_counts_s < 0] <- 0 21 | future_counts_s_lg <- t(log2(future_counts_s + 1)) 22 | future_counts_s_lg - counts_s_lg 23 | } 24 | 25 | 26 | normalize_velocity <- function(v) { 27 | v_normalizer <- apply(v, 2, \(vi) vi^2) %>% rowSums() %>% sqrt() 28 | t(t(v) / v_normalizer) 29 | } 30 | 31 | if (raw) { 32 | return( 33 | paired_simil(velocity, results$velocity, method = "cosine") 34 | ) 35 | } 36 | 37 | dist_obj <- dist(counts_s_lg) 38 | dist_mat <- as.matrix(dist_obj) 39 | n_cells <- nrow(dist_mat) 40 | k <- ceiling(n_cells / 50) 41 | 42 | v_knn <- process_velocity(velocity) %>% 43 | apply(2, \(vi) 44 | distMat.KernelKnn(dist_mat, TEST_indices = NULL, 45 | weights_function = 'gaussian', 46 | y = vi, k = k, regression = TRUE) 47 | ) %>% 48 | normalize_velocity() 49 | 50 | v_true_knn <- process_velocity(results$velocity) %>% 51 | apply(2, \(vi) 52 | distMat.KernelKnn(dist_mat, TEST_indices = NULL, 53 | weights_function = 'gaussian', 54 | y = vi, k = k, regression = TRUE) 55 | ) %>% 56 | normalize_velocity() 57 | 58 | sim <- paired_simil(v_knn, v_true_knn, method = "cosine") 59 | 60 | mean(sim) 61 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # scMultiSim 2 | 3 | **Table of contents** 4 | 5 | - [Tutorials and documentation](#tutorials-and-documentation) 6 | - [Installation](#installation) 7 | - [Shiny App](#shiny-app) 8 | - [FAQ](#faq) 9 | - [Contact and reference](#contact) 10 | 11 | scMultiSim is an in silico simulator that generates multi-modality data of single-cells, including gene expression, chromatin accessibility, RNA velocity, and spatial location of cells. It takes a cell differential tree and a gene regulatory network (GRN) as input, and simulates spliced and unspliced counts while accounting for the relationships between modalities. The output single cell gene expression data is determined by three factors: cell-cell interactions, within-cell GRNs and chromatin accessibility. Users can tune the effect of each factor on the output data and set various parameters for the underlying model. Furthermore, the GRN can be set in a time-varying mode where the network's structure changes temporally to reflect the dynamic nature of biological networks. We also provide options to simulate technical variations such as batch effects. scMultiSim can be used to benchmark challenging computational tasks on single-cell multi-omics data, including the inference of GRNs, estimation of RNA velocity, integration of single-cell datasets from multiple batches and modalities, and analysis of cell-cell interaction using the cell spatial location data. 12 | 13 | ![Overview](https://github.com/ZhangLabGT/scMultiSim/raw/img/img/scMultisim.png) 14 | 15 | The following figure briefly shows results from the same cell differential tree: 16 | 17 | 1. Connected scATAC-seq and scRNA-seq, in continuous or discrete mode. Visualized by t-SNE. 18 | 2. GRN correlation heatmap, where genes regulated by the same regulator have similar correlations with others. 19 | 3. Unspliced counts and RNA velocity ground truth visualized by t-SNE. 20 | 4. Spatial cell locations and cell-cell interaction ground truth. 21 | 5. Discrete cell population with added batch effects. 22 | 23 | ![Results](https://github.com/ZhangLabGT/scMultiSim/raw/img/img/results.png) 24 | 25 | ## Tutorials and documentation 26 | 27 | Please check out the [tutorials](https://zhanglabgt.github.io/scMultiSim/articles) 28 | for detailed instructions on how to use scMultiSim. 29 | 30 | ## Installation 31 | 32 | scMultiSim can be installed from BioConductor using the following command: 33 | 34 | ```R 35 | if (!require("BiocManager")) { 36 | install.packages("BiocManager") 37 | } 38 | 39 | BiocManager::install("scMultiSim") 40 | ``` 41 | 42 | ## Shiny App 43 | 44 | A Shiny app is provided to help users visualize the effect of each parameter and adjust the simulation options. 45 | To run the app, simply call `run_shiny()`. 46 | 47 | 48 | 49 | ## FAQ 50 | 51 | ### Running Speed 52 | 53 | Simulations should finish in a reasonable time in most cases. On a machine with an i7-12700K CPU and 64GB RAM, using 1000 cells, 100 genes and 50 CIFs, the simulation took under 1 mimute to generate both scRNA-seq and scATAC-seq data. If also generating unspliced and spliced counts, or enabling cell-cell interactions, the running time is longer (~3 minutes when RNA velocity is enabled, and 30 minutes for 500 cells with spatial cell-cell interaction enabled). 54 | 55 | ## Contact and reference 56 | 57 | GitHub issues are welcomed. 58 | It is also possible to send email to the main author 59 | `Hechen Li (hli691 at gatech.edu)`. 60 | 61 | ### Please cite 62 | 63 | Hechen Li, Ziqi Zhang, Michael Squires, Xi Chen, and Xiuwei Zhang. 2023. “scMultiSim: Simulation of Multi-Modality Single Cell Data Guided by Cell-Cell Interactions and Gene Regulatory Networks.” bioRxiv. 64 | -------------------------------------------------------------------------------- /_pkgdown.yml: -------------------------------------------------------------------------------- 1 | url: https://zhanglabgt.github.io/scMultiSim/ 2 | template: 3 | bootstrap: 5 4 | light-switch: true 5 | articles: 6 | - title: Tutorials 7 | navbar: ~ 8 | contents: 9 | - workflow 10 | - basics 11 | - spatialCCI 12 | - options 13 | reference: 14 | - title: Simulation 15 | desc: Functions for simulating single-cell data. 16 | - contents: 17 | - sim_true_counts 18 | - add_expr_noise 19 | - divide_batches 20 | - add_outliers 21 | - title: Visualization 22 | desc: Functions for visualizing the results. 23 | - contents: 24 | - starts_with("plot_") 25 | - gene_corr_cci 26 | - gene_corr_regulator 27 | - title: Help 28 | desc: Functions for getting help. 29 | - contents: 30 | - run_shiny 31 | - scmultisim_help 32 | - title: Utilities 33 | desc: Utility functions that can be useful for simulating data. 34 | - contents: 35 | - cci_cell_type_params 36 | - gen_clutter 37 | - title: Data 38 | desc: Default data provided by scMultiSim 39 | - contents: 40 | - starts_with("Phyla") 41 | - has_keyword("datasets") 42 | - title: Internal helpers 43 | desc: Internal helper functions, but can be useful for advanced customization. 44 | - contents: 45 | - Get_1region_ATAC_correlation 46 | - Get_ATAC_correlation 47 | - True2ObservedATAC 48 | - True2ObservedCounts 49 | - sim_example_200_cells 50 | - sim_example_200_cells_spatial 51 | -------------------------------------------------------------------------------- /data/GRN_params_100.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhangLabGT/scMultiSim/12e3799a445316c93df9fc357909c796cfc61f6e/data/GRN_params_100.RData -------------------------------------------------------------------------------- /data/GRN_params_1139.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhangLabGT/scMultiSim/12e3799a445316c93df9fc357909c796cfc61f6e/data/GRN_params_1139.RData -------------------------------------------------------------------------------- /data/dens_nonzero.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhangLabGT/scMultiSim/12e3799a445316c93df9fc357909c796cfc61f6e/data/dens_nonzero.RData -------------------------------------------------------------------------------- /data/gene_len_pool.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhangLabGT/scMultiSim/12e3799a445316c93df9fc357909c796cfc61f6e/data/gene_len_pool.RData -------------------------------------------------------------------------------- /data/len2nfrag.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhangLabGT/scMultiSim/12e3799a445316c93df9fc357909c796cfc61f6e/data/len2nfrag.RData -------------------------------------------------------------------------------- /data/param_realdata.zeisel.imputed.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhangLabGT/scMultiSim/12e3799a445316c93df9fc357909c796cfc61f6e/data/param_realdata.zeisel.imputed.RData -------------------------------------------------------------------------------- /inst/extdata/Newick_ABCDE.txt: -------------------------------------------------------------------------------- 1 | (((A:1,B:1):1,(C:0.5,D:0.5):1.5):1,E:3); 2 | -------------------------------------------------------------------------------- /inst/extdata/Newick_animals.txt: -------------------------------------------------------------------------------- 1 | ((raccoon:19.19959,bear:6.80041):0.84600,((sea_lion:11.99700, seal:12.00300):7.52973,((monkey:100.85930,cat:47.14069):20.59201, weasel:18.87953):2.09460):3.87382,dog:25.46154); -------------------------------------------------------------------------------- /inst/shiny-app/app.R: -------------------------------------------------------------------------------- 1 | library(shiny) 2 | 3 | getGlobalObjects <- function() { 4 | res <- list() 5 | i <- 1 6 | obj.names <- ls(envir=.GlobalEnv) 7 | for (n in obj.names) { 8 | obj <- get(n, envir=.GlobalEnv) 9 | if (is.data.frame(obj) || is.matrix(obj)) { 10 | res[[i]] <- list(name = n, type = "data.frame") 11 | } else if (is.vector(obj)) { 12 | res[[i]] <- list(name = n, type = "vector", length = length(obj)) 13 | } else if (is.list(obj)) { 14 | res[[i]] <- list(name = n, type = "list", length = length(obj)) 15 | } else { 16 | res[[i]] <- list(name = n, type = "other") 17 | } 18 | i <- i + 1 19 | } 20 | res 21 | } 22 | 23 | getOptDefaults <- function() { 24 | opt_list <- .opt_list() 25 | names <- names(opt_list) 26 | opts <- seq_along(names) 27 | res <- list() 28 | 29 | for (i in opts) { 30 | n <- names[i] 31 | opt <- opt_list[[i]] 32 | if (n == "") next; 33 | val <- opt[[1]] 34 | if (!val[[1]]) { 35 | res[[n]] <- val[[2]] 36 | } 37 | } 38 | 39 | res 40 | } 41 | 42 | 43 | # Define server logic for random distribution app ---- 44 | server <- function(input, output, session) { 45 | grn_info <- reactive({ 46 | print(input$GRN) 47 | error <- NULL 48 | grn_df <- get(input$GRN) 49 | if (!is.data.frame(grn_df)) { 50 | return(list(error = "GRN must be a data frame")) 51 | } 52 | if (ncol(grn_df) != 3) { 53 | return(list(error = "GRN must have 3 columns")) 54 | } 55 | rg_genes <- unique(grn_df[,2]) 56 | tg_genes <- unique(grn_df[,1]) 57 | all_genes <- unique(c(rg_genes, tg_genes)) 58 | list( 59 | data = grn_df, 60 | ngenes = length(all_genes), 61 | nrows = nrow(grn_df), 62 | nregulators = length(rg_genes), 63 | ntargets = length(tg_genes), 64 | error = FALSE 65 | ) 66 | }) 67 | 68 | tree_info <- reactive({ 69 | tree_name <- input$tree 70 | if (is.null(tree_name)) { 71 | return(list(error = "No tree selected")) 72 | } 73 | tree <- switch(tree_name, 74 | phyla1 = Phyla1(), 75 | phyla3 = Phyla3(), 76 | phyla5 = Phyla5(), 77 | tryCatch( 78 | { eval(parse(text=tree_name), envir=.GlobalEnv) }, error = function(e) NULL 79 | ) 80 | ) 81 | if (is.null(tree)) { 82 | return(list(error = paste0("Error loading tree: ", tree_name, ", please check syntax"))) 83 | } 84 | edges <- cbind(1:nrow(tree$edge), tree$edge, tree$edge.length) 85 | colnames(edges) <- c("id", "from", "to", "len") 86 | parents <- unique(edges[, 2]) 87 | children <- unique(edges[, 3]) 88 | root <- setdiff(parents, children) %>% as.numeric() 89 | tips <- setdiff(children, parents) %>% as.numeric() 90 | internal <- union(parents, children) %>% as.numeric() 91 | 92 | list(error = FALSE, tree = tree, edges = edges, root = root, tips = tips, internal = internal) 93 | }) 94 | 95 | grid <- eventReactive(input$submit_spatial, { 96 | opt <- input$submit_spatial 97 | g <- generateSpatialLoc(list( 98 | layout = opt$layout, 99 | tree = tree_info()$tree, 100 | step_size = opt$stepSize, 101 | ncell = opt$ncell, 102 | is_discrete = F, 103 | lr_num = 0, 104 | ctype_lr = 0, 105 | grid.size = opt$gridSize, 106 | same.type.prob = opt$sameTypeProb, 107 | max_nbs = 4 108 | )) 109 | 110 | list( 111 | locs = g$locs, 112 | size = g$grid_size, 113 | final_types = g$final_types 114 | ) 115 | }) 116 | 117 | # Generate a plot of the data ---- 118 | # Also uses the inputs to build the plot label. Note that the 119 | # dependencies on the inputs and the data reactive expression are 120 | # both tracked, and all expressions are called in the sequence 121 | # implied by the dependency graph. 122 | # output$plot <- renderPlot({ 123 | # dist <- input$dist 124 | # n <- input$n 125 | 126 | # hist(d(), 127 | # main = paste("r", dist, "(", n, ")", sep = ""), 128 | # col = "#007bc2", border = "white") 129 | # }) 130 | 131 | # Generate a summary of the data ---- 132 | # output$summary <- renderPrint({ 133 | # summary(d()) 134 | # }) 135 | 136 | # Generate an HTML table view of the head of the data ---- 137 | output$grn_head <- renderTable({ 138 | info <- grn_info() 139 | if (is.character(info$error)) { 140 | NULL 141 | } else if (is.null(info$data)) { 142 | "No data available" 143 | } else { 144 | head(data.frame(info$data)) 145 | } 146 | }, html.table.attributes = 'class="table table-sm"') 147 | 148 | output$grn_summary <- renderText({ 149 | info <- grn_info() 150 | if (is.character(info$error)) { 151 | paste0("Error: ", info$error) 152 | } else if (is.null(info$data)) { 153 | "No data available" 154 | } else { 155 | paste("GRN with", info$nrows, "edges and" , info$ngenes, "genes, incl.", info$nregulators, "regulators and", info$ntargets, "targets") 156 | } 157 | }) 158 | 159 | output$tree_plot <- renderPlot({ 160 | info <- tree_info() 161 | if (is.null(info)) { 162 | NULL 163 | } else { 164 | plot(info$tree, no.margin = TRUE) 165 | nodelabels() 166 | } 167 | }) 168 | 169 | observe({ 170 | session$sendCustomMessage(type = "RObjects", getGlobalObjects()) 171 | }) 172 | observe({ 173 | session$sendCustomMessage(type = "GRNInfo", grn_info()) 174 | }) 175 | observe({ 176 | session$sendCustomMessage(type = "TreeInfo", tree_info()) 177 | }) 178 | observe({ 179 | session$sendCustomMessage(type = "Grid", grid()) 180 | }) 181 | observe({ 182 | session$sendCustomMessage(type = "Defaults", getOptDefaults()) 183 | }) 184 | 185 | observe({ 186 | print(input$generatedOptions) 187 | if (!is.null(input$generatedOptions)) { 188 | eval(parse(text=input$generatedOptions), envir=.GlobalEnv) 189 | } 190 | }) 191 | 192 | observe({ 193 | print(input$stopApp) 194 | if (is.character(input$stopApp) && input$stopApp == "YES") { 195 | stopApp() 196 | } 197 | }) 198 | } 199 | 200 | a <- shinyApp(ui = htmlTemplate("www/index.html"), server) 201 | -------------------------------------------------------------------------------- /inst/shiny-app/www/.prettierrc: -------------------------------------------------------------------------------- 1 | { 2 | "printWidth": 140 3 | } 4 | -------------------------------------------------------------------------------- /inst/shiny-app/www/output.js: -------------------------------------------------------------------------------- 1 | const defaultValues = {}; 2 | 3 | function outputOptions() { 4 | const options = {}; 5 | 6 | function add(name, getter = null, inputName = null) { 7 | const defaultValue = defaultValues[name]; 8 | let value; 9 | if (getter) { 10 | value = typeof getter === "function" ? getter() : getter; 11 | } else { 12 | const el = document.querySelector(`[name="${inputName || name}"]`); 13 | if (el === null) { 14 | console.error(`Element with name ${name} not found`); 15 | return; 16 | } 17 | value = el.type === "checkbox" ? el.checked.toString().toUpperCase() : el.value; 18 | } 19 | if (value !== defaultValue) { 20 | options[name] = value; 21 | } 22 | } 23 | 24 | const isDiscrete = document.querySelector("[name=_cellPop]:checked").value === "discrete"; 25 | const grnEnabled = document.querySelector("[name=_grnEnabled]:checked").value === "on"; 26 | 27 | add("seed"); 28 | add("speed.up"); 29 | 30 | add("num.cifs"); 31 | add("diff.cif.fraction"); 32 | add("cif.center"); 33 | add("cif.sigma"); 34 | add("giv.mean"); 35 | add("giv.sd"); 36 | add("giv.prob"); 37 | add("num.cells"); 38 | 39 | if (isDiscrete) { 40 | add("discrete.cif", "TRUE"); 41 | const numCluster = document.querySelector("[name=_numCluster]").value; 42 | add("tree", `rtree(${numCluster})`); 43 | add("discrete.min.pop.size"); 44 | const discretePopSize = document.querySelector("[name=discretePopSize]").value; 45 | if (discretePopSize.length > 0) { 46 | add("discrete.pop.size", `as.integer(c(${discretePopSize}))`); 47 | } 48 | } else { 49 | add("tree", () => { 50 | let treeVal = document.querySelector("[name=tree]:checked").value; 51 | if (treeVal === "custom") { 52 | treeVal = document.querySelector("[name='_treeCustom']").value; 53 | } else { 54 | treeVal = { 55 | phyla1: "Phyla1()", 56 | phyla3: "Phyla3()", 57 | phyla5: "Phyla5()", 58 | }[treeVal]; 59 | } 60 | return treeVal; 61 | }); 62 | add("use.impulse"); 63 | } 64 | 65 | if (grnEnabled) { 66 | add("GRN"); 67 | add("num.genes"); 68 | } else { 69 | add("GRN", "NA"); 70 | add("num.genes", null, "num.genes2"); 71 | } 72 | 73 | const useCustomATACDensity = document.querySelector("[name=useCustomATACDensity]").checked; 74 | 75 | add("riv.mean"); 76 | add("riv.sd"); 77 | add("riv.prob"); 78 | 79 | add("region.distrib", () => { 80 | const v = [0, 1, 2].map((i) => document.querySelector(`[name=_regionDist${i}]`).value).join(", "); 81 | return `c(${v})`; 82 | }); 83 | add("atac.effect"); 84 | add("atac.p_zero"); 85 | if (useCustomATACDensity) { 86 | add("atac.density"); 87 | } 88 | 89 | add("scale.s"); 90 | add("bimod"); 91 | 92 | const velocityEnabled = document.querySelector("[name=velocity]:checked").value === "on"; 93 | if (velocityEnabled) { 94 | add("do.velocity", "TRUE"); 95 | add("beta"); 96 | add("d"); 97 | add("num.cycles"); 98 | add("cycle.len"); 99 | } else { 100 | add("intrinsic.noise"); 101 | } 102 | 103 | let cciOptions = null; 104 | function add_sp(name, getter = null) { 105 | let value; 106 | if (getter) { 107 | value = typeof getter === "function" ? getter() : getter; 108 | } else { 109 | const el = document.querySelector(`[name="sp.${name}"]`); 110 | if (el === null) { 111 | console.error(`Element with name ${name} not found`); 112 | return; 113 | } 114 | value = el.type === "checkbox" ? el.checked.toString().toUpperCase() : el.value; 115 | } 116 | cciOptions[name] = value; 117 | } 118 | 119 | const spatialEnabled = document.querySelector("[name=_spatialEnabled]").checked; 120 | if (spatialEnabled) { 121 | cciOptions = {}; 122 | 123 | add_sp("grid.size"); 124 | add_sp("step.size"); 125 | add_sp("max.neighbors"); 126 | add_sp("params", () => document.querySelector("[name=_CCI]").value); 127 | 128 | const spLayout = document.querySelector("[name=_spLayout]").value; 129 | if (spLayout === "normal") { 130 | add_sp("layout", `"enhanced"`); 131 | add_sp("same.type.prob"); 132 | } else if (spLayout === "islands") { 133 | const spIslands = document.querySelector("[name=_spIslands]").value; 134 | add_sp("layout", `"islands:${spIslands}"`); 135 | } else if (spLayout === "layers") { 136 | add_sp("layout", `"layers"`); 137 | } 138 | 139 | if (cciOptions.params.length === 0) { 140 | alert("Please select a CCI dataframe to simulate spatial data."); 141 | } 142 | } 143 | 144 | console.log(options, cciOptions); 145 | 146 | const optString = `options <- list( 147 | ${Object.entries(options) 148 | .map(([k, v]) => `${k} = ${v}`) 149 | .join(", ")} 150 | ${ 151 | cciOptions 152 | ? `, cci = list(${Object.entries(cciOptions) 153 | .map(([k, v]) => `${k} = ${v}`) 154 | .join(", ")})` 155 | : "" 156 | } 157 | )`; 158 | Shiny.setInputValue("generatedOptions", optString); 159 | const modal = new bootstrap.Modal(document.getElementById("outputModal")); 160 | modal.show(); 161 | } 162 | 163 | function init() { 164 | const outputButton = document.querySelector("#outputButton"); 165 | outputButton.addEventListener("click", outputOptions); 166 | 167 | document.querySelector("#resetButton").addEventListener("click", () => { 168 | location.reload(); 169 | }); 170 | 171 | document.querySelector("#stopApp").addEventListener("click", () => { 172 | Shiny.setInputValue("stopApp", "YES"); 173 | close(); 174 | }); 175 | 176 | Shiny.addCustomMessageHandler("Defaults", (v) => { 177 | console.log("Def", v); 178 | Object.assign(defaultValues, v); 179 | }); 180 | } 181 | 182 | document.addEventListener("DOMContentLoaded", init); 183 | -------------------------------------------------------------------------------- /inst/shiny-app/www/phyla1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhangLabGT/scMultiSim/12e3799a445316c93df9fc357909c796cfc61f6e/inst/shiny-app/www/phyla1.png -------------------------------------------------------------------------------- /inst/shiny-app/www/phyla3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhangLabGT/scMultiSim/12e3799a445316c93df9fc357909c796cfc61f6e/inst/shiny-app/www/phyla3.png -------------------------------------------------------------------------------- /inst/shiny-app/www/phyla5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhangLabGT/scMultiSim/12e3799a445316c93df9fc357909c796cfc61f6e/inst/shiny-app/www/phyla5.png -------------------------------------------------------------------------------- /inst/shiny-app/www/scm_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZhangLabGT/scMultiSim/12e3799a445316c93df9fc357909c796cfc61f6e/inst/shiny-app/www/scm_logo.png -------------------------------------------------------------------------------- /inst/shiny-app/www/style.css: -------------------------------------------------------------------------------- 1 | .text-small { 2 | font-size: 85%; 3 | } 4 | 5 | .bi { 6 | width: 1em; 7 | height: 1em; 8 | vertical-align: -.125em; 9 | fill: currentcolor; 10 | } 11 | 12 | .legend-rect { 13 | width: 16px; 14 | display: inline-block; 15 | height: 16px; 16 | border: 1px solid black; 17 | margin-right: 4px; 18 | transform: translateY(2px); 19 | } 20 | 21 | nav#mainNav { 22 | border-bottom: 2px solid #555; 23 | } 24 | -------------------------------------------------------------------------------- /inst/shiny-app/www/validate.js: -------------------------------------------------------------------------------- 1 | function setInvalid(el, msg) { 2 | el.classList.add("is-invalid"); 3 | let sb = el.nextElementSibling; 4 | while (sb !== null && !sb.classList.contains("invalid-feedback")) { 5 | sb = sb.nextElementSibling; 6 | } 7 | if (sb === null) { 8 | console.error("No sibling with class 'invalid-feedback' found"); 9 | } 10 | sb.innerText = msg; 11 | } 12 | 13 | function setValid(el) { 14 | el.classList.remove("is-invalid"); 15 | } 16 | 17 | function validateMultipleElements(els, custom_fn) { 18 | const [valid, msg] = custom_fn(els); 19 | if (valid) { 20 | els.forEach((el) => setValid(el)); 21 | } else { 22 | els.forEach((el) => setInvalid(el, msg)); 23 | } 24 | } 25 | 26 | function validateSingleElement(el, custom_fn) { 27 | if (typeof custom_fn === "function") { 28 | const [valid, msg] = custom_fn(el); 29 | if (valid) { 30 | setValid(el); 31 | } else { 32 | setInvalid(el, msg); 33 | } 34 | return; 35 | } 36 | 37 | const data = el.dataset.v; 38 | const [type, valueRange] = data.split(":"); 39 | 40 | let value = el.value; 41 | if (type === "i" || type === "f") { 42 | if (type === "i" && /[^\d]/.test(value)) { 43 | setInvalid(el, "Please enter an integer number"); 44 | return; 45 | } 46 | value = type === "i" ? parseInt(value) : parseFloat(value); 47 | if (isNaN(value)) { 48 | setInvalid(el, `Please enter a number`); 49 | return; 50 | } 51 | } else if (type === "n") { 52 | return; 53 | } 54 | 55 | if (valueRange !== undefined) { 56 | const [min, max] = valueRange.split("-"); 57 | if (value < min || value > max) { 58 | setInvalid(el, `Please enter a value between ${min} and ${max}`); 59 | return; 60 | } 61 | } 62 | 63 | setValid(el); 64 | } 65 | 66 | function validate(name, custom_fn) { 67 | if (Array.isArray(name)) { 68 | const els = name.map((n) => document.querySelector(`[name="${n}"]`)); 69 | validateMultipleElements(els, custom_fn); 70 | for (const el of els) { 71 | el.addEventListener("input", () => { 72 | validateMultipleElements(els, custom_fn); 73 | }); 74 | } 75 | return; 76 | } 77 | const el = document.querySelectorAll(`[name="${name}"]`); 78 | if (el.length === 0) { 79 | console.error(`Element with name ${name} not found`); 80 | } else if (el.length === 1) { 81 | validateSingleElement(el[0], custom_fn); 82 | el[0].addEventListener("input", () => { 83 | validateSingleElement(el[0], custom_fn); 84 | }); 85 | } 86 | } 87 | 88 | function init() { 89 | const validatedNames = new Set(); 90 | 91 | function customValidate(n, fn) { 92 | validate(n, fn); 93 | validatedNames.add(n); 94 | } 95 | 96 | customValidate("discretePopSize", (el) => { 97 | const popType = document.querySelector("input[name='_cellPop']:checked").value; 98 | if (popType === "continuous" || el.value.length === 0) return [true, null]; 99 | const nClus = parseInt(document.querySelector("input[name='_numCluster']").value); 100 | const nCell = parseInt(document.querySelector("input[name='num.cells']").value); 101 | const values = el.value.split(",").map((v) => parseInt(v)); 102 | if (values.length !== nClus) return [false, "Number of clusters not matching"]; 103 | if (values.reduce((a, b) => a + b, 0) !== nCell) return [false, "Number of cells not matching"]; 104 | return [true, null]; 105 | }); 106 | 107 | customValidate(["_regionDist0", "_regionDist1", "_regionDist2"], (el) => { 108 | const distValues = [0, 1, 2].map((i) => parseFloat(document.querySelector(`input[name='_regionDist${i}']`).value)); 109 | const currValue = parseFloat(el.value); 110 | if (currValue < 0 || currValue > 1) return [false, "Please enter a value between 0 and 1"]; 111 | if (distValues.reduce((a, b) => a + b, 0) !== 1) return [false, "Sum of all values should be 1"]; 112 | return [true, null]; 113 | }); 114 | 115 | const allInputs = document.querySelectorAll("input"); 116 | for (const input of allInputs) { 117 | if (typeof input.dataset.v === "string") { 118 | if (input.dataset.v !== "c" && !validatedNames.has(input.name)) { 119 | validate(input.name); 120 | validatedNames.add(input.name); 121 | } 122 | } else { 123 | if (!(input.type === "radio" || input.type === "checkbox")) { 124 | console.warn("Validation: data attribute not found", input); 125 | } 126 | } 127 | } 128 | } 129 | 130 | document.addEventListener("DOMContentLoaded", init); 131 | -------------------------------------------------------------------------------- /man/GRN_params_100.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{GRN_params_100} 5 | \alias{GRN_params_100} 6 | \title{100_gene_GRN is a matrix of GRN params consisting of 100 genes where: # - column 1 is the target gene ID, # - column 2 is the gene ID which acts as a transcription factor for the target (regulated) gene # - column 3 is the effect of the column 2 gene ID on the column 1 gene ID} 7 | \format{ 8 | a data frame. 9 | } 10 | \usage{ 11 | data(GRN_params_100) 12 | } 13 | \value{ 14 | a data frame with three columns: target gene ID, TF gene ID, and the effect of TF on target gene. 15 | } 16 | \description{ 17 | 100_gene_GRN is a matrix of GRN params consisting of 100 genes where: # - column 1 is the target gene ID, # - column 2 is the gene ID which acts as a transcription factor for the target (regulated) gene # - column 3 is the effect of the column 2 gene ID on the column 1 gene ID 18 | } 19 | \examples{ 20 | data(GRN_params_100) 21 | } 22 | \keyword{datasets} 23 | -------------------------------------------------------------------------------- /man/GRN_params_1139.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{GRN_params_1139} 5 | \alias{GRN_params_1139} 6 | \title{GRN_params_1139 is a matrix of GRN params consisting of 1139 genes where: # - column 1 is the target gene ID, # - column 2 is the gene ID which acts as a transcription factor for the target (regulated) gene # - column 3 is the effect of the column 2 gene ID on the column 1 gene ID} 7 | \format{ 8 | a data frame. 9 | } 10 | \usage{ 11 | data(GRN_params_1139) 12 | } 13 | \value{ 14 | a data frame with three columns: target gene ID, TF gene ID, and the effect of TF on target gene. 15 | } 16 | \description{ 17 | GRN_params_1139 is a matrix of GRN params consisting of 1139 genes where: # - column 1 is the target gene ID, # - column 2 is the gene ID which acts as a transcription factor for the target (regulated) gene # - column 3 is the effect of the column 2 gene ID on the column 1 gene ID 18 | } 19 | \examples{ 20 | data(GRN_params_1139) 21 | } 22 | \keyword{datasets} 23 | -------------------------------------------------------------------------------- /man/Get_1region_ATAC_correlation.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/7_benchmark.R 3 | \name{Get_1region_ATAC_correlation} 4 | \alias{Get_1region_ATAC_correlation} 5 | \title{This function gets the average correlation rna seq counts and region effect on genes for genes which are only associated with 1 chromatin region} 6 | \usage{ 7 | Get_1region_ATAC_correlation(counts, atacseq_data, region2gene) 8 | } 9 | \arguments{ 10 | \item{counts}{rna seq counts} 11 | 12 | \item{atacseq_data}{atac seq data} 13 | 14 | \item{region2gene}{a 0 1 coupling matrix between regions and genes of shape (nregions) x (num_genes), where a value of 1 indicates the gene is affected by a particular region} 15 | } 16 | \value{ 17 | the correlation value 18 | } 19 | \description{ 20 | This function gets the average correlation rna seq counts and region effect on genes for genes which are only associated with 1 chromatin region 21 | } 22 | \examples{ 23 | \donttest{ 24 | results <- sim_example(ncells = 10) 25 | Get_1region_ATAC_correlation(results$counts, results$atacseq_data, results$region_to_gene) 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /man/Get_ATAC_correlation.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/7_benchmark.R 3 | \name{Get_ATAC_correlation} 4 | \alias{Get_ATAC_correlation} 5 | \title{This function gets the average correlation rna seq counts and chromatin region effect on genes} 6 | \usage{ 7 | Get_ATAC_correlation(counts, atacseq_data, num_genes) 8 | } 9 | \arguments{ 10 | \item{counts}{rna seq counts} 11 | 12 | \item{atacseq_data}{atac seq data} 13 | 14 | \item{num_genes}{number of genes} 15 | } 16 | \value{ 17 | the correlation value 18 | } 19 | \description{ 20 | This function gets the average correlation rna seq counts and chromatin region effect on genes 21 | } 22 | \examples{ 23 | results <- sim_example(ncells = 10) 24 | Get_ATAC_correlation(results$counts, results$atacseq_data, results$num_genes) 25 | } 26 | -------------------------------------------------------------------------------- /man/OP.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/0_opts.R 3 | \name{OP} 4 | \alias{OP} 5 | \title{Get option from an object in the current environment} 6 | \usage{ 7 | OP(..., .name = "options") 8 | } 9 | \arguments{ 10 | \item{...}{the parameter name} 11 | 12 | \item{.name}{get option from this object} 13 | } 14 | \value{ 15 | the parameter value 16 | } 17 | \description{ 18 | Get option from an object in the current environment 19 | } 20 | \keyword{internal} 21 | -------------------------------------------------------------------------------- /man/Phyla1.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/8_utils.R 3 | \name{Phyla1} 4 | \alias{Phyla1} 5 | \title{Creating a linear example tree} 6 | \usage{ 7 | Phyla1(len = 1) 8 | } 9 | \arguments{ 10 | \item{len}{length of the tree} 11 | } 12 | \value{ 13 | a R phylo object 14 | } 15 | \description{ 16 | Creating a linear example tree 17 | } 18 | \examples{ 19 | Phyla1(len = 1) 20 | } 21 | -------------------------------------------------------------------------------- /man/Phyla3.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/8_utils.R 3 | \name{Phyla3} 4 | \alias{Phyla3} 5 | \title{Creating an example tree with 3 tips} 6 | \usage{ 7 | Phyla3(plotting = FALSE) 8 | } 9 | \arguments{ 10 | \item{plotting}{True for plotting the tree on console, False for no plot} 11 | } 12 | \value{ 13 | a R phylo object 14 | } 15 | \description{ 16 | Creating an example tree with 3 tips 17 | } 18 | \examples{ 19 | Phyla3() 20 | } 21 | -------------------------------------------------------------------------------- /man/Phyla5.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/8_utils.R 3 | \name{Phyla5} 4 | \alias{Phyla5} 5 | \title{Creating an example tree with 5 tips} 6 | \usage{ 7 | Phyla5(plotting = FALSE) 8 | } 9 | \arguments{ 10 | \item{plotting}{True for plotting the tree on console, False for no plot} 11 | } 12 | \value{ 13 | a R phylo object 14 | } 15 | \description{ 16 | Creating an example tree with 5 tips 17 | } 18 | \examples{ 19 | Phyla5() 20 | } 21 | -------------------------------------------------------------------------------- /man/SampleDen.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/8_utils.R 3 | \name{SampleDen} 4 | \alias{SampleDen} 5 | \title{sample from smoothed density function} 6 | \usage{ 7 | SampleDen(nsample, den_fun, reduce.mem = FALSE) 8 | } 9 | \arguments{ 10 | \item{nsample}{number of samples needed} 11 | 12 | \item{den_fun}{density function estimated from density() from R default} 13 | 14 | \item{reduce.mem}{use alternative implementation to reduce memory usage} 15 | } 16 | \value{ 17 | a vector of samples 18 | } 19 | \description{ 20 | sample from smoothed density function 21 | } 22 | \keyword{internal} 23 | -------------------------------------------------------------------------------- /man/True2ObservedATAC.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/6_technoise.R 3 | \name{True2ObservedATAC} 4 | \alias{True2ObservedATAC} 5 | \title{Simulate observed ATAC-seq matrix given technical noise and the true counts} 6 | \usage{ 7 | True2ObservedATAC( 8 | atacseq_data, 9 | randseed, 10 | observation_prob = 0.3, 11 | sd_frac = 0.1 12 | ) 13 | } 14 | \arguments{ 15 | \item{atacseq_data}{true ATAC-seq data} 16 | 17 | \item{randseed}{(should produce same result if nregions, nevf and randseed are all the same)} 18 | 19 | \item{observation_prob}{for each integer count of a particular region for a particular cell, the probability the count will be observed} 20 | 21 | \item{sd_frac}{the fraction of ATAC-seq data value used as the standard deviation of added normally distrubted noise} 22 | } 23 | \value{ 24 | a matrix of observed ATAC-seq data 25 | } 26 | \description{ 27 | Simulate observed ATAC-seq matrix given technical noise and the true counts 28 | } 29 | \examples{ 30 | results <- sim_example(ncells = 10) 31 | True2ObservedATAC(results$atac_counts, randseed = 1) 32 | } 33 | -------------------------------------------------------------------------------- /man/True2ObservedCounts.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/6_technoise.R 3 | \name{True2ObservedCounts} 4 | \alias{True2ObservedCounts} 5 | \title{Simulate observed count matrix given technical biases and the true counts} 6 | \usage{ 7 | True2ObservedCounts( 8 | true_counts, 9 | meta_cell, 10 | protocol, 11 | randseed, 12 | alpha_mean = 0.1, 13 | alpha_sd = 0.002, 14 | alpha_gene_mean = 1, 15 | alpha_gene_sd = 0, 16 | gene_len, 17 | depth_mean, 18 | depth_sd, 19 | lenslope = 0.02, 20 | nbins = 20, 21 | amp_bias_limit = c(-0.2, 0.2), 22 | rate_2PCR = 0.8, 23 | nPCR1 = 16, 24 | nPCR2 = 10, 25 | LinearAmp = FALSE, 26 | LinearAmp_coef = 2000 27 | ) 28 | } 29 | \arguments{ 30 | \item{true_counts}{gene cell matrix} 31 | 32 | \item{meta_cell}{the meta information related to cells, will be combined with technical cell level information and returned} 33 | 34 | \item{protocol}{a string, can be "nonUMI" or "UMI"} 35 | 36 | \item{randseed}{(should produce same result if nregions, nevf and randseed are all the same)} 37 | 38 | \item{alpha_mean}{the mean of rate of subsampling of transcripts during capture step, default at 10 percent efficiency} 39 | 40 | \item{alpha_sd}{the std of rate of subsampling of transcripts} 41 | 42 | \item{alpha_gene_mean}{the per-gene scale factor of the alpha parameter, default at 1} 43 | 44 | \item{alpha_gene_sd}{the standard deviation of the per-gene scale factor of the alpha parameter, default at 0} 45 | 46 | \item{gene_len}{a vector with lengths of all genes} 47 | 48 | \item{depth_mean}{mean of sequencing depth} 49 | 50 | \item{depth_sd}{std of sequencing depth} 51 | 52 | \item{lenslope}{amount of length bias} 53 | 54 | \item{nbins}{number of bins for gene length} 55 | 56 | \item{amp_bias_limit}{range of amplification bias for each gene, a vector of length ngenes} 57 | 58 | \item{rate_2PCR}{PCR efficiency, usually very high, default is 0.8} 59 | 60 | \item{nPCR1}{the number of PCR cycles in "pre-amplification" step, default is 16} 61 | 62 | \item{nPCR2}{the number of PCR cycles used after fragmentation.} 63 | 64 | \item{LinearAmp}{if linear amplification is used for pre-amplification step, default is FALSE} 65 | 66 | \item{LinearAmp_coef}{the coeficient of linear amplification, that is, how many times each molecule is amplified by} 67 | } 68 | \value{ 69 | if UMI, a list with two elements, the first is the observed count matrix, the second is the metadata; if nonUMI, a matrix 70 | } 71 | \description{ 72 | Simulate observed count matrix given technical biases and the true counts 73 | } 74 | \examples{ 75 | \donttest{ 76 | results <- sim_example(ncells = 10) 77 | data(gene_len_pool) 78 | gene_len <- sample(gene_len_pool, results$num_genes, replace = FALSE) 79 | True2ObservedCounts( 80 | results$counts, results$cell_meta, protocol = "nonUMI", randseed = 1, 81 | alpha_mean = 0.1, alpha_sd = 0.05, gene_len = gene_len, depth_mean = 1e5, depth_sd = 3e3 82 | ) 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /man/add_expr_noise.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/6_technoise.R 3 | \name{add_expr_noise} 4 | \alias{add_expr_noise} 5 | \title{Add experimental noise to true counts} 6 | \usage{ 7 | add_expr_noise(results, ...) 8 | } 9 | \arguments{ 10 | \item{results}{The scMultisim result object} 11 | 12 | \item{...}{\code{randseed}: The random seed 13 | \code{protocol}: \code{UMI} or \code{non-UMI} 14 | \code{gene_len}: A vector with lengths of all genes 15 | \code{alpha_mean}, \code{alpha_sd}: rate of subsampling of transcripts during capture step 16 | \code{depth_mean}, \code{depth_sd}: The sequencing depth} 17 | } 18 | \value{ 19 | none 20 | } 21 | \description{ 22 | Add experimental noise to true counts 23 | } 24 | \examples{ 25 | results <- sim_example(ncells = 10) 26 | add_expr_noise(results) 27 | } 28 | \seealso{ 29 | The underlying methods 30 | \link{True2ObservedCounts} and \link{True2ObservedATAC} 31 | } 32 | -------------------------------------------------------------------------------- /man/add_outliers.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/6_technoise.R 3 | \name{add_outliers} 4 | \alias{add_outliers} 5 | \title{Add outliers to the observed counts} 6 | \usage{ 7 | add_outliers( 8 | res, 9 | prob = 0.01, 10 | factor = 2, 11 | sd = 0.5, 12 | cell.num = 1, 13 | max.var = Inf 14 | ) 15 | } 16 | \arguments{ 17 | \item{res}{The scMultisim result object} 18 | 19 | \item{prob}{The probability of adding outliers for each gene} 20 | 21 | \item{factor}{The factor of the outliers} 22 | 23 | \item{sd}{The standard deviation of the outliers} 24 | 25 | \item{cell.num}{For a gene, the number of cells chosen to add outliers} 26 | 27 | \item{max.var}{The maximum variance allowed} 28 | } 29 | \value{ 30 | none 31 | } 32 | \description{ 33 | Add outliers to the observed counts 34 | } 35 | -------------------------------------------------------------------------------- /man/cci_cell_type_params.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/3.1_spatial.R 3 | \name{cci_cell_type_params} 4 | \alias{cci_cell_type_params} 5 | \title{Generate cell-type level CCI parameters} 6 | \usage{ 7 | cci_cell_type_params( 8 | tree, 9 | total.lr, 10 | ctype.lr = 4:6, 11 | step.size = 1, 12 | rand = TRUE, 13 | discrete = FALSE 14 | ) 15 | } 16 | \arguments{ 17 | \item{tree}{Use the same value for \code{sim_true_counts()}.} 18 | 19 | \item{total.lr}{Total number of LR pairs in the database. Use the same value for \code{sim_true_counts()}.} 20 | 21 | \item{ctype.lr}{If \code{rand} is \code{TRUE}, how many LR pairs should be enabled between each cell type pair. Should be a range, e.g. 4:6.} 22 | 23 | \item{step.size}{Use the same value for \code{sim_true_counts()}.} 24 | 25 | \item{rand}{Whether fill the matrix randomly} 26 | 27 | \item{discrete}{Whether the cell population is discrete. Use the same value for \code{sim_true_counts()}.} 28 | } 29 | \value{ 30 | A 3D matrix of (n_cell_type, n_cell_type, n_lr). The value at (i, j, k) is 1 if there exist CCI of LR-pair k between cell type i and cell type j. 31 | } 32 | \description{ 33 | See the return value if you want to specify the cell-type level ground truth. 34 | } 35 | \examples{ 36 | cci_cell_type_params(Phyla3(), 100, 4:6, 0.5, TRUE, FALSE) 37 | 38 | } 39 | -------------------------------------------------------------------------------- /man/dens_nonzero.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{dens_nonzero} 5 | \alias{dens_nonzero} 6 | \title{this is the density function of log(x+1), where x is the non-zero values for ATAC-SEQ data} 7 | \format{ 8 | a vector. 9 | } 10 | \usage{ 11 | data(dens_nonzero) 12 | } 13 | \value{ 14 | a vector. 15 | } 16 | \description{ 17 | this is the density function of log(x+1), where x is the non-zero values for ATAC-SEQ data 18 | } 19 | \examples{ 20 | data(dens_nonzero) 21 | } 22 | \keyword{datasets} 23 | -------------------------------------------------------------------------------- /man/divide_batches.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/6_technoise.R 3 | \name{divide_batches} 4 | \alias{divide_batches} 5 | \title{Divide batches for observed counts} 6 | \usage{ 7 | divide_batches(results, nbatch = 2, effect = 3, randseed = 0) 8 | } 9 | \arguments{ 10 | \item{results}{The scMultisim result object, after running \code{addExprNoise()}} 11 | 12 | \item{nbatch}{Number of batches} 13 | 14 | \item{effect}{Batch effect size, default is 3} 15 | 16 | \item{randseed}{Random seed} 17 | } 18 | \value{ 19 | none 20 | } 21 | \description{ 22 | Divide batches for observed counts 23 | } 24 | \examples{ 25 | results <- sim_example(ncells = 10) 26 | add_expr_noise(results) 27 | divide_batches(results) 28 | } 29 | -------------------------------------------------------------------------------- /man/dot-amplifyOneCell.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/6_technoise.R 3 | \name{.amplifyOneCell} 4 | \alias{.amplifyOneCell} 5 | \title{This function simulates the amplification, library prep, and the sequencing processes.} 6 | \usage{ 7 | .amplifyOneCell( 8 | true_counts_1cell, 9 | protocol, 10 | rate_2cap, 11 | gene_len, 12 | amp_bias, 13 | rate_2PCR, 14 | nPCR1, 15 | nPCR2, 16 | LinearAmp, 17 | LinearAmp_coef, 18 | N_molecules_SEQ 19 | ) 20 | } 21 | \arguments{ 22 | \item{true_counts_1cell}{the true transcript counts for one cell (one vector)} 23 | 24 | \item{protocol}{a string, can be "nonUMI" or "UMI"} 25 | 26 | \item{rate_2cap}{the capture efficiency for this cell} 27 | 28 | \item{gene_len}{gene lengths for the genes/transcripts, sampled from real human transcript length} 29 | 30 | \item{amp_bias}{amplification bias for each gene, a vector of length ngenes} 31 | 32 | \item{rate_2PCR}{PCR efficiency, usually very high} 33 | 34 | \item{nPCR1}{the number of PCR cycles} 35 | 36 | \item{nPCR2}{the number of PCR cycles} 37 | 38 | \item{LinearAmp}{if linear amplification is used for pre-amplification step, default is FALSE} 39 | 40 | \item{LinearAmp_coef}{the coeficient of linear amplification, that is, how many times each molecule is amplified by} 41 | 42 | \item{N_molecules_SEQ}{number of molecules sent for sequencing; sequencing depth} 43 | } 44 | \value{ 45 | read counts (if protocol="nonUMI") or UMI counts (if protocol="UMI) 46 | } 47 | \description{ 48 | This function simulates the amplification, library prep, and the sequencing processes. 49 | } 50 | \keyword{internal} 51 | -------------------------------------------------------------------------------- /man/dot-calAmpBias.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/6_technoise.R 3 | \name{.calAmpBias} 4 | \alias{.calAmpBias} 5 | \title{Simulate technical biases} 6 | \usage{ 7 | .calAmpBias(lenslope, nbins, gene_len, amp_bias_limit) 8 | } 9 | \arguments{ 10 | \item{lenslope}{amount of length bias. This value sould be less than 2*amp_bias_limit[2]/(nbins-1)} 11 | 12 | \item{nbins}{number of bins for gene length} 13 | 14 | \item{gene_len}{transcript length of each gene} 15 | 16 | \item{amp_bias_limit}{range of amplification bias for each gene, a vector of length ngenes} 17 | } 18 | \value{ 19 | a vector 20 | } 21 | \description{ 22 | Simulate technical biases 23 | } 24 | \keyword{internal} 25 | -------------------------------------------------------------------------------- /man/dot-continuousCIF.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/1_main.R 3 | \name{.continuousCIF} 4 | \alias{.continuousCIF} 5 | \title{Generates cifs for cells sampled along the trajectory of cell development} 6 | \usage{ 7 | .continuousCIF( 8 | seed, 9 | N, 10 | options, 11 | ncell_key = "cell", 12 | is_spatial = FALSE, 13 | spatial_params = NULL, 14 | .plot = FALSE, 15 | .plot.name = "cont_cif.pdf" 16 | ) 17 | } 18 | \arguments{ 19 | \item{seed}{random seed} 20 | 21 | \item{N}{the number list} 22 | 23 | \item{options}{the option list} 24 | 25 | \item{ncell_key}{the key for the number of cells in N} 26 | 27 | \item{is_spatial}{return a list of cifs for spatial} 28 | 29 | \item{spatial_params}{the spatial parameters} 30 | 31 | \item{.plot}{save the CIF plot} 32 | 33 | \item{.plot.name}{plot name} 34 | } 35 | \value{ 36 | a list containing the cif and meta data 37 | } 38 | \description{ 39 | Generates cifs for cells sampled along the trajectory of cell development 40 | } 41 | \keyword{internal} 42 | -------------------------------------------------------------------------------- /man/dot-divideBatchesImpl.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/6_technoise.R 3 | \name{.divideBatchesImpl} 4 | \alias{.divideBatchesImpl} 5 | \title{Divide the observed counts into multiple batches by adding batch effect to each batch} 6 | \usage{ 7 | .divideBatchesImpl( 8 | counts, 9 | meta_cell, 10 | nbatch, 11 | batch_effect_size = 1, 12 | randseed = 0 13 | ) 14 | } 15 | \arguments{ 16 | \item{counts}{gene cell matrix} 17 | 18 | \item{meta_cell}{the meta information related to cells, will be combined with technical cell level information and returned} 19 | 20 | \item{nbatch}{number of batches} 21 | 22 | \item{batch_effect_size}{amount of batch effects. Larger values result in bigger differences between batches. Default is 1.} 23 | 24 | \item{randseed}{random seed} 25 | } 26 | \value{ 27 | a list with two elements: counts and meta_cell 28 | } 29 | \description{ 30 | Divide the observed counts into multiple batches by adding batch effect to each batch 31 | } 32 | \keyword{internal} 33 | -------------------------------------------------------------------------------- /man/dot-expandToBinary.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/6_technoise.R 3 | \name{.expandToBinary} 4 | \alias{.expandToBinary} 5 | \title{expand transcript counts to a vector of binaries of the same length of as the number of transcripts} 6 | \usage{ 7 | .expandToBinary(true_counts_1cell) 8 | } 9 | \arguments{ 10 | \item{true_counts_1cell}{number of transcript in one cell} 11 | } 12 | \value{ 13 | a list of two vectors, the first vector is a vector of 1s, the second vector is the index of transcripts 14 | } 15 | \description{ 16 | expand transcript counts to a vector of binaries of the same length of as the number of transcripts 17 | } 18 | \keyword{internal} 19 | -------------------------------------------------------------------------------- /man/dot-getCountCorrMatrix.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/7_benchmark.R 3 | \name{.getCountCorrMatrix} 4 | \alias{.getCountCorrMatrix} 5 | \title{This function finds the correlation between every pair of genes} 6 | \usage{ 7 | .getCountCorrMatrix(counts) 8 | } 9 | \arguments{ 10 | \item{counts}{rna seq counts} 11 | } 12 | \value{ 13 | the correlation matrix 14 | } 15 | \description{ 16 | This function finds the correlation between every pair of genes 17 | } 18 | \keyword{internal} 19 | -------------------------------------------------------------------------------- /man/dot-getParams.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/2_sim.R 3 | \name{.getParams} 4 | \alias{.getParams} 5 | \title{Get Kineic Parameters for all cells and genes} 6 | \usage{ 7 | .getParams(seed, sim, sp_cell_i = NULL, sp_path_i = NULL) 8 | } 9 | \arguments{ 10 | \item{seed}{random seed} 11 | 12 | \item{sim}{the simulation environment} 13 | 14 | \item{sp_cell_i}{spatial cell index} 15 | 16 | \item{sp_path_i}{the pre-sampled path along the tree for this cell} 17 | } 18 | \value{ 19 | the kinetic parameters 20 | } 21 | \description{ 22 | Get Kineic Parameters for all cells and genes 23 | } 24 | \keyword{internal} 25 | -------------------------------------------------------------------------------- /man/dot-normalizeGRNParams.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/1_main.R 3 | \name{.normalizeGRNParams} 4 | \alias{.normalizeGRNParams} 5 | \title{Rename the original gene IDs in the GRN table to integers.} 6 | \usage{ 7 | .normalizeGRNParams(params) 8 | } 9 | \arguments{ 10 | \item{params}{GRN parameters.} 11 | } 12 | \value{ 13 | list 14 | } 15 | \description{ 16 | Rename the original gene IDs in the GRN table to integers. 17 | } 18 | \keyword{internal} 19 | -------------------------------------------------------------------------------- /man/dot-rnormTrunc.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/6_technoise.R 3 | \name{.rnormTrunc} 4 | \alias{.rnormTrunc} 5 | \title{sample from truncated normal distribution} 6 | \usage{ 7 | .rnormTrunc(n, mean, sd, a, b) 8 | } 9 | \arguments{ 10 | \item{n}{number of values to create} 11 | 12 | \item{mean}{mean of the normal distribution} 13 | 14 | \item{sd}{standard deviation of the normal distribution} 15 | 16 | \item{a}{the minimum value allowed} 17 | 18 | \item{b}{the maximum value allowed} 19 | } 20 | \value{ 21 | a vector of length n 22 | } 23 | \description{ 24 | sample from truncated normal distribution 25 | } 26 | \keyword{internal} 27 | -------------------------------------------------------------------------------- /man/gen_1branch.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/2_sim.R 3 | \name{gen_1branch} 4 | \alias{gen_1branch} 5 | \title{Generate true transcript counts for linear structure} 6 | \usage{ 7 | gen_1branch( 8 | kinet_params, 9 | start_state, 10 | start_s, 11 | start_u, 12 | randpoints1, 13 | ncells1, 14 | ngenes, 15 | beta_vec, 16 | d_vec, 17 | cycle_length_factor, 18 | cell 19 | ) 20 | } 21 | \arguments{ 22 | \item{kinet_params}{kinetic parameters, include k_on, k_off, s and beta} 23 | 24 | \item{start_state}{the starting state: on or off of each gene} 25 | 26 | \item{start_s}{spliced count of the root cell in the branch} 27 | 28 | \item{start_u}{unspliced count of the root cell in the branch} 29 | 30 | \item{randpoints1}{the value which evf mean is generated from} 31 | 32 | \item{ncells1}{number of cells in the branch} 33 | 34 | \item{ngenes}{number of genes} 35 | 36 | \item{beta_vec}{splicing rate of each gene} 37 | 38 | \item{d_vec}{degradation rate of each gene} 39 | 40 | \item{cycle_length_factor}{for generating velocity data, a factor which is multiplied by the expected time to transition from kon to koff and back to to form the the length of a cycle} 41 | 42 | \item{cell}{the cell number currently having counts generated} 43 | } 44 | \value{ 45 | a list of 4 elements, the first element is true counts, second is the gene level meta information, the third is cell level meta information, including a matrix of evf and a vector of cell identity, and the fourth is the parameters kon, koff and s used to simulation the true counts 46 | } 47 | \description{ 48 | Generate true transcript counts for linear structure 49 | } 50 | \keyword{internal} 51 | -------------------------------------------------------------------------------- /man/gen_clutter.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/3.1_spatial.R 3 | \name{gen_clutter} 4 | \alias{gen_clutter} 5 | \title{generate a clutter of cells by growing from the center} 6 | \usage{ 7 | gen_clutter( 8 | n_cell, 9 | grid_size = NA, 10 | center = c(0, 0), 11 | existing_loc = NULL, 12 | existing_grid = NULL 13 | ) 14 | } 15 | \arguments{ 16 | \item{n_cell}{the number of cells} 17 | 18 | \item{grid_size}{the width and height of the grid} 19 | 20 | \item{center}{the center of the grid} 21 | 22 | \item{existing_loc}{only place cells on the specified existing locations} 23 | 24 | \item{existing_grid}{manually specify what locations are in the grid} 25 | } 26 | \value{ 27 | a matrix of locations 28 | } 29 | \description{ 30 | generate a clutter of cells by growing from the center 31 | } 32 | \examples{ 33 | gen_clutter(10, 10, c(5, 5)) 34 | 35 | } 36 | -------------------------------------------------------------------------------- /man/gene_corr_cci.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/7_benchmark.R 3 | \name{gene_corr_cci} 4 | \alias{gene_corr_cci} 5 | \title{Plot the ligand-receptor correlation summary} 6 | \usage{ 7 | gene_corr_cci( 8 | results = .getResultsFromGlobal(), 9 | all.genes = FALSE, 10 | .pair = NULL, 11 | .exclude.same.types = TRUE 12 | ) 13 | } 14 | \arguments{ 15 | \item{results}{The scMultisim result object} 16 | 17 | \item{all.genes}{Whether to use all genes or only the ligand/receptor genes} 18 | 19 | \item{.pair}{Return the raw data for the given LR pair} 20 | 21 | \item{.exclude.same.types}{Whether to exclude neighbor cells with same cell type} 22 | } 23 | \value{ 24 | none 25 | } 26 | \description{ 27 | Plot the ligand-receptor correlation summary 28 | } 29 | \examples{ 30 | results <- sim_example_spatial(ncells = 10) 31 | gene_corr_cci(results) 32 | } 33 | -------------------------------------------------------------------------------- /man/gene_corr_regulator.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/7_benchmark.R 3 | \name{gene_corr_regulator} 4 | \alias{gene_corr_regulator} 5 | \title{Print the correlations between targets of each regulator} 6 | \usage{ 7 | gene_corr_regulator(results = .getResultsFromGlobal(), regulator) 8 | } 9 | \arguments{ 10 | \item{results}{The scMultisim result object} 11 | 12 | \item{regulator}{The regulator ID in the GRN params} 13 | } 14 | \value{ 15 | none 16 | } 17 | \description{ 18 | Print the correlations between targets of each regulator 19 | } 20 | \examples{ 21 | results <- sim_example(ncells = 10) 22 | gene_corr_regulator(results, 2) 23 | } 24 | -------------------------------------------------------------------------------- /man/gene_len_pool.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{gene_len_pool} 5 | \alias{gene_len_pool} 6 | \title{a pool of gene lengths to sample from} 7 | \format{ 8 | a vector. 9 | } 10 | \usage{ 11 | data(gene_len_pool) 12 | } 13 | \value{ 14 | a vector of gene lengths. 15 | } 16 | \description{ 17 | a pool of gene lengths to sample from 18 | } 19 | \examples{ 20 | data(gene_len_pool) 21 | } 22 | \keyword{datasets} 23 | \keyword{internal} 24 | -------------------------------------------------------------------------------- /man/len2nfrag.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{len2nfrag} 5 | \alias{len2nfrag} 6 | \title{from transcript length to number of fragments (for the nonUMI protocol)} 7 | \format{ 8 | a vector. 9 | } 10 | \usage{ 11 | data(len2nfrag) 12 | } 13 | \value{ 14 | a vector. 15 | } 16 | \description{ 17 | from transcript length to number of fragments (for the nonUMI protocol) 18 | } 19 | \examples{ 20 | data(len2nfrag) 21 | } 22 | \keyword{datasets} 23 | \keyword{internal} 24 | -------------------------------------------------------------------------------- /man/match_params.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{match_params} 5 | \alias{match_params} 6 | \title{distribution of kinetic parameters learned from the Zeisel UMI cortex datasets} 7 | \format{ 8 | a data frame. 9 | } 10 | \usage{ 11 | data(param_realdata.zeisel.imputed) 12 | } 13 | \value{ 14 | a data frame. 15 | } 16 | \description{ 17 | distribution of kinetic parameters learned from the Zeisel UMI cortex datasets 18 | } 19 | \examples{ 20 | data(param_realdata.zeisel.imputed) 21 | } 22 | \keyword{datasets} 23 | \keyword{internal} 24 | -------------------------------------------------------------------------------- /man/plot_cell_loc.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/7_benchmark.R 3 | \name{plot_cell_loc} 4 | \alias{plot_cell_loc} 5 | \title{Plot cell locations} 6 | \usage{ 7 | plot_cell_loc( 8 | results = .getResultsFromGlobal(), 9 | size = 4, 10 | show.label = FALSE, 11 | show.arrows = TRUE, 12 | lr.pair = 1, 13 | .cell.pop = NULL, 14 | .locs = NULL 15 | ) 16 | } 17 | \arguments{ 18 | \item{results}{The scMultisim result object} 19 | 20 | \item{size}{Fig size} 21 | 22 | \item{show.label}{Show cell numbers} 23 | 24 | \item{show.arrows}{Show arrows representing cell-cell interactions} 25 | 26 | \item{lr.pair}{The ligand-receptor pair used to plot CCI arrows 27 | \code{results$cci_cell_type_param[lr.pair]}} 28 | 29 | \item{.cell.pop}{Specify the cell population metadata} 30 | 31 | \item{.locs}{Manually specify the cell locations as a 2x\code{ncells} matrix} 32 | } 33 | \value{ 34 | none 35 | } 36 | \description{ 37 | Plot cell locations 38 | } 39 | \examples{ 40 | results <- sim_example_spatial(ncells = 10) 41 | plot_cell_loc(results) 42 | } 43 | -------------------------------------------------------------------------------- /man/plot_gene_module_cor_heatmap.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/7_benchmark.R 3 | \name{plot_gene_module_cor_heatmap} 4 | \alias{plot_gene_module_cor_heatmap} 5 | \title{Plot the gene module correlation heatmap} 6 | \usage{ 7 | plot_gene_module_cor_heatmap( 8 | results = .getResultsFromGlobal(), 9 | seed = 0, 10 | grn.genes.only = TRUE, 11 | save = FALSE 12 | ) 13 | } 14 | \arguments{ 15 | \item{results}{The scMultisim result object} 16 | 17 | \item{seed}{The random seed} 18 | 19 | \item{grn.genes.only}{Plot the GRN gens only} 20 | 21 | \item{save}{save the plot as pdf} 22 | } 23 | \value{ 24 | none 25 | } 26 | \description{ 27 | Plot the gene module correlation heatmap 28 | } 29 | \examples{ 30 | results <- sim_example(ncells = 10) 31 | plot_gene_module_cor_heatmap(results) 32 | } 33 | -------------------------------------------------------------------------------- /man/plot_grid.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/7_benchmark.R 3 | \name{plot_grid} 4 | \alias{plot_grid} 5 | \title{Plot the CCI grid} 6 | \usage{ 7 | plot_grid(results = .getResultsFromGlobal()) 8 | } 9 | \arguments{ 10 | \item{results}{The scMultisim result object} 11 | } 12 | \value{ 13 | none 14 | } 15 | \description{ 16 | In normal cases, please use \code{plotCellLoc} instead. 17 | } 18 | \examples{ 19 | results <- sim_example_spatial(ncells = 10) 20 | plot_grid(results) 21 | } 22 | -------------------------------------------------------------------------------- /man/plot_grn.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/7_benchmark.R 3 | \name{plot_grn} 4 | \alias{plot_grn} 5 | \title{Plot the GRN network} 6 | \usage{ 7 | plot_grn(params) 8 | } 9 | \arguments{ 10 | \item{params}{The GRN params data frame} 11 | } 12 | \value{ 13 | none 14 | } 15 | \description{ 16 | Plot the GRN network 17 | } 18 | \examples{ 19 | data(GRN_params_100, envir = environment()) 20 | plot_grn(GRN_params_100) 21 | } 22 | -------------------------------------------------------------------------------- /man/plot_phyla.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/7_benchmark.R 3 | \name{plot_phyla} 4 | \alias{plot_phyla} 5 | \title{Plot a R phylogenic tree} 6 | \usage{ 7 | plot_phyla(tree) 8 | } 9 | \arguments{ 10 | \item{tree}{The tree} 11 | } 12 | \value{ 13 | none 14 | } 15 | \description{ 16 | Plot a R phylogenic tree 17 | } 18 | \examples{ 19 | plot_phyla(Phyla5()) 20 | } 21 | -------------------------------------------------------------------------------- /man/plot_rna_velocity.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/7_benchmark.R 3 | \name{plot_rna_velocity} 4 | \alias{plot_rna_velocity} 5 | \title{Plot RNA velocity as arrows on tSNE plot} 6 | \usage{ 7 | plot_rna_velocity( 8 | results = .getResultsFromGlobal(), 9 | velocity = results$velocity, 10 | perplexity = 70, 11 | arrow.length = 1, 12 | save = FALSE, 13 | randseed = 0, 14 | ... 15 | ) 16 | } 17 | \arguments{ 18 | \item{results}{The scMultiSim result object} 19 | 20 | \item{velocity}{The velocity matrix, by default using the velocity matrix in the result object} 21 | 22 | \item{perplexity}{The perplexity for tSNE} 23 | 24 | \item{arrow.length}{The length scaler of the arrow} 25 | 26 | \item{save}{Whether to save the plot} 27 | 28 | \item{randseed}{The random seed} 29 | 30 | \item{...}{Other parameters passed to ggplot} 31 | } 32 | \value{ 33 | The plot 34 | } 35 | \description{ 36 | Plot RNA velocity as arrows on tSNE plot 37 | } 38 | \examples{ 39 | results <- sim_example(ncells = 10, velocity = TRUE) 40 | plot_rna_velocity(results, perplexity = 3) 41 | } 42 | -------------------------------------------------------------------------------- /man/plot_tsne.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/7_benchmark.R 3 | \name{plot_tsne} 4 | \alias{plot_tsne} 5 | \title{Plot t-SNE visualization of a data matrix} 6 | \usage{ 7 | plot_tsne( 8 | data, 9 | labels, 10 | perplexity = 60, 11 | legend = "", 12 | plot.name = "", 13 | save = FALSE, 14 | rand.seed = 0, 15 | continuous = FALSE, 16 | labels2 = NULL, 17 | lim = NULL, 18 | runPCA = FALSE, 19 | alpha = 1 20 | ) 21 | } 22 | \arguments{ 23 | \item{data}{The \code{d}x\code{n} matrix} 24 | 25 | \item{labels}{A vector of length \code{n}, usually cell clusters} 26 | 27 | \item{perplexity}{Perplexity value used for t-SNE} 28 | 29 | \item{legend}{A list of colors for the labels} 30 | 31 | \item{plot.name}{The plot title} 32 | 33 | \item{save}{If \code{TRUE}, save as \code{plot.name}.pdf} 34 | 35 | \item{rand.seed}{The random seed} 36 | 37 | \item{continuous}{Whether \code{labels} should be treated as continuous, e.g. pseudotime} 38 | 39 | \item{labels2}{Additional label} 40 | 41 | \item{lim}{Specify the xlim and y lim c(x_min, x_max, y_min, y_max)} 42 | 43 | \item{runPCA}{Whether to run PCA before t-SNE} 44 | 45 | \item{alpha}{The alpha value for the points} 46 | } 47 | \value{ 48 | the figure if not \code{save}, otherwise save the figure as \code{plot.name}.pdf 49 | } 50 | \description{ 51 | Plot t-SNE visualization of a data matrix 52 | } 53 | \examples{ 54 | results <- sim_example(ncells = 10) 55 | plot_tsne(log2(results$counts + 1), results$cell_meta$pop, perplexity = 3) 56 | } 57 | -------------------------------------------------------------------------------- /man/run_shiny.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/9.1_shiny.R 3 | \name{run_shiny} 4 | \alias{run_shiny} 5 | \title{Launch the Shiny App to configure the simulation} 6 | \usage{ 7 | run_shiny() 8 | } 9 | \description{ 10 | Launch the Shiny App to configure the simulation 11 | } 12 | -------------------------------------------------------------------------------- /man/scmultisim_help.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/9_meta.R 3 | \name{scmultisim_help} 4 | \alias{scmultisim_help} 5 | \title{Show detailed documentations of scMultiSim's parameters} 6 | \usage{ 7 | scmultisim_help(topic = NULL) 8 | } 9 | \arguments{ 10 | \item{topic}{Can be \code{options}, \code{dynamic.GRN}, or \code{cci}} 11 | } 12 | \value{ 13 | none 14 | } 15 | \description{ 16 | Show detailed documentations of scMultiSim's parameters 17 | } 18 | \examples{ 19 | scmultisim_help() 20 | } 21 | -------------------------------------------------------------------------------- /man/sim_example.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/8_utils.R 3 | \name{sim_example} 4 | \alias{sim_example} 5 | \title{Simulate a small example dataset with 200 cells and the 100-gene GRN} 6 | \usage{ 7 | sim_example(ncells = 10, velocity = FALSE) 8 | } 9 | \arguments{ 10 | \item{ncells}{number of cells, please increase this number on your machine} 11 | 12 | \item{velocity}{whether to simulate RNA velocity} 13 | } 14 | \value{ 15 | the simulation result 16 | } 17 | \description{ 18 | Simulate a small example dataset with 200 cells and the 100-gene GRN 19 | } 20 | \examples{ 21 | sim_example(ncells = 10) 22 | } 23 | -------------------------------------------------------------------------------- /man/sim_example_spatial.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/8_utils.R 3 | \name{sim_example_spatial} 4 | \alias{sim_example_spatial} 5 | \title{Simulate a small example dataset with 200 cells and the 100-gene GRN, with CCI enabled} 6 | \usage{ 7 | sim_example_spatial(ncells = 10) 8 | } 9 | \arguments{ 10 | \item{ncells}{number of cells, please increase this number on your machine} 11 | } 12 | \value{ 13 | the simulation result 14 | } 15 | \description{ 16 | Simulate a small example dataset with 200 cells and the 100-gene GRN, with CCI enabled 17 | } 18 | \examples{ 19 | sim_example_spatial(ncells = 10) 20 | } 21 | -------------------------------------------------------------------------------- /man/sim_true_counts.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/1_main.R 3 | \name{sim_true_counts} 4 | \alias{sim_true_counts} 5 | \title{Simulate true scRNA and scATAC counts from the parameters} 6 | \usage{ 7 | sim_true_counts(options, return_summarized_exp = FALSE) 8 | } 9 | \arguments{ 10 | \item{options}{See scMultiSim_help().} 11 | 12 | \item{return_summarized_exp}{Whether to return a SummarizedExperiment object.} 13 | } 14 | \value{ 15 | scMultiSim returns an environment with the following fields: 16 | \itemize{ 17 | \item \code{counts}: Gene-by-cell scRNA-seq counts. 18 | \item \code{atac_counts}: Region-by-cell scATAC-seq counts. 19 | \item \code{region_to_gene}: Region-by-gene 0-1 marix indicating the corresponding relationship between chtomatin regions and genes. 20 | \item \code{atacseq_data}: The "clean" scATAC-seq counts without added intrinsic noise. 21 | \item \code{cell_meta}: A dataframe containing cell type labels and pseudotime information. 22 | \item \code{cif}: The CIF used during the simulation. 23 | \item \code{giv}: The GIV used during the simulation. 24 | \item \code{kinetic_params}: The kinetic parameters used during the simulation. 25 | \item \code{.grn}: The GRN used during the simulation. 26 | \item \code{.grn$regulators}: The list of TFs used by all gene-by-TF matrices. 27 | \item \code{.grn$geff}: Gene-by-TF matrix representing the GRN used during the simulation. 28 | \item \code{.n}: Other metadata, e.g. \code{.n$cells} is the number of cells. 29 | } 30 | 31 | If \code{do.velocity} is enabled, it has these additional fields: 32 | \itemize{ 33 | \item \code{unspliced_counts}: Gene-by-cell unspliced RNA counts. 34 | \item \code{velocity}: Gene-by-cell RNA velocity ground truth. 35 | \item \code{cell_time}: The pseudotime at which the cell counts were generated. 36 | } 37 | 38 | If dynamic GRN is enabled, it has these additional fields: 39 | \itemize{ 40 | \item \code{cell_specific_grn}: A list of length \code{n_cells}. Each element is a gene-by-TF matrix, indicating the cell's GRN. 41 | } 42 | 43 | If cell-cell interaction is enabled, it has these additional fields: 44 | \itemize{ 45 | \item \code{grid}: The grid object used during the simulation. 46 | \itemize{ 47 | \item \code{grid$get_neighbours(i)}: Get the neighbour cells of cell \code{i}. 48 | } 49 | \item \code{cci_locs}: A dataframe containing the X and Y coordinates of each cell. 50 | \item \code{cci_cell_type_param}: A dataframe containing the CCI network ground truth: all ligand-receptor pairs between each pair of cell types. 51 | \item \code{cci_cell_types}: For continuous cell population, the sub-divided cell types along the trajectory used when simulating CCI. 52 | } 53 | 54 | If it is a debug session (\code{debug = TRUE}), a \code{sim} field is available, 55 | which is an environment contains all internal states and data structures. 56 | } 57 | \description{ 58 | Simulate true scRNA and scATAC counts from the parameters 59 | } 60 | \examples{ 61 | data(GRN_params_100, envir = environment()) 62 | sim_true_counts(list( 63 | rand.seed = 0, 64 | GRN = GRN_params_100, 65 | num.cells = 100, 66 | num.cifs = 50, 67 | tree = Phyla5() 68 | )) 69 | } 70 | -------------------------------------------------------------------------------- /man/spatialGrid-class.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/3.1_spatial.R 3 | \docType{class} 4 | \name{spatialGrid-class} 5 | \alias{spatialGrid-class} 6 | \alias{.SpatialGrid} 7 | \title{The class for spatial grids} 8 | \value{ 9 | a spatialGrid object 10 | } 11 | \description{ 12 | The class for spatial grids 13 | } 14 | \section{Fields}{ 15 | 16 | \describe{ 17 | \item{\code{method}}{the method to generate the cell layout} 18 | 19 | \item{\code{grid_size}}{the width and height of the grid} 20 | 21 | \item{\code{ncells}}{the number of cells} 22 | 23 | \item{\code{grid}}{the grid matrix} 24 | 25 | \item{\code{locs}}{a list containing the locations of all cells} 26 | 27 | \item{\code{loc_order}}{deprecated, don't use; the order of the locations} 28 | 29 | \item{\code{cell_types}}{a map to save the cell type of each allocated cell} 30 | 31 | \item{\code{same_type_prob}}{the probability of a new cell placed next to a cell with the same type} 32 | 33 | \item{\code{max_nbs}}{the maximum number of neighbors for each cell} 34 | 35 | \item{\code{nb_map}}{a list containing the neighbors for each cell} 36 | 37 | \item{\code{nb_adj}}{adjacency matrix for neighbors} 38 | 39 | \item{\code{nb_radius}}{the radius of neighbors} 40 | 41 | \item{\code{final_types}}{the final cell types after the final time step} 42 | 43 | \item{\code{pre_allocated_pos}}{the pre-allocated positions for each cell, if any} 44 | 45 | \item{\code{method_param}}{additional parameters for the layout method} 46 | }} 47 | 48 | 49 | -------------------------------------------------------------------------------- /pkgdown/extra.css: -------------------------------------------------------------------------------- 1 | h2 { 2 | margin-top: 2rem; 3 | margin-bottom: 1.5rem; 4 | } 5 | 6 | h3 { 7 | margin-top: 1.5rem; 8 | margin-bottom: 1rem; 9 | } 10 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | # This file is part of the standard setup for testthat. 2 | # It is recommended that you do not modify it. 3 | # 4 | # Where should you do additional test configuration? 5 | # Learn more about the roles of various files in: 6 | # * https://r-pkgs.org/tests.html 7 | # * https://testthat.r-lib.org/reference/test_package.html#special-files 8 | 9 | library(testthat) 10 | library(scMultiSim) 11 | 12 | test_check("scMultiSim") 13 | -------------------------------------------------------------------------------- /tests/testthat/test-1_main.R: -------------------------------------------------------------------------------- 1 | # test_that("simulates data using Kinetic model", { 2 | # data(GRN_params_100, envir = environment()) 3 | # 4 | # set.seed(0) 5 | # options_ <- list( 6 | # GRN = GRN_params_100, 7 | # num.cells = 100, 8 | # num.cifs = 20, 9 | # tree = Phyla5(), 10 | # diff.cif.fraction = 0.8, 11 | # do.velocity = T 12 | # ) 13 | # 14 | # res <- sim_true_counts(options_) 15 | # 16 | # selectedIndicies <- c(1:5, 1000:1005, 10000:10005) 17 | # expect_equal(dim(res$counts), c(110, 100)) 18 | # expect_equal( 19 | # res$counts[selectedIndicies], 20 | # c(34, 5, 28, 21, 49, 0, 0, 10, 4, 90, 18, 18, 0, 12, 0, 37, 171) 21 | # ) 22 | # 23 | # add_expr_noise(res) 24 | # divide_batches(res, nbatch = 2) 25 | # 26 | # expect_equal( 27 | # res$counts_obs[selectedIndicies], 28 | # c(585, 307, 141, 187, 309, 0, 0, 326, 0, 2692, 0, 401, 0, 22, 0, 187, 2291) 29 | # ) 30 | # expect_equal( 31 | # res$counts_with_batches[selectedIndicies], 32 | # c(2331, 5031, 180, 1263, 131, 0, 0, 93, 0, 21462, 0, 467, 0, 357, 0, 33 | # 1020, 1495) 34 | # ) 35 | # 36 | # expect_equal( 37 | # res$atac_counts[selectedIndicies], 38 | # c(0.0000000, 0.0000000, 0.6887022, 0.7694152, 0.0000000, 2.5172858, 39 | # 0.3549220, 5.6001554, 3.5924741, 0.2475742, 0.0000000, 1.0374884, 40 | # 3.4453020, 4.0073263, 0.0000000, 0.0000000, 0.0000000) 41 | # ) 42 | # 43 | # expect_equal( 44 | # res$velocity[selectedIndicies], 45 | # c(4.3146894, 0.4199234, -2.1558627, 2.0012763, -23.2763955, 0.0000000, 46 | # 0.0000000, -2.0025398, 1.2857669, 2.8214759, 2.5935024, 1.3315967, 47 | # 0.0000000, -2.5724074, 0.2749523, 1.9946433, -2.0456734) 48 | # ) 49 | # 50 | # expect_no_error(plot_gene_module_cor_heatmap(res, save = FALSE)) 51 | # expect_no_error(gene_corr_regulator(res, 2)) 52 | # expect_no_error(plot_rna_velocity(res, perplexity = 20)) 53 | # }) 54 | # 55 | # 56 | # test_that("simulates data using Beta-Poisson model", { 57 | # data(GRN_params_100, envir = environment()) 58 | # 59 | # set.seed(0) 60 | # options_ <- list( 61 | # GRN = GRN_params_100, 62 | # num.cells = 100, 63 | # num.cifs = 20, 64 | # tree = Phyla5(), 65 | # diff.cif.fraction = 0.8, 66 | # do.velocity = F 67 | # ) 68 | # 69 | # res <- sim_true_counts(options_) 70 | # 71 | # selectedIndicies <- c(1:5, 101:105, 10001:10005) 72 | # expect_equal(dim(res$counts), c(110, 100)) 73 | # expect_equal( 74 | # res$counts[selectedIndicies], 75 | # c(120, 5, 18, 33, 88, 5, 4, 0, 8, 96, 0, 18, 0, 15, 146) 76 | # ) 77 | # }) 78 | # 79 | # 80 | # test_that("simulates spatial data", { 81 | # data(GRN_params_100, envir = environment()) 82 | # 83 | # lig_params <- data.frame( 84 | # target = c(101, 102), 85 | # regulator = c(103, 104), 86 | # effect = c(5.2, 5.9) 87 | # ) 88 | # 89 | # options_ <- list2( 90 | # GRN = GRN_params_100, 91 | # num.genes = 200, 92 | # num.cells = 100, 93 | # num.cifs = 20, 94 | # tree = Phyla3(), 95 | # intrinsic.noise = 0.5, 96 | # cci = list( 97 | # params = lig_params, 98 | # max.neighbors = 4, 99 | # cell.type.interaction = "random", 100 | # step.size = 0.5 101 | # ) 102 | # ) 103 | # 104 | # set.seed(0) 105 | # res <- sim_true_counts(options_) 106 | # 107 | # selectedIndicies <- c(1:5, 1000:1005, 10000:10005) 108 | # expect_equal( 109 | # res$counts[selectedIndicies], 110 | # c(40.675564, 30.876988, 29.984167, 49.430348, 25.113605, 4.093944, 111 | # 45.194247, 29.063519, 47.389263, 42.516067, 43.014273, 7.110385, 112 | # 55.992341, 13.604489, 14.811897, 10.213004, 24.046141) 113 | # ) 114 | # 115 | # expect_no_error(plot_cell_loc(res)) 116 | # expect_no_error(gene_corr_cci(res)) 117 | # }) 118 | # 119 | # 120 | # test_that("simulates spatial data with discrete population and HGE", { 121 | # data(GRN_params_100, envir = environment()) 122 | # 123 | # lig_params <- data.frame( 124 | # target = c(101, 102), 125 | # regulator = c(103, 104), 126 | # effect = c(5.2, 5.9) 127 | # ) 128 | # 129 | # options_ <- list2( 130 | # GRN = GRN_params_100, 131 | # num.genes = 200, 132 | # num.cells = 100, 133 | # num.cifs = 20, 134 | # tree = Phyla3(), 135 | # discrete.cif = T, 136 | # discrete.min.pop.size = 20, 137 | # intrinsic.noise = 0.5, 138 | # hge.prop = 0.05, 139 | # cci = list( 140 | # params = lig_params, 141 | # max.neighbors = 4, 142 | # cell.type.interaction = "random", 143 | # step.size = 0.5 144 | # ) 145 | # ) 146 | # 147 | # set.seed(0) 148 | # res <- sim_true_counts(options_) 149 | # 150 | # selectedIndicies <- c(1:5, 1000:1005, 10000:10005) 151 | # expect_equal( 152 | # res$counts[selectedIndicies], 153 | # c(109.0693303, 60.4151790, 91.9120934, 0.5816326, 177.6741585, 197.0663584, 154 | # 102.3145704, 65.9978484, 89.1613630, 3.1734446, 156.9202179, 29.8315553, 155 | # 92.3944947, 66.1421921, 105.4677530, 0.5729707, 110.5115346) 156 | # ) 157 | # }) 158 | -------------------------------------------------------------------------------- /vignettes/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | *.R 3 | -------------------------------------------------------------------------------- /vignettes/basics.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "2. Simulating Multimodal Single-cell Datasets" 3 | output: 4 | BiocStyle::html_document: 5 | toc: true 6 | toc_depth: 2 7 | vignette: > 8 | %\VignetteEngine{knitr::knitr} 9 | %\VignetteIndexEntry{2. Simulating Multimodal Single-cell Datasets} 10 | %\usepackage[UTF-8]{inputenc} 11 | --- 12 | ```{r "setup", include=FALSE} 13 | require("knitr") 14 | opts_chunk$set(fig.width=4, fig.height=3) 15 | ``` 16 | 17 | ```{r install-packages, include=FALSE, message=FALSE, warning=FALSE, eval=FALSE} 18 | # The following chunk will install all the required packages. 19 | (function() { 20 | installed <- installed.packages()[,"Package"] 21 | install <- function(list, fn) { 22 | pkg <- setdiff(list, installed) 23 | if (length(pkg)) fn(pkg, dependencies=TRUE) 24 | } 25 | 26 | r_packages <- c( 27 | "devtools", "dplyr", "ggplot2", "Rtsne", "rlang", 28 | "reshape", "ape", "phytools", "repr", "KernelKnn", 29 | "gridExtra", "parallel", 'foreach', 'phytools', "doParallel", 30 | "zeallot", "gtools", "gplots", "roxygen2", "usethis" 31 | ) 32 | install(r_packages, install.packages) 33 | 34 | if (requireNamespace("BiocManager", quietly = TRUE)) { 35 | bioc_packages <- c('Biobase') 36 | install(bioc_packages, BiocManager::install) 37 | } 38 | })() 39 | ``` 40 | 41 | In this tutorial, we will demonstrate how to use scMultiSim to simulate multi-omics data 42 | with different biological effects, including: 43 | 44 | - Simulating true RNA counts and ATAC-seq data 45 | - Controlling the cell population and GRN effects 46 | - Adding technical variation and batch effect to the true counts 47 | - Adjusting the parameters to control different biological effects 48 | 49 | We first load the package: 50 | 51 | ```{r load-package, quietly=TRUE, message=FALSE, warning=FALSE} 52 | library("scMultiSim") 53 | ``` 54 | 55 | # Simulating True Counts 56 | 57 | scMultiSim first generates the true RNA counts, and then add technical variation and batch effect to the true counts. 58 | To simulate true counts, call `sim_true_counts(options)` where `options` is a 59 | list. You can use `scmultisim_help()` to get help on the options, 60 | or like `scmulti_help("num.cells")` to get help on the options for a specific function. 61 | 62 | ```{r scmultisim-help, echo = TRUE, results = "hide"} 63 | scmultisim_help("options") 64 | ``` 65 | 66 | ## GRN and Differentiation Tree 67 | 68 | Before start, we define a utility function to modify a list. 69 | ```{r load-dplyr, quietly=TRUE, message=FALSE, warning=FALSE} 70 | library(dplyr) 71 | ``` 72 | ```{r define-list-modify} 73 | list_modify <- function (curr_list, ...) { 74 | args <- list(...) 75 | for (i in names(args)) { 76 | curr_list[[i]] <- args[[i]] 77 | } 78 | curr_list 79 | } 80 | ``` 81 | 82 | The minimal input to scMultiSim is a **differentiation tree**, and you can optionally provide 83 | ground truth for GRN and cell-cell interactions. 84 | The differentiation tree is an R phylo object, which can be created using e.g. 85 | `ape::read.tree()` or `ape::rtree()`. 86 | It controls the cell population structure: each node of the tree should represent a cell type, 87 | and connected nodes indicate the differentiation relationship between cell types. 88 | _scMultiSim provides this explicit control on the cell population structure 89 | while preserving all other effects (such as GRN and Cell-Cell Interactions)_, 90 | so you can generate any cell trajectory or clustering structure you want, which is especially useful 91 | for benchmarking trajectory inference and clustering methods. 92 | 93 | If generating a continuous population, this tree 94 | specifies the cell differentiation trajectory; if generating a discrete population, the 95 | tips of this tree will be the clusters (cell types are the terminal cell states). 96 | 97 | scMultiSim also provides three differentiation trees. 98 | `Phyla5()` and `Phyla3()` return bifurcating trees with 5 and 3 leaves respectively. 99 | `Phyla1()` returns only a single branch, which can be useful when we don't want any specific trajectory. 100 | ```{r plot-tree, fig.width = 8, fig.height = 4} 101 | par(mfrow=c(1,2)) 102 | Phyla5(plotting = TRUE) 103 | Phyla3(plotting = TRUE) 104 | 105 | # It's not possible to plot Phyla1() because it only contains 1 branch connecting two nodes. 106 | Phyla1() 107 | ``` 108 | 109 | If you only need `n` cell clusters without any specific trajectory, you can use code like below to generate a simple tree with `n` leaves. 110 | ```{r random-tree} 111 | # tree with four leaves 112 | ape::read.tree(text = "(A:1,B:1,C:1,D:1);") 113 | ``` 114 | 115 | The GRN should be a data frame with 3 columns, each representing the `target`, `regulator`, and `effect`. 116 | The target and regulator should be gene names, which can be integers or strings. 117 | The effect should be a numeric value, indicating the effect of the regulator on the target. 118 | 119 | scMultiSim provides two sample GRNs, `GRN_params_100` and `GRN_params_1139`, 120 | which contain 100 and 1139 genes respectively. 121 | Let's load them first. 122 | ```{r load-grn} 123 | data(GRN_params_100) 124 | GRN_params <- GRN_params_100 125 | head(GRN_params) 126 | ``` 127 | 128 | ## Simulating True Counts 129 | 130 | Now, we create the options list for the simulation session. 131 | In the following example, we simulate 500 cells with 50 CIFs. 132 | 133 | The number of genes is determined by the option `num.genes` or the number of genes in the GRN. 134 | If `num.genes` is not specified, the number of genes will be the number of unique genes in the GRN, 135 | plus a fraction of genes that are not regulated by any other genes. 136 | this is controlled by the option `unregulated.gene.ratio` (default is 0.1). 137 | Since our `GRN_params` contains 100 gene names, 10% more genes will be added to the simulation, 138 | and the number of genes in the simulated data will be 110. 139 | If you don't need to simulate GRN effects, simply set `GRN = NA`. 140 | 141 | The `cif.sigma` controls the variance of the CIFs. Usually, with `cif.sigma` = 0.1, 142 | the trajectory will be very clear, while with `cif.sigma` = 1, the trajectory will be more 143 | noisy. We use `cif.sigma` = 0.5 in this example. 144 | 145 | We also have `do.velocity` option to use the Kinetic model to simulate RNA velocity data. 146 | 147 | ```{r define-options} 148 | set.seed(42) 149 | 150 | options <- list( 151 | GRN = GRN_params, 152 | num.cells = 300, 153 | num.cifs = 20, 154 | cif.sigma = 1, 155 | tree = Phyla5(), 156 | diff.cif.fraction = 0.8, 157 | do.velocity = TRUE 158 | ) 159 | ``` 160 | 161 | ### Omitting the GRN 162 | 163 | Note that the minimal input to scMultiSim is the cell population structure (differentiation tree) and number of cells. 164 | You can omit the GRN by using `GRN = NA`: 165 | ``` 166 | options <- list( 167 | GRN = NA 168 | num.cells = 1000, 169 | num.genes = 500, 170 | tree = Phyla5(), 171 | ) 172 | ``` 173 | 174 | ### Running the Simulation 175 | 176 | Now we run the simulation and check what kind of data is in the returned result: 177 | ```{r run-simulation} 178 | results <- sim_true_counts(options) 179 | names(results) 180 | ``` 181 | 182 | ## Accessing the Results 183 | 184 | The return value will be a `scMultiSim Environment` object, 185 | and you can access various data and parameters using the `$` operator. 186 | 187 | - `counts`: Gene-by-cell scRNA-seq counts. 188 | - `atac_counts`: Region-by-cell scATAC-seq counts. 189 | - `region_to_gene`: Region-by-gene 0-1 marix indicating the corresponding relationship between chtomatin regions and genes. 190 | - `atacseq_data`: The "clean" scATAC-seq counts without added intrinsic noise. 191 | - `cell_meta`: A dataframe containing cell type labels and pseudotime information. 192 | - `cif`: The CIF used during the simulation. 193 | - `giv`: The GIV used during the simulation. 194 | - `kinetic_params`: The kinetic parameters used during the simulation. 195 | - `.grn`: The GRN used during the simulation. 196 | - `.grn$regulators`: The list of TFs used by all gene-by-TF matrices. 197 | - `.grn$geff`: Gene-by-TF matrix representing the GRN used during the simulation. 198 | - `.n`: Other metadata, e.g. `.n$cells` is the number of cells. 199 | 200 | If `do.velocity` is enabled, it has these additional fields: 201 | 202 | - `unspliced_counts`: Gene-by-cell unspliced RNA counts. 203 | - `velocity`: Gene-by-cell RNA velocity ground truth. 204 | - `cell_time`: The pseudotime at which the cell counts were generated. 205 | 206 | If dynamic GRN is enabled, it has these additional fields: 207 | 208 | - `cell_specific_grn`: A list of length `n_cells`. Each element is a gene-by-TF matrix, indicating the cell's GRN. 209 | 210 | If cell-cell interaction is enabled, it has these additional fields: 211 | 212 | - `grid`: The grid object used during the simulation. 213 | - `grid$get_neighbours(i)`: Get the neighbour cells of cell `i`. 214 | - `cci_locs`: A dataframe containing the X and Y coordinates of each cell. 215 | - `cci_cell_type_param`: A dataframe containing the CCI network ground truth: all ligand-receptor pairs between each pair of cell types. 216 | - `cci_cell_types`: For continuous cell population, the sub-divided cell types along the trajectory used when simulating CCI. 217 | 218 | If it is a debug session (`debug = TRUE`), a `sim` field is available, 219 | which is an environment contains all internal states and data structures. 220 | 221 | ## Visualizing the Results 222 | 223 | We can visualize the true counts and ATAC-seq data using `plot_tsne()`: 224 | ```{r plot-counts, fig.width = 4, fig.height = 3.5, out.width = "60%"} 225 | plot_tsne(log2(results$counts + 1), 226 | results$cell_meta$pop, 227 | legend = 'pop', plot.name = 'True RNA Counts Tsne') 228 | plot_tsne(log2(results$atacseq_data + 1), 229 | results$cell_meta$pop, 230 | legend = 'pop', plot.name = 'True ATAC-seq Tsne') 231 | ``` 232 | 233 | 234 | Since we also have RNA velocity enabled, the `results` also contains the following data: 235 | - `velocity`: the true RNA velocity (genes x cells) 236 | - `unspliced_counts`: the true unspliced RNA counts (genes x cells) 237 | 238 | ```{r plot-velocity, fig.width = 4, fig.height = 3.5, out.width = "60%"} 239 | plot_rna_velocity(results, arrow.length = 2) 240 | ``` 241 | 242 | We can inspect the gene-gene correlation using `plot_gene_module_cor_heatmap(results)`: 243 | ```{r plot-gene-correlation, fig.width = 8, fig.height = 8} 244 | plot_gene_module_cor_heatmap(results) 245 | ``` 246 | 247 | # Adding Technical Variation and Batch Effect 248 | 249 | We can also add the technical variation and batch effect to the true counts. 250 | 251 | ## Adding technical noise 252 | 253 | Simply use the `add_expr_noise` function to add technical noise to the dataset. 254 | 255 | ```{r technical-noise} 256 | add_expr_noise( 257 | results, 258 | # options go here 259 | alpha_mean = 1e4 260 | ) 261 | ``` 262 | 263 | A `counts_obs` field will be added to the `results` object. 264 | 265 | This function also accepts a list of options. See the documentation for more details. 266 | 267 | - `protocol`: `"umi"` or `"nonUMI"`, whether simulate the UMI protocol. 268 | - `alpha_mean`, `alpha_sd`: Mean and deviation of rate of subsampling of transcripts during capture step. 269 | - `alpha_gene_mean`, `alpha_gene_sd`: `alpha` parameters, but gene-wise. 270 | - `depth_mean`, `depth_sd`: Mean and deviation of sequencing depth. 271 | - `gene_len`: A vector with lengths of all genes. 272 | - `atac.obs.prob`: For each integer count of a particular region for a particular cell, the probability the count will be observed. 273 | - `atac.sd.frac`: The fraction of ATAC-seq data value used as the standard deviation of added normally distrubted noise. 274 | - `randseed`: random seed. 275 | 276 | ## Adding batch effects 277 | 278 | Finally, use the `divide_batches` function to add batch effects. 279 | 280 | ```{r batch-effects} 281 | divide_batches( 282 | results, 283 | nbatch = 2, 284 | effect = 1 285 | ) 286 | ``` 287 | 288 | A `counts_with_batches` field will be added to the `results` object. 289 | 290 | The available options are: 291 | 292 | - `nbatch`: Number of batches. 293 | - `effect`: The batch effect size. 294 | 295 | We can visualize the result with technical noise and batches: 296 | 297 | ```{r add-expr-noise, fig.width = 4, fig.height = 3.5, out.width = "60%"} 298 | plot_tsne(log2(results$counts_with_batches + 1), 299 | results$cell_meta$pop, 300 | legend = 'pop', plot.name = 'RNA Counts Tsne with Batches') 301 | ``` 302 | 303 | # Adjusting Parameters 304 | 305 | scMultiSim provides various parameters to control each type of biological effect. 306 | Here, we describe the most important parameters and how they affect the simulation results: 307 | 308 | - `num.cifs`, `diff.cif.fraction` 309 | - `cif.mean`, `cif.sigma` 310 | - `discrete.cif` 311 | - `intinsic.noise` 312 | 313 | For a complete list of parameters, please check out the [Parameter Guide](https://zhanglabgt.github.io/scMultiSim/articles/options) 314 | page in the documentation. 315 | 316 | ## The Shiny App 317 | 318 | scMultiSim provides a Shiny app to help you generate the options list and visualize the effects of different parameters. 319 | It is highly recommended to use the Shiny app to explore the available parameters. 320 | You can run the app by calling `run_shiny()`. 321 | 322 | ```{r run-shiny, eval=FALSE} 323 | run_shiny() 324 | ``` 325 | 326 | ![Shiny App](https://github.com/ZhangLabGT/scMultiSim/raw/img/img/shiny_app_sc.png) 327 | 328 | ## Deciding Number of CIFs: `num.cifs` 329 | 330 | In scMultiSim, user use `num.cifs` to control the total number of diff-CIF and non-diff-CIFs. 331 | The number of CIFs should be large enough to represent the cell population structure and gene information. 332 | By default, `num.cifs` is set to 50, which is a good starting point for most cases. 333 | However, each gene's base expression is affected by two random diff-CIF entries, 334 | therefore if you have a large number of genes, they may have similar expression patterns, which may not be ideal. 335 | It is recommended to increase `num.cifs` to 50-100 if you have more than 2000 genes. 336 | If you have a small number of genes (less than 1000), you can also decrease `num.cifs` to 20-40. 337 | 338 | ## Discrete Cell Population: `discrete.cif` 339 | 340 | We can also simulate discrete cell population by setting `discrete.cif = TRUE`. 341 | In this case, each tip of the tree will be one cell type, 342 | therefore there will be 5 clusters in the following result. 343 | 344 | ```{r simulate-discrete, fig.width = 4, fig.height = 3.5, out.width = "60%"} 345 | set.seed(42) 346 | 347 | options <- list( 348 | GRN = GRN_params, 349 | num.cells = 400, 350 | num.cifs = 20, 351 | tree = Phyla5(), 352 | diff.cif.fraction = 0.8, 353 | discrete.cif = TRUE 354 | ) 355 | 356 | results <- sim_true_counts(options) 357 | 358 | plot_tsne(log2(results$counts + 1), 359 | results$cell_meta$pop, 360 | legend = 'pop', plot.name = 'True RNA Counts Tsne') 361 | ``` 362 | 363 | ## Adjusting the Effect of Cell Population: `diff.cif.fraction` 364 | 365 | In scMultiSim, the differentiation tree provides explicit control of the cell population. 366 | The effect of the tree can be adjusted by the option `diff.cif.fraction`, 367 | which controls how many CIFs are affected by the cell population. 368 | With a larger `diff.cif.fraction`, the effect of cell population will be larger 369 | and you may see a clearer trajectory or well separated clusters. 370 | With a smaller `diff.cif.fraction`, the resulting RNA counts will be more affected by 371 | other factors, such as the GRN. 372 | 373 | Now let's visualize the trajectory with different `diff.cif.fraction` values: 374 | 375 | ```{r adjust-diff-cif-fraction, fig.width = 4, fig.height = 3.5, out.width = "60%"} 376 | set.seed(42) 377 | 378 | options <- list( 379 | GRN = GRN_params, 380 | num.cells = 300, 381 | num.cifs = 20, 382 | tree = Phyla5(), 383 | diff.cif.fraction = 0.8 384 | ) 385 | 386 | results <- sim_true_counts( 387 | options %>% list_modify(diff.cif.fraction = 0.4)) 388 | plot_tsne(log2(results$counts + 1), 389 | results$cell_meta$pop, 390 | legend = 'pop', plot.name = 'RNA Counts (diff.cif.fraction = 0.2)') 391 | 392 | results <- sim_true_counts( 393 | options %>% list_modify(diff.cif.fraction = 0.9)) 394 | plot_tsne(log2(results$counts + 1), 395 | results$cell_meta$pop, 396 | legend = 'pop', plot.name = 'RNA Counts (diff.cif.fraction = 0.8)') 397 | ``` 398 | 399 | ## Adjusting the Inherent Cell Heterogeneity: `cif.mean` and `cif.sigma` 400 | 401 | The inherent cell heterogeneity is controlled by the non-diff-CIF, 402 | which is sampled from a normal distribution with mean `cif.mean` and standard deviation `cif.sigma`. 403 | Therefore, the larger `cif.sigma` is, the larger the inherent cell heterogeneity is. 404 | 405 | Now, let's visualize the effect of `cif.sigma`: 406 | 407 | ```{r adjust-cif-sigma, fig.width = 4, fig.height = 3.5, out.width = "60%"} 408 | set.seed(42) 409 | 410 | options <- list( 411 | GRN = GRN_params, 412 | num.cells = 300, 413 | num.cifs = 20, 414 | tree = Phyla5(), 415 | diff.cif.fraction = 0.8, 416 | cif.sigma = 0.5 417 | ) 418 | 419 | results <- sim_true_counts( 420 | options %>% list_modify(cif.sigma = 0.1)) 421 | plot_tsne(log2(results$counts + 1), 422 | results$cell_meta$pop, 423 | legend = 'pop', plot.name = 'RNA Counts (cif.sigma = 0.1)') 424 | 425 | results <- sim_true_counts( 426 | options %>% list_modify(cif.sigma = 1.0)) 427 | plot_tsne(log2(results$counts + 1), 428 | results$cell_meta$pop, 429 | legend = 'pop', plot.name = 'RNA Counts (cif.sigma = 1.0)') 430 | ``` 431 | 432 | ## Adjusting the Intrinsic Noise: `intinsic.noise` 433 | 434 | If we set `do.velocity = FALSE`, scMultiSim will simulate the RNA counts using the Beta-Poisson model, 435 | which is faster but doesn't output RNA velocity. 436 | When using the Beta-Possion model, scMultiSim provides a `intrinsic.noise` parameter to control the 437 | intrinsic noise during the transcription process. 438 | By default, `intrinsic.noise` is set to 1, which means the true counts will be sampled from the Beta-Poisson 439 | model. If we set `intrinsic.noise` to a smaller value like 0.5, 440 | the true counts will be 0.5 * (theoretical mean) + 0.5 * (sampled from the Beta-Poisson model). 441 | _More intrinsic noise will make the encoded effects (e.g. GRN) harder to be inferred._ 442 | 443 | 444 | ```{r adjust-intrinsic-noise, fig.width = 4, fig.height = 3.5, out.width = "60%"} 445 | set.seed(42) 446 | 447 | options <- list( 448 | GRN = GRN_params, 449 | num.cells = 300, 450 | num.cifs = 20, 451 | tree = Phyla5(), 452 | diff.cif.fraction = 0.8, 453 | intrinsic.noise = 1 454 | ) 455 | 456 | results <- sim_true_counts( 457 | options %>% list_modify(intrinsic.noise = 0.5)) 458 | plot_tsne(log2(results$counts + 1), 459 | results$cell_meta$pop, 460 | legend = 'pop', plot.name = 'RNA Counts (intrinsic.noise = 0.5)') 461 | 462 | results <- sim_true_counts( 463 | options %>% list_modify(intrinsic.noise = 1)) 464 | plot_tsne(log2(results$counts + 1), 465 | results$cell_meta$pop, 466 | legend = 'pop', plot.name = 'RNA Counts (intrinsic.noise = 1)') 467 | ``` 468 | 469 | ## Adjust the effect of chromatin accessibility: `atac.effect` 470 | 471 | `atac.effect` Controls the contribution of the chromatin accessibility. 472 | A higher `atac.effect` means the RNA counts are more affected by the ATAC-seq data, 473 | therefore the correlation between the ATAC-seq and RNA-seq data will be higher. 474 | 475 | # Simulating Dynamic GRN 476 | 477 | First, call the following function to check the usage of dynamic GRN. 478 | ```{r help-dynamic-grn} 479 | scmultisim_help("dynamic.GRN") 480 | ``` 481 | 482 | Here we use `Phyla1()` as the differentiation tree to remove the effect of the trajectory. Additionally, we can use `discrete.cif = TRUE` to simulate discrete cell population. 483 | ```{r define-options-dynamic-grn} 484 | set.seed(42) 485 | 486 | options_ <- list( 487 | GRN = GRN_params, 488 | num.cells = 300, 489 | num.cifs = 20, 490 | tree = Phyla1(), 491 | diff.cif.fraction = 0.8, 492 | do.velocity = FALSE, 493 | dynamic.GRN = list( 494 | cell.per.step = 3, 495 | num.changing.edges = 5, 496 | weight.mean = 0, 497 | weight.sd = 4 498 | ) 499 | ) 500 | 501 | results <- sim_true_counts(options_) 502 | ``` 503 | 504 | `results$cell_specific_grn` is a list containing the gene effects matrix for each cell. Each row is a target and each column is a regulator. The corresponding gene names are displayed as column and row names. 505 | ```{r show-cell-specific-grn} 506 | # GRN for cell 1 (first 10 rows) 507 | results$cell_specific_grn[[1]][1:10,] 508 | ``` 509 | 510 | Since we set `cell.per.step = 3`, we expect each adjacent 3 cells share the same GRN: 511 | ```{r check-cell-specific-grn} 512 | print(all(results$cell_specific_grn[[1]] == results$cell_specific_grn[[2]])) 513 | print(all(results$cell_specific_grn[[2]] == results$cell_specific_grn[[3]])) 514 | print(all(results$cell_specific_grn[[3]] == results$cell_specific_grn[[4]])) 515 | ``` 516 | 517 | # Session Information 518 | 519 | ```{r session-info} 520 | sessionInfo() 521 | ``` -------------------------------------------------------------------------------- /vignettes/options.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "4. Parameter Guide" 3 | output: 4 | BiocStyle::html_document: 5 | toc: true 6 | toc_depth: 2 7 | vignette: > 8 | %\VignetteEngine{knitr::knitr} 9 | %\VignetteIndexEntry{4. Parameter Guide} 10 | %\usepackage[UTF-8]{inputenc} 11 | --- 12 | 13 | ```{r, include = FALSE} 14 | knitr::opts_chunk$set( 15 | collapse = TRUE, 16 | comment = "#>" 17 | ) 18 | ``` 19 | 20 | This article introduces the available options in `scMultiSim`. 21 | 22 | The following flow chart shows the workflow of `scMultiSim` and each parameter's role in the simulation. 23 | 24 | ![scMultiSim parameters flow chart](https://github.com/ZhangLabGT/scMultiSim/raw/img/img/params.png) 25 | 26 | ## Options: General 27 | 28 | ### rand.seed 29 | 30 | > integer (default: `0`) 31 | 32 | scMultiSim should produce the same result if all other parameters are the same. 33 | 34 | ### threads 35 | 36 | > integer (default: `1`) 37 | 38 | Use multithreading only when generating the CIF matrix. 39 | It will not speed up the simulation a lot, thus not recommended. 40 | 41 | ### speed.up 42 | 43 | > logical (default: `FALSE`) 44 | 45 | Enable experimental speed-up mode. 46 | It is recommended to **enable** this option, and it will be the default in the future. 47 | Currently, it is disabled for reproducibility. 48 | 49 | ## Options: Genes 50 | 51 | ### GRN 52 | 53 | > A data frame with 3 columns as below. 54 | > Supply `NA` to disable the GRN effect. (required) 55 | 56 | | Column | Value | 57 | | ------ | ------------------------------------------ | 58 | | 1 | target gene ID: `integer or character`; | 59 | | 2 | regulator gene ID: `integer or character`; | 60 | | 3 | effect: `number`. | 61 | 62 | If `num.genes` presents, the gene IDs should not exceed this number. 63 | The gene IDs should start from 1 and should not ship any intermidiate numbers. 64 | 65 | Two sample datasets `GRN_params_100` and `GRN_params_1000` from 66 | [Dibaeinia, P., & Sinha, S. (2020)](https://doi.org/10.1016/j.cels.2020.08.003) are provided for testing and inspection. 67 | 68 | ### num.genes 69 | 70 | > integer (default: `NULL`) 71 | 72 | If a GRN is supplied, override the total number of genes. 73 | It should be larger than the largest gene ID in the GRN. 74 | Otherwise, the number of genes will be determined by `N_genes * (1 + r_u)`, 75 | where `r_u` is `unregulated.gene.ratio`. 76 | 77 | If GRN is disabled, 78 | this option specifies the total number of genes. 79 | 80 | ### unregulated.gene.ratio 81 | 82 | > number > 0 (default: `0.1`) 83 | 84 | Ratio of unreulated to regulated genes. 85 | When a GRN is supplied with `N` genes, 86 | scMultiSim will simulate `N * r_u` extra (unregulated) genes. 87 | 88 | ### giv.mean, giv.sd, giv.prob 89 | 90 | > (default: `0, 1, 0.3`) 91 | 92 | The parameters used to sample the GIV matrix. 93 | With probability `giv.prob`, the value is sampled from N(`giv.mean`, `giv.sd`). 94 | Otherwise the value is 0. 95 | 96 | ### dynamic.GRN 97 | 98 | > list (default: `NULL`) 99 | 100 | Enables dynamic (cell-specific GRN). 101 | Run `scmultisim_help("dynamic.GRN")` to see more explaination. 102 | 103 | ### hge.prop, hge.mean, hge.sd 104 | 105 | > (default: `0, 5, 1`) 106 | 107 | Treat some random genes as highly-expressed (house-keeping) genes. 108 | A proportion of `hge.prop` genes will have expression scaled by a 109 | multiplier sampled from N(`hge.mean`, `hge.sd`). 110 | 111 | ### hge.range 112 | 113 | > integer (default: `1`) 114 | 115 | When selecting highly-expressed genes, only choose genes with ID > `hge.range`. 116 | 117 | ### hge.max.var 118 | 119 | > number (default: `500`) 120 | 121 | When selecting highly-expressed genes, only choose genes 122 | with variation < `hge.max.var`. 123 | 124 | ## Options: Cells 125 | 126 | ### num.cells 127 | 128 | > integer (default: `1000`) 129 | 130 | The number of cells to be simulated. 131 | 132 | ### tree 133 | 134 | > phylo (default: `Phyla5()`) 135 | 136 | The cell differential tree, 137 | which will be used to generate cell trajectories (if `discrete.cif = T`) 138 | or clusters (if `discrete.cif = F`). 139 | In discrete population mode, only the tree tips will be used. 140 | Three demo trees, `Phyla5()`, `Phyla3()` and `Phyla1()`, are provided. 141 | 142 | ### discrete.cif 143 | 144 | > logical (default: `FALSE`) 145 | 146 | Whether the cell population is discrete (continuous otherwise). 147 | 148 | ### discrete.min.pop.size, discrete.min.pop.index 149 | 150 | > integer, integer (default: `70, 1`) 151 | 152 | In discrete population mode, specify one cluster to have the 153 | smallest cell population. 154 | The cluster will contain `discrete.min.pop.size` cells. 155 | `discrete.min.pop.index` should be a valid cluster index (tree tip number). 156 | 157 | ### discrete.pop.size 158 | 159 | > integer vector (default: `NA`); e.g. `c(200, 250, 300)` 160 | 161 | Manually specify the size of each cluster. 162 | 163 | ## Options: CIF 164 | 165 | ### num.cifs 166 | 167 | > integer (default: `50`) 168 | 169 | Total number of differential and non-differential CIFs, 170 | which can be viewed as latent representation of cells. 171 | 172 | ### diff.cif.fraction 173 | 174 | > number (default: `0.9`) 175 | 176 | Fraction of differential CIFs. 177 | Differential CIFs encode the cell type information, 178 | while non-differential CIFs are randomly sampled for each cell. 179 | 180 | ### cif.center, cif.sigma 181 | 182 | > (default: `1, 0.1`) 183 | 184 | The distribution used to sample CIF values. 185 | 186 | ### use.impulse 187 | 188 | > logical (default: `FALSE`) 189 | 190 | In continuous population mode, when sampling CIFs along the tree, 191 | use the impulse model rather than the default gaussian random walk. 192 | 193 | ## Options: Simulation - ATAC 194 | 195 | ### atac.effect 196 | 197 | > number ∈ [0, 1] (default: `0.5`) 198 | 199 | The influence of chromatin accessability data on gene expression. 200 | 201 | ### region.distrib 202 | 203 | > vector of length 3, should sum to 1 (default: `c(0.1, 0.5, 0.4)`) 204 | 205 | The probability that a gene is regulated by 0, 1, 2 206 | consecutive regions, respectively. 207 | 208 | ### atac.p_zero 209 | 210 | > number ∈ [0, 1] (default: `0.8`) 211 | 212 | The proportion of zeros we see in the simulated scATAC-seq data. 213 | 214 | ### riv.mean, riv.sd, riv.prob 215 | 216 | > (default: `0, 1, 0.3`) 217 | 218 | The parameters used to sample the RIV (Region Identity Vectors). 219 | With probability `riv.prob`, the value is sampled from N(`riv.mean`, `riv.sd`). 220 | Otherwise the value is 0. 221 | 222 | ## Customization 223 | 224 | ### mod.cif.giv 225 | 226 | > function (default: `NULL`) 227 | 228 | Modify the generated CIF and GIV. 229 | The function takes four arguments: the kinetic parameter index (1=kon, 2=koff, 3=s), 230 | the current CIF matrix, the GIV matrix, and the cell metadata dataframe. 231 | It should return a list of two elements: the modified CIF matrix and the modified GIV matrix. 232 | 233 | ```R 234 | sim_true_counts(list( 235 | # ... 236 | mod.cif.giv = function(i, cif, giv, meta) { 237 | # modify cif and giv 238 | return(list(cif, giv)) 239 | } 240 | )) 241 | ``` 242 | 243 | ### ext.cif.giv 244 | 245 | > function (default: `NULL`) 246 | 247 | Add extra CIF and GIV. 248 | The function takes one argument, the kinetic parameter index (1=kon, 2=koff, 3=s). 249 | It should return a list of two elements: the extra CIF matrix `(n_extra_cif x n_cells)` 250 | and the GIV matrix `(n_genes x n_extra_cif)`. Return `NULL` for no extra CIF and GIV." 251 | 252 | ```R 253 | sim_true_counts(list( 254 | # ... 255 | ext.cif.giv = function(i) { 256 | # add extra cif and giv 257 | return(list(extra_cif, extra_giv)) 258 | } 259 | )) 260 | ``` 261 | 262 | ## Optins: Simulation 263 | 264 | ### vary 265 | 266 | > character (default: `"s"`) 267 | 268 | Can be `"all", "kon", "koff", "s", "except_kon", "except_koff", "except_s"`. 269 | It specifies which kinetic parameters to vary across cells, i.e. which kinetic parameters have differential CIFs 270 | sampled from the tree. 271 | 272 | ### bimod 273 | 274 | > number (default: `0`) 275 | 276 | A number between 0 and 1, which adjust the bimodality of the gene expression distribution. 277 | 278 | ### scale.s 279 | 280 | > number (default: `1`) 281 | 282 | Manually scale the final `s` parameter, thus the gene expression. 283 | When discrete.cif = T, it can be a vector specifying the scale.s for each cluster. 284 | In this case, you can use smaller value for cell types known to be small (like naive cells). 285 | 286 | ### intrinsic.noise 287 | 288 | > number (default: `1`) 289 | 290 | A number between 0 and 1, which specify the weight of the random sample from the Beta-Poisson distribution. 291 | 292 | ``` 293 | 0 <----------------------> 1 294 | Theoritical mean Random sample from 295 | Beta-Poisson distribution 296 | ``` 297 | 298 | ## Options: Simulation - RNA Velocity 299 | 300 | ### do.velocity 301 | 302 | > logical (default: `FALSE`) 303 | 304 | When set to `TRUE`, 305 | simulate using the full kinetic model and generate RNA velocity data. 306 | Otherwise, the Beta-Poission model will be used. 307 | 308 | ### beta 309 | 310 | > number (default: `0.4`) 311 | 312 | The splicing rate of each gene in the kinetic model. 313 | 314 | ### d 315 | 316 | > number (default: `1`) 317 | 318 | The degradation rate of each gene in the kinetic model. 319 | 320 | ### num.cycles 321 | 322 | > number (default: `3`) 323 | 324 | The number of cycles run before sampling the gene expression of a cell. 325 | 326 | ### cycle.len 327 | 328 | > number (default: `1`) 329 | 330 | In velocity mode, a multiplier for the cell cycle length. 331 | It is multiplied by the expected time to 332 | transition from k_on to k_off and back to form the the length of a cycle. 333 | 334 | ## Options: Simulation - Spatial Cell-Cell Interaction 335 | 336 | The simulation of cell-cell interaction can be enabled by passing a `list` as the `cci` option. 337 | In this list, you can specify the following options: 338 | 339 | ### grid.size 340 | 341 | > integer 342 | 343 | Manually specify the width and height of the grid. 344 | 345 | ### layout 346 | 347 | > "enhanced", "layers", "islands", or a function (default: `"enhanced"`) 348 | 349 | Specify the layout of the cell types. 350 | scMultiSim provides three built-in layouts: `"enhanced"`, `"layers"`, and `"islands"`. 351 | 352 | If set to `"islands"`, you can specify which cell types are the islands, e.g. `"islands:1,2"`. 353 | 354 | If using a custom function, it should take two arguments: `function (grid_size, cell_types)` 355 | - grid_size: (integer) The width and height of the grid. 356 | - cell_types: (integer vector) Each cell's cell type. 357 | 358 | It should return a `n_cell x 2` matrix, where each row is the x and y coordinates of a cell. 359 | 360 | ### step.size 361 | 362 | > number 363 | 364 | If using continuous population, use this step size to further divide the 365 | cell types on the tree. For example, if the tree only has one branch `a -> b` 366 | and the branch length is 1 while the step size is 0.34, there will be totally three cell types: a_b_1, a_b_2, a_b_3. 367 | 368 | ### params 369 | 370 | > data.frame 371 | 372 | The spatial effect between a ligand and a receptor gene. 373 | It should be a data frame similar to the GRN parameter, i.e. with columns `receptor`, `ligand`, and `effect`. 374 | 375 | Example: 376 | ```R 377 | cci = list( 378 | params = data.frame( 379 | target = c(2, 6, 10, 8, 20, 30), 380 | regulator = c(101, 102, 103, 104, 105, 106), 381 | effect = 20 382 | ) 383 | ) 384 | ``` 385 | 386 | ### cell.type.interaction 387 | 388 | > "random" or a matrix 389 | 390 | Specify which cell types can communicate using which ligand-receptor pair. 391 | It should be a 3d `n_cell_types x n_cell_types x n_ligand_pair` numeric matrix. 392 | The value at (i, j, k) is 1 if there exist CCI of LR-pair k between cell type i and cell type j. 393 | 394 | This matrix can be generated using the `cci_cell_type_params()` function. 395 | It can fill the matrix randomly, or return an empty matrix for you to fill manually. 396 | If you want to fill it randomly, you can simply supply `"random"` for this option. 397 | 398 | ### cell.type.lr.pairs 399 | 400 | > integer vector 401 | 402 | If `cell.type.interaction` is `"random"`, specify how many LR pairs should be enabled between each cell type pair. 403 | Should be a range, e.g. `4:6`. The actual number of LR pairs will be uniformly sampled from this range. 404 | 405 | ### max.neighbors 406 | 407 | > integer 408 | 409 | The number of interacting cells for each cell. 410 | If the cell's available neighbor count is not large enough, the actual interacting cells may be smaller than this value. 411 | 412 | ### radius 413 | 414 | > number (default: `1`), or "gaussian:sigma" 415 | 416 | Which cells should be considered as neighbors. 417 | The interacting cells are those within these neighbors. 418 | 419 | When it is a number, it controls the maximum distance between two cells for them to interact. 420 | 421 | When it is a string, it should be in the format `gaussian:sigma`, for example, `gaussian:1.2`. 422 | In this case, the probability of two cells interacting is proportional to the distance with a Gaussian kernel applied. 423 | 424 | ### start.layer 425 | 426 | > integer 427 | 428 | From which layer (time step) the simulation should start. 429 | If set to 1, the simulation will start with one cell in the grid and add one more cell in each following layer. 430 | If set to `num_cells`, the simulation will start from all cells available in the grid 431 | and only continues for a few static layers, which will greatly speed up the simulation. 432 | -------------------------------------------------------------------------------- /vignettes/spatialCCI.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "3. Simulating Spatial Cell-Cell Interactions" 3 | output: 4 | BiocStyle::html_document: 5 | toc: true 6 | toc_depth: 2 7 | vignette: > 8 | %\VignetteEngine{knitr::knitr} 9 | %\VignetteIndexEntry{3. Simulating Spatial Cell-Cell Interactions} 10 | %\usepackage[UTF-8]{inputenc} 11 | --- 12 | 13 | ```{r "setup", include=FALSE} 14 | require("knitr") 15 | opts_chunk$set(fig.width=4, fig.height=3) 16 | 17 | # devtools::load_all(".") 18 | ``` 19 | 20 | 21 | ## Simulating Spatial Cell-Cell Interactions 22 | 23 | scMultiSim can simulate spatial cell-cell interactions. 24 | To do so, we need to provide the `cci` option as a list. 25 | The following code will print more instructions on how to use the `cci` option. 26 | 27 | ```{r help-cci} 28 | library(scMultiSim) 29 | 30 | scmultisim_help("cci") 31 | ``` 32 | 33 | Now, we prepare a ligand-receptor interaction database. 34 | This is pretty similar to the GRN network: it is a data frame with three columns, 35 | specifying `target`, `regulator`, and `effect`, respectively. 36 | The target and regulator columns should contain the IDs of the target and regulator genes. 37 | In the following example, we have two ligand-receptor pairs interacting between two neighboring cells. 38 | 39 | ```{r cci-network} 40 | lig_params <- data.frame( 41 | target = c(101, 102), 42 | regulator = c(103, 104), 43 | effect = c(5.2, 5.9) 44 | ) 45 | ``` 46 | 47 | We can now simulate the spatial cell-cell interactions. 48 | In scMultiSim, the CCI network is cell-type based, which means that between each cell type pair, 49 | we can have a different CCI network sampled from the database defined above. 50 | Here, we set the `step.size` to 0.5, so the differentiation tree is divided into segments of length 0.5, 51 | each segment is treated as a cell type in CCI. 52 | We set `cell.type.interaction` to `random`, so the CCI network between each cell type pair is randomly sampled from the database. 53 | 54 | Here, we use only 100 cells to speed up the simulation. Feel free to try a larger number of cells when running this vignette locally. 55 | 56 | ```{r} 57 | data(GRN_params_100) 58 | set.seed(42) 59 | 60 | options_ <- list( 61 | GRN = GRN_params_100, 62 | speed.up = TRUE, 63 | num.genes = 120, 64 | num.cells = 80, 65 | num.cifs = 20, 66 | cif.sigma = 0.2, 67 | tree = Phyla3(), 68 | intrinsic.noise = 0.5, 69 | cci = list( 70 | params = lig_params, 71 | max.neighbors = 4, 72 | grid.size = 13, 73 | cell.type.interaction = "random", 74 | step.size = 0.5 75 | ) 76 | ) 77 | 78 | results <- sim_true_counts(options_) 79 | ``` 80 | 81 | The `results$cell_meta` will contain the cell type information used in CCI. 82 | We can plot the cell spatial locations using `plot_cell_loc()`. 83 | The arrows indicate cell-cell interactions between two cells (for the first ligand-receptor pair). 84 | 85 | ```{r plot-cell-loc, fig.width=6, fig.height=6} 86 | plot_cell_loc(results) 87 | ``` 88 | 89 | The cell locations are available in `results$cci_locs`. 90 | 91 | ```{r print-cell-loc} 92 | head(results$cci_locs) 93 | ``` 94 | 95 | ### Speeding up the Simulation 96 | 97 | Simulating spatial cell-cell interactions can be computationally expensive. 98 | Setting these two options can speed up the simulation: 99 | 100 | ``` 101 | options_ <- list( 102 | # ... 103 | speed.up = T, 104 | cci = list( 105 | # ... 106 | start.layer = ncells 107 | ) 108 | ) 109 | ``` 110 | 111 | First of all, it is recommended to set the experimental `speed.up = T` option. This option will become default in later versions of scMultiSim. 112 | 113 | Next, it is possible to set the CCI option `start.layer = n_cells`, where `n_cells` is the number of cells. 114 | scMultiSim simulates a spatial dataset by following `n_cells` steps, adding one more cell to the spatial grid in each step. 115 | Only the final step is outputted as the result. 116 | The CCI option `start.layer` can be used to start simulation from a specific time step. 117 | When set to `n_cells`, the simulation will skip all previous steps by adding all cells at once. 118 | By default, `start.layer` will be set to `n_cells` when number of cells is greater than 800. 119 | 120 | 121 | ## Spatial layouts 122 | 123 | scMultiSim provides powerful customization options for spatial cell layouts. 124 | 125 | ### Built-in layouts 126 | 127 | scMultiSim ships with several built-in spatial layouts. 128 | The `enhanced` layout is the default layout, where cells are added to the grid one by one. 129 | When adding a new cell, it has a higher probability of being placed near the existing cells of the same cell type. 130 | ```{r layout-enhanced, fig.width=6, fig.height=6} 131 | # helper function to add `layout` to options, to make the code more readable 132 | spatial_options <- function (...) { 133 | cci_opt <- list( 134 | params = lig_params, 135 | max.neighbors = 4, 136 | start.layer = 300, 137 | grid.size = 28, 138 | cell.type.interaction = "random" 139 | ) 140 | list( 141 | rand.seed = 0, 142 | GRN = GRN_params_100, 143 | speed.up = TRUE, 144 | num.genes = 200, 145 | num.cells = 300, 146 | num.cifs = 50, 147 | tree = Phyla3(), 148 | cci = c(cci_opt, list(...)) 149 | ) 150 | } 151 | 152 | 153 | results <- sim_true_counts(spatial_options( 154 | layout = "enhanced" 155 | )) 156 | plot_cell_loc(results, show.arrows = FALSE) 157 | ``` 158 | 159 | An option `same.type.prob` decides the probability of a new cell being placed near the existing cells of the same cell type. 160 | By default, it is 0.8; and if we use a lower value, the new cell will be placed more randomly. 161 | ```{r layout-random, fig.width=6, fig.height=6} 162 | 163 | results <- sim_true_counts(spatial_options( 164 | layout = "enhanced", 165 | same.type.prob = 0.1 166 | )) 167 | plot_cell_loc(results, show.arrows = FALSE) 168 | ``` 169 | 170 | The `layers` layout arranges cells in layers. 171 | 172 | ```{r layout-layers, fig.width=6, fig.height=6} 173 | results <- sim_true_counts(spatial_options( 174 | layout = "layers" 175 | )) 176 | plot_cell_loc(results, show.arrows = FALSE) 177 | ``` 178 | 179 | The `islands` layout will put some cell types in the center like islands, and others around them. 180 | You may specify which cell type should be islands in the format `islands:1,2,3`. 181 | The number here can be looked up in `results$cci_cell_types`. 182 | 183 | ```{r} 184 | results$cci_cell_types 185 | ``` 186 | 187 | ```{r layout-islands, fig.width=6, fig.height=6} 188 | results <- sim_true_counts(spatial_options( 189 | # cell type 4_1_2 should be the island 190 | layout = "islands:5" 191 | )) 192 | plot_cell_loc(results, show.arrows = FALSE) 193 | ``` 194 | 195 | ### Custom layouts 196 | 197 | It is also possible to layout the cells programmatically. 198 | The `layout` option can be a function that takes the cell type information and returns the spatial locations of the cells: 199 | ``` 200 | # grid_size is a number 201 | # cell_types is an integer vector, representing the cell types 202 | function(grids_size, cell_types) { 203 | # return a matrix with two columns, representing the x and y coordinates of the cells 204 | return matrix(nrow = 2, ncol = ncells) 205 | } 206 | ``` 207 | 208 | For example, the following layout function will place the cells sequentially in the grid, 209 | starting from the bottom-left corner. 210 | 211 | ```{r layout-custom, fig.width=6, fig.height=6} 212 | results <- sim_true_counts(spatial_options( 213 | layout = function (grid_size, cell_types) { 214 | ncells <- length(cell_types) 215 | new_locs <- matrix(nrow = ncells, ncol = 2) 216 | # for each cell... 217 | for (i in 1:ncells) { 218 | # ...place it in the grid 219 | new_locs[i,] <- c(i %% grid_size, i %/% grid_size) 220 | } 221 | return(new_locs) 222 | } 223 | )) 224 | plot_cell_loc(results, show.arrows = FALSE) 225 | ``` 226 | 227 | ## Spatial domains 228 | 229 | Next, we demonstrate how to use custom layout function to create spatial domains. 230 | We want to have three spatial domains in a layered layout, and we have four cell types. 231 | Each cell type has a different probability of being in each domain. 232 | 233 | The following layout function will do this job: First of all, it generates a set of locations that form a circular shape. 234 | Next, it assigns cells to these locations; the leftmost cell is selected as the origin. 235 | Then, we can create a layered layout by sorting the locations based on their euclidian distance to the origin. 236 | The three domains are determined by the distance to the origin. 237 | We have a matrix `ct_matrix` that specifies the probability of each cell type being in each domain. 238 | Finally, we sample the cells based on the probabilities and assign them to the domains. 239 | 240 | ```{r layout-domains} 241 | layout_fn <- function(grid_size, final_types) { 242 | ncells <- length(final_types) 243 | grid_center <- c(round(grid_size / 2), round(grid_size / 2)) 244 | all_locs <- gen_clutter(ncells, grid_size, grid_center) 245 | # center is bottom-left 246 | left_ones <- which(all_locs[,1] == min(all_locs[,1])) 247 | new_center <<- all_locs[left_ones[which.min(all_locs[left_ones, 2])],] 248 | dist_to_center <- sqrt(colSums((t(all_locs) - new_center)^2)) 249 | new_locs <- all_locs[order(dist_to_center),] 250 | # prob of a cell type being in a zone (cell_type x zone) 251 | ct_matrix <- matrix(c( 252 | 0.9, 0.1, 0.0, 253 | 0.1, 0.8, 0.1, 254 | 0.1, 0.7, 0.2, 255 | 0.0, 0.1, 0.9 256 | ), nrow = 4, byrow = TRUE) 257 | # number of cells per type 258 | ct_pop <- c(160, 80, 100, 140) 259 | pop_mtx <- round(ct_matrix * ct_pop) 260 | if (sum(pop_mtx) != ncells) { 261 | diffrence <- ncells - sum(pop_mtx) 262 | pop_mtx[1, 1] <- pop_mtx[1, 1] + diffrence 263 | } 264 | # number of cells per zone 265 | zone_pop <- colSums(pop_mtx) 266 | # assign cells to zones 267 | cs <- cumsum(zone_pop) 268 | # sample cells 269 | cell_idx <- unlist(lapply(1:3, function(izone) { 270 | sample(rep(1:4, pop_mtx[,izone]), zone_pop[izone]) 271 | })) 272 | locs <<- new_locs[order(cell_idx),] 273 | zone_gt <<- rep(1:3, zone_pop)[order(cell_idx)] 274 | return(locs) 275 | } 276 | ``` 277 | 278 | Inspecting the result, we can see the three spatial domains, where the middle one contains a mix of two cell types. 279 | 280 | ```{r layout-domains-plot, fig.width=6, fig.height=6} 281 | results <- sim_true_counts(list( 282 | num.cells = 500, 283 | num.genes = 300, 284 | num.cifs = 40, 285 | GRN = NA, 286 | speed.up = T, 287 | cif.sigma = 0.8, 288 | tree = ape::read.tree(text = "(A:1,B:1,C:1,D:1);"), 289 | diff.cif.fraction = 0.8, 290 | discrete.cif = T, 291 | discrete.pop.size = as.integer(c(120,150,100,130)), 292 | cci = list( 293 | params = lig_params, 294 | max.neighbors = 4, 295 | start.layer = 500, 296 | cell.type.interaction = "random", 297 | layout = layout_fn, 298 | step.size = 1 299 | ) 300 | )) 301 | 302 | plot_cell_loc(results, show.arrows = FALSE) 303 | ``` 304 | 305 | ## Spatially variable genes 306 | 307 | The `ext.cif.giv` option allows us to append custom CIF and GIV entries for each cell and gene. 308 | We can use this option to simulate spatially variable genes. 309 | This option should be a function that takes the kinetic parameter index and returns a list of extra CIF and GIV matrices. 310 | 311 | ```{r} 312 | scmultisim_help("ext.cif.giv") 313 | ``` 314 | 315 | Using the previous layout function, we can add extra CIF with value based on the distance to the origin. 316 | 317 | ```{r} 318 | ext_cif <- function(i) { 319 | # We manually set genes 290-300 to be spatially variable 320 | spatial_genes <- 290:300 321 | dist_to_center <- colSums((t(locs) - new_center)^2) 322 | dist_to_center <- dist_to_center / max(dist_to_center) 323 | # 3 is the s parameter 324 | if (i == 3) { 325 | # n_extra_cif x n_cells 326 | ex_cif <- cbind( 327 | # the two CIFs have large values when distance to the center is near 0.5 328 | rnorm(500, 0.5 * dnorm(abs(dist_to_center - 0.5), 0, 0.04), 0.02), 329 | rnorm(500, 0.5 * dnorm(abs(dist_to_center - 0.5), 0, 0.04), 0.02) 330 | ) 331 | # n_genes x n_extra_cif 332 | ex_giv <- matrix(0, nrow = 300, ncol = 2) 333 | for (i in spatial_genes) { 334 | # odd genes affected by the first two CIF, even genes affected by the last two CIF 335 | ex_giv[i, ] <- rnorm(2, 1, 0.5) 336 | } 337 | list(ex_cif, ex_giv * 2) 338 | } else { 339 | NULL 340 | } 341 | } 342 | ``` 343 | 344 | ```{r} 345 | results <- sim_true_counts(list( 346 | num.cells = 500, 347 | num.genes = 300, 348 | num.cifs = 40, 349 | GRN = NA, 350 | speed.up = T, 351 | cif.sigma = 0.8, 352 | tree = ape::read.tree(text = "(A:1,B:1,C:1,D:1);"), 353 | diff.cif.fraction = 0.8, 354 | ext.cif.giv = ext_cif, 355 | discrete.cif = T, 356 | discrete.pop.size = as.integer(c(120,150,100,130)), 357 | cci = list( 358 | params = lig_params, 359 | max.neighbors = 4, 360 | start.layer = 500, 361 | cell.type.interaction = "random", 362 | layout = layout_fn, 363 | step.size = 1 364 | ) 365 | )) 366 | ``` 367 | 368 | Try plotting one of the spatially variable genes. We can see that the gene expression is higher in the specific spatial 369 | region. 370 | ```{r spatially-variable-gene, fig.width=6, fig.height=6} 371 | library(ggplot2) 372 | 373 | plot_cell_loc(results, show.arrows = FALSE, 374 | .cell.pop = log(results$counts[299,] + 1)) + scale_colour_viridis_c() 375 | ``` 376 | 377 | ## Long-distance Cell-Cell Interactions 378 | 379 | scMultiSim also supports simulation of long-distance cell-cell interactions. 380 | 381 | The CCI option `radius` controls the maximum distance between two cells for them to interact. 382 | It can be a number or a string. 383 | When it is a number, it specifies the maximum distance. 384 | When it is a string it should be in the format `gaussian:sigma`, for example, `gaussian:1.2`. 385 | In this case, the probability of two cells interacting is proportional to the distance with a Gaussian kernel applied. 386 | 387 | By default, `radius = 1`, which means scMultiSim only consider the four nearest neighbors. 388 | 389 | We can compare the result with different sigma values 1 and 3: 390 | 391 | ```{r long-distance-cci} 392 | 393 | options <- lapply(c(1, 3), \(sigma) { 394 | list( 395 | rand.seed = 1, 396 | GRN = NA, 397 | num.genes = 200, 398 | num.cells = 500, 399 | num.cifs = 50, 400 | tree = Phyla5(), 401 | discrete.cif = T, 402 | discrete.min.pop.size = 20, 403 | discrete.pop.size = as.integer(c(110, 80, 140, 40, 130)), 404 | do.velocity = F, 405 | scale.s = 1, 406 | cci = list( 407 | params = lig_params, 408 | max.neighbors = 4, 409 | cell.type.interaction = "random", 410 | cell.type.lr.pairs = 3:6, 411 | step.size = 0.3, 412 | grid.size = 35, 413 | start.layer = 500, 414 | radius = paste0("gaussian:", sigma), 415 | layout = "layers" 416 | ) 417 | ) 418 | 419 | }) 420 | 421 | results_1 <- sim_true_counts(options[[1]]) 422 | results_3 <- sim_true_counts(options[[2]]) 423 | 424 | ``` 425 | 426 | ```{r plot-long-distance-cci, fig.width=6, fig.height=6} 427 | plot_cell_loc(results_1, show.arrows = T, .cell.pop = as.character(results$grid$final_types)) 428 | plot_cell_loc(results_3, show.arrows = T, .cell.pop = as.character(results$grid$final_types)) 429 | ``` 430 | 431 | ## Session Information 432 | 433 | ```{r session-info} 434 | sessionInfo() 435 | ``` 436 | -------------------------------------------------------------------------------- /vignettes/workflow.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "1. Getting Started" 3 | output: 4 | BiocStyle::html_document: 5 | toc: true 6 | toc_depth: 2 7 | vignette: > 8 | %\VignetteEngine{knitr::knitr} 9 | %\VignetteIndexEntry{1. Getting Started} 10 | %\usepackage[UTF-8]{inputenc} 11 | --- 12 | 13 | scMultiSim is a simulation tool for single-cell multi-omics data. 14 | It can simulate RNA counts, ATAC-seq data, RNA velocity, 15 | and spatial locations of continuous or discrete cell populations. 16 | It can model the effects of gene regulatory networks (GRN), chromatin accessibility, 17 | and cell-cell interactions on the simulated data. 18 | 19 | This article introduces the basic workflow of `scMultiSim`. 20 | 21 | ```{r, include = FALSE} 22 | knitr::opts_chunk$set( 23 | collapse = TRUE, 24 | comment = "#>" 25 | ) 26 | ``` 27 | 28 | # Installation 29 | 30 | It is recommended to install `scMultiSim` from Bioconductor with: 31 | ```R 32 | if (!requireNamespace("BiocManager", quietly = TRUE)) 33 | install.packages("BiocManager") 34 | BiocManager::install("scMultiSim") 35 | ``` 36 | 37 | You can also install the development version of `scMultiSim` from GitHub with: 38 | ```R 39 | devtools::install_github("ZhangLabGT/scMultiSim@main") 40 | ``` 41 | 42 | # Running Simulation 43 | 44 | Once installed, you can load the package with: 45 | 46 | ```{r setup} 47 | library(scMultiSim) 48 | ``` 49 | 50 | A typical workflow consists two main steps: 51 | 52 | 1. Simulate the true counts; 53 | 2. Add technical noise and batch to the dataset. 54 | 55 | The `sim_true_counts` function generates the true counts. 56 | It accepts a list of options as input. 57 | You are able to control most of the simulated effects here. 58 | 59 | ```{r true-counts} 60 | data(GRN_params_100) 61 | 62 | results <- sim_true_counts(list( 63 | # required options 64 | GRN = GRN_params_100, 65 | tree = Phyla3(), 66 | num.cells = 500, 67 | # optional options 68 | num.cif = 20, 69 | discrete.cif = F, 70 | cif.sigma = 0.1 71 | # ... other options 72 | )) 73 | ``` 74 | 75 | scMultiSim requires users to provide the following options: 76 | 77 | - `GRN`: The Gene Regulatory Network. 78 | - `tree`: The cell differential tree. 79 | 80 | Typically, you may also want to adjust the following options to control other important factors: 81 | 82 | - `num.cells`: Specify the number of cells. 83 | - `unregulated.gene.ratio` or `num.genes`: Control the total number of genes. 84 | - `discrete.cif`: Whether generating discrete or continuous cell population. 85 | - `diff.cif.fraction`: Control the contribution of the trajectory/cluster specified by the tree. 86 | - `cif.sigma`: Control the variation of cells along the trajectory. 87 | 88 | The [Simulating Multimodal Single-Cell Data](https://zhanglabgt.github.io/scMultiSim/articles/basics.html) 89 | tutorial will introduce these functions in more detail, 90 | including how to simulate RNA velocity data and ATAC-seq data. 91 | The [Simulating Spatial Cell-Cell Interactions](https://zhanglabgt.github.io/scMultiSim/articles/spatialCCI.html) 92 | tutorial will focus on simulating spatial cell locations and cell-cell interactions. 93 | You may also want to check the [Parameter Guide](https://zhanglabgt.github.io/scMultiSim/articles/options.html) 94 | or running the `scmultisim_help()` function for a complete list of options. 95 | 96 | ## The Shiny app 97 | 98 | Don't forget that scMultiSim provides a Shiny app to help you explore the options interactively. 99 | Simply run `run_shiny()` to start the app. 100 | 101 | ```{r run-shiny, eval = FALSE} 102 | run_shiny() 103 | ``` 104 | 105 | 106 | ## Add technical noise and batch effect 107 | 108 | You can use `add_expr_noise` to add technical noise to the dataset, and `divide_batches` to add batch effects. 109 | 110 | ```{r technical-noise} 111 | add_expr_noise(results) 112 | divide_batches(results, nbatch = 2) 113 | ``` 114 | 115 | ## Visualize the results 116 | 117 | scMultiSim provides various visualization functions to help you understand the simulated data. 118 | 119 | For example, `plot_tsne()` visualizes the cells using t-SNE. 120 | 121 | ```{r visualize} 122 | plot_tsne(results$counts, results$cell_meta$pop) 123 | ``` 124 | --------------------------------------------------------------------------------