├── .Rprofile ├── .gitignore ├── README.md ├── Snakefile ├── code ├── 00-get_data-CellBench.R ├── 00-get_data-Ding20.R ├── 00-get_data-Gierahn17.R ├── 00-get_data-Kang18.R ├── 00-get_data-Koh16.R ├── 00-get_data-MCA20.gland.R ├── 00-get_data-MCA20.lung.R ├── 00-get_data-Mereu20.R ├── 00-get_data-Oetjen18.R ├── 00-get_data-TabulaMuris.R ├── 00-get_data-Tung17.R ├── 00-get_data-Zheng17.R ├── 00-get_data-panc8.R ├── 00-get_data.R ├── 01-fil_data.R ├── 02-sub_data.R ├── 03-est_pars-BASiCS.R ├── 03-est_pars-ESCO.R ├── 03-est_pars-POWSC.R ├── 03-est_pars-SCRIP.R ├── 03-est_pars-SPARSim.R ├── 03-est_pars-SPsimSeq.R ├── 03-est_pars-SymSim.R ├── 03-est_pars-ZINB-WaVE.R ├── 03-est_pars-hierarchicell.R ├── 03-est_pars-muscat.R ├── 03-est_pars-powsimR.R ├── 03-est_pars-scDD.R ├── 03-est_pars-scDesign.R ├── 03-est_pars-scDesign2.R ├── 03-est_pars-splatter.R ├── 03-est_pars-zingeR.R ├── 03-est_pars.R ├── 04-sim_data-BASiCS.R ├── 04-sim_data-ESCO.R ├── 04-sim_data-POWSC.R ├── 04-sim_data-SCRIP.R ├── 04-sim_data-SPARSim.R ├── 04-sim_data-SPsimSeq.R ├── 04-sim_data-SymSim.R ├── 04-sim_data-ZINB-WaVE.R ├── 04-sim_data-hierarchicell.R ├── 04-sim_data-muscat.R ├── 04-sim_data-powsimR.R ├── 04-sim_data-scDD.R ├── 04-sim_data-scDesign.R ├── 04-sim_data-scDesign2.R ├── 04-sim_data-splatter.R ├── 04-sim_data-zingeR.R ├── 04-sim_data.R ├── 05-calc_batch-ComBat.R ├── 05-calc_batch-Harmony.R ├── 05-calc_batch-Seurat.R ├── 05-calc_batch-fastMNN.R ├── 05-calc_batch-limma.R ├── 05-calc_batch-mnnCorrect.R ├── 05-calc_batch.R ├── 05-calc_clust-CIDR.R ├── 05-calc_clust-PCA+HC.R ├── 05-calc_clust-PCA+KM.R ├── 05-calc_clust-SC3.R ├── 05-calc_clust-SC3svm.R ├── 05-calc_clust-Seurat.R ├── 05-calc_clust-monocle.R ├── 05-calc_clust-pcaReduce.R ├── 05-calc_clust-tSNE+KM.R ├── 05-calc_clust.R ├── 05-calc_dr.R ├── 05-calc_qc-cell_cms.R ├── 05-calc_qc-cell_cor.R ├── 05-calc_qc-cell_frq.R ├── 05-calc_qc-cell_knn.R ├── 05-calc_qc-cell_ldf.R ├── 05-calc_qc-cell_lls.R ├── 05-calc_qc-cell_pcd.R ├── 05-calc_qc-cell_sw.R ├── 05-calc_qc-gene_avg.R ├── 05-calc_qc-gene_cor.R ├── 05-calc_qc-gene_cv.R ├── 05-calc_qc-gene_frq.R ├── 05-calc_qc-gene_pve.R ├── 05-calc_qc-gene_var.R ├── 05-calc_qc.R ├── 05-runtimes.R ├── 06-dr_batch.R ├── 06-eval_batch.R ├── 06-eval_clust.R ├── 06-stat_1d-ks.R ├── 06-stat_1d-ws.R ├── 06-stat_1d.R ├── 06-stat_2d-emd.R ├── 06-stat_2d-ks2.R ├── 06-stat_2d.R ├── 07-plot_batch-boxplot_by_method.R ├── 07-plot_batch-boxplot_dX.R ├── 07-plot_batch-correlations.R ├── 07-plot_batch-densities.R ├── 07-plot_batch-heatmap_by_method.R ├── 07-plot_batch-heatmap_by_refset.R ├── 07-plot_clust-boxplot_by_method.R ├── 07-plot_clust-boxplot_dF1.R ├── 07-plot_clust-correlations.R ├── 07-plot_clust-heatmap_by_method.R ├── 07-plot_corr-stat_2d.R ├── 07-plot_dimred.R ├── 07-plot_dimred_batch.R ├── 07-plot_memory.R ├── 07-plot_qc_ref-correlations.R ├── 07-plot_runtimes.R ├── 07-plot_stat_1d-scatters.R ├── 07-plot_stat_1d_by_reftyp-boxplot.R ├── 07-plot_stat_1d_by_reftyp-dimEst.R ├── 07-plot_stat_1d_by_reftyp-heatmap.R ├── 07-plot_stat_1d_by_reftyp-mds.R ├── 07-plot_stat_1d_by_reftyp-pca.R ├── 07-plot_stat_1d_by_stat1d-boxplot_by_method.R ├── 07-plot_stat_1d_by_stat1d-boxplot_by_metric.R ├── 07-plot_stat_1d_by_stat1d-correlations.R ├── 07-plot_stat_1d_by_stat1d-correlations_by_method.R ├── 07-plot_stat_1d_by_stat1d-correlations_by_metric.R ├── 07-plot_stat_1d_by_stat1d-dimEst.R ├── 07-plot_stat_1d_by_stat1d-mds.R ├── 07-plot_stat_1d_by_stat1d-pca.R ├── 07-plot_stat_2d-scatters.R ├── 07-plot_stat_2d_by_reftyp-boxplot.R ├── 07-plot_stat_2d_by_reftyp-heatmap.R ├── 08-fig_boxplots.R ├── 08-fig_clustering.R ├── 08-fig_heatmaps.R ├── 08-fig_integration.R ├── 08-fig_mds.R ├── 08-fig_memory.R ├── 08-fig_runtimes.R ├── 08-fig_scalability.R ├── 08-fig_scatters.R ├── 08-fig_stat1d.R ├── 08-fig_stat2d.R ├── 08-fig_summaries.R ├── 09-write_fns.R ├── 09-write_obj.R ├── 10-session_info.R ├── utils-clustering.R ├── utils-integration.R ├── utils-plotting.R └── utils-summaries.R ├── config.yaml ├── data └── .gitkeep ├── figs └── .gitkeep ├── logs └── .gitkeep ├── meta ├── methods.json ├── runtimes.json └── subsets.json ├── outs └── .gitkeep ├── plts └── .gitkeep └── schematic.png /.Rprofile: -------------------------------------------------------------------------------- 1 | data.table::setDTthreads(threads = 1) 2 | 3 | .get_wcs <- function(wcs) { 4 | wcs <- gsub("(,)(\\w+=)", ";\\2", wcs) 5 | ss <- strsplit(wcs, ";")[[1]] 6 | ss <- sapply(ss, strsplit, "=") 7 | keys <- sapply(ss, .subset, 1) 8 | vals <- sapply(ss, .subset, 2) 9 | wcs <- as.list(vals) 10 | names(wcs) <- keys 11 | return(wcs) 12 | } 13 | 14 | args <- R.utils::commandArgs( 15 | trailingOnly = TRUE, 16 | asValues = TRUE) 17 | 18 | if (!is.null(args$wcs)) { 19 | wcs <- .get_wcs(args$wcs) 20 | args$wcs <- NULL 21 | } else wcs <- NULL 22 | 23 | args <- lapply(args, function(u) 24 | unlist(strsplit(u, ";"))) 25 | 26 | cat("WILDCARDS:\n\n"); print(wcs); cat("\n") 27 | cat("ARGUMENTS:\n\n"); print(args) 28 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .snakemake 3 | 4 | ._* 5 | *.txt 6 | *.html 7 | 8 | logs/* 9 | data/* 10 | outs/* 11 | plts/* 12 | figs/* 13 | !*/.gitkeep -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # `Snakemake` workflow to benchmark
scRNA-seq data simulators 2 | 3 | - [Setup](#setup) 4 | - [Dependencies](#dependencies) 5 | - [Structure](#structure) 6 | - [Workflow](#workflow) 7 | - [Preproccessing](#preprocessing) 8 | - [Simulation](#simulation) 9 | - [Summaries](#summaries) 10 | - [Statistics](#statistics) 11 | - [Downstream](#downstream) 12 | - [Clustering](#clustering) 13 | - [Integration](#integration) 14 | - [Visualization](#visualization) 15 | - [Customization](#customization) 16 | - [Datasets](#datasets) 17 | - [Methods](#methods) 18 | 19 | *** 20 | 21 | # Setup 22 | 23 | ## Dependencies 24 | 25 | The current code was implemented using R v4.1.0, Bioconductor v3.13, Snakemake v5.5.0, and Python v3.6.8. All R dependencies (from GitHub, CRAN and Bioconductor) are listed under *code/10-session_info.R* and may be installed using the command contained therein. 26 | 27 | ## Structure 28 | 29 | * `config.yaml` specifies the R library and version to use 30 | * `code` contains all R scripts used in the *Snakemake* workflow 31 | * `data` contains raw, filtered and simulated scRNA-seq datasets, 32 | as well as simulation parameter estimates 33 | * `meta` contains two *.json* files that specify simulation method (`methods.json`) and reference subset (`subsets.json`) configurations 34 | * `outs` contains all results from computations (typically `data.frame`s) as *.rds* files 35 | * `figs` contains all visual outputs as *.pdf* files, and corresponding `ggplot` objects as *.rds* files (for subsequent arrangement into 'super'-figures) 36 | 37 | Simulation methods are tagged with *one or many* of the following labels, according to which scenario(s) they can accommodate: 38 | 39 | * `n` for none: no clusters or batches 40 | * `b` for batch: multiple batches, no clusters 41 | * `k` for cluster: multiple clusters, no batches 42 | 43 | Similarly, we tag subsets (see below) with *exactly one* of these labels. This allows running each method on subsets they are capable of simulating. 44 | 45 | *** 46 | 47 | # Workflow 48 | 49 | ![Schematic of the computational workflow used to benchmark scRNA-seq simulators. (1) Methods are grouped according to which level of complexity they can accommodate: type *n* (`singular'), *b* (batches), *k* (clusters). (2) Raw datasets are retrieved reproducibly from a public source, filtered, and subsetted into various datasets that serve as reference for (3) parameter estimation and simulation. (4) Various gene-, cell-level and global summaries are computed from reference and simulated data, and (5) compared in a one- and two-dimensional setting using two statistics each. (6) Integration and clustering methods are applied to type *b* and *k* references and simulations, respectively, and relative performances compared between reference-simulation and simulation-simulation pairs.](schematic.png) 50 | 51 | ## Preprocessing 52 | 53 | **1. Data retrieval** 54 | 55 | Each `code/00-get_data-.R` script retrieves a publicly available scRNA-seq dataset through from which a *SingleCellExperiment* is constructed and written to `data/00-raw/.rds` 56 | 57 | **2. Filtering** 58 | 59 | `code/01-fil_data.R` is applied to each raw dataset as to 60 | 61 | * remove batches, cluster, or batch-cluster instances with fewer than 50 cells (depending on the dataset's complexity) 62 | * keep genes with a count of at least 1 in at least 10 cells, and remove cells with fewer than 100 detected genes 63 | 64 | Filtered data are written to `data/01-fil/.rds`. 65 | 66 | **3. Subsetting** 67 | 68 | Because different methods can accommodate only some features (e.g. multiple batches or clusters, both or neither), `code/02-sub_data.R` creates specific subsets in `data/02-sub/.,rds`. We term these *ref(erence)set*s (i.e. `. = `), as they serve as the input reference data for simulation. 69 | 70 | ## Simulation 71 | 72 | **1. Parameter estimation** 73 | 74 | Simulation parameters are estimated with `code/03-est_pars.R`, which in term sources a `code/03-est_pars-.R` script that executes a method's parameter estimation function(s). In cases where no separate estimation takes place, this returns `NULL`. Parameter estimates for each combination of ` = ` are written to `data/04-est/.rds`. 75 | 76 | **2. Data simulation** 77 | 78 | Data is simulated with `code/04-sim_data.R`, which in term sources a `code/04-sim_data-.R` script that executes a method's simulation function. Simulations for each combination of `` and `method_id` are written to `data/05-sim/,.rds`. 79 | 80 | ## Summaries 81 | 82 | Various quality control (QC) summaries are computed with `code/05-calc_qc.R`, which in term sources a set of `code/05-calc_qc-.R` scripts. QC results for reference and simulated data are written to `outs/qc_ref-,.rds` and `outs/qc_sim-,.rds`, respectively. At current, we consider: 83 | 84 | **1. Gene-level** 85 | 86 | * `frq`: detection frequency (i.e., fraction of cells with non-zero counts) 87 | * `avg/var`: average/variance of logCPM 88 | * `cv`: coefficient of variation 89 | * `cor`: gene-to-gene correlation 90 | 91 | **2. Cell-level** 92 | 93 | * `frq`: detection frequency (i.e., fraction of genes with non-zero counts) 94 | * `lls`: log-transformed library size (total counts) 95 | * `cor`: cell-to-cell correlation 96 | * `pcd`: cell-to-cell distance (in PCA space) 97 | * `knn`: number of KNN occurrences 98 | * `ldf`: local density factor 99 | 100 | **3. Global** 101 | 102 | * `sw`: Silhouette width (using batch/cluster labels as classes) 103 | * `cms`: cell-specific mixing score (using batch/cluster labels as batches) 104 | * `pve`: percent variance explained (of gene expression = logCPM, by batch/cluster) 105 | 106 | Noteworthily, we compute each summary for different groupings of cells (depending on the dataset's complexity): 107 | 108 | 1. globally, i.e. across all cells 109 | 2. at the batch-level, i.e. for each batch 110 | 3. at the cluster-level, i.e. for each cluster 111 | 112 | Global summaries are computed at the batch-/cluster-level only, as they require a grouping variable. 113 | 114 | ## Statistics 115 | 116 | We compare summaries between reference and simulated data in both one- (`code/06-stat_1d.R`) and two-dimensional settings (`code/06-statl_2d.R`). For the latter, every combination of gene- and cell-level metrics is considered, excluding correlations and global summaries. Furthermore, metrics are evaluated for each cell grouping, i.e. we perform a test globally, for each batch and cluster (again, depending on the dataset's complexity). Test results are written to `outs/stat_1d,,,.rds` for 1D, and `outs/stat_2d,,,,.rds` for 2D tests. 117 | 118 | **1. One-dimensional** 119 | 120 | * Kolmogorov-Smirnov (KS) test 121 | * Wasserstein metric 122 | 123 | **2. Two-dimensional** 124 | 125 | * two-dimensional KS test 126 | * Earth Mover's Distance (EMD) 127 | 128 | ## Downstream 129 | 130 | ### Integration 131 | 132 | Each `05-calc_batch-x.R` script wraps around an integration method that is applied in `05-calc_batch.R` to the set of type *b* subsets. The output corrected assay data or integrated cell embeddings (depending on the method) are written to `outs/batch_ref/sim-,.rds` for every reference and simulation, respectively. Results are evaluated by `06-eval_batch.R`, which computes the following set of metrics: 133 | 134 | - cell-specific mixing score (CMS) 135 | - difference in local density factor ($\Delta$LDF) 136 | - batch correction score (BCS) 137 | 138 | ### Clustering 139 | 140 | Each `05-calc_clust-x.R` script wraps around an integration method that is applied in `05-calc_clust.R` to the set of type *b* subsets. The output cluster assignments are written to `outs/clust_ref/sim-,.rds` for every reference and simulation, respectively. Results are evaluated by `06-eval_clust.R`, which computes the following set of metrics: 141 | 142 | - precision (P) and recall (R) 143 | - F1 score (harmonic mean of P and R) 144 | 145 | ## Visualization 146 | 147 | Finally, results are collected across `refset_id`s and `method_id`s (jointly or separated by type), and visualized in various ways using as set of `07-plot_x.R` scripts. Output figures are written to `plts` as *.pdf* files, along with the corresponding `ggplot` objects as *.rds* files. Lastly, `08-fig_x.R` scripts are used to combined various `ggplot`s into figures that are saved to `figs` as *.pdf* files. 148 | 149 | *** 150 | 151 | # Customization 152 | 153 | ## Datasets 154 | 155 | In principle, any dataset for which a `code/00-get_data-.R` script exists will be accessible to the workflow. However, data will only be retrieved if the dataset appears in `meta/subsets.json`. Hence, 156 | 157 | ### Removing 158 | 159 | To exclude a dataset from the workflow, i) (re)move the corresponding `code/00-get_data-.R` script; or, ii) remove or comment out any associated `meta/subsets.json` entries. 160 | 161 | ### Adding 162 | 163 | Similarly, a new dataset can be added by supplying an adequate `code/00-get_data-.R` script, and adding an entry to the `meta/subsets.json` configuration that specifies the subset ID, the number of genes/cells to sample (`NULL` for all), which batch(es)/cluster(s) to retain, as well as the resulting subset's type (one of n,b,k,g). 164 | 165 | ## Methods 166 | 167 | The *Snakemake* will automatically include any simulation method for which a `code/03-est_pars-.R` and `code/04-sim_data-.R` script exists. Secondly, `meta/methods.json` will determine on which type(s) of dataset(s) each method should be run. Thus, 168 | 169 | ### Removing 170 | 171 | To exclude a method from the workflow, either i) set `"": "x"` in the `meta/methods.json` file (or anything other than n,b,k,g); or, ii) (re)move the parameter estimation and/or simulation script from the `code` directory. 172 | 173 | ### Adding 174 | 175 | Analogous to the above, adding a method to the benchmark requires i) adding a `code/03-est_pars-.R` and `code/04-sim-data-.R` script; and, ii) adding an entry for the `method_id` to the `meta/methods.json` file. Importantly, the R script for parameter estimation should handle batches (`colData` column `batch`), clusters (`colData` column `cluster`), both or neither. And the method's type(s) should be specified accordingly (`n` for neither, `b/k` for batches/clusters, `g` for groups), e.g. `"": ["n", "k"]` for a method that supports 'singular' datasets, as well as ones with multiple clusters. 176 | -------------------------------------------------------------------------------- /code/00-get_data-CellBench.R: -------------------------------------------------------------------------------- 1 | # load required packages 2 | suppressPackageStartupMessages({ 3 | library(CellBench) 4 | library(SingleCellExperiment) 5 | }) 6 | 7 | fun <- \() 8 | { 9 | # load SCE from EH 10 | sce <- load_sc_data() 11 | ids <- names(sce) 12 | 13 | # get genes shared across batches 14 | gs <- lapply(sce, rownames) 15 | gs <- Reduce(intersect, gs) 16 | 17 | sce <- lapply(ids, function(id) 18 | { 19 | # subset shared genes 20 | x <- sce[[id]][gs, ] 21 | 22 | # simplify metadata 23 | rowData(x) <- NULL 24 | colData(x) <- DataFrame( 25 | batch = id, 26 | cluster = x$cell_line) 27 | metadata(x) <- list() 28 | 29 | # make counts sparse & drop drop log-normalized counts 30 | y <- as(counts(x), "dgCMatrix") 31 | assays(x) <- list(counts = y) 32 | print(dim(x)) 33 | return(x) 34 | }) 35 | 36 | # concatenate batches into single dataset 37 | do.call(cbind, sce) 38 | } -------------------------------------------------------------------------------- /code/00-get_data-Ding20.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(ExperimentHub) 3 | library(SingleCellExperiment) 4 | library(Seurat) 5 | }) 6 | 7 | fun <- \() 8 | { 9 | eh <- ExperimentHub() 10 | q <- query(eh, "SimBenchData") 11 | 12 | x <- lapply( 13 | grep("^Neural", q$title), 14 | function(i) { 15 | so <- eh[[q$ah_id[i]]] 16 | sce <- as.SingleCellExperiment(so) 17 | # keep 1st experiment only 18 | sce[, sce$ident == "Cortex1"] 19 | }) 20 | 21 | x <- do.call(cbind, x) 22 | 23 | colData(x) <- DataFrame( 24 | batch = x$technology, 25 | cluster = x$celltype) 26 | 27 | assays(x) <- list(counts = counts(x)) 28 | 29 | return(x) 30 | } -------------------------------------------------------------------------------- /code/00-get_data-Gierahn17.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(ExperimentHub) 3 | library(Seurat) 4 | library(SingleCellExperiment) 5 | }) 6 | 7 | fun <- \() 8 | { 9 | eh <- ExperimentHub() 10 | q <- query(eh, "SimBenchData") 11 | i <- grep("HEK", q$title) 12 | x <- eh[[q$ah_id[i]]] 13 | 14 | x <- as.SingleCellExperiment(x) 15 | assays(x) <- assays(x)["counts"] 16 | colData(x) <- NULL 17 | 18 | return(x) 19 | } -------------------------------------------------------------------------------- /code/00-get_data-Kang18.R: -------------------------------------------------------------------------------- 1 | # load required packages 2 | suppressPackageStartupMessages({ 3 | library(ExperimentHub) 4 | library(SingleCellExperiment) 5 | }) 6 | 7 | fun <- \() 8 | { 9 | # load SCE from EH 10 | eh <- ExperimentHub() 11 | q <- query(eh, "Kang18_8vs8") 12 | x <- eh[[q$ah_id]] 13 | 14 | # keep reference samples only 15 | x <- x[, x$stim == "ctrl"] 16 | 17 | # drop unassigned & multiplet cells 18 | x <- x[, !is.na(x$cell)] 19 | x <- x[, x$multiplets == "singlet"] 20 | 21 | # drop undetected genes 22 | x <- x[rowSums(counts(x)) > 0, ] 23 | 24 | # convert counts to sparse matrix 25 | counts(x) <- as(counts(x), "dgCMatrix") 26 | 27 | # drop feature metadata 28 | rowData(x) <- NULL 29 | 30 | # subset & rename cell metadata 31 | colData(x) <- DataFrame( 32 | batch = factor(x$ind), 33 | cluster = x$cell, 34 | row.names = NULL) 35 | 36 | return(x) 37 | } -------------------------------------------------------------------------------- /code/00-get_data-Koh16.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(Biobase) 3 | library(GEOquery) 4 | library(SingleCellExperiment) 5 | }) 6 | 7 | fun <- \() 8 | { 9 | url <- "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSM2257302&format=file&file=GSM2257302%5FAll%5Fsamples%5Fsc%5Ftpm%2Etxt%2Egz" 10 | fnm <- tempfile(fileext = ".txt.gz") 11 | download.file(url, fnm, quiet = TRUE) 12 | 13 | # read in raw data 14 | y <- read.delim(gzfile(fnm), header = TRUE) 15 | 16 | # make feature metadata 17 | rd <- data.frame( 18 | ensembl = y$geneID, 19 | symbol = y$geneSymbol) 20 | 21 | ex <- grep("^gene", names(y)) 22 | y <- as.matrix(y[, -ex]) 23 | 24 | # make cell metadata 25 | cd <- data.frame(cluster = factor( 26 | gsub("\\..*$", "", colnames(y)), 27 | levels = c("H7hESC", "APS", "MPS3", "D2_25somitomere", "DLL1PXM", "LatM", "Earlysomite", "cDM", "Sclerotome"), 28 | labels = c("hESC", "APS", "MPS", "D2.25 Somitomere", "DLL1+PXM", "LatM", "Somite", "Dermo", "Sclerotome"))) 29 | 30 | # convert TPM to integers 31 | y <- matrix(as.integer(y), nrow(y), ncol(y)) 32 | 33 | # construct 'SingleCellExperiment' 34 | SingleCellExperiment( 35 | assays = list(counts = y), 36 | rowData = rd, colData = cd) 37 | } -------------------------------------------------------------------------------- /code/00-get_data-MCA20.gland.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(SingleCellExperiment) 3 | }) 4 | 5 | fun <- \() 6 | { 7 | # bump timeout limit of 60 sec to 5 min 8 | # to assure download isn't interrupted 9 | options(timeout = max(300, getOption("timeout"))) 10 | 11 | # get count data 12 | url <- "https://ndownloader.figshare.com/files/10756798?private_link=865e694ad06d5857db4b" 13 | dir <- dirname(fnm <- tempfile()) 14 | download.file(url, fnm, quiet = TRUE) 15 | fns <- untar(fnm, list = TRUE) 16 | 17 | # read in counts 18 | sub <- grep("/MammaryGland.Virgin[0-9]", fns, value = TRUE) 19 | untar(fnm, files = sub, exdir = dir) 20 | ys <- lapply(file.path(dir, sub), \(.) { 21 | y <- read.csv(., sep = " ") 22 | as(as.matrix(y), "dgCMatrix") 23 | }) 24 | 25 | # get shared features 26 | gs <- lapply(ys, rownames) 27 | gs <- Reduce(intersect, gs) 28 | 29 | # join & construct SCE 30 | y <- do.call(cbind, lapply(ys, \(.) .[gs, ])) 31 | x <- SingleCellExperiment(assays = list(counts = y)) 32 | 33 | # get cell metadata 34 | url <- "https://ndownloader.figshare.com/files/11083451?private_link=865e694ad06d5857db4b" 35 | fnm <- tempfile() 36 | download.file(url, fnm, quiet = TRUE) 37 | cd <- read.csv(fnm) 38 | 39 | # drop cells w/o metadata 40 | i <- match(colnames(x), cd$Cell.name, nomatch = 0) 41 | x <- x[, i != 0] 42 | cd <- cd[i, ] 43 | 44 | # simplify annotations 45 | cluster <- gsub("\\(.*\\)$", "", cd$Annotation) 46 | 47 | # add cell metadata 48 | colData(x) <- DataFrame( 49 | tissue = cd$Tissue, 50 | batch = cd$Batch, 51 | cluster) 52 | 53 | return(x) 54 | } 55 | -------------------------------------------------------------------------------- /code/00-get_data-MCA20.lung.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(SingleCellExperiment) 3 | }) 4 | 5 | fun <- \() 6 | { 7 | # bump timeout limit of 60 sec to 5 min 8 | # to assure download isn't interrupted 9 | options(timeout = max(300, getOption("timeout"))) 10 | 11 | # get count data 12 | url <- "https://ndownloader.figshare.com/files/10756798?private_link=865e694ad06d5857db4b" 13 | dir <- dirname(fnm <- tempfile()) 14 | download.file(url, fnm, quiet = TRUE) 15 | fns <- untar(fnm, list = TRUE) 16 | 17 | # read in counts 18 | sub <- grep("/Lung[0-9]", fns, value = TRUE) 19 | untar(fnm, files = sub, exdir = dir) 20 | ys <- lapply(file.path(dir, sub), \(.) { 21 | y <- read.csv(., sep = " ") 22 | as(as.matrix(y), "dgCMatrix") 23 | }) 24 | 25 | # get shared features 26 | gs <- lapply(ys, rownames) 27 | gs <- Reduce(intersect, gs) 28 | 29 | # join & construct SCE 30 | y <- do.call(cbind, lapply(ys, \(.) .[gs, ])) 31 | x <- SingleCellExperiment(assays = list(counts = y)) 32 | 33 | # get cell metadata 34 | url <- "https://ndownloader.figshare.com/files/11083451?private_link=865e694ad06d5857db4b" 35 | fnm <- tempfile() 36 | download.file(url, fnm, quiet = TRUE) 37 | cd <- read.csv(fnm) 38 | 39 | # drop cells w/o metadata 40 | i <- match(colnames(x), cd$Cell.name, nomatch = 0) 41 | x <- x[, i != 0] 42 | cd <- cd[i, ] 43 | 44 | # simplify annotations 45 | cluster <- gsub("\\(.*\\)$", "", cd$Annotation) 46 | 47 | # add cell metadata 48 | colData(x) <- DataFrame( 49 | tissue = cd$Tissue, 50 | batch = cd$Batch, 51 | cluster) 52 | 53 | return(x) 54 | } 55 | -------------------------------------------------------------------------------- /code/00-get_data-Mereu20.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(SingleCellExperiment) 3 | }) 4 | 5 | fun <- \() 6 | { 7 | # load SCE from URL 8 | con <- url("https://www.dropbox.com/s/i8mwmyymchx8mn8/x.all_classified.technologies.RData?raw=1") 9 | x <- get(load(con)) 10 | close(con) 11 | 12 | # drop log-normalized counts 13 | assay(x, "logcounts") <- NULL 14 | 15 | # drop feature metadata 16 | rowData(x) <- NULL 17 | 18 | # subset & rename cell metadata 19 | colData(x) <- DataFrame( 20 | batch = x$batch, 21 | cluster = x$nnet2) 22 | 23 | return(x) 24 | } -------------------------------------------------------------------------------- /code/00-get_data-Oetjen18.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(Matrix) 3 | library(SingleCellExperiment) 4 | }) 5 | 6 | fun <- \() 7 | { 8 | url <- "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE120221&format=file" 9 | fnm <- tempfile(fileext = ".tar") 10 | download.file(url, fnm, quiet = TRUE) 11 | 12 | # unpack data 13 | dir <- file.path(dirname(fnm), "GSE120221") 14 | untar(fnm, exdir = dir) 15 | 16 | # split files by sample 17 | fns <- list.files(dir, full.names = TRUE) 18 | ss <- strsplit(fns, "_") 19 | ids <- sapply(ss, .subset, 3) 20 | ids <- gsub("\\..*", "", ids) 21 | fns <- split(fns, ids) 22 | 23 | # removes replicated samples 24 | pat <- "(C|S1|2)|(C|Sk)" 25 | fns <- fns[!grepl(pat, unique(ids))] 26 | 27 | l <- lapply(names(fns), \(.) { 28 | # read in gene metadata 29 | rd <- grep("genes", fns[[.]], value = TRUE) 30 | rd <- read.delim(rd, header = FALSE) 31 | names(rd) <- c("ensembl", "symbol") 32 | 33 | # read in cell metadata 34 | cd <- grep("barcodes", fns[[.]], value = TRUE) 35 | cd <- read.delim(cd, header = FALSE) 36 | names(cd) <- "barcode" 37 | cd$batch <- . 38 | 39 | # read in counts 40 | y <- grep("matrix", fns[[.]], value = TRUE) 41 | y <- readMM(y) 42 | dimnames(y) <- list( 43 | rd$ensembl, 44 | cd$barcode) 45 | 46 | # construct SCE 47 | SingleCellExperiment( 48 | assays = list(counts = y), 49 | rowData = rd, colData = cd) 50 | }) 51 | # concatenate samples 52 | do.call(cbind, l) 53 | } -------------------------------------------------------------------------------- /code/00-get_data-TabulaMuris.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(ExperimentHub) 3 | library(SingleCellExperiment) 4 | }) 5 | 6 | fun <- \() 7 | { 8 | eh <- ExperimentHub() 9 | q <- query(eh, "TabulaMurisDroplet") 10 | x <- eh[[q$ah_id]] 11 | 12 | x <- x[, x$mouse_id == "3-F-56"] 13 | x <- x[, !is.na(x$cell_ontology_class)] 14 | 15 | cd <- data.frame( 16 | tissue = x$tissue, 17 | cluster = x$cell_ontology_class) 18 | colData(x) <- DataFrame(cd) 19 | rowData(x) <- NULL 20 | 21 | return(x) 22 | } -------------------------------------------------------------------------------- /code/00-get_data-Tung17.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(SingleCellExperiment) 3 | }) 4 | 5 | fun <- \() 6 | { 7 | url <- "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE77288&format=file&file=GSE77288%5Freads%2Draw%2Dsingle%2Dper%2Dsample%2Etxt%2Egz" 8 | tmp <- tempfile(fileext = ".txt.gz") 9 | download.file(url, destfile = tmp) 10 | 11 | y <- read.delim(gzfile(tmp)) 12 | cd <- DataFrame(batch = y$individual) 13 | 14 | # simplify gene & cell names 15 | y <- y[, grep("^ENSG", names(y))] 16 | y <- t(as.matrix(y)) 17 | 18 | SingleCellExperiment( 19 | assays = list(counts = y), 20 | colData = cd) 21 | } -------------------------------------------------------------------------------- /code/00-get_data-Zheng17.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(scater) 3 | library(SingleCellExperiment) 4 | library(TENxPBMCData) 5 | }) 6 | 7 | fun <- \() 8 | { 9 | sce <- TENxPBMCData("pbmc68k") 10 | counts(sce) <- as(counts(sce), "dgCMatrix") 11 | 12 | url <- "https://github.com/10XGenomics/single-cell-3prime-paper/raw/master/pbmc68k_analysis/68k_pbmc_barcodes_annotation.tsv" 13 | fnm <- file.path(tempdir(), "foo.tsv") 14 | download.file(url, fnm, quiet = TRUE) 15 | cd <- read.delim(fnm) 16 | 17 | rowData(sce)[3] <- NULL 18 | names(rowData(sce)) <- c("ensembl", "symbol") 19 | 20 | old <- c( 21 | "CD8+ Cytotoxic T", 22 | "CD8+/CD45RA+ Naive Cytotoxic", 23 | "CD4+/CD45RO+ Memory", 24 | "CD19+ B", 25 | "CD4+/CD25 T Reg", 26 | "CD56+ NK", 27 | "CD4+ T Helper2", 28 | "CD4+/CD45RA+/CD25- Naive T", 29 | "CD34+", 30 | "Dendritic", 31 | "CD14+ Monocyte") 32 | new <- c( 33 | "T CD8+", 34 | "T CD8+", 35 | "T CD4+", 36 | "B CD19+", 37 | "T CD4+", 38 | "NK CD56+", 39 | "T CD4+", 40 | "T CD4+", 41 | "HSCs CD34+", 42 | "Dendritic", 43 | "Monocytes CD14+") 44 | 45 | cluster <- new[match(cd$celltype, old)] 46 | colData(sce) <- DataFrame(cluster) 47 | 48 | return(sce) 49 | } -------------------------------------------------------------------------------- /code/00-get_data-panc8.R: -------------------------------------------------------------------------------- 1 | # load required packages 2 | suppressPackageStartupMessages({ 3 | library(Seurat) 4 | library(SeuratData) 5 | library(SingleCellExperiment) 6 | }) 7 | 8 | fun <- \() 9 | { 10 | if (!require(panc8.SeuratData)) 11 | InstallData("panc8.SeuratData") 12 | 13 | # load data as 'Seurat' object 14 | data("panc8") 15 | x <- as.SingleCellExperiment(panc8) 16 | 17 | # drop log-normalized counts 18 | # (these are identical to counts) 19 | assay(x, "logcounts") <- NULL 20 | 21 | # exclude datasets with non-integer counts 22 | x <- x[, !x$tech %in% c("celseq", "celseq2", "fluidigmc1")] 23 | 24 | # drop spike-ins 25 | x <- x[-grep("ERCC", rownames(x)), ] 26 | 27 | # drop undetected genes 28 | x <- x[rowSums(assay(x)) > 0, ] 29 | 30 | # subset & rename cell metadata 31 | colData(x) <- DataFrame( 32 | batch = x$dataset, 33 | cluster = x$celltype) 34 | 35 | return(x) 36 | } -------------------------------------------------------------------------------- /code/00-get_data.R: -------------------------------------------------------------------------------- 1 | # retrieve dataset 2 | source(args[[1]]) 3 | x <- fun() 4 | 5 | # number of features & observations 6 | dim(x) 7 | 8 | # tabulate number of cells by 9 | # batch, cluster, batch-cluster 10 | b <- !is.null(x$batch) 11 | k <- !is.null(x$cluster) 12 | if (b) { 13 | table(x$batch) 14 | if (k) { 15 | table(x$cluster) 16 | table(x$batch, x$cluster) 17 | } 18 | } else if (k) { 19 | table(x$cluster) 20 | } 21 | 22 | # simplify gene & cell names 23 | dimnames(x) <- list( 24 | paste0("gene", seq_len(nrow(x))), 25 | paste0("cell", seq_len(ncol(x)))) 26 | 27 | # save SCE to .rds 28 | saveRDS(x, args[[2]]) -------------------------------------------------------------------------------- /code/01-fil_data.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(Matrix) 3 | library(SingleCellExperiment) 4 | }) 5 | 6 | # read in raw data 7 | x <- readRDS(args[[1]]) 8 | 9 | # filter out instances with less than 50 cells 10 | by <- c("cluster", "sample", "batch") 11 | by <- intersect(by, names(colData(x))) 12 | 13 | if (length(by) != 0) { 14 | cs <- split(seq(ncol(x)), colData(x)[by]) 15 | rmv <- vapply(cs, length, numeric(1)) < 50 16 | if (any(rmv)) x <- x[, -unlist(cs[rmv])] 17 | } 18 | 19 | # keep genes with count > 1 in at least 10 cells, 20 | # and cells with at least 100 detected genes 21 | x <- x[ 22 | rowSums(counts(x) > 1) >= 10, 23 | colSums(counts(x) > 0) >= 100] 24 | 25 | # drop missing factor levels 26 | for (. in by) x[[.]] <- droplevels(factor(x[[.]])) 27 | 28 | # save filtered data to .rds 29 | saveRDS(x, args[[2]]) -------------------------------------------------------------------------------- /code/02-sub_data.R: -------------------------------------------------------------------------------- 1 | set.seed(1) 2 | suppressPackageStartupMessages({ 3 | library(jsonlite) 4 | library(SingleCellExperiment) 5 | }) 6 | 7 | # read in reference dataset & subsetting parameters 8 | x <- readRDS(args[[1]]) 9 | y <- fromJSON(args[[2]]) 10 | y <- y[[wcs$datset]][[wcs$subset]] 11 | 12 | # subset cells according to configuration 13 | for (i in names(y)) { 14 | if (is.null(x[[i]])) next 15 | x <- x[, x[[i]] %in% y[[i]]] 16 | x[[i]] <- droplevels(factor(x[[i]])) 17 | # drop cell metadata variable when 18 | # there's only one level remaining 19 | if (nlevels(x[[i]]) == 1) 20 | x[[i]] <- NULL 21 | } 22 | 23 | # keep cells with at least 100 detected genes 24 | x <- x[, colSums(counts(x) > 0) >= 100] 25 | 26 | # downsample to at most 'n_cells' per instance 27 | if (!is.null(y$n_cells)) { 28 | by <- intersect( 29 | c("cluster", "sample", "batch"), 30 | names(colData(x))) 31 | if (length(by) > 0) { 32 | cs <- lapply( 33 | split(seq(ncol(x)), colData(x)[by]), 34 | function(.) { 35 | n <- as.numeric(y$n_cells) 36 | n <- min(n, length(.)) 37 | sample(., n) 38 | }) 39 | x <- x[, unlist(cs)] 40 | } else { 41 | n <- as.numeric(y$n_cells) 42 | n <- min(n, ncol(x)) 43 | x <- x[, sample(ncol(x), n)] 44 | } 45 | } 46 | 47 | # keep genes with count > 1 in at least 10 cells 48 | x <- x[rowSums(counts(x) > 1) >= 10, ] 49 | 50 | # downsample to at most 'n_genes' 51 | if (!is.null(y$n_genes)) { 52 | n <- as.numeric(y$n_genes) 53 | n <- min(n, nrow(x)) 54 | x <- x[sample(nrow(x), n), ] 55 | } 56 | 57 | # save subsetted data to .rds 58 | saveRDS(x, args[[3]]) -------------------------------------------------------------------------------- /code/03-est_pars-BASiCS.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(BASiCS) 3 | library(SingleCellExperiment) 4 | }) 5 | 6 | fun <- function(x) { 7 | suppressMessages({ 8 | foo <- capture.output({ 9 | x$BatchInfo <- as.numeric(x$batch) 10 | mcmc <- BASiCS_MCMC(Data = x, 11 | N = 4000, Thin = 10, Burn = 2000, 12 | Regression = TRUE, WithSpikes = FALSE) 13 | }) 14 | }) 15 | mcmc <- Summary(mcmc) 16 | names(ps) <- ps <- c("mu", "delta", "s", "theta") 17 | ps <- lapply(ps, function(p) displaySummaryBASiCS(mcmc, p)[, 1]) 18 | c(ps, list(bi = x$BatchInfo, batch = x$batch)) 19 | } -------------------------------------------------------------------------------- /code/03-est_pars-ESCO.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(ESCO) 3 | library(SingleCellExperiment) 4 | }) 5 | 6 | fun <- function(x) { 7 | y <- counts(x) 8 | if (!is.matrix(y)) 9 | y <- as.matrix(y) 10 | dir <- tempdir() 11 | if (!is.null(x$batch)) { 12 | type <- "batch" 13 | } else if (!is.null(x$cluster)) { 14 | type <- "cluster" 15 | } else type <- "single" 16 | group <- type != "single" 17 | cellinfo <- x[[type]] 18 | suppressMessages(z <- escoEstimate(y, dir, group, cellinfo)) 19 | list(params = z, type = type, groups = unique(x[[type]])) 20 | } -------------------------------------------------------------------------------- /code/03-est_pars-POWSC.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(POWSC) 3 | library(SingleCellExperiment) 4 | }) 5 | 6 | fun <- function(x) { 7 | if (!is.matrix(y <- counts(x))) 8 | counts(x) <- as.matrix(y) 9 | if (is.null(x$cluster)) { 10 | y <- Est2Phase(x) 11 | } else { 12 | i <- split(seq(ncol(x)), x$cluster, drop = TRUE) 13 | y <- lapply(i, function(.) Est2Phase(x[, .])) 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /code/03-est_pars-SCRIP.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(splatter) 3 | library(SingleCellExperiment) 4 | }) 5 | 6 | fun <- function(x) { 7 | y <- counts(x) 8 | if (!is.matrix(y)) 9 | y <- as.matrix(y) 10 | 11 | cd <- data.frame(colData(x)) 12 | i <- c("batch", "cluster") 13 | i <- intersect(i, names(cd)) 14 | t <- ifelse(length(i) == 0, "foo", i) 15 | 16 | p <- if (t == "foo") { 17 | list( 18 | data = y, 19 | mode = "GP-trendedBCV", 20 | params = splatEstimate(y)) 21 | } else { 22 | cd$cellType <- x[[i]] 23 | list( 24 | expre_data = y, 25 | mode = "GP-trendedBCV", 26 | pheno_data = cd, 27 | CTlist = unique(x[[i]]), 28 | nfeatures = nrow(x)) 29 | } 30 | x <- list(p = p, t = t) 31 | } 32 | -------------------------------------------------------------------------------- /code/03-est_pars-SPARSim.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(scater) 3 | library(SPARSim) 4 | library(SingleCellExperiment) 5 | }) 6 | 7 | fun <- function(x) 8 | { 9 | y <- counts(x) 10 | if (!is.matrix(y)) 11 | y <- as.matrix(y) 12 | 13 | if(!is.null(x$batch)){ 14 | conditions <- list() 15 | for (batch_name in unique(x$batch)) { 16 | conditions[[batch_name]] <- which(x$batch %in% batch_name) 17 | } 18 | }else{ 19 | conditions <- list(seq_len(ncol(x))) 20 | } 21 | 22 | estimate <- SPARSim_estimate_parameter_from_data( 23 | raw_data = y, 24 | norm_data = normalizeCounts(y), 25 | conditions = conditions) 26 | 27 | if(!is.null(x$batch)){ 28 | list(estimate = estimate, batch=x$batch) 29 | }else{ 30 | list(estimate=estimate) 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /code/03-est_pars-SPsimSeq.R: -------------------------------------------------------------------------------- 1 | fun <- function(x) { } -------------------------------------------------------------------------------- /code/03-est_pars-SymSim.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(SymSim) 3 | library(SingleCellExperiment) 4 | }) 5 | #type (k)? 6 | fun <- function(x) { 7 | y <- counts(x) 8 | if (!is.matrix(y)) 9 | y <- as.matrix(y) 10 | z <- BestMatchParams( 11 | tech = "UMI", 12 | counts = y, 13 | plotfilename = "foo", 14 | n_optimal = 1) 15 | file.remove("foo.pdf") 16 | z$ngenes <- nrow(x) 17 | z$ncells_total <- ncol(x) 18 | n <- length(unique(x$batch)) 19 | z$nbatch <- ifelse(n == 0, 1, n) 20 | list(params = z, batch = x$batch) 21 | } -------------------------------------------------------------------------------- /code/03-est_pars-ZINB-WaVE.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(BiocParallel) 3 | library(SingleCellExperiment) 4 | library(zinbwave) 5 | }) 6 | 7 | fun <- function(x) { 8 | cd <- data.frame(colData(x)) 9 | i <- c("cluster", "batch") 10 | i <- intersect(i, names(cd)) 11 | 12 | y <- if (length(i) == 0) { 13 | zinbFit(x, 14 | verbose = FALSE, 15 | BPPARAM = SerialParam()) 16 | } else { 17 | zinbFit(x, 18 | model.matrix(~cd[[i]]), 19 | verbose = FALSE, 20 | BPPARAM = SerialParam()) 21 | } 22 | x <- list(obj = y, cd = colData(x)) 23 | } 24 | -------------------------------------------------------------------------------- /code/03-est_pars-hierarchicell.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(hierarchicell) 3 | library(SingleCellExperiment) 4 | }) 5 | 6 | fun <- function(x) { 7 | y <- counts(x) 8 | if (!is.matrix(y)) 9 | y <- as.matrix(y) 10 | if (is.null(x$batch)) { 11 | # split cells into random batches 12 | batch_ids <- sample(2, ncol(x), TRUE) 13 | n_batches <- 0 14 | } else { 15 | batch_ids <- x$batch 16 | n_batches <- length(unique(batch_ids)) 17 | } 18 | df <- data.frame(seq(ncol(x)), batch_ids, t(y)) 19 | z <- filter_counts(df, gene_thresh = 0, cell_thresh = 0) 20 | p <- compute_data_summaries(expr = z, type = "Raw") 21 | # single group 2 cell is required 22 | # for simulation to pass successfully 23 | list( 24 | data_summaries = p, 25 | n_genes = nrow(x), 26 | n_cases = 1, 27 | cells_per_case = 1, 28 | n_controls = length(unique(batch_ids)), 29 | cells_per_control = tabulate(batch_ids), 30 | ncells_variation_type = "Fixed", 31 | n_batches = n_batches, 32 | batch_ids = batch_ids) 33 | } 34 | -------------------------------------------------------------------------------- /code/03-est_pars-muscat.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(muscat) 3 | library(SingleCellExperiment) 4 | }) 5 | 6 | fun <- function(x) { 7 | vars <- c("cluster", "batch", "group") 8 | vars <- setdiff(vars, names(colData(x))) 9 | for (v in vars) x[[v]] <- "foo" 10 | 11 | y <- prepSCE(x, 12 | kid = "cluster", 13 | sid = "batch", 14 | gid = "group", 15 | drop = TRUE) 16 | 17 | z <- prepSim(y, 18 | min_size = NULL, 19 | verbose = FALSE) 20 | } 21 | -------------------------------------------------------------------------------- /code/03-est_pars-powsimR.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(powsimR) 3 | library(SingleCellExperiment) 4 | }) 5 | 6 | fun <- function(x) { 7 | y <- counts(x) 8 | if (!is.matrix(y)) 9 | y <- as.matrix(y) 10 | z <- estimateParam( 11 | countData = y, 12 | RNAseq = "singlecell", 13 | Protocol = "UMI", 14 | Distribution = "NB", 15 | Normalisation = "scran", 16 | GeneFilter = 0, 17 | SampleFilter = Inf, 18 | NCores = 1, 19 | verbose = FALSE) 20 | Setup( 21 | ngenes = z$totalG, 22 | estParamRes = z, 23 | n1 = z$totalS, 24 | n2 = 2, # has to be at least 2 25 | p.DE = 0, 26 | pLFC = 0, 27 | p.G = 1, 28 | nsims = 1, 29 | setup.seed = 1234) 30 | } 31 | -------------------------------------------------------------------------------- /code/03-est_pars-scDD.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(BiocParallel) 3 | library(scDD) 4 | library(SingleCellExperiment) 5 | }) 6 | 7 | fun <- function(x) { 8 | if (!is.matrix(y <- counts(x))) 9 | counts(x) <- as.matrix(y) 10 | # randomly split cells into 2 groups 11 | x$foo <- sample(2, ncol(x), TRUE) 12 | x <- preprocess(x, 13 | condition = "foo", 14 | scran_norm = TRUE) 15 | list( 16 | SCdat = x, 17 | condition = "foo", 18 | plots = FALSE, 19 | param = SerialParam(), 20 | nDE = 0, nDP = 0, nDM = 0, 21 | nDB = 0, nEE = nrow(x), nEP = 0, 22 | numSamples = ceiling(ncol(x)/2)) 23 | } 24 | -------------------------------------------------------------------------------- /code/03-est_pars-scDesign.R: -------------------------------------------------------------------------------- 1 | fun <- function(x) { } -------------------------------------------------------------------------------- /code/03-est_pars-scDesign2.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(scDesign2) 3 | }) 4 | 5 | fun <- function(x) 6 | { 7 | if (is.null(x$cluster)) { 8 | x$id <- "foo" 9 | } else { 10 | x$id <- x$cluster 11 | } 12 | colnames(x) <- x$id 13 | y <- fit_model_scDesign2( 14 | data_mat = counts(x), 15 | cell_type_sel = unique(x$id)) 16 | } -------------------------------------------------------------------------------- /code/03-est_pars-splatter.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(splatter) 3 | library(SingleCellExperiment) 4 | }) 5 | 6 | fun <- \(x) 7 | { 8 | if (!is.matrix(y <- counts(x))) 9 | counts(x) <- as.matrix(y) 10 | g <- if (!is.null(x$batch)) { 11 | "batch" 12 | } else if (!is.null(x$cluster)) { 13 | "cluster" 14 | } 15 | if (is.null(g)) { 16 | p <- splatEstimate(x) 17 | } else { 18 | i <- split(seq(ncol(x)), x[[g]], drop = TRUE) 19 | x <- lapply(i, function(.) x[, .]) 20 | p <- lapply(x, splatEstimate) 21 | } 22 | list(pars = p, type = g) 23 | } -------------------------------------------------------------------------------- /code/03-est_pars-zingeR.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(gamlss) 3 | library(gamlss.tr) 4 | library(SingleCellExperiment) 5 | library(zingeR) 6 | }) 7 | 8 | fun <- function(x) { 9 | y <- counts(x) 10 | if (!is.matrix(x)) 11 | y <- as.matrix(y) 12 | g <- sample(2, ncol(y), TRUE) 13 | mm <- model.matrix(~g) 14 | p <- getDatasetZTNB(y, mm) 15 | list( 16 | dataset = y, 17 | group = g, 18 | nTags = p$dataset.nTags, 19 | nlibs = length(p$dataset.lib.size), 20 | pUp = 0, 21 | verbose = FALSE, 22 | params = p) 23 | } 24 | -------------------------------------------------------------------------------- /code/03-est_pars.R: -------------------------------------------------------------------------------- 1 | x <- readRDS(args$sub) 2 | 3 | source(args$fun) 4 | y <- tryCatch(fun(x), 5 | error = function(e) NA) 6 | 7 | saveRDS(y, args$est) -------------------------------------------------------------------------------- /code/04-sim_data-BASiCS.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(BASiCS) 3 | library(SingleCellExperiment) 4 | }) 5 | 6 | fun <- function(x) { 7 | suppressMessages({ 8 | y <- BASiCS_Sim( 9 | Mu = x$mu, 10 | Mu_spikes = NULL, 11 | Delta = x$delta, 12 | Phi = NULL, 13 | S = x$s, 14 | Theta = x$theta, 15 | BatchInfo = x$bi) 16 | }) 17 | y$batch <- x$batch 18 | y$BatchInfo <- NULL 19 | return(y) 20 | } -------------------------------------------------------------------------------- /code/04-sim_data-ESCO.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(ESCO) 3 | library(SingleCellExperiment) 4 | }) 5 | 6 | fun <- function(x) { 7 | type <- ifelse(x$type == "single", "single", "group") 8 | y <- escoSimulate(x$params, type, verbose = FALSE, numCores = 1) 9 | assays(y) <- list(counts = assay(y, "counts")) 10 | if (type == "group") { 11 | groups <- factor(y$Group, labels = x$groups) 12 | cd <- DataFrame(groups) 13 | names(cd) <- x$type 14 | colData(y) <- cd 15 | } else { 16 | colData(y) <- NULL 17 | } 18 | return(y) 19 | } 20 | -------------------------------------------------------------------------------- /code/04-sim_data-POWSC.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(POWSC) 3 | library(SingleCellExperiment) 4 | }) 5 | 6 | fun <- function(x) { 7 | if (!is.list(x[[1]])) { 8 | y <- Simulate2SCE( 9 | n = ncol(x$exprs), 10 | perDE = 0, 11 | estParas1 = x, 12 | estParas2 = x) 13 | 14 | y <- y$sce 15 | rowData(y) <- NULL 16 | colData(y) <- NULL 17 | } else { 18 | ns <- vapply(x, function(.) 19 | ncol(.$exprs), numeric(1)) 20 | 21 | y <- SimulateMultiSCEs( 22 | n = sum(ns)/(length(x)-1), 23 | estParas_set = x, 24 | multiProb = ns) 25 | 26 | y <- lapply(y, function(.) .$sce) 27 | y <- do.call(cbind, y) 28 | 29 | y$cluster <- y$cellTypes 30 | y$cellTypes <- NULL 31 | rowData(y) <- NULL 32 | } 33 | return(y) 34 | } -------------------------------------------------------------------------------- /code/04-sim_data-SCRIP.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(checkmate) 3 | library(SCRIP) 4 | library(Seurat) 5 | library(SingleCellExperiment) 6 | }) 7 | 8 | fun <- function(x) { 9 | if (x$t == "foo") { 10 | y <- do.call(SCRIPsimu, x$p) 11 | metadata(y) <- list() 12 | rowData(y) <- colData(y) <- NULL 13 | assays(y) <- assays(y)["counts"] 14 | } else { 15 | # code bug requires 'CTlist' 16 | # to be passed via environment 17 | env <- parent.frame() 18 | env$CTlist <- x$p$CTlist 19 | x$p$CTlist <- NULL 20 | sink(tempfile()) 21 | y <- do.call(simu_cluster, x$p, envir = env) 22 | sink() 23 | cd <- DataFrame(y$CT.infor) 24 | names(cd) <- x$t 25 | y <- SingleCellExperiment( 26 | list(counts = y$final), 27 | colData = cd) 28 | # make cell names unique across clusters 29 | colnames(y) <- paste0("cell", seq(ncol(y))) 30 | } 31 | return(y) 32 | } -------------------------------------------------------------------------------- /code/04-sim_data-SPARSim.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(SPARSim) 3 | library(SingleCellExperiment) 4 | }) 5 | 6 | fun <- function(x) { 7 | 8 | sink(tempfile()) 9 | y <- SPARSim_simulation(x$estimate, output_batch_matrix = TRUE) 10 | sink() 11 | 12 | y <- y$count_matrix 13 | 14 | if(is.null(x$batch)){ 15 | SingleCellExperiment( 16 | assays = list(counts = y)) 17 | }else{ 18 | SingleCellExperiment( 19 | assays = list(counts = y), 20 | colData=data.frame(batch=x$batch)) 21 | } 22 | 23 | } -------------------------------------------------------------------------------- /code/04-sim_data-SPsimSeq.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(SPsimSeq) 3 | library(SingleCellExperiment) 4 | }) 5 | 6 | fun <- function(x) { 7 | if (!is.matrix(y <- counts(x))) 8 | counts(x) <- as.matrix(y) 9 | if (is.null(colnames(x))) 10 | colnames(x) <- seq(ncol(x)) 11 | if (is.null(x$batch)) { 12 | batch <- rep(1, ncol(x)) 13 | batch.config <- 1 14 | } else { 15 | # keep genes detected in at least 10 cells per batch 16 | i <- split(seq(ncol(x)), x$batch) 17 | x <- x[rowAlls(vapply(i, function(.) 18 | rowSums(counts(x[, .]) > 0) >= 10, 19 | logical(nrow(x)))), ] 20 | batch <- as.numeric(x$batch) 21 | batch.config <- tabulate(x$batch)/ncol(x) 22 | } 23 | y <- SPsimSeq( 24 | s.data = counts(x), 25 | n.genes = nrow(x), 26 | tot.samples = ncol(x), 27 | batch = batch, 28 | batch.config = batch.config, 29 | model.zero.prob = TRUE, 30 | genewiseCor = TRUE, 31 | result.format = "SCE", 32 | return.details = FALSE, 33 | verbose = FALSE) 34 | y <- y[[1]] 35 | rowData(y) <- NULL 36 | colData(y) <- colData(x) 37 | return(y) 38 | } 39 | -------------------------------------------------------------------------------- /code/04-sim_data-SymSim.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(SymSim) 3 | library(SingleCellExperiment) 4 | }) 5 | 6 | fun <- function(x) { 7 | args <- intersect( 8 | names(x$params), 9 | names(formals(SimulateTrueCounts))) 10 | args <- c(x$params[args], list(randseed = 1234)) 11 | y <- do.call(SimulateTrueCounts, args) 12 | 13 | data("gene_len_pool") 14 | gene_len <- sample(gene_len_pool, args$ngenes, TRUE) 15 | 16 | args <- intersect( 17 | names(x$params), 18 | names(formals(True2ObservedCounts))) 19 | args <- c(x$params[args], list( 20 | true_counts = y$counts, 21 | meta_cell = y$cell_meta, 22 | gene_len = gene_len)) 23 | z <- do.call(True2ObservedCounts, args) 24 | 25 | if (x$params$nbatch > 1) { 26 | z <- DivideBatches( 27 | observed_counts_res = z, 28 | nbatch = x$params$nbatch) 29 | cd <- DataFrame(batch = x$batch) 30 | } else cd <- make_zero_col_DFrame(ncol(z$counts)) 31 | SingleCellExperiment(list(counts = z$counts), colData = cd) 32 | } -------------------------------------------------------------------------------- /code/04-sim_data-ZINB-WaVE.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(SingleCellExperiment) 3 | library(zinbwave) 4 | }) 5 | 6 | fun <- function(x) { 7 | y <- zinbSim(x$obj) 8 | SingleCellExperiment( 9 | list(counts = y$counts), 10 | colData = x$cd) 11 | } 12 | -------------------------------------------------------------------------------- /code/04-sim_data-hierarchicell.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(hierarchicell) 3 | library(SingleCellExperiment) 4 | }) 5 | 6 | fun <- function(x) { 7 | args <- formals("simulate_hierarchicell") 8 | y <- do.call( 9 | simulate_hierarchicell, 10 | x[names(x) %in% names(args)]) 11 | # keep control cells only 12 | y <- y[y$Status == "Control", ] 13 | y <- t(as.matrix(y[, grep("^Gene", names(y))])) 14 | cd <- if (x$n_batches != 0) { 15 | data.frame(batch = x$batch_ids) 16 | } else make_zero_col_DFrame(ncol(y)) 17 | SingleCellExperiment(list(counts = y), colData = cd) 18 | } -------------------------------------------------------------------------------- /code/04-sim_data-muscat.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(muscat) 3 | library(SingleCellExperiment) 4 | }) 5 | 6 | fun <- function(x) { 7 | y <- simData(x, dd = FALSE) 8 | md <- metadata(y) 9 | 10 | if (!is.null(x$cluster_id)) { 11 | kids <- factor(y$cluster_id, labels = md$ref_kids) 12 | kids <- factor(kids, levels(x$cluster_id)) 13 | y$cluster <- kids 14 | y$cluster_id <- NULL 15 | } 16 | if (!is.null(x$sample_id)) { 17 | bids <- factor(y$sample_id, labels = md$ref_sids) 18 | bids <- factor(bids, levels(x$sample_id)) 19 | y$batch <- bids 20 | y$sample_id <- NULL 21 | } 22 | metadata(y) <- list() 23 | rowData(y) <- NULL 24 | return(y) 25 | } 26 | -------------------------------------------------------------------------------- /code/04-sim_data-powsimR.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(powsimR) 3 | library(SingleCellExperiment) 4 | }) 5 | 6 | fun <- function(x) { 7 | y <- simulateDE( 8 | SetupRes = x, 9 | Normalisation = "scran", 10 | DEmethod = "DESeq2", 11 | Counts = TRUE, 12 | NCores = NULL, 13 | verbose = TRUE) 14 | # remove group 2 cells 15 | z <- y$Counts[[1]][[1]][, -c(1, 2)] 16 | SingleCellExperiment(assays = list(counts = z)) 17 | } 18 | 19 | -------------------------------------------------------------------------------- /code/04-sim_data-scDD.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(BiocParallel) 3 | library(scDD) 4 | library(SingleCellExperiment) 5 | }) 6 | 7 | fun <- function(x) { 8 | sink(tempfile()) 9 | y <- do.call(simulateSet, x) 10 | sink() 11 | colData(y) <- rowData(y) <- NULL 12 | return(y) 13 | } 14 | -------------------------------------------------------------------------------- /code/04-sim_data-scDesign.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(scDesign) 3 | library(SingleCellExperiment) 4 | }) 5 | 6 | fun <- function(x) { 7 | y <- counts(x) 8 | if (!is.matrix(y)) 9 | y <- as.matrix(y) 10 | foo <- capture.output( 11 | z <- design_data( 12 | realcount = y, 13 | S = sum(y), 14 | ncell = ncol(y), 15 | ngroup = 1, 16 | ncores = 1)) 17 | SingleCellExperiment( 18 | assays = list(counts = z)) 19 | } -------------------------------------------------------------------------------- /code/04-sim_data-scDesign2.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(scDesign2) 3 | library(SingleCellExperiment) 4 | }) 5 | 6 | fun <- function(x) 7 | { 8 | # get number of cells per cluster 9 | n <- vapply(x, function(.) .$n_cell, numeric(1)) 10 | # simulate data with equal number 11 | # & proportion of cells per cluster 12 | y <- simulate_count_scDesign2( 13 | model_params = x, 14 | n_cell_new = sum(n), 15 | cell_type_prop = prop.table(n)) 16 | # construct SCE 17 | cd <- if (length(x) == 1) { 18 | make_zero_col_DFrame(ncol(y)) 19 | } else { 20 | data.frame(cluster = colnames(y)) 21 | } 22 | SingleCellExperiment( 23 | assay = list(counts = unname(y)), 24 | colData = cd) 25 | } 26 | -------------------------------------------------------------------------------- /code/04-sim_data-splatter.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(splatter) 3 | library(SingleCellExperiment) 4 | }) 5 | 6 | fun <- \(x) 7 | { 8 | f <- function(p) { 9 | y <- splatSimulate(p, verbose = FALSE) 10 | assays(y) <- assays(y)["counts"] 11 | rowData(y) <- colData(y) <- NULL 12 | metadata(y) <- list() 13 | return(y) 14 | } 15 | if (!is.null(x$type)) { 16 | y <- lapply(names(x$pars), 17 | function(i) { 18 | y <- f(x$pars[[i]]) 19 | y[[x$type]] <- i 20 | return(y) 21 | } 22 | ) 23 | do.call(cbind, y) 24 | } else { 25 | f(x$pars) 26 | } 27 | } -------------------------------------------------------------------------------- /code/04-sim_data-zingeR.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(gamlss) 3 | library(gamlss.tr) 4 | library(mgcv) 5 | library(SingleCellExperiment) 6 | library(zingeR) 7 | }) 8 | 9 | fun <- function(x) { 10 | y <- do.call(NBsimSingleCell, x) 11 | SingleCellExperiment(list(counts = y$counts)) 12 | } 13 | -------------------------------------------------------------------------------- /code/04-sim_data.R: -------------------------------------------------------------------------------- 1 | x <- readRDS(args$est) 2 | 3 | 4 | y <- if (!isTRUE(is.na(x))) { # skip simulation if estimation failed 5 | if (is.null(x)) # use dataset if there's no separate estimation step 6 | x <- readRDS(args$sub) 7 | 8 | source(args$fun) 9 | tryCatch(fun(x), 10 | error = function(e) NULL) 11 | } 12 | 13 | if (!is.null(y)) { 14 | z <- assay(y) 15 | y <- y[ 16 | rowSums(z) > 0, 17 | colSums(z) > 0] 18 | dimnames(y) <- list( 19 | paste0("gene", seq(nrow(y))), 20 | paste0("cell", seq(ncol(y)))) 21 | } 22 | 23 | saveRDS(y, args$sim) -------------------------------------------------------------------------------- /code/05-calc_batch-ComBat.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(scater) 3 | library(sva) 4 | }) 5 | 6 | fun <- \(x) 7 | { 8 | y <- normalizeCounts(x, log = TRUE) 9 | if (!is.matrix(y)) y <- as.matrix(y) 10 | 11 | df <- data.frame(t(y)) 12 | mm <- model.matrix(~ 1, df) 13 | 14 | sink(tempfile()) 15 | suppressMessages({ 16 | z <- ComBat( 17 | dat = y, 18 | batch = x$batch, 19 | mod = mm, 20 | par.prior = TRUE, 21 | prior.plots = FALSE) 22 | }) 23 | sink() 24 | 25 | list( 26 | assay_in = y, 27 | assay_out = z) 28 | } -------------------------------------------------------------------------------- /code/05-calc_batch-Harmony.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(harmony) 3 | library(scater) 4 | }) 5 | 6 | fun <- \(x) 7 | { 8 | y <- calculatePCA( 9 | logNormCounts(x), 10 | ncomponents = 20) 11 | 12 | z <- HarmonyMatrix( 13 | data_mat = y, 14 | meta_data = x$batch, 15 | do_pca = FALSE, 16 | verbose = FALSE) 17 | 18 | list( 19 | dimred_in = y, 20 | dimred_out = z) 21 | } -------------------------------------------------------------------------------- /code/05-calc_batch-Seurat.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(scater) 3 | library(Seurat) 4 | }) 5 | 6 | fun <- \(x) 7 | { 8 | x <- logNormCounts(x) 9 | k <- min(table(x$batch)/2) 10 | 11 | l <- SplitObject( 12 | as.Seurat(x), 13 | split.by = "batch") 14 | l <- lapply(l, \(.) { 15 | . <- NormalizeData(., verbose = FALSE) 16 | . <- FindVariableFeatures(., 17 | verbose = FALSE, 18 | nfeatures = 1000, 19 | selection.method = "vst") 20 | }) 21 | fs <- SelectIntegrationFeatures(l) 22 | as <- FindIntegrationAnchors(l, 23 | k.filter = k, 24 | dims = seq(20), 25 | verbose = FALSE, 26 | anchor.features = fs) 27 | y <- IntegrateData(as, 28 | k.weight = k, 29 | verbose = FALSE, 30 | features.to.integrate = rownames(x)) 31 | 32 | list( 33 | assay_in = logcounts(x), 34 | assay_out = GetAssayData(y)) 35 | } -------------------------------------------------------------------------------- /code/05-calc_batch-fastMNN.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(batchelor) 3 | library(scater) 4 | }) 5 | 6 | fun <- \(x) 7 | { 8 | x <- logNormCounts(x) 9 | y <- calculatePCA(x, ncomponents = 20) 10 | 11 | k <- min(table(x$batch)/2) 12 | z <- fastMNN(x, 13 | batch = x$batch, 14 | k = k, d = 20) 15 | 16 | list( 17 | dimred_in = y, 18 | dimred_out = reducedDim(z)) 19 | } -------------------------------------------------------------------------------- /code/05-calc_batch-limma.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(edgeR) 3 | library(limma) 4 | }) 5 | 6 | fun <- \(x) 7 | { 8 | mm <- model.matrix(~ x$batch) 9 | y <- DGEList(counts(x)) 10 | 11 | y <- calcNormFactors(y, method = "TMMwsp") 12 | v <- voom(y, mm, plot = FALSE) 13 | z <- removeBatchEffect(v, x$batch) 14 | 15 | list( 16 | assay_in = v$E, 17 | assay_out = z) 18 | } -------------------------------------------------------------------------------- /code/05-calc_batch-mnnCorrect.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(batchelor) 3 | library(scater) 4 | }) 5 | 6 | fun <- \(x) 7 | { 8 | x <- logNormCounts(x) 9 | k <- min(table(x$batch)/2) 10 | y <- mnnCorrect(x, batch = x$batch, k = k) 11 | 12 | list( 13 | assay_in = logcounts(x), 14 | assay_out = assay(y)) 15 | } -------------------------------------------------------------------------------- /code/05-calc_batch.R: -------------------------------------------------------------------------------- 1 | source(args$fun) 2 | sce <- readRDS(args$sce) 3 | 4 | # skip if simulation failed (return NULL) 5 | res <- if (!is.null(sce)) fun(sce) 6 | 7 | saveRDS(res, args$res) 8 | -------------------------------------------------------------------------------- /code/05-calc_clust-CIDR.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(cidr) 3 | library(SingleCellExperiment) 4 | }) 5 | 6 | fun <- \(x) 7 | { 8 | y <- counts(x) 9 | if (!is.matrix(y)) 10 | y <- as.matrix(y) 11 | k <- length(unique(x$cluster)) 12 | 13 | z <- scDataConstructor(y, tagType = "raw") 14 | z <- determineDropoutCandidates(z) 15 | z <- wThreshold(z) 16 | z <- scDissim(z, threads = 1) 17 | z <- scPCA(z, plotPC = FALSE) 18 | z <- nPC(z) 19 | 20 | scCluster( 21 | object = z, 22 | nCluster = k, 23 | nPC = z@nPC, 24 | cMethod = "ward.D2")@clusters 25 | } -------------------------------------------------------------------------------- /code/05-calc_clust-PCA+HC.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(scater) 3 | library(SingleCellExperiment) 4 | }) 5 | 6 | fun <- \(x) 7 | { 8 | y <- normalizeCounts(x, log = TRUE) 9 | pca <- prcomp(t(y), center = TRUE, scale. = FALSE) 10 | pca <- pca$x[, seq(20), drop = FALSE] 11 | hc <- hclust(dist(pca), method = "ward.D2") 12 | cutree(hc, k = length(unique(x$cluster))) 13 | } -------------------------------------------------------------------------------- /code/05-calc_clust-PCA+KM.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(scater) 3 | }) 4 | 5 | fun <- \(x) 6 | { 7 | y <- normalizeCounts(x, log = TRUE) 8 | k <- length(unique(x$cluster)) 9 | pca <- prcomp(t(y), center = TRUE, scale. = FALSE) 10 | pca <- pca$x[, seq(30), drop = FALSE] 11 | kmeans(pca, centers = k, nstart = 25)$cluster 12 | } -------------------------------------------------------------------------------- /code/05-calc_clust-SC3.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(SC3) 3 | library(scater) 4 | library(SingleCellExperiment) 5 | }) 6 | 7 | fun <- \(x) 8 | { 9 | if (is.null(rownames(x))) 10 | rownames(x) <- paste0("gene", seq(nrow(x))) 11 | 12 | if (!is.matrix(y <- counts(x))) 13 | counts(x) <- as.matrix(y) 14 | 15 | x <- logNormCounts(x) 16 | k <- length(unique(x$cluster)) 17 | rowData(x)$feature_symbol <- rownames(x) 18 | 19 | y <- sc3_prepare(x, 20 | gene_filter = FALSE, 21 | svm_max = 1e6, 22 | n_cores = 1, 23 | rand_seed = 1) 24 | 25 | z <- sc3(y, 26 | ks = k, 27 | gene_filter = FALSE, 28 | biology = FALSE, 29 | k_estimator = FALSE, 30 | svm_max = 1e6, 31 | n_cores = 1, 32 | rand_seed = 1) 33 | 34 | z[[grep("sc3", names(colData(z)))]] 35 | } -------------------------------------------------------------------------------- /code/05-calc_clust-SC3svm.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(SC3) 3 | library(scater) 4 | library(SingleCellExperiment) 5 | }) 6 | 7 | fun <- \(x) 8 | { 9 | if (is.null(rownames(x))) 10 | rownames(x) <- paste0("gene", seq(nrow(x))) 11 | 12 | if (!is.matrix(y <- counts(x))) 13 | counts(x) <- as.matrix(y) 14 | 15 | n <- round(ncol(x)/2) 16 | x <- logNormCounts(x) 17 | k <- length(unique(x$cluster)) 18 | rowData(x)$feature_symbol <- rownames(x) 19 | 20 | y <- sc3_prepare(x, 21 | gene_filter = FALSE, 22 | svm_max = 0, 23 | svm_num_cells = n, 24 | n_cores = 1, 25 | rand_seed = 1) 26 | 27 | y <- sc3(y, 28 | ks = k, 29 | gene_filter = FALSE, 30 | biology = FALSE, 31 | k_estimator = FALSE, 32 | svm_max = 0, 33 | svm_num_cells = n, 34 | n_cores = 1, 35 | rand_seed = 1) 36 | 37 | y <- sc3_run_svm(y, k) 38 | y[[grep("sc3", names(colData(y)))]] 39 | } -------------------------------------------------------------------------------- /code/05-calc_clust-Seurat.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(Seurat) 3 | library(SingleCellExperiment) 4 | }) 5 | 6 | fun <- \(x) 7 | { 8 | if (is.null(rownames(x))) 9 | rownames(x) <- paste0("gene", seq(nrow(x))) 10 | if (is.null(colnames(x))) 11 | colnames(x) <- paste0("cell", seq(ncol(x))) 12 | 13 | y <- CreateSeuratObject( 14 | counts = counts(x), 15 | meta.data = colData(x)) 16 | 17 | y <- NormalizeData(y, verbose = FALSE) 18 | y <- ScaleData(y, verbose = FALSE) 19 | y <- FindVariableFeatures(y, verbose = FALSE) 20 | hvgs <- VariableFeatures(y) 21 | 22 | y <- RunPCA(y, npcs = n <- 30, features = hvgs, verbose = FALSE) 23 | y <- FindNeighbors(y, dims = seq(n), verbose = FALSE) 24 | y <- FindClusters(y, res = 0.8, verbose = FALSE) 25 | Idents(y) 26 | } 27 | -------------------------------------------------------------------------------- /code/05-calc_clust-monocle.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(monocle) 3 | library(scran) 4 | }) 5 | 6 | fun <- \(x) 7 | { 8 | if (!is.matrix(y <- counts(x))) 9 | counts(x) <- as.matrix(y) 10 | if (is.null(rownames(x))) 11 | rownames(x) <- paste0("gene", seq(nrow(x))) 12 | k <- length(unique(x$cluster)) 13 | rowData(x)$gene_short_name <- rownames(x) 14 | y <- convertTo(x, type = "monocle") 15 | y <- estimateSizeFactors(y) 16 | y <- tryCatch( 17 | estimateDispersions(y), 18 | error = function(e) y) 19 | y <- reduceDimension(y, 20 | max_components = 3, 21 | num_dim = 50, 22 | reduction_method = "tSNE", 23 | verbose = TRUE) 24 | monocle::clusterCells(y, 25 | num_clusters = k+1, 26 | method = "densityPeak")$Cluster 27 | } -------------------------------------------------------------------------------- /code/05-calc_clust-pcaReduce.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(clue) 3 | library(pcaReduce) 4 | library(scater) 5 | }) 6 | 7 | fun <- \(x) 8 | { 9 | k <- length(unique(x$cluster)) 10 | y <- normalizeCounts(x, log = TRUE) 11 | z <- PCAreduce(t(y), 12 | nbt = 100, 13 | q = q <- 30, 14 | method = "S") 15 | l <- lapply(z, function(.) { 16 | colnames(.) <- paste0("k", seq(q+1, 2)) 17 | as.cl_partition(.[, paste0("k", k)]) 18 | }) 19 | e <- as.cl_ensemble(l) 20 | res <- cl_consensus(e, 21 | method = "SE", 22 | control = list(nruns = 50)) 23 | c(cl_class_ids(res)) 24 | } -------------------------------------------------------------------------------- /code/05-calc_clust-tSNE+KM.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(Rtsne) 3 | library(scater) 4 | }) 5 | 6 | fun <- \(x) 7 | { 8 | k <- length(unique(x$cluster)) 9 | y <- normalizeCounts(x, log = TRUE) 10 | if (!is.matrix(y)) y <- as.matrix(y) 11 | tsne <- Rtsne(t(y), 12 | pca = TRUE, initial_dims = 50, dims = 3, 13 | perplexity = 30, check_duplicates = FALSE) 14 | kmeans(tsne$Y, centers = k, nstart = 25)$cluster 15 | } -------------------------------------------------------------------------------- /code/05-calc_clust.R: -------------------------------------------------------------------------------- 1 | source(args$fun) 2 | sce <- readRDS(args$sce) 3 | 4 | # skip if simulation failed (return NULL) 5 | res <- if (!is.null(sce)) 6 | data.frame(wcs, 7 | row.names = NULL, 8 | pred = as.integer(as.integer(fun(sce))), 9 | true = as.integer(droplevels(factor(sce$cluster)))) 10 | 11 | saveRDS(res, args$res) 12 | -------------------------------------------------------------------------------- /code/05-calc_dr.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(scater) 3 | library(scran) 4 | library(SingleCellExperiment) 5 | }) 6 | 7 | set.seed(7043) 8 | x <- readRDS(args[[1]]) 9 | 10 | # skip if simulation failed (return NULL) 11 | df <- if (!is.null(x)) { 12 | if ("normcounts" %in% assayNames(x)) { 13 | logcounts(x) <- log(normcounts(x)+1) 14 | } else x <- logNormCounts(x) 15 | 16 | stats <- modelGeneVar(x) 17 | hvgs <- getTopHVGs(stats, n = 500) 18 | 19 | x <- runPCA(x, subset_row = hvgs) 20 | x <- runTSNE(x, dimred = "PCA") 21 | x <- runUMAP(x, dimred = "PCA") 22 | 23 | tsne <- reducedDim(x, "TSNE") 24 | umap <- reducedDim(x, "UMAP") 25 | colnames(tsne) <- paste0("TSNE", seq(2)) 26 | colnames(umap) <- paste0("UMAP", seq(2)) 27 | 28 | x$lls <- if ("counts" %in% assayNames(x)) 29 | x$lls <- log(colSums(counts(x))+1) else NA 30 | 31 | i <- c("cluster", "batch", "lls") 32 | i <- intersect(names(colData(x)), i) 33 | cd <- colData(x)[i] 34 | 35 | data.frame(wcs, cd, tsne, umap) 36 | } 37 | 38 | saveRDS(df, args[[2]]) 39 | -------------------------------------------------------------------------------- /code/05-calc_qc-cell_cms.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(CellMixS) 3 | library(scater) 4 | library(scran) 5 | library(SingleCellExperiment) 6 | }) 7 | 8 | ppFUN <- function(sce) { 9 | if ("normcounts" %in% assayNames(sce)) { 10 | logcounts(sce) <- log(normcounts(sce)+1) 11 | } else sce <- logNormCounts(sce) 12 | stats <- modelGeneVar(sce) 13 | hvgs <- getTopHVGs(stats, n = 500) 14 | sce <- runPCA(sce, subset_row = hvgs) 15 | } 16 | 17 | qcFUN <- function(sce) { 18 | i <- c("batch", "cluster") 19 | i <- intersect(i, names(colData(sce))) 20 | if (length(i) == 0) return(NULL) 21 | ids <- factor(sce[[i]]) 22 | n <- tabulate(ids) 23 | k <- min(n[n != 0])/2 24 | cms(sce, k, i)$cms 25 | } 26 | 27 | groups <- "global" 28 | n_genes <- NULL 29 | n_cells <- NULL -------------------------------------------------------------------------------- /code/05-calc_qc-cell_cor.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(scater) 3 | }) 4 | 5 | ppFUN <- function(sce) { 6 | if (!is.matrix(y <- assay(sce))) 7 | assay(sce) <- as.matrix(y) 8 | y <- if ("normcounts" %in% assayNames(sce)) { 9 | normcounts(sce) 10 | } else calculateCPM(sce) 11 | assay(sce, "exprs") <- log(y+1) 12 | return(sce) 13 | } 14 | 15 | qcFUN <- function(sce) { 16 | cor <- cor( 17 | assay(sce, "exprs"), 18 | method = "spearman", 19 | use = "pairwise.complete.obs") 20 | cor[upper.tri(cor)] 21 | } 22 | 23 | groups <- NULL 24 | n_genes <- NULL 25 | n_cells <- 200 -------------------------------------------------------------------------------- /code/05-calc_qc-cell_frq.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(SingleCellExperiment) 3 | }) 4 | 5 | ppFUN <- function(sce) { 6 | sce 7 | } 8 | 9 | qcFUN <- function(sce) { 10 | colMeans(assay(sce) != 0) 11 | } 12 | 13 | groups <- NULL 14 | n_genes <- NULL 15 | n_cells <- NULL -------------------------------------------------------------------------------- /code/05-calc_qc-cell_knn.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(RANN) 3 | library(scater) 4 | library(scran) 5 | library(SingleCellExperiment) 6 | }) 7 | 8 | ppFUN <- function(sce) { 9 | if ("normcounts" %in% assayNames(sce)) { 10 | logcounts(sce) <- log(normcounts(sce)+1) 11 | } else sce <- logNormCounts(sce) 12 | stats <- modelGeneVar(sce) 13 | hvgs <- getTopHVGs(stats, n = 500) 14 | sce <- runPCA(sce, subset_row = hvgs) 15 | } 16 | 17 | qcFUN <- function(sce) { 18 | # build KNN-graph on PCA 19 | # (where k = 5% of cells) 20 | pca <- reducedDim(sce, "PCA") 21 | k <- round(0.05*ncol(sce)) 22 | knn <- nn2(pca, k = k+1) 23 | idx <- knn$nn.idx[, seq(2, k+1)] 24 | # count how often each cell is a KNN 25 | vapply(seq(ncol(sce)), \(i) sum(idx == i), numeric(1)) 26 | } 27 | 28 | groups <- NULL 29 | n_genes <- NULL 30 | n_cells <- NULL -------------------------------------------------------------------------------- /code/05-calc_qc-cell_ldf.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(BiocNeighbors) 3 | library(CellMixS) 4 | library(dplyr) 5 | library(scater) 6 | library(scran) 7 | library(SingleCellExperiment) 8 | }) 9 | 10 | ppFUN <- function(sce) { 11 | if ("normcounts" %in% assayNames(sce)) { 12 | logcounts(sce) <- log(normcounts(sce)+1) 13 | } else sce <- logNormCounts(sce) 14 | stats <- modelGeneVar(sce) 15 | hvgs <- getTopHVGs(stats, n = 500) 16 | sce <- runPCA(sce, subset_row = hvgs) 17 | } 18 | 19 | qcFUN <- function(sce) { 20 | i <- c("batch", "cluster") 21 | i <- intersect(i, names(colData(sce))) 22 | k <- if (length(i) != 0) { 23 | # half of smallest group 24 | ids <- factor(sce[[i]]) 25 | n <- tabulate(ids) 26 | min(n[n != 0])/2 27 | } else { 28 | # 5% of total cells 29 | 0.05*ncol(sce) 30 | } 31 | # KNN on PCA 32 | pca <- reducedDim(sce, "PCA") 33 | knn <- findKNN(pca, k) 34 | # fix naming for indexing 35 | cs <- colnames(sce) 36 | rownames(knn$index) <- cs 37 | rownames(knn$distance) <- cs 38 | knn$cell_name <- knn$index 39 | # compute local density factors 40 | c(CellMixS:::.ldfKnn(pca, knn, k)$LDF) 41 | } 42 | 43 | groups <- NULL 44 | n_genes <- NULL 45 | n_cells <- NULL -------------------------------------------------------------------------------- /code/05-calc_qc-cell_lls.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(SingleCellExperiment) 3 | }) 4 | 5 | ppFUN <- function(sce) { 6 | sce 7 | } 8 | 9 | qcFUN <- function(sce) { 10 | if ("counts" %in% assayNames(sce)) 11 | log(colSums(counts(sce) + 1)) 12 | } 13 | 14 | groups <- NULL 15 | n_genes <- NULL 16 | n_cells <- NULL -------------------------------------------------------------------------------- /code/05-calc_qc-cell_pcd.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(scater) 3 | library(scran) 4 | library(SingleCellExperiment) 5 | }) 6 | 7 | ppFUN <- function(sce) { 8 | if ("normcounts" %in% assayNames(sce)) { 9 | logcounts(sce) <- log(normcounts(sce)+1) 10 | } else sce <- logNormCounts(sce) 11 | stats <- modelGeneVar(sce) 12 | hvgs <- getTopHVGs(stats, n = 500) 13 | sce <- runPCA(sce, subset_row = hvgs) 14 | } 15 | 16 | qcFUN <- function(sce) { 17 | pca <- reducedDim(sce, "PCA") 18 | c(dist(pca, upper = TRUE)) 19 | } 20 | 21 | groups <- NULL 22 | n_genes <- NULL 23 | n_cells <- 200 -------------------------------------------------------------------------------- /code/05-calc_qc-cell_sw.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(cluster) 3 | library(scater) 4 | library(scran) 5 | library(SingleCellExperiment) 6 | }) 7 | 8 | ppFUN <- function(sce) { 9 | if ("normcounts" %in% assayNames(sce)) { 10 | logcounts(sce) <- log(normcounts(sce)+1) 11 | } else sce <- logNormCounts(sce) 12 | stats <- modelGeneVar(sce) 13 | hvgs <- getTopHVGs(stats, n = 500) 14 | sce <- runPCA(sce, subset_row = hvgs) 15 | } 16 | 17 | qcFUN <- function(sce) { 18 | i <- c("batch", "cluster") 19 | i <- intersect(i, names(colData(sce))) 20 | if (length(i) == 0) return(NULL) 21 | ids <- as.integer(factor(sce[[i]])) 22 | mtx <- dist(reducedDim(sce, "PCA")) 23 | res <- silhouette(ids, mtx) 24 | return(res[, "sil_width"]) 25 | } 26 | 27 | groups <- "global" 28 | n_genes <- NULL 29 | n_cells <- NULL -------------------------------------------------------------------------------- /code/05-calc_qc-gene_avg.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(scater) 3 | }) 4 | 5 | ppFUN <- function(sce) { 6 | if (!is.matrix(y <- assay(sce))) 7 | assay(sce) <- as.matrix(y) 8 | y <- if ("normcounts" %in% assayNames(sce)) { 9 | normcounts(sce) 10 | } else calculateCPM(sce) 11 | assay(sce, "exprs") <- log(y+1) 12 | return(sce) 13 | } 14 | 15 | qcFUN <- function(sce) { 16 | rowMeans(assay(sce, "exprs")) 17 | } 18 | 19 | groups <- NULL 20 | n_genes <- NULL 21 | n_cells <- NULL -------------------------------------------------------------------------------- /code/05-calc_qc-gene_cor.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(matrixStats) 3 | library(scater) 4 | }) 5 | 6 | ppFUN <- function(sce) { 7 | if (!is.matrix(y <- assay(sce))) 8 | assay(sce) <- as.matrix(y) 9 | y <- if ("normcounts" %in% assayNames(sce)) { 10 | normcounts(sce) 11 | } else calculateCPM(sce) 12 | assay(sce, "exprs") <- log(y+1) 13 | sce[rowVars(y) > 0, ] 14 | } 15 | 16 | qcFUN <- function(sce) { 17 | cor <- cor( 18 | t(assay(sce, "exprs")), 19 | method = "spearman", 20 | use = "pairwise.complete.obs") 21 | cor[upper.tri(cor)] 22 | } 23 | 24 | groups <- NULL 25 | n_genes <- 400 26 | n_cells <- NULL -------------------------------------------------------------------------------- /code/05-calc_qc-gene_cv.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(scater) 3 | }) 4 | 5 | ppFUN <- function(sce) { 6 | if (!is.matrix(y <- assay(sce))) 7 | assay(sce) <- as.matrix(y) 8 | y <- if ("normcounts" %in% assayNames(sce)) { 9 | normcounts(sce) 10 | } else calculateCPM(sce) 11 | assay(sce, "exprs") <- log(y+1) 12 | return(sce) 13 | } 14 | 15 | qcFUN <- function(sce) { 16 | es <- assay(sce, "exprs") 17 | sd <- sqrt(rowVars(es)) 18 | mu <- rowMeans(es) 19 | cv <- sd / mu 20 | cv[!is.na(cv)] 21 | } 22 | 23 | groups <- NULL 24 | n_genes <- NULL 25 | n_cells <- NULL -------------------------------------------------------------------------------- /code/05-calc_qc-gene_frq.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(SingleCellExperiment) 3 | }) 4 | 5 | ppFUN <- function(sce) { 6 | sce 7 | } 8 | 9 | qcFUN <- function(sce) { 10 | rowMeans(assay(sce) != 0) 11 | } 12 | 13 | groups <- NULL 14 | n_genes <- NULL 15 | n_cells <- NULL -------------------------------------------------------------------------------- /code/05-calc_qc-gene_pve.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(BiocParallel) 3 | library(scater) 4 | library(SingleCellExperiment) 5 | library(variancePartition) 6 | }) 7 | 8 | ppFUN <- function(sce) { 9 | if ("normcounts" %in% assayNames(sce)) { 10 | logcounts(sce) <- log(normcounts(sce)+1) 11 | } else sce <- logNormCounts(sce) 12 | return(sce) 13 | } 14 | 15 | qcFUN <- function(sce) { 16 | i <- c("batch", "cluster") 17 | i <- intersect(i, names(colData(sce))) 18 | if (length(i) == 0) return(NULL) 19 | y <- logcounts(sce) 20 | if (!is.matrix(y)) 21 | y <- as.matrix(y) 22 | y <- y[rowSums(y) != 0, ] 23 | f <- as.formula(sprintf("~(1|%s)", i)) 24 | cd <- data.frame(colData(sce)[i]) 25 | pve <- fitExtractVarPartModel(y, f, cd, 26 | quiet = TRUE, BPPARAM = SerialParam()) 27 | return(pve[[i]]) 28 | } 29 | 30 | groups <- "global" 31 | n_genes <- NULL 32 | n_cells <- NULL -------------------------------------------------------------------------------- /code/05-calc_qc-gene_var.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(matrixStats) 3 | library(scater) 4 | }) 5 | 6 | ppFUN <- function(sce) { 7 | if (!is.matrix(y <- assay(sce))) 8 | assay(sce) <- as.matrix(y) 9 | y <- if ("normcounts" %in% assayNames(sce)) { 10 | normcounts(sce) 11 | } else calculateCPM(sce) 12 | assay(sce, "exprs") <- log(y+1) 13 | return(sce) 14 | } 15 | 16 | qcFUN <- function(sce) { 17 | rowVars(assay(sce, "exprs")) 18 | } 19 | 20 | groups <- NULL 21 | n_genes <- NULL 22 | n_cells <- NULL -------------------------------------------------------------------------------- /code/05-calc_qc.R: -------------------------------------------------------------------------------- 1 | source(args$uts) 2 | source(args$fun) 3 | 4 | # each QC script should specify 5 | # ppFUN: function to apply to the SCE for PreProcessing 6 | # qcFUN: function to compute the Quality Control metric 7 | # (both FUNs should input & output a SCE) 8 | # groups: for which to compute the QC metric 9 | # (global, cluster, batch, NULL for all) 10 | # n_genes: number of genes to sample - globally (NULL for all) 11 | # n_cells: number of cells to sample - per group (NULL for all) 12 | 13 | if (exists("ppFUN")) { 14 | stopifnot(is.function(ppFUN)) 15 | } else { 16 | # do nothing 17 | ppFUN <- \(.) . 18 | } 19 | 20 | if (exists("qcFUN")) { 21 | stopifnot(is.function(qcFUN)) 22 | } else { 23 | stop("'qcFUN' needs to be specified") 24 | } 25 | 26 | choices <- eval(formals(.calc_qc)$groups) 27 | if (exists("groups") && !is.null(groups)) { 28 | groups <- match.arg(groups, choices, several.ok = TRUE) 29 | } else { 30 | groups <- choices 31 | } 32 | 33 | if (exists("n_genes") && !is.null(n_genes)) { 34 | stopifnot(is.numeric(n_genes), length(n_genes) == 1) 35 | } else { 36 | # use all 37 | n_genes <- NULL 38 | } 39 | 40 | if (exists("n_cells") && !is.null(n_cells)) { 41 | stopifnot(is.numeric(n_cells), length(n_cells) == 1) 42 | } else { 43 | # use all 44 | n_cells <- NULL 45 | } 46 | 47 | # ------------------------------------------------------------------------------ 48 | 49 | sce <- readRDS(args$sce) 50 | 51 | # skip if simulation failed (return NULL) 52 | res <- if (!is.null(sce)) { 53 | sce <- ppFUN(sce) 54 | .calc_qc(sce, 55 | fun = qcFUN, 56 | groups = groups, 57 | n_genes = n_genes, 58 | n_cells = n_cells) 59 | } 60 | 61 | res <- if (!is.null(res)) { 62 | if (is.null(wcs$method)) 63 | wcs$method <- NA 64 | data.frame(wcs, res) 65 | } 66 | 67 | print(head(res)) 68 | saveRDS(res, args$res) -------------------------------------------------------------------------------- /code/05-runtimes.R: -------------------------------------------------------------------------------- 1 | # wcs <- list(ngs = "x", ncs = 325, rep = 3) 2 | # args <- list( 3 | # sce = "data/02-sub/Mereu20,CD4T.rds", 4 | # est = "code/03-est_pars-SCRIP.R", 5 | # sim = "code/04-sim_data-SCRIP.R") 6 | 7 | suppressPackageStartupMessages({ 8 | library(R.utils) 9 | library(SingleCellExperiment) 10 | }) 11 | 12 | # set seed to current rep(licate) for reproducibility 13 | set.seed(as.numeric(wcs$rep)) 14 | 15 | # read in dataset 16 | x <- readRDS(args$sce) 17 | 18 | # if down-sampling cells, use all 19 | # genes & store NA in output data.frame 20 | if (wcs$ngs == "x") { 21 | wcs$ngs <- NA 22 | ngs <- nrow(x) 23 | } else wcs$ngs <- ngs <- as.numeric(wcs$ngs) 24 | 25 | # if down-sampling genes, use all 26 | # cells & store NA in output data.frame 27 | if (wcs$ncs == "x") { 28 | wcs$ncs <- NA 29 | ncs <- ncol(x) 30 | } else wcs$ncs <- ncs <- as.numeric(wcs$ncs) 31 | 32 | # downsample number of genes / cells 33 | x <- x[ 34 | sample(nrow(x), ngs), 35 | sample(ncol(x), ncs)] 36 | 37 | sink(tempfile()) # suppress printing... 38 | 39 | # set time limit (s) until timeout 40 | t <- 1e4 41 | 42 | # time estimation 43 | source(args$est) 44 | est <- withTimeout( 45 | { 46 | tryCatch( 47 | system.time(y <- fun(x))[[3]], 48 | error = function(e) e) 49 | }, 50 | timeout = t, 51 | onTimeout = "warning") 52 | 53 | if (is.character(est)) { 54 | # timed out 55 | est <- t 56 | } else if (inherits(est, "error")) { 57 | # Inf if estimation failed 58 | est <- Inf 59 | } else if (is.null(y)) { 60 | # NA if included in simulation 61 | est <- NA_real_ 62 | # pass SCE to simulation 63 | y <- x 64 | } 65 | 66 | # time simulation 67 | source(args$sim) 68 | sim <- withTimeout( 69 | { 70 | tryCatch( 71 | system.time(fun(y))[[3]], 72 | error = function(e) e) 73 | }, 74 | timeout = t, 75 | onTimeout = "warning") 76 | 77 | if (is.character(sim)) { 78 | # timed out 79 | sim <- t 80 | } else if (inherits(sim, "error")) { 81 | # Inf if estimation failed 82 | sim <- Inf 83 | } else if (is.null(sim)) { 84 | # NA if included in simulation 85 | sim <- NA_real_ 86 | } 87 | 88 | sink() # ...until here 89 | 90 | # construct table including wildcards (wcs), 91 | # timing of est(imation) & sim(ulation), 92 | # and number of genes/cells (ng/cs) 93 | res <- data.frame(wcs, est, sim, row.names = NULL) 94 | 95 | # ...and write to .rds 96 | saveRDS(res, args$res) -------------------------------------------------------------------------------- /code/06-dr_batch.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(scater) 3 | library(scran) 4 | library(SingleCellExperiment) 5 | }) 6 | 7 | # args <- list( 8 | # sce = "data/04-sim/Oetjen18,foo,SPARSim.rds", 9 | # res = "outs/batch_sim-Oetjen18,foo,SPARSim,mnnCorrect.rds") 10 | 11 | x <- readRDS(args[[1]]) 12 | y <- readRDS(args[[2]]) 13 | 14 | df <- if (!is.null(x) && !is.null(y)) { 15 | if (!is.null(y$dimred_in) && 16 | !is.null(y$dimred_out)) { 17 | # use integrated dimension reduction 18 | reducedDim(x, "PCA") <- y$dimred_out 19 | } else if ( 20 | !is.null(y$assay_in) && 21 | !is.null(y$assay_out)) { 22 | # run PCA on integrated data 23 | pca <- calculatePCA(y$assay_out) 24 | reducedDim(x, "PCA") <- pca 25 | } 26 | 27 | x <- runTSNE(x, dimred = "PCA") 28 | x <- runUMAP(x, dimred = "PCA") 29 | 30 | tsne <- reducedDim(x, "TSNE") 31 | umap <- reducedDim(x, "UMAP") 32 | colnames(tsne) <- paste0("TSNE", seq(2)) 33 | colnames(umap) <- paste0("UMAP", seq(2)) 34 | 35 | data.frame(wcs, tsne, umap, batch = x$batch) 36 | } 37 | saveRDS(df, args[[3]]) 38 | -------------------------------------------------------------------------------- /code/06-eval_batch.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(CellMixS) 3 | library(SingleCellExperiment) 4 | }) 5 | 6 | # args <- list( 7 | # sce = "data/02-sub/panc8,inDrop.ductal.rds", 8 | # res = "outs/batch_sim-panc8,inDrop.ductal,muscat,Seurat.rds") 9 | 10 | sce <- readRDS(args[[1]]) 11 | res <- readRDS(args[[2]]) 12 | 13 | df <- if (!is.null(sce) && !is.null(res)) { 14 | 15 | if (!is.factor(sce$batch)) 16 | sce$batch <- factor(sce$batch) 17 | 18 | group <- "batch" 19 | k <- min(table(sce$batch))/2 20 | 21 | # x = uncorrected, 22 | # y = integrated 23 | if (!is.null(res$dimred_in) && 24 | !is.null(res$dimred_out)) { 25 | # compute scores io dimension reductions 26 | dimred_in <- dimred_out <- "foo" 27 | assay_in <- assay_out <- "logcounts" 28 | # setup input data 29 | x <- sce 30 | reducedDim(x, "foo") <- res$dimred_in 31 | # setup output data 32 | y <- sce 33 | reducedDim(y, "foo") <- res$dimred_out 34 | } else if ( 35 | !is.null(res$assay_in) && 36 | !is.null(res$assay_out)) { 37 | # compute scores on integrated assay data 38 | assay_in <- assay_out <- "foo" 39 | dimred_in <- dimred_out <- "PCA" 40 | # setup input data 41 | x <- sce 42 | assay(x, "foo", FALSE) <- res$assay_in 43 | # setup output data 44 | y <- sce 45 | assay(y, "foo", FALSE) <- res$assay_out 46 | } 47 | # split input data by batch 48 | idx <- split(seq(ncol(x)), x$batch) 49 | x <- lapply(idx, \(.) x[, .]) 50 | 51 | suppressMessages({ 52 | # cell-specific changes in Local Density 53 | # Factor (LDF) before vs. after integration 54 | ldf <- ldfDiff(x, y, group, k, dimred_in, dimred_out, assay_in, assay_out) 55 | 56 | # cell-specific mixing scores based on 57 | # euclidean distances within integrated data 58 | cms <- cms(y, k, group, dimred_in, assay_in) 59 | }) 60 | 61 | data.frame(wcs, 62 | row.names = NULL, 63 | batch = sce$batch, 64 | ldf = ldf$diff_ldf, 65 | cms = cms$cms) 66 | } 67 | 68 | saveRDS(df, args[[3]]) -------------------------------------------------------------------------------- /code/06-eval_clust.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(dplyr) 3 | library(tidyr) 4 | }) 5 | 6 | source(args$uts) 7 | 8 | sce <- readRDS(args$sce) 9 | true <- droplevels(factor(sce$cluster)) 10 | 11 | res <- args[c("ref", "sim")] %>% 12 | unlist() %>% 13 | lapply(readRDS) %>% 14 | bind_rows() %>% 15 | replace_na(list(method = "ref")) 16 | 17 | # for each simulators & clustering method, match prediction w/ truth 18 | # using Hungarian algorithm & compute precision, recall, F1 score 19 | res <- group_by(res, 20 | across(contains("method"))) %>% 21 | group_modify(~{ 22 | res <- .hungarian(.x$pred, .x$true) 23 | data.frame( 24 | cluster = levels(true), 25 | res[c("pr", "re", "F1")]) 26 | }) %>% data.frame(wcs) 27 | 28 | saveRDS(res, args$res) 29 | -------------------------------------------------------------------------------- /code/06-stat_1d-ks.R: -------------------------------------------------------------------------------- 1 | fun <- function(x, y) { 2 | suppressWarnings(z <- ks.test(x, y)) 3 | as.numeric(z$statistic) 4 | } -------------------------------------------------------------------------------- /code/06-stat_1d-ws.R: -------------------------------------------------------------------------------- 1 | fun <- function(x, y) waddR::wasserstein_metric(x, y) -------------------------------------------------------------------------------- /code/06-stat_1d.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(dplyr) 3 | library(purrr) 4 | library(tidyr) 5 | }) 6 | 7 | # read in QC results for reference (ref) & simulation (sim) 8 | ref <- readRDS(args$ref) 9 | sim <- readRDS(args$sim) 10 | 11 | # check that QC is available for reference & simulation 12 | res <- if (!(is.null(ref) || is.null(sim))) { 13 | # source test statistic function 14 | source(args$fun) 15 | # test ref vs. sim for each grouping 16 | bind_rows(ref, sim) %>% 17 | pivot_wider( 18 | names_from = "method", 19 | values_from = "value", 20 | values_fn = list) %>% 21 | rename( 22 | ref = "NA", 23 | sim = wcs$method) %>% 24 | rowwise() %>% 25 | mutate(stat = fun(ref, sim)) %>% 26 | ungroup() %>% 27 | select(negate(is.list)) %>% 28 | mutate(.before = 1, data.frame(wcs)) 29 | } 30 | 31 | print(head(res)) 32 | saveRDS(res, args$res) 33 | -------------------------------------------------------------------------------- /code/06-stat_2d-emd.R: -------------------------------------------------------------------------------- 1 | fun <- function(x, y, n = 25) { 2 | stopifnot(is.numeric(n), length(n) == 1, n == as.integer(n)) 3 | if (is.null(dim(x))) { 4 | # ONE-DIMENSIONAL 5 | # smoothing 6 | x <- density(x, n = n)$x 7 | y <- density(y, n = n)$x 8 | # compute EMD 9 | ws <- rep(1/n, n) 10 | x <- cbind(ws, x) 11 | y <- cbind(ws, y) 12 | emd(x, y) 13 | } else { 14 | # TWO-DIMENSIONAL 15 | if (!is.matrix(x)) x <- as.matrix(x) 16 | if (!is.matrix(y)) y <- as.matrix(y) 17 | # smoothing over common range 18 | rng <- c( 19 | range(c(x[, 1], y[, 1])), 20 | range(c(x[, 2], y[, 2]))) 21 | x <- MASS::kde2d(x[, 1], x[, 2], n = n, lims = rng) 22 | y <- MASS::kde2d(y[, 1], y[, 2], n = n, lims = rng) 23 | # compute EMD 24 | emdist::emd2d(x$z, y$z)/n 25 | } 26 | } -------------------------------------------------------------------------------- /code/06-stat_2d-ks2.R: -------------------------------------------------------------------------------- 1 | fun <- function(x, y) Peacock.test::peacock2(x, y) -------------------------------------------------------------------------------- /code/06-stat_2d.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(dplyr) 3 | library(purrr) 4 | library(tidyr) 5 | }) 6 | 7 | x <- lapply(args$x, readRDS) %>% bind_rows() 8 | y <- lapply(args$y, readRDS) %>% bind_rows() 9 | 10 | # check that QCs are available for reference & simulation 11 | res <- if (!(all(is.na(x$method)) || all(is.na(y$method)))) { 12 | # source evaluation function 13 | source(args$fun) 14 | # for each grouping, test simulation vs. reference 15 | list(x = x, y = y) %>% 16 | bind_rows(.id = "dim") %>% 17 | pivot_wider( 18 | id_cols = c("group", "id"), 19 | names_from = c("dim", "method"), 20 | values_from = "value", 21 | values_fn = list) %>% 22 | rename_at( 23 | vars(contains("NA")), 24 | function(.) gsub("NA", "ref", .)) %>% 25 | rename_at( 26 | vars(contains(wcs$method)), 27 | function(.) gsub(wcs$method, "sim", .)) %>% 28 | rowwise() %>% 29 | mutate(stat = fun( 30 | cbind(x_ref, y_ref), 31 | cbind(x_sim, y_sim))) %>% 32 | ungroup() %>% 33 | select_if(negate(is.list)) %>% 34 | mutate(.before = 1, data.frame(wcs)) 35 | } 36 | 37 | print(head(res)) 38 | saveRDS(res, args$res) 39 | -------------------------------------------------------------------------------- /code/07-plot_batch-boxplot_by_method.R: -------------------------------------------------------------------------------- 1 | # wcs <- list(val = "cms") 2 | # args <- list( 3 | # uts1 = "code/utils-plotting.R", 4 | # uts2 = "code/utils-integration.R", 5 | # res = list.files("outs", "^batch_res-", full.names = TRUE), 6 | # rds = sprintf("plts/batch-boxplot_by_method_%s.rds", wcs$val), 7 | # pdf = sprintf("plts/batch-boxplot_by_method_%s.pdf", wcs$val)) 8 | 9 | source(args$uts1) 10 | source(args$uts2) 11 | 12 | res <- .read_res(args$res) 13 | 14 | df <- res %>% 15 | .cms_ldf() %>% 16 | .bcs(n = 1) # average across cells but not batches 17 | 18 | lim <- switch(wcs$val, bcs = c(0, 1), c(-0.5, 0.5)) 19 | 20 | plt <- ggplot(df, aes( 21 | reorder_within(batch_method, .data[[wcs$val]], sim_method, median), 22 | .data[[wcs$val]], col = batch_method, fill = batch_method)) + 23 | facet_grid(~ sim_method, scales = "free_x") + 24 | geom_hline(yintercept = c(0, 1), size = 0.1) + 25 | geom_boxplot( 26 | size = 0.25, outlier.size = 0.25, 27 | alpha = 0.25, key_glyph = "point") + 28 | scale_y_continuous( 29 | .batch_labs[wcs$val], 30 | limits = lim, 31 | n.breaks = 3) 32 | 33 | thm <- theme( 34 | legend.title = element_blank(), 35 | axis.text.x = element_blank(), 36 | axis.title.x = element_blank(), 37 | axis.ticks.x = element_blank()) 38 | 39 | fig <- .prettify(plt, thm) 40 | 41 | saveRDS(fig, args$rds) 42 | ggsave(args$pdf, fig, width = 15, height = 6, units = "cm") 43 | -------------------------------------------------------------------------------- /code/07-plot_batch-boxplot_dX.R: -------------------------------------------------------------------------------- 1 | # wcs <- list(val = "ldf") 2 | # args <- list( 3 | # uts1 = "code/utils-plotting.R", 4 | # uts2 = "code/utils-integration.R", 5 | # rds = paste0("plts/batch-heatmap_by_method_", wcs$val, ".rds"), 6 | # pdf = paste0("plts/batch-heatmap_by_method_", wcs$val, ".pdf"), 7 | # res = list.files("outs", "^batch_res", full.names = TRUE)) 8 | 9 | source(args$uts1) 10 | source(args$uts2) 11 | 12 | res <- .read_res(args$res) 13 | 14 | df <- res %>% 15 | .cms_ldf() %>% 16 | .bcs(n = 1) %>% # average across cells but not batches 17 | group_by(refset, batch_method, batch) %>% 18 | mutate(val = .data[[wcs$val]]-.data[[wcs$val]][.data$sim_method == "ref"]) %>% 19 | filter(sim_method != "ref") %>% 20 | mutate(sim_method = droplevels(sim_method)) 21 | 22 | lim <- c( 23 | floor(min(df$val)/0.1)*0.1, 24 | ceiling(max(df$val)/0.1)*0.1) 25 | lab <- eval(.batch_labs[wcs$val]) 26 | 27 | plt <- ggplot(df, aes(reorder(sim_method, val, median), val)) + 28 | geom_hline(yintercept = 0, size = 0.1, col = "red") + 29 | geom_boxplot(size = 0.2, outlier.size = 0.1, key_glyph = "point") + 30 | geom_violin(fill = NA, col = "grey", size = 0.2) + 31 | scale_y_continuous(limits = lim, breaks = c(0, lim)) + 32 | labs(x = NULL, y = bquote(Delta*.(lab)*"(sim-ref)")) 33 | 34 | thm <- theme(axis.text.x = element_text(angle = 45, hjust = 1),) 35 | 36 | fig <- .prettify(plt, thm) 37 | 38 | saveRDS(fig, args$rds) 39 | ggsave(args$pdf, fig, width = 6, height = 6, units = "cm") 40 | -------------------------------------------------------------------------------- /code/07-plot_batch-correlations.R: -------------------------------------------------------------------------------- 1 | # wcs <- list(val = "cms") 2 | # args <- list( 3 | # uts1 = "code/utils-plotting.R", 4 | # uts2 = "code/utils-integration.R", 5 | # rds = paste0("plts/batch-heatmap_by_method_", wcs$val, ".rds"), 6 | # pdf = paste0("plts/batch-heatmap_by_method_", wcs$val, ".pdf"), 7 | # res = list.files("outs", "^batch_res", full.names = TRUE)) 8 | 9 | source(args$uts1) 10 | source(args$uts2) 11 | 12 | res <- .read_res(args$res) 13 | 14 | df <- res %>% 15 | .cms_ldf() %>% 16 | .bcs(n = 1) # average across cells & batches 17 | 18 | mat <- df %>% 19 | ungroup() %>% 20 | pivot_wider( 21 | values_from = wcs$val, 22 | names_from = "sim_method", 23 | id_cols = c("refset", "batch_method", "batch")) %>% 24 | select(any_of(c("ref", names(.methods_pal)))) %>% 25 | cor(method = "pearson", use = "pairwise.complete.obs") 26 | 27 | mat[is.na(mat)] <- 0 28 | xo <- rownames(mat)[hclust(dist(mat))$order] 29 | yo <- rownames(mat)[hclust(dist(t(mat)))$order] 30 | 31 | df <- mat %>% 32 | data.frame( 33 | from = rownames(.), 34 | check.names = FALSE) %>% 35 | pivot_longer( 36 | cols = -from, 37 | names_to = "to", 38 | values_to = "corr") %>% 39 | mutate( 40 | to = factor(to, levels = xo), 41 | from = factor(from, levels = yo)) %>% 42 | filter(as.numeric(to) <= as.numeric(from)) 43 | 44 | fd <- filter(df, from != to) 45 | min <- floor(min(fd$corr)/0.1)*0.1 46 | max <- ceiling(max(fd$corr)/0.1)*0.1 47 | plt <- ggplot(fd, aes(from, to, fill = corr)) + 48 | geom_tile() + 49 | scale_fill_gradientn("r", 50 | colors = rev(hcl.colors(11, "Grays")), 51 | limits = c(min, max), breaks = c(min, max)) + 52 | coord_equal(expand = FALSE) + 53 | scale_x_discrete(limits = rev(xo[-1])) 54 | 55 | fy <- c("plain", "bold")[as.numeric(levels(df$to) == "ref") + 1] 56 | fx <- c("plain", "bold")[as.numeric(rev(levels(df$from)) == "ref") + 1] 57 | 58 | thm <- theme( 59 | axis.text.y = element_text(face = fy), 60 | axis.text.x = element_text(angle = 45, hjust = 1, face = fx), 61 | axis.title = element_blank(), 62 | panel.border = element_blank(), 63 | legend.title = element_text(vjust = 1), 64 | legend.direction = "horizontal", 65 | legend.position = c(1, 1), 66 | legend.justification = c(1, 1)) 67 | 68 | fig <- .prettify(plt, thm) 69 | 70 | saveRDS(fig, args$rds) 71 | ggsave(args$pdf, fig, width = 6, height = 6, units = "cm") -------------------------------------------------------------------------------- /code/07-plot_batch-densities.R: -------------------------------------------------------------------------------- 1 | # wcs <- list(val = "cms") 2 | # args <- list( 3 | # uts1 = "code/utils-plotting.R", 4 | # uts2 = "code/utils-integration.R", 5 | # rds = paste0("plts/batch-densities_", wcs$val, ".rds"), 6 | # pdf = paste0("plts/batch-densities_", wcs$val, ".pdf"), 7 | # res = list.files("outs", "^batch_res", full.names = TRUE)) 8 | 9 | source(args$uts1) 10 | source(args$uts2) 11 | 12 | res <- .read_res(args$res) 13 | 14 | lim <- switch(wcs$val, bcs = c(0, 1), c(-0.5, 0.5)) 15 | 16 | fig <- if (wcs$val != "bcs") { 17 | df <- .cms_ldf(res) 18 | plt <- ggplot(df, aes( 19 | .data[[wcs$val]], ..ndensity.., 20 | col = batch_method, fill = batch_method)) + 21 | facet_grid(sim_method ~ refset) + 22 | geom_vline(xintercept = 0, size = 0.2) + 23 | geom_density(alpha = 0, key_glyph = "point") + 24 | scale_x_continuous(limits = lim, n.breaks = 3) + 25 | scale_y_continuous(limits = c(0, 1), n.breaks = 3) + 26 | labs(x = .batch_labs[wcs$val], y = "scaled density") 27 | thm <- theme(axis.text.x = element_text(angle = 45, hjust = 1)) 28 | .prettify(plt, thm) 29 | } 30 | 31 | saveRDS(fig, args$rds) 32 | ggsave(args$pdf, fig, width = 16, height = 12, units = "cm") -------------------------------------------------------------------------------- /code/07-plot_batch-heatmap_by_method.R: -------------------------------------------------------------------------------- 1 | # wcs <- list(val = "cms") 2 | # args <- list( 3 | # uts1 = "code/utils-plotting.R", 4 | # uts2 = "code/utils-integration.R", 5 | # rds = paste0("plts/batch-heatmap_by_method_", wcs$val, ".rds"), 6 | # pdf = paste0("plts/batch-heatmap_by_method_", wcs$val, ".pdf"), 7 | # res = list.files("outs", "^batch_res", full.names = TRUE)) 8 | 9 | source(args$uts1) 10 | source(args$uts2) 11 | 12 | res <- .read_res(args$res) 13 | 14 | df <- res %>% 15 | .cms_ldf() %>% 16 | .bcs(n = 2) # average across cells & batches 17 | 18 | max <- ceiling(max(df$bcs)/0.1)*0.1 19 | lim <- switch(wcs$val, bcs = c(0, max), c(-0.5, 0.5)) 20 | 21 | plt <- ggplot(df, aes( 22 | reorder_within(batch_method, .data[[wcs$val]], sim_method, median), 23 | refset, fill = .data[[wcs$val]])) + 24 | facet_grid(~ sim_method, scales = "free_x") + 25 | geom_tile(col = "white") + 26 | scale_fill_distiller( 27 | .batch_labs[wcs$val], 28 | palette = "RdYlBu", 29 | na.value = "lightgrey", 30 | limits = lim, 31 | n.breaks = 3) + 32 | coord_cartesian(expand = FALSE) + 33 | scale_x_reordered() + 34 | scale_y_reordered() 35 | 36 | thm <- theme( 37 | axis.title = element_blank(), 38 | axis.text.x = element_text(angle = 45, hjust = 1)) 39 | 40 | fig <- .prettify(plt, thm) 41 | 42 | saveRDS(fig, args$rds) 43 | ggsave(args$pdf, fig, width = 15, height = 6, units = "cm") -------------------------------------------------------------------------------- /code/07-plot_batch-heatmap_by_refset.R: -------------------------------------------------------------------------------- 1 | # wcs <- list(val = "cms") 2 | # args <- list( 3 | # uts1 = "code/utils-plotting.R", 4 | # uts2 = "code/utils-integration.R", 5 | # rds = paste0("plts/batch-heatmap_by_refset_", wcs$val, ".rds"), 6 | # pdf = paste0("plts/batch-heatmap_by_refset_", wcs$val, ".pdf"), 7 | # res = list.files("outs", "^batch_res", full.names = TRUE)) 8 | 9 | source(args$uts1) 10 | source(args$uts2) 11 | 12 | res <- .read_res(args$res) 13 | 14 | df <- res %>% 15 | .cms_ldf() %>% 16 | .bcs(n = 2) # average across cells & batches 17 | 18 | max <- ceiling(max(df$bcs)/0.1)*0.1 19 | lim <- switch(wcs$val, bcs = c(0, max), c(-0.5, 0.5)) 20 | 21 | plt <- ggplot(df, aes( 22 | reorder_within(batch_method, .data[[wcs$val]], refset, median), 23 | sim_method, fill = .data[[wcs$val]])) + 24 | facet_grid(~ refset, scales = "free_x") + 25 | geom_tile(col = "white") + 26 | scale_fill_distiller( 27 | .batch_labs[wcs$val], 28 | palette = "RdYlBu", 29 | na.value = "lightgrey", 30 | limits = lim, 31 | n.breaks = 3) + 32 | coord_cartesian(expand = FALSE) + 33 | scale_x_reordered() + 34 | scale_y_reordered() 35 | plt 36 | thm <- theme( 37 | axis.title = element_blank(), 38 | axis.text.x = element_text(angle = 45, hjust = 1)) 39 | 40 | fig <- .prettify(plt, thm) 41 | 42 | saveRDS(fig, args$rds) 43 | ggsave(args$pdf, fig, width = 15, height = 6, units = "cm") 44 | -------------------------------------------------------------------------------- /code/07-plot_clust-boxplot_by_method.R: -------------------------------------------------------------------------------- 1 | source(args$uts) 2 | 3 | res <- .read_res(args$res) 4 | 5 | # order according to average F1 score 6 | res$sim_method <- factor( 7 | res$sim_method, 8 | levels = res %>% 9 | group_by(sim_method) %>% 10 | summarise_at("F1", mean) %>% 11 | arrange(F1) %>% 12 | pull("sim_method")) 13 | 14 | plt <- ggplot(res, aes( 15 | reorder_within(clust_method, -F1, sim_method, median), 16 | F1, col = clust_method, fill = clust_method)) + 17 | facet_grid(~ sim_method, scales = "free_x") + 18 | geom_hline(yintercept = c(0, 1), size = 0.1) + 19 | geom_boxplot( 20 | size = 0.25, outlier.size = 0.25, 21 | alpha = 0.25, key_glyph = "point") + 22 | scale_x_reordered(NULL) + 23 | scale_y_continuous(breaks = seq(0, 1, 0.2)) 24 | 25 | thm <- theme( 26 | legend.title = element_blank(), 27 | axis.text.x = element_blank(), 28 | axis.ticks.x = element_blank()) 29 | 30 | fig <- .prettify(plt, thm) 31 | 32 | saveRDS(fig, args$rds) 33 | ggsave(args$pdf, fig, width = 16, height = 3.25, units = "cm") 34 | -------------------------------------------------------------------------------- /code/07-plot_clust-boxplot_dF1.R: -------------------------------------------------------------------------------- 1 | source(args$uts) 2 | 3 | res <- .read_res(args$res) 4 | 5 | df <- res %>% 6 | group_by(refset, cluster, clust_method) %>% 7 | mutate(F1 = F1-.data$F1[.data$sim_method == "ref"]) %>% 8 | filter(sim_method != "ref") %>% 9 | mutate(sim_method = droplevels(sim_method)) 10 | 11 | plt <- ggplot(df, aes(reorder(sim_method, F1, median), F1)) + 12 | geom_hline(yintercept = 0, size = 0.1, col = "red") + 13 | geom_boxplot(size = 0.2, outlier.size = 0.1, key_glyph = "point") + 14 | geom_violin(fill = NA, col = "grey", size = 0.2) + 15 | scale_y_continuous(limits = c(-1, 1), n.breaks = 5) + 16 | labs(x = NULL, y = expression(Delta*"F1(sim-ref)")) 17 | 18 | thm <- theme(axis.text.x = element_text(angle = 45, hjust = 1),) 19 | 20 | fig <- .prettify(plt, thm) 21 | 22 | saveRDS(fig, args$rds) 23 | ggsave(args$pdf, fig, width = 6, height = 6, units = "cm") 24 | -------------------------------------------------------------------------------- /code/07-plot_clust-correlations.R: -------------------------------------------------------------------------------- 1 | # args <- list( 2 | # uts = "code/utils-plotting.R", 3 | # rds = "plts/clust-correlations.rds", 4 | # pdf = "plts/clust-correlations.pdf", 5 | # res = list.files("outs", "^clust_res", full.names = TRUE)) 6 | 7 | source(args$uts) 8 | 9 | res <- .read_res(args$res) 10 | 11 | mat <- res %>% 12 | pivot_wider( 13 | id_cols = c("refset", "cluster", "clust_method"), 14 | names_from = "sim_method", 15 | values_from = "F1") %>% 16 | select(any_of(c("ref", names(.methods_pal)))) %>% 17 | cor(method = "pearson", use = "pairwise.complete.obs") 18 | 19 | xo <- rownames(mat)[hclust(dist(mat))$order] 20 | yo <- rownames(mat)[hclust(dist(t(mat)))$order] 21 | 22 | df <- mat %>% 23 | data.frame( 24 | from = rownames(.), 25 | check.names = FALSE) %>% 26 | pivot_longer( 27 | cols = -from, 28 | names_to = "to", 29 | values_to = "corr") %>% 30 | mutate( 31 | to = factor(to, levels = xo), 32 | from = factor(from, levels = yo)) %>% 33 | filter(as.numeric(to) <= as.numeric(from)) 34 | 35 | fd <- filter(df, from != to) 36 | min <- floor(min(fd$corr)/0.1)*0.1 37 | max <- ceiling(max(fd$corr)/0.1)*0.1 38 | plt <- ggplot(fd, aes(from, to, fill = corr)) + 39 | geom_tile() + 40 | scale_fill_gradientn("r", 41 | colors = rev(hcl.colors(11, "Grays")), 42 | limits = c(min, max), breaks = c(min, max)) + 43 | coord_equal(expand = FALSE) + 44 | scale_x_discrete(limits = rev(xo[-1])) 45 | 46 | fy <- c("plain", "bold")[as.numeric(levels(df$to) == "ref") + 1] 47 | fx <- c("plain", "bold")[as.numeric(rev(levels(df$from)) == "ref") + 1] 48 | 49 | thm <- theme( 50 | axis.text.y = element_text(face = fy), 51 | axis.text.x = element_text(angle = 45, hjust = 1, face = fx), 52 | axis.title = element_blank(), 53 | panel.border = element_blank(), 54 | legend.title = element_text(vjust = 1), 55 | legend.direction = "horizontal", 56 | legend.position = c(1, 1), 57 | legend.justification = c(1, 1)) 58 | 59 | fig <- .prettify(plt, thm) 60 | 61 | saveRDS(fig, args$rds) 62 | ggsave(args$pdf, fig, width = 6, height = 6, units = "cm") -------------------------------------------------------------------------------- /code/07-plot_clust-heatmap_by_method.R: -------------------------------------------------------------------------------- 1 | source(args$uts) 2 | 3 | res <- .read_res(args$res) 4 | 5 | df <- res %>% 6 | group_by( 7 | refset, 8 | sim_method, 9 | clust_method) %>% 10 | summarise_at("F1", mean) %>% 11 | mutate(rank = rank(-F1)) 12 | 13 | plt <- ggplot(df, aes( 14 | reorder_within(clust_method, rank, sim_method), 15 | reorder(refset, rank), fill = rank)) + 16 | facet_wrap(~ sim_method, nrow = 1, scales = "free_x") + 17 | geom_tile(col = "white", key_glyph = "point") + 18 | scale_fill_gradientn( 19 | colors = rev(hcl.colors(5, "Spectral")), 20 | limits = range(df$rank), 21 | breaks = range(df$rank), 22 | labels = c("best", "worst")) + 23 | guides(fill = guide_colorbar(reverse = TRUE)) + 24 | coord_cartesian(expand = FALSE) + 25 | scale_x_reordered() 26 | 27 | thm <- theme( 28 | axis.title = element_blank(), 29 | axis.text.x = element_text(size = 4, angle = 45, hjust = 1)) 30 | 31 | fig <- .prettify(plt, thm) 32 | 33 | saveRDS(fig, args$rds) 34 | ggsave(args$pdf, fig, width = 16, height = 3.25, units = "cm") 35 | -------------------------------------------------------------------------------- /code/07-plot_corr-stat_2d.R: -------------------------------------------------------------------------------- 1 | source(args$fun) 2 | 3 | # args <- list(res = list.files("outs", "stat_2d-.*(ks2|emd)\\.rds", full.names = TRUE)) 4 | 5 | res <- .read_res(args$res) 6 | 7 | df <- res %>% 8 | filter(!is.na(stat)) %>% 9 | mutate( 10 | refset = paste(datset, subset, sep = ","), 11 | metrics = paste(metric1, metric2, sep = "\n")) %>% 12 | select(-c(datset, subset, metric1, metric2)) %>% 13 | # for type != n, keep batch-/cluster-level comparisons only 14 | group_by(stat2d, refset, method, metrics) %>% 15 | mutate(n = n()) %>% 16 | filter(group != "global" | n == 1) %>% 17 | select(-c(group, n)) 18 | 19 | gg <- pivot_wider(df, 20 | names_from = stat2d, 21 | values_from = stat) %>% 22 | group_by(refset, metrics) %>% 23 | mutate( 24 | ks2 = ks2/max(ks2, na.rm = TRUE), 25 | emd = emd/max(emd, na.rm = TRUE)) 26 | 27 | plt <- ggplot(gg, aes(emd, ks2)) + 28 | facet_wrap(~ metrics, nrow = 2) + 29 | geom_abline(intercept = 0, slope = 1, size = 0.2, lty = 2) + 30 | stat_smooth(method = "lm", formula = y ~ x, size = 0.4, col = "red") + 31 | geom_point(alpha = 0.2, size = 1, shape = 16, col = "tomato") + 32 | stat_cor(method = "pearson", size = 1.5, 33 | hjust = 1, vjust = 0, label.x.npc = 1, label.y.npc = 0) + 34 | scale_x_continuous(limits = c(0, 1), breaks = c(0, 1)) + 35 | scale_y_continuous(limits = c(0, 1), breaks = c(0, 1)) + 36 | labs(x = "scaled EMD", y = "scaled KS statistic") 37 | 38 | thm <- theme( 39 | aspect.ratio = 1, 40 | legend.position = "none") 41 | 42 | fig <- .prettify(plt, thm) 43 | 44 | saveRDS(fig, args$ggp) 45 | ggsave(args$plt, fig, width = 16, height = 9.25, units = "cm") 46 | -------------------------------------------------------------------------------- /code/07-plot_dimred.R: -------------------------------------------------------------------------------- 1 | # wcs <- list(reftyp = "k") 2 | # args <- list( 3 | # fun = "code/utils-plotting.R", 4 | # res = list.files("outs", "dr_ref", full.names = TRUE), 5 | # pdf = sprintf("figs/dimred_%s.pdf", wcs$reftyp)) 6 | 7 | source(args$fun) 8 | 9 | dfs <- args$res %>% 10 | lapply(readRDS) %>% 11 | bind_rows() %>% 12 | mutate( 13 | refset = paste(datset, subset, sep = ","), 14 | type = ifelse(!is.na(batch), "b", ifelse(!is.na(cluster), "k", "n"))) %>% 15 | filter(type == wcs$reftyp) %>% 16 | group_split(refset) 17 | 18 | # number of rows & columns for facetting 19 | nc <- ceiling(length(dfs)/(nr <- 3)) 20 | 21 | # get variable to color by 22 | # - log-library size (lls) for type n 23 | # - batch for type b, cluster for type k 24 | col <- switch(wcs$reftyp, n = "lls", b = "batch", k = "cluster") 25 | 26 | fig <- lapply(seq_along(dfs), \(i) { 27 | plt <- ggplot(dfs[[i]], aes_string( 28 | "TSNE1", "TSNE2", col = col, fill = col)) + 29 | geom_point_rast(size = 0.1, alpha = 0.2) + 30 | (if (col == "lls") scale_color_viridis_c()) + 31 | guides(fill = "none") + ggtitle(dfs[[i]]$refset[1]) 32 | thm <- theme( 33 | aspect.ratio = 1, 34 | axis.text = element_blank(), 35 | axis.ticks = element_blank(), 36 | legend.title = element_blank(), 37 | legend.justification = c(0, 0.5), 38 | plot.title = element_text(hjust = 0.5), 39 | axis.title.x = if (i > length(dfs) - nc) element_text() else element_blank(), 40 | axis.title.y = if (i %% nc != 1) element_blank() else element_text(angle = 90)) 41 | .prettify(plt, thm) 42 | }) %>% wrap_plots(nrow = nr) 43 | 44 | ggsave(args$pdf, fig, width = 16, height = 4*length(dfs)/n, units = "cm") 45 | -------------------------------------------------------------------------------- /code/07-plot_dimred_batch.R: -------------------------------------------------------------------------------- 1 | # args <- list( 2 | # rds = "plts/batch-dimred.rds", 3 | # pdf = "plts/batch-dimred.pdf", 4 | # uts = "code/utils-plotting.R", 5 | # res = list.files("outs", "^dr_", full.names = TRUE)) 6 | # args$res <- grep("CellBench,H", args$res, value = TRUE) 7 | 8 | source(args$uts) 9 | 10 | res <- .read_res(args$res) 11 | 12 | dfs <- res %>% 13 | replace_na(list(batch_method = "none")) %>% 14 | mutate( 15 | batch_method = factor(batch_method), 16 | batch_method = relevel(batch_method, "none")) %>% 17 | group_split(refset) 18 | 19 | lys <- lapply(dfs, \(df) { 20 | plt <- ggplot(df, aes(TSNE1, TSNE2, col = batch, fill = batch)) + 21 | facet_grid(batch_method ~ sim_method, scales = "free") + 22 | geom_point_rast(shape = 21, size = 0.05, alpha = 0.2) + 23 | ggtitle(df$refset[1]) 24 | thm <- theme( 25 | aspect.ratio = 1, 26 | axis.text = element_blank(), 27 | axis.ticks = element_blank(), 28 | axis.title = element_text(hjust = 0)) 29 | fig <- .prettify(plt, thm) 30 | }) 31 | 32 | saveRDS(lys, args$rds) 33 | 34 | pdf(args$pdf, width = 16/2.54, height = 12/2.54) 35 | for (p in lys) print(p); dev.off() 36 | -------------------------------------------------------------------------------- /code/07-plot_memory.R: -------------------------------------------------------------------------------- 1 | # args <- list( 2 | # pdf = "plts/memory.pdf", 3 | # fun = "code/utils-plotting.R", 4 | # res = list.files("logs", ".*\\.txt", full.names = TRUE)) 5 | 6 | source(args$fun) 7 | res <- lapply(args$res, read.table, header = TRUE) 8 | mbs <- do.call(rbind, res)[["max_rss"]] 9 | 10 | # get metadata from output names 11 | # rts_{reftyp}-{datset},{subset}, 12 | # {method},{ngs},{ncs},{rep}.rds 13 | ss <- strsplit(basename(gsub("\\.txt", "", args$res)), ",") 14 | reftyp <- gsub("rts_([a-z])-.*", "\\1", sapply(ss, .subset, 1)) 15 | refset <- paste(sep = ",", 16 | gsub(".*-", "", sapply(ss, .subset, 1)), 17 | sapply(ss, .subset, 2)) 18 | vars <- seq(3, 6) 19 | names(vars) <- c("method", "ngs", "ncs", "rep") 20 | vars <- lapply(vars, \(.) sapply(ss, .subset, .)) 21 | 22 | df <- data.frame(mbs, reftyp, refset, vars) %>% 23 | pivot_longer( 24 | values_to = "n", 25 | names_to = "dim", 26 | cols = c("ngs", "ncs")) %>% 27 | filter(n != "x") %>% 28 | group_by(dim) %>% 29 | mutate(n = factor(n, sort(as.numeric(unique(n))))) %>% 30 | ungroup() %>% 31 | mutate( 32 | dim = factor(dim, 33 | levels = c("ngs", "ncs"), 34 | labels = c("# genes", "# cells")), 35 | method = droplevels(factor(method, names(.methods_pal)))) %>% 36 | group_by(method, dim, n) %>% 37 | summarise_at("mbs", mean) 38 | 39 | # order methods by average across subsets 40 | ms <- df %>% 41 | group_by(method) %>% 42 | summarise_at("mbs", mean) %>% 43 | arrange(mbs) %>% pull("method") 44 | df$method <- factor(df$method, levels = ms) 45 | 46 | pal <- .methods_pal[levels(df$method)] 47 | 48 | lab <- parse(text = paste( 49 | sep = "~", 50 | sprintf("bold(%s)", LETTERS), 51 | gsub("\\s", "~", names(pal)))) 52 | 53 | anno <- mutate(df, letter = LETTERS[ 54 | match(method, levels(method))]) 55 | 56 | plt <- ggplot(df, aes(factor(n), mbs, 57 | col = method, fill = method)) + 58 | facet_grid(~ dim, scales = "free") + 59 | geom_bar( 60 | width = 0.9, 61 | stat = "identity", 62 | position = "dodge") + 63 | geom_text(data = anno, 64 | aes(label = letter, y = -250), 65 | size = 1.5, color = "black", 66 | position = position_dodge(0.9)) + 67 | scale_fill_manual(values = pal, labels = lab) + 68 | scale_color_manual(values = pal, labels = lab) + 69 | xlab(NULL) + scale_y_continuous( 70 | "memory usage (MBs)", 71 | limits = c(-500, 4e3), 72 | expand = c(0, 0)) 73 | 74 | thm <- theme( 75 | axis.ticks.x = element_blank(), 76 | legend.title = element_blank(), 77 | legend.text = element_text(hjust = 0), 78 | panel.grid.major.y = element_line(color = "grey")) 79 | 80 | fig <- .prettify(plt, thm) 81 | 82 | saveRDS(fig, args$rds) 83 | ggsave(args$pdf, fig, width = 18, height = 9, units = "cm") -------------------------------------------------------------------------------- /code/07-plot_qc_ref-correlations.R: -------------------------------------------------------------------------------- 1 | # wcs <- list(stat1d = "ks") 2 | # args <- list( 3 | # fun = "code/utils-plotting.R", 4 | # res = "outs/obj-qc_ref.rds", 5 | # rds = "plts/qc_ref-correlations.rds", 6 | # pdf = "plts/qc_ref-correlations.pdf") 7 | 8 | source(args$fun) 9 | res <- readRDS(args$res) 10 | 11 | # exclude summaries that include sampling 12 | ex <- .metrics_lab[grep("cor|pcd", names(.metrics_lab))] 13 | df <- res %>% 14 | select(-c(datset, subset)) %>% 15 | filter(!metric %in% ex) %>% 16 | mutate( 17 | # add summary short names 18 | .metric = names(.metrics_lab)[match(metric, .metrics_lab)], 19 | # add summary type 20 | type = ifelse(grepl("cell", .metric), "cell", "gene")) %>% 21 | # keep group-level comparisons only 22 | mutate(across(c(group, id), as.character)) %>% 23 | filter(group != id | .metric %in% .none_metrics) %>% 24 | select(-c(.metric, group, id)) %>% 25 | # correlate summaries of same type for each refset 26 | group_by(refset, metric) %>% 27 | mutate(n = row_number()) %>% 28 | group_by(refset, type) %>% 29 | group_map(\(df, keys) { 30 | pivot_wider(df, 31 | names_from = metric, 32 | values_from = value) %>% 33 | select(any_of(.metrics_lab)) %>% 34 | cor(method = "spearman", 35 | use = "pairwise.complete.obs") %>% 36 | data.frame(from = rownames(.)) %>% 37 | pivot_longer(-from, names_to = "to") %>% 38 | mutate(keys) 39 | }) %>% 40 | bind_rows() %>% 41 | # average across refsets 42 | group_by(from, to) %>% 43 | mutate(value = mean(value)) %>% 44 | ungroup() %>% 45 | distinct(from, to, .keep_all = TRUE) %>% 46 | select(-refset) 47 | 48 | # do hierarchical clustering for each type 49 | mat <- df %>% 50 | split(.$type) %>% 51 | lapply(\(df) df %>% 52 | pivot_wider( 53 | names_from = to, 54 | values_from = value) %>% 55 | select_if(is.numeric)) 56 | 57 | o <- unlist(lapply(mat, \(.) colnames(.)[hclust(dist(.))$order])) 58 | df <- mutate(df, across(c(from, to), factor, o, .metrics_lab[o])) 59 | 60 | plt <- ggplot(df, aes(from, to, fill = value)) + 61 | geom_tile() + 62 | scale_fill_distiller("r", 63 | palette = "RdYlBu", 64 | limits = c(-0.5, 1), 65 | breaks = c(0, 1)) + 66 | coord_equal(expand = FALSE) + 67 | scale_x_discrete(limits = rev(levels(df$to))) 68 | 69 | thm <- theme( 70 | axis.title = element_blank(), 71 | axis.text.x = element_text(angle = 45, hjust = 1)) 72 | 73 | fig <- .prettify(plt, thm) 74 | 75 | saveRDS(fig, args$rds) 76 | ggsave(args$pdf, fig, width = 7.5, height = 6, units = "cm") 77 | -------------------------------------------------------------------------------- /code/07-plot_runtimes.R: -------------------------------------------------------------------------------- 1 | # wcs <- list(reftyp = "n") 2 | # args <- list( 3 | # rds = sprintf("plts/rts_%s.rds", wcs$reftyp), 4 | # pdf = sprintf("plts/rts_%s.pdf", wcs$reftyp), 5 | # fun = "code/utils-plotting.R", 6 | # res = list.files("outs", sprintf("^rts_%s-", wcs$reftyp), full.names = TRUE)) 7 | 8 | source(args$fun) 9 | res <- .read_res(args$res) 10 | 11 | df <- res %>% 12 | # drop SCRIP estimation for 'reftyp' other than 'n' 13 | # as there's none, it's just setting up parameters 14 | { if (wcs$reftyp != "n") 15 | mutate(., est = case_when( 16 | method == "SCRIP" ~ NA_real_, 17 | TRUE ~ est)) else . } %>% 18 | # sum up estimation & simulation timings 19 | rowwise() %>% 20 | mutate(tot = { 21 | if (is.finite(est)) { 22 | if (is.finite(sim)) { 23 | est + sim # both available 24 | } else est # only estimation 25 | } else if (is.finite(sim)) { 26 | sim # only simulation 27 | } else NA # neither available 28 | }) %>% 29 | ungroup() %>% 30 | pivot_longer( 31 | values_to = "n", 32 | names_to = "dim", 33 | cols = c("ngs", "ncs")) %>% 34 | pivot_longer( 35 | values_to = "t", 36 | names_to = "step", 37 | cols = c("est", "sim", "tot")) %>% 38 | mutate( 39 | dim = factor(dim, 40 | levels = c("ngs", "ncs"), 41 | labels = c("# genes", "# cells")), 42 | step = factor(step, 43 | levels = c("est", "sim", "tot"), 44 | labels = c("estimation", "simulation", "overall"))) %>% 45 | filter(!is.na(n), !is.na(t), is.finite(t)) %>% 46 | group_by(method, step, dim, n) %>% 47 | summarise_at("t", mean) 48 | 49 | # order methods by overall average across subsets 50 | ms <- df %>% 51 | filter(step == "overall") %>% 52 | group_by(method) %>% 53 | summarise_at("t", mean) %>% 54 | arrange(t) %>% pull("method") 55 | df$method <- factor(df$method, levels = ms) 56 | 57 | pal <- .methods_pal[levels(df$method)] 58 | 59 | lab <- parse(text = paste( 60 | sep = "~", 61 | sprintf("bold(%s)", LETTERS), 62 | gsub("\\s", "~", names(pal)))) 63 | 64 | anno <- df %>% 65 | group_by(step, dim) %>% 66 | mutate(letter = LETTERS[match(method, levels(method))]) 67 | 68 | plt <- ggplot(df, aes(factor(n), t, 69 | col = method, fill = method)) + 70 | facet_grid(step ~ dim, scales = "free") + 71 | geom_bar( 72 | width = 0.9, 73 | stat = "identity", 74 | position = "dodge") + 75 | geom_text(data = anno, 76 | aes(label = letter, y = 0), 77 | size = 1.5, vjust = 2, color = "black", 78 | position = position_dodge(0.9)) + 79 | scale_fill_manual(values = pal, labels = lab) + 80 | scale_color_manual(values = pal, labels = lab) + 81 | xlab(NULL) + scale_y_sqrt("runtime (s)", 82 | expand = expansion(mult = c(0.2, 0.1))) 83 | 84 | thm <- theme( 85 | axis.ticks.x = element_blank(), 86 | legend.title = element_blank(), 87 | legend.text = element_text(hjust = 0), 88 | panel.grid.major.y = element_line(color = "grey")) 89 | 90 | fig <- .prettify(plt, thm) 91 | 92 | saveRDS(fig, args$rds) 93 | ggsave(args$pdf, fig, width = 18, height = 9, units = "cm") 94 | -------------------------------------------------------------------------------- /code/07-plot_stat_1d-scatters.R: -------------------------------------------------------------------------------- 1 | # args <- list( 2 | # fun = "code/utils.R", 3 | # res = "outs/obj-stat_1d.rds", 4 | # rds = "plts/stat_1d-scatters.rds", 5 | # pdf = "plts/stat_1d-scatters.pdf") 6 | 7 | source(args$fun) 8 | 9 | res <- readRDS(args$res) 10 | 11 | df <- res %>% 12 | mutate(refset = paste(datset, subset, sep = ",")) %>% 13 | select(-c(datset, subset)) %>% 14 | # for type != n, keep batch-/cluster-level comparisons only 15 | group_by(stat1d, refset, method, metric) %>% 16 | mutate(n = n()) %>% 17 | filter(group != "global" | n == 1) %>% 18 | select(-c(group, n)) %>% 19 | # for each statistic & metric, re-scale b/w 0 & 1 20 | group_by(stat1d, metric) %>% 21 | mutate(stat = stat/max(stat, na.rm = TRUE)) %>% 22 | pivot_wider(names_from = stat1d, values_from = stat) %>% 23 | filter(!is.na(ks), !is.na(ws)) 24 | 25 | plt <- ggplot(df, aes(ks, ws, col = refset, fill = refset)) + 26 | facet_wrap(~ metric, nrow = 2) + 27 | stat_cor( 28 | aes(group = metric), method = "spearman", 29 | label.x = 0, label.y = 1, vjust = 1, size = 1.5) + 30 | geom_point(size = 0.2, alpha = 0.4, show.legend = FALSE) + 31 | scale_x_sqrt(limits = c(0, 1), breaks = seq(0.2, 1, 0.2)) + 32 | scale_y_sqrt(limits = c(0, 1), breaks = seq(0.2, 1, 0.2)) + 33 | labs(x = "KS statistics", y = "scaled Wasserstein metric") 34 | 35 | thm <- theme(aspect.ratio = 1) 36 | 37 | fig <- .prettify(plt, thm) 38 | 39 | saveRDS(fig, args$rds) 40 | ggsave(args$pdf, fig, width = 16, height = 7.5, units = "cm") 41 | -------------------------------------------------------------------------------- /code/07-plot_stat_1d_by_reftyp-boxplot.R: -------------------------------------------------------------------------------- 1 | # wcs <- list(reftyp = "n", stat1d = "ks") 2 | # args <- list( 3 | # fun = "code/utils-plotting.R", 4 | # res = "outs/obj-stat_1d.rds", 5 | # rds = sprintf("stat_1d_by_reftyp-boxplot,%s,%s.rds", wcs$reftyp, wcs$stat1d), 6 | # pdf = sprintf("stat_1d_by_reftyp-boxplot,%s,%s.pdf", wcs$reftyp, wcs$stat1d)) 7 | 8 | source(args$fun) 9 | res <- readRDS(args$res) 10 | 11 | df <- res %>% 12 | # keep data of interest 13 | filter( 14 | reftyp == wcs$reftyp, 15 | stat1d == wcs$stat1d) %>% 16 | .filter_res() %>% 17 | # scale values b/w 0 and 1 for visualization 18 | group_by(metric) %>% 19 | mutate(stat = stat/max(stat, na.rm = TRUE)) 20 | 21 | pal <- .methods_pal[levels(df$method)] 22 | lab <- parse(text = paste( 23 | sep = "~", 24 | sprintf("bold(%s)", LETTERS), 25 | gsub("\\s", "~", names(pal)))) 26 | 27 | anno <- df %>% 28 | group_by(method, metric) %>% 29 | summarize_at("stat", median) %>% 30 | mutate(letter = LETTERS[match(method, levels(method))]) 31 | 32 | plt <- ggplot(df, aes( 33 | reorder_within(method, stat, metric, median), 34 | stat, col = method, fill = method)) + 35 | facet_wrap(~ metric, nrow = 3, scales = "free_x") + 36 | geom_boxplot( 37 | outlier.size = 0.25, outlier.alpha = 1, 38 | size = 0.25, alpha = 0.25, key_glyph = "point") + 39 | geom_text(data = anno, 40 | size = 1.5, color = "black", 41 | aes(label = letter, y = -0.075)) + 42 | scale_fill_manual(values = pal, labels = lab) + 43 | scale_color_manual(values = pal, labels = lab) + 44 | scale_x_reordered(NULL) + 45 | scale_y_continuous( 46 | paste(ifelse(wcs$stat1d == "ws", "scaled", ""), 47 | .stats1d_lab[wcs$stat1d]), 48 | limits = c(-0.1, 1), n.breaks = 3) 49 | 50 | thm <- theme( 51 | legend.text.align = 0, 52 | legend.title = element_blank(), 53 | axis.text.x = element_blank(), 54 | axis.title.x = element_blank(), 55 | axis.ticks.x = element_blank()) 56 | 57 | fig <- .prettify(plt, thm) 58 | 59 | saveRDS(fig, args$rds) 60 | ggsave(args$pdf, fig, width = 16, height = 9, units = "cm") -------------------------------------------------------------------------------- /code/07-plot_stat_1d_by_reftyp-dimEst.R: -------------------------------------------------------------------------------- 1 | # wcs <- list(reftyp = "n", stat1d = "ks") 2 | # args <- list( 3 | # fun = "code/utils-plotting.R", 4 | # res = "outs/obj-stat_1d.rds", 5 | # rds = sprintf("plts/stat_1d_by_reftyp-dimEst,%s,%s.rds", wcs$reftyp, wcs$stat1d), 6 | # pdf = sprintf("plts/stat_1d_by_reftyp-dimEst,%s,%s.pdf", wcs$reftyp, wcs$stat1d)) 7 | 8 | suppressPackageStartupMessages({ 9 | library(intrinsicDimension) 10 | }) 11 | 12 | source(args$fun) 13 | res <- readRDS(args$res) 14 | 15 | df <- res %>% 16 | # keep data of interest 17 | filter( 18 | reftyp == wcs$reftyp, 19 | stat1d == wcs$stat1d) %>% 20 | .filter_res() %>% 21 | # average across groups 22 | group_by(refset, method, metric, group) %>% .avg(n = 1) %>% 23 | # estimate dimensionality for each refset 24 | pivot_wider( 25 | names_from = metric, 26 | values_from = stat) %>% 27 | split(.$refset) %>% 28 | lapply(\(df) { 29 | if (nrow(df) < 3) return(NULL) 30 | ks <- seq(2, nrow(df)-1) 31 | mat <- as.matrix(select(df, any_of(.metrics_lab))) 32 | mat <- mat[, apply(mat, 2, \(.) !all(is.na(.)))] 33 | mat[is.na(mat)] <- 1 34 | est <- sapply(ks, maxLikGlobalDimEst, data = mat) 35 | data.frame(dim = unlist(est), k = ks) 36 | }) %>% 37 | bind_rows(.id = "refset") 38 | 39 | plt <- ggplot(df, aes(k, dim, col = refset, fill = refset)) + 40 | geom_point(shape = 21, size = 0.5) + 41 | geom_line(alpha = 0.5, show.legend = FALSE) + 42 | scale_x_continuous( 43 | "number of nearest neighbors (k)", 44 | limits = range(df$k), 45 | breaks = seq(2, 20, 2)) + 46 | scale_y_continuous( 47 | "estimated dimensionality", 48 | limits = range(df$dim), 49 | breaks = seq(0, 10, 2)) + 50 | ggtitle(paste("reftyp:", wcs$reftyp)) 51 | 52 | thm <- theme( 53 | legend.title = element_blank(), 54 | panel.grid.major = element_line(color = "grey")) 55 | 56 | fig <- .prettify(plt, thm) 57 | 58 | saveRDS(fig, args$rds) 59 | ggsave(args$pdf, fig, width = 16, height = 9, units = "cm") 60 | -------------------------------------------------------------------------------- /code/07-plot_stat_1d_by_reftyp-heatmap.R: -------------------------------------------------------------------------------- 1 | # wcs <- list(reftyp = "k", stat1d = "ks") 2 | # args <- list( 3 | # fun = "code/utils-plotting.R", 4 | # res = "outs/obj-stat_1d.rds", 5 | # rds = sprintf("stat_1d_by_reftyp-boxplot,%s,%s.rds", wcs$reftyp, wcs$stat1d), 6 | # pdf = sprintf("stat_1d_by_reftyp-boxplot,%s,%s.pdf", wcs$reftyp, wcs$stat1d)) 7 | 8 | source(args$fun) 9 | res <- readRDS(args$res) 10 | 11 | df <- res %>% 12 | # keep data of interest 13 | filter( 14 | reftyp == wcs$reftyp, 15 | stat1d == wcs$stat1d) %>% 16 | .filter_res() %>% 17 | # for each method & summary, 18 | # average across groups & refsets 19 | group_by(metric, method, refset, group) %>% .avg(n = 3) %>% 20 | complete(method, metric, fill = list(stat = NA)) 21 | 22 | # order methods by average across metrics 23 | ox <- df %>% 24 | group_by(method) %>% 25 | summarise_at("stat", mean, na.rm = TRUE) %>% 26 | arrange(desc(stat)) %>% 27 | pull("method") 28 | 29 | # order metrics by average across methods 30 | oy <- df %>% 31 | group_by(metric) %>% 32 | summarise_at("stat", mean, na.rm = TRUE) %>% 33 | arrange(desc(stat)) %>% 34 | pull("metric") 35 | 36 | plt <- ggplot(df, 37 | aes(method, metric, fill = stat)) + 38 | geom_tile(col = "white") + 39 | scale_fill_distiller( 40 | .stats1d_lab[wcs$stat1d], 41 | palette = "RdYlBu", 42 | na.value = "grey", 43 | limits = c(0, 1), 44 | breaks = c(0, 1)) + 45 | coord_equal(expand = FALSE) + 46 | scale_x_discrete(limits = rev(ox)) + 47 | scale_y_discrete(limits = rev(oy)) 48 | 49 | thm <- theme( 50 | axis.ticks = element_blank(), 51 | axis.title = element_blank(), 52 | panel.border = element_blank(), 53 | axis.text.x = element_text(angle = 45, hjust = 1)) 54 | 55 | fig <- .prettify(plt, thm) 56 | 57 | saveRDS(fig, args$rds) 58 | ggsave(args$pdf, fig, width = 8, height = 6, units = "cm") -------------------------------------------------------------------------------- /code/07-plot_stat_1d_by_reftyp-mds.R: -------------------------------------------------------------------------------- 1 | # wcs <- list(reftyp = "b", stat1d = "ks") 2 | # args <- list( 3 | # fun = "code/utils-plotting.R", 4 | # res = "outs/obj-stat_1d.rds", 5 | # rds = sprintf("plts/stat_1d_by_reftyp-mds,%s,%s.rds", wcs$reftyp, wcs$stat1d), 6 | # pdf = sprintf("plts/stat_1d_by_reftyp-mds,%s,%s.pdf", wcs$reftyp, wcs$stat1d)) 7 | 8 | source(args$fun) 9 | res <- readRDS(args$res) 10 | 11 | df <- res %>% 12 | # keep data of interest 13 | filter( 14 | reftyp == wcs$reftyp, 15 | stat1d == wcs$stat1d) %>% 16 | .filter_res() %>% 17 | # average across groups 18 | group_by(refset, reftyp, method, metric, group) %>% 19 | .avg(n = 2) 20 | 21 | mds <- df %>% 22 | pivot_wider( 23 | id_cols = c(refset, method), 24 | names_from = metric, 25 | values_from = stat) %>% 26 | select(any_of(.metrics_lab)) %>% 27 | as.matrix() %>% t() %>% 28 | dist() %>% cmdscale() 29 | 30 | gg <- mds %>% 31 | data.frame(rownames(.)) %>% 32 | set_colnames(c("x", "y", "metric")) %>% 33 | mutate( 34 | type = case_when( 35 | metric %in% .none_metrics ~ "global", 36 | grepl("gene", metric) ~ "gene", 37 | grepl("cell", metric) ~ "cell"), 38 | type = factor(type, c("gene", "cell", "global")), 39 | metric = .metrics_lab[metric]) 40 | 41 | plt <- ggplot(gg, aes(x, y, 42 | col = type, fill = type, label = metric)) + 43 | geom_point(size = 2, shape = 21, col = "black", alpha = 0.5) + 44 | geom_text_repel(size = 2, show.legend = FALSE) + 45 | scale_fill_manual(values = c("red", "blue", "green3")) + 46 | scale_color_manual(values = c("red", "blue", "green3")) + 47 | scale_x_continuous(expand = expansion(mult = 0.1)) + 48 | scale_y_continuous(expand = expansion(mult = 0.1)) + 49 | labs(x = "MDS dim. 1", y = "MDS dim. 2") + 50 | coord_fixed() 51 | 52 | thm <- theme( 53 | legend.position = "bottom", 54 | legend.title = element_blank(), 55 | panel.grid.major = element_line(color = "grey")) 56 | 57 | fig <- .prettify(plt, thm) 58 | 59 | saveRDS(fig, args$rds) 60 | ggsave(args$pdf, fig, width = 9, height = 6, units = "cm") 61 | -------------------------------------------------------------------------------- /code/07-plot_stat_1d_by_reftyp-pca.R: -------------------------------------------------------------------------------- 1 | # wcs <- list(reftyp = "b", stat1d = "ks") 2 | # args <- list( 3 | # fun = "code/utils-plotting.R", 4 | # res = "outs/obj-stat_1d.rds", 5 | # rds = sprintf("plts/stat_1d_by_reftyp-pca,%s,%s.rds", wcs$reftyp, wcs$stat1d), 6 | # pdf = sprintf("plts/stat_1d_by_reftyp-pca,%s,%s.pdf", wcs$reftyp, wcs$stat1d)) 7 | 8 | source(args$fun) 9 | res <- readRDS(args$res) 10 | 11 | df <- res %>% 12 | # keep data of interest 13 | filter( 14 | reftyp == wcs$reftyp, 15 | stat1d == wcs$stat1d) %>% 16 | .filter_res() %>% 17 | # average across groups 18 | group_by(refset, method, metric, group) %>% 19 | .avg(n = 2) %>% 20 | pivot_wider( 21 | names_from = metric, 22 | values_from = stat) %>% 23 | drop_na(any_of(.metrics_lab)) 24 | 25 | # do PCA 26 | pc <- df %>% 27 | select(any_of(.metrics_lab)) %>% 28 | as.matrix() %>% prcomp() 29 | 30 | # get coordinates 31 | xy <- pc$x %>% 32 | data.frame() %>% 33 | # add metadata 34 | mutate( 35 | select(df, !any_of(.metrics_lab)), 36 | method = droplevels(method), 37 | reftyp = res$reftyp[match(refset, res$refset)]) %>% 38 | mutate_at("reftyp", factor, c("n", "b", "k")) 39 | 40 | # get loadings 41 | rot <- pc$rotation %>% 42 | data.frame() %>% 43 | mutate( 44 | metric = .metrics_lab[rownames(.)], 45 | metric = factor(metric, metric)) 46 | ij <- c("PC1", "PC2") 47 | m1 <- max(abs(xy[, ij])) 48 | m2 <- max(abs(rot[, ij])) 49 | rot[, ij] <- 0.5*rot[, ij]*m1/m2 50 | 51 | # get percentages 52 | var <- prop.table(pc$sdev^2) 53 | var_lab <- round(100*var, 1) 54 | 55 | # plotting 56 | lab <- xy %>% 57 | group_by(method) %>% 58 | select(starts_with("PC")) %>% 59 | summarise_all(mean, .groups = "drop") 60 | 61 | p0 <- ggplot(xy, aes(PC1, PC2)) + 62 | geom_vline(xintercept = 0, size = 0.1) + 63 | geom_hline(yintercept = 0, size = 0.1) + 64 | coord_fixed() + labs( 65 | x = sprintf("PC1 (%s%%)", var_lab[1]), 66 | y = sprintf("PC2 (%s%%)", var_lab[2])) + 67 | scale_x_continuous( 68 | limits = range(xy$PC1), 69 | expand = expansion(mult = 0.2)) + 70 | scale_y_continuous( 71 | limits = range(xy$PC2), 72 | expand = expansion(mult = 0.2)) 73 | 74 | p1 <- p0 + 75 | geom_point( 76 | aes(fill = method), shape = 21, 77 | size = 2, stroke = 0, alpha = 0.4) + 78 | geom_point(data = lab, 79 | aes(fill = method), shape = 21, 80 | size = 3, stroke = 0, alpha = 0.8) + 81 | geom_label_repel(data = lab, 82 | aes(label = method, col = method), 83 | label.padding = unit(0.75, "mm"), 84 | size = 2, fontface = "bold", show.legend = FALSE) + 85 | scale_fill_manual(values = .methods_pal[levels(xy$method)]) + 86 | scale_color_manual(values = .methods_pal[levels(xy$method)]) 87 | 88 | p2 <- p0 + 89 | geom_segment(data = rot, 90 | aes(0, 0, xend = PC1, yend = PC2, col = metric), 91 | size = 0.5, arrow = arrow(length = unit(1, "mm"))) + 92 | geom_label_repel(data = rot, 93 | aes(label = metric, col = metric), 94 | label.padding = unit(0.75, "mm"), 95 | size = 2, fontface = "bold", show.legend = FALSE) + 96 | scale_color_manual(values = .metrics_pal[levels(rot$metric)]) 97 | 98 | thm <- theme( 99 | legend.justification = c(0, 0.5), 100 | panel.grid.major = element_line(size = 0.1, color = "grey")) 101 | 102 | f1 <- .prettify(p1, thm) 103 | f2 <- .prettify(p2, thm) 104 | f2$guides$colour$override.aes$size <- 0.5 105 | 106 | fig <- f1 / f2 + 107 | plot_annotation(tag_levels = "a") & 108 | theme(plot.tag = element_text(size = 9, face = "bold")) 109 | 110 | saveRDS(fig, args$rds) 111 | ggsave(args$pdf, fig, width = 16, height = 18, units = "cm") 112 | -------------------------------------------------------------------------------- /code/07-plot_stat_1d_by_stat1d-boxplot_by_method.R: -------------------------------------------------------------------------------- 1 | # wcs <- list(stat1d = "ks") 2 | # args <- list( 3 | # fun = "code/utils-plotting.R", 4 | # res = "outs/obj-stat_1d.rds", 5 | # rds = sprintf("plts/stat_1d_by_stat1d-boxplot,%s.rds", wcs$stat1d), 6 | # pdf = sprintf("plts/stat_1d_by_stat1d-boxplot,%s.pdf", wcs$stat2d)) 7 | 8 | source(args$fun) 9 | res <- readRDS(args$res) 10 | 11 | df <- res %>% 12 | # keep data of interest 13 | filter(stat1d == wcs$stat1d) %>% 14 | .filter_res() %>% 15 | # average across groups 16 | group_by(refset, method, metric) %>% 17 | .avg(n = 1) 18 | 19 | # average statistics across refsets & summaries 20 | # statistic across all summaries 21 | o <- df %>% 22 | group_by(method, refset, metric) %>% 23 | .avg(n = 3) %>% 24 | # order methods (panels) by average 25 | arrange(stat) %>% 26 | pull("method") 27 | df <- df %>% mutate_at("method", factor, o) 28 | 29 | pal <- .metrics_pal[levels(df$metric)] 30 | lab <- parse(text = paste( 31 | sep = "~", 32 | sprintf("bold(%s)", LETTERS), 33 | gsub("\\s", "~", names(pal)))) 34 | 35 | anno <- df %>% 36 | group_by(method, metric) %>% 37 | summarize_at("stat", median) %>% 38 | mutate(letter = LETTERS[match(metric, levels(metric))]) 39 | 40 | plt <- ggplot(df, aes( 41 | reorder_within(metric, stat, method, median), 42 | stat, col = metric, fill = metric)) + 43 | facet_wrap(~ method, nrow = 4, scales = "free_x") + 44 | geom_boxplot( 45 | size = 0.25, outlier.size = 0.25, 46 | alpha = 0.25, key_glyph = "point") + 47 | geom_text(data = anno, 48 | size = 1.5, color = "black", 49 | aes(label = letter, y = -0.075)) + 50 | scale_fill_manual(values = pal, labels = lab) + 51 | scale_color_manual(values = pal, labels = lab) + 52 | scale_y_continuous(limits = c(-0.1, NA), n.breaks = 3) + 53 | labs(x = NULL, y = .stats1d_lab[wcs$stat1d]) 54 | 55 | thm <- theme( 56 | legend.text.align = 0, 57 | axis.text.x = element_blank(), 58 | axis.ticks.x = element_blank()) 59 | 60 | fig <- .prettify(plt, thm) 61 | 62 | saveRDS(fig, args$rds) 63 | ggsave(args$pdf, fig, width = 16, height = 9, units = "cm") 64 | -------------------------------------------------------------------------------- /code/07-plot_stat_1d_by_stat1d-boxplot_by_metric.R: -------------------------------------------------------------------------------- 1 | # wcs <- list(stat1d = "ks") 2 | # args <- list( 3 | # fun = "code/utils-plotting.R", 4 | # rds = "plts/stat_1d_by_reftyp-boxplot.rds", 5 | # pdf = "plts/stat_1d_by_reftyp-boxplot.pdf", 6 | # res = list.files("outs", paste0("^stat_1d.*", wcs$stat1d, "\\."), full.names = TRUE)) 7 | # args$res <- sample(args$res, 100) 8 | 9 | source(args$fun) 10 | res <- readRDS(args$res) 11 | 12 | df <- res %>% 13 | # keep data of interest 14 | filter(stat1d == wcs$stat1d) %>% 15 | .filter_res() %>% 16 | # average across groups 17 | group_by(refset, method, metric, group) %>% .avg(n = 1) 18 | 19 | pal <- .methods_pal[levels(df$method)] 20 | lab <- parse(text = paste( 21 | sep = "~", 22 | sprintf("bold(%s)", LETTERS), 23 | gsub("\\s", "~", names(pal)))) 24 | 25 | anno <- df %>% 26 | group_by(method, metric) %>% 27 | summarize_at("stat", median) %>% 28 | mutate(letter = LETTERS[match(method, levels(method))]) 29 | 30 | plt <- ggplot(df, aes( 31 | reorder_within(method, stat, metric, median), 32 | stat, col = method, fill = method)) + 33 | facet_wrap(~ metric, nrow = 3, scales = "free_x") + 34 | geom_boxplot( 35 | outlier.size = 0.25, outlier.alpha = 1, 36 | size = 0.25, alpha = 0.25, key_glyph = "point") + 37 | geom_text(data = anno, 38 | size = 1.5, color = "black", 39 | aes(label = letter, y = -0.075)) + 40 | scale_fill_manual(values = pal, labels = lab) + 41 | scale_color_manual(values = pal, labels = lab) + 42 | scale_y_continuous(limits = c(-0.1, NA), n.breaks = 3) + 43 | labs(x = NULL, y = .stats1d_lab[wcs$stat1d]) 44 | 45 | thm <- theme( 46 | legend.text.align = 0, 47 | axis.text.x = element_blank(), 48 | axis.ticks.x = element_blank()) 49 | 50 | fig <- .prettify(plt, thm) 51 | 52 | saveRDS(fig, args$rds) 53 | ggsave(args$pdf, fig, width = 16, height = 9, units = "cm") 54 | -------------------------------------------------------------------------------- /code/07-plot_stat_1d_by_stat1d-correlations.R: -------------------------------------------------------------------------------- 1 | # wcs <- list(stat1d = "ks") 2 | # args <- list( 3 | # fun = "code/utils-plotting.R", 4 | # res = "outs/obj-stat_1d.rds", 5 | # rds = sprintf("plts/stat_1d_by_stat1d-correlations,%s.rds", wcs$stat1d), 6 | # pdf = sprintf("plts/stat_1d_by_stat1d-correlations,%s.pdf", wcs$stat1d)) 7 | 8 | source(args$fun) 9 | res <- readRDS(args$res) 10 | 11 | mat <- res %>% 12 | # keep data of interest 13 | filter(stat1d == wcs$stat1d) %>% 14 | .filter_res() %>% 15 | # average across groups & subsets 16 | group_by(method, metric, refset, group) %>% 17 | .avg(n = 3) %>% 18 | # correlate b/w summaries, across methods & refsets 19 | pivot_wider( 20 | names_from = metric, 21 | values_from = stat) %>% 22 | select(any_of(.metrics_lab)) %>% 23 | cor(method = "spearman", 24 | use = "pairwise.complete.obs") 25 | 26 | foo <- mat 27 | foo[is.na(foo)] <- 1 28 | xo <- rownames(mat)[hclust(dist(foo))$order] 29 | yo <- rownames(mat)[hclust(dist(t(foo)))$order] 30 | 31 | df <- mat %>% 32 | data.frame(from = rownames(.)) %>% 33 | pivot_longer(-from, names_to = "to") %>% 34 | complete(from, to, fill = list(value = NA)) 35 | 36 | plt <- ggplot(df, aes(from, to, fill = value)) + 37 | geom_tile() + 38 | scale_fill_distiller( 39 | bquote("r("*.(.stats1d_lab[wcs$stat1d])*")"), 40 | palette = "RdYlBu", 41 | limits = c(-1, 1), 42 | n.breaks = 3) + 43 | coord_equal(expand = FALSE) + 44 | scale_x_discrete(limits = rev(xo), labels = .metrics_lab) + 45 | scale_y_discrete(limits = yo, labels = .metrics_lab) 46 | 47 | thm <- theme( 48 | axis.title = element_blank(), 49 | axis.text.x = element_text(angle = 45, hjust = 1)) 50 | 51 | fig <- .prettify(plt, thm) 52 | 53 | saveRDS(fig, args$rds) 54 | ggsave(args$pdf, fig, width = 9, height = 7.5, units = "cm") 55 | -------------------------------------------------------------------------------- /code/07-plot_stat_1d_by_stat1d-correlations_by_method.R: -------------------------------------------------------------------------------- 1 | # wcs <- list(stat1d = "ks") 2 | # args <- list( 3 | # fun = "code/utils-plotting.R", 4 | # res = "outs/obj-stat_1d.rds", 5 | # rds = sprintf("plts/stat_1d_by_stat1d-correlations_by_method,%s.rds", wcs$stat1d), 6 | # pdf = sprintf("plts/stat_1d_by_stat1d-correlations_by_method,%s.pdf", wcs$stat1d)) 7 | 8 | source(args$fun) 9 | res <- readRDS(args$res) 10 | 11 | mat <- res %>% 12 | # keep data of interest 13 | filter(stat1d == wcs$stat1d) %>% 14 | .filter_res() %>% 15 | # for each method, method & summary, average across groups 16 | group_by(refset, method, metric, group) %>% .avg(n = 1) %>% 17 | # for each method, correlate b/w summaries & across refsets 18 | split(.$method) %>% 19 | lapply(\(.) 20 | pivot_wider(., 21 | names_from = metric, 22 | values_from = stat) %>% 23 | select(any_of(.metrics_lab)) %>% 24 | cor(method = "spearman", 25 | use = "pairwise.complete.obs")) 26 | 27 | df <- lapply(mat, \(u) u %>% 28 | data.frame(from = rownames(.)) %>% 29 | pivot_longer(-from, names_to = "to")) %>% 30 | bind_rows(.id = "method") %>% 31 | complete(from, to, method, fill = list(value = NA)) 32 | 33 | foo <- df %>% 34 | group_by(from, to) %>% 35 | summarise(value = mean(value), .groups = "drop") %>% 36 | pivot_wider(names_from = to, values_from = value) %>% 37 | select(-from) 38 | foo[is.na(foo)] <- 1 39 | o <- colnames(foo)[hclust(dist(as.matrix(foo)))$order] 40 | 41 | plt <- ggplot(df, aes(from, to, fill = value)) + 42 | facet_wrap(~ method, nrow = 4) + 43 | scale_fill_distiller( 44 | bquote("r("*.(.stats1d_lab[wcs$stat1d])*")"), 45 | palette = "RdYlBu", 46 | na.value = "grey", 47 | limits = c(-1, 1), 48 | n.breaks = 3) + 49 | scale_x_discrete(limits = rev(o), labels = .metrics_lab) + 50 | scale_y_discrete(limits = o, labels = .metrics_lab) + 51 | coord_equal(expand = FALSE) + 52 | geom_tile() 53 | 54 | thm <- theme( 55 | axis.title = element_blank(), 56 | axis.text.x = element_text(angle = 45, hjust = 1)) 57 | 58 | fig <- .prettify(plt, thm) 59 | 60 | saveRDS(fig, args$rds) 61 | ggsave(args$pdf, fig, width = 16, height = 16, units = "cm") 62 | -------------------------------------------------------------------------------- /code/07-plot_stat_1d_by_stat1d-correlations_by_metric.R: -------------------------------------------------------------------------------- 1 | # wcs <- list(stat1d = "ks") 2 | # args <- list( 3 | # fun = "code/utils-plotting.R", 4 | # res = "outs/obj-stat_1d.rds", 5 | # rds = sprintf("plts/stat_1d_by_stat1d-correlations_by_metric,%s.rds", wcs$stat1d), 6 | # pdf = sprintf("plts/stat_1d_by_stat1d-correlations_by_metric,%s.pdf", wcs$stat1d)) 7 | 8 | source(args$fun) 9 | res <- readRDS(args$res) 10 | 11 | mat <- res %>% 12 | # keep method of interest 13 | filter(stat1d == wcs$stat1d) %>% 14 | .filter_res() %>% 15 | # for each method, method & summary, average across groups & refsets 16 | group_by(refset, method, metric, group) %>% .avg(n = 1) %>% 17 | # for each method, correlate b/w summaries & across refsets 18 | split(.$metric) %>% 19 | lapply(\(.) 20 | pivot_wider(., 21 | names_from = method, 22 | values_from = stat) %>% 23 | select(any_of(names(.methods_pal))) %>% 24 | cor(method = "spearman", 25 | use = "pairwise.complete.obs")) 26 | 27 | df <- lapply(mat, \(u) u %>% 28 | data.frame(from = rownames(.)) %>% 29 | pivot_longer(-from, names_to = "to")) %>% 30 | bind_rows(.id = "metric") %>% 31 | complete(from, to, metric, fill = list(value = NA)) 32 | 33 | foo <- df %>% 34 | group_by(from, to) %>% 35 | summarise(value = mean(value), .groups = "drop") %>% 36 | pivot_wider(names_from = to, values_from = value) %>% 37 | select(-from) %>% 38 | as.matrix() 39 | foo[is.na(foo)] <- 1 40 | o <- colnames(foo)[hclust(dist(foo))$order] 41 | 42 | plt <- ggplot(df, aes(from, to, fill = value)) + 43 | facet_wrap(~ metric, nrow = 2) + 44 | scale_fill_distiller( 45 | bquote("r("*.(.stats1d_lab[wcs$stat1d])*")"), 46 | palette = "RdYlBu", 47 | na.value = "grey", 48 | limits = c(-1, 1), 49 | n.breaks = 3) + 50 | scale_x_discrete(limits = rev(o), labels = names(.methods_pal)) + 51 | scale_y_discrete(limits = o, labels = names(.methods_pal)) + 52 | coord_equal(expand = FALSE) + 53 | geom_tile() 54 | 55 | thm <- theme( 56 | axis.title = element_blank(), 57 | axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) 58 | 59 | fig <- .prettify(plt, thm) 60 | 61 | saveRDS(fig, args$rds) 62 | ggsave(args$pdf, fig, width = 24, height = 8.5, units = "cm") 63 | 64 | -------------------------------------------------------------------------------- /code/07-plot_stat_1d_by_stat1d-dimEst.R: -------------------------------------------------------------------------------- 1 | # wcs <- list(stat1d = "ks") 2 | # args <- list( 3 | # fun = "code/utils-plotting.R", 4 | # res = "outs/obj-stat_1d.rds", 5 | # rds = sprintf("plts/stat_1d_by_stat1d-dimEst,%s.rds", wcs$stat1d), 6 | # pdf = sprintf("plts/stat_1d_by_stat1d-dimEst,%s.pdf", wcs$stat1d)) 7 | 8 | suppressPackageStartupMessages({ 9 | library(intrinsicDimension) 10 | }) 11 | 12 | source(args$fun) 13 | res <- readRDS(args$res) 14 | 15 | df <- res %>% 16 | # keep data of interest 17 | filter(stat1d == wcs$stat1d) %>% 18 | .filter_res() %>% 19 | # average across groups 20 | group_by(refset, reftyp, method, metric, group) %>% .avg(n = 1) %>% 21 | # estimate dimensionality for each refset 22 | pivot_wider( 23 | names_from = metric, 24 | values_from = stat) %>% 25 | split(.$refset) %>% 26 | lapply(\(df) { 27 | if (nrow(df) < 3) return(NULL) 28 | ks <- seq(2, nrow(df)-1) 29 | mat <- as.matrix(select(df, any_of(.metrics_lab))) 30 | mat <- mat[, apply(mat, 2, \(.) !all(is.na(.)))] 31 | mat[is.na(mat)] <- 1 32 | est <- sapply(ks, maxLikGlobalDimEst, data = mat) 33 | data.frame(dim = unlist(est), k = ks) 34 | }) %>% 35 | bind_rows(.id = "refset") %>% 36 | mutate(type = res$reftyp[match(refset, res$refset)]) 37 | 38 | plt <- ggplot(df, aes(factor(k), dim, col = type, fill = type)) + 39 | geom_boxplot( 40 | size = 0.25, outlier.size = 0.25, 41 | alpha = 0.25, key_glyph = "point") + 42 | scale_x_discrete( 43 | "number of nearest neighbors (k)", 44 | breaks = seq(0, 20, 2)) + 45 | scale_y_continuous( 46 | "estimated dimensionality", 47 | limits = range(df$dim), 48 | breaks = seq(0, 10, 2)) 49 | 50 | thm <- theme(panel.grid.major = element_line(color = "grey")) 51 | 52 | fig <- .prettify(plt, thm) 53 | 54 | saveRDS(fig, args$rds) 55 | ggsave(args$pdf, fig, width = 9, height = 6, units = "cm") 56 | -------------------------------------------------------------------------------- /code/07-plot_stat_1d_by_stat1d-mds.R: -------------------------------------------------------------------------------- 1 | # wcs <- list(stat1d = "ks") 2 | # args <- list( 3 | # fun = "code/utils-plotting.R", 4 | # res = "outs/obj-stat_1d.rds", 5 | # rds = sprintf("plts/stat_1d_by_stat1d-mds,%s.rds", wcs$stat1d), 6 | # pdf = sprintf("plts/stat_1d_by_stat1d-mds,%s.pdf", wcs$stat1d)) 7 | 8 | source(args$fun) 9 | res <- readRDS(args$res) 10 | 11 | df <- res %>% 12 | # keep data of interest 13 | filter(stat1d == wcs$stat1d) %>% 14 | .filter_res() %>% 15 | # average across groups & refsets 16 | group_by(method, metric, refset, group) %>% 17 | .avg(n = 3) %>% 18 | pivot_wider( 19 | names_from = metric, 20 | values_from = stat) %>% 21 | rowwise() %>% 22 | drop_na(any_of(.metrics_lab)) %>% 23 | select(any_of(.metrics_lab)) %>% 24 | as.matrix() %>% 25 | t() %>% 26 | dist() %>% 27 | cmdscale() %>% 28 | data.frame(rownames(.)) %>% 29 | set_colnames(c("x", "y", "metric")) %>% 30 | mutate( 31 | type = case_when( 32 | metric %in% .none_metrics ~ "global", 33 | grepl("gene", metric) ~ "gene", 34 | grepl("cell", metric) ~ "cell"), 35 | metric = .metrics_lab[metric]) 36 | 37 | plt <- ggplot(df, aes(x, y, 38 | col = type, fill = type, label = metric)) + 39 | geom_point(size = 2, shape = 21, col = "black", alpha = 0.5) + 40 | geom_text_repel(size = 2, show.legend = FALSE) + 41 | scale_fill_manual(values = c("red", "blue", "green3")) + 42 | scale_color_manual(values = c("red", "blue", "green3")) + 43 | scale_x_continuous(expand = expansion(mult = 0.1)) + 44 | scale_y_continuous(expand = expansion(mult = 0.1)) + 45 | labs(x = "MDS dim. 1", y = "MDS dim. 2") + 46 | coord_fixed() 47 | 48 | thm <- theme( 49 | legend.position = "bottom", 50 | legend.title = element_blank(), 51 | panel.grid.major = element_line(color = "grey")) 52 | 53 | fig <- .prettify(plt, thm) 54 | 55 | saveRDS(fig, args$rds) 56 | ggsave(args$pdf, fig, width = 12, height = 8, units = "cm") 57 | -------------------------------------------------------------------------------- /code/07-plot_stat_1d_by_stat1d-pca.R: -------------------------------------------------------------------------------- 1 | # wcs <- list(stat1d = "ks") 2 | # args <- list( 3 | # fun = "code/utils-plotting.R", 4 | # res = "outs/obj-stat_1d.rds", 5 | # rds = sprintf("plts/stat_1d_by_stat1d-pca,%s.rds", wcs$stat1d), 6 | # pdf = sprintf("plts/stat_1d_by_stat1d-pca,%s.pdf", wcs$stat1d)) 7 | 8 | source(args$fun) 9 | res <- readRDS(args$res) 10 | 11 | df <- res %>% 12 | # keep data of interest 13 | filter(stat1d == wcs$stat1d) %>% 14 | .filter_res() %>% 15 | # average across groups & refsets 16 | group_by(method, metric, refset, group) %>% 17 | .avg(n = 3) %>% 18 | pivot_wider( 19 | names_from = metric, 20 | values_from = stat) %>% 21 | rowwise() %>% 22 | drop_na(any_of(.metrics_lab)) 23 | 24 | # do PCA 25 | pc <- df %>% 26 | select(any_of(.metrics_lab)) %>% 27 | as.matrix() %>% prcomp() 28 | 29 | # get coordinates 30 | xy <- pc$x %>% 31 | data.frame() %>% 32 | # add metadata 33 | mutate( 34 | select(df, !any_of(.metrics_lab)), 35 | method = droplevels(method)) 36 | 37 | # get loadings 38 | rot <- pc$rotation %>% 39 | data.frame() %>% 40 | mutate( 41 | metric = .metrics_lab[rownames(.)], 42 | metric = factor(metric, metric)) 43 | ij <- c("PC1", "PC2") 44 | m1 <- max(abs(xy[, ij])) 45 | m2 <- max(abs(rot[, ij])) 46 | rot[, ij] <- 0.5*rot[, ij]*m1/m2 47 | 48 | # get percentages 49 | var <- prop.table(pc$sdev^2) 50 | var_lab <- round(100*var, 1) 51 | 52 | p0 <- ggplot(xy, aes(PC1, PC2)) + 53 | geom_vline(xintercept = 0, size = 0.1) + 54 | geom_hline(yintercept = 0, size = 0.1) + 55 | coord_fixed() + labs( 56 | x = sprintf("PC1 (%s%%)", var_lab[1]), 57 | y = sprintf("PC2 (%s%%)", var_lab[2])) + 58 | scale_x_continuous( 59 | limits = range(xy$PC1), 60 | expand = expansion(mult = 0.2)) + 61 | scale_y_continuous( 62 | limits = range(xy$PC2), 63 | expand = expansion(mult = 0.2)) 64 | 65 | p1 <- p0 + 66 | geom_point( 67 | aes(fill = method), shape = 21, 68 | size = 3, stroke = 0, alpha = 0.8) + 69 | geom_label_repel( 70 | aes(label = method, col = method), 71 | label.padding = unit(0.75, "mm"), 72 | size = 2, fontface = "bold", show.legend = FALSE) + 73 | scale_fill_manual(values = .methods_pal[levels(xy$method)]) + 74 | scale_color_manual(values = .methods_pal[levels(xy$method)]) 75 | 76 | p2 <- p0 + 77 | geom_segment(data = rot, 78 | aes(0, 0, xend = PC1, yend = PC2, col = metric), 79 | size = 0.5, arrow = arrow(length = unit(1, "mm"))) + 80 | geom_label_repel(data = rot, 81 | aes(label = metric, col = metric), 82 | label.padding = unit(0.75, "mm"), 83 | size = 2, fontface = "bold", show.legend = FALSE) + 84 | scale_color_manual(values = .metrics_pal[levels(rot$metric)]) 85 | 86 | thm <- theme( 87 | legend.justification = c(0, 0.5), 88 | panel.grid.major = element_line(size = 0.1, color = "grey")) 89 | 90 | f1 <- .prettify(p1, thm) 91 | f2 <- .prettify(p2, thm) 92 | f2$guides$colour$override.aes$size <- 0.5 93 | 94 | fig <- f1 / f2 + 95 | plot_annotation(tag_levels = "a") & 96 | theme(plot.tag = element_text(size = 9, face = "bold")) 97 | 98 | saveRDS(fig, args$rds) 99 | ggsave(args$pdf, fig, width = 16, height = 18, units = "cm") 100 | -------------------------------------------------------------------------------- /code/07-plot_stat_2d-scatters.R: -------------------------------------------------------------------------------- 1 | # args <- list( 2 | # fun = "code/utils.R", 3 | # res = "outs/obj-stat_2d.rds", 4 | # rds = "plts/stat_2d-scatters.rds", 5 | # pdf = "plts/stat_2d-scatters.pdf") 6 | 7 | source(args$fun) 8 | 9 | res <- readRDS(args$res) 10 | 11 | df <- res %>% 12 | mutate( 13 | refset = paste(datset, subset, sep = ","), 14 | metrics = paste(metric1, metric2, sep = "\n")) %>% 15 | select(-c(datset, subset, metric1, metric2)) %>% 16 | # for type != n, keep batch-/cluster-level comparisons only 17 | group_by(stat2d, refset, method, metrics) %>% 18 | mutate(n = n()) %>% 19 | filter(group != "global" | n == 1) %>% 20 | select(-c(group, n)) %>% 21 | # for each statistic & metric, re-scale b/w 0 & 1 22 | group_by(stat2d, metrics) %>% 23 | mutate(stat = stat/max(stat, na.rm = TRUE)) %>% 24 | pivot_wider(names_from = stat2d, values_from = stat) %>% 25 | filter(!is.na(ks2), !is.na(emd)) 26 | 27 | plt <- ggplot(df, aes(ks2, emd, col = refset, fill = refset)) + 28 | facet_wrap(~ metrics, nrow = 1) + 29 | stat_cor( 30 | aes(group = metrics), method = "spearman", 31 | label.x = 0, label.y = 1, vjust = 1, size = 1.5) + 32 | geom_point(size = 0.2, alpha = 0.4, show.legend = FALSE) + 33 | scale_x_sqrt(limits = c(0, 1), breaks = seq(0.2, 1, 0.2)) + 34 | scale_y_sqrt(limits = c(0, 1), breaks = seq(0.2, 1, 0.2)) + 35 | labs(x = "KS statistics", y = "scaled EMD") 36 | 37 | thm <- theme(aspect.ratio = 1) 38 | 39 | fig <- .prettify(plt, thm) 40 | 41 | saveRDS(fig, args$rds) 42 | ggsave(args$pdf, fig, width = 16, height = 7.5, units = "cm") 43 | -------------------------------------------------------------------------------- /code/07-plot_stat_2d_by_reftyp-boxplot.R: -------------------------------------------------------------------------------- 1 | # wcs <- list(stat2d = "emd", reftyp = "n") 2 | # args <- list( 3 | # fun = "code/utils-plotting.R", 4 | # res = "outs/obj-stat_2d.rds", 5 | # rds = "plts/stat_2d_by_reftyp-boxplot,b,ks2.rds", 6 | # pdf = "plts/stat_2d_by_reftyp-boxplot,b,ks2.pdf") 7 | 8 | source(args$fun) 9 | res <- .read_res(args$res) 10 | 11 | df <- res %>% 12 | mutate(metrics = paste(metric1, metric2, sep = "\n")) %>% 13 | # keep statistic of interest 14 | filter(stat2d == wcs$stat2d) %>% 15 | # keep group-level comparisons only 16 | { if (wcs$reftyp == "n") . else 17 | mutate(., across( 18 | c(group, id), 19 | as.character)) %>% 20 | filter(group != id) } %>% 21 | # scale values b/w 0 and 1 for visualization 22 | group_by(metrics) %>% 23 | mutate(stat = stat/max(stat, na.rm = TRUE)) 24 | 25 | pal <- .methods_pal[levels(df$method)] 26 | lab <- parse(text = paste( 27 | sep = "~", 28 | sprintf("bold(%s)", LETTERS), 29 | gsub("\\s", "~", names(pal)))) 30 | 31 | anno <- df %>% 32 | group_by(method, metrics) %>% 33 | summarize_at("stat", median) %>% 34 | mutate(letter = LETTERS[match(method, levels(method))]) 35 | 36 | plt <- ggplot(df, aes( 37 | reorder_within(method, stat, metrics, median), 38 | stat, col = method, fill = method)) + 39 | facet_wrap(~ metrics, nrow = 3, scales = "free_x") + 40 | geom_boxplot( 41 | outlier.size = 0.25, outlier.alpha = 1, 42 | size = 0.25, alpha = 0.25, key_glyph = "point") + 43 | geom_text(data = anno, 44 | size = 1.5, color = "black", 45 | aes(label = letter, y = -0.075)) + 46 | scale_fill_manual(values = pal, labels = lab) + 47 | scale_color_manual(values = pal, labels = lab) + 48 | scale_x_reordered(NULL) + 49 | scale_y_continuous( 50 | paste(ifelse(wcs$stat2d == "emd", "scaled", ""), 51 | .stats2d_lab[wcs$stat2d]), 52 | limits = c(-0.1, 1), n.breaks = 3) 53 | 54 | thm <- theme( 55 | legend.text.align = 0, 56 | legend.title = element_blank(), 57 | axis.text.x = element_blank(), 58 | axis.title.x = element_blank(), 59 | axis.ticks.x = element_blank()) 60 | 61 | fig <- .prettify(plt, thm) 62 | 63 | saveRDS(fig, args$rds) 64 | ggsave(args$pdf, fig, width = 16, height = 9, units = "cm") 65 | -------------------------------------------------------------------------------- /code/07-plot_stat_2d_by_reftyp-heatmap.R: -------------------------------------------------------------------------------- 1 | # wcs <- list(stat2d = "emd", reftyp = "b") 2 | # args <- list( 3 | # fun = "code/utils-plotting.R", 4 | # res = "outs/obj-stat_2d.rds", 5 | # rds = "plts/stat_2d_by_reftyp-boxplot,b,ks2.rds", 6 | # pdf = "plts/stat_2d_by_reftyp-boxplot,b,ks2.pdf") 7 | 8 | source(args$fun) 9 | res <- .read_res(args$res) 10 | 11 | df <- res %>% 12 | mutate(metrics = paste(metric1, metric2, sep = "\n")) %>% 13 | # except for KNN & global summaries, 14 | # keep group-level comparisons only 15 | { if (wcs$reftyp == "n") . else 16 | mutate(., across( 17 | c(group, id), 18 | as.character)) %>% 19 | filter( 20 | (!grepl("KNN", metrics) & group != id) | 21 | (grepl("KNN", metrics) & group == "global") | 22 | any(c(metric1, metric2) %in% .metrics_lab[.none_metrics])) } %>% 23 | # aggregation 24 | group_by(method, metrics, group, datset, subset) %>% 25 | summarize_at("stat", mean, na.rm = TRUE) %>% # average across groups 26 | summarize_at("stat", mean, na.rm = TRUE) %>% # average across subsets 27 | summarize_at("stat", mean, na.rm = TRUE) # average across datsets 28 | 29 | # order methods by average across metrics 30 | ox <- df %>% 31 | group_by(method) %>% 32 | summarise_at("stat", mean) %>% 33 | arrange(desc(stat)) %>% 34 | pull("method") 35 | 36 | # order metrics by average across methods 37 | oy <- df %>% 38 | group_by(metrics) %>% 39 | summarise_at("stat", mean) %>% 40 | arrange(desc(stat)) %>% 41 | pull("metrics") 42 | 43 | df <- complete(df, method, metrics, fill = list(stat = NA)) 44 | 45 | plt <- ggplot(df, 46 | aes(method, metrics, fill = stat)) + 47 | geom_tile(col = "white") + 48 | scale_fill_distiller( 49 | .stats2d_lab[wcs$stat2d], 50 | palette = "RdYlBu", 51 | na.value = "grey", 52 | limits = c(0, 1), 53 | breaks = c(0, 1)) + 54 | coord_equal(3/2, expand = FALSE) + 55 | scale_x_discrete(limits = rev(ox)) + 56 | scale_y_discrete(limits = rev(oy)) 57 | 58 | thm <- theme( 59 | axis.ticks = element_blank(), 60 | axis.title = element_blank(), 61 | panel.border = element_blank(), 62 | axis.text.x = element_text(angle = 45, hjust = 1)) 63 | 64 | fig <- .prettify(plt, thm) 65 | 66 | saveRDS(fig, args$rds) 67 | ggsave(args$pdf, fig, width = 7.5, height = 6, units = "cm") 68 | -------------------------------------------------------------------------------- /code/08-fig_boxplots.R: -------------------------------------------------------------------------------- 1 | # args <- list( 2 | # uts = "code/utils-plotting.R", 3 | # rds = c( 4 | # "plts/stat_1d_by_stat1d-boxplot_by_metric,ks.rds", 5 | # "plts/stat_1d_by_stat1d-boxplot_by_method,ks.rds"), 6 | # pdf = "figs/boxplots.pdf") 7 | 8 | source(args$uts) 9 | 10 | ps <- lapply(args$rds, readRDS) 11 | 12 | fig <- wrap_plots(ps, ncol = 1, heights = c(3, 4)) + 13 | plot_annotation(tag_levels = "a") & 14 | theme( 15 | plot.margin = margin(), 16 | panel.spacing = unit(1, "mm"), 17 | plot.tag = element_text(face = "bold", size = 9)) 18 | 19 | ggsave(args$pdf, fig, width = 16, height = 14, units = "cm") 20 | -------------------------------------------------------------------------------- /code/08-fig_clustering.R: -------------------------------------------------------------------------------- 1 | # args <- list( 2 | # rds = file.path("plts", c( 3 | # "clust-boxplot_by_method.rds", 4 | # "clust-boxplot_dF1.rds", 5 | # "clust-heatmap_by_method.rds", 6 | # "clust-correlations.rds")), 7 | # uts = "code/utils-plotting.R", 8 | # pdf = "figs/clustering.pdf") 9 | 10 | source(args$uts) 11 | 12 | ps <- lapply(args$rds, readRDS) 13 | 14 | # fix methods order according to ref-sim 15 | # difference for boxplots & heatmaps 16 | o <- c("ref", ggplot_build(ps[[2]])$layout$panel_params[[1]]$x$breaks) 17 | ps[[1]]$data$sim_method <- factor(ps[[1]]$data$sim_method, o) 18 | ps[[3]]$data$sim_method <- factor(ps[[3]]$data$sim_method, o) 19 | 20 | ps[[3]]$layers[[1]]$aes_params$colour <- NA 21 | 22 | ps[[3]] <- ps[[3]] + theme( 23 | axis.text.y = element_text(size = 3, angle = 60, hjust = 1), 24 | axis.text.x = element_text(size = 3, angle = 90, vjust = 0.5)) 25 | 26 | ps[[4]] <- ps[[4]] + theme( 27 | legend.background = element_blank()) 28 | 29 | ps <- lapply(ps, \(.) wrap_elements(full = .)) 30 | 31 | ws <- c(3, 1) 32 | 33 | fig <- 34 | ((ps[[1]] + ps[[2]]) + plot_layout(widths = ws)) / 35 | ((ps[[3]] + ps[[4]]) + plot_layout(widths = ws)) + 36 | plot_annotation(tag_levels = "a") & 37 | theme( 38 | plot.margin = margin(), 39 | plot.tag = element_text(face = "bold", size = 9)) 40 | 41 | ggsave(args$pdf, fig, width = 16, height = 9, units = "cm") 42 | -------------------------------------------------------------------------------- /code/08-fig_heatmaps.R: -------------------------------------------------------------------------------- 1 | # args <- list( 2 | # pdf = "figs/heatmaps.pdf", 3 | # uts = "code/utils-plotting.R", 4 | # rds = list.files("plts", "stat_(1|2)d_by_reftyp-heatmap.*ks2?\\.rds", full.names = TRUE)) 5 | 6 | source(args$uts) 7 | ps <- lapply(fns <- args$rds, readRDS) 8 | 9 | # re-order by dimension & type 10 | dim <- gsub(".*(1|2)d.*", "\\1", fns) 11 | typ <- gsub(".*,(n|b|k),ks.*", "\\1", fns) 12 | names(ps) <- paste0(typ, dim) 13 | ps <- ps[paste0( 14 | c("n", "b", "k"), 15 | rep(c(1, 2), each = 3))] 16 | 17 | # assure global metrics are 18 | # included across all panels 19 | 20 | ps[1:3] <- lapply(ps[1:3], \(.) { 21 | y <- .$data$metric 22 | y <- factor(y, .metrics_lab) 23 | .$data$metric <- y 24 | return(.) 25 | }) 26 | 27 | # fix y-axis order across panels and add rectangles 28 | # highlighting gene-/cell-level & global summaries 29 | 30 | s <- 0.50 # border size 31 | d <- 0.00 # offset b/w boxes 32 | 33 | ps[1:3] <- lapply(ps[1:3], \(p) 34 | p + scale_y_discrete(limits = rev(.metrics_lab)) + 35 | geom_rect( 36 | xmin = 0.5 + d, 37 | xmax = length(unique(p$data$method)) + 0.5 - d, 38 | ymax = length(.metrics_lab) + 0.5 - d, 39 | ymin = length(.none_metrics) + length(.cell_metrics) + 0.5 + d, 40 | size = s, fill = NA, col = "red") + 41 | geom_rect( 42 | xmin = 0.5 + d, 43 | xmax = length(unique(p$data$method)) + 0.5 - d, 44 | ymin = length(.none_metrics) + 0.5 + d, 45 | ymax = length(.none_metrics) + length(.cell_metrics) + 0.5 - d, 46 | size = s, fill = NA, col = "blue") + 47 | geom_rect( 48 | xmin = 0.5 + d, 49 | xmax = length(unique(p$data$method)) + 0.5 - d, 50 | ymin = 0.5 - d, 51 | ymax = length(.none_metrics) + 0.5 - d, 52 | size = s, fill = NA, col = "green3")) 53 | 54 | # get x-axis (methods) ordering 55 | xo <- lapply(ps[1:3], \(p) ggplot_build(p)$layout$panel_scales_x[[1]]$limits) 56 | 57 | .metric_pairs <- unique(ps[[5]]$data$metrics) 58 | .metric_pairs <- .metric_pairs[c(3, 2, 1, 9, 8, 6, 5, 4, 7)] 59 | 60 | ps[4:6] <- lapply(seq_along(ps[4:6]), \(i) { 61 | p <- ps[4:6][[i]] 62 | p + scale_x_discrete(limits = xo[[i]]) + 63 | scale_y_discrete(limits = rev(.metric_pairs)) + 64 | geom_rect( 65 | xmin = 0.5 - d, 66 | xmax = length(unique(p$data$method)) + 0.5 - d, 67 | ymax = length(.metric_pairs) + 0.5 - d, 68 | ymin = 3.5 + d, 69 | size = s, fill = NA, col = "red") + 70 | geom_rect( 71 | xmin = 0.5 - d, 72 | xmax = length(unique(p$data$method)) + 0.5 - d, 73 | ymin = 0.5 + d, 74 | ymax = 3.5 - d, 75 | size = s, fill = NA, col = "blue") 76 | }) 77 | 78 | # drop y-axis labels from all but left-most panels 79 | ps[-c(1, 4)] <- lapply(ps[-c(1, 4)], "+", 80 | theme(axis.text.y = element_blank())) 81 | 82 | # re-size relative to number of 83 | # rows (metrics) & columns (methods) 84 | hs <- c( 85 | length(unique(ps[[1]]$data$metric)), 86 | 1.5 + length(unique(ps[[5]]$data$metrics))) 87 | ws <- sapply(ps[1:3], \(.) nlevels(.$data$method)) 88 | 89 | x <- c("gene-level" = "red", "cell-level" = "blue", "global" = "green3") 90 | ps[[1]] <- ps[[1]] + 91 | geom_point( 92 | inherit.aes = FALSE, 93 | data = data.frame(x), 94 | aes_string( 95 | ps[[1]]$data$method[1], 96 | ps[[1]]$data$metric[1], 97 | col = "x"), alpha = 0) + 98 | scale_color_manual("type of\nsummary", values = x) + 99 | guides(color = guide_legend(order = 1, override.aes = list(alpha = 1))) 100 | 101 | fig <- wrap_plots(ps, 102 | nrow = 2, heights = hs, widths = ws) + 103 | plot_layout(guides = "collect") + 104 | plot_annotation(tag_levels = "a") & 105 | coord_cartesian() & 106 | theme( 107 | axis.text.x = element_text(angle = 30), 108 | plot.margin = margin(l = 2, unit = "mm"), 109 | plot.tag = element_text(face = "bold", size = 9)) 110 | 111 | ggsave(args$pdf, fig, width = 16, height = 12, units = "cm") 112 | -------------------------------------------------------------------------------- /code/08-fig_integration.R: -------------------------------------------------------------------------------- 1 | # val <- rep(c("cms", "ldf", "bcs"), each = 4) 2 | # args <- list( 3 | # uts = "code/utils-plotting.R", 4 | # pdf = "figs/integration.pdf", 5 | # rds = sprintf(c( 6 | # "plts/batch-boxplot_by_method_%s.rds", 7 | # "plts/batch-boxplot_dX_%s.rds", 8 | # "plts/batch-heatmap_by_method_%s.rds", 9 | # "plts/batch-correlations_%s.rds"), val)) 10 | 11 | source(args$uts) 12 | ps <- lapply(args$rds, readRDS) 13 | 14 | # split plots by value 15 | pat <- ".*_(.*)\\.rds$" 16 | lys <- split(ps, gsub(pat, "\\1", args$rds)) 17 | 18 | # fix methods order according to ref-sim 19 | # difference for boxplots & heatmaps 20 | o <- c("ref", ggplot_build(lys[[1]][[2]])$layout$panel_params[[1]]$x$breaks) 21 | lys <- lapply(lys, \(ps) { 22 | ps[[1]]$data$sim_method <- factor(ps[[1]]$data$sim_method, o) 23 | ps[[3]]$data$sim_method <- factor(ps[[3]]$data$sim_method, o) 24 | return(ps) 25 | }) 26 | 27 | # generate separate figure for each 28 | lys <- lapply(lys, \(p) { 29 | p[[3]]$layers[[1]]$aes_params$colour <- NA 30 | p[[3]] <- p[[3]] + theme( 31 | axis.text.y = element_text(size = 3, angle = 60, hjust = 1), 32 | axis.text.x = element_text(size = 3, angle = 90, vjust = 0.5)) 33 | p[[4]] <- p[[4]] + theme( 34 | legend.background = element_blank()) 35 | p <- lapply(p, \(.) wrap_elements(full = .)) 36 | ws <- c(3, 1) 37 | fig <- ((p[[1]] + p[[2]]) + plot_layout(widths = ws)) / 38 | ((p[[3]] + p[[4]]) + plot_layout(widths = ws)) + 39 | plot_annotation(tag_levels = "a") & 40 | theme( 41 | plot.margin = margin(), 42 | plot.tag = element_text(face = "bold", size = 9)) 43 | }) 44 | 45 | pdf(args$pdf, width = 17.6/2.54, height = 9.9/2.54) 46 | for (p in lys) print(p); dev.off() 47 | -------------------------------------------------------------------------------- /code/08-fig_mds.R: -------------------------------------------------------------------------------- 1 | # args <- list( 2 | # pdf = "figs/mds.pdf", 3 | # uts = "code/utils-plotting.R", 4 | # rds = list.files("plts", "reftyp-mds.*ks\\.rds", full.names = TRUE)) 5 | 6 | source(args$uts) 7 | ps <- lapply(args$rds, readRDS) 8 | ps[[1]]$guides$fill <- "none" 9 | 10 | fig <- wrap_plots(ps, ncol = 1) + 11 | plot_layout(guides = "collect") + 12 | plot_annotation(tag_levels = "a") & 13 | theme( 14 | plot.margin = margin(), 15 | legend.position = "bottom", 16 | plot.tag = element_text(size = 9, face = "bold")) 17 | 18 | ggsave(args$pdf, fig, width = 9, height = 17, units = "cm") -------------------------------------------------------------------------------- /code/08-fig_memory.R: -------------------------------------------------------------------------------- 1 | # args <- list( 2 | # rds = list.files("plts", "mbs.*\\.rds", full.names = TRUE), 3 | # pdf = "figs/memory.pdf", 4 | # uts = "code/utils-plotting.R") 5 | 6 | source(args$uts) 7 | 8 | ps <- lapply(args$rds, readRDS) 9 | 10 | pat <- paste0("_", c("n", "b", "k"), ".rds") 11 | ps <- ps[sapply(pat, grep, args$rds)] 12 | 13 | fig <- wrap_plots(ps, ncol = 1) + 14 | plot_annotation(tag_levels = "a") & 15 | theme( 16 | plot.margin = margin(), 17 | panel.spacing = unit(1, "mm"), 18 | legend.title = element_blank(), 19 | plot.tag = element_text(size = 9, face = "bold")) 20 | 21 | ggsave(args$pdf, fig, width = 18, height = 16, units = "cm") 22 | -------------------------------------------------------------------------------- /code/08-fig_runtimes.R: -------------------------------------------------------------------------------- 1 | # args <- list( 2 | # rds = list.files("plts", "rts.*\\.rds", full.names = TRUE), 3 | # pdf = "figs/runtimes.pdf", 4 | # uts = "code/utils-plotting.R") 5 | 6 | source(args$uts) 7 | 8 | ps <- lapply(args$rds, readRDS) 9 | 10 | pat <- paste0("_", c("n", "b", "k"), ".rds") 11 | ps <- ps[sapply(pat, grep, args$rds)] 12 | 13 | fig <- wrap_plots(ps, ncol = 1) + 14 | plot_annotation(tag_levels = "a") & 15 | theme( 16 | plot.margin = margin(), 17 | panel.spacing = unit(1, "mm"), 18 | legend.title = element_blank(), 19 | plot.tag = element_text(size = 9, face = "bold")) 20 | 21 | ggsave(args$pdf, fig, width = 18, height = 21, units = "cm") 22 | -------------------------------------------------------------------------------- /code/08-fig_scalability.R: -------------------------------------------------------------------------------- 1 | # args <- list( 2 | # rds = list.files("plts", "(rts|mbs).*\\.rds", full.names = TRUE), 3 | # pdf = "figs/rts_vs_mbs.pdf", 4 | # uts = "code/utils-plotting.R") 5 | 6 | source(args$uts) 7 | ps <- lapply(args$rds, readRDS) 8 | 9 | pat <- ".*_([a-z])\\.rds" 10 | reftyp <- gsub(pat, "\\1", basename(args$rds)) 11 | is <- split(seq_along(ps), reftyp)[c("n", "b", "k")] 12 | 13 | ps <- lapply(is, \(i) { 14 | 15 | nm <- c("rts", "mbs") 16 | for (j in seq_along(nm)) { 17 | k <- grep(nm[[j]], args$rds[i]) 18 | df <- paste0("df_", nm[[j]]) 19 | assign(df, ps[i][[k]]$data) 20 | } 21 | 22 | i <- c("method", "dim", "n", j <- c("t", "mbs")) 23 | #f <- \(.) factor(., sort(as.numeric(as.character(unique(.))))) 24 | f <- \(.) as.numeric(as.character(.)) 25 | df_rts <- df_rts %>% 26 | filter(step == "overall") %>% 27 | mutate(n = f(n)) %>% ungroup() %>% select(any_of(i)) 28 | 29 | df_mbs <- df_mbs %>% 30 | mutate(n = f(n)) %>% ungroup() %>% select(any_of(i)) 31 | 32 | df <- full_join( 33 | df_rts, df_mbs, 34 | by = setdiff(i, j)) %>% 35 | filter(!is.na(t), !is.na(mbs)) 36 | pal <- .methods_pal[levels(df$method)] 37 | 38 | p <- ggplot(df, aes(t, mbs, col = method, fill = method)) + 39 | scale_color_manual(values = pal) + 40 | scale_fill_manual(values = pal) + 41 | facet_grid(~ dim) + 42 | geom_point(alpha = 0.8) + 43 | geom_path(size = 0.2) + 44 | scale_x_continuous("runtime (s)", limits = c(0, NA), trans = "sqrt") + 45 | scale_y_continuous("memory usage (MBs)", limits = c(0, NA)) 46 | 47 | .prettify(p) + theme( 48 | panel.grid.major = element_line(color = "grey", size = 0.2)) 49 | }) 50 | 51 | fig <- wrap_plots(ps, ncol = 1) + 52 | plot_annotation(tag_levels = "a") & 53 | theme( 54 | plot.margin = margin(), 55 | panel.spacing = unit(1, "mm"), 56 | legend.title = element_blank(), 57 | plot.tag = element_text(size = 9, face = "bold")) 58 | 59 | ggsave(args$pdf, fig, width = 16, height = 16, units = "cm") 60 | -------------------------------------------------------------------------------- /code/08-fig_scatters.R: -------------------------------------------------------------------------------- 1 | # args <- list( 2 | # uts = "code/utils-plotting.R", 3 | # pdf = "figs/scatters.pdf", 4 | # rds = list.files("plts", "scatters.*\\.rds", full.names = TRUE)) 5 | 6 | source(args$uts) 7 | 8 | ps <- lapply(args$rds, readRDS) 9 | ps[[1]]$facet$params$nrow <- 2 10 | ps[[2]]$facet$params$nrow <- 1 11 | 12 | fig <- 13 | wrap_plots(ps, ncol = 1, heights = c(2, 1)) + 14 | plot_annotation(tag_levels = "a") & 15 | scale_x_sqrt(breaks = c(0.2, 0.5, 1)) & 16 | theme( 17 | plot.margin = margin(r = 1, unit = "mm"), 18 | plot.tag = element_text(size = 9, face = "bold")) 19 | 20 | ggsave(args$pdf, fig, width = 20, height = 11, units = "cm") 21 | -------------------------------------------------------------------------------- /code/08-fig_stat1d.R: -------------------------------------------------------------------------------- 1 | # args <- list( 2 | # uts = "code/utils-plotting.R", 3 | # pdf = "figs/stat1d.pdf", 4 | # rds = list.files("plts", "stat_1d_by_reftyp-boxplot.*\\.rds", full.names = TRUE)) 5 | 6 | source(args$uts) 7 | 8 | ps <- lapply(c("n", "b", "k"), function(type) { 9 | pat <- sprintf(",%s,", type) 10 | fns <- grep(pat, args$rds, value = TRUE) 11 | lapply(rev(fns), readRDS) %>% 12 | wrap_plots(ncol = 1) + 13 | plot_layout(guides = "collect") + 14 | plot_annotation(tag_levels = "a") & 15 | theme( 16 | panel.spacing = unit(1, unit = "mm"), 17 | plot.margin = margin(t = 1, unit = "mm"), 18 | plot.tag = element_text(size = 9, face = "bold")) 19 | }) 20 | 21 | print.gglist <- \(p, ...) plyr::l_ply(p, print, ...) 22 | fig <- structure(ps, class = c("gglist", "ggplot")) 23 | ggsave(args$pdf, fig, width = 16, height = 16, units = "cm") 24 | -------------------------------------------------------------------------------- /code/08-fig_stat2d.R: -------------------------------------------------------------------------------- 1 | # args <- list( 2 | # uts = "code/utils-plotting.R", 3 | # pdf = "figs/stat2d.pdf", 4 | # rds = list.files("plts", "stat_2d_by_reftyp-boxplot.*\\.rds", full.names = TRUE)) 5 | 6 | source(args$uts) 7 | 8 | ps <- lapply(c("n", "b", "k"), function(type) { 9 | pat <- sprintf(",%s,", type) 10 | fns <- grep(pat, args$rds, value = TRUE) 11 | lapply(rev(fns), readRDS) %>% 12 | wrap_plots(ncol = 1) + 13 | plot_layout(guides = "collect") + 14 | plot_annotation(tag_levels = "a") & 15 | theme( 16 | panel.spacing = unit(1, unit = "mm"), 17 | plot.margin = margin(t = 1, unit = "mm"), 18 | plot.tag = element_text(size = 9, face = "bold")) 19 | }) 20 | 21 | print.gglist <- \(p, ...) plyr::l_ply(p, print, ...) 22 | fig <- structure(ps, class = c("gglist", "ggplot")) 23 | ggsave(args$pdf, fig, width = 16, height = 16, units = "cm") -------------------------------------------------------------------------------- /code/08-fig_summaries.R: -------------------------------------------------------------------------------- 1 | # args <- list( 2 | # uts = "code/utils-plotting.R", 3 | # pdf = "figs/summaries.pdf", 4 | # rds = file.path("plts", c( 5 | # "qc_ref-correlations.rds", 6 | # "stat_1d_by_stat1d-correlations,ks.rds", 7 | # "stat_1d_by_stat1d-mds,ks.rds", 8 | # "stat_1d_by_stat1d-pca,ks.rds"))) 9 | 10 | source(args$uts) 11 | ps <- lapply(args$rds, readRDS) 12 | 13 | ps[c(1, 2)] <- lapply(ps[c(1, 2)], "+", theme( 14 | axis.text.x = element_text(size = 3, angle = 30))) 15 | 16 | ps[[3]]$layers[[2]]$aes_params$size <- 1.25 17 | ps[[3]] <- ps[[3]] + theme(legend.position = "right") 18 | ps[[3]] <- wrap_elements(full = ps[[3]]) 19 | 20 | ps[[4]][[1]] <- ps[[4]][[1]] + 21 | theme(legend.position = "none") 22 | ps[[4]][[1]]$coordinates$expand <- TRUE 23 | ps[[4]][[2]]$layers <- ps[[4]][[2]]$layers[-4] 24 | ps[[4]][[2]] <- ps[[4]][[2]] + 25 | theme( 26 | legend.position = c(0.95, 0.5), 27 | legend.justification = c(1, 0.5)) 28 | 29 | ps[[4]] <- ps[[4]] & theme(plot.tag = element_blank()) 30 | ps[[4]] <- wrap_elements(full = ps[[4]]) 31 | 32 | fig <- (wrap_plots(ps[1:3], ncol = 1) | 33 | wrap_plots(ps[4], ncol = 1)) + 34 | plot_layout(widths = c(1, 2.5)) + 35 | plot_annotation(tag_levels = "a") & 36 | theme( 37 | plot.margin = margin(l = 1, t = 1, unit = "mm"), 38 | plot.tag = element_text(size = 9, face = "bold")) 39 | 40 | ggsave(args$pdf, fig, width = 16, height = 12, units = "cm") 41 | -------------------------------------------------------------------------------- /code/09-write_fns.R: -------------------------------------------------------------------------------- 1 | pat <- paste0("^", wcs$pat) 2 | fns <- list.files("outs", pat, full.names = TRUE) 3 | writeLines(fns, args$txt) -------------------------------------------------------------------------------- /code/09-write_obj.R: -------------------------------------------------------------------------------- 1 | source(args$fun) 2 | fns <- readLines(args$txt) 3 | res <- .read_res(fns) 4 | saveRDS(res, args$rds) -------------------------------------------------------------------------------- /code/10-session_info.R: -------------------------------------------------------------------------------- 1 | x <- c( 2 | # GENERAL 3 | # Bioconductor 4 | "Biobase", 5 | "BiocParallel", 6 | "CellBench", 7 | "ExperimentHub", 8 | "GEOquery", 9 | "scater", 10 | "scran", 11 | "SingleCellExperiment", 12 | "TENxPBMCData", 13 | "variancePartition", 14 | "waddR", 15 | # CRAN 16 | "cluster", 17 | "dplyr", 18 | "emdist", 19 | "jsonlite", 20 | "ggplot2", 21 | "ggpubr", 22 | "ggrastr", 23 | "Matrix", 24 | "matrixStats", 25 | "patchwork", 26 | "Peacock.test", 27 | "RANN", 28 | "RColorBrewer", 29 | "Seurat", 30 | "SeuratData", 31 | "tidyr", 32 | "tidytext", 33 | # SIMULATION 34 | # Bioconductor 35 | "BASiCS", 36 | "muscat", 37 | "scDD", 38 | "splatter", 39 | "SPsimSeq", 40 | "zinbwave", 41 | # GitHub 42 | "JINJINT/ESCO", 43 | "suke18/POWSC", 44 | "bvieth/powsimR", 45 | "Vivianstats/scDesign", 46 | "JSB-UCLA/scDesign2", 47 | "YosefLab/SymSim", 48 | "statOmics/zingeR", 49 | # INTEGRATION 50 | # Bioconductor 51 | "batchelor", 52 | "CellMixS", 53 | "edgeR", 54 | "limma", 55 | "sva", 56 | # CRAN 57 | "harmony", 58 | # CLUSTERING 59 | # Bioconductor 60 | "ConsensusClusterPlus", 61 | "flowCore", 62 | "monocle", 63 | "SC3", 64 | # CRAN 65 | "clue", 66 | "Rtsne", 67 | # GitHub 68 | "VCCRI/cidr", 69 | "JustinaZ/pcaReduce" 70 | ) 71 | 72 | # # TO INSTALL ALL DEPENDENCIES: 73 | # if (!require(BiocManager)) 74 | # install.packages("BiocManager") 75 | # for (. in x) 76 | # if (!require(., character.only = TRUE)) 77 | # BiocManager::install(., ask = FALSE, update = TRUE) 78 | 79 | # TO CAPTURE SESSIO INFO: 80 | for (. in x) { 81 | . <- gsub(".*/", "", .) 82 | suppressPackageStartupMessages( 83 | library(., character.only = TRUE)) 84 | } 85 | si <- capture.output(sessionInfo()) 86 | writeLines(si, args[[1]]) 87 | -------------------------------------------------------------------------------- /code/utils-clustering.R: -------------------------------------------------------------------------------- 1 | # Hungarian algorithm 2 | # (LM Weber, Aug '16) 3 | 4 | .hungarian <- function(clus_algorithm, clus_truth) { 5 | 6 | # number of detected clusters 7 | n_clus <- length(table(clus_algorithm)) 8 | 9 | # remove unassigned cells (NA's in clus_truth) 10 | unassigned <- is.na(clus_truth) 11 | clus_algorithm <- clus_algorithm[!unassigned] 12 | clus_truth <- clus_truth[!unassigned] 13 | if (length(clus_algorithm) != length(clus_truth)) 14 | warning("vector lengths are not equal") 15 | 16 | tbl_algorithm <- table(clus_algorithm) 17 | tbl_truth <- table(clus_truth) 18 | 19 | # detected clusters in rows, true populations in columns 20 | pr_mat <- re_mat <- F1_mat <- matrix(NA, nrow = length(tbl_algorithm), ncol = length(tbl_truth)) 21 | 22 | for (i in 1:length(tbl_algorithm)) { 23 | for (j in 1:length(tbl_truth)) { 24 | i_int <- as.integer(names(tbl_algorithm))[i] # cluster number from algorithm 25 | j_int <- as.integer(names(tbl_truth))[j] # cluster number from true labels 26 | 27 | true_positives <- sum(clus_algorithm == i_int & clus_truth == j_int, na.rm = TRUE) 28 | detected <- sum(clus_algorithm == i_int, na.rm = TRUE) 29 | truth <- sum(clus_truth == j_int, na.rm = TRUE) 30 | 31 | # calculate precision, recall, and F1 score 32 | precision_ij <- true_positives / detected 33 | recall_ij <- true_positives / truth 34 | F1_ij <- 2 * (precision_ij * recall_ij) / (precision_ij + recall_ij) 35 | 36 | if (F1_ij == "NaN") F1_ij <- 0 37 | 38 | pr_mat[i, j] <- precision_ij 39 | re_mat[i, j] <- recall_ij 40 | F1_mat[i, j] <- F1_ij 41 | } 42 | } 43 | 44 | # put back cluster labels (note some row names may be missing due to removal of unassigned cells) 45 | rownames(pr_mat) <- rownames(re_mat) <- rownames(F1_mat) <- names(tbl_algorithm) 46 | colnames(pr_mat) <- colnames(re_mat) <- colnames(F1_mat) <- names(tbl_truth) 47 | 48 | # match labels using Hungarian algorithm applied to matrix of F1 scores (Hungarian 49 | # algorithm calculates an optimal one-to-one assignment) 50 | 51 | # use transpose matrix (Hungarian algorithm assumes n_rows <= n_cols) 52 | F1_mat_trans <- t(F1_mat) 53 | 54 | if (nrow(F1_mat_trans) <= ncol(F1_mat_trans)) { 55 | # if fewer (or equal no.) true populations than detected clusters, can match all true populations 56 | labels_matched <- clue::solve_LSAP(F1_mat_trans, maximum = TRUE) 57 | # use row and column names since some labels may have been removed due to unassigned cells 58 | labels_matched <- as.numeric(colnames(F1_mat_trans)[as.numeric(labels_matched)]) 59 | names(labels_matched) <- rownames(F1_mat_trans) 60 | 61 | } else { 62 | # if fewer detected clusters than true populations, use transpose matrix and assign 63 | # NAs for true populations without any matching clusters 64 | labels_matched_flipped <- clue::solve_LSAP(F1_mat, maximum = TRUE) 65 | # use row and column names since some labels may have been removed due to unassigned cells 66 | labels_matched_flipped <- as.numeric(rownames(F1_mat_trans)[as.numeric(labels_matched_flipped)]) 67 | names(labels_matched_flipped) <- rownames(F1_mat) 68 | 69 | labels_matched <- rep(NA, ncol(F1_mat)) 70 | names(labels_matched) <- rownames(F1_mat_trans) 71 | labels_matched[as.character(labels_matched_flipped)] <- as.numeric(names(labels_matched_flipped)) 72 | } 73 | 74 | # precision, recall, F1 score, and number of cells for each matched cluster 75 | pr <- re <- F1 <- n_cells_matched <- rep(NA, ncol(F1_mat)) 76 | names(pr) <- names(re) <- names(F1) <- names(n_cells_matched) <- names(labels_matched) 77 | 78 | for (i in 1:ncol(F1_mat)) { 79 | # set to 0 if no matching cluster (too few detected clusters); use character names 80 | # for row and column indices in case subsampling completely removes some clusters 81 | pr[i] <- ifelse(is.na(labels_matched[i]), 0, pr_mat[as.character(labels_matched[i]), names(labels_matched)[i]]) 82 | re[i] <- ifelse(is.na(labels_matched[i]), 0, re_mat[as.character(labels_matched[i]), names(labels_matched)[i]]) 83 | F1[i] <- ifelse(is.na(labels_matched[i]), 0, F1_mat[as.character(labels_matched[i]), names(labels_matched)[i]]) 84 | 85 | n_cells_matched[i] <- sum(clus_algorithm == labels_matched[i], na.rm = TRUE) 86 | } 87 | 88 | # means across populations 89 | mean_pr <- mean(pr) 90 | mean_re <- mean(re) 91 | mean_F1 <- mean(F1) 92 | 93 | return(list(n_clus = n_clus, 94 | pr = pr, 95 | re = re, 96 | F1 = F1, 97 | labels_matched = labels_matched, 98 | n_cells_matched = n_cells_matched, 99 | mean_pr = mean_pr, 100 | mean_re = mean_re, 101 | mean_F1 = mean_F1)) 102 | } -------------------------------------------------------------------------------- /code/utils-integration.R: -------------------------------------------------------------------------------- 1 | .cms_ldf <- \(df) df %>% 2 | group_by(refset, sim_method) %>% 3 | mutate( 4 | # center around 0 5 | cms = cms - 0.5, 6 | # scale to range 1 7 | ldf = ldf / diff(range(ldf)), 8 | # center around 0 9 | ldf = ldf - min(ldf) - 0.5) 10 | 11 | .bcs <- \(df, n = 1) { 12 | stopifnot(any(n == c(1, 2))) 13 | .avg <- \(df) summarise(df, 14 | across(c(cms, ldf), mean), 15 | .groups = "drop_last") 16 | df <- df %>% 17 | group_by(refset, sim_method, batch_method, batch) %>% 18 | # average across cells 19 | .avg() 20 | if (n == 2) 21 | # average across batches 22 | df <- df %>% .avg() 23 | # batch correction score 24 | df %>% mutate(bcs = abs(cms) + abs(ldf)) 25 | } 26 | 27 | .batch_labs <- c( 28 | cms = "CMS*", bcs = "BCS", 29 | ldf = expression(Delta~"LDF*")) -------------------------------------------------------------------------------- /code/utils-plotting.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages({ 2 | library(dplyr) 3 | library(ggplot2) 4 | library(ggpubr) 5 | library(ggrastr) 6 | library(ggrepel) 7 | library(magrittr) 8 | library(patchwork) 9 | library(RColorBrewer) 10 | library(tidyr) 11 | library(tidytext) 12 | }) 13 | 14 | .read_res <- function(x) { 15 | df <- bind_rows(lapply(x, readRDS)) %>% 16 | replace_na(list(method = "ref")) %>% 17 | mutate(method = droplevels(factor(method, names(.methods_pal)))) 18 | if (all(c("datset", "subset") %in% names(df))) 19 | df <- mutate(df, refset = paste(datset, subset, sep = ",")) 20 | if ("id" %in% names(df)) 21 | df <- mutate(df, 22 | group = relevel(factor(group), ref = "global"), 23 | id = case_when(group == "global" ~ "global", TRUE ~ id), 24 | id = relevel(factor(id), ref = "global")) 25 | if (all(c("refset", "group") %in% names(df))) 26 | df <- group_by(df, refset) %>% 27 | mutate(.group = as.character(group)) %>% 28 | mutate(reftyp = ifelse(any(.group == "batch"), "b", 29 | ifelse(any(.group == "cluster"), "k", "n")), 30 | reftyp = factor(reftyp, c("n", "b", "k"))) %>% 31 | select(-.group) 32 | if (sum(grepl("method", names(df))) > 1) 33 | df <- rename(df, sim_method = method) 34 | if (any(grepl("^metric", names(df)))) 35 | df <- mutate(df, across( 36 | starts_with("metric"), 37 | ~droplevels(factor(.x, 38 | levels = names(.metrics_lab), 39 | labels = .metrics_lab)))) 40 | return(df) 41 | } 42 | 43 | .filter_res <- function(df) df %>% 44 | mutate( 45 | .metric = names(.metrics_lab)[ 46 | match(metric, .metrics_lab)], 47 | across(c(group, id), as.character)) %>% 48 | filter( 49 | # keep everything for type 'n' 50 | reftyp == "n" | 51 | # keep all groupings for global summaries, 52 | # gene-level (-correlations), cell-level 53 | # (-log-library size & cell detection frequency 54 | .metric %in% .none_metrics | 55 | .metric %in% setdiff(.gene_metrics, "gene_cor") | 56 | .metric %in% setdiff(.cell_metrics, c("cell_lls", "cell_frq")) | 57 | # for gene-gene correlation, keep global only 58 | (group == id & .metric == "gene_cor") | 59 | # for log-library size & cell detection 60 | # frequency, keep group-level results only 61 | (group != id & grepl("cell_(lls|frq)", .metric))) %>% 62 | ungroup() %>% 63 | select(-.metric) %>% 64 | mutate(across(c(metric, method), droplevels)) 65 | 66 | # repeatedly average statistics, e.g., 67 | # across groups, subsets, datsets, etc. 68 | .avg <- \(df, n) { 69 | .fun <- \(df) summarise_at(df, "stat", mean, na.rm = TRUE) 70 | res <- Reduce(\(df, foo) .fun(df), seq(n), init = df, accumulate = TRUE) 71 | ungroup(res[[n + 1]]) 72 | } 73 | 74 | .prettify <- function(plt, thm = NULL, base_size = 6) 75 | { 76 | col <- !is.null(plt$labels$colour) && !is.numeric(plt$data[[plt$labels$colour]]) 77 | fill <- !is.null(plt$labels$fill) && !is.numeric(plt$data[[plt$labels$fill]]) 78 | 79 | plt <- plt + if (col && fill) { 80 | tmp <- if (plt$labels$colour != plt$labels$fill) { 81 | guide_legend(order = 1, override.aes = list(alpha = 1, size = 2, stroke = 0.5, shape = 21)) 82 | } else "none" 83 | guides( 84 | col = tmp, 85 | fill = guide_legend(override.aes = list(alpha = 1, size = 2, col = NA, shape = 21))) 86 | } else if (col) { guides( 87 | col = guide_legend(override.aes = list(alpha = 1, size = 2, shape = 21))) 88 | } else if (fill) { guides( 89 | fill = guide_legend(override.aes = list(alpha = 1, size = 2))) 90 | } 91 | plt <- plt + 92 | theme_linedraw(base_size) + theme( 93 | panel.grid = element_blank(), 94 | legend.key.size = unit(0.5, "lines"), 95 | strip.background = element_blank(), 96 | strip.text = element_text(color = "black")) 97 | return(plt + thm) 98 | } 99 | 100 | # aesthetics ---- 101 | 102 | .groups_pal <- c(global = "grey40", batch = "grey80", cluster = "grey80", group = "grey80") 103 | 104 | pat <- ".*-sim_data-(.*)\\.R" 105 | methods <- gsub(pat, "\\1", list.files("code", pat)) 106 | .methods_pal <- colorRampPalette(brewer.pal(12, "Paired"))(length(methods)) 107 | .methods_pal <- c(ref = "black", setNames(.methods_pal, methods)) 108 | 109 | pat <- ".*calc_qc-(.*)\\.R" 110 | .metrics <- gsub(pat, "\\1", list.files("code", pat)) 111 | .none_metrics <- c("gene_pve", "cell_cms", "cell_sw") 112 | .metrics <- setdiff(.metrics, .none_metrics) 113 | .gene_metrics <- grep("gene", .metrics, value = TRUE) 114 | .cell_metrics <- grep("cell", .metrics, value = TRUE) 115 | 116 | .metrics_lab <- c( 117 | gene_avg = "average of logCPM", 118 | gene_var = "variance of logCPM", 119 | gene_cv = "coefficient of variation", 120 | gene_frq = "gene detection frequency", 121 | gene_cor = "gene-to-gene correlation", 122 | cell_lls = "log-library size", 123 | cell_frq = "cell detection frequency", 124 | cell_cor = "cell-to-cell correlation", 125 | cell_ldf = "local density factor", 126 | cell_pcd = "cell-to-cell distance", 127 | cell_knn = "KNN occurences", 128 | gene_pve = "percent variance explained", 129 | cell_cms = "cell-specific mixing score", 130 | cell_sw = "silhouette width") 131 | 132 | n <- 3 133 | .metrics_pal <- c( 134 | setNames(hcl.colors(n*length(.gene_metrics), "Reds" )[seq(1, n*length(.gene_metrics), n)], .gene_metrics), 135 | setNames(hcl.colors(n*length(.cell_metrics), "Blues")[seq(1, n*length(.cell_metrics), n)], .cell_metrics), 136 | setNames(brewer.pal(1+length(.none_metrics), "Greens")[-1], .none_metrics)) 137 | names(.metrics_pal) <- .metrics_lab 138 | 139 | pat <- ".*-stat_1d-(.*)\\.R" 140 | .stats1d <- gsub(pat, "\\1", list.files("code", pat)) 141 | .stats1d_lab <- c(ks = "KS", ws = "W") 142 | 143 | pat <- ".*-stat_2d-(.*)\\.R" 144 | .stats2d <- gsub(pat, "\\1", list.files("code", pat)) 145 | .stats2d_lab <- c(emd = "EMD", ks2 = "KS") 146 | -------------------------------------------------------------------------------- /code/utils-summaries.R: -------------------------------------------------------------------------------- 1 | # split cell indices into list 2 | # - global = list w/ one element (all cells) 3 | # - batch = list w/ one element per batch (optional) 4 | # - cluster = list w/ one element per cluster (optional) 5 | .split_cells <- function(sce, 6 | i = c("global", "batch", "cluster")) 7 | { 8 | names(j) <- j <- intersect(i, names(colData(sce))) 9 | cs <- lapply(j, function(.) { 10 | ids <- droplevels(factor(sce[[.]])) 11 | split(seq(ncol(sce)), ids) 12 | }) 13 | if ("global" %in% i) 14 | cs <- c(list(global = list(foo = seq(ncol(sce)))), cs) 15 | return(cs) 16 | } 17 | 18 | # split genes by group = "cluster","batch", "sample". 19 | # Then per group calculate the qc with the FUN function(which is a metric) 20 | # returns a dataframe with cols: group | id | metric_name 21 | .calc_qc <- function(sce, fun, 22 | n_genes = NULL, n_cells = NULL, 23 | groups = c("global", "cluster", "batch")) 24 | { 25 | suppressPackageStartupMessages({ 26 | library(dplyr) 27 | library(purrr) 28 | }) 29 | if (is.null(groups)) { 30 | groups <- eval(formals(.calc_qc)$groups) 31 | } else { 32 | groups <- match.arg(groups, several.ok = TRUE) 33 | } 34 | # split cells into groups 35 | idx <- .split_cells(sce, groups) 36 | # downsample to at most 'n_genes' in total 37 | if (!is.null(n_genes)) { 38 | n_genes <- min(n_genes, nrow(sce)) 39 | gs <- sample(nrow(sce), n_genes) 40 | sce <- sce[gs, ] 41 | } 42 | # downsample to at most 'n_cells' per group 43 | if (!is.null(n_cells)) { 44 | idx <- map_depth(idx, -1, ~{ 45 | n_cells <- min(n_cells, length(.x)) 46 | sample(.x, n_cells) 47 | }) 48 | } 49 | # compute QC metric per group 50 | res <- map_depth(idx, -1, ~{ 51 | data.frame( 52 | row.names = NULL, 53 | value = fun(sce[, .x])) 54 | }) 55 | # join into single table 56 | res <- map_depth(res, 1, bind_rows, .id = "id") 57 | res <- bind_rows(res, .id = "group") 58 | if (nrow(res) != 0) return(res) 59 | } -------------------------------------------------------------------------------- /config.yaml: -------------------------------------------------------------------------------- 1 | R: "R_LIBS_USER=~/libs/bioc313/ /usr/local/R/R-4.1.0/bin/R" -------------------------------------------------------------------------------- /data/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HelenaLC/simulation-comparison/f89e6650f8acae1c32b8fe27d644ec7c7be63051/data/.gitkeep -------------------------------------------------------------------------------- /figs/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HelenaLC/simulation-comparison/f89e6650f8acae1c32b8fe27d644ec7c7be63051/figs/.gitkeep -------------------------------------------------------------------------------- /logs/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HelenaLC/simulation-comparison/f89e6650f8acae1c32b8fe27d644ec7c7be63051/logs/.gitkeep -------------------------------------------------------------------------------- /meta/methods.json: -------------------------------------------------------------------------------- 1 | { 2 | "BASiCS": ["b"], 3 | "ESCO": ["n", "b", "k"], 4 | "hierarchicell": ["n", "b"], 5 | "muscat": ["n", "b", "k"], 6 | "POWSC": ["n", "k"], 7 | "powsimR": ["n"], 8 | "scDD": ["n"], 9 | "scDesign": ["n"], 10 | "scDesign2": ["n", "k"], 11 | "SCRIP": ["n", "b", "k"], 12 | "SPARSim": ["n", "b"], 13 | "SPsimSeq": ["n", "b"], 14 | "splatter": ["n"], 15 | "SymSim": ["n", "b"], 16 | "ZINB-WaVE": ["n", "b", "k"], 17 | "zingeR": ["n"] 18 | } -------------------------------------------------------------------------------- /meta/runtimes.json: -------------------------------------------------------------------------------- 1 | { 2 | "panc8,inDrop1.beta": { 3 | "type": "n", 4 | "n_genes": ["4000", "2000", "1000", "500"], 5 | "n_cells": ["800", "400", "200", "100"] 6 | }, 7 | "Mereu20,CD4T": { 8 | "type": "b", 9 | "n_genes": ["4000", "2000", "1000", "500"], 10 | "n_cells": ["2600", "1300", "650", "325"] 11 | }, 12 | "Zheng17,foo": { 13 | "type": "k", 14 | "n_genes": ["3200", "1600", "800", "400"], 15 | "n_cells": ["2600", "1300", "650", "325"] 16 | } 17 | } -------------------------------------------------------------------------------- /meta/subsets.json: -------------------------------------------------------------------------------- 1 | { 2 | "CellBench": { 3 | "H2228": { 4 | "type": "b", 5 | "n_cells": "400", 6 | "n_genes": "4000", 7 | "cluster": "H2228" 8 | }, 9 | "CELSeq": { 10 | "type": "k", 11 | "n_cells": "400", 12 | "n_genes": "4000", 13 | "batch": "sc_celseq" 14 | } 15 | }, 16 | "Ding20": { 17 | "10x.InhibNeuron": { 18 | "type": "n", 19 | "n_cells": null, 20 | "n_genes": "4000", 21 | "batch": "10x Chromium", 22 | "cluster": "Inhibitory neuron" 23 | }, 24 | "ExcitNeuron": { 25 | "type": "b", 26 | "n_cells": "200", 27 | "n_genes": "4000", 28 | "cluster": "Excitatory neuron" 29 | }, 30 | "DroNcSeq": { 31 | "type": "k", 32 | "n_cells": "200", 33 | "n_genes": "4000", 34 | "batch": "DroNc-seq" 35 | } 36 | }, 37 | "Gierahn17" : { 38 | "foo": { 39 | "type": "n", 40 | "n_cells": null, 41 | "n_genes": "4000" 42 | } 43 | }, 44 | "Kang18": { 45 | "1015": { 46 | "type": "k", 47 | "n_cells": "400", 48 | "n_genes": "4000", 49 | "batch": "1015" 50 | }, 51 | "B": { 52 | "type": "n", 53 | "n_cells": null, 54 | "n_genes": "4000", 55 | "batch": "1015", 56 | "cluster": "B cells" 57 | }, 58 | "NK": { 59 | "type": "n", 60 | "n_cells": null, 61 | "n_genes": "4000", 62 | "batch": "1015", 63 | "cluster": "NK cells" 64 | } 65 | }, 66 | "Koh16": { 67 | "foo": { 68 | "type": "k", 69 | "n_cells": null, 70 | "n_genes": "4000" 71 | } 72 | }, 73 | "MCA20.gland": { 74 | "T": { 75 | "type": "b", 76 | "n_cells": null, 77 | "n_genes": 4000, 78 | "cluster": "T cell_Cd8b1 high" 79 | } 80 | }, 81 | "MCA20.lung": { 82 | "AT2": { 83 | "type": "b", 84 | "n_cells": null, 85 | "n_genes": 4000, 86 | "cluster": "AT2 Cell" 87 | } 88 | }, 89 | "Mereu20":{ 90 | "CD4T": { 91 | "type": "b", 92 | "n_cells": "200", 93 | "n_genes": "4000", 94 | "cluster": "CD4 T cells" 95 | }, 96 | "ddSeq": { 97 | "type": "k", 98 | "n_cells": "400", 99 | "n_genes": "4000", 100 | "batch": "ddSEQ" 101 | } 102 | }, 103 | "Oetjen18": { 104 | "R": { 105 | "type": "n", 106 | "n_cells": null, 107 | "n_genes": "4000", 108 | "batch": "R" 109 | }, 110 | "foo": { 111 | "type": "b", 112 | "n_cells": "200", 113 | "n_genes": "4000" 114 | } 115 | }, 116 | "panc8": { 117 | "inDrop1.beta": { 118 | "type": "n", 119 | "n_cells": null, 120 | "n_genes": "4000", 121 | "batch": "indrop1", 122 | "cluster": "beta" 123 | }, 124 | "inDrop.ductal": { 125 | "type": "b", 126 | "n_cells": null, 127 | "n_genes": "4000", 128 | "batch": ["indrop1", "indrop2", "indrop3", "indrop4"], 129 | "cluster": "ductal" 130 | }, 131 | "SmartSeq2": { 132 | "type": "k", 133 | "n_cells": "400", 134 | "n_genes": "4000", 135 | "batch": "smartseq2" 136 | } 137 | }, 138 | "TabulaMuris": { 139 | "limb.MSCs": { 140 | "type": "n", 141 | "n_cells": null, 142 | "n_genes": "4000", 143 | "tissue": "Limb_Muscle", 144 | "cluster": "mesenchymal stem cell" 145 | }, 146 | "spleen": { 147 | "type": "k", 148 | "n_cells": "400", 149 | "n_genes": "4000", 150 | "tissue": "Spleen" 151 | } 152 | }, 153 | "Tung17": { 154 | "foo": { 155 | "type": "b", 156 | "n_cells": "400", 157 | "n_genes": "4000" 158 | }, 159 | "NA19101": { 160 | "type": "n", 161 | "n_cells": null, 162 | "n_genes": "4000", 163 | "batch": "NA19101" 164 | } 165 | }, 166 | "Zheng17": { 167 | "foo": { 168 | "type": "k", 169 | "n_cells": "400", 170 | "n_genes": "4000" 171 | }, 172 | "HSCs": { 173 | "type": "n", 174 | "n_cells": null, 175 | "n_genes": "4000", 176 | "cluster": "HSCs CD34+" 177 | }, 178 | "Monocytes": { 179 | "type": "n", 180 | "n_cells": "400", 181 | "n_genes": "4000", 182 | "cluster": "Monocytes CD14+" 183 | } 184 | } 185 | } -------------------------------------------------------------------------------- /outs/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HelenaLC/simulation-comparison/f89e6650f8acae1c32b8fe27d644ec7c7be63051/outs/.gitkeep -------------------------------------------------------------------------------- /plts/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HelenaLC/simulation-comparison/f89e6650f8acae1c32b8fe27d644ec7c7be63051/plts/.gitkeep -------------------------------------------------------------------------------- /schematic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HelenaLC/simulation-comparison/f89e6650f8acae1c32b8fe27d644ec7c7be63051/schematic.png --------------------------------------------------------------------------------