├── .Rprofile
├── .gitignore
├── README.md
├── Snakefile
├── code
    ├── 00-get_data-CellBench.R
    ├── 00-get_data-Ding20.R
    ├── 00-get_data-Gierahn17.R
    ├── 00-get_data-Kang18.R
    ├── 00-get_data-Koh16.R
    ├── 00-get_data-MCA20.gland.R
    ├── 00-get_data-MCA20.lung.R
    ├── 00-get_data-Mereu20.R
    ├── 00-get_data-Oetjen18.R
    ├── 00-get_data-TabulaMuris.R
    ├── 00-get_data-Tung17.R
    ├── 00-get_data-Zheng17.R
    ├── 00-get_data-panc8.R
    ├── 00-get_data.R
    ├── 01-fil_data.R
    ├── 02-sub_data.R
    ├── 03-est_pars-BASiCS.R
    ├── 03-est_pars-ESCO.R
    ├── 03-est_pars-POWSC.R
    ├── 03-est_pars-SCRIP.R
    ├── 03-est_pars-SPARSim.R
    ├── 03-est_pars-SPsimSeq.R
    ├── 03-est_pars-SymSim.R
    ├── 03-est_pars-ZINB-WaVE.R
    ├── 03-est_pars-hierarchicell.R
    ├── 03-est_pars-muscat.R
    ├── 03-est_pars-powsimR.R
    ├── 03-est_pars-scDD.R
    ├── 03-est_pars-scDesign.R
    ├── 03-est_pars-scDesign2.R
    ├── 03-est_pars-splatter.R
    ├── 03-est_pars-zingeR.R
    ├── 03-est_pars.R
    ├── 04-sim_data-BASiCS.R
    ├── 04-sim_data-ESCO.R
    ├── 04-sim_data-POWSC.R
    ├── 04-sim_data-SCRIP.R
    ├── 04-sim_data-SPARSim.R
    ├── 04-sim_data-SPsimSeq.R
    ├── 04-sim_data-SymSim.R
    ├── 04-sim_data-ZINB-WaVE.R
    ├── 04-sim_data-hierarchicell.R
    ├── 04-sim_data-muscat.R
    ├── 04-sim_data-powsimR.R
    ├── 04-sim_data-scDD.R
    ├── 04-sim_data-scDesign.R
    ├── 04-sim_data-scDesign2.R
    ├── 04-sim_data-splatter.R
    ├── 04-sim_data-zingeR.R
    ├── 04-sim_data.R
    ├── 05-calc_batch-ComBat.R
    ├── 05-calc_batch-Harmony.R
    ├── 05-calc_batch-Seurat.R
    ├── 05-calc_batch-fastMNN.R
    ├── 05-calc_batch-limma.R
    ├── 05-calc_batch-mnnCorrect.R
    ├── 05-calc_batch.R
    ├── 05-calc_clust-CIDR.R
    ├── 05-calc_clust-PCA+HC.R
    ├── 05-calc_clust-PCA+KM.R
    ├── 05-calc_clust-SC3.R
    ├── 05-calc_clust-SC3svm.R
    ├── 05-calc_clust-Seurat.R
    ├── 05-calc_clust-monocle.R
    ├── 05-calc_clust-pcaReduce.R
    ├── 05-calc_clust-tSNE+KM.R
    ├── 05-calc_clust.R
    ├── 05-calc_dr.R
    ├── 05-calc_qc-cell_cms.R
    ├── 05-calc_qc-cell_cor.R
    ├── 05-calc_qc-cell_frq.R
    ├── 05-calc_qc-cell_knn.R
    ├── 05-calc_qc-cell_ldf.R
    ├── 05-calc_qc-cell_lls.R
    ├── 05-calc_qc-cell_pcd.R
    ├── 05-calc_qc-cell_sw.R
    ├── 05-calc_qc-gene_avg.R
    ├── 05-calc_qc-gene_cor.R
    ├── 05-calc_qc-gene_cv.R
    ├── 05-calc_qc-gene_frq.R
    ├── 05-calc_qc-gene_pve.R
    ├── 05-calc_qc-gene_var.R
    ├── 05-calc_qc.R
    ├── 05-runtimes.R
    ├── 06-dr_batch.R
    ├── 06-eval_batch.R
    ├── 06-eval_clust.R
    ├── 06-stat_1d-ks.R
    ├── 06-stat_1d-ws.R
    ├── 06-stat_1d.R
    ├── 06-stat_2d-emd.R
    ├── 06-stat_2d-ks2.R
    ├── 06-stat_2d.R
    ├── 07-plot_batch-boxplot_by_method.R
    ├── 07-plot_batch-boxplot_dX.R
    ├── 07-plot_batch-correlations.R
    ├── 07-plot_batch-densities.R
    ├── 07-plot_batch-heatmap_by_method.R
    ├── 07-plot_batch-heatmap_by_refset.R
    ├── 07-plot_clust-boxplot_by_method.R
    ├── 07-plot_clust-boxplot_dF1.R
    ├── 07-plot_clust-correlations.R
    ├── 07-plot_clust-heatmap_by_method.R
    ├── 07-plot_corr-stat_2d.R
    ├── 07-plot_dimred.R
    ├── 07-plot_dimred_batch.R
    ├── 07-plot_memory.R
    ├── 07-plot_qc_ref-correlations.R
    ├── 07-plot_runtimes.R
    ├── 07-plot_stat_1d-scatters.R
    ├── 07-plot_stat_1d_by_reftyp-boxplot.R
    ├── 07-plot_stat_1d_by_reftyp-dimEst.R
    ├── 07-plot_stat_1d_by_reftyp-heatmap.R
    ├── 07-plot_stat_1d_by_reftyp-mds.R
    ├── 07-plot_stat_1d_by_reftyp-pca.R
    ├── 07-plot_stat_1d_by_stat1d-boxplot_by_method.R
    ├── 07-plot_stat_1d_by_stat1d-boxplot_by_metric.R
    ├── 07-plot_stat_1d_by_stat1d-correlations.R
    ├── 07-plot_stat_1d_by_stat1d-correlations_by_method.R
    ├── 07-plot_stat_1d_by_stat1d-correlations_by_metric.R
    ├── 07-plot_stat_1d_by_stat1d-dimEst.R
    ├── 07-plot_stat_1d_by_stat1d-mds.R
    ├── 07-plot_stat_1d_by_stat1d-pca.R
    ├── 07-plot_stat_2d-scatters.R
    ├── 07-plot_stat_2d_by_reftyp-boxplot.R
    ├── 07-plot_stat_2d_by_reftyp-heatmap.R
    ├── 08-fig_boxplots.R
    ├── 08-fig_clustering.R
    ├── 08-fig_heatmaps.R
    ├── 08-fig_integration.R
    ├── 08-fig_mds.R
    ├── 08-fig_memory.R
    ├── 08-fig_runtimes.R
    ├── 08-fig_scalability.R
    ├── 08-fig_scatters.R
    ├── 08-fig_stat1d.R
    ├── 08-fig_stat2d.R
    ├── 08-fig_summaries.R
    ├── 09-write_fns.R
    ├── 09-write_obj.R
    ├── 10-session_info.R
    ├── utils-clustering.R
    ├── utils-integration.R
    ├── utils-plotting.R
    └── utils-summaries.R
├── config.yaml
├── data
    └── .gitkeep
├── figs
    └── .gitkeep
├── logs
    └── .gitkeep
├── meta
    ├── methods.json
    ├── runtimes.json
    └── subsets.json
├── outs
    └── .gitkeep
├── plts
    └── .gitkeep
└── schematic.png


/.Rprofile:
--------------------------------------------------------------------------------
 1 | data.table::setDTthreads(threads = 1)
 2 | 
 3 | .get_wcs <- function(wcs) {
 4 | 	wcs <- gsub("(,)(\\w+=)", ";\\2", wcs)
 5 |     ss <- strsplit(wcs, ";")[[1]]
 6 |     ss <- sapply(ss, strsplit, "=")
 7 |     keys <- sapply(ss, .subset, 1)
 8 |     vals <- sapply(ss, .subset, 2)
 9 | 	wcs <- as.list(vals)
10 | 	names(wcs) <- keys
11 |     return(wcs)
12 | }
13 | 
14 | args <- R.utils::commandArgs(
15 | 	trailingOnly = TRUE, 
16 | 	asValues = TRUE)
17 | 
18 | if (!is.null(args$wcs)) {
19 | 	wcs <- .get_wcs(args$wcs)
20 | 	args$wcs <- NULL
21 | } else wcs <- NULL
22 | 
23 | args <- lapply(args, function(u) 
24 | 	unlist(strsplit(u, ";")))
25 | 
26 | cat("WILDCARDS:\n\n"); print(wcs); cat("\n")
27 | cat("ARGUMENTS:\n\n"); print(args)
28 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | .snakemake
 3 | 
 4 | ._*
 5 | *.txt
 6 | *.html
 7 | 
 8 | logs/*
 9 | data/*
10 | outs/*
11 | plts/*
12 | figs/*
13 | !*/.gitkeep


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # `Snakemake` workflow to benchmark <br> scRNA-seq data simulators
  2 | 
  3 | - [Setup](#setup)
  4 |   - [Dependencies](#dependencies)
  5 |   - [Structure](#structure)
  6 | - [Workflow](#workflow)
  7 |   - [Preproccessing](#preprocessing)
  8 |   - [Simulation](#simulation)
  9 |   - [Summaries](#summaries)
 10 |   - [Statistics](#statistics)
 11 |   - [Downstream](#downstream)
 12 |     - [Clustering](#clustering)
 13 |     - [Integration](#integration)
 14 |   - [Visualization](#visualization)
 15 | - [Customization](#customization)
 16 |   - [Datasets](#datasets)
 17 |   - [Methods](#methods)
 18 | 
 19 | *** 
 20 | 
 21 | # Setup
 22 | 
 23 | ## Dependencies
 24 | 
 25 | The current code was implemented using R v4.1.0, Bioconductor v3.13, Snakemake v5.5.0, and Python v3.6.8. All R dependencies (from GitHub, CRAN and Bioconductor) are listed under *code/10-session_info.R* and may be installed using the command contained therein.
 26 | 
 27 | ## Structure
 28 | 
 29 | * `config.yaml` specifies the R library and version to use
 30 | * `code` contains all R scripts used in the *Snakemake* workflow
 31 | * `data` contains raw, filtered and simulated scRNA-seq datasets,  
 32 | as well as simulation parameter estimates
 33 | * `meta` contains two *.json* files that specify simulation method (`methods.json`) and reference subset (`subsets.json`) configurations
 34 | * `outs` contains all results from computations (typically `data.frame`s) as *.rds* files
 35 | * `figs` contains all visual outputs as *.pdf* files, and corresponding `ggplot` objects as *.rds* files (for subsequent arrangement into 'super'-figures)
 36 | 
 37 | Simulation methods are tagged with *one or many* of the following labels, according to which scenario(s) they can accommodate: 
 38 | 
 39 | * `n` for none: no clusters or batches
 40 | * `b` for batch: multiple batches, no clusters
 41 | * `k` for cluster: multiple clusters, no batches
 42 | 
 43 | Similarly, we tag subsets (see below) with *exactly one* of these labels. This allows running each method on subsets they are capable of simulating.
 44 | 
 45 | ***
 46 | 
 47 | # Workflow
 48 | 
 49 | ![Schematic of the computational workflow used to benchmark scRNA-seq simulators. (1) Methods are grouped according to which level of complexity they can accommodate: type *n* (`singular'), *b* (batches), *k* (clusters). (2) Raw datasets are retrieved reproducibly from a public source, filtered, and subsetted into various datasets that serve as reference for (3) parameter estimation and simulation. (4) Various gene-, cell-level and global summaries are computed from reference and simulated data, and (5) compared in a one- and two-dimensional setting using two statistics each. (6) Integration and clustering methods are applied to type *b* and *k* references and simulations, respectively, and relative performances compared between reference-simulation and simulation-simulation pairs.](schematic.png)
 50 | 
 51 | ## Preprocessing
 52 | 
 53 | **1. Data retrieval**
 54 | 
 55 | Each `code/00-get_data-<datset_id>.R` script retrieves a publicly available scRNA-seq dataset through from which a *SingleCellExperiment* is constructed and written to `data/00-raw/<datset_id>.rds`
 56 | 
 57 | **2. Filtering**
 58 | 
 59 | `code/01-fil_data.R` is applied to each raw dataset as to 
 60 | 
 61 |   * remove batches, cluster, or batch-cluster instances with fewer than 50 cells (depending on the dataset's complexity)
 62 |   * keep genes with a count of at least 1 in at least 10 cells, and remove cells with fewer than 100 detected genes 
 63 |   
 64 | Filtered data are written to `data/01-fil/<datset_id>.rds`.
 65 | 
 66 | **3. Subsetting**
 67 | 
 68 | Because different methods can accommodate only some features (e.g. multiple batches or clusters, both or neither), `code/02-sub_data.R` creates specific subsets in `data/02-sub/<datset_id>.<subset_id>,rds`. We term these *ref(erence)set*s (i.e. `<datset_id>.<subset_id> = <refset_id>`), as they serve as the input reference data for simulation.
 69 | 
 70 | ## Simulation
 71 | 
 72 | **1. Parameter estimation**
 73 | 
 74 | Simulation parameters are estimated with `code/03-est_pars.R`, which in term sources a `code/03-est_pars-<method_id>.R` script that executes a method's parameter estimation function(s). In cases where no separate estimation takes place, this returns `NULL`. Parameter estimates for each combination of `<refset_id.<method_id> = <simset_id>` are written to `data/04-est/<simset_id>.rds`.
 75 | 
 76 | **2. Data simulation**
 77 | 
 78 | Data is simulated with `code/04-sim_data.R`, which in term sources a `code/04-sim_data-<method_id>.R` script that executes a method's simulation function. Simulations for each combination of `<refset_id>` and `method_id` are written to `data/05-sim/<refset_id>,<method_id>.rds`.
 79 | 
 80 | ## Summaries
 81 | 
 82 | Various quality control (QC) summaries are computed with `code/05-calc_qc.R`, which in term sources a set of `code/05-calc_qc-<metric_id>.R` scripts. QC results for reference and simulated data are written to `outs/qc_ref-<refset_id>,<metric_id>.rds` and `outs/qc_sim-<simset_id>,<metric_id>.rds`, respectively. At current, we consider:
 83 | 
 84 | **1. Gene-level**
 85 | 
 86 | * `frq`: detection frequency (i.e., fraction of cells with non-zero counts)
 87 | * `avg/var`: average/variance of logCPM
 88 | * `cv`: coefficient of variation
 89 | * `cor`: gene-to-gene correlation
 90 | 
 91 | **2. Cell-level**
 92 | 
 93 | * `frq`: detection frequency (i.e., fraction of genes with non-zero counts)
 94 | * `lls`: log-transformed library size (total counts)
 95 | * `cor`: cell-to-cell correlation
 96 | * `pcd`: cell-to-cell distance (in PCA space)
 97 | * `knn`: number of KNN occurrences
 98 | * `ldf`: local density factor
 99 | 
100 | **3. Global**
101 | 
102 | * `sw`: Silhouette width (using batch/cluster labels as classes)
103 | * `cms`: cell-specific mixing score (using batch/cluster labels as batches)
104 | * `pve`: percent variance explained (of gene expression = logCPM, by batch/cluster)
105 | 
106 | Noteworthily, we compute each summary for different groupings of cells (depending on the dataset's complexity): 
107 | 
108 | 1. globally, i.e. across all cells
109 | 2. at the batch-level, i.e. for each batch
110 | 3. at the cluster-level, i.e. for each cluster
111 | 
112 | Global summaries are computed at the batch-/cluster-level only, as they require a grouping variable. 
113 | 
114 | ## Statistics
115 | 
116 | We compare summaries between reference and simulated data in both one- (`code/06-stat_1d.R`) and two-dimensional settings (`code/06-statl_2d.R`). For the latter, every combination of gene- and cell-level metrics is considered, excluding correlations and global summaries. Furthermore, metrics are evaluated for each cell grouping, i.e. we perform a test globally, for each batch and cluster (again, depending on the dataset's complexity). Test results are written to `outs/stat_1d,<refset_id>,<metric_id>,<stat1d_id>.rds` for 1D, and `outs/stat_2d,<refset_id>,<metric1_id>,<metric2_id>,<stat2d_id>.rds` for 2D tests.
117 | 
118 | **1. One-dimensional**
119 | 
120 | * Kolmogorov-Smirnov (KS) test
121 | * Wasserstein metric
122 | 
123 | **2. Two-dimensional**
124 | 
125 | * two-dimensional KS test
126 | * Earth Mover's Distance (EMD)
127 | 
128 | ## Downstream
129 | 
130 | ### Integration
131 | 
132 | Each `05-calc_batch-x.R` script wraps around an integration method that is applied in `05-calc_batch.R` to the set of type *b* subsets. The output corrected assay data or integrated cell embeddings (depending on the method) are written to `outs/batch_ref/sim-<ref/simset_id>,<batch_method>.rds` for every reference and simulation, respectively. Results are evaluated by `06-eval_batch.R`, which computes the following set of metrics:
133 | 
134 | - cell-specific mixing score (CMS)
135 | - difference in local density factor ($\Delta$LDF) 
136 | - batch correction score (BCS)
137 | 
138 | ### Clustering
139 | 
140 | Each `05-calc_clust-x.R` script wraps around an integration method that is applied in `05-calc_clust.R` to the set of type *b* subsets. The output cluster assignments are written to `outs/clust_ref/sim-<ref/simset_id>,<clust_method>.rds` for every reference and simulation, respectively. Results are evaluated by `06-eval_clust.R`, which computes the following set of metrics:
141 | 
142 | - precision (P) and recall (R)
143 | - F1 score (harmonic mean of P and R)
144 | 
145 | ## Visualization
146 | 
147 | Finally, results are collected across `refset_id`s and `method_id`s (jointly or separated by type), and visualized in various ways using as set of `07-plot_x.R` scripts. Output figures are written to `plts` as *.pdf* files, along with the corresponding `ggplot` objects as *.rds* files. Lastly, `08-fig_x.R` scripts are used to combined various `ggplot`s into figures that are saved to `figs` as *.pdf* files.
148 | 
149 | ***
150 | 
151 | # Customization
152 | 
153 | ## Datasets
154 | 
155 | In principle, any dataset for which a `code/00-get_data-<dataset_id>.R` script exists will be accessible to the workflow. However, data will only be retrieved if the dataset appears in `meta/subsets.json`. Hence,
156 | 
157 | ### Removing
158 | 
159 | To exclude a dataset from the workflow, i) (re)move the corresponding `code/00-get_data-<dataset_id>.R` script; or, ii) remove or comment out any associated `meta/subsets.json` entries.
160 | 
161 | ### Adding
162 | 
163 | Similarly, a new dataset can be added by supplying an adequate `code/00-get_data-<dataset_id>.R` script, and adding an entry to the `meta/subsets.json` configuration that specifies the subset ID, the number of genes/cells to sample (`NULL` for all), which batch(es)/cluster(s) to retain, as well as the resulting subset's type (one of n,b,k,g).
164 | 
165 | ## Methods
166 | 
167 | The *Snakemake* will automatically include any simulation method for which a `code/03-est_pars-<method_id>.R` and `code/04-sim_data-<method_id>.R` script exists. Secondly, `meta/methods.json` will determine on which type(s) of dataset(s) each method should be run. Thus, 
168 | 
169 | ### Removing
170 | 
171 | To exclude a method from the workflow, either i) set `"<method_id>": "x"` in the `meta/methods.json` file (or anything other than n,b,k,g); or, ii) (re)move the parameter estimation and/or simulation script from the `code` directory.
172 | 
173 | ### Adding
174 | 
175 | Analogous to the above, adding a method to the benchmark requires i) adding a `code/03-est_pars-<method_id>.R` and `code/04-sim-data-<method_id>.R` script; and, ii) adding an entry for the `method_id` to the `meta/methods.json` file. Importantly, the R script for parameter estimation should handle batches (`colData` column `batch`), clusters (`colData` column `cluster`), both or neither. And the method's type(s) should be specified accordingly (`n` for neither, `b/k` for batches/clusters, `g` for groups), e.g. `"<method_id>": ["n", "k"]` for a method that supports 'singular' datasets, as well as ones with multiple clusters.
176 | 


--------------------------------------------------------------------------------
/code/00-get_data-CellBench.R:
--------------------------------------------------------------------------------
 1 | # load required packages
 2 | suppressPackageStartupMessages({
 3 |     library(CellBench)
 4 |     library(SingleCellExperiment)
 5 | })
 6 | 
 7 | fun <- \()
 8 | {
 9 |     # load SCE from EH
10 |     sce <- load_sc_data()
11 |     ids <- names(sce)
12 |     
13 |     # get genes shared across batches
14 |     gs <- lapply(sce, rownames)
15 |     gs <- Reduce(intersect, gs)
16 |     
17 |     sce <- lapply(ids, function(id) 
18 |     {
19 |         # subset shared genes
20 |         x <- sce[[id]][gs, ]
21 |         
22 |         # simplify metadata
23 |         rowData(x) <- NULL
24 |         colData(x) <- DataFrame(
25 |             batch = id,
26 |             cluster = x$cell_line)
27 |         metadata(x) <- list()
28 |         
29 |         # make counts sparse & drop drop log-normalized counts
30 |         y <- as(counts(x), "dgCMatrix")
31 |         assays(x) <- list(counts = y)
32 |         print(dim(x))
33 |         return(x)
34 |     })
35 |     
36 |     # concatenate batches into single dataset
37 |     do.call(cbind, sce)
38 | }


--------------------------------------------------------------------------------
/code/00-get_data-Ding20.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(ExperimentHub)
 3 |     library(SingleCellExperiment)
 4 |     library(Seurat)
 5 | })
 6 | 
 7 | fun <- \()
 8 | {
 9 |     eh <- ExperimentHub()
10 |     q <- query(eh, "SimBenchData")
11 |     
12 |     x <- lapply(
13 |         grep("^Neural", q$title),
14 |         function(i) {
15 |             so <- eh[[q$ah_id[i]]]
16 |             sce <- as.SingleCellExperiment(so)
17 |             # keep 1st experiment only
18 |             sce[, sce$ident == "Cortex1"]
19 |         })
20 |     
21 |     x <- do.call(cbind, x)
22 |     
23 |     colData(x) <- DataFrame(
24 |         batch = x$technology,
25 |         cluster = x$celltype)
26 |     
27 |     assays(x) <- list(counts = counts(x))
28 |     
29 |     return(x)
30 | }


--------------------------------------------------------------------------------
/code/00-get_data-Gierahn17.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(ExperimentHub)
 3 |     library(Seurat)
 4 |     library(SingleCellExperiment)
 5 | })
 6 | 
 7 | fun <- \()
 8 | {
 9 |     eh <- ExperimentHub()
10 |     q <- query(eh, "SimBenchData")
11 |     i <- grep("HEK", q$title)
12 |     x <- eh[[q$ah_id[i]]]
13 |     
14 |     x <- as.SingleCellExperiment(x)
15 |     assays(x) <- assays(x)["counts"]
16 |     colData(x) <- NULL
17 |     
18 |     return(x)
19 | }


--------------------------------------------------------------------------------
/code/00-get_data-Kang18.R:
--------------------------------------------------------------------------------
 1 | # load required packages
 2 | suppressPackageStartupMessages({
 3 |     library(ExperimentHub)
 4 |     library(SingleCellExperiment)
 5 | })
 6 | 
 7 | fun <- \()
 8 | {
 9 |     # load SCE from EH
10 |     eh <- ExperimentHub()
11 |     q <- query(eh, "Kang18_8vs8")
12 |     x <- eh[[q$ah_id]]
13 |     
14 |     # keep reference samples only
15 |     x <- x[, x$stim == "ctrl"]
16 |     
17 |     # drop unassigned & multiplet cells
18 |     x <- x[, !is.na(x$cell)]
19 |     x <- x[, x$multiplets == "singlet"]
20 |     
21 |     # drop undetected genes
22 |     x <- x[rowSums(counts(x)) > 0, ]
23 |     
24 |     # convert counts to sparse matrix
25 |     counts(x) <- as(counts(x), "dgCMatrix")
26 |     
27 |     # drop feature metadata
28 |     rowData(x) <- NULL
29 |     
30 |     # subset & rename cell metadata
31 |     colData(x) <- DataFrame(
32 |         batch = factor(x$ind),
33 |         cluster = x$cell,
34 |         row.names = NULL)
35 |     
36 |     return(x)
37 | }


--------------------------------------------------------------------------------
/code/00-get_data-Koh16.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(Biobase)
 3 |     library(GEOquery)
 4 |     library(SingleCellExperiment)
 5 | })
 6 | 
 7 | fun <- \()
 8 | {
 9 |     url <- "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSM2257302&format=file&file=GSM2257302%5FAll%5Fsamples%5Fsc%5Ftpm%2Etxt%2Egz"
10 |     fnm <- tempfile(fileext = ".txt.gz")
11 |     download.file(url, fnm, quiet = TRUE)
12 |     
13 |     # read in raw data 
14 |     y <- read.delim(gzfile(fnm), header = TRUE)
15 |     
16 |     # make feature metadata
17 |     rd <- data.frame(
18 |         ensembl = y$geneID, 
19 |         symbol = y$geneSymbol)
20 |     
21 |     ex <- grep("^gene", names(y))
22 |     y <- as.matrix(y[, -ex])
23 |     
24 |     # make cell metadata
25 |     cd <- data.frame(cluster = factor(
26 |         gsub("\\..*$", "", colnames(y)),
27 |         levels = c("H7hESC", "APS", "MPS3", "D2_25somitomere", "DLL1PXM", "LatM", "Earlysomite", "cDM", "Sclerotome"),
28 |         labels = c("hESC", "APS", "MPS", "D2.25 Somitomere", "DLL1+PXM", "LatM", "Somite", "Dermo", "Sclerotome")))
29 |     
30 |     # convert TPM to integers
31 |     y <- matrix(as.integer(y), nrow(y), ncol(y))
32 |     
33 |     # construct 'SingleCellExperiment'
34 |     SingleCellExperiment(
35 |         assays = list(counts = y),
36 |         rowData = rd, colData = cd)
37 | }


--------------------------------------------------------------------------------
/code/00-get_data-MCA20.gland.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(SingleCellExperiment)
 3 | })
 4 | 
 5 | fun <- \()
 6 | {
 7 |     # bump timeout limit of 60 sec to 5 min
 8 |     # to assure download isn't interrupted
 9 |     options(timeout = max(300, getOption("timeout")))
10 |     
11 |     # get count data
12 |     url <- "https://ndownloader.figshare.com/files/10756798?private_link=865e694ad06d5857db4b"
13 |     dir <- dirname(fnm <- tempfile())
14 |     download.file(url, fnm, quiet = TRUE)
15 |     fns <- untar(fnm, list = TRUE)
16 |     
17 |     # read in counts
18 |     sub <- grep("/MammaryGland.Virgin[0-9]", fns, value = TRUE)
19 |     untar(fnm, files = sub, exdir = dir)
20 |     ys <- lapply(file.path(dir, sub), \(.) {
21 |         y <- read.csv(., sep = " ")
22 |         as(as.matrix(y), "dgCMatrix")
23 |     })
24 |     
25 |     # get shared features
26 |     gs <- lapply(ys, rownames)
27 |     gs <- Reduce(intersect, gs)
28 |     
29 |     # join & construct SCE
30 |     y <- do.call(cbind, lapply(ys, \(.) .[gs, ]))
31 |     x <- SingleCellExperiment(assays = list(counts = y))
32 |     
33 |     # get cell metadata
34 |     url <- "https://ndownloader.figshare.com/files/11083451?private_link=865e694ad06d5857db4b"
35 |     fnm <- tempfile()
36 |     download.file(url, fnm, quiet = TRUE)
37 |     cd <- read.csv(fnm)
38 |     
39 |     # drop cells w/o metadata
40 |     i <- match(colnames(x), cd$Cell.name, nomatch = 0)
41 |     x <- x[, i != 0]
42 |     cd <- cd[i, ]
43 |     
44 |     # simplify annotations
45 |     cluster <- gsub("\\(.*\\)$", "", cd$Annotation)
46 |     
47 |     # add cell metadata
48 |     colData(x) <- DataFrame(
49 |         tissue = cd$Tissue,
50 |         batch = cd$Batch, 
51 |         cluster)
52 |     
53 |     return(x)
54 | }
55 | 


--------------------------------------------------------------------------------
/code/00-get_data-MCA20.lung.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(SingleCellExperiment)
 3 | })
 4 | 
 5 | fun <- \()
 6 | {
 7 |     # bump timeout limit of 60 sec to 5 min
 8 |     # to assure download isn't interrupted
 9 |     options(timeout = max(300, getOption("timeout")))
10 |     
11 |     # get count data
12 |     url <- "https://ndownloader.figshare.com/files/10756798?private_link=865e694ad06d5857db4b"
13 |     dir <- dirname(fnm <- tempfile())
14 |     download.file(url, fnm, quiet = TRUE)
15 |     fns <- untar(fnm, list = TRUE)
16 |     
17 |     # read in counts
18 |     sub <- grep("/Lung[0-9]", fns, value = TRUE)
19 |     untar(fnm, files = sub, exdir = dir)
20 |     ys <- lapply(file.path(dir, sub), \(.) {
21 |         y <- read.csv(., sep = " ")
22 |         as(as.matrix(y), "dgCMatrix")
23 |     })
24 |     
25 |     # get shared features
26 |     gs <- lapply(ys, rownames)
27 |     gs <- Reduce(intersect, gs)
28 |     
29 |     # join & construct SCE
30 |     y <- do.call(cbind, lapply(ys, \(.) .[gs, ]))
31 |     x <- SingleCellExperiment(assays = list(counts = y))
32 |     
33 |     # get cell metadata
34 |     url <- "https://ndownloader.figshare.com/files/11083451?private_link=865e694ad06d5857db4b"
35 |     fnm <- tempfile()
36 |     download.file(url, fnm, quiet = TRUE)
37 |     cd <- read.csv(fnm)
38 |     
39 |     # drop cells w/o metadata
40 |     i <- match(colnames(x), cd$Cell.name, nomatch = 0)
41 |     x <- x[, i != 0]
42 |     cd <- cd[i, ]
43 |     
44 |     # simplify annotations
45 |     cluster <- gsub("\\(.*\\)$", "", cd$Annotation)
46 |     
47 |     # add cell metadata
48 |     colData(x) <- DataFrame(
49 |         tissue = cd$Tissue,
50 |         batch = cd$Batch, 
51 |         cluster)
52 |     
53 |     return(x)
54 | }
55 | 


--------------------------------------------------------------------------------
/code/00-get_data-Mereu20.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(SingleCellExperiment)
 3 | })
 4 | 
 5 | fun <- \()
 6 | {
 7 |     # load SCE from URL
 8 |     con <- url("https://www.dropbox.com/s/i8mwmyymchx8mn8/x.all_classified.technologies.RData?raw=1")
 9 |     x <- get(load(con))
10 |     close(con)
11 |     
12 |     # drop log-normalized counts
13 |     assay(x, "logcounts") <- NULL
14 |     
15 |     # drop feature metadata
16 |     rowData(x) <- NULL
17 |     
18 |     # subset & rename cell metadata
19 |     colData(x) <- DataFrame(
20 |         batch = x$batch,
21 |         cluster = x$nnet2)
22 |     
23 |     return(x)
24 | }


--------------------------------------------------------------------------------
/code/00-get_data-Oetjen18.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(Matrix)
 3 |     library(SingleCellExperiment)
 4 | })
 5 | 
 6 | fun <- \()
 7 | {
 8 |     url <- "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE120221&format=file"
 9 |     fnm <- tempfile(fileext = ".tar")
10 |     download.file(url, fnm, quiet = TRUE)
11 | 
12 |     # unpack data
13 |     dir <- file.path(dirname(fnm), "GSE120221")
14 |     untar(fnm, exdir = dir)
15 |     
16 |     # split files by sample
17 |     fns <- list.files(dir, full.names = TRUE)
18 |     ss <- strsplit(fns, "_")
19 |     ids <- sapply(ss, .subset, 3) 
20 |     ids <- gsub("\\..*", "", ids)
21 |     fns <- split(fns, ids)
22 |     
23 |     # removes replicated samples
24 |     pat <- "(C|S1|2)|(C|Sk)"
25 |     fns <- fns[!grepl(pat, unique(ids))]
26 |     
27 |     l <- lapply(names(fns), \(.) {
28 |         # read in gene metadata
29 |         rd <- grep("genes", fns[[.]], value = TRUE)
30 |         rd <- read.delim(rd, header = FALSE)
31 |         names(rd) <- c("ensembl", "symbol")
32 |         
33 |         # read in cell metadata
34 |         cd <- grep("barcodes", fns[[.]], value = TRUE)
35 |         cd <- read.delim(cd, header = FALSE)
36 |         names(cd) <- "barcode"
37 |         cd$batch <- .
38 |         
39 |         # read in counts
40 |         y <- grep("matrix", fns[[.]], value = TRUE)
41 |         y <- readMM(y)
42 |         dimnames(y) <- list(
43 |             rd$ensembl,
44 |             cd$barcode)
45 |         
46 |         # construct SCE
47 |         SingleCellExperiment(
48 |             assays = list(counts = y),
49 |             rowData = rd, colData = cd)
50 |     })
51 |     # concatenate samples
52 |     do.call(cbind, l)
53 | }


--------------------------------------------------------------------------------
/code/00-get_data-TabulaMuris.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(ExperimentHub)
 3 |     library(SingleCellExperiment)
 4 | })
 5 | 
 6 | fun <- \()
 7 | {
 8 |     eh <- ExperimentHub()
 9 |     q <- query(eh, "TabulaMurisDroplet")
10 |     x <- eh[[q$ah_id]]
11 |     
12 |     x <- x[, x$mouse_id == "3-F-56"]
13 |     x <- x[, !is.na(x$cell_ontology_class)]
14 |     
15 |     cd <- data.frame(
16 |         tissue = x$tissue,
17 |         cluster = x$cell_ontology_class)
18 |     colData(x) <- DataFrame(cd)
19 |     rowData(x) <- NULL
20 |     
21 |     return(x)
22 | }


--------------------------------------------------------------------------------
/code/00-get_data-Tung17.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(SingleCellExperiment)
 3 | })
 4 | 
 5 | fun <- \()
 6 | {
 7 |     url <- "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE77288&format=file&file=GSE77288%5Freads%2Draw%2Dsingle%2Dper%2Dsample%2Etxt%2Egz"
 8 |     tmp <- tempfile(fileext = ".txt.gz")
 9 |     download.file(url, destfile = tmp)
10 |     
11 |     y <- read.delim(gzfile(tmp))
12 |     cd <- DataFrame(batch = y$individual)
13 |     
14 |     # simplify gene & cell names
15 |     y <- y[, grep("^ENSG", names(y))]
16 |     y <- t(as.matrix(y))
17 |     
18 |     SingleCellExperiment(
19 |         assays = list(counts = y),
20 |         colData = cd)
21 | }


--------------------------------------------------------------------------------
/code/00-get_data-Zheng17.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(scater)
 3 |     library(SingleCellExperiment)
 4 |     library(TENxPBMCData)
 5 | })
 6 | 
 7 | fun <- \()
 8 | {
 9 |     sce <- TENxPBMCData("pbmc68k")
10 |     counts(sce) <- as(counts(sce), "dgCMatrix")
11 |     
12 |     url <- "https://github.com/10XGenomics/single-cell-3prime-paper/raw/master/pbmc68k_analysis/68k_pbmc_barcodes_annotation.tsv"
13 |     fnm <- file.path(tempdir(), "foo.tsv")
14 |     download.file(url, fnm, quiet = TRUE)
15 |     cd <- read.delim(fnm)
16 |     
17 |     rowData(sce)[3] <- NULL
18 |     names(rowData(sce)) <- c("ensembl", "symbol")
19 |     
20 |     old <- c(
21 |         "CD8+ Cytotoxic T",
22 |         "CD8+/CD45RA+ Naive Cytotoxic",
23 |         "CD4+/CD45RO+ Memory",
24 |         "CD19+ B",
25 |         "CD4+/CD25 T Reg",
26 |         "CD56+ NK",
27 |         "CD4+ T Helper2",
28 |         "CD4+/CD45RA+/CD25- Naive T",
29 |         "CD34+",
30 |         "Dendritic",
31 |         "CD14+ Monocyte")
32 |     new <- c(
33 |         "T CD8+",
34 |         "T CD8+",
35 |         "T CD4+",
36 |         "B CD19+",
37 |         "T CD4+",
38 |         "NK CD56+",
39 |         "T CD4+",
40 |         "T CD4+",
41 |         "HSCs CD34+",
42 |         "Dendritic",
43 |         "Monocytes CD14+")
44 |     
45 |     cluster <- new[match(cd$celltype, old)]
46 |     colData(sce) <- DataFrame(cluster)
47 |     
48 |     return(sce)
49 | }


--------------------------------------------------------------------------------
/code/00-get_data-panc8.R:
--------------------------------------------------------------------------------
 1 | # load required packages
 2 | suppressPackageStartupMessages({
 3 |     library(Seurat)
 4 |     library(SeuratData)
 5 |     library(SingleCellExperiment)
 6 | })
 7 | 
 8 | fun <- \()
 9 | {
10 |     if (!require(panc8.SeuratData))
11 |         InstallData("panc8.SeuratData")
12 |     
13 |     # load data as 'Seurat' object
14 |     data("panc8")
15 |     x <- as.SingleCellExperiment(panc8)
16 |     
17 |     # drop log-normalized counts
18 |     # (these are identical to counts)
19 |     assay(x, "logcounts") <- NULL
20 |     
21 |     # exclude datasets with non-integer counts
22 |     x <- x[, !x$tech %in% c("celseq", "celseq2", "fluidigmc1")]
23 |     
24 |     # drop spike-ins
25 |     x <- x[-grep("ERCC", rownames(x)), ]
26 |     
27 |     # drop undetected genes
28 |     x <- x[rowSums(assay(x)) > 0, ]
29 |     
30 |     # subset & rename cell metadata
31 |     colData(x) <- DataFrame(
32 |         batch = x$dataset, 
33 |         cluster = x$celltype)
34 |     
35 |     return(x)
36 | }


--------------------------------------------------------------------------------
/code/00-get_data.R:
--------------------------------------------------------------------------------
 1 | # retrieve dataset
 2 | source(args[[1]])
 3 | x <- fun()
 4 | 
 5 | # number of features & observations
 6 | dim(x)
 7 | 
 8 | # tabulate number of cells by
 9 | # batch, cluster, batch-cluster
10 | b <- !is.null(x$batch)
11 | k <- !is.null(x$cluster)
12 | if (b) {
13 |     table(x$batch)
14 |     if (k) {
15 |         table(x$cluster)
16 |         table(x$batch, x$cluster)
17 |     }
18 | } else if (k) {
19 |     table(x$cluster)
20 | }
21 | 
22 | # simplify gene & cell names
23 | dimnames(x) <- list(
24 |     paste0("gene", seq_len(nrow(x))),
25 |     paste0("cell", seq_len(ncol(x))))
26 | 
27 | # save SCE to .rds
28 | saveRDS(x, args[[2]])


--------------------------------------------------------------------------------
/code/01-fil_data.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(Matrix)
 3 |     library(SingleCellExperiment)
 4 | })
 5 | 
 6 | # read in raw data
 7 | x <- readRDS(args[[1]])
 8 | 
 9 | # filter out instances with less than 50 cells
10 | by <- c("cluster", "sample", "batch")
11 | by <- intersect(by, names(colData(x)))
12 | 
13 | if (length(by) != 0) {
14 |     cs <- split(seq(ncol(x)), colData(x)[by])
15 |     rmv <- vapply(cs, length, numeric(1)) < 50
16 |     if (any(rmv)) x <- x[, -unlist(cs[rmv])]   
17 | }
18 | 
19 | # keep genes with count > 1 in at least 10 cells,
20 | # and cells with at least 100 detected genes
21 | x <- x[
22 |     rowSums(counts(x) > 1) >= 10,
23 |     colSums(counts(x) > 0) >= 100]
24 | 
25 | # drop missing factor levels
26 | for (. in by) x[[.]] <- droplevels(factor(x[[.]]))
27 | 
28 | # save filtered data to .rds
29 | saveRDS(x, args[[2]])


--------------------------------------------------------------------------------
/code/02-sub_data.R:
--------------------------------------------------------------------------------
 1 | set.seed(1)
 2 | suppressPackageStartupMessages({
 3 |     library(jsonlite)
 4 |     library(SingleCellExperiment)
 5 | })
 6 | 
 7 | # read in reference dataset & subsetting parameters 
 8 | x <- readRDS(args[[1]])
 9 | y <- fromJSON(args[[2]])
10 | y <- y[[wcs$datset]][[wcs$subset]]
11 | 
12 | # subset cells according to configuration
13 | for (i in names(y)) {
14 |     if (is.null(x[[i]])) next 
15 |     x <- x[, x[[i]] %in% y[[i]]]
16 |     x[[i]] <- droplevels(factor(x[[i]]))
17 |     # drop cell metadata variable when 
18 |     # there's only one level remaining
19 |     if (nlevels(x[[i]]) == 1) 
20 |         x[[i]] <- NULL
21 | }
22 | 
23 | # keep cells with at least 100 detected genes
24 | x <- x[, colSums(counts(x) > 0) >= 100]
25 | 
26 | # downsample to at most 'n_cells' per instance
27 | if (!is.null(y$n_cells)) {
28 |     by <- intersect(
29 |         c("cluster", "sample", "batch"),
30 |         names(colData(x)))
31 |     if (length(by) > 0) {
32 |         cs <- lapply(
33 |             split(seq(ncol(x)), colData(x)[by]),
34 |             function(.) {
35 |                 n <- as.numeric(y$n_cells)
36 |                 n <- min(n, length(.))
37 |                 sample(., n)
38 |             })
39 |         x <- x[, unlist(cs)]
40 |     } else {
41 |         n <- as.numeric(y$n_cells)
42 |         n <- min(n, ncol(x))
43 |         x <- x[, sample(ncol(x), n)]
44 |     }
45 | }
46 | 
47 | # keep genes with count > 1 in at least 10 cells
48 | x <- x[rowSums(counts(x) > 1) >= 10, ]
49 | 
50 | # downsample to at most 'n_genes'
51 | if (!is.null(y$n_genes)) {
52 |     n <- as.numeric(y$n_genes)
53 |     n <- min(n, nrow(x))
54 |     x <- x[sample(nrow(x), n), ]
55 | }
56 | 
57 | # save subsetted data to .rds
58 | saveRDS(x, args[[3]])


--------------------------------------------------------------------------------
/code/03-est_pars-BASiCS.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(BASiCS)
 3 |     library(SingleCellExperiment)
 4 | })
 5 | 
 6 | fun <- function(x) {
 7 |     suppressMessages({
 8 |         foo <- capture.output({
 9 |             x$BatchInfo <- as.numeric(x$batch)
10 |             mcmc <- BASiCS_MCMC(Data = x,
11 |                 N = 4000, Thin = 10, Burn = 2000,
12 |                 Regression = TRUE, WithSpikes = FALSE)
13 |         })
14 |     })
15 |     mcmc <- Summary(mcmc)
16 |     names(ps) <- ps <- c("mu", "delta", "s", "theta")
17 |     ps <- lapply(ps, function(p) displaySummaryBASiCS(mcmc, p)[, 1])
18 |     c(ps, list(bi = x$BatchInfo, batch = x$batch))
19 | }


--------------------------------------------------------------------------------
/code/03-est_pars-ESCO.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |   library(ESCO)
 3 |   library(SingleCellExperiment)
 4 | })
 5 | 
 6 | fun <- function(x) {
 7 |     y <- counts(x)
 8 |     if (!is.matrix(y))
 9 |         y <- as.matrix(y)
10 |     dir <- tempdir()
11 |     if (!is.null(x$batch)) {
12 |         type <- "batch"
13 |     } else if (!is.null(x$cluster)) {
14 |         type <- "cluster"
15 |     } else type <- "single"
16 |     group <- type != "single"
17 |     cellinfo <- x[[type]]
18 |     suppressMessages(z <- escoEstimate(y, dir, group, cellinfo))
19 |     list(params = z, type = type, groups = unique(x[[type]]))
20 | }


--------------------------------------------------------------------------------
/code/03-est_pars-POWSC.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(POWSC)
 3 |     library(SingleCellExperiment)
 4 | })
 5 | 
 6 | fun <- function(x) {
 7 |     if (!is.matrix(y <- counts(x)))
 8 |         counts(x) <- as.matrix(y)
 9 |     if (is.null(x$cluster)) {
10 |         y <- Est2Phase(x)
11 |     } else {
12 |         i <- split(seq(ncol(x)), x$cluster, drop = TRUE)
13 |         y <- lapply(i, function(.) Est2Phase(x[, .]))
14 |     }
15 | }
16 | 


--------------------------------------------------------------------------------
/code/03-est_pars-SCRIP.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(splatter)
 3 |     library(SingleCellExperiment)
 4 | })
 5 | 
 6 | fun <- function(x) {
 7 |     y <- counts(x)
 8 |     if (!is.matrix(y))
 9 |         y <- as.matrix(y)
10 |     
11 |     cd <- data.frame(colData(x))
12 |     i <- c("batch", "cluster")
13 |     i <- intersect(i, names(cd))
14 |     t <- ifelse(length(i) == 0, "foo", i)
15 | 
16 |     p <- if (t == "foo") {
17 |         list(
18 |             data = y,
19 |             mode = "GP-trendedBCV",
20 |             params = splatEstimate(y))
21 |     } else {
22 |         cd$cellType <- x[[i]]
23 |         list(
24 |             expre_data = y,
25 |             mode = "GP-trendedBCV",
26 |             pheno_data = cd,
27 |             CTlist = unique(x[[i]]),
28 |             nfeatures = nrow(x)) 
29 |     }
30 |     x <- list(p = p, t = t)
31 | }
32 | 


--------------------------------------------------------------------------------
/code/03-est_pars-SPARSim.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |   library(scater)
 3 |   library(SPARSim)
 4 |   library(SingleCellExperiment)
 5 | })
 6 | 
 7 | fun <- function(x)
 8 | {
 9 |     y <- counts(x)
10 |     if (!is.matrix(y))
11 |         y <- as.matrix(y)
12 |     
13 |     if(!is.null(x$batch)){
14 |       conditions <- list()
15 |       for (batch_name in unique(x$batch)) {
16 |         conditions[[batch_name]] <- which(x$batch %in% batch_name)
17 |       }
18 |     }else{
19 |       conditions <- list(seq_len(ncol(x)))
20 |     }
21 |     
22 |     estimate <- SPARSim_estimate_parameter_from_data(
23 |         raw_data = y,
24 |         norm_data = normalizeCounts(y),
25 |         conditions = conditions)
26 |     
27 |     if(!is.null(x$batch)){
28 |       list(estimate = estimate, batch=x$batch)
29 |     }else{
30 |       list(estimate=estimate)
31 |     }
32 | }
33 | 


--------------------------------------------------------------------------------
/code/03-est_pars-SPsimSeq.R:
--------------------------------------------------------------------------------
1 | fun <- function(x) { }


--------------------------------------------------------------------------------
/code/03-est_pars-SymSim.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |   library(SymSim)
 3 |   library(SingleCellExperiment)
 4 | })
 5 | #type (k)?
 6 | fun <- function(x) {
 7 |     y <- counts(x)
 8 |     if (!is.matrix(y))
 9 |         y <- as.matrix(y)
10 |     z <- BestMatchParams(
11 |         tech = "UMI",
12 |         counts = y,
13 |         plotfilename = "foo",
14 |         n_optimal = 1)
15 |     file.remove("foo.pdf")
16 |     z$ngenes <- nrow(x)
17 |     z$ncells_total <- ncol(x)
18 |     n <- length(unique(x$batch))
19 |     z$nbatch <- ifelse(n == 0, 1, n)
20 |     list(params = z, batch = x$batch)
21 | }


--------------------------------------------------------------------------------
/code/03-est_pars-ZINB-WaVE.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(BiocParallel)
 3 |     library(SingleCellExperiment)
 4 |     library(zinbwave)
 5 | })
 6 | 
 7 | fun <- function(x) {
 8 |     cd <- data.frame(colData(x))
 9 |     i <- c("cluster", "batch")
10 |     i <- intersect(i, names(cd))
11 |     
12 |     y <- if (length(i) == 0) {
13 |         zinbFit(x, 
14 |             verbose = FALSE,
15 |             BPPARAM = SerialParam())
16 |     } else {
17 |         zinbFit(x, 
18 |             model.matrix(~cd[[i]]),
19 |             verbose = FALSE,
20 |             BPPARAM = SerialParam())
21 |     }
22 |     x <- list(obj = y, cd = colData(x))
23 | }
24 | 


--------------------------------------------------------------------------------
/code/03-est_pars-hierarchicell.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(hierarchicell)
 3 |     library(SingleCellExperiment)
 4 | })
 5 | 
 6 | fun <- function(x) {
 7 |     y <- counts(x)
 8 |     if (!is.matrix(y))
 9 |         y <- as.matrix(y)
10 |     if (is.null(x$batch)) {
11 |         # split cells into random batches
12 |         batch_ids <- sample(2, ncol(x), TRUE)
13 |         n_batches <- 0
14 |     } else {
15 |         batch_ids <- x$batch
16 |         n_batches <- length(unique(batch_ids))
17 |     }
18 |     df <- data.frame(seq(ncol(x)), batch_ids, t(y))
19 |     z <- filter_counts(df, gene_thresh = 0, cell_thresh = 0)
20 |     p <- compute_data_summaries(expr = z, type = "Raw")
21 |     # single group 2 cell is required
22 |     # for simulation to pass successfully
23 |     list(
24 |         data_summaries = p,
25 |         n_genes = nrow(x),
26 |         n_cases = 1,
27 |         cells_per_case = 1,
28 |         n_controls = length(unique(batch_ids)),
29 |         cells_per_control = tabulate(batch_ids),
30 |         ncells_variation_type = "Fixed",
31 |         n_batches = n_batches,
32 |         batch_ids = batch_ids)
33 | }
34 | 


--------------------------------------------------------------------------------
/code/03-est_pars-muscat.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(muscat)
 3 |     library(SingleCellExperiment)
 4 | })
 5 | 
 6 | fun <- function(x) {
 7 |     vars <- c("cluster", "batch", "group")
 8 |     vars <- setdiff(vars, names(colData(x)))
 9 |     for (v in vars) x[[v]] <- "foo"
10 |     
11 |     y <- prepSCE(x, 
12 |         kid = "cluster",
13 |         sid = "batch",
14 |         gid = "group",
15 |         drop = TRUE)
16 | 
17 |     z <- prepSim(y,
18 |         min_size = NULL,
19 |         verbose = FALSE)
20 | }
21 | 


--------------------------------------------------------------------------------
/code/03-est_pars-powsimR.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |   library(powsimR)
 3 |   library(SingleCellExperiment)
 4 | })
 5 | 
 6 | fun <- function(x) {
 7 |   y <- counts(x) 
 8 |   if (!is.matrix(y))
 9 |     y <- as.matrix(y)
10 |   z <- estimateParam(
11 |     countData = y,
12 |     RNAseq = "singlecell",
13 |     Protocol = "UMI",
14 |     Distribution = "NB",
15 |     Normalisation = "scran",
16 |     GeneFilter = 0,
17 |     SampleFilter = Inf,
18 |     NCores = 1,
19 |     verbose = FALSE)
20 |   Setup(
21 |     ngenes = z$totalG,
22 |     estParamRes = z,
23 |     n1 = z$totalS, 
24 |     n2 = 2, # has to be at least 2
25 |     p.DE = 0,
26 |     pLFC = 0,
27 |     p.G = 1,
28 |     nsims = 1,  
29 |     setup.seed = 1234)
30 | }
31 | 


--------------------------------------------------------------------------------
/code/03-est_pars-scDD.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(BiocParallel)
 3 |     library(scDD)
 4 |     library(SingleCellExperiment)
 5 | })
 6 | 
 7 | fun <- function(x) {
 8 |     if (!is.matrix(y <- counts(x)))
 9 |         counts(x) <- as.matrix(y)
10 |     # randomly split cells into 2 groups
11 |     x$foo <- sample(2, ncol(x), TRUE)
12 |     x <- preprocess(x, 
13 |         condition = "foo", 
14 |         scran_norm = TRUE)
15 |     list(
16 |         SCdat = x,
17 |         condition = "foo",
18 |         plots = FALSE,
19 |         param = SerialParam(),
20 |         nDE = 0, nDP = 0, nDM = 0, 
21 |         nDB = 0, nEE = nrow(x), nEP = 0,
22 |         numSamples = ceiling(ncol(x)/2))
23 | }
24 | 


--------------------------------------------------------------------------------
/code/03-est_pars-scDesign.R:
--------------------------------------------------------------------------------
1 | fun <- function(x) { }


--------------------------------------------------------------------------------
/code/03-est_pars-scDesign2.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(scDesign2)
 3 | })
 4 | 
 5 | fun <- function(x) 
 6 | {
 7 |     if (is.null(x$cluster)) {
 8 |         x$id <- "foo" 
 9 |     } else {
10 |         x$id <- x$cluster
11 |     }
12 |     colnames(x) <- x$id
13 |     y <- fit_model_scDesign2(
14 |         data_mat = counts(x),
15 |         cell_type_sel = unique(x$id))
16 | }


--------------------------------------------------------------------------------
/code/03-est_pars-splatter.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(splatter)
 3 |     library(SingleCellExperiment)
 4 | })
 5 | 
 6 | fun <- \(x) 
 7 | {
 8 |     if (!is.matrix(y <- counts(x)))
 9 |         counts(x) <- as.matrix(y)
10 |     g <- if (!is.null(x$batch)) {
11 |         "batch"
12 |     } else if (!is.null(x$cluster)) {
13 |         "cluster"
14 |     }
15 |     if (is.null(g)) {
16 |         p <- splatEstimate(x)
17 |     } else {
18 |         i <- split(seq(ncol(x)), x[[g]], drop = TRUE)
19 |         x <- lapply(i, function(.) x[, .])
20 |         p <- lapply(x, splatEstimate)
21 |     }
22 |     list(pars = p, type = g)
23 | }


--------------------------------------------------------------------------------
/code/03-est_pars-zingeR.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(gamlss)
 3 |     library(gamlss.tr)
 4 |     library(SingleCellExperiment)
 5 |     library(zingeR)
 6 | })
 7 | 
 8 | fun <- function(x) {
 9 |     y <- counts(x)
10 |     if (!is.matrix(x))
11 |         y <- as.matrix(y)
12 |     g <- sample(2, ncol(y), TRUE)
13 |     mm <- model.matrix(~g)
14 |     p <- getDatasetZTNB(y, mm)
15 |     list(
16 |         dataset = y,
17 |         group = g,
18 |         nTags = p$dataset.nTags,
19 |         nlibs = length(p$dataset.lib.size),
20 |         pUp = 0,
21 |         verbose = FALSE,
22 |         params = p)
23 | }
24 | 


--------------------------------------------------------------------------------
/code/03-est_pars.R:
--------------------------------------------------------------------------------
1 | x <- readRDS(args$sub)
2 | 
3 | source(args$fun)
4 | y <- tryCatch(fun(x),
5 |     error = function(e) NA)
6 | 
7 | saveRDS(y, args$est)


--------------------------------------------------------------------------------
/code/04-sim_data-BASiCS.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(BASiCS)
 3 |     library(SingleCellExperiment)
 4 | })
 5 | 
 6 | fun <- function(x) {
 7 |     suppressMessages({
 8 |         y <- BASiCS_Sim(
 9 |             Mu = x$mu,
10 |             Mu_spikes = NULL,
11 |             Delta = x$delta,
12 |             Phi = NULL,
13 |             S = x$s,
14 |             Theta = x$theta,
15 |             BatchInfo = x$bi)
16 |     })
17 |     y$batch <- x$batch
18 |     y$BatchInfo <- NULL
19 |     return(y)
20 | }


--------------------------------------------------------------------------------
/code/04-sim_data-ESCO.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |   library(ESCO)
 3 |   library(SingleCellExperiment)
 4 | })
 5 | 
 6 | fun <- function(x) {
 7 |   type <- ifelse(x$type == "single", "single", "group")
 8 |   y <- escoSimulate(x$params, type, verbose = FALSE, numCores = 1)
 9 |   assays(y) <- list(counts = assay(y, "counts"))
10 |   if (type == "group") {
11 |     groups <- factor(y$Group, labels = x$groups)
12 |     cd <- DataFrame(groups)
13 |     names(cd) <- x$type
14 |     colData(y) <- cd
15 |   } else {
16 |     colData(y) <- NULL
17 |   }
18 |   return(y)
19 | }
20 | 


--------------------------------------------------------------------------------
/code/04-sim_data-POWSC.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(POWSC)
 3 |     library(SingleCellExperiment)
 4 | })
 5 | 
 6 | fun <- function(x) {
 7 |     if (!is.list(x[[1]])) {
 8 |         y <- Simulate2SCE(
 9 |             n = ncol(x$exprs),
10 |             perDE = 0,
11 |             estParas1 = x,
12 |             estParas2 = x)
13 |         
14 |         y <- y$sce
15 |         rowData(y) <- NULL
16 |         colData(y) <- NULL
17 |     } else {
18 |         ns <- vapply(x, function(.) 
19 |             ncol(.$exprs), numeric(1))
20 |         
21 |         y <- SimulateMultiSCEs(
22 |             n = sum(ns)/(length(x)-1),
23 |             estParas_set = x,
24 |             multiProb = ns)
25 | 
26 |         y <- lapply(y, function(.) .$sce)
27 |         y <- do.call(cbind, y)
28 |         
29 |         y$cluster <- y$cellTypes
30 |         y$cellTypes <- NULL
31 |         rowData(y) <- NULL
32 |     }
33 |     return(y)
34 | }


--------------------------------------------------------------------------------
/code/04-sim_data-SCRIP.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(checkmate)
 3 |     library(SCRIP)
 4 |     library(Seurat)
 5 |     library(SingleCellExperiment)
 6 | })
 7 | 
 8 | fun <- function(x) {
 9 |     if (x$t == "foo") {
10 |         y <- do.call(SCRIPsimu, x$p)
11 |         metadata(y) <- list()
12 |         rowData(y) <- colData(y) <- NULL
13 |         assays(y) <- assays(y)["counts"]
14 |     } else {
15 |         # code bug requires 'CTlist' 
16 |         # to be passed via environment
17 |         env <- parent.frame()
18 |         env$CTlist <- x$p$CTlist
19 |         x$p$CTlist <- NULL
20 |         sink(tempfile())
21 |         y <- do.call(simu_cluster, x$p, envir = env)
22 |         sink()
23 |         cd <- DataFrame(y$CT.infor)
24 |         names(cd) <- x$t
25 |         y <- SingleCellExperiment(
26 |             list(counts = y$final), 
27 |             colData = cd)
28 |         # make cell names unique across clusters
29 |         colnames(y) <- paste0("cell", seq(ncol(y)))
30 |     }
31 |     return(y)
32 | }


--------------------------------------------------------------------------------
/code/04-sim_data-SPARSim.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(SPARSim)
 3 |     library(SingleCellExperiment)
 4 | })
 5 | 
 6 | fun <- function(x) {
 7 |     
 8 |     sink(tempfile())
 9 |     y <- SPARSim_simulation(x$estimate, output_batch_matrix = TRUE)
10 |     sink()
11 |     
12 |     y <- y$count_matrix  
13 |     
14 |     if(is.null(x$batch)){
15 |         SingleCellExperiment(
16 |             assays = list(counts = y))
17 |     }else{
18 |         SingleCellExperiment(
19 |             assays = list(counts = y),
20 |             colData=data.frame(batch=x$batch))
21 |     }
22 |     
23 | }


--------------------------------------------------------------------------------
/code/04-sim_data-SPsimSeq.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(SPsimSeq)
 3 |     library(SingleCellExperiment)
 4 | })
 5 | 
 6 | fun <- function(x) {
 7 |     if (!is.matrix(y <- counts(x)))
 8 |         counts(x) <- as.matrix(y)
 9 |     if (is.null(colnames(x)))
10 |         colnames(x) <- seq(ncol(x))
11 |     if (is.null(x$batch)) {
12 |         batch <- rep(1, ncol(x))
13 |         batch.config <- 1
14 |     } else {
15 |         # keep genes detected in at least 10 cells per batch
16 |         i <- split(seq(ncol(x)), x$batch)
17 |         x <- x[rowAlls(vapply(i, function(.) 
18 |             rowSums(counts(x[, .]) > 0) >= 10,
19 |             logical(nrow(x)))), ]
20 |         batch <- as.numeric(x$batch)
21 |         batch.config <- tabulate(x$batch)/ncol(x)
22 |     }
23 |     y <- SPsimSeq(
24 |         s.data = counts(x), 
25 |         n.genes = nrow(x), 
26 |         tot.samples = ncol(x),
27 |         batch = batch, 
28 |         batch.config = batch.config,
29 |         model.zero.prob = TRUE, 
30 |         genewiseCor = TRUE,
31 |         result.format = "SCE", 
32 |         return.details = FALSE,
33 |         verbose = FALSE)
34 |     y <- y[[1]]
35 |     rowData(y) <- NULL
36 |     colData(y) <- colData(x)
37 |     return(y)
38 | }
39 | 


--------------------------------------------------------------------------------
/code/04-sim_data-SymSim.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |   library(SymSim)
 3 |   library(SingleCellExperiment)
 4 | })
 5 | 
 6 | fun <- function(x) {
 7 |     args <- intersect(
 8 |         names(x$params),
 9 |         names(formals(SimulateTrueCounts)))
10 |     args <- c(x$params[args], list(randseed = 1234))
11 |     y <- do.call(SimulateTrueCounts, args)
12 |     
13 |     data("gene_len_pool")
14 |     gene_len <- sample(gene_len_pool, args$ngenes, TRUE)
15 |     
16 |     args <- intersect(
17 |         names(x$params),
18 |         names(formals(True2ObservedCounts)))
19 |     args <- c(x$params[args], list(
20 |         true_counts = y$counts, 
21 |         meta_cell = y$cell_meta, 
22 |         gene_len = gene_len))
23 |     z <- do.call(True2ObservedCounts, args)
24 |     
25 |     if (x$params$nbatch > 1) {
26 |         z <- DivideBatches(
27 |             observed_counts_res = z, 
28 |             nbatch = x$params$nbatch)
29 |         cd <- DataFrame(batch = x$batch)
30 |     } else cd <- make_zero_col_DFrame(ncol(z$counts))
31 |     SingleCellExperiment(list(counts = z$counts), colData = cd)
32 | }


--------------------------------------------------------------------------------
/code/04-sim_data-ZINB-WaVE.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(SingleCellExperiment)
 3 |     library(zinbwave)
 4 | })
 5 | 
 6 | fun <- function(x) {
 7 |     y <- zinbSim(x$obj)
 8 |     SingleCellExperiment(
 9 |         list(counts = y$counts), 
10 |         colData = x$cd)
11 | }
12 | 


--------------------------------------------------------------------------------
/code/04-sim_data-hierarchicell.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(hierarchicell)
 3 |     library(SingleCellExperiment)
 4 | })
 5 | 
 6 | fun <- function(x) {
 7 |     args <- formals("simulate_hierarchicell")
 8 |     y <- do.call(
 9 |         simulate_hierarchicell, 
10 |         x[names(x) %in% names(args)])
11 |     # keep control cells only
12 |     y <- y[y$Status == "Control", ] 
13 |     y <- t(as.matrix(y[, grep("^Gene", names(y))]))
14 |     cd <- if (x$n_batches != 0) {
15 |         data.frame(batch = x$batch_ids)
16 |     } else make_zero_col_DFrame(ncol(y))
17 |     SingleCellExperiment(list(counts = y), colData = cd)
18 | }


--------------------------------------------------------------------------------
/code/04-sim_data-muscat.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(muscat)
 3 |     library(SingleCellExperiment)
 4 | })
 5 | 
 6 | fun <- function(x) {
 7 |     y <- simData(x, dd = FALSE)
 8 |     md <- metadata(y)
 9 | 
10 |     if (!is.null(x$cluster_id)) {
11 |         kids <- factor(y$cluster_id, labels = md$ref_kids)
12 |         kids <- factor(kids, levels(x$cluster_id))
13 |         y$cluster <- kids
14 |         y$cluster_id <- NULL
15 |     }
16 |     if (!is.null(x$sample_id)) {
17 |         bids <- factor(y$sample_id, labels = md$ref_sids)
18 |         bids <- factor(bids, levels(x$sample_id))
19 |         y$batch <- bids
20 |         y$sample_id <- NULL
21 |     }
22 |     metadata(y) <- list()
23 |     rowData(y) <- NULL
24 |     return(y)
25 | }
26 | 


--------------------------------------------------------------------------------
/code/04-sim_data-powsimR.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |   library(powsimR)
 3 |   library(SingleCellExperiment)
 4 | })
 5 | 
 6 | fun <- function(x) {
 7 |   y <- simulateDE(
 8 |     SetupRes = x,
 9 |     Normalisation = "scran",
10 |     DEmethod = "DESeq2",
11 |     Counts = TRUE,
12 |     NCores = NULL,
13 |     verbose = TRUE)
14 |   # remove group 2 cells
15 |   z <- y$Counts[[1]][[1]][, -c(1, 2)]
16 |   SingleCellExperiment(assays = list(counts = z))
17 | }
18 | 
19 | 


--------------------------------------------------------------------------------
/code/04-sim_data-scDD.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(BiocParallel)
 3 |     library(scDD)
 4 |     library(SingleCellExperiment)
 5 | })
 6 | 
 7 | fun <- function(x) {
 8 |     sink(tempfile())
 9 |     y <- do.call(simulateSet, x)
10 |     sink()
11 |     colData(y) <- rowData(y) <- NULL
12 |     return(y)
13 | }
14 | 


--------------------------------------------------------------------------------
/code/04-sim_data-scDesign.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(scDesign)
 3 |     library(SingleCellExperiment)
 4 | })
 5 | 
 6 | fun <- function(x) {
 7 |     y <- counts(x)
 8 |     if (!is.matrix(y))
 9 |         y <- as.matrix(y)
10 |     foo <- capture.output(
11 |         z <- design_data(
12 |             realcount = y,
13 |             S = sum(y),
14 |             ncell = ncol(y),
15 |             ngroup = 1,
16 |             ncores = 1))
17 |     SingleCellExperiment(
18 |         assays = list(counts = z))
19 | }


--------------------------------------------------------------------------------
/code/04-sim_data-scDesign2.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(scDesign2)
 3 |     library(SingleCellExperiment)
 4 | })
 5 | 
 6 | fun <- function(x)
 7 | {
 8 |     # get number of cells per cluster
 9 |     n <- vapply(x, function(.) .$n_cell, numeric(1))
10 |     # simulate data with equal number 
11 |     # & proportion of cells per cluster
12 |     y <- simulate_count_scDesign2(
13 |         model_params = x, 
14 |         n_cell_new = sum(n), 
15 |         cell_type_prop = prop.table(n))
16 |     # construct SCE
17 |     cd <- if (length(x) == 1) {
18 |         make_zero_col_DFrame(ncol(y))
19 |     } else {
20 |         data.frame(cluster = colnames(y))
21 |     }
22 |     SingleCellExperiment(
23 |         assay = list(counts = unname(y)),
24 |         colData = cd)
25 | }
26 | 


--------------------------------------------------------------------------------
/code/04-sim_data-splatter.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(splatter)
 3 |     library(SingleCellExperiment)
 4 | })
 5 | 
 6 | fun <- \(x) 
 7 | {
 8 |     f <- function(p) {
 9 |         y <- splatSimulate(p, verbose = FALSE)
10 |         assays(y) <- assays(y)["counts"]
11 |         rowData(y) <- colData(y) <- NULL
12 |         metadata(y) <- list()
13 |         return(y)
14 |     }
15 |     if (!is.null(x$type)) {
16 |         y <- lapply(names(x$pars), 
17 |             function(i) {
18 |                 y <- f(x$pars[[i]])
19 |                 y[[x$type]] <- i
20 |                 return(y)
21 |             }
22 |         )
23 |         do.call(cbind, y)
24 |     } else {
25 |         f(x$pars)
26 |     }
27 | }


--------------------------------------------------------------------------------
/code/04-sim_data-zingeR.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(gamlss)
 3 |     library(gamlss.tr)
 4 |     library(mgcv)
 5 |     library(SingleCellExperiment)
 6 |     library(zingeR)
 7 | })
 8 | 
 9 | fun <- function(x) {
10 |     y <- do.call(NBsimSingleCell, x)
11 |     SingleCellExperiment(list(counts = y$counts))
12 | }
13 | 


--------------------------------------------------------------------------------
/code/04-sim_data.R:
--------------------------------------------------------------------------------
 1 | x <- readRDS(args$est)
 2 | 
 3 | 
 4 | y <- if (!isTRUE(is.na(x))) { # skip simulation if estimation failed
 5 |     if (is.null(x))   # use dataset if there's no separate estimation step
 6 |         x <- readRDS(args$sub)
 7 |     
 8 |     source(args$fun)
 9 |     tryCatch(fun(x), 
10 |         error = function(e) NULL)
11 | }
12 | 
13 | if (!is.null(y)) {
14 |     z <- assay(y)
15 |     y <- y[
16 |         rowSums(z) > 0,
17 |         colSums(z) > 0]
18 |     dimnames(y) <- list(
19 |         paste0("gene", seq(nrow(y))),
20 |         paste0("cell", seq(ncol(y))))
21 | }
22 | 
23 | saveRDS(y, args$sim)


--------------------------------------------------------------------------------
/code/05-calc_batch-ComBat.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(scater)
 3 |     library(sva)
 4 | })
 5 | 
 6 | fun <- \(x)
 7 | {
 8 |     y <- normalizeCounts(x, log = TRUE)
 9 |     if (!is.matrix(y)) y <- as.matrix(y)
10 |     
11 |     df <- data.frame(t(y))
12 |     mm <- model.matrix(~ 1, df)
13 |     
14 |     sink(tempfile())
15 |     suppressMessages({
16 |         z <- ComBat(
17 |             dat = y, 
18 |             batch = x$batch, 
19 |             mod = mm, 
20 |             par.prior = TRUE, 
21 |             prior.plots = FALSE)
22 |     })
23 |     sink()
24 |     
25 |     list(
26 |         assay_in = y,
27 |         assay_out = z)
28 | }


--------------------------------------------------------------------------------
/code/05-calc_batch-Harmony.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(harmony)
 3 |     library(scater)
 4 | })
 5 | 
 6 | fun <- \(x)
 7 | {
 8 |     y <- calculatePCA(
 9 |         logNormCounts(x), 
10 |         ncomponents = 20)
11 |     
12 |     z <- HarmonyMatrix(
13 |         data_mat = y, 
14 |         meta_data = x$batch, 
15 |         do_pca = FALSE, 
16 |         verbose = FALSE)
17 |     
18 |     list(
19 |         dimred_in = y,
20 |         dimred_out = z)
21 | }


--------------------------------------------------------------------------------
/code/05-calc_batch-Seurat.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(scater)
 3 |     library(Seurat)
 4 | })
 5 | 
 6 | fun <- \(x) 
 7 | {
 8 |     x <- logNormCounts(x)
 9 |     k <- min(table(x$batch)/2)
10 |     
11 |     l <- SplitObject(
12 |         as.Seurat(x), 
13 |         split.by = "batch")
14 |     l <- lapply(l, \(.) {
15 |         . <- NormalizeData(., verbose = FALSE)
16 |         . <- FindVariableFeatures(., 
17 |             verbose = FALSE,
18 |             nfeatures = 1000,
19 |             selection.method = "vst")
20 |     })
21 |     fs <- SelectIntegrationFeatures(l)
22 |     as <- FindIntegrationAnchors(l, 
23 |         k.filter = k,
24 |         dims = seq(20), 
25 |         verbose = FALSE,
26 |         anchor.features = fs)
27 |     y <- IntegrateData(as, 
28 |         k.weight = k, 
29 |         verbose = FALSE,
30 |         features.to.integrate = rownames(x))
31 |     
32 |     list(
33 |         assay_in = logcounts(x),
34 |         assay_out = GetAssayData(y))
35 | }


--------------------------------------------------------------------------------
/code/05-calc_batch-fastMNN.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(batchelor)
 3 |     library(scater)
 4 | })
 5 | 
 6 | fun <- \(x)
 7 | {
 8 |     x <- logNormCounts(x)
 9 |     y <- calculatePCA(x, ncomponents = 20)
10 |     
11 |     k <- min(table(x$batch)/2)
12 |     z <- fastMNN(x, 
13 |         batch = x$batch, 
14 |         k = k, d = 20)
15 |     
16 |     list(
17 |         dimred_in = y,
18 |         dimred_out = reducedDim(z))
19 | }


--------------------------------------------------------------------------------
/code/05-calc_batch-limma.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(edgeR)
 3 |     library(limma)
 4 | })
 5 | 
 6 | fun <- \(x)
 7 | {
 8 |     mm <- model.matrix(~ x$batch) 
 9 |     y <- DGEList(counts(x))
10 |     
11 |     y <- calcNormFactors(y, method = "TMMwsp")
12 |     v <- voom(y, mm, plot = FALSE)
13 |     z <- removeBatchEffect(v, x$batch)
14 |     
15 |     list(
16 |         assay_in = v$E,
17 |         assay_out = z) 
18 | }


--------------------------------------------------------------------------------
/code/05-calc_batch-mnnCorrect.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(batchelor)
 3 |     library(scater)
 4 | })
 5 | 
 6 | fun <- \(x)
 7 | {
 8 |     x <- logNormCounts(x)
 9 |     k <- min(table(x$batch)/2)
10 |     y <- mnnCorrect(x, batch = x$batch, k = k)
11 |     
12 |     list(
13 |         assay_in = logcounts(x),
14 |         assay_out = assay(y))
15 | }


--------------------------------------------------------------------------------
/code/05-calc_batch.R:
--------------------------------------------------------------------------------
1 | source(args$fun)
2 | sce <- readRDS(args$sce)
3 | 
4 | # skip if simulation failed (return NULL)
5 | res <- if (!is.null(sce)) fun(sce)
6 | 
7 | saveRDS(res, args$res)
8 | 


--------------------------------------------------------------------------------
/code/05-calc_clust-CIDR.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(cidr)
 3 |     library(SingleCellExperiment)
 4 | })
 5 | 
 6 | fun <- \(x) 
 7 | {
 8 |     y <- counts(x)
 9 |     if (!is.matrix(y))
10 |         y <- as.matrix(y)
11 |     k <- length(unique(x$cluster))
12 |     
13 |     z <- scDataConstructor(y, tagType = "raw")
14 |     z <- determineDropoutCandidates(z)
15 |     z <- wThreshold(z)
16 |     z <- scDissim(z, threads = 1)
17 |     z <- scPCA(z, plotPC = FALSE)
18 |     z <- nPC(z)
19 |     
20 |     scCluster(
21 |         object = z, 
22 |         nCluster = k, 
23 |         nPC = z@nPC, 
24 |         cMethod = "ward.D2")@clusters
25 | }


--------------------------------------------------------------------------------
/code/05-calc_clust-PCA+HC.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(scater)
 3 |     library(SingleCellExperiment)
 4 | })
 5 | 
 6 | fun <- \(x) 
 7 | {
 8 |     y <- normalizeCounts(x, log = TRUE)
 9 |     pca <- prcomp(t(y), center = TRUE, scale. = FALSE)
10 |     pca <- pca$x[, seq(20), drop = FALSE]
11 |     hc <- hclust(dist(pca), method = "ward.D2")
12 |     cutree(hc, k = length(unique(x$cluster)))
13 | }


--------------------------------------------------------------------------------
/code/05-calc_clust-PCA+KM.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(scater)
 3 | })
 4 | 
 5 | fun <- \(x) 
 6 | {
 7 |     y <- normalizeCounts(x, log = TRUE)
 8 |     k <- length(unique(x$cluster))
 9 |     pca <- prcomp(t(y), center = TRUE, scale. = FALSE)
10 |     pca <- pca$x[, seq(30), drop = FALSE]
11 |     kmeans(pca, centers = k, nstart = 25)$cluster
12 | }


--------------------------------------------------------------------------------
/code/05-calc_clust-SC3.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(SC3)
 3 |     library(scater)
 4 |     library(SingleCellExperiment)
 5 | })
 6 | 
 7 | fun <- \(x) 
 8 | {
 9 |     if (is.null(rownames(x)))
10 |         rownames(x) <- paste0("gene", seq(nrow(x)))
11 |     
12 |     if (!is.matrix(y <- counts(x)))
13 |         counts(x) <- as.matrix(y)
14 |     
15 |     x <- logNormCounts(x)
16 |     k <- length(unique(x$cluster))
17 |     rowData(x)$feature_symbol <- rownames(x)
18 |     
19 |     y <- sc3_prepare(x,
20 |         gene_filter = FALSE,
21 |         svm_max = 1e6,
22 |         n_cores = 1,
23 |         rand_seed = 1)
24 | 
25 |     z <- sc3(y, 
26 |         ks = k, 
27 |         gene_filter = FALSE,
28 |         biology = FALSE,
29 |         k_estimator = FALSE,
30 |         svm_max = 1e6,
31 |         n_cores = 1,
32 |         rand_seed = 1)
33 |     
34 |     z[[grep("sc3", names(colData(z)))]]
35 | }


--------------------------------------------------------------------------------
/code/05-calc_clust-SC3svm.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(SC3)
 3 |     library(scater)
 4 |     library(SingleCellExperiment)
 5 | })
 6 | 
 7 | fun <- \(x) 
 8 | {
 9 |     if (is.null(rownames(x)))
10 |         rownames(x) <- paste0("gene", seq(nrow(x)))
11 |     
12 |     if (!is.matrix(y <- counts(x)))
13 |         counts(x) <- as.matrix(y)
14 |     
15 |     n <- round(ncol(x)/2)
16 |     x <- logNormCounts(x)
17 |     k <- length(unique(x$cluster))
18 |     rowData(x)$feature_symbol <- rownames(x)
19 |     
20 |     y <- sc3_prepare(x,
21 |         gene_filter = FALSE,
22 |         svm_max = 0,
23 |         svm_num_cells = n,
24 |         n_cores = 1,
25 |         rand_seed = 1)
26 | 
27 |     y <- sc3(y, 
28 |         ks = k, 
29 |         gene_filter = FALSE,
30 |         biology = FALSE,
31 |         k_estimator = FALSE,
32 |         svm_max = 0,
33 |         svm_num_cells = n,
34 |         n_cores = 1,
35 |         rand_seed = 1)
36 |     
37 |     y <- sc3_run_svm(y, k)
38 |     y[[grep("sc3", names(colData(y)))]]
39 | }


--------------------------------------------------------------------------------
/code/05-calc_clust-Seurat.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(Seurat)
 3 |     library(SingleCellExperiment)
 4 | })
 5 | 
 6 | fun <- \(x) 
 7 | {
 8 |     if (is.null(rownames(x))) 
 9 |         rownames(x) <- paste0("gene", seq(nrow(x)))
10 |     if (is.null(colnames(x))) 
11 |         colnames(x) <- paste0("cell", seq(ncol(x)))
12 |     
13 |     y <- CreateSeuratObject(
14 |         counts = counts(x), 
15 |         meta.data = colData(x))
16 |     
17 |     y <- NormalizeData(y, verbose = FALSE)
18 |     y <- ScaleData(y, verbose = FALSE) 
19 |     y <- FindVariableFeatures(y, verbose = FALSE) 
20 |     hvgs <- VariableFeatures(y)
21 |     
22 |     y <- RunPCA(y, npcs = n <- 30, features = hvgs, verbose = FALSE) 
23 |     y <- FindNeighbors(y, dims = seq(n), verbose = FALSE) 
24 |     y <- FindClusters(y, res = 0.8, verbose = FALSE) 
25 |     Idents(y)
26 | }
27 | 


--------------------------------------------------------------------------------
/code/05-calc_clust-monocle.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(monocle)
 3 |     library(scran)
 4 | })
 5 | 
 6 | fun <- \(x) 
 7 | {
 8 |     if (!is.matrix(y <- counts(x)))
 9 |         counts(x) <- as.matrix(y)
10 |     if (is.null(rownames(x)))
11 |         rownames(x) <- paste0("gene", seq(nrow(x)))
12 |     k <- length(unique(x$cluster))
13 |     rowData(x)$gene_short_name <- rownames(x)
14 |     y <- convertTo(x, type = "monocle")
15 |     y <- estimateSizeFactors(y)
16 |     y <- tryCatch(
17 |         estimateDispersions(y),
18 |         error = function(e) y)
19 |     y <- reduceDimension(y, 
20 |         max_components = 3,
21 |         num_dim = 50,
22 |         reduction_method = "tSNE",
23 |         verbose = TRUE)
24 |     monocle::clusterCells(y, 
25 |         num_clusters = k+1,
26 |         method = "densityPeak")$Cluster
27 | }


--------------------------------------------------------------------------------
/code/05-calc_clust-pcaReduce.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(clue)
 3 |     library(pcaReduce)
 4 |     library(scater)
 5 | })
 6 | 
 7 | fun <- \(x) 
 8 | {
 9 |     k <- length(unique(x$cluster))
10 |     y <- normalizeCounts(x, log = TRUE)
11 |     z <- PCAreduce(t(y),
12 |         nbt = 100,
13 |         q = q <- 30,
14 |         method = "S")
15 |     l <- lapply(z, function(.) {
16 |         colnames(.) <- paste0("k", seq(q+1, 2))
17 |         as.cl_partition(.[, paste0("k", k)])
18 |     })
19 |     e <- as.cl_ensemble(l)
20 |     res <- cl_consensus(e,
21 |         method = "SE",
22 |         control = list(nruns = 50))
23 |     c(cl_class_ids(res))
24 | }


--------------------------------------------------------------------------------
/code/05-calc_clust-tSNE+KM.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(Rtsne)
 3 |     library(scater)
 4 | })
 5 | 
 6 | fun <- \(x) 
 7 | {
 8 |     k <- length(unique(x$cluster))
 9 |     y <- normalizeCounts(x, log = TRUE)
10 |     if (!is.matrix(y)) y <- as.matrix(y)
11 |     tsne <- Rtsne(t(y), 
12 |         pca = TRUE, initial_dims = 50, dims = 3,
13 |         perplexity = 30, check_duplicates = FALSE)
14 |     kmeans(tsne$Y, centers = k, nstart = 25)$cluster
15 | }


--------------------------------------------------------------------------------
/code/05-calc_clust.R:
--------------------------------------------------------------------------------
 1 | source(args$fun)
 2 | sce <- readRDS(args$sce)
 3 | 
 4 | # skip if simulation failed (return NULL)
 5 | res <- if (!is.null(sce))
 6 |     data.frame(wcs, 
 7 |         row.names = NULL,
 8 |         pred = as.integer(as.integer(fun(sce))),
 9 |         true = as.integer(droplevels(factor(sce$cluster))))
10 | 
11 | saveRDS(res, args$res)
12 | 


--------------------------------------------------------------------------------
/code/05-calc_dr.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(scater)
 3 |     library(scran)
 4 |     library(SingleCellExperiment)
 5 | })
 6 | 
 7 | set.seed(7043)
 8 | x <- readRDS(args[[1]])
 9 | 
10 | # skip if simulation failed (return NULL)
11 | df <- if (!is.null(x)) {
12 |     if ("normcounts" %in% assayNames(x)) {
13 |         logcounts(x) <- log(normcounts(x)+1)
14 |     } else x <- logNormCounts(x)
15 |     
16 |     stats <- modelGeneVar(x)
17 |     hvgs <- getTopHVGs(stats, n = 500)
18 |     
19 |     x <- runPCA(x, subset_row = hvgs)
20 |     x <- runTSNE(x, dimred = "PCA")
21 |     x <- runUMAP(x, dimred = "PCA")
22 |     
23 |     tsne <- reducedDim(x, "TSNE")
24 |     umap <- reducedDim(x, "UMAP")
25 |     colnames(tsne) <- paste0("TSNE", seq(2))
26 |     colnames(umap) <- paste0("UMAP", seq(2))
27 |     
28 |     x$lls <- if ("counts" %in% assayNames(x))
29 |         x$lls <- log(colSums(counts(x))+1) else NA
30 |     
31 |     i <- c("cluster", "batch", "lls")
32 |     i <- intersect(names(colData(x)), i)
33 |     cd <- colData(x)[i]
34 |     
35 |     data.frame(wcs, cd, tsne, umap)
36 | }
37 | 
38 | saveRDS(df, args[[2]])
39 | 


--------------------------------------------------------------------------------
/code/05-calc_qc-cell_cms.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(CellMixS)
 3 |     library(scater)
 4 |     library(scran)
 5 |     library(SingleCellExperiment)
 6 | })
 7 | 
 8 | ppFUN <- function(sce) {
 9 |     if ("normcounts" %in% assayNames(sce)) {
10 |         logcounts(sce) <- log(normcounts(sce)+1)
11 |     } else sce <- logNormCounts(sce)
12 |     stats <- modelGeneVar(sce)
13 |     hvgs <- getTopHVGs(stats, n = 500)
14 |     sce <- runPCA(sce, subset_row = hvgs)
15 | }
16 | 
17 | qcFUN <- function(sce) {
18 |     i <- c("batch", "cluster")
19 |     i <- intersect(i, names(colData(sce)))
20 |     if (length(i) == 0) return(NULL)
21 |     ids <- factor(sce[[i]])
22 |     n <- tabulate(ids)
23 |     k <- min(n[n != 0])/2
24 |     cms(sce, k, i)$cms
25 | }
26 | 
27 | groups <- "global"
28 | n_genes <- NULL
29 | n_cells <- NULL


--------------------------------------------------------------------------------
/code/05-calc_qc-cell_cor.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |   library(scater)
 3 | })
 4 | 
 5 | ppFUN <- function(sce) {
 6 |     if (!is.matrix(y <- assay(sce))) 
 7 |         assay(sce) <- as.matrix(y)
 8 |     y <- if ("normcounts" %in% assayNames(sce)) {
 9 |         normcounts(sce)
10 |     } else calculateCPM(sce)
11 |     assay(sce, "exprs") <- log(y+1)
12 |     return(sce)
13 | }
14 | 
15 | qcFUN <- function(sce) {
16 |     cor <- cor(
17 |         assay(sce, "exprs"),
18 |         method = "spearman",
19 |         use = "pairwise.complete.obs")
20 |     cor[upper.tri(cor)]
21 | }
22 | 
23 | groups <- NULL
24 | n_genes <- NULL
25 | n_cells <- 200


--------------------------------------------------------------------------------
/code/05-calc_qc-cell_frq.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(SingleCellExperiment)
 3 | })
 4 | 
 5 | ppFUN <- function(sce) {
 6 |     sce
 7 | }
 8 | 
 9 | qcFUN <- function(sce) {
10 |     colMeans(assay(sce) != 0)
11 | }
12 | 
13 | groups <- NULL
14 | n_genes <- NULL
15 | n_cells <- NULL


--------------------------------------------------------------------------------
/code/05-calc_qc-cell_knn.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(RANN)
 3 |     library(scater)
 4 |     library(scran)
 5 |     library(SingleCellExperiment)
 6 | })
 7 | 
 8 | ppFUN <- function(sce) {
 9 |     if ("normcounts" %in% assayNames(sce)) {
10 |         logcounts(sce) <- log(normcounts(sce)+1)
11 |     } else sce <- logNormCounts(sce)
12 |     stats <- modelGeneVar(sce)
13 |     hvgs <- getTopHVGs(stats, n = 500)
14 |     sce <- runPCA(sce, subset_row = hvgs)
15 | }
16 | 
17 | qcFUN <- function(sce) {
18 |     # build KNN-graph on PCA
19 |     # (where k = 5% of cells)
20 |     pca <- reducedDim(sce, "PCA")
21 |     k <- round(0.05*ncol(sce))
22 |     knn <- nn2(pca, k = k+1)
23 |     idx <- knn$nn.idx[, seq(2, k+1)]
24 |     # count how often each cell is a KNN
25 |     vapply(seq(ncol(sce)), \(i) sum(idx == i), numeric(1))
26 | }
27 | 
28 | groups <- NULL
29 | n_genes <- NULL
30 | n_cells <- NULL


--------------------------------------------------------------------------------
/code/05-calc_qc-cell_ldf.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(BiocNeighbors)
 3 |     library(CellMixS)
 4 |     library(dplyr)
 5 |     library(scater)
 6 |     library(scran)
 7 |     library(SingleCellExperiment)
 8 | })
 9 | 
10 | ppFUN <- function(sce) {
11 |     if ("normcounts" %in% assayNames(sce)) {
12 |         logcounts(sce) <- log(normcounts(sce)+1)
13 |     } else sce <- logNormCounts(sce)
14 |     stats <- modelGeneVar(sce)
15 |     hvgs <- getTopHVGs(stats, n = 500)
16 |     sce <- runPCA(sce, subset_row = hvgs)
17 | }
18 | 
19 | qcFUN <- function(sce) {
20 |     i <- c("batch", "cluster")
21 |     i <- intersect(i, names(colData(sce)))
22 |     k <- if (length(i) != 0) {
23 |         # half of smallest group
24 |         ids <- factor(sce[[i]])
25 |         n <- tabulate(ids)
26 |         min(n[n != 0])/2
27 |     } else {
28 |         # 5% of total cells
29 |         0.05*ncol(sce)
30 |     }
31 |     # KNN on PCA
32 |     pca <- reducedDim(sce, "PCA")
33 |     knn <- findKNN(pca, k)
34 |     # fix naming for indexing
35 |     cs <- colnames(sce)
36 |     rownames(knn$index) <- cs
37 |     rownames(knn$distance) <- cs
38 |     knn$cell_name <- knn$index
39 |     # compute local density factors
40 |     c(CellMixS:::.ldfKnn(pca, knn, k)$LDF)
41 | }
42 | 
43 | groups <- NULL
44 | n_genes <- NULL
45 | n_cells <- NULL


--------------------------------------------------------------------------------
/code/05-calc_qc-cell_lls.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(SingleCellExperiment)
 3 | })
 4 | 
 5 | ppFUN <- function(sce) {
 6 |     sce
 7 | }
 8 | 
 9 | qcFUN <- function(sce) {
10 |     if ("counts" %in% assayNames(sce))
11 |         log(colSums(counts(sce) + 1))
12 | }
13 | 
14 | groups <- NULL
15 | n_genes <- NULL
16 | n_cells <- NULL


--------------------------------------------------------------------------------
/code/05-calc_qc-cell_pcd.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(scater)
 3 |     library(scran)
 4 |     library(SingleCellExperiment)
 5 | })
 6 | 
 7 | ppFUN <- function(sce) {
 8 |     if ("normcounts" %in% assayNames(sce)) {
 9 |         logcounts(sce) <- log(normcounts(sce)+1)
10 |     } else sce <- logNormCounts(sce)
11 |     stats <- modelGeneVar(sce)
12 |     hvgs <- getTopHVGs(stats, n = 500)
13 |     sce <- runPCA(sce, subset_row = hvgs)
14 | }
15 | 
16 | qcFUN <- function(sce) {
17 |     pca <- reducedDim(sce, "PCA")
18 |     c(dist(pca, upper = TRUE))
19 | }
20 | 
21 | groups <- NULL
22 | n_genes <- NULL
23 | n_cells <- 200


--------------------------------------------------------------------------------
/code/05-calc_qc-cell_sw.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(cluster)
 3 |     library(scater)
 4 |     library(scran)
 5 |     library(SingleCellExperiment)
 6 | })
 7 | 
 8 | ppFUN <- function(sce) {
 9 |     if ("normcounts" %in% assayNames(sce)) {
10 |         logcounts(sce) <- log(normcounts(sce)+1)
11 |     } else sce <- logNormCounts(sce)
12 |     stats <- modelGeneVar(sce)
13 |     hvgs <- getTopHVGs(stats, n = 500)
14 |     sce <- runPCA(sce, subset_row = hvgs)
15 | }
16 | 
17 | qcFUN <- function(sce) {
18 |     i <- c("batch", "cluster")
19 |     i <- intersect(i, names(colData(sce)))
20 |     if (length(i) == 0) return(NULL)
21 |     ids <- as.integer(factor(sce[[i]]))
22 |     mtx <- dist(reducedDim(sce, "PCA"))
23 |     res <- silhouette(ids, mtx)
24 |     return(res[, "sil_width"])
25 | }
26 | 
27 | groups <- "global"
28 | n_genes <- NULL
29 | n_cells <- NULL


--------------------------------------------------------------------------------
/code/05-calc_qc-gene_avg.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(scater)
 3 | })
 4 | 
 5 | ppFUN <- function(sce) {
 6 |     if (!is.matrix(y <- assay(sce))) 
 7 |         assay(sce) <- as.matrix(y)
 8 |     y <- if ("normcounts" %in% assayNames(sce)) {
 9 |         normcounts(sce)
10 |     } else calculateCPM(sce)
11 |     assay(sce, "exprs") <- log(y+1)
12 |     return(sce)
13 | }
14 | 
15 | qcFUN <- function(sce) {
16 |     rowMeans(assay(sce, "exprs"))
17 | }
18 | 
19 | groups <- NULL
20 | n_genes <- NULL
21 | n_cells <- NULL


--------------------------------------------------------------------------------
/code/05-calc_qc-gene_cor.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |   library(matrixStats)
 3 |   library(scater)
 4 | })
 5 | 
 6 | ppFUN <- function(sce) {
 7 |     if (!is.matrix(y <- assay(sce))) 
 8 |         assay(sce) <- as.matrix(y)
 9 |     y <- if ("normcounts" %in% assayNames(sce)) {
10 |         normcounts(sce)
11 |     } else calculateCPM(sce)
12 |     assay(sce, "exprs") <- log(y+1)
13 |     sce[rowVars(y) > 0, ]
14 | }
15 | 
16 | qcFUN <- function(sce) {
17 |     cor <- cor(
18 |         t(assay(sce, "exprs")),
19 |         method = "spearman",
20 |         use = "pairwise.complete.obs")
21 |     cor[upper.tri(cor)]
22 | }
23 | 
24 | groups <- NULL
25 | n_genes <- 400
26 | n_cells <- NULL


--------------------------------------------------------------------------------
/code/05-calc_qc-gene_cv.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(scater)
 3 | })
 4 | 
 5 | ppFUN <- function(sce) {
 6 |     if (!is.matrix(y <- assay(sce))) 
 7 |         assay(sce) <- as.matrix(y)
 8 |     y <- if ("normcounts" %in% assayNames(sce)) {
 9 |         normcounts(sce)
10 |     } else calculateCPM(sce)
11 |     assay(sce, "exprs") <- log(y+1)
12 |     return(sce)
13 | }
14 | 
15 | qcFUN <- function(sce) {
16 |     es <- assay(sce, "exprs")
17 |     sd <- sqrt(rowVars(es))
18 |     mu <- rowMeans(es)
19 |     cv <- sd / mu
20 |     cv[!is.na(cv)]
21 | }
22 | 
23 | groups <- NULL
24 | n_genes <- NULL
25 | n_cells <- NULL


--------------------------------------------------------------------------------
/code/05-calc_qc-gene_frq.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(SingleCellExperiment)
 3 | })
 4 | 
 5 | ppFUN <- function(sce) {
 6 |     sce
 7 | }
 8 | 
 9 | qcFUN <- function(sce) {
10 |     rowMeans(assay(sce) != 0)
11 | }
12 | 
13 | groups <- NULL
14 | n_genes <- NULL
15 | n_cells <- NULL


--------------------------------------------------------------------------------
/code/05-calc_qc-gene_pve.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |   library(BiocParallel)
 3 |   library(scater)
 4 |   library(SingleCellExperiment)
 5 |   library(variancePartition)
 6 | })
 7 | 
 8 | ppFUN <- function(sce) {
 9 |     if ("normcounts" %in% assayNames(sce)) {
10 |         logcounts(sce) <- log(normcounts(sce)+1)
11 |     } else sce <- logNormCounts(sce)
12 |     return(sce)
13 | }
14 | 
15 | qcFUN <- function(sce) {
16 |     i <- c("batch", "cluster")
17 |     i <- intersect(i, names(colData(sce)))
18 |     if (length(i) == 0) return(NULL)
19 |     y <- logcounts(sce)
20 |     if (!is.matrix(y))
21 |         y <- as.matrix(y)
22 |     y <- y[rowSums(y) != 0, ]    
23 |     f <- as.formula(sprintf("~(1|%s)", i))
24 |     cd <- data.frame(colData(sce)[i])
25 |     pve <- fitExtractVarPartModel(y, f, cd, 
26 |         quiet = TRUE, BPPARAM = SerialParam())
27 |     return(pve[[i]])
28 | }
29 | 
30 | groups <- "global"
31 | n_genes <- NULL
32 | n_cells <- NULL


--------------------------------------------------------------------------------
/code/05-calc_qc-gene_var.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |   library(matrixStats)
 3 |   library(scater)
 4 | })
 5 | 
 6 | ppFUN <- function(sce) {
 7 |     if (!is.matrix(y <- assay(sce))) 
 8 |         assay(sce) <- as.matrix(y)
 9 |     y <- if ("normcounts" %in% assayNames(sce)) {
10 |         normcounts(sce)
11 |     } else calculateCPM(sce)
12 |     assay(sce, "exprs") <- log(y+1)
13 |     return(sce)
14 | }
15 | 
16 | qcFUN <- function(sce) {
17 |     rowVars(assay(sce, "exprs"))
18 | }
19 | 
20 | groups <- NULL
21 | n_genes <- NULL
22 | n_cells <- NULL


--------------------------------------------------------------------------------
/code/05-calc_qc.R:
--------------------------------------------------------------------------------
 1 | source(args$uts)
 2 | source(args$fun)
 3 | 
 4 | # each QC script should specify
 5 | # ppFUN:   function to apply to the SCE for PreProcessing
 6 | # qcFUN:   function to compute the Quality Control metric
 7 | #          (both FUNs should input & output a SCE)
 8 | # groups:  for which to compute the QC metric 
 9 | #          (global, cluster, batch, NULL for all)
10 | # n_genes: number of genes to sample - globally  (NULL for all)
11 | # n_cells: number of cells to sample - per group (NULL for all)
12 | 
13 | if (exists("ppFUN")) {
14 |     stopifnot(is.function(ppFUN)) 
15 | } else {
16 |     # do nothing
17 |     ppFUN <- \(.) . 
18 | }
19 | 
20 | if (exists("qcFUN")) {
21 |     stopifnot(is.function(qcFUN)) 
22 | } else {
23 |     stop("'qcFUN' needs to be specified")
24 | }
25 | 
26 | choices <- eval(formals(.calc_qc)$groups)
27 | if (exists("groups") && !is.null(groups)) {
28 |     groups <- match.arg(groups, choices, several.ok = TRUE)
29 | } else {
30 |     groups <- choices
31 | }
32 | 
33 | if (exists("n_genes") && !is.null(n_genes)) {
34 |     stopifnot(is.numeric(n_genes), length(n_genes) == 1)
35 | } else {
36 |     # use all
37 |     n_genes <- NULL 
38 | }
39 | 
40 | if (exists("n_cells") && !is.null(n_cells)) {
41 |     stopifnot(is.numeric(n_cells), length(n_cells) == 1)
42 | } else {
43 |     # use all
44 |     n_cells <- NULL
45 | }
46 | 
47 | # ------------------------------------------------------------------------------
48 | 
49 | sce <- readRDS(args$sce)
50 | 
51 | # skip if simulation failed (return NULL)
52 | res <- if (!is.null(sce)) {
53 |     sce <- ppFUN(sce)
54 |     .calc_qc(sce, 
55 |         fun = qcFUN, 
56 |         groups = groups,
57 |         n_genes = n_genes,
58 |         n_cells = n_cells)
59 | }
60 | 
61 | res <- if (!is.null(res)) {
62 |     if (is.null(wcs$method)) 
63 |         wcs$method <- NA
64 |     data.frame(wcs, res)
65 | }
66 | 
67 | print(head(res))
68 | saveRDS(res, args$res)


--------------------------------------------------------------------------------
/code/05-runtimes.R:
--------------------------------------------------------------------------------
 1 | # wcs <- list(ngs = "x", ncs = 325, rep = 3)
 2 | # args <- list(
 3 | #     sce = "data/02-sub/Mereu20,CD4T.rds",
 4 | #     est = "code/03-est_pars-SCRIP.R",
 5 | #     sim = "code/04-sim_data-SCRIP.R")
 6 | 
 7 | suppressPackageStartupMessages({
 8 |     library(R.utils)
 9 |     library(SingleCellExperiment)
10 | })
11 | 
12 | # set seed to current rep(licate) for reproducibility
13 | set.seed(as.numeric(wcs$rep))
14 | 
15 | # read in dataset
16 | x <- readRDS(args$sce)
17 | 
18 | # if down-sampling cells, use all 
19 | # genes & store NA in output data.frame
20 | if (wcs$ngs == "x") {
21 |     wcs$ngs <- NA 
22 |     ngs <- nrow(x) 
23 | } else wcs$ngs <- ngs <- as.numeric(wcs$ngs)
24 | 
25 | # if down-sampling genes, use all 
26 | # cells & store NA in output data.frame
27 | if (wcs$ncs == "x") {
28 |     wcs$ncs <- NA 
29 |     ncs <- ncol(x) 
30 | } else wcs$ncs <- ncs <- as.numeric(wcs$ncs)
31 | 
32 | # downsample number of genes / cells
33 | x <- x[
34 |     sample(nrow(x), ngs), 
35 |     sample(ncol(x), ncs)]
36 | 
37 | sink(tempfile()) # suppress printing...
38 | 
39 | # set time limit (s) until timeout
40 | t <- 1e4
41 | 
42 | # time estimation 
43 | source(args$est)
44 | est <- withTimeout(
45 |     {
46 |     tryCatch(
47 |         system.time(y <- fun(x))[[3]],
48 |         error = function(e) e)
49 |     }, 
50 |     timeout = t, 
51 |     onTimeout = "warning")
52 | 
53 | if (is.character(est)) {
54 |     # timed out
55 |     est <- t
56 | } else if (inherits(est, "error")) {
57 |     # Inf if estimation failed
58 |     est <- Inf
59 | } else if (is.null(y)) {
60 |     # NA if included in simulation
61 |     est <- NA_real_
62 |     # pass SCE to simulation
63 |     y <- x
64 | }
65 | 
66 | # time simulation
67 | source(args$sim)
68 | sim <- withTimeout(
69 |     {
70 |     tryCatch(
71 |         system.time(fun(y))[[3]],
72 |         error = function(e) e)
73 |     }, 
74 |     timeout = t, 
75 |     onTimeout = "warning")
76 | 
77 | if (is.character(sim)) {
78 |     # timed out
79 |     sim <- t
80 | } else if (inherits(sim, "error")) {
81 |     # Inf if estimation failed
82 |     sim <- Inf
83 | } else if (is.null(sim)) {
84 |     # NA if included in simulation
85 |     sim <- NA_real_
86 | }
87 | 
88 | sink() # ...until here
89 | 
90 | # construct table including wildcards (wcs),
91 | # timing of est(imation) & sim(ulation), 
92 | # and number of genes/cells (ng/cs)
93 | res <- data.frame(wcs, est, sim, row.names = NULL)
94 | 
95 | # ...and write to .rds
96 | saveRDS(res, args$res)


--------------------------------------------------------------------------------
/code/06-dr_batch.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(scater)
 3 |     library(scran)
 4 |     library(SingleCellExperiment)
 5 | })
 6 | 
 7 | # args <- list(
 8 | #     sce = "data/04-sim/Oetjen18,foo,SPARSim.rds",
 9 | #     res = "outs/batch_sim-Oetjen18,foo,SPARSim,mnnCorrect.rds")
10 | 
11 | x <- readRDS(args[[1]])
12 | y <- readRDS(args[[2]])
13 | 
14 | df <- if (!is.null(x) && !is.null(y)) {
15 |     if (!is.null(y$dimred_in) && 
16 |         !is.null(y$dimred_out)) {
17 |         # use integrated dimension reduction
18 |         reducedDim(x, "PCA") <- y$dimred_out
19 |     } else if (
20 |         !is.null(y$assay_in) && 
21 |         !is.null(y$assay_out)) {
22 |         # run PCA on integrated data
23 |         pca <- calculatePCA(y$assay_out)
24 |         reducedDim(x, "PCA") <- pca
25 |     }
26 |     
27 |     x <- runTSNE(x, dimred = "PCA")
28 |     x <- runUMAP(x, dimred = "PCA")
29 |     
30 |     tsne <- reducedDim(x, "TSNE")
31 |     umap <- reducedDim(x, "UMAP")
32 |     colnames(tsne) <- paste0("TSNE", seq(2))
33 |     colnames(umap) <- paste0("UMAP", seq(2))
34 |     
35 |     data.frame(wcs, tsne, umap, batch = x$batch)
36 | }
37 | saveRDS(df, args[[3]])
38 | 


--------------------------------------------------------------------------------
/code/06-eval_batch.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(CellMixS)
 3 |     library(SingleCellExperiment)
 4 | })
 5 | 
 6 | # args <- list(
 7 | #     sce = "data/02-sub/panc8,inDrop.ductal.rds",
 8 | #     res = "outs/batch_sim-panc8,inDrop.ductal,muscat,Seurat.rds")
 9 | 
10 | sce <- readRDS(args[[1]])
11 | res <- readRDS(args[[2]])
12 | 
13 | df <- if (!is.null(sce) && !is.null(res)) {
14 |     
15 |     if (!is.factor(sce$batch))
16 |         sce$batch <- factor(sce$batch)
17 |     
18 |     group <- "batch"
19 |     k <- min(table(sce$batch))/2
20 |     
21 |     # x = uncorrected, 
22 |     # y = integrated
23 |     if (!is.null(res$dimred_in) && 
24 |         !is.null(res$dimred_out)) {
25 |         # compute scores io dimension reductions
26 |         dimred_in <- dimred_out <- "foo"
27 |         assay_in <- assay_out <- "logcounts"
28 |         # setup input data
29 |         x <- sce
30 |         reducedDim(x, "foo") <- res$dimred_in
31 |         # setup output data
32 |         y <- sce
33 |         reducedDim(y, "foo") <- res$dimred_out 
34 |     } else if (
35 |         !is.null(res$assay_in) && 
36 |         !is.null(res$assay_out)) {
37 |         # compute scores on integrated assay data
38 |         assay_in <- assay_out <- "foo"
39 |         dimred_in <- dimred_out <- "PCA"
40 |         # setup input data
41 |         x <- sce
42 |         assay(x, "foo", FALSE) <- res$assay_in
43 |         # setup output data
44 |         y <- sce
45 |         assay(y, "foo", FALSE) <- res$assay_out
46 |     }
47 |     # split input data by batch
48 |     idx <- split(seq(ncol(x)), x$batch)
49 |     x <- lapply(idx, \(.) x[, .])
50 |         
51 |     suppressMessages({
52 |         # cell-specific changes in Local Density 
53 |         # Factor (LDF) before vs. after integration
54 |         ldf <- ldfDiff(x, y, group, k, dimred_in, dimred_out, assay_in, assay_out)
55 |         
56 |         # cell-specific mixing scores based on 
57 |         # euclidean distances within integrated data
58 |         cms <- cms(y, k, group, dimred_in, assay_in)
59 |     })
60 |     
61 |     data.frame(wcs,
62 |         row.names = NULL,
63 |         batch = sce$batch,
64 |         ldf = ldf$diff_ldf,
65 |         cms = cms$cms)
66 | }
67 | 
68 | saveRDS(df, args[[3]])


--------------------------------------------------------------------------------
/code/06-eval_clust.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(dplyr)
 3 |     library(tidyr)
 4 | })
 5 | 
 6 | source(args$uts)
 7 | 
 8 | sce <- readRDS(args$sce)
 9 | true <- droplevels(factor(sce$cluster))
10 | 
11 | res <- args[c("ref", "sim")] %>% 
12 |     unlist() %>% 
13 |     lapply(readRDS) %>% 
14 |     bind_rows() %>% 
15 |     replace_na(list(method = "ref"))
16 |     
17 | # for each simulators & clustering method, match prediction w/ truth 
18 | # using Hungarian algorithm & compute precision, recall, F1 score 
19 | res <- group_by(res,
20 |     across(contains("method"))) %>% 
21 |     group_modify(~{
22 |         res <- .hungarian(.x$pred, .x$true)
23 |         data.frame(
24 |             cluster = levels(true),
25 |             res[c("pr", "re", "F1")])
26 |     }) %>% data.frame(wcs)
27 | 
28 | saveRDS(res, args$res)
29 | 


--------------------------------------------------------------------------------
/code/06-stat_1d-ks.R:
--------------------------------------------------------------------------------
1 | fun <- function(x, y) {
2 |     suppressWarnings(z <- ks.test(x, y))
3 |     as.numeric(z$statistic)
4 | }


--------------------------------------------------------------------------------
/code/06-stat_1d-ws.R:
--------------------------------------------------------------------------------
1 | fun <- function(x, y) waddR::wasserstein_metric(x, y)


--------------------------------------------------------------------------------
/code/06-stat_1d.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |   library(dplyr)
 3 |   library(purrr)
 4 |   library(tidyr)
 5 | })
 6 | 
 7 | # read in QC results for reference (ref) & simulation (sim)
 8 | ref <- readRDS(args$ref)
 9 | sim <- readRDS(args$sim)
10 | 
11 | # check that QC is available for reference & simulation
12 | res <- if (!(is.null(ref) || is.null(sim))) {
13 |   # source test statistic function
14 |   source(args$fun)
15 |   # test ref vs. sim for each grouping
16 |   bind_rows(ref, sim) %>% 
17 |     pivot_wider(
18 |       names_from = "method", 
19 |       values_from = "value", 
20 |       values_fn = list) %>% 
21 |     rename(
22 |       ref = "NA", 
23 |       sim = wcs$method) %>% 
24 |     rowwise() %>% 
25 |     mutate(stat = fun(ref, sim)) %>% 
26 |     ungroup() %>% 
27 |     select(negate(is.list)) %>% 
28 |     mutate(.before = 1, data.frame(wcs))
29 | }
30 | 
31 | print(head(res))
32 | saveRDS(res, args$res)
33 | 


--------------------------------------------------------------------------------
/code/06-stat_2d-emd.R:
--------------------------------------------------------------------------------
 1 | fun <- function(x, y, n = 25) {
 2 |     stopifnot(is.numeric(n), length(n) == 1, n == as.integer(n))
 3 |     if (is.null(dim(x))) {
 4 |         # ONE-DIMENSIONAL
 5 |         # smoothing
 6 |         x <- density(x, n = n)$x
 7 |         y <- density(y, n = n)$x
 8 |         # compute EMD
 9 |         ws <- rep(1/n, n)
10 |         x <- cbind(ws, x)
11 |         y <- cbind(ws, y)
12 |         emd(x, y)
13 |     } else {
14 |         # TWO-DIMENSIONAL
15 |         if (!is.matrix(x)) x <- as.matrix(x)
16 |         if (!is.matrix(y)) y <- as.matrix(y)
17 |         # smoothing over common range
18 |         rng <- c(
19 |             range(c(x[, 1], y[, 1])),
20 |             range(c(x[, 2], y[, 2])))
21 |         x <- MASS::kde2d(x[, 1], x[, 2], n = n, lims = rng)
22 |         y <- MASS::kde2d(y[, 1], y[, 2], n = n, lims = rng)
23 |         # compute EMD
24 |         emdist::emd2d(x$z, y$z)/n
25 |     }
26 | }


--------------------------------------------------------------------------------
/code/06-stat_2d-ks2.R:
--------------------------------------------------------------------------------
1 | fun <- function(x, y) Peacock.test::peacock2(x, y)


--------------------------------------------------------------------------------
/code/06-stat_2d.R:
--------------------------------------------------------------------------------
 1 | suppressPackageStartupMessages({
 2 |     library(dplyr)
 3 |     library(purrr)
 4 |     library(tidyr)
 5 | })
 6 | 
 7 | x <- lapply(args$x, readRDS) %>% bind_rows()
 8 | y <- lapply(args$y, readRDS) %>% bind_rows()
 9 | 
10 | # check that QCs are available for reference & simulation
11 | res <- if (!(all(is.na(x$method)) || all(is.na(y$method)))) {
12 |     # source evaluation function
13 |     source(args$fun)
14 |     # for each grouping, test simulation vs. reference
15 |     list(x = x, y = y) %>% 
16 |     bind_rows(.id = "dim") %>% 
17 |     pivot_wider(
18 |         id_cols = c("group", "id"),
19 |         names_from = c("dim", "method"),
20 |         values_from = "value",
21 |         values_fn = list) %>% 
22 |     rename_at(
23 |         vars(contains("NA")),
24 |         function(.) gsub("NA", "ref", .)) %>% 
25 |     rename_at(
26 |         vars(contains(wcs$method)), 
27 |         function(.) gsub(wcs$method, "sim", .)) %>% 
28 |     rowwise() %>% 
29 |     mutate(stat = fun(
30 |         cbind(x_ref, y_ref),
31 |         cbind(x_sim, y_sim))) %>% 
32 |     ungroup() %>% 
33 |     select_if(negate(is.list)) %>% 
34 |     mutate(.before = 1, data.frame(wcs))
35 | }
36 | 
37 | print(head(res))
38 | saveRDS(res, args$res)
39 | 


--------------------------------------------------------------------------------
/code/07-plot_batch-boxplot_by_method.R:
--------------------------------------------------------------------------------
 1 | # wcs <- list(val = "cms")
 2 | # args <- list(
 3 | #     uts1 = "code/utils-plotting.R",
 4 | #     uts2 = "code/utils-integration.R",
 5 | #     res = list.files("outs", "^batch_res-", full.names = TRUE),
 6 | #     rds = sprintf("plts/batch-boxplot_by_method_%s.rds", wcs$val),
 7 | #     pdf = sprintf("plts/batch-boxplot_by_method_%s.pdf", wcs$val))
 8 | 
 9 | source(args$uts1)
10 | source(args$uts2)
11 | 
12 | res <- .read_res(args$res)
13 | 
14 | df <- res %>% 
15 |     .cms_ldf() %>% 
16 |     .bcs(n = 1) # average across cells but not batches
17 | 
18 | lim <- switch(wcs$val, bcs = c(0, 1), c(-0.5, 0.5))
19 | 
20 | plt <- ggplot(df, aes(
21 |     reorder_within(batch_method, .data[[wcs$val]], sim_method, median), 
22 |     .data[[wcs$val]], col = batch_method, fill = batch_method)) +
23 |     facet_grid(~ sim_method, scales = "free_x") +
24 |     geom_hline(yintercept = c(0, 1), size = 0.1) +
25 |     geom_boxplot(
26 |         size = 0.25, outlier.size = 0.25,
27 |         alpha = 0.25, key_glyph = "point") +
28 |     scale_y_continuous(
29 |         .batch_labs[wcs$val], 
30 |         limits = lim, 
31 |         n.breaks = 3)
32 | 
33 | thm <- theme(
34 |     legend.title = element_blank(),
35 |     axis.text.x = element_blank(),
36 |     axis.title.x = element_blank(),
37 |     axis.ticks.x = element_blank())
38 | 
39 | fig <- .prettify(plt, thm)
40 | 
41 | saveRDS(fig, args$rds)
42 | ggsave(args$pdf, fig, width = 15, height = 6, units = "cm")
43 | 


--------------------------------------------------------------------------------
/code/07-plot_batch-boxplot_dX.R:
--------------------------------------------------------------------------------
 1 | # wcs <- list(val = "ldf")
 2 | # args <- list(
 3 | #     uts1 = "code/utils-plotting.R",
 4 | #     uts2 = "code/utils-integration.R",
 5 | #     rds = paste0("plts/batch-heatmap_by_method_", wcs$val, ".rds"),
 6 | #     pdf = paste0("plts/batch-heatmap_by_method_", wcs$val, ".pdf"),
 7 | #     res = list.files("outs", "^batch_res", full.names = TRUE))
 8 | 
 9 | source(args$uts1)
10 | source(args$uts2)
11 | 
12 | res <- .read_res(args$res)
13 | 
14 | df <- res %>% 
15 |     .cms_ldf() %>% 
16 |     .bcs(n = 1) %>% # average across cells but not batches
17 |     group_by(refset, batch_method, batch) %>% 
18 |     mutate(val = .data[[wcs$val]]-.data[[wcs$val]][.data$sim_method == "ref"]) %>%
19 |     filter(sim_method != "ref") %>%
20 |     mutate(sim_method = droplevels(sim_method))
21 | 
22 | lim <- c(
23 |     floor(min(df$val)/0.1)*0.1, 
24 |     ceiling(max(df$val)/0.1)*0.1)
25 | lab <- eval(.batch_labs[wcs$val])
26 | 
27 | plt <- ggplot(df, aes(reorder(sim_method, val, median), val)) +
28 |     geom_hline(yintercept = 0, size = 0.1, col = "red") +
29 |     geom_boxplot(size = 0.2, outlier.size = 0.1, key_glyph = "point") +
30 |     geom_violin(fill = NA, col = "grey", size = 0.2) +
31 |     scale_y_continuous(limits = lim, breaks = c(0, lim)) +
32 |     labs(x = NULL, y = bquote(Delta*.(lab)*"(sim-ref)"))
33 | 
34 | thm <- theme(axis.text.x = element_text(angle = 45, hjust = 1),)
35 | 
36 | fig <- .prettify(plt, thm)
37 | 
38 | saveRDS(fig, args$rds)
39 | ggsave(args$pdf, fig, width = 6, height = 6, units = "cm")
40 | 


--------------------------------------------------------------------------------
/code/07-plot_batch-correlations.R:
--------------------------------------------------------------------------------
 1 | # wcs <- list(val = "cms")
 2 | # args <- list(
 3 | #     uts1 = "code/utils-plotting.R",
 4 | #     uts2 = "code/utils-integration.R",
 5 | #     rds = paste0("plts/batch-heatmap_by_method_", wcs$val, ".rds"),
 6 | #     pdf = paste0("plts/batch-heatmap_by_method_", wcs$val, ".pdf"),
 7 | #     res = list.files("outs", "^batch_res", full.names = TRUE))
 8 | 
 9 | source(args$uts1)
10 | source(args$uts2)
11 | 
12 | res <- .read_res(args$res)
13 | 
14 | df <- res %>% 
15 |     .cms_ldf() %>% 
16 |     .bcs(n = 1) # average across cells & batches
17 | 
18 | mat <- df %>% 
19 |     ungroup() %>% 
20 |     pivot_wider(
21 |         values_from = wcs$val,
22 |         names_from = "sim_method",
23 |         id_cols = c("refset", "batch_method", "batch")) %>% 
24 |     select(any_of(c("ref", names(.methods_pal)))) %>% 
25 |     cor(method = "pearson", use = "pairwise.complete.obs")
26 | 
27 | mat[is.na(mat)] <- 0
28 | xo <- rownames(mat)[hclust(dist(mat))$order]
29 | yo <- rownames(mat)[hclust(dist(t(mat)))$order]
30 | 
31 | df <- mat %>% 
32 |     data.frame(
33 |         from = rownames(.), 
34 |         check.names = FALSE) %>% 
35 |     pivot_longer(
36 |         cols = -from, 
37 |         names_to = "to",
38 |         values_to = "corr") %>% 
39 |     mutate(
40 |         to = factor(to, levels = xo),
41 |         from = factor(from, levels = yo)) %>% 
42 |     filter(as.numeric(to) <= as.numeric(from))
43 | 
44 | fd <- filter(df, from != to)
45 | min <- floor(min(fd$corr)/0.1)*0.1
46 | max <- ceiling(max(fd$corr)/0.1)*0.1
47 | plt <- ggplot(fd, aes(from, to, fill = corr)) +
48 |     geom_tile() +
49 |     scale_fill_gradientn("r",
50 |         colors = rev(hcl.colors(11, "Grays")),
51 |         limits = c(min, max), breaks = c(min, max)) +
52 |     coord_equal(expand = FALSE) +
53 |     scale_x_discrete(limits = rev(xo[-1])) 
54 | 
55 | fy <- c("plain", "bold")[as.numeric(levels(df$to) == "ref") + 1]
56 | fx <- c("plain", "bold")[as.numeric(rev(levels(df$from)) == "ref") + 1]
57 | 
58 | thm <- theme(
59 |     axis.text.y = element_text(face = fy),
60 |     axis.text.x = element_text(angle = 45, hjust = 1, face = fx),
61 |     axis.title = element_blank(),
62 |     panel.border = element_blank(),
63 |     legend.title = element_text(vjust = 1),
64 |     legend.direction = "horizontal",
65 |     legend.position = c(1, 1),
66 |     legend.justification = c(1, 1))
67 | 
68 | fig <- .prettify(plt, thm)
69 | 
70 | saveRDS(fig, args$rds)
71 | ggsave(args$pdf, fig, width = 6, height = 6, units = "cm")


--------------------------------------------------------------------------------
/code/07-plot_batch-densities.R:
--------------------------------------------------------------------------------
 1 | # wcs <- list(val = "cms")
 2 | # args <- list(
 3 | #     uts1 = "code/utils-plotting.R",
 4 | #     uts2 = "code/utils-integration.R",
 5 | #     rds = paste0("plts/batch-densities_", wcs$val, ".rds"),
 6 | #     pdf = paste0("plts/batch-densities_", wcs$val, ".pdf"),
 7 | #     res = list.files("outs", "^batch_res", full.names = TRUE))
 8 | 
 9 | source(args$uts1)
10 | source(args$uts2)
11 | 
12 | res <- .read_res(args$res)
13 | 
14 | lim <- switch(wcs$val, bcs = c(0, 1), c(-0.5, 0.5))
15 | 
16 | fig <- if (wcs$val != "bcs") {
17 |     df <- .cms_ldf(res)
18 |     plt <- ggplot(df, aes(
19 |         .data[[wcs$val]], ..ndensity.., 
20 |         col = batch_method, fill = batch_method)) +
21 |         facet_grid(sim_method ~ refset) +
22 |         geom_vline(xintercept = 0, size = 0.2) +
23 |         geom_density(alpha = 0, key_glyph = "point") +
24 |         scale_x_continuous(limits = lim, n.breaks = 3) +
25 |         scale_y_continuous(limits = c(0, 1), n.breaks = 3) +
26 |         labs(x = .batch_labs[wcs$val], y = "scaled density")
27 |     thm <- theme(axis.text.x = element_text(angle = 45, hjust = 1))
28 |     .prettify(plt, thm)
29 | }
30 | 
31 | saveRDS(fig, args$rds)
32 | ggsave(args$pdf, fig, width = 16, height = 12, units = "cm")


--------------------------------------------------------------------------------
/code/07-plot_batch-heatmap_by_method.R:
--------------------------------------------------------------------------------
 1 | # wcs <- list(val = "cms")
 2 | # args <- list(
 3 | #     uts1 = "code/utils-plotting.R",
 4 | #     uts2 = "code/utils-integration.R",
 5 | #     rds = paste0("plts/batch-heatmap_by_method_", wcs$val, ".rds"),
 6 | #     pdf = paste0("plts/batch-heatmap_by_method_", wcs$val, ".pdf"),
 7 | #     res = list.files("outs", "^batch_res", full.names = TRUE))
 8 | 
 9 | source(args$uts1)
10 | source(args$uts2)
11 | 
12 | res <- .read_res(args$res)
13 | 
14 | df <- res %>% 
15 |     .cms_ldf() %>% 
16 |     .bcs(n = 2) # average across cells & batches
17 | 
18 | max <- ceiling(max(df$bcs)/0.1)*0.1
19 | lim <- switch(wcs$val, bcs = c(0, max), c(-0.5, 0.5))
20 | 
21 | plt <- ggplot(df, aes(
22 |     reorder_within(batch_method, .data[[wcs$val]], sim_method, median), 
23 |     refset, fill = .data[[wcs$val]])) +
24 |     facet_grid(~ sim_method, scales = "free_x") +
25 |     geom_tile(col = "white") +
26 |     scale_fill_distiller(
27 |         .batch_labs[wcs$val],
28 |         palette = "RdYlBu",
29 |         na.value = "lightgrey",
30 |         limits = lim,
31 |         n.breaks = 3) +
32 |     coord_cartesian(expand = FALSE) +
33 |     scale_x_reordered() +
34 |     scale_y_reordered() 
35 | 
36 | thm <- theme(
37 |     axis.title = element_blank(),
38 |     axis.text.x = element_text(angle = 45, hjust = 1))
39 | 
40 | fig <- .prettify(plt, thm)
41 | 
42 | saveRDS(fig, args$rds)
43 | ggsave(args$pdf, fig, width = 15, height = 6, units = "cm")


--------------------------------------------------------------------------------
/code/07-plot_batch-heatmap_by_refset.R:
--------------------------------------------------------------------------------
 1 | # wcs <- list(val = "cms")
 2 | # args <- list(
 3 | #     uts1 = "code/utils-plotting.R",
 4 | #     uts2 = "code/utils-integration.R",
 5 | #     rds = paste0("plts/batch-heatmap_by_refset_", wcs$val, ".rds"),
 6 | #     pdf = paste0("plts/batch-heatmap_by_refset_", wcs$val, ".pdf"),
 7 | #     res = list.files("outs", "^batch_res", full.names = TRUE))
 8 | 
 9 | source(args$uts1)
10 | source(args$uts2)
11 | 
12 | res <- .read_res(args$res) 
13 | 
14 | df <- res %>% 
15 |     .cms_ldf() %>% 
16 |     .bcs(n = 2) # average across cells & batches
17 | 
18 | max <- ceiling(max(df$bcs)/0.1)*0.1
19 | lim <- switch(wcs$val, bcs = c(0, max), c(-0.5, 0.5))
20 | 
21 | plt <- ggplot(df, aes(
22 |     reorder_within(batch_method, .data[[wcs$val]], refset, median), 
23 |     sim_method, fill = .data[[wcs$val]])) +
24 |     facet_grid(~ refset, scales = "free_x") +
25 |     geom_tile(col = "white") +
26 |     scale_fill_distiller(
27 |         .batch_labs[wcs$val],
28 |         palette = "RdYlBu",
29 |         na.value = "lightgrey",
30 |         limits = lim,
31 |         n.breaks = 3) +
32 |     coord_cartesian(expand = FALSE) +
33 |     scale_x_reordered() +
34 |     scale_y_reordered() 
35 | plt
36 | thm <- theme(
37 |     axis.title = element_blank(),
38 |     axis.text.x = element_text(angle = 45, hjust = 1))
39 | 
40 | fig <- .prettify(plt, thm)
41 | 
42 | saveRDS(fig, args$rds)
43 | ggsave(args$pdf, fig, width = 15, height = 6, units = "cm")
44 | 


--------------------------------------------------------------------------------
/code/07-plot_clust-boxplot_by_method.R:
--------------------------------------------------------------------------------
 1 | source(args$uts)
 2 | 
 3 | res <- .read_res(args$res)
 4 | 
 5 | # order according to average F1 score
 6 | res$sim_method <- factor(
 7 |     res$sim_method, 
 8 |     levels = res %>% 
 9 |         group_by(sim_method) %>% 
10 |         summarise_at("F1", mean) %>% 
11 |         arrange(F1) %>% 
12 |         pull("sim_method"))
13 | 
14 | plt <- ggplot(res, aes(
15 |     reorder_within(clust_method, -F1, sim_method, median), 
16 |     F1, col = clust_method, fill = clust_method)) +
17 |     facet_grid(~ sim_method, scales = "free_x") +
18 |     geom_hline(yintercept = c(0, 1), size = 0.1) +
19 |     geom_boxplot(
20 |         size = 0.25, outlier.size = 0.25,
21 |         alpha = 0.25, key_glyph = "point") +
22 |     scale_x_reordered(NULL) +
23 |     scale_y_continuous(breaks = seq(0, 1, 0.2)) 
24 | 
25 | thm <- theme(
26 |     legend.title = element_blank(),
27 |     axis.text.x = element_blank(),
28 |     axis.ticks.x = element_blank())
29 | 
30 | fig <- .prettify(plt, thm)
31 | 
32 | saveRDS(fig, args$rds)
33 | ggsave(args$pdf, fig, width = 16, height = 3.25, units = "cm")
34 | 


--------------------------------------------------------------------------------
/code/07-plot_clust-boxplot_dF1.R:
--------------------------------------------------------------------------------
 1 | source(args$uts)
 2 | 
 3 | res <- .read_res(args$res)
 4 | 
 5 | df <- res %>%
 6 |     group_by(refset, cluster, clust_method) %>%
 7 |     mutate(F1 = F1-.data$F1[.data$sim_method == "ref"]) %>%
 8 |     filter(sim_method != "ref") %>%
 9 |     mutate(sim_method = droplevels(sim_method))
10 | 
11 | plt <- ggplot(df, aes(reorder(sim_method, F1, median), F1)) +
12 |     geom_hline(yintercept = 0, size = 0.1, col = "red") +
13 |     geom_boxplot(size = 0.2, outlier.size = 0.1, key_glyph = "point") +
14 |     geom_violin(fill = NA, col = "grey", size = 0.2) +
15 |     scale_y_continuous(limits = c(-1, 1), n.breaks = 5) +
16 |     labs(x = NULL, y = expression(Delta*"F1(sim-ref)"))
17 | 
18 | thm <- theme(axis.text.x = element_text(angle = 45, hjust = 1),)
19 | 
20 | fig <- .prettify(plt, thm)
21 | 
22 | saveRDS(fig, args$rds)
23 | ggsave(args$pdf, fig, width = 6, height = 6, units = "cm")
24 | 


--------------------------------------------------------------------------------
/code/07-plot_clust-correlations.R:
--------------------------------------------------------------------------------
 1 | # args <- list(
 2 | #     uts = "code/utils-plotting.R",
 3 | #     rds = "plts/clust-correlations.rds",
 4 | #     pdf = "plts/clust-correlations.pdf",
 5 | #     res = list.files("outs", "^clust_res", full.names = TRUE))
 6 | 
 7 | source(args$uts)
 8 | 
 9 | res <- .read_res(args$res)
10 | 
11 | mat <- res %>%
12 |     pivot_wider(
13 |         id_cols = c("refset", "cluster", "clust_method"),
14 |         names_from = "sim_method",
15 |         values_from = "F1") %>%
16 |     select(any_of(c("ref", names(.methods_pal)))) %>%
17 |     cor(method = "pearson", use = "pairwise.complete.obs")
18 | 
19 | xo <- rownames(mat)[hclust(dist(mat))$order]
20 | yo <- rownames(mat)[hclust(dist(t(mat)))$order]
21 | 
22 | df <- mat %>% 
23 |     data.frame(
24 |         from = rownames(.), 
25 |         check.names = FALSE) %>% 
26 |     pivot_longer(
27 |         cols = -from, 
28 |         names_to = "to",
29 |         values_to = "corr") %>% 
30 |     mutate(
31 |         to = factor(to, levels = xo),
32 |         from = factor(from, levels = yo)) %>% 
33 |     filter(as.numeric(to) <= as.numeric(from))
34 | 
35 | fd <- filter(df, from != to)
36 | min <- floor(min(fd$corr)/0.1)*0.1
37 | max <- ceiling(max(fd$corr)/0.1)*0.1
38 | plt <- ggplot(fd, aes(from, to, fill = corr)) +
39 |     geom_tile() +
40 |     scale_fill_gradientn("r",
41 |         colors = rev(hcl.colors(11, "Grays")),
42 |         limits = c(min, max), breaks = c(min, max)) +
43 |     coord_equal(expand = FALSE) +
44 |     scale_x_discrete(limits = rev(xo[-1])) 
45 | 
46 | fy <- c("plain", "bold")[as.numeric(levels(df$to) == "ref") + 1]
47 | fx <- c("plain", "bold")[as.numeric(rev(levels(df$from)) == "ref") + 1]
48 | 
49 | thm <- theme(
50 |     axis.text.y = element_text(face = fy),
51 |     axis.text.x = element_text(angle = 45, hjust = 1, face = fx),
52 |     axis.title = element_blank(),
53 |     panel.border = element_blank(),
54 |     legend.title = element_text(vjust = 1),
55 |     legend.direction = "horizontal",
56 |     legend.position = c(1, 1),
57 |     legend.justification = c(1, 1))
58 | 
59 | fig <- .prettify(plt, thm)
60 | 
61 | saveRDS(fig, args$rds)
62 | ggsave(args$pdf, fig, width = 6, height = 6, units = "cm")


--------------------------------------------------------------------------------
/code/07-plot_clust-heatmap_by_method.R:
--------------------------------------------------------------------------------
 1 | source(args$uts)
 2 | 
 3 | res <- .read_res(args$res)
 4 | 
 5 | df <- res %>% 
 6 |     group_by(
 7 |         refset,
 8 |         sim_method,
 9 |         clust_method) %>% 
10 |     summarise_at("F1", mean) %>% 
11 |     mutate(rank = rank(-F1))
12 | 
13 | plt <- ggplot(df, aes(
14 |     reorder_within(clust_method, rank, sim_method), 
15 |     reorder(refset, rank), fill = rank)) +
16 |     facet_wrap(~ sim_method, nrow = 1, scales = "free_x") +
17 |     geom_tile(col = "white", key_glyph = "point") +
18 |     scale_fill_gradientn(
19 |         colors = rev(hcl.colors(5, "Spectral")),
20 |         limits = range(df$rank), 
21 |         breaks = range(df$rank),
22 |         labels = c("best", "worst")) +
23 |     guides(fill = guide_colorbar(reverse = TRUE)) +
24 |     coord_cartesian(expand = FALSE) +
25 |     scale_x_reordered()
26 | 
27 | thm <- theme(
28 |     axis.title = element_blank(),
29 |     axis.text.x = element_text(size = 4, angle = 45, hjust = 1))
30 | 
31 | fig <- .prettify(plt, thm)
32 | 
33 | saveRDS(fig, args$rds)
34 | ggsave(args$pdf, fig, width = 16, height = 3.25, units = "cm")
35 | 


--------------------------------------------------------------------------------
/code/07-plot_corr-stat_2d.R:
--------------------------------------------------------------------------------
 1 | source(args$fun)
 2 | 
 3 | # args <- list(res = list.files("outs", "stat_2d-.*(ks2|emd)\\.rds", full.names = TRUE))
 4 | 
 5 | res <- .read_res(args$res)
 6 | 
 7 | df <- res %>% 
 8 |   filter(!is.na(stat)) %>%
 9 |   mutate(
10 |     refset = paste(datset, subset, sep = ","),
11 |     metrics = paste(metric1, metric2, sep = "\n")) %>%
12 |   select(-c(datset, subset, metric1, metric2)) %>% 
13 |   # for type != n, keep batch-/cluster-level comparisons only
14 |   group_by(stat2d, refset, method, metrics) %>% 
15 |   mutate(n = n()) %>% 
16 |   filter(group != "global" | n == 1) %>%  
17 |   select(-c(group, n))
18 | 
19 | gg <- pivot_wider(df,
20 |   names_from = stat2d,
21 |   values_from = stat) %>% 
22 |   group_by(refset, metrics) %>%
23 |   mutate(
24 |     ks2 = ks2/max(ks2, na.rm = TRUE),
25 |     emd = emd/max(emd, na.rm = TRUE))
26 | 
27 | plt <- ggplot(gg, aes(emd, ks2)) +
28 |   facet_wrap(~ metrics, nrow = 2) + 
29 |   geom_abline(intercept = 0, slope = 1, size = 0.2, lty = 2) +
30 |   stat_smooth(method = "lm", formula = y ~ x, size = 0.4, col = "red") +
31 |   geom_point(alpha = 0.2, size = 1, shape = 16, col = "tomato") + 
32 |   stat_cor(method = "pearson", size = 1.5,
33 |     hjust = 1, vjust = 0, label.x.npc = 1, label.y.npc = 0) +
34 |   scale_x_continuous(limits = c(0, 1), breaks = c(0, 1)) +
35 |   scale_y_continuous(limits = c(0, 1), breaks = c(0, 1)) +
36 |   labs(x = "scaled EMD", y = "scaled KS statistic")
37 | 
38 | thm <- theme(
39 |   aspect.ratio = 1,
40 |   legend.position = "none")
41 | 
42 | fig <- .prettify(plt, thm)
43 | 
44 | saveRDS(fig, args$ggp)
45 | ggsave(args$plt, fig, width = 16, height = 9.25, units = "cm")
46 | 


--------------------------------------------------------------------------------
/code/07-plot_dimred.R:
--------------------------------------------------------------------------------
 1 | # wcs <- list(reftyp = "k")
 2 | # args <- list(
 3 | #     fun = "code/utils-plotting.R",
 4 | #     res = list.files("outs", "dr_ref", full.names = TRUE),
 5 | #     pdf = sprintf("figs/dimred_%s.pdf", wcs$reftyp))
 6 | 
 7 | source(args$fun)
 8 | 
 9 | dfs <- args$res %>%
10 |     lapply(readRDS) %>%
11 |     bind_rows() %>%
12 |     mutate(
13 |         refset = paste(datset, subset, sep = ","),
14 |         type = ifelse(!is.na(batch), "b", ifelse(!is.na(cluster), "k", "n"))) %>% 
15 |     filter(type == wcs$reftyp) %>% 
16 |     group_split(refset) 
17 | 
18 | # number of rows & columns for facetting
19 | nc <- ceiling(length(dfs)/(nr <- 3))
20 | 
21 | # get variable to color by
22 | # - log-library size (lls) for type n
23 | # - batch for type b, cluster for type k
24 | col <- switch(wcs$reftyp, n = "lls", b = "batch", k = "cluster")
25 | 
26 | fig <- lapply(seq_along(dfs), \(i) {
27 |     plt <- ggplot(dfs[[i]], aes_string(
28 |         "TSNE1", "TSNE2", col = col, fill = col)) +
29 |         geom_point_rast(size = 0.1, alpha = 0.2) +
30 |         (if (col == "lls") scale_color_viridis_c()) +
31 |         guides(fill = "none") + ggtitle(dfs[[i]]$refset[1]) 
32 |     thm <- theme(
33 |         aspect.ratio = 1,
34 |         axis.text = element_blank(),
35 |         axis.ticks = element_blank(),
36 |         legend.title = element_blank(),
37 |         legend.justification = c(0, 0.5),
38 |         plot.title = element_text(hjust = 0.5),
39 |         axis.title.x = if (i > length(dfs) - nc) element_text() else element_blank(),
40 |         axis.title.y = if (i %% nc != 1) element_blank() else element_text(angle = 90))
41 |     .prettify(plt, thm)
42 | }) %>% wrap_plots(nrow = nr)
43 | 
44 | ggsave(args$pdf, fig, width = 16, height = 4*length(dfs)/n, units = "cm")
45 | 


--------------------------------------------------------------------------------
/code/07-plot_dimred_batch.R:
--------------------------------------------------------------------------------
 1 | # args <- list(
 2 | #     rds = "plts/batch-dimred.rds",
 3 | #     pdf = "plts/batch-dimred.pdf",
 4 | #     uts = "code/utils-plotting.R",
 5 | #     res = list.files("outs", "^dr_", full.names = TRUE))
 6 | # args$res <- grep("CellBench,H", args$res, value = TRUE)
 7 | 
 8 | source(args$uts)
 9 | 
10 | res <- .read_res(args$res)
11 |     
12 | dfs <- res %>% 
13 |     replace_na(list(batch_method = "none")) %>%
14 |     mutate(
15 |         batch_method = factor(batch_method),
16 |         batch_method = relevel(batch_method, "none")) %>% 
17 |     group_split(refset)
18 | 
19 | lys <- lapply(dfs, \(df) {
20 |     plt <- ggplot(df, aes(TSNE1, TSNE2, col = batch, fill = batch)) +
21 |         facet_grid(batch_method ~ sim_method, scales = "free") +
22 |         geom_point_rast(shape = 21, size = 0.05, alpha = 0.2) +
23 |         ggtitle(df$refset[1])
24 |     thm <- theme(
25 |         aspect.ratio = 1,
26 |         axis.text = element_blank(),
27 |         axis.ticks = element_blank(),
28 |         axis.title = element_text(hjust = 0))
29 |     fig <- .prettify(plt, thm)
30 | })
31 | 
32 | saveRDS(lys, args$rds)
33 | 
34 | pdf(args$pdf, width = 16/2.54, height = 12/2.54)
35 | for (p in lys) print(p); dev.off()
36 | 


--------------------------------------------------------------------------------
/code/07-plot_memory.R:
--------------------------------------------------------------------------------
 1 | # args <- list(
 2 | #     pdf = "plts/memory.pdf",
 3 | #     fun = "code/utils-plotting.R",
 4 | #     res = list.files("logs", ".*\\.txt", full.names = TRUE))
 5 | 
 6 | source(args$fun)
 7 | res <- lapply(args$res, read.table, header = TRUE)
 8 | mbs <- do.call(rbind, res)[["max_rss"]]
 9 | 
10 | # get metadata from output names
11 | # rts_{reftyp}-{datset},{subset},
12 | # {method},{ngs},{ncs},{rep}.rds
13 | ss <- strsplit(basename(gsub("\\.txt", "", args$res)), ",")
14 | reftyp <- gsub("rts_([a-z])-.*", "\\1", sapply(ss, .subset, 1))
15 | refset <- paste(sep = ",",
16 |     gsub(".*-", "", sapply(ss, .subset, 1)),
17 |     sapply(ss, .subset, 2))
18 | vars <- seq(3, 6)
19 | names(vars) <- c("method", "ngs", "ncs", "rep")
20 | vars <- lapply(vars, \(.) sapply(ss, .subset, .))
21 | 
22 | df <- data.frame(mbs, reftyp, refset, vars) %>% 
23 |     pivot_longer(
24 |         values_to = "n",
25 |         names_to = "dim",
26 |         cols = c("ngs", "ncs")) %>% 
27 |     filter(n != "x") %>% 
28 |     group_by(dim) %>%
29 |     mutate(n = factor(n, sort(as.numeric(unique(n))))) %>%
30 |     ungroup() %>%
31 |     mutate(
32 |         dim = factor(dim,
33 |             levels = c("ngs", "ncs"),
34 |             labels = c("# genes", "# cells")),
35 |         method = droplevels(factor(method, names(.methods_pal)))) %>% 
36 |     group_by(method, dim, n) %>%
37 |     summarise_at("mbs", mean)
38 | 
39 | # order methods by average across subsets
40 | ms <- df %>% 
41 |     group_by(method) %>% 
42 |     summarise_at("mbs", mean) %>% 
43 |     arrange(mbs) %>% pull("method")
44 | df$method <- factor(df$method, levels = ms)
45 | 
46 | pal <- .methods_pal[levels(df$method)]
47 | 
48 | lab <- parse(text = paste(
49 |     sep = "~",
50 |     sprintf("bold(%s)", LETTERS), 
51 |     gsub("\\s", "~", names(pal))))
52 | 
53 | anno <- mutate(df, letter = LETTERS[
54 |     match(method, levels(method))])    
55 | 
56 | plt <- ggplot(df, aes(factor(n), mbs, 
57 |     col = method, fill = method)) +
58 |     facet_grid(~ dim, scales = "free") +
59 |     geom_bar(
60 |         width = 0.9,
61 |         stat = "identity",
62 |         position = "dodge") +
63 |     geom_text(data = anno, 
64 |         aes(label = letter, y = -250),
65 |         size = 1.5, color = "black",
66 |         position = position_dodge(0.9)) + 
67 |     scale_fill_manual(values = pal, labels = lab) +
68 |     scale_color_manual(values = pal, labels = lab) +
69 |     xlab(NULL) + scale_y_continuous(
70 |         "memory usage (MBs)",
71 |         limits = c(-500, 4e3),
72 |         expand = c(0, 0))
73 | 
74 | thm <- theme(
75 |     axis.ticks.x = element_blank(),
76 |     legend.title = element_blank(),
77 |     legend.text = element_text(hjust = 0),
78 |     panel.grid.major.y = element_line(color = "grey"))
79 | 
80 | fig <- .prettify(plt, thm)
81 | 
82 | saveRDS(fig, args$rds)
83 | ggsave(args$pdf, fig, width = 18, height = 9, units = "cm")


--------------------------------------------------------------------------------
/code/07-plot_qc_ref-correlations.R:
--------------------------------------------------------------------------------
 1 | # wcs <- list(stat1d = "ks")
 2 | # args <- list(
 3 | #     fun = "code/utils-plotting.R",
 4 | #     res = "outs/obj-qc_ref.rds",
 5 | #     rds = "plts/qc_ref-correlations.rds",
 6 | #     pdf = "plts/qc_ref-correlations.pdf")
 7 | 
 8 | source(args$fun)
 9 | res <- readRDS(args$res)
10 | 
11 | # exclude summaries that include sampling
12 | ex <- .metrics_lab[grep("cor|pcd", names(.metrics_lab))]
13 | df <- res %>% 
14 |     select(-c(datset, subset)) %>% 
15 |     filter(!metric %in% ex) %>% 
16 |     mutate(
17 |         # add summary short names
18 |         .metric = names(.metrics_lab)[match(metric, .metrics_lab)],
19 |         # add summary type
20 |         type = ifelse(grepl("cell", .metric), "cell", "gene")) %>% 
21 |     # keep group-level comparisons only
22 |     mutate(across(c(group, id), as.character)) %>%
23 |     filter(group != id | .metric %in% .none_metrics) %>% 
24 |     select(-c(.metric, group, id)) %>% 
25 |     # correlate summaries of same type for each refset
26 |     group_by(refset, metric) %>% 
27 |     mutate(n = row_number()) %>% 
28 |     group_by(refset, type) %>% 
29 |     group_map(\(df, keys) {
30 |         pivot_wider(df,
31 |             names_from = metric, 
32 |             values_from = value) %>% 
33 |         select(any_of(.metrics_lab)) %>% 
34 |         cor(method = "spearman", 
35 |             use = "pairwise.complete.obs") %>% 
36 |         data.frame(from = rownames(.)) %>% 
37 |         pivot_longer(-from, names_to = "to") %>% 
38 |         mutate(keys)
39 |     }) %>% 
40 |     bind_rows() %>% 
41 |     # average across refsets
42 |     group_by(from, to) %>% 
43 |     mutate(value = mean(value)) %>% 
44 |     ungroup() %>% 
45 |     distinct(from, to, .keep_all = TRUE) %>% 
46 |     select(-refset)
47 | 
48 | # do hierarchical clustering for each type
49 | mat <- df %>% 
50 |     split(.$type) %>% 
51 |     lapply(\(df) df %>% 
52 |         pivot_wider(
53 |             names_from = to,
54 |             values_from = value) %>%
55 |             select_if(is.numeric))
56 | 
57 | o <- unlist(lapply(mat, \(.) colnames(.)[hclust(dist(.))$order]))
58 | df <- mutate(df, across(c(from, to), factor, o, .metrics_lab[o]))
59 | 
60 | plt <- ggplot(df, aes(from, to, fill = value)) +
61 |     geom_tile() +
62 |     scale_fill_distiller("r",
63 |         palette = "RdYlBu",
64 |         limits = c(-0.5, 1),
65 |         breaks = c(0, 1)) +
66 |     coord_equal(expand = FALSE) +
67 |     scale_x_discrete(limits = rev(levels(df$to))) 
68 | 
69 | thm <- theme(
70 |     axis.title = element_blank(),
71 |     axis.text.x = element_text(angle = 45, hjust = 1))
72 | 
73 | fig <- .prettify(plt, thm)
74 | 
75 | saveRDS(fig, args$rds)
76 | ggsave(args$pdf, fig, width = 7.5, height = 6, units = "cm")
77 | 


--------------------------------------------------------------------------------
/code/07-plot_runtimes.R:
--------------------------------------------------------------------------------
 1 | # wcs <- list(reftyp = "n")
 2 | # args <- list(
 3 | #     rds = sprintf("plts/rts_%s.rds", wcs$reftyp),
 4 | #     pdf = sprintf("plts/rts_%s.pdf", wcs$reftyp),
 5 | #     fun = "code/utils-plotting.R",
 6 | #     res = list.files("outs", sprintf("^rts_%s-", wcs$reftyp), full.names = TRUE))
 7 | 
 8 | source(args$fun)
 9 | res <- .read_res(args$res)
10 | 
11 | df <- res %>% 
12 |     # drop SCRIP estimation for 'reftyp' other than 'n'
13 |     # as there's none, it's just setting up parameters
14 |     { if (wcs$reftyp != "n")
15 |         mutate(., est = case_when(
16 |             method == "SCRIP" ~ NA_real_,
17 |             TRUE ~ est)) else . } %>%
18 |     # sum up estimation & simulation timings
19 |     rowwise() %>% 
20 |     mutate(tot = {
21 |         if (is.finite(est)) {
22 |             if (is.finite(sim)) {
23 |                 est + sim # both available
24 |             } else est # only estimation
25 |         } else if (is.finite(sim)) {
26 |             sim # only simulation
27 |         } else NA # neither available
28 |     }) %>% 
29 |     ungroup() %>% 
30 |     pivot_longer(
31 |         values_to = "n",
32 |         names_to = "dim",
33 |         cols = c("ngs", "ncs")) %>%
34 |     pivot_longer(
35 |         values_to = "t",
36 |         names_to = "step",
37 |         cols = c("est", "sim", "tot")) %>%
38 |     mutate(
39 |         dim = factor(dim,
40 |             levels = c("ngs", "ncs"),
41 |             labels = c("# genes", "# cells")),
42 |         step = factor(step,
43 |             levels = c("est", "sim", "tot"),
44 |             labels = c("estimation", "simulation", "overall"))) %>%
45 |     filter(!is.na(n), !is.na(t), is.finite(t)) %>%
46 |     group_by(method, step, dim, n) %>%
47 |     summarise_at("t", mean)
48 | 
49 | # order methods by overall average across subsets
50 | ms <- df %>% 
51 |     filter(step == "overall") %>% 
52 |     group_by(method) %>% 
53 |     summarise_at("t", mean) %>% 
54 |     arrange(t) %>% pull("method")
55 | df$method <- factor(df$method, levels = ms)
56 | 
57 | pal <- .methods_pal[levels(df$method)]
58 | 
59 | lab <- parse(text = paste(
60 |     sep = "~",
61 |     sprintf("bold(%s)", LETTERS), 
62 |     gsub("\\s", "~", names(pal))))
63 | 
64 | anno <- df %>% 
65 |     group_by(step, dim) %>% 
66 |     mutate(letter = LETTERS[match(method, levels(method))])
67 | 
68 | plt <- ggplot(df, aes(factor(n), t, 
69 |     col = method, fill = method)) +
70 |     facet_grid(step ~ dim, scales = "free") +
71 |     geom_bar(
72 |         width = 0.9,
73 |         stat = "identity",
74 |         position = "dodge") +
75 |     geom_text(data = anno, 
76 |         aes(label = letter, y = 0),
77 |         size = 1.5, vjust = 2, color = "black",
78 |         position = position_dodge(0.9)) + 
79 |     scale_fill_manual(values = pal, labels = lab) +
80 |     scale_color_manual(values = pal, labels = lab) +
81 |     xlab(NULL) + scale_y_sqrt("runtime (s)", 
82 |         expand = expansion(mult = c(0.2, 0.1)))
83 | 
84 | thm <- theme(
85 |     axis.ticks.x = element_blank(),
86 |     legend.title = element_blank(),
87 |     legend.text = element_text(hjust = 0),
88 |     panel.grid.major.y = element_line(color = "grey"))
89 | 
90 | fig <- .prettify(plt, thm)
91 | 
92 | saveRDS(fig, args$rds)
93 | ggsave(args$pdf, fig, width = 18, height = 9, units = "cm")
94 | 


--------------------------------------------------------------------------------
/code/07-plot_stat_1d-scatters.R:
--------------------------------------------------------------------------------
 1 | # args <- list(
 2 | #   fun = "code/utils.R",
 3 | #   res = "outs/obj-stat_1d.rds",
 4 | #   rds = "plts/stat_1d-scatters.rds",
 5 | #   pdf = "plts/stat_1d-scatters.pdf")
 6 | 
 7 | source(args$fun)
 8 | 
 9 | res <- readRDS(args$res)
10 |   
11 | df <- res %>% 
12 |   mutate(refset = paste(datset, subset, sep = ",")) %>%
13 |   select(-c(datset, subset)) %>% 
14 |   # for type != n, keep batch-/cluster-level comparisons only
15 |   group_by(stat1d, refset, method, metric) %>% 
16 |   mutate(n = n()) %>% 
17 |   filter(group != "global" | n == 1) %>%  
18 |   select(-c(group, n)) %>% 
19 |   # for each statistic & metric, re-scale b/w 0 & 1 
20 |   group_by(stat1d, metric) %>% 
21 |   mutate(stat = stat/max(stat, na.rm = TRUE)) %>% 
22 |   pivot_wider(names_from = stat1d, values_from = stat) %>% 
23 |   filter(!is.na(ks), !is.na(ws))
24 | 
25 | plt <- ggplot(df, aes(ks, ws, col = refset, fill = refset)) +
26 |   facet_wrap(~ metric, nrow = 2) +
27 |   stat_cor(
28 |     aes(group = metric), method = "spearman",
29 |     label.x = 0, label.y = 1, vjust = 1, size = 1.5) +
30 |   geom_point(size = 0.2, alpha = 0.4, show.legend = FALSE) +
31 |   scale_x_sqrt(limits = c(0, 1), breaks = seq(0.2, 1, 0.2)) +
32 |   scale_y_sqrt(limits = c(0, 1), breaks = seq(0.2, 1, 0.2)) +
33 |   labs(x = "KS statistics", y = "scaled Wasserstein metric")
34 | 
35 | thm <- theme(aspect.ratio = 1)
36 | 
37 | fig <- .prettify(plt, thm)
38 | 
39 | saveRDS(fig, args$rds)
40 | ggsave(args$pdf, fig, width = 16, height = 7.5, units = "cm")
41 | 


--------------------------------------------------------------------------------
/code/07-plot_stat_1d_by_reftyp-boxplot.R:
--------------------------------------------------------------------------------
 1 | # wcs <- list(reftyp = "n", stat1d = "ks")
 2 | # args <- list(
 3 | #     fun = "code/utils-plotting.R",
 4 | #     res = "outs/obj-stat_1d.rds",
 5 | #     rds = sprintf("stat_1d_by_reftyp-boxplot,%s,%s.rds", wcs$reftyp, wcs$stat1d),
 6 | #     pdf = sprintf("stat_1d_by_reftyp-boxplot,%s,%s.pdf", wcs$reftyp, wcs$stat1d))
 7 | 
 8 | source(args$fun)
 9 | res <- readRDS(args$res)
10 | 
11 | df <- res %>% 
12 |     # keep data of interest
13 |     filter(
14 |         reftyp == wcs$reftyp, 
15 |         stat1d == wcs$stat1d) %>%
16 |     .filter_res() %>% 
17 |     # scale values b/w 0 and 1 for visualization
18 |     group_by(metric) %>% 
19 |     mutate(stat = stat/max(stat, na.rm = TRUE))
20 | 
21 | pal <- .methods_pal[levels(df$method)]
22 | lab <- parse(text = paste(
23 |     sep = "~",
24 |     sprintf("bold(%s)", LETTERS), 
25 |     gsub("\\s", "~", names(pal))))
26 | 
27 | anno <- df %>% 
28 |     group_by(method, metric) %>% 
29 |     summarize_at("stat", median) %>% 
30 |     mutate(letter = LETTERS[match(method, levels(method))])
31 | 
32 | plt <- ggplot(df, aes(
33 |     reorder_within(method, stat, metric, median), 
34 |     stat, col = method, fill = method)) +
35 |     facet_wrap(~ metric, nrow = 3, scales = "free_x") +
36 |     geom_boxplot(
37 |         outlier.size = 0.25, outlier.alpha = 1,
38 |         size = 0.25, alpha = 0.25, key_glyph = "point") + 
39 |     geom_text(data = anno, 
40 |         size = 1.5, color = "black", 
41 |         aes(label = letter, y = -0.075)) + 
42 |     scale_fill_manual(values = pal, labels = lab) +
43 |     scale_color_manual(values = pal, labels = lab) +
44 |     scale_x_reordered(NULL) +
45 |     scale_y_continuous(
46 |         paste(ifelse(wcs$stat1d == "ws", "scaled", ""),
47 |             .stats1d_lab[wcs$stat1d]),
48 |         limits = c(-0.1, 1), n.breaks = 3)
49 | 
50 | thm <- theme(
51 |     legend.text.align = 0,
52 |     legend.title = element_blank(),
53 |     axis.text.x = element_blank(),
54 |     axis.title.x = element_blank(),
55 |     axis.ticks.x = element_blank())
56 | 
57 | fig <- .prettify(plt, thm)
58 | 
59 | saveRDS(fig, args$rds)
60 | ggsave(args$pdf, fig, width = 16, height = 9, units = "cm")


--------------------------------------------------------------------------------
/code/07-plot_stat_1d_by_reftyp-dimEst.R:
--------------------------------------------------------------------------------
 1 | # wcs <- list(reftyp = "n", stat1d = "ks")
 2 | # args <- list(
 3 | #     fun = "code/utils-plotting.R",
 4 | #     res = "outs/obj-stat_1d.rds",
 5 | #     rds = sprintf("plts/stat_1d_by_reftyp-dimEst,%s,%s.rds", wcs$reftyp, wcs$stat1d),
 6 | #     pdf = sprintf("plts/stat_1d_by_reftyp-dimEst,%s,%s.pdf", wcs$reftyp, wcs$stat1d))
 7 | 
 8 | suppressPackageStartupMessages({
 9 |     library(intrinsicDimension)
10 | })
11 | 
12 | source(args$fun)
13 | res <- readRDS(args$res)
14 | 
15 | df <- res %>% 
16 |     # keep data of interest
17 |     filter(
18 |         reftyp == wcs$reftyp, 
19 |         stat1d == wcs$stat1d) %>%
20 |     .filter_res() %>% 
21 |     # average across groups
22 |     group_by(refset, method, metric, group) %>% .avg(n = 1) %>%
23 |     # estimate dimensionality for each refset
24 |     pivot_wider(
25 |         names_from = metric, 
26 |         values_from = stat) %>% 
27 |     split(.$refset) %>% 
28 |     lapply(\(df) {
29 |         if (nrow(df) < 3) return(NULL)
30 |         ks <- seq(2, nrow(df)-1)
31 |         mat <- as.matrix(select(df, any_of(.metrics_lab)))
32 |         mat <- mat[, apply(mat, 2, \(.) !all(is.na(.)))]
33 |         mat[is.na(mat)] <- 1
34 |         est <- sapply(ks, maxLikGlobalDimEst, data = mat)
35 |         data.frame(dim = unlist(est), k = ks)
36 |     }) %>% 
37 |     bind_rows(.id = "refset")
38 | 
39 | plt <- ggplot(df, aes(k, dim, col = refset, fill = refset)) +
40 |     geom_point(shape = 21, size = 0.5) + 
41 |     geom_line(alpha = 0.5, show.legend = FALSE) +
42 |     scale_x_continuous(
43 |         "number of nearest neighbors (k)",
44 |         limits = range(df$k),
45 |         breaks = seq(2, 20, 2)) +
46 |     scale_y_continuous(
47 |         "estimated dimensionality",
48 |         limits = range(df$dim),
49 |         breaks = seq(0, 10, 2)) +
50 |     ggtitle(paste("reftyp:", wcs$reftyp))
51 | 
52 | thm <- theme(
53 |     legend.title = element_blank(),
54 |     panel.grid.major = element_line(color = "grey"))
55 | 
56 | fig <- .prettify(plt, thm)
57 | 
58 | saveRDS(fig, args$rds)
59 | ggsave(args$pdf, fig, width = 16, height = 9, units = "cm")
60 | 


--------------------------------------------------------------------------------
/code/07-plot_stat_1d_by_reftyp-heatmap.R:
--------------------------------------------------------------------------------
 1 | # wcs <- list(reftyp = "k", stat1d = "ks")
 2 | # args <- list(
 3 | #     fun = "code/utils-plotting.R",
 4 | #     res = "outs/obj-stat_1d.rds",
 5 | #     rds = sprintf("stat_1d_by_reftyp-boxplot,%s,%s.rds", wcs$reftyp, wcs$stat1d),
 6 | #     pdf = sprintf("stat_1d_by_reftyp-boxplot,%s,%s.pdf", wcs$reftyp, wcs$stat1d))
 7 | 
 8 | source(args$fun)
 9 | res <- readRDS(args$res)
10 | 
11 | df <- res %>%
12 |     # keep data of interest
13 |     filter(
14 |         reftyp == wcs$reftyp, 
15 |         stat1d == wcs$stat1d) %>%
16 |     .filter_res() %>% 
17 |     # for each method & summary, 
18 |     # average across groups & refsets
19 |     group_by(metric, method, refset, group) %>% .avg(n = 3) %>% 
20 |     complete(method, metric, fill = list(stat = NA))
21 | 
22 | # order methods by average across metrics
23 | ox <- df %>% 
24 |     group_by(method) %>% 
25 |     summarise_at("stat", mean, na.rm = TRUE) %>% 
26 |     arrange(desc(stat)) %>% 
27 |     pull("method")
28 | 
29 | # order metrics by average across methods
30 | oy <- df %>% 
31 |     group_by(metric) %>% 
32 |     summarise_at("stat", mean, na.rm = TRUE) %>% 
33 |     arrange(desc(stat)) %>% 
34 |     pull("metric")
35 | 
36 | plt <- ggplot(df, 
37 |     aes(method, metric, fill = stat)) +
38 |     geom_tile(col = "white") + 
39 |     scale_fill_distiller(
40 |         .stats1d_lab[wcs$stat1d],
41 |         palette = "RdYlBu",
42 |         na.value = "grey",
43 |         limits = c(0, 1),
44 |         breaks = c(0, 1)) +
45 |     coord_equal(expand = FALSE) +
46 |     scale_x_discrete(limits = rev(ox)) + 
47 |     scale_y_discrete(limits = rev(oy))
48 | 
49 | thm <- theme(
50 |     axis.ticks = element_blank(),
51 |     axis.title = element_blank(),
52 |     panel.border = element_blank(),
53 |     axis.text.x = element_text(angle = 45, hjust = 1))
54 | 
55 | fig <- .prettify(plt, thm)
56 | 
57 | saveRDS(fig, args$rds)
58 | ggsave(args$pdf, fig, width = 8, height = 6, units = "cm")


--------------------------------------------------------------------------------
/code/07-plot_stat_1d_by_reftyp-mds.R:
--------------------------------------------------------------------------------
 1 | # wcs <- list(reftyp = "b", stat1d = "ks")
 2 | # args <- list(
 3 | #     fun = "code/utils-plotting.R",
 4 | #     res = "outs/obj-stat_1d.rds",
 5 | #     rds = sprintf("plts/stat_1d_by_reftyp-mds,%s,%s.rds", wcs$reftyp, wcs$stat1d),
 6 | #     pdf = sprintf("plts/stat_1d_by_reftyp-mds,%s,%s.pdf", wcs$reftyp, wcs$stat1d))
 7 | 
 8 | source(args$fun)
 9 | res <- readRDS(args$res)
10 | 
11 | df <- res %>%
12 |     # keep data of interest
13 |     filter(
14 |         reftyp == wcs$reftyp,
15 |         stat1d == wcs$stat1d) %>%
16 |     .filter_res() %>% 
17 |     # average across groups
18 |     group_by(refset, reftyp, method, metric, group) %>%
19 |     .avg(n = 2)
20 | 
21 | mds <- df %>% 
22 |     pivot_wider(
23 |         id_cols = c(refset, method),
24 |         names_from = metric, 
25 |         values_from = stat) %>% 
26 |     select(any_of(.metrics_lab)) %>%
27 |     as.matrix() %>% t() %>% 
28 |     dist() %>% cmdscale()
29 | 
30 | gg <- mds %>% 
31 |     data.frame(rownames(.)) %>% 
32 |     set_colnames(c("x", "y", "metric")) %>% 
33 |     mutate(
34 |         type = case_when(
35 |             metric %in% .none_metrics ~ "global",
36 |             grepl("gene", metric) ~ "gene",
37 |             grepl("cell", metric) ~ "cell"),
38 |         type = factor(type, c("gene", "cell", "global")),
39 |         metric = .metrics_lab[metric])
40 |             
41 | plt <- ggplot(gg, aes(x, y, 
42 |     col = type, fill = type, label = metric)) +
43 |     geom_point(size = 2, shape = 21, col = "black", alpha = 0.5) +
44 |     geom_text_repel(size = 2, show.legend = FALSE) +
45 |     scale_fill_manual(values = c("red", "blue", "green3")) +
46 |     scale_color_manual(values = c("red", "blue", "green3")) +
47 |     scale_x_continuous(expand = expansion(mult = 0.1)) +
48 |     scale_y_continuous(expand = expansion(mult = 0.1)) +
49 |     labs(x = "MDS dim. 1", y = "MDS dim. 2") +
50 |     coord_fixed()
51 | 
52 | thm <- theme(
53 |     legend.position = "bottom",
54 |     legend.title = element_blank(),
55 |     panel.grid.major = element_line(color = "grey"))
56 | 
57 | fig <- .prettify(plt, thm)
58 | 
59 | saveRDS(fig, args$rds)
60 | ggsave(args$pdf, fig, width = 9, height = 6, units = "cm")
61 | 


--------------------------------------------------------------------------------
/code/07-plot_stat_1d_by_reftyp-pca.R:
--------------------------------------------------------------------------------
  1 | # wcs <- list(reftyp = "b", stat1d = "ks")
  2 | # args <- list(
  3 | #     fun = "code/utils-plotting.R",
  4 | #     res = "outs/obj-stat_1d.rds",
  5 | #     rds = sprintf("plts/stat_1d_by_reftyp-pca,%s,%s.rds", wcs$reftyp, wcs$stat1d),
  6 | #     pdf = sprintf("plts/stat_1d_by_reftyp-pca,%s,%s.pdf", wcs$reftyp, wcs$stat1d))
  7 | 
  8 | source(args$fun)
  9 | res <- readRDS(args$res)
 10 | 
 11 | df <- res %>%
 12 |     # keep data of interest
 13 |     filter(
 14 |         reftyp == wcs$reftyp,
 15 |         stat1d == wcs$stat1d) %>%
 16 |     .filter_res() %>% 
 17 |     # average across groups
 18 |     group_by(refset, method, metric, group) %>%
 19 |     .avg(n = 2) %>%
 20 |     pivot_wider(
 21 |         names_from = metric, 
 22 |         values_from = stat) %>% 
 23 |     drop_na(any_of(.metrics_lab))
 24 | 
 25 | # do PCA
 26 | pc <- df %>% 
 27 |     select(any_of(.metrics_lab)) %>%
 28 |     as.matrix() %>% prcomp()
 29 | 
 30 | # get coordinates
 31 | xy <- pc$x %>% 
 32 |     data.frame() %>%
 33 |     # add metadata
 34 |     mutate(
 35 |         select(df, !any_of(.metrics_lab)),
 36 |         method = droplevels(method),
 37 |         reftyp = res$reftyp[match(refset, res$refset)]) %>% 
 38 |     mutate_at("reftyp", factor, c("n", "b", "k"))
 39 | 
 40 | # get loadings
 41 | rot <- pc$rotation %>% 
 42 |     data.frame() %>% 
 43 |     mutate(
 44 |         metric = .metrics_lab[rownames(.)],
 45 |         metric = factor(metric, metric))
 46 | ij <- c("PC1", "PC2")
 47 | m1 <- max(abs(xy[, ij]))
 48 | m2 <- max(abs(rot[, ij]))
 49 | rot[, ij] <- 0.5*rot[, ij]*m1/m2
 50 | 
 51 | # get percentages
 52 | var <- prop.table(pc$sdev^2)
 53 | var_lab <- round(100*var, 1)
 54 | 
 55 | # plotting
 56 | lab <- xy %>%
 57 |     group_by(method) %>%
 58 |     select(starts_with("PC")) %>%
 59 |     summarise_all(mean, .groups = "drop")
 60 | 
 61 | p0 <- ggplot(xy, aes(PC1, PC2)) +
 62 |     geom_vline(xintercept = 0, size = 0.1) +
 63 |     geom_hline(yintercept = 0, size = 0.1) +
 64 |     coord_fixed() + labs(
 65 |         x = sprintf("PC1 (%s%%)", var_lab[1]),
 66 |         y = sprintf("PC2 (%s%%)", var_lab[2])) +
 67 |     scale_x_continuous(
 68 |         limits = range(xy$PC1), 
 69 |         expand = expansion(mult = 0.2)) +
 70 |     scale_y_continuous(
 71 |         limits = range(xy$PC2),
 72 |         expand = expansion(mult = 0.2))
 73 | 
 74 | p1 <- p0 +
 75 |     geom_point(
 76 |         aes(fill = method), shape = 21, 
 77 |         size = 2, stroke = 0, alpha = 0.4) +
 78 |     geom_point(data = lab,
 79 |         aes(fill = method), shape = 21, 
 80 |         size = 3, stroke = 0, alpha = 0.8) +
 81 |     geom_label_repel(data = lab, 
 82 |         aes(label = method, col = method),
 83 |         label.padding = unit(0.75, "mm"),
 84 |         size = 2, fontface = "bold", show.legend = FALSE) +
 85 |     scale_fill_manual(values = .methods_pal[levels(xy$method)]) +
 86 |     scale_color_manual(values = .methods_pal[levels(xy$method)]) 
 87 |     
 88 | p2 <- p0 +
 89 |     geom_segment(data = rot, 
 90 |         aes(0, 0, xend = PC1, yend = PC2, col = metric),
 91 |         size = 0.5, arrow = arrow(length = unit(1, "mm"))) +
 92 |     geom_label_repel(data = rot,
 93 |         aes(label = metric, col = metric), 
 94 |         label.padding = unit(0.75, "mm"),
 95 |         size = 2, fontface = "bold", show.legend = FALSE) +
 96 |     scale_color_manual(values = .metrics_pal[levels(rot$metric)]) 
 97 | 
 98 | thm <- theme(
 99 |     legend.justification = c(0, 0.5),
100 |     panel.grid.major = element_line(size = 0.1, color = "grey"))
101 | 
102 | f1 <- .prettify(p1, thm)
103 | f2 <- .prettify(p2, thm)
104 | f2$guides$colour$override.aes$size <- 0.5
105 | 
106 | fig <- f1 / f2 +
107 |     plot_annotation(tag_levels = "a") &
108 |     theme(plot.tag = element_text(size = 9, face = "bold"))
109 | 
110 | saveRDS(fig, args$rds)
111 | ggsave(args$pdf, fig, width = 16, height = 18, units = "cm")
112 | 


--------------------------------------------------------------------------------
/code/07-plot_stat_1d_by_stat1d-boxplot_by_method.R:
--------------------------------------------------------------------------------
 1 | # wcs <- list(stat1d = "ks")
 2 | # args <- list(
 3 | #     fun = "code/utils-plotting.R",
 4 | #     res = "outs/obj-stat_1d.rds",
 5 | #     rds = sprintf("plts/stat_1d_by_stat1d-boxplot,%s.rds", wcs$stat1d),
 6 | #     pdf = sprintf("plts/stat_1d_by_stat1d-boxplot,%s.pdf", wcs$stat2d))
 7 | 
 8 | source(args$fun)
 9 | res <- readRDS(args$res)
10 | 
11 | df <- res %>% 
12 |     # keep data of interest
13 |     filter(stat1d == wcs$stat1d) %>% 
14 |     .filter_res() %>% 
15 |     # average across groups
16 |     group_by(refset, method, metric) %>% 
17 |     .avg(n = 1)
18 | 
19 | # average statistics across refsets & summaries 
20 | # statistic across all summaries
21 | o <- df %>% 
22 |     group_by(method, refset, metric) %>%
23 |     .avg(n = 3) %>% 
24 |     # order methods (panels) by average 
25 |     arrange(stat) %>% 
26 |     pull("method")
27 | df <- df %>% mutate_at("method", factor, o)
28 | 
29 | pal <- .metrics_pal[levels(df$metric)]
30 | lab <- parse(text = paste(
31 |     sep = "~",
32 |     sprintf("bold(%s)", LETTERS), 
33 |     gsub("\\s", "~", names(pal))))
34 | 
35 | anno <- df %>% 
36 |     group_by(method, metric) %>% 
37 |     summarize_at("stat", median) %>% 
38 |     mutate(letter = LETTERS[match(metric, levels(metric))])
39 | 
40 | plt <- ggplot(df, aes(
41 |     reorder_within(metric, stat, method, median), 
42 |     stat, col = metric, fill = metric)) +
43 |     facet_wrap(~ method, nrow = 4, scales = "free_x") +
44 |     geom_boxplot(
45 |         size = 0.25, outlier.size = 0.25,
46 |         alpha = 0.25, key_glyph = "point") +
47 |     geom_text(data = anno, 
48 |         size = 1.5, color = "black", 
49 |         aes(label = letter, y = -0.075)) + 
50 |     scale_fill_manual(values = pal, labels = lab) +
51 |     scale_color_manual(values = pal, labels = lab) +
52 |     scale_y_continuous(limits = c(-0.1, NA), n.breaks = 3) +
53 |     labs(x = NULL, y = .stats1d_lab[wcs$stat1d])
54 | 
55 | thm <- theme(
56 |     legend.text.align = 0,
57 |     axis.text.x = element_blank(),
58 |     axis.ticks.x = element_blank())
59 | 
60 | fig <- .prettify(plt, thm)
61 | 
62 | saveRDS(fig, args$rds)
63 | ggsave(args$pdf, fig, width = 16, height = 9, units = "cm")
64 | 


--------------------------------------------------------------------------------
/code/07-plot_stat_1d_by_stat1d-boxplot_by_metric.R:
--------------------------------------------------------------------------------
 1 | # wcs <- list(stat1d = "ks")
 2 | # args <- list(
 3 | #     fun = "code/utils-plotting.R",
 4 | #     rds = "plts/stat_1d_by_reftyp-boxplot.rds",
 5 | #     pdf = "plts/stat_1d_by_reftyp-boxplot.pdf",
 6 | #     res = list.files("outs", paste0("^stat_1d.*", wcs$stat1d, "\\."), full.names = TRUE))
 7 | # args$res <- sample(args$res, 100)
 8 | 
 9 | source(args$fun)
10 | res <- readRDS(args$res)
11 | 
12 | df <- res %>%
13 |     # keep data of interest
14 |     filter(stat1d == wcs$stat1d) %>% 
15 |     .filter_res() %>% 
16 |     # average across groups
17 |     group_by(refset, method, metric, group) %>% .avg(n = 1)
18 | 
19 | pal <- .methods_pal[levels(df$method)]
20 | lab <- parse(text = paste(
21 |     sep = "~",
22 |     sprintf("bold(%s)", LETTERS), 
23 |     gsub("\\s", "~", names(pal))))
24 | 
25 | anno <- df %>% 
26 |     group_by(method, metric) %>% 
27 |     summarize_at("stat", median) %>% 
28 |     mutate(letter = LETTERS[match(method, levels(method))])
29 | 
30 | plt <- ggplot(df, aes(
31 |     reorder_within(method, stat, metric, median), 
32 |     stat, col = method, fill = method)) +
33 |     facet_wrap(~ metric, nrow = 3, scales = "free_x") +
34 |     geom_boxplot(
35 |         outlier.size = 0.25, outlier.alpha = 1,
36 |         size = 0.25, alpha = 0.25, key_glyph = "point") + 
37 |     geom_text(data = anno, 
38 |         size = 1.5, color = "black", 
39 |         aes(label = letter, y = -0.075)) + 
40 |     scale_fill_manual(values = pal, labels = lab) +
41 |     scale_color_manual(values = pal, labels = lab) +
42 |     scale_y_continuous(limits = c(-0.1, NA), n.breaks = 3) +
43 |     labs(x = NULL, y = .stats1d_lab[wcs$stat1d])
44 | 
45 | thm <- theme(
46 |     legend.text.align = 0,
47 |     axis.text.x = element_blank(),
48 |     axis.ticks.x = element_blank())
49 | 
50 | fig <- .prettify(plt, thm)
51 | 
52 | saveRDS(fig, args$rds)
53 | ggsave(args$pdf, fig, width = 16, height = 9, units = "cm")
54 | 


--------------------------------------------------------------------------------
/code/07-plot_stat_1d_by_stat1d-correlations.R:
--------------------------------------------------------------------------------
 1 | # wcs <- list(stat1d = "ks")
 2 | # args <- list(
 3 | #     fun = "code/utils-plotting.R",
 4 | #     res = "outs/obj-stat_1d.rds",
 5 | #     rds = sprintf("plts/stat_1d_by_stat1d-correlations,%s.rds", wcs$stat1d),
 6 | #     pdf = sprintf("plts/stat_1d_by_stat1d-correlations,%s.pdf", wcs$stat1d))
 7 | 
 8 | source(args$fun)
 9 | res <- readRDS(args$res)
10 | 
11 | mat <- res %>% 
12 |     # keep data of interest
13 |     filter(stat1d == wcs$stat1d) %>% 
14 |     .filter_res() %>% 
15 |     # average across groups & subsets
16 |     group_by(method, metric, refset, group) %>% 
17 |     .avg(n = 3) %>% 
18 |     # correlate b/w summaries, across methods & refsets
19 |     pivot_wider(
20 |         names_from = metric, 
21 |         values_from = stat) %>% 
22 |     select(any_of(.metrics_lab)) %>% 
23 |     cor(method = "spearman", 
24 |         use = "pairwise.complete.obs")
25 | 
26 | foo <- mat
27 | foo[is.na(foo)] <- 1
28 | xo <- rownames(mat)[hclust(dist(foo))$order]
29 | yo <- rownames(mat)[hclust(dist(t(foo)))$order]
30 | 
31 | df <- mat %>% 
32 |     data.frame(from = rownames(.)) %>% 
33 |     pivot_longer(-from, names_to = "to") %>% 
34 |     complete(from, to, fill = list(value = NA))
35 |     
36 | plt <- ggplot(df, aes(from, to, fill = value)) +
37 |     geom_tile() +
38 |     scale_fill_distiller(
39 |         bquote("r("*.(.stats1d_lab[wcs$stat1d])*")"),
40 |         palette = "RdYlBu",
41 |         limits = c(-1, 1),
42 |         n.breaks = 3) +
43 |     coord_equal(expand = FALSE) +
44 |     scale_x_discrete(limits = rev(xo), labels = .metrics_lab) +
45 |     scale_y_discrete(limits = yo, labels = .metrics_lab)
46 | 
47 | thm <- theme(
48 |     axis.title = element_blank(),
49 |     axis.text.x = element_text(angle = 45, hjust = 1))
50 | 
51 | fig <- .prettify(plt, thm)
52 | 
53 | saveRDS(fig, args$rds)
54 | ggsave(args$pdf, fig, width = 9, height = 7.5, units = "cm")
55 | 


--------------------------------------------------------------------------------
/code/07-plot_stat_1d_by_stat1d-correlations_by_method.R:
--------------------------------------------------------------------------------
 1 | # wcs <- list(stat1d = "ks")
 2 | # args <- list(
 3 | #     fun = "code/utils-plotting.R",
 4 | #     res = "outs/obj-stat_1d.rds",
 5 | #     rds = sprintf("plts/stat_1d_by_stat1d-correlations_by_method,%s.rds", wcs$stat1d),
 6 | #     pdf = sprintf("plts/stat_1d_by_stat1d-correlations_by_method,%s.pdf", wcs$stat1d))
 7 | 
 8 | source(args$fun)
 9 | res <- readRDS(args$res)
10 | 
11 | mat <- res %>% 
12 |     # keep data of interest
13 |     filter(stat1d == wcs$stat1d) %>% 
14 |     .filter_res() %>% 
15 |     # for each method, method & summary, average across groups
16 |     group_by(refset, method, metric, group) %>% .avg(n = 1) %>% 
17 |     # for each method, correlate b/w summaries & across refsets
18 |     split(.$method) %>% 
19 |     lapply(\(.)
20 |         pivot_wider(.,
21 |             names_from = metric, 
22 |             values_from = stat) %>% 
23 |         select(any_of(.metrics_lab)) %>% 
24 |         cor(method = "spearman", 
25 |             use = "pairwise.complete.obs"))
26 | 
27 | df <- lapply(mat, \(u) u %>% 
28 |     data.frame(from = rownames(.)) %>% 
29 |     pivot_longer(-from, names_to = "to")) %>% 
30 |     bind_rows(.id = "method") %>% 
31 |     complete(from, to, method, fill = list(value = NA))
32 |     
33 | foo <- df %>% 
34 |     group_by(from, to) %>% 
35 |     summarise(value = mean(value), .groups = "drop") %>% 
36 |     pivot_wider(names_from = to, values_from = value) %>% 
37 |     select(-from)
38 | foo[is.na(foo)] <- 1
39 | o <- colnames(foo)[hclust(dist(as.matrix(foo)))$order]
40 | 
41 | plt <- ggplot(df, aes(from, to, fill = value)) +
42 |     facet_wrap(~ method, nrow = 4) +
43 |     scale_fill_distiller(
44 |         bquote("r("*.(.stats1d_lab[wcs$stat1d])*")"),
45 |         palette = "RdYlBu", 
46 |         na.value = "grey",
47 |         limits = c(-1, 1), 
48 |         n.breaks = 3) +
49 |     scale_x_discrete(limits = rev(o), labels = .metrics_lab) +
50 |     scale_y_discrete(limits = o, labels = .metrics_lab) +
51 |     coord_equal(expand = FALSE) +
52 |     geom_tile()
53 | 
54 | thm <- theme(
55 |     axis.title = element_blank(),
56 |     axis.text.x = element_text(angle = 45, hjust = 1))
57 | 
58 | fig <- .prettify(plt, thm)
59 | 
60 | saveRDS(fig, args$rds)
61 | ggsave(args$pdf, fig, width = 16, height = 16, units = "cm")
62 | 


--------------------------------------------------------------------------------
/code/07-plot_stat_1d_by_stat1d-correlations_by_metric.R:
--------------------------------------------------------------------------------
 1 | # wcs <- list(stat1d = "ks")
 2 | # args <- list(
 3 | #     fun = "code/utils-plotting.R",
 4 | #     res = "outs/obj-stat_1d.rds",
 5 | #     rds = sprintf("plts/stat_1d_by_stat1d-correlations_by_metric,%s.rds", wcs$stat1d),
 6 | #     pdf = sprintf("plts/stat_1d_by_stat1d-correlations_by_metric,%s.pdf", wcs$stat1d))
 7 | 
 8 | source(args$fun)
 9 | res <- readRDS(args$res)
10 | 
11 | mat <- res %>% 
12 |     # keep method of interest
13 |     filter(stat1d == wcs$stat1d) %>% 
14 |     .filter_res() %>% 
15 |     # for each method, method & summary, average across groups & refsets
16 |     group_by(refset, method, metric, group) %>% .avg(n = 1) %>% 
17 |     # for each method, correlate b/w summaries & across refsets
18 |     split(.$metric) %>% 
19 |     lapply(\(.)
20 |         pivot_wider(.,
21 |             names_from = method, 
22 |             values_from = stat) %>% 
23 |         select(any_of(names(.methods_pal))) %>% 
24 |         cor(method = "spearman", 
25 |             use = "pairwise.complete.obs"))
26 | 
27 | df <- lapply(mat, \(u) u %>% 
28 |     data.frame(from = rownames(.)) %>% 
29 |     pivot_longer(-from, names_to = "to")) %>% 
30 |     bind_rows(.id = "metric") %>% 
31 |     complete(from, to, metric, fill = list(value = NA))
32 |     
33 | foo <- df %>% 
34 |     group_by(from, to) %>% 
35 |     summarise(value = mean(value), .groups = "drop") %>% 
36 |     pivot_wider(names_from = to, values_from = value) %>% 
37 |     select(-from) %>% 
38 |     as.matrix()
39 | foo[is.na(foo)] <- 1
40 | o <- colnames(foo)[hclust(dist(foo))$order]
41 | 
42 | plt <- ggplot(df, aes(from, to, fill = value)) +
43 |     facet_wrap(~ metric, nrow = 2) +
44 |     scale_fill_distiller(
45 |         bquote("r("*.(.stats1d_lab[wcs$stat1d])*")"),
46 |         palette = "RdYlBu", 
47 |         na.value = "grey",
48 |         limits = c(-1, 1), 
49 |         n.breaks = 3) +
50 |     scale_x_discrete(limits = rev(o), labels = names(.methods_pal)) +
51 |     scale_y_discrete(limits = o, labels = names(.methods_pal)) +
52 |     coord_equal(expand = FALSE) +
53 |     geom_tile()
54 | 
55 | thm <- theme(
56 |     axis.title = element_blank(),
57 |     axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
58 | 
59 | fig <- .prettify(plt, thm)
60 | 
61 | saveRDS(fig, args$rds)
62 | ggsave(args$pdf, fig, width = 24, height = 8.5, units = "cm")
63 | 
64 | 


--------------------------------------------------------------------------------
/code/07-plot_stat_1d_by_stat1d-dimEst.R:
--------------------------------------------------------------------------------
 1 | # wcs <- list(stat1d = "ks")
 2 | # args <- list(
 3 | #     fun = "code/utils-plotting.R",
 4 | #     res = "outs/obj-stat_1d.rds",
 5 | #     rds = sprintf("plts/stat_1d_by_stat1d-dimEst,%s.rds", wcs$stat1d),
 6 | #     pdf = sprintf("plts/stat_1d_by_stat1d-dimEst,%s.pdf", wcs$stat1d))
 7 | 
 8 | suppressPackageStartupMessages({
 9 |     library(intrinsicDimension)
10 | })
11 | 
12 | source(args$fun)
13 | res <- readRDS(args$res)
14 | 
15 | df <- res %>% 
16 |     # keep data of interest
17 |     filter(stat1d == wcs$stat1d) %>%
18 |     .filter_res() %>% 
19 |     # average across groups
20 |     group_by(refset, reftyp, method, metric, group) %>% .avg(n = 1) %>% 
21 |     # estimate dimensionality for each refset
22 |     pivot_wider(
23 |         names_from = metric, 
24 |         values_from = stat) %>% 
25 |     split(.$refset) %>% 
26 |     lapply(\(df) {
27 |         if (nrow(df) < 3) return(NULL)
28 |         ks <- seq(2, nrow(df)-1)
29 |         mat <- as.matrix(select(df, any_of(.metrics_lab)))
30 |         mat <- mat[, apply(mat, 2, \(.) !all(is.na(.)))]
31 |         mat[is.na(mat)] <- 1
32 |         est <- sapply(ks, maxLikGlobalDimEst, data = mat)
33 |         data.frame(dim = unlist(est), k = ks)
34 |     }) %>% 
35 |     bind_rows(.id = "refset") %>% 
36 |     mutate(type = res$reftyp[match(refset, res$refset)])
37 | 
38 | plt <- ggplot(df, aes(factor(k), dim, col = type, fill = type)) +
39 |     geom_boxplot(
40 |         size = 0.25, outlier.size = 0.25,
41 |         alpha = 0.25, key_glyph = "point") +
42 |     scale_x_discrete(
43 |         "number of nearest neighbors (k)",
44 |         breaks = seq(0, 20, 2)) +
45 |     scale_y_continuous(
46 |         "estimated dimensionality",
47 |         limits = range(df$dim),
48 |         breaks = seq(0, 10, 2))
49 | 
50 | thm <- theme(panel.grid.major = element_line(color = "grey"))
51 | 
52 | fig <- .prettify(plt, thm)
53 | 
54 | saveRDS(fig, args$rds)
55 | ggsave(args$pdf, fig, width = 9, height = 6, units = "cm")
56 | 


--------------------------------------------------------------------------------
/code/07-plot_stat_1d_by_stat1d-mds.R:
--------------------------------------------------------------------------------
 1 | # wcs <- list(stat1d = "ks")
 2 | # args <- list(
 3 | #     fun = "code/utils-plotting.R",
 4 | #     res = "outs/obj-stat_1d.rds",
 5 | #     rds = sprintf("plts/stat_1d_by_stat1d-mds,%s.rds", wcs$stat1d),
 6 | #     pdf = sprintf("plts/stat_1d_by_stat1d-mds,%s.pdf", wcs$stat1d))
 7 | 
 8 | source(args$fun)
 9 | res <- readRDS(args$res)
10 | 
11 | df <- res %>%
12 |     # keep data of interest
13 |     filter(stat1d == wcs$stat1d) %>%
14 |     .filter_res() %>% 
15 |     # average across groups & refsets
16 |     group_by(method, metric, refset, group) %>%
17 |     .avg(n = 3) %>%
18 |     pivot_wider(
19 |         names_from = metric, 
20 |         values_from = stat) %>% 
21 |     rowwise() %>% 
22 |     drop_na(any_of(.metrics_lab)) %>% 
23 |     select(any_of(.metrics_lab)) %>% 
24 |     as.matrix() %>% 
25 |     t() %>% 
26 |     dist() %>% 
27 |     cmdscale() %>% 
28 |     data.frame(rownames(.)) %>% 
29 |     set_colnames(c("x", "y", "metric")) %>% 
30 |     mutate(
31 |         type = case_when(
32 |             metric %in% .none_metrics ~ "global",
33 |             grepl("gene", metric) ~ "gene",
34 |             grepl("cell", metric) ~ "cell"),
35 |         metric = .metrics_lab[metric])
36 |             
37 | plt <- ggplot(df, aes(x, y, 
38 |     col = type, fill = type, label = metric)) +
39 |     geom_point(size = 2, shape = 21, col = "black", alpha = 0.5) +
40 |     geom_text_repel(size = 2, show.legend = FALSE) +
41 |     scale_fill_manual(values = c("red", "blue", "green3")) +
42 |     scale_color_manual(values = c("red", "blue", "green3")) +
43 |     scale_x_continuous(expand = expansion(mult = 0.1)) +
44 |     scale_y_continuous(expand = expansion(mult = 0.1)) +
45 |     labs(x = "MDS dim. 1", y = "MDS dim. 2") +
46 |     coord_fixed()
47 | 
48 | thm <- theme(
49 |     legend.position = "bottom",
50 |     legend.title = element_blank(),
51 |     panel.grid.major = element_line(color = "grey"))
52 | 
53 | fig <- .prettify(plt, thm)
54 | 
55 | saveRDS(fig, args$rds)
56 | ggsave(args$pdf, fig, width = 12, height = 8, units = "cm")
57 | 


--------------------------------------------------------------------------------
/code/07-plot_stat_1d_by_stat1d-pca.R:
--------------------------------------------------------------------------------
  1 | # wcs <- list(stat1d = "ks")
  2 | # args <- list(
  3 | #     fun = "code/utils-plotting.R",
  4 | #     res = "outs/obj-stat_1d.rds",
  5 | #     rds = sprintf("plts/stat_1d_by_stat1d-pca,%s.rds", wcs$stat1d),
  6 | #     pdf = sprintf("plts/stat_1d_by_stat1d-pca,%s.pdf", wcs$stat1d))
  7 | 
  8 | source(args$fun)
  9 | res <- readRDS(args$res)
 10 | 
 11 | df <- res %>%
 12 |     # keep data of interest
 13 |     filter(stat1d == wcs$stat1d) %>%
 14 |     .filter_res() %>% 
 15 |     # average across groups & refsets
 16 |     group_by(method, metric, refset, group) %>%
 17 |     .avg(n = 3) %>%
 18 |     pivot_wider(
 19 |         names_from = metric, 
 20 |         values_from = stat) %>% 
 21 |     rowwise() %>% 
 22 |     drop_na(any_of(.metrics_lab))
 23 | 
 24 | # do PCA
 25 | pc <- df %>% 
 26 |     select(any_of(.metrics_lab)) %>%
 27 |     as.matrix() %>% prcomp()
 28 | 
 29 | # get coordinates
 30 | xy <- pc$x %>% 
 31 |     data.frame() %>%
 32 |     # add metadata
 33 |     mutate(
 34 |         select(df, !any_of(.metrics_lab)),
 35 |         method = droplevels(method))
 36 | 
 37 | # get loadings
 38 | rot <- pc$rotation %>% 
 39 |     data.frame() %>% 
 40 |     mutate(
 41 |         metric = .metrics_lab[rownames(.)],
 42 |         metric = factor(metric, metric))
 43 | ij <- c("PC1", "PC2")
 44 | m1 <- max(abs(xy[, ij]))
 45 | m2 <- max(abs(rot[, ij]))
 46 | rot[, ij] <- 0.5*rot[, ij]*m1/m2
 47 | 
 48 | # get percentages
 49 | var <- prop.table(pc$sdev^2)
 50 | var_lab <- round(100*var, 1)
 51 | 
 52 | p0 <- ggplot(xy, aes(PC1, PC2)) +
 53 |     geom_vline(xintercept = 0, size = 0.1) +
 54 |     geom_hline(yintercept = 0, size = 0.1) +
 55 |     coord_fixed() + labs(
 56 |         x = sprintf("PC1 (%s%%)", var_lab[1]),
 57 |         y = sprintf("PC2 (%s%%)", var_lab[2])) +
 58 |     scale_x_continuous(
 59 |         limits = range(xy$PC1), 
 60 |         expand = expansion(mult = 0.2)) +
 61 |     scale_y_continuous(
 62 |         limits = range(xy$PC2),
 63 |         expand = expansion(mult = 0.2))
 64 | 
 65 | p1 <- p0 +
 66 |     geom_point(
 67 |         aes(fill = method), shape = 21,
 68 |         size = 3, stroke = 0, alpha = 0.8) +
 69 |     geom_label_repel(
 70 |         aes(label = method, col = method),
 71 |         label.padding = unit(0.75, "mm"),
 72 |         size = 2, fontface = "bold", show.legend = FALSE) +
 73 |     scale_fill_manual(values = .methods_pal[levels(xy$method)]) +
 74 |     scale_color_manual(values = .methods_pal[levels(xy$method)]) 
 75 |     
 76 | p2 <- p0 +
 77 |     geom_segment(data = rot, 
 78 |         aes(0, 0, xend = PC1, yend = PC2, col = metric),
 79 |         size = 0.5, arrow = arrow(length = unit(1, "mm"))) +
 80 |     geom_label_repel(data = rot,
 81 |         aes(label = metric, col = metric), 
 82 |         label.padding = unit(0.75, "mm"),
 83 |         size = 2, fontface = "bold", show.legend = FALSE) +
 84 |     scale_color_manual(values = .metrics_pal[levels(rot$metric)]) 
 85 | 
 86 | thm <- theme(
 87 |     legend.justification = c(0, 0.5),
 88 |     panel.grid.major = element_line(size = 0.1, color = "grey"))
 89 | 
 90 | f1 <- .prettify(p1, thm)
 91 | f2 <- .prettify(p2, thm)
 92 | f2$guides$colour$override.aes$size <- 0.5
 93 | 
 94 | fig <- f1 / f2 +
 95 |     plot_annotation(tag_levels = "a") &
 96 |     theme(plot.tag = element_text(size = 9, face = "bold"))
 97 | 
 98 | saveRDS(fig, args$rds)
 99 | ggsave(args$pdf, fig, width = 16, height = 18, units = "cm")
100 | 


--------------------------------------------------------------------------------
/code/07-plot_stat_2d-scatters.R:
--------------------------------------------------------------------------------
 1 | # args <- list(
 2 | #   fun = "code/utils.R",
 3 | #   res = "outs/obj-stat_2d.rds",
 4 | #   rds = "plts/stat_2d-scatters.rds",
 5 | #   pdf = "plts/stat_2d-scatters.pdf")
 6 | 
 7 | source(args$fun)
 8 | 
 9 | res <- readRDS(args$res)
10 |   
11 | df <- res %>% 
12 |   mutate(
13 |       refset = paste(datset, subset, sep = ","),
14 |       metrics = paste(metric1, metric2, sep = "\n")) %>%
15 |   select(-c(datset, subset, metric1, metric2)) %>% 
16 |   # for type != n, keep batch-/cluster-level comparisons only
17 |   group_by(stat2d, refset, method, metrics) %>% 
18 |   mutate(n = n()) %>% 
19 |   filter(group != "global" | n == 1) %>%  
20 |   select(-c(group, n)) %>% 
21 |   # for each statistic & metric, re-scale b/w 0 & 1 
22 |   group_by(stat2d, metrics) %>% 
23 |   mutate(stat = stat/max(stat, na.rm = TRUE)) %>% 
24 |   pivot_wider(names_from = stat2d, values_from = stat) %>% 
25 |   filter(!is.na(ks2), !is.na(emd))
26 | 
27 | plt <- ggplot(df, aes(ks2, emd, col = refset, fill = refset)) +
28 |   facet_wrap(~ metrics, nrow = 1) +
29 |   stat_cor(
30 |     aes(group = metrics), method = "spearman",
31 |     label.x = 0, label.y = 1, vjust = 1, size = 1.5) +
32 |   geom_point(size = 0.2, alpha = 0.4, show.legend = FALSE) +
33 |   scale_x_sqrt(limits = c(0, 1), breaks = seq(0.2, 1, 0.2)) +
34 |   scale_y_sqrt(limits = c(0, 1), breaks = seq(0.2, 1, 0.2)) +
35 |   labs(x = "KS statistics", y = "scaled EMD")
36 | 
37 | thm <- theme(aspect.ratio = 1)
38 | 
39 | fig <- .prettify(plt, thm)
40 | 
41 | saveRDS(fig, args$rds)
42 | ggsave(args$pdf, fig, width = 16, height = 7.5, units = "cm")
43 | 


--------------------------------------------------------------------------------
/code/07-plot_stat_2d_by_reftyp-boxplot.R:
--------------------------------------------------------------------------------
 1 | # wcs <- list(stat2d = "emd", reftyp = "n")
 2 | # args <- list(
 3 | #     fun = "code/utils-plotting.R",
 4 | #     res = "outs/obj-stat_2d.rds",
 5 | #     rds = "plts/stat_2d_by_reftyp-boxplot,b,ks2.rds",
 6 | #     pdf = "plts/stat_2d_by_reftyp-boxplot,b,ks2.pdf")
 7 | 
 8 | source(args$fun)
 9 | res <- .read_res(args$res)
10 |     
11 | df <- res %>% 
12 |     mutate(metrics = paste(metric1, metric2, sep = "\n")) %>% 
13 |     # keep statistic of interest
14 |     filter(stat2d == wcs$stat2d) %>% 
15 |     # keep group-level comparisons only
16 |     { if (wcs$reftyp == "n") . else
17 |         mutate(., across(
18 |             c(group, id), 
19 |             as.character)) %>% 
20 |         filter(group != id) } %>% 
21 |     # scale values b/w 0 and 1 for visualization
22 |     group_by(metrics) %>% 
23 |     mutate(stat = stat/max(stat, na.rm = TRUE))
24 | 
25 | pal <- .methods_pal[levels(df$method)]
26 | lab <- parse(text = paste(
27 |     sep = "~",
28 |     sprintf("bold(%s)", LETTERS), 
29 |     gsub("\\s", "~", names(pal))))
30 | 
31 | anno <- df %>% 
32 |     group_by(method, metrics) %>% 
33 |     summarize_at("stat", median) %>% 
34 |     mutate(letter = LETTERS[match(method, levels(method))])
35 | 
36 | plt <- ggplot(df, aes(
37 |     reorder_within(method, stat, metrics, median), 
38 |     stat, col = method, fill = method)) +
39 |     facet_wrap(~ metrics, nrow = 3, scales = "free_x") +
40 |     geom_boxplot(
41 |         outlier.size = 0.25, outlier.alpha = 1,
42 |         size = 0.25, alpha = 0.25, key_glyph = "point") + 
43 |     geom_text(data = anno, 
44 |         size = 1.5, color = "black", 
45 |         aes(label = letter, y = -0.075)) + 
46 |     scale_fill_manual(values = pal, labels = lab) +
47 |     scale_color_manual(values = pal, labels = lab) +
48 |     scale_x_reordered(NULL) +
49 |     scale_y_continuous(
50 |         paste(ifelse(wcs$stat2d == "emd", "scaled", ""),
51 |             .stats2d_lab[wcs$stat2d]),
52 |         limits = c(-0.1, 1), n.breaks = 3)
53 | 
54 | thm <- theme(
55 |     legend.text.align = 0,
56 |     legend.title = element_blank(),
57 |     axis.text.x = element_blank(),
58 |     axis.title.x = element_blank(),
59 |     axis.ticks.x = element_blank())
60 | 
61 | fig <- .prettify(plt, thm)
62 | 
63 | saveRDS(fig, args$rds)
64 | ggsave(args$pdf, fig, width = 16, height = 9, units = "cm")
65 | 


--------------------------------------------------------------------------------
/code/07-plot_stat_2d_by_reftyp-heatmap.R:
--------------------------------------------------------------------------------
 1 | # wcs <- list(stat2d = "emd", reftyp = "b")
 2 | # args <- list(
 3 | #     fun = "code/utils-plotting.R",
 4 | #     res = "outs/obj-stat_2d.rds",
 5 | #     rds = "plts/stat_2d_by_reftyp-boxplot,b,ks2.rds",
 6 | #     pdf = "plts/stat_2d_by_reftyp-boxplot,b,ks2.pdf")
 7 | 
 8 | source(args$fun)
 9 | res <- .read_res(args$res)
10 | 
11 | df <-  res %>%
12 |      mutate(metrics = paste(metric1, metric2, sep = "\n")) %>% 
13 |     # except for KNN & global summaries,
14 |     # keep group-level comparisons only
15 |     { if (wcs$reftyp == "n") . else
16 |         mutate(., across(
17 |             c(group, id), 
18 |             as.character)) %>% 
19 |         filter(
20 |             (!grepl("KNN", metrics) & group != id) |
21 |             (grepl("KNN", metrics) & group == "global") |
22 |             any(c(metric1, metric2) %in% .metrics_lab[.none_metrics])) } %>% 
23 |     # aggregation
24 |     group_by(method, metrics, group, datset, subset) %>% 
25 |     summarize_at("stat", mean, na.rm = TRUE) %>% # average across groups
26 |     summarize_at("stat", mean, na.rm = TRUE) %>% # average across subsets
27 |     summarize_at("stat", mean, na.rm = TRUE)     # average across datsets
28 | 
29 | # order methods by average across metrics
30 | ox <- df %>% 
31 |     group_by(method) %>% 
32 |     summarise_at("stat", mean) %>% 
33 |     arrange(desc(stat)) %>% 
34 |     pull("method")
35 | 
36 | # order metrics by average across methods
37 | oy <- df %>% 
38 |     group_by(metrics) %>% 
39 |     summarise_at("stat", mean) %>% 
40 |     arrange(desc(stat)) %>% 
41 |     pull("metrics")
42 | 
43 | df <- complete(df, method, metrics, fill = list(stat = NA))
44 | 
45 | plt <- ggplot(df, 
46 |     aes(method, metrics, fill = stat)) +
47 |     geom_tile(col = "white") +
48 |     scale_fill_distiller(
49 |         .stats2d_lab[wcs$stat2d],
50 |         palette = "RdYlBu",
51 |         na.value = "grey",
52 |         limits = c(0, 1), 
53 |         breaks = c(0, 1)) +
54 |     coord_equal(3/2, expand = FALSE) +
55 |     scale_x_discrete(limits = rev(ox)) +
56 |     scale_y_discrete(limits = rev(oy)) 
57 | 
58 | thm <- theme(
59 |     axis.ticks = element_blank(),
60 |     axis.title = element_blank(),
61 |     panel.border = element_blank(),
62 |     axis.text.x = element_text(angle = 45, hjust = 1))
63 | 
64 | fig <- .prettify(plt, thm)
65 | 
66 | saveRDS(fig, args$rds)
67 | ggsave(args$pdf, fig, width = 7.5, height = 6, units = "cm")
68 | 


--------------------------------------------------------------------------------
/code/08-fig_boxplots.R:
--------------------------------------------------------------------------------
 1 | # args <- list(
 2 | #     uts = "code/utils-plotting.R",
 3 | #     rds = c(
 4 | #         "plts/stat_1d_by_stat1d-boxplot_by_metric,ks.rds",
 5 | #         "plts/stat_1d_by_stat1d-boxplot_by_method,ks.rds"),
 6 | #     pdf = "figs/boxplots.pdf")
 7 | 
 8 | source(args$uts)
 9 | 
10 | ps <- lapply(args$rds, readRDS)
11 | 
12 | fig <- wrap_plots(ps, ncol = 1, heights = c(3, 4)) +
13 |     plot_annotation(tag_levels = "a") &
14 |     theme(
15 |         plot.margin = margin(),
16 |         panel.spacing = unit(1, "mm"),
17 |         plot.tag = element_text(face = "bold", size = 9))
18 | 
19 | ggsave(args$pdf, fig, width = 16, height = 14, units = "cm")
20 | 


--------------------------------------------------------------------------------
/code/08-fig_clustering.R:
--------------------------------------------------------------------------------
 1 | # args <- list(
 2 | #     rds = file.path("plts", c(
 3 | #         "clust-boxplot_by_method.rds",
 4 | #         "clust-boxplot_dF1.rds",
 5 | #         "clust-heatmap_by_method.rds",
 6 | #         "clust-correlations.rds")),
 7 | #     uts = "code/utils-plotting.R",
 8 | #     pdf = "figs/clustering.pdf")
 9 | 
10 | source(args$uts)
11 | 
12 | ps <- lapply(args$rds, readRDS)
13 | 
14 | # fix methods order according to ref-sim 
15 | # difference for boxplots & heatmaps
16 | o <- c("ref", ggplot_build(ps[[2]])$layout$panel_params[[1]]$x$breaks)
17 | ps[[1]]$data$sim_method <- factor(ps[[1]]$data$sim_method, o)
18 | ps[[3]]$data$sim_method <- factor(ps[[3]]$data$sim_method, o)
19 | 
20 | ps[[3]]$layers[[1]]$aes_params$colour <- NA
21 | 
22 | ps[[3]] <- ps[[3]] + theme(
23 |     axis.text.y = element_text(size = 3, angle = 60, hjust = 1),
24 |     axis.text.x = element_text(size = 3, angle = 90, vjust = 0.5))
25 | 
26 | ps[[4]] <- ps[[4]] + theme(
27 |     legend.background = element_blank())
28 | 
29 | ps <- lapply(ps, \(.) wrap_elements(full = .))
30 | 
31 | ws <- c(3, 1)
32 | 
33 | fig <- 
34 |     ((ps[[1]] + ps[[2]]) + plot_layout(widths = ws)) / 
35 |     ((ps[[3]] + ps[[4]]) + plot_layout(widths = ws)) +
36 |     plot_annotation(tag_levels = "a") &
37 |     theme(
38 |         plot.margin = margin(),
39 |         plot.tag = element_text(face = "bold", size = 9))
40 | 
41 | ggsave(args$pdf, fig, width = 16, height = 9, units = "cm")
42 | 


--------------------------------------------------------------------------------
/code/08-fig_heatmaps.R:
--------------------------------------------------------------------------------
  1 | # args <- list(
  2 | #     pdf = "figs/heatmaps.pdf",
  3 | #     uts = "code/utils-plotting.R",
  4 | #     rds = list.files("plts", "stat_(1|2)d_by_reftyp-heatmap.*ks2?\\.rds", full.names = TRUE))
  5 | 
  6 | source(args$uts)
  7 | ps <- lapply(fns <- args$rds, readRDS)
  8 | 
  9 | # re-order by dimension & type
 10 | dim <- gsub(".*(1|2)d.*", "\\1", fns)
 11 | typ <- gsub(".*,(n|b|k),ks.*", "\\1", fns)
 12 | names(ps) <- paste0(typ, dim)
 13 | ps <- ps[paste0(
 14 |     c("n", "b", "k"), 
 15 |     rep(c(1, 2), each = 3))]
 16 | 
 17 | # assure global metrics are 
 18 | # included across all panels
 19 | 
 20 | ps[1:3] <- lapply(ps[1:3], \(.) {
 21 |     y <- .$data$metric
 22 |     y <- factor(y, .metrics_lab)
 23 |     .$data$metric <- y
 24 |     return(.)
 25 | })
 26 | 
 27 | # fix y-axis order across panels and add rectangles 
 28 | # highlighting gene-/cell-level & global summaries
 29 | 
 30 | s <- 0.50 # border size
 31 | d <- 0.00 # offset b/w boxes
 32 | 
 33 | ps[1:3] <- lapply(ps[1:3], \(p)
 34 |     p + scale_y_discrete(limits = rev(.metrics_lab)) +
 35 |         geom_rect(
 36 |             xmin = 0.5 + d, 
 37 |             xmax = length(unique(p$data$method)) + 0.5 - d, 
 38 |             ymax = length(.metrics_lab) + 0.5 - d, 
 39 |             ymin = length(.none_metrics) + length(.cell_metrics) + 0.5 + d, 
 40 |             size = s, fill = NA, col = "red") +
 41 |         geom_rect(
 42 |             xmin = 0.5 + d, 
 43 |             xmax = length(unique(p$data$method)) + 0.5 - d, 
 44 |             ymin = length(.none_metrics) + 0.5 + d,
 45 |             ymax = length(.none_metrics) + length(.cell_metrics) + 0.5 - d, 
 46 |             size = s, fill = NA, col = "blue") +
 47 |         geom_rect(
 48 |             xmin = 0.5 + d, 
 49 |             xmax = length(unique(p$data$method)) + 0.5 - d, 
 50 |             ymin = 0.5 - d, 
 51 |             ymax = length(.none_metrics) + 0.5 - d, 
 52 |             size = s, fill = NA, col = "green3"))
 53 | 
 54 | # get x-axis (methods) ordering
 55 | xo <- lapply(ps[1:3], \(p) ggplot_build(p)$layout$panel_scales_x[[1]]$limits)
 56 | 
 57 | .metric_pairs <- unique(ps[[5]]$data$metrics)
 58 | .metric_pairs <- .metric_pairs[c(3, 2, 1, 9, 8, 6, 5, 4, 7)]
 59 | 
 60 | ps[4:6] <- lapply(seq_along(ps[4:6]), \(i) {
 61 |     p <- ps[4:6][[i]]
 62 |     p + scale_x_discrete(limits = xo[[i]]) +
 63 |         scale_y_discrete(limits = rev(.metric_pairs)) +
 64 |         geom_rect(
 65 |             xmin = 0.5 - d, 
 66 |             xmax = length(unique(p$data$method)) + 0.5 - d, 
 67 |             ymax = length(.metric_pairs) + 0.5 - d, 
 68 |             ymin = 3.5 + d, 
 69 |             size = s, fill = NA, col = "red") +
 70 |         geom_rect(
 71 |             xmin = 0.5 - d, 
 72 |             xmax = length(unique(p$data$method)) + 0.5 - d, 
 73 |             ymin = 0.5 + d,
 74 |             ymax = 3.5 - d, 
 75 |             size = s, fill = NA, col = "blue")
 76 | })
 77 | 
 78 | # drop y-axis labels from all but left-most panels
 79 | ps[-c(1, 4)] <- lapply(ps[-c(1, 4)], "+", 
 80 |     theme(axis.text.y = element_blank()))
 81 | 
 82 | # re-size relative to number of 
 83 | # rows (metrics) & columns (methods)
 84 | hs <- c(
 85 |     length(unique(ps[[1]]$data$metric)),
 86 |     1.5 + length(unique(ps[[5]]$data$metrics)))
 87 | ws <- sapply(ps[1:3], \(.) nlevels(.$data$method))
 88 | 
 89 | x <- c("gene-level" = "red", "cell-level" = "blue", "global" = "green3")
 90 | ps[[1]] <- ps[[1]] +
 91 |     geom_point(
 92 |         inherit.aes = FALSE,
 93 |         data = data.frame(x), 
 94 |         aes_string(
 95 |             ps[[1]]$data$method[1], 
 96 |             ps[[1]]$data$metric[1], 
 97 |             col = "x"), alpha = 0) +
 98 |     scale_color_manual("type of\nsummary", values = x) +
 99 |     guides(color = guide_legend(order = 1, override.aes = list(alpha = 1))) 
100 | 
101 | fig <- wrap_plots(ps,
102 |     nrow = 2, heights = hs, widths = ws) +
103 |     plot_layout(guides = "collect") +
104 |     plot_annotation(tag_levels = "a") &
105 |     coord_cartesian() &
106 |     theme(
107 |         axis.text.x = element_text(angle = 30),
108 |         plot.margin = margin(l = 2, unit = "mm"),
109 |         plot.tag = element_text(face = "bold", size = 9))
110 | 
111 | ggsave(args$pdf, fig, width = 16, height = 12, units = "cm")
112 | 


--------------------------------------------------------------------------------
/code/08-fig_integration.R:
--------------------------------------------------------------------------------
 1 | # val <- rep(c("cms", "ldf", "bcs"), each = 4)
 2 | # args <- list(
 3 | #     uts = "code/utils-plotting.R",
 4 | #     pdf = "figs/integration.pdf",
 5 | #     rds = sprintf(c(
 6 | #         "plts/batch-boxplot_by_method_%s.rds",
 7 | #         "plts/batch-boxplot_dX_%s.rds",
 8 | #         "plts/batch-heatmap_by_method_%s.rds",
 9 | #         "plts/batch-correlations_%s.rds"), val))
10 | 
11 | source(args$uts)
12 | ps <- lapply(args$rds, readRDS)
13 | 
14 | # split plots by value
15 | pat <- ".*_(.*)\\.rds$"
16 | lys <- split(ps, gsub(pat, "\\1", args$rds))
17 | 
18 | # fix methods order according to ref-sim 
19 | # difference for boxplots & heatmaps
20 | o <- c("ref", ggplot_build(lys[[1]][[2]])$layout$panel_params[[1]]$x$breaks)
21 | lys <- lapply(lys, \(ps) {
22 |     ps[[1]]$data$sim_method <- factor(ps[[1]]$data$sim_method, o)
23 |     ps[[3]]$data$sim_method <- factor(ps[[3]]$data$sim_method, o)
24 |     return(ps)
25 | })
26 | 
27 | # generate separate figure for each
28 | lys <- lapply(lys, \(p) {
29 |     p[[3]]$layers[[1]]$aes_params$colour <- NA
30 |     p[[3]] <- p[[3]] + theme(
31 |         axis.text.y = element_text(size = 3, angle = 60, hjust = 1),
32 |         axis.text.x = element_text(size = 3, angle = 90, vjust = 0.5))
33 |     p[[4]] <- p[[4]] + theme(
34 |         legend.background = element_blank())
35 |     p <- lapply(p, \(.) wrap_elements(full = .))
36 |     ws <- c(3, 1)
37 |     fig <- ((p[[1]] + p[[2]]) + plot_layout(widths = ws)) / 
38 |     ((p[[3]] + p[[4]]) + plot_layout(widths = ws)) +
39 |     plot_annotation(tag_levels = "a") &
40 |     theme(
41 |         plot.margin = margin(),
42 |         plot.tag = element_text(face = "bold", size = 9))
43 | })
44 | 
45 | pdf(args$pdf, width = 17.6/2.54, height = 9.9/2.54)
46 | for (p in lys) print(p); dev.off()
47 | 


--------------------------------------------------------------------------------
/code/08-fig_mds.R:
--------------------------------------------------------------------------------
 1 | # args <- list(
 2 | #     pdf = "figs/mds.pdf",
 3 | #     uts = "code/utils-plotting.R",
 4 | #     rds = list.files("plts", "reftyp-mds.*ks\\.rds", full.names = TRUE))
 5 | 
 6 | source(args$uts)
 7 | ps <- lapply(args$rds, readRDS)
 8 | ps[[1]]$guides$fill <- "none"
 9 | 
10 | fig <- wrap_plots(ps, ncol = 1) +
11 |     plot_layout(guides = "collect") +
12 |     plot_annotation(tag_levels = "a") &
13 |     theme(
14 |         plot.margin = margin(),
15 |         legend.position = "bottom",
16 |         plot.tag = element_text(size = 9, face = "bold"))
17 | 
18 | ggsave(args$pdf, fig, width = 9, height = 17, units = "cm")


--------------------------------------------------------------------------------
/code/08-fig_memory.R:
--------------------------------------------------------------------------------
 1 | # args <- list(
 2 | #     rds = list.files("plts", "mbs.*\\.rds", full.names = TRUE),
 3 | #     pdf = "figs/memory.pdf",
 4 | #     uts = "code/utils-plotting.R")
 5 | 
 6 | source(args$uts)
 7 | 
 8 | ps <- lapply(args$rds, readRDS)
 9 | 
10 | pat <- paste0("_", c("n", "b", "k"), ".rds")
11 | ps <- ps[sapply(pat, grep, args$rds)]
12 | 
13 | fig <- wrap_plots(ps, ncol = 1) +
14 |     plot_annotation(tag_levels = "a") &
15 |     theme(
16 |         plot.margin = margin(),
17 |         panel.spacing = unit(1, "mm"),
18 |         legend.title = element_blank(),
19 |         plot.tag = element_text(size = 9, face = "bold"))
20 | 
21 | ggsave(args$pdf, fig, width = 18, height = 16, units = "cm")
22 | 


--------------------------------------------------------------------------------
/code/08-fig_runtimes.R:
--------------------------------------------------------------------------------
 1 | # args <- list(
 2 | #     rds = list.files("plts", "rts.*\\.rds", full.names = TRUE),
 3 | #     pdf = "figs/runtimes.pdf",
 4 | #     uts = "code/utils-plotting.R")
 5 | 
 6 | source(args$uts)
 7 | 
 8 | ps <- lapply(args$rds, readRDS)
 9 | 
10 | pat <- paste0("_", c("n", "b", "k"), ".rds")
11 | ps <- ps[sapply(pat, grep, args$rds)]
12 | 
13 | fig <- wrap_plots(ps, ncol = 1) +
14 |     plot_annotation(tag_levels = "a") &
15 |     theme(
16 |         plot.margin = margin(),
17 |         panel.spacing = unit(1, "mm"),
18 |         legend.title = element_blank(),
19 |         plot.tag = element_text(size = 9, face = "bold"))
20 | 
21 | ggsave(args$pdf, fig, width = 18, height = 21, units = "cm")
22 | 


--------------------------------------------------------------------------------
/code/08-fig_scalability.R:
--------------------------------------------------------------------------------
 1 | # args <- list(
 2 | #     rds = list.files("plts", "(rts|mbs).*\\.rds", full.names = TRUE),
 3 | #     pdf = "figs/rts_vs_mbs.pdf",
 4 | #     uts = "code/utils-plotting.R")
 5 | 
 6 | source(args$uts)
 7 | ps <- lapply(args$rds, readRDS)
 8 | 
 9 | pat <- ".*_([a-z])\\.rds"
10 | reftyp <- gsub(pat, "\\1", basename(args$rds))
11 | is <- split(seq_along(ps), reftyp)[c("n", "b", "k")]
12 | 
13 | ps <- lapply(is, \(i) {
14 | 
15 |     nm <- c("rts", "mbs")
16 |     for (j in seq_along(nm)) {
17 |         k <- grep(nm[[j]], args$rds[i])
18 |         df <- paste0("df_", nm[[j]])
19 |         assign(df, ps[i][[k]]$data)
20 |     }
21 |     
22 |     i <- c("method", "dim", "n", j <- c("t", "mbs"))
23 |     #f <- \(.) factor(., sort(as.numeric(as.character(unique(.)))))
24 |     f <- \(.) as.numeric(as.character(.))
25 |     df_rts <- df_rts %>% 
26 |         filter(step == "overall") %>% 
27 |         mutate(n = f(n)) %>% ungroup() %>% select(any_of(i)) 
28 |     
29 |     df_mbs <- df_mbs %>% 
30 |         mutate(n = f(n)) %>% ungroup() %>% select(any_of(i)) 
31 |     
32 |     df <- full_join(
33 |         df_rts, df_mbs, 
34 |         by = setdiff(i, j)) %>% 
35 |         filter(!is.na(t), !is.na(mbs))
36 |     pal <- .methods_pal[levels(df$method)]
37 |     
38 |     p <- ggplot(df, aes(t, mbs, col = method, fill = method)) +
39 |         scale_color_manual(values = pal) +
40 |         scale_fill_manual(values = pal) +
41 |         facet_grid(~ dim) +
42 |         geom_point(alpha = 0.8) + 
43 |         geom_path(size = 0.2) +
44 |         scale_x_continuous("runtime (s)", limits = c(0, NA), trans = "sqrt") + 
45 |         scale_y_continuous("memory usage (MBs)", limits = c(0, NA)) 
46 |     
47 |     .prettify(p) + theme(
48 |         panel.grid.major = element_line(color = "grey", size = 0.2))
49 | })
50 | 
51 | fig <- wrap_plots(ps, ncol = 1) +
52 |     plot_annotation(tag_levels = "a") &
53 |     theme(
54 |         plot.margin = margin(),
55 |         panel.spacing = unit(1, "mm"),
56 |         legend.title = element_blank(),
57 |         plot.tag = element_text(size = 9, face = "bold"))
58 | 
59 | ggsave(args$pdf, fig, width = 16, height = 16, units = "cm")
60 | 


--------------------------------------------------------------------------------
/code/08-fig_scatters.R:
--------------------------------------------------------------------------------
 1 | # args <- list(
 2 | #     uts = "code/utils-plotting.R",
 3 | #     pdf = "figs/scatters.pdf",
 4 | #     rds = list.files("plts", "scatters.*\\.rds", full.names = TRUE))
 5 | 
 6 | source(args$uts)
 7 | 
 8 | ps <- lapply(args$rds, readRDS)
 9 | ps[[1]]$facet$params$nrow <- 2
10 | ps[[2]]$facet$params$nrow <- 1
11 | 
12 | fig <- 
13 |     wrap_plots(ps, ncol = 1, heights = c(2, 1)) +
14 |     plot_annotation(tag_levels = "a") &
15 |     scale_x_sqrt(breaks = c(0.2, 0.5, 1)) &
16 |     theme(
17 |         plot.margin = margin(r = 1, unit = "mm"),
18 |         plot.tag = element_text(size = 9, face = "bold"))
19 | 
20 | ggsave(args$pdf, fig, width = 20, height = 11, units = "cm")
21 | 


--------------------------------------------------------------------------------
/code/08-fig_stat1d.R:
--------------------------------------------------------------------------------
 1 | # args <- list(
 2 | #     uts = "code/utils-plotting.R",
 3 | #     pdf = "figs/stat1d.pdf",
 4 | #     rds = list.files("plts", "stat_1d_by_reftyp-boxplot.*\\.rds", full.names = TRUE))
 5 | 
 6 | source(args$uts)
 7 | 
 8 | ps <- lapply(c("n", "b", "k"), function(type) {
 9 |     pat <- sprintf(",%s,", type)
10 |     fns <- grep(pat, args$rds, value = TRUE)
11 |     lapply(rev(fns), readRDS) %>% 
12 |         wrap_plots(ncol = 1) +
13 |         plot_layout(guides = "collect") +
14 |         plot_annotation(tag_levels = "a") &
15 |         theme(
16 |             panel.spacing = unit(1, unit = "mm"),
17 |             plot.margin = margin(t = 1, unit = "mm"),
18 |             plot.tag = element_text(size = 9, face = "bold"))
19 | }) 
20 | 
21 | print.gglist <- \(p, ...) plyr::l_ply(p, print, ...)
22 | fig <- structure(ps, class = c("gglist", "ggplot"))
23 | ggsave(args$pdf, fig, width = 16, height = 16, units = "cm")
24 | 


--------------------------------------------------------------------------------
/code/08-fig_stat2d.R:
--------------------------------------------------------------------------------
 1 | # args <- list(
 2 | #     uts = "code/utils-plotting.R",
 3 | #     pdf = "figs/stat2d.pdf",
 4 | #     rds = list.files("plts", "stat_2d_by_reftyp-boxplot.*\\.rds", full.names = TRUE))
 5 | 
 6 | source(args$uts)
 7 | 
 8 | ps <- lapply(c("n", "b", "k"), function(type) {
 9 |     pat <- sprintf(",%s,", type)
10 |     fns <- grep(pat, args$rds, value = TRUE)
11 |     lapply(rev(fns), readRDS) %>%
12 |         wrap_plots(ncol = 1) +
13 |         plot_layout(guides = "collect") +
14 |         plot_annotation(tag_levels = "a") &
15 |         theme(
16 |             panel.spacing = unit(1, unit = "mm"),
17 |             plot.margin = margin(t = 1, unit = "mm"),
18 |             plot.tag = element_text(size = 9, face = "bold"))
19 | })
20 | 
21 | print.gglist <- \(p, ...) plyr::l_ply(p, print, ...)
22 | fig <- structure(ps, class = c("gglist", "ggplot"))
23 | ggsave(args$pdf, fig, width = 16, height = 16, units = "cm")


--------------------------------------------------------------------------------
/code/08-fig_summaries.R:
--------------------------------------------------------------------------------
 1 | # args <- list(
 2 | #     uts = "code/utils-plotting.R",
 3 | #     pdf = "figs/summaries.pdf",
 4 | #     rds = file.path("plts", c(
 5 | #         "qc_ref-correlations.rds",
 6 | #         "stat_1d_by_stat1d-correlations,ks.rds",
 7 | #         "stat_1d_by_stat1d-mds,ks.rds",
 8 | #         "stat_1d_by_stat1d-pca,ks.rds")))
 9 | 
10 | source(args$uts)
11 | ps <- lapply(args$rds, readRDS)
12 | 
13 | ps[c(1, 2)] <- lapply(ps[c(1, 2)], "+", theme(
14 |     axis.text.x = element_text(size = 3, angle = 30)))
15 | 
16 | ps[[3]]$layers[[2]]$aes_params$size <- 1.25
17 | ps[[3]] <- ps[[3]] + theme(legend.position = "right")
18 | ps[[3]] <- wrap_elements(full = ps[[3]])
19 | 
20 | ps[[4]][[1]] <- ps[[4]][[1]] +
21 |     theme(legend.position = "none")
22 | ps[[4]][[1]]$coordinates$expand <- TRUE
23 | ps[[4]][[2]]$layers <- ps[[4]][[2]]$layers[-4]
24 | ps[[4]][[2]] <- ps[[4]][[2]] +
25 |     theme(
26 |         legend.position = c(0.95, 0.5),
27 |         legend.justification = c(1, 0.5))
28 | 
29 | ps[[4]] <- ps[[4]] & theme(plot.tag = element_blank())
30 | ps[[4]] <- wrap_elements(full = ps[[4]])
31 | 
32 | fig <- (wrap_plots(ps[1:3], ncol = 1) |
33 |     wrap_plots(ps[4], ncol = 1)) +
34 |     plot_layout(widths = c(1, 2.5)) +
35 |     plot_annotation(tag_levels = "a") &
36 |     theme(
37 |         plot.margin = margin(l = 1, t = 1, unit = "mm"),
38 |         plot.tag = element_text(size = 9, face = "bold"))
39 | 
40 | ggsave(args$pdf, fig, width = 16, height = 12, units = "cm")
41 | 


--------------------------------------------------------------------------------
/code/09-write_fns.R:
--------------------------------------------------------------------------------
1 | pat <- paste0("^", wcs$pat)
2 | fns <- list.files("outs", pat, full.names = TRUE)
3 | writeLines(fns, args$txt)


--------------------------------------------------------------------------------
/code/09-write_obj.R:
--------------------------------------------------------------------------------
1 | source(args$fun)
2 | fns <- readLines(args$txt)
3 | res <- .read_res(fns)
4 | saveRDS(res, args$rds)


--------------------------------------------------------------------------------
/code/10-session_info.R:
--------------------------------------------------------------------------------
 1 | x <- c(
 2 | # GENERAL
 3 | # Bioconductor
 4 |     "Biobase",
 5 |     "BiocParallel",
 6 |     "CellBench",
 7 |     "ExperimentHub",
 8 |     "GEOquery",
 9 |     "scater",
10 |     "scran",
11 |     "SingleCellExperiment",
12 |     "TENxPBMCData",
13 |     "variancePartition",
14 |     "waddR",
15 | # CRAN
16 |     "cluster",
17 |     "dplyr",
18 |     "emdist",
19 |     "jsonlite",
20 |     "ggplot2",
21 |     "ggpubr",
22 |     "ggrastr",
23 |     "Matrix",
24 |     "matrixStats",
25 |     "patchwork",
26 |     "Peacock.test",
27 |     "RANN",
28 |     "RColorBrewer",
29 |     "Seurat",
30 |     "SeuratData",
31 |     "tidyr",
32 |     "tidytext",
33 | # SIMULATION
34 | # Bioconductor
35 |     "BASiCS",
36 |     "muscat",
37 |     "scDD",
38 |     "splatter",
39 |     "SPsimSeq",
40 |     "zinbwave",
41 | # GitHub
42 |     "JINJINT/ESCO",
43 |     "suke18/POWSC",
44 |     "bvieth/powsimR",
45 |     "Vivianstats/scDesign",
46 |     "JSB-UCLA/scDesign2",
47 |     "YosefLab/SymSim",
48 |     "statOmics/zingeR",
49 | # INTEGRATION
50 | # Bioconductor
51 |     "batchelor",
52 |     "CellMixS",
53 |     "edgeR",
54 |     "limma",
55 |     "sva",
56 |     # CRAN
57 |     "harmony",
58 | # CLUSTERING
59 | # Bioconductor
60 |     "ConsensusClusterPlus",
61 |     "flowCore",
62 |     "monocle",
63 |     "SC3",
64 | # CRAN
65 |     "clue",
66 |     "Rtsne",
67 | # GitHub
68 |     "VCCRI/cidr",
69 |     "JustinaZ/pcaReduce"
70 | )
71 | 
72 | # # TO INSTALL ALL DEPENDENCIES:
73 | # if (!require(BiocManager)) 
74 | #     install.packages("BiocManager")
75 | # for (. in x) 
76 | #     if (!require(., character.only = TRUE)) 
77 | #         BiocManager::install(., ask = FALSE, update = TRUE)
78 | 
79 | # TO CAPTURE SESSIO INFO:
80 | for (. in x) {
81 |     . <- gsub(".*/", "", .)
82 |     suppressPackageStartupMessages(
83 |         library(., character.only = TRUE))
84 | }
85 | si <- capture.output(sessionInfo())
86 | writeLines(si, args[[1]])
87 | 


--------------------------------------------------------------------------------
/code/utils-clustering.R:
--------------------------------------------------------------------------------
  1 | # Hungarian algorithm
  2 | # (LM Weber, Aug '16)
  3 | 
  4 | .hungarian <- function(clus_algorithm, clus_truth) {
  5 |   
  6 |   # number of detected clusters
  7 |   n_clus <- length(table(clus_algorithm))
  8 |   
  9 |   # remove unassigned cells (NA's in clus_truth)
 10 |   unassigned <- is.na(clus_truth)
 11 |   clus_algorithm <- clus_algorithm[!unassigned]
 12 |   clus_truth <- clus_truth[!unassigned]
 13 |   if (length(clus_algorithm) != length(clus_truth)) 
 14 |       warning("vector lengths are not equal")
 15 |   
 16 |   tbl_algorithm <- table(clus_algorithm)
 17 |   tbl_truth <- table(clus_truth)
 18 |   
 19 |   # detected clusters in rows, true populations in columns
 20 |   pr_mat <- re_mat <- F1_mat <- matrix(NA, nrow = length(tbl_algorithm), ncol = length(tbl_truth))
 21 |   
 22 |   for (i in 1:length(tbl_algorithm)) {
 23 |     for (j in 1:length(tbl_truth)) {
 24 |       i_int <- as.integer(names(tbl_algorithm))[i]  # cluster number from algorithm
 25 |       j_int <- as.integer(names(tbl_truth))[j]  # cluster number from true labels
 26 |       
 27 |       true_positives <- sum(clus_algorithm == i_int & clus_truth == j_int, na.rm = TRUE)
 28 |       detected <- sum(clus_algorithm == i_int, na.rm = TRUE)
 29 |       truth <- sum(clus_truth == j_int, na.rm = TRUE)
 30 |       
 31 |       # calculate precision, recall, and F1 score
 32 |       precision_ij <- true_positives / detected
 33 |       recall_ij <- true_positives / truth
 34 |       F1_ij <- 2 * (precision_ij * recall_ij) / (precision_ij + recall_ij)
 35 |       
 36 |       if (F1_ij == "NaN") F1_ij <- 0
 37 |       
 38 |       pr_mat[i, j] <- precision_ij
 39 |       re_mat[i, j] <- recall_ij
 40 |       F1_mat[i, j] <- F1_ij
 41 |     }
 42 |   }
 43 |   
 44 |   # put back cluster labels (note some row names may be missing due to removal of unassigned cells)
 45 |   rownames(pr_mat) <- rownames(re_mat) <- rownames(F1_mat) <- names(tbl_algorithm)
 46 |   colnames(pr_mat) <- colnames(re_mat) <- colnames(F1_mat) <- names(tbl_truth)
 47 |   
 48 |   # match labels using Hungarian algorithm applied to matrix of F1 scores (Hungarian
 49 |   # algorithm calculates an optimal one-to-one assignment)
 50 |   
 51 |   # use transpose matrix (Hungarian algorithm assumes n_rows <= n_cols)
 52 |   F1_mat_trans <- t(F1_mat)
 53 |   
 54 |   if (nrow(F1_mat_trans) <= ncol(F1_mat_trans)) {
 55 |     # if fewer (or equal no.) true populations than detected clusters, can match all true populations
 56 |     labels_matched <- clue::solve_LSAP(F1_mat_trans, maximum = TRUE)
 57 |     # use row and column names since some labels may have been removed due to unassigned cells
 58 |     labels_matched <- as.numeric(colnames(F1_mat_trans)[as.numeric(labels_matched)])
 59 |     names(labels_matched) <- rownames(F1_mat_trans)
 60 |     
 61 |   } else {
 62 |     # if fewer detected clusters than true populations, use transpose matrix and assign
 63 |     # NAs for true populations without any matching clusters
 64 |     labels_matched_flipped <- clue::solve_LSAP(F1_mat, maximum = TRUE)
 65 |     # use row and column names since some labels may have been removed due to unassigned cells
 66 |     labels_matched_flipped <- as.numeric(rownames(F1_mat_trans)[as.numeric(labels_matched_flipped)])
 67 |     names(labels_matched_flipped) <- rownames(F1_mat)
 68 |     
 69 |     labels_matched <- rep(NA, ncol(F1_mat))
 70 |     names(labels_matched) <- rownames(F1_mat_trans)
 71 |     labels_matched[as.character(labels_matched_flipped)] <- as.numeric(names(labels_matched_flipped))
 72 |   }
 73 |   
 74 |   # precision, recall, F1 score, and number of cells for each matched cluster
 75 |   pr <- re <- F1 <- n_cells_matched <- rep(NA, ncol(F1_mat))
 76 |   names(pr) <- names(re) <- names(F1) <- names(n_cells_matched) <- names(labels_matched)
 77 |   
 78 |   for (i in 1:ncol(F1_mat)) {
 79 |     # set to 0 if no matching cluster (too few detected clusters); use character names 
 80 |     # for row and column indices in case subsampling completely removes some clusters
 81 |     pr[i] <- ifelse(is.na(labels_matched[i]), 0, pr_mat[as.character(labels_matched[i]), names(labels_matched)[i]])
 82 |     re[i] <- ifelse(is.na(labels_matched[i]), 0, re_mat[as.character(labels_matched[i]), names(labels_matched)[i]])
 83 |     F1[i] <- ifelse(is.na(labels_matched[i]), 0, F1_mat[as.character(labels_matched[i]), names(labels_matched)[i]])
 84 |     
 85 |     n_cells_matched[i] <- sum(clus_algorithm == labels_matched[i], na.rm = TRUE)
 86 |   }
 87 |   
 88 |   # means across populations
 89 |   mean_pr <- mean(pr)
 90 |   mean_re <- mean(re)
 91 |   mean_F1 <- mean(F1)
 92 |   
 93 |   return(list(n_clus = n_clus, 
 94 |               pr = pr, 
 95 |               re = re, 
 96 |               F1 = F1, 
 97 |               labels_matched = labels_matched, 
 98 |               n_cells_matched = n_cells_matched, 
 99 |               mean_pr = mean_pr, 
100 |               mean_re = mean_re, 
101 |               mean_F1 = mean_F1))
102 | }


--------------------------------------------------------------------------------
/code/utils-integration.R:
--------------------------------------------------------------------------------
 1 | .cms_ldf <- \(df) df %>% 
 2 |     group_by(refset, sim_method) %>% 
 3 |     mutate(
 4 |         # center around 0
 5 |         cms = cms - 0.5,
 6 |         # scale to range 1
 7 |         ldf = ldf / diff(range(ldf)),
 8 |         # center around 0
 9 |         ldf = ldf - min(ldf) - 0.5)
10 | 
11 | .bcs <- \(df, n = 1) {
12 |     stopifnot(any(n == c(1, 2)))
13 |     .avg <- \(df) summarise(df,
14 |         across(c(cms, ldf), mean),
15 |         .groups = "drop_last")
16 |     df <- df %>% 
17 |         group_by(refset, sim_method, batch_method, batch) %>% 
18 |         # average across cells
19 |         .avg()
20 |     if (n == 2)
21 |         # average across batches
22 |         df <- df %>% .avg()
23 |     # batch correction score
24 |     df %>% mutate(bcs = abs(cms) + abs(ldf))
25 | }
26 | 
27 | .batch_labs <- c(
28 |     cms = "CMS*", bcs = "BCS",
29 |     ldf = expression(Delta~"LDF*"))


--------------------------------------------------------------------------------
/code/utils-plotting.R:
--------------------------------------------------------------------------------
  1 | suppressPackageStartupMessages({
  2 |     library(dplyr)
  3 |     library(ggplot2)
  4 |     library(ggpubr)
  5 |     library(ggrastr)
  6 |     library(ggrepel)
  7 |     library(magrittr)
  8 |     library(patchwork)
  9 |     library(RColorBrewer)
 10 |     library(tidyr)
 11 |     library(tidytext)
 12 | })
 13 | 
 14 | .read_res <- function(x) {
 15 |     df <- bind_rows(lapply(x, readRDS)) %>% 
 16 |         replace_na(list(method = "ref")) %>% 
 17 |         mutate(method = droplevels(factor(method, names(.methods_pal))))
 18 |     if (all(c("datset", "subset") %in% names(df)))
 19 |         df <- mutate(df, refset = paste(datset, subset, sep = ","))
 20 |     if ("id" %in% names(df))
 21 |         df <- mutate(df,
 22 |             group = relevel(factor(group), ref = "global"),
 23 |             id = case_when(group == "global" ~ "global", TRUE ~ id),
 24 |             id = relevel(factor(id), ref = "global"))
 25 |     if (all(c("refset", "group") %in% names(df)))
 26 |         df <- group_by(df, refset) %>% 
 27 |             mutate(.group = as.character(group)) %>% 
 28 |             mutate(reftyp = ifelse(any(.group == "batch"), "b", 
 29 |                 ifelse(any(.group == "cluster"), "k", "n")),
 30 |                 reftyp = factor(reftyp, c("n", "b", "k"))) %>% 
 31 |             select(-.group)
 32 |     if (sum(grepl("method", names(df))) > 1)
 33 |         df <- rename(df, sim_method = method)
 34 |     if (any(grepl("^metric", names(df))))
 35 |         df <- mutate(df, across(
 36 |             starts_with("metric"), 
 37 |             ~droplevels(factor(.x, 
 38 |                 levels = names(.metrics_lab), 
 39 |                 labels = .metrics_lab))))
 40 |     return(df)
 41 | }
 42 | 
 43 | .filter_res <- function(df) df %>% 
 44 |     mutate(
 45 |         .metric = names(.metrics_lab)[
 46 |             match(metric, .metrics_lab)],
 47 |         across(c(group, id), as.character)) %>% 
 48 |     filter(
 49 |         # keep everything for type 'n'
 50 |         reftyp == "n" |
 51 |         # keep all groupings for global summaries, 
 52 |         # gene-level (-correlations), cell-level 
 53 |         # (-log-library size & cell detection frequency
 54 |         .metric %in% .none_metrics |
 55 |         .metric %in% setdiff(.gene_metrics, "gene_cor") |
 56 |         .metric %in% setdiff(.cell_metrics, c("cell_lls", "cell_frq")) |
 57 |         # for gene-gene correlation, keep global only
 58 |         (group == id & .metric == "gene_cor") |
 59 |         # for log-library size & cell detection 
 60 |         # frequency, keep group-level results only 
 61 |         (group != id & grepl("cell_(lls|frq)", .metric))) %>% 
 62 |     ungroup() %>% 
 63 |     select(-.metric) %>% 
 64 |     mutate(across(c(metric, method), droplevels))
 65 | 
 66 | # repeatedly average statistics, e.g., 
 67 | # across groups, subsets, datsets, etc. 
 68 | .avg <- \(df, n) {
 69 |     .fun <- \(df) summarise_at(df, "stat", mean, na.rm = TRUE)
 70 |     res <- Reduce(\(df, foo) .fun(df), seq(n), init = df, accumulate = TRUE)
 71 |    ungroup(res[[n + 1]])
 72 | }
 73 | 
 74 | .prettify <- function(plt, thm = NULL, base_size = 6) 
 75 | {
 76 |     col <- !is.null(plt$labels$colour) && !is.numeric(plt$data[[plt$labels$colour]])
 77 |     fill <- !is.null(plt$labels$fill) && !is.numeric(plt$data[[plt$labels$fill]])
 78 |     
 79 |     plt <- plt + if (col && fill) { 
 80 |         tmp <- if (plt$labels$colour != plt$labels$fill) {
 81 |             guide_legend(order = 1, override.aes = list(alpha = 1, size = 2, stroke = 0.5, shape = 21))
 82 |         } else "none"
 83 |         guides(
 84 |             col = tmp,
 85 |             fill = guide_legend(override.aes = list(alpha = 1, size = 2, col = NA, shape = 21)))
 86 |     } else if (col) { guides(
 87 |         col = guide_legend(override.aes = list(alpha = 1, size = 2, shape = 21)))
 88 |     } else if (fill) { guides(
 89 |         fill = guide_legend(override.aes = list(alpha = 1, size = 2)))
 90 |     }
 91 |     plt <- plt + 
 92 |         theme_linedraw(base_size) + theme(
 93 |             panel.grid = element_blank(),
 94 |             legend.key.size = unit(0.5, "lines"),
 95 |             strip.background = element_blank(),
 96 |             strip.text = element_text(color = "black"))
 97 |     return(plt + thm)
 98 | }
 99 | 
100 | # aesthetics ----
101 | 
102 | .groups_pal <- c(global = "grey40", batch = "grey80", cluster = "grey80", group = "grey80")
103 | 
104 | pat <- ".*-sim_data-(.*)\\.R"
105 | methods <- gsub(pat, "\\1", list.files("code", pat))
106 | .methods_pal <- colorRampPalette(brewer.pal(12, "Paired"))(length(methods))
107 | .methods_pal <- c(ref = "black", setNames(.methods_pal, methods))
108 | 
109 | pat <- ".*calc_qc-(.*)\\.R"
110 | .metrics <- gsub(pat, "\\1", list.files("code", pat))
111 | .none_metrics <- c("gene_pve", "cell_cms", "cell_sw")
112 | .metrics <- setdiff(.metrics, .none_metrics)
113 | .gene_metrics <- grep("gene", .metrics, value = TRUE)
114 | .cell_metrics <- grep("cell", .metrics, value = TRUE)
115 | 
116 | .metrics_lab <- c(
117 |     gene_avg = "average of logCPM",
118 |     gene_var = "variance of logCPM",
119 |     gene_cv  = "coefficient of variation",
120 |     gene_frq = "gene detection frequency",
121 |     gene_cor = "gene-to-gene correlation",
122 |     cell_lls = "log-library size",
123 |     cell_frq = "cell detection frequency",
124 |     cell_cor = "cell-to-cell correlation",
125 |     cell_ldf = "local density factor",
126 |     cell_pcd = "cell-to-cell distance",
127 |     cell_knn = "KNN occurences",
128 |     gene_pve = "percent variance explained",
129 |     cell_cms = "cell-specific mixing score",
130 |     cell_sw = "silhouette width")
131 | 
132 | n <- 3
133 | .metrics_pal <- c(
134 |     setNames(hcl.colors(n*length(.gene_metrics), "Reds" )[seq(1, n*length(.gene_metrics), n)], .gene_metrics),
135 |     setNames(hcl.colors(n*length(.cell_metrics), "Blues")[seq(1, n*length(.cell_metrics), n)], .cell_metrics),
136 |     setNames(brewer.pal(1+length(.none_metrics), "Greens")[-1], .none_metrics))
137 | names(.metrics_pal) <- .metrics_lab
138 | 
139 | pat <- ".*-stat_1d-(.*)\\.R"
140 | .stats1d <- gsub(pat, "\\1", list.files("code", pat))
141 | .stats1d_lab <- c(ks = "KS", ws = "W")
142 | 
143 | pat <- ".*-stat_2d-(.*)\\.R"
144 | .stats2d <- gsub(pat, "\\1", list.files("code", pat))
145 | .stats2d_lab <- c(emd = "EMD", ks2 = "KS")
146 | 


--------------------------------------------------------------------------------
/code/utils-summaries.R:
--------------------------------------------------------------------------------
 1 | # split cell indices into list
 2 | # - global = list w/ one element (all cells)
 3 | # - batch = list w/ one element per batch (optional)
 4 | # - cluster = list w/ one element per cluster (optional)
 5 | .split_cells <- function(sce, 
 6 |     i = c("global", "batch", "cluster"))
 7 | {
 8 |     names(j) <- j <- intersect(i, names(colData(sce)))
 9 |     cs <- lapply(j, function(.) {
10 |         ids <- droplevels(factor(sce[[.]]))
11 |         split(seq(ncol(sce)), ids)
12 |     })
13 |     if ("global" %in% i)
14 |         cs <- c(list(global = list(foo = seq(ncol(sce)))), cs)
15 |     return(cs)
16 | }
17 | 
18 | # split genes by group = "cluster","batch", "sample". 
19 | # Then per group calculate the qc with the FUN function(which is a metric)
20 | # returns a dataframe with cols:  group | id | metric_name
21 | .calc_qc <- function(sce, fun, 
22 |     n_genes = NULL, n_cells = NULL, 
23 |     groups = c("global", "cluster", "batch")) 
24 | {
25 |     suppressPackageStartupMessages({
26 |         library(dplyr)
27 |         library(purrr)
28 |     })
29 |     if (is.null(groups)) {
30 |         groups <- eval(formals(.calc_qc)$groups)
31 |     } else {
32 |         groups <- match.arg(groups, several.ok = TRUE)
33 |     }
34 |     # split cells into groups
35 |     idx <- .split_cells(sce, groups)
36 |     # downsample to at most 'n_genes' in total
37 |     if (!is.null(n_genes)) {
38 |         n_genes <- min(n_genes, nrow(sce))
39 |         gs <- sample(nrow(sce), n_genes)
40 |         sce <- sce[gs, ]
41 |     }
42 |     # downsample to at most 'n_cells' per group
43 |     if (!is.null(n_cells)) {
44 |         idx <- map_depth(idx, -1, ~{
45 |             n_cells <- min(n_cells, length(.x))
46 |             sample(.x, n_cells)
47 |         })
48 |     }
49 |     # compute QC metric per group
50 |     res <- map_depth(idx, -1, ~{
51 |         data.frame(
52 |             row.names = NULL, 
53 |             value = fun(sce[, .x]))
54 |     })
55 |     # join into single table
56 |     res <- map_depth(res, 1, bind_rows, .id = "id")
57 |     res <- bind_rows(res, .id = "group")
58 |     if (nrow(res) != 0) return(res)
59 | }


--------------------------------------------------------------------------------
/config.yaml:
--------------------------------------------------------------------------------
1 | R: "R_LIBS_USER=~/libs/bioc313/ /usr/local/R/R-4.1.0/bin/R"


--------------------------------------------------------------------------------
/data/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelenaLC/simulation-comparison/f89e6650f8acae1c32b8fe27d644ec7c7be63051/data/.gitkeep


--------------------------------------------------------------------------------
/figs/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelenaLC/simulation-comparison/f89e6650f8acae1c32b8fe27d644ec7c7be63051/figs/.gitkeep


--------------------------------------------------------------------------------
/logs/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelenaLC/simulation-comparison/f89e6650f8acae1c32b8fe27d644ec7c7be63051/logs/.gitkeep


--------------------------------------------------------------------------------
/meta/methods.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"BASiCS": ["b"],
 3 | 	"ESCO": ["n", "b", "k"],
 4 | 	"hierarchicell": ["n", "b"],
 5 | 	"muscat": ["n", "b", "k"],
 6 | 	"POWSC": ["n", "k"],
 7 | 	"powsimR": ["n"],
 8 | 	"scDD": ["n"],
 9 | 	"scDesign": ["n"],
10 | 	"scDesign2": ["n", "k"],
11 | 	"SCRIP": ["n", "b", "k"],
12 | 	"SPARSim": ["n", "b"],
13 | 	"SPsimSeq": ["n", "b"],
14 | 	"splatter": ["n"],
15 | 	"SymSim": ["n", "b"],
16 | 	"ZINB-WaVE": ["n", "b", "k"],
17 | 	"zingeR": ["n"]
18 | }


--------------------------------------------------------------------------------
/meta/runtimes.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"panc8,inDrop1.beta": {
 3 | 		"type": "n",
 4 | 		"n_genes": ["4000", "2000", "1000", "500"],
 5 | 		"n_cells": ["800", "400", "200", "100"]
 6 | 	},
 7 | 	"Mereu20,CD4T": {
 8 | 		"type": "b",
 9 | 		"n_genes": ["4000", "2000", "1000", "500"],
10 | 		"n_cells": ["2600", "1300", "650", "325"]
11 | 	},
12 | 	"Zheng17,foo": {
13 | 		"type": "k",
14 | 		"n_genes": ["3200", "1600", "800", "400"],
15 | 		"n_cells": ["2600", "1300", "650", "325"]
16 | 	}
17 | }


--------------------------------------------------------------------------------
/meta/subsets.json:
--------------------------------------------------------------------------------
  1 | {
  2 | 	"CellBench": {
  3 | 		"H2228": {
  4 | 			"type": "b",
  5 | 			"n_cells": "400",
  6 | 			"n_genes": "4000",
  7 | 			"cluster": "H2228"
  8 | 		},
  9 | 		"CELSeq": {
 10 | 			"type": "k",
 11 | 			"n_cells": "400",
 12 | 			"n_genes": "4000",
 13 | 			"batch": "sc_celseq"
 14 | 		}
 15 | 	},
 16 | 	"Ding20": {
 17 | 		"10x.InhibNeuron": {
 18 | 			"type": "n",
 19 | 			"n_cells": null,
 20 | 			"n_genes": "4000",
 21 | 			"batch": "10x Chromium",
 22 | 			"cluster": "Inhibitory neuron"
 23 | 		},
 24 | 		"ExcitNeuron": {
 25 | 			"type": "b",
 26 | 			"n_cells": "200",
 27 | 			"n_genes": "4000",
 28 | 			"cluster": "Excitatory neuron"
 29 | 		},
 30 | 		"DroNcSeq": {
 31 | 			"type": "k",
 32 | 			"n_cells": "200",
 33 | 			"n_genes": "4000",
 34 | 			"batch": "DroNc-seq"
 35 | 		}
 36 | 	},
 37 | 	"Gierahn17" : {
 38 | 		"foo": {
 39 | 			"type": "n", 
 40 | 			"n_cells": null,
 41 | 			"n_genes": "4000"
 42 | 		}
 43 | 	},
 44 | 	"Kang18": {
 45 | 		"1015": {
 46 | 			"type": "k",
 47 | 			"n_cells": "400",
 48 | 			"n_genes": "4000",
 49 | 			"batch": "1015"
 50 | 		},
 51 | 		"B": {
 52 | 			"type": "n",
 53 | 			"n_cells": null,
 54 | 			"n_genes": "4000",
 55 | 			"batch": "1015",
 56 | 			"cluster": "B cells"
 57 | 		},
 58 | 		"NK": {
 59 | 			"type": "n",
 60 | 			"n_cells": null,
 61 | 			"n_genes": "4000",
 62 | 			"batch": "1015",
 63 | 			"cluster": "NK cells"
 64 | 		}
 65 | 	},
 66 | 	"Koh16": {
 67 | 		"foo": {
 68 | 			"type": "k",
 69 | 			"n_cells": null,
 70 | 			"n_genes": "4000"
 71 | 		}
 72 | 	},
 73 | 	"MCA20.gland": {
 74 | 		"T": {
 75 | 			"type": "b",
 76 | 			"n_cells": null,
 77 | 			"n_genes": 4000,
 78 | 			"cluster": "T cell_Cd8b1 high"
 79 | 		}
 80 | 	},
 81 | 	"MCA20.lung": {
 82 | 		"AT2": {
 83 | 			"type": "b",
 84 | 			"n_cells": null,
 85 | 			"n_genes": 4000,
 86 | 			"cluster": "AT2 Cell"
 87 | 		}
 88 | 	},
 89 | 	"Mereu20":{
 90 | 		"CD4T": {
 91 | 			"type": "b",
 92 | 			"n_cells": "200",
 93 | 			"n_genes": "4000",
 94 | 			"cluster": "CD4 T cells"
 95 | 		},
 96 | 		"ddSeq": {
 97 | 			"type": "k",
 98 | 			"n_cells": "400",
 99 | 			"n_genes": "4000",
100 | 			"batch": "ddSEQ"
101 | 		}
102 | 	},
103 | 	"Oetjen18": {
104 | 		"R": {
105 | 			"type": "n",
106 | 			"n_cells": null,
107 | 			"n_genes": "4000",
108 | 			"batch": "R"
109 | 		},
110 | 		"foo": {
111 | 			"type": "b",
112 | 			"n_cells": "200",
113 | 			"n_genes": "4000"
114 | 		}
115 | 	},
116 | 	"panc8": {
117 | 		"inDrop1.beta": {
118 | 			"type": "n",
119 | 			"n_cells": null,
120 | 			"n_genes": "4000",
121 | 			"batch": "indrop1",
122 | 			"cluster": "beta"
123 | 		},
124 | 		"inDrop.ductal": {
125 | 			"type": "b",
126 | 			"n_cells": null,
127 | 			"n_genes": "4000",
128 | 			"batch": ["indrop1", "indrop2", "indrop3", "indrop4"],
129 | 			"cluster": "ductal"
130 | 		},
131 | 		"SmartSeq2": {
132 | 			"type": "k",
133 | 			"n_cells": "400",
134 | 			"n_genes": "4000",
135 | 			"batch": "smartseq2"
136 | 		}
137 | 	},
138 | 	"TabulaMuris": {
139 | 		"limb.MSCs": {
140 | 			"type": "n",
141 | 			"n_cells": null,
142 | 			"n_genes": "4000",
143 | 			"tissue": "Limb_Muscle",
144 | 			"cluster": "mesenchymal stem cell"
145 | 		},
146 | 		"spleen": {
147 | 			"type": "k",
148 | 			"n_cells": "400",
149 | 			"n_genes": "4000",
150 | 			"tissue": "Spleen"
151 | 		}
152 | 	},
153 | 	"Tung17": {
154 | 		"foo": {
155 | 			"type": "b",
156 | 			"n_cells": "400",
157 | 			"n_genes": "4000"
158 | 		},
159 | 		"NA19101": {
160 | 			"type": "n",
161 | 			"n_cells": null,
162 | 			"n_genes": "4000",
163 | 			"batch": "NA19101"
164 | 		}
165 | 	},
166 | 	"Zheng17": {
167 | 		"foo": {
168 | 			"type": "k",
169 | 			"n_cells": "400",
170 | 			"n_genes": "4000"
171 | 		},
172 | 		"HSCs": {
173 | 			"type": "n",
174 | 			"n_cells": null,
175 | 			"n_genes": "4000",
176 | 			"cluster": "HSCs CD34+"
177 | 		},
178 | 		"Monocytes": {
179 | 			"type": "n",
180 | 			"n_cells": "400",
181 | 			"n_genes": "4000",
182 | 			"cluster": "Monocytes CD14+"
183 | 		}
184 | 	}
185 | }


--------------------------------------------------------------------------------
/outs/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelenaLC/simulation-comparison/f89e6650f8acae1c32b8fe27d644ec7c7be63051/outs/.gitkeep


--------------------------------------------------------------------------------
/plts/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelenaLC/simulation-comparison/f89e6650f8acae1c32b8fe27d644ec7c7be63051/plts/.gitkeep


--------------------------------------------------------------------------------
/schematic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelenaLC/simulation-comparison/f89e6650f8acae1c32b8fe27d644ec7c7be63051/schematic.png


--------------------------------------------------------------------------------