├── .Rprofile
├── .gitignore
├── README.md
├── Snakefile
├── code
├── 00-get_data-CellBench.R
├── 00-get_data-Ding20.R
├── 00-get_data-Gierahn17.R
├── 00-get_data-Kang18.R
├── 00-get_data-Koh16.R
├── 00-get_data-MCA20.gland.R
├── 00-get_data-MCA20.lung.R
├── 00-get_data-Mereu20.R
├── 00-get_data-Oetjen18.R
├── 00-get_data-TabulaMuris.R
├── 00-get_data-Tung17.R
├── 00-get_data-Zheng17.R
├── 00-get_data-panc8.R
├── 00-get_data.R
├── 01-fil_data.R
├── 02-sub_data.R
├── 03-est_pars-BASiCS.R
├── 03-est_pars-ESCO.R
├── 03-est_pars-POWSC.R
├── 03-est_pars-SCRIP.R
├── 03-est_pars-SPARSim.R
├── 03-est_pars-SPsimSeq.R
├── 03-est_pars-SymSim.R
├── 03-est_pars-ZINB-WaVE.R
├── 03-est_pars-hierarchicell.R
├── 03-est_pars-muscat.R
├── 03-est_pars-powsimR.R
├── 03-est_pars-scDD.R
├── 03-est_pars-scDesign.R
├── 03-est_pars-scDesign2.R
├── 03-est_pars-splatter.R
├── 03-est_pars-zingeR.R
├── 03-est_pars.R
├── 04-sim_data-BASiCS.R
├── 04-sim_data-ESCO.R
├── 04-sim_data-POWSC.R
├── 04-sim_data-SCRIP.R
├── 04-sim_data-SPARSim.R
├── 04-sim_data-SPsimSeq.R
├── 04-sim_data-SymSim.R
├── 04-sim_data-ZINB-WaVE.R
├── 04-sim_data-hierarchicell.R
├── 04-sim_data-muscat.R
├── 04-sim_data-powsimR.R
├── 04-sim_data-scDD.R
├── 04-sim_data-scDesign.R
├── 04-sim_data-scDesign2.R
├── 04-sim_data-splatter.R
├── 04-sim_data-zingeR.R
├── 04-sim_data.R
├── 05-calc_batch-ComBat.R
├── 05-calc_batch-Harmony.R
├── 05-calc_batch-Seurat.R
├── 05-calc_batch-fastMNN.R
├── 05-calc_batch-limma.R
├── 05-calc_batch-mnnCorrect.R
├── 05-calc_batch.R
├── 05-calc_clust-CIDR.R
├── 05-calc_clust-PCA+HC.R
├── 05-calc_clust-PCA+KM.R
├── 05-calc_clust-SC3.R
├── 05-calc_clust-SC3svm.R
├── 05-calc_clust-Seurat.R
├── 05-calc_clust-monocle.R
├── 05-calc_clust-pcaReduce.R
├── 05-calc_clust-tSNE+KM.R
├── 05-calc_clust.R
├── 05-calc_dr.R
├── 05-calc_qc-cell_cms.R
├── 05-calc_qc-cell_cor.R
├── 05-calc_qc-cell_frq.R
├── 05-calc_qc-cell_knn.R
├── 05-calc_qc-cell_ldf.R
├── 05-calc_qc-cell_lls.R
├── 05-calc_qc-cell_pcd.R
├── 05-calc_qc-cell_sw.R
├── 05-calc_qc-gene_avg.R
├── 05-calc_qc-gene_cor.R
├── 05-calc_qc-gene_cv.R
├── 05-calc_qc-gene_frq.R
├── 05-calc_qc-gene_pve.R
├── 05-calc_qc-gene_var.R
├── 05-calc_qc.R
├── 05-runtimes.R
├── 06-dr_batch.R
├── 06-eval_batch.R
├── 06-eval_clust.R
├── 06-stat_1d-ks.R
├── 06-stat_1d-ws.R
├── 06-stat_1d.R
├── 06-stat_2d-emd.R
├── 06-stat_2d-ks2.R
├── 06-stat_2d.R
├── 07-plot_batch-boxplot_by_method.R
├── 07-plot_batch-boxplot_dX.R
├── 07-plot_batch-correlations.R
├── 07-plot_batch-densities.R
├── 07-plot_batch-heatmap_by_method.R
├── 07-plot_batch-heatmap_by_refset.R
├── 07-plot_clust-boxplot_by_method.R
├── 07-plot_clust-boxplot_dF1.R
├── 07-plot_clust-correlations.R
├── 07-plot_clust-heatmap_by_method.R
├── 07-plot_corr-stat_2d.R
├── 07-plot_dimred.R
├── 07-plot_dimred_batch.R
├── 07-plot_memory.R
├── 07-plot_qc_ref-correlations.R
├── 07-plot_runtimes.R
├── 07-plot_stat_1d-scatters.R
├── 07-plot_stat_1d_by_reftyp-boxplot.R
├── 07-plot_stat_1d_by_reftyp-dimEst.R
├── 07-plot_stat_1d_by_reftyp-heatmap.R
├── 07-plot_stat_1d_by_reftyp-mds.R
├── 07-plot_stat_1d_by_reftyp-pca.R
├── 07-plot_stat_1d_by_stat1d-boxplot_by_method.R
├── 07-plot_stat_1d_by_stat1d-boxplot_by_metric.R
├── 07-plot_stat_1d_by_stat1d-correlations.R
├── 07-plot_stat_1d_by_stat1d-correlations_by_method.R
├── 07-plot_stat_1d_by_stat1d-correlations_by_metric.R
├── 07-plot_stat_1d_by_stat1d-dimEst.R
├── 07-plot_stat_1d_by_stat1d-mds.R
├── 07-plot_stat_1d_by_stat1d-pca.R
├── 07-plot_stat_2d-scatters.R
├── 07-plot_stat_2d_by_reftyp-boxplot.R
├── 07-plot_stat_2d_by_reftyp-heatmap.R
├── 08-fig_boxplots.R
├── 08-fig_clustering.R
├── 08-fig_heatmaps.R
├── 08-fig_integration.R
├── 08-fig_mds.R
├── 08-fig_memory.R
├── 08-fig_runtimes.R
├── 08-fig_scalability.R
├── 08-fig_scatters.R
├── 08-fig_stat1d.R
├── 08-fig_stat2d.R
├── 08-fig_summaries.R
├── 09-write_fns.R
├── 09-write_obj.R
├── 10-session_info.R
├── utils-clustering.R
├── utils-integration.R
├── utils-plotting.R
└── utils-summaries.R
├── config.yaml
├── data
└── .gitkeep
├── figs
└── .gitkeep
├── logs
└── .gitkeep
├── meta
├── methods.json
├── runtimes.json
└── subsets.json
├── outs
└── .gitkeep
├── plts
└── .gitkeep
└── schematic.png
/.Rprofile:
--------------------------------------------------------------------------------
1 | data.table::setDTthreads(threads = 1)
2 |
3 | .get_wcs <- function(wcs) {
4 | wcs <- gsub("(,)(\\w+=)", ";\\2", wcs)
5 | ss <- strsplit(wcs, ";")[[1]]
6 | ss <- sapply(ss, strsplit, "=")
7 | keys <- sapply(ss, .subset, 1)
8 | vals <- sapply(ss, .subset, 2)
9 | wcs <- as.list(vals)
10 | names(wcs) <- keys
11 | return(wcs)
12 | }
13 |
14 | args <- R.utils::commandArgs(
15 | trailingOnly = TRUE,
16 | asValues = TRUE)
17 |
18 | if (!is.null(args$wcs)) {
19 | wcs <- .get_wcs(args$wcs)
20 | args$wcs <- NULL
21 | } else wcs <- NULL
22 |
23 | args <- lapply(args, function(u)
24 | unlist(strsplit(u, ";")))
25 |
26 | cat("WILDCARDS:\n\n"); print(wcs); cat("\n")
27 | cat("ARGUMENTS:\n\n"); print(args)
28 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | .snakemake
3 |
4 | ._*
5 | *.txt
6 | *.html
7 |
8 | logs/*
9 | data/*
10 | outs/*
11 | plts/*
12 | figs/*
13 | !*/.gitkeep
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # `Snakemake` workflow to benchmark
scRNA-seq data simulators
2 |
3 | - [Setup](#setup)
4 | - [Dependencies](#dependencies)
5 | - [Structure](#structure)
6 | - [Workflow](#workflow)
7 | - [Preproccessing](#preprocessing)
8 | - [Simulation](#simulation)
9 | - [Summaries](#summaries)
10 | - [Statistics](#statistics)
11 | - [Downstream](#downstream)
12 | - [Clustering](#clustering)
13 | - [Integration](#integration)
14 | - [Visualization](#visualization)
15 | - [Customization](#customization)
16 | - [Datasets](#datasets)
17 | - [Methods](#methods)
18 |
19 | ***
20 |
21 | # Setup
22 |
23 | ## Dependencies
24 |
25 | The current code was implemented using R v4.1.0, Bioconductor v3.13, Snakemake v5.5.0, and Python v3.6.8. All R dependencies (from GitHub, CRAN and Bioconductor) are listed under *code/10-session_info.R* and may be installed using the command contained therein.
26 |
27 | ## Structure
28 |
29 | * `config.yaml` specifies the R library and version to use
30 | * `code` contains all R scripts used in the *Snakemake* workflow
31 | * `data` contains raw, filtered and simulated scRNA-seq datasets,
32 | as well as simulation parameter estimates
33 | * `meta` contains two *.json* files that specify simulation method (`methods.json`) and reference subset (`subsets.json`) configurations
34 | * `outs` contains all results from computations (typically `data.frame`s) as *.rds* files
35 | * `figs` contains all visual outputs as *.pdf* files, and corresponding `ggplot` objects as *.rds* files (for subsequent arrangement into 'super'-figures)
36 |
37 | Simulation methods are tagged with *one or many* of the following labels, according to which scenario(s) they can accommodate:
38 |
39 | * `n` for none: no clusters or batches
40 | * `b` for batch: multiple batches, no clusters
41 | * `k` for cluster: multiple clusters, no batches
42 |
43 | Similarly, we tag subsets (see below) with *exactly one* of these labels. This allows running each method on subsets they are capable of simulating.
44 |
45 | ***
46 |
47 | # Workflow
48 |
49 | 
50 |
51 | ## Preprocessing
52 |
53 | **1. Data retrieval**
54 |
55 | Each `code/00-get_data-.R` script retrieves a publicly available scRNA-seq dataset through from which a *SingleCellExperiment* is constructed and written to `data/00-raw/.rds`
56 |
57 | **2. Filtering**
58 |
59 | `code/01-fil_data.R` is applied to each raw dataset as to
60 |
61 | * remove batches, cluster, or batch-cluster instances with fewer than 50 cells (depending on the dataset's complexity)
62 | * keep genes with a count of at least 1 in at least 10 cells, and remove cells with fewer than 100 detected genes
63 |
64 | Filtered data are written to `data/01-fil/.rds`.
65 |
66 | **3. Subsetting**
67 |
68 | Because different methods can accommodate only some features (e.g. multiple batches or clusters, both or neither), `code/02-sub_data.R` creates specific subsets in `data/02-sub/.,rds`. We term these *ref(erence)set*s (i.e. `. = `), as they serve as the input reference data for simulation.
69 |
70 | ## Simulation
71 |
72 | **1. Parameter estimation**
73 |
74 | Simulation parameters are estimated with `code/03-est_pars.R`, which in term sources a `code/03-est_pars-.R` script that executes a method's parameter estimation function(s). In cases where no separate estimation takes place, this returns `NULL`. Parameter estimates for each combination of ` = ` are written to `data/04-est/.rds`.
75 |
76 | **2. Data simulation**
77 |
78 | Data is simulated with `code/04-sim_data.R`, which in term sources a `code/04-sim_data-.R` script that executes a method's simulation function. Simulations for each combination of `` and `method_id` are written to `data/05-sim/,.rds`.
79 |
80 | ## Summaries
81 |
82 | Various quality control (QC) summaries are computed with `code/05-calc_qc.R`, which in term sources a set of `code/05-calc_qc-.R` scripts. QC results for reference and simulated data are written to `outs/qc_ref-,.rds` and `outs/qc_sim-,.rds`, respectively. At current, we consider:
83 |
84 | **1. Gene-level**
85 |
86 | * `frq`: detection frequency (i.e., fraction of cells with non-zero counts)
87 | * `avg/var`: average/variance of logCPM
88 | * `cv`: coefficient of variation
89 | * `cor`: gene-to-gene correlation
90 |
91 | **2. Cell-level**
92 |
93 | * `frq`: detection frequency (i.e., fraction of genes with non-zero counts)
94 | * `lls`: log-transformed library size (total counts)
95 | * `cor`: cell-to-cell correlation
96 | * `pcd`: cell-to-cell distance (in PCA space)
97 | * `knn`: number of KNN occurrences
98 | * `ldf`: local density factor
99 |
100 | **3. Global**
101 |
102 | * `sw`: Silhouette width (using batch/cluster labels as classes)
103 | * `cms`: cell-specific mixing score (using batch/cluster labels as batches)
104 | * `pve`: percent variance explained (of gene expression = logCPM, by batch/cluster)
105 |
106 | Noteworthily, we compute each summary for different groupings of cells (depending on the dataset's complexity):
107 |
108 | 1. globally, i.e. across all cells
109 | 2. at the batch-level, i.e. for each batch
110 | 3. at the cluster-level, i.e. for each cluster
111 |
112 | Global summaries are computed at the batch-/cluster-level only, as they require a grouping variable.
113 |
114 | ## Statistics
115 |
116 | We compare summaries between reference and simulated data in both one- (`code/06-stat_1d.R`) and two-dimensional settings (`code/06-statl_2d.R`). For the latter, every combination of gene- and cell-level metrics is considered, excluding correlations and global summaries. Furthermore, metrics are evaluated for each cell grouping, i.e. we perform a test globally, for each batch and cluster (again, depending on the dataset's complexity). Test results are written to `outs/stat_1d,,,.rds` for 1D, and `outs/stat_2d,,,,.rds` for 2D tests.
117 |
118 | **1. One-dimensional**
119 |
120 | * Kolmogorov-Smirnov (KS) test
121 | * Wasserstein metric
122 |
123 | **2. Two-dimensional**
124 |
125 | * two-dimensional KS test
126 | * Earth Mover's Distance (EMD)
127 |
128 | ## Downstream
129 |
130 | ### Integration
131 |
132 | Each `05-calc_batch-x.R` script wraps around an integration method that is applied in `05-calc_batch.R` to the set of type *b* subsets. The output corrected assay data or integrated cell embeddings (depending on the method) are written to `outs/batch_ref/sim-[,.rds` for every reference and simulation, respectively. Results are evaluated by `06-eval_batch.R`, which computes the following set of metrics:
133 |
134 | - cell-specific mixing score (CMS)
135 | - difference in local density factor ($\Delta$LDF)
136 | - batch correction score (BCS)
137 |
138 | ### Clustering
139 |
140 | Each `05-calc_clust-x.R` script wraps around an integration method that is applied in `05-calc_clust.R` to the set of type *b* subsets. The output cluster assignments are written to `outs/clust_ref/sim-][,.rds` for every reference and simulation, respectively. Results are evaluated by `06-eval_clust.R`, which computes the following set of metrics:
141 |
142 | - precision (P) and recall (R)
143 | - F1 score (harmonic mean of P and R)
144 |
145 | ## Visualization
146 |
147 | Finally, results are collected across `refset_id`s and `method_id`s (jointly or separated by type), and visualized in various ways using as set of `07-plot_x.R` scripts. Output figures are written to `plts` as *.pdf* files, along with the corresponding `ggplot` objects as *.rds* files. Lastly, `08-fig_x.R` scripts are used to combined various `ggplot`s into figures that are saved to `figs` as *.pdf* files.
148 |
149 | ***
150 |
151 | # Customization
152 |
153 | ## Datasets
154 |
155 | In principle, any dataset for which a `code/00-get_data-.R` script exists will be accessible to the workflow. However, data will only be retrieved if the dataset appears in `meta/subsets.json`. Hence,
156 |
157 | ### Removing
158 |
159 | To exclude a dataset from the workflow, i) (re)move the corresponding `code/00-get_data-.R` script; or, ii) remove or comment out any associated `meta/subsets.json` entries.
160 |
161 | ### Adding
162 |
163 | Similarly, a new dataset can be added by supplying an adequate `code/00-get_data-.R` script, and adding an entry to the `meta/subsets.json` configuration that specifies the subset ID, the number of genes/cells to sample (`NULL` for all), which batch(es)/cluster(s) to retain, as well as the resulting subset's type (one of n,b,k,g).
164 |
165 | ## Methods
166 |
167 | The *Snakemake* will automatically include any simulation method for which a `code/03-est_pars-.R` and `code/04-sim_data-.R` script exists. Secondly, `meta/methods.json` will determine on which type(s) of dataset(s) each method should be run. Thus,
168 |
169 | ### Removing
170 |
171 | To exclude a method from the workflow, either i) set `"": "x"` in the `meta/methods.json` file (or anything other than n,b,k,g); or, ii) (re)move the parameter estimation and/or simulation script from the `code` directory.
172 |
173 | ### Adding
174 |
175 | Analogous to the above, adding a method to the benchmark requires i) adding a `code/03-est_pars-.R` and `code/04-sim-data-.R` script; and, ii) adding an entry for the `method_id` to the `meta/methods.json` file. Importantly, the R script for parameter estimation should handle batches (`colData` column `batch`), clusters (`colData` column `cluster`), both or neither. And the method's type(s) should be specified accordingly (`n` for neither, `b/k` for batches/clusters, `g` for groups), e.g. `"": ["n", "k"]` for a method that supports 'singular' datasets, as well as ones with multiple clusters.
176 |
--------------------------------------------------------------------------------
/code/00-get_data-CellBench.R:
--------------------------------------------------------------------------------
1 | # load required packages
2 | suppressPackageStartupMessages({
3 | library(CellBench)
4 | library(SingleCellExperiment)
5 | })
6 |
7 | fun <- \()
8 | {
9 | # load SCE from EH
10 | sce <- load_sc_data()
11 | ids <- names(sce)
12 |
13 | # get genes shared across batches
14 | gs <- lapply(sce, rownames)
15 | gs <- Reduce(intersect, gs)
16 |
17 | sce <- lapply(ids, function(id)
18 | {
19 | # subset shared genes
20 | x <- sce[[id]][gs, ]
21 |
22 | # simplify metadata
23 | rowData(x) <- NULL
24 | colData(x) <- DataFrame(
25 | batch = id,
26 | cluster = x$cell_line)
27 | metadata(x) <- list()
28 |
29 | # make counts sparse & drop drop log-normalized counts
30 | y <- as(counts(x), "dgCMatrix")
31 | assays(x) <- list(counts = y)
32 | print(dim(x))
33 | return(x)
34 | })
35 |
36 | # concatenate batches into single dataset
37 | do.call(cbind, sce)
38 | }
--------------------------------------------------------------------------------
/code/00-get_data-Ding20.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(ExperimentHub)
3 | library(SingleCellExperiment)
4 | library(Seurat)
5 | })
6 |
7 | fun <- \()
8 | {
9 | eh <- ExperimentHub()
10 | q <- query(eh, "SimBenchData")
11 |
12 | x <- lapply(
13 | grep("^Neural", q$title),
14 | function(i) {
15 | so <- eh[[q$ah_id[i]]]
16 | sce <- as.SingleCellExperiment(so)
17 | # keep 1st experiment only
18 | sce[, sce$ident == "Cortex1"]
19 | })
20 |
21 | x <- do.call(cbind, x)
22 |
23 | colData(x) <- DataFrame(
24 | batch = x$technology,
25 | cluster = x$celltype)
26 |
27 | assays(x) <- list(counts = counts(x))
28 |
29 | return(x)
30 | }
--------------------------------------------------------------------------------
/code/00-get_data-Gierahn17.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(ExperimentHub)
3 | library(Seurat)
4 | library(SingleCellExperiment)
5 | })
6 |
7 | fun <- \()
8 | {
9 | eh <- ExperimentHub()
10 | q <- query(eh, "SimBenchData")
11 | i <- grep("HEK", q$title)
12 | x <- eh[[q$ah_id[i]]]
13 |
14 | x <- as.SingleCellExperiment(x)
15 | assays(x) <- assays(x)["counts"]
16 | colData(x) <- NULL
17 |
18 | return(x)
19 | }
--------------------------------------------------------------------------------
/code/00-get_data-Kang18.R:
--------------------------------------------------------------------------------
1 | # load required packages
2 | suppressPackageStartupMessages({
3 | library(ExperimentHub)
4 | library(SingleCellExperiment)
5 | })
6 |
7 | fun <- \()
8 | {
9 | # load SCE from EH
10 | eh <- ExperimentHub()
11 | q <- query(eh, "Kang18_8vs8")
12 | x <- eh[[q$ah_id]]
13 |
14 | # keep reference samples only
15 | x <- x[, x$stim == "ctrl"]
16 |
17 | # drop unassigned & multiplet cells
18 | x <- x[, !is.na(x$cell)]
19 | x <- x[, x$multiplets == "singlet"]
20 |
21 | # drop undetected genes
22 | x <- x[rowSums(counts(x)) > 0, ]
23 |
24 | # convert counts to sparse matrix
25 | counts(x) <- as(counts(x), "dgCMatrix")
26 |
27 | # drop feature metadata
28 | rowData(x) <- NULL
29 |
30 | # subset & rename cell metadata
31 | colData(x) <- DataFrame(
32 | batch = factor(x$ind),
33 | cluster = x$cell,
34 | row.names = NULL)
35 |
36 | return(x)
37 | }
--------------------------------------------------------------------------------
/code/00-get_data-Koh16.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(Biobase)
3 | library(GEOquery)
4 | library(SingleCellExperiment)
5 | })
6 |
7 | fun <- \()
8 | {
9 | url <- "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSM2257302&format=file&file=GSM2257302%5FAll%5Fsamples%5Fsc%5Ftpm%2Etxt%2Egz"
10 | fnm <- tempfile(fileext = ".txt.gz")
11 | download.file(url, fnm, quiet = TRUE)
12 |
13 | # read in raw data
14 | y <- read.delim(gzfile(fnm), header = TRUE)
15 |
16 | # make feature metadata
17 | rd <- data.frame(
18 | ensembl = y$geneID,
19 | symbol = y$geneSymbol)
20 |
21 | ex <- grep("^gene", names(y))
22 | y <- as.matrix(y[, -ex])
23 |
24 | # make cell metadata
25 | cd <- data.frame(cluster = factor(
26 | gsub("\\..*$", "", colnames(y)),
27 | levels = c("H7hESC", "APS", "MPS3", "D2_25somitomere", "DLL1PXM", "LatM", "Earlysomite", "cDM", "Sclerotome"),
28 | labels = c("hESC", "APS", "MPS", "D2.25 Somitomere", "DLL1+PXM", "LatM", "Somite", "Dermo", "Sclerotome")))
29 |
30 | # convert TPM to integers
31 | y <- matrix(as.integer(y), nrow(y), ncol(y))
32 |
33 | # construct 'SingleCellExperiment'
34 | SingleCellExperiment(
35 | assays = list(counts = y),
36 | rowData = rd, colData = cd)
37 | }
--------------------------------------------------------------------------------
/code/00-get_data-MCA20.gland.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(SingleCellExperiment)
3 | })
4 |
5 | fun <- \()
6 | {
7 | # bump timeout limit of 60 sec to 5 min
8 | # to assure download isn't interrupted
9 | options(timeout = max(300, getOption("timeout")))
10 |
11 | # get count data
12 | url <- "https://ndownloader.figshare.com/files/10756798?private_link=865e694ad06d5857db4b"
13 | dir <- dirname(fnm <- tempfile())
14 | download.file(url, fnm, quiet = TRUE)
15 | fns <- untar(fnm, list = TRUE)
16 |
17 | # read in counts
18 | sub <- grep("/MammaryGland.Virgin[0-9]", fns, value = TRUE)
19 | untar(fnm, files = sub, exdir = dir)
20 | ys <- lapply(file.path(dir, sub), \(.) {
21 | y <- read.csv(., sep = " ")
22 | as(as.matrix(y), "dgCMatrix")
23 | })
24 |
25 | # get shared features
26 | gs <- lapply(ys, rownames)
27 | gs <- Reduce(intersect, gs)
28 |
29 | # join & construct SCE
30 | y <- do.call(cbind, lapply(ys, \(.) .[gs, ]))
31 | x <- SingleCellExperiment(assays = list(counts = y))
32 |
33 | # get cell metadata
34 | url <- "https://ndownloader.figshare.com/files/11083451?private_link=865e694ad06d5857db4b"
35 | fnm <- tempfile()
36 | download.file(url, fnm, quiet = TRUE)
37 | cd <- read.csv(fnm)
38 |
39 | # drop cells w/o metadata
40 | i <- match(colnames(x), cd$Cell.name, nomatch = 0)
41 | x <- x[, i != 0]
42 | cd <- cd[i, ]
43 |
44 | # simplify annotations
45 | cluster <- gsub("\\(.*\\)$", "", cd$Annotation)
46 |
47 | # add cell metadata
48 | colData(x) <- DataFrame(
49 | tissue = cd$Tissue,
50 | batch = cd$Batch,
51 | cluster)
52 |
53 | return(x)
54 | }
55 |
--------------------------------------------------------------------------------
/code/00-get_data-MCA20.lung.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(SingleCellExperiment)
3 | })
4 |
5 | fun <- \()
6 | {
7 | # bump timeout limit of 60 sec to 5 min
8 | # to assure download isn't interrupted
9 | options(timeout = max(300, getOption("timeout")))
10 |
11 | # get count data
12 | url <- "https://ndownloader.figshare.com/files/10756798?private_link=865e694ad06d5857db4b"
13 | dir <- dirname(fnm <- tempfile())
14 | download.file(url, fnm, quiet = TRUE)
15 | fns <- untar(fnm, list = TRUE)
16 |
17 | # read in counts
18 | sub <- grep("/Lung[0-9]", fns, value = TRUE)
19 | untar(fnm, files = sub, exdir = dir)
20 | ys <- lapply(file.path(dir, sub), \(.) {
21 | y <- read.csv(., sep = " ")
22 | as(as.matrix(y), "dgCMatrix")
23 | })
24 |
25 | # get shared features
26 | gs <- lapply(ys, rownames)
27 | gs <- Reduce(intersect, gs)
28 |
29 | # join & construct SCE
30 | y <- do.call(cbind, lapply(ys, \(.) .[gs, ]))
31 | x <- SingleCellExperiment(assays = list(counts = y))
32 |
33 | # get cell metadata
34 | url <- "https://ndownloader.figshare.com/files/11083451?private_link=865e694ad06d5857db4b"
35 | fnm <- tempfile()
36 | download.file(url, fnm, quiet = TRUE)
37 | cd <- read.csv(fnm)
38 |
39 | # drop cells w/o metadata
40 | i <- match(colnames(x), cd$Cell.name, nomatch = 0)
41 | x <- x[, i != 0]
42 | cd <- cd[i, ]
43 |
44 | # simplify annotations
45 | cluster <- gsub("\\(.*\\)$", "", cd$Annotation)
46 |
47 | # add cell metadata
48 | colData(x) <- DataFrame(
49 | tissue = cd$Tissue,
50 | batch = cd$Batch,
51 | cluster)
52 |
53 | return(x)
54 | }
55 |
--------------------------------------------------------------------------------
/code/00-get_data-Mereu20.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(SingleCellExperiment)
3 | })
4 |
5 | fun <- \()
6 | {
7 | # load SCE from URL
8 | con <- url("https://www.dropbox.com/s/i8mwmyymchx8mn8/x.all_classified.technologies.RData?raw=1")
9 | x <- get(load(con))
10 | close(con)
11 |
12 | # drop log-normalized counts
13 | assay(x, "logcounts") <- NULL
14 |
15 | # drop feature metadata
16 | rowData(x) <- NULL
17 |
18 | # subset & rename cell metadata
19 | colData(x) <- DataFrame(
20 | batch = x$batch,
21 | cluster = x$nnet2)
22 |
23 | return(x)
24 | }
--------------------------------------------------------------------------------
/code/00-get_data-Oetjen18.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(Matrix)
3 | library(SingleCellExperiment)
4 | })
5 |
6 | fun <- \()
7 | {
8 | url <- "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE120221&format=file"
9 | fnm <- tempfile(fileext = ".tar")
10 | download.file(url, fnm, quiet = TRUE)
11 |
12 | # unpack data
13 | dir <- file.path(dirname(fnm), "GSE120221")
14 | untar(fnm, exdir = dir)
15 |
16 | # split files by sample
17 | fns <- list.files(dir, full.names = TRUE)
18 | ss <- strsplit(fns, "_")
19 | ids <- sapply(ss, .subset, 3)
20 | ids <- gsub("\\..*", "", ids)
21 | fns <- split(fns, ids)
22 |
23 | # removes replicated samples
24 | pat <- "(C|S1|2)|(C|Sk)"
25 | fns <- fns[!grepl(pat, unique(ids))]
26 |
27 | l <- lapply(names(fns), \(.) {
28 | # read in gene metadata
29 | rd <- grep("genes", fns[[.]], value = TRUE)
30 | rd <- read.delim(rd, header = FALSE)
31 | names(rd) <- c("ensembl", "symbol")
32 |
33 | # read in cell metadata
34 | cd <- grep("barcodes", fns[[.]], value = TRUE)
35 | cd <- read.delim(cd, header = FALSE)
36 | names(cd) <- "barcode"
37 | cd$batch <- .
38 |
39 | # read in counts
40 | y <- grep("matrix", fns[[.]], value = TRUE)
41 | y <- readMM(y)
42 | dimnames(y) <- list(
43 | rd$ensembl,
44 | cd$barcode)
45 |
46 | # construct SCE
47 | SingleCellExperiment(
48 | assays = list(counts = y),
49 | rowData = rd, colData = cd)
50 | })
51 | # concatenate samples
52 | do.call(cbind, l)
53 | }
--------------------------------------------------------------------------------
/code/00-get_data-TabulaMuris.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(ExperimentHub)
3 | library(SingleCellExperiment)
4 | })
5 |
6 | fun <- \()
7 | {
8 | eh <- ExperimentHub()
9 | q <- query(eh, "TabulaMurisDroplet")
10 | x <- eh[[q$ah_id]]
11 |
12 | x <- x[, x$mouse_id == "3-F-56"]
13 | x <- x[, !is.na(x$cell_ontology_class)]
14 |
15 | cd <- data.frame(
16 | tissue = x$tissue,
17 | cluster = x$cell_ontology_class)
18 | colData(x) <- DataFrame(cd)
19 | rowData(x) <- NULL
20 |
21 | return(x)
22 | }
--------------------------------------------------------------------------------
/code/00-get_data-Tung17.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(SingleCellExperiment)
3 | })
4 |
5 | fun <- \()
6 | {
7 | url <- "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE77288&format=file&file=GSE77288%5Freads%2Draw%2Dsingle%2Dper%2Dsample%2Etxt%2Egz"
8 | tmp <- tempfile(fileext = ".txt.gz")
9 | download.file(url, destfile = tmp)
10 |
11 | y <- read.delim(gzfile(tmp))
12 | cd <- DataFrame(batch = y$individual)
13 |
14 | # simplify gene & cell names
15 | y <- y[, grep("^ENSG", names(y))]
16 | y <- t(as.matrix(y))
17 |
18 | SingleCellExperiment(
19 | assays = list(counts = y),
20 | colData = cd)
21 | }
--------------------------------------------------------------------------------
/code/00-get_data-Zheng17.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(scater)
3 | library(SingleCellExperiment)
4 | library(TENxPBMCData)
5 | })
6 |
7 | fun <- \()
8 | {
9 | sce <- TENxPBMCData("pbmc68k")
10 | counts(sce) <- as(counts(sce), "dgCMatrix")
11 |
12 | url <- "https://github.com/10XGenomics/single-cell-3prime-paper/raw/master/pbmc68k_analysis/68k_pbmc_barcodes_annotation.tsv"
13 | fnm <- file.path(tempdir(), "foo.tsv")
14 | download.file(url, fnm, quiet = TRUE)
15 | cd <- read.delim(fnm)
16 |
17 | rowData(sce)[3] <- NULL
18 | names(rowData(sce)) <- c("ensembl", "symbol")
19 |
20 | old <- c(
21 | "CD8+ Cytotoxic T",
22 | "CD8+/CD45RA+ Naive Cytotoxic",
23 | "CD4+/CD45RO+ Memory",
24 | "CD19+ B",
25 | "CD4+/CD25 T Reg",
26 | "CD56+ NK",
27 | "CD4+ T Helper2",
28 | "CD4+/CD45RA+/CD25- Naive T",
29 | "CD34+",
30 | "Dendritic",
31 | "CD14+ Monocyte")
32 | new <- c(
33 | "T CD8+",
34 | "T CD8+",
35 | "T CD4+",
36 | "B CD19+",
37 | "T CD4+",
38 | "NK CD56+",
39 | "T CD4+",
40 | "T CD4+",
41 | "HSCs CD34+",
42 | "Dendritic",
43 | "Monocytes CD14+")
44 |
45 | cluster <- new[match(cd$celltype, old)]
46 | colData(sce) <- DataFrame(cluster)
47 |
48 | return(sce)
49 | }
--------------------------------------------------------------------------------
/code/00-get_data-panc8.R:
--------------------------------------------------------------------------------
1 | # load required packages
2 | suppressPackageStartupMessages({
3 | library(Seurat)
4 | library(SeuratData)
5 | library(SingleCellExperiment)
6 | })
7 |
8 | fun <- \()
9 | {
10 | if (!require(panc8.SeuratData))
11 | InstallData("panc8.SeuratData")
12 |
13 | # load data as 'Seurat' object
14 | data("panc8")
15 | x <- as.SingleCellExperiment(panc8)
16 |
17 | # drop log-normalized counts
18 | # (these are identical to counts)
19 | assay(x, "logcounts") <- NULL
20 |
21 | # exclude datasets with non-integer counts
22 | x <- x[, !x$tech %in% c("celseq", "celseq2", "fluidigmc1")]
23 |
24 | # drop spike-ins
25 | x <- x[-grep("ERCC", rownames(x)), ]
26 |
27 | # drop undetected genes
28 | x <- x[rowSums(assay(x)) > 0, ]
29 |
30 | # subset & rename cell metadata
31 | colData(x) <- DataFrame(
32 | batch = x$dataset,
33 | cluster = x$celltype)
34 |
35 | return(x)
36 | }
--------------------------------------------------------------------------------
/code/00-get_data.R:
--------------------------------------------------------------------------------
1 | # retrieve dataset
2 | source(args[[1]])
3 | x <- fun()
4 |
5 | # number of features & observations
6 | dim(x)
7 |
8 | # tabulate number of cells by
9 | # batch, cluster, batch-cluster
10 | b <- !is.null(x$batch)
11 | k <- !is.null(x$cluster)
12 | if (b) {
13 | table(x$batch)
14 | if (k) {
15 | table(x$cluster)
16 | table(x$batch, x$cluster)
17 | }
18 | } else if (k) {
19 | table(x$cluster)
20 | }
21 |
22 | # simplify gene & cell names
23 | dimnames(x) <- list(
24 | paste0("gene", seq_len(nrow(x))),
25 | paste0("cell", seq_len(ncol(x))))
26 |
27 | # save SCE to .rds
28 | saveRDS(x, args[[2]])
--------------------------------------------------------------------------------
/code/01-fil_data.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(Matrix)
3 | library(SingleCellExperiment)
4 | })
5 |
6 | # read in raw data
7 | x <- readRDS(args[[1]])
8 |
9 | # filter out instances with less than 50 cells
10 | by <- c("cluster", "sample", "batch")
11 | by <- intersect(by, names(colData(x)))
12 |
13 | if (length(by) != 0) {
14 | cs <- split(seq(ncol(x)), colData(x)[by])
15 | rmv <- vapply(cs, length, numeric(1)) < 50
16 | if (any(rmv)) x <- x[, -unlist(cs[rmv])]
17 | }
18 |
19 | # keep genes with count > 1 in at least 10 cells,
20 | # and cells with at least 100 detected genes
21 | x <- x[
22 | rowSums(counts(x) > 1) >= 10,
23 | colSums(counts(x) > 0) >= 100]
24 |
25 | # drop missing factor levels
26 | for (. in by) x[[.]] <- droplevels(factor(x[[.]]))
27 |
28 | # save filtered data to .rds
29 | saveRDS(x, args[[2]])
--------------------------------------------------------------------------------
/code/02-sub_data.R:
--------------------------------------------------------------------------------
1 | set.seed(1)
2 | suppressPackageStartupMessages({
3 | library(jsonlite)
4 | library(SingleCellExperiment)
5 | })
6 |
7 | # read in reference dataset & subsetting parameters
8 | x <- readRDS(args[[1]])
9 | y <- fromJSON(args[[2]])
10 | y <- y[[wcs$datset]][[wcs$subset]]
11 |
12 | # subset cells according to configuration
13 | for (i in names(y)) {
14 | if (is.null(x[[i]])) next
15 | x <- x[, x[[i]] %in% y[[i]]]
16 | x[[i]] <- droplevels(factor(x[[i]]))
17 | # drop cell metadata variable when
18 | # there's only one level remaining
19 | if (nlevels(x[[i]]) == 1)
20 | x[[i]] <- NULL
21 | }
22 |
23 | # keep cells with at least 100 detected genes
24 | x <- x[, colSums(counts(x) > 0) >= 100]
25 |
26 | # downsample to at most 'n_cells' per instance
27 | if (!is.null(y$n_cells)) {
28 | by <- intersect(
29 | c("cluster", "sample", "batch"),
30 | names(colData(x)))
31 | if (length(by) > 0) {
32 | cs <- lapply(
33 | split(seq(ncol(x)), colData(x)[by]),
34 | function(.) {
35 | n <- as.numeric(y$n_cells)
36 | n <- min(n, length(.))
37 | sample(., n)
38 | })
39 | x <- x[, unlist(cs)]
40 | } else {
41 | n <- as.numeric(y$n_cells)
42 | n <- min(n, ncol(x))
43 | x <- x[, sample(ncol(x), n)]
44 | }
45 | }
46 |
47 | # keep genes with count > 1 in at least 10 cells
48 | x <- x[rowSums(counts(x) > 1) >= 10, ]
49 |
50 | # downsample to at most 'n_genes'
51 | if (!is.null(y$n_genes)) {
52 | n <- as.numeric(y$n_genes)
53 | n <- min(n, nrow(x))
54 | x <- x[sample(nrow(x), n), ]
55 | }
56 |
57 | # save subsetted data to .rds
58 | saveRDS(x, args[[3]])
--------------------------------------------------------------------------------
/code/03-est_pars-BASiCS.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(BASiCS)
3 | library(SingleCellExperiment)
4 | })
5 |
6 | fun <- function(x) {
7 | suppressMessages({
8 | foo <- capture.output({
9 | x$BatchInfo <- as.numeric(x$batch)
10 | mcmc <- BASiCS_MCMC(Data = x,
11 | N = 4000, Thin = 10, Burn = 2000,
12 | Regression = TRUE, WithSpikes = FALSE)
13 | })
14 | })
15 | mcmc <- Summary(mcmc)
16 | names(ps) <- ps <- c("mu", "delta", "s", "theta")
17 | ps <- lapply(ps, function(p) displaySummaryBASiCS(mcmc, p)[, 1])
18 | c(ps, list(bi = x$BatchInfo, batch = x$batch))
19 | }
--------------------------------------------------------------------------------
/code/03-est_pars-ESCO.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(ESCO)
3 | library(SingleCellExperiment)
4 | })
5 |
6 | fun <- function(x) {
7 | y <- counts(x)
8 | if (!is.matrix(y))
9 | y <- as.matrix(y)
10 | dir <- tempdir()
11 | if (!is.null(x$batch)) {
12 | type <- "batch"
13 | } else if (!is.null(x$cluster)) {
14 | type <- "cluster"
15 | } else type <- "single"
16 | group <- type != "single"
17 | cellinfo <- x[[type]]
18 | suppressMessages(z <- escoEstimate(y, dir, group, cellinfo))
19 | list(params = z, type = type, groups = unique(x[[type]]))
20 | }
--------------------------------------------------------------------------------
/code/03-est_pars-POWSC.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(POWSC)
3 | library(SingleCellExperiment)
4 | })
5 |
6 | fun <- function(x) {
7 | if (!is.matrix(y <- counts(x)))
8 | counts(x) <- as.matrix(y)
9 | if (is.null(x$cluster)) {
10 | y <- Est2Phase(x)
11 | } else {
12 | i <- split(seq(ncol(x)), x$cluster, drop = TRUE)
13 | y <- lapply(i, function(.) Est2Phase(x[, .]))
14 | }
15 | }
16 |
--------------------------------------------------------------------------------
/code/03-est_pars-SCRIP.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(splatter)
3 | library(SingleCellExperiment)
4 | })
5 |
6 | fun <- function(x) {
7 | y <- counts(x)
8 | if (!is.matrix(y))
9 | y <- as.matrix(y)
10 |
11 | cd <- data.frame(colData(x))
12 | i <- c("batch", "cluster")
13 | i <- intersect(i, names(cd))
14 | t <- ifelse(length(i) == 0, "foo", i)
15 |
16 | p <- if (t == "foo") {
17 | list(
18 | data = y,
19 | mode = "GP-trendedBCV",
20 | params = splatEstimate(y))
21 | } else {
22 | cd$cellType <- x[[i]]
23 | list(
24 | expre_data = y,
25 | mode = "GP-trendedBCV",
26 | pheno_data = cd,
27 | CTlist = unique(x[[i]]),
28 | nfeatures = nrow(x))
29 | }
30 | x <- list(p = p, t = t)
31 | }
32 |
--------------------------------------------------------------------------------
/code/03-est_pars-SPARSim.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(scater)
3 | library(SPARSim)
4 | library(SingleCellExperiment)
5 | })
6 |
7 | fun <- function(x)
8 | {
9 | y <- counts(x)
10 | if (!is.matrix(y))
11 | y <- as.matrix(y)
12 |
13 | if(!is.null(x$batch)){
14 | conditions <- list()
15 | for (batch_name in unique(x$batch)) {
16 | conditions[[batch_name]] <- which(x$batch %in% batch_name)
17 | }
18 | }else{
19 | conditions <- list(seq_len(ncol(x)))
20 | }
21 |
22 | estimate <- SPARSim_estimate_parameter_from_data(
23 | raw_data = y,
24 | norm_data = normalizeCounts(y),
25 | conditions = conditions)
26 |
27 | if(!is.null(x$batch)){
28 | list(estimate = estimate, batch=x$batch)
29 | }else{
30 | list(estimate=estimate)
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/code/03-est_pars-SPsimSeq.R:
--------------------------------------------------------------------------------
1 | fun <- function(x) { }
--------------------------------------------------------------------------------
/code/03-est_pars-SymSim.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(SymSim)
3 | library(SingleCellExperiment)
4 | })
5 | #type (k)?
6 | fun <- function(x) {
7 | y <- counts(x)
8 | if (!is.matrix(y))
9 | y <- as.matrix(y)
10 | z <- BestMatchParams(
11 | tech = "UMI",
12 | counts = y,
13 | plotfilename = "foo",
14 | n_optimal = 1)
15 | file.remove("foo.pdf")
16 | z$ngenes <- nrow(x)
17 | z$ncells_total <- ncol(x)
18 | n <- length(unique(x$batch))
19 | z$nbatch <- ifelse(n == 0, 1, n)
20 | list(params = z, batch = x$batch)
21 | }
--------------------------------------------------------------------------------
/code/03-est_pars-ZINB-WaVE.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(BiocParallel)
3 | library(SingleCellExperiment)
4 | library(zinbwave)
5 | })
6 |
7 | fun <- function(x) {
8 | cd <- data.frame(colData(x))
9 | i <- c("cluster", "batch")
10 | i <- intersect(i, names(cd))
11 |
12 | y <- if (length(i) == 0) {
13 | zinbFit(x,
14 | verbose = FALSE,
15 | BPPARAM = SerialParam())
16 | } else {
17 | zinbFit(x,
18 | model.matrix(~cd[[i]]),
19 | verbose = FALSE,
20 | BPPARAM = SerialParam())
21 | }
22 | x <- list(obj = y, cd = colData(x))
23 | }
24 |
--------------------------------------------------------------------------------
/code/03-est_pars-hierarchicell.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(hierarchicell)
3 | library(SingleCellExperiment)
4 | })
5 |
6 | fun <- function(x) {
7 | y <- counts(x)
8 | if (!is.matrix(y))
9 | y <- as.matrix(y)
10 | if (is.null(x$batch)) {
11 | # split cells into random batches
12 | batch_ids <- sample(2, ncol(x), TRUE)
13 | n_batches <- 0
14 | } else {
15 | batch_ids <- x$batch
16 | n_batches <- length(unique(batch_ids))
17 | }
18 | df <- data.frame(seq(ncol(x)), batch_ids, t(y))
19 | z <- filter_counts(df, gene_thresh = 0, cell_thresh = 0)
20 | p <- compute_data_summaries(expr = z, type = "Raw")
21 | # single group 2 cell is required
22 | # for simulation to pass successfully
23 | list(
24 | data_summaries = p,
25 | n_genes = nrow(x),
26 | n_cases = 1,
27 | cells_per_case = 1,
28 | n_controls = length(unique(batch_ids)),
29 | cells_per_control = tabulate(batch_ids),
30 | ncells_variation_type = "Fixed",
31 | n_batches = n_batches,
32 | batch_ids = batch_ids)
33 | }
34 |
--------------------------------------------------------------------------------
/code/03-est_pars-muscat.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(muscat)
3 | library(SingleCellExperiment)
4 | })
5 |
6 | fun <- function(x) {
7 | vars <- c("cluster", "batch", "group")
8 | vars <- setdiff(vars, names(colData(x)))
9 | for (v in vars) x[[v]] <- "foo"
10 |
11 | y <- prepSCE(x,
12 | kid = "cluster",
13 | sid = "batch",
14 | gid = "group",
15 | drop = TRUE)
16 |
17 | z <- prepSim(y,
18 | min_size = NULL,
19 | verbose = FALSE)
20 | }
21 |
--------------------------------------------------------------------------------
/code/03-est_pars-powsimR.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(powsimR)
3 | library(SingleCellExperiment)
4 | })
5 |
6 | fun <- function(x) {
7 | y <- counts(x)
8 | if (!is.matrix(y))
9 | y <- as.matrix(y)
10 | z <- estimateParam(
11 | countData = y,
12 | RNAseq = "singlecell",
13 | Protocol = "UMI",
14 | Distribution = "NB",
15 | Normalisation = "scran",
16 | GeneFilter = 0,
17 | SampleFilter = Inf,
18 | NCores = 1,
19 | verbose = FALSE)
20 | Setup(
21 | ngenes = z$totalG,
22 | estParamRes = z,
23 | n1 = z$totalS,
24 | n2 = 2, # has to be at least 2
25 | p.DE = 0,
26 | pLFC = 0,
27 | p.G = 1,
28 | nsims = 1,
29 | setup.seed = 1234)
30 | }
31 |
--------------------------------------------------------------------------------
/code/03-est_pars-scDD.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(BiocParallel)
3 | library(scDD)
4 | library(SingleCellExperiment)
5 | })
6 |
7 | fun <- function(x) {
8 | if (!is.matrix(y <- counts(x)))
9 | counts(x) <- as.matrix(y)
10 | # randomly split cells into 2 groups
11 | x$foo <- sample(2, ncol(x), TRUE)
12 | x <- preprocess(x,
13 | condition = "foo",
14 | scran_norm = TRUE)
15 | list(
16 | SCdat = x,
17 | condition = "foo",
18 | plots = FALSE,
19 | param = SerialParam(),
20 | nDE = 0, nDP = 0, nDM = 0,
21 | nDB = 0, nEE = nrow(x), nEP = 0,
22 | numSamples = ceiling(ncol(x)/2))
23 | }
24 |
--------------------------------------------------------------------------------
/code/03-est_pars-scDesign.R:
--------------------------------------------------------------------------------
1 | fun <- function(x) { }
--------------------------------------------------------------------------------
/code/03-est_pars-scDesign2.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(scDesign2)
3 | })
4 |
5 | fun <- function(x)
6 | {
7 | if (is.null(x$cluster)) {
8 | x$id <- "foo"
9 | } else {
10 | x$id <- x$cluster
11 | }
12 | colnames(x) <- x$id
13 | y <- fit_model_scDesign2(
14 | data_mat = counts(x),
15 | cell_type_sel = unique(x$id))
16 | }
--------------------------------------------------------------------------------
/code/03-est_pars-splatter.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(splatter)
3 | library(SingleCellExperiment)
4 | })
5 |
6 | fun <- \(x)
7 | {
8 | if (!is.matrix(y <- counts(x)))
9 | counts(x) <- as.matrix(y)
10 | g <- if (!is.null(x$batch)) {
11 | "batch"
12 | } else if (!is.null(x$cluster)) {
13 | "cluster"
14 | }
15 | if (is.null(g)) {
16 | p <- splatEstimate(x)
17 | } else {
18 | i <- split(seq(ncol(x)), x[[g]], drop = TRUE)
19 | x <- lapply(i, function(.) x[, .])
20 | p <- lapply(x, splatEstimate)
21 | }
22 | list(pars = p, type = g)
23 | }
--------------------------------------------------------------------------------
/code/03-est_pars-zingeR.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(gamlss)
3 | library(gamlss.tr)
4 | library(SingleCellExperiment)
5 | library(zingeR)
6 | })
7 |
8 | fun <- function(x) {
9 | y <- counts(x)
10 | if (!is.matrix(x))
11 | y <- as.matrix(y)
12 | g <- sample(2, ncol(y), TRUE)
13 | mm <- model.matrix(~g)
14 | p <- getDatasetZTNB(y, mm)
15 | list(
16 | dataset = y,
17 | group = g,
18 | nTags = p$dataset.nTags,
19 | nlibs = length(p$dataset.lib.size),
20 | pUp = 0,
21 | verbose = FALSE,
22 | params = p)
23 | }
24 |
--------------------------------------------------------------------------------
/code/03-est_pars.R:
--------------------------------------------------------------------------------
1 | x <- readRDS(args$sub)
2 |
3 | source(args$fun)
4 | y <- tryCatch(fun(x),
5 | error = function(e) NA)
6 |
7 | saveRDS(y, args$est)
--------------------------------------------------------------------------------
/code/04-sim_data-BASiCS.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(BASiCS)
3 | library(SingleCellExperiment)
4 | })
5 |
6 | fun <- function(x) {
7 | suppressMessages({
8 | y <- BASiCS_Sim(
9 | Mu = x$mu,
10 | Mu_spikes = NULL,
11 | Delta = x$delta,
12 | Phi = NULL,
13 | S = x$s,
14 | Theta = x$theta,
15 | BatchInfo = x$bi)
16 | })
17 | y$batch <- x$batch
18 | y$BatchInfo <- NULL
19 | return(y)
20 | }
--------------------------------------------------------------------------------
/code/04-sim_data-ESCO.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(ESCO)
3 | library(SingleCellExperiment)
4 | })
5 |
6 | fun <- function(x) {
7 | type <- ifelse(x$type == "single", "single", "group")
8 | y <- escoSimulate(x$params, type, verbose = FALSE, numCores = 1)
9 | assays(y) <- list(counts = assay(y, "counts"))
10 | if (type == "group") {
11 | groups <- factor(y$Group, labels = x$groups)
12 | cd <- DataFrame(groups)
13 | names(cd) <- x$type
14 | colData(y) <- cd
15 | } else {
16 | colData(y) <- NULL
17 | }
18 | return(y)
19 | }
20 |
--------------------------------------------------------------------------------
/code/04-sim_data-POWSC.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(POWSC)
3 | library(SingleCellExperiment)
4 | })
5 |
6 | fun <- function(x) {
7 | if (!is.list(x[[1]])) {
8 | y <- Simulate2SCE(
9 | n = ncol(x$exprs),
10 | perDE = 0,
11 | estParas1 = x,
12 | estParas2 = x)
13 |
14 | y <- y$sce
15 | rowData(y) <- NULL
16 | colData(y) <- NULL
17 | } else {
18 | ns <- vapply(x, function(.)
19 | ncol(.$exprs), numeric(1))
20 |
21 | y <- SimulateMultiSCEs(
22 | n = sum(ns)/(length(x)-1),
23 | estParas_set = x,
24 | multiProb = ns)
25 |
26 | y <- lapply(y, function(.) .$sce)
27 | y <- do.call(cbind, y)
28 |
29 | y$cluster <- y$cellTypes
30 | y$cellTypes <- NULL
31 | rowData(y) <- NULL
32 | }
33 | return(y)
34 | }
--------------------------------------------------------------------------------
/code/04-sim_data-SCRIP.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(checkmate)
3 | library(SCRIP)
4 | library(Seurat)
5 | library(SingleCellExperiment)
6 | })
7 |
8 | fun <- function(x) {
9 | if (x$t == "foo") {
10 | y <- do.call(SCRIPsimu, x$p)
11 | metadata(y) <- list()
12 | rowData(y) <- colData(y) <- NULL
13 | assays(y) <- assays(y)["counts"]
14 | } else {
15 | # code bug requires 'CTlist'
16 | # to be passed via environment
17 | env <- parent.frame()
18 | env$CTlist <- x$p$CTlist
19 | x$p$CTlist <- NULL
20 | sink(tempfile())
21 | y <- do.call(simu_cluster, x$p, envir = env)
22 | sink()
23 | cd <- DataFrame(y$CT.infor)
24 | names(cd) <- x$t
25 | y <- SingleCellExperiment(
26 | list(counts = y$final),
27 | colData = cd)
28 | # make cell names unique across clusters
29 | colnames(y) <- paste0("cell", seq(ncol(y)))
30 | }
31 | return(y)
32 | }
--------------------------------------------------------------------------------
/code/04-sim_data-SPARSim.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(SPARSim)
3 | library(SingleCellExperiment)
4 | })
5 |
6 | fun <- function(x) {
7 |
8 | sink(tempfile())
9 | y <- SPARSim_simulation(x$estimate, output_batch_matrix = TRUE)
10 | sink()
11 |
12 | y <- y$count_matrix
13 |
14 | if(is.null(x$batch)){
15 | SingleCellExperiment(
16 | assays = list(counts = y))
17 | }else{
18 | SingleCellExperiment(
19 | assays = list(counts = y),
20 | colData=data.frame(batch=x$batch))
21 | }
22 |
23 | }
--------------------------------------------------------------------------------
/code/04-sim_data-SPsimSeq.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(SPsimSeq)
3 | library(SingleCellExperiment)
4 | })
5 |
6 | fun <- function(x) {
7 | if (!is.matrix(y <- counts(x)))
8 | counts(x) <- as.matrix(y)
9 | if (is.null(colnames(x)))
10 | colnames(x) <- seq(ncol(x))
11 | if (is.null(x$batch)) {
12 | batch <- rep(1, ncol(x))
13 | batch.config <- 1
14 | } else {
15 | # keep genes detected in at least 10 cells per batch
16 | i <- split(seq(ncol(x)), x$batch)
17 | x <- x[rowAlls(vapply(i, function(.)
18 | rowSums(counts(x[, .]) > 0) >= 10,
19 | logical(nrow(x)))), ]
20 | batch <- as.numeric(x$batch)
21 | batch.config <- tabulate(x$batch)/ncol(x)
22 | }
23 | y <- SPsimSeq(
24 | s.data = counts(x),
25 | n.genes = nrow(x),
26 | tot.samples = ncol(x),
27 | batch = batch,
28 | batch.config = batch.config,
29 | model.zero.prob = TRUE,
30 | genewiseCor = TRUE,
31 | result.format = "SCE",
32 | return.details = FALSE,
33 | verbose = FALSE)
34 | y <- y[[1]]
35 | rowData(y) <- NULL
36 | colData(y) <- colData(x)
37 | return(y)
38 | }
39 |
--------------------------------------------------------------------------------
/code/04-sim_data-SymSim.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(SymSim)
3 | library(SingleCellExperiment)
4 | })
5 |
6 | fun <- function(x) {
7 | args <- intersect(
8 | names(x$params),
9 | names(formals(SimulateTrueCounts)))
10 | args <- c(x$params[args], list(randseed = 1234))
11 | y <- do.call(SimulateTrueCounts, args)
12 |
13 | data("gene_len_pool")
14 | gene_len <- sample(gene_len_pool, args$ngenes, TRUE)
15 |
16 | args <- intersect(
17 | names(x$params),
18 | names(formals(True2ObservedCounts)))
19 | args <- c(x$params[args], list(
20 | true_counts = y$counts,
21 | meta_cell = y$cell_meta,
22 | gene_len = gene_len))
23 | z <- do.call(True2ObservedCounts, args)
24 |
25 | if (x$params$nbatch > 1) {
26 | z <- DivideBatches(
27 | observed_counts_res = z,
28 | nbatch = x$params$nbatch)
29 | cd <- DataFrame(batch = x$batch)
30 | } else cd <- make_zero_col_DFrame(ncol(z$counts))
31 | SingleCellExperiment(list(counts = z$counts), colData = cd)
32 | }
--------------------------------------------------------------------------------
/code/04-sim_data-ZINB-WaVE.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(SingleCellExperiment)
3 | library(zinbwave)
4 | })
5 |
6 | fun <- function(x) {
7 | y <- zinbSim(x$obj)
8 | SingleCellExperiment(
9 | list(counts = y$counts),
10 | colData = x$cd)
11 | }
12 |
--------------------------------------------------------------------------------
/code/04-sim_data-hierarchicell.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(hierarchicell)
3 | library(SingleCellExperiment)
4 | })
5 |
6 | fun <- function(x) {
7 | args <- formals("simulate_hierarchicell")
8 | y <- do.call(
9 | simulate_hierarchicell,
10 | x[names(x) %in% names(args)])
11 | # keep control cells only
12 | y <- y[y$Status == "Control", ]
13 | y <- t(as.matrix(y[, grep("^Gene", names(y))]))
14 | cd <- if (x$n_batches != 0) {
15 | data.frame(batch = x$batch_ids)
16 | } else make_zero_col_DFrame(ncol(y))
17 | SingleCellExperiment(list(counts = y), colData = cd)
18 | }
--------------------------------------------------------------------------------
/code/04-sim_data-muscat.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(muscat)
3 | library(SingleCellExperiment)
4 | })
5 |
6 | fun <- function(x) {
7 | y <- simData(x, dd = FALSE)
8 | md <- metadata(y)
9 |
10 | if (!is.null(x$cluster_id)) {
11 | kids <- factor(y$cluster_id, labels = md$ref_kids)
12 | kids <- factor(kids, levels(x$cluster_id))
13 | y$cluster <- kids
14 | y$cluster_id <- NULL
15 | }
16 | if (!is.null(x$sample_id)) {
17 | bids <- factor(y$sample_id, labels = md$ref_sids)
18 | bids <- factor(bids, levels(x$sample_id))
19 | y$batch <- bids
20 | y$sample_id <- NULL
21 | }
22 | metadata(y) <- list()
23 | rowData(y) <- NULL
24 | return(y)
25 | }
26 |
--------------------------------------------------------------------------------
/code/04-sim_data-powsimR.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(powsimR)
3 | library(SingleCellExperiment)
4 | })
5 |
6 | fun <- function(x) {
7 | y <- simulateDE(
8 | SetupRes = x,
9 | Normalisation = "scran",
10 | DEmethod = "DESeq2",
11 | Counts = TRUE,
12 | NCores = NULL,
13 | verbose = TRUE)
14 | # remove group 2 cells
15 | z <- y$Counts[[1]][[1]][, -c(1, 2)]
16 | SingleCellExperiment(assays = list(counts = z))
17 | }
18 |
19 |
--------------------------------------------------------------------------------
/code/04-sim_data-scDD.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(BiocParallel)
3 | library(scDD)
4 | library(SingleCellExperiment)
5 | })
6 |
7 | fun <- function(x) {
8 | sink(tempfile())
9 | y <- do.call(simulateSet, x)
10 | sink()
11 | colData(y) <- rowData(y) <- NULL
12 | return(y)
13 | }
14 |
--------------------------------------------------------------------------------
/code/04-sim_data-scDesign.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(scDesign)
3 | library(SingleCellExperiment)
4 | })
5 |
6 | fun <- function(x) {
7 | y <- counts(x)
8 | if (!is.matrix(y))
9 | y <- as.matrix(y)
10 | foo <- capture.output(
11 | z <- design_data(
12 | realcount = y,
13 | S = sum(y),
14 | ncell = ncol(y),
15 | ngroup = 1,
16 | ncores = 1))
17 | SingleCellExperiment(
18 | assays = list(counts = z))
19 | }
--------------------------------------------------------------------------------
/code/04-sim_data-scDesign2.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(scDesign2)
3 | library(SingleCellExperiment)
4 | })
5 |
6 | fun <- function(x)
7 | {
8 | # get number of cells per cluster
9 | n <- vapply(x, function(.) .$n_cell, numeric(1))
10 | # simulate data with equal number
11 | # & proportion of cells per cluster
12 | y <- simulate_count_scDesign2(
13 | model_params = x,
14 | n_cell_new = sum(n),
15 | cell_type_prop = prop.table(n))
16 | # construct SCE
17 | cd <- if (length(x) == 1) {
18 | make_zero_col_DFrame(ncol(y))
19 | } else {
20 | data.frame(cluster = colnames(y))
21 | }
22 | SingleCellExperiment(
23 | assay = list(counts = unname(y)),
24 | colData = cd)
25 | }
26 |
--------------------------------------------------------------------------------
/code/04-sim_data-splatter.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(splatter)
3 | library(SingleCellExperiment)
4 | })
5 |
6 | fun <- \(x)
7 | {
8 | f <- function(p) {
9 | y <- splatSimulate(p, verbose = FALSE)
10 | assays(y) <- assays(y)["counts"]
11 | rowData(y) <- colData(y) <- NULL
12 | metadata(y) <- list()
13 | return(y)
14 | }
15 | if (!is.null(x$type)) {
16 | y <- lapply(names(x$pars),
17 | function(i) {
18 | y <- f(x$pars[[i]])
19 | y[[x$type]] <- i
20 | return(y)
21 | }
22 | )
23 | do.call(cbind, y)
24 | } else {
25 | f(x$pars)
26 | }
27 | }
--------------------------------------------------------------------------------
/code/04-sim_data-zingeR.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(gamlss)
3 | library(gamlss.tr)
4 | library(mgcv)
5 | library(SingleCellExperiment)
6 | library(zingeR)
7 | })
8 |
9 | fun <- function(x) {
10 | y <- do.call(NBsimSingleCell, x)
11 | SingleCellExperiment(list(counts = y$counts))
12 | }
13 |
--------------------------------------------------------------------------------
/code/04-sim_data.R:
--------------------------------------------------------------------------------
1 | x <- readRDS(args$est)
2 |
3 |
4 | y <- if (!isTRUE(is.na(x))) { # skip simulation if estimation failed
5 | if (is.null(x)) # use dataset if there's no separate estimation step
6 | x <- readRDS(args$sub)
7 |
8 | source(args$fun)
9 | tryCatch(fun(x),
10 | error = function(e) NULL)
11 | }
12 |
13 | if (!is.null(y)) {
14 | z <- assay(y)
15 | y <- y[
16 | rowSums(z) > 0,
17 | colSums(z) > 0]
18 | dimnames(y) <- list(
19 | paste0("gene", seq(nrow(y))),
20 | paste0("cell", seq(ncol(y))))
21 | }
22 |
23 | saveRDS(y, args$sim)
--------------------------------------------------------------------------------
/code/05-calc_batch-ComBat.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(scater)
3 | library(sva)
4 | })
5 |
6 | fun <- \(x)
7 | {
8 | y <- normalizeCounts(x, log = TRUE)
9 | if (!is.matrix(y)) y <- as.matrix(y)
10 |
11 | df <- data.frame(t(y))
12 | mm <- model.matrix(~ 1, df)
13 |
14 | sink(tempfile())
15 | suppressMessages({
16 | z <- ComBat(
17 | dat = y,
18 | batch = x$batch,
19 | mod = mm,
20 | par.prior = TRUE,
21 | prior.plots = FALSE)
22 | })
23 | sink()
24 |
25 | list(
26 | assay_in = y,
27 | assay_out = z)
28 | }
--------------------------------------------------------------------------------
/code/05-calc_batch-Harmony.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(harmony)
3 | library(scater)
4 | })
5 |
6 | fun <- \(x)
7 | {
8 | y <- calculatePCA(
9 | logNormCounts(x),
10 | ncomponents = 20)
11 |
12 | z <- HarmonyMatrix(
13 | data_mat = y,
14 | meta_data = x$batch,
15 | do_pca = FALSE,
16 | verbose = FALSE)
17 |
18 | list(
19 | dimred_in = y,
20 | dimred_out = z)
21 | }
--------------------------------------------------------------------------------
/code/05-calc_batch-Seurat.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(scater)
3 | library(Seurat)
4 | })
5 |
6 | fun <- \(x)
7 | {
8 | x <- logNormCounts(x)
9 | k <- min(table(x$batch)/2)
10 |
11 | l <- SplitObject(
12 | as.Seurat(x),
13 | split.by = "batch")
14 | l <- lapply(l, \(.) {
15 | . <- NormalizeData(., verbose = FALSE)
16 | . <- FindVariableFeatures(.,
17 | verbose = FALSE,
18 | nfeatures = 1000,
19 | selection.method = "vst")
20 | })
21 | fs <- SelectIntegrationFeatures(l)
22 | as <- FindIntegrationAnchors(l,
23 | k.filter = k,
24 | dims = seq(20),
25 | verbose = FALSE,
26 | anchor.features = fs)
27 | y <- IntegrateData(as,
28 | k.weight = k,
29 | verbose = FALSE,
30 | features.to.integrate = rownames(x))
31 |
32 | list(
33 | assay_in = logcounts(x),
34 | assay_out = GetAssayData(y))
35 | }
--------------------------------------------------------------------------------
/code/05-calc_batch-fastMNN.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(batchelor)
3 | library(scater)
4 | })
5 |
6 | fun <- \(x)
7 | {
8 | x <- logNormCounts(x)
9 | y <- calculatePCA(x, ncomponents = 20)
10 |
11 | k <- min(table(x$batch)/2)
12 | z <- fastMNN(x,
13 | batch = x$batch,
14 | k = k, d = 20)
15 |
16 | list(
17 | dimred_in = y,
18 | dimred_out = reducedDim(z))
19 | }
--------------------------------------------------------------------------------
/code/05-calc_batch-limma.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(edgeR)
3 | library(limma)
4 | })
5 |
6 | fun <- \(x)
7 | {
8 | mm <- model.matrix(~ x$batch)
9 | y <- DGEList(counts(x))
10 |
11 | y <- calcNormFactors(y, method = "TMMwsp")
12 | v <- voom(y, mm, plot = FALSE)
13 | z <- removeBatchEffect(v, x$batch)
14 |
15 | list(
16 | assay_in = v$E,
17 | assay_out = z)
18 | }
--------------------------------------------------------------------------------
/code/05-calc_batch-mnnCorrect.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(batchelor)
3 | library(scater)
4 | })
5 |
6 | fun <- \(x)
7 | {
8 | x <- logNormCounts(x)
9 | k <- min(table(x$batch)/2)
10 | y <- mnnCorrect(x, batch = x$batch, k = k)
11 |
12 | list(
13 | assay_in = logcounts(x),
14 | assay_out = assay(y))
15 | }
--------------------------------------------------------------------------------
/code/05-calc_batch.R:
--------------------------------------------------------------------------------
1 | source(args$fun)
2 | sce <- readRDS(args$sce)
3 |
4 | # skip if simulation failed (return NULL)
5 | res <- if (!is.null(sce)) fun(sce)
6 |
7 | saveRDS(res, args$res)
8 |
--------------------------------------------------------------------------------
/code/05-calc_clust-CIDR.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(cidr)
3 | library(SingleCellExperiment)
4 | })
5 |
6 | fun <- \(x)
7 | {
8 | y <- counts(x)
9 | if (!is.matrix(y))
10 | y <- as.matrix(y)
11 | k <- length(unique(x$cluster))
12 |
13 | z <- scDataConstructor(y, tagType = "raw")
14 | z <- determineDropoutCandidates(z)
15 | z <- wThreshold(z)
16 | z <- scDissim(z, threads = 1)
17 | z <- scPCA(z, plotPC = FALSE)
18 | z <- nPC(z)
19 |
20 | scCluster(
21 | object = z,
22 | nCluster = k,
23 | nPC = z@nPC,
24 | cMethod = "ward.D2")@clusters
25 | }
--------------------------------------------------------------------------------
/code/05-calc_clust-PCA+HC.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(scater)
3 | library(SingleCellExperiment)
4 | })
5 |
6 | fun <- \(x)
7 | {
8 | y <- normalizeCounts(x, log = TRUE)
9 | pca <- prcomp(t(y), center = TRUE, scale. = FALSE)
10 | pca <- pca$x[, seq(20), drop = FALSE]
11 | hc <- hclust(dist(pca), method = "ward.D2")
12 | cutree(hc, k = length(unique(x$cluster)))
13 | }
--------------------------------------------------------------------------------
/code/05-calc_clust-PCA+KM.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(scater)
3 | })
4 |
5 | fun <- \(x)
6 | {
7 | y <- normalizeCounts(x, log = TRUE)
8 | k <- length(unique(x$cluster))
9 | pca <- prcomp(t(y), center = TRUE, scale. = FALSE)
10 | pca <- pca$x[, seq(30), drop = FALSE]
11 | kmeans(pca, centers = k, nstart = 25)$cluster
12 | }
--------------------------------------------------------------------------------
/code/05-calc_clust-SC3.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(SC3)
3 | library(scater)
4 | library(SingleCellExperiment)
5 | })
6 |
7 | fun <- \(x)
8 | {
9 | if (is.null(rownames(x)))
10 | rownames(x) <- paste0("gene", seq(nrow(x)))
11 |
12 | if (!is.matrix(y <- counts(x)))
13 | counts(x) <- as.matrix(y)
14 |
15 | x <- logNormCounts(x)
16 | k <- length(unique(x$cluster))
17 | rowData(x)$feature_symbol <- rownames(x)
18 |
19 | y <- sc3_prepare(x,
20 | gene_filter = FALSE,
21 | svm_max = 1e6,
22 | n_cores = 1,
23 | rand_seed = 1)
24 |
25 | z <- sc3(y,
26 | ks = k,
27 | gene_filter = FALSE,
28 | biology = FALSE,
29 | k_estimator = FALSE,
30 | svm_max = 1e6,
31 | n_cores = 1,
32 | rand_seed = 1)
33 |
34 | z[[grep("sc3", names(colData(z)))]]
35 | }
--------------------------------------------------------------------------------
/code/05-calc_clust-SC3svm.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(SC3)
3 | library(scater)
4 | library(SingleCellExperiment)
5 | })
6 |
7 | fun <- \(x)
8 | {
9 | if (is.null(rownames(x)))
10 | rownames(x) <- paste0("gene", seq(nrow(x)))
11 |
12 | if (!is.matrix(y <- counts(x)))
13 | counts(x) <- as.matrix(y)
14 |
15 | n <- round(ncol(x)/2)
16 | x <- logNormCounts(x)
17 | k <- length(unique(x$cluster))
18 | rowData(x)$feature_symbol <- rownames(x)
19 |
20 | y <- sc3_prepare(x,
21 | gene_filter = FALSE,
22 | svm_max = 0,
23 | svm_num_cells = n,
24 | n_cores = 1,
25 | rand_seed = 1)
26 |
27 | y <- sc3(y,
28 | ks = k,
29 | gene_filter = FALSE,
30 | biology = FALSE,
31 | k_estimator = FALSE,
32 | svm_max = 0,
33 | svm_num_cells = n,
34 | n_cores = 1,
35 | rand_seed = 1)
36 |
37 | y <- sc3_run_svm(y, k)
38 | y[[grep("sc3", names(colData(y)))]]
39 | }
--------------------------------------------------------------------------------
/code/05-calc_clust-Seurat.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(Seurat)
3 | library(SingleCellExperiment)
4 | })
5 |
6 | fun <- \(x)
7 | {
8 | if (is.null(rownames(x)))
9 | rownames(x) <- paste0("gene", seq(nrow(x)))
10 | if (is.null(colnames(x)))
11 | colnames(x) <- paste0("cell", seq(ncol(x)))
12 |
13 | y <- CreateSeuratObject(
14 | counts = counts(x),
15 | meta.data = colData(x))
16 |
17 | y <- NormalizeData(y, verbose = FALSE)
18 | y <- ScaleData(y, verbose = FALSE)
19 | y <- FindVariableFeatures(y, verbose = FALSE)
20 | hvgs <- VariableFeatures(y)
21 |
22 | y <- RunPCA(y, npcs = n <- 30, features = hvgs, verbose = FALSE)
23 | y <- FindNeighbors(y, dims = seq(n), verbose = FALSE)
24 | y <- FindClusters(y, res = 0.8, verbose = FALSE)
25 | Idents(y)
26 | }
27 |
--------------------------------------------------------------------------------
/code/05-calc_clust-monocle.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(monocle)
3 | library(scran)
4 | })
5 |
6 | fun <- \(x)
7 | {
8 | if (!is.matrix(y <- counts(x)))
9 | counts(x) <- as.matrix(y)
10 | if (is.null(rownames(x)))
11 | rownames(x) <- paste0("gene", seq(nrow(x)))
12 | k <- length(unique(x$cluster))
13 | rowData(x)$gene_short_name <- rownames(x)
14 | y <- convertTo(x, type = "monocle")
15 | y <- estimateSizeFactors(y)
16 | y <- tryCatch(
17 | estimateDispersions(y),
18 | error = function(e) y)
19 | y <- reduceDimension(y,
20 | max_components = 3,
21 | num_dim = 50,
22 | reduction_method = "tSNE",
23 | verbose = TRUE)
24 | monocle::clusterCells(y,
25 | num_clusters = k+1,
26 | method = "densityPeak")$Cluster
27 | }
--------------------------------------------------------------------------------
/code/05-calc_clust-pcaReduce.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(clue)
3 | library(pcaReduce)
4 | library(scater)
5 | })
6 |
7 | fun <- \(x)
8 | {
9 | k <- length(unique(x$cluster))
10 | y <- normalizeCounts(x, log = TRUE)
11 | z <- PCAreduce(t(y),
12 | nbt = 100,
13 | q = q <- 30,
14 | method = "S")
15 | l <- lapply(z, function(.) {
16 | colnames(.) <- paste0("k", seq(q+1, 2))
17 | as.cl_partition(.[, paste0("k", k)])
18 | })
19 | e <- as.cl_ensemble(l)
20 | res <- cl_consensus(e,
21 | method = "SE",
22 | control = list(nruns = 50))
23 | c(cl_class_ids(res))
24 | }
--------------------------------------------------------------------------------
/code/05-calc_clust-tSNE+KM.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(Rtsne)
3 | library(scater)
4 | })
5 |
6 | fun <- \(x)
7 | {
8 | k <- length(unique(x$cluster))
9 | y <- normalizeCounts(x, log = TRUE)
10 | if (!is.matrix(y)) y <- as.matrix(y)
11 | tsne <- Rtsne(t(y),
12 | pca = TRUE, initial_dims = 50, dims = 3,
13 | perplexity = 30, check_duplicates = FALSE)
14 | kmeans(tsne$Y, centers = k, nstart = 25)$cluster
15 | }
--------------------------------------------------------------------------------
/code/05-calc_clust.R:
--------------------------------------------------------------------------------
1 | source(args$fun)
2 | sce <- readRDS(args$sce)
3 |
4 | # skip if simulation failed (return NULL)
5 | res <- if (!is.null(sce))
6 | data.frame(wcs,
7 | row.names = NULL,
8 | pred = as.integer(as.integer(fun(sce))),
9 | true = as.integer(droplevels(factor(sce$cluster))))
10 |
11 | saveRDS(res, args$res)
12 |
--------------------------------------------------------------------------------
/code/05-calc_dr.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(scater)
3 | library(scran)
4 | library(SingleCellExperiment)
5 | })
6 |
7 | set.seed(7043)
8 | x <- readRDS(args[[1]])
9 |
10 | # skip if simulation failed (return NULL)
11 | df <- if (!is.null(x)) {
12 | if ("normcounts" %in% assayNames(x)) {
13 | logcounts(x) <- log(normcounts(x)+1)
14 | } else x <- logNormCounts(x)
15 |
16 | stats <- modelGeneVar(x)
17 | hvgs <- getTopHVGs(stats, n = 500)
18 |
19 | x <- runPCA(x, subset_row = hvgs)
20 | x <- runTSNE(x, dimred = "PCA")
21 | x <- runUMAP(x, dimred = "PCA")
22 |
23 | tsne <- reducedDim(x, "TSNE")
24 | umap <- reducedDim(x, "UMAP")
25 | colnames(tsne) <- paste0("TSNE", seq(2))
26 | colnames(umap) <- paste0("UMAP", seq(2))
27 |
28 | x$lls <- if ("counts" %in% assayNames(x))
29 | x$lls <- log(colSums(counts(x))+1) else NA
30 |
31 | i <- c("cluster", "batch", "lls")
32 | i <- intersect(names(colData(x)), i)
33 | cd <- colData(x)[i]
34 |
35 | data.frame(wcs, cd, tsne, umap)
36 | }
37 |
38 | saveRDS(df, args[[2]])
39 |
--------------------------------------------------------------------------------
/code/05-calc_qc-cell_cms.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(CellMixS)
3 | library(scater)
4 | library(scran)
5 | library(SingleCellExperiment)
6 | })
7 |
8 | ppFUN <- function(sce) {
9 | if ("normcounts" %in% assayNames(sce)) {
10 | logcounts(sce) <- log(normcounts(sce)+1)
11 | } else sce <- logNormCounts(sce)
12 | stats <- modelGeneVar(sce)
13 | hvgs <- getTopHVGs(stats, n = 500)
14 | sce <- runPCA(sce, subset_row = hvgs)
15 | }
16 |
17 | qcFUN <- function(sce) {
18 | i <- c("batch", "cluster")
19 | i <- intersect(i, names(colData(sce)))
20 | if (length(i) == 0) return(NULL)
21 | ids <- factor(sce[[i]])
22 | n <- tabulate(ids)
23 | k <- min(n[n != 0])/2
24 | cms(sce, k, i)$cms
25 | }
26 |
27 | groups <- "global"
28 | n_genes <- NULL
29 | n_cells <- NULL
--------------------------------------------------------------------------------
/code/05-calc_qc-cell_cor.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(scater)
3 | })
4 |
5 | ppFUN <- function(sce) {
6 | if (!is.matrix(y <- assay(sce)))
7 | assay(sce) <- as.matrix(y)
8 | y <- if ("normcounts" %in% assayNames(sce)) {
9 | normcounts(sce)
10 | } else calculateCPM(sce)
11 | assay(sce, "exprs") <- log(y+1)
12 | return(sce)
13 | }
14 |
15 | qcFUN <- function(sce) {
16 | cor <- cor(
17 | assay(sce, "exprs"),
18 | method = "spearman",
19 | use = "pairwise.complete.obs")
20 | cor[upper.tri(cor)]
21 | }
22 |
23 | groups <- NULL
24 | n_genes <- NULL
25 | n_cells <- 200
--------------------------------------------------------------------------------
/code/05-calc_qc-cell_frq.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(SingleCellExperiment)
3 | })
4 |
5 | ppFUN <- function(sce) {
6 | sce
7 | }
8 |
9 | qcFUN <- function(sce) {
10 | colMeans(assay(sce) != 0)
11 | }
12 |
13 | groups <- NULL
14 | n_genes <- NULL
15 | n_cells <- NULL
--------------------------------------------------------------------------------
/code/05-calc_qc-cell_knn.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(RANN)
3 | library(scater)
4 | library(scran)
5 | library(SingleCellExperiment)
6 | })
7 |
8 | ppFUN <- function(sce) {
9 | if ("normcounts" %in% assayNames(sce)) {
10 | logcounts(sce) <- log(normcounts(sce)+1)
11 | } else sce <- logNormCounts(sce)
12 | stats <- modelGeneVar(sce)
13 | hvgs <- getTopHVGs(stats, n = 500)
14 | sce <- runPCA(sce, subset_row = hvgs)
15 | }
16 |
17 | qcFUN <- function(sce) {
18 | # build KNN-graph on PCA
19 | # (where k = 5% of cells)
20 | pca <- reducedDim(sce, "PCA")
21 | k <- round(0.05*ncol(sce))
22 | knn <- nn2(pca, k = k+1)
23 | idx <- knn$nn.idx[, seq(2, k+1)]
24 | # count how often each cell is a KNN
25 | vapply(seq(ncol(sce)), \(i) sum(idx == i), numeric(1))
26 | }
27 |
28 | groups <- NULL
29 | n_genes <- NULL
30 | n_cells <- NULL
--------------------------------------------------------------------------------
/code/05-calc_qc-cell_ldf.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(BiocNeighbors)
3 | library(CellMixS)
4 | library(dplyr)
5 | library(scater)
6 | library(scran)
7 | library(SingleCellExperiment)
8 | })
9 |
10 | ppFUN <- function(sce) {
11 | if ("normcounts" %in% assayNames(sce)) {
12 | logcounts(sce) <- log(normcounts(sce)+1)
13 | } else sce <- logNormCounts(sce)
14 | stats <- modelGeneVar(sce)
15 | hvgs <- getTopHVGs(stats, n = 500)
16 | sce <- runPCA(sce, subset_row = hvgs)
17 | }
18 |
19 | qcFUN <- function(sce) {
20 | i <- c("batch", "cluster")
21 | i <- intersect(i, names(colData(sce)))
22 | k <- if (length(i) != 0) {
23 | # half of smallest group
24 | ids <- factor(sce[[i]])
25 | n <- tabulate(ids)
26 | min(n[n != 0])/2
27 | } else {
28 | # 5% of total cells
29 | 0.05*ncol(sce)
30 | }
31 | # KNN on PCA
32 | pca <- reducedDim(sce, "PCA")
33 | knn <- findKNN(pca, k)
34 | # fix naming for indexing
35 | cs <- colnames(sce)
36 | rownames(knn$index) <- cs
37 | rownames(knn$distance) <- cs
38 | knn$cell_name <- knn$index
39 | # compute local density factors
40 | c(CellMixS:::.ldfKnn(pca, knn, k)$LDF)
41 | }
42 |
43 | groups <- NULL
44 | n_genes <- NULL
45 | n_cells <- NULL
--------------------------------------------------------------------------------
/code/05-calc_qc-cell_lls.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(SingleCellExperiment)
3 | })
4 |
5 | ppFUN <- function(sce) {
6 | sce
7 | }
8 |
9 | qcFUN <- function(sce) {
10 | if ("counts" %in% assayNames(sce))
11 | log(colSums(counts(sce) + 1))
12 | }
13 |
14 | groups <- NULL
15 | n_genes <- NULL
16 | n_cells <- NULL
--------------------------------------------------------------------------------
/code/05-calc_qc-cell_pcd.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(scater)
3 | library(scran)
4 | library(SingleCellExperiment)
5 | })
6 |
7 | ppFUN <- function(sce) {
8 | if ("normcounts" %in% assayNames(sce)) {
9 | logcounts(sce) <- log(normcounts(sce)+1)
10 | } else sce <- logNormCounts(sce)
11 | stats <- modelGeneVar(sce)
12 | hvgs <- getTopHVGs(stats, n = 500)
13 | sce <- runPCA(sce, subset_row = hvgs)
14 | }
15 |
16 | qcFUN <- function(sce) {
17 | pca <- reducedDim(sce, "PCA")
18 | c(dist(pca, upper = TRUE))
19 | }
20 |
21 | groups <- NULL
22 | n_genes <- NULL
23 | n_cells <- 200
--------------------------------------------------------------------------------
/code/05-calc_qc-cell_sw.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(cluster)
3 | library(scater)
4 | library(scran)
5 | library(SingleCellExperiment)
6 | })
7 |
8 | ppFUN <- function(sce) {
9 | if ("normcounts" %in% assayNames(sce)) {
10 | logcounts(sce) <- log(normcounts(sce)+1)
11 | } else sce <- logNormCounts(sce)
12 | stats <- modelGeneVar(sce)
13 | hvgs <- getTopHVGs(stats, n = 500)
14 | sce <- runPCA(sce, subset_row = hvgs)
15 | }
16 |
17 | qcFUN <- function(sce) {
18 | i <- c("batch", "cluster")
19 | i <- intersect(i, names(colData(sce)))
20 | if (length(i) == 0) return(NULL)
21 | ids <- as.integer(factor(sce[[i]]))
22 | mtx <- dist(reducedDim(sce, "PCA"))
23 | res <- silhouette(ids, mtx)
24 | return(res[, "sil_width"])
25 | }
26 |
27 | groups <- "global"
28 | n_genes <- NULL
29 | n_cells <- NULL
--------------------------------------------------------------------------------
/code/05-calc_qc-gene_avg.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(scater)
3 | })
4 |
5 | ppFUN <- function(sce) {
6 | if (!is.matrix(y <- assay(sce)))
7 | assay(sce) <- as.matrix(y)
8 | y <- if ("normcounts" %in% assayNames(sce)) {
9 | normcounts(sce)
10 | } else calculateCPM(sce)
11 | assay(sce, "exprs") <- log(y+1)
12 | return(sce)
13 | }
14 |
15 | qcFUN <- function(sce) {
16 | rowMeans(assay(sce, "exprs"))
17 | }
18 |
19 | groups <- NULL
20 | n_genes <- NULL
21 | n_cells <- NULL
--------------------------------------------------------------------------------
/code/05-calc_qc-gene_cor.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(matrixStats)
3 | library(scater)
4 | })
5 |
6 | ppFUN <- function(sce) {
7 | if (!is.matrix(y <- assay(sce)))
8 | assay(sce) <- as.matrix(y)
9 | y <- if ("normcounts" %in% assayNames(sce)) {
10 | normcounts(sce)
11 | } else calculateCPM(sce)
12 | assay(sce, "exprs") <- log(y+1)
13 | sce[rowVars(y) > 0, ]
14 | }
15 |
16 | qcFUN <- function(sce) {
17 | cor <- cor(
18 | t(assay(sce, "exprs")),
19 | method = "spearman",
20 | use = "pairwise.complete.obs")
21 | cor[upper.tri(cor)]
22 | }
23 |
24 | groups <- NULL
25 | n_genes <- 400
26 | n_cells <- NULL
--------------------------------------------------------------------------------
/code/05-calc_qc-gene_cv.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(scater)
3 | })
4 |
5 | ppFUN <- function(sce) {
6 | if (!is.matrix(y <- assay(sce)))
7 | assay(sce) <- as.matrix(y)
8 | y <- if ("normcounts" %in% assayNames(sce)) {
9 | normcounts(sce)
10 | } else calculateCPM(sce)
11 | assay(sce, "exprs") <- log(y+1)
12 | return(sce)
13 | }
14 |
15 | qcFUN <- function(sce) {
16 | es <- assay(sce, "exprs")
17 | sd <- sqrt(rowVars(es))
18 | mu <- rowMeans(es)
19 | cv <- sd / mu
20 | cv[!is.na(cv)]
21 | }
22 |
23 | groups <- NULL
24 | n_genes <- NULL
25 | n_cells <- NULL
--------------------------------------------------------------------------------
/code/05-calc_qc-gene_frq.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(SingleCellExperiment)
3 | })
4 |
5 | ppFUN <- function(sce) {
6 | sce
7 | }
8 |
9 | qcFUN <- function(sce) {
10 | rowMeans(assay(sce) != 0)
11 | }
12 |
13 | groups <- NULL
14 | n_genes <- NULL
15 | n_cells <- NULL
--------------------------------------------------------------------------------
/code/05-calc_qc-gene_pve.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(BiocParallel)
3 | library(scater)
4 | library(SingleCellExperiment)
5 | library(variancePartition)
6 | })
7 |
8 | ppFUN <- function(sce) {
9 | if ("normcounts" %in% assayNames(sce)) {
10 | logcounts(sce) <- log(normcounts(sce)+1)
11 | } else sce <- logNormCounts(sce)
12 | return(sce)
13 | }
14 |
15 | qcFUN <- function(sce) {
16 | i <- c("batch", "cluster")
17 | i <- intersect(i, names(colData(sce)))
18 | if (length(i) == 0) return(NULL)
19 | y <- logcounts(sce)
20 | if (!is.matrix(y))
21 | y <- as.matrix(y)
22 | y <- y[rowSums(y) != 0, ]
23 | f <- as.formula(sprintf("~(1|%s)", i))
24 | cd <- data.frame(colData(sce)[i])
25 | pve <- fitExtractVarPartModel(y, f, cd,
26 | quiet = TRUE, BPPARAM = SerialParam())
27 | return(pve[[i]])
28 | }
29 |
30 | groups <- "global"
31 | n_genes <- NULL
32 | n_cells <- NULL
--------------------------------------------------------------------------------
/code/05-calc_qc-gene_var.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(matrixStats)
3 | library(scater)
4 | })
5 |
6 | ppFUN <- function(sce) {
7 | if (!is.matrix(y <- assay(sce)))
8 | assay(sce) <- as.matrix(y)
9 | y <- if ("normcounts" %in% assayNames(sce)) {
10 | normcounts(sce)
11 | } else calculateCPM(sce)
12 | assay(sce, "exprs") <- log(y+1)
13 | return(sce)
14 | }
15 |
16 | qcFUN <- function(sce) {
17 | rowVars(assay(sce, "exprs"))
18 | }
19 |
20 | groups <- NULL
21 | n_genes <- NULL
22 | n_cells <- NULL
--------------------------------------------------------------------------------
/code/05-calc_qc.R:
--------------------------------------------------------------------------------
1 | source(args$uts)
2 | source(args$fun)
3 |
4 | # each QC script should specify
5 | # ppFUN: function to apply to the SCE for PreProcessing
6 | # qcFUN: function to compute the Quality Control metric
7 | # (both FUNs should input & output a SCE)
8 | # groups: for which to compute the QC metric
9 | # (global, cluster, batch, NULL for all)
10 | # n_genes: number of genes to sample - globally (NULL for all)
11 | # n_cells: number of cells to sample - per group (NULL for all)
12 |
13 | if (exists("ppFUN")) {
14 | stopifnot(is.function(ppFUN))
15 | } else {
16 | # do nothing
17 | ppFUN <- \(.) .
18 | }
19 |
20 | if (exists("qcFUN")) {
21 | stopifnot(is.function(qcFUN))
22 | } else {
23 | stop("'qcFUN' needs to be specified")
24 | }
25 |
26 | choices <- eval(formals(.calc_qc)$groups)
27 | if (exists("groups") && !is.null(groups)) {
28 | groups <- match.arg(groups, choices, several.ok = TRUE)
29 | } else {
30 | groups <- choices
31 | }
32 |
33 | if (exists("n_genes") && !is.null(n_genes)) {
34 | stopifnot(is.numeric(n_genes), length(n_genes) == 1)
35 | } else {
36 | # use all
37 | n_genes <- NULL
38 | }
39 |
40 | if (exists("n_cells") && !is.null(n_cells)) {
41 | stopifnot(is.numeric(n_cells), length(n_cells) == 1)
42 | } else {
43 | # use all
44 | n_cells <- NULL
45 | }
46 |
47 | # ------------------------------------------------------------------------------
48 |
49 | sce <- readRDS(args$sce)
50 |
51 | # skip if simulation failed (return NULL)
52 | res <- if (!is.null(sce)) {
53 | sce <- ppFUN(sce)
54 | .calc_qc(sce,
55 | fun = qcFUN,
56 | groups = groups,
57 | n_genes = n_genes,
58 | n_cells = n_cells)
59 | }
60 |
61 | res <- if (!is.null(res)) {
62 | if (is.null(wcs$method))
63 | wcs$method <- NA
64 | data.frame(wcs, res)
65 | }
66 |
67 | print(head(res))
68 | saveRDS(res, args$res)
--------------------------------------------------------------------------------
/code/05-runtimes.R:
--------------------------------------------------------------------------------
1 | # wcs <- list(ngs = "x", ncs = 325, rep = 3)
2 | # args <- list(
3 | # sce = "data/02-sub/Mereu20,CD4T.rds",
4 | # est = "code/03-est_pars-SCRIP.R",
5 | # sim = "code/04-sim_data-SCRIP.R")
6 |
7 | suppressPackageStartupMessages({
8 | library(R.utils)
9 | library(SingleCellExperiment)
10 | })
11 |
12 | # set seed to current rep(licate) for reproducibility
13 | set.seed(as.numeric(wcs$rep))
14 |
15 | # read in dataset
16 | x <- readRDS(args$sce)
17 |
18 | # if down-sampling cells, use all
19 | # genes & store NA in output data.frame
20 | if (wcs$ngs == "x") {
21 | wcs$ngs <- NA
22 | ngs <- nrow(x)
23 | } else wcs$ngs <- ngs <- as.numeric(wcs$ngs)
24 |
25 | # if down-sampling genes, use all
26 | # cells & store NA in output data.frame
27 | if (wcs$ncs == "x") {
28 | wcs$ncs <- NA
29 | ncs <- ncol(x)
30 | } else wcs$ncs <- ncs <- as.numeric(wcs$ncs)
31 |
32 | # downsample number of genes / cells
33 | x <- x[
34 | sample(nrow(x), ngs),
35 | sample(ncol(x), ncs)]
36 |
37 | sink(tempfile()) # suppress printing...
38 |
39 | # set time limit (s) until timeout
40 | t <- 1e4
41 |
42 | # time estimation
43 | source(args$est)
44 | est <- withTimeout(
45 | {
46 | tryCatch(
47 | system.time(y <- fun(x))[[3]],
48 | error = function(e) e)
49 | },
50 | timeout = t,
51 | onTimeout = "warning")
52 |
53 | if (is.character(est)) {
54 | # timed out
55 | est <- t
56 | } else if (inherits(est, "error")) {
57 | # Inf if estimation failed
58 | est <- Inf
59 | } else if (is.null(y)) {
60 | # NA if included in simulation
61 | est <- NA_real_
62 | # pass SCE to simulation
63 | y <- x
64 | }
65 |
66 | # time simulation
67 | source(args$sim)
68 | sim <- withTimeout(
69 | {
70 | tryCatch(
71 | system.time(fun(y))[[3]],
72 | error = function(e) e)
73 | },
74 | timeout = t,
75 | onTimeout = "warning")
76 |
77 | if (is.character(sim)) {
78 | # timed out
79 | sim <- t
80 | } else if (inherits(sim, "error")) {
81 | # Inf if estimation failed
82 | sim <- Inf
83 | } else if (is.null(sim)) {
84 | # NA if included in simulation
85 | sim <- NA_real_
86 | }
87 |
88 | sink() # ...until here
89 |
90 | # construct table including wildcards (wcs),
91 | # timing of est(imation) & sim(ulation),
92 | # and number of genes/cells (ng/cs)
93 | res <- data.frame(wcs, est, sim, row.names = NULL)
94 |
95 | # ...and write to .rds
96 | saveRDS(res, args$res)
--------------------------------------------------------------------------------
/code/06-dr_batch.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(scater)
3 | library(scran)
4 | library(SingleCellExperiment)
5 | })
6 |
7 | # args <- list(
8 | # sce = "data/04-sim/Oetjen18,foo,SPARSim.rds",
9 | # res = "outs/batch_sim-Oetjen18,foo,SPARSim,mnnCorrect.rds")
10 |
11 | x <- readRDS(args[[1]])
12 | y <- readRDS(args[[2]])
13 |
14 | df <- if (!is.null(x) && !is.null(y)) {
15 | if (!is.null(y$dimred_in) &&
16 | !is.null(y$dimred_out)) {
17 | # use integrated dimension reduction
18 | reducedDim(x, "PCA") <- y$dimred_out
19 | } else if (
20 | !is.null(y$assay_in) &&
21 | !is.null(y$assay_out)) {
22 | # run PCA on integrated data
23 | pca <- calculatePCA(y$assay_out)
24 | reducedDim(x, "PCA") <- pca
25 | }
26 |
27 | x <- runTSNE(x, dimred = "PCA")
28 | x <- runUMAP(x, dimred = "PCA")
29 |
30 | tsne <- reducedDim(x, "TSNE")
31 | umap <- reducedDim(x, "UMAP")
32 | colnames(tsne) <- paste0("TSNE", seq(2))
33 | colnames(umap) <- paste0("UMAP", seq(2))
34 |
35 | data.frame(wcs, tsne, umap, batch = x$batch)
36 | }
37 | saveRDS(df, args[[3]])
38 |
--------------------------------------------------------------------------------
/code/06-eval_batch.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(CellMixS)
3 | library(SingleCellExperiment)
4 | })
5 |
6 | # args <- list(
7 | # sce = "data/02-sub/panc8,inDrop.ductal.rds",
8 | # res = "outs/batch_sim-panc8,inDrop.ductal,muscat,Seurat.rds")
9 |
10 | sce <- readRDS(args[[1]])
11 | res <- readRDS(args[[2]])
12 |
13 | df <- if (!is.null(sce) && !is.null(res)) {
14 |
15 | if (!is.factor(sce$batch))
16 | sce$batch <- factor(sce$batch)
17 |
18 | group <- "batch"
19 | k <- min(table(sce$batch))/2
20 |
21 | # x = uncorrected,
22 | # y = integrated
23 | if (!is.null(res$dimred_in) &&
24 | !is.null(res$dimred_out)) {
25 | # compute scores io dimension reductions
26 | dimred_in <- dimred_out <- "foo"
27 | assay_in <- assay_out <- "logcounts"
28 | # setup input data
29 | x <- sce
30 | reducedDim(x, "foo") <- res$dimred_in
31 | # setup output data
32 | y <- sce
33 | reducedDim(y, "foo") <- res$dimred_out
34 | } else if (
35 | !is.null(res$assay_in) &&
36 | !is.null(res$assay_out)) {
37 | # compute scores on integrated assay data
38 | assay_in <- assay_out <- "foo"
39 | dimred_in <- dimred_out <- "PCA"
40 | # setup input data
41 | x <- sce
42 | assay(x, "foo", FALSE) <- res$assay_in
43 | # setup output data
44 | y <- sce
45 | assay(y, "foo", FALSE) <- res$assay_out
46 | }
47 | # split input data by batch
48 | idx <- split(seq(ncol(x)), x$batch)
49 | x <- lapply(idx, \(.) x[, .])
50 |
51 | suppressMessages({
52 | # cell-specific changes in Local Density
53 | # Factor (LDF) before vs. after integration
54 | ldf <- ldfDiff(x, y, group, k, dimred_in, dimred_out, assay_in, assay_out)
55 |
56 | # cell-specific mixing scores based on
57 | # euclidean distances within integrated data
58 | cms <- cms(y, k, group, dimred_in, assay_in)
59 | })
60 |
61 | data.frame(wcs,
62 | row.names = NULL,
63 | batch = sce$batch,
64 | ldf = ldf$diff_ldf,
65 | cms = cms$cms)
66 | }
67 |
68 | saveRDS(df, args[[3]])
--------------------------------------------------------------------------------
/code/06-eval_clust.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(dplyr)
3 | library(tidyr)
4 | })
5 |
6 | source(args$uts)
7 |
8 | sce <- readRDS(args$sce)
9 | true <- droplevels(factor(sce$cluster))
10 |
11 | res <- args[c("ref", "sim")] %>%
12 | unlist() %>%
13 | lapply(readRDS) %>%
14 | bind_rows() %>%
15 | replace_na(list(method = "ref"))
16 |
17 | # for each simulators & clustering method, match prediction w/ truth
18 | # using Hungarian algorithm & compute precision, recall, F1 score
19 | res <- group_by(res,
20 | across(contains("method"))) %>%
21 | group_modify(~{
22 | res <- .hungarian(.x$pred, .x$true)
23 | data.frame(
24 | cluster = levels(true),
25 | res[c("pr", "re", "F1")])
26 | }) %>% data.frame(wcs)
27 |
28 | saveRDS(res, args$res)
29 |
--------------------------------------------------------------------------------
/code/06-stat_1d-ks.R:
--------------------------------------------------------------------------------
1 | fun <- function(x, y) {
2 | suppressWarnings(z <- ks.test(x, y))
3 | as.numeric(z$statistic)
4 | }
--------------------------------------------------------------------------------
/code/06-stat_1d-ws.R:
--------------------------------------------------------------------------------
1 | fun <- function(x, y) waddR::wasserstein_metric(x, y)
--------------------------------------------------------------------------------
/code/06-stat_1d.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(dplyr)
3 | library(purrr)
4 | library(tidyr)
5 | })
6 |
7 | # read in QC results for reference (ref) & simulation (sim)
8 | ref <- readRDS(args$ref)
9 | sim <- readRDS(args$sim)
10 |
11 | # check that QC is available for reference & simulation
12 | res <- if (!(is.null(ref) || is.null(sim))) {
13 | # source test statistic function
14 | source(args$fun)
15 | # test ref vs. sim for each grouping
16 | bind_rows(ref, sim) %>%
17 | pivot_wider(
18 | names_from = "method",
19 | values_from = "value",
20 | values_fn = list) %>%
21 | rename(
22 | ref = "NA",
23 | sim = wcs$method) %>%
24 | rowwise() %>%
25 | mutate(stat = fun(ref, sim)) %>%
26 | ungroup() %>%
27 | select(negate(is.list)) %>%
28 | mutate(.before = 1, data.frame(wcs))
29 | }
30 |
31 | print(head(res))
32 | saveRDS(res, args$res)
33 |
--------------------------------------------------------------------------------
/code/06-stat_2d-emd.R:
--------------------------------------------------------------------------------
1 | fun <- function(x, y, n = 25) {
2 | stopifnot(is.numeric(n), length(n) == 1, n == as.integer(n))
3 | if (is.null(dim(x))) {
4 | # ONE-DIMENSIONAL
5 | # smoothing
6 | x <- density(x, n = n)$x
7 | y <- density(y, n = n)$x
8 | # compute EMD
9 | ws <- rep(1/n, n)
10 | x <- cbind(ws, x)
11 | y <- cbind(ws, y)
12 | emd(x, y)
13 | } else {
14 | # TWO-DIMENSIONAL
15 | if (!is.matrix(x)) x <- as.matrix(x)
16 | if (!is.matrix(y)) y <- as.matrix(y)
17 | # smoothing over common range
18 | rng <- c(
19 | range(c(x[, 1], y[, 1])),
20 | range(c(x[, 2], y[, 2])))
21 | x <- MASS::kde2d(x[, 1], x[, 2], n = n, lims = rng)
22 | y <- MASS::kde2d(y[, 1], y[, 2], n = n, lims = rng)
23 | # compute EMD
24 | emdist::emd2d(x$z, y$z)/n
25 | }
26 | }
--------------------------------------------------------------------------------
/code/06-stat_2d-ks2.R:
--------------------------------------------------------------------------------
1 | fun <- function(x, y) Peacock.test::peacock2(x, y)
--------------------------------------------------------------------------------
/code/06-stat_2d.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(dplyr)
3 | library(purrr)
4 | library(tidyr)
5 | })
6 |
7 | x <- lapply(args$x, readRDS) %>% bind_rows()
8 | y <- lapply(args$y, readRDS) %>% bind_rows()
9 |
10 | # check that QCs are available for reference & simulation
11 | res <- if (!(all(is.na(x$method)) || all(is.na(y$method)))) {
12 | # source evaluation function
13 | source(args$fun)
14 | # for each grouping, test simulation vs. reference
15 | list(x = x, y = y) %>%
16 | bind_rows(.id = "dim") %>%
17 | pivot_wider(
18 | id_cols = c("group", "id"),
19 | names_from = c("dim", "method"),
20 | values_from = "value",
21 | values_fn = list) %>%
22 | rename_at(
23 | vars(contains("NA")),
24 | function(.) gsub("NA", "ref", .)) %>%
25 | rename_at(
26 | vars(contains(wcs$method)),
27 | function(.) gsub(wcs$method, "sim", .)) %>%
28 | rowwise() %>%
29 | mutate(stat = fun(
30 | cbind(x_ref, y_ref),
31 | cbind(x_sim, y_sim))) %>%
32 | ungroup() %>%
33 | select_if(negate(is.list)) %>%
34 | mutate(.before = 1, data.frame(wcs))
35 | }
36 |
37 | print(head(res))
38 | saveRDS(res, args$res)
39 |
--------------------------------------------------------------------------------
/code/07-plot_batch-boxplot_by_method.R:
--------------------------------------------------------------------------------
1 | # wcs <- list(val = "cms")
2 | # args <- list(
3 | # uts1 = "code/utils-plotting.R",
4 | # uts2 = "code/utils-integration.R",
5 | # res = list.files("outs", "^batch_res-", full.names = TRUE),
6 | # rds = sprintf("plts/batch-boxplot_by_method_%s.rds", wcs$val),
7 | # pdf = sprintf("plts/batch-boxplot_by_method_%s.pdf", wcs$val))
8 |
9 | source(args$uts1)
10 | source(args$uts2)
11 |
12 | res <- .read_res(args$res)
13 |
14 | df <- res %>%
15 | .cms_ldf() %>%
16 | .bcs(n = 1) # average across cells but not batches
17 |
18 | lim <- switch(wcs$val, bcs = c(0, 1), c(-0.5, 0.5))
19 |
20 | plt <- ggplot(df, aes(
21 | reorder_within(batch_method, .data[[wcs$val]], sim_method, median),
22 | .data[[wcs$val]], col = batch_method, fill = batch_method)) +
23 | facet_grid(~ sim_method, scales = "free_x") +
24 | geom_hline(yintercept = c(0, 1), size = 0.1) +
25 | geom_boxplot(
26 | size = 0.25, outlier.size = 0.25,
27 | alpha = 0.25, key_glyph = "point") +
28 | scale_y_continuous(
29 | .batch_labs[wcs$val],
30 | limits = lim,
31 | n.breaks = 3)
32 |
33 | thm <- theme(
34 | legend.title = element_blank(),
35 | axis.text.x = element_blank(),
36 | axis.title.x = element_blank(),
37 | axis.ticks.x = element_blank())
38 |
39 | fig <- .prettify(plt, thm)
40 |
41 | saveRDS(fig, args$rds)
42 | ggsave(args$pdf, fig, width = 15, height = 6, units = "cm")
43 |
--------------------------------------------------------------------------------
/code/07-plot_batch-boxplot_dX.R:
--------------------------------------------------------------------------------
1 | # wcs <- list(val = "ldf")
2 | # args <- list(
3 | # uts1 = "code/utils-plotting.R",
4 | # uts2 = "code/utils-integration.R",
5 | # rds = paste0("plts/batch-heatmap_by_method_", wcs$val, ".rds"),
6 | # pdf = paste0("plts/batch-heatmap_by_method_", wcs$val, ".pdf"),
7 | # res = list.files("outs", "^batch_res", full.names = TRUE))
8 |
9 | source(args$uts1)
10 | source(args$uts2)
11 |
12 | res <- .read_res(args$res)
13 |
14 | df <- res %>%
15 | .cms_ldf() %>%
16 | .bcs(n = 1) %>% # average across cells but not batches
17 | group_by(refset, batch_method, batch) %>%
18 | mutate(val = .data[[wcs$val]]-.data[[wcs$val]][.data$sim_method == "ref"]) %>%
19 | filter(sim_method != "ref") %>%
20 | mutate(sim_method = droplevels(sim_method))
21 |
22 | lim <- c(
23 | floor(min(df$val)/0.1)*0.1,
24 | ceiling(max(df$val)/0.1)*0.1)
25 | lab <- eval(.batch_labs[wcs$val])
26 |
27 | plt <- ggplot(df, aes(reorder(sim_method, val, median), val)) +
28 | geom_hline(yintercept = 0, size = 0.1, col = "red") +
29 | geom_boxplot(size = 0.2, outlier.size = 0.1, key_glyph = "point") +
30 | geom_violin(fill = NA, col = "grey", size = 0.2) +
31 | scale_y_continuous(limits = lim, breaks = c(0, lim)) +
32 | labs(x = NULL, y = bquote(Delta*.(lab)*"(sim-ref)"))
33 |
34 | thm <- theme(axis.text.x = element_text(angle = 45, hjust = 1),)
35 |
36 | fig <- .prettify(plt, thm)
37 |
38 | saveRDS(fig, args$rds)
39 | ggsave(args$pdf, fig, width = 6, height = 6, units = "cm")
40 |
--------------------------------------------------------------------------------
/code/07-plot_batch-correlations.R:
--------------------------------------------------------------------------------
1 | # wcs <- list(val = "cms")
2 | # args <- list(
3 | # uts1 = "code/utils-plotting.R",
4 | # uts2 = "code/utils-integration.R",
5 | # rds = paste0("plts/batch-heatmap_by_method_", wcs$val, ".rds"),
6 | # pdf = paste0("plts/batch-heatmap_by_method_", wcs$val, ".pdf"),
7 | # res = list.files("outs", "^batch_res", full.names = TRUE))
8 |
9 | source(args$uts1)
10 | source(args$uts2)
11 |
12 | res <- .read_res(args$res)
13 |
14 | df <- res %>%
15 | .cms_ldf() %>%
16 | .bcs(n = 1) # average across cells & batches
17 |
18 | mat <- df %>%
19 | ungroup() %>%
20 | pivot_wider(
21 | values_from = wcs$val,
22 | names_from = "sim_method",
23 | id_cols = c("refset", "batch_method", "batch")) %>%
24 | select(any_of(c("ref", names(.methods_pal)))) %>%
25 | cor(method = "pearson", use = "pairwise.complete.obs")
26 |
27 | mat[is.na(mat)] <- 0
28 | xo <- rownames(mat)[hclust(dist(mat))$order]
29 | yo <- rownames(mat)[hclust(dist(t(mat)))$order]
30 |
31 | df <- mat %>%
32 | data.frame(
33 | from = rownames(.),
34 | check.names = FALSE) %>%
35 | pivot_longer(
36 | cols = -from,
37 | names_to = "to",
38 | values_to = "corr") %>%
39 | mutate(
40 | to = factor(to, levels = xo),
41 | from = factor(from, levels = yo)) %>%
42 | filter(as.numeric(to) <= as.numeric(from))
43 |
44 | fd <- filter(df, from != to)
45 | min <- floor(min(fd$corr)/0.1)*0.1
46 | max <- ceiling(max(fd$corr)/0.1)*0.1
47 | plt <- ggplot(fd, aes(from, to, fill = corr)) +
48 | geom_tile() +
49 | scale_fill_gradientn("r",
50 | colors = rev(hcl.colors(11, "Grays")),
51 | limits = c(min, max), breaks = c(min, max)) +
52 | coord_equal(expand = FALSE) +
53 | scale_x_discrete(limits = rev(xo[-1]))
54 |
55 | fy <- c("plain", "bold")[as.numeric(levels(df$to) == "ref") + 1]
56 | fx <- c("plain", "bold")[as.numeric(rev(levels(df$from)) == "ref") + 1]
57 |
58 | thm <- theme(
59 | axis.text.y = element_text(face = fy),
60 | axis.text.x = element_text(angle = 45, hjust = 1, face = fx),
61 | axis.title = element_blank(),
62 | panel.border = element_blank(),
63 | legend.title = element_text(vjust = 1),
64 | legend.direction = "horizontal",
65 | legend.position = c(1, 1),
66 | legend.justification = c(1, 1))
67 |
68 | fig <- .prettify(plt, thm)
69 |
70 | saveRDS(fig, args$rds)
71 | ggsave(args$pdf, fig, width = 6, height = 6, units = "cm")
--------------------------------------------------------------------------------
/code/07-plot_batch-densities.R:
--------------------------------------------------------------------------------
1 | # wcs <- list(val = "cms")
2 | # args <- list(
3 | # uts1 = "code/utils-plotting.R",
4 | # uts2 = "code/utils-integration.R",
5 | # rds = paste0("plts/batch-densities_", wcs$val, ".rds"),
6 | # pdf = paste0("plts/batch-densities_", wcs$val, ".pdf"),
7 | # res = list.files("outs", "^batch_res", full.names = TRUE))
8 |
9 | source(args$uts1)
10 | source(args$uts2)
11 |
12 | res <- .read_res(args$res)
13 |
14 | lim <- switch(wcs$val, bcs = c(0, 1), c(-0.5, 0.5))
15 |
16 | fig <- if (wcs$val != "bcs") {
17 | df <- .cms_ldf(res)
18 | plt <- ggplot(df, aes(
19 | .data[[wcs$val]], ..ndensity..,
20 | col = batch_method, fill = batch_method)) +
21 | facet_grid(sim_method ~ refset) +
22 | geom_vline(xintercept = 0, size = 0.2) +
23 | geom_density(alpha = 0, key_glyph = "point") +
24 | scale_x_continuous(limits = lim, n.breaks = 3) +
25 | scale_y_continuous(limits = c(0, 1), n.breaks = 3) +
26 | labs(x = .batch_labs[wcs$val], y = "scaled density")
27 | thm <- theme(axis.text.x = element_text(angle = 45, hjust = 1))
28 | .prettify(plt, thm)
29 | }
30 |
31 | saveRDS(fig, args$rds)
32 | ggsave(args$pdf, fig, width = 16, height = 12, units = "cm")
--------------------------------------------------------------------------------
/code/07-plot_batch-heatmap_by_method.R:
--------------------------------------------------------------------------------
1 | # wcs <- list(val = "cms")
2 | # args <- list(
3 | # uts1 = "code/utils-plotting.R",
4 | # uts2 = "code/utils-integration.R",
5 | # rds = paste0("plts/batch-heatmap_by_method_", wcs$val, ".rds"),
6 | # pdf = paste0("plts/batch-heatmap_by_method_", wcs$val, ".pdf"),
7 | # res = list.files("outs", "^batch_res", full.names = TRUE))
8 |
9 | source(args$uts1)
10 | source(args$uts2)
11 |
12 | res <- .read_res(args$res)
13 |
14 | df <- res %>%
15 | .cms_ldf() %>%
16 | .bcs(n = 2) # average across cells & batches
17 |
18 | max <- ceiling(max(df$bcs)/0.1)*0.1
19 | lim <- switch(wcs$val, bcs = c(0, max), c(-0.5, 0.5))
20 |
21 | plt <- ggplot(df, aes(
22 | reorder_within(batch_method, .data[[wcs$val]], sim_method, median),
23 | refset, fill = .data[[wcs$val]])) +
24 | facet_grid(~ sim_method, scales = "free_x") +
25 | geom_tile(col = "white") +
26 | scale_fill_distiller(
27 | .batch_labs[wcs$val],
28 | palette = "RdYlBu",
29 | na.value = "lightgrey",
30 | limits = lim,
31 | n.breaks = 3) +
32 | coord_cartesian(expand = FALSE) +
33 | scale_x_reordered() +
34 | scale_y_reordered()
35 |
36 | thm <- theme(
37 | axis.title = element_blank(),
38 | axis.text.x = element_text(angle = 45, hjust = 1))
39 |
40 | fig <- .prettify(plt, thm)
41 |
42 | saveRDS(fig, args$rds)
43 | ggsave(args$pdf, fig, width = 15, height = 6, units = "cm")
--------------------------------------------------------------------------------
/code/07-plot_batch-heatmap_by_refset.R:
--------------------------------------------------------------------------------
1 | # wcs <- list(val = "cms")
2 | # args <- list(
3 | # uts1 = "code/utils-plotting.R",
4 | # uts2 = "code/utils-integration.R",
5 | # rds = paste0("plts/batch-heatmap_by_refset_", wcs$val, ".rds"),
6 | # pdf = paste0("plts/batch-heatmap_by_refset_", wcs$val, ".pdf"),
7 | # res = list.files("outs", "^batch_res", full.names = TRUE))
8 |
9 | source(args$uts1)
10 | source(args$uts2)
11 |
12 | res <- .read_res(args$res)
13 |
14 | df <- res %>%
15 | .cms_ldf() %>%
16 | .bcs(n = 2) # average across cells & batches
17 |
18 | max <- ceiling(max(df$bcs)/0.1)*0.1
19 | lim <- switch(wcs$val, bcs = c(0, max), c(-0.5, 0.5))
20 |
21 | plt <- ggplot(df, aes(
22 | reorder_within(batch_method, .data[[wcs$val]], refset, median),
23 | sim_method, fill = .data[[wcs$val]])) +
24 | facet_grid(~ refset, scales = "free_x") +
25 | geom_tile(col = "white") +
26 | scale_fill_distiller(
27 | .batch_labs[wcs$val],
28 | palette = "RdYlBu",
29 | na.value = "lightgrey",
30 | limits = lim,
31 | n.breaks = 3) +
32 | coord_cartesian(expand = FALSE) +
33 | scale_x_reordered() +
34 | scale_y_reordered()
35 | plt
36 | thm <- theme(
37 | axis.title = element_blank(),
38 | axis.text.x = element_text(angle = 45, hjust = 1))
39 |
40 | fig <- .prettify(plt, thm)
41 |
42 | saveRDS(fig, args$rds)
43 | ggsave(args$pdf, fig, width = 15, height = 6, units = "cm")
44 |
--------------------------------------------------------------------------------
/code/07-plot_clust-boxplot_by_method.R:
--------------------------------------------------------------------------------
1 | source(args$uts)
2 |
3 | res <- .read_res(args$res)
4 |
5 | # order according to average F1 score
6 | res$sim_method <- factor(
7 | res$sim_method,
8 | levels = res %>%
9 | group_by(sim_method) %>%
10 | summarise_at("F1", mean) %>%
11 | arrange(F1) %>%
12 | pull("sim_method"))
13 |
14 | plt <- ggplot(res, aes(
15 | reorder_within(clust_method, -F1, sim_method, median),
16 | F1, col = clust_method, fill = clust_method)) +
17 | facet_grid(~ sim_method, scales = "free_x") +
18 | geom_hline(yintercept = c(0, 1), size = 0.1) +
19 | geom_boxplot(
20 | size = 0.25, outlier.size = 0.25,
21 | alpha = 0.25, key_glyph = "point") +
22 | scale_x_reordered(NULL) +
23 | scale_y_continuous(breaks = seq(0, 1, 0.2))
24 |
25 | thm <- theme(
26 | legend.title = element_blank(),
27 | axis.text.x = element_blank(),
28 | axis.ticks.x = element_blank())
29 |
30 | fig <- .prettify(plt, thm)
31 |
32 | saveRDS(fig, args$rds)
33 | ggsave(args$pdf, fig, width = 16, height = 3.25, units = "cm")
34 |
--------------------------------------------------------------------------------
/code/07-plot_clust-boxplot_dF1.R:
--------------------------------------------------------------------------------
1 | source(args$uts)
2 |
3 | res <- .read_res(args$res)
4 |
5 | df <- res %>%
6 | group_by(refset, cluster, clust_method) %>%
7 | mutate(F1 = F1-.data$F1[.data$sim_method == "ref"]) %>%
8 | filter(sim_method != "ref") %>%
9 | mutate(sim_method = droplevels(sim_method))
10 |
11 | plt <- ggplot(df, aes(reorder(sim_method, F1, median), F1)) +
12 | geom_hline(yintercept = 0, size = 0.1, col = "red") +
13 | geom_boxplot(size = 0.2, outlier.size = 0.1, key_glyph = "point") +
14 | geom_violin(fill = NA, col = "grey", size = 0.2) +
15 | scale_y_continuous(limits = c(-1, 1), n.breaks = 5) +
16 | labs(x = NULL, y = expression(Delta*"F1(sim-ref)"))
17 |
18 | thm <- theme(axis.text.x = element_text(angle = 45, hjust = 1),)
19 |
20 | fig <- .prettify(plt, thm)
21 |
22 | saveRDS(fig, args$rds)
23 | ggsave(args$pdf, fig, width = 6, height = 6, units = "cm")
24 |
--------------------------------------------------------------------------------
/code/07-plot_clust-correlations.R:
--------------------------------------------------------------------------------
1 | # args <- list(
2 | # uts = "code/utils-plotting.R",
3 | # rds = "plts/clust-correlations.rds",
4 | # pdf = "plts/clust-correlations.pdf",
5 | # res = list.files("outs", "^clust_res", full.names = TRUE))
6 |
7 | source(args$uts)
8 |
9 | res <- .read_res(args$res)
10 |
11 | mat <- res %>%
12 | pivot_wider(
13 | id_cols = c("refset", "cluster", "clust_method"),
14 | names_from = "sim_method",
15 | values_from = "F1") %>%
16 | select(any_of(c("ref", names(.methods_pal)))) %>%
17 | cor(method = "pearson", use = "pairwise.complete.obs")
18 |
19 | xo <- rownames(mat)[hclust(dist(mat))$order]
20 | yo <- rownames(mat)[hclust(dist(t(mat)))$order]
21 |
22 | df <- mat %>%
23 | data.frame(
24 | from = rownames(.),
25 | check.names = FALSE) %>%
26 | pivot_longer(
27 | cols = -from,
28 | names_to = "to",
29 | values_to = "corr") %>%
30 | mutate(
31 | to = factor(to, levels = xo),
32 | from = factor(from, levels = yo)) %>%
33 | filter(as.numeric(to) <= as.numeric(from))
34 |
35 | fd <- filter(df, from != to)
36 | min <- floor(min(fd$corr)/0.1)*0.1
37 | max <- ceiling(max(fd$corr)/0.1)*0.1
38 | plt <- ggplot(fd, aes(from, to, fill = corr)) +
39 | geom_tile() +
40 | scale_fill_gradientn("r",
41 | colors = rev(hcl.colors(11, "Grays")),
42 | limits = c(min, max), breaks = c(min, max)) +
43 | coord_equal(expand = FALSE) +
44 | scale_x_discrete(limits = rev(xo[-1]))
45 |
46 | fy <- c("plain", "bold")[as.numeric(levels(df$to) == "ref") + 1]
47 | fx <- c("plain", "bold")[as.numeric(rev(levels(df$from)) == "ref") + 1]
48 |
49 | thm <- theme(
50 | axis.text.y = element_text(face = fy),
51 | axis.text.x = element_text(angle = 45, hjust = 1, face = fx),
52 | axis.title = element_blank(),
53 | panel.border = element_blank(),
54 | legend.title = element_text(vjust = 1),
55 | legend.direction = "horizontal",
56 | legend.position = c(1, 1),
57 | legend.justification = c(1, 1))
58 |
59 | fig <- .prettify(plt, thm)
60 |
61 | saveRDS(fig, args$rds)
62 | ggsave(args$pdf, fig, width = 6, height = 6, units = "cm")
--------------------------------------------------------------------------------
/code/07-plot_clust-heatmap_by_method.R:
--------------------------------------------------------------------------------
1 | source(args$uts)
2 |
3 | res <- .read_res(args$res)
4 |
5 | df <- res %>%
6 | group_by(
7 | refset,
8 | sim_method,
9 | clust_method) %>%
10 | summarise_at("F1", mean) %>%
11 | mutate(rank = rank(-F1))
12 |
13 | plt <- ggplot(df, aes(
14 | reorder_within(clust_method, rank, sim_method),
15 | reorder(refset, rank), fill = rank)) +
16 | facet_wrap(~ sim_method, nrow = 1, scales = "free_x") +
17 | geom_tile(col = "white", key_glyph = "point") +
18 | scale_fill_gradientn(
19 | colors = rev(hcl.colors(5, "Spectral")),
20 | limits = range(df$rank),
21 | breaks = range(df$rank),
22 | labels = c("best", "worst")) +
23 | guides(fill = guide_colorbar(reverse = TRUE)) +
24 | coord_cartesian(expand = FALSE) +
25 | scale_x_reordered()
26 |
27 | thm <- theme(
28 | axis.title = element_blank(),
29 | axis.text.x = element_text(size = 4, angle = 45, hjust = 1))
30 |
31 | fig <- .prettify(plt, thm)
32 |
33 | saveRDS(fig, args$rds)
34 | ggsave(args$pdf, fig, width = 16, height = 3.25, units = "cm")
35 |
--------------------------------------------------------------------------------
/code/07-plot_corr-stat_2d.R:
--------------------------------------------------------------------------------
1 | source(args$fun)
2 |
3 | # args <- list(res = list.files("outs", "stat_2d-.*(ks2|emd)\\.rds", full.names = TRUE))
4 |
5 | res <- .read_res(args$res)
6 |
7 | df <- res %>%
8 | filter(!is.na(stat)) %>%
9 | mutate(
10 | refset = paste(datset, subset, sep = ","),
11 | metrics = paste(metric1, metric2, sep = "\n")) %>%
12 | select(-c(datset, subset, metric1, metric2)) %>%
13 | # for type != n, keep batch-/cluster-level comparisons only
14 | group_by(stat2d, refset, method, metrics) %>%
15 | mutate(n = n()) %>%
16 | filter(group != "global" | n == 1) %>%
17 | select(-c(group, n))
18 |
19 | gg <- pivot_wider(df,
20 | names_from = stat2d,
21 | values_from = stat) %>%
22 | group_by(refset, metrics) %>%
23 | mutate(
24 | ks2 = ks2/max(ks2, na.rm = TRUE),
25 | emd = emd/max(emd, na.rm = TRUE))
26 |
27 | plt <- ggplot(gg, aes(emd, ks2)) +
28 | facet_wrap(~ metrics, nrow = 2) +
29 | geom_abline(intercept = 0, slope = 1, size = 0.2, lty = 2) +
30 | stat_smooth(method = "lm", formula = y ~ x, size = 0.4, col = "red") +
31 | geom_point(alpha = 0.2, size = 1, shape = 16, col = "tomato") +
32 | stat_cor(method = "pearson", size = 1.5,
33 | hjust = 1, vjust = 0, label.x.npc = 1, label.y.npc = 0) +
34 | scale_x_continuous(limits = c(0, 1), breaks = c(0, 1)) +
35 | scale_y_continuous(limits = c(0, 1), breaks = c(0, 1)) +
36 | labs(x = "scaled EMD", y = "scaled KS statistic")
37 |
38 | thm <- theme(
39 | aspect.ratio = 1,
40 | legend.position = "none")
41 |
42 | fig <- .prettify(plt, thm)
43 |
44 | saveRDS(fig, args$ggp)
45 | ggsave(args$plt, fig, width = 16, height = 9.25, units = "cm")
46 |
--------------------------------------------------------------------------------
/code/07-plot_dimred.R:
--------------------------------------------------------------------------------
1 | # wcs <- list(reftyp = "k")
2 | # args <- list(
3 | # fun = "code/utils-plotting.R",
4 | # res = list.files("outs", "dr_ref", full.names = TRUE),
5 | # pdf = sprintf("figs/dimred_%s.pdf", wcs$reftyp))
6 |
7 | source(args$fun)
8 |
9 | dfs <- args$res %>%
10 | lapply(readRDS) %>%
11 | bind_rows() %>%
12 | mutate(
13 | refset = paste(datset, subset, sep = ","),
14 | type = ifelse(!is.na(batch), "b", ifelse(!is.na(cluster), "k", "n"))) %>%
15 | filter(type == wcs$reftyp) %>%
16 | group_split(refset)
17 |
18 | # number of rows & columns for facetting
19 | nc <- ceiling(length(dfs)/(nr <- 3))
20 |
21 | # get variable to color by
22 | # - log-library size (lls) for type n
23 | # - batch for type b, cluster for type k
24 | col <- switch(wcs$reftyp, n = "lls", b = "batch", k = "cluster")
25 |
26 | fig <- lapply(seq_along(dfs), \(i) {
27 | plt <- ggplot(dfs[[i]], aes_string(
28 | "TSNE1", "TSNE2", col = col, fill = col)) +
29 | geom_point_rast(size = 0.1, alpha = 0.2) +
30 | (if (col == "lls") scale_color_viridis_c()) +
31 | guides(fill = "none") + ggtitle(dfs[[i]]$refset[1])
32 | thm <- theme(
33 | aspect.ratio = 1,
34 | axis.text = element_blank(),
35 | axis.ticks = element_blank(),
36 | legend.title = element_blank(),
37 | legend.justification = c(0, 0.5),
38 | plot.title = element_text(hjust = 0.5),
39 | axis.title.x = if (i > length(dfs) - nc) element_text() else element_blank(),
40 | axis.title.y = if (i %% nc != 1) element_blank() else element_text(angle = 90))
41 | .prettify(plt, thm)
42 | }) %>% wrap_plots(nrow = nr)
43 |
44 | ggsave(args$pdf, fig, width = 16, height = 4*length(dfs)/n, units = "cm")
45 |
--------------------------------------------------------------------------------
/code/07-plot_dimred_batch.R:
--------------------------------------------------------------------------------
1 | # args <- list(
2 | # rds = "plts/batch-dimred.rds",
3 | # pdf = "plts/batch-dimred.pdf",
4 | # uts = "code/utils-plotting.R",
5 | # res = list.files("outs", "^dr_", full.names = TRUE))
6 | # args$res <- grep("CellBench,H", args$res, value = TRUE)
7 |
8 | source(args$uts)
9 |
10 | res <- .read_res(args$res)
11 |
12 | dfs <- res %>%
13 | replace_na(list(batch_method = "none")) %>%
14 | mutate(
15 | batch_method = factor(batch_method),
16 | batch_method = relevel(batch_method, "none")) %>%
17 | group_split(refset)
18 |
19 | lys <- lapply(dfs, \(df) {
20 | plt <- ggplot(df, aes(TSNE1, TSNE2, col = batch, fill = batch)) +
21 | facet_grid(batch_method ~ sim_method, scales = "free") +
22 | geom_point_rast(shape = 21, size = 0.05, alpha = 0.2) +
23 | ggtitle(df$refset[1])
24 | thm <- theme(
25 | aspect.ratio = 1,
26 | axis.text = element_blank(),
27 | axis.ticks = element_blank(),
28 | axis.title = element_text(hjust = 0))
29 | fig <- .prettify(plt, thm)
30 | })
31 |
32 | saveRDS(lys, args$rds)
33 |
34 | pdf(args$pdf, width = 16/2.54, height = 12/2.54)
35 | for (p in lys) print(p); dev.off()
36 |
--------------------------------------------------------------------------------
/code/07-plot_memory.R:
--------------------------------------------------------------------------------
1 | # args <- list(
2 | # pdf = "plts/memory.pdf",
3 | # fun = "code/utils-plotting.R",
4 | # res = list.files("logs", ".*\\.txt", full.names = TRUE))
5 |
6 | source(args$fun)
7 | res <- lapply(args$res, read.table, header = TRUE)
8 | mbs <- do.call(rbind, res)[["max_rss"]]
9 |
10 | # get metadata from output names
11 | # rts_{reftyp}-{datset},{subset},
12 | # {method},{ngs},{ncs},{rep}.rds
13 | ss <- strsplit(basename(gsub("\\.txt", "", args$res)), ",")
14 | reftyp <- gsub("rts_([a-z])-.*", "\\1", sapply(ss, .subset, 1))
15 | refset <- paste(sep = ",",
16 | gsub(".*-", "", sapply(ss, .subset, 1)),
17 | sapply(ss, .subset, 2))
18 | vars <- seq(3, 6)
19 | names(vars) <- c("method", "ngs", "ncs", "rep")
20 | vars <- lapply(vars, \(.) sapply(ss, .subset, .))
21 |
22 | df <- data.frame(mbs, reftyp, refset, vars) %>%
23 | pivot_longer(
24 | values_to = "n",
25 | names_to = "dim",
26 | cols = c("ngs", "ncs")) %>%
27 | filter(n != "x") %>%
28 | group_by(dim) %>%
29 | mutate(n = factor(n, sort(as.numeric(unique(n))))) %>%
30 | ungroup() %>%
31 | mutate(
32 | dim = factor(dim,
33 | levels = c("ngs", "ncs"),
34 | labels = c("# genes", "# cells")),
35 | method = droplevels(factor(method, names(.methods_pal)))) %>%
36 | group_by(method, dim, n) %>%
37 | summarise_at("mbs", mean)
38 |
39 | # order methods by average across subsets
40 | ms <- df %>%
41 | group_by(method) %>%
42 | summarise_at("mbs", mean) %>%
43 | arrange(mbs) %>% pull("method")
44 | df$method <- factor(df$method, levels = ms)
45 |
46 | pal <- .methods_pal[levels(df$method)]
47 |
48 | lab <- parse(text = paste(
49 | sep = "~",
50 | sprintf("bold(%s)", LETTERS),
51 | gsub("\\s", "~", names(pal))))
52 |
53 | anno <- mutate(df, letter = LETTERS[
54 | match(method, levels(method))])
55 |
56 | plt <- ggplot(df, aes(factor(n), mbs,
57 | col = method, fill = method)) +
58 | facet_grid(~ dim, scales = "free") +
59 | geom_bar(
60 | width = 0.9,
61 | stat = "identity",
62 | position = "dodge") +
63 | geom_text(data = anno,
64 | aes(label = letter, y = -250),
65 | size = 1.5, color = "black",
66 | position = position_dodge(0.9)) +
67 | scale_fill_manual(values = pal, labels = lab) +
68 | scale_color_manual(values = pal, labels = lab) +
69 | xlab(NULL) + scale_y_continuous(
70 | "memory usage (MBs)",
71 | limits = c(-500, 4e3),
72 | expand = c(0, 0))
73 |
74 | thm <- theme(
75 | axis.ticks.x = element_blank(),
76 | legend.title = element_blank(),
77 | legend.text = element_text(hjust = 0),
78 | panel.grid.major.y = element_line(color = "grey"))
79 |
80 | fig <- .prettify(plt, thm)
81 |
82 | saveRDS(fig, args$rds)
83 | ggsave(args$pdf, fig, width = 18, height = 9, units = "cm")
--------------------------------------------------------------------------------
/code/07-plot_qc_ref-correlations.R:
--------------------------------------------------------------------------------
1 | # wcs <- list(stat1d = "ks")
2 | # args <- list(
3 | # fun = "code/utils-plotting.R",
4 | # res = "outs/obj-qc_ref.rds",
5 | # rds = "plts/qc_ref-correlations.rds",
6 | # pdf = "plts/qc_ref-correlations.pdf")
7 |
8 | source(args$fun)
9 | res <- readRDS(args$res)
10 |
11 | # exclude summaries that include sampling
12 | ex <- .metrics_lab[grep("cor|pcd", names(.metrics_lab))]
13 | df <- res %>%
14 | select(-c(datset, subset)) %>%
15 | filter(!metric %in% ex) %>%
16 | mutate(
17 | # add summary short names
18 | .metric = names(.metrics_lab)[match(metric, .metrics_lab)],
19 | # add summary type
20 | type = ifelse(grepl("cell", .metric), "cell", "gene")) %>%
21 | # keep group-level comparisons only
22 | mutate(across(c(group, id), as.character)) %>%
23 | filter(group != id | .metric %in% .none_metrics) %>%
24 | select(-c(.metric, group, id)) %>%
25 | # correlate summaries of same type for each refset
26 | group_by(refset, metric) %>%
27 | mutate(n = row_number()) %>%
28 | group_by(refset, type) %>%
29 | group_map(\(df, keys) {
30 | pivot_wider(df,
31 | names_from = metric,
32 | values_from = value) %>%
33 | select(any_of(.metrics_lab)) %>%
34 | cor(method = "spearman",
35 | use = "pairwise.complete.obs") %>%
36 | data.frame(from = rownames(.)) %>%
37 | pivot_longer(-from, names_to = "to") %>%
38 | mutate(keys)
39 | }) %>%
40 | bind_rows() %>%
41 | # average across refsets
42 | group_by(from, to) %>%
43 | mutate(value = mean(value)) %>%
44 | ungroup() %>%
45 | distinct(from, to, .keep_all = TRUE) %>%
46 | select(-refset)
47 |
48 | # do hierarchical clustering for each type
49 | mat <- df %>%
50 | split(.$type) %>%
51 | lapply(\(df) df %>%
52 | pivot_wider(
53 | names_from = to,
54 | values_from = value) %>%
55 | select_if(is.numeric))
56 |
57 | o <- unlist(lapply(mat, \(.) colnames(.)[hclust(dist(.))$order]))
58 | df <- mutate(df, across(c(from, to), factor, o, .metrics_lab[o]))
59 |
60 | plt <- ggplot(df, aes(from, to, fill = value)) +
61 | geom_tile() +
62 | scale_fill_distiller("r",
63 | palette = "RdYlBu",
64 | limits = c(-0.5, 1),
65 | breaks = c(0, 1)) +
66 | coord_equal(expand = FALSE) +
67 | scale_x_discrete(limits = rev(levels(df$to)))
68 |
69 | thm <- theme(
70 | axis.title = element_blank(),
71 | axis.text.x = element_text(angle = 45, hjust = 1))
72 |
73 | fig <- .prettify(plt, thm)
74 |
75 | saveRDS(fig, args$rds)
76 | ggsave(args$pdf, fig, width = 7.5, height = 6, units = "cm")
77 |
--------------------------------------------------------------------------------
/code/07-plot_runtimes.R:
--------------------------------------------------------------------------------
1 | # wcs <- list(reftyp = "n")
2 | # args <- list(
3 | # rds = sprintf("plts/rts_%s.rds", wcs$reftyp),
4 | # pdf = sprintf("plts/rts_%s.pdf", wcs$reftyp),
5 | # fun = "code/utils-plotting.R",
6 | # res = list.files("outs", sprintf("^rts_%s-", wcs$reftyp), full.names = TRUE))
7 |
8 | source(args$fun)
9 | res <- .read_res(args$res)
10 |
11 | df <- res %>%
12 | # drop SCRIP estimation for 'reftyp' other than 'n'
13 | # as there's none, it's just setting up parameters
14 | { if (wcs$reftyp != "n")
15 | mutate(., est = case_when(
16 | method == "SCRIP" ~ NA_real_,
17 | TRUE ~ est)) else . } %>%
18 | # sum up estimation & simulation timings
19 | rowwise() %>%
20 | mutate(tot = {
21 | if (is.finite(est)) {
22 | if (is.finite(sim)) {
23 | est + sim # both available
24 | } else est # only estimation
25 | } else if (is.finite(sim)) {
26 | sim # only simulation
27 | } else NA # neither available
28 | }) %>%
29 | ungroup() %>%
30 | pivot_longer(
31 | values_to = "n",
32 | names_to = "dim",
33 | cols = c("ngs", "ncs")) %>%
34 | pivot_longer(
35 | values_to = "t",
36 | names_to = "step",
37 | cols = c("est", "sim", "tot")) %>%
38 | mutate(
39 | dim = factor(dim,
40 | levels = c("ngs", "ncs"),
41 | labels = c("# genes", "# cells")),
42 | step = factor(step,
43 | levels = c("est", "sim", "tot"),
44 | labels = c("estimation", "simulation", "overall"))) %>%
45 | filter(!is.na(n), !is.na(t), is.finite(t)) %>%
46 | group_by(method, step, dim, n) %>%
47 | summarise_at("t", mean)
48 |
49 | # order methods by overall average across subsets
50 | ms <- df %>%
51 | filter(step == "overall") %>%
52 | group_by(method) %>%
53 | summarise_at("t", mean) %>%
54 | arrange(t) %>% pull("method")
55 | df$method <- factor(df$method, levels = ms)
56 |
57 | pal <- .methods_pal[levels(df$method)]
58 |
59 | lab <- parse(text = paste(
60 | sep = "~",
61 | sprintf("bold(%s)", LETTERS),
62 | gsub("\\s", "~", names(pal))))
63 |
64 | anno <- df %>%
65 | group_by(step, dim) %>%
66 | mutate(letter = LETTERS[match(method, levels(method))])
67 |
68 | plt <- ggplot(df, aes(factor(n), t,
69 | col = method, fill = method)) +
70 | facet_grid(step ~ dim, scales = "free") +
71 | geom_bar(
72 | width = 0.9,
73 | stat = "identity",
74 | position = "dodge") +
75 | geom_text(data = anno,
76 | aes(label = letter, y = 0),
77 | size = 1.5, vjust = 2, color = "black",
78 | position = position_dodge(0.9)) +
79 | scale_fill_manual(values = pal, labels = lab) +
80 | scale_color_manual(values = pal, labels = lab) +
81 | xlab(NULL) + scale_y_sqrt("runtime (s)",
82 | expand = expansion(mult = c(0.2, 0.1)))
83 |
84 | thm <- theme(
85 | axis.ticks.x = element_blank(),
86 | legend.title = element_blank(),
87 | legend.text = element_text(hjust = 0),
88 | panel.grid.major.y = element_line(color = "grey"))
89 |
90 | fig <- .prettify(plt, thm)
91 |
92 | saveRDS(fig, args$rds)
93 | ggsave(args$pdf, fig, width = 18, height = 9, units = "cm")
94 |
--------------------------------------------------------------------------------
/code/07-plot_stat_1d-scatters.R:
--------------------------------------------------------------------------------
1 | # args <- list(
2 | # fun = "code/utils.R",
3 | # res = "outs/obj-stat_1d.rds",
4 | # rds = "plts/stat_1d-scatters.rds",
5 | # pdf = "plts/stat_1d-scatters.pdf")
6 |
7 | source(args$fun)
8 |
9 | res <- readRDS(args$res)
10 |
11 | df <- res %>%
12 | mutate(refset = paste(datset, subset, sep = ",")) %>%
13 | select(-c(datset, subset)) %>%
14 | # for type != n, keep batch-/cluster-level comparisons only
15 | group_by(stat1d, refset, method, metric) %>%
16 | mutate(n = n()) %>%
17 | filter(group != "global" | n == 1) %>%
18 | select(-c(group, n)) %>%
19 | # for each statistic & metric, re-scale b/w 0 & 1
20 | group_by(stat1d, metric) %>%
21 | mutate(stat = stat/max(stat, na.rm = TRUE)) %>%
22 | pivot_wider(names_from = stat1d, values_from = stat) %>%
23 | filter(!is.na(ks), !is.na(ws))
24 |
25 | plt <- ggplot(df, aes(ks, ws, col = refset, fill = refset)) +
26 | facet_wrap(~ metric, nrow = 2) +
27 | stat_cor(
28 | aes(group = metric), method = "spearman",
29 | label.x = 0, label.y = 1, vjust = 1, size = 1.5) +
30 | geom_point(size = 0.2, alpha = 0.4, show.legend = FALSE) +
31 | scale_x_sqrt(limits = c(0, 1), breaks = seq(0.2, 1, 0.2)) +
32 | scale_y_sqrt(limits = c(0, 1), breaks = seq(0.2, 1, 0.2)) +
33 | labs(x = "KS statistics", y = "scaled Wasserstein metric")
34 |
35 | thm <- theme(aspect.ratio = 1)
36 |
37 | fig <- .prettify(plt, thm)
38 |
39 | saveRDS(fig, args$rds)
40 | ggsave(args$pdf, fig, width = 16, height = 7.5, units = "cm")
41 |
--------------------------------------------------------------------------------
/code/07-plot_stat_1d_by_reftyp-boxplot.R:
--------------------------------------------------------------------------------
1 | # wcs <- list(reftyp = "n", stat1d = "ks")
2 | # args <- list(
3 | # fun = "code/utils-plotting.R",
4 | # res = "outs/obj-stat_1d.rds",
5 | # rds = sprintf("stat_1d_by_reftyp-boxplot,%s,%s.rds", wcs$reftyp, wcs$stat1d),
6 | # pdf = sprintf("stat_1d_by_reftyp-boxplot,%s,%s.pdf", wcs$reftyp, wcs$stat1d))
7 |
8 | source(args$fun)
9 | res <- readRDS(args$res)
10 |
11 | df <- res %>%
12 | # keep data of interest
13 | filter(
14 | reftyp == wcs$reftyp,
15 | stat1d == wcs$stat1d) %>%
16 | .filter_res() %>%
17 | # scale values b/w 0 and 1 for visualization
18 | group_by(metric) %>%
19 | mutate(stat = stat/max(stat, na.rm = TRUE))
20 |
21 | pal <- .methods_pal[levels(df$method)]
22 | lab <- parse(text = paste(
23 | sep = "~",
24 | sprintf("bold(%s)", LETTERS),
25 | gsub("\\s", "~", names(pal))))
26 |
27 | anno <- df %>%
28 | group_by(method, metric) %>%
29 | summarize_at("stat", median) %>%
30 | mutate(letter = LETTERS[match(method, levels(method))])
31 |
32 | plt <- ggplot(df, aes(
33 | reorder_within(method, stat, metric, median),
34 | stat, col = method, fill = method)) +
35 | facet_wrap(~ metric, nrow = 3, scales = "free_x") +
36 | geom_boxplot(
37 | outlier.size = 0.25, outlier.alpha = 1,
38 | size = 0.25, alpha = 0.25, key_glyph = "point") +
39 | geom_text(data = anno,
40 | size = 1.5, color = "black",
41 | aes(label = letter, y = -0.075)) +
42 | scale_fill_manual(values = pal, labels = lab) +
43 | scale_color_manual(values = pal, labels = lab) +
44 | scale_x_reordered(NULL) +
45 | scale_y_continuous(
46 | paste(ifelse(wcs$stat1d == "ws", "scaled", ""),
47 | .stats1d_lab[wcs$stat1d]),
48 | limits = c(-0.1, 1), n.breaks = 3)
49 |
50 | thm <- theme(
51 | legend.text.align = 0,
52 | legend.title = element_blank(),
53 | axis.text.x = element_blank(),
54 | axis.title.x = element_blank(),
55 | axis.ticks.x = element_blank())
56 |
57 | fig <- .prettify(plt, thm)
58 |
59 | saveRDS(fig, args$rds)
60 | ggsave(args$pdf, fig, width = 16, height = 9, units = "cm")
--------------------------------------------------------------------------------
/code/07-plot_stat_1d_by_reftyp-dimEst.R:
--------------------------------------------------------------------------------
1 | # wcs <- list(reftyp = "n", stat1d = "ks")
2 | # args <- list(
3 | # fun = "code/utils-plotting.R",
4 | # res = "outs/obj-stat_1d.rds",
5 | # rds = sprintf("plts/stat_1d_by_reftyp-dimEst,%s,%s.rds", wcs$reftyp, wcs$stat1d),
6 | # pdf = sprintf("plts/stat_1d_by_reftyp-dimEst,%s,%s.pdf", wcs$reftyp, wcs$stat1d))
7 |
8 | suppressPackageStartupMessages({
9 | library(intrinsicDimension)
10 | })
11 |
12 | source(args$fun)
13 | res <- readRDS(args$res)
14 |
15 | df <- res %>%
16 | # keep data of interest
17 | filter(
18 | reftyp == wcs$reftyp,
19 | stat1d == wcs$stat1d) %>%
20 | .filter_res() %>%
21 | # average across groups
22 | group_by(refset, method, metric, group) %>% .avg(n = 1) %>%
23 | # estimate dimensionality for each refset
24 | pivot_wider(
25 | names_from = metric,
26 | values_from = stat) %>%
27 | split(.$refset) %>%
28 | lapply(\(df) {
29 | if (nrow(df) < 3) return(NULL)
30 | ks <- seq(2, nrow(df)-1)
31 | mat <- as.matrix(select(df, any_of(.metrics_lab)))
32 | mat <- mat[, apply(mat, 2, \(.) !all(is.na(.)))]
33 | mat[is.na(mat)] <- 1
34 | est <- sapply(ks, maxLikGlobalDimEst, data = mat)
35 | data.frame(dim = unlist(est), k = ks)
36 | }) %>%
37 | bind_rows(.id = "refset")
38 |
39 | plt <- ggplot(df, aes(k, dim, col = refset, fill = refset)) +
40 | geom_point(shape = 21, size = 0.5) +
41 | geom_line(alpha = 0.5, show.legend = FALSE) +
42 | scale_x_continuous(
43 | "number of nearest neighbors (k)",
44 | limits = range(df$k),
45 | breaks = seq(2, 20, 2)) +
46 | scale_y_continuous(
47 | "estimated dimensionality",
48 | limits = range(df$dim),
49 | breaks = seq(0, 10, 2)) +
50 | ggtitle(paste("reftyp:", wcs$reftyp))
51 |
52 | thm <- theme(
53 | legend.title = element_blank(),
54 | panel.grid.major = element_line(color = "grey"))
55 |
56 | fig <- .prettify(plt, thm)
57 |
58 | saveRDS(fig, args$rds)
59 | ggsave(args$pdf, fig, width = 16, height = 9, units = "cm")
60 |
--------------------------------------------------------------------------------
/code/07-plot_stat_1d_by_reftyp-heatmap.R:
--------------------------------------------------------------------------------
1 | # wcs <- list(reftyp = "k", stat1d = "ks")
2 | # args <- list(
3 | # fun = "code/utils-plotting.R",
4 | # res = "outs/obj-stat_1d.rds",
5 | # rds = sprintf("stat_1d_by_reftyp-boxplot,%s,%s.rds", wcs$reftyp, wcs$stat1d),
6 | # pdf = sprintf("stat_1d_by_reftyp-boxplot,%s,%s.pdf", wcs$reftyp, wcs$stat1d))
7 |
8 | source(args$fun)
9 | res <- readRDS(args$res)
10 |
11 | df <- res %>%
12 | # keep data of interest
13 | filter(
14 | reftyp == wcs$reftyp,
15 | stat1d == wcs$stat1d) %>%
16 | .filter_res() %>%
17 | # for each method & summary,
18 | # average across groups & refsets
19 | group_by(metric, method, refset, group) %>% .avg(n = 3) %>%
20 | complete(method, metric, fill = list(stat = NA))
21 |
22 | # order methods by average across metrics
23 | ox <- df %>%
24 | group_by(method) %>%
25 | summarise_at("stat", mean, na.rm = TRUE) %>%
26 | arrange(desc(stat)) %>%
27 | pull("method")
28 |
29 | # order metrics by average across methods
30 | oy <- df %>%
31 | group_by(metric) %>%
32 | summarise_at("stat", mean, na.rm = TRUE) %>%
33 | arrange(desc(stat)) %>%
34 | pull("metric")
35 |
36 | plt <- ggplot(df,
37 | aes(method, metric, fill = stat)) +
38 | geom_tile(col = "white") +
39 | scale_fill_distiller(
40 | .stats1d_lab[wcs$stat1d],
41 | palette = "RdYlBu",
42 | na.value = "grey",
43 | limits = c(0, 1),
44 | breaks = c(0, 1)) +
45 | coord_equal(expand = FALSE) +
46 | scale_x_discrete(limits = rev(ox)) +
47 | scale_y_discrete(limits = rev(oy))
48 |
49 | thm <- theme(
50 | axis.ticks = element_blank(),
51 | axis.title = element_blank(),
52 | panel.border = element_blank(),
53 | axis.text.x = element_text(angle = 45, hjust = 1))
54 |
55 | fig <- .prettify(plt, thm)
56 |
57 | saveRDS(fig, args$rds)
58 | ggsave(args$pdf, fig, width = 8, height = 6, units = "cm")
--------------------------------------------------------------------------------
/code/07-plot_stat_1d_by_reftyp-mds.R:
--------------------------------------------------------------------------------
1 | # wcs <- list(reftyp = "b", stat1d = "ks")
2 | # args <- list(
3 | # fun = "code/utils-plotting.R",
4 | # res = "outs/obj-stat_1d.rds",
5 | # rds = sprintf("plts/stat_1d_by_reftyp-mds,%s,%s.rds", wcs$reftyp, wcs$stat1d),
6 | # pdf = sprintf("plts/stat_1d_by_reftyp-mds,%s,%s.pdf", wcs$reftyp, wcs$stat1d))
7 |
8 | source(args$fun)
9 | res <- readRDS(args$res)
10 |
11 | df <- res %>%
12 | # keep data of interest
13 | filter(
14 | reftyp == wcs$reftyp,
15 | stat1d == wcs$stat1d) %>%
16 | .filter_res() %>%
17 | # average across groups
18 | group_by(refset, reftyp, method, metric, group) %>%
19 | .avg(n = 2)
20 |
21 | mds <- df %>%
22 | pivot_wider(
23 | id_cols = c(refset, method),
24 | names_from = metric,
25 | values_from = stat) %>%
26 | select(any_of(.metrics_lab)) %>%
27 | as.matrix() %>% t() %>%
28 | dist() %>% cmdscale()
29 |
30 | gg <- mds %>%
31 | data.frame(rownames(.)) %>%
32 | set_colnames(c("x", "y", "metric")) %>%
33 | mutate(
34 | type = case_when(
35 | metric %in% .none_metrics ~ "global",
36 | grepl("gene", metric) ~ "gene",
37 | grepl("cell", metric) ~ "cell"),
38 | type = factor(type, c("gene", "cell", "global")),
39 | metric = .metrics_lab[metric])
40 |
41 | plt <- ggplot(gg, aes(x, y,
42 | col = type, fill = type, label = metric)) +
43 | geom_point(size = 2, shape = 21, col = "black", alpha = 0.5) +
44 | geom_text_repel(size = 2, show.legend = FALSE) +
45 | scale_fill_manual(values = c("red", "blue", "green3")) +
46 | scale_color_manual(values = c("red", "blue", "green3")) +
47 | scale_x_continuous(expand = expansion(mult = 0.1)) +
48 | scale_y_continuous(expand = expansion(mult = 0.1)) +
49 | labs(x = "MDS dim. 1", y = "MDS dim. 2") +
50 | coord_fixed()
51 |
52 | thm <- theme(
53 | legend.position = "bottom",
54 | legend.title = element_blank(),
55 | panel.grid.major = element_line(color = "grey"))
56 |
57 | fig <- .prettify(plt, thm)
58 |
59 | saveRDS(fig, args$rds)
60 | ggsave(args$pdf, fig, width = 9, height = 6, units = "cm")
61 |
--------------------------------------------------------------------------------
/code/07-plot_stat_1d_by_reftyp-pca.R:
--------------------------------------------------------------------------------
1 | # wcs <- list(reftyp = "b", stat1d = "ks")
2 | # args <- list(
3 | # fun = "code/utils-plotting.R",
4 | # res = "outs/obj-stat_1d.rds",
5 | # rds = sprintf("plts/stat_1d_by_reftyp-pca,%s,%s.rds", wcs$reftyp, wcs$stat1d),
6 | # pdf = sprintf("plts/stat_1d_by_reftyp-pca,%s,%s.pdf", wcs$reftyp, wcs$stat1d))
7 |
8 | source(args$fun)
9 | res <- readRDS(args$res)
10 |
11 | df <- res %>%
12 | # keep data of interest
13 | filter(
14 | reftyp == wcs$reftyp,
15 | stat1d == wcs$stat1d) %>%
16 | .filter_res() %>%
17 | # average across groups
18 | group_by(refset, method, metric, group) %>%
19 | .avg(n = 2) %>%
20 | pivot_wider(
21 | names_from = metric,
22 | values_from = stat) %>%
23 | drop_na(any_of(.metrics_lab))
24 |
25 | # do PCA
26 | pc <- df %>%
27 | select(any_of(.metrics_lab)) %>%
28 | as.matrix() %>% prcomp()
29 |
30 | # get coordinates
31 | xy <- pc$x %>%
32 | data.frame() %>%
33 | # add metadata
34 | mutate(
35 | select(df, !any_of(.metrics_lab)),
36 | method = droplevels(method),
37 | reftyp = res$reftyp[match(refset, res$refset)]) %>%
38 | mutate_at("reftyp", factor, c("n", "b", "k"))
39 |
40 | # get loadings
41 | rot <- pc$rotation %>%
42 | data.frame() %>%
43 | mutate(
44 | metric = .metrics_lab[rownames(.)],
45 | metric = factor(metric, metric))
46 | ij <- c("PC1", "PC2")
47 | m1 <- max(abs(xy[, ij]))
48 | m2 <- max(abs(rot[, ij]))
49 | rot[, ij] <- 0.5*rot[, ij]*m1/m2
50 |
51 | # get percentages
52 | var <- prop.table(pc$sdev^2)
53 | var_lab <- round(100*var, 1)
54 |
55 | # plotting
56 | lab <- xy %>%
57 | group_by(method) %>%
58 | select(starts_with("PC")) %>%
59 | summarise_all(mean, .groups = "drop")
60 |
61 | p0 <- ggplot(xy, aes(PC1, PC2)) +
62 | geom_vline(xintercept = 0, size = 0.1) +
63 | geom_hline(yintercept = 0, size = 0.1) +
64 | coord_fixed() + labs(
65 | x = sprintf("PC1 (%s%%)", var_lab[1]),
66 | y = sprintf("PC2 (%s%%)", var_lab[2])) +
67 | scale_x_continuous(
68 | limits = range(xy$PC1),
69 | expand = expansion(mult = 0.2)) +
70 | scale_y_continuous(
71 | limits = range(xy$PC2),
72 | expand = expansion(mult = 0.2))
73 |
74 | p1 <- p0 +
75 | geom_point(
76 | aes(fill = method), shape = 21,
77 | size = 2, stroke = 0, alpha = 0.4) +
78 | geom_point(data = lab,
79 | aes(fill = method), shape = 21,
80 | size = 3, stroke = 0, alpha = 0.8) +
81 | geom_label_repel(data = lab,
82 | aes(label = method, col = method),
83 | label.padding = unit(0.75, "mm"),
84 | size = 2, fontface = "bold", show.legend = FALSE) +
85 | scale_fill_manual(values = .methods_pal[levels(xy$method)]) +
86 | scale_color_manual(values = .methods_pal[levels(xy$method)])
87 |
88 | p2 <- p0 +
89 | geom_segment(data = rot,
90 | aes(0, 0, xend = PC1, yend = PC2, col = metric),
91 | size = 0.5, arrow = arrow(length = unit(1, "mm"))) +
92 | geom_label_repel(data = rot,
93 | aes(label = metric, col = metric),
94 | label.padding = unit(0.75, "mm"),
95 | size = 2, fontface = "bold", show.legend = FALSE) +
96 | scale_color_manual(values = .metrics_pal[levels(rot$metric)])
97 |
98 | thm <- theme(
99 | legend.justification = c(0, 0.5),
100 | panel.grid.major = element_line(size = 0.1, color = "grey"))
101 |
102 | f1 <- .prettify(p1, thm)
103 | f2 <- .prettify(p2, thm)
104 | f2$guides$colour$override.aes$size <- 0.5
105 |
106 | fig <- f1 / f2 +
107 | plot_annotation(tag_levels = "a") &
108 | theme(plot.tag = element_text(size = 9, face = "bold"))
109 |
110 | saveRDS(fig, args$rds)
111 | ggsave(args$pdf, fig, width = 16, height = 18, units = "cm")
112 |
--------------------------------------------------------------------------------
/code/07-plot_stat_1d_by_stat1d-boxplot_by_method.R:
--------------------------------------------------------------------------------
1 | # wcs <- list(stat1d = "ks")
2 | # args <- list(
3 | # fun = "code/utils-plotting.R",
4 | # res = "outs/obj-stat_1d.rds",
5 | # rds = sprintf("plts/stat_1d_by_stat1d-boxplot,%s.rds", wcs$stat1d),
6 | # pdf = sprintf("plts/stat_1d_by_stat1d-boxplot,%s.pdf", wcs$stat2d))
7 |
8 | source(args$fun)
9 | res <- readRDS(args$res)
10 |
11 | df <- res %>%
12 | # keep data of interest
13 | filter(stat1d == wcs$stat1d) %>%
14 | .filter_res() %>%
15 | # average across groups
16 | group_by(refset, method, metric) %>%
17 | .avg(n = 1)
18 |
19 | # average statistics across refsets & summaries
20 | # statistic across all summaries
21 | o <- df %>%
22 | group_by(method, refset, metric) %>%
23 | .avg(n = 3) %>%
24 | # order methods (panels) by average
25 | arrange(stat) %>%
26 | pull("method")
27 | df <- df %>% mutate_at("method", factor, o)
28 |
29 | pal <- .metrics_pal[levels(df$metric)]
30 | lab <- parse(text = paste(
31 | sep = "~",
32 | sprintf("bold(%s)", LETTERS),
33 | gsub("\\s", "~", names(pal))))
34 |
35 | anno <- df %>%
36 | group_by(method, metric) %>%
37 | summarize_at("stat", median) %>%
38 | mutate(letter = LETTERS[match(metric, levels(metric))])
39 |
40 | plt <- ggplot(df, aes(
41 | reorder_within(metric, stat, method, median),
42 | stat, col = metric, fill = metric)) +
43 | facet_wrap(~ method, nrow = 4, scales = "free_x") +
44 | geom_boxplot(
45 | size = 0.25, outlier.size = 0.25,
46 | alpha = 0.25, key_glyph = "point") +
47 | geom_text(data = anno,
48 | size = 1.5, color = "black",
49 | aes(label = letter, y = -0.075)) +
50 | scale_fill_manual(values = pal, labels = lab) +
51 | scale_color_manual(values = pal, labels = lab) +
52 | scale_y_continuous(limits = c(-0.1, NA), n.breaks = 3) +
53 | labs(x = NULL, y = .stats1d_lab[wcs$stat1d])
54 |
55 | thm <- theme(
56 | legend.text.align = 0,
57 | axis.text.x = element_blank(),
58 | axis.ticks.x = element_blank())
59 |
60 | fig <- .prettify(plt, thm)
61 |
62 | saveRDS(fig, args$rds)
63 | ggsave(args$pdf, fig, width = 16, height = 9, units = "cm")
64 |
--------------------------------------------------------------------------------
/code/07-plot_stat_1d_by_stat1d-boxplot_by_metric.R:
--------------------------------------------------------------------------------
1 | # wcs <- list(stat1d = "ks")
2 | # args <- list(
3 | # fun = "code/utils-plotting.R",
4 | # rds = "plts/stat_1d_by_reftyp-boxplot.rds",
5 | # pdf = "plts/stat_1d_by_reftyp-boxplot.pdf",
6 | # res = list.files("outs", paste0("^stat_1d.*", wcs$stat1d, "\\."), full.names = TRUE))
7 | # args$res <- sample(args$res, 100)
8 |
9 | source(args$fun)
10 | res <- readRDS(args$res)
11 |
12 | df <- res %>%
13 | # keep data of interest
14 | filter(stat1d == wcs$stat1d) %>%
15 | .filter_res() %>%
16 | # average across groups
17 | group_by(refset, method, metric, group) %>% .avg(n = 1)
18 |
19 | pal <- .methods_pal[levels(df$method)]
20 | lab <- parse(text = paste(
21 | sep = "~",
22 | sprintf("bold(%s)", LETTERS),
23 | gsub("\\s", "~", names(pal))))
24 |
25 | anno <- df %>%
26 | group_by(method, metric) %>%
27 | summarize_at("stat", median) %>%
28 | mutate(letter = LETTERS[match(method, levels(method))])
29 |
30 | plt <- ggplot(df, aes(
31 | reorder_within(method, stat, metric, median),
32 | stat, col = method, fill = method)) +
33 | facet_wrap(~ metric, nrow = 3, scales = "free_x") +
34 | geom_boxplot(
35 | outlier.size = 0.25, outlier.alpha = 1,
36 | size = 0.25, alpha = 0.25, key_glyph = "point") +
37 | geom_text(data = anno,
38 | size = 1.5, color = "black",
39 | aes(label = letter, y = -0.075)) +
40 | scale_fill_manual(values = pal, labels = lab) +
41 | scale_color_manual(values = pal, labels = lab) +
42 | scale_y_continuous(limits = c(-0.1, NA), n.breaks = 3) +
43 | labs(x = NULL, y = .stats1d_lab[wcs$stat1d])
44 |
45 | thm <- theme(
46 | legend.text.align = 0,
47 | axis.text.x = element_blank(),
48 | axis.ticks.x = element_blank())
49 |
50 | fig <- .prettify(plt, thm)
51 |
52 | saveRDS(fig, args$rds)
53 | ggsave(args$pdf, fig, width = 16, height = 9, units = "cm")
54 |
--------------------------------------------------------------------------------
/code/07-plot_stat_1d_by_stat1d-correlations.R:
--------------------------------------------------------------------------------
1 | # wcs <- list(stat1d = "ks")
2 | # args <- list(
3 | # fun = "code/utils-plotting.R",
4 | # res = "outs/obj-stat_1d.rds",
5 | # rds = sprintf("plts/stat_1d_by_stat1d-correlations,%s.rds", wcs$stat1d),
6 | # pdf = sprintf("plts/stat_1d_by_stat1d-correlations,%s.pdf", wcs$stat1d))
7 |
8 | source(args$fun)
9 | res <- readRDS(args$res)
10 |
11 | mat <- res %>%
12 | # keep data of interest
13 | filter(stat1d == wcs$stat1d) %>%
14 | .filter_res() %>%
15 | # average across groups & subsets
16 | group_by(method, metric, refset, group) %>%
17 | .avg(n = 3) %>%
18 | # correlate b/w summaries, across methods & refsets
19 | pivot_wider(
20 | names_from = metric,
21 | values_from = stat) %>%
22 | select(any_of(.metrics_lab)) %>%
23 | cor(method = "spearman",
24 | use = "pairwise.complete.obs")
25 |
26 | foo <- mat
27 | foo[is.na(foo)] <- 1
28 | xo <- rownames(mat)[hclust(dist(foo))$order]
29 | yo <- rownames(mat)[hclust(dist(t(foo)))$order]
30 |
31 | df <- mat %>%
32 | data.frame(from = rownames(.)) %>%
33 | pivot_longer(-from, names_to = "to") %>%
34 | complete(from, to, fill = list(value = NA))
35 |
36 | plt <- ggplot(df, aes(from, to, fill = value)) +
37 | geom_tile() +
38 | scale_fill_distiller(
39 | bquote("r("*.(.stats1d_lab[wcs$stat1d])*")"),
40 | palette = "RdYlBu",
41 | limits = c(-1, 1),
42 | n.breaks = 3) +
43 | coord_equal(expand = FALSE) +
44 | scale_x_discrete(limits = rev(xo), labels = .metrics_lab) +
45 | scale_y_discrete(limits = yo, labels = .metrics_lab)
46 |
47 | thm <- theme(
48 | axis.title = element_blank(),
49 | axis.text.x = element_text(angle = 45, hjust = 1))
50 |
51 | fig <- .prettify(plt, thm)
52 |
53 | saveRDS(fig, args$rds)
54 | ggsave(args$pdf, fig, width = 9, height = 7.5, units = "cm")
55 |
--------------------------------------------------------------------------------
/code/07-plot_stat_1d_by_stat1d-correlations_by_method.R:
--------------------------------------------------------------------------------
1 | # wcs <- list(stat1d = "ks")
2 | # args <- list(
3 | # fun = "code/utils-plotting.R",
4 | # res = "outs/obj-stat_1d.rds",
5 | # rds = sprintf("plts/stat_1d_by_stat1d-correlations_by_method,%s.rds", wcs$stat1d),
6 | # pdf = sprintf("plts/stat_1d_by_stat1d-correlations_by_method,%s.pdf", wcs$stat1d))
7 |
8 | source(args$fun)
9 | res <- readRDS(args$res)
10 |
11 | mat <- res %>%
12 | # keep data of interest
13 | filter(stat1d == wcs$stat1d) %>%
14 | .filter_res() %>%
15 | # for each method, method & summary, average across groups
16 | group_by(refset, method, metric, group) %>% .avg(n = 1) %>%
17 | # for each method, correlate b/w summaries & across refsets
18 | split(.$method) %>%
19 | lapply(\(.)
20 | pivot_wider(.,
21 | names_from = metric,
22 | values_from = stat) %>%
23 | select(any_of(.metrics_lab)) %>%
24 | cor(method = "spearman",
25 | use = "pairwise.complete.obs"))
26 |
27 | df <- lapply(mat, \(u) u %>%
28 | data.frame(from = rownames(.)) %>%
29 | pivot_longer(-from, names_to = "to")) %>%
30 | bind_rows(.id = "method") %>%
31 | complete(from, to, method, fill = list(value = NA))
32 |
33 | foo <- df %>%
34 | group_by(from, to) %>%
35 | summarise(value = mean(value), .groups = "drop") %>%
36 | pivot_wider(names_from = to, values_from = value) %>%
37 | select(-from)
38 | foo[is.na(foo)] <- 1
39 | o <- colnames(foo)[hclust(dist(as.matrix(foo)))$order]
40 |
41 | plt <- ggplot(df, aes(from, to, fill = value)) +
42 | facet_wrap(~ method, nrow = 4) +
43 | scale_fill_distiller(
44 | bquote("r("*.(.stats1d_lab[wcs$stat1d])*")"),
45 | palette = "RdYlBu",
46 | na.value = "grey",
47 | limits = c(-1, 1),
48 | n.breaks = 3) +
49 | scale_x_discrete(limits = rev(o), labels = .metrics_lab) +
50 | scale_y_discrete(limits = o, labels = .metrics_lab) +
51 | coord_equal(expand = FALSE) +
52 | geom_tile()
53 |
54 | thm <- theme(
55 | axis.title = element_blank(),
56 | axis.text.x = element_text(angle = 45, hjust = 1))
57 |
58 | fig <- .prettify(plt, thm)
59 |
60 | saveRDS(fig, args$rds)
61 | ggsave(args$pdf, fig, width = 16, height = 16, units = "cm")
62 |
--------------------------------------------------------------------------------
/code/07-plot_stat_1d_by_stat1d-correlations_by_metric.R:
--------------------------------------------------------------------------------
1 | # wcs <- list(stat1d = "ks")
2 | # args <- list(
3 | # fun = "code/utils-plotting.R",
4 | # res = "outs/obj-stat_1d.rds",
5 | # rds = sprintf("plts/stat_1d_by_stat1d-correlations_by_metric,%s.rds", wcs$stat1d),
6 | # pdf = sprintf("plts/stat_1d_by_stat1d-correlations_by_metric,%s.pdf", wcs$stat1d))
7 |
8 | source(args$fun)
9 | res <- readRDS(args$res)
10 |
11 | mat <- res %>%
12 | # keep method of interest
13 | filter(stat1d == wcs$stat1d) %>%
14 | .filter_res() %>%
15 | # for each method, method & summary, average across groups & refsets
16 | group_by(refset, method, metric, group) %>% .avg(n = 1) %>%
17 | # for each method, correlate b/w summaries & across refsets
18 | split(.$metric) %>%
19 | lapply(\(.)
20 | pivot_wider(.,
21 | names_from = method,
22 | values_from = stat) %>%
23 | select(any_of(names(.methods_pal))) %>%
24 | cor(method = "spearman",
25 | use = "pairwise.complete.obs"))
26 |
27 | df <- lapply(mat, \(u) u %>%
28 | data.frame(from = rownames(.)) %>%
29 | pivot_longer(-from, names_to = "to")) %>%
30 | bind_rows(.id = "metric") %>%
31 | complete(from, to, metric, fill = list(value = NA))
32 |
33 | foo <- df %>%
34 | group_by(from, to) %>%
35 | summarise(value = mean(value), .groups = "drop") %>%
36 | pivot_wider(names_from = to, values_from = value) %>%
37 | select(-from) %>%
38 | as.matrix()
39 | foo[is.na(foo)] <- 1
40 | o <- colnames(foo)[hclust(dist(foo))$order]
41 |
42 | plt <- ggplot(df, aes(from, to, fill = value)) +
43 | facet_wrap(~ metric, nrow = 2) +
44 | scale_fill_distiller(
45 | bquote("r("*.(.stats1d_lab[wcs$stat1d])*")"),
46 | palette = "RdYlBu",
47 | na.value = "grey",
48 | limits = c(-1, 1),
49 | n.breaks = 3) +
50 | scale_x_discrete(limits = rev(o), labels = names(.methods_pal)) +
51 | scale_y_discrete(limits = o, labels = names(.methods_pal)) +
52 | coord_equal(expand = FALSE) +
53 | geom_tile()
54 |
55 | thm <- theme(
56 | axis.title = element_blank(),
57 | axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
58 |
59 | fig <- .prettify(plt, thm)
60 |
61 | saveRDS(fig, args$rds)
62 | ggsave(args$pdf, fig, width = 24, height = 8.5, units = "cm")
63 |
64 |
--------------------------------------------------------------------------------
/code/07-plot_stat_1d_by_stat1d-dimEst.R:
--------------------------------------------------------------------------------
1 | # wcs <- list(stat1d = "ks")
2 | # args <- list(
3 | # fun = "code/utils-plotting.R",
4 | # res = "outs/obj-stat_1d.rds",
5 | # rds = sprintf("plts/stat_1d_by_stat1d-dimEst,%s.rds", wcs$stat1d),
6 | # pdf = sprintf("plts/stat_1d_by_stat1d-dimEst,%s.pdf", wcs$stat1d))
7 |
8 | suppressPackageStartupMessages({
9 | library(intrinsicDimension)
10 | })
11 |
12 | source(args$fun)
13 | res <- readRDS(args$res)
14 |
15 | df <- res %>%
16 | # keep data of interest
17 | filter(stat1d == wcs$stat1d) %>%
18 | .filter_res() %>%
19 | # average across groups
20 | group_by(refset, reftyp, method, metric, group) %>% .avg(n = 1) %>%
21 | # estimate dimensionality for each refset
22 | pivot_wider(
23 | names_from = metric,
24 | values_from = stat) %>%
25 | split(.$refset) %>%
26 | lapply(\(df) {
27 | if (nrow(df) < 3) return(NULL)
28 | ks <- seq(2, nrow(df)-1)
29 | mat <- as.matrix(select(df, any_of(.metrics_lab)))
30 | mat <- mat[, apply(mat, 2, \(.) !all(is.na(.)))]
31 | mat[is.na(mat)] <- 1
32 | est <- sapply(ks, maxLikGlobalDimEst, data = mat)
33 | data.frame(dim = unlist(est), k = ks)
34 | }) %>%
35 | bind_rows(.id = "refset") %>%
36 | mutate(type = res$reftyp[match(refset, res$refset)])
37 |
38 | plt <- ggplot(df, aes(factor(k), dim, col = type, fill = type)) +
39 | geom_boxplot(
40 | size = 0.25, outlier.size = 0.25,
41 | alpha = 0.25, key_glyph = "point") +
42 | scale_x_discrete(
43 | "number of nearest neighbors (k)",
44 | breaks = seq(0, 20, 2)) +
45 | scale_y_continuous(
46 | "estimated dimensionality",
47 | limits = range(df$dim),
48 | breaks = seq(0, 10, 2))
49 |
50 | thm <- theme(panel.grid.major = element_line(color = "grey"))
51 |
52 | fig <- .prettify(plt, thm)
53 |
54 | saveRDS(fig, args$rds)
55 | ggsave(args$pdf, fig, width = 9, height = 6, units = "cm")
56 |
--------------------------------------------------------------------------------
/code/07-plot_stat_1d_by_stat1d-mds.R:
--------------------------------------------------------------------------------
1 | # wcs <- list(stat1d = "ks")
2 | # args <- list(
3 | # fun = "code/utils-plotting.R",
4 | # res = "outs/obj-stat_1d.rds",
5 | # rds = sprintf("plts/stat_1d_by_stat1d-mds,%s.rds", wcs$stat1d),
6 | # pdf = sprintf("plts/stat_1d_by_stat1d-mds,%s.pdf", wcs$stat1d))
7 |
8 | source(args$fun)
9 | res <- readRDS(args$res)
10 |
11 | df <- res %>%
12 | # keep data of interest
13 | filter(stat1d == wcs$stat1d) %>%
14 | .filter_res() %>%
15 | # average across groups & refsets
16 | group_by(method, metric, refset, group) %>%
17 | .avg(n = 3) %>%
18 | pivot_wider(
19 | names_from = metric,
20 | values_from = stat) %>%
21 | rowwise() %>%
22 | drop_na(any_of(.metrics_lab)) %>%
23 | select(any_of(.metrics_lab)) %>%
24 | as.matrix() %>%
25 | t() %>%
26 | dist() %>%
27 | cmdscale() %>%
28 | data.frame(rownames(.)) %>%
29 | set_colnames(c("x", "y", "metric")) %>%
30 | mutate(
31 | type = case_when(
32 | metric %in% .none_metrics ~ "global",
33 | grepl("gene", metric) ~ "gene",
34 | grepl("cell", metric) ~ "cell"),
35 | metric = .metrics_lab[metric])
36 |
37 | plt <- ggplot(df, aes(x, y,
38 | col = type, fill = type, label = metric)) +
39 | geom_point(size = 2, shape = 21, col = "black", alpha = 0.5) +
40 | geom_text_repel(size = 2, show.legend = FALSE) +
41 | scale_fill_manual(values = c("red", "blue", "green3")) +
42 | scale_color_manual(values = c("red", "blue", "green3")) +
43 | scale_x_continuous(expand = expansion(mult = 0.1)) +
44 | scale_y_continuous(expand = expansion(mult = 0.1)) +
45 | labs(x = "MDS dim. 1", y = "MDS dim. 2") +
46 | coord_fixed()
47 |
48 | thm <- theme(
49 | legend.position = "bottom",
50 | legend.title = element_blank(),
51 | panel.grid.major = element_line(color = "grey"))
52 |
53 | fig <- .prettify(plt, thm)
54 |
55 | saveRDS(fig, args$rds)
56 | ggsave(args$pdf, fig, width = 12, height = 8, units = "cm")
57 |
--------------------------------------------------------------------------------
/code/07-plot_stat_1d_by_stat1d-pca.R:
--------------------------------------------------------------------------------
1 | # wcs <- list(stat1d = "ks")
2 | # args <- list(
3 | # fun = "code/utils-plotting.R",
4 | # res = "outs/obj-stat_1d.rds",
5 | # rds = sprintf("plts/stat_1d_by_stat1d-pca,%s.rds", wcs$stat1d),
6 | # pdf = sprintf("plts/stat_1d_by_stat1d-pca,%s.pdf", wcs$stat1d))
7 |
8 | source(args$fun)
9 | res <- readRDS(args$res)
10 |
11 | df <- res %>%
12 | # keep data of interest
13 | filter(stat1d == wcs$stat1d) %>%
14 | .filter_res() %>%
15 | # average across groups & refsets
16 | group_by(method, metric, refset, group) %>%
17 | .avg(n = 3) %>%
18 | pivot_wider(
19 | names_from = metric,
20 | values_from = stat) %>%
21 | rowwise() %>%
22 | drop_na(any_of(.metrics_lab))
23 |
24 | # do PCA
25 | pc <- df %>%
26 | select(any_of(.metrics_lab)) %>%
27 | as.matrix() %>% prcomp()
28 |
29 | # get coordinates
30 | xy <- pc$x %>%
31 | data.frame() %>%
32 | # add metadata
33 | mutate(
34 | select(df, !any_of(.metrics_lab)),
35 | method = droplevels(method))
36 |
37 | # get loadings
38 | rot <- pc$rotation %>%
39 | data.frame() %>%
40 | mutate(
41 | metric = .metrics_lab[rownames(.)],
42 | metric = factor(metric, metric))
43 | ij <- c("PC1", "PC2")
44 | m1 <- max(abs(xy[, ij]))
45 | m2 <- max(abs(rot[, ij]))
46 | rot[, ij] <- 0.5*rot[, ij]*m1/m2
47 |
48 | # get percentages
49 | var <- prop.table(pc$sdev^2)
50 | var_lab <- round(100*var, 1)
51 |
52 | p0 <- ggplot(xy, aes(PC1, PC2)) +
53 | geom_vline(xintercept = 0, size = 0.1) +
54 | geom_hline(yintercept = 0, size = 0.1) +
55 | coord_fixed() + labs(
56 | x = sprintf("PC1 (%s%%)", var_lab[1]),
57 | y = sprintf("PC2 (%s%%)", var_lab[2])) +
58 | scale_x_continuous(
59 | limits = range(xy$PC1),
60 | expand = expansion(mult = 0.2)) +
61 | scale_y_continuous(
62 | limits = range(xy$PC2),
63 | expand = expansion(mult = 0.2))
64 |
65 | p1 <- p0 +
66 | geom_point(
67 | aes(fill = method), shape = 21,
68 | size = 3, stroke = 0, alpha = 0.8) +
69 | geom_label_repel(
70 | aes(label = method, col = method),
71 | label.padding = unit(0.75, "mm"),
72 | size = 2, fontface = "bold", show.legend = FALSE) +
73 | scale_fill_manual(values = .methods_pal[levels(xy$method)]) +
74 | scale_color_manual(values = .methods_pal[levels(xy$method)])
75 |
76 | p2 <- p0 +
77 | geom_segment(data = rot,
78 | aes(0, 0, xend = PC1, yend = PC2, col = metric),
79 | size = 0.5, arrow = arrow(length = unit(1, "mm"))) +
80 | geom_label_repel(data = rot,
81 | aes(label = metric, col = metric),
82 | label.padding = unit(0.75, "mm"),
83 | size = 2, fontface = "bold", show.legend = FALSE) +
84 | scale_color_manual(values = .metrics_pal[levels(rot$metric)])
85 |
86 | thm <- theme(
87 | legend.justification = c(0, 0.5),
88 | panel.grid.major = element_line(size = 0.1, color = "grey"))
89 |
90 | f1 <- .prettify(p1, thm)
91 | f2 <- .prettify(p2, thm)
92 | f2$guides$colour$override.aes$size <- 0.5
93 |
94 | fig <- f1 / f2 +
95 | plot_annotation(tag_levels = "a") &
96 | theme(plot.tag = element_text(size = 9, face = "bold"))
97 |
98 | saveRDS(fig, args$rds)
99 | ggsave(args$pdf, fig, width = 16, height = 18, units = "cm")
100 |
--------------------------------------------------------------------------------
/code/07-plot_stat_2d-scatters.R:
--------------------------------------------------------------------------------
1 | # args <- list(
2 | # fun = "code/utils.R",
3 | # res = "outs/obj-stat_2d.rds",
4 | # rds = "plts/stat_2d-scatters.rds",
5 | # pdf = "plts/stat_2d-scatters.pdf")
6 |
7 | source(args$fun)
8 |
9 | res <- readRDS(args$res)
10 |
11 | df <- res %>%
12 | mutate(
13 | refset = paste(datset, subset, sep = ","),
14 | metrics = paste(metric1, metric2, sep = "\n")) %>%
15 | select(-c(datset, subset, metric1, metric2)) %>%
16 | # for type != n, keep batch-/cluster-level comparisons only
17 | group_by(stat2d, refset, method, metrics) %>%
18 | mutate(n = n()) %>%
19 | filter(group != "global" | n == 1) %>%
20 | select(-c(group, n)) %>%
21 | # for each statistic & metric, re-scale b/w 0 & 1
22 | group_by(stat2d, metrics) %>%
23 | mutate(stat = stat/max(stat, na.rm = TRUE)) %>%
24 | pivot_wider(names_from = stat2d, values_from = stat) %>%
25 | filter(!is.na(ks2), !is.na(emd))
26 |
27 | plt <- ggplot(df, aes(ks2, emd, col = refset, fill = refset)) +
28 | facet_wrap(~ metrics, nrow = 1) +
29 | stat_cor(
30 | aes(group = metrics), method = "spearman",
31 | label.x = 0, label.y = 1, vjust = 1, size = 1.5) +
32 | geom_point(size = 0.2, alpha = 0.4, show.legend = FALSE) +
33 | scale_x_sqrt(limits = c(0, 1), breaks = seq(0.2, 1, 0.2)) +
34 | scale_y_sqrt(limits = c(0, 1), breaks = seq(0.2, 1, 0.2)) +
35 | labs(x = "KS statistics", y = "scaled EMD")
36 |
37 | thm <- theme(aspect.ratio = 1)
38 |
39 | fig <- .prettify(plt, thm)
40 |
41 | saveRDS(fig, args$rds)
42 | ggsave(args$pdf, fig, width = 16, height = 7.5, units = "cm")
43 |
--------------------------------------------------------------------------------
/code/07-plot_stat_2d_by_reftyp-boxplot.R:
--------------------------------------------------------------------------------
1 | # wcs <- list(stat2d = "emd", reftyp = "n")
2 | # args <- list(
3 | # fun = "code/utils-plotting.R",
4 | # res = "outs/obj-stat_2d.rds",
5 | # rds = "plts/stat_2d_by_reftyp-boxplot,b,ks2.rds",
6 | # pdf = "plts/stat_2d_by_reftyp-boxplot,b,ks2.pdf")
7 |
8 | source(args$fun)
9 | res <- .read_res(args$res)
10 |
11 | df <- res %>%
12 | mutate(metrics = paste(metric1, metric2, sep = "\n")) %>%
13 | # keep statistic of interest
14 | filter(stat2d == wcs$stat2d) %>%
15 | # keep group-level comparisons only
16 | { if (wcs$reftyp == "n") . else
17 | mutate(., across(
18 | c(group, id),
19 | as.character)) %>%
20 | filter(group != id) } %>%
21 | # scale values b/w 0 and 1 for visualization
22 | group_by(metrics) %>%
23 | mutate(stat = stat/max(stat, na.rm = TRUE))
24 |
25 | pal <- .methods_pal[levels(df$method)]
26 | lab <- parse(text = paste(
27 | sep = "~",
28 | sprintf("bold(%s)", LETTERS),
29 | gsub("\\s", "~", names(pal))))
30 |
31 | anno <- df %>%
32 | group_by(method, metrics) %>%
33 | summarize_at("stat", median) %>%
34 | mutate(letter = LETTERS[match(method, levels(method))])
35 |
36 | plt <- ggplot(df, aes(
37 | reorder_within(method, stat, metrics, median),
38 | stat, col = method, fill = method)) +
39 | facet_wrap(~ metrics, nrow = 3, scales = "free_x") +
40 | geom_boxplot(
41 | outlier.size = 0.25, outlier.alpha = 1,
42 | size = 0.25, alpha = 0.25, key_glyph = "point") +
43 | geom_text(data = anno,
44 | size = 1.5, color = "black",
45 | aes(label = letter, y = -0.075)) +
46 | scale_fill_manual(values = pal, labels = lab) +
47 | scale_color_manual(values = pal, labels = lab) +
48 | scale_x_reordered(NULL) +
49 | scale_y_continuous(
50 | paste(ifelse(wcs$stat2d == "emd", "scaled", ""),
51 | .stats2d_lab[wcs$stat2d]),
52 | limits = c(-0.1, 1), n.breaks = 3)
53 |
54 | thm <- theme(
55 | legend.text.align = 0,
56 | legend.title = element_blank(),
57 | axis.text.x = element_blank(),
58 | axis.title.x = element_blank(),
59 | axis.ticks.x = element_blank())
60 |
61 | fig <- .prettify(plt, thm)
62 |
63 | saveRDS(fig, args$rds)
64 | ggsave(args$pdf, fig, width = 16, height = 9, units = "cm")
65 |
--------------------------------------------------------------------------------
/code/07-plot_stat_2d_by_reftyp-heatmap.R:
--------------------------------------------------------------------------------
1 | # wcs <- list(stat2d = "emd", reftyp = "b")
2 | # args <- list(
3 | # fun = "code/utils-plotting.R",
4 | # res = "outs/obj-stat_2d.rds",
5 | # rds = "plts/stat_2d_by_reftyp-boxplot,b,ks2.rds",
6 | # pdf = "plts/stat_2d_by_reftyp-boxplot,b,ks2.pdf")
7 |
8 | source(args$fun)
9 | res <- .read_res(args$res)
10 |
11 | df <- res %>%
12 | mutate(metrics = paste(metric1, metric2, sep = "\n")) %>%
13 | # except for KNN & global summaries,
14 | # keep group-level comparisons only
15 | { if (wcs$reftyp == "n") . else
16 | mutate(., across(
17 | c(group, id),
18 | as.character)) %>%
19 | filter(
20 | (!grepl("KNN", metrics) & group != id) |
21 | (grepl("KNN", metrics) & group == "global") |
22 | any(c(metric1, metric2) %in% .metrics_lab[.none_metrics])) } %>%
23 | # aggregation
24 | group_by(method, metrics, group, datset, subset) %>%
25 | summarize_at("stat", mean, na.rm = TRUE) %>% # average across groups
26 | summarize_at("stat", mean, na.rm = TRUE) %>% # average across subsets
27 | summarize_at("stat", mean, na.rm = TRUE) # average across datsets
28 |
29 | # order methods by average across metrics
30 | ox <- df %>%
31 | group_by(method) %>%
32 | summarise_at("stat", mean) %>%
33 | arrange(desc(stat)) %>%
34 | pull("method")
35 |
36 | # order metrics by average across methods
37 | oy <- df %>%
38 | group_by(metrics) %>%
39 | summarise_at("stat", mean) %>%
40 | arrange(desc(stat)) %>%
41 | pull("metrics")
42 |
43 | df <- complete(df, method, metrics, fill = list(stat = NA))
44 |
45 | plt <- ggplot(df,
46 | aes(method, metrics, fill = stat)) +
47 | geom_tile(col = "white") +
48 | scale_fill_distiller(
49 | .stats2d_lab[wcs$stat2d],
50 | palette = "RdYlBu",
51 | na.value = "grey",
52 | limits = c(0, 1),
53 | breaks = c(0, 1)) +
54 | coord_equal(3/2, expand = FALSE) +
55 | scale_x_discrete(limits = rev(ox)) +
56 | scale_y_discrete(limits = rev(oy))
57 |
58 | thm <- theme(
59 | axis.ticks = element_blank(),
60 | axis.title = element_blank(),
61 | panel.border = element_blank(),
62 | axis.text.x = element_text(angle = 45, hjust = 1))
63 |
64 | fig <- .prettify(plt, thm)
65 |
66 | saveRDS(fig, args$rds)
67 | ggsave(args$pdf, fig, width = 7.5, height = 6, units = "cm")
68 |
--------------------------------------------------------------------------------
/code/08-fig_boxplots.R:
--------------------------------------------------------------------------------
1 | # args <- list(
2 | # uts = "code/utils-plotting.R",
3 | # rds = c(
4 | # "plts/stat_1d_by_stat1d-boxplot_by_metric,ks.rds",
5 | # "plts/stat_1d_by_stat1d-boxplot_by_method,ks.rds"),
6 | # pdf = "figs/boxplots.pdf")
7 |
8 | source(args$uts)
9 |
10 | ps <- lapply(args$rds, readRDS)
11 |
12 | fig <- wrap_plots(ps, ncol = 1, heights = c(3, 4)) +
13 | plot_annotation(tag_levels = "a") &
14 | theme(
15 | plot.margin = margin(),
16 | panel.spacing = unit(1, "mm"),
17 | plot.tag = element_text(face = "bold", size = 9))
18 |
19 | ggsave(args$pdf, fig, width = 16, height = 14, units = "cm")
20 |
--------------------------------------------------------------------------------
/code/08-fig_clustering.R:
--------------------------------------------------------------------------------
1 | # args <- list(
2 | # rds = file.path("plts", c(
3 | # "clust-boxplot_by_method.rds",
4 | # "clust-boxplot_dF1.rds",
5 | # "clust-heatmap_by_method.rds",
6 | # "clust-correlations.rds")),
7 | # uts = "code/utils-plotting.R",
8 | # pdf = "figs/clustering.pdf")
9 |
10 | source(args$uts)
11 |
12 | ps <- lapply(args$rds, readRDS)
13 |
14 | # fix methods order according to ref-sim
15 | # difference for boxplots & heatmaps
16 | o <- c("ref", ggplot_build(ps[[2]])$layout$panel_params[[1]]$x$breaks)
17 | ps[[1]]$data$sim_method <- factor(ps[[1]]$data$sim_method, o)
18 | ps[[3]]$data$sim_method <- factor(ps[[3]]$data$sim_method, o)
19 |
20 | ps[[3]]$layers[[1]]$aes_params$colour <- NA
21 |
22 | ps[[3]] <- ps[[3]] + theme(
23 | axis.text.y = element_text(size = 3, angle = 60, hjust = 1),
24 | axis.text.x = element_text(size = 3, angle = 90, vjust = 0.5))
25 |
26 | ps[[4]] <- ps[[4]] + theme(
27 | legend.background = element_blank())
28 |
29 | ps <- lapply(ps, \(.) wrap_elements(full = .))
30 |
31 | ws <- c(3, 1)
32 |
33 | fig <-
34 | ((ps[[1]] + ps[[2]]) + plot_layout(widths = ws)) /
35 | ((ps[[3]] + ps[[4]]) + plot_layout(widths = ws)) +
36 | plot_annotation(tag_levels = "a") &
37 | theme(
38 | plot.margin = margin(),
39 | plot.tag = element_text(face = "bold", size = 9))
40 |
41 | ggsave(args$pdf, fig, width = 16, height = 9, units = "cm")
42 |
--------------------------------------------------------------------------------
/code/08-fig_heatmaps.R:
--------------------------------------------------------------------------------
1 | # args <- list(
2 | # pdf = "figs/heatmaps.pdf",
3 | # uts = "code/utils-plotting.R",
4 | # rds = list.files("plts", "stat_(1|2)d_by_reftyp-heatmap.*ks2?\\.rds", full.names = TRUE))
5 |
6 | source(args$uts)
7 | ps <- lapply(fns <- args$rds, readRDS)
8 |
9 | # re-order by dimension & type
10 | dim <- gsub(".*(1|2)d.*", "\\1", fns)
11 | typ <- gsub(".*,(n|b|k),ks.*", "\\1", fns)
12 | names(ps) <- paste0(typ, dim)
13 | ps <- ps[paste0(
14 | c("n", "b", "k"),
15 | rep(c(1, 2), each = 3))]
16 |
17 | # assure global metrics are
18 | # included across all panels
19 |
20 | ps[1:3] <- lapply(ps[1:3], \(.) {
21 | y <- .$data$metric
22 | y <- factor(y, .metrics_lab)
23 | .$data$metric <- y
24 | return(.)
25 | })
26 |
27 | # fix y-axis order across panels and add rectangles
28 | # highlighting gene-/cell-level & global summaries
29 |
30 | s <- 0.50 # border size
31 | d <- 0.00 # offset b/w boxes
32 |
33 | ps[1:3] <- lapply(ps[1:3], \(p)
34 | p + scale_y_discrete(limits = rev(.metrics_lab)) +
35 | geom_rect(
36 | xmin = 0.5 + d,
37 | xmax = length(unique(p$data$method)) + 0.5 - d,
38 | ymax = length(.metrics_lab) + 0.5 - d,
39 | ymin = length(.none_metrics) + length(.cell_metrics) + 0.5 + d,
40 | size = s, fill = NA, col = "red") +
41 | geom_rect(
42 | xmin = 0.5 + d,
43 | xmax = length(unique(p$data$method)) + 0.5 - d,
44 | ymin = length(.none_metrics) + 0.5 + d,
45 | ymax = length(.none_metrics) + length(.cell_metrics) + 0.5 - d,
46 | size = s, fill = NA, col = "blue") +
47 | geom_rect(
48 | xmin = 0.5 + d,
49 | xmax = length(unique(p$data$method)) + 0.5 - d,
50 | ymin = 0.5 - d,
51 | ymax = length(.none_metrics) + 0.5 - d,
52 | size = s, fill = NA, col = "green3"))
53 |
54 | # get x-axis (methods) ordering
55 | xo <- lapply(ps[1:3], \(p) ggplot_build(p)$layout$panel_scales_x[[1]]$limits)
56 |
57 | .metric_pairs <- unique(ps[[5]]$data$metrics)
58 | .metric_pairs <- .metric_pairs[c(3, 2, 1, 9, 8, 6, 5, 4, 7)]
59 |
60 | ps[4:6] <- lapply(seq_along(ps[4:6]), \(i) {
61 | p <- ps[4:6][[i]]
62 | p + scale_x_discrete(limits = xo[[i]]) +
63 | scale_y_discrete(limits = rev(.metric_pairs)) +
64 | geom_rect(
65 | xmin = 0.5 - d,
66 | xmax = length(unique(p$data$method)) + 0.5 - d,
67 | ymax = length(.metric_pairs) + 0.5 - d,
68 | ymin = 3.5 + d,
69 | size = s, fill = NA, col = "red") +
70 | geom_rect(
71 | xmin = 0.5 - d,
72 | xmax = length(unique(p$data$method)) + 0.5 - d,
73 | ymin = 0.5 + d,
74 | ymax = 3.5 - d,
75 | size = s, fill = NA, col = "blue")
76 | })
77 |
78 | # drop y-axis labels from all but left-most panels
79 | ps[-c(1, 4)] <- lapply(ps[-c(1, 4)], "+",
80 | theme(axis.text.y = element_blank()))
81 |
82 | # re-size relative to number of
83 | # rows (metrics) & columns (methods)
84 | hs <- c(
85 | length(unique(ps[[1]]$data$metric)),
86 | 1.5 + length(unique(ps[[5]]$data$metrics)))
87 | ws <- sapply(ps[1:3], \(.) nlevels(.$data$method))
88 |
89 | x <- c("gene-level" = "red", "cell-level" = "blue", "global" = "green3")
90 | ps[[1]] <- ps[[1]] +
91 | geom_point(
92 | inherit.aes = FALSE,
93 | data = data.frame(x),
94 | aes_string(
95 | ps[[1]]$data$method[1],
96 | ps[[1]]$data$metric[1],
97 | col = "x"), alpha = 0) +
98 | scale_color_manual("type of\nsummary", values = x) +
99 | guides(color = guide_legend(order = 1, override.aes = list(alpha = 1)))
100 |
101 | fig <- wrap_plots(ps,
102 | nrow = 2, heights = hs, widths = ws) +
103 | plot_layout(guides = "collect") +
104 | plot_annotation(tag_levels = "a") &
105 | coord_cartesian() &
106 | theme(
107 | axis.text.x = element_text(angle = 30),
108 | plot.margin = margin(l = 2, unit = "mm"),
109 | plot.tag = element_text(face = "bold", size = 9))
110 |
111 | ggsave(args$pdf, fig, width = 16, height = 12, units = "cm")
112 |
--------------------------------------------------------------------------------
/code/08-fig_integration.R:
--------------------------------------------------------------------------------
1 | # val <- rep(c("cms", "ldf", "bcs"), each = 4)
2 | # args <- list(
3 | # uts = "code/utils-plotting.R",
4 | # pdf = "figs/integration.pdf",
5 | # rds = sprintf(c(
6 | # "plts/batch-boxplot_by_method_%s.rds",
7 | # "plts/batch-boxplot_dX_%s.rds",
8 | # "plts/batch-heatmap_by_method_%s.rds",
9 | # "plts/batch-correlations_%s.rds"), val))
10 |
11 | source(args$uts)
12 | ps <- lapply(args$rds, readRDS)
13 |
14 | # split plots by value
15 | pat <- ".*_(.*)\\.rds$"
16 | lys <- split(ps, gsub(pat, "\\1", args$rds))
17 |
18 | # fix methods order according to ref-sim
19 | # difference for boxplots & heatmaps
20 | o <- c("ref", ggplot_build(lys[[1]][[2]])$layout$panel_params[[1]]$x$breaks)
21 | lys <- lapply(lys, \(ps) {
22 | ps[[1]]$data$sim_method <- factor(ps[[1]]$data$sim_method, o)
23 | ps[[3]]$data$sim_method <- factor(ps[[3]]$data$sim_method, o)
24 | return(ps)
25 | })
26 |
27 | # generate separate figure for each
28 | lys <- lapply(lys, \(p) {
29 | p[[3]]$layers[[1]]$aes_params$colour <- NA
30 | p[[3]] <- p[[3]] + theme(
31 | axis.text.y = element_text(size = 3, angle = 60, hjust = 1),
32 | axis.text.x = element_text(size = 3, angle = 90, vjust = 0.5))
33 | p[[4]] <- p[[4]] + theme(
34 | legend.background = element_blank())
35 | p <- lapply(p, \(.) wrap_elements(full = .))
36 | ws <- c(3, 1)
37 | fig <- ((p[[1]] + p[[2]]) + plot_layout(widths = ws)) /
38 | ((p[[3]] + p[[4]]) + plot_layout(widths = ws)) +
39 | plot_annotation(tag_levels = "a") &
40 | theme(
41 | plot.margin = margin(),
42 | plot.tag = element_text(face = "bold", size = 9))
43 | })
44 |
45 | pdf(args$pdf, width = 17.6/2.54, height = 9.9/2.54)
46 | for (p in lys) print(p); dev.off()
47 |
--------------------------------------------------------------------------------
/code/08-fig_mds.R:
--------------------------------------------------------------------------------
1 | # args <- list(
2 | # pdf = "figs/mds.pdf",
3 | # uts = "code/utils-plotting.R",
4 | # rds = list.files("plts", "reftyp-mds.*ks\\.rds", full.names = TRUE))
5 |
6 | source(args$uts)
7 | ps <- lapply(args$rds, readRDS)
8 | ps[[1]]$guides$fill <- "none"
9 |
10 | fig <- wrap_plots(ps, ncol = 1) +
11 | plot_layout(guides = "collect") +
12 | plot_annotation(tag_levels = "a") &
13 | theme(
14 | plot.margin = margin(),
15 | legend.position = "bottom",
16 | plot.tag = element_text(size = 9, face = "bold"))
17 |
18 | ggsave(args$pdf, fig, width = 9, height = 17, units = "cm")
--------------------------------------------------------------------------------
/code/08-fig_memory.R:
--------------------------------------------------------------------------------
1 | # args <- list(
2 | # rds = list.files("plts", "mbs.*\\.rds", full.names = TRUE),
3 | # pdf = "figs/memory.pdf",
4 | # uts = "code/utils-plotting.R")
5 |
6 | source(args$uts)
7 |
8 | ps <- lapply(args$rds, readRDS)
9 |
10 | pat <- paste0("_", c("n", "b", "k"), ".rds")
11 | ps <- ps[sapply(pat, grep, args$rds)]
12 |
13 | fig <- wrap_plots(ps, ncol = 1) +
14 | plot_annotation(tag_levels = "a") &
15 | theme(
16 | plot.margin = margin(),
17 | panel.spacing = unit(1, "mm"),
18 | legend.title = element_blank(),
19 | plot.tag = element_text(size = 9, face = "bold"))
20 |
21 | ggsave(args$pdf, fig, width = 18, height = 16, units = "cm")
22 |
--------------------------------------------------------------------------------
/code/08-fig_runtimes.R:
--------------------------------------------------------------------------------
1 | # args <- list(
2 | # rds = list.files("plts", "rts.*\\.rds", full.names = TRUE),
3 | # pdf = "figs/runtimes.pdf",
4 | # uts = "code/utils-plotting.R")
5 |
6 | source(args$uts)
7 |
8 | ps <- lapply(args$rds, readRDS)
9 |
10 | pat <- paste0("_", c("n", "b", "k"), ".rds")
11 | ps <- ps[sapply(pat, grep, args$rds)]
12 |
13 | fig <- wrap_plots(ps, ncol = 1) +
14 | plot_annotation(tag_levels = "a") &
15 | theme(
16 | plot.margin = margin(),
17 | panel.spacing = unit(1, "mm"),
18 | legend.title = element_blank(),
19 | plot.tag = element_text(size = 9, face = "bold"))
20 |
21 | ggsave(args$pdf, fig, width = 18, height = 21, units = "cm")
22 |
--------------------------------------------------------------------------------
/code/08-fig_scalability.R:
--------------------------------------------------------------------------------
1 | # args <- list(
2 | # rds = list.files("plts", "(rts|mbs).*\\.rds", full.names = TRUE),
3 | # pdf = "figs/rts_vs_mbs.pdf",
4 | # uts = "code/utils-plotting.R")
5 |
6 | source(args$uts)
7 | ps <- lapply(args$rds, readRDS)
8 |
9 | pat <- ".*_([a-z])\\.rds"
10 | reftyp <- gsub(pat, "\\1", basename(args$rds))
11 | is <- split(seq_along(ps), reftyp)[c("n", "b", "k")]
12 |
13 | ps <- lapply(is, \(i) {
14 |
15 | nm <- c("rts", "mbs")
16 | for (j in seq_along(nm)) {
17 | k <- grep(nm[[j]], args$rds[i])
18 | df <- paste0("df_", nm[[j]])
19 | assign(df, ps[i][[k]]$data)
20 | }
21 |
22 | i <- c("method", "dim", "n", j <- c("t", "mbs"))
23 | #f <- \(.) factor(., sort(as.numeric(as.character(unique(.)))))
24 | f <- \(.) as.numeric(as.character(.))
25 | df_rts <- df_rts %>%
26 | filter(step == "overall") %>%
27 | mutate(n = f(n)) %>% ungroup() %>% select(any_of(i))
28 |
29 | df_mbs <- df_mbs %>%
30 | mutate(n = f(n)) %>% ungroup() %>% select(any_of(i))
31 |
32 | df <- full_join(
33 | df_rts, df_mbs,
34 | by = setdiff(i, j)) %>%
35 | filter(!is.na(t), !is.na(mbs))
36 | pal <- .methods_pal[levels(df$method)]
37 |
38 | p <- ggplot(df, aes(t, mbs, col = method, fill = method)) +
39 | scale_color_manual(values = pal) +
40 | scale_fill_manual(values = pal) +
41 | facet_grid(~ dim) +
42 | geom_point(alpha = 0.8) +
43 | geom_path(size = 0.2) +
44 | scale_x_continuous("runtime (s)", limits = c(0, NA), trans = "sqrt") +
45 | scale_y_continuous("memory usage (MBs)", limits = c(0, NA))
46 |
47 | .prettify(p) + theme(
48 | panel.grid.major = element_line(color = "grey", size = 0.2))
49 | })
50 |
51 | fig <- wrap_plots(ps, ncol = 1) +
52 | plot_annotation(tag_levels = "a") &
53 | theme(
54 | plot.margin = margin(),
55 | panel.spacing = unit(1, "mm"),
56 | legend.title = element_blank(),
57 | plot.tag = element_text(size = 9, face = "bold"))
58 |
59 | ggsave(args$pdf, fig, width = 16, height = 16, units = "cm")
60 |
--------------------------------------------------------------------------------
/code/08-fig_scatters.R:
--------------------------------------------------------------------------------
1 | # args <- list(
2 | # uts = "code/utils-plotting.R",
3 | # pdf = "figs/scatters.pdf",
4 | # rds = list.files("plts", "scatters.*\\.rds", full.names = TRUE))
5 |
6 | source(args$uts)
7 |
8 | ps <- lapply(args$rds, readRDS)
9 | ps[[1]]$facet$params$nrow <- 2
10 | ps[[2]]$facet$params$nrow <- 1
11 |
12 | fig <-
13 | wrap_plots(ps, ncol = 1, heights = c(2, 1)) +
14 | plot_annotation(tag_levels = "a") &
15 | scale_x_sqrt(breaks = c(0.2, 0.5, 1)) &
16 | theme(
17 | plot.margin = margin(r = 1, unit = "mm"),
18 | plot.tag = element_text(size = 9, face = "bold"))
19 |
20 | ggsave(args$pdf, fig, width = 20, height = 11, units = "cm")
21 |
--------------------------------------------------------------------------------
/code/08-fig_stat1d.R:
--------------------------------------------------------------------------------
1 | # args <- list(
2 | # uts = "code/utils-plotting.R",
3 | # pdf = "figs/stat1d.pdf",
4 | # rds = list.files("plts", "stat_1d_by_reftyp-boxplot.*\\.rds", full.names = TRUE))
5 |
6 | source(args$uts)
7 |
8 | ps <- lapply(c("n", "b", "k"), function(type) {
9 | pat <- sprintf(",%s,", type)
10 | fns <- grep(pat, args$rds, value = TRUE)
11 | lapply(rev(fns), readRDS) %>%
12 | wrap_plots(ncol = 1) +
13 | plot_layout(guides = "collect") +
14 | plot_annotation(tag_levels = "a") &
15 | theme(
16 | panel.spacing = unit(1, unit = "mm"),
17 | plot.margin = margin(t = 1, unit = "mm"),
18 | plot.tag = element_text(size = 9, face = "bold"))
19 | })
20 |
21 | print.gglist <- \(p, ...) plyr::l_ply(p, print, ...)
22 | fig <- structure(ps, class = c("gglist", "ggplot"))
23 | ggsave(args$pdf, fig, width = 16, height = 16, units = "cm")
24 |
--------------------------------------------------------------------------------
/code/08-fig_stat2d.R:
--------------------------------------------------------------------------------
1 | # args <- list(
2 | # uts = "code/utils-plotting.R",
3 | # pdf = "figs/stat2d.pdf",
4 | # rds = list.files("plts", "stat_2d_by_reftyp-boxplot.*\\.rds", full.names = TRUE))
5 |
6 | source(args$uts)
7 |
8 | ps <- lapply(c("n", "b", "k"), function(type) {
9 | pat <- sprintf(",%s,", type)
10 | fns <- grep(pat, args$rds, value = TRUE)
11 | lapply(rev(fns), readRDS) %>%
12 | wrap_plots(ncol = 1) +
13 | plot_layout(guides = "collect") +
14 | plot_annotation(tag_levels = "a") &
15 | theme(
16 | panel.spacing = unit(1, unit = "mm"),
17 | plot.margin = margin(t = 1, unit = "mm"),
18 | plot.tag = element_text(size = 9, face = "bold"))
19 | })
20 |
21 | print.gglist <- \(p, ...) plyr::l_ply(p, print, ...)
22 | fig <- structure(ps, class = c("gglist", "ggplot"))
23 | ggsave(args$pdf, fig, width = 16, height = 16, units = "cm")
--------------------------------------------------------------------------------
/code/08-fig_summaries.R:
--------------------------------------------------------------------------------
1 | # args <- list(
2 | # uts = "code/utils-plotting.R",
3 | # pdf = "figs/summaries.pdf",
4 | # rds = file.path("plts", c(
5 | # "qc_ref-correlations.rds",
6 | # "stat_1d_by_stat1d-correlations,ks.rds",
7 | # "stat_1d_by_stat1d-mds,ks.rds",
8 | # "stat_1d_by_stat1d-pca,ks.rds")))
9 |
10 | source(args$uts)
11 | ps <- lapply(args$rds, readRDS)
12 |
13 | ps[c(1, 2)] <- lapply(ps[c(1, 2)], "+", theme(
14 | axis.text.x = element_text(size = 3, angle = 30)))
15 |
16 | ps[[3]]$layers[[2]]$aes_params$size <- 1.25
17 | ps[[3]] <- ps[[3]] + theme(legend.position = "right")
18 | ps[[3]] <- wrap_elements(full = ps[[3]])
19 |
20 | ps[[4]][[1]] <- ps[[4]][[1]] +
21 | theme(legend.position = "none")
22 | ps[[4]][[1]]$coordinates$expand <- TRUE
23 | ps[[4]][[2]]$layers <- ps[[4]][[2]]$layers[-4]
24 | ps[[4]][[2]] <- ps[[4]][[2]] +
25 | theme(
26 | legend.position = c(0.95, 0.5),
27 | legend.justification = c(1, 0.5))
28 |
29 | ps[[4]] <- ps[[4]] & theme(plot.tag = element_blank())
30 | ps[[4]] <- wrap_elements(full = ps[[4]])
31 |
32 | fig <- (wrap_plots(ps[1:3], ncol = 1) |
33 | wrap_plots(ps[4], ncol = 1)) +
34 | plot_layout(widths = c(1, 2.5)) +
35 | plot_annotation(tag_levels = "a") &
36 | theme(
37 | plot.margin = margin(l = 1, t = 1, unit = "mm"),
38 | plot.tag = element_text(size = 9, face = "bold"))
39 |
40 | ggsave(args$pdf, fig, width = 16, height = 12, units = "cm")
41 |
--------------------------------------------------------------------------------
/code/09-write_fns.R:
--------------------------------------------------------------------------------
1 | pat <- paste0("^", wcs$pat)
2 | fns <- list.files("outs", pat, full.names = TRUE)
3 | writeLines(fns, args$txt)
--------------------------------------------------------------------------------
/code/09-write_obj.R:
--------------------------------------------------------------------------------
1 | source(args$fun)
2 | fns <- readLines(args$txt)
3 | res <- .read_res(fns)
4 | saveRDS(res, args$rds)
--------------------------------------------------------------------------------
/code/10-session_info.R:
--------------------------------------------------------------------------------
1 | x <- c(
2 | # GENERAL
3 | # Bioconductor
4 | "Biobase",
5 | "BiocParallel",
6 | "CellBench",
7 | "ExperimentHub",
8 | "GEOquery",
9 | "scater",
10 | "scran",
11 | "SingleCellExperiment",
12 | "TENxPBMCData",
13 | "variancePartition",
14 | "waddR",
15 | # CRAN
16 | "cluster",
17 | "dplyr",
18 | "emdist",
19 | "jsonlite",
20 | "ggplot2",
21 | "ggpubr",
22 | "ggrastr",
23 | "Matrix",
24 | "matrixStats",
25 | "patchwork",
26 | "Peacock.test",
27 | "RANN",
28 | "RColorBrewer",
29 | "Seurat",
30 | "SeuratData",
31 | "tidyr",
32 | "tidytext",
33 | # SIMULATION
34 | # Bioconductor
35 | "BASiCS",
36 | "muscat",
37 | "scDD",
38 | "splatter",
39 | "SPsimSeq",
40 | "zinbwave",
41 | # GitHub
42 | "JINJINT/ESCO",
43 | "suke18/POWSC",
44 | "bvieth/powsimR",
45 | "Vivianstats/scDesign",
46 | "JSB-UCLA/scDesign2",
47 | "YosefLab/SymSim",
48 | "statOmics/zingeR",
49 | # INTEGRATION
50 | # Bioconductor
51 | "batchelor",
52 | "CellMixS",
53 | "edgeR",
54 | "limma",
55 | "sva",
56 | # CRAN
57 | "harmony",
58 | # CLUSTERING
59 | # Bioconductor
60 | "ConsensusClusterPlus",
61 | "flowCore",
62 | "monocle",
63 | "SC3",
64 | # CRAN
65 | "clue",
66 | "Rtsne",
67 | # GitHub
68 | "VCCRI/cidr",
69 | "JustinaZ/pcaReduce"
70 | )
71 |
72 | # # TO INSTALL ALL DEPENDENCIES:
73 | # if (!require(BiocManager))
74 | # install.packages("BiocManager")
75 | # for (. in x)
76 | # if (!require(., character.only = TRUE))
77 | # BiocManager::install(., ask = FALSE, update = TRUE)
78 |
79 | # TO CAPTURE SESSIO INFO:
80 | for (. in x) {
81 | . <- gsub(".*/", "", .)
82 | suppressPackageStartupMessages(
83 | library(., character.only = TRUE))
84 | }
85 | si <- capture.output(sessionInfo())
86 | writeLines(si, args[[1]])
87 |
--------------------------------------------------------------------------------
/code/utils-clustering.R:
--------------------------------------------------------------------------------
1 | # Hungarian algorithm
2 | # (LM Weber, Aug '16)
3 |
4 | .hungarian <- function(clus_algorithm, clus_truth) {
5 |
6 | # number of detected clusters
7 | n_clus <- length(table(clus_algorithm))
8 |
9 | # remove unassigned cells (NA's in clus_truth)
10 | unassigned <- is.na(clus_truth)
11 | clus_algorithm <- clus_algorithm[!unassigned]
12 | clus_truth <- clus_truth[!unassigned]
13 | if (length(clus_algorithm) != length(clus_truth))
14 | warning("vector lengths are not equal")
15 |
16 | tbl_algorithm <- table(clus_algorithm)
17 | tbl_truth <- table(clus_truth)
18 |
19 | # detected clusters in rows, true populations in columns
20 | pr_mat <- re_mat <- F1_mat <- matrix(NA, nrow = length(tbl_algorithm), ncol = length(tbl_truth))
21 |
22 | for (i in 1:length(tbl_algorithm)) {
23 | for (j in 1:length(tbl_truth)) {
24 | i_int <- as.integer(names(tbl_algorithm))[i] # cluster number from algorithm
25 | j_int <- as.integer(names(tbl_truth))[j] # cluster number from true labels
26 |
27 | true_positives <- sum(clus_algorithm == i_int & clus_truth == j_int, na.rm = TRUE)
28 | detected <- sum(clus_algorithm == i_int, na.rm = TRUE)
29 | truth <- sum(clus_truth == j_int, na.rm = TRUE)
30 |
31 | # calculate precision, recall, and F1 score
32 | precision_ij <- true_positives / detected
33 | recall_ij <- true_positives / truth
34 | F1_ij <- 2 * (precision_ij * recall_ij) / (precision_ij + recall_ij)
35 |
36 | if (F1_ij == "NaN") F1_ij <- 0
37 |
38 | pr_mat[i, j] <- precision_ij
39 | re_mat[i, j] <- recall_ij
40 | F1_mat[i, j] <- F1_ij
41 | }
42 | }
43 |
44 | # put back cluster labels (note some row names may be missing due to removal of unassigned cells)
45 | rownames(pr_mat) <- rownames(re_mat) <- rownames(F1_mat) <- names(tbl_algorithm)
46 | colnames(pr_mat) <- colnames(re_mat) <- colnames(F1_mat) <- names(tbl_truth)
47 |
48 | # match labels using Hungarian algorithm applied to matrix of F1 scores (Hungarian
49 | # algorithm calculates an optimal one-to-one assignment)
50 |
51 | # use transpose matrix (Hungarian algorithm assumes n_rows <= n_cols)
52 | F1_mat_trans <- t(F1_mat)
53 |
54 | if (nrow(F1_mat_trans) <= ncol(F1_mat_trans)) {
55 | # if fewer (or equal no.) true populations than detected clusters, can match all true populations
56 | labels_matched <- clue::solve_LSAP(F1_mat_trans, maximum = TRUE)
57 | # use row and column names since some labels may have been removed due to unassigned cells
58 | labels_matched <- as.numeric(colnames(F1_mat_trans)[as.numeric(labels_matched)])
59 | names(labels_matched) <- rownames(F1_mat_trans)
60 |
61 | } else {
62 | # if fewer detected clusters than true populations, use transpose matrix and assign
63 | # NAs for true populations without any matching clusters
64 | labels_matched_flipped <- clue::solve_LSAP(F1_mat, maximum = TRUE)
65 | # use row and column names since some labels may have been removed due to unassigned cells
66 | labels_matched_flipped <- as.numeric(rownames(F1_mat_trans)[as.numeric(labels_matched_flipped)])
67 | names(labels_matched_flipped) <- rownames(F1_mat)
68 |
69 | labels_matched <- rep(NA, ncol(F1_mat))
70 | names(labels_matched) <- rownames(F1_mat_trans)
71 | labels_matched[as.character(labels_matched_flipped)] <- as.numeric(names(labels_matched_flipped))
72 | }
73 |
74 | # precision, recall, F1 score, and number of cells for each matched cluster
75 | pr <- re <- F1 <- n_cells_matched <- rep(NA, ncol(F1_mat))
76 | names(pr) <- names(re) <- names(F1) <- names(n_cells_matched) <- names(labels_matched)
77 |
78 | for (i in 1:ncol(F1_mat)) {
79 | # set to 0 if no matching cluster (too few detected clusters); use character names
80 | # for row and column indices in case subsampling completely removes some clusters
81 | pr[i] <- ifelse(is.na(labels_matched[i]), 0, pr_mat[as.character(labels_matched[i]), names(labels_matched)[i]])
82 | re[i] <- ifelse(is.na(labels_matched[i]), 0, re_mat[as.character(labels_matched[i]), names(labels_matched)[i]])
83 | F1[i] <- ifelse(is.na(labels_matched[i]), 0, F1_mat[as.character(labels_matched[i]), names(labels_matched)[i]])
84 |
85 | n_cells_matched[i] <- sum(clus_algorithm == labels_matched[i], na.rm = TRUE)
86 | }
87 |
88 | # means across populations
89 | mean_pr <- mean(pr)
90 | mean_re <- mean(re)
91 | mean_F1 <- mean(F1)
92 |
93 | return(list(n_clus = n_clus,
94 | pr = pr,
95 | re = re,
96 | F1 = F1,
97 | labels_matched = labels_matched,
98 | n_cells_matched = n_cells_matched,
99 | mean_pr = mean_pr,
100 | mean_re = mean_re,
101 | mean_F1 = mean_F1))
102 | }
--------------------------------------------------------------------------------
/code/utils-integration.R:
--------------------------------------------------------------------------------
1 | .cms_ldf <- \(df) df %>%
2 | group_by(refset, sim_method) %>%
3 | mutate(
4 | # center around 0
5 | cms = cms - 0.5,
6 | # scale to range 1
7 | ldf = ldf / diff(range(ldf)),
8 | # center around 0
9 | ldf = ldf - min(ldf) - 0.5)
10 |
11 | .bcs <- \(df, n = 1) {
12 | stopifnot(any(n == c(1, 2)))
13 | .avg <- \(df) summarise(df,
14 | across(c(cms, ldf), mean),
15 | .groups = "drop_last")
16 | df <- df %>%
17 | group_by(refset, sim_method, batch_method, batch) %>%
18 | # average across cells
19 | .avg()
20 | if (n == 2)
21 | # average across batches
22 | df <- df %>% .avg()
23 | # batch correction score
24 | df %>% mutate(bcs = abs(cms) + abs(ldf))
25 | }
26 |
27 | .batch_labs <- c(
28 | cms = "CMS*", bcs = "BCS",
29 | ldf = expression(Delta~"LDF*"))
--------------------------------------------------------------------------------
/code/utils-plotting.R:
--------------------------------------------------------------------------------
1 | suppressPackageStartupMessages({
2 | library(dplyr)
3 | library(ggplot2)
4 | library(ggpubr)
5 | library(ggrastr)
6 | library(ggrepel)
7 | library(magrittr)
8 | library(patchwork)
9 | library(RColorBrewer)
10 | library(tidyr)
11 | library(tidytext)
12 | })
13 |
14 | .read_res <- function(x) {
15 | df <- bind_rows(lapply(x, readRDS)) %>%
16 | replace_na(list(method = "ref")) %>%
17 | mutate(method = droplevels(factor(method, names(.methods_pal))))
18 | if (all(c("datset", "subset") %in% names(df)))
19 | df <- mutate(df, refset = paste(datset, subset, sep = ","))
20 | if ("id" %in% names(df))
21 | df <- mutate(df,
22 | group = relevel(factor(group), ref = "global"),
23 | id = case_when(group == "global" ~ "global", TRUE ~ id),
24 | id = relevel(factor(id), ref = "global"))
25 | if (all(c("refset", "group") %in% names(df)))
26 | df <- group_by(df, refset) %>%
27 | mutate(.group = as.character(group)) %>%
28 | mutate(reftyp = ifelse(any(.group == "batch"), "b",
29 | ifelse(any(.group == "cluster"), "k", "n")),
30 | reftyp = factor(reftyp, c("n", "b", "k"))) %>%
31 | select(-.group)
32 | if (sum(grepl("method", names(df))) > 1)
33 | df <- rename(df, sim_method = method)
34 | if (any(grepl("^metric", names(df))))
35 | df <- mutate(df, across(
36 | starts_with("metric"),
37 | ~droplevels(factor(.x,
38 | levels = names(.metrics_lab),
39 | labels = .metrics_lab))))
40 | return(df)
41 | }
42 |
43 | .filter_res <- function(df) df %>%
44 | mutate(
45 | .metric = names(.metrics_lab)[
46 | match(metric, .metrics_lab)],
47 | across(c(group, id), as.character)) %>%
48 | filter(
49 | # keep everything for type 'n'
50 | reftyp == "n" |
51 | # keep all groupings for global summaries,
52 | # gene-level (-correlations), cell-level
53 | # (-log-library size & cell detection frequency
54 | .metric %in% .none_metrics |
55 | .metric %in% setdiff(.gene_metrics, "gene_cor") |
56 | .metric %in% setdiff(.cell_metrics, c("cell_lls", "cell_frq")) |
57 | # for gene-gene correlation, keep global only
58 | (group == id & .metric == "gene_cor") |
59 | # for log-library size & cell detection
60 | # frequency, keep group-level results only
61 | (group != id & grepl("cell_(lls|frq)", .metric))) %>%
62 | ungroup() %>%
63 | select(-.metric) %>%
64 | mutate(across(c(metric, method), droplevels))
65 |
66 | # repeatedly average statistics, e.g.,
67 | # across groups, subsets, datsets, etc.
68 | .avg <- \(df, n) {
69 | .fun <- \(df) summarise_at(df, "stat", mean, na.rm = TRUE)
70 | res <- Reduce(\(df, foo) .fun(df), seq(n), init = df, accumulate = TRUE)
71 | ungroup(res[[n + 1]])
72 | }
73 |
74 | .prettify <- function(plt, thm = NULL, base_size = 6)
75 | {
76 | col <- !is.null(plt$labels$colour) && !is.numeric(plt$data[[plt$labels$colour]])
77 | fill <- !is.null(plt$labels$fill) && !is.numeric(plt$data[[plt$labels$fill]])
78 |
79 | plt <- plt + if (col && fill) {
80 | tmp <- if (plt$labels$colour != plt$labels$fill) {
81 | guide_legend(order = 1, override.aes = list(alpha = 1, size = 2, stroke = 0.5, shape = 21))
82 | } else "none"
83 | guides(
84 | col = tmp,
85 | fill = guide_legend(override.aes = list(alpha = 1, size = 2, col = NA, shape = 21)))
86 | } else if (col) { guides(
87 | col = guide_legend(override.aes = list(alpha = 1, size = 2, shape = 21)))
88 | } else if (fill) { guides(
89 | fill = guide_legend(override.aes = list(alpha = 1, size = 2)))
90 | }
91 | plt <- plt +
92 | theme_linedraw(base_size) + theme(
93 | panel.grid = element_blank(),
94 | legend.key.size = unit(0.5, "lines"),
95 | strip.background = element_blank(),
96 | strip.text = element_text(color = "black"))
97 | return(plt + thm)
98 | }
99 |
100 | # aesthetics ----
101 |
102 | .groups_pal <- c(global = "grey40", batch = "grey80", cluster = "grey80", group = "grey80")
103 |
104 | pat <- ".*-sim_data-(.*)\\.R"
105 | methods <- gsub(pat, "\\1", list.files("code", pat))
106 | .methods_pal <- colorRampPalette(brewer.pal(12, "Paired"))(length(methods))
107 | .methods_pal <- c(ref = "black", setNames(.methods_pal, methods))
108 |
109 | pat <- ".*calc_qc-(.*)\\.R"
110 | .metrics <- gsub(pat, "\\1", list.files("code", pat))
111 | .none_metrics <- c("gene_pve", "cell_cms", "cell_sw")
112 | .metrics <- setdiff(.metrics, .none_metrics)
113 | .gene_metrics <- grep("gene", .metrics, value = TRUE)
114 | .cell_metrics <- grep("cell", .metrics, value = TRUE)
115 |
116 | .metrics_lab <- c(
117 | gene_avg = "average of logCPM",
118 | gene_var = "variance of logCPM",
119 | gene_cv = "coefficient of variation",
120 | gene_frq = "gene detection frequency",
121 | gene_cor = "gene-to-gene correlation",
122 | cell_lls = "log-library size",
123 | cell_frq = "cell detection frequency",
124 | cell_cor = "cell-to-cell correlation",
125 | cell_ldf = "local density factor",
126 | cell_pcd = "cell-to-cell distance",
127 | cell_knn = "KNN occurences",
128 | gene_pve = "percent variance explained",
129 | cell_cms = "cell-specific mixing score",
130 | cell_sw = "silhouette width")
131 |
132 | n <- 3
133 | .metrics_pal <- c(
134 | setNames(hcl.colors(n*length(.gene_metrics), "Reds" )[seq(1, n*length(.gene_metrics), n)], .gene_metrics),
135 | setNames(hcl.colors(n*length(.cell_metrics), "Blues")[seq(1, n*length(.cell_metrics), n)], .cell_metrics),
136 | setNames(brewer.pal(1+length(.none_metrics), "Greens")[-1], .none_metrics))
137 | names(.metrics_pal) <- .metrics_lab
138 |
139 | pat <- ".*-stat_1d-(.*)\\.R"
140 | .stats1d <- gsub(pat, "\\1", list.files("code", pat))
141 | .stats1d_lab <- c(ks = "KS", ws = "W")
142 |
143 | pat <- ".*-stat_2d-(.*)\\.R"
144 | .stats2d <- gsub(pat, "\\1", list.files("code", pat))
145 | .stats2d_lab <- c(emd = "EMD", ks2 = "KS")
146 |
--------------------------------------------------------------------------------
/code/utils-summaries.R:
--------------------------------------------------------------------------------
1 | # split cell indices into list
2 | # - global = list w/ one element (all cells)
3 | # - batch = list w/ one element per batch (optional)
4 | # - cluster = list w/ one element per cluster (optional)
5 | .split_cells <- function(sce,
6 | i = c("global", "batch", "cluster"))
7 | {
8 | names(j) <- j <- intersect(i, names(colData(sce)))
9 | cs <- lapply(j, function(.) {
10 | ids <- droplevels(factor(sce[[.]]))
11 | split(seq(ncol(sce)), ids)
12 | })
13 | if ("global" %in% i)
14 | cs <- c(list(global = list(foo = seq(ncol(sce)))), cs)
15 | return(cs)
16 | }
17 |
18 | # split genes by group = "cluster","batch", "sample".
19 | # Then per group calculate the qc with the FUN function(which is a metric)
20 | # returns a dataframe with cols: group | id | metric_name
21 | .calc_qc <- function(sce, fun,
22 | n_genes = NULL, n_cells = NULL,
23 | groups = c("global", "cluster", "batch"))
24 | {
25 | suppressPackageStartupMessages({
26 | library(dplyr)
27 | library(purrr)
28 | })
29 | if (is.null(groups)) {
30 | groups <- eval(formals(.calc_qc)$groups)
31 | } else {
32 | groups <- match.arg(groups, several.ok = TRUE)
33 | }
34 | # split cells into groups
35 | idx <- .split_cells(sce, groups)
36 | # downsample to at most 'n_genes' in total
37 | if (!is.null(n_genes)) {
38 | n_genes <- min(n_genes, nrow(sce))
39 | gs <- sample(nrow(sce), n_genes)
40 | sce <- sce[gs, ]
41 | }
42 | # downsample to at most 'n_cells' per group
43 | if (!is.null(n_cells)) {
44 | idx <- map_depth(idx, -1, ~{
45 | n_cells <- min(n_cells, length(.x))
46 | sample(.x, n_cells)
47 | })
48 | }
49 | # compute QC metric per group
50 | res <- map_depth(idx, -1, ~{
51 | data.frame(
52 | row.names = NULL,
53 | value = fun(sce[, .x]))
54 | })
55 | # join into single table
56 | res <- map_depth(res, 1, bind_rows, .id = "id")
57 | res <- bind_rows(res, .id = "group")
58 | if (nrow(res) != 0) return(res)
59 | }
--------------------------------------------------------------------------------
/config.yaml:
--------------------------------------------------------------------------------
1 | R: "R_LIBS_USER=~/libs/bioc313/ /usr/local/R/R-4.1.0/bin/R"
--------------------------------------------------------------------------------
/data/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelenaLC/simulation-comparison/f89e6650f8acae1c32b8fe27d644ec7c7be63051/data/.gitkeep
--------------------------------------------------------------------------------
/figs/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelenaLC/simulation-comparison/f89e6650f8acae1c32b8fe27d644ec7c7be63051/figs/.gitkeep
--------------------------------------------------------------------------------
/logs/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelenaLC/simulation-comparison/f89e6650f8acae1c32b8fe27d644ec7c7be63051/logs/.gitkeep
--------------------------------------------------------------------------------
/meta/methods.json:
--------------------------------------------------------------------------------
1 | {
2 | "BASiCS": ["b"],
3 | "ESCO": ["n", "b", "k"],
4 | "hierarchicell": ["n", "b"],
5 | "muscat": ["n", "b", "k"],
6 | "POWSC": ["n", "k"],
7 | "powsimR": ["n"],
8 | "scDD": ["n"],
9 | "scDesign": ["n"],
10 | "scDesign2": ["n", "k"],
11 | "SCRIP": ["n", "b", "k"],
12 | "SPARSim": ["n", "b"],
13 | "SPsimSeq": ["n", "b"],
14 | "splatter": ["n"],
15 | "SymSim": ["n", "b"],
16 | "ZINB-WaVE": ["n", "b", "k"],
17 | "zingeR": ["n"]
18 | }
--------------------------------------------------------------------------------
/meta/runtimes.json:
--------------------------------------------------------------------------------
1 | {
2 | "panc8,inDrop1.beta": {
3 | "type": "n",
4 | "n_genes": ["4000", "2000", "1000", "500"],
5 | "n_cells": ["800", "400", "200", "100"]
6 | },
7 | "Mereu20,CD4T": {
8 | "type": "b",
9 | "n_genes": ["4000", "2000", "1000", "500"],
10 | "n_cells": ["2600", "1300", "650", "325"]
11 | },
12 | "Zheng17,foo": {
13 | "type": "k",
14 | "n_genes": ["3200", "1600", "800", "400"],
15 | "n_cells": ["2600", "1300", "650", "325"]
16 | }
17 | }
--------------------------------------------------------------------------------
/meta/subsets.json:
--------------------------------------------------------------------------------
1 | {
2 | "CellBench": {
3 | "H2228": {
4 | "type": "b",
5 | "n_cells": "400",
6 | "n_genes": "4000",
7 | "cluster": "H2228"
8 | },
9 | "CELSeq": {
10 | "type": "k",
11 | "n_cells": "400",
12 | "n_genes": "4000",
13 | "batch": "sc_celseq"
14 | }
15 | },
16 | "Ding20": {
17 | "10x.InhibNeuron": {
18 | "type": "n",
19 | "n_cells": null,
20 | "n_genes": "4000",
21 | "batch": "10x Chromium",
22 | "cluster": "Inhibitory neuron"
23 | },
24 | "ExcitNeuron": {
25 | "type": "b",
26 | "n_cells": "200",
27 | "n_genes": "4000",
28 | "cluster": "Excitatory neuron"
29 | },
30 | "DroNcSeq": {
31 | "type": "k",
32 | "n_cells": "200",
33 | "n_genes": "4000",
34 | "batch": "DroNc-seq"
35 | }
36 | },
37 | "Gierahn17" : {
38 | "foo": {
39 | "type": "n",
40 | "n_cells": null,
41 | "n_genes": "4000"
42 | }
43 | },
44 | "Kang18": {
45 | "1015": {
46 | "type": "k",
47 | "n_cells": "400",
48 | "n_genes": "4000",
49 | "batch": "1015"
50 | },
51 | "B": {
52 | "type": "n",
53 | "n_cells": null,
54 | "n_genes": "4000",
55 | "batch": "1015",
56 | "cluster": "B cells"
57 | },
58 | "NK": {
59 | "type": "n",
60 | "n_cells": null,
61 | "n_genes": "4000",
62 | "batch": "1015",
63 | "cluster": "NK cells"
64 | }
65 | },
66 | "Koh16": {
67 | "foo": {
68 | "type": "k",
69 | "n_cells": null,
70 | "n_genes": "4000"
71 | }
72 | },
73 | "MCA20.gland": {
74 | "T": {
75 | "type": "b",
76 | "n_cells": null,
77 | "n_genes": 4000,
78 | "cluster": "T cell_Cd8b1 high"
79 | }
80 | },
81 | "MCA20.lung": {
82 | "AT2": {
83 | "type": "b",
84 | "n_cells": null,
85 | "n_genes": 4000,
86 | "cluster": "AT2 Cell"
87 | }
88 | },
89 | "Mereu20":{
90 | "CD4T": {
91 | "type": "b",
92 | "n_cells": "200",
93 | "n_genes": "4000",
94 | "cluster": "CD4 T cells"
95 | },
96 | "ddSeq": {
97 | "type": "k",
98 | "n_cells": "400",
99 | "n_genes": "4000",
100 | "batch": "ddSEQ"
101 | }
102 | },
103 | "Oetjen18": {
104 | "R": {
105 | "type": "n",
106 | "n_cells": null,
107 | "n_genes": "4000",
108 | "batch": "R"
109 | },
110 | "foo": {
111 | "type": "b",
112 | "n_cells": "200",
113 | "n_genes": "4000"
114 | }
115 | },
116 | "panc8": {
117 | "inDrop1.beta": {
118 | "type": "n",
119 | "n_cells": null,
120 | "n_genes": "4000",
121 | "batch": "indrop1",
122 | "cluster": "beta"
123 | },
124 | "inDrop.ductal": {
125 | "type": "b",
126 | "n_cells": null,
127 | "n_genes": "4000",
128 | "batch": ["indrop1", "indrop2", "indrop3", "indrop4"],
129 | "cluster": "ductal"
130 | },
131 | "SmartSeq2": {
132 | "type": "k",
133 | "n_cells": "400",
134 | "n_genes": "4000",
135 | "batch": "smartseq2"
136 | }
137 | },
138 | "TabulaMuris": {
139 | "limb.MSCs": {
140 | "type": "n",
141 | "n_cells": null,
142 | "n_genes": "4000",
143 | "tissue": "Limb_Muscle",
144 | "cluster": "mesenchymal stem cell"
145 | },
146 | "spleen": {
147 | "type": "k",
148 | "n_cells": "400",
149 | "n_genes": "4000",
150 | "tissue": "Spleen"
151 | }
152 | },
153 | "Tung17": {
154 | "foo": {
155 | "type": "b",
156 | "n_cells": "400",
157 | "n_genes": "4000"
158 | },
159 | "NA19101": {
160 | "type": "n",
161 | "n_cells": null,
162 | "n_genes": "4000",
163 | "batch": "NA19101"
164 | }
165 | },
166 | "Zheng17": {
167 | "foo": {
168 | "type": "k",
169 | "n_cells": "400",
170 | "n_genes": "4000"
171 | },
172 | "HSCs": {
173 | "type": "n",
174 | "n_cells": null,
175 | "n_genes": "4000",
176 | "cluster": "HSCs CD34+"
177 | },
178 | "Monocytes": {
179 | "type": "n",
180 | "n_cells": "400",
181 | "n_genes": "4000",
182 | "cluster": "Monocytes CD14+"
183 | }
184 | }
185 | }
--------------------------------------------------------------------------------
/outs/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelenaLC/simulation-comparison/f89e6650f8acae1c32b8fe27d644ec7c7be63051/outs/.gitkeep
--------------------------------------------------------------------------------
/plts/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelenaLC/simulation-comparison/f89e6650f8acae1c32b8fe27d644ec7c7be63051/plts/.gitkeep
--------------------------------------------------------------------------------
/schematic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelenaLC/simulation-comparison/f89e6650f8acae1c32b8fe27d644ec7c7be63051/schematic.png
--------------------------------------------------------------------------------
]