├── .Rbuildignore
├── .gitignore
├── DESCRIPTION
├── NAMESPACE
├── R
├── KDE_test.R
├── biological_signal_prop.R
├── calculateGeneCellCorr.R
├── calculateMeanVarLibrary.R
├── countsim_eval_KS.R
├── evaluate_DE_prop.R
├── generate_DE_prop.R
├── parameter_estimation.R
└── plotting.R
├── README.md
├── SimBench.Rproj
├── inst
└── extdata
│ ├── real.rds
│ └── sim.rds
├── man
├── KDE_test.Rd
├── calculateFeatureCorrs.Rd
├── calculateMeanVarLibrary.Rd
├── calculateSampleCorrs.Rd
├── countsim_eval.Rd
├── draw_biosignal_plot.Rd
├── draw_parameter_plot.Rd
├── eval_parameter.Rd
├── eval_signal.Rd
├── evaluate_DE.Rd
├── figures
│ ├── vis_biosignal.jpg
│ └── vis_parameter.jpg
├── generate_DE.Rd
├── prepare_eval.Rd
├── process_data.Rd
└── summarise_score.Rd
└── readme.md
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^SimBench\.Rproj$
2 | ^\.Rproj\.user$
3 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 |
--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
1 | Package: SimBench
2 | Title: SimBench: benchmarking simulation methods
3 | Version: 0.99.1
4 | Authors@R: c(
5 | person(given = "Yue",family = "Cao", email = "yue.cao@sydney.edu.au", role=c("aut", "cre")),
6 | person(given = "Pengyi",family = "Yang", email = "pengyi.yang@sydney.edu.au", role="aut" ),
7 | person(given = "Jean Yee Hwa",family = "Yang", email = "jean.yang@sydney.edu.au", role="aut" ))
8 | Description: The SimBench package is designed for benchmarking simulation methods based on two key aspects of accuracy of data properties estimation and ability to retain biological signals. It contains functions for comparing simulated data obtained from simulation methods and real data that was used as the reference input into the simulation methods on the two aspects.
9 | Imports: ks, dplyr, SingleCellExperiment, Seurat, scales, limma, DESeq2, rlist, Seurat, ggplot2, ggpubr, ggthemes, methods
10 | License: GPL-3
11 | Encoding: UTF-8
12 | LazyData: true
13 | Roxygen: list(markdown = TRUE)
14 | RoxygenNote: 7.2.3
15 | Date: 2021-03-07
16 |
--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
1 | # Generated by roxygen2: do not edit by hand
2 |
3 | export(KDE_test)
4 | export(calculateFeatureCorrs)
5 | export(calculateMeanVarLibrary)
6 | export(calculateSampleCorrs)
7 | export(countsim_eval)
8 | export(draw_biosignal_plot)
9 | export(draw_parameter_plot)
10 | export(eval_parameter)
11 | export(eval_signal)
12 | export(evaluate_DE)
13 | export(generate_DE)
14 | export(prepare_eval)
15 | export(process_data)
16 | export(summarise_score)
17 | import(DESeq2)
18 | import(GO.db)
19 | import(Seurat)
20 | import(WGCNA)
21 | import(dplyr)
22 | import(edgeR)
23 | import(ggplot2)
24 | import(ggpubr)
25 | import(ggthemes)
26 | import(impute)
27 | import(ks)
28 | import(preprocessCore)
29 | importFrom(limma,eBayes)
30 | importFrom(limma,lmFit)
31 | importFrom(methods,new)
32 |
--------------------------------------------------------------------------------
/R/KDE_test.R:
--------------------------------------------------------------------------------
1 |
2 |
3 | #' Perform KDE test
4 | #'
5 | #' @param df dataframe containing the raw distribution for comparison
6 | #' @param column the column containing the parameter of interest
7 | #' @param subsampleSize maximum number of data points for comparison
8 | #'
9 | #' @return KDE test statistic
10 | #' @import ks
11 | #'
12 | #' @export
13 | KDE_test <- function(df, column, subsampleSize ) {
14 |
15 |
16 | ## Initialize data frame by populating it with all data set pairs
17 | method_name <- unique(df$dataset)
18 |
19 | ds1 = method_name[1] #set the name, eg, "real"
20 | ds2 = method_name[2] # set the name, eg, "splatter"
21 |
22 |
23 | ## Remove rows with NA values in column(s) of interest
24 | df <- df[rowSums(is.na(df[, column, drop = FALSE])) == 0, ]
25 |
26 | # kde
27 | if (nrow(df) > subsampleSize) {
28 | df <- df[sample(seq_len(nrow(df)), subsampleSize, replace = FALSE), ]
29 | }
30 |
31 | # KDE test requires vector for univariate and matrix for multivariate comparison
32 | if (length(column) == 1) {
33 | kde <- kde.test(x1 = as.vector(df[, column][df$dataset == ds1]) ,
34 | x2 = as.vector(df[, column][df$dataset == ds2]))
35 | } else{
36 | kde <- kde.test(x1 = as.matrix(df[, column][df$dataset == ds1 , ]) ,
37 | x2 = as.matrix(df[, column][df$dataset == ds2, ]))
38 | }
39 |
40 | ds_res <- data.frame( kde_tstat = kde$Tstat ,
41 | kde_zstat = kde$zstat ,
42 | kde_pvalue = kde$pvalue)
43 |
44 | ## Return output table
45 | return( ds_res)
46 |
47 | }
48 |
--------------------------------------------------------------------------------
/R/biological_signal_prop.R:
--------------------------------------------------------------------------------
1 |
2 |
3 | #' evaluate the similarities between the two set of DE genes
4 | #'
5 | #' @param real real data
6 | #' @param simulated simulated data
7 | #'
8 | #' @return confusion matrix, logFC of the set of genes from real and simiulated data
9 | #' @export
10 | eval_signal <- function(real , sim ) {
11 |
12 |
13 | # subset the dataset to two largest cell types
14 | temp_result <- process_data(real,sim )
15 |
16 | # find the amount of biological signals between two cell types
17 | result <- evaluate_DE(temp_result$real, temp_result$sim)
18 |
19 | return (result)
20 | }
21 |
--------------------------------------------------------------------------------
/R/calculateGeneCellCorr.R:
--------------------------------------------------------------------------------
1 | #' cellwise correlation
2 | #'
3 | #' @param sim_list a list containing the real and simulated data
4 | #' @param maxNForCorr number of cells selected to compute the cellwise correlation
5 | #' @param ncore number of cores for parallel computing
6 | #'
7 | #' @return A data frame with cellwise correlations for real and simulated data
8 | #' @import WGCNA
9 | #' @import GO.db
10 | #' @import impute
11 | #' @import preprocessCore
12 | #' @import dplyr
13 | #' @export
14 | calculateSampleCorrs <- function(sim_list , maxNForCorr = 1000, ncore = 8 ) {
15 |
16 | sampleCorrDF <- mclapply(sim_list , function(x) {
17 |
18 | # get the log2 CPM
19 | cpms <- as.matrix(x$dge$log2cpm)
20 |
21 | # get the top 500 most variable genes
22 | var_genes <- apply(cpms, 1, var)
23 | maxgene <- min(500, nrow(cpms))
24 | select_var <- names(sort(var_genes, decreasing = TRUE))[1:maxgene]
25 |
26 | # Subset logcounts matrix
27 | cpms <- cpms[select_var, ]
28 |
29 | # say, there are 2000 cells, then only take the 1000 cells for the evaluation
30 | if (ncol(cpms) > maxNForCorr) {
31 | cpms <- cpms[, sample(seq_len(ncol(cpms)), maxNForCorr, replace = FALSE)]
32 | }
33 |
34 | # although this can compute correlation using multiple cores
35 | # the main function itself is already set up in parallel
36 | # so no more multiple cores here
37 | corrs <- WGCNA::cor(cpms, use = "pairwise.complete.obs",
38 | method = "spearman", nThreads = 1)
39 |
40 | data.frame(Correlation = corrs[upper.tri(corrs)])
41 |
42 | } , mc.cores = ncore )
43 |
44 | ## Merge correlations from all data sets
45 | ns <- vapply(sampleCorrDF, nrow, 0)
46 | do.call(rbind, sampleCorrDF) %>% dplyr::mutate(dataset = rep(names(sampleCorrDF), ns))
47 |
48 | }
49 |
50 |
51 | #' genewise correlation
52 | #'
53 | #' @param sim_list a list containing the real and simulated data
54 | #' @param ncore number of cores for parallel computing
55 | #'
56 | #' @return A data frame with genewise correlations for real and simulated data
57 | #' @import WGCNA
58 | #' @import GO.db
59 | #' @import impute
60 | #' @import preprocessCore
61 | #' @import dplyr
62 | #' @export
63 | calculateFeatureCorrs <- function(sim_list , ncore = 8) {
64 |
65 | featureCorrDF <- mclapply(sim_list, function(x) {
66 |
67 | # get the log2 CPM
68 | cpms <- as.matrix(x$dge$log2cpm)
69 | cpms <- cpms[genefilter::rowVars(cpms) > 0,]
70 |
71 | # subset to calculate the correlation of the highly variable genes
72 | var_genes <- apply(cpms, 1, var)
73 | maxgene <- min(500, nrow(cpms))
74 | select_var <- names(sort(var_genes, decreasing = TRUE))[1:maxgene]
75 | cpms <- cpms[select_var, ]
76 |
77 | corrs <- WGCNA::cor(t(cpms), use = "pairwise.complete.obs",
78 | method = "spearman" , nThreads = 1)
79 |
80 | data.frame(Correlation = corrs[upper.tri(corrs)])
81 |
82 | } , mc.cores = ncore)
83 |
84 | ## Merge correlations from all data sets
85 | ns <- vapply(featureCorrDF, nrow, 0)
86 | do.call(rbind, featureCorrDF) %>% dplyr::mutate(dataset = rep(names(featureCorrDF), ns))
87 |
88 | }
89 |
--------------------------------------------------------------------------------
/R/calculateMeanVarLibrary.R:
--------------------------------------------------------------------------------
1 | #' Calculate mean variance and library size estimates
2 | #'
3 | #' @param sim_list A list containing real and simulated data
4 | #' @param ncore number of cores for parallel computing
5 | #'
6 | #' @return a list containing the mean , variance , library size estimates of the real and simulated data for downstream evaluation
7 | #' @import Seurat
8 | #' @import edgeR
9 | #' @import DESeq2
10 | #' @export
11 | calculateMeanVarLibrary <- function(sim_list , ncore = 8) {
12 |
13 | mclapply(sim_list, function(ds) {
14 | # if Seurat , it means the simulated count matrix is normalised
15 | # hence library size estimate cannot be obtained
16 | if (class(ds) == "Seurat") {
17 | dge <- list()
18 | dds <- list()
19 |
20 | msg <- try({
21 | temp_seurat <- FindVariableFeatures(ds, selection.method = "disp")
22 | })
23 |
24 | if (class(msg) == "try-error") {
25 | ds <- NormalizeData(ds, scale.factor = 1e6)
26 | }
27 |
28 | temp_seurat <- FindVariableFeatures(ds, selection.method = "disp")
29 | dge$log2cpm <- ds[["RNA"]]$data
30 | dge$mean <- HVFInfo(temp_seurat)[, 1]
31 | dge$dispersion <- HVFInfo(temp_seurat)[, 2]
32 | dge$dispersion.scaled <- HVFInfo(temp_seurat)[, 3]
33 |
34 |
35 | } else if (class(ds) == "DESeqDataSet") {
36 | # if the object is DESeq, it means we can find the library size estimate
37 |
38 | dge <- edgeR::DGEList(counts = DESeq2::counts(ds))
39 | ## Calculate normalization factors
40 | dge <- edgeR::calcNormFactors(dge)
41 |
42 |
43 | # if the class is DEseq, it means the data is raw, and need cpm normalization
44 | temp_seurat <- CreateSeuratObject(counts = counts(ds))
45 | temp_seurat <- NormalizeData(temp_seurat, scale = 1e6)
46 | temp_seurat <- SetAssayData(object = temp_seurat, slot = "counts", new.data = temp_seurat[["RNA"]]$data)
47 |
48 | # note that even though it "finds variable features"
49 | # it returns the mean and sd of all genes in the dataset
50 | temp_seurat <- FindVariableFeatures(temp_seurat, selection.method = "disp")
51 | dge$mean <- HVFInfo(temp_seurat)[, 1]
52 | dge$dispersion <- HVFInfo(temp_seurat)[, 2]
53 | dge$dispersion.scaled <- HVFInfo(temp_seurat)[, 3]
54 | dge$log2cpm <- temp_seurat[["RNA"]]$data
55 |
56 |
57 |
58 | ## --------------------------- DESeq2 -------------------------- ##
59 | ## Calculate size factors
60 | dds <- DESeq2::estimateSizeFactors(ds, type = "poscounts")
61 | }
62 |
63 | # the dge is used to store mean and variance
64 | # dds is used to store library size
65 | list(dge = dge, dds = dds)
66 |
67 | } , mc.cores = ncore)
68 |
69 | }
70 |
--------------------------------------------------------------------------------
/R/countsim_eval_KS.R:
--------------------------------------------------------------------------------
1 |
2 |
3 | #' Calculate KDE test statistic on each parameter
4 | #'
5 | #' @param sim_list A list containing real and simulated data
6 | #' @param ncore number of cores for parallel computing
7 | #'
8 | #' @return the KDE test statistic across 13 parameters, and the raw values that are used to calculate the KDE test statistics
9 | #' @import Seurat
10 | #' @import dplyr
11 | #' @export
12 | countsim_eval <- function( sim_list, ncore = 8 ) {
13 |
14 | print("finding mean and variance ")
15 |
16 | obj <- calculateMeanVarLibrary(sim_list = sim_list , ncore = ncore )
17 |
18 |
19 |
20 | print("calculating correlation estimate")
21 | maxNForCorr = 500 # default value is 500
22 | # if there are more than 500 cells in this cell type, just calculate using 500 cells
23 | #ie. sample - sample correlation for top 500 variable genes, across 500 samples
24 | # then gene - gene correlation for top 500 variable genes , across all samples
25 |
26 | print("cellwise correlation estimate")
27 | sampleCorrDF <- calculateSampleCorrs(sim_list = obj, maxNForCorr = maxNForCorr, ncore = ncore)
28 |
29 | print("genewise correlation estimate ")
30 | ## -------------- Calculate between-feature correlations --------------- ##
31 | featureCorrDF <- calculateFeatureCorrs(sim_list = obj, ncore = ncore)
32 |
33 |
34 | sampleDF <- lapply(obj, function(x) {
35 |
36 | if (! is.null( x$dge$counts ) ){
37 | data.frame(
38 | Libsize = colSums(x$dge$counts),
39 | Fraczero = colMeans(x$dge$counts == 0),
40 | TMM = x$dge$samples$norm.factors,
41 | type = "raw"
42 | ) %>% dplyr::mutate(EffLibsize = Libsize * TMM)
43 | }else{
44 | data.frame(
45 | Libsize = -100 ,
46 | Fraczero = colMeans(x$dge$log2cpm == 0) ,
47 | TMM = -100 ,
48 | type = "normalized",
49 | EffLibsize = -100 # if normalised, then can't calculate libsize or TMM
50 | )
51 | }
52 | } )
53 |
54 | ns <- sapply(sampleDF, nrow)
55 | sampleDF <- do.call(rbind, sampleDF) %>% dplyr::mutate(dataset = rep(names(sampleDF), ns))
56 |
57 |
58 |
59 | featureDF <- lapply(obj, function(x) {
60 |
61 | if (! is.null( x$dge$counts ) ){
62 | data.frame(
63 | average_log2_cpm = x$dge$mean,
64 | variance_log2_cpm = x$dge$dispersion,
65 | variance_scaled_log2_cpm = x$dge$dispersion.scaled,
66 | Fraczero = rowMeans(x$dge$counts == 0),
67 | type = "raw"
68 | )
69 | } else {
70 | data.frame(
71 | Fraczero = rowMeans( x$dge$log2cpm == 0 ),
72 | average_log2_cpm = x$dge$mean,
73 | variance_log2_cpm = x$dge$dispersion,
74 | variance_scaled_log2_cpm = x$dge$dispersion.scaled,
75 | type = "normalized"
76 | )
77 | }
78 | } )
79 |
80 | ns <- sapply(featureDF, nrow)
81 | featureDF <- do.call(rbind, featureDF) %>% dplyr::mutate(dataset = rep(names(featureDF), ns))
82 |
83 |
84 |
85 |
86 | ## ----------------- Summarize data set characteristics ---------------- ##
87 | datasetDF <- do.call(rbind, lapply(obj, function(x) {
88 |
89 | if (! is.null( x$dge$counts ) ){
90 | data.frame(
91 | nVars = nrow(x$dge$counts),
92 | nSamples = ncol(x$dge$counts),
93 | type = "raw"
94 | )
95 |
96 | } else {
97 | data.frame(
98 | nVars = nrow(x$dge$log2cpm),
99 | nSamples = ncol(x$dge$log2cpm),
100 | type = "normalized"
101 | )
102 |
103 | }
104 | })) %>% dplyr::mutate(dataset = names(obj))
105 |
106 |
107 | subsampleSize = 10000 # choose 10000 values to calculate the similarities between the real and the simulated
108 |
109 | dataset <- unique( featureDF$dataset)
110 | real_dataset <- dataset[1]
111 | sim_dataset <- dataset[ 2]
112 | type <- datasetDF$type[2]
113 |
114 |
115 |
116 | tmm <- efflibsize <- libsize <- NULL
117 | average_log2_cpm <- variance_log2_cpm <- variance_scaled_log2_cpm <- NULL
118 | samplecor <- featurecor <- NULL
119 | fraczerogene <- fraczerocell <- NULL
120 | mean_variance <- mean_fraczero <- libsize_fraczero <- NULL
121 |
122 |
123 | # can only calculate library size estimates for raw count
124 | if ( type == "raw") {
125 |
126 | libsize <- KDE_test(df = sampleDF, column = "Libsize", subsampleSize = subsampleSize )
127 |
128 | libsize_fraczero <- KDE_test(df = sampleDF , column = c("Libsize", "Fraczero"),
129 | subsampleSize = subsampleSize )
130 |
131 | tmm <- KDE_test(df = sampleDF, column = "TMM", subsampleSize = subsampleSize )
132 |
133 | efflibsize <- KDE_test(df = sampleDF, column = "EffLibsize",
134 | subsampleSize = subsampleSize )
135 |
136 | }
137 |
138 | # now start evaluting the stats that can be evaluated by all
139 |
140 |
141 | mean_variance <- KDE_test(df = featureDF, column = c("average_log2_cpm", "variance_log2_cpm"),
142 | subsampleSize = subsampleSize )
143 |
144 |
145 | mean_fraczero <- KDE_test(df = featureDF , column = c("average_log2_cpm", "Fraczero"),
146 | subsampleSize = subsampleSize )
147 |
148 |
149 | fraczerogene <- KDE_test(df = featureDF, column = "Fraczero",
150 | subsampleSize = subsampleSize )
151 |
152 |
153 | average_log2_cpm <- KDE_test(df =featureDF, column = "average_log2_cpm",
154 | subsampleSize = subsampleSize )
155 |
156 | variance_log2_cpm <- KDE_test(df = featureDF, column = "variance_log2_cpm",
157 | subsampleSize = subsampleSize )
158 |
159 | variance_scaled_log2_cpm <- KDE_test(df = featureDF, column = "variance_scaled_log2_cpm",
160 | subsampleSize = subsampleSize )
161 |
162 |
163 |
164 | fraczerocell <- KDE_test(df = sampleDF, column = "Fraczero",
165 | subsampleSize = subsampleSize )
166 |
167 |
168 | samplecor <- KDE_test(df = sampleCorrDF, column = "Correlation",
169 | subsampleSize = subsampleSize )
170 |
171 |
172 | featurecor <- KDE_test(df = featureCorrDF, column = "Correlation",
173 | subsampleSize = subsampleSize )
174 |
175 |
176 |
177 | result <- list()
178 |
179 |
180 | # can only be evaluated on raw count
181 | result$tmm <- tmm
182 | result$efflibsize <- efflibsize
183 | result$libsize <- libsize
184 | result$libsize_fraczero <- libsize_fraczero
185 |
186 |
187 | # can be evaluted using log2cpm
188 | result$average_log2_cpm <- average_log2_cpm
189 | result$variance_log2_cpm <- variance_log2_cpm
190 | result$variance_scaled_log2_cpm <- variance_scaled_log2_cpm
191 |
192 | result$samplecor <- samplecor
193 | result$featurecor <- featurecor
194 |
195 | result$fraczerogene <- fraczerogene
196 | result$fraczerocell <- fraczerocell
197 |
198 | result$mean_variance <- mean_variance
199 | result$mean_fraczero <- mean_fraczero
200 |
201 |
202 |
203 | result <- list(score = result,
204 | raw_value = list ( sampleDF = sampleDF,
205 | featureDF = featureDF,
206 | sampleCorrDF = sampleCorrDF,
207 | featureCorrDF = featureCorrDF))
208 | return( result)
209 |
210 | }
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
--------------------------------------------------------------------------------
/R/evaluate_DE_prop.R:
--------------------------------------------------------------------------------
1 |
2 |
3 | #' Evaluate DE
4 | #'
5 | #' @param real A list containing real and simulated data
6 | #' @param simulated number of cores for parallel computing
7 | #'
8 | #' @return DE
9 | #' @export
10 | evaluate_DE <- function(real, simulated) {
11 | # generate DE genes
12 |
13 |
14 | types <- c("DE", "DV", "DD", "DP", "BD")
15 |
16 | df <- NULL
17 |
18 | for (thistype in types){
19 |
20 | simulated_prop <- 0
21 | real_prop <- 0
22 |
23 | try({
24 | real_prop <- generate_DE( real, real$celltype, thistype)
25 | temp <- data.frame( types = thistype, prop = real_prop, sim = "real")
26 | df <- rbind(df, temp)
27 | })
28 |
29 | try({
30 | simulated_prop <- generate_DE( simulated, simulated$celltype, thistype)
31 | temp <- data.frame( types = thistype, prop = simulated_prop, sim = "sim")
32 | df <- rbind(df, temp)
33 | })
34 | }
35 |
36 | return (df)
37 |
38 | }
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
--------------------------------------------------------------------------------
/R/generate_DE_prop.R:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | #' Process data
5 | #'
6 | #' @param sim_list A list containing real and simulated data
7 | #' @param ncore number of cores for parallel computing
8 | #'
9 | #' @return the KDE test statistic across 13 parameters, and the raw values that are used to calculate the KDE test statistics
10 | #' @export
11 | process_data <- function(real, simulated) {
12 |
13 |
14 | total_celltypes <- unique ( as.character( real$celltype ))
15 | if (length(total_celltypes) < 2){
16 | stop("not enough cell types")
17 | }
18 |
19 | # get the two most abundant cell types to evaluate
20 | celltype <- names( sort(table( real$celltype),decreasing=TRUE)[1:2] )
21 |
22 | # subset to the two most abundant cell types
23 | real_thiscelltype <- real[ , real$celltype %in% celltype ]
24 | sim_thiscelltype <- simulated[ , simulated$celltype %in% celltype ]
25 |
26 |
27 | # if the dataset is singlecellexperiment, change it to seurat
28 | # this is because we will get the DE genes using Seurat
29 | if (class(real_thiscelltype) == "SingleCellExperiment" ) {
30 | temp_celltype <- real_thiscelltype$celltype
31 | real_thiscelltype <- CreateSeuratObject( counts = counts(real_thiscelltype))
32 | real_thiscelltype$celltype <- temp_celltype
33 | real_thiscelltype <- NormalizeData(real_thiscelltype)
34 | }
35 |
36 | if (class( sim_thiscelltype) == "SingleCellExperiment" ){
37 | temp_celltype <- sim_thiscelltype$celltype
38 | sim_thiscelltype <- CreateSeuratObject( counts = counts(sim_thiscelltype))
39 | sim_thiscelltype$celltype <- temp_celltype
40 | sim_thiscelltype <- NormalizeData(sim_thiscelltype)
41 | }
42 |
43 |
44 |
45 | return (list(real_final = real_thiscelltype , simulated_final = sim_thiscelltype) )
46 | }
47 |
48 |
49 |
50 | #' Generate DE
51 | #'
52 | #' @param sim_list A list containing real and simulated data
53 | #' @param ncore number of cores for parallel computing
54 | #'
55 | #' @return the KDE test statistic across 13 parameters, and the raw values that are used to calculate the KDE test statistics
56 | #' @export
57 | generate_DE <- function(exprsMat,
58 | trainClass,
59 | feature = c("DE", "DV", "DD", "DP", "BD"),
60 | # limma is DE , chisq is DP , BI is Bimodal distribution
61 | pSig = 0.1
62 | ){
63 |
64 |
65 | if (feature == "DV") {
66 | tt <- doDV(exprsMat, trainClass)
67 | tt <- sum(tt < pSig)
68 |
69 | } else if (feature == "DD") {
70 | tt <- doDD(exprsMat, trainClass)
71 | tt <- sum(tt < pSig)
72 |
73 | } else if (feature == "DP") {
74 | tt <- doChisSquared(exprsMat, trainClass)
75 | tt <- sum(tt < pSig , na.rm = T)
76 |
77 | } else if (feature == "BD") {
78 | tt <- doBI(exprsMat, trainClass)
79 | tt <- sum(tt > 0.3 , na.rm = T)
80 |
81 | }
82 |
83 | else{
84 | tt <- doLimma(exprsMat, trainClass)
85 | tt <- sum(tt$adj.P.Val < pSig)
86 | }
87 | res <- tt / dim( exprsMat )[1]
88 |
89 | return(res)
90 | }
91 |
92 |
93 | #' @importFrom limma eBayes lmFit
94 | #' @importFrom methods new
95 | doLimma <- function(exprsMat, cellTypes, exprs_pct = 0.05){
96 |
97 | cellTypes <- droplevels(as.factor(cellTypes))
98 |
99 |
100 | tmp_celltype <- (ifelse(cellTypes == levels(cellTypes)[1], 1, 0))
101 | design <- stats::model.matrix(~tmp_celltype)
102 |
103 |
104 | meanExprs <- do.call(cbind, lapply(c(0,1), function(i){
105 | Matrix::rowMeans(exprsMat@assays$RNA@data[, tmp_celltype == i, drop = FALSE])
106 | }))
107 |
108 | meanPct <- do.call(cbind, lapply(c(0,1), function(i){
109 | Matrix::rowSums(exprsMat@assays$RNA@data[, tmp_celltype == i,
110 | drop = FALSE] > 0)/sum(tmp_celltype == i)
111 | }))
112 |
113 | keep <- meanPct[,2] > exprs_pct
114 |
115 | y <- methods::new("EList")
116 | y$E <- exprsMat@assays$RNA@data[keep, ]
117 | fit <- limma::lmFit(y, design = design)
118 | fit <- limma::eBayes(fit, trend = TRUE, robust = TRUE)
119 | tt <- limma::topTable(fit, n = Inf, adjust.method = "BH", coef = 2)
120 |
121 |
122 |
123 | if (!is.null(tt$ID)) {
124 | tt <- tt[!duplicated(tt$ID),]
125 | rownames(tt) <- tt$ID
126 | }
127 |
128 | tt$meanExprs.1 <- meanExprs[rownames(tt), 1]
129 | tt$meanExprs.2 <- meanExprs[rownames(tt), 2]
130 | tt$meanPct.1 <- meanPct[rownames(tt), 1]
131 | tt$meanPct.2 <- meanPct[rownames(tt), 2]
132 |
133 |
134 | return(tt)
135 |
136 |
137 | }
138 |
139 |
140 | doDV <- function(exprsMat, cellTypes){
141 |
142 |
143 | cellTypes <- droplevels(as.factor(cellTypes))
144 |
145 | tmp_celltype <- (ifelse(cellTypes == levels(cellTypes)[1], 1, 0))
146 |
147 | meanPct <- do.call(cbind, lapply(c(0,1), function(i){
148 | Matrix::rowSums(exprsMat@assays$RNA@data[,
149 | tmp_celltype == i,
150 | drop = FALSE] > 0)/sum(tmp_celltype == i)
151 | }))
152 |
153 |
154 | posNeg <- (meanPct[,2] - meanPct[,1]) > 0.05
155 | # print(sum(posNeg))
156 | exprsMat_filt <- exprsMat@assays$RNA@data[posNeg,]
157 | tt <- apply(exprsMat_filt, 1, function(x) {
158 | df <- data.frame(gene = x, cellTypes = as.factor(tmp_celltype))
159 | stats::bartlett.test(gene~cellTypes, df)$p.value
160 | })
161 |
162 | tt <- stats::p.adjust(tt , method = "BH")
163 |
164 |
165 |
166 |
167 | return(tt)
168 |
169 |
170 | }
171 |
172 |
173 |
174 |
175 | doDD <- function(exprsMat, cellTypes){
176 |
177 | cellTypes <- droplevels(as.factor(cellTypes))
178 |
179 |
180 | tmp_celltype <- ifelse(cellTypes == levels(cellTypes)[1], 1, 0)
181 |
182 | meanPct <- do.call(cbind, lapply(c(0,1), function(i){
183 | Matrix::rowSums(exprsMat@assays$RNA@data [,
184 | tmp_celltype == i,
185 | drop = FALSE] > 0)/sum(tmp_celltype == i)
186 | }))
187 |
188 | posNeg <- (meanPct[,2] - meanPct[,1]) > 0.05
189 |
190 | exprsMat_filt <- exprsMat@assays$RNA@data[posNeg,]
191 |
192 | tt <- apply(exprsMat_filt, 1, function(x) {
193 | x1 <- x[tmp_celltype == 0]
194 | x2 <- x[tmp_celltype == 1]
195 | stats::ks.test(x1, x2, alternative = "greater")$p.value
196 | })
197 |
198 |
199 |
200 | tt <- stats::p.adjust(tt , method = "BH")
201 |
202 | return(tt)
203 |
204 |
205 | }
206 |
207 |
208 |
209 |
210 | doChisSquared <- function(exprsMat, cellTypes, threshold = 1){
211 |
212 |
213 | cellTypes <- droplevels(as.factor(cellTypes))
214 |
215 |
216 | tmp_celltype <- (ifelse(cellTypes == levels(cellTypes)[1], 1, 0))
217 |
218 |
219 | zerosMat <- ifelse(as.matrix( exprsMat@assays$RNA@data ) > threshold, 1, 0)
220 |
221 | tt <- apply(zerosMat,1, function(x){
222 | tab <- c()
223 | for (i in c(0,1)) {
224 | tmp <- factor(x[tmp_celltype == i], levels = c(0, 1))
225 | tab <- rbind(tab, table(tmp))
226 | }
227 |
228 | suppressWarnings(stats::chisq.test(tab)$p.value)
229 |
230 | })
231 |
232 | tt <- stats::p.adjust(tt , method = "BH")
233 |
234 | return(tt)
235 |
236 |
237 | }
238 |
239 |
240 | doBI <- function(exprsMat, cellTypes){
241 | # Select genes by bimodal index
242 |
243 | cellTypes <- droplevels(as.factor(cellTypes))
244 |
245 |
246 | tmp_celltype <- (ifelse(cellTypes == levels(cellTypes)[1], 1, 0))
247 |
248 | pi <- table(tmp_celltype)/length(tmp_celltype)
249 |
250 | agg_mean <- do.call(cbind, lapply(c(0,1), function(i){
251 | Matrix::rowMeans(exprsMat@assays$RNA@data[, tmp_celltype == i, drop = FALSE])
252 | }))
253 |
254 | agg_sd2 <- do.call(cbind, lapply(c(0,1), function(i){
255 | apply(exprsMat@assays$RNA@data[, tmp_celltype == i, drop = FALSE], 1, stats::var)
256 | }))
257 |
258 | bi <- abs(agg_mean[,2] - agg_mean[,1])/sqrt(pi[1]*agg_sd2[,1] +
259 | pi[2]*agg_sd2[,2])
260 |
261 | bi <- unlist(bi)
262 | names(bi) <- rownames(exprsMat)
263 | bi <- bi[order(bi, decreasing = TRUE)]
264 | tt <- bi
265 |
266 |
267 | return(tt)
268 |
269 |
270 | }
271 |
--------------------------------------------------------------------------------
/R/parameter_estimation.R:
--------------------------------------------------------------------------------
1 |
2 |
3 | #' format the data for evaluation
4 | #'
5 | #' @param sim_list a list for storing the real and simulated data
6 | #' @param thissim the data to be put into the sim_list
7 | #' @param name name of the data (eg, "real", "simulated")
8 | #' @param celltype cell type that the data contains
9 | #' @param counttype whether the data is unnormalised or normalised
10 | #' @import DESeq2
11 | #' @return a sim_list filled with the formatted data
12 | #' @export
13 | prepare_eval <- function( sim_list, thissim , name , celltype, counttype ){
14 |
15 |
16 | # if count type is normalized , then put it in Seurat
17 | # because DESeqDataSetFromMatrix cannot handle normalized count with decimals
18 | if ( counttype == "normalized" ){
19 |
20 | if (class(thissim) == "SingleCellExperiment"){
21 | meta_data <- as.data.frame( colData(thissim))
22 | meta_data <- meta_data[, -1]
23 | thissim <- CreateSeuratObject(counts = counts(thissim ), meta.data = meta_data)
24 | thissim$group <- celltype
25 | thissim$sample <- colnames( thissim )
26 | }
27 |
28 | } else if (counttype == "raw"){
29 |
30 | # if count type is raw , then put it in a DESeqDataSetFromMatrix
31 | count <- as.matrix( counts( thissim) )
32 | mode( count) <- "integer"
33 |
34 | thissim <- DESeqDataSetFromMatrix(countData = count,
35 | colData = data.frame( group = celltype ,
36 | sample = colnames(thissim ) ,
37 | row.names = colnames(thissim ) ,
38 | stringsAsFactors = FALSE) ,
39 | design = ~ 1)
40 |
41 | }
42 | sim_list[[name ]] <- thissim
43 |
44 |
45 | return(sim_list)
46 | }
47 |
48 |
49 |
50 |
51 |
52 |
53 | #' summarise the scores across multiple cell type according to the proportion of each cell type
54 | #'
55 | #' @param intermediate the scores from multiple cell type
56 | #'
57 | #' @return a summarised score
58 | #' @export
59 | summarise_score <- function( intermediate ){
60 |
61 | return_dataframe <- NULL
62 |
63 | allparameter <- names(intermediate)
64 |
65 | for ( i in (1:length(allparameter)) ){
66 |
67 | thisparameter <- intermediate[[i]]
68 | thisparameter$temp <- as.numeric( thisparameter$kde_zstat ) * thisparameter$proportion
69 |
70 | temp <- thisparameter %>% dplyr::summarise( sum_kde = sum( temp))
71 |
72 | return_dataframe[[ allparameter[i]]] <- temp
73 | }
74 |
75 | return ( return_dataframe )
76 |
77 | }
78 |
79 |
80 |
81 |
82 |
83 |
84 | #' run evaluation on parameter estimation
85 | #'
86 | #' @param real real data
87 | #' @param sim simulated data
88 | #' @param type whether the simulated data is raw (unnormalised raw count) or normalized
89 | #' @param method name of the simulation method
90 |
91 | #' @return distribution of parameters that are used to construct KDE test, KDE test statistics of each cell type, combined KDE test statistics
92 | #' @export
93 |
94 | eval_parameter <- function(real , sim , type = "raw" , method = "samplemethod") {
95 |
96 | #-----------Prepare the dataset ------------------#
97 |
98 | celltype <- unique( as.character( real$celltype))
99 |
100 | eval_result_allcelltype <- NULL
101 | eval_result_raw <- list()
102 |
103 |
104 | #----------------- start evaluating --------------#
105 |
106 | # loop through each cell type
107 | for (i in (1: length( celltype) )) {
108 | sim_list <- list()
109 |
110 | thiscelltype <- celltype[i]
111 | celltypeproportion <- sum( real$celltype == thiscelltype ) / length( real$celltype)
112 | print(paste0 ( "evaluating cell type .. " , thiscelltype ))
113 |
114 | # prepare the format
115 | data_real <- real[ , real$celltype == thiscelltype ]
116 | if(ncol(data_real) <10 ) next
117 | sim_list <- prepare_eval( sim_list = sim_list,
118 | thissim = data_real ,
119 | name = "real" ,
120 | celltype = thiscelltype,
121 | counttype = type)
122 |
123 | data_sim <- sim [ , sim$celltype == thiscelltype ]
124 |
125 | sim_list <- prepare_eval( sim_list = sim_list,
126 | thissim = data_sim ,
127 | name = method ,
128 | celltype = thiscelltype,
129 | counttype = type )
130 |
131 |
132 | # generate the evaluation result
133 | eval_result <- countsim_eval( sim_list )
134 |
135 | eval_result_raw[[thiscelltype]] <- eval_result
136 |
137 | eval_result <- eval_result$score
138 |
139 |
140 | # add additional column on cell type information
141 | eval_result <- lapply( eval_result , function(x)
142 | cbind(x, celltype = thiscelltype))
143 |
144 | # add additional column on cell type proportion information
145 | eval_result <- lapply( eval_result , function(x)
146 | cbind(x, proportion = celltypeproportion))
147 |
148 |
149 | # combine the result for each cell type
150 | if ( is.null( eval_result_allcelltype ) ){
151 | eval_result_allcelltype <- eval_result
152 | } else{
153 | eval_result_allcelltype <- mapply(FUN = rbind, eval_result_allcelltype , eval_result, SIMPLIFY = FALSE)
154 | }
155 |
156 | } #end the for loop for evaluating each individual cell type
157 |
158 |
159 | summarise_celltype <- summarise_score(eval_result_allcelltype)
160 |
161 | return ( list ( raw_value = eval_result_raw , stats_celltype = eval_result_allcelltype ,
162 | stats_overall = summarise_celltype ) )
163 |
164 | }
165 |
166 | # save the result for the methods that evaluate based on each cell type
167 |
168 |
--------------------------------------------------------------------------------
/R/plotting.R:
--------------------------------------------------------------------------------
1 | # the key challenge is to compare across each evaluation criteria.
2 | # For this, show the distribution of different metrics
3 |
4 |
5 | # show that for the same dataset, the score share similar distribution, ie, needs to be comparable
6 |
7 |
8 |
9 | #' Plot
10 | #' @param sim_list A list containing real and simulated data
11 | #' @param ncore number of cores for parallel computing
12 | #'
13 | #' @return the KDE test statistic across 13 parameters, and the raw values that are used to calculate the KDE test statistics
14 | #' @import ggplot2 ggpubr ggthemes
15 | #' @export
16 | draw_parameter_plot <- function( result ){
17 |
18 | sampleDF <- result$sampleDF
19 | featureDF <- result$featureDF
20 | sampleCorrDF <- result$sampleCorrDF
21 | featureCorrDF <- result$featureCorrDF
22 |
23 |
24 | plot_list <- list()
25 |
26 | th <- theme(text=element_text(size=12 ),
27 | axis.text.x = element_text(angle = 45, hjust = 1),
28 | panel.grid.major = element_blank(),
29 | panel.grid.minor = element_blank(),
30 | panel.background = element_rect(colour = "black", size=0.2, fill=NA) )
31 |
32 |
33 | p <- ggplot( sampleDF , aes(x = Libsize , group = dataset, fill=dataset , color = dataset )) +
34 | geom_density( alpha = 0.7 ) +
35 | xlab("library size") +
36 | scale_fill_manual(values=c( "#184275", "#b3202c" )) +
37 | scale_colour_manual(values=c( "#184275", "#b3202c" )) +
38 | ggtitle( "libsize") + th
39 |
40 | plot_list$libsize <- p
41 |
42 |
43 | p <- ggplot( sampleDF , aes(x = Libsize , y = Fraczero , color = dataset )) +
44 | geom_point(size = 0.5, alpha = 0.5 ) +
45 | xlab("library size") + ylab("fraction zero per gene") +
46 | scale_fill_manual(values=c( "#184275", "#b3202c" )) +
47 | scale_colour_manual(values=c( "#184275", "#b3202c" )) +
48 | ggtitle("libsize_fraczero")+ th
49 |
50 | plot_list$libsize_fraczero <- p
51 |
52 |
53 | p <- ggplot( sampleDF , aes(x = TMM , group = dataset, fill=dataset, color = dataset )) +
54 | geom_density( alpha = 0.7 ) +
55 | xlab("TMM") +
56 | scale_fill_manual(values=c( "#184275", "#b3202c" )) +
57 | scale_colour_manual(values=c( "#184275", "#b3202c" )) +
58 | ggtitle("TMM") + th
59 |
60 | plot_list$tmm <- p
61 |
62 |
63 |
64 | p <- ggplot( sampleDF , aes(x = EffLibsize, group = dataset, fill=dataset, color = dataset )) +
65 | geom_density( alpha = 0.7 ) +
66 | xlab("effective library size") +
67 | scale_fill_manual(values=c( "#184275", "#b3202c" )) +
68 | scale_colour_manual(values=c( "#184275", "#b3202c" )) +
69 | ggtitle("effective library size") + th
70 |
71 | plot_list$effectivelibsize <- p
72 |
73 |
74 |
75 |
76 | p <- ggplot( featureDF , aes(x = average_log2_cpm , y = variance_log2_cpm , color = dataset, fill=dataset )) +
77 | geom_point(size = 0.5, alpha = 0.1) +
78 | xlab(" mean expression ") + ylab(" variance of gene expression ") +
79 | scale_fill_manual(values=c( "#184275", "#b3202c" )) +
80 | scale_colour_manual(values=c( "#184275", "#b3202c" )) +
81 | ggtitle( "mean_variance" ) + th
82 |
83 | plot_list$mean_variance <- p
84 |
85 |
86 | p <- ggplot(featureDF, aes(x = variance_log2_cpm , group = dataset, fill=dataset , color = dataset )) +
87 | geom_density( alpha = 0.7 ) +
88 | xlab("variance log2 cpm") +
89 | scale_fill_manual(values=c( "#184275", "#b3202c" )) +
90 | scale_colour_manual(values=c( "#184275", "#b3202c" )) +
91 | ggtitle("variance") + th
92 |
93 | plot_list$variance <- p
94 |
95 |
96 |
97 |
98 | p <- ggplot(featureDF, aes(x = variance_scaled_log2_cpm , group = dataset, fill=dataset , color = dataset )) +
99 | geom_density( alpha = 0.7 ) +
100 | xlab("variance scaled log2 cpm") +
101 | scale_fill_manual(values=c( "#184275", "#b3202c" )) +
102 | scale_colour_manual(values=c( "#184275", "#b3202c" )) +
103 | ggtitle("scaled variance") + th
104 |
105 | plot_list$variance_scaled <- p
106 |
107 |
108 |
109 |
110 |
111 | p <- ggplot( sampleCorrDF , aes(x = Correlation, group = dataset, fill=dataset , color = dataset )) +
112 | geom_density( alpha = 0.7 ) +
113 | xlab("sample correlation") +
114 | scale_fill_manual(values=c( "#184275", "#b3202c" )) +
115 | scale_colour_manual(values=c( "#184275", "#b3202c" )) +
116 | ggtitle("samplecor") + th
117 |
118 | plot_list$samplecor <- p
119 |
120 |
121 |
122 | p <- ggplot(featureCorrDF , aes(x = Correlation, group = dataset, fill=dataset, color = dataset )) +
123 | geom_density( alpha = 0.7 ) +
124 | xlab("feature correlation") +
125 | scale_fill_manual(values=c( "#184275", "#b3202c" )) +
126 | scale_colour_manual(values=c( "#184275", "#b3202c" )) +
127 | ggtitle("featurecor") + th
128 |
129 | plot_list$featurecor <- p
130 |
131 |
132 | p <- ggplot( featureDF , aes(x = average_log2_cpm , y = Fraczero , color = dataset)) +
133 | geom_point(size = 0.5, alpha = 0.1) +
134 | xlab("mean expression") + ylab("fraction zero per gene") +
135 | scale_fill_manual(values=c( "#184275", "#b3202c" )) +
136 | scale_colour_manual(values=c( "#184275", "#b3202c" )) +
137 | ggtitle("mean_fraczero") + th
138 |
139 | plot_list$mean_fraczero <- p
140 |
141 |
142 |
143 |
144 | p <- ggplot(featureDF, aes(x = Fraczero, group = dataset, fill=dataset, color = dataset )) +
145 | geom_density( alpha = 0.7 ) +
146 | xlab("Fraction zeros per gene") +
147 | scale_fill_manual(values=c( "#184275", "#b3202c" )) +
148 | scale_colour_manual(values=c( "#184275", "#b3202c" )) +
149 | ggtitle("fraczerogene") + th
150 |
151 | plot_list$fraczerogene <- p
152 |
153 |
154 |
155 | p <- ggplot(sampleDF, aes(x = Fraczero, group = dataset, fill=dataset , color = dataset )) +
156 | geom_density( alpha = 0.7 ) +
157 | xlab("Fraction zeros per cell") +
158 | scale_fill_manual(values=c( "#184275", "#b3202c" )) +
159 | scale_colour_manual(values=c( "#184275", "#b3202c" )) +
160 | ggtitle("fraczerocell") + th
161 |
162 | plot_list$fraczerocell <- p
163 |
164 |
165 |
166 |
167 | p <- ggplot(featureDF, aes(x = average_log2_cpm , group = dataset, fill=dataset, color = dataset )) +
168 | geom_density( alpha = 0.7 ) +
169 | xlab("average log2 cpm") +
170 | scale_fill_manual(values=c( "#184275", "#b3202c" )) +
171 | scale_color_manual(values=c( "#184275", "#b3202c" )) +
172 | ggtitle("mean") + th
173 |
174 | plot_list$mean <- p
175 |
176 |
177 | return( plot_list )
178 |
179 |
180 | }
181 |
182 |
183 |
184 | #' Plot biological plot
185 | #'
186 | #' @param sim_list A list containing real and simulated data
187 | #' @param ncore number of cores for parallel computing
188 | #'
189 | #' @return the KDE test statistic across 13 parameters, and the raw values that are used to calculate the KDE test statistics
190 | #' @import ggplot2 ggpubr ggthemes
191 | #' @export
192 | draw_biosignal_plot <- function(result){
193 |
194 |
195 | result$types <- factor(result$types , c("DE" , "DV", "DD", "DP", "BD"))
196 |
197 | p <- ggplot(data= result , aes(x= types, y=prop, fill=sim)) +
198 | geom_bar(stat="identity", position=position_dodge()) +
199 | scale_fill_manual(values=c("#b3202c" , "#184275" )) + theme_minimal() +
200 | theme(panel.grid.major = element_blank(),
201 | panel.grid.minor = element_blank(),
202 | panel.background = element_rect(colour = "black", size=0.2, fill=NA)) + ylim(0,1)
203 |
204 |
205 | return(p)
206 |
207 |
208 | }
209 |
210 |
211 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # SimBench: benchmarking of single cell simulation methods
2 |
3 |
4 |
5 | The `SimBench` package is designed for benchmarking simulation methods based on two key aspects of accuracy of data properties estimation and ability to retain biological signals.
6 |
7 | In detail, `SimBench`:
8 | i) quantifies the distributional similarities between a simulated scRNA-seq and a real scRNA-seq data using KDE test (Kernel Density Based Global Two-Sample Comparison Test) across 13 gene-wise and cell-wise properties.
9 | ii) measures the similarities of the amount of biological signals in a simulated scRNA-seq and a real scRNA-seq data by measuring the proportion difference of DE, DV, DD, BD and BI genes.
10 |
11 |
12 | ## Installation
13 |
14 |
15 | You may need to install the following dependencies first:
16 |
17 | ```r
18 | ggthemes, ggpubr, ggplot2, dplyr, plyr, Seurat, SingleCellExperiment, edgeR, DESeq2, caret, ks
19 | ```
20 |
21 |
22 | `SimBench` can be then installed using `devtools`
23 |
24 | ```r
25 | library(devtools)
26 | devtools::install_github("SydneyBioX/SimBench")
27 | library(SimBench)
28 | library(parallel)
29 | library(DESeq2)
30 | library(Seurat)
31 | library(dplyr)
32 | ```
33 |
34 |
35 | # Getting started
36 |
37 |
38 | ## Example usage
39 |
40 |
41 | We have provided a 'simulated' data (`sim.rds`) and a 'real' (`real.rds`) scRNA-seq in the github folder to illustrate the usage of our codes.
42 |
43 |
44 |
45 |
46 | ### Load example data
47 |
48 | The files are provided in the `inst\extdata` folder in this github repo .
49 |
50 | ```r
51 | path <- system.file("extdata", "real.rds", package="SimBench")
52 | real <- readRDS(path)
53 |
54 | path <- system.file("extdata", "sim.rds", package="SimBench")
55 | sim <- readRDS(path)
56 | ```
57 |
58 | Note both the sim and real dataset needs to be SingleCellExperiment object.
59 | If `celltype` is provided in the object, then the comparison will be made based on each cell type and then combined using a weighted sum (where the weight is the proportion of the cell type).
60 | if no `celltype` is provided, then the comparison will be made based on the entire dataset.
61 |
62 |
63 |
64 | ### Parameter estimation
65 |
66 | The parameter estimation score can be obtained by :
67 |
68 | ```r
69 | parameter_result <- eval_parameter(real = real, sim = sim, type = "raw" , method = "samplemethod")
70 | ```
71 | The output contains 3 fields:
72 | `stats_overall` gives the overall KDE test statistics
73 | `stats_celltype` gives the KDE test statistics for each cell type
74 | `stats_raw` gives the raw values used to perform the KDE test (eg, the mean expression of each gene)
75 |
76 |
77 | #### Visualise
78 |
79 | We can use the raw value to visualise the simulated dataset and real dataset over 13 parameters.
80 |
81 | ```r
82 | distribution_celltype <- parameter_result$raw_value$`B cell`$raw_value #this obtain the distribution of B cell type
83 | fig <- draw_parameter_plot(distribution_celltype)
84 | ggarrange( plotlist = fig , common.legend = T)
85 | ```
86 |
87 |
88 |
89 | ### Maintaining biological signatures
90 |
91 |
92 | Evaluation of biological signals can be obtained by
93 |
94 | ```r
95 | signal_result <- eval_signal( real = real, sim = sim )
96 | ```
97 |
98 | #### Visualise
99 |
100 | The proportion difference can be visualised using barplot.
101 |
102 | ```r
103 | draw_biosignal_plot(signal_result)
104 | ```
105 |
106 |
107 |
108 |
109 | # Reference
110 |
111 | Part of the codes was inspired and adapted from R package `countsimQC` and `scClassify`.
112 |
113 | > Soneson, C., & Robinson, M. D. (2018). Towards unified quality verification of synthetic count data with countsimQC. Bioinformatics, 34(4), 691-692.).
114 | > Lin, Y., Cao, Y., Kim, H. J., Salim, A., Speed, T. P., Lin, D. M., ... & Yang, J. Y. H. (2020). scClassify: sample size estimation and multiscale classification of cells using single and multiple reference. Molecular systems biology, 16(6), e9389.
115 |
116 | Installation of `countsimQC` is however not required for running `SimBench`.
117 |
118 |
119 |
120 |
121 |
122 |
--------------------------------------------------------------------------------
/SimBench.Rproj:
--------------------------------------------------------------------------------
1 | Version: 1.0
2 |
3 | RestoreWorkspace: No
4 | SaveWorkspace: No
5 | AlwaysSaveHistory: Default
6 |
7 | EnableCodeIndexing: Yes
8 | UseSpacesForTab: Yes
9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 |
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 |
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 |
18 | BuildType: Package
19 | PackageUseDevtools: Yes
20 | PackageInstallArgs: --no-multiarch --with-keep.source
21 | PackageRoxygenize: rd,collate,namespace
22 |
--------------------------------------------------------------------------------
/inst/extdata/real.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SydneyBioX/SimBench/d009e5d33280134cf47a9f5c7f5df44bdf3f25ef/inst/extdata/real.rds
--------------------------------------------------------------------------------
/inst/extdata/sim.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SydneyBioX/SimBench/d009e5d33280134cf47a9f5c7f5df44bdf3f25ef/inst/extdata/sim.rds
--------------------------------------------------------------------------------
/man/KDE_test.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/KDE_test.R
3 | \name{KDE_test}
4 | \alias{KDE_test}
5 | \title{Perform KDE test}
6 | \usage{
7 | KDE_test(df, column, subsampleSize)
8 | }
9 | \arguments{
10 | \item{df}{dataframe containing the raw distribution for comparison}
11 |
12 | \item{column}{the column containing the parameter of interest}
13 |
14 | \item{subsampleSize}{maximum number of data points for comparison}
15 | }
16 | \value{
17 | KDE test statistic
18 | }
19 | \description{
20 | Perform KDE test
21 | }
22 |
--------------------------------------------------------------------------------
/man/calculateFeatureCorrs.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/calculateGeneCellCorr.R
3 | \name{calculateFeatureCorrs}
4 | \alias{calculateFeatureCorrs}
5 | \title{genewise correlation}
6 | \usage{
7 | calculateFeatureCorrs(sim_list, ncore = 8)
8 | }
9 | \arguments{
10 | \item{sim_list}{a list containing the real and simulated data}
11 |
12 | \item{ncore}{number of cores for parallel computing}
13 | }
14 | \value{
15 | A data frame with genewise correlations for real and simulated data
16 | }
17 | \description{
18 | genewise correlation
19 | }
20 |
--------------------------------------------------------------------------------
/man/calculateMeanVarLibrary.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/calculateMeanVarLibrary.R
3 | \name{calculateMeanVarLibrary}
4 | \alias{calculateMeanVarLibrary}
5 | \title{Calculate mean variance and library size estimates}
6 | \usage{
7 | calculateMeanVarLibrary(sim_list, ncore = 8)
8 | }
9 | \arguments{
10 | \item{sim_list}{A list containing real and simulated data}
11 |
12 | \item{ncore}{number of cores for parallel computing}
13 | }
14 | \value{
15 | a list containing the mean , variance , library size estimates of the real and simulated data for downstream evaluation
16 | }
17 | \description{
18 | Calculate mean variance and library size estimates
19 | }
20 |
--------------------------------------------------------------------------------
/man/calculateSampleCorrs.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/calculateGeneCellCorr.R
3 | \name{calculateSampleCorrs}
4 | \alias{calculateSampleCorrs}
5 | \title{cellwise correlation}
6 | \usage{
7 | calculateSampleCorrs(sim_list, maxNForCorr = 1000, ncore = 8)
8 | }
9 | \arguments{
10 | \item{sim_list}{a list containing the real and simulated data}
11 |
12 | \item{maxNForCorr}{number of cells selected to compute the cellwise correlation}
13 |
14 | \item{ncore}{number of cores for parallel computing}
15 | }
16 | \value{
17 | A data frame with cellwise correlations for real and simulated data
18 | }
19 | \description{
20 | cellwise correlation
21 | }
22 |
--------------------------------------------------------------------------------
/man/countsim_eval.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/countsim_eval_KS.R
3 | \name{countsim_eval}
4 | \alias{countsim_eval}
5 | \title{Calculate KDE test statistic on each parameter}
6 | \usage{
7 | countsim_eval(sim_list, ncore = 8)
8 | }
9 | \arguments{
10 | \item{sim_list}{A list containing real and simulated data}
11 |
12 | \item{ncore}{number of cores for parallel computing}
13 | }
14 | \value{
15 | the KDE test statistic across 13 parameters, and the raw values that are used to calculate the KDE test statistics
16 | }
17 | \description{
18 | Calculate KDE test statistic on each parameter
19 | }
20 |
--------------------------------------------------------------------------------
/man/draw_biosignal_plot.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/plotting.R
3 | \name{draw_biosignal_plot}
4 | \alias{draw_biosignal_plot}
5 | \title{Plot biological plot}
6 | \usage{
7 | draw_biosignal_plot(result)
8 | }
9 | \arguments{
10 | \item{sim_list}{A list containing real and simulated data}
11 |
12 | \item{ncore}{number of cores for parallel computing}
13 | }
14 | \value{
15 | the KDE test statistic across 13 parameters, and the raw values that are used to calculate the KDE test statistics
16 | }
17 | \description{
18 | Plot biological plot
19 | }
20 |
--------------------------------------------------------------------------------
/man/draw_parameter_plot.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/plotting.R
3 | \name{draw_parameter_plot}
4 | \alias{draw_parameter_plot}
5 | \title{Plot}
6 | \usage{
7 | draw_parameter_plot(result)
8 | }
9 | \arguments{
10 | \item{sim_list}{A list containing real and simulated data}
11 |
12 | \item{ncore}{number of cores for parallel computing}
13 | }
14 | \value{
15 | the KDE test statistic across 13 parameters, and the raw values that are used to calculate the KDE test statistics
16 | }
17 | \description{
18 | Plot
19 | }
20 |
--------------------------------------------------------------------------------
/man/eval_parameter.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/parameter_estimation.R
3 | \name{eval_parameter}
4 | \alias{eval_parameter}
5 | \title{run evaluation on parameter estimation}
6 | \usage{
7 | eval_parameter(real, sim, type = "raw", method = "samplemethod")
8 | }
9 | \arguments{
10 | \item{real}{real data}
11 |
12 | \item{sim}{simulated data}
13 |
14 | \item{type}{whether the simulated data is raw (unnormalised raw count) or normalized}
15 |
16 | \item{method}{name of the simulation method}
17 | }
18 | \value{
19 | distribution of parameters that are used to construct KDE test, KDE test statistics of each cell type, combined KDE test statistics
20 | }
21 | \description{
22 | run evaluation on parameter estimation
23 | }
24 |
--------------------------------------------------------------------------------
/man/eval_signal.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/biological_signal_prop.R
3 | \name{eval_signal}
4 | \alias{eval_signal}
5 | \title{evaluate the similarities between the two set of DE genes}
6 | \usage{
7 | eval_signal(real, sim)
8 | }
9 | \arguments{
10 | \item{real}{real data}
11 |
12 | \item{simulated}{simulated data}
13 | }
14 | \value{
15 | confusion matrix, logFC of the set of genes from real and simiulated data
16 | }
17 | \description{
18 | evaluate the similarities between the two set of DE genes
19 | }
20 |
--------------------------------------------------------------------------------
/man/evaluate_DE.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/evaluate_DE_prop.R
3 | \name{evaluate_DE}
4 | \alias{evaluate_DE}
5 | \title{Evaluate DE}
6 | \usage{
7 | evaluate_DE(real, simulated)
8 | }
9 | \arguments{
10 | \item{real}{A list containing real and simulated data}
11 |
12 | \item{simulated}{number of cores for parallel computing}
13 | }
14 | \value{
15 | DE
16 | }
17 | \description{
18 | Evaluate DE
19 | }
20 |
--------------------------------------------------------------------------------
/man/figures/vis_biosignal.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SydneyBioX/SimBench/d009e5d33280134cf47a9f5c7f5df44bdf3f25ef/man/figures/vis_biosignal.jpg
--------------------------------------------------------------------------------
/man/figures/vis_parameter.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SydneyBioX/SimBench/d009e5d33280134cf47a9f5c7f5df44bdf3f25ef/man/figures/vis_parameter.jpg
--------------------------------------------------------------------------------
/man/generate_DE.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/generate_DE_prop.R
3 | \name{generate_DE}
4 | \alias{generate_DE}
5 | \title{Generate DE}
6 | \usage{
7 | generate_DE(
8 | exprsMat,
9 | trainClass,
10 | feature = c("DE", "DV", "DD", "DP", "BD"),
11 | pSig = 0.1
12 | )
13 | }
14 | \arguments{
15 | \item{sim_list}{A list containing real and simulated data}
16 |
17 | \item{ncore}{number of cores for parallel computing}
18 | }
19 | \value{
20 | the KDE test statistic across 13 parameters, and the raw values that are used to calculate the KDE test statistics
21 | }
22 | \description{
23 | Generate DE
24 | }
25 |
--------------------------------------------------------------------------------
/man/prepare_eval.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/parameter_estimation.R
3 | \name{prepare_eval}
4 | \alias{prepare_eval}
5 | \title{format the data for evaluation}
6 | \usage{
7 | prepare_eval(sim_list, thissim, name, celltype, counttype)
8 | }
9 | \arguments{
10 | \item{sim_list}{a list for storing the real and simulated data}
11 |
12 | \item{thissim}{the data to be put into the sim_list}
13 |
14 | \item{name}{name of the data (eg, "real", "simulated")}
15 |
16 | \item{celltype}{cell type that the data contains}
17 |
18 | \item{counttype}{whether the data is unnormalised or normalised}
19 | }
20 | \value{
21 | a sim_list filled with the formatted data
22 | }
23 | \description{
24 | format the data for evaluation
25 | }
26 |
--------------------------------------------------------------------------------
/man/process_data.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/generate_DE_prop.R
3 | \name{process_data}
4 | \alias{process_data}
5 | \title{Process data}
6 | \usage{
7 | process_data(real, simulated)
8 | }
9 | \arguments{
10 | \item{sim_list}{A list containing real and simulated data}
11 |
12 | \item{ncore}{number of cores for parallel computing}
13 | }
14 | \value{
15 | the KDE test statistic across 13 parameters, and the raw values that are used to calculate the KDE test statistics
16 | }
17 | \description{
18 | Process data
19 | }
20 |
--------------------------------------------------------------------------------
/man/summarise_score.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/parameter_estimation.R
3 | \name{summarise_score}
4 | \alias{summarise_score}
5 | \title{summarise the scores across multiple cell type according to the proportion of each cell type}
6 | \usage{
7 | summarise_score(intermediate)
8 | }
9 | \arguments{
10 | \item{intermediate}{the scores from multiple cell type}
11 | }
12 | \value{
13 | a summarised score
14 | }
15 | \description{
16 | summarise the scores across multiple cell type according to the proportion of each cell type
17 | }
18 |
--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
1 | # SimBench: benchmarking of single cell simulation methods
2 |
3 |
4 | The `SimBench` package is designed for benchmarking simulation methods based on two key aspects of accuracy of data properties estimation and ability to retain biological signals.
5 |
6 | In detail, `SimBench`:
7 | i) quantifies the distributional similarities between a simulated scRNA-seq and a real scRNA-seq data using KDE test (Kernel Density Based Global Two-Sample Comparison Test) across 13 gene-wise and cell-wise properties.
8 | ii) measures the similarities of the amount of biological signals in a simulated scRNA-seq and a real scRNA-seq data by measuring the proportion difference of DE, DV, DD, BD and BI genes.
9 |
10 |
11 | ## Installation
12 |
13 |
14 | You may need to install the following dependencies first:
15 |
16 | ```r
17 | ggthemes, ggpubr, ggplot2, dplyr, plyr, Seurat, SingleCellExperiment, edgeR, DESeq2, caret, ks
18 | ```
19 |
20 |
21 | `SimBench` can be then installed using `devtools`
22 |
23 | ```r
24 | library(devtools)
25 | devtools::install_github("SydneyBioX/SimBench")
26 | library(SimBench)
27 | ```
28 |
29 |
30 | # Getting started
31 |
32 |
33 | ## Example usage
34 |
35 |
36 | We have provided a 'simulated' data (`sim.rds`) and a 'real' (`real.rds`) scRNA-seq in the github folder to illustrate the usage of our codes.
37 |
38 |
39 |
40 |
41 | ### Load example data
42 |
43 | The files are provided in the `inst\extdata` folder in this github repo .
44 |
45 | ```r
46 | path <- system.file("extdata", "real.rds", package="SimBench")
47 | real <- readRDS(path)
48 |
49 | path <- system.file("extdata", "sim.rds", package="SimBench")
50 | sim <- readRDS(path)
51 | ```
52 |
53 | Note both the sim and real dataset needs to be SingleCellExperiment object.
54 | If `celltype` is provided in the object, then the comparison will be made based on each cell type and then combined using a weighted sum (where the weight is the proportion of the cell type).
55 | if no `celltype` is provided, then the comparison will be made based on the entire dataset.
56 |
57 |
58 |
59 | ### Parameter estimation
60 |
61 | The parameter estimation score can be obtained by :
62 |
63 | ```r
64 | parameter_result <- eval_parameter(real = real, sim = sim, type = "count" , method = "samplemethod")
65 | ```
66 | The output contains 3 fields:
67 | `stats_overall` gives the overall KDE test statistics
68 | `stats_celltype` gives the KDE test statistics for each cell type
69 | `stats_raw` gives the raw values used to perform the KDE test (eg, the mean expression of each gene)
70 |
71 |
72 | #### Visualise
73 |
74 | We can use the raw value to visualise the simulated dataset and real dataset over 13 parameters.
75 |
76 | ```r
77 | distribution_celltype <- parameter_result$raw_value$`B cell`$raw_value #this obtain the distribution of B cell type
78 | fig <- draw_parameter_plot(distribution_celltype)
79 | ggarrange( plotlist = fig , common.legend = T)
80 | ```
81 |
82 |
83 |
84 | ### Maintaining biological signatures
85 |
86 |
87 | Evaluation of biological signals can be obtained by
88 |
89 | ```r
90 | signal_result <- eval_signal( real = real, sim = sim )
91 | ```
92 |
93 | #### Visualise
94 |
95 | The proportion difference can be visualised using barplot.
96 |
97 | ```r
98 | draw_biosignal_plot(signal_result)
99 | ```
100 |
101 |
102 |
103 |
104 | # Reference
105 |
106 | Part of the codes was inspired and adapted from R package `countsimQC` and `scClassify`.
107 |
108 | > Soneson, C., & Robinson, M. D. (2018). Towards unified quality verification of synthetic count data with countsimQC. Bioinformatics, 34(4), 691-692.).
109 | > Lin, Y., Cao, Y., Kim, H. J., Salim, A., Speed, T. P., Lin, D. M., ... & Yang, J. Y. H. (2020). scClassify: sample size estimation and multiscale classification of cells using single and multiple reference. Molecular systems biology, 16(6), e9389.
110 |
111 | Installation of `countsimQC` is however not required for running `SimBench`.
112 |
113 |
114 |
115 |
116 |
117 |
--------------------------------------------------------------------------------