├── .DS_Store
├── ENSG_to_HGNC.rds
├── LICENSE.md
├── MASTER.R
├── README.md
├── SQUID
├── DESCRIPTION
├── LICENSE
├── NAMESPACE
├── R
│ └── SQUID.R
├── README.md
├── SQUID.Rproj
├── SQUID_Toy_Example_Data.RData
└── man
│ └── SQUID.Rd
├── SURVIVAL
└── Survival_Analyses.R
├── Toy_example
├── .DS_Store
├── P.rds
├── T.rds
├── markers_TMM_FC1.5_Seuratwilcox.rds
├── phenoDataC_clusters_after_regrouping.txt
├── phenoDataC_clusters_wo_regrouping.txt
└── scC.rds
├── additional_data
└── Denisenko_crosstable_bulk_sc_sn_14092021.csv
├── helper_functions.R
├── human_lengths.rda
├── schematic_figures
├── .DS_Store
├── Scheme1_Github.jpg
├── Scheme1_Github.pdf
├── Scheme2_Github.jpg
└── Scheme2_Github.pdf
└── sessionInfo_macOS.txt
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/favilaco/deconv_matching_bulk_scnRNA/c353e025979a7a122078a324585d1980301bc5f7/.DS_Store
--------------------------------------------------------------------------------
/ENSG_to_HGNC.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/favilaco/deconv_matching_bulk_scnRNA/c353e025979a7a122078a324585d1980301bc5f7/ENSG_to_HGNC.rds
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | # LICENSE
2 |
3 | This repository is distributed under the BSD 3-clause license.
4 |
5 | **IMPORTANT: methods used and included in this repository are provided as separate packages. Their respective license conditions apply**
6 |
7 |
8 | ```
9 | Copyright 2022 Francisco Avila Cobos
10 |
11 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
12 |
13 | * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
14 |
15 | * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
16 |
17 | * Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
18 |
19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
20 | ```
--------------------------------------------------------------------------------
/MASTER.R:
--------------------------------------------------------------------------------
1 | suppressMessages(library(tidyverse))
2 | suppressMessages(library(data.table))
3 | suppressMessages(library(ggrastr))
4 | suppressMessages(library(viridis))
5 | suppressMessages(library(ggpointdensity))
6 | suppressMessages(library(egg))
7 | suppressMessages(library(Matrix))
8 | suppressMessages(library(matrixStats))
9 | suppressMessages(library(limma))
10 | suppressMessages(library(ggplot2))
11 | suppressMessages(library(RColorBrewer))
12 | suppressMessages(library(ggrepel))
13 | suppressMessages(library(ggpubr))
14 | suppressMessages(library(monocle3))
15 | suppressMessages(library(Seurat))
16 | suppressMessages(library(foreach))
17 | suppressMessages(library(R.utils))
18 |
19 | args <- R.utils::commandArgs(trailingOnly=TRUE)
20 |
21 | if(length(args) != 5){
22 |
23 | print("Please check that all required parameters are indicated or are correct")
24 | stop()
25 | }
26 |
27 | set.seed(4)
28 | options(digits = 3)
29 | options(future.globals.maxSize = 2000 * 1024^2) #https://satijalab.org/seurat/archive/v3.0/future_vignette.html
30 |
31 | ENSG_to_HGNC = readRDS("./ENSG_to_HGNC.rds")
32 |
33 | source('./helper_functions.R')
34 |
35 | ##############################################################################
36 | ## Input files and parameters:
37 |
38 | directory <- args[1]
39 | ref.normalization = args[2] # normalization choice for scC or C
40 | mix.normalization = args[3] # normalization choice for T
41 | method = args[4]
42 |
43 | cell.type.info.present <- args[5] #if "yes", no need to apply unsupervised clustering
44 |
45 | ##############################################################################
46 | ## Reading input into memory:
47 |
48 | setwd(directory)
49 |
50 | scC <- readRDS("./scC.rds") %>% as.matrix()
51 | T = readRDS("./T.rds") %>% as.matrix()
52 | phenoDataC = read.table("phenoDataC_clusters_wo_regrouping.txt", header = TRUE) # It must contain at least two columns: "cellID" & "SubjectName". "cellType" is optional (e.g. if cell.type.info.present = "no")
53 |
54 | #Unless proportions are provided from IHC, it would be computed directly from scRNA-seq/snRNA-seq
55 | if(!file.exists("./P.rds")){
56 | P = NULL
57 | } else {
58 | P = readRDS("./P.rds")
59 | }
60 |
61 | ##############################################################################
62 | ## PRE-PROCESSING AND QUALITY CONTROL
63 |
64 | # Filter to remove 1% of extreme values (top 0.5% and bottom 0.5%)
65 | filterCells <- function(filterParam){
66 | cellsToRemove <- which(filterParam < quantile(filterParam, .005) | filterParam > quantile(filterParam, .995))
67 | cellsToRemove
68 | }
69 |
70 | libSizes <- colSums(scC)
71 | gene_names <- rownames(scC)
72 |
73 | if(length(grep("ENSG000|ENSMUSG000",rownames(scC))) > 100){
74 |
75 | mtID.Ensembl <- ENSG_to_HGNC$ENSG_ID[grep("^MT-|_MT-", ENSG_to_HGNC$HGNC, ignore.case = TRUE)]
76 | rbID.Ensembl <- ENSG_to_HGNC$ENSG_ID[grep("^RPL|^RPS|_RPL|_RPS", ENSG_to_HGNC$HGNC, ignore.case = TRUE)]
77 | mtID <- gene_names %in% mtID.Ensembl
78 | rbID <- gene_names %in% rbID.Ensembl
79 |
80 | } else {
81 |
82 | mtID <- grepl("^MT-|_MT-", gene_names, ignore.case = TRUE)
83 | rbID <- grepl("^RPL|^RPS|_RPL|_RPS", gene_names, ignore.case = TRUE)
84 |
85 | }
86 |
87 | mtPercent <- colSums(scC[mtID, ])/libSizes
88 | rbPercent <- colSums(scC[rbID, ])/libSizes
89 |
90 | lapply(list(libSizes = libSizes, mtPercent = mtPercent, rbPercent = rbPercent), filterCells) %>%
91 | unlist() %>%
92 | unique() -> cellsToRemove
93 |
94 | if(length(cellsToRemove) != 0){
95 | scC <- scC[,-cellsToRemove]
96 | phenoDataC <- phenoDataC[-cellsToRemove,]
97 | }
98 |
99 | # Keep only "detectable" genes: min(round(0.01 * ncol(scC), 10) cells (regardless of the group) have a read/UMI count different from 0
100 | keep <- which(Matrix::rowSums(scC > 0) >= min(round(0.01 * ncol(scC)), 10)) #otherwise it removes many genes from Ellis dataset
101 | scC = scC[keep,]
102 |
103 | ##############################################################################
104 | ## Unsupervised clustering (Monocle3) if metadata information not provided
105 |
106 | if(cell.type.info.present == "no"){
107 |
108 | if(file.exists("phenoDataC_clusters_after_regrouping.txt")){
109 |
110 | phenoDataC = read.table("phenoDataC_clusters_after_regrouping.txt", header = TRUE)
111 |
112 | } else {
113 |
114 | cds <- monocle3::new_cell_data_set(scC)
115 | cds <- monocle3::preprocess_cds(cds, num_dim = 100, norm_method = "log", method = "PCA", scaling = TRUE)
116 | cds <- monocle3::reduce_dimension(cds, max_components = 2, umap.metric = "cosine", umap.fast_sgd = FALSE, preprocess_method = 'PCA')
117 | cds <- monocle3::cluster_cells(cds, k = 20, resolution = NULL, partition_qval = 0.05, num_iter = 1) #with resolution = NULL, the parameter is determined automatically
118 |
119 | Monocle3_metadata <- data.frame(cellID = names(cds@clusters$UMAP$clusters), cellType = paste("cluster", cds@clusters$UMAP$clusters, sep ="."))
120 | phenoDataC$clusterID <- Monocle3_metadata$cellType
121 |
122 | m1 <- monocle3::plot_cells(cds, color_cells_by = "cluster", group_label_size = 4)
123 |
124 | ggsave(paste("Monocle3_",basename(directory),".pdf",sep=""), gridExtra::grid.arrange(m1, nrow = 1), height = 6, width = 6, limitsize = FALSE)
125 | dev.off()
126 |
127 | ## Make C based on clusters from unsupervised clustering
128 | cellType <- phenoDataC$clusterID
129 | group = list()
130 |
131 | for(i in unique(cellType)){
132 | group[[i]] <- which(cellType %in% i)
133 | }
134 |
135 | C = lapply(group,function(x) Matrix::rowMeans(scC[,x])) #C should be made with the mean (not sum) to agree with the way markers were selected
136 | C = do.call(cbind.data.frame, C)
137 |
138 | ###############
139 | # Regrouping if correlation > 0.95
140 |
141 | correlation.matrix = cor(C)
142 | annotation.regrouped <- regroup.cor(correlation.matrix, correlation.threshold = 0.95)
143 |
144 | if(nrow(annotation.regrouped) > 0){
145 |
146 | pdf(paste("Monocle3_cor_full_", basename(getwd()), "_log.ls.norm.pdf", sep = ""), width = max(7, ncol(C)/2), height = max(7, ncol(C)/2))
147 | pheatmap::pheatmap(correlation.matrix, annotation = annotation.regrouped, scale = "none", display_numbers = TRUE, number_format = "%.2f")
148 | dev.off()
149 |
150 | counter = 1
151 | for(elem in unique(annotation.regrouped$new.group)){
152 | phenoDataC$clusterID[phenoDataC$clusterID %in% rownames(annotation.regrouped)[annotation.regrouped$new.group == elem]] <- paste("regrouped.cluster", counter, sep = ".")
153 | counter = counter + 1
154 | }
155 |
156 | phenoDataC$cellType <- phenoDataC$clusterID
157 | # re-order phenoDataC in function of scC
158 | phenoDataC = phenoDataC[phenoDataC$cellID %in% colnames(scC),]
159 | phenoDataC <- phenoDataC[match(colnames(scC),phenoDataC$cellID),]
160 |
161 | }
162 |
163 | write.table(phenoDataC, file = "phenoDataC_clusters_after_regrouping.txt", row.names = FALSE, col.names = TRUE, sep = "\t", quote = FALSE)
164 |
165 | ###############
166 | # re-making UMAP plots with new information:
167 |
168 | cds@colData$new.cluster = phenoDataC$clusterID[match(cds@colData$cell, phenoDataC$cellID)]
169 |
170 | m2 <- monocle3::plot_cells(cds, color_cells_by = "new.cluster", group_label_size = 4, show_trajectory_graph = FALSE)
171 |
172 | ggsave(paste("MONOCLE.UMAP_REGROUPED_",basename(directory),".pdf",sep=""), gridExtra::grid.arrange(m2, nrow = 1), height = 6, width = 6, limitsize = FALSE)
173 | dev.off()
174 |
175 | }
176 |
177 | } else {
178 |
179 | # re-order phenoDataC in function of scC
180 | phenoDataC = phenoDataC[phenoDataC$cellID %in% colnames(scC),]
181 | phenoDataC <- phenoDataC[match(colnames(scC),phenoDataC$cellID),]
182 |
183 | }
184 |
185 | ## Ensure that phenoDataC & scC have the same (& ordered) cells [for cases where users will provide a foreign "phenoDataC_clusters_after_regrouping.txt" as input]:
186 | common.cells = intersect(phenoDataC$cellID,colnames(scC))
187 | scC = scC[, common.cells]
188 | phenoDataC = phenoDataC[match(common.cells, phenoDataC$cellID),]
189 |
190 | # Create matrix C (made with the mean expression values for all cells across cell types)
191 | cellType <- phenoDataC$cellType
192 | group = list()
193 |
194 | for(i in unique(cellType)){
195 | group[[i]] <- which(cellType %in% i)
196 | }
197 |
198 | # Generating C (raw) from the mean (raw) expression values across all cells from each cell type
199 | C = lapply(group,function(x) Matrix::rowMeans(scC[,x]))
200 | C = do.call(cbind.data.frame, C) %>% as.matrix()
201 |
202 | ##############################################################################
203 | ## Remove row duplicates
204 |
205 | C = collapsing(C, rownames(C))
206 | scC = collapsing(scC, rownames(scC))
207 | scC.raw = scC # needed to run marker selection + deconvolution
208 | T = collapsing(T, rownames(T))
209 |
210 | ##############################################################################
211 | ## Single-cell and bulk normalization
212 |
213 | C <- Scaling(matrix = C, option = ref.normalization)
214 | scC <- Scaling(matrix = scC.raw, option = ref.normalization, phenoDataC = phenoDataC)
215 |
216 | T <- Scaling(matrix = T, option = mix.normalization)
217 |
218 | ##############################################################################
219 | ## Seurat - FindAllMarkers (Wilcoxon) on TMM normalized scRNA-seq/snRNA-seq
220 | # MuSiC & DWLS: without markers
221 |
222 | if(!method %in% c("MuSiC", "DWLS", "SQUID")){
223 |
224 | if(! file.exists("./markers_TMM_FC1.5_Seuratwilcox.rds")){
225 |
226 | scC <- Scaling(matrix = scC.raw, option = "TMM", phenoDataC = phenoDataC)
227 |
228 | scC <- Seurat::CreateSeuratObject(counts = scC)
229 | scC$groups <- as.character(phenoDataC$clusterID)
230 | Idents(object = scC) <- as.character(phenoDataC$clusterID) #Needed. See: https://github.com/satijalab/seurat/issues/1482
231 |
232 | markers <- Seurat::FindAllMarkers(scC,
233 | logfc.threshold = log2(1.5),
234 | grouping.var = "groups",
235 | test.use = "wilcox")
236 |
237 | markers <- markers[,c("gene","avg_log2FC","cluster")]
238 | markers$AveExpr <- Inf
239 | colnames(markers) <- c("gene","log2FC","cell_type","AveExpr")
240 | markers$HGNC <- ENSG_to_HGNC$HGNC[match(markers$gene, ENSG_to_HGNC$ENSG_ID)]
241 | markers$ENSG <- ENSG_to_HGNC$ENSG[match(markers$gene, ENSG_to_HGNC$HGNC)]
242 | if(sum(is.na(markers$HGNC)) == nrow(markers)){markers$HGNC <- NULL; markers$HGNC <- rownames(markers)}
243 | if(sum(is.na(markers$ENSG)) == nrow(markers)){markers$ENSG <- NULL; markers$ENSG <- rownames(markers)}
244 |
245 | #if markers$gene is both ENSG and HGNC, split:
246 | if(length(grep("--",markers$gene)) == nrow(markers)){ markers$gene <- strsplit(markers$gene, "--") %>% lapply(., function(x) x[1]) %>% unlist()}
247 | if(length(grep("__",markers$gene)) == nrow(markers)){ markers$gene <- strsplit(markers$gene, "__") %>% lapply(., function(x) x[1]) %>% unlist()}
248 |
249 | ## if needed, put markers$gene in same gene format as rownames C & T:
250 | if(length(intersect(markers$gene, rownames(C))) == 0){
251 |
252 | if(length(grep("ENSG000|ENSMUSG000",markers$gene)) > 100){
253 |
254 | markers.gene <- ENSG_to_HGNC$HGNC[match(markers$gene, ENSG_to_HGNC$ENSG_ID)]
255 |
256 | } else {
257 |
258 | markers.gene <- ENSG_to_HGNC$ENSG_ID[match(markers$gene, ENSG_to_HGNC$HGNC)]
259 |
260 | }
261 |
262 | markers$gene <- markers.gene
263 |
264 | }
265 |
266 | no.dups <- which(table(markers$gene)[markers$gene] == 1) %>% as.numeric()
267 | dups <- which(table(markers$gene)[markers$gene] > 1) #keeps ALL occurrences in original table + original ordering
268 | dups <- sapply(unique(names(dups)), function(x) grep(x, markers$gene)[1]) %>% as.numeric() #keeps ONLY FIRST occurrence of duplicates (in original table) + original ordering
269 | markers <- markers[c(no.dups, dups),]
270 |
271 | saveRDS(markers, "./markers_TMM_FC1.5_Seuratwilcox.rds")
272 |
273 | } else {
274 |
275 | markers = readRDS("./markers_TMM_FC1.5_Seuratwilcox.rds")
276 |
277 | }
278 |
279 | } else {
280 |
281 | markers = ""
282 |
283 | }
284 |
285 | ##############################################################################
286 | ## Deconvolution
287 |
288 | STRING <- paste(basename(directory), method, ref.normalization, mix.normalization, sep = "_")
289 |
290 | output.filename <- paste(STRING, ".rds", sep = "")
291 | output_folder <- paste(directory, "OUTPUT_DECONVOLUTION/", sep = "/")
292 | ifelse(!dir.exists(output_folder), dir.create(output_folder), FALSE)
293 | output_name <- paste(output_folder, "RESULTS_", output.filename, sep = "")
294 |
295 | if(is.null(P)){
296 |
297 | P <- table(phenoDataC$SubjectName,phenoDataC$cellType) %>% t() %>% as.data.table() %>% data.table::dcast.data.table(., V1 ~ V2, fill = "N")
298 | P <- data.frame(P, check.names = FALSE)
299 | rownames(P) <- P$V1
300 | P$V1 <- NULL
301 | P <- P[,colnames(P) %in% colnames(T)]
302 | P <- apply(P, 2, function(x) x / sum(x))
303 |
304 | }
305 |
306 | ## Unify row names between T and C (after normalization!)
307 | perc.overlap <- (length(intersect(rownames(C),rownames(T))) * 100)/min(nrow(T),nrow(C))
308 | if(perc.overlap < 33.3){
309 |
310 | if(length(grep("ENSG000|ENSMUSG000",rownames(T))) > 100){
311 |
312 | rownames.T <- ENSG_to_HGNC$HGNC[match(rownames(T), ENSG_to_HGNC$ENSG_ID)]
313 | T = T[!is.na(rownames.T),]
314 | rownames.T = rownames.T[!is.na(rownames.T)]
315 |
316 | } else {
317 |
318 | rownames.T <- ENSG_to_HGNC$ENSG_ID[match(rownames(T), ENSG_to_HGNC$HGNC)]
319 | T = T[!is.na(rownames.T),]
320 | rownames.T = rownames.T[!is.na(rownames.T)]
321 |
322 | }
323 |
324 | T <- collapsing(T, rownames.T)
325 |
326 | }
327 |
328 | # To avoid issues with sample names that are fully integers
329 | if(length(grep("[A-Z|a-z]", colnames(T))) == 0){colnames(T) <- paste("X",colnames(T),sep="")}
330 | if(length(grep("[A-Z|a-z]", colnames(P))) == 0){colnames(P) <- paste("X",colnames(P),sep="")}
331 |
332 | ## Remove NA values (if any) after row.name interconversion
333 | if(sum(is.na(rownames(T))) != 0){T = T[!is.na(rownames(T)), ]}
334 |
335 | P = P[,gtools::mixedsort(colnames(P))]
336 | T = T[,gtools::mixedsort(colnames(T))]
337 |
338 | #To avoid MAST-DWLS to be repeated!
339 | if(method == "DWLS"){STRING <- paste("DWLS", ref.normalization, sep = "_")}
340 |
341 | if(method %in% c("CIBERSORT", "RLR", "FARDEEP", "nnls")){
342 |
343 | RESULTS = Deconvolution(T = T,
344 | C = C,
345 | method = method,
346 | phenoDataC = phenoDataC,
347 | P = P,
348 | STRING = STRING,
349 | marker_distrib = markers,
350 | refProfiles.var = NULL)
351 |
352 | } else {
353 |
354 | RESULTS = Deconvolution(T = T,
355 | C = scC,
356 | method = method,
357 | phenoDataC = phenoDataC,
358 | P = P,
359 | STRING = STRING,
360 | marker_distrib = markers,
361 | refProfiles.var = NULL)
362 |
363 | }
364 |
365 | RESULTS$dataset <- basename(directory)
366 | RESULTS$method <- method
367 | RESULTS$ref.normalization <- ref.normalization
368 | RESULTS$mix.normalization <- mix.normalization
369 |
370 | ann_text.global = RESULTS %>% dplyr::summarise(RMSE = sqrt(mean((observed_fraction - expected_fraction)^2)) %>% round(.,3),
371 | Pearson = cor(observed_fraction, expected_fraction) %>% round(.,3))
372 |
373 | label = paste("Pearson = ", ann_text.global$Pearson, "\n RMSE = ", ann_text.global$RMSE, sep = "")
374 |
375 | scaleFUN <- function(x) sprintf("%.3f", x)
376 |
377 | P <- ggplot(RESULTS, aes(x = observed_fraction , y = expected_fraction, colour = cell_type)) +
378 | geom_point() +
379 | geom_abline(intercept = 0, slope = 1, lty = 2, col = "blue") +
380 | facet_grid( ~ method) +
381 | geom_text(vjust = "inward", hjust = "inward", data = ann_text.global, aes(x = 0.02, y = 1.04, label = label), size = 4, inherit.aes = FALSE)+
382 | ggtitle("all_normalizations_per_cell") +
383 | theme_bw() +
384 | xlim(0,1) +
385 | ylim(0,1.05) +
386 | xlab("observed fraction") +
387 | ylab("expected fraction") +
388 | ggtitle(STRING) +
389 | theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
390 | legend.position = "bottom")
391 |
392 | ggsave(P, filename = paste(output_folder, paste(STRING, ".pdf", sep = ""), sep = "/"), width = 6, height = 6)
393 | saveRDS(RESULTS, output_name)
394 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Source code (R statistical programming language, v4.0.1) to reproduce the results described in the article:
2 |
3 | > Francisco Avila Cobos, Mohammad Javad Najaf Panah, Jessica Epps, Xiaochen Long, Tsz-Kwong Man, Hua-Sheng Chiu, Elad Chomsky, Evgeny Kiner, Michael J Krueger, Diego di Bernardo, Luis Voloch, Jan Molenaar, Sander R. van Hooff, Frank Westermann, Selina Jansky, Michele L. Redell, Pieter Mestdagh, Pavel Sumazin **Effective methods for bulk RNA-Seq deconvolution using scnRNA-Seq transcriptomes** *(bioRxiv; https://www.biorxiv.org/content/10.1101/2022.12.13.520241v2)*
4 |
5 | SQUID as standalone R package
6 | ========
7 | Please go here to detailed instructions for its installation:
8 | > https://github.com/favilaco/deconv_matching_bulk_scnRNA/tree/master/SQUID
9 |
10 | DATASETS
11 | ========
12 | Here we provide an **example folder** (named "Toy_example"; see *"Folder requirements" & "Running the deconvolution"*) that can be directly used to test the framework. It contains an artificial single-cell RNA-seq dataset made of 5 artificial cell types where two of them are highly correlated (cell type 1 and 5); 1000 cells per cell type and 64 genes (of which 16 are marker genes, 4 per cell type and in different ranges of expression).
13 |
14 | The **other external datasets used in the manuscript** (together with the necessary metadata) can be downloaded from their respective sources:
15 |
16 | * Cell_mixtures: GEO GSE220608
17 | * Kidney: GEO GSE141115 (https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE141115&format=file) ; additional sample metadata information in "additional_data/Denisenko_crosstable_bulk_sc_sn_14092021".
18 | * AML: GEO GSE220608
19 | * Breast_cancer: GEO GSE176078 ("GSE176078_RAW.tar" from "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE176078&format=file")
20 | * NB_1: EGAS00001004349;
21 | * NB_2: EGAS00001006723; EGAS00001006823; GEO GSE218450
22 | * Synapse: syn8691134; syn23554292; syn23554293; syn23554294
23 | * Brain: GEO GSE67835 ("GSE67835_RAW.tar" from https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE67835&format=file) & IHC proportions (re-scaled to sum-to-one) present here: https://github.com/ellispatrick/CortexCellDeconv/tree/master/CellTypeDeconvAnalysis/Data [IHC.astro.txt, IHC.endo.txt, IHC.microglia.txt, IHC.neuro.txt; IHC.oligo.txt]
24 |
25 |
26 | The following line is needed for fresh installations of Linux (Debian):
27 | `sudo apt-get install curl libcurl4-openssl-dev libssl-dev zlib1g-dev r-base-dev libxml2-dev`
28 |
29 |
30 | R 4.0.1: REQUIRED PACKAGES AND PACKAGE DEPENDENCIES:
31 | ===================================================
32 | Packages to be installed (alphabetically ordered) before running any deconvolution (**R >= 4.0.1**):
33 | ```
34 | AnnotationDbi
35 | Biobase
36 | CIBERSORT
37 | data.table
38 | devtools
39 | doMC
40 | dplyr
41 | DWLS
42 | edgeR
43 | egg
44 | foreach
45 | FARDEEP
46 | ggplot2
47 | ggpointdensity
48 | ggpubr
49 | ggrastr
50 | ggrepel
51 | gridExtra
52 | gtools
53 | limma
54 | MASS
55 | Matrix
56 | matrixStats
57 | monocle3
58 | MuSiC
59 | nnls
60 | org.Hs.eg.db
61 | org.Mm.eg.db
62 | parallel
63 | pheatmap
64 | RColorBrewer
65 | R.utils
66 | scater
67 | scran
68 | sctransform
69 | Seurat
70 | SingleCellExperiment
71 | tidyr
72 | tidyverse
73 | viridis
74 | ```
75 |
76 |
77 | References to other methods included in our benchmark:
78 | ======================================================
79 | While our work has a **BSD (3-clause)** license, you **may need** to obtain a license to use the individual normalization/deconvolution methods (e.g. CIBERSORT. The source code for CIBERSORT needs to be asked to the authors at https://cibersort.stanford.edu).
80 |
81 | | method | ref |
82 | |--------|----------|
83 | | nnls | Mullen, K. M. & van Stokkum, I. H. M. nnls: The Lawson-Hanson algorithm for non-negative least squares (NNLS). R package version 1.4. https://CRAN.R-project.org/package=nnls |
84 | | FARDEEP | Hao, Y., Yan, M., Lei, Y. L. & Xie, Y. Fast and Robust Deconvolution of Tumor Infiltrating Lymphocyte from Expression Profiles using Least Trimmed Squares. bioRxiv 358366 (2018) doi:10.1101/358366 |
85 | | MASS: Robust linear regression (RLR) | Ripley, B. et al. MASS: Support Functions and Datasets for Venables and Ripley’s MASS. (2002) |
86 | | CIBERSORT | Newman, A. M. et al. Robust enumeration of cell subsets from tissue expression profiles. Nat. Methods 12, 453–457 (2015) |
87 | |--------|----------|
88 | | DWLS | Tsoucas, D. et al. Accurate estimation of cell-type composition from gene expression data. Nat. Commun. 10, 1–9 (2019) |
89 | | Bisque | Jew, B. et al. Accurate estimation of cell composition in bulk expression through robust integration of single-cell information. Nat. Commun. 11, 1971 (2020) |
90 | | MuSiC | Wang, X., Park, J., Susztak, K., Zhang, N. R. & Li, M. Bulk tissue cell type deconvolution with multi-subject single-cell expression reference. Nat. Commun. 10, 380 (2019) |
91 | |--------|----------|
92 | | SCTransform / regularized negative binomial regression (RNBR) | Hafemeister, C. & Satija, R. Normalization and variance stabilization of single-cell RNA-seq data using regularized negative binomial regression. Genome Biology (2019) doi:10.1186/s13059-019-1874-1 |
93 | |scran | L. Lun, A. T., Bach, K. & Marioni, J. C. Pooling across cells to normalize single-cell RNA sequencing data with many zero counts. Genome Biol. 17, 75 (2016) |
94 | | scater | McCarthy, D. J., Campbell, K. R., Lun, A. T. L. & Wills, Q. F. Scater: pre-processing, quality control, normalization and visualization of single-cell RNA-seq data in R. Bioinformatics 33, 1179–1186 (2017) |
95 | | Trimmed mean of M-values (TMM) | Robinson, M. D. & Oshlack, A. A scaling normalization method for differential expression analysis of RNA-seq data. Genome Biol. 11, R25 (2010) |
96 | | Transcripts per million (TPM) | Li, B., Ruotti, V., Stewart, R. M., Thomson, J. A. & Dewey, C. N. RNA-Seq gene expression estimation with read mapping uncertainty. Bioinformatics 26, 493–500 (2010) |
97 | | LogNormalize | LogNormalize function (part of "Seurat"). R Documentation. https://www.rdocumentation.org/packages/Seurat/versions/3.1.1/topics/LogNormalize ; Butler, A., Hoffman, P., Smibert, P. et al. Integrating single-cell transcriptomic data across different conditions, technologies, and species. Nat Biotechnol 36, 411–420 (2018) doi:10.1038/nbt.4096 |
98 |
99 |
100 | FOLDER REQUIREMENTS
101 | ===================
102 |
103 | a) Folder structure:
104 |
105 | **NOTE**: The gold standard matrix of proportions ("P") is either present or obtained as the sum of individual cells across each cell type.
106 |
107 | ```
108 | .
109 | ├── AML
110 | │ ├── scC.rds
111 | │ ├── T.rds
112 | │ ├── phenoDataC_clusters_wo_regrouping.txt
113 | ├── Brain
114 | │ ├── scC.rds
115 | │ ├── T.rds
116 | │ ├── P.rds #from IHC
117 | │ ├── phenoDataC_clusters_wo_regrouping.txt
118 | ├── Toy_example
119 | │ ├── scC.rds
120 | │ ├── T.rds
121 | │ ├── P.rds
122 | │ ├── phenoDataC_clusters_wo_regrouping.txt
123 | │ └── markers_TMM_FC1.5_Seuratwilcox.rds #(if not present will be created)
124 | ...
125 | │
126 | ├── helper_functions.R
127 | ├── MASTER.R
128 | └── CIBERSORT.R
129 | ```
130 |
131 | b) Minimally the following (tab-separated) columns being part of the metadata: "cellID", "cellType", "SubjectName".
132 |
133 | ```
134 | # For the "example" dataset, it should look like:
135 |
136 | cellID cellType SubjectName
137 | cell_1 cell_type_1 Mix_1
138 | cell_2 cell_type_1 Mix_2
139 | ...
140 | ```
141 |
142 |
143 | RUNNING THE DECONVOLUTION
144 | =========================
145 | The deconvolution problem is formulated as: T = C·P [see https://doi.org/10.1093/bioinformatics/bty019 and https://doi.org/10.1038/s41467-020-19015-1 for detailed information]
146 |
147 | Make the following choices (in order):
148 |
149 | ```
150 | i) specific dataset [from: "Toy_example","Cell_mixtures", "Kidney", "AML", "Breast_cancer", "NB_1", "NB_2", "Synapse", "Brain"]
151 | ii) normalization strategy for the reference matrix [bulk (C): "none", "LogNormalize", "TMM", "TPM"] ; [single-cell: "none", "LogNormalize", "TMM", "TPM", "SCTransform", "scran", "scater"]
152 | iii) normalization strategy for the mixture matrix (T)["none", "LogNormalize", "TMM", "TPM"]
153 | iv) deconvolution method [from: "CIBERSORT", "nnls", "FARDEEP", "RLR", "MuSiC", "MuSiC_with_markers", "Bisque", "Bisque_with_markers", "DWLS", "SQUID"]
154 | v) indicate whether cell-type labels are included ("yes") or whether they should be obtained by unsupervised clustering ("no")
155 | ```
156 |
157 | **NOTE: To systematically test the benefit of bulk transformation and deconvolution with SQUID, a leave-one-out cross-validation strategy can also be used: iteratively, concurrent RNA-Seq and scnRNA-Seq profiles of all but one of the samples were used to predict the composition of the remaining sample based on its bulk RNA-Seq profile. By default, all available data is used at once, but this approach can be tried out by using "leave.one.out = TRUE" as a parameter of the function "transformation2" that is used inside SQUID (line 507 inside "helper_functions.R")**
158 |
159 |
160 | R example call
161 | ===============
162 |
163 | **MINIMUM REQUIREMENTS**: matching scnRNA-seq **("scC.rds")** and bulk RNA-seq **("T.rds")** samples together with the correspoinding metadata **("phenoDataC_clusters_wo_regrouping.txt")**.
164 |
165 | **OPTIONAL**: if users have gold standard proportion estimates from IHC or other techniques, they can include such information as **"P.rds"** (this will avoid re-computing the proportions directly from the scnRNA-seq data). If users wish to provide their own re-clustering or set of cell type labels, they can include this as a file named **"phenoDataC_clusters_after_regrouping.txt"** (see Toy_example/phenoDataC_clusters_after_regrouping.txt for an example).
166 |
167 | **NOTE**: Each single-cell RNA-seq dataset ("scC.rds") and mixture ("T.rds") should be **RAW integer MATRICES** containing genomic features as rows.
168 |
169 |
170 | ```
171 |
172 | Rscript MASTER.R ~/Downloads/deconv_matching_bulk_scnRNA/Toy_example TPM TPM nnls no
173 |
174 | ```
175 |
176 | In the "example" (toy) dataset cell types 1 and 5 were found to be very highly correlated and were first collapsed into a unique cluster (relabelled as "regrouped.cluster.1"):
177 |
178 | 
179 |
180 | Next, the deconvolution was run and the output was generated:
181 |
182 | 
183 |
184 |
185 | Survival analysis
186 | =================
187 | Please run script(s) inside folder "SURVIVAL"
188 |
--------------------------------------------------------------------------------
/SQUID/DESCRIPTION:
--------------------------------------------------------------------------------
1 | Package: SQUID
2 | Type: Package
3 | Title: Single-cell RNA Quantity Informed Deconvolution (SQUID)
4 | Version: 0.1.0
5 | Year: 2023
6 | Author: Francisco Avila Cobos, Mohammad Javad Najaf Panah, ..., Pieter Mestdagh, Pavel Sumazin
7 | Maintainer: Francisco Avila Cobos and Mohammad Javad Najaf Panah
8 | Description: SQUID executes a combination of RNA-Seq transformation and dampened weighted least-squares deconvolution approaches in predicting the composition of
9 | cell mixtures and tissue samples based on concurrent RNA-Seq and scnRNA-Seq profiles. To systematically test the benefit of bulk transformation and deconvolution
10 | with SQUID, a leave-one-out cross-validation strategy can also be used; iteratively, concurrent RNA-Seq and scnRNA-Seq profiles of all but one of the samples were
11 | used to predict the composition of the remaining sample based on its bulk RNA-Seq profile.
12 | Data: SQUID_Toy_Example_Data.RData
13 | License: MIT
14 | Encoding: UTF-8
15 | LazyData: true
16 |
--------------------------------------------------------------------------------
/SQUID/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023 Mohammad Najafpanah
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/SQUID/NAMESPACE:
--------------------------------------------------------------------------------
1 | exportPattern("^[[:alpha:]]+")
2 |
--------------------------------------------------------------------------------
/SQUID/R/SQUID.R:
--------------------------------------------------------------------------------
1 | #' Single-cell RNA Quantity Informed Deconvolution (SQUID)!
2 | #'
3 | #' This is an executable R function named 'SQUID'
4 | #' which analyse cell-type composition predictions based on concurrent RNA-Seq and scRNA-Seq profiles.
5 | #'
6 | #' You can learn more about SQUID compare to some other approaches at:
7 | #'
8 | #' https://github.com/favilaco/deconv_matching_bulk_scnRNA
9 | #'
10 | #' Citation:
11 | #' Avila Cobos, F., Najaf Panah, M. J., Epps, J., Long, X., Man, T. K., Chiu, H. S., ..., Mestdagh, P & Sumazin, P. (2022). Effective
12 | #' methods for bulk RNA-Seq deconvolution using scnRNA-Seq transcriptomes. bioRxiv, 2022-12.
13 | #'
14 | #' GitHub: https://github.com/favilaco/deconv_matching_bulk_scnRNA
15 | #'
16 | #' Licence: MIT License
17 | #'
18 | #' Copyright (c) [2023] [Francisco Avila Cobos, Mohammad Javad Najaf Panah, Pieter Mestdagh, Pavel Sumazin]
19 | #'
20 | #'
21 | #' Permission is hereby granted, free of charge, to any person obtaining a copy
22 | #' of this software and associated documentation files (the "Software"), to deal
23 | #' in the Software without restriction, including without limitation the rights
24 | #' to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
25 | #' copies of the Software, and to permit persons to whom the Software is
26 | #' furnished to do so, subject to the following conditions:
27 | #'
28 | #' The above copyright notice and this permission notice shall be included in
29 | #' all copies or substantial portions of the Software.
30 | #'
31 | #' THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
32 | #' IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
33 | #' FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
34 | #' AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
35 | #' LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
36 | #' OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
37 | #' THE SOFTWARE.
38 | #'
39 | #'
40 | #' For commercial use inquiries, please contact the corresponding authors [Pavel Sumazin at Pavel.Sumazin@bcm.edu; Pieter Mestdagh at pieter.mestdagh@ugent.be].
41 | #'
42 | #'
43 |
44 | # Load the example dataset:
45 | load("SQUID_Toy_Example_Data.RData", verbose = T)
46 |
47 | #' Single-cell RNA Quantity Informed Deconvolution (SQUID).
48 | #'
49 | #' SQUID executes a combination of RNA-Seq transformation and dampened weighted least-squares deconvolution approaches in predicting the composition of
50 | #' cell mixtures and tissue samples.
51 | #'
52 | #' SQUID is an improved deconvolution method based on concurrent RNA-Seq and scRNA-Seq profiles.
53 | #'
54 | #' NOTE: To systematically test the benefit of bulk transformation and deconvolution with SQUID, a leave-one-out cross-validation strategy can also be used:
55 | #' iteratively, concurrent RNA-Seq and scnRNA-Seq profiles of all but one of the samples were used to predict the composition of the remaining sample based
56 | #' on its bulk RNA-Seq profile.
57 | #'
58 | #' @param B A bulk RNA-seq numeric matrix which could be either count or normalized values. Rows should be genes and columns should be sample.ids.
59 | #' @param scC A single-cell numeric matrix which could be either count or normalized values. Rows should be genes and columns should be cell.ids.
60 | #' @param scMeta A single-cell annotation data.frame which rows should be cell.ids and columns including cell.id, sample.id, cluster.id, cellType, etc.
61 | #' @param pB (Optional) A pseudo-bulk matrix generated based on cluster.id/cellType from single-cell analysis. Rows should be genes and columns should be cluster.id/cellType.
62 | #' @param P (Optional) A numeric matrix of expected cell fractions including the composition of cluster.id/cellTypes per samples can be calculated from scRNA-seq analysis. Rows should be unique cluster.ids/cellTypes and columns should be sample.ids.
63 | #' @param LeaveOneOut A logical variable which allow users run SQUID with/without a leave-one-out cross validation strategy. Default is FALSE.
64 | #'
65 | #' @return A table including cellType, sample.id, observed_fraction (predicted cluster/cell-type fraction), expected_fraction (P) for each bulk sample.
66 | #'
67 | #' @export RESULTS Save the RESULTS.rds and RESULTS.csv files in your working directory.
68 | #'
69 | #' @usage RESULTS <- SQUID(B=B, scC=scC , scMeta=scMeta, pB=NULL, P=NULL, LeaveOneOut=FALSE)
70 | #'
71 | #'
72 | SQUID <- function(B=B, scC=scC , scMeta=scMeta, pB=NULL, P=NULL, LeaveOneOut=FALSE) {
73 | cat('
74 |
75 |
76 |
77 | ----------------------------------- Welcome To SQUID ---------------------------------------
78 |
79 | ######## ######## # # ######## #######
80 | # # # # # ## # #
81 | # # # # # ## # #
82 | ######## # ## # # # ## # #
83 | # # # # # # ## # #
84 | # # #### # # ## # #
85 | ######## ######## ######## ######## #######
86 |
87 |
88 | Execute an Effective method for bulk RNA-seq deconvolution using scnRNA-seq transcriptomes
89 |
90 | --------------------------------------------------------------------------------------------
91 |
92 |
93 |
94 | ')
95 | cat('SQUID just began the analysis ... \n\n')
96 |
97 | # Make pseudo-Bulk (pB) based on clusters from single-cell analysis, if not provided
98 | if(is.null(pB)){
99 | cellType <- scMeta$cellType # could be 'scMeta$cluster.id' as well.
100 | group = list()
101 | for(i in unique(cellType)){
102 | group[[i]] <- which(cellType %in% i)
103 | }
104 | pB = lapply(group,function(x) base::rowMeans(scC[,x, drop = FALSE]))
105 | pB = do.call(cbind.data.frame, pB)
106 | } else {
107 | pB=pB
108 | }
109 |
110 | # Make proportional table (P), if not provided
111 | if(is.null(P)){
112 | P <- table(scMeta$sample.id, scMeta$cellType) %>% t()
113 | P <- P[,gtools::mixedsort(colnames(P))]
114 | P <- P[,colnames(P) %in% colnames(B)]
115 | P <- apply(P, 2, function(x) x / sum(x))
116 | } else {
117 | P=P
118 | }
119 |
120 | # Unify row names between B and pB
121 | perc.overlap <- (length(intersect(rownames(pB),rownames(B))) * 60)/min(nrow(B),nrow(pB))
122 | if(perc.overlap < 33.3){
123 | if(length(grep("ENSG000|ENSMUSG000",rownames(B))) > 100){
124 | rownames.B <- ENSG_to_HGNC$HGNC[match(rownames(B), ENSG_to_HGNC$ENSG_ID)]
125 | B = B[!is.na(rownames.B),]
126 | rownames.B = rownames.B[!is.na(rownames.B)]
127 | } else {
128 | rownames.B <- ENSG_to_HGNC$ENSG_ID[match(rownames(B), ENSG_to_HGNC$HGNC)]
129 | B = B[!is.na(rownames.B),]
130 | rownames.B = rownames.B[!is.na(rownames.B)]
131 | }
132 | B <- collapsing(B, rownames.B)
133 | }
134 |
135 | # To avoid issues with sample.ids that are fully integers
136 | if(length(grep("[A-Z|a-z]", colnames(B))) == 0){colnames(B) <- paste("X",colnames(B),sep="")}
137 | if(length(grep("[A-Z|a-z]", colnames(P))) == 0){colnames(P) <- paste("X",colnames(P),sep="")}
138 |
139 | # Remove NA values (if any) after row.name interconversion
140 | if(sum(is.na(rownames(B))) != 0){B = B[!is.na(rownames(B)), ]}
141 |
142 | # Transform the bulk data
143 | if(LeaveOneOut) {
144 | cat("SQUID is transforming the Bulk samples using Leave-One-Out cross validation method ... ")
145 | } else {
146 | cat("SQUID is transforming the Bulk samples using all informations ... ")
147 | }
148 |
149 | Z = as.matrix(pB)
150 | Y = Z %*% P
151 | B.new = transformation2(X = B, Y = Y, leave.one.out = LeaveOneOut)
152 | B.new[!is.finite(B.new)] <- 0
153 |
154 | cat(' DONE!\n\n')
155 |
156 | # take common genes
157 | Genes <- intersect(rownames(Z),rownames(B.new))
158 | B.new <- as.matrix(B.new[Genes,])
159 | Z <- as.matrix(Z[Genes,])
160 |
161 | # solve Dampened WLS
162 | cat('SQUID is solving the dampened weighted least squares model for Bulk samples ... \n\n')
163 | RESULTS <- apply(B.new, 2, function(x){
164 | b = setNames(x, rownames(B.new))
165 | tr <- trimData(Z, b)
166 | RES <- t(solveDampenedWLS(S = Z, B = x))
167 | })
168 |
169 | rownames(RESULTS) <- colnames(Z)
170 | RESULTS = apply(RESULTS,2,function(x) ifelse(x < 0, 0, x)) # explicit non-negativity constraint
171 | RESULTS = apply(RESULTS,2,function(x) x/sum(x)) # explicit STO constraint
172 | RESULTS = RESULTS[gtools::mixedsort(rownames(RESULTS)), , drop = FALSE]
173 | RESULTS = suppressMessages(reshape2::melt(RESULTS))
174 | colnames(RESULTS) <- c("cellType","sample.id","observed_fraction")
175 | RESULTS$cellType = as.character(RESULTS$cellType)
176 | RESULTS$sample.id = as.character(RESULTS$sample.id)
177 | if(!is.null(P)){
178 | P = P[gtools::mixedsort(rownames(P)),,drop = FALSE] %>% data.frame(., check.names = FALSE)
179 | P$cellType = rownames(P)
180 | P = suppressMessages(reshape2::melt(P))
181 | colnames(P) <- c("cellType","sample.id","expected_fraction")
182 | P$cellType = as.character(P$cellType)
183 | P$sample.id = as.character(P$sample.id)
184 | RESULTS = merge(RESULTS, P, by = c("cellType", "sample.id"), all = TRUE)
185 | RESULTS[is.na(RESULTS)] <- 0
186 | RESULTS$expected_fraction <- round(RESULTS$expected_fraction, 3)
187 | RESULTS$observed_fraction <- round(RESULTS$observed_fraction, 3)
188 | }
189 | saveRDS(RESULTS, "RESULTS.rds")
190 | write.csv(RESULTS, "RESULTS.csv")
191 | cat('\nSQUID has saved the results table in .rds and .csv formats.\n\n')
192 | cat('Congrats! SQUID has finished the analyses!\n')
193 | return(RESULTS)
194 | } # The End
195 |
--------------------------------------------------------------------------------
/SQUID/README.md:
--------------------------------------------------------------------------------
1 | # SQUID: Single-cell RNA Quantity Informed Deconvolution
2 |
3 | # Overview
4 | SQUID is an R package for conducting (tumor) deconvolution analyses. SQUID executes a combination of RNA-Seq transformation and dampened weighted least-squares deconvolution to predict the composition of cell mixtures and tissue samples based on the concurrent RNA-Seq and scnRNA-Seq profiles. SQUID harnesses the power of concurrent RNA-Seq and scnRNA-Seq profiling, outperforming other methods in predicting the composition of cell mixtures and tissue samples.
5 |
6 | # Installation
7 | First load the requirements:
8 | ```r
9 | library(devtools)
10 | devtools::source_url("https://github.com/favilaco/deconv_matching_bulk_scnRNA/blob/master/helper_functions.R?raw=TRUE")
11 | library(dplyr)
12 | ```
13 | Now, you can install the latest version of SQUID from GitHub using the devtools package:
14 | ```r
15 | install_github("favilaco/deconv_matching_bulk_scnRNA/SQUID")
16 | ```
17 | # Usage
18 | To use SQUID, load the package into your R session using the library function:
19 | ```r
20 | library(SQUID)
21 | ```
22 | You can then use the provided SQUID function to conduct the experimantal analyses. You need to provide the required inputs as described in the package documentation in details. SQUID package contains a toy example dataset including bulk and single-cell RNA-seq simulated count data which you can use to run the program and check how it works:
23 | ```r
24 | RESULTS <- SQUID(B = B, scC = scC , scMeta = scMeta, pB = NULL, P = NULL, LeaveOneOut = FALSE)
25 |
26 | #' @param B A bulk RNA-seq numeric matrix which could be either count or normalized values. Rows should be genes and columns should be sample.ids.
27 | #' @param scC A single-cell numeric matrix which could be either count or normalized values. Rows should be genes and columns should be cell.ids.
28 | #' @param scMeta A single-cell annotation data.frame which rows should be cell.ids and columns including cell.id, sample.id, cluster.id, cellType, etc.
29 | #' @param pB (Optional) A pseudo-bulk matrix generated based on cluster.id/cellType from single-cell analysis. Rows should be genes and columns should be cluster.id/cellType.
30 | #' @param P (Optional) A numeric matrix of expected cell fractions including the composition of cluster.id/cellTypes per samples can be calculated from scRNA-seq analysis. Rows should be unique cluster.ids/cellTypes and columns should be sample.ids.
31 | #' @param LeaveOneOut A logical variable which allow users run SQUID with/without a leave-one-out cross validation strategy. Default is FALSE.
32 | #'
33 | ```
34 | Notice: To improve the performance of SQUID prediction, you may need to set up your research-specific pipeline for the quality contol, normalization, imputation, and single-cell clustering procedures on your datasets prior to run SQUID. You are welcome to look at our preprint [paper](https://www.biorxiv.org/content/10.1101/2022.12.13.520241v2) as an example.
35 |
36 | # License
37 | This package is licensed under the MIT license. See the LICENSE file for details.
38 |
39 | # Contact
40 | For questions or comments about SQUID, please contact the package maintainers at Francisco Avila Cobos and Mohammad Javad Najaf Panah . If you find a bug or have a feature request, please submit an issue on the GitHub repository.
41 |
42 | # Citation
43 | Francisco Avila Cobos, Mohammad Javad Najaf Panah, Jessica Epps, Xiaochen Long, Tsz-Kwong Man, Hua-Sheng Chiu, Elad Chomsky, Evgeny Kiner, Michael J Krueger, Diego di Bernardo, Luis Voloch, Jan Molenaar, Sander R. van Hooff, Frank Westermann, Selina Jansky, Michele L. Redell, Pieter Mestdagh, Pavel Sumazin Effective methods for bulk RNA-Seq deconvolution using scnRNA-Seq transcriptomes (bioRxiv; https://www.biorxiv.org/content/10.1101/2022.12.13.520241v2)
44 |
--------------------------------------------------------------------------------
/SQUID/SQUID.Rproj:
--------------------------------------------------------------------------------
1 | Version: 1.0
2 |
3 | RestoreWorkspace: Default
4 | SaveWorkspace: Default
5 | AlwaysSaveHistory: Default
6 |
7 | EnableCodeIndexing: Yes
8 | UseSpacesForTab: Yes
9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 |
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 |
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 |
18 | BuildType: Package
19 | PackageUseDevtools: Yes
20 | PackageInstallArgs: --no-multiarch --with-keep.source
21 |
--------------------------------------------------------------------------------
/SQUID/SQUID_Toy_Example_Data.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/favilaco/deconv_matching_bulk_scnRNA/c353e025979a7a122078a324585d1980301bc5f7/SQUID/SQUID_Toy_Example_Data.RData
--------------------------------------------------------------------------------
/SQUID/man/SQUID.Rd:
--------------------------------------------------------------------------------
1 | \name{SQUID}
2 |
3 | \alias{SQUID}
4 |
5 | \title{Single-cell RNA Quantity Informed Deconvolution (SQUID)}
6 |
7 | \description{SQUID predicts the composition of cell mixtures and tissue samples based on concurrent RNA-Seq and scnRNA-Seq profiles.}
8 |
9 | \usage{SQUID(B=B, scC=scC , scMeta=scMeta, pB=NULL, P=NULL, LeaveOneOut=FALSE)}
10 |
11 | \arguments{
12 | \item{B}{A bulk RNA-seq numeric matrix which could be either count or normalized values. Rows should be genes and columns should be samples.}
13 | \item{scC}{A single-cell numeric matrix which could be either count or normalized values. Rows should be genes and columns should be cell.id.}
14 | \item{scMeta}{A single-cell annotation data.frame which rows should be cell.ids and columns including cell.id, sample.id, cluster.id, cellType, etc.}
15 | \item{pB}{(Optional) A pseudo-bulk matrix generated based on cluster.id/cellType from single-cell analysis. Rows should be genes and columns should be cluster.id/cellType.}
16 | \item{P}{ (Optional) A numeric matrix of expected cell fractions including the composition of clusters/cellTypes per samples can be calculated from scRNA-seq analysis. Rows should be unique cluster.ids/cellTypes and columns should be samples.}
17 | \item{LeaveOneOut}{A logical variable which allow users run SQUID with/without a leave-one-out cross validation strategy. Default is FALSE. Note that the LeaveOneOut argument included here to give users an extra option to systematically test the benefit of bulk transformation and deconvolution accuracy by SQUID.}
18 | }
19 |
20 | \value{A matrix including cellType, sample.id, observed_fraction (predicted cluster/cell-type fraction), expected_fraction (P) for each bulk sample. By default, SQUID also saves the RESULTS.rds and RESULTS.csv files on the working directory.}
21 |
22 |
23 | \examples{
24 | RESULTS <- SQUID(B=B, scC=scC , scMeta=scMeta, pB=NULL, P=NULL, LeaveOneOut=FALSE)
25 | }
26 |
--------------------------------------------------------------------------------
/SURVIVAL/Survival_Analyses.R:
--------------------------------------------------------------------------------
1 | # Survival Analyses (AML dataset)
2 | # Bulk: pAML from TARGET (n=256)
3 | # SC: 6 pAML from BCM/TCH inclusing Diagnosis and relapes timepoints (n=12)
4 |
5 | # Load libraries:
6 | library(survival)
7 | library(survminer)
8 |
9 | ## Read TARGET patients clinical data:
10 | clinical.dt <- read.delim("./AML-TARGET-RNAseq-Diagn-No_21-Anno-091920.txt")
11 |
12 | ## Read the predicted proportional results (LogNormalize-LogNormalize normalization) generated by deconvolution methods:
13 | pSQUID <- read.csv("./TARGET_AML_LogNormalize_LogNormalize_pSQUID.csv")
14 |
15 | # Extract patients IDs
16 | TARGET.USI <- strsplit(pSQUID$X, "[.]")
17 | TARGET.USI <- matrix(unlist(TARGET.USI), ncol=5, byrow=TRUE)
18 | TARGET.USI <- as.data.frame(TARGET.USI)
19 | pSQUID <- cbind.data.frame(TARGET.USI, pSQUID)
20 | pSQUID <- pSQUID[,-c(1,2,4,5,6)]
21 | colnames(pSQUID)[1] <- "TARGET.USI"
22 |
23 | # Merge the cluster.prop variables to clinical data (Relapsed_died + non_Relapsed_Alive patients) n=181
24 | merged.tAML <- merge(clinical.dt, pJanus, by="TARGET.USI")
25 | merged.tAML.R.DoD <- merged.tAML[merged.tAML$First.Event=="Relapse" & merged.tAML$Vital.Status=="Dead",] # 93
26 | merged.tAML.C.Ali <- merged.tAML[merged.tAML$First.Event=="Censored" & merged.tAML$Vital.Status=="Alive",] # 88
27 | t.AML.R.C.DOD.ALI <- rbind(merged.tAML.R.DoD,merged.tAML.C.Ali)
28 |
29 | # Categorize the patients based on their median cut-off on cluster proportion (i. e. cluster 11):
30 | data=t.AML.R.C.DOD.ALI
31 | cluster="cluster.11"
32 | for(i in 1:(nrow(data))) {
33 | if(data[i,cluster] >= median(data[,cluster])) {
34 | data$cluster.11.pro[i] <- "high"
35 | } else {
36 | data$cluster.11.pro[i] <- "low"
37 | }
38 | }
39 |
40 | # Getting the status value
41 | status <- c()
42 | for(j in 1:nrow(data)) {
43 | if(data$Vital.Status[j]=="Dead") {
44 | status[j] <- 1
45 | } else {
46 | status[j] <- 0
47 | }
48 | }
49 |
50 | # Model overall survival
51 | sfit <- survfit(Surv(Overall.Survival.Time.in.Days, status)~cluster.11.pro, data=data)
52 | ggsurvplot(sfit, data = data, pval=T, risk.table = F, palette = c("red","blue"))
53 |
54 | # Using cox ph regression to fit the model
55 | cfit <- coxph(Surv(Overall.Survival.Time.in.Days, status)~cluster.11, data= data)
56 | cfit
57 |
58 | # Model event-free survival
59 | sfit0 <- survfit(Surv(Event.Free.Survival.Time.in.Days, status)~cluster.11.pro, data = data)
60 | ggsurvplot(sfit0, data = data, pval=T, risk.table = F, palette = c("red","blue"))
61 | # Using cox ph regression to fit the model
62 | cfit0 <- coxph(Surv(Event.Free.Survival.Time.in.Days, status)~cluster.11, data=data)
63 | cfit0
64 |
--------------------------------------------------------------------------------
/Toy_example/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/favilaco/deconv_matching_bulk_scnRNA/c353e025979a7a122078a324585d1980301bc5f7/Toy_example/.DS_Store
--------------------------------------------------------------------------------
/Toy_example/P.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/favilaco/deconv_matching_bulk_scnRNA/c353e025979a7a122078a324585d1980301bc5f7/Toy_example/P.rds
--------------------------------------------------------------------------------
/Toy_example/T.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/favilaco/deconv_matching_bulk_scnRNA/c353e025979a7a122078a324585d1980301bc5f7/Toy_example/T.rds
--------------------------------------------------------------------------------
/Toy_example/markers_TMM_FC1.5_Seuratwilcox.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/favilaco/deconv_matching_bulk_scnRNA/c353e025979a7a122078a324585d1980301bc5f7/Toy_example/markers_TMM_FC1.5_Seuratwilcox.rds
--------------------------------------------------------------------------------
/Toy_example/scC.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/favilaco/deconv_matching_bulk_scnRNA/c353e025979a7a122078a324585d1980301bc5f7/Toy_example/scC.rds
--------------------------------------------------------------------------------
/additional_data/Denisenko_crosstable_bulk_sc_sn_14092021.csv:
--------------------------------------------------------------------------------
1 | BG17_BulkSeq_ColdDiss_Fresh;BG1,BG30,BG52;Mouse2_Right;"kidney Batch1_C57BL6J_Male_19weeks"
2 | BG18_BulkSeq_WarmDiss_Fresh;BG2,BG31,BG53;Mouse1_Right;"kidney Batch1_C57BL6J_Male_19weeks"
3 | BG19_BulkSeq_ColdDiss_Fresh;BG3,BG32,BG54;Mouse4_Left;"kidney Batch1_C57BL6J_Male_19weeks"
4 | BG20_BulkSeq_WarmDiss_Fresh;BG4,BG33,BG55;Mouse3_Left;"kidney Batch1_C57BL6J_Male_19weeks"
5 | BG21_BulkSeq_ColdDiss_Fresh;BG5,BG34,BG56;Mouse6_Left;"kidney Batch1_C57BL6J_Male_19weeks"
6 | BG22_BulkSeq_WarmDiss_Fresh;BG6,BG35,BG57;Mouse5_Right;"kidney Batch1_C57BL6J_Male_19weeks"
7 | LDK4_BulkSeq_ColdDiss_Fresh;LD01,MJ51;Mouse1_Right;"kidney Batch3_Balbc_Female_15weeks"
8 | LDK5_BulkSeq_ColdDiss_Fresh;LD02,MJ52;Mouse2_Right;"kidney Batch3_Balbc_Female_15weeks"
9 | LDK6_BulkSeq_ColdDiss_Fresh;LD03,MJ53;Mouse3_Left;"kidney Batch3_Balbc_Female_15weeks"
--------------------------------------------------------------------------------
/helper_functions.R:
--------------------------------------------------------------------------------
1 | regroup.cor <- function(correlation.matrix, correlation.threshold = 0.95){
2 |
3 | top.cor <- which(abs(correlation.matrix) >= correlation.threshold & row(correlation.matrix) < col(correlation.matrix), arr.ind = TRUE)
4 |
5 | if(nrow(top.cor) != 0){
6 |
7 | ## reconstruct names from positions
8 | high_cor <- matrix(colnames(correlation.matrix)[top.cor], ncol = 2)
9 |
10 | count = 0; groups = list()
11 |
12 | for(elem in unique(high_cor[,1])){
13 | count = count + 1
14 | items = high_cor[high_cor[,1] %in% elem,]
15 | if(!is.null(dim(items))){items <- items %>% melt() %>% .[3] %>% .$value %>% unique()}
16 | groups[[paste("group",count,sep="_")]] <- items
17 | }
18 |
19 | for(elem in unique(high_cor[,2])){
20 | count = count + 1
21 | items = high_cor[high_cor[,2] %in% elem,]
22 | if(!is.null(dim(items))){items <- items %>% melt() %>% .[3] %>% .$value %>% unique()}
23 | groups[[paste("group",count,sep="_")]] <- items
24 | }
25 |
26 | # If one element present in multiple groups, keep largest
27 | groups <- groups[sapply(groups, length) %>% order() %>% rev()] #re-order NEEDED to ensure biggest groups and no repetition
28 | new.groups <- list()
29 | used <- c()
30 | count = 0
31 | for(i in names(groups)){
32 | count = count + 1
33 | to.regroup <- sapply(groups, function(x) groups[[i]] %in% x) %>% colSums() != 0
34 | to.regroup <- names(to.regroup)[to.regroup]
35 | if(sum(to.regroup %in% used) == 0){
36 | new.groups[[paste("group",count,sep="_")]] <- groups[to.regroup] %>% unlist() %>% unique()
37 | used <- c(used, to.regroup)
38 | }
39 |
40 | }
41 |
42 | annotation = melt(new.groups)
43 | rownames(annotation) <- annotation$value
44 | annotation$value <- NULL
45 | colnames(annotation) <- "new.group"
46 |
47 | } else {
48 |
49 | annotation = data.frame(new.group = "")
50 |
51 | }
52 |
53 | return(annotation)
54 |
55 | }
56 |
57 |
58 |
59 | collapsing <- function(input, duplicated.names){
60 |
61 | if(sum(duplicated(duplicated.names)) != 0){
62 |
63 | exp.var <- apply(input, 1, mean) %>% melt()
64 | exp.var$gene <- duplicated.names
65 | exp.var$IQR <- apply(input, 1, IQR) %>% as.numeric()
66 | exp.var$row.number <- 1:nrow(exp.var)
67 | max.values = exp.var %>%
68 | group_by(gene) %>%
69 | summarise(across(c("value", "IQR"), ~ max(.x)))
70 | exp.var <- merge(max.values, exp.var, by = "gene")
71 | to.keep <- which(exp.var[,2] == exp.var[,4] & exp.var[,3] == exp.var[,5]) %>% sort()
72 |
73 | #if after this criteria, still duplicates, choose the first one (smallest row.number)!
74 | exp.var <- exp.var[to.keep,]
75 | min.row = exp.var %>%
76 | group_by(gene) %>%
77 | summarise(across(c("row.number"), ~ min(.x)))
78 | exp.var <- merge(min.row, exp.var, by = "gene")
79 | to.keep <- which(exp.var[,2] == exp.var[,7]) %>% sort()
80 | exp.var <- exp.var[to.keep,] %>% na.omit()
81 | input <- input[exp.var[,2], ]
82 |
83 | if(length(grep("ENS.*\\.[0-9]+", rownames(input))) > 1000){ # For datasets with ENSG names with format "ENSG00...003.14": keep genes with highest mean expression AND highest variability (first criterium insufficient, there are cases with same mean)
84 |
85 | rownames(input) <- strsplit(rownames(input),"\\.") %>% lapply(., function(x) x[1]) %>% unlist()
86 |
87 | } else if(length(grep("ENS.*\\__", rownames(input))) > 1000){ # For datasets with gene names as "ENSG00...__HGNC": remove the HGNC part
88 |
89 | rownames(input) <- strsplit(rownames(input),"__") %>% lapply(., function(x) x[1]) %>% unlist()
90 |
91 | } else {
92 |
93 | rownames(input) <- exp.var$gene
94 |
95 | }
96 |
97 | }
98 |
99 | return(input)
100 |
101 | }
102 |
103 |
104 |
105 | transformation2 <- function(X, Y, leave.one.out = TRUE) {
106 |
107 | # use the same genes for all input datasets
108 | Genes <- intersect(row.names(Y), row.names(X))
109 |
110 | X <- as.matrix(X[Genes,])
111 | Y <- as.matrix(Y[Genes,])
112 |
113 | if(leave.one.out){
114 |
115 | pred <- intersect(colnames(X), colnames(Y)) # matching samples with sc and bulk data
116 | X.new <- matrix(0, nrow=dim(X)[1], ncol=length(pred))
117 |
118 | for(j in 1:length(pred)){
119 |
120 | X.train <- as.matrix(X[,pred[-j]])
121 | X.test <- as.matrix(X[,pred[j]])
122 | Y.train <- as.matrix(Y[,pred])
123 |
124 | # track the mean and sd after leaving one out:
125 | X.mean <- rowMeans(X.train)
126 | X.sd <- apply(X.train,1,sd)
127 | Y.mean <- rowMeans(Y)
128 | Y.sd <- apply(Y,1,sd)
129 |
130 | for (i in 1:dim(X)[1]){
131 |
132 | # transforming over genes
133 | x <- X.test[i]
134 | sigma_j <- Y.sd[i]*sqrt((length(Y[i,])-1)/(length(Y[i,])+1))
135 | x.new <- (x-X.mean[i])/X.sd[i]
136 | x.new <- x.new*sigma_j + Y.mean[i]
137 |
138 | X.new[i,j] <- x.new
139 |
140 | }
141 |
142 | }
143 |
144 | } else {
145 |
146 | # transforming over genes
147 | X.new <- matrix(0, nrow=dim(X)[1], ncol=dim(X)[2])
148 |
149 | for (i in 1:dim(X)[1]){
150 |
151 | y <- Y[i,]
152 | x <- X[i,]
153 | l <- length(y)
154 | sigma_j <- sd(y)*sqrt((l-1)/(l+1))
155 | x.new <- (x-mean(x))/sd(x)
156 | x.new <- x.new*sigma_j + mean(y)
157 | X.new[i,] <- x.new
158 |
159 | }
160 |
161 | }
162 |
163 | # explicit non-negativity constraint:
164 | X.new = apply(X.new,2,function(x) ifelse(x < 0, 0, x))
165 |
166 | rownames(X.new) = rownames(X); colnames(X.new) = colnames(X)
167 |
168 | return(X.new)
169 |
170 | }
171 |
172 |
173 |
174 | bulkC.fromSC <- function(scC, phenoDataC){
175 |
176 | phenoDataC = phenoDataC[colnames(scC),]
177 | cellType <- phenoDataC$cellType
178 | group = list()
179 |
180 | for(i in unique(cellType)){
181 |
182 | group[[i]] <- which(cellType %in% i)
183 |
184 | }
185 |
186 | C = lapply(group,function(x) Matrix::rowMeans(scC[,x]))
187 | C = do.call(cbind.data.frame, C)
188 |
189 | return(as.matrix(C))
190 |
191 | }
192 |
193 |
194 |
195 | Scaling <- function(matrix, option, phenoDataC=NULL){
196 |
197 | #Avoid Error: Input matrix x contains at least one null or NA-filled row.
198 | matrix = matrix[rowSums(matrix) != 0,]
199 | #Avoid error if all elements within a row are equal (e.g. all 0, or all a common value after log/sqrt/vst transformation)
200 | matrix = matrix[!apply(matrix, 1, function(x) var(x) == 0),]
201 | matrix = matrix[,colSums(matrix) != 0]
202 |
203 | if (option == "LogNormalize"){
204 |
205 | matrix = expm1(Seurat::LogNormalize(matrix, verbose = FALSE)) %>% as.matrix()
206 |
207 | } else if (option == "TMM"){# CPM counts coming from TMM-normalized library sizes; https://support.bioconductor.org/p/114798/
208 |
209 | if(!is.null(phenoDataC)){
210 |
211 | Celltype = as.character(phenoDataC$cellType[phenoDataC$cellID %in% colnames(matrix)])
212 |
213 | } else {
214 |
215 | Celltype = colnames(matrix)
216 |
217 | }
218 |
219 | matrix <- edgeR::DGEList(counts=matrix, group=Celltype)
220 | CtrlGenes <- grep("ERCC-",rownames(data))
221 |
222 | if(length(CtrlGenes) > 1){
223 |
224 | spikes <- data[CtrlGenes,]
225 | spikes <- edgeR::calcNormFactors(spikes, method = "TMM")
226 | matrix$samples$norm.factors <- spikes$samples$norm.factors
227 |
228 | } else {
229 |
230 | matrix <- edgeR::calcNormFactors(matrix, method = "TMM")
231 |
232 | }
233 |
234 | matrix <- edgeR::cpm(matrix)
235 |
236 | } else if (option == "TPM"){
237 |
238 | # MULTI-CORE VERSION: adapted from #devtools::source_url('https://raw.githubusercontent.com/dviraran/SingleR/master/R/HelperFunctions.R', sha1 = "df5560b4ebb28295349aadcbe5b0dc77d847fd9e")
239 |
240 | TPM <- function(counts,lengths=NULL){
241 |
242 | require(foreach)
243 | require(Matrix)
244 |
245 | if (is.null(lengths)) {
246 | data('gene_lengths')
247 | }
248 |
249 | A = intersect(rownames(counts),names(lengths))
250 | counts = as.matrix(counts[A,])
251 | lengths = lengths[A]
252 | rate = counts / lengths
253 |
254 | num_cores <- min(4, parallel::detectCores()) #slurm will automatically restrict this to parameter --cpus-per-task
255 | print(paste("num_cores = ", num_cores, sep = ""))
256 |
257 | doMC::registerDoMC(cores = num_cores)
258 | jump = 100
259 | tpm <- foreach::foreach(elem = seq(1,ncol(counts), jump), .combine='cbind.data.frame') %dopar% {
260 |
261 | lsizes <- colSums(rate[, elem:min(ncol(counts),(elem + jump - 1))])
262 | lo <- 1e6*rate[, elem:min(ncol(counts),(elem + jump - 1))]
263 | tpm <- lo %*% diag(1/lsizes)
264 | tpm
265 |
266 | }
267 |
268 | colnames(tpm) <- colnames(counts)
269 | return(tpm)
270 |
271 | }
272 |
273 | if(! file.exists("human_lengths.rda")){
274 | download.file("https://github.com/dviraran/SingleR/blob/master/data/human_lengths.rda?raw=true","./human_lengths.rda")
275 | }
276 | load("./human_lengths.rda")
277 |
278 | # Doesn't work with Ensembl IDs:
279 | if(length(grep("ENSG000",rownames(matrix))) > 50){
280 |
281 | suppressMessages(library("AnnotationDbi"))
282 | suppressMessages(library("org.Hs.eg.db"))
283 | temp = mapIds(org.Hs.eg.db,keys = names(human_lengths), column = "ENSEMBL", keytype = "SYMBOL", multiVals = "first")
284 | names(human_lengths) = as.character(temp)
285 |
286 | }
287 |
288 | matrix = TPM(counts = matrix, lengths = human_lengths)
289 | rownames(matrix) = toupper(rownames(matrix))
290 |
291 | } else if (option == "TPM.murine"){
292 |
293 | # MULTI-CORE VERSION: adapted from #devtools::source_url('https://raw.githubusercontent.com/dviraran/SingleR/master/R/HelperFunctions.R', sha1 = "df5560b4ebb28295349aadcbe5b0dc77d847fd9e")
294 |
295 | TPM <- function(counts,lengths=NULL){
296 |
297 | require(foreach)
298 | require(Matrix)
299 |
300 | if (is.null(lengths)) {
301 | data('gene_lengths')
302 | }
303 |
304 |
305 | A = intersect(rownames(counts),names(lengths))
306 | counts = as.matrix(counts[A,])
307 | lengths = lengths[A]
308 | rate = counts / lengths
309 |
310 | num_cores <- min(4, parallel::detectCores()) #slurm will automatically restrict this to parameter --cpus-per-task
311 | print(paste("num_cores = ", num_cores, sep = ""))
312 |
313 | doMC::registerDoMC(cores = num_cores)
314 | jump = 100
315 | tpm <- foreach::foreach(elem = seq(1,ncol(counts), jump), .combine='cbind.data.frame') %dopar% {
316 |
317 | lsizes <- colSums(rate[, elem:min(ncol(counts),(elem + jump - 1))])
318 | lo <- 1e6*rate[, elem:min(ncol(counts),(elem + jump - 1))]
319 | tpm <- lo %*% diag(1/lsizes)
320 | tpm
321 |
322 | }
323 |
324 | colnames(tpm) <- colnames(counts)
325 | return(tpm)
326 |
327 | }
328 |
329 | if(! file.exists("mouse_lengths.rda")){
330 | download.file("https://github.com/dviraran/SingleR/blob/master/data/mouse_lengths.rda?raw=true","./mouse_lengths.rda")
331 | }
332 |
333 | load("./mouse_lengths.rda")
334 |
335 | ## Append version with ENSMUSG000 names
336 | mouse_lengths2 <- mouse_lengths
337 |
338 | # Doesn't work with Ensembl IDs:
339 | suppressMessages(library("AnnotationDbi"))
340 | suppressMessages(library("org.Mm.eg.db"))
341 | temp = mapIds(org.Mm.eg.db,keys = names(mouse_lengths), column = "ENSEMBL", keytype = "SYMBOL", multiVals = "first")
342 | names(mouse_lengths) = as.character(temp)
343 |
344 | mouse_lengths = c(mouse_lengths,mouse_lengths2)
345 | matrix = TPM(counts = matrix, lengths = mouse_lengths)
346 |
347 | ####################################################################################
348 | ## scRNA-seq specific
349 |
350 | } else if (option == "SCTransform"){
351 |
352 | matrix = as(matrix, "dgCMatrix")
353 | matrix = sctransform::vst(matrix, return_corrected_umi = TRUE, show_progress = FALSE)$umi_corrected
354 |
355 | } else if (option == "scran"){
356 |
357 | sce = SingleCellExperiment::SingleCellExperiment(assays = list(counts=as.matrix(matrix)))
358 | sce = scran::computeSumFactors(sce, clusters=NULL)
359 | sce = scater::logNormCounts(sce, log = FALSE)
360 | matrix = normcounts(sce)
361 |
362 | } else if (option == "scater"){
363 |
364 | sce = SingleCellExperiment::SingleCellExperiment(assays = list(counts=as.matrix(matrix)))
365 | size_factors = scater::librarySizeFactors(sce)
366 | sce = scater::logNormCounts(sce, log = FALSE)
367 | matrix = normcounts(sce)
368 |
369 | }
370 |
371 | return(matrix)
372 |
373 | }
374 |
375 |
376 |
377 | Deconvolution <- function(T, C, method, phenoDataC, P = NULL, elem = NULL, STRING = NULL, marker_distrib, refProfiles.var){
378 |
379 | bulk_methods = c("CIBERSORT","nnls","FARDEEP","RLR")
380 | sc_methods = c("MuSiC","MuSiC_with_markers","DWLS","SQUID", "Bisque", "Bisque_with_markers")
381 |
382 | ########## Using marker information for bulk_methods
383 |
384 | if(method %in% bulk_methods){
385 |
386 | C = C[rownames(C) %in% unique(marker_distrib$gene), , drop = FALSE]
387 | T = T[rownames(T) %in% unique(marker_distrib$gene), , drop = FALSE]
388 | refProfiles.var = refProfiles.var[rownames(refProfiles.var) %in% unique(marker_distrib$gene), , drop = FALSE]
389 |
390 | } else { ### For scRNA-seq methods
391 |
392 | if(length(grep("[N-n]ame",colnames(phenoDataC))) > 0){
393 | sample_column = grep("[N-n]ame",colnames(phenoDataC))
394 | } else {
395 | sample_column = grep("[S-s]ample|[S-s]ubject",colnames(phenoDataC))
396 | }
397 |
398 | colnames(phenoDataC)[sample_column] = "SubjectName"
399 | rownames(phenoDataC) = phenoDataC$cellID
400 | # establish same order in (sc)C and phenoDataC:
401 | phenoDataC <- phenoDataC[match(colnames(C),phenoDataC$cellID),]
402 |
403 | if(method %in% c("MuSiC","MuSiC_with_markers", "Bisque", "Bisque_with_markers")){
404 |
405 | require(xbioc)
406 | C.eset <- Biobase::ExpressionSet(assayData = as.matrix(C), phenoData = Biobase::AnnotatedDataFrame(phenoDataC))
407 | T.eset <- Biobase::ExpressionSet(assayData = as.matrix(T))
408 |
409 | }
410 |
411 | }
412 |
413 | ########## MATRIX DIMENSION APPROPRIATENESS ##########
414 |
415 | keep = intersect(rownames(C),rownames(T))
416 | C = C[keep, , drop = FALSE]
417 | T = T[keep, , drop = FALSE]
418 |
419 | ###################################
420 |
421 | if(method == "CIBERSORT"){
422 |
423 | #source("./CIBERSORT.R")
424 | RESULTS = CIBERSORT(sig_matrix = C, mixture_file = T, QN = FALSE)
425 | RESULTS = t(RESULTS[,1:(ncol(RESULTS)-3),drop=FALSE])
426 |
427 | } else if (method == "nnls"){
428 |
429 | require(nnls)
430 | RESULTS = do.call(cbind.data.frame,lapply(apply(T,2,function(x) nnls::nnls(as.matrix(C),x)), function(y) y$x))
431 | RESULTS = apply(RESULTS,2,function(x) x/sum(x)) #explicit STO constraint
432 | rownames(RESULTS) <- colnames(C)
433 |
434 | } else if (method == "FARDEEP"){
435 |
436 | require(FARDEEP)
437 | RESULTS = t(FARDEEP::fardeep(C, T, nn = TRUE, intercept = TRUE, permn = 10, QN = FALSE)$abs.beta)
438 | RESULTS = apply(RESULTS,2,function(x) x/sum(x)) #explicit STO constraint
439 | # These 2 lines are similar as retrieving $relative.beta instead of abs.beta + re-scaling
440 |
441 | } else if (method == "RLR"){ #RLR = robust linear regression
442 |
443 | require(MASS)
444 | RESULTS = do.call(cbind.data.frame,lapply(apply(T,2,function(x) MASS::rlm(x ~ as.matrix(C), maxit=100)), function(y) y$coefficients[-1]))
445 | RESULTS = apply(RESULTS,2,function(x) ifelse(x < 0, 0, x)) #explicit non-negativity constraint
446 | RESULTS = apply(RESULTS,2,function(x) x/sum(x)) #explicit STO constraint
447 | rownames(RESULTS) <- unlist(lapply(strsplit(rownames(RESULTS),")"),function(x) x[2]))
448 |
449 | ###################################
450 | ###################################
451 |
452 | } else if (method == "MuSiC"){
453 |
454 | require(MuSiC)
455 | RESULTS = t(MuSiC::music_prop(bulk.eset = T.eset, sc.eset = C.eset, clusters = 'cellType',
456 | markers = NULL, normalize = FALSE, samples = 'SubjectName',
457 | verbose = F)$Est.prop.weighted)
458 |
459 | } else if (method == "MuSiC_with_markers"){
460 |
461 | require(MuSiC)
462 | RESULTS = t(MuSiC::music_prop(bulk.eset = T.eset, sc.eset = C.eset, clusters = 'cellType',
463 | markers = unique(marker_distrib$gene), normalize = FALSE, samples = 'SubjectName',
464 | verbose = F)$Est.prop.weighted)
465 |
466 | } else if (method == "Bisque"){#By default, Bisque uses all genes for decomposition. However, you may supply a list of genes (such as marker genes) to be used with the markers parameter
467 |
468 | require(BisqueRNA)
469 | RESULTS <- BisqueRNA::ReferenceBasedDecomposition(T.eset, C.eset, markers = NULL, use.overlap = FALSE)$bulk.props
470 |
471 | } else if (method == "Bisque_with_markers"){#By default, Bisque uses all genes for decomposition. However, you may supply a list of genes (such as marker genes) to be used with the markers parameter
472 |
473 | require(BisqueRNA)
474 | RESULTS <- BisqueRNA::ReferenceBasedDecomposition(T.eset, C.eset, markers = marker_distrib$gene, use.overlap = FALSE)$bulk.props
475 |
476 | } else if (method == "DWLS"){
477 |
478 | require(DWLS)
479 | path = paste(getwd(),"/results_",STRING,sep="")
480 |
481 | if(! dir.exists(path)){ dir.create(path) } #to avoid repeating marker_selection step when removing cell types; Sig.RData automatically created
482 |
483 | if(!file.exists(paste(path,"Sig.RData",sep="/"))){
484 |
485 | Signature <- DWLS::buildSignatureMatrixMAST(scdata = C, id = as.character(phenoDataC$cellType), path = path, diff.cutoff = 0.5, pval.cutoff = 0.01)
486 |
487 | } else {#re-load signature and remove CT column + its correspondent markers
488 |
489 | load(paste(path,"Sig.RData",sep="/"))
490 | Signature <- Sig
491 |
492 | }
493 |
494 | Signature = as.matrix(Signature)
495 |
496 | RESULTS <- apply(T, 2, function(x){
497 | b = setNames(x, rownames(T))
498 | tr <- DWLS::trimData(Signature, b)
499 | RES <- t(DWLS::solveDampenedWLS(tr$sig, tr$bulk))
500 | })
501 |
502 | rownames(RESULTS) <- as.character(unique(phenoDataC$cellType))
503 | RESULTS = apply(RESULTS,2,function(x) ifelse(x < 0, 0, x)) #explicit non-negativity constraint
504 | RESULTS = apply(RESULTS,2,function(x) x/sum(x)) #explicit STO constraint
505 | print(head(RESULTS))
506 |
507 | } else if (method == "SQUID"){
508 |
509 | # Transforming T :
510 | Z = bulkC.fromSC(scC = C, phenoDataC = phenoDataC)
511 | X = T
512 | P <- as.matrix(P[base::colnames(Z),,drop = FALSE])
513 | Y = Z %*% P
514 |
515 | X.new = transformation2(X = X, Y = Y, leave.one.out = FALSE)
516 | X.new[!is.finite(X.new)] <- 0
517 |
518 | # take common genes
519 | Genes <- intersect(rownames(Z),rownames(X.new))
520 |
521 | X.new <- as.matrix.Vector(X.new[Genes,])
522 | Z <- as.matrix.Vector(Z[Genes,])
523 |
524 | RESULTS <- apply(X.new, 2, function(x){
525 | RES <- t(DWLS::solveDampenedWLS(S = Z, B = x))
526 | })
527 |
528 | rownames(RESULTS) <- colnames(Z)
529 | RESULTS = apply(RESULTS,2,function(x) ifelse(x < 0, 0, x)) #explicit non-negativity constraint
530 | RESULTS = apply(RESULTS,2,function(x) x/sum(x)) #explicit STO constraint
531 |
532 | }
533 |
534 | RESULTS = RESULTS[gtools::mixedsort(rownames(RESULTS)), , drop = FALSE]
535 | RESULTS = data.table::melt(RESULTS)
536 | colnames(RESULTS) <- c("cell_type","tissue","observed_fraction")
537 | RESULTS$cell_type = as.character(RESULTS$cell_type)
538 | RESULTS$tissue = as.character(RESULTS$tissue)
539 |
540 | if(!is.null(P)){
541 |
542 | P = P[gtools::mixedsort(rownames(P)),,drop = FALSE] %>% data.frame(., check.names = FALSE)
543 | P$cell_type = rownames(P)
544 | P = data.table::melt(P, id.vars="cell_type")
545 | colnames(P) <-c("cell_type","tissue","expected_fraction")
546 | P$cell_type = as.character(P$cell_type)
547 | P$tissue = as.character(P$tissue)
548 |
549 | RESULTS = merge(RESULTS, P, by = c("cell_type", "tissue"), all = TRUE)
550 | RESULTS[is.na(RESULTS)] <- 0
551 | RESULTS$expected_fraction <- round(RESULTS$expected_fraction, 3)
552 | RESULTS$observed_fraction <- round(RESULTS$observed_fraction, 3)
553 |
554 | }
555 |
556 | return(RESULTS)
557 |
558 | }
559 |
--------------------------------------------------------------------------------
/human_lengths.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/favilaco/deconv_matching_bulk_scnRNA/c353e025979a7a122078a324585d1980301bc5f7/human_lengths.rda
--------------------------------------------------------------------------------
/schematic_figures/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/favilaco/deconv_matching_bulk_scnRNA/c353e025979a7a122078a324585d1980301bc5f7/schematic_figures/.DS_Store
--------------------------------------------------------------------------------
/schematic_figures/Scheme1_Github.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/favilaco/deconv_matching_bulk_scnRNA/c353e025979a7a122078a324585d1980301bc5f7/schematic_figures/Scheme1_Github.jpg
--------------------------------------------------------------------------------
/schematic_figures/Scheme1_Github.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/favilaco/deconv_matching_bulk_scnRNA/c353e025979a7a122078a324585d1980301bc5f7/schematic_figures/Scheme1_Github.pdf
--------------------------------------------------------------------------------
/schematic_figures/Scheme2_Github.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/favilaco/deconv_matching_bulk_scnRNA/c353e025979a7a122078a324585d1980301bc5f7/schematic_figures/Scheme2_Github.jpg
--------------------------------------------------------------------------------
/schematic_figures/Scheme2_Github.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/favilaco/deconv_matching_bulk_scnRNA/c353e025979a7a122078a324585d1980301bc5f7/schematic_figures/Scheme2_Github.pdf
--------------------------------------------------------------------------------
/sessionInfo_macOS.txt:
--------------------------------------------------------------------------------
1 | > sessionInfo()
2 | R version 4.0.1 (2020-06-06)
3 | Platform: x86_64-apple-darwin17.0 (64-bit)
4 | Running under: macOS High Sierra 10.13.6
5 |
6 | Matrix products: default
7 | BLAS: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
8 | LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
9 |
10 | locale:
11 | [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
12 |
13 | attached base packages:
14 | [1] parallel stats4 stats graphics grDevices utils datasets
15 | [8] methods base
16 |
17 | other attached packages:
18 | [1] viridis_0.6.1 viridisLite_0.4.0
19 | [3] forcats_0.5.1 stringr_1.4.0
20 | [5] purrr_0.3.4 readr_1.4.0
21 | [7] tibble_3.1.6 tidyverse_1.3.1
22 | [9] tidyr_1.1.4 sctransform_0.3.2
23 | [11] scran_1.16.0 scater_1.16.2
24 | [13] R.utils_2.10.1 R.oo_1.24.0
25 | [15] R.methodsS3_1.8.1 RColorBrewer_1.1-2
26 | [17] pheatmap_1.0.12 org.Mm.eg.db_3.11.4
27 | [19] org.Hs.eg.db_3.11.4 MuSiC_0.1.1
28 | [21] nnls_1.4 monocle3_1.0.0
29 | [23] Matrix_1.3-4 MASS_7.3-54
30 | [25] gtools_3.9.2 ggrepel_0.9.1
31 | [27] ggrastr_0.2.3 ggpubr_0.4.0
32 | [29] ggpointdensity_0.1.0 FARDEEP_1.0.1
33 | [31] egg_0.4.5 ggplot2_3.3.5
34 | [33] gridExtra_2.3 edgeR_3.30.3
35 | [35] limma_3.44.3 dplyr_1.0.7
36 | [37] doMC_1.3.7 iterators_1.0.13
37 | [39] foreach_1.5.1 devtools_2.4.2
38 | [41] usethis_2.0.1 data.table_1.14.0
39 | [43] AnnotationDbi_1.50.3 MAST_1.14.0
40 | [45] SingleCellExperiment_1.10.1 SummarizedExperiment_1.18.2
41 | [47] DelayedArray_0.14.1 matrixStats_0.59.0
42 | [49] Biobase_2.50.0 GenomicRanges_1.40.0
43 | [51] GenomeInfoDb_1.24.2 IRanges_2.22.2
44 | [53] S4Vectors_0.26.1 BiocGenerics_0.36.1
45 | [55] varhandle_2.0.5 ROCR_1.0-11
46 | [57] SeuratObject_4.0.2 Seurat_4.0.3
47 | [59] e1071_1.7-9 reshape_0.8.8
48 | [61] quadprog_1.5-8 BisqueRNA_1.0.5
49 |
50 | loaded via a namespace (and not attached):
51 | [1] SparseM_1.81 scattermore_0.7
52 | [3] coda_0.19-4 bit64_4.0.5
53 | [5] irlba_2.3.3 rpart_4.1-15
54 | [7] RCurl_1.98-1.3 generics_0.1.0
55 | [9] callr_3.7.0 cowplot_1.1.1
56 | [11] RSQLite_2.2.7 RANN_2.6.1
57 | [13] proxy_0.4-26 future_1.21.0
58 | [15] bit_4.0.4 xml2_1.3.2
59 | [17] lubridate_1.7.10 spatstat.data_2.1-0
60 | [19] httpuv_1.6.5 assertthat_0.2.1
61 | [21] hms_1.1.0 promises_1.2.0.1
62 | [23] fansi_0.5.0 dbplyr_2.1.1
63 | [25] readxl_1.3.1 igraph_1.2.6
64 | [27] DBI_1.1.1 htmlwidgets_1.5.3
65 | [29] mcmc_0.9-7 spatstat.geom_2.2-0
66 | [31] ellipsis_0.3.2 backports_1.2.1
67 | [33] MCMCpack_1.5-0 deldir_0.2-10
68 | [35] vctrs_0.3.8 remotes_2.4.0
69 | [37] quantreg_5.86 abind_1.4-5
70 | [39] cachem_1.0.5 withr_2.4.2
71 | [41] prettyunits_1.1.1 goftest_1.2-2
72 | [43] cluster_2.1.2 lazyeval_0.2.2
73 | [45] crayon_1.4.1 pkgconfig_2.0.3
74 | [47] nlme_3.1-152 vipor_0.4.5
75 | [49] pkgload_1.2.1 rlang_0.4.11
76 | [51] globals_0.14.0 lifecycle_1.0.0
77 | [53] miniUI_0.1.1.1 MatrixModels_0.5-0
78 | [55] modelr_0.1.8 rsvd_1.0.5
79 | [57] cellranger_1.1.0 rprojroot_2.0.2
80 | [59] polyclip_1.10-0 lmtest_0.9-38
81 | [61] carData_3.0-4 zoo_1.8-9
82 | [63] reprex_2.0.0 beeswarm_0.4.0
83 | [65] ggridges_0.5.3 processx_3.5.2
84 | [67] png_0.1-7 bitops_1.0-7
85 | [69] KernSmooth_2.23-20 blob_1.2.1
86 | [71] DelayedMatrixStats_1.10.1 parallelly_1.26.0
87 | [73] rstatix_0.7.0 ggsignif_0.6.2
88 | [75] scales_1.1.1 memoise_2.0.0
89 | [77] magrittr_2.0.1 plyr_1.8.6
90 | [79] ica_1.0-2 zlibbioc_1.34.0
91 | [81] compiler_4.0.1 dqrng_0.3.0
92 | [83] fitdistrplus_1.1-5 cli_2.5.0
93 | [85] XVector_0.28.0 listenv_0.8.0
94 | [87] patchwork_1.1.1 pbapply_1.4-3
95 | [89] ps_1.6.0 mgcv_1.8-36
96 | [91] tidyselect_1.1.1 stringi_1.6.2
97 | [93] BiocSingular_1.4.0 locfit_1.5-9.4
98 | [95] grid_4.0.1 tools_4.0.1
99 | [97] future.apply_1.7.0 rio_0.5.27
100 | [99] rstudioapi_0.13 foreign_0.8-81
101 | [101] Rtsne_0.15 digest_0.6.27
102 | [103] shiny_1.6.0 Rcpp_1.0.8
103 | [105] car_3.0-11 broom_0.7.8
104 | [107] later_1.2.0 RcppAnnoy_0.0.18
105 | [109] httr_1.4.2 colorspace_2.0-2
106 | [111] rvest_1.0.0 fs_1.5.0
107 | [113] tensor_1.5 reticulate_1.20
108 | [115] splines_4.0.1 uwot_0.1.10
109 | [117] statmod_1.4.36 conquer_1.0.2
110 | [119] spatstat.utils_2.2-0 plotly_4.9.4.1
111 | [121] sessioninfo_1.1.1 xtable_1.8-4
112 | [123] jsonlite_1.7.2 testthat_3.0.3
113 | [125] R6_2.5.0 pillar_1.6.4
114 | [127] htmltools_0.5.1.1 mime_0.11
115 | [129] glue_1.4.2 fastmap_1.1.0
116 | [131] BiocParallel_1.22.0 BiocNeighbors_1.6.0
117 | [133] class_7.3-19 codetools_0.2-18
118 | [135] pkgbuild_1.2.0 utf8_1.2.1
119 | [137] lattice_0.20-44 spatstat.sparse_2.0-0
120 | [139] curl_4.3.2 ggbeeswarm_0.6.0
121 | [141] leiden_0.3.8 zip_2.2.0
122 | [143] openxlsx_4.2.4 survival_3.2-11
123 | [145] desc_1.3.0 munsell_0.5.0
124 | [147] GenomeInfoDbData_1.2.3 haven_2.4.1
125 | [149] reshape2_1.4.4 gtable_0.3.0
126 | [151] spatstat.core_2.2-0
127 |
--------------------------------------------------------------------------------