├── .DS_Store ├── ENSG_to_HGNC.rds ├── LICENSE.md ├── MASTER.R ├── README.md ├── SQUID ├── DESCRIPTION ├── LICENSE ├── NAMESPACE ├── R │ └── SQUID.R ├── README.md ├── SQUID.Rproj ├── SQUID_Toy_Example_Data.RData └── man │ └── SQUID.Rd ├── SURVIVAL └── Survival_Analyses.R ├── Toy_example ├── .DS_Store ├── P.rds ├── T.rds ├── markers_TMM_FC1.5_Seuratwilcox.rds ├── phenoDataC_clusters_after_regrouping.txt ├── phenoDataC_clusters_wo_regrouping.txt └── scC.rds ├── additional_data └── Denisenko_crosstable_bulk_sc_sn_14092021.csv ├── helper_functions.R ├── human_lengths.rda ├── schematic_figures ├── .DS_Store ├── Scheme1_Github.jpg ├── Scheme1_Github.pdf ├── Scheme2_Github.jpg └── Scheme2_Github.pdf └── sessionInfo_macOS.txt /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/favilaco/deconv_matching_bulk_scnRNA/c353e025979a7a122078a324585d1980301bc5f7/.DS_Store -------------------------------------------------------------------------------- /ENSG_to_HGNC.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/favilaco/deconv_matching_bulk_scnRNA/c353e025979a7a122078a324585d1980301bc5f7/ENSG_to_HGNC.rds -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # LICENSE 2 | 3 | This repository is distributed under the BSD 3-clause license. 4 | 5 | **IMPORTANT: methods used and included in this repository are provided as separate packages. Their respective license conditions apply** 6 | 7 | 8 | ``` 9 | Copyright 2022 Francisco Avila Cobos 10 | 11 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 12 | 13 | * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 14 | 15 | * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 16 | 17 | * Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 20 | ``` -------------------------------------------------------------------------------- /MASTER.R: -------------------------------------------------------------------------------- 1 | suppressMessages(library(tidyverse)) 2 | suppressMessages(library(data.table)) 3 | suppressMessages(library(ggrastr)) 4 | suppressMessages(library(viridis)) 5 | suppressMessages(library(ggpointdensity)) 6 | suppressMessages(library(egg)) 7 | suppressMessages(library(Matrix)) 8 | suppressMessages(library(matrixStats)) 9 | suppressMessages(library(limma)) 10 | suppressMessages(library(ggplot2)) 11 | suppressMessages(library(RColorBrewer)) 12 | suppressMessages(library(ggrepel)) 13 | suppressMessages(library(ggpubr)) 14 | suppressMessages(library(monocle3)) 15 | suppressMessages(library(Seurat)) 16 | suppressMessages(library(foreach)) 17 | suppressMessages(library(R.utils)) 18 | 19 | args <- R.utils::commandArgs(trailingOnly=TRUE) 20 | 21 | if(length(args) != 5){ 22 | 23 | print("Please check that all required parameters are indicated or are correct") 24 | stop() 25 | } 26 | 27 | set.seed(4) 28 | options(digits = 3) 29 | options(future.globals.maxSize = 2000 * 1024^2) #https://satijalab.org/seurat/archive/v3.0/future_vignette.html 30 | 31 | ENSG_to_HGNC = readRDS("./ENSG_to_HGNC.rds") 32 | 33 | source('./helper_functions.R') 34 | 35 | ############################################################################## 36 | ## Input files and parameters: 37 | 38 | directory <- args[1] 39 | ref.normalization = args[2] # normalization choice for scC or C 40 | mix.normalization = args[3] # normalization choice for T 41 | method = args[4] 42 | 43 | cell.type.info.present <- args[5] #if "yes", no need to apply unsupervised clustering 44 | 45 | ############################################################################## 46 | ## Reading input into memory: 47 | 48 | setwd(directory) 49 | 50 | scC <- readRDS("./scC.rds") %>% as.matrix() 51 | T = readRDS("./T.rds") %>% as.matrix() 52 | phenoDataC = read.table("phenoDataC_clusters_wo_regrouping.txt", header = TRUE) # It must contain at least two columns: "cellID" & "SubjectName". "cellType" is optional (e.g. if cell.type.info.present = "no") 53 | 54 | #Unless proportions are provided from IHC, it would be computed directly from scRNA-seq/snRNA-seq 55 | if(!file.exists("./P.rds")){ 56 | P = NULL 57 | } else { 58 | P = readRDS("./P.rds") 59 | } 60 | 61 | ############################################################################## 62 | ## PRE-PROCESSING AND QUALITY CONTROL 63 | 64 | # Filter to remove 1% of extreme values (top 0.5% and bottom 0.5%) 65 | filterCells <- function(filterParam){ 66 | cellsToRemove <- which(filterParam < quantile(filterParam, .005) | filterParam > quantile(filterParam, .995)) 67 | cellsToRemove 68 | } 69 | 70 | libSizes <- colSums(scC) 71 | gene_names <- rownames(scC) 72 | 73 | if(length(grep("ENSG000|ENSMUSG000",rownames(scC))) > 100){ 74 | 75 | mtID.Ensembl <- ENSG_to_HGNC$ENSG_ID[grep("^MT-|_MT-", ENSG_to_HGNC$HGNC, ignore.case = TRUE)] 76 | rbID.Ensembl <- ENSG_to_HGNC$ENSG_ID[grep("^RPL|^RPS|_RPL|_RPS", ENSG_to_HGNC$HGNC, ignore.case = TRUE)] 77 | mtID <- gene_names %in% mtID.Ensembl 78 | rbID <- gene_names %in% rbID.Ensembl 79 | 80 | } else { 81 | 82 | mtID <- grepl("^MT-|_MT-", gene_names, ignore.case = TRUE) 83 | rbID <- grepl("^RPL|^RPS|_RPL|_RPS", gene_names, ignore.case = TRUE) 84 | 85 | } 86 | 87 | mtPercent <- colSums(scC[mtID, ])/libSizes 88 | rbPercent <- colSums(scC[rbID, ])/libSizes 89 | 90 | lapply(list(libSizes = libSizes, mtPercent = mtPercent, rbPercent = rbPercent), filterCells) %>% 91 | unlist() %>% 92 | unique() -> cellsToRemove 93 | 94 | if(length(cellsToRemove) != 0){ 95 | scC <- scC[,-cellsToRemove] 96 | phenoDataC <- phenoDataC[-cellsToRemove,] 97 | } 98 | 99 | # Keep only "detectable" genes: min(round(0.01 * ncol(scC), 10) cells (regardless of the group) have a read/UMI count different from 0 100 | keep <- which(Matrix::rowSums(scC > 0) >= min(round(0.01 * ncol(scC)), 10)) #otherwise it removes many genes from Ellis dataset 101 | scC = scC[keep,] 102 | 103 | ############################################################################## 104 | ## Unsupervised clustering (Monocle3) if metadata information not provided 105 | 106 | if(cell.type.info.present == "no"){ 107 | 108 | if(file.exists("phenoDataC_clusters_after_regrouping.txt")){ 109 | 110 | phenoDataC = read.table("phenoDataC_clusters_after_regrouping.txt", header = TRUE) 111 | 112 | } else { 113 | 114 | cds <- monocle3::new_cell_data_set(scC) 115 | cds <- monocle3::preprocess_cds(cds, num_dim = 100, norm_method = "log", method = "PCA", scaling = TRUE) 116 | cds <- monocle3::reduce_dimension(cds, max_components = 2, umap.metric = "cosine", umap.fast_sgd = FALSE, preprocess_method = 'PCA') 117 | cds <- monocle3::cluster_cells(cds, k = 20, resolution = NULL, partition_qval = 0.05, num_iter = 1) #with resolution = NULL, the parameter is determined automatically 118 | 119 | Monocle3_metadata <- data.frame(cellID = names(cds@clusters$UMAP$clusters), cellType = paste("cluster", cds@clusters$UMAP$clusters, sep =".")) 120 | phenoDataC$clusterID <- Monocle3_metadata$cellType 121 | 122 | m1 <- monocle3::plot_cells(cds, color_cells_by = "cluster", group_label_size = 4) 123 | 124 | ggsave(paste("Monocle3_",basename(directory),".pdf",sep=""), gridExtra::grid.arrange(m1, nrow = 1), height = 6, width = 6, limitsize = FALSE) 125 | dev.off() 126 | 127 | ## Make C based on clusters from unsupervised clustering 128 | cellType <- phenoDataC$clusterID 129 | group = list() 130 | 131 | for(i in unique(cellType)){ 132 | group[[i]] <- which(cellType %in% i) 133 | } 134 | 135 | C = lapply(group,function(x) Matrix::rowMeans(scC[,x])) #C should be made with the mean (not sum) to agree with the way markers were selected 136 | C = do.call(cbind.data.frame, C) 137 | 138 | ############### 139 | # Regrouping if correlation > 0.95 140 | 141 | correlation.matrix = cor(C) 142 | annotation.regrouped <- regroup.cor(correlation.matrix, correlation.threshold = 0.95) 143 | 144 | if(nrow(annotation.regrouped) > 0){ 145 | 146 | pdf(paste("Monocle3_cor_full_", basename(getwd()), "_log.ls.norm.pdf", sep = ""), width = max(7, ncol(C)/2), height = max(7, ncol(C)/2)) 147 | pheatmap::pheatmap(correlation.matrix, annotation = annotation.regrouped, scale = "none", display_numbers = TRUE, number_format = "%.2f") 148 | dev.off() 149 | 150 | counter = 1 151 | for(elem in unique(annotation.regrouped$new.group)){ 152 | phenoDataC$clusterID[phenoDataC$clusterID %in% rownames(annotation.regrouped)[annotation.regrouped$new.group == elem]] <- paste("regrouped.cluster", counter, sep = ".") 153 | counter = counter + 1 154 | } 155 | 156 | phenoDataC$cellType <- phenoDataC$clusterID 157 | # re-order phenoDataC in function of scC 158 | phenoDataC = phenoDataC[phenoDataC$cellID %in% colnames(scC),] 159 | phenoDataC <- phenoDataC[match(colnames(scC),phenoDataC$cellID),] 160 | 161 | } 162 | 163 | write.table(phenoDataC, file = "phenoDataC_clusters_after_regrouping.txt", row.names = FALSE, col.names = TRUE, sep = "\t", quote = FALSE) 164 | 165 | ############### 166 | # re-making UMAP plots with new information: 167 | 168 | cds@colData$new.cluster = phenoDataC$clusterID[match(cds@colData$cell, phenoDataC$cellID)] 169 | 170 | m2 <- monocle3::plot_cells(cds, color_cells_by = "new.cluster", group_label_size = 4, show_trajectory_graph = FALSE) 171 | 172 | ggsave(paste("MONOCLE.UMAP_REGROUPED_",basename(directory),".pdf",sep=""), gridExtra::grid.arrange(m2, nrow = 1), height = 6, width = 6, limitsize = FALSE) 173 | dev.off() 174 | 175 | } 176 | 177 | } else { 178 | 179 | # re-order phenoDataC in function of scC 180 | phenoDataC = phenoDataC[phenoDataC$cellID %in% colnames(scC),] 181 | phenoDataC <- phenoDataC[match(colnames(scC),phenoDataC$cellID),] 182 | 183 | } 184 | 185 | ## Ensure that phenoDataC & scC have the same (& ordered) cells [for cases where users will provide a foreign "phenoDataC_clusters_after_regrouping.txt" as input]: 186 | common.cells = intersect(phenoDataC$cellID,colnames(scC)) 187 | scC = scC[, common.cells] 188 | phenoDataC = phenoDataC[match(common.cells, phenoDataC$cellID),] 189 | 190 | # Create matrix C (made with the mean expression values for all cells across cell types) 191 | cellType <- phenoDataC$cellType 192 | group = list() 193 | 194 | for(i in unique(cellType)){ 195 | group[[i]] <- which(cellType %in% i) 196 | } 197 | 198 | # Generating C (raw) from the mean (raw) expression values across all cells from each cell type 199 | C = lapply(group,function(x) Matrix::rowMeans(scC[,x])) 200 | C = do.call(cbind.data.frame, C) %>% as.matrix() 201 | 202 | ############################################################################## 203 | ## Remove row duplicates 204 | 205 | C = collapsing(C, rownames(C)) 206 | scC = collapsing(scC, rownames(scC)) 207 | scC.raw = scC # needed to run marker selection + deconvolution 208 | T = collapsing(T, rownames(T)) 209 | 210 | ############################################################################## 211 | ## Single-cell and bulk normalization 212 | 213 | C <- Scaling(matrix = C, option = ref.normalization) 214 | scC <- Scaling(matrix = scC.raw, option = ref.normalization, phenoDataC = phenoDataC) 215 | 216 | T <- Scaling(matrix = T, option = mix.normalization) 217 | 218 | ############################################################################## 219 | ## Seurat - FindAllMarkers (Wilcoxon) on TMM normalized scRNA-seq/snRNA-seq 220 | # MuSiC & DWLS: without markers 221 | 222 | if(!method %in% c("MuSiC", "DWLS", "SQUID")){ 223 | 224 | if(! file.exists("./markers_TMM_FC1.5_Seuratwilcox.rds")){ 225 | 226 | scC <- Scaling(matrix = scC.raw, option = "TMM", phenoDataC = phenoDataC) 227 | 228 | scC <- Seurat::CreateSeuratObject(counts = scC) 229 | scC$groups <- as.character(phenoDataC$clusterID) 230 | Idents(object = scC) <- as.character(phenoDataC$clusterID) #Needed. See: https://github.com/satijalab/seurat/issues/1482 231 | 232 | markers <- Seurat::FindAllMarkers(scC, 233 | logfc.threshold = log2(1.5), 234 | grouping.var = "groups", 235 | test.use = "wilcox") 236 | 237 | markers <- markers[,c("gene","avg_log2FC","cluster")] 238 | markers$AveExpr <- Inf 239 | colnames(markers) <- c("gene","log2FC","cell_type","AveExpr") 240 | markers$HGNC <- ENSG_to_HGNC$HGNC[match(markers$gene, ENSG_to_HGNC$ENSG_ID)] 241 | markers$ENSG <- ENSG_to_HGNC$ENSG[match(markers$gene, ENSG_to_HGNC$HGNC)] 242 | if(sum(is.na(markers$HGNC)) == nrow(markers)){markers$HGNC <- NULL; markers$HGNC <- rownames(markers)} 243 | if(sum(is.na(markers$ENSG)) == nrow(markers)){markers$ENSG <- NULL; markers$ENSG <- rownames(markers)} 244 | 245 | #if markers$gene is both ENSG and HGNC, split: 246 | if(length(grep("--",markers$gene)) == nrow(markers)){ markers$gene <- strsplit(markers$gene, "--") %>% lapply(., function(x) x[1]) %>% unlist()} 247 | if(length(grep("__",markers$gene)) == nrow(markers)){ markers$gene <- strsplit(markers$gene, "__") %>% lapply(., function(x) x[1]) %>% unlist()} 248 | 249 | ## if needed, put markers$gene in same gene format as rownames C & T: 250 | if(length(intersect(markers$gene, rownames(C))) == 0){ 251 | 252 | if(length(grep("ENSG000|ENSMUSG000",markers$gene)) > 100){ 253 | 254 | markers.gene <- ENSG_to_HGNC$HGNC[match(markers$gene, ENSG_to_HGNC$ENSG_ID)] 255 | 256 | } else { 257 | 258 | markers.gene <- ENSG_to_HGNC$ENSG_ID[match(markers$gene, ENSG_to_HGNC$HGNC)] 259 | 260 | } 261 | 262 | markers$gene <- markers.gene 263 | 264 | } 265 | 266 | no.dups <- which(table(markers$gene)[markers$gene] == 1) %>% as.numeric() 267 | dups <- which(table(markers$gene)[markers$gene] > 1) #keeps ALL occurrences in original table + original ordering 268 | dups <- sapply(unique(names(dups)), function(x) grep(x, markers$gene)[1]) %>% as.numeric() #keeps ONLY FIRST occurrence of duplicates (in original table) + original ordering 269 | markers <- markers[c(no.dups, dups),] 270 | 271 | saveRDS(markers, "./markers_TMM_FC1.5_Seuratwilcox.rds") 272 | 273 | } else { 274 | 275 | markers = readRDS("./markers_TMM_FC1.5_Seuratwilcox.rds") 276 | 277 | } 278 | 279 | } else { 280 | 281 | markers = "" 282 | 283 | } 284 | 285 | ############################################################################## 286 | ## Deconvolution 287 | 288 | STRING <- paste(basename(directory), method, ref.normalization, mix.normalization, sep = "_") 289 | 290 | output.filename <- paste(STRING, ".rds", sep = "") 291 | output_folder <- paste(directory, "OUTPUT_DECONVOLUTION/", sep = "/") 292 | ifelse(!dir.exists(output_folder), dir.create(output_folder), FALSE) 293 | output_name <- paste(output_folder, "RESULTS_", output.filename, sep = "") 294 | 295 | if(is.null(P)){ 296 | 297 | P <- table(phenoDataC$SubjectName,phenoDataC$cellType) %>% t() %>% as.data.table() %>% data.table::dcast.data.table(., V1 ~ V2, fill = "N") 298 | P <- data.frame(P, check.names = FALSE) 299 | rownames(P) <- P$V1 300 | P$V1 <- NULL 301 | P <- P[,colnames(P) %in% colnames(T)] 302 | P <- apply(P, 2, function(x) x / sum(x)) 303 | 304 | } 305 | 306 | ## Unify row names between T and C (after normalization!) 307 | perc.overlap <- (length(intersect(rownames(C),rownames(T))) * 100)/min(nrow(T),nrow(C)) 308 | if(perc.overlap < 33.3){ 309 | 310 | if(length(grep("ENSG000|ENSMUSG000",rownames(T))) > 100){ 311 | 312 | rownames.T <- ENSG_to_HGNC$HGNC[match(rownames(T), ENSG_to_HGNC$ENSG_ID)] 313 | T = T[!is.na(rownames.T),] 314 | rownames.T = rownames.T[!is.na(rownames.T)] 315 | 316 | } else { 317 | 318 | rownames.T <- ENSG_to_HGNC$ENSG_ID[match(rownames(T), ENSG_to_HGNC$HGNC)] 319 | T = T[!is.na(rownames.T),] 320 | rownames.T = rownames.T[!is.na(rownames.T)] 321 | 322 | } 323 | 324 | T <- collapsing(T, rownames.T) 325 | 326 | } 327 | 328 | # To avoid issues with sample names that are fully integers 329 | if(length(grep("[A-Z|a-z]", colnames(T))) == 0){colnames(T) <- paste("X",colnames(T),sep="")} 330 | if(length(grep("[A-Z|a-z]", colnames(P))) == 0){colnames(P) <- paste("X",colnames(P),sep="")} 331 | 332 | ## Remove NA values (if any) after row.name interconversion 333 | if(sum(is.na(rownames(T))) != 0){T = T[!is.na(rownames(T)), ]} 334 | 335 | P = P[,gtools::mixedsort(colnames(P))] 336 | T = T[,gtools::mixedsort(colnames(T))] 337 | 338 | #To avoid MAST-DWLS to be repeated! 339 | if(method == "DWLS"){STRING <- paste("DWLS", ref.normalization, sep = "_")} 340 | 341 | if(method %in% c("CIBERSORT", "RLR", "FARDEEP", "nnls")){ 342 | 343 | RESULTS = Deconvolution(T = T, 344 | C = C, 345 | method = method, 346 | phenoDataC = phenoDataC, 347 | P = P, 348 | STRING = STRING, 349 | marker_distrib = markers, 350 | refProfiles.var = NULL) 351 | 352 | } else { 353 | 354 | RESULTS = Deconvolution(T = T, 355 | C = scC, 356 | method = method, 357 | phenoDataC = phenoDataC, 358 | P = P, 359 | STRING = STRING, 360 | marker_distrib = markers, 361 | refProfiles.var = NULL) 362 | 363 | } 364 | 365 | RESULTS$dataset <- basename(directory) 366 | RESULTS$method <- method 367 | RESULTS$ref.normalization <- ref.normalization 368 | RESULTS$mix.normalization <- mix.normalization 369 | 370 | ann_text.global = RESULTS %>% dplyr::summarise(RMSE = sqrt(mean((observed_fraction - expected_fraction)^2)) %>% round(.,3), 371 | Pearson = cor(observed_fraction, expected_fraction) %>% round(.,3)) 372 | 373 | label = paste("Pearson = ", ann_text.global$Pearson, "\n RMSE = ", ann_text.global$RMSE, sep = "") 374 | 375 | scaleFUN <- function(x) sprintf("%.3f", x) 376 | 377 | P <- ggplot(RESULTS, aes(x = observed_fraction , y = expected_fraction, colour = cell_type)) + 378 | geom_point() + 379 | geom_abline(intercept = 0, slope = 1, lty = 2, col = "blue") + 380 | facet_grid( ~ method) + 381 | geom_text(vjust = "inward", hjust = "inward", data = ann_text.global, aes(x = 0.02, y = 1.04, label = label), size = 4, inherit.aes = FALSE)+ 382 | ggtitle("all_normalizations_per_cell") + 383 | theme_bw() + 384 | xlim(0,1) + 385 | ylim(0,1.05) + 386 | xlab("observed fraction") + 387 | ylab("expected fraction") + 388 | ggtitle(STRING) + 389 | theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), 390 | legend.position = "bottom") 391 | 392 | ggsave(P, filename = paste(output_folder, paste(STRING, ".pdf", sep = ""), sep = "/"), width = 6, height = 6) 393 | saveRDS(RESULTS, output_name) 394 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Source code (R statistical programming language, v4.0.1) to reproduce the results described in the article: 2 | 3 | > Francisco Avila Cobos, Mohammad Javad Najaf Panah, Jessica Epps, Xiaochen Long, Tsz-Kwong Man, Hua-Sheng Chiu, Elad Chomsky, Evgeny Kiner, Michael J Krueger, Diego di Bernardo, Luis Voloch, Jan Molenaar, Sander R. van Hooff, Frank Westermann, Selina Jansky, Michele L. Redell, Pieter Mestdagh, Pavel Sumazin **Effective methods for bulk RNA-Seq deconvolution using scnRNA-Seq transcriptomes** *(bioRxiv; https://www.biorxiv.org/content/10.1101/2022.12.13.520241v2)* 4 | 5 | SQUID as standalone R package 6 | ======== 7 | Please go here to detailed instructions for its installation: 8 | > https://github.com/favilaco/deconv_matching_bulk_scnRNA/tree/master/SQUID 9 | 10 | DATASETS 11 | ======== 12 | Here we provide an **example folder** (named "Toy_example"; see *"Folder requirements" & "Running the deconvolution"*) that can be directly used to test the framework. It contains an artificial single-cell RNA-seq dataset made of 5 artificial cell types where two of them are highly correlated (cell type 1 and 5); 1000 cells per cell type and 64 genes (of which 16 are marker genes, 4 per cell type and in different ranges of expression). 13 | 14 | The **other external datasets used in the manuscript** (together with the necessary metadata) can be downloaded from their respective sources: 15 | 16 | * Cell_mixtures: GEO GSE220608 17 | * Kidney: GEO GSE141115 (https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE141115&format=file) ; additional sample metadata information in "additional_data/Denisenko_crosstable_bulk_sc_sn_14092021". 18 | * AML: GEO GSE220608 19 | * Breast_cancer: GEO GSE176078 ("GSE176078_RAW.tar" from "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE176078&format=file") 20 | * NB_1: EGAS00001004349; 21 | * NB_2: EGAS00001006723; EGAS00001006823; GEO GSE218450 22 | * Synapse: syn8691134; syn23554292; syn23554293; syn23554294 23 | * Brain: GEO GSE67835 ("GSE67835_RAW.tar" from https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE67835&format=file) & IHC proportions (re-scaled to sum-to-one) present here: https://github.com/ellispatrick/CortexCellDeconv/tree/master/CellTypeDeconvAnalysis/Data [IHC.astro.txt, IHC.endo.txt, IHC.microglia.txt, IHC.neuro.txt; IHC.oligo.txt] 24 | 25 | 26 | The following line is needed for fresh installations of Linux (Debian): 27 | `sudo apt-get install curl libcurl4-openssl-dev libssl-dev zlib1g-dev r-base-dev libxml2-dev` 28 | 29 | 30 | R 4.0.1: REQUIRED PACKAGES AND PACKAGE DEPENDENCIES: 31 | =================================================== 32 | Packages to be installed (alphabetically ordered) before running any deconvolution (**R >= 4.0.1**): 33 | ``` 34 | AnnotationDbi 35 | Biobase 36 | CIBERSORT 37 | data.table 38 | devtools 39 | doMC 40 | dplyr 41 | DWLS 42 | edgeR 43 | egg 44 | foreach 45 | FARDEEP 46 | ggplot2 47 | ggpointdensity 48 | ggpubr 49 | ggrastr 50 | ggrepel 51 | gridExtra 52 | gtools 53 | limma 54 | MASS 55 | Matrix 56 | matrixStats 57 | monocle3 58 | MuSiC 59 | nnls 60 | org.Hs.eg.db 61 | org.Mm.eg.db 62 | parallel 63 | pheatmap 64 | RColorBrewer 65 | R.utils 66 | scater 67 | scran 68 | sctransform 69 | Seurat 70 | SingleCellExperiment 71 | tidyr 72 | tidyverse 73 | viridis 74 | ``` 75 | 76 | 77 | References to other methods included in our benchmark: 78 | ====================================================== 79 | While our work has a **BSD (3-clause)** license, you **may need** to obtain a license to use the individual normalization/deconvolution methods (e.g. CIBERSORT. The source code for CIBERSORT needs to be asked to the authors at https://cibersort.stanford.edu). 80 | 81 | | method | ref | 82 | |--------|----------| 83 | | nnls | Mullen, K. M. & van Stokkum, I. H. M. nnls: The Lawson-Hanson algorithm for non-negative least squares (NNLS). R package version 1.4. https://CRAN.R-project.org/package=nnls | 84 | | FARDEEP | Hao, Y., Yan, M., Lei, Y. L. & Xie, Y. Fast and Robust Deconvolution of Tumor Infiltrating Lymphocyte from Expression Profiles using Least Trimmed Squares. bioRxiv 358366 (2018) doi:10.1101/358366 | 85 | | MASS: Robust linear regression (RLR) | Ripley, B. et al. MASS: Support Functions and Datasets for Venables and Ripley’s MASS. (2002) | 86 | | CIBERSORT | Newman, A. M. et al. Robust enumeration of cell subsets from tissue expression profiles. Nat. Methods 12, 453–457 (2015) | 87 | |--------|----------| 88 | | DWLS | Tsoucas, D. et al. Accurate estimation of cell-type composition from gene expression data. Nat. Commun. 10, 1–9 (2019) | 89 | | Bisque | Jew, B. et al. Accurate estimation of cell composition in bulk expression through robust integration of single-cell information. Nat. Commun. 11, 1971 (2020) | 90 | | MuSiC | Wang, X., Park, J., Susztak, K., Zhang, N. R. & Li, M. Bulk tissue cell type deconvolution with multi-subject single-cell expression reference. Nat. Commun. 10, 380 (2019) | 91 | |--------|----------| 92 | | SCTransform / regularized negative binomial regression (RNBR) | Hafemeister, C. & Satija, R. Normalization and variance stabilization of single-cell RNA-seq data using regularized negative binomial regression. Genome Biology (2019) doi:10.1186/s13059-019-1874-1 | 93 | |scran | L. Lun, A. T., Bach, K. & Marioni, J. C. Pooling across cells to normalize single-cell RNA sequencing data with many zero counts. Genome Biol. 17, 75 (2016) | 94 | | scater | McCarthy, D. J., Campbell, K. R., Lun, A. T. L. & Wills, Q. F. Scater: pre-processing, quality control, normalization and visualization of single-cell RNA-seq data in R. Bioinformatics 33, 1179–1186 (2017) | 95 | | Trimmed mean of M-values (TMM) | Robinson, M. D. & Oshlack, A. A scaling normalization method for differential expression analysis of RNA-seq data. Genome Biol. 11, R25 (2010) | 96 | | Transcripts per million (TPM) | Li, B., Ruotti, V., Stewart, R. M., Thomson, J. A. & Dewey, C. N. RNA-Seq gene expression estimation with read mapping uncertainty. Bioinformatics 26, 493–500 (2010) | 97 | | LogNormalize | LogNormalize function (part of "Seurat"). R Documentation. https://www.rdocumentation.org/packages/Seurat/versions/3.1.1/topics/LogNormalize ; Butler, A., Hoffman, P., Smibert, P. et al. Integrating single-cell transcriptomic data across different conditions, technologies, and species. Nat Biotechnol 36, 411–420 (2018) doi:10.1038/nbt.4096 | 98 | 99 | 100 | FOLDER REQUIREMENTS 101 | =================== 102 | 103 | a) Folder structure: 104 | 105 | **NOTE**: The gold standard matrix of proportions ("P") is either present or obtained as the sum of individual cells across each cell type. 106 | 107 | ``` 108 | . 109 | ├── AML 110 | │   ├── scC.rds 111 | │   ├── T.rds 112 | │   ├── phenoDataC_clusters_wo_regrouping.txt 113 | ├── Brain 114 | │   ├── scC.rds 115 | │   ├── T.rds 116 | │   ├── P.rds #from IHC 117 | │   ├── phenoDataC_clusters_wo_regrouping.txt 118 | ├── Toy_example 119 | │   ├── scC.rds 120 | │   ├── T.rds 121 | │   ├── P.rds 122 | │   ├── phenoDataC_clusters_wo_regrouping.txt 123 | │   └── markers_TMM_FC1.5_Seuratwilcox.rds #(if not present will be created) 124 | ... 125 | │ 126 | ├── helper_functions.R 127 | ├── MASTER.R 128 | └── CIBERSORT.R 129 | ``` 130 | 131 | b) Minimally the following (tab-separated) columns being part of the metadata: "cellID", "cellType", "SubjectName". 132 | 133 | ``` 134 | # For the "example" dataset, it should look like: 135 | 136 | cellID cellType SubjectName 137 | cell_1 cell_type_1 Mix_1 138 | cell_2 cell_type_1 Mix_2 139 | ... 140 | ``` 141 | 142 | 143 | RUNNING THE DECONVOLUTION 144 | ========================= 145 | The deconvolution problem is formulated as: T = C·P [see https://doi.org/10.1093/bioinformatics/bty019 and https://doi.org/10.1038/s41467-020-19015-1 for detailed information] 146 | 147 | Make the following choices (in order): 148 | 149 | ``` 150 | i) specific dataset [from: "Toy_example","Cell_mixtures", "Kidney", "AML", "Breast_cancer", "NB_1", "NB_2", "Synapse", "Brain"] 151 | ii) normalization strategy for the reference matrix [bulk (C): "none", "LogNormalize", "TMM", "TPM"] ; [single-cell: "none", "LogNormalize", "TMM", "TPM", "SCTransform", "scran", "scater"] 152 | iii) normalization strategy for the mixture matrix (T)["none", "LogNormalize", "TMM", "TPM"] 153 | iv) deconvolution method [from: "CIBERSORT", "nnls", "FARDEEP", "RLR", "MuSiC", "MuSiC_with_markers", "Bisque", "Bisque_with_markers", "DWLS", "SQUID"] 154 | v) indicate whether cell-type labels are included ("yes") or whether they should be obtained by unsupervised clustering ("no") 155 | ``` 156 | 157 | **NOTE: To systematically test the benefit of bulk transformation and deconvolution with SQUID, a leave-one-out cross-validation strategy can also be used: iteratively, concurrent RNA-Seq and scnRNA-Seq profiles of all but one of the samples were used to predict the composition of the remaining sample based on its bulk RNA-Seq profile. By default, all available data is used at once, but this approach can be tried out by using "leave.one.out = TRUE" as a parameter of the function "transformation2" that is used inside SQUID (line 507 inside "helper_functions.R")** 158 | 159 | 160 | R example call 161 | =============== 162 | 163 | **MINIMUM REQUIREMENTS**: matching scnRNA-seq **("scC.rds")** and bulk RNA-seq **("T.rds")** samples together with the correspoinding metadata **("phenoDataC_clusters_wo_regrouping.txt")**. 164 | 165 | **OPTIONAL**: if users have gold standard proportion estimates from IHC or other techniques, they can include such information as **"P.rds"** (this will avoid re-computing the proportions directly from the scnRNA-seq data). If users wish to provide their own re-clustering or set of cell type labels, they can include this as a file named **"phenoDataC_clusters_after_regrouping.txt"** (see Toy_example/phenoDataC_clusters_after_regrouping.txt for an example). 166 | 167 | **NOTE**: Each single-cell RNA-seq dataset ("scC.rds") and mixture ("T.rds") should be **RAW integer MATRICES** containing genomic features as rows. 168 | 169 | 170 | ``` 171 | 172 | Rscript MASTER.R ~/Downloads/deconv_matching_bulk_scnRNA/Toy_example TPM TPM nnls no 173 | 174 | ``` 175 | 176 | In the "example" (toy) dataset cell types 1 and 5 were found to be very highly correlated and were first collapsed into a unique cluster (relabelled as "regrouped.cluster.1"): 177 | 178 | ![plot](./schematic_figures/Scheme1_Github.jpg) 179 | 180 | Next, the deconvolution was run and the output was generated: 181 | 182 | ![plot](./schematic_figures/Scheme2_Github.jpg) 183 | 184 | 185 | Survival analysis 186 | ================= 187 | Please run script(s) inside folder "SURVIVAL" 188 | -------------------------------------------------------------------------------- /SQUID/DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: SQUID 2 | Type: Package 3 | Title: Single-cell RNA Quantity Informed Deconvolution (SQUID) 4 | Version: 0.1.0 5 | Year: 2023 6 | Author: Francisco Avila Cobos, Mohammad Javad Najaf Panah, ..., Pieter Mestdagh, Pavel Sumazin 7 | Maintainer: Francisco Avila Cobos and Mohammad Javad Najaf Panah 8 | Description: SQUID executes a combination of RNA-Seq transformation and dampened weighted least-squares deconvolution approaches in predicting the composition of 9 | cell mixtures and tissue samples based on concurrent RNA-Seq and scnRNA-Seq profiles. To systematically test the benefit of bulk transformation and deconvolution 10 | with SQUID, a leave-one-out cross-validation strategy can also be used; iteratively, concurrent RNA-Seq and scnRNA-Seq profiles of all but one of the samples were 11 | used to predict the composition of the remaining sample based on its bulk RNA-Seq profile. 12 | Data: SQUID_Toy_Example_Data.RData 13 | License: MIT 14 | Encoding: UTF-8 15 | LazyData: true 16 | -------------------------------------------------------------------------------- /SQUID/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Mohammad Najafpanah 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /SQUID/NAMESPACE: -------------------------------------------------------------------------------- 1 | exportPattern("^[[:alpha:]]+") 2 | -------------------------------------------------------------------------------- /SQUID/R/SQUID.R: -------------------------------------------------------------------------------- 1 | #' Single-cell RNA Quantity Informed Deconvolution (SQUID)! 2 | #' 3 | #' This is an executable R function named 'SQUID' 4 | #' which analyse cell-type composition predictions based on concurrent RNA-Seq and scRNA-Seq profiles. 5 | #' 6 | #' You can learn more about SQUID compare to some other approaches at: 7 | #' 8 | #' https://github.com/favilaco/deconv_matching_bulk_scnRNA 9 | #' 10 | #' Citation: 11 | #' Avila Cobos, F., Najaf Panah, M. J., Epps, J., Long, X., Man, T. K., Chiu, H. S., ..., Mestdagh, P & Sumazin, P. (2022). Effective 12 | #' methods for bulk RNA-Seq deconvolution using scnRNA-Seq transcriptomes. bioRxiv, 2022-12. 13 | #' 14 | #' GitHub: https://github.com/favilaco/deconv_matching_bulk_scnRNA 15 | #' 16 | #' Licence: MIT License 17 | #' 18 | #' Copyright (c) [2023] [Francisco Avila Cobos, Mohammad Javad Najaf Panah, Pieter Mestdagh, Pavel Sumazin] 19 | #' 20 | #' 21 | #' Permission is hereby granted, free of charge, to any person obtaining a copy 22 | #' of this software and associated documentation files (the "Software"), to deal 23 | #' in the Software without restriction, including without limitation the rights 24 | #' to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 25 | #' copies of the Software, and to permit persons to whom the Software is 26 | #' furnished to do so, subject to the following conditions: 27 | #' 28 | #' The above copyright notice and this permission notice shall be included in 29 | #' all copies or substantial portions of the Software. 30 | #' 31 | #' THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 32 | #' IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 33 | #' FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 34 | #' AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 35 | #' LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 36 | #' OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 37 | #' THE SOFTWARE. 38 | #' 39 | #' 40 | #' For commercial use inquiries, please contact the corresponding authors [Pavel Sumazin at Pavel.Sumazin@bcm.edu; Pieter Mestdagh at pieter.mestdagh@ugent.be]. 41 | #' 42 | #' 43 | 44 | # Load the example dataset: 45 | load("SQUID_Toy_Example_Data.RData", verbose = T) 46 | 47 | #' Single-cell RNA Quantity Informed Deconvolution (SQUID). 48 | #' 49 | #' SQUID executes a combination of RNA-Seq transformation and dampened weighted least-squares deconvolution approaches in predicting the composition of 50 | #' cell mixtures and tissue samples. 51 | #' 52 | #' SQUID is an improved deconvolution method based on concurrent RNA-Seq and scRNA-Seq profiles. 53 | #' 54 | #' NOTE: To systematically test the benefit of bulk transformation and deconvolution with SQUID, a leave-one-out cross-validation strategy can also be used: 55 | #' iteratively, concurrent RNA-Seq and scnRNA-Seq profiles of all but one of the samples were used to predict the composition of the remaining sample based 56 | #' on its bulk RNA-Seq profile. 57 | #' 58 | #' @param B A bulk RNA-seq numeric matrix which could be either count or normalized values. Rows should be genes and columns should be sample.ids. 59 | #' @param scC A single-cell numeric matrix which could be either count or normalized values. Rows should be genes and columns should be cell.ids. 60 | #' @param scMeta A single-cell annotation data.frame which rows should be cell.ids and columns including cell.id, sample.id, cluster.id, cellType, etc. 61 | #' @param pB (Optional) A pseudo-bulk matrix generated based on cluster.id/cellType from single-cell analysis. Rows should be genes and columns should be cluster.id/cellType. 62 | #' @param P (Optional) A numeric matrix of expected cell fractions including the composition of cluster.id/cellTypes per samples can be calculated from scRNA-seq analysis. Rows should be unique cluster.ids/cellTypes and columns should be sample.ids. 63 | #' @param LeaveOneOut A logical variable which allow users run SQUID with/without a leave-one-out cross validation strategy. Default is FALSE. 64 | #' 65 | #' @return A table including cellType, sample.id, observed_fraction (predicted cluster/cell-type fraction), expected_fraction (P) for each bulk sample. 66 | #' 67 | #' @export RESULTS Save the RESULTS.rds and RESULTS.csv files in your working directory. 68 | #' 69 | #' @usage RESULTS <- SQUID(B=B, scC=scC , scMeta=scMeta, pB=NULL, P=NULL, LeaveOneOut=FALSE) 70 | #' 71 | #' 72 | SQUID <- function(B=B, scC=scC , scMeta=scMeta, pB=NULL, P=NULL, LeaveOneOut=FALSE) { 73 | cat(' 74 | 75 | 76 | 77 | ----------------------------------- Welcome To SQUID --------------------------------------- 78 | 79 | ######## ######## # # ######## ####### 80 | # # # # # ## # # 81 | # # # # # ## # # 82 | ######## # ## # # # ## # # 83 | # # # # # # ## # # 84 | # # #### # # ## # # 85 | ######## ######## ######## ######## ####### 86 | 87 | 88 | Execute an Effective method for bulk RNA-seq deconvolution using scnRNA-seq transcriptomes 89 | 90 | -------------------------------------------------------------------------------------------- 91 | 92 | 93 | 94 | ') 95 | cat('SQUID just began the analysis ... \n\n') 96 | 97 | # Make pseudo-Bulk (pB) based on clusters from single-cell analysis, if not provided 98 | if(is.null(pB)){ 99 | cellType <- scMeta$cellType # could be 'scMeta$cluster.id' as well. 100 | group = list() 101 | for(i in unique(cellType)){ 102 | group[[i]] <- which(cellType %in% i) 103 | } 104 | pB = lapply(group,function(x) base::rowMeans(scC[,x, drop = FALSE])) 105 | pB = do.call(cbind.data.frame, pB) 106 | } else { 107 | pB=pB 108 | } 109 | 110 | # Make proportional table (P), if not provided 111 | if(is.null(P)){ 112 | P <- table(scMeta$sample.id, scMeta$cellType) %>% t() 113 | P <- P[,gtools::mixedsort(colnames(P))] 114 | P <- P[,colnames(P) %in% colnames(B)] 115 | P <- apply(P, 2, function(x) x / sum(x)) 116 | } else { 117 | P=P 118 | } 119 | 120 | # Unify row names between B and pB 121 | perc.overlap <- (length(intersect(rownames(pB),rownames(B))) * 60)/min(nrow(B),nrow(pB)) 122 | if(perc.overlap < 33.3){ 123 | if(length(grep("ENSG000|ENSMUSG000",rownames(B))) > 100){ 124 | rownames.B <- ENSG_to_HGNC$HGNC[match(rownames(B), ENSG_to_HGNC$ENSG_ID)] 125 | B = B[!is.na(rownames.B),] 126 | rownames.B = rownames.B[!is.na(rownames.B)] 127 | } else { 128 | rownames.B <- ENSG_to_HGNC$ENSG_ID[match(rownames(B), ENSG_to_HGNC$HGNC)] 129 | B = B[!is.na(rownames.B),] 130 | rownames.B = rownames.B[!is.na(rownames.B)] 131 | } 132 | B <- collapsing(B, rownames.B) 133 | } 134 | 135 | # To avoid issues with sample.ids that are fully integers 136 | if(length(grep("[A-Z|a-z]", colnames(B))) == 0){colnames(B) <- paste("X",colnames(B),sep="")} 137 | if(length(grep("[A-Z|a-z]", colnames(P))) == 0){colnames(P) <- paste("X",colnames(P),sep="")} 138 | 139 | # Remove NA values (if any) after row.name interconversion 140 | if(sum(is.na(rownames(B))) != 0){B = B[!is.na(rownames(B)), ]} 141 | 142 | # Transform the bulk data 143 | if(LeaveOneOut) { 144 | cat("SQUID is transforming the Bulk samples using Leave-One-Out cross validation method ... ") 145 | } else { 146 | cat("SQUID is transforming the Bulk samples using all informations ... ") 147 | } 148 | 149 | Z = as.matrix(pB) 150 | Y = Z %*% P 151 | B.new = transformation2(X = B, Y = Y, leave.one.out = LeaveOneOut) 152 | B.new[!is.finite(B.new)] <- 0 153 | 154 | cat(' DONE!\n\n') 155 | 156 | # take common genes 157 | Genes <- intersect(rownames(Z),rownames(B.new)) 158 | B.new <- as.matrix(B.new[Genes,]) 159 | Z <- as.matrix(Z[Genes,]) 160 | 161 | # solve Dampened WLS 162 | cat('SQUID is solving the dampened weighted least squares model for Bulk samples ... \n\n') 163 | RESULTS <- apply(B.new, 2, function(x){ 164 | b = setNames(x, rownames(B.new)) 165 | tr <- trimData(Z, b) 166 | RES <- t(solveDampenedWLS(S = Z, B = x)) 167 | }) 168 | 169 | rownames(RESULTS) <- colnames(Z) 170 | RESULTS = apply(RESULTS,2,function(x) ifelse(x < 0, 0, x)) # explicit non-negativity constraint 171 | RESULTS = apply(RESULTS,2,function(x) x/sum(x)) # explicit STO constraint 172 | RESULTS = RESULTS[gtools::mixedsort(rownames(RESULTS)), , drop = FALSE] 173 | RESULTS = suppressMessages(reshape2::melt(RESULTS)) 174 | colnames(RESULTS) <- c("cellType","sample.id","observed_fraction") 175 | RESULTS$cellType = as.character(RESULTS$cellType) 176 | RESULTS$sample.id = as.character(RESULTS$sample.id) 177 | if(!is.null(P)){ 178 | P = P[gtools::mixedsort(rownames(P)),,drop = FALSE] %>% data.frame(., check.names = FALSE) 179 | P$cellType = rownames(P) 180 | P = suppressMessages(reshape2::melt(P)) 181 | colnames(P) <- c("cellType","sample.id","expected_fraction") 182 | P$cellType = as.character(P$cellType) 183 | P$sample.id = as.character(P$sample.id) 184 | RESULTS = merge(RESULTS, P, by = c("cellType", "sample.id"), all = TRUE) 185 | RESULTS[is.na(RESULTS)] <- 0 186 | RESULTS$expected_fraction <- round(RESULTS$expected_fraction, 3) 187 | RESULTS$observed_fraction <- round(RESULTS$observed_fraction, 3) 188 | } 189 | saveRDS(RESULTS, "RESULTS.rds") 190 | write.csv(RESULTS, "RESULTS.csv") 191 | cat('\nSQUID has saved the results table in .rds and .csv formats.\n\n') 192 | cat('Congrats! SQUID has finished the analyses!\n') 193 | return(RESULTS) 194 | } # The End 195 | -------------------------------------------------------------------------------- /SQUID/README.md: -------------------------------------------------------------------------------- 1 | # SQUID: Single-cell RNA Quantity Informed Deconvolution 2 | 3 | # Overview 4 | SQUID is an R package for conducting (tumor) deconvolution analyses. SQUID executes a combination of RNA-Seq transformation and dampened weighted least-squares deconvolution to predict the composition of cell mixtures and tissue samples based on the concurrent RNA-Seq and scnRNA-Seq profiles. SQUID harnesses the power of concurrent RNA-Seq and scnRNA-Seq profiling, outperforming other methods in predicting the composition of cell mixtures and tissue samples. 5 | 6 | # Installation 7 | First load the requirements: 8 | ```r 9 | library(devtools) 10 | devtools::source_url("https://github.com/favilaco/deconv_matching_bulk_scnRNA/blob/master/helper_functions.R?raw=TRUE") 11 | library(dplyr) 12 | ``` 13 | Now, you can install the latest version of SQUID from GitHub using the devtools package: 14 | ```r 15 | install_github("favilaco/deconv_matching_bulk_scnRNA/SQUID") 16 | ``` 17 | # Usage 18 | To use SQUID, load the package into your R session using the library function: 19 | ```r 20 | library(SQUID) 21 | ``` 22 | You can then use the provided SQUID function to conduct the experimantal analyses. You need to provide the required inputs as described in the package documentation in details. SQUID package contains a toy example dataset including bulk and single-cell RNA-seq simulated count data which you can use to run the program and check how it works: 23 | ```r 24 | RESULTS <- SQUID(B = B, scC = scC , scMeta = scMeta, pB = NULL, P = NULL, LeaveOneOut = FALSE) 25 | 26 | #' @param B A bulk RNA-seq numeric matrix which could be either count or normalized values. Rows should be genes and columns should be sample.ids. 27 | #' @param scC A single-cell numeric matrix which could be either count or normalized values. Rows should be genes and columns should be cell.ids. 28 | #' @param scMeta A single-cell annotation data.frame which rows should be cell.ids and columns including cell.id, sample.id, cluster.id, cellType, etc. 29 | #' @param pB (Optional) A pseudo-bulk matrix generated based on cluster.id/cellType from single-cell analysis. Rows should be genes and columns should be cluster.id/cellType. 30 | #' @param P (Optional) A numeric matrix of expected cell fractions including the composition of cluster.id/cellTypes per samples can be calculated from scRNA-seq analysis. Rows should be unique cluster.ids/cellTypes and columns should be sample.ids. 31 | #' @param LeaveOneOut A logical variable which allow users run SQUID with/without a leave-one-out cross validation strategy. Default is FALSE. 32 | #' 33 | ``` 34 | Notice: To improve the performance of SQUID prediction, you may need to set up your research-specific pipeline for the quality contol, normalization, imputation, and single-cell clustering procedures on your datasets prior to run SQUID. You are welcome to look at our preprint [paper](https://www.biorxiv.org/content/10.1101/2022.12.13.520241v2) as an example. 35 | 36 | # License 37 | This package is licensed under the MIT license. See the LICENSE file for details. 38 | 39 | # Contact 40 | For questions or comments about SQUID, please contact the package maintainers at Francisco Avila Cobos and Mohammad Javad Najaf Panah . If you find a bug or have a feature request, please submit an issue on the GitHub repository. 41 | 42 | # Citation 43 | Francisco Avila Cobos, Mohammad Javad Najaf Panah, Jessica Epps, Xiaochen Long, Tsz-Kwong Man, Hua-Sheng Chiu, Elad Chomsky, Evgeny Kiner, Michael J Krueger, Diego di Bernardo, Luis Voloch, Jan Molenaar, Sander R. van Hooff, Frank Westermann, Selina Jansky, Michele L. Redell, Pieter Mestdagh, Pavel Sumazin Effective methods for bulk RNA-Seq deconvolution using scnRNA-Seq transcriptomes (bioRxiv; https://www.biorxiv.org/content/10.1101/2022.12.13.520241v2) 44 | -------------------------------------------------------------------------------- /SQUID/SQUID.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Package 19 | PackageUseDevtools: Yes 20 | PackageInstallArgs: --no-multiarch --with-keep.source 21 | -------------------------------------------------------------------------------- /SQUID/SQUID_Toy_Example_Data.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/favilaco/deconv_matching_bulk_scnRNA/c353e025979a7a122078a324585d1980301bc5f7/SQUID/SQUID_Toy_Example_Data.RData -------------------------------------------------------------------------------- /SQUID/man/SQUID.Rd: -------------------------------------------------------------------------------- 1 | \name{SQUID} 2 | 3 | \alias{SQUID} 4 | 5 | \title{Single-cell RNA Quantity Informed Deconvolution (SQUID)} 6 | 7 | \description{SQUID predicts the composition of cell mixtures and tissue samples based on concurrent RNA-Seq and scnRNA-Seq profiles.} 8 | 9 | \usage{SQUID(B=B, scC=scC , scMeta=scMeta, pB=NULL, P=NULL, LeaveOneOut=FALSE)} 10 | 11 | \arguments{ 12 | \item{B}{A bulk RNA-seq numeric matrix which could be either count or normalized values. Rows should be genes and columns should be samples.} 13 | \item{scC}{A single-cell numeric matrix which could be either count or normalized values. Rows should be genes and columns should be cell.id.} 14 | \item{scMeta}{A single-cell annotation data.frame which rows should be cell.ids and columns including cell.id, sample.id, cluster.id, cellType, etc.} 15 | \item{pB}{(Optional) A pseudo-bulk matrix generated based on cluster.id/cellType from single-cell analysis. Rows should be genes and columns should be cluster.id/cellType.} 16 | \item{P}{ (Optional) A numeric matrix of expected cell fractions including the composition of clusters/cellTypes per samples can be calculated from scRNA-seq analysis. Rows should be unique cluster.ids/cellTypes and columns should be samples.} 17 | \item{LeaveOneOut}{A logical variable which allow users run SQUID with/without a leave-one-out cross validation strategy. Default is FALSE. Note that the LeaveOneOut argument included here to give users an extra option to systematically test the benefit of bulk transformation and deconvolution accuracy by SQUID.} 18 | } 19 | 20 | \value{A matrix including cellType, sample.id, observed_fraction (predicted cluster/cell-type fraction), expected_fraction (P) for each bulk sample. By default, SQUID also saves the RESULTS.rds and RESULTS.csv files on the working directory.} 21 | 22 | 23 | \examples{ 24 | RESULTS <- SQUID(B=B, scC=scC , scMeta=scMeta, pB=NULL, P=NULL, LeaveOneOut=FALSE) 25 | } 26 | -------------------------------------------------------------------------------- /SURVIVAL/Survival_Analyses.R: -------------------------------------------------------------------------------- 1 | # Survival Analyses (AML dataset) 2 | # Bulk: pAML from TARGET (n=256) 3 | # SC: 6 pAML from BCM/TCH inclusing Diagnosis and relapes timepoints (n=12) 4 | 5 | # Load libraries: 6 | library(survival) 7 | library(survminer) 8 | 9 | ## Read TARGET patients clinical data: 10 | clinical.dt <- read.delim("./AML-TARGET-RNAseq-Diagn-No_21-Anno-091920.txt") 11 | 12 | ## Read the predicted proportional results (LogNormalize-LogNormalize normalization) generated by deconvolution methods: 13 | pSQUID <- read.csv("./TARGET_AML_LogNormalize_LogNormalize_pSQUID.csv") 14 | 15 | # Extract patients IDs 16 | TARGET.USI <- strsplit(pSQUID$X, "[.]") 17 | TARGET.USI <- matrix(unlist(TARGET.USI), ncol=5, byrow=TRUE) 18 | TARGET.USI <- as.data.frame(TARGET.USI) 19 | pSQUID <- cbind.data.frame(TARGET.USI, pSQUID) 20 | pSQUID <- pSQUID[,-c(1,2,4,5,6)] 21 | colnames(pSQUID)[1] <- "TARGET.USI" 22 | 23 | # Merge the cluster.prop variables to clinical data (Relapsed_died + non_Relapsed_Alive patients) n=181 24 | merged.tAML <- merge(clinical.dt, pJanus, by="TARGET.USI") 25 | merged.tAML.R.DoD <- merged.tAML[merged.tAML$First.Event=="Relapse" & merged.tAML$Vital.Status=="Dead",] # 93 26 | merged.tAML.C.Ali <- merged.tAML[merged.tAML$First.Event=="Censored" & merged.tAML$Vital.Status=="Alive",] # 88 27 | t.AML.R.C.DOD.ALI <- rbind(merged.tAML.R.DoD,merged.tAML.C.Ali) 28 | 29 | # Categorize the patients based on their median cut-off on cluster proportion (i. e. cluster 11): 30 | data=t.AML.R.C.DOD.ALI 31 | cluster="cluster.11" 32 | for(i in 1:(nrow(data))) { 33 | if(data[i,cluster] >= median(data[,cluster])) { 34 | data$cluster.11.pro[i] <- "high" 35 | } else { 36 | data$cluster.11.pro[i] <- "low" 37 | } 38 | } 39 | 40 | # Getting the status value 41 | status <- c() 42 | for(j in 1:nrow(data)) { 43 | if(data$Vital.Status[j]=="Dead") { 44 | status[j] <- 1 45 | } else { 46 | status[j] <- 0 47 | } 48 | } 49 | 50 | # Model overall survival 51 | sfit <- survfit(Surv(Overall.Survival.Time.in.Days, status)~cluster.11.pro, data=data) 52 | ggsurvplot(sfit, data = data, pval=T, risk.table = F, palette = c("red","blue")) 53 | 54 | # Using cox ph regression to fit the model 55 | cfit <- coxph(Surv(Overall.Survival.Time.in.Days, status)~cluster.11, data= data) 56 | cfit 57 | 58 | # Model event-free survival 59 | sfit0 <- survfit(Surv(Event.Free.Survival.Time.in.Days, status)~cluster.11.pro, data = data) 60 | ggsurvplot(sfit0, data = data, pval=T, risk.table = F, palette = c("red","blue")) 61 | # Using cox ph regression to fit the model 62 | cfit0 <- coxph(Surv(Event.Free.Survival.Time.in.Days, status)~cluster.11, data=data) 63 | cfit0 64 | -------------------------------------------------------------------------------- /Toy_example/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/favilaco/deconv_matching_bulk_scnRNA/c353e025979a7a122078a324585d1980301bc5f7/Toy_example/.DS_Store -------------------------------------------------------------------------------- /Toy_example/P.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/favilaco/deconv_matching_bulk_scnRNA/c353e025979a7a122078a324585d1980301bc5f7/Toy_example/P.rds -------------------------------------------------------------------------------- /Toy_example/T.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/favilaco/deconv_matching_bulk_scnRNA/c353e025979a7a122078a324585d1980301bc5f7/Toy_example/T.rds -------------------------------------------------------------------------------- /Toy_example/markers_TMM_FC1.5_Seuratwilcox.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/favilaco/deconv_matching_bulk_scnRNA/c353e025979a7a122078a324585d1980301bc5f7/Toy_example/markers_TMM_FC1.5_Seuratwilcox.rds -------------------------------------------------------------------------------- /Toy_example/scC.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/favilaco/deconv_matching_bulk_scnRNA/c353e025979a7a122078a324585d1980301bc5f7/Toy_example/scC.rds -------------------------------------------------------------------------------- /additional_data/Denisenko_crosstable_bulk_sc_sn_14092021.csv: -------------------------------------------------------------------------------- 1 | BG17_BulkSeq_ColdDiss_Fresh;BG1,BG30,BG52;Mouse2_Right;"kidney Batch1_C57BL6J_Male_19weeks" 2 | BG18_BulkSeq_WarmDiss_Fresh;BG2,BG31,BG53;Mouse1_Right;"kidney Batch1_C57BL6J_Male_19weeks" 3 | BG19_BulkSeq_ColdDiss_Fresh;BG3,BG32,BG54;Mouse4_Left;"kidney Batch1_C57BL6J_Male_19weeks" 4 | BG20_BulkSeq_WarmDiss_Fresh;BG4,BG33,BG55;Mouse3_Left;"kidney Batch1_C57BL6J_Male_19weeks" 5 | BG21_BulkSeq_ColdDiss_Fresh;BG5,BG34,BG56;Mouse6_Left;"kidney Batch1_C57BL6J_Male_19weeks" 6 | BG22_BulkSeq_WarmDiss_Fresh;BG6,BG35,BG57;Mouse5_Right;"kidney Batch1_C57BL6J_Male_19weeks" 7 | LDK4_BulkSeq_ColdDiss_Fresh;LD01,MJ51;Mouse1_Right;"kidney Batch3_Balbc_Female_15weeks" 8 | LDK5_BulkSeq_ColdDiss_Fresh;LD02,MJ52;Mouse2_Right;"kidney Batch3_Balbc_Female_15weeks" 9 | LDK6_BulkSeq_ColdDiss_Fresh;LD03,MJ53;Mouse3_Left;"kidney Batch3_Balbc_Female_15weeks" -------------------------------------------------------------------------------- /helper_functions.R: -------------------------------------------------------------------------------- 1 | regroup.cor <- function(correlation.matrix, correlation.threshold = 0.95){ 2 | 3 | top.cor <- which(abs(correlation.matrix) >= correlation.threshold & row(correlation.matrix) < col(correlation.matrix), arr.ind = TRUE) 4 | 5 | if(nrow(top.cor) != 0){ 6 | 7 | ## reconstruct names from positions 8 | high_cor <- matrix(colnames(correlation.matrix)[top.cor], ncol = 2) 9 | 10 | count = 0; groups = list() 11 | 12 | for(elem in unique(high_cor[,1])){ 13 | count = count + 1 14 | items = high_cor[high_cor[,1] %in% elem,] 15 | if(!is.null(dim(items))){items <- items %>% melt() %>% .[3] %>% .$value %>% unique()} 16 | groups[[paste("group",count,sep="_")]] <- items 17 | } 18 | 19 | for(elem in unique(high_cor[,2])){ 20 | count = count + 1 21 | items = high_cor[high_cor[,2] %in% elem,] 22 | if(!is.null(dim(items))){items <- items %>% melt() %>% .[3] %>% .$value %>% unique()} 23 | groups[[paste("group",count,sep="_")]] <- items 24 | } 25 | 26 | # If one element present in multiple groups, keep largest 27 | groups <- groups[sapply(groups, length) %>% order() %>% rev()] #re-order NEEDED to ensure biggest groups and no repetition 28 | new.groups <- list() 29 | used <- c() 30 | count = 0 31 | for(i in names(groups)){ 32 | count = count + 1 33 | to.regroup <- sapply(groups, function(x) groups[[i]] %in% x) %>% colSums() != 0 34 | to.regroup <- names(to.regroup)[to.regroup] 35 | if(sum(to.regroup %in% used) == 0){ 36 | new.groups[[paste("group",count,sep="_")]] <- groups[to.regroup] %>% unlist() %>% unique() 37 | used <- c(used, to.regroup) 38 | } 39 | 40 | } 41 | 42 | annotation = melt(new.groups) 43 | rownames(annotation) <- annotation$value 44 | annotation$value <- NULL 45 | colnames(annotation) <- "new.group" 46 | 47 | } else { 48 | 49 | annotation = data.frame(new.group = "") 50 | 51 | } 52 | 53 | return(annotation) 54 | 55 | } 56 | 57 | 58 | 59 | collapsing <- function(input, duplicated.names){ 60 | 61 | if(sum(duplicated(duplicated.names)) != 0){ 62 | 63 | exp.var <- apply(input, 1, mean) %>% melt() 64 | exp.var$gene <- duplicated.names 65 | exp.var$IQR <- apply(input, 1, IQR) %>% as.numeric() 66 | exp.var$row.number <- 1:nrow(exp.var) 67 | max.values = exp.var %>% 68 | group_by(gene) %>% 69 | summarise(across(c("value", "IQR"), ~ max(.x))) 70 | exp.var <- merge(max.values, exp.var, by = "gene") 71 | to.keep <- which(exp.var[,2] == exp.var[,4] & exp.var[,3] == exp.var[,5]) %>% sort() 72 | 73 | #if after this criteria, still duplicates, choose the first one (smallest row.number)! 74 | exp.var <- exp.var[to.keep,] 75 | min.row = exp.var %>% 76 | group_by(gene) %>% 77 | summarise(across(c("row.number"), ~ min(.x))) 78 | exp.var <- merge(min.row, exp.var, by = "gene") 79 | to.keep <- which(exp.var[,2] == exp.var[,7]) %>% sort() 80 | exp.var <- exp.var[to.keep,] %>% na.omit() 81 | input <- input[exp.var[,2], ] 82 | 83 | if(length(grep("ENS.*\\.[0-9]+", rownames(input))) > 1000){ # For datasets with ENSG names with format "ENSG00...003.14": keep genes with highest mean expression AND highest variability (first criterium insufficient, there are cases with same mean) 84 | 85 | rownames(input) <- strsplit(rownames(input),"\\.") %>% lapply(., function(x) x[1]) %>% unlist() 86 | 87 | } else if(length(grep("ENS.*\\__", rownames(input))) > 1000){ # For datasets with gene names as "ENSG00...__HGNC": remove the HGNC part 88 | 89 | rownames(input) <- strsplit(rownames(input),"__") %>% lapply(., function(x) x[1]) %>% unlist() 90 | 91 | } else { 92 | 93 | rownames(input) <- exp.var$gene 94 | 95 | } 96 | 97 | } 98 | 99 | return(input) 100 | 101 | } 102 | 103 | 104 | 105 | transformation2 <- function(X, Y, leave.one.out = TRUE) { 106 | 107 | # use the same genes for all input datasets 108 | Genes <- intersect(row.names(Y), row.names(X)) 109 | 110 | X <- as.matrix(X[Genes,]) 111 | Y <- as.matrix(Y[Genes,]) 112 | 113 | if(leave.one.out){ 114 | 115 | pred <- intersect(colnames(X), colnames(Y)) # matching samples with sc and bulk data 116 | X.new <- matrix(0, nrow=dim(X)[1], ncol=length(pred)) 117 | 118 | for(j in 1:length(pred)){ 119 | 120 | X.train <- as.matrix(X[,pred[-j]]) 121 | X.test <- as.matrix(X[,pred[j]]) 122 | Y.train <- as.matrix(Y[,pred]) 123 | 124 | # track the mean and sd after leaving one out: 125 | X.mean <- rowMeans(X.train) 126 | X.sd <- apply(X.train,1,sd) 127 | Y.mean <- rowMeans(Y) 128 | Y.sd <- apply(Y,1,sd) 129 | 130 | for (i in 1:dim(X)[1]){ 131 | 132 | # transforming over genes 133 | x <- X.test[i] 134 | sigma_j <- Y.sd[i]*sqrt((length(Y[i,])-1)/(length(Y[i,])+1)) 135 | x.new <- (x-X.mean[i])/X.sd[i] 136 | x.new <- x.new*sigma_j + Y.mean[i] 137 | 138 | X.new[i,j] <- x.new 139 | 140 | } 141 | 142 | } 143 | 144 | } else { 145 | 146 | # transforming over genes 147 | X.new <- matrix(0, nrow=dim(X)[1], ncol=dim(X)[2]) 148 | 149 | for (i in 1:dim(X)[1]){ 150 | 151 | y <- Y[i,] 152 | x <- X[i,] 153 | l <- length(y) 154 | sigma_j <- sd(y)*sqrt((l-1)/(l+1)) 155 | x.new <- (x-mean(x))/sd(x) 156 | x.new <- x.new*sigma_j + mean(y) 157 | X.new[i,] <- x.new 158 | 159 | } 160 | 161 | } 162 | 163 | # explicit non-negativity constraint: 164 | X.new = apply(X.new,2,function(x) ifelse(x < 0, 0, x)) 165 | 166 | rownames(X.new) = rownames(X); colnames(X.new) = colnames(X) 167 | 168 | return(X.new) 169 | 170 | } 171 | 172 | 173 | 174 | bulkC.fromSC <- function(scC, phenoDataC){ 175 | 176 | phenoDataC = phenoDataC[colnames(scC),] 177 | cellType <- phenoDataC$cellType 178 | group = list() 179 | 180 | for(i in unique(cellType)){ 181 | 182 | group[[i]] <- which(cellType %in% i) 183 | 184 | } 185 | 186 | C = lapply(group,function(x) Matrix::rowMeans(scC[,x])) 187 | C = do.call(cbind.data.frame, C) 188 | 189 | return(as.matrix(C)) 190 | 191 | } 192 | 193 | 194 | 195 | Scaling <- function(matrix, option, phenoDataC=NULL){ 196 | 197 | #Avoid Error: Input matrix x contains at least one null or NA-filled row. 198 | matrix = matrix[rowSums(matrix) != 0,] 199 | #Avoid error if all elements within a row are equal (e.g. all 0, or all a common value after log/sqrt/vst transformation) 200 | matrix = matrix[!apply(matrix, 1, function(x) var(x) == 0),] 201 | matrix = matrix[,colSums(matrix) != 0] 202 | 203 | if (option == "LogNormalize"){ 204 | 205 | matrix = expm1(Seurat::LogNormalize(matrix, verbose = FALSE)) %>% as.matrix() 206 | 207 | } else if (option == "TMM"){# CPM counts coming from TMM-normalized library sizes; https://support.bioconductor.org/p/114798/ 208 | 209 | if(!is.null(phenoDataC)){ 210 | 211 | Celltype = as.character(phenoDataC$cellType[phenoDataC$cellID %in% colnames(matrix)]) 212 | 213 | } else { 214 | 215 | Celltype = colnames(matrix) 216 | 217 | } 218 | 219 | matrix <- edgeR::DGEList(counts=matrix, group=Celltype) 220 | CtrlGenes <- grep("ERCC-",rownames(data)) 221 | 222 | if(length(CtrlGenes) > 1){ 223 | 224 | spikes <- data[CtrlGenes,] 225 | spikes <- edgeR::calcNormFactors(spikes, method = "TMM") 226 | matrix$samples$norm.factors <- spikes$samples$norm.factors 227 | 228 | } else { 229 | 230 | matrix <- edgeR::calcNormFactors(matrix, method = "TMM") 231 | 232 | } 233 | 234 | matrix <- edgeR::cpm(matrix) 235 | 236 | } else if (option == "TPM"){ 237 | 238 | # MULTI-CORE VERSION: adapted from #devtools::source_url('https://raw.githubusercontent.com/dviraran/SingleR/master/R/HelperFunctions.R', sha1 = "df5560b4ebb28295349aadcbe5b0dc77d847fd9e") 239 | 240 | TPM <- function(counts,lengths=NULL){ 241 | 242 | require(foreach) 243 | require(Matrix) 244 | 245 | if (is.null(lengths)) { 246 | data('gene_lengths') 247 | } 248 | 249 | A = intersect(rownames(counts),names(lengths)) 250 | counts = as.matrix(counts[A,]) 251 | lengths = lengths[A] 252 | rate = counts / lengths 253 | 254 | num_cores <- min(4, parallel::detectCores()) #slurm will automatically restrict this to parameter --cpus-per-task 255 | print(paste("num_cores = ", num_cores, sep = "")) 256 | 257 | doMC::registerDoMC(cores = num_cores) 258 | jump = 100 259 | tpm <- foreach::foreach(elem = seq(1,ncol(counts), jump), .combine='cbind.data.frame') %dopar% { 260 | 261 | lsizes <- colSums(rate[, elem:min(ncol(counts),(elem + jump - 1))]) 262 | lo <- 1e6*rate[, elem:min(ncol(counts),(elem + jump - 1))] 263 | tpm <- lo %*% diag(1/lsizes) 264 | tpm 265 | 266 | } 267 | 268 | colnames(tpm) <- colnames(counts) 269 | return(tpm) 270 | 271 | } 272 | 273 | if(! file.exists("human_lengths.rda")){ 274 | download.file("https://github.com/dviraran/SingleR/blob/master/data/human_lengths.rda?raw=true","./human_lengths.rda") 275 | } 276 | load("./human_lengths.rda") 277 | 278 | # Doesn't work with Ensembl IDs: 279 | if(length(grep("ENSG000",rownames(matrix))) > 50){ 280 | 281 | suppressMessages(library("AnnotationDbi")) 282 | suppressMessages(library("org.Hs.eg.db")) 283 | temp = mapIds(org.Hs.eg.db,keys = names(human_lengths), column = "ENSEMBL", keytype = "SYMBOL", multiVals = "first") 284 | names(human_lengths) = as.character(temp) 285 | 286 | } 287 | 288 | matrix = TPM(counts = matrix, lengths = human_lengths) 289 | rownames(matrix) = toupper(rownames(matrix)) 290 | 291 | } else if (option == "TPM.murine"){ 292 | 293 | # MULTI-CORE VERSION: adapted from #devtools::source_url('https://raw.githubusercontent.com/dviraran/SingleR/master/R/HelperFunctions.R', sha1 = "df5560b4ebb28295349aadcbe5b0dc77d847fd9e") 294 | 295 | TPM <- function(counts,lengths=NULL){ 296 | 297 | require(foreach) 298 | require(Matrix) 299 | 300 | if (is.null(lengths)) { 301 | data('gene_lengths') 302 | } 303 | 304 | 305 | A = intersect(rownames(counts),names(lengths)) 306 | counts = as.matrix(counts[A,]) 307 | lengths = lengths[A] 308 | rate = counts / lengths 309 | 310 | num_cores <- min(4, parallel::detectCores()) #slurm will automatically restrict this to parameter --cpus-per-task 311 | print(paste("num_cores = ", num_cores, sep = "")) 312 | 313 | doMC::registerDoMC(cores = num_cores) 314 | jump = 100 315 | tpm <- foreach::foreach(elem = seq(1,ncol(counts), jump), .combine='cbind.data.frame') %dopar% { 316 | 317 | lsizes <- colSums(rate[, elem:min(ncol(counts),(elem + jump - 1))]) 318 | lo <- 1e6*rate[, elem:min(ncol(counts),(elem + jump - 1))] 319 | tpm <- lo %*% diag(1/lsizes) 320 | tpm 321 | 322 | } 323 | 324 | colnames(tpm) <- colnames(counts) 325 | return(tpm) 326 | 327 | } 328 | 329 | if(! file.exists("mouse_lengths.rda")){ 330 | download.file("https://github.com/dviraran/SingleR/blob/master/data/mouse_lengths.rda?raw=true","./mouse_lengths.rda") 331 | } 332 | 333 | load("./mouse_lengths.rda") 334 | 335 | ## Append version with ENSMUSG000 names 336 | mouse_lengths2 <- mouse_lengths 337 | 338 | # Doesn't work with Ensembl IDs: 339 | suppressMessages(library("AnnotationDbi")) 340 | suppressMessages(library("org.Mm.eg.db")) 341 | temp = mapIds(org.Mm.eg.db,keys = names(mouse_lengths), column = "ENSEMBL", keytype = "SYMBOL", multiVals = "first") 342 | names(mouse_lengths) = as.character(temp) 343 | 344 | mouse_lengths = c(mouse_lengths,mouse_lengths2) 345 | matrix = TPM(counts = matrix, lengths = mouse_lengths) 346 | 347 | #################################################################################### 348 | ## scRNA-seq specific 349 | 350 | } else if (option == "SCTransform"){ 351 | 352 | matrix = as(matrix, "dgCMatrix") 353 | matrix = sctransform::vst(matrix, return_corrected_umi = TRUE, show_progress = FALSE)$umi_corrected 354 | 355 | } else if (option == "scran"){ 356 | 357 | sce = SingleCellExperiment::SingleCellExperiment(assays = list(counts=as.matrix(matrix))) 358 | sce = scran::computeSumFactors(sce, clusters=NULL) 359 | sce = scater::logNormCounts(sce, log = FALSE) 360 | matrix = normcounts(sce) 361 | 362 | } else if (option == "scater"){ 363 | 364 | sce = SingleCellExperiment::SingleCellExperiment(assays = list(counts=as.matrix(matrix))) 365 | size_factors = scater::librarySizeFactors(sce) 366 | sce = scater::logNormCounts(sce, log = FALSE) 367 | matrix = normcounts(sce) 368 | 369 | } 370 | 371 | return(matrix) 372 | 373 | } 374 | 375 | 376 | 377 | Deconvolution <- function(T, C, method, phenoDataC, P = NULL, elem = NULL, STRING = NULL, marker_distrib, refProfiles.var){ 378 | 379 | bulk_methods = c("CIBERSORT","nnls","FARDEEP","RLR") 380 | sc_methods = c("MuSiC","MuSiC_with_markers","DWLS","SQUID", "Bisque", "Bisque_with_markers") 381 | 382 | ########## Using marker information for bulk_methods 383 | 384 | if(method %in% bulk_methods){ 385 | 386 | C = C[rownames(C) %in% unique(marker_distrib$gene), , drop = FALSE] 387 | T = T[rownames(T) %in% unique(marker_distrib$gene), , drop = FALSE] 388 | refProfiles.var = refProfiles.var[rownames(refProfiles.var) %in% unique(marker_distrib$gene), , drop = FALSE] 389 | 390 | } else { ### For scRNA-seq methods 391 | 392 | if(length(grep("[N-n]ame",colnames(phenoDataC))) > 0){ 393 | sample_column = grep("[N-n]ame",colnames(phenoDataC)) 394 | } else { 395 | sample_column = grep("[S-s]ample|[S-s]ubject",colnames(phenoDataC)) 396 | } 397 | 398 | colnames(phenoDataC)[sample_column] = "SubjectName" 399 | rownames(phenoDataC) = phenoDataC$cellID 400 | # establish same order in (sc)C and phenoDataC: 401 | phenoDataC <- phenoDataC[match(colnames(C),phenoDataC$cellID),] 402 | 403 | if(method %in% c("MuSiC","MuSiC_with_markers", "Bisque", "Bisque_with_markers")){ 404 | 405 | require(xbioc) 406 | C.eset <- Biobase::ExpressionSet(assayData = as.matrix(C), phenoData = Biobase::AnnotatedDataFrame(phenoDataC)) 407 | T.eset <- Biobase::ExpressionSet(assayData = as.matrix(T)) 408 | 409 | } 410 | 411 | } 412 | 413 | ########## MATRIX DIMENSION APPROPRIATENESS ########## 414 | 415 | keep = intersect(rownames(C),rownames(T)) 416 | C = C[keep, , drop = FALSE] 417 | T = T[keep, , drop = FALSE] 418 | 419 | ################################### 420 | 421 | if(method == "CIBERSORT"){ 422 | 423 | #source("./CIBERSORT.R") 424 | RESULTS = CIBERSORT(sig_matrix = C, mixture_file = T, QN = FALSE) 425 | RESULTS = t(RESULTS[,1:(ncol(RESULTS)-3),drop=FALSE]) 426 | 427 | } else if (method == "nnls"){ 428 | 429 | require(nnls) 430 | RESULTS = do.call(cbind.data.frame,lapply(apply(T,2,function(x) nnls::nnls(as.matrix(C),x)), function(y) y$x)) 431 | RESULTS = apply(RESULTS,2,function(x) x/sum(x)) #explicit STO constraint 432 | rownames(RESULTS) <- colnames(C) 433 | 434 | } else if (method == "FARDEEP"){ 435 | 436 | require(FARDEEP) 437 | RESULTS = t(FARDEEP::fardeep(C, T, nn = TRUE, intercept = TRUE, permn = 10, QN = FALSE)$abs.beta) 438 | RESULTS = apply(RESULTS,2,function(x) x/sum(x)) #explicit STO constraint 439 | # These 2 lines are similar as retrieving $relative.beta instead of abs.beta + re-scaling 440 | 441 | } else if (method == "RLR"){ #RLR = robust linear regression 442 | 443 | require(MASS) 444 | RESULTS = do.call(cbind.data.frame,lapply(apply(T,2,function(x) MASS::rlm(x ~ as.matrix(C), maxit=100)), function(y) y$coefficients[-1])) 445 | RESULTS = apply(RESULTS,2,function(x) ifelse(x < 0, 0, x)) #explicit non-negativity constraint 446 | RESULTS = apply(RESULTS,2,function(x) x/sum(x)) #explicit STO constraint 447 | rownames(RESULTS) <- unlist(lapply(strsplit(rownames(RESULTS),")"),function(x) x[2])) 448 | 449 | ################################### 450 | ################################### 451 | 452 | } else if (method == "MuSiC"){ 453 | 454 | require(MuSiC) 455 | RESULTS = t(MuSiC::music_prop(bulk.eset = T.eset, sc.eset = C.eset, clusters = 'cellType', 456 | markers = NULL, normalize = FALSE, samples = 'SubjectName', 457 | verbose = F)$Est.prop.weighted) 458 | 459 | } else if (method == "MuSiC_with_markers"){ 460 | 461 | require(MuSiC) 462 | RESULTS = t(MuSiC::music_prop(bulk.eset = T.eset, sc.eset = C.eset, clusters = 'cellType', 463 | markers = unique(marker_distrib$gene), normalize = FALSE, samples = 'SubjectName', 464 | verbose = F)$Est.prop.weighted) 465 | 466 | } else if (method == "Bisque"){#By default, Bisque uses all genes for decomposition. However, you may supply a list of genes (such as marker genes) to be used with the markers parameter 467 | 468 | require(BisqueRNA) 469 | RESULTS <- BisqueRNA::ReferenceBasedDecomposition(T.eset, C.eset, markers = NULL, use.overlap = FALSE)$bulk.props 470 | 471 | } else if (method == "Bisque_with_markers"){#By default, Bisque uses all genes for decomposition. However, you may supply a list of genes (such as marker genes) to be used with the markers parameter 472 | 473 | require(BisqueRNA) 474 | RESULTS <- BisqueRNA::ReferenceBasedDecomposition(T.eset, C.eset, markers = marker_distrib$gene, use.overlap = FALSE)$bulk.props 475 | 476 | } else if (method == "DWLS"){ 477 | 478 | require(DWLS) 479 | path = paste(getwd(),"/results_",STRING,sep="") 480 | 481 | if(! dir.exists(path)){ dir.create(path) } #to avoid repeating marker_selection step when removing cell types; Sig.RData automatically created 482 | 483 | if(!file.exists(paste(path,"Sig.RData",sep="/"))){ 484 | 485 | Signature <- DWLS::buildSignatureMatrixMAST(scdata = C, id = as.character(phenoDataC$cellType), path = path, diff.cutoff = 0.5, pval.cutoff = 0.01) 486 | 487 | } else {#re-load signature and remove CT column + its correspondent markers 488 | 489 | load(paste(path,"Sig.RData",sep="/")) 490 | Signature <- Sig 491 | 492 | } 493 | 494 | Signature = as.matrix(Signature) 495 | 496 | RESULTS <- apply(T, 2, function(x){ 497 | b = setNames(x, rownames(T)) 498 | tr <- DWLS::trimData(Signature, b) 499 | RES <- t(DWLS::solveDampenedWLS(tr$sig, tr$bulk)) 500 | }) 501 | 502 | rownames(RESULTS) <- as.character(unique(phenoDataC$cellType)) 503 | RESULTS = apply(RESULTS,2,function(x) ifelse(x < 0, 0, x)) #explicit non-negativity constraint 504 | RESULTS = apply(RESULTS,2,function(x) x/sum(x)) #explicit STO constraint 505 | print(head(RESULTS)) 506 | 507 | } else if (method == "SQUID"){ 508 | 509 | # Transforming T : 510 | Z = bulkC.fromSC(scC = C, phenoDataC = phenoDataC) 511 | X = T 512 | P <- as.matrix(P[base::colnames(Z),,drop = FALSE]) 513 | Y = Z %*% P 514 | 515 | X.new = transformation2(X = X, Y = Y, leave.one.out = FALSE) 516 | X.new[!is.finite(X.new)] <- 0 517 | 518 | # take common genes 519 | Genes <- intersect(rownames(Z),rownames(X.new)) 520 | 521 | X.new <- as.matrix.Vector(X.new[Genes,]) 522 | Z <- as.matrix.Vector(Z[Genes,]) 523 | 524 | RESULTS <- apply(X.new, 2, function(x){ 525 | RES <- t(DWLS::solveDampenedWLS(S = Z, B = x)) 526 | }) 527 | 528 | rownames(RESULTS) <- colnames(Z) 529 | RESULTS = apply(RESULTS,2,function(x) ifelse(x < 0, 0, x)) #explicit non-negativity constraint 530 | RESULTS = apply(RESULTS,2,function(x) x/sum(x)) #explicit STO constraint 531 | 532 | } 533 | 534 | RESULTS = RESULTS[gtools::mixedsort(rownames(RESULTS)), , drop = FALSE] 535 | RESULTS = data.table::melt(RESULTS) 536 | colnames(RESULTS) <- c("cell_type","tissue","observed_fraction") 537 | RESULTS$cell_type = as.character(RESULTS$cell_type) 538 | RESULTS$tissue = as.character(RESULTS$tissue) 539 | 540 | if(!is.null(P)){ 541 | 542 | P = P[gtools::mixedsort(rownames(P)),,drop = FALSE] %>% data.frame(., check.names = FALSE) 543 | P$cell_type = rownames(P) 544 | P = data.table::melt(P, id.vars="cell_type") 545 | colnames(P) <-c("cell_type","tissue","expected_fraction") 546 | P$cell_type = as.character(P$cell_type) 547 | P$tissue = as.character(P$tissue) 548 | 549 | RESULTS = merge(RESULTS, P, by = c("cell_type", "tissue"), all = TRUE) 550 | RESULTS[is.na(RESULTS)] <- 0 551 | RESULTS$expected_fraction <- round(RESULTS$expected_fraction, 3) 552 | RESULTS$observed_fraction <- round(RESULTS$observed_fraction, 3) 553 | 554 | } 555 | 556 | return(RESULTS) 557 | 558 | } 559 | -------------------------------------------------------------------------------- /human_lengths.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/favilaco/deconv_matching_bulk_scnRNA/c353e025979a7a122078a324585d1980301bc5f7/human_lengths.rda -------------------------------------------------------------------------------- /schematic_figures/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/favilaco/deconv_matching_bulk_scnRNA/c353e025979a7a122078a324585d1980301bc5f7/schematic_figures/.DS_Store -------------------------------------------------------------------------------- /schematic_figures/Scheme1_Github.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/favilaco/deconv_matching_bulk_scnRNA/c353e025979a7a122078a324585d1980301bc5f7/schematic_figures/Scheme1_Github.jpg -------------------------------------------------------------------------------- /schematic_figures/Scheme1_Github.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/favilaco/deconv_matching_bulk_scnRNA/c353e025979a7a122078a324585d1980301bc5f7/schematic_figures/Scheme1_Github.pdf -------------------------------------------------------------------------------- /schematic_figures/Scheme2_Github.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/favilaco/deconv_matching_bulk_scnRNA/c353e025979a7a122078a324585d1980301bc5f7/schematic_figures/Scheme2_Github.jpg -------------------------------------------------------------------------------- /schematic_figures/Scheme2_Github.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/favilaco/deconv_matching_bulk_scnRNA/c353e025979a7a122078a324585d1980301bc5f7/schematic_figures/Scheme2_Github.pdf -------------------------------------------------------------------------------- /sessionInfo_macOS.txt: -------------------------------------------------------------------------------- 1 | > sessionInfo() 2 | R version 4.0.1 (2020-06-06) 3 | Platform: x86_64-apple-darwin17.0 (64-bit) 4 | Running under: macOS High Sierra 10.13.6 5 | 6 | Matrix products: default 7 | BLAS: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib 8 | LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib 9 | 10 | locale: 11 | [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8 12 | 13 | attached base packages: 14 | [1] parallel stats4 stats graphics grDevices utils datasets 15 | [8] methods base 16 | 17 | other attached packages: 18 | [1] viridis_0.6.1 viridisLite_0.4.0 19 | [3] forcats_0.5.1 stringr_1.4.0 20 | [5] purrr_0.3.4 readr_1.4.0 21 | [7] tibble_3.1.6 tidyverse_1.3.1 22 | [9] tidyr_1.1.4 sctransform_0.3.2 23 | [11] scran_1.16.0 scater_1.16.2 24 | [13] R.utils_2.10.1 R.oo_1.24.0 25 | [15] R.methodsS3_1.8.1 RColorBrewer_1.1-2 26 | [17] pheatmap_1.0.12 org.Mm.eg.db_3.11.4 27 | [19] org.Hs.eg.db_3.11.4 MuSiC_0.1.1 28 | [21] nnls_1.4 monocle3_1.0.0 29 | [23] Matrix_1.3-4 MASS_7.3-54 30 | [25] gtools_3.9.2 ggrepel_0.9.1 31 | [27] ggrastr_0.2.3 ggpubr_0.4.0 32 | [29] ggpointdensity_0.1.0 FARDEEP_1.0.1 33 | [31] egg_0.4.5 ggplot2_3.3.5 34 | [33] gridExtra_2.3 edgeR_3.30.3 35 | [35] limma_3.44.3 dplyr_1.0.7 36 | [37] doMC_1.3.7 iterators_1.0.13 37 | [39] foreach_1.5.1 devtools_2.4.2 38 | [41] usethis_2.0.1 data.table_1.14.0 39 | [43] AnnotationDbi_1.50.3 MAST_1.14.0 40 | [45] SingleCellExperiment_1.10.1 SummarizedExperiment_1.18.2 41 | [47] DelayedArray_0.14.1 matrixStats_0.59.0 42 | [49] Biobase_2.50.0 GenomicRanges_1.40.0 43 | [51] GenomeInfoDb_1.24.2 IRanges_2.22.2 44 | [53] S4Vectors_0.26.1 BiocGenerics_0.36.1 45 | [55] varhandle_2.0.5 ROCR_1.0-11 46 | [57] SeuratObject_4.0.2 Seurat_4.0.3 47 | [59] e1071_1.7-9 reshape_0.8.8 48 | [61] quadprog_1.5-8 BisqueRNA_1.0.5 49 | 50 | loaded via a namespace (and not attached): 51 | [1] SparseM_1.81 scattermore_0.7 52 | [3] coda_0.19-4 bit64_4.0.5 53 | [5] irlba_2.3.3 rpart_4.1-15 54 | [7] RCurl_1.98-1.3 generics_0.1.0 55 | [9] callr_3.7.0 cowplot_1.1.1 56 | [11] RSQLite_2.2.7 RANN_2.6.1 57 | [13] proxy_0.4-26 future_1.21.0 58 | [15] bit_4.0.4 xml2_1.3.2 59 | [17] lubridate_1.7.10 spatstat.data_2.1-0 60 | [19] httpuv_1.6.5 assertthat_0.2.1 61 | [21] hms_1.1.0 promises_1.2.0.1 62 | [23] fansi_0.5.0 dbplyr_2.1.1 63 | [25] readxl_1.3.1 igraph_1.2.6 64 | [27] DBI_1.1.1 htmlwidgets_1.5.3 65 | [29] mcmc_0.9-7 spatstat.geom_2.2-0 66 | [31] ellipsis_0.3.2 backports_1.2.1 67 | [33] MCMCpack_1.5-0 deldir_0.2-10 68 | [35] vctrs_0.3.8 remotes_2.4.0 69 | [37] quantreg_5.86 abind_1.4-5 70 | [39] cachem_1.0.5 withr_2.4.2 71 | [41] prettyunits_1.1.1 goftest_1.2-2 72 | [43] cluster_2.1.2 lazyeval_0.2.2 73 | [45] crayon_1.4.1 pkgconfig_2.0.3 74 | [47] nlme_3.1-152 vipor_0.4.5 75 | [49] pkgload_1.2.1 rlang_0.4.11 76 | [51] globals_0.14.0 lifecycle_1.0.0 77 | [53] miniUI_0.1.1.1 MatrixModels_0.5-0 78 | [55] modelr_0.1.8 rsvd_1.0.5 79 | [57] cellranger_1.1.0 rprojroot_2.0.2 80 | [59] polyclip_1.10-0 lmtest_0.9-38 81 | [61] carData_3.0-4 zoo_1.8-9 82 | [63] reprex_2.0.0 beeswarm_0.4.0 83 | [65] ggridges_0.5.3 processx_3.5.2 84 | [67] png_0.1-7 bitops_1.0-7 85 | [69] KernSmooth_2.23-20 blob_1.2.1 86 | [71] DelayedMatrixStats_1.10.1 parallelly_1.26.0 87 | [73] rstatix_0.7.0 ggsignif_0.6.2 88 | [75] scales_1.1.1 memoise_2.0.0 89 | [77] magrittr_2.0.1 plyr_1.8.6 90 | [79] ica_1.0-2 zlibbioc_1.34.0 91 | [81] compiler_4.0.1 dqrng_0.3.0 92 | [83] fitdistrplus_1.1-5 cli_2.5.0 93 | [85] XVector_0.28.0 listenv_0.8.0 94 | [87] patchwork_1.1.1 pbapply_1.4-3 95 | [89] ps_1.6.0 mgcv_1.8-36 96 | [91] tidyselect_1.1.1 stringi_1.6.2 97 | [93] BiocSingular_1.4.0 locfit_1.5-9.4 98 | [95] grid_4.0.1 tools_4.0.1 99 | [97] future.apply_1.7.0 rio_0.5.27 100 | [99] rstudioapi_0.13 foreign_0.8-81 101 | [101] Rtsne_0.15 digest_0.6.27 102 | [103] shiny_1.6.0 Rcpp_1.0.8 103 | [105] car_3.0-11 broom_0.7.8 104 | [107] later_1.2.0 RcppAnnoy_0.0.18 105 | [109] httr_1.4.2 colorspace_2.0-2 106 | [111] rvest_1.0.0 fs_1.5.0 107 | [113] tensor_1.5 reticulate_1.20 108 | [115] splines_4.0.1 uwot_0.1.10 109 | [117] statmod_1.4.36 conquer_1.0.2 110 | [119] spatstat.utils_2.2-0 plotly_4.9.4.1 111 | [121] sessioninfo_1.1.1 xtable_1.8-4 112 | [123] jsonlite_1.7.2 testthat_3.0.3 113 | [125] R6_2.5.0 pillar_1.6.4 114 | [127] htmltools_0.5.1.1 mime_0.11 115 | [129] glue_1.4.2 fastmap_1.1.0 116 | [131] BiocParallel_1.22.0 BiocNeighbors_1.6.0 117 | [133] class_7.3-19 codetools_0.2-18 118 | [135] pkgbuild_1.2.0 utf8_1.2.1 119 | [137] lattice_0.20-44 spatstat.sparse_2.0-0 120 | [139] curl_4.3.2 ggbeeswarm_0.6.0 121 | [141] leiden_0.3.8 zip_2.2.0 122 | [143] openxlsx_4.2.4 survival_3.2-11 123 | [145] desc_1.3.0 munsell_0.5.0 124 | [147] GenomeInfoDbData_1.2.3 haven_2.4.1 125 | [149] reshape2_1.4.4 gtable_0.3.0 126 | [151] spatstat.core_2.2-0 127 | --------------------------------------------------------------------------------