├── .gitignore ├── 10x_dir_to_files.py ├── 10x_files_to_dir.py ├── CHETAH_referenceDataset.Rmd ├── CHETAH_referenceDataset.html ├── Convert_seurat_to_anndata.R ├── IMMUcan.Rproj ├── README.md ├── Rename_meta.data.Rmd ├── TME_markerGenes.xlsx ├── annotate.R ├── annotation_CHETAH.xlsx ├── cell_ontology.xlsx ├── check_seurat.R ├── create_seurat_fromCSV.R ├── data_example.json ├── scProcessor_1.R ├── scProcessor_1.sh ├── scProcessor_2.R ├── scProcessor_2.sh ├── scRNA_seq_database_summary_stat.Rmd ├── tidy_metadata.R ├── tidy_metadata.xlsx └── zip_checksum.R /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | -------------------------------------------------------------------------------- /10x_dir_to_files.py: -------------------------------------------------------------------------------- 1 | 2 | import logging 3 | import glob 4 | import os 5 | import re 6 | import shutil 7 | import sys 8 | 9 | logging.basicConfig() 10 | log = logging.getLogger() 11 | log.setLevel(logging.DEBUG) 12 | 13 | directories_submitted_as_commandline_arguments = sys.argv[1:] 14 | for directory_to_act_on in directories_submitted_as_commandline_arguments: 15 | log.debug(f"Acting on directory {directory_to_act_on}...") 16 | assert(not "/" in directory_to_act_on) 17 | 18 | all_filenames = glob.glob(f"{directory_to_act_on}/*") 19 | log.debug(f"Found these files: {all_filenames}") 20 | 21 | dir_name = os.path.split(os.path.abspath(directory_to_act_on))[1] 22 | separator = "_" 23 | 24 | for filename in all_filenames: 25 | log.debug(f"Moving {filename}...") 26 | cropped_filename = filename[len(directory_to_act_on)+1:] 27 | shutil.move(filename, dir_name+separator+cropped_filename) 28 | 29 | os.rmdir(directory_to_act_on) 30 | -------------------------------------------------------------------------------- /10x_files_to_dir.py: -------------------------------------------------------------------------------- 1 | 2 | import logging 3 | import glob 4 | import os 5 | import re 6 | import shutil 7 | 8 | logging.basicConfig() 9 | log = logging.getLogger() 10 | log.setLevel(logging.DEBUG) 11 | 12 | log.debug("Searching for matrix files...") 13 | 14 | all_matrix_filenames = glob.glob("*matrix.mtx") 15 | log.debug(f"Found these matrix files: {all_matrix_filenames}") 16 | 17 | for matrix_filename in all_matrix_filenames: 18 | m = re.search("^(.*)matrix.mtx$", matrix_filename) 19 | assert(m is not None) ## should find a hit 20 | 21 | hits = m.groups() 22 | assert(len(hits)==1) ## should be exactly one hit 23 | 24 | first_part = hits[0] 25 | 26 | log.debug(f"Creating directory {first_part}") 27 | os.makedirs(first_part) 28 | 29 | all_files_like_this = glob.glob(f"{first_part}*.*") 30 | log.debug(f"Will move the following files: {all_files_like_this}") 31 | 32 | for filename in all_files_like_this: 33 | cropped_filename = filename[len(first_part):] 34 | shutil.move(filename, first_part + "/" + cropped_filename) 35 | -------------------------------------------------------------------------------- /CHETAH_referenceDataset.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "training dataset CHETAH" 3 | output: html_document 4 | --- 5 | 6 | ```{r} 7 | library(Seurat) 8 | library(dplyr) 9 | library(ggplot2) 10 | library(SingleCellExperiment) 11 | load("CHETAH_TME_reference.Rdata") 12 | ``` 13 | 14 | ```{r} 15 | reference 16 | ``` 17 | 18 | ```{r} 19 | logcounts(reference) <- counts(reference) 20 | reference 21 | ``` 22 | 23 | ```{r} 24 | seurat <- as.Seurat(reference) 25 | seurat 26 | ``` 27 | 28 | ```{r} 29 | seurat <- seurat %>% 30 | FindVariableFeatures(selection.method = "vst", nfeatures = 2000) %>% 31 | ScaleData() %>% 32 | RunPCA(npcs = 50) %>% 33 | RunUMAP(dims = 1:30, a = .5, b = 1.2) %>% 34 | FindNeighbors(dims = 1:30) %>% 35 | FindClusters(resolution = 1.2) 36 | ``` 37 | 38 | ```{r fig.height=5, fig.width=10} 39 | #Idents(seurat) <- seurat$seurat_clusters 40 | DimPlot(seurat, label = TRUE) 41 | DimPlot(seurat, group.by = c("seurat_clusters", "celltypes"), label = TRUE) + NoLegend() 42 | ``` 43 | 44 | ```{r fig.height=12, fig.width=12} 45 | markers <- readxl::read_excel("TME_markerGenes.xlsx") 46 | DotPlot(seurat, features = unique(markers$gene), cluster.idents = TRUE) + coord_flip() + NoLegend() 47 | ``` 48 | 49 | ```{r} 50 | anno_clust <- readxl::read_excel("annotation_CHETAH.xlsx") 51 | anno_clust <- arrange(anno_clust, seurat_clusters) 52 | new.cluster.ids <- tolower(anno_clust$abbreviation) 53 | names(new.cluster.ids) <- levels(seurat) 54 | seurat <- RenameIdents(seurat, new.cluster.ids) 55 | seurat$abbreviation <- Idents(seurat) 56 | cell_ont <- readxl::read_excel("cell_ontology.xlsx") 57 | cell_ont$abbreviation <- tolower(cell_ont$abbreviation) 58 | seurat@meta.data <- seurat@meta.data %>% 59 | tibble::rownames_to_column("cell") %>% 60 | left_join(cell_ont, by = "abbreviation") %>% 61 | tibble::column_to_rownames("cell") 62 | Idents(seurat) <- seurat$cell_ontology 63 | ``` 64 | 65 | ```{r} 66 | seurat <- seurat[, !grepl("doublets", seurat@meta.data$abbreviation)] 67 | seurat$celltypes <- seurat$cell_ontology 68 | seurat 69 | ``` 70 | 71 | ```{r} 72 | DimPlot(seurat, label = TRUE) 73 | ``` 74 | 75 | ```{r} 76 | reference <- as.SingleCellExperiment(seurat) 77 | reference 78 | ``` 79 | 80 | ```{r} 81 | save(reference, file = "CHETAH_reference_updatedAnnotation.RData") 82 | ``` 83 | 84 | -------------------------------------------------------------------------------- /Convert_seurat_to_anndata.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | args = commandArgs(trailingOnly=TRUE) 3 | object_path = args[1] 4 | 5 | suppressPackageStartupMessages({ 6 | library(sceasy) 7 | library(reticulate) 8 | use_condaenv('sceasy') 9 | loompy <- reticulate::import('loompy') 10 | library(Seurat) 11 | library(SeuratDisk) 12 | }) 13 | 14 | seurat <- readRDS(object_path) 15 | #SaveH5Seurat(seurat, filename = "out/harmony.h5Seurat", overwrite = TRUE) 16 | #Convert("out/harmony.h5Seurat", dest = "h5ad", overwrite = TRUE) 17 | 18 | # Convert to h5ad with sceasy for immediate use with cellxgene 19 | sceasy::convertFormat(seurat, from="seurat", to="anndata", outFile=gsub(".rds$", ".h5ad", object_path)) 20 | -------------------------------------------------------------------------------- /IMMUcan.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # IMMUcan 2 | 3 | scProcessor is used for the processing of scRNAseq datasets in the IMMUcan scDB. It runs on R and is mostly based on the Seurat package. 4 | 5 | - Quality control 6 | - Measure and correct batch effect (harmony) 7 | - Clustering optimization 8 | - Supervised annotation (CHETAH) 9 | - CNA calling (copyKat) 10 | - Cell ontology (ebi.ac.uk/ols/ontologies/cl) 11 | - Differential expression 12 | - Universal output files (sceasy) 13 | 14 | 15 | ## Install instructions 16 | 17 | - Follow install instructions for sceasy (https://github.com/cellgeni/sceasy) 18 | - Get CHETAH_reference_updatedAnnotation.RData from IMMUcan teams channel 19 | - Install following R packages 20 | ``` 21 | install.packages(c("Seurat", "tidyverse", "readxl", "patchwork", "devtools", "data.table", "BiocManager", "remotes", "openxlsx", "pheatmap", "plyr", "DescTools", "future", "jsonlite")) 22 | BiocManager::install(c("CHETAH", "SingleCellExperiment")) 23 | devtools::install_github("mahmoudibrahim/genesorteR") 24 | devtools::install_github("immunogenomics/harmony") 25 | devtools::install_github("navinlabcode/copykat") 26 | remotes::install_github("mojaveazure/seurat-disk") 27 | ``` 28 | 29 | ## Before starting 30 | 31 | Change the paths to files provided in the script 32 | - cellMarker_path = PATH to TME_markerGenes.xlsx 33 | - chetahClassifier_path = PATH to CHETAH_reference_updatedAnnotation.RData 34 | - cellOntology_path = PATH to cell_ontology.xlsx 35 | 36 | ## Run scProcessor 37 | The core of scProcessor are three processing scripts. 38 | 39 | ### 1. check_seurat.R: check seurat object and estimate batch 40 | 41 | - It takes a Seurat object as input (in the future this will be extended to other file formats) 42 | - This step is optional, if data.json is filled in you can immediately run scProcessor_1 43 | 1. Check validity of seurat object 44 | 2. Estimate batch variable 45 | 3. Return QC plots (in temp) 46 | 47 | ``` 48 | Rscript check_seurat.R [SEURAT] [BATCH] 49 | ``` 50 | 51 | - [SEURAT]: path to seurat object (if only one .rds file in directory it will also find it itself) 52 | - [BATCH]: only necessary when you already know your batch variable 53 | 54 | ### 2. data.json 55 | 56 | - scProcessor works without arguments to the Rscripts, therefore it needs an input file that specifies these variables. This is automatically generated by check_seurat and has to be reviewed to make sure scProcessor_1 processes the data how you want. 57 | - Here is an overview of the data.json (NA in a json is indicated as null) 58 | - **object_path**: full path where seurat object is stored 59 | - **batch**: e.g. *patient* 60 | - **norm**: boolean indicating if data is already normalized e.g. *false* 61 | - **QC_feature_min**: threshold for minimal number of detected genes per cell e.g. *250* 62 | - **QC_mt_max**: threshold for maximal percentage of mitochondrial reads per cell e.g. *20* 63 | - **pca_dims**: number of PCA dimensions to take for further processing e.g. *30* 64 | - **features_var**: number of highly variable features to take for further processing e.g. *2000* 65 | - **nSample**: number of cells to take for intense computing steps and for cellxgene.h5ad at the end e.g. *10000* 66 | - **cluster_resolution**: a sequence of different cluster resolutions, scProcessor will select the most optimal resolution e.g. *0.5, 1, 1.5* 67 | - **malignant**: boolean indicating if maligant cell prediction is necessary e.g. *TRUE* 68 | - **normal_cells**: cell type taken as normal cells to increase confidence of malingant cell prediction e.g. *null (standard Macrophages are taken), false (no normal cells taken)* 69 | - **annotation**: columns in meta.data that contains annotation information 70 | - **metadata**: other important columns contained in the meta.data slot e.g. *biopsy, sample_id, treatment ...* 71 | 72 | ### 3. scProcessor_1: the main processing script 73 | 74 | 1. QC 75 | 2. Batch integration and clustering 76 | 3. Supervised classification and CNA calling 77 | 4. Create marker gene plots 78 | 5. Save summary statistics in misc 79 | 80 | ### 4. Annotate clusters 81 | 82 | - Check plots in temp/plots: 83 | - marker gene plots 84 | - dotplot 85 | - In out/annotation.xlsx, fill in cell types as defined in the abbreviation column of cell_ontology.xlsx 86 | 87 | 88 | ### 5. scProcessor_2: link to cell ontology and create all output files 89 | 90 | 1. Links cell ontology 91 | 2. Differential expression 92 | 3. Creates output files for SIB scRNAseq interface 93 | - AverageExpression matrices and DE_results per annotation level 94 | - geneIndex.tsv 95 | - Metadata.tsv 96 | - cellCount.tsv 97 | - harmony.rds 98 | - cellxgene.h5ad 99 | 100 | ### 6. Create checksum file to send to SIB 101 | 102 | on the terminal 103 | ``` 104 | zip -r AML_UNB_SW_GSE116256.zip AML_UNB_SW_GSE116256 105 | md5sum AML_UNB_SW_GSE116256.zip 106 | mv AML_UNB_SW_GSE116256 AML_UNB_SW_GSE116256_-_###PASTE_MD5SUM_OUTPUT_HERE###.zip 107 | ``` 108 | 109 | Login to SIB through sftp and transfer 110 | -------------------------------------------------------------------------------- /Rename_meta.data.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Rename_meta.data" 3 | output: html_document 4 | --- 5 | 6 | ```{r} 7 | setwd("~/s3result/CRC_UNB_10X_GSE144735/") 8 | library(dplyr) 9 | library(jsonlite) 10 | seurat <- readRDS("out/harmony.rds") 11 | glimpse(seurat@meta.data) 12 | ``` 13 | 14 | ```{r} 15 | seurat@meta.data <- plyr::rename(seurat@meta.data, c( 16 | "" = "" 17 | )) 18 | 19 | #also change these columns in data.json if necessary!!! 20 | saveRDS(seurat, "out/harmony.rds") 21 | ``` 22 | 23 | -------------------------------------------------------------------------------- /TME_markerGenes.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ImmucanWP7/immucan-scdb/51b8ca98c255823a23213f0744b3779640ef75ea/TME_markerGenes.xlsx -------------------------------------------------------------------------------- /annotate.R: -------------------------------------------------------------------------------- 1 | library(readxl) 2 | library(editData) 3 | library(openxlsx) 4 | 5 | #Load annotation.xlsx 6 | if (file.exists("out/annotation_copy.xlsx")) { 7 | print("Reading annotation_copy.xlsx") 8 | annotation <- read_excel("out/annotation_copy.xlsx") 9 | } else { 10 | annotation <- read_excel("out/annotation.xlsx") 11 | } 12 | 13 | # Use DE for checking top genes of some clusters 14 | DE_genes <- read.csv("temp/DE_genes.csv", row.names = 1) 15 | 16 | #Check top 10 genes of a certain cluster 17 | clust = 30 18 | head(DE_genes[DE_genes$cluster == clust, ], 10) 19 | 20 | # Change annotation 21 | annotation <- editData(annotation) 22 | 23 | write.xlsx(annotation, "out/annotation.xlsx") 24 | if(file.exists("out/annotation_copy.xlsx")) {file.remove("out/annotation_copy.xlsx")} -------------------------------------------------------------------------------- /annotation_CHETAH.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ImmucanWP7/immucan-scdb/51b8ca98c255823a23213f0744b3779640ef75ea/annotation_CHETAH.xlsx -------------------------------------------------------------------------------- /cell_ontology.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ImmucanWP7/immucan-scdb/51b8ca98c255823a23213f0744b3779640ef75ea/cell_ontology.xlsx -------------------------------------------------------------------------------- /check_seurat.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | args = commandArgs(trailingOnly=TRUE) 3 | seurat_obj = args[1] #path of seurat object 4 | batch_var = args[2] #batch variable if known 5 | verbose = FALSE 6 | tidy_metadata_path <- "/home/jordi_camps/IMMUcan/tidy_metadata.xlsx" 7 | 8 | dir <- getwd() 9 | setwd(dir) 10 | print(dir) 11 | if (!dir.exists("temp")) {dir.create("temp")} 12 | if (!dir.exists("temp/QC")) {dir.create("temp/QC")} 13 | if (!dir.exists("out")) {dir.create("out")} 14 | if (!dir.exists("out/plots")) {dir.create("out/plots")} 15 | 16 | suppressPackageStartupMessages({ 17 | library(Seurat) 18 | library(ggplot2) 19 | library(patchwork) 20 | library(Matrix) 21 | library(dplyr) 22 | library(readxl) 23 | library(DescTools) 24 | library(tidyr) 25 | library(tibble) 26 | library(jsonlite) 27 | library(harmony) 28 | library(kBET) 29 | library(scater) 30 | }) 31 | 32 | print("STEP 1: CHECKING SEURAT OBJECT") 33 | 34 | if (is.na(seurat_obj)) { 35 | seurat_obj <- normalizePath(list.files(pattern = ".rds$")) 36 | if (length(seurat_obj) != 1) { 37 | stop("Specify seurat object in arguments") 38 | } 39 | } 40 | seurat <- readRDS(seurat_obj) 41 | 42 | # Load data.json or create standard 43 | if (file.exists("out/data.json")) { 44 | data <- fromJSON("out/data.json") 45 | } else { 46 | data <- list() 47 | data$object_path = seurat_obj 48 | data$batch = batch_var 49 | data$QC_feature_min = 250 #Minimal features threshold 50 | data$QC_mt_max = 20 #Maximum mitochondrial content threshold 51 | data$pca_dims = 30 #Amount of PCA dimensions to use 52 | data$features_var = 2000 #Amount of variable features to select 53 | data$nSample = 10000 54 | data$cluster_resolution = seq(from = 0.4, to = 4, by = 0.1) 55 | data$malignant = TRUE 56 | data$normal_cells = NA 57 | } 58 | 59 | print(paste0("nCell = ", ncol(seurat), " / ", "nGene = ", nrow(seurat))) 60 | 61 | if (sum(colnames(seurat) == rownames(seurat@meta.data)) == ncol(seurat)) { 62 | print("Cell IDs linked correctly") 63 | } else { 64 | stop("Cell IDs linked uncorrectly") 65 | } 66 | 67 | gapdh <- grepl("GAPDH|Gapdh", rownames(seurat)) 68 | if (sum(grepl("\\.", seurat[["RNA"]]@counts[gapdh, ])) == 0) { 69 | data$norm <- FALSE 70 | print("Raw counts supplied") 71 | } else if (any(seurat[["RNA"]]@counts[gapdh, ] > 100)) { 72 | data$norm <- FALSE 73 | print("Normalized counts supplied. Be careful with further interpretation") 74 | } else { 75 | data$norm <- TRUE 76 | print("Logcounts supplied, no normalization will be done. Be careful with further interpretation") 77 | } 78 | 79 | seurat[["percent_mt"]] <- PercentageFeatureSet(seurat, pattern = "^Mt\\.|^MT\\.|^mt\\.|^Mt-|^MT-|^mt-") 80 | seurat[["RNA"]]@counts[1:5,1:5] 81 | dplyr::glimpse(seurat@meta.data) 82 | 83 | # Clean metadata columns 84 | names <- tolower(colnames(seurat@meta.data)) 85 | names <- gsub("\\.", "_", names) 86 | meta_cols <- read_excel(tidy_metadata_path) 87 | if (any(names %in% meta_cols$col_names)) { 88 | change_cols <- colnames(seurat@meta.data)[colnames(seurat@meta.data) %in% meta_cols$col_names] 89 | for (i in change_cols) { 90 | hit_1 <- grepl(i, names) 91 | hit_2 <- meta_cols$col_names %in% i 92 | print(paste0("changing ", i, " to ", meta_cols$general[hit_2])) 93 | colnames(seurat@meta.data)[hit_1] <- meta_cols$general[hit_2] 94 | } 95 | } else { 96 | print("No meta.data columns to tidy up") 97 | } 98 | data$annotation = c("seurat_clusters","annotation_CHETAH","annotation_major","annotation_immune","annotation_minor", colnames(seurat@meta.data)[grepl("authors_annotation|Authors_annotation", colnames(seurat@meta.data))]) 99 | saveRDS(seurat, seurat_obj) 100 | 101 | # Batch 102 | print("STEP 2a: ESTIMATING BATCH VARIABLES") 103 | 104 | ## Remove bad quality cells 105 | seurat <- CreateSeuratObject(counts = seurat[["RNA"]]@counts, meta.data = seurat@meta.data, min.cells = 10, min.features = 200) 106 | seurat <- subset(seurat, subset = nFeature_RNA > data$QC_feature_min & percent_mt < data$QC_mt_max) 107 | 108 | ## Subsample datasets larger than 20k cells 109 | if (ncol(seurat) > 50000) { 110 | subset_size <- 0.1 #subsample to 10% of the data 111 | subset_id <- sample.int(n = ncol(seurat), size = floor(subset_size * ncol(seurat)), replace=FALSE) 112 | seurat <- seurat[, subset_id] 113 | } 114 | 115 | ## Select potential batch columns from meta.data 116 | seurat@meta.data <- seurat@meta.data[, sapply(sapply(seurat@meta.data, unique), length) != 1, drop = FALSE] #Remove all columns that have only one variable 117 | #seurat@meta.data <- seurat@meta.data[, sapply(sapply(seurat@meta.data, unique), length) != nrow(seurat@meta.data), drop = FALSE] #Remove columns with only unique values 118 | meta <- seurat@meta.data[, sapply(seurat@meta.data, class) %in% c("character", "factor")] #Select all columns that are factor or character 119 | meta <- meta[,!grepl("Cluster|cluster|author|Author|Annotation|annotation|Cell_type|cell_type|cell|Cell|barcode|Barcode", colnames(meta))] 120 | if (!"metadata" %in% names(data)) {data$metadata = colnames(meta)} #save metadata columns to data.json 121 | if (length(data$batch) > 1) { 122 | batch <- data$batch 123 | } else if (is.na(data$batch)) { 124 | temp <- meta[, apply(meta, 2, function(x) !any(is.na(x))), drop = FALSE] #Remove all columns with NAs 125 | batch <- colnames(temp) 126 | } else { 127 | batch <- data$batch 128 | } 129 | print("Possible batches:") 130 | print(paste0(batch)) 131 | 132 | ## Create nearest neighbour graph 133 | if (data$norm == FALSE) { 134 | seurat <- Seurat::NormalizeData(seurat, verbose = verbose) 135 | } else {seurat[["RNA"]]@data <- seurat[["RNA"]]@counts} 136 | 137 | seurat <- seurat %>% 138 | FindVariableFeatures(selection.method = "vst", nfeatures = data$features_var, verbose = verbose) %>% 139 | ScaleData(verbose = verbose) %>% 140 | RunPCA(pc.genes = seurat@var.genes, npcs = data$pca_dims+20, verbose = verbose) %>% 141 | RunUMAP(dims = 1:data$pca_dims, a = .5, b = 1.2, verbose = verbose) %>% 142 | FindNeighbors(dims = 1:2, k.param = 30, reduction = "umap", verbose = verbose) 143 | 144 | p_lbw <- ElbowPlot(seurat, ndims = data$pca_dims+20) + geom_vline(xintercept = data$pca_dims, color = "red") + ylab("STDEV PCA") + theme(axis.title.x = element_blank()) 145 | 146 | ## Compute variance explained 147 | #print("Compute explained variance") 148 | #temp <- seurat@meta.data[, apply(seurat@meta.data, 2, function(x) !any(is.na(x))), drop = FALSE] #Remove all columns with NAs 149 | #sce <- SingleCellExperiment(assays = list(logcounts = seurat[["RNA"]]@data), colData = temp) 150 | #vars_exp <- getVarianceExplained(x = sce, variables = colnames(temp)) 151 | #p_var <- plotExplanatoryVariables(vars_exp) 152 | #ggsave(plot = p_var, filename = paste0("temp/QC/Batch_variance_explained.png")) 153 | 154 | #print("Compute kBET score") 155 | ## kBET 156 | #kbet <- list() 157 | #mtx <- t(as.matrix(seurat[["RNA"]]@data)) 158 | #for (b in batch) { 159 | # btch <- seurat@meta.data[, b] 160 | # kbet.estimate <- kBET(df = mtx, batch = btch, plot = FALSE) 161 | # kbet[[b]] <- 1 - kbet.estimate$stats$kBET.observed 162 | #} 163 | ##Plot kbet scores 164 | #kbet <- do.call(cbind, kbet) 165 | #p_kbet <- kbet %>% 166 | # as.data.frame() %>% 167 | # gather("var", "Acceptance_rate") %>% 168 | # ggplot(aes(x = var, y = Acceptance_rate)) + 169 | # geom_boxplot() 170 | #ggsave(plot = p_kbet, filename = "temp/QC/batch_kBET.png") 171 | 172 | ## Compute the percentage of batch in cell neighbors 173 | print("Compute batch entropy") 174 | batch_entropy <- list() 175 | for (b in batch) { 176 | neighbors <- list() 177 | for (i in as.factor(unique(seurat@meta.data[, b]))) { 178 | temp <- rownames(seurat@meta.data[seurat@meta.data[ , b] == i, ]) 179 | neighbors[[i]] <- rowSums(as.matrix(seurat@graphs$RNA_nn[, temp]))/30 180 | } 181 | neighbors <- as.data.frame(neighbors) 182 | ## Compute entropy per cell 183 | optimum <- table(seurat@meta.data[, b]) / ncol(seurat) 184 | batch_entropy[[b]] <- apply(neighbors, 1, Entropy) 185 | batch_entropy[[b]] <- batch_entropy[[b]] / Entropy(optimum) 186 | } 187 | batch_entropy <- do.call(cbind, batch_entropy) 188 | 189 | ## Save batch variables with entropy < 2 190 | #batch_var <- list() 191 | #for (i in colnames(batch_entropy)) { 192 | # if (median(as.numeric(batch_entropy[, i])) < 2) { 193 | # batch_var[[i]] <- median(as.numeric(batch_entropy[, i])) 194 | # } 195 | #} 196 | #batch_var <- batch_var[!duplicated(batch_var)] 197 | #batches <- paste(names(batch_var), sep = ", ") 198 | #print(paste0("Possible batch(es): ", batches)) 199 | 200 | # Run harmony 201 | 202 | if (length(batch >= 1)) { 203 | print("STEP 2b: RUN HARMONY") 204 | batch_harmony <- list() 205 | 206 | for (i in batch) { 207 | p0 <- AugmentPlot(DimPlot(seurat, reduction = "umap", group.by = i, pt.size = .1) + NoLegend() + ggtitle("Before harmony")) 208 | p1 <- AugmentPlot(DimPlot(object = seurat, reduction = "pca", group.by = i, pt.size = .1) + NoLegend()) 209 | seurat_corrected <- seurat %>% 210 | RunHarmony(i, plot_convergence = FALSE, verbose = verbose) %>% 211 | RunUMAP(reduction = "harmony", dims = 1:data$pca_dims, a = .5, b = 1.2, verbose = verbose) %>% 212 | FindNeighbors(dims = 1:2, k.param = 30, reduction = "umap", verbose = verbose) 213 | 214 | p3 <- AugmentPlot(DimPlot(object = seurat_corrected, reduction = "harmony", group.by = i, pt.size = .1) + NoLegend()) 215 | p2 <- AugmentPlot(DimPlot(seurat_corrected, reduction = "umap", group.by = i, pt.size = .1) + NoLegend() + ggtitle("After harmony")) 216 | p <- (p0 | p2) / (p1 | p3) 217 | ggsave(plot = p, filename = paste0("temp/QC/Harmony_", i, ".png")) 218 | 219 | ## Compute the percentage of batch in cell neighbors 220 | neighbors <- list() 221 | for (j in as.factor(unique(seurat_corrected@meta.data[, i]))) { 222 | temp <- rownames(seurat_corrected@meta.data[seurat@meta.data[ , i] == j, ]) 223 | neighbors[[j]] <- rowSums(as.matrix(seurat_corrected@graphs$RNA_nn[, temp]))/30 224 | } 225 | neighbors <- as.data.frame(neighbors) 226 | ## Compute entropy per cell 227 | optimum <- table(seurat@meta.data[, i]) / ncol(seurat) 228 | batch_harmony[[i]] <- apply(neighbors, 1, Entropy) 229 | batch_harmony[[i]] <- batch_harmony[[i]] / Entropy(optimum) 230 | } 231 | batch_harmony <- do.call(cbind, batch_harmony) 232 | ## Plot entropy over all batches 233 | #batch_entropy$harmony <- "Before" 234 | batch_entropy <- batch_entropy %>% 235 | as.data.frame() %>% 236 | mutate(harmony = "Before") %>% 237 | tibble::rownames_to_column("cell") %>% 238 | gather("batch", "entropy", -cell, -harmony) 239 | 240 | batch_harmony <- batch_harmony %>% 241 | as.data.frame() %>% 242 | mutate(harmony = "After") %>% 243 | tibble::rownames_to_column("cell") %>% 244 | gather("batch", "entropy", -cell, -harmony) 245 | 246 | batch_entropy <- rbind(batch_entropy, batch_harmony) 247 | batch_entropy$harmony <- factor(batch_entropy$harmony, levels = c("Before", "After")) 248 | 249 | p <- batch_entropy %>% 250 | ggplot(aes(y = as.numeric(entropy), x = batch, col = harmony)) + 251 | geom_boxplot() + 252 | scale_y_continuous("Entropy") + 253 | theme(axis.text.x = element_text(angle = 45, hjust = 1)) 254 | ggsave(plot = p, filename = "temp/QC/batch_entropy.png", width = 10, height = 10) 255 | 256 | } #else { 257 | ## Plot entropy over all batches 258 | #batch_entropy <- as.data.frame(batch_entropy) 259 | #p <- batch_entropy %>% 260 | # tibble::rownames_to_column("cell") %>% 261 | # gather("batch", "entropy", -cell) %>% 262 | # ggplot(aes(y = as.numeric(entropy), x = batch)) + 263 | # geom_boxplot() + 264 | # scale_y_continuous("Entropy") + 265 | # theme(axis.text.x = element_text(angle = 45, hjust = 1)) 266 | #ggsave(plot = p, filename = "temp/QC/batch_entropy.png", width = 10, height = 10) 267 | #} 268 | 269 | # QC 270 | print("STEP 3: CREATE QC PLOTS") 271 | if (length(batch) >= 1) { 272 | for (i in batch) { 273 | p2 <- DimPlot(seurat, reduction = "umap", pt.size = .1, group.by = i, label = TRUE) + NoLegend() 274 | p3 <- AugmentPlot(VlnPlot(seurat, features = "nFeature_RNA", pt.size = 0.1, group.by = i, log = TRUE)) + 275 | NoLegend() + 276 | scale_y_log10("Genes", expand = c(0,0)) + 277 | geom_hline(yintercept = data$QC_feature_min, color = "red") + 278 | theme(axis.title.x = element_blank(), plot.title = element_blank(), axis.title.y = element_text(), axis.text.x = element_blank(), axis.ticks.x = element_blank()) 279 | p4 <- AugmentPlot(VlnPlot(seurat, features = "nCount_RNA", pt.size = 0.1, group.by = i, log = TRUE)) + 280 | NoLegend() + 281 | scale_y_log10("Counts", expand = c(0,0)) + 282 | theme(axis.title.x = element_blank(), plot.title = element_blank(), axis.title.y = element_text(), axis.text.x = element_blank(), axis.ticks.x = element_blank()) 283 | if ("percent_mt" %in% colnames(seurat@meta.data)) { 284 | p5 <- AugmentPlot(VlnPlot(seurat, features = "percent_mt", pt.size = 0.1, group.by = i)) + 285 | NoLegend() + 286 | geom_hline(yintercept = data$QC_mt_max, color = "red") + 287 | scale_y_continuous("Mito", expand = c(0,0)) + 288 | theme(axis.title.x = element_blank(), plot.title = element_blank(), axis.title.y = element_text()) 289 | } else { 290 | seurat@meta.data$percent_mt <- 0 291 | p5 <- AugmentPlot(VlnPlot(seurat, features = "percent_mt", pt.size = 0.1, group.by = i)) + 292 | NoLegend() + 293 | geom_hline(yintercept = data$QC_mt_max, color = "red") + 294 | scale_y_continuous("Mito", expand = c(0,0)) + 295 | theme(axis.title.x = element_blank(), plot.title = element_blank(), axis.title.y = element_text()) 296 | } 297 | if ("CD3D" %in% rownames(seurat)) { 298 | p6 <- AugmentPlot(FeaturePlot(seurat, features = "CD3D", pt.size = .1)) + 299 | theme(axis.title.x = element_blank(), axis.title.y = element_text()) 300 | } else if ("CD68" %in% rownames(seurat)) { 301 | p6 <- AugmentPlot(FeaturePlot(seurat, features = "CD68", pt.size = .1)) + 302 | theme(axis.title.x = element_blank(), axis.title.y = element_text()) 303 | } else if ("CLDN5" %in% rownames(seurat)) { 304 | p6 <- AugmentPlot(FeaturePlot(seurat, features = "CLDN5", pt.size = .1)) + 305 | theme(axis.title.x = element_blank(), axis.title.y = element_text()) 306 | } else { 307 | p6 <- AugmentPlot(DimPlot(seurat, group.by = i)) 308 | } 309 | p <- (p_lbw + p2) / (p3 + p5) / (p4 + p6) 310 | ggsave(plot = p, filename = paste0("temp/QC/QC_", i, ".png")) 311 | } 312 | } else { 313 | print("No batches found in metadata!") 314 | p1 <- AugmentPlot(VlnPlot(seurat, features = "nFeature_RNA", pt.size = 0.1, log = TRUE)) + 315 | NoLegend() + 316 | scale_y_log10("Genes", expand = c(0,0)) + 317 | geom_hline(yintercept = data$QC_feature_min, color = "red") + 318 | theme(axis.title.x = element_blank(), plot.title = element_blank(), axis.title.y = element_text(), axis.text.x = element_blank(), axis.ticks.x = element_blank()) 319 | p2 <- AugmentPlot(VlnPlot(seurat, features = "nCount_RNA", pt.size = 0.1, log = TRUE)) + 320 | NoLegend() + 321 | scale_y_log10("Counts", expand = c(0,0)) + 322 | theme(axis.title.x = element_blank(), plot.title = element_blank(), axis.title.y = element_text(), axis.text.x = element_blank(), axis.ticks.x = element_blank()) 323 | p3 <- AugmentPlot(VlnPlot(seurat, features = "percent_mt", pt.size = 0.1)) + 324 | NoLegend() + 325 | geom_hline(yintercept = data$QC_mt_max, color = "red") + 326 | scale_y_continuous("Mito", expand = c(0,0)) + 327 | theme(axis.title.x = element_blank(), plot.title = element_blank(), axis.title.y = element_text()) 328 | p <- p_lbw / p1 / p2 / p3 329 | ggsave(plot = p, filename = "temp/QC/QC.png") 330 | } 331 | 332 | ## Save data.json 333 | if (length(batch) == 1) { 334 | data$batch = "patient" 335 | } else { 336 | data$batch = FALSE 337 | } 338 | data <- toJSON(data) 339 | 340 | if (!file.exists("out/data.json")) { 341 | write(data, "out/data.json") 342 | } else { 343 | print("data.json already exists, writing to copy") 344 | write(data, "out/data_copy.json") 345 | } 346 | 347 | -------------------------------------------------------------------------------- /create_seurat_fromCSV.R: -------------------------------------------------------------------------------- 1 | library(Seurat) 2 | library(tidyverse) 3 | library(tidyr) 4 | 5 | count_file <- "" 6 | meta_file <- "" 7 | patient_file <- "" 8 | 9 | print("Read count matrix") 10 | counts_test <- read.csv(count_file, header = TRUE, row.names = 1, sep = " ", nrows = 6) #Read try 11 | counts_test[,1:6] 12 | counts <- read.csv(count_file, header = TRUE, row.names = 1, sep = " ") 13 | 14 | print("Metadata") 15 | meta <- read.csv(meta_file, header = TRUE, sep = "\t", row.names = 1) 16 | print(head(meta)) 17 | 18 | print("Patient info") 19 | patient <- read.csv(patient_file, header = TRUE, sep = "\t", row.names = 1) 20 | 21 | if (ncol(counts) != sum(colnames(counts) == rownames(meta))) {stop("colnames counts not equal to rownames meta.data")} 22 | 23 | print("Create Seurat object") 24 | seurat <- CreateSeuratObject(counts = counts, meta.data = meta) 25 | saveRDS(seurat, "raw.rds") -------------------------------------------------------------------------------- /data_example.json: -------------------------------------------------------------------------------- 1 | {"object_path":["raw.rds"],"batch":["orig.ident"],"QC_feature_min":[250],"QC_mt_max":[20],"pca_dims":[30],"features_var":[2000],"nSample":[10000],"cluster_resolution":[1],"malignant":[true],"normal_cells":[null],"annotation":["seurat_clusters","annotation_CHETAH","annotation_major","annotation_immune","annotation_minor","annotation_authors"],"norm":[true],"metadata":["orig.ident","cell_id","biopsy_id"]} 2 | -------------------------------------------------------------------------------- /scProcessor_1.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | args = commandArgs(trailingOnly=TRUE) 3 | data = "out/data.json" #If data is already normalized or not, stored by check_seurat.R 4 | cellMarker_path = "/home/jordi_camps/IMMUcan/TME_markerGenes.xlsx" 5 | chetahClassifier_path = "/home/jordi_camps/IMMUcan/CHETAH_reference_updatedAnnotation.RData" 6 | uGene_clust = 20 7 | verbose = FALSE 8 | if (!dir.exists("temp")) {dir.create("temp")} 9 | if (!dir.exists("temp/annotation")) {dir.create("temp/annotation")} 10 | if (!dir.exists("out")) {dir.create("out")} 11 | if (!dir.exists("out/plots")) {dir.create("out/plots")} 12 | 13 | # Load packages and set environment 14 | suppressPackageStartupMessages({ 15 | library(Seurat) 16 | library(SingleCellExperiment) 17 | library(CHETAH) 18 | library(harmony) 19 | library(ggplot2) 20 | library(patchwork) 21 | library(Matrix) 22 | library(dplyr) 23 | library(openxlsx) 24 | library(pheatmap) 25 | library(DescTools) 26 | library(copykat) 27 | library(future) 28 | library(jsonlite) 29 | }) 30 | 31 | suppressWarnings(RNGkind(sample.kind = "Rounding")) 32 | set.seed(111) 33 | options(future.globals.maxSize= 150000*1024^2) 34 | plan("multisession", workers = 4) 35 | 36 | # Make and set directories 37 | dir <- getwd() 38 | print(dir) 39 | setwd(dir) 40 | if (!file.exists("out/data.json")) {stop("first run check_seurat.R")} 41 | data <- fromJSON("out/data.json") 42 | 43 | # Recreate seurat object 44 | 45 | seurat <- readRDS(data$object_path) 46 | seurat <- CreateSeuratObject(counts = seurat[["RNA"]]@counts, meta.data = seurat@meta.data, min.cells = 10, min.features = 200) 47 | if (length(data$batch) > 1) {stop("More than one batch specified, select the correct batch")} 48 | if (!"cluster_resolution" %in% names(data)) {data$cluster_resolution = seq(from = 0.4, to = 4, by = 0.1)} 49 | if (ncol(seurat) > 50000) {subsamples <- sample(ncol(seurat), 50000, replace = FALSE)} #copykat can only run on matrix of max 50,000 cells 50 | 51 | # QC 52 | 53 | print("STEP 1a: QC") 54 | cells_before_QC <- ncol(seurat) 55 | bad_columns <- colnames(seurat@meta.data[, sapply(sapply(seurat@meta.data, unique), length) == 1, drop = FALSE]) 56 | bad_cols <- paste(bad_columns, sep = ", ") 57 | print(paste0("Removing columns with only one value: ", bad_cols)) 58 | seurat@meta.data <- seurat@meta.data[, !colnames(seurat@meta.data) %in% c(bad_columns)] #Remove all columns that have only one variable 59 | #colnames(seurat@meta.data) <- gsub("[[:space:]]|\\/", "_", colnames(seurat@meta.data)) #Clean column names from special characters 60 | seurat[["percent_mt"]] <- PercentageFeatureSet(seurat, pattern = "^Mt\\.|^MT\\.|^mt\\.|^Mt-|^MT-|^mt-") 61 | cols <- colnames(seurat@meta.data)[!colnames(seurat@meta.data) %in% "percent_mt"] 62 | for (i in seq_along(cols)) { 63 | if (ncol(seurat) == sum(seurat[[cols[i], drop = TRUE]] == seurat$percent_mt, na.rm = TRUE)) { 64 | print(paste0("Found duplicate mito column, removing ", cols[i])) 65 | seurat@meta.data <- seurat@meta.data[, !colnames(seurat@meta.data) %in% i] 66 | } 67 | } 68 | seurat <- subset(seurat, subset = nFeature_RNA > data$QC_feature_min & percent_mt < data$QC_mt_max) 69 | if (data$norm == FALSE) { 70 | seurat <- Seurat::NormalizeData(seurat, verbose = verbose) 71 | } else {seurat[["RNA"]]@data <- seurat[["RNA"]]@counts} 72 | seurat <- suppressWarnings(seurat %>% 73 | FindVariableFeatures(selection.method = "vst", nfeatures = data$features_var, verbose = verbose) %>% 74 | ScaleData(verbose = verbose) %>% 75 | RunPCA(pc.genes = seurat@var.genes, npcs = data$pca_dims+20, verbose = verbose) %>% 76 | RunUMAP(dims = 1:data$pca_dims, a = .5, b = 1.2, verbose = verbose)) 77 | 78 | # Harmony 79 | 80 | if (data$batch != FALSE) { 81 | print("STEP 1b: INTEGRATING BATCH") 82 | p0 <- AugmentPlot(DimPlot(seurat, reduction = "umap", group.by = data$batch, pt.size = .1) + 83 | NoLegend() + 84 | ggtitle("Before harmony")) 85 | p1 <- AugmentPlot(DimPlot(object = seurat, reduction = "pca", pt.size = .1, group.by = data$batch) + NoLegend()) 86 | p2 <- AugmentPlot(VlnPlot(object = seurat, features = "PC_1", group.by = data$batch, pt.size = .1) + NoLegend() + theme(plot.title = element_blank())) 87 | 88 | seurat <- suppressWarnings(seurat %>% 89 | RunHarmony(data$batch, plot_convergence = FALSE, verbose = verbose)) 90 | 91 | p3 <- AugmentPlot(DimPlot(object = seurat, reduction = "harmony", pt.size = .1, group.by = data$batch) + NoLegend()) 92 | p4 <- AugmentPlot(VlnPlot(object = seurat, features = "harmony_1", group.by = data$batch, pt.size = .1) + NoLegend() + theme(plot.title = element_blank())) 93 | } 94 | 95 | # Dimensionality reduction and clustering 96 | print("STEP 2: CLUSTERING") 97 | 98 | if (data$batch != FALSE) { 99 | seurat <- seurat %>% 100 | RunUMAP(reduction = "harmony", dims = 1:data$pca_dims, a = .5, b = 1.2, verbose = verbose) %>% 101 | RunTSNE(reduction = "harmony", dims = 1:data$pca_dims, check_duplicates = FALSE) %>% 102 | FindNeighbors(reduction = "harmony", dims = 1:data$pca_dims, verbose = verbose) %>% 103 | FindClusters(resolution = data$cluster_resolution, verbose = verbose) 104 | p5 <- AugmentPlot(DimPlot(seurat, reduction = "umap", group.by = data$batch, pt.size = .1) + 105 | NoLegend() + 106 | ggtitle("After harmony")) 107 | p <- (p0 | p5) / (p1 | p3) / (p2 | p4) 108 | ggsave(plot = p, filename = "out/plots/Harmony.png") 109 | } else { 110 | seurat <- seurat %>% 111 | RunUMAP(reduction = "pca", dims = 1:data$pca_dims, a = .5, b = 1.2, verbose = verbose) %>% 112 | RunTSNE(reduction = "pca", dims = 1:data$pca_dims, check_duplicates = FALSE) %>% 113 | FindNeighbors(reduction = "pca", dims = 1:data$pca_dims, verbose = verbose) %>% 114 | FindClusters(resolution = data$cluster_resolution, verbose = verbose) 115 | } 116 | 117 | if (length(data$cluster_resolution) > 1) { 118 | print("Defining optimal cluster resolution") 119 | if (exists("subsamples")) { 120 | seurat_sampled <- seurat[, subsamples] 121 | if (any(table(seurat_sampled[[paste0("RNA_snn_res.", tail(data$cluster_resolution, n=1))]]) < 3)) { 122 | seurat_sampled <- seurat 123 | } 124 | } else { 125 | seurat_sampled <- seurat 126 | } 127 | clusters <- seurat_sampled@meta.data[, grepl("RNA_snn_res.", colnames(seurat_sampled@meta.data))] 128 | clusters <- apply(clusters, 2, as.numeric) 129 | data$cluster_resolution <- data$cluster_resolution[!duplicated(apply(clusters, 2, max))] 130 | for (i in seq_along(data$cluster_resolution)) { 131 | Idents(seurat_sampled) <- seurat_sampled[[paste0("RNA_snn_res.", data$cluster_resolution[i])]] 132 | #print(paste0("Checking cluster resolution ", data$cluster_resolution[i])) 133 | if (i == 1) { 134 | seurat.markers <- FindAllMarkers(seurat_sampled, only.pos = TRUE, min.pct = 0.1, logfc.threshold = 0.25, verbose = verbose) 135 | seurat.markers.unique <- seurat.markers[!duplicated(seurat.markers$gene) & seurat.markers$p_val_adj < 0.05, ] 136 | clust_num <- nlevels(seurat.markers$cluster) 137 | clust_unique <- sum(table(seurat.markers.unique$cluster) >= uGene_clust) 138 | diff1 <- clust_num - clust_unique 139 | } else if (i == length(data$cluster_resolution)) { 140 | print(paste0("Optimal cluster resolution: ", data$cluster_resolution[i], " is max defined, consider increasing resolution range")) 141 | data$cluster_resolution <- data$cluster_resolution[[i]] 142 | Idents(seurat) <- seurat$seurat_clusters 143 | seurat.markers <- FindAllMarkers(seurat_sampled, only.pos = TRUE, min.pct = 0.1, logfc.threshold = 0.25, verbose = verbose) 144 | write.csv(seurat.markers, file = "temp/DE_genes.csv") 145 | break 146 | } else { 147 | temp <- table(seurat_sampled[[paste0("RNA_snn_res.", data$cluster_resolution[i-1]), drop = TRUE]], seurat_sampled[[paste0("RNA_snn_res.", data$cluster_resolution[i]), drop = TRUE]]) 148 | temp2 <- t(apply(temp, 1, function(x) x / sum(x))) 149 | temp3 <- apply(temp2, 2, function(x) x < .9 & x > 0) 150 | clust_test <- levels(seurat_sampled[[paste0("RNA_snn_res.", data$cluster_resolution[i]), drop = TRUE]])[colSums(temp3) == 1] 151 | seurat.markers <- list() 152 | for (c in clust_test) { 153 | seurat.markers[[c]] <- FindMarkers(seurat_sampled, ident.1 = c, ident.2 = NULL, only.pos = TRUE, min.pct = 0.1, logfc.threshold = 0.25, verbose = verbose) 154 | } 155 | seurat.markers <- do.call(rbind, seurat.markers) %>% 156 | as.data.frame() %>% 157 | tibble::rownames_to_column("row") %>% 158 | tidyr::separate(row, c("cluster", "gene"), remove = FALSE, sep = "\\.") %>% 159 | tibble::column_to_rownames("row") 160 | seurat.markers.unique <- seurat.markers[!duplicated(seurat.markers$gene) & seurat.markers$p_val_adj < 0.05, ] 161 | clust_unique <- sum(table(seurat.markers.unique$cluster) >= uGene_clust) 162 | diff2 <- length(clust_test) - clust_unique 163 | if (diff2 > diff1) { 164 | print(paste0("Optimal cluster resolution: ", data$cluster_resolution[i-1])) 165 | seurat$seurat_clusters <- seurat[[paste0("RNA_snn_res.", data$cluster_resolution[i-1])]] 166 | data$cluster_resolution <- data$cluster_resolution[[i-1]] 167 | Idents(seurat_sampled) <- seurat_sampled[[paste0("RNA_snn_res.", data$cluster_resolution)]] 168 | seurat.markers <- FindAllMarkers(seurat_sampled, only.pos = TRUE, min.pct = 0.1, logfc.threshold = 0.25, verbose = verbose) 169 | write.csv(seurat.markers, file = "temp/DE_genes.csv") 170 | break 171 | } 172 | } 173 | } 174 | } 175 | seurat@meta.data <- seurat@meta.data[, !grepl("RNA_snn_res.", colnames(seurat@meta.data))] 176 | Idents(seurat) <- seurat$seurat_clusters #Set seurat_clusters to Idents 177 | 178 | # Supervised annotation 179 | 180 | print("STEP 3a: SUPERVISED ANNOTATION") 181 | load(chetahClassifier_path) 182 | input <- SingleCellExperiment(assays = list(counts = seurat[["RNA"]]@data), 183 | reducedDims = SimpleList(TSNE = seurat@reductions$umap@cell.embeddings)) 184 | input <- CHETAHclassifier(input = input, ref_cells = reference, n_genes = 500, thresh = 0.05) 185 | p1 <- PlotCHETAH(input, return = TRUE) 186 | #nodes <- c("Node1" = "Immune", "Node2" = "Immune", "Node3" = "Lymphoid", "Node4" = "Lymphoid", "Node5" = "NKT", "Node6" = "T", "Node7" = "T", "Node8" = "Myeloid", "Node9" = "Macro/DC", "Node10"= "Stromal", "Node11" = "Stromal") 187 | #input$celltype_CHETAH <- plyr::revalue(input$celltype_CHETAH, replace = nodes[names(nodes) %in% input$celltype_CHETAH]) 188 | seurat@meta.data$annotation_CHETAH <- input$celltype_CHETAH 189 | ggsave(plot = p1, filename = "out/plots/CHETAH_classification.pdf", height = 6, width = 12) 190 | 191 | ##CHETAH recommendation 192 | fraction_chetah <- seurat@meta.data %>% 193 | group_by(seurat_clusters, annotation_CHETAH) %>% 194 | tally(name = "nCells_CHETAH") %>% 195 | mutate(fraction_CHETAH = round(nCells_CHETAH/sum(nCells_CHETAH), digits = 2)) %>% 196 | select(-nCells_CHETAH) %>% 197 | arrange(desc(fraction_CHETAH), .by_group = TRUE) %>% 198 | slice_head(n = 1) 199 | 200 | # copyKat 201 | 202 | if (data$malignant == TRUE) { 203 | print("STEP 3b: CALLING COPY NUMBER ABBERATIONS") 204 | if (exists("subsamples")) { 205 | seurat_sampled <- seurat[, subsamples] 206 | } else { 207 | seurat_sampled <- seurat 208 | } 209 | counts <- as.matrix(seurat_sampled[["RNA"]]@counts) 210 | if (is.na(data$normal_cells)) { 211 | normal_cells <- rownames(seurat_sampled@meta.data[seurat_sampled$annotation_CHETAH %in% c("Macrophage"), ]) 212 | print("Running copykat with Macrophages as normal cells") 213 | copykat.test <- copykat(rawmat=counts, id.type="S", ngene.chr=5, win.size=25, KS.cut=0.15, distance="euclidean", norm.cell.names=normal_cells, n.cores=4) 214 | } else if (data$normal_cells == FALSE) { 215 | print("Running copykat without normal cells") 216 | copykat.test <- copykat(rawmat=counts, id.type="S", ngene.chr=5, win.size=25, KS.cut=0.15, distance="euclidean", norm.cell.names="", n.cores=4) 217 | } else { 218 | normal_cells <- rownames(seurat_sampled@meta.data[seurat_sampled$annotation_CHETAH %in% c(data$normal_cells), ]) 219 | print(paste0("Running copykat with ", data$normal_cells, " as normal cells")) 220 | copykat.test <- copykat(rawmat=counts, id.type="S", ngene.chr=5, win.size=25, KS.cut=0.15, distance="euclidean", norm.cell.names=normal_cells, n.cores=4) 221 | } 222 | pred.test <- data.frame(copykat.test$prediction) 223 | pred.test <- pred.test[, "copykat.pred", drop = FALSE] 224 | seurat@meta.data <- seurat@meta.data %>% 225 | tibble::rownames_to_column("cell") %>% 226 | left_join(tibble::rownames_to_column(pred.test, "cell"), by = "cell") %>% 227 | tibble::column_to_rownames("cell") 228 | 229 | p1 <- DimPlot(seurat, group.by = "copykat.pred") 230 | p2 <- DimPlot(seurat, group.by = "seurat_clusters", label = TRUE) + NoLegend() 231 | if ("EPCAM" %in% rownames(seurat)) { 232 | p3 <- FeaturePlot(seurat, features = "EPCAM") 233 | p <- p1 + p2 + p3 234 | ggsave(plot = p, filename = "out/plots/copyKat_umap.pdf", height = 5, width = 15) 235 | } 236 | p <- p1 + p2 237 | ggsave(plot = p, filename = "out/plots/copyKat_umap.pdf", height = 5, width = 10) 238 | 239 | ##copykat recommendation 240 | fraction_copykat <- seurat@meta.data %>% 241 | group_by(seurat_clusters, copykat.pred) %>% 242 | tally(name = "nCells_copykat") %>% 243 | filter(is.na(copykat.pred) == FALSE) %>% 244 | mutate(fraction_copykat = round(nCells_copykat/sum(nCells_copykat), digits = 2)) %>% 245 | arrange(desc(fraction_copykat), .by_group = TRUE) %>% 246 | slice_head(n = 1) %>% 247 | select(-nCells_copykat, -fraction_copykat) 248 | 249 | annotation <- inner_join(fraction_chetah, fraction_copykat, by = "seurat_clusters") 250 | annotation$abbreviation <- as.character("") 251 | annotation[annotation$fraction_CHETAH >= .8, "abbreviation"] <- annotation[annotation$fraction_CHETAH >= .8, "annotation_CHETAH"] 252 | annotation[annotation$copykat.pred == "aneuploid", "abbreviation"] <- "mal" 253 | } else { 254 | annotation <- fraction_chetah 255 | annotation$abbreviation <- as.character("") 256 | annotation[annotation$fraction_CHETAH >= .8, "abbreviation"] <- annotation[annotation$fraction_CHETAH >= .8, "annotation_CHETAH"] 257 | } 258 | 259 | ##Create annotation.xlsx 260 | if (!file.exists("out/annotation.xlsx")) { 261 | write.xlsx(x = annotation, "out/annotation.xlsx") 262 | } else { 263 | print("Not overwriting annotation.xlsx, saving as copy") 264 | write.xlsx(x = annotation, "out/annotation_copy.xlsx") 265 | } 266 | 267 | # Plot cell markers 268 | 269 | print("STEP 4: CREATING MARKER GENE PLOTS") 270 | cell.markers <- readxl::read_excel(cellMarker_path) 271 | markers <- list() 272 | for (i in as.character(na.omit(unique(cell.markers$cell_type)))) { 273 | temp <- rownames(seurat)[rownames(seurat) %in% na.omit(cell.markers[cell.markers$cell_type == i, "gene", drop = TRUE])] 274 | if (length(temp) > 0) { 275 | markers[[i]] <- temp 276 | } 277 | } 278 | 279 | #Idents(seurat) <- seurat$seurat_clusters #set seurat_clusters as idents 280 | temp <- AddModuleScore(seurat, features = markers) 281 | p <- DotPlot(temp, features = colnames(temp@meta.data)[grepl("Cluster[[:digit:]]", colnames(temp@meta.data))], group.by = "seurat_clusters", cluster.idents = TRUE) + scale_x_discrete(labels = names(markers)) + theme(axis.text.y = element_text(size = 8)) + RotatedAxis() 282 | ggsave(plot = p, filename = "temp/annotation/Dotplot_seuratClusters_geneModules.png", dpi = 300, height = 12, width = 12) 283 | 284 | p0 <- DotPlot(seurat, features = unique(cell.markers$gene), group.by = "seurat_clusters", cluster.idents = TRUE) + theme(axis.text.y = element_text(size = 8)) + coord_flip() 285 | ggsave(plot = p0, filename = "temp/annotation/Dotplot_seuratClusters_genes.png", dpi = 300, height = 12, width = 12) 286 | 287 | p1 <- AugmentPlot(DimPlot(seurat, label = TRUE, label.size = 8)) 288 | cell.markers <- cell.markers[cell.markers$gene %in% rownames(seurat), ] 289 | for (type in unique(cell.markers$category)) { 290 | markers <- unique(cell.markers[cell.markers$category == type, ]$gene) 291 | if (length(markers) >= 6) {p2 <- FeaturePlot(seurat, features = markers, pt.size = .1, ncol = 6)} else {p2 <- FeaturePlot(seurat, features = markers, pt.size = .1)} 292 | p3 <- DotPlot(seurat, features = markers, group.by = "seurat_clusters", cluster.idents = TRUE) + theme(axis.text.y = element_text(size = 8)) + coord_flip() + NoLegend() 293 | layout <- " 294 | ACC 295 | BBB 296 | BBB 297 | " 298 | p <- p1 + p2 + p3 + plot_layout(design = layout) 299 | ggsave(plot = p, filename = paste0("temp/annotation/", type, ".png"), height = 20, width = 30, dpi = 300) 300 | } 301 | 302 | temp <- table(seurat$seurat_clusters, seurat$annotation_CHETAH) 303 | temp <- apply(temp, 1, function(x) x / sum(x)) 304 | pheatmap::pheatmap(temp, filename = "temp/annotation/cluster_comparison.pdf") 305 | 306 | # Summary statistics 307 | 308 | print("STEP 5: CREATING SUMMARY STATISTICS") 309 | harmony_summary = data.frame( 310 | "Input_file" = data$object_path, 311 | "Batch" = data$batch, 312 | "QC_features_min" = data$QC_feature_min, 313 | "QC_mito_max" = data$QC_mt_max, 314 | "Variable_features" = data$features_var, 315 | "PCA_dimensions" = data$pca_dims, 316 | "Amount_genes" = nrow(seurat), 317 | "Genes_detected_per_cell" = median(seurat@meta.data$nFeature_RNA), 318 | "Cells_before_QC" = cells_before_QC, 319 | "Cells_after_QC" = ncol(seurat), 320 | "Cluster_resolution" = data$cluster_resolution 321 | ) 322 | seurat@misc <- list(harmony_summary) 323 | 324 | # Save RDS and convert to h5ad with seuratdisk 325 | 326 | print("STEP 6: SAVING RESULTS") 327 | saveRDS(seurat, paste0("temp/harmony.rds")) 328 | data <- toJSON(data) 329 | write(data, "out/data.json") 330 | print("ALL DONE") 331 | -------------------------------------------------------------------------------- /scProcessor_1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | SCRIPT='/gpfs01/home/glanl/scripts/IMMUcan/scProcessor_1.R' 3 | BATCH='none' #Fill in 4 | GENES='250' #Adapt 5 | MITO='15' #Adapt 6 | PCA='30' #Adapt 7 | 8 | ml R/3.6.3-foss-2016b 9 | 10 | srun --mem=100G --time=4:00:00 --cpus-per-task=5 --partition=bigmem Rscript ${SCRIPT} ${BATCH} ${GENES} ${MITO} ${PCA} 11 | -------------------------------------------------------------------------------- /scProcessor_2.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | args = commandArgs(trailingOnly=TRUE) 3 | object_path = "temp/harmony.rds" #harmony.rds file 4 | if (file.exists("out/annotation.xlsx")) { 5 | annotationFile_path = "out/annotation.xlsx" #path to annotation file 6 | } else {annotationFile_path = "out/annotation.xls"} #path to annotation file 7 | cellOntology_path = "/home/jordi_camps/IMMUcan/cell_ontology.xlsx" 8 | verbose = FALSE 9 | if (!dir.exists("temp")) {dir.create("temp")} 10 | if (!dir.exists("temp/plots")) {dir.create("temp/plots")} 11 | if (!dir.exists("out")) {dir.create("out")} 12 | if (!dir.exists("out/plots")) {dir.create("out/plots")} 13 | 14 | dir <- getwd() 15 | setwd(dir) 16 | print(dir) 17 | if (!file.exists("temp/harmony.rds")) { 18 | stop("first run scProcessor_1.R") 19 | } 20 | suppressPackageStartupMessages({ 21 | library(sceasy) 22 | library(reticulate) 23 | use_condaenv('sceasy') 24 | loompy <- reticulate::import('loompy') 25 | library(Seurat) 26 | library(SeuratDisk) 27 | library(ggplot2) 28 | library(patchwork) 29 | library(Matrix) 30 | library(dplyr) 31 | library(genesorteR) 32 | library(data.table) 33 | library(future) 34 | library(jsonlite) 35 | }) 36 | suppressWarnings(RNGkind(sample.kind = "Rounding")) 37 | set.seed(111) 38 | options(future.globals.maxSize= 150000*1024^2) 39 | plan("multiprocess", workers = 12) 40 | seurat <- readRDS(object_path) 41 | data <- fromJSON("out/data.json") 42 | if (!is.na(data$nSample) & ncol(seurat) > data$nSample) {subsamples <- sample(ncol(seurat), data$nSample, replace = FALSE)} 43 | 44 | #makeReference, takes a Seurat Object and name of meta data column that contains the clusters. Returns a ranking of genes. 45 | makeReference = function(seuratObj, groupBy) { 46 | groupBy = which(colnames(seuratObj@meta.data) == groupBy) 47 | gs = sortGenes(seuratObj@assays$RNA@counts, factor(seuratObj@meta.data[,groupBy]), binarizeMethod = "naive", cores = 12) 48 | pp = getPValues(gs, numPerm = 5, cores = 1) 49 | pp = apply(pp$adjpval, 1, function(x) any(x < 0.1)) 50 | mm = getMarkers(gs) 51 | ref = mm$gene_shannon_index 52 | ref[!pp] = max(ref[[2]]) 53 | return(sort(ref, decreasing = FALSE)) 54 | } 55 | 56 | # Annotate 57 | 58 | print("STEP 1: LINKING CELL ONTOLOGY") 59 | anno_clust <- readxl::read_excel(annotationFile_path) 60 | #anno_clust <- arrange(anno_clust, seurat_clusters) 61 | new.cluster.ids <- tolower(anno_clust$abbreviation) 62 | names(new.cluster.ids) <- levels(seurat) 63 | seurat <- RenameIdents(seurat, new.cluster.ids) 64 | seurat$abbreviation <- Idents(seurat) 65 | cell_ont <- readxl::read_excel(cellOntology_path) 66 | cell_ont$abbreviation <- tolower(cell_ont$abbreviation) 67 | seurat@meta.data <- seurat@meta.data %>% 68 | tibble::rownames_to_column("cell") %>% 69 | left_join(cell_ont, by = "abbreviation") %>% 70 | tibble::column_to_rownames("cell") 71 | if (any(is.na(seurat$cell_ontology)) == TRUE) { 72 | print(distinct(seurat@meta.data[is.na(seurat$cell_ontology), c("abbreviation", "cell_ontology")])) 73 | stop("NOT ALL CELL TYPE ABBREVIATIONS FIT") 74 | } 75 | Idents(seurat) <- seurat$seurat_clusters 76 | 77 | # Remove annotations with less than 10 cells 78 | for (i in data$annotation) { 79 | temp <- names(table(seurat@meta.data[[i]]))[table(seurat@meta.data[[i]]) <= 10] 80 | seurat@meta.data[seurat@meta.data[[i]] %in% temp, i] <- NA 81 | } 82 | 83 | # Plotting 84 | 85 | temp <- colnames(seurat@meta.data)[tolower(colnames(seurat@meta.data)) %in% tolower(data$annotation)] 86 | for (i in temp) { 87 | if (is.numeric(seurat@meta.data[[i]]) == TRUE) { 88 | p <- FeaturePlot(seurat, features = i, reduction = "umap") 89 | ggsave(plot = p, filename = paste0("out/plots/", i, ".png"), dpi = 300, width = 10, height = 10) 90 | } else if (length(unique(seurat@meta.data[[i]])) <= 20) { 91 | p <- DimPlot(seurat, reduction = "umap", pt.size = 1, group.by = i, label = TRUE) + ggthemes::scale_color_tableau(palette = "Tableau 20") 92 | ggsave(plot = p, filename = paste0("out/plots/", i, ".png"), dpi = 300, width = 10, height = 10) 93 | } else { 94 | p <- DimPlot(seurat, reduction = "umap", pt.size = 1, group.by = i, label = TRUE) 95 | ggsave(plot = p, filename = paste0("out/plots/", i, ".png"), dpi = 300, width = 10, height = 10) 96 | } 97 | } 98 | 99 | temp <- colnames(seurat@meta.data)[tolower(colnames(seurat@meta.data)) %in% tolower(data$metadata)] 100 | for (i in temp) { 101 | p <- ggplot(seurat@meta.data, aes_string(x = "cell_ontology", fill = i)) + geom_bar(position = "fill") + RotatedAxis() 102 | ggsave(plot = p, filename = paste0("out/plots/", i, ".png"), dpi = 300, width = 10, height = 10) 103 | } 104 | 105 | # DE 106 | print("STEP 2: CALCULATING MARKER GENES") 107 | #if (exists("subsamples")) { 108 | # seurat_sampled <- seurat[, subsamples] 109 | #} else { 110 | seurat_sampled <- seurat 111 | #} 112 | annoCounts <- list() 113 | for (i in data$annotation) { 114 | Idents(seurat_sampled) <- seurat_sampled[[i, drop = TRUE]] 115 | seurat.markers <- FindAllMarkers(seurat_sampled, only.pos = TRUE, min.pct = 0.1, logfc.threshold = 0.25, verbose = verbose) 116 | write.table(seurat.markers, paste0("out/DE_", i, ".tsv"), sep = "\t") 117 | temp <- table(seurat[[i]]) 118 | annoCounts[[i]] <- data.frame(annotation = i, cell_type = temp) 119 | write.csv(data.table::rbindlist(annoCounts), file = "out/cell_count.csv", row.names = FALSE) 120 | } 121 | 122 | # Gene entropy ranking 123 | print("STEP 3: CALCULATING GENE ENTROPY RANKING") 124 | geneIndex <- list() 125 | for (i in data$annotation) { 126 | if (length(unique(seurat_sampled@meta.data[[i]])) > 1) { 127 | geneIndex[[i]] <- makeReference(seuratObj = seurat_sampled, groupBy = i) 128 | } 129 | } 130 | geneIndex <- do.call(cbind, geneIndex) 131 | write.table(geneIndex, "out/gene_index.tsv", row.names = TRUE, sep = "\t") 132 | 133 | 134 | # Export 135 | print("STEP 4: SAVING RESULTS") 136 | #Seurat 137 | saveRDS(seurat, "out/harmony.rds") 138 | #SaveH5Seurat(seurat, filename = "out/harmony.h5Seurat", overwrite = TRUE) 139 | #Convert("out/harmony.h5Seurat", dest = "h5ad", overwrite = TRUE) 140 | 141 | # Export average gene expression over cluster 142 | #write.csv(x = seurat[["RNA"]]@data, file = "out/normCounts.csv", row.names = TRUE) 143 | Idents(seurat) <- seurat$annotation_major 144 | temp <- AverageExpression(seurat, assays = "RNA") 145 | write.table(x = temp$RNA, file = "out/avgExpr_major.tsv", row.names = TRUE, sep = "\t") 146 | Idents(seurat) <- seurat$annotation_immune 147 | temp <- AverageExpression(seurat, assays = "RNA") 148 | write.table(x = temp$RNA, file = "out/avgExpr_immune.tsv", row.names = TRUE, sep = "\t") 149 | Idents(seurat) <- seurat$annotation_minor 150 | temp <- AverageExpression(seurat, assays = "RNA") 151 | write.table(x = temp$RNA, file = "out/avgExpr_minor.tsv", row.names = TRUE, sep = "\t") 152 | Idents(seurat) <- seurat$annotation_CHETAH 153 | temp <- AverageExpression(seurat, assays = "RNA") 154 | write.table(x = temp$RNA, file = "out/avgExpr_CHETAH.tsv", row.names = TRUE, sep = "\t") 155 | 156 | Idents(seurat) <- seurat$annotation_minor 157 | seurat@meta.data <- seurat@meta.data[, sapply(sapply(seurat@meta.data, unique), length) != 1, drop = FALSE] #Remove all columns that have only one variable 158 | seurat@meta.data <- seurat@meta.data[, !grepl("RNA_snn_res|abbreviation|cell_id|cell.id", colnames(seurat@meta.data))] 159 | 160 | # Convert to h5ad with sceasy for immediate use with cellxgene 161 | sceasy::convertFormat(seurat, from="seurat", to="anndata", outFile= "out/cellxgene.h5ad") 162 | 163 | # Export metadata with umap coordinates 164 | write.table(x = cbind(seurat@meta.data, seurat@reductions$umap@cell.embeddings), file = "out/metadata.tsv", row.names = TRUE, sep = "\t") 165 | 166 | #Subsample object to 10k cells 167 | if (exists("subsamples")) {seurat <- seurat[, subsamples]} 168 | 169 | # Export metadata with umap coordinates 170 | write.table(x = cbind(seurat@meta.data, seurat@reductions$umap@cell.embeddings), file = "out/metadata_10k.tsv", row.names = TRUE, sep = "\t") 171 | 172 | # Convert to h5ad with sceasy for immediate use with cellxgene 173 | sceasy::convertFormat(seurat, from="seurat", to="anndata", outFile= "out/cellxgene_10k.h5ad") 174 | 175 | #zip and checksum 176 | print("STEP 5: ZIP AND CHECKSUM") 177 | folder_name <- tail(unlist(strsplit(dir, "/")), n=1) 178 | dir.create(folder_name) 179 | out_files <- paste0("out/", list.files("out/")) 180 | file.copy(out_files, folder_name, recursive = TRUE) 181 | zip(paste0(folder_name, ".zip"), folder_name) 182 | checksum <- tools::md5sum(paste0(folder_name, ".zip")) 183 | file.rename(paste0(folder_name, ".zip"), paste0(folder_name, "_-_", checksum, ".zip")) 184 | file.copy(paste0(folder_name, "_-_", checksum, ".zip"), "../") 185 | unlink(paste0(folder_name, "_-_", checksum, ".zip")) 186 | unlink(folder_name, recursive = TRUE) 187 | -------------------------------------------------------------------------------- /scProcessor_2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | SCRIPT='/gpfs01/home/glanl/scripts/IMMUcan/scProcessor_2.R' 3 | 4 | module use /gpfs01/sw/eb-2019/modules/all /gpfs01/sw/eb-rh7/modules/all /gpfs01$ 5 | ml Anaconda3/2020.02.lua 6 | source activate sceasy 7 | ml R/3.6.3-foss-2016b 8 | 9 | srun --mem=100G --time=6:00:00 --cpus-per-task=5 --partition=bigmem Rscript ${SCRIPT} 10 | 11 | -------------------------------------------------------------------------------- /scRNA_seq_database_summary_stat.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Global stats IMMUcan database" 3 | output: html_document 4 | --- 5 | 6 | ```{r, setup} 7 | knitr::opts_knit$set(root.dir = "/Users/jordicamps/OneDrive - Bayer/IMMUcan/WP7/D1 - scRNAseq database/") 8 | ``` 9 | 10 | 11 | ```{r, root.dir = TRUE} 12 | library(dplyr) 13 | library(readxl) 14 | library(ggplot2) 15 | library(patchwork) 16 | library(stringr) 17 | library(tidyr) 18 | options(scipen = 999) 19 | ``` 20 | 21 | ```{r} 22 | theme_jc_vert <- theme(panel.border = element_blank(), panel.grid.major.y = element_blank(), panel.grid.minor = element_blank(), axis.line = element_blank(), axis.ticks = element_blank(), axis.text = element_text(colour = "black")) 23 | theme_jc_hor <- theme(panel.border = element_blank(), panel.grid.major.x = element_blank(), panel.grid.minor = element_blank(), axis.line = element_blank(), axis.ticks = element_blank(), axis.text = element_text(colour = "black")) 24 | ``` 25 | 26 | ```{r} 27 | ScaleDiscretePositionFunc <- ggproto( 28 | "ScaleDiscretePositionReversed", ScaleDiscretePosition, 29 | get_limits = function(self) { 30 | if (self$is_empty()) { 31 | c(0, 1) 32 | } else if (is.null(self$limits)) { 33 | self$range$range 34 | } else if (is.function(self$limits)) { 35 | self$limits(self$range$range) 36 | } else { 37 | integer(0) 38 | } 39 | } 40 | ) 41 | 42 | scale_x_discrete2 <- function(..., expand = waiver(), position = "bottom") { 43 | sc <- discrete_scale(c("x", "xmin", "xmax", "xend"), "position_d", identity, ..., 44 | expand = expand, guide = "none", position = position, super = ScaleDiscretePositionFunc) 45 | 46 | sc$range_c <- ggplot2:::continuous_range() 47 | sc 48 | } 49 | ``` 50 | 51 | ```{r} 52 | df <- read_excel("15052020_scRNAseq_database.xlsx", n_max = 75) 53 | df 54 | ``` 55 | 56 | ```{r} 57 | df$`Cancer localization` <- tolower(df$`Cancer localization`) 58 | df$`Library construction` <- tolower(df$`Library construction`) 59 | df$cells_tenfive <- df$`Cell amount` / 100000 60 | ``` 61 | 62 | ```{r fig.height=4, fig.width=5} 63 | tech <- df %>% 64 | group_by(`Library construction`) %>% 65 | summarise("n_patient" = sum(`Number of Patients`, na.rm=TRUE), "n_cells" = sum(cells_tenfive, na.rm=TRUE), "n" = n()) %>% 66 | ggplot(aes(x = reorder(`Library construction`, -n_patient), y = n_patient)) + 67 | geom_segment(aes(xend = `Library construction`, yend = 0)) + 68 | geom_point(aes(col = n_cells, size = n)) + 69 | scale_size_continuous(range = c(2, 8)) 70 | ``` 71 | 72 | ```{r} 73 | tech_vert <- tech + 74 | coord_flip() + 75 | scale_y_continuous("Patients") + 76 | scale_x_discrete("Technology", limits = rev) + 77 | scale_color_continuous(type = "viridis") + 78 | labs(size="Datasets", color="Cells per\n100 000") + 79 | guides(color = guide_colorbar(barwidth = 1, barheight = 3, ticks = FALSE)) + 80 | theme_bw() + 81 | theme(legend.position = "right", axis.title.y = element_blank(), plot.title = element_text(hjust=0.5)) + 82 | theme_jc_vert + 83 | ggtitle("Technology") 84 | tech_vert 85 | ``` 86 | 87 | ```{r} 88 | tech_hor <- tech + 89 | scale_y_continuous("Patients", limits = c(0, 400)) + 90 | scale_x_discrete("Technology") + 91 | scale_color_continuous(type = "viridis") + 92 | labs(size="Datasets", color="Cells per\n100 000") + 93 | guides(color = guide_colorbar(barwidth = .5, barheight = 3, ticks = FALSE, direction = "vertical", title.position = "left"), 94 | size = guide_legend(ncol = 1)) + 95 | theme_bw() + 96 | theme(legend.position = "bottom", axis.title.x = element_blank(), plot.title = element_text(hjust=0.5), axis.text.x = element_text(angle = 45, hjust = 1), legend.title = element_text(size = 8), legend.text = element_text(size = 8)) + 97 | theme_jc_hor + 98 | ggtitle("Technology") 99 | tech_hor 100 | ``` 101 | 102 | 103 | ```{r fig.height=3, fig.width=5} 104 | treat <- df %>% 105 | group_by(`Treatment type`) %>% 106 | #tally(name = "Count") %>% 107 | summarise("n_patient" = sum(`Number of Patients`, na.rm=TRUE), "n_cells" = sum(cells_tenfive, na.rm=TRUE), "n" = n()) %>% 108 | ggplot(aes(x = reorder(`Treatment type`, -n_patient), y = n_patient)) + 109 | geom_segment(aes(xend = `Treatment type`, yend = 0)) + 110 | geom_point(aes(col = n_cells, size = n)) + 111 | scale_size_continuous(range = c(2, 8)) 112 | ``` 113 | 114 | ```{r} 115 | treat_vert <- treat + 116 | coord_flip() + 117 | scale_y_continuous("Patients") + 118 | scale_x_discrete("Treatment type", limits = rev) + 119 | scale_color_continuous(type = "viridis") + 120 | labs(size="Datasets", color="Cells per\n100 000") + 121 | guides(color = guide_colorbar(barwidth = 1, barheight = 3, ticks = FALSE)) + 122 | theme_bw() + 123 | theme(legend.position = "right", axis.title.y = element_blank(), plot.title = element_text(hjust = 0.5)) + 124 | ggtitle("Treatment") + 125 | theme_jc_vert 126 | treat_vert 127 | ``` 128 | 129 | ```{r} 130 | treat_hor <- treat + 131 | scale_y_continuous("Patients", limits = c(0, 400)) + 132 | scale_x_discrete("Treatment type") + 133 | scale_color_continuous(type = "viridis") + 134 | labs(size="Datasets", color="Cells per\n100 000") + 135 | guides(color = guide_colorbar(barwidth = .5, barheight = 3, ticks = FALSE, direction = "vertical", title.position = "left"), 136 | size = guide_legend(ncol = 1)) + 137 | theme_bw() + 138 | theme(legend.position = "bottom", axis.title.x = element_blank(), plot.title = element_text(hjust = 0.5), 139 | axis.text.x = element_text(angle = 45, hjust = 1), legend.title = element_text(size = 8), legend.text = element_text(size = 8)) + 140 | ggtitle("Treatment") + 141 | theme_jc_hor 142 | treat_hor 143 | ``` 144 | 145 | 146 | ```{r} 147 | cancer <- df %>% 148 | group_by(`Cancer type abbreviation`) %>% 149 | summarise("n_patient" = sum(`Number of Patients`, na.rm=TRUE), "n_cells" = sum(cells_tenfive, na.rm=TRUE), "n" = n()) %>% 150 | ggplot(aes(x = reorder(`Cancer type abbreviation`, -n_patient), y = n_patient)) + 151 | geom_segment(aes(xend = `Cancer type abbreviation`, yend = 0)) + 152 | geom_point(aes(col=n_cells, size=n)) + 153 | scale_size_continuous(range = c(2, 8)) 154 | ``` 155 | 156 | ```{r} 157 | cancer_vert <- cancer + 158 | coord_flip() + 159 | scale_y_continuous("Patients") + 160 | scale_x_discrete("Cancer type", limits = rev) + 161 | viridis::scale_color_viridis(option = "viridis") + 162 | #scale_size(breaks = c(3, 6, 9)) + 163 | theme_bw() + 164 | labs(size="Datasets", color="Cells per\n100 000") + 165 | guides(color = guide_colorbar(barwidth = 1, barheight = 5, ticks = FALSE)) + 166 | theme_jc_vert + 167 | theme(axis.title.y = element_blank(), legend.position = "right", plot.title = element_text(hjust = 0.5)) + 168 | ggtitle("Cancer type") 169 | cancer_vert 170 | ``` 171 | 172 | ```{r fig.height=4.5, fig.width=9} 173 | cancer_hor <- cancer + 174 | scale_y_continuous("Patients") + 175 | scale_x_discrete("Cancer type") + 176 | viridis::scale_color_viridis(option = "viridis") + 177 | #scale_size(breaks = c(3, 6, 9)) + 178 | theme_bw() + 179 | labs(size="Datasets", color="Cells per\n100 000") + 180 | guides(color = guide_colorbar(barwidth = 1, barheight = 5, ticks = FALSE)) + 181 | theme_jc_hor + 182 | theme(axis.title.y = element_blank(), legend.position = "right", plot.title = element_text(hjust = 0.5), axis.text.x = element_text(angle = 45, hjust = 1), axis.title.x = element_blank()) 183 | #ggtitle("Cancer type") 184 | cancer_hor 185 | ggsave("plot_cancer_horizontal.pdf", dpi = 300) 186 | ``` 187 | 188 | 189 | ```{r fig.height=4, fig.width=8} 190 | cell_type <- df %>% 191 | group_by(`Enrichment cell types`) %>% 192 | summarise("n_patient" = sum(`Number of Patients`, na.rm=TRUE), "n_cells" = sum(cells_tenfive, na.rm=TRUE), "n" = n()) %>% 193 | ggplot(aes(x = reorder(`Enrichment cell types`, -n_patient), y = n_patient)) + 194 | geom_segment(aes(xend = `Enrichment cell types`, yend = 0)) + 195 | geom_point(aes(col = n_cells, size = n)) + 196 | scale_size_continuous(range = c(2, 8)) 197 | ``` 198 | 199 | ```{r} 200 | cell_type_vert <- cell_type + 201 | coord_flip() + 202 | scale_y_continuous("Patients") + 203 | scale_x_discrete("Cell types", limits = rev) + 204 | scale_color_continuous(type = "viridis") + 205 | labs(size="Datasets", color="Cells per\n100 000") + 206 | guides(color = guide_colorbar(barwidth = 1, barheight = 5, ticks = FALSE)) + 207 | theme_bw() + 208 | theme(axis.title.y = element_blank(), plot.title = element_text(hjust = 0.5)) + 209 | theme_jc_vert + 210 | ggtitle("Cell type enrichment") 211 | cell_type_vert 212 | ``` 213 | 214 | ```{r} 215 | cell_type_hor <- cell_type + 216 | scale_y_continuous("Patients", limits = c(0, 400)) + 217 | scale_x_discrete("Cell types") + 218 | scale_color_continuous(type = "viridis") + 219 | labs(size="Datasets", color="Cells") + 220 | guides(color = guide_colorbar(barwidth = .5, barheight = 3, ticks = FALSE, direction = "vertical", title.position = "left"), 221 | size = guide_legend(ncol = 1)) + 222 | theme_bw() + 223 | theme(axis.title.x = element_blank(), plot.title = element_text(hjust = 0.5), axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "bottom", legend.title = element_text(size = 8), legend.text = element_text(size = 8)) + 224 | theme_jc_hor + 225 | ggtitle("Cell type enrichment") 226 | cell_type_hor 227 | ``` 228 | 229 | ## Plots patchwork 230 | ```{r} 231 | layout <- " 232 | AABBDD 233 | AABBDD 234 | AACCDD 235 | " 236 | ``` 237 | 238 | ```{r fig.height=6, fig.width=12} 239 | cancer_vert + cell_type_vert + treat_vert + tech_vert + 240 | plot_layout(design = layout) 241 | ggsave("TME_table_quant.pdf", dpi = 300) 242 | ``` 243 | 244 | #Plots Jasna 245 | ```{r} 246 | layout <- " 247 | AABCC 248 | " 249 | ``` 250 | 251 | ```{r fig.height=5, fig.width=10} 252 | cell_type_hor + treat_hor + tech_hor + 253 | plot_layout(design = layout) 254 | ggsave("plot_cellType_treatment_tech_horizontal.pdf", dpi = 300) 255 | ``` 256 | 257 | 258 | 259 | ```{r} 260 | sum(df$`Number of Patients`, na.rm = TRUE) 261 | sum(df$`Cell amount`, na.rm = TRUE) 262 | ``` 263 | 264 | -------------------------------------------------------------------------------- /tidy_metadata.R: -------------------------------------------------------------------------------- 1 | dir <- getwd() 2 | setwd(dir) 3 | print(dir) 4 | 5 | suppressPackageStartupMessages({ 6 | library(sceasy) 7 | library(reticulate) 8 | use_condaenv('sceasy') 9 | loompy <- reticulate::import('loompy') 10 | library(Seurat) 11 | library(tidyverse) 12 | library(readxl) 13 | library(jsonlite) 14 | }) 15 | 16 | tidy_metadata_path <- "/home/jordi_camps/IMMUcan/tidy_metadata.xlsx" 17 | seurat_obj <- "out/harmony.rds" 18 | data <- fromJSON("out/data.json") 19 | 20 | meta_cols <- read_excel(tidy_metadata_path) 21 | seurat <- readRDS(seurat_obj) 22 | if (!is.na(data$nSample) & ncol(seurat) > data$nSample) {subsamples <- sample(ncol(seurat), data$nSample, replace = FALSE)} 23 | #colnames(seurat@meta.data) <- tolower(colnames(seurat@meta.data)) 24 | glimpse(seurat@meta.data) 25 | 26 | #Clean metadata column names 27 | names <- tolower(colnames(seurat@meta.data)) 28 | names <- gsub("\\.", "_", names) 29 | 30 | if (any(meta_cols$col_names %in% names)) { 31 | change_cols <- names[names %in% meta_cols$col_names] 32 | for (i in change_cols) { 33 | clean = FALSE 34 | hit_1 <- grepl(i, names) 35 | hit_2 <- meta_cols$col_names %in% i 36 | print(paste0("changing ", i, " to ", meta_cols$general[hit_2])) 37 | colnames(seurat@meta.data)[hit_1] <- meta_cols$general[hit_2] 38 | } 39 | } else { 40 | clean = TRUE 41 | print("No meta.data columns to tidy up") 42 | } 43 | 44 | print("Updating data.json") 45 | names <- tolower(data$metadata) 46 | names <- gsub("\\.", "_", names) 47 | if (isFALSE(clean)) { 48 | change_cols <- names[names %in% meta_cols$col_names] 49 | for (i in change_cols) { 50 | hit_1 <- grepl(i, names) 51 | hit_2 <- meta_cols$col_names %in% i 52 | data$metadata[hit_1] <- meta_cols$general[hit_2] 53 | } 54 | names <- tolower(data$annotation) 55 | names <- gsub("\\.", "_", names) 56 | change_cols <- names[names %in% meta_cols$col_names] 57 | for (i in change_cols) { 58 | hit_1 <- grepl(i, names) 59 | hit_2 <- meta_cols$col_names %in% i 60 | data$annotation[hit_1] <- meta_cols$general[hit_2] 61 | } 62 | data <- toJSON(data) 63 | write(data, "out/data.json") 64 | } 65 | 66 | print("Saving objects") 67 | Idents(seurat) <- seurat$annotation_minor 68 | saveRDS(seurat, "out/harmony.rds") 69 | seurat@meta.data <- seurat@meta.data[, sapply(sapply(seurat@meta.data, unique), length) != 1, drop = FALSE] #Remove all columns that have only one variable 70 | seurat@meta.data <- seurat@meta.data[, !grepl("RNA_snn_res|abbreviation|cell_id|cell.id|cell_id", colnames(seurat@meta.data))] 71 | 72 | # Convert to h5ad with sceasy for immediate use with cellxgene 73 | sceasy::convertFormat(seurat, from="seurat", to="anndata", outFile= "out/cellxgene.h5ad") 74 | 75 | # Export metadata with umap coordinates 76 | write.table(x = cbind(seurat@meta.data, seurat@reductions$umap@cell.embeddings), file = "out/metadata.tsv", row.names = TRUE, sep = "\t") 77 | 78 | #Subsample object to 10k cells 79 | if (exists("subsamples")) {seurat <- seurat[, subsamples]} 80 | 81 | # Export metadata with umap coordinates 82 | write.table(x = cbind(seurat@meta.data, seurat@reductions$umap@cell.embeddings), file = "out/metadata_10k.tsv", row.names = TRUE, sep = "\t") 83 | 84 | # Convert to h5ad with sceasy for immediate use with cellxgene 85 | sceasy::convertFormat(seurat, from="seurat", to="anndata", outFile= "out/cellxgene_10k.h5ad") 86 | 87 | #zip and checksum 88 | print("STEP 5: ZIP AND CHECKSUM") 89 | folder_name <- tail(unlist(strsplit(dir, "/")), n=1) 90 | dir.create(folder_name) 91 | out_files <- paste0("out/", list.files("out/")) 92 | file.copy(out_files, folder_name, recursive = TRUE) 93 | zip(paste0(folder_name, ".zip"), folder_name) 94 | checksum <- tools::md5sum(paste0(folder_name, ".zip")) 95 | file.rename(paste0(folder_name, ".zip"), paste0(folder_name, "_-_", checksum, ".zip")) 96 | file.copy(paste0(folder_name, "_-_", checksum, ".zip"), "../") 97 | unlink(paste0(folder_name, "_-_", checksum, ".zip")) 98 | unlink(folder_name, recursive = TRUE) -------------------------------------------------------------------------------- /tidy_metadata.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ImmucanWP7/immucan-scdb/51b8ca98c255823a23213f0744b3779640ef75ea/tidy_metadata.xlsx -------------------------------------------------------------------------------- /zip_checksum.R: -------------------------------------------------------------------------------- 1 | dir <- getwd() 2 | setwd(dir) 3 | print(dir) 4 | 5 | # Create zip and checksum from out folder 6 | folder_name <- tail(unlist(strsplit(dir, "/")), n=1) 7 | dir.create(folder_name) 8 | out_files <- paste0("out/", list.files("out/")) 9 | file.copy(out_files, folder_name, recursive = TRUE) 10 | zip(paste0(folder_name, ".zip"), folder_name) 11 | checksum <- tools::md5sum(paste0(folder_name, ".zip")) 12 | file.rename(paste0(folder_name, ".zip"), paste0(folder_name, "_-_", checksum, ".zip")) 13 | file.copy(paste0(folder_name, "_-_", checksum, ".zip"), "../") 14 | unlink(paste0(folder_name, "_-_", checksum, ".zip")) 15 | unlink(folder_name, recursive = TRUE) --------------------------------------------------------------------------------