├── .gitignore
├── 10x_dir_to_files.py
├── 10x_files_to_dir.py
├── CHETAH_referenceDataset.Rmd
├── CHETAH_referenceDataset.html
├── Convert_seurat_to_anndata.R
├── IMMUcan.Rproj
├── README.md
├── Rename_meta.data.Rmd
├── TME_markerGenes.xlsx
├── annotate.R
├── annotation_CHETAH.xlsx
├── cell_ontology.xlsx
├── check_seurat.R
├── create_seurat_fromCSV.R
├── data_example.json
├── scProcessor_1.R
├── scProcessor_1.sh
├── scProcessor_2.R
├── scProcessor_2.sh
├── scRNA_seq_database_summary_stat.Rmd
├── tidy_metadata.R
├── tidy_metadata.xlsx
└── zip_checksum.R


/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | 


--------------------------------------------------------------------------------
/10x_dir_to_files.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import logging
 3 | import glob 
 4 | import os 
 5 | import re 
 6 | import shutil
 7 | import sys
 8 | 
 9 | logging.basicConfig()
10 | log = logging.getLogger()
11 | log.setLevel(logging.DEBUG)
12 | 
13 | directories_submitted_as_commandline_arguments = sys.argv[1:]
14 | for directory_to_act_on in directories_submitted_as_commandline_arguments:
15 |     log.debug(f"Acting on directory {directory_to_act_on}...")
16 |     assert(not "/" in directory_to_act_on)
17 | 
18 |     all_filenames = glob.glob(f"{directory_to_act_on}/*")
19 |     log.debug(f"Found these files: {all_filenames}")
20 | 
21 |     dir_name = os.path.split(os.path.abspath(directory_to_act_on))[1]
22 |     separator = "_"
23 | 
24 |     for filename in all_filenames:
25 |         log.debug(f"Moving {filename}...")
26 |         cropped_filename = filename[len(directory_to_act_on)+1:]
27 |         shutil.move(filename, dir_name+separator+cropped_filename)
28 | 
29 |     os.rmdir(directory_to_act_on)
30 | 


--------------------------------------------------------------------------------
/10x_files_to_dir.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import logging
 3 | import glob 
 4 | import os 
 5 | import re 
 6 | import shutil
 7 | 
 8 | logging.basicConfig()
 9 | log = logging.getLogger()
10 | log.setLevel(logging.DEBUG)
11 | 
12 | log.debug("Searching for matrix files...")
13 | 
14 | all_matrix_filenames = glob.glob("*matrix.mtx")
15 | log.debug(f"Found these matrix files: {all_matrix_filenames}")
16 | 
17 | for matrix_filename in all_matrix_filenames:
18 |     m = re.search("^(.*)matrix.mtx$", matrix_filename)
19 |     assert(m is not None) ## should find a hit
20 | 
21 |     hits = m.groups()
22 |     assert(len(hits)==1) ## should be exactly one hit
23 | 
24 |     first_part = hits[0]
25 | 
26 |     log.debug(f"Creating directory {first_part}")
27 |     os.makedirs(first_part)
28 | 
29 |     all_files_like_this = glob.glob(f"{first_part}*.*")
30 |     log.debug(f"Will move the following files: {all_files_like_this}")
31 | 
32 |     for filename in all_files_like_this:
33 |         cropped_filename = filename[len(first_part):]
34 |         shutil.move(filename, first_part + "/" + cropped_filename)
35 | 


--------------------------------------------------------------------------------
/CHETAH_referenceDataset.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "training dataset CHETAH"
 3 | output: html_document
 4 | ---
 5 | 
 6 | ```{r}
 7 | library(Seurat)
 8 | library(dplyr)
 9 | library(ggplot2)
10 | library(SingleCellExperiment)
11 | load("CHETAH_TME_reference.Rdata")
12 | ```
13 | 
14 | ```{r}
15 | reference
16 | ```
17 | 
18 | ```{r}
19 | logcounts(reference) <- counts(reference)
20 | reference
21 | ```
22 | 
23 | ```{r}
24 | seurat <- as.Seurat(reference)
25 | seurat
26 | ```
27 | 
28 | ```{r}
29 | seurat <- seurat %>% 
30 |   FindVariableFeatures(selection.method = "vst", nfeatures = 2000) %>% 
31 |   ScaleData() %>% 
32 |   RunPCA(npcs = 50) %>%
33 |   RunUMAP(dims = 1:30, a = .5, b = 1.2) %>%
34 |   FindNeighbors(dims = 1:30) %>% 
35 |   FindClusters(resolution = 1.2)
36 | ```
37 | 
38 | ```{r fig.height=5, fig.width=10}
39 | #Idents(seurat) <- seurat$seurat_clusters
40 | DimPlot(seurat, label = TRUE)
41 | DimPlot(seurat, group.by = c("seurat_clusters", "celltypes"), label = TRUE) + NoLegend()
42 | ```
43 | 
44 | ```{r fig.height=12, fig.width=12}
45 | markers <- readxl::read_excel("TME_markerGenes.xlsx")
46 | DotPlot(seurat, features = unique(markers$gene), cluster.idents = TRUE) + coord_flip() + NoLegend()
47 | ```
48 | 
49 | ```{r}
50 | anno_clust <- readxl::read_excel("annotation_CHETAH.xlsx")
51 | anno_clust <- arrange(anno_clust, seurat_clusters)
52 | new.cluster.ids <- tolower(anno_clust$abbreviation)
53 | names(new.cluster.ids) <- levels(seurat)
54 | seurat <- RenameIdents(seurat, new.cluster.ids)
55 | seurat$abbreviation <- Idents(seurat)
56 | cell_ont <- readxl::read_excel("cell_ontology.xlsx")
57 | cell_ont$abbreviation <- tolower(cell_ont$abbreviation)
58 | seurat@meta.data <- seurat@meta.data %>%
59 |   tibble::rownames_to_column("cell") %>%
60 |   left_join(cell_ont, by = "abbreviation") %>%
61 |   tibble::column_to_rownames("cell")
62 | Idents(seurat) <- seurat$cell_ontology
63 | ```
64 | 
65 | ```{r}
66 | seurat <- seurat[, !grepl("doublets", seurat@meta.data$abbreviation)]
67 | seurat$celltypes <- seurat$cell_ontology
68 | seurat
69 | ```
70 | 
71 | ```{r}
72 | DimPlot(seurat, label = TRUE)
73 | ```
74 | 
75 | ```{r}
76 | reference <- as.SingleCellExperiment(seurat)
77 | reference
78 | ```
79 | 
80 | ```{r}
81 | save(reference, file = "CHETAH_reference_updatedAnnotation.RData")
82 | ```
83 | 
84 | 


--------------------------------------------------------------------------------
/Convert_seurat_to_anndata.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | args = commandArgs(trailingOnly=TRUE)
 3 | object_path = args[1]
 4 | 
 5 | suppressPackageStartupMessages({
 6 |   library(sceasy)
 7 |   library(reticulate)
 8 |   use_condaenv('sceasy')
 9 |   loompy <- reticulate::import('loompy')
10 |   library(Seurat)
11 |   library(SeuratDisk)
12 | })
13 | 
14 | seurat <- readRDS(object_path)
15 | #SaveH5Seurat(seurat, filename = "out/harmony.h5Seurat", overwrite = TRUE)
16 | #Convert("out/harmony.h5Seurat", dest = "h5ad", overwrite = TRUE)
17 | 
18 | # Convert to h5ad with sceasy for immediate use with cellxgene
19 | sceasy::convertFormat(seurat, from="seurat", to="anndata", outFile=gsub(".rds$", ".h5ad", object_path))
20 | 


--------------------------------------------------------------------------------
/IMMUcan.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # IMMUcan
  2 | 
  3 | scProcessor is used for the processing of scRNAseq datasets in the IMMUcan scDB. It runs on R and is mostly based on the Seurat package.
  4 | 
  5 | - Quality control
  6 | - Measure and correct batch effect (harmony)
  7 | - Clustering optimization
  8 | - Supervised annotation (CHETAH)
  9 | - CNA calling (copyKat)
 10 | - Cell ontology (ebi.ac.uk/ols/ontologies/cl)
 11 | - Differential expression
 12 | - Universal output files (sceasy)
 13 | 
 14 | 
 15 | ## Install instructions
 16 | 
 17 | - Follow install instructions for sceasy (https://github.com/cellgeni/sceasy)
 18 | - Get CHETAH_reference_updatedAnnotation.RData from IMMUcan teams channel
 19 | - Install following R packages
 20 | ```
 21 | install.packages(c("Seurat", "tidyverse", "readxl", "patchwork", "devtools", "data.table", "BiocManager", "remotes", "openxlsx", "pheatmap", "plyr", "DescTools", "future", "jsonlite"))
 22 | BiocManager::install(c("CHETAH", "SingleCellExperiment"))
 23 | devtools::install_github("mahmoudibrahim/genesorteR") 
 24 | devtools::install_github("immunogenomics/harmony")
 25 | devtools::install_github("navinlabcode/copykat")
 26 | remotes::install_github("mojaveazure/seurat-disk")
 27 | ```
 28 | 
 29 | ## Before starting
 30 | 
 31 | Change the paths to files provided in the script
 32 | - cellMarker_path = PATH to TME_markerGenes.xlsx
 33 | - chetahClassifier_path = PATH to CHETAH_reference_updatedAnnotation.RData
 34 | - cellOntology_path = PATH to cell_ontology.xlsx
 35 | 
 36 | ## Run scProcessor
 37 | The core of scProcessor are three processing scripts.
 38 | 
 39 | ### 1. check_seurat.R: check seurat object and estimate batch
 40 | 
 41 | - It takes a Seurat object as input (in the future this will be extended to other file formats)
 42 | - This step is optional, if data.json is filled in you can immediately run scProcessor_1
 43 |   1. Check validity of seurat object
 44 |   2. Estimate batch variable
 45 |   3. Return QC plots (in temp)
 46 | 
 47 | ``` 
 48 | Rscript check_seurat.R [SEURAT] [BATCH]
 49 | ```
 50 | 
 51 | - [SEURAT]: path to seurat object (if only one .rds file in directory it will also find it itself)
 52 | - [BATCH]: only necessary when you already know your batch variable
 53 | 
 54 | ### 2. data.json
 55 | 
 56 | - scProcessor works without arguments to the Rscripts, therefore it needs an input file that specifies these variables. This is automatically generated by check_seurat and has to be reviewed to make sure scProcessor_1 processes the data how you want.
 57 | - Here is an overview of the data.json (NA in a json is indicated as null)
 58 |   - **object_path**: full path where seurat object is stored
 59 |   - **batch**: e.g. *patient*
 60 |   - **norm**: boolean indicating if data is already normalized e.g. *false*
 61 |   - **QC_feature_min**: threshold for minimal number of detected genes per cell e.g. *250*
 62 |   - **QC_mt_max**: threshold for maximal percentage of mitochondrial reads per cell e.g. *20*
 63 |   - **pca_dims**: number of PCA dimensions to take for further processing e.g. *30*
 64 |   - **features_var**: number of highly variable features to take for further processing e.g. *2000*
 65 |   - **nSample**: number of cells to take for intense computing steps and for cellxgene.h5ad at the end e.g. *10000*
 66 |   - **cluster_resolution**: a sequence of different cluster resolutions, scProcessor will select the most optimal resolution e.g. *0.5, 1, 1.5*
 67 |   - **malignant**: boolean indicating if maligant cell prediction is necessary e.g. *TRUE*
 68 |   - **normal_cells**: cell type taken as normal cells to increase confidence of malingant cell prediction e.g. *null (standard Macrophages are taken), false (no normal cells taken)*
 69 |   - **annotation**: columns in meta.data that contains annotation information 
 70 |   - **metadata**: other important columns contained in the meta.data slot e.g. *biopsy, sample_id, treatment ...*
 71 | 
 72 | ### 3. scProcessor_1: the main processing script
 73 | 
 74 |   1. QC
 75 |   2. Batch integration and clustering
 76 |   3. Supervised classification and CNA calling
 77 |   4. Create marker gene plots
 78 |   5. Save summary statistics in misc
 79 | 
 80 | ### 4. Annotate clusters
 81 | 
 82 | - Check plots in temp/plots:
 83 |   - marker gene plots
 84 |   - dotplot
 85 | - In out/annotation.xlsx, fill in cell types as defined in the abbreviation column of cell_ontology.xlsx
 86 | 
 87 | 
 88 | ### 5. scProcessor_2: link to cell ontology and create all output files
 89 | 
 90 |   1. Links cell ontology
 91 |   2. Differential expression
 92 |   3. Creates output files for SIB scRNAseq interface
 93 |       - AverageExpression matrices and DE_results per annotation level
 94 |       - geneIndex.tsv
 95 |       - Metadata.tsv
 96 |       - cellCount.tsv
 97 |       - harmony.rds
 98 |       - cellxgene.h5ad
 99 | 
100 | ### 6. Create checksum file to send to SIB
101 | 
102 | on the terminal
103 | ```
104 | zip -r AML_UNB_SW_GSE116256.zip AML_UNB_SW_GSE116256
105 | md5sum AML_UNB_SW_GSE116256.zip
106 | mv AML_UNB_SW_GSE116256 AML_UNB_SW_GSE116256_-_###PASTE_MD5SUM_OUTPUT_HERE###.zip 
107 | ```
108 | 
109 | Login to SIB through sftp and transfer
110 | 


--------------------------------------------------------------------------------
/Rename_meta.data.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Rename_meta.data"
 3 | output: html_document
 4 | ---
 5 | 
 6 | ```{r}
 7 | setwd("~/s3result/CRC_UNB_10X_GSE144735/")
 8 | library(dplyr)
 9 | library(jsonlite)
10 | seurat <- readRDS("out/harmony.rds")
11 | glimpse(seurat@meta.data)
12 | ```
13 | 
14 | ```{r}
15 | seurat@meta.data <- plyr::rename(seurat@meta.data, c(
16 |   "" = ""
17 | ))
18 | 
19 | #also change these columns in data.json if necessary!!!
20 | saveRDS(seurat, "out/harmony.rds")
21 | ```
22 | 
23 | 


--------------------------------------------------------------------------------
/TME_markerGenes.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ImmucanWP7/immucan-scdb/51b8ca98c255823a23213f0744b3779640ef75ea/TME_markerGenes.xlsx


--------------------------------------------------------------------------------
/annotate.R:
--------------------------------------------------------------------------------
 1 | library(readxl)
 2 | library(editData)
 3 | library(openxlsx)
 4 | 
 5 | #Load annotation.xlsx
 6 | if (file.exists("out/annotation_copy.xlsx")) {
 7 |   print("Reading annotation_copy.xlsx")
 8 |   annotation <- read_excel("out/annotation_copy.xlsx")
 9 | } else {
10 |   annotation <- read_excel("out/annotation.xlsx")
11 | }
12 | 
13 | # Use DE for checking top genes of some clusters
14 | DE_genes <- read.csv("temp/DE_genes.csv", row.names = 1)
15 | 
16 | #Check top 10 genes of a certain cluster
17 | clust = 30
18 | head(DE_genes[DE_genes$cluster == clust, ], 10)
19 | 
20 | # Change annotation
21 | annotation <- editData(annotation)
22 | 
23 | write.xlsx(annotation, "out/annotation.xlsx")
24 | if(file.exists("out/annotation_copy.xlsx")) {file.remove("out/annotation_copy.xlsx")}


--------------------------------------------------------------------------------
/annotation_CHETAH.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ImmucanWP7/immucan-scdb/51b8ca98c255823a23213f0744b3779640ef75ea/annotation_CHETAH.xlsx


--------------------------------------------------------------------------------
/cell_ontology.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ImmucanWP7/immucan-scdb/51b8ca98c255823a23213f0744b3779640ef75ea/cell_ontology.xlsx


--------------------------------------------------------------------------------
/check_seurat.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | args = commandArgs(trailingOnly=TRUE)
  3 | seurat_obj = args[1] #path of seurat object
  4 | batch_var = args[2] #batch variable if known
  5 | verbose = FALSE
  6 | tidy_metadata_path <- "/home/jordi_camps/IMMUcan/tidy_metadata.xlsx"
  7 | 
  8 | dir <- getwd()
  9 | setwd(dir)
 10 | print(dir)
 11 | if (!dir.exists("temp")) {dir.create("temp")}
 12 | if (!dir.exists("temp/QC")) {dir.create("temp/QC")}
 13 | if (!dir.exists("out")) {dir.create("out")}
 14 | if (!dir.exists("out/plots")) {dir.create("out/plots")}
 15 | 
 16 | suppressPackageStartupMessages({
 17 |   library(Seurat)
 18 |   library(ggplot2)
 19 |   library(patchwork)
 20 |   library(Matrix)
 21 |   library(dplyr)
 22 |   library(readxl)
 23 |   library(DescTools)
 24 |   library(tidyr)
 25 |   library(tibble)
 26 |   library(jsonlite)
 27 |   library(harmony)
 28 |   library(kBET)
 29 |   library(scater)
 30 | })
 31 | 
 32 | print("STEP 1: CHECKING SEURAT OBJECT")
 33 | 
 34 | if (is.na(seurat_obj)) {
 35 |   seurat_obj <- normalizePath(list.files(pattern = ".rds$"))
 36 |   if (length(seurat_obj) != 1) {
 37 |     stop("Specify seurat object in arguments")
 38 |   }
 39 | }
 40 | seurat <- readRDS(seurat_obj)
 41 | 
 42 | # Load data.json or create standard
 43 | if (file.exists("out/data.json")) {
 44 |   data <- fromJSON("out/data.json")
 45 | } else {
 46 |   data <- list()
 47 |   data$object_path = seurat_obj
 48 |   data$batch = batch_var
 49 |   data$QC_feature_min = 250 #Minimal features threshold
 50 |   data$QC_mt_max = 20 #Maximum mitochondrial content threshold
 51 |   data$pca_dims = 30 #Amount of PCA dimensions to use
 52 |   data$features_var = 2000 #Amount of variable features to select
 53 |   data$nSample = 10000
 54 |   data$cluster_resolution = seq(from = 0.4, to = 4, by = 0.1)
 55 |   data$malignant = TRUE
 56 |   data$normal_cells = NA
 57 | }
 58 | 
 59 | print(paste0("nCell = ", ncol(seurat), " / ", "nGene = ", nrow(seurat)))
 60 | 
 61 | if (sum(colnames(seurat) == rownames(seurat@meta.data)) == ncol(seurat)) {
 62 |   print("Cell IDs linked correctly")
 63 | } else {
 64 |   stop("Cell IDs linked uncorrectly")
 65 | }
 66 | 
 67 | gapdh <- grepl("GAPDH|Gapdh", rownames(seurat))
 68 | if (sum(grepl("\\.", seurat[["RNA"]]@counts[gapdh, ])) == 0) {
 69 |   data$norm <- FALSE
 70 |   print("Raw counts supplied")
 71 | } else if (any(seurat[["RNA"]]@counts[gapdh, ] > 100)) {
 72 |   data$norm <- FALSE
 73 |   print("Normalized counts supplied. Be careful with further interpretation")
 74 | } else {
 75 |   data$norm <- TRUE
 76 |   print("Logcounts supplied, no normalization will be done. Be careful with further interpretation")
 77 | }
 78 | 
 79 | seurat[["percent_mt"]] <- PercentageFeatureSet(seurat, pattern = "^Mt\\.|^MT\\.|^mt\\.|^Mt-|^MT-|^mt-")
 80 | seurat[["RNA"]]@counts[1:5,1:5]
 81 | dplyr::glimpse(seurat@meta.data)
 82 | 
 83 | # Clean metadata columns
 84 | names <- tolower(colnames(seurat@meta.data))
 85 | names <- gsub("\\.", "_", names)
 86 | meta_cols <- read_excel(tidy_metadata_path)
 87 | if (any(names %in% meta_cols$col_names)) {
 88 |   change_cols <- colnames(seurat@meta.data)[colnames(seurat@meta.data) %in% meta_cols$col_names]
 89 |   for (i in change_cols) {
 90 |     hit_1 <- grepl(i, names)
 91 |     hit_2 <- meta_cols$col_names %in% i
 92 |     print(paste0("changing ", i, " to ", meta_cols$general[hit_2]))
 93 |     colnames(seurat@meta.data)[hit_1] <- meta_cols$general[hit_2]
 94 |   }
 95 | } else {
 96 |   print("No meta.data columns to tidy up")
 97 | }
 98 | data$annotation = c("seurat_clusters","annotation_CHETAH","annotation_major","annotation_immune","annotation_minor", colnames(seurat@meta.data)[grepl("authors_annotation|Authors_annotation", colnames(seurat@meta.data))])
 99 | saveRDS(seurat, seurat_obj)
100 | 
101 | # Batch
102 | print("STEP 2a: ESTIMATING BATCH VARIABLES")
103 | 
104 | ## Remove bad quality cells
105 | seurat <- CreateSeuratObject(counts = seurat[["RNA"]]@counts, meta.data = seurat@meta.data, min.cells = 10, min.features = 200)
106 | seurat <- subset(seurat, subset = nFeature_RNA > data$QC_feature_min & percent_mt < data$QC_mt_max)
107 | 
108 | ## Subsample datasets larger than 20k cells
109 | if (ncol(seurat) > 50000) {
110 |   subset_size <- 0.1 #subsample to 10% of the data
111 |   subset_id <- sample.int(n = ncol(seurat), size = floor(subset_size * ncol(seurat)), replace=FALSE)
112 |   seurat <- seurat[, subset_id]
113 | }
114 | 
115 | ## Select potential batch columns from meta.data
116 | seurat@meta.data <- seurat@meta.data[, sapply(sapply(seurat@meta.data, unique), length) != 1, drop = FALSE] #Remove all columns that have only one variable
117 | #seurat@meta.data <- seurat@meta.data[, sapply(sapply(seurat@meta.data, unique), length) != nrow(seurat@meta.data), drop = FALSE] #Remove columns with only unique values
118 | meta <- seurat@meta.data[, sapply(seurat@meta.data, class) %in% c("character", "factor")] #Select all columns that are factor or character
119 | meta <- meta[,!grepl("Cluster|cluster|author|Author|Annotation|annotation|Cell_type|cell_type|cell|Cell|barcode|Barcode", colnames(meta))]
120 | if (!"metadata" %in% names(data)) {data$metadata = colnames(meta)} #save metadata columns to data.json
121 | if (length(data$batch) > 1) {
122 |   batch <- data$batch
123 | } else if (is.na(data$batch)) {
124 |   temp <- meta[, apply(meta, 2, function(x) !any(is.na(x))), drop = FALSE] #Remove all columns with NAs
125 |   batch <- colnames(temp)
126 | } else {
127 |   batch <- data$batch
128 | }
129 | print("Possible batches:")
130 | print(paste0(batch))
131 | 
132 | ## Create nearest neighbour graph
133 | if (data$norm == FALSE) {
134 |   seurat <- Seurat::NormalizeData(seurat, verbose = verbose)
135 | } else {seurat[["RNA"]]@data <- seurat[["RNA"]]@counts}
136 | 
137 | seurat <- seurat %>% 
138 |   FindVariableFeatures(selection.method = "vst", nfeatures = data$features_var, verbose = verbose) %>% 
139 |   ScaleData(verbose = verbose) %>% 
140 |   RunPCA(pc.genes = seurat@var.genes, npcs = data$pca_dims+20, verbose = verbose) %>%
141 |   RunUMAP(dims = 1:data$pca_dims, a = .5, b = 1.2, verbose = verbose) %>%
142 |   FindNeighbors(dims = 1:2, k.param = 30, reduction = "umap", verbose = verbose)
143 | 
144 | p_lbw <- ElbowPlot(seurat, ndims = data$pca_dims+20) + geom_vline(xintercept = data$pca_dims, color = "red") + ylab("STDEV PCA") + theme(axis.title.x = element_blank())
145 | 
146 | ## Compute variance explained
147 | #print("Compute explained variance")
148 | #temp <- seurat@meta.data[, apply(seurat@meta.data, 2, function(x) !any(is.na(x))), drop = FALSE] #Remove all columns with NAs
149 | #sce <- SingleCellExperiment(assays = list(logcounts = seurat[["RNA"]]@data), colData = temp)
150 | #vars_exp <- getVarianceExplained(x = sce, variables = colnames(temp))
151 | #p_var <- plotExplanatoryVariables(vars_exp)
152 | #ggsave(plot = p_var, filename = paste0("temp/QC/Batch_variance_explained.png"))
153 | 
154 | #print("Compute kBET score")
155 | ## kBET
156 | #kbet <- list()
157 | #mtx <- t(as.matrix(seurat[["RNA"]]@data))
158 | #for (b in batch) {
159 | #  btch <- seurat@meta.data[, b]
160 | #  kbet.estimate <- kBET(df = mtx, batch = btch, plot = FALSE)
161 | #  kbet[[b]] <- 1 - kbet.estimate$stats$kBET.observed
162 | #}
163 | ##Plot kbet scores
164 | #kbet <- do.call(cbind, kbet)
165 | #p_kbet <- kbet %>%
166 | #  as.data.frame() %>%
167 | #  gather("var", "Acceptance_rate") %>%
168 | #  ggplot(aes(x = var, y = Acceptance_rate)) +
169 | #  geom_boxplot()
170 | #ggsave(plot = p_kbet, filename = "temp/QC/batch_kBET.png")
171 | 
172 | ## Compute the percentage of batch in cell neighbors
173 | print("Compute batch entropy")
174 | batch_entropy <- list()
175 | for (b in batch) {
176 |   neighbors <- list()
177 |   for (i in as.factor(unique(seurat@meta.data[, b]))) {
178 |     temp <- rownames(seurat@meta.data[seurat@meta.data[ , b] == i, ])
179 |     neighbors[[i]] <- rowSums(as.matrix(seurat@graphs$RNA_nn[, temp]))/30
180 |   }
181 |   neighbors <- as.data.frame(neighbors)
182 |   ## Compute entropy per cell
183 |   optimum <- table(seurat@meta.data[, b]) / ncol(seurat)
184 |   batch_entropy[[b]] <- apply(neighbors, 1, Entropy)
185 |   batch_entropy[[b]] <- batch_entropy[[b]] / Entropy(optimum)
186 | }
187 | batch_entropy <- do.call(cbind, batch_entropy)
188 | 
189 | ## Save batch variables with entropy < 2
190 | #batch_var <- list()
191 | #for (i in colnames(batch_entropy)) {
192 | #  if (median(as.numeric(batch_entropy[, i])) < 2) {
193 | #    batch_var[[i]] <- median(as.numeric(batch_entropy[, i]))
194 | #  }
195 | #}
196 | #batch_var <- batch_var[!duplicated(batch_var)]
197 | #batches <- paste(names(batch_var), sep = ", ")
198 | #print(paste0("Possible batch(es): ", batches))
199 | 
200 | # Run harmony
201 | 
202 | if (length(batch >= 1)) {
203 |   print("STEP 2b: RUN HARMONY")
204 |   batch_harmony <- list()
205 |   
206 |   for (i in batch) {
207 |     p0 <- AugmentPlot(DimPlot(seurat, reduction = "umap", group.by = i, pt.size = .1) + NoLegend() + ggtitle("Before harmony"))
208 |     p1 <- AugmentPlot(DimPlot(object = seurat, reduction = "pca", group.by = i, pt.size = .1) + NoLegend())
209 |     seurat_corrected <- seurat %>%
210 |       RunHarmony(i, plot_convergence = FALSE, verbose = verbose) %>%
211 |       RunUMAP(reduction = "harmony", dims = 1:data$pca_dims, a = .5, b = 1.2, verbose = verbose) %>%
212 |       FindNeighbors(dims = 1:2, k.param = 30, reduction = "umap", verbose = verbose)
213 |     
214 |     p3 <- AugmentPlot(DimPlot(object = seurat_corrected, reduction = "harmony", group.by = i, pt.size = .1) + NoLegend())
215 |     p2 <- AugmentPlot(DimPlot(seurat_corrected, reduction = "umap", group.by = i, pt.size = .1) + NoLegend() + ggtitle("After harmony"))
216 |     p <- (p0 | p2) / (p1 | p3)
217 |     ggsave(plot = p, filename = paste0("temp/QC/Harmony_", i, ".png"))
218 |     
219 |     ## Compute the percentage of batch in cell neighbors
220 |     neighbors <- list()
221 |     for (j in as.factor(unique(seurat_corrected@meta.data[, i]))) {
222 |       temp <- rownames(seurat_corrected@meta.data[seurat@meta.data[ , i] == j, ])
223 |       neighbors[[j]] <- rowSums(as.matrix(seurat_corrected@graphs$RNA_nn[, temp]))/30
224 |     }
225 |     neighbors <- as.data.frame(neighbors)
226 |     ## Compute entropy per cell
227 |     optimum <- table(seurat@meta.data[, i]) / ncol(seurat)
228 |     batch_harmony[[i]] <- apply(neighbors, 1, Entropy)
229 |     batch_harmony[[i]] <- batch_harmony[[i]] / Entropy(optimum)
230 |   }
231 |   batch_harmony <- do.call(cbind, batch_harmony)
232 |   ## Plot entropy over all batches
233 |   #batch_entropy$harmony <- "Before"
234 |   batch_entropy <- batch_entropy %>%
235 |     as.data.frame() %>%
236 |     mutate(harmony = "Before") %>%
237 |     tibble::rownames_to_column("cell") %>%
238 |     gather("batch", "entropy", -cell, -harmony)
239 |   
240 |   batch_harmony <- batch_harmony %>%
241 |     as.data.frame() %>%
242 |     mutate(harmony = "After") %>%
243 |     tibble::rownames_to_column("cell") %>%
244 |     gather("batch", "entropy", -cell, -harmony)
245 |   
246 |   batch_entropy <- rbind(batch_entropy, batch_harmony)
247 |   batch_entropy$harmony <- factor(batch_entropy$harmony, levels = c("Before", "After"))
248 |   
249 |   p <- batch_entropy %>% 
250 |     ggplot(aes(y = as.numeric(entropy), x = batch, col = harmony)) +
251 |     geom_boxplot() +
252 |     scale_y_continuous("Entropy") +
253 |     theme(axis.text.x = element_text(angle = 45, hjust = 1))
254 |   ggsave(plot = p, filename = "temp/QC/batch_entropy.png", width = 10, height = 10)
255 |   
256 | } #else {
257 | ## Plot entropy over all batches
258 | #batch_entropy <- as.data.frame(batch_entropy)
259 | #p <- batch_entropy %>% 
260 | #  tibble::rownames_to_column("cell") %>%
261 | #  gather("batch", "entropy", -cell) %>%
262 | #  ggplot(aes(y = as.numeric(entropy), x = batch)) +
263 | #  geom_boxplot() +
264 | #  scale_y_continuous("Entropy") +
265 | #  theme(axis.text.x = element_text(angle = 45, hjust = 1))
266 | #ggsave(plot = p, filename = "temp/QC/batch_entropy.png", width = 10, height = 10)
267 | #}
268 | 
269 | # QC
270 | print("STEP 3: CREATE QC PLOTS")
271 | if (length(batch) >= 1) {
272 |   for (i in batch) {
273 |     p2 <- DimPlot(seurat, reduction = "umap", pt.size = .1, group.by = i, label = TRUE) + NoLegend()
274 |     p3 <- AugmentPlot(VlnPlot(seurat, features = "nFeature_RNA", pt.size = 0.1, group.by = i, log = TRUE)) + 
275 |       NoLegend() +
276 |       scale_y_log10("Genes", expand = c(0,0)) + 
277 |       geom_hline(yintercept = data$QC_feature_min, color = "red") + 
278 |       theme(axis.title.x = element_blank(), plot.title = element_blank(), axis.title.y = element_text(), axis.text.x = element_blank(), axis.ticks.x = element_blank())
279 |     p4 <- AugmentPlot(VlnPlot(seurat, features = "nCount_RNA", pt.size = 0.1, group.by = i, log = TRUE)) + 
280 |       NoLegend() + 
281 |       scale_y_log10("Counts", expand = c(0,0)) + 
282 |       theme(axis.title.x = element_blank(), plot.title = element_blank(), axis.title.y = element_text(), axis.text.x = element_blank(), axis.ticks.x = element_blank())
283 |     if ("percent_mt" %in% colnames(seurat@meta.data)) {
284 |       p5 <- AugmentPlot(VlnPlot(seurat, features = "percent_mt", pt.size = 0.1, group.by = i)) + 
285 |         NoLegend() +
286 |         geom_hline(yintercept = data$QC_mt_max, color = "red") + 
287 |         scale_y_continuous("Mito", expand = c(0,0)) +
288 |         theme(axis.title.x = element_blank(), plot.title = element_blank(), axis.title.y = element_text())
289 |     } else {
290 |       seurat@meta.data$percent_mt <- 0
291 |       p5 <- AugmentPlot(VlnPlot(seurat, features = "percent_mt", pt.size = 0.1, group.by = i)) + 
292 |         NoLegend() +
293 |         geom_hline(yintercept = data$QC_mt_max, color = "red") + 
294 |         scale_y_continuous("Mito", expand = c(0,0)) +
295 |         theme(axis.title.x = element_blank(), plot.title = element_blank(), axis.title.y = element_text())
296 |     }
297 |     if ("CD3D" %in% rownames(seurat)) {
298 |       p6 <- AugmentPlot(FeaturePlot(seurat, features = "CD3D", pt.size = .1)) +
299 |         theme(axis.title.x = element_blank(), axis.title.y = element_text())
300 |     } else if ("CD68" %in% rownames(seurat)) {
301 |       p6 <- AugmentPlot(FeaturePlot(seurat, features = "CD68", pt.size = .1)) +
302 |         theme(axis.title.x = element_blank(), axis.title.y = element_text())
303 |     } else if ("CLDN5" %in% rownames(seurat)) {
304 |       p6 <- AugmentPlot(FeaturePlot(seurat, features = "CLDN5", pt.size = .1)) +
305 |         theme(axis.title.x = element_blank(), axis.title.y = element_text())
306 |     } else {
307 |       p6 <- AugmentPlot(DimPlot(seurat, group.by = i))
308 |     }
309 |     p <- (p_lbw + p2) / (p3 + p5) / (p4 + p6)
310 |     ggsave(plot = p, filename = paste0("temp/QC/QC_", i, ".png"))
311 |   }
312 | } else {
313 |   print("No batches found in metadata!")
314 |   p1 <- AugmentPlot(VlnPlot(seurat, features = "nFeature_RNA", pt.size = 0.1, log = TRUE)) + 
315 |     NoLegend() +
316 |     scale_y_log10("Genes", expand = c(0,0)) + 
317 |     geom_hline(yintercept = data$QC_feature_min, color = "red") + 
318 |     theme(axis.title.x = element_blank(), plot.title = element_blank(), axis.title.y = element_text(), axis.text.x = element_blank(), axis.ticks.x = element_blank())
319 |   p2 <- AugmentPlot(VlnPlot(seurat, features = "nCount_RNA", pt.size = 0.1, log = TRUE)) + 
320 |     NoLegend() + 
321 |     scale_y_log10("Counts", expand = c(0,0)) + 
322 |     theme(axis.title.x = element_blank(), plot.title = element_blank(), axis.title.y = element_text(), axis.text.x = element_blank(), axis.ticks.x = element_blank())
323 |   p3 <- AugmentPlot(VlnPlot(seurat, features = "percent_mt", pt.size = 0.1)) + 
324 |     NoLegend() +
325 |     geom_hline(yintercept = data$QC_mt_max, color = "red") + 
326 |     scale_y_continuous("Mito", expand = c(0,0)) +
327 |     theme(axis.title.x = element_blank(), plot.title = element_blank(), axis.title.y = element_text())
328 |   p <- p_lbw /  p1 / p2 / p3
329 |   ggsave(plot = p, filename = "temp/QC/QC.png")
330 | }
331 | 
332 | ## Save data.json
333 | if (length(batch) == 1) {
334 |   data$batch = "patient"
335 | } else {
336 |   data$batch = FALSE
337 | }
338 | data <- toJSON(data)
339 | 
340 | if (!file.exists("out/data.json")) {
341 |   write(data, "out/data.json")
342 | } else {
343 |   print("data.json already exists, writing to copy")
344 |   write(data, "out/data_copy.json")
345 | }
346 | 
347 | 


--------------------------------------------------------------------------------
/create_seurat_fromCSV.R:
--------------------------------------------------------------------------------
 1 | library(Seurat)
 2 | library(tidyverse)
 3 | library(tidyr)
 4 | 
 5 | count_file <- ""
 6 | meta_file <- ""
 7 | patient_file <- ""
 8 | 
 9 | print("Read count matrix")
10 | counts_test <- read.csv(count_file, header = TRUE, row.names = 1, sep = " ", nrows = 6) #Read try
11 | counts_test[,1:6]
12 | counts <- read.csv(count_file, header = TRUE, row.names = 1, sep = " ")
13 | 
14 | print("Metadata")
15 | meta <- read.csv(meta_file, header = TRUE, sep = "\t", row.names = 1)
16 | print(head(meta))
17 | 
18 | print("Patient info")
19 | patient <- read.csv(patient_file, header = TRUE, sep = "\t", row.names = 1)
20 | 
21 | if (ncol(counts) != sum(colnames(counts) == rownames(meta))) {stop("colnames counts not equal to rownames meta.data")}
22 | 
23 | print("Create Seurat object")
24 | seurat <- CreateSeuratObject(counts = counts, meta.data = meta)
25 | saveRDS(seurat, "raw.rds")


--------------------------------------------------------------------------------
/data_example.json:
--------------------------------------------------------------------------------
1 | {"object_path":["raw.rds"],"batch":["orig.ident"],"QC_feature_min":[250],"QC_mt_max":[20],"pca_dims":[30],"features_var":[2000],"nSample":[10000],"cluster_resolution":[1],"malignant":[true],"normal_cells":[null],"annotation":["seurat_clusters","annotation_CHETAH","annotation_major","annotation_immune","annotation_minor","annotation_authors"],"norm":[true],"metadata":["orig.ident","cell_id","biopsy_id"]}
2 | 


--------------------------------------------------------------------------------
/scProcessor_1.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | args = commandArgs(trailingOnly=TRUE)
  3 | data = "out/data.json" #If data is already normalized or not, stored by check_seurat.R
  4 | cellMarker_path = "/home/jordi_camps/IMMUcan/TME_markerGenes.xlsx"
  5 | chetahClassifier_path = "/home/jordi_camps/IMMUcan/CHETAH_reference_updatedAnnotation.RData"
  6 | uGene_clust = 20
  7 | verbose = FALSE
  8 | if (!dir.exists("temp")) {dir.create("temp")}
  9 | if (!dir.exists("temp/annotation")) {dir.create("temp/annotation")}
 10 | if (!dir.exists("out")) {dir.create("out")}
 11 | if (!dir.exists("out/plots")) {dir.create("out/plots")}
 12 | 
 13 | # Load packages and set environment
 14 | suppressPackageStartupMessages({
 15 |   library(Seurat)
 16 |   library(SingleCellExperiment)
 17 |   library(CHETAH)
 18 |   library(harmony)
 19 |   library(ggplot2)
 20 |   library(patchwork)
 21 |   library(Matrix)
 22 |   library(dplyr)
 23 |   library(openxlsx)
 24 |   library(pheatmap)
 25 |   library(DescTools)
 26 |   library(copykat)
 27 |   library(future)
 28 |   library(jsonlite)
 29 | })
 30 | 
 31 | suppressWarnings(RNGkind(sample.kind = "Rounding"))
 32 | set.seed(111)
 33 | options(future.globals.maxSize= 150000*1024^2)
 34 | plan("multisession", workers = 4)
 35 | 
 36 | # Make and set directories
 37 | dir <- getwd()
 38 | print(dir)
 39 | setwd(dir)
 40 | if (!file.exists("out/data.json")) {stop("first run check_seurat.R")}
 41 | data <- fromJSON("out/data.json")
 42 | 
 43 | # Recreate seurat object
 44 | 
 45 | seurat <- readRDS(data$object_path)
 46 | seurat <- CreateSeuratObject(counts = seurat[["RNA"]]@counts, meta.data = seurat@meta.data, min.cells = 10, min.features = 200)
 47 | if (length(data$batch) > 1) {stop("More than one batch specified, select the correct batch")}
 48 | if (!"cluster_resolution" %in% names(data)) {data$cluster_resolution = seq(from = 0.4, to = 4, by = 0.1)}
 49 | if (ncol(seurat) > 50000) {subsamples <- sample(ncol(seurat), 50000, replace = FALSE)} #copykat can only run on matrix of max 50,000 cells
 50 | 
 51 | # QC
 52 | 
 53 | print("STEP 1a: QC")
 54 | cells_before_QC <- ncol(seurat)
 55 | bad_columns <- colnames(seurat@meta.data[, sapply(sapply(seurat@meta.data, unique), length) == 1, drop = FALSE])
 56 | bad_cols <- paste(bad_columns, sep = ", ")
 57 | print(paste0("Removing columns with only one value: ", bad_cols))
 58 | seurat@meta.data <- seurat@meta.data[, !colnames(seurat@meta.data) %in% c(bad_columns)] #Remove all columns that have only one variable
 59 | #colnames(seurat@meta.data) <- gsub("[[:space:]]|\\/", "_", colnames(seurat@meta.data)) #Clean column names from special characters
 60 | seurat[["percent_mt"]] <- PercentageFeatureSet(seurat, pattern = "^Mt\\.|^MT\\.|^mt\\.|^Mt-|^MT-|^mt-")
 61 | cols <- colnames(seurat@meta.data)[!colnames(seurat@meta.data) %in% "percent_mt"]
 62 | for (i in seq_along(cols)) {
 63 |   if (ncol(seurat) == sum(seurat[[cols[i], drop = TRUE]] == seurat$percent_mt, na.rm = TRUE)) {
 64 |     print(paste0("Found duplicate mito column, removing ", cols[i]))
 65 |     seurat@meta.data <- seurat@meta.data[, !colnames(seurat@meta.data) %in% i]
 66 |     }
 67 |   }
 68 | seurat <- subset(seurat, subset = nFeature_RNA > data$QC_feature_min & percent_mt < data$QC_mt_max)
 69 | if (data$norm == FALSE) {
 70 |   seurat <- Seurat::NormalizeData(seurat, verbose = verbose)
 71 | } else {seurat[["RNA"]]@data <- seurat[["RNA"]]@counts}
 72 | seurat <- suppressWarnings(seurat %>% 
 73 |   FindVariableFeatures(selection.method = "vst", nfeatures = data$features_var, verbose = verbose) %>% 
 74 |   ScaleData(verbose = verbose) %>% 
 75 |   RunPCA(pc.genes = seurat@var.genes, npcs = data$pca_dims+20, verbose = verbose) %>%
 76 |   RunUMAP(dims = 1:data$pca_dims, a = .5, b = 1.2, verbose = verbose))
 77 | 
 78 | # Harmony
 79 | 
 80 | if (data$batch != FALSE) {
 81 |   print("STEP 1b: INTEGRATING BATCH")
 82 |   p0 <- AugmentPlot(DimPlot(seurat, reduction = "umap", group.by = data$batch, pt.size = .1) + 
 83 |                       NoLegend() + 
 84 |                       ggtitle("Before harmony"))
 85 |   p1 <- AugmentPlot(DimPlot(object = seurat, reduction = "pca", pt.size = .1, group.by = data$batch) + NoLegend())
 86 |   p2 <- AugmentPlot(VlnPlot(object = seurat, features = "PC_1", group.by = data$batch, pt.size = .1) + NoLegend() + theme(plot.title = element_blank()))
 87 |   
 88 |   seurat <- suppressWarnings(seurat %>% 
 89 |     RunHarmony(data$batch, plot_convergence = FALSE, verbose = verbose))
 90 |   
 91 |   p3 <- AugmentPlot(DimPlot(object = seurat, reduction = "harmony", pt.size = .1, group.by = data$batch) + NoLegend())
 92 |   p4 <- AugmentPlot(VlnPlot(object = seurat, features = "harmony_1", group.by = data$batch, pt.size = .1) + NoLegend() + theme(plot.title = element_blank()))
 93 | }
 94 | 
 95 | # Dimensionality reduction and clustering
 96 | print("STEP 2: CLUSTERING")
 97 |   
 98 | if (data$batch != FALSE) {
 99 |   seurat <- seurat %>% 
100 |     RunUMAP(reduction = "harmony", dims = 1:data$pca_dims, a = .5, b = 1.2, verbose = verbose) %>%
101 |     RunTSNE(reduction = "harmony", dims = 1:data$pca_dims, check_duplicates = FALSE)  %>%
102 |     FindNeighbors(reduction = "harmony", dims = 1:data$pca_dims, verbose = verbose) %>% 
103 |     FindClusters(resolution = data$cluster_resolution, verbose = verbose)
104 |   p5 <- AugmentPlot(DimPlot(seurat, reduction = "umap", group.by = data$batch, pt.size = .1) + 
105 |                       NoLegend() + 
106 |                       ggtitle("After harmony"))
107 |   p <- (p0 | p5) / (p1 | p3) / (p2 | p4)
108 |   ggsave(plot = p, filename = "out/plots/Harmony.png")
109 | } else {
110 |   seurat <- seurat %>% 
111 |     RunUMAP(reduction = "pca", dims = 1:data$pca_dims, a = .5, b = 1.2, verbose = verbose) %>%
112 |     RunTSNE(reduction = "pca", dims = 1:data$pca_dims, check_duplicates = FALSE)  %>%
113 |     FindNeighbors(reduction = "pca", dims = 1:data$pca_dims, verbose = verbose) %>% 
114 |     FindClusters(resolution = data$cluster_resolution, verbose = verbose)
115 | }
116 | 
117 | if (length(data$cluster_resolution) > 1) {
118 | print("Defining optimal cluster resolution")
119 |   if (exists("subsamples")) {
120 |     seurat_sampled <- seurat[, subsamples]
121 |     if (any(table(seurat_sampled[[paste0("RNA_snn_res.", tail(data$cluster_resolution, n=1))]]) < 3)) {
122 |       seurat_sampled <- seurat
123 |     }
124 |   } else {
125 |     seurat_sampled <- seurat
126 |   }
127 |   clusters <- seurat_sampled@meta.data[, grepl("RNA_snn_res.", colnames(seurat_sampled@meta.data))]
128 |   clusters <- apply(clusters, 2, as.numeric)
129 |   data$cluster_resolution <- data$cluster_resolution[!duplicated(apply(clusters, 2, max))]
130 |   for (i in seq_along(data$cluster_resolution)) {
131 |     Idents(seurat_sampled) <- seurat_sampled[[paste0("RNA_snn_res.", data$cluster_resolution[i])]]
132 |     #print(paste0("Checking cluster resolution ", data$cluster_resolution[i]))
133 |     if (i == 1) {
134 |       seurat.markers <- FindAllMarkers(seurat_sampled, only.pos = TRUE, min.pct = 0.1, logfc.threshold = 0.25, verbose = verbose)
135 |       seurat.markers.unique <- seurat.markers[!duplicated(seurat.markers$gene) & seurat.markers$p_val_adj < 0.05, ]
136 |       clust_num <- nlevels(seurat.markers$cluster)
137 |       clust_unique <- sum(table(seurat.markers.unique$cluster) >= uGene_clust)
138 |       diff1 <- clust_num - clust_unique
139 |     } else if (i == length(data$cluster_resolution)) {
140 |       print(paste0("Optimal cluster resolution: ", data$cluster_resolution[i], " is max defined, consider increasing resolution range"))
141 |       data$cluster_resolution <- data$cluster_resolution[[i]]
142 |       Idents(seurat) <- seurat$seurat_clusters
143 |       seurat.markers <- FindAllMarkers(seurat_sampled, only.pos = TRUE, min.pct = 0.1, logfc.threshold = 0.25, verbose = verbose)
144 |       write.csv(seurat.markers, file = "temp/DE_genes.csv")
145 |       break
146 |     } else {
147 |       temp <- table(seurat_sampled[[paste0("RNA_snn_res.", data$cluster_resolution[i-1]), drop = TRUE]], seurat_sampled[[paste0("RNA_snn_res.", data$cluster_resolution[i]), drop = TRUE]])
148 |       temp2 <- t(apply(temp, 1, function(x) x / sum(x)))
149 |       temp3 <- apply(temp2, 2, function(x) x < .9 & x > 0)
150 |       clust_test <- levels(seurat_sampled[[paste0("RNA_snn_res.", data$cluster_resolution[i]), drop = TRUE]])[colSums(temp3) == 1]
151 |       seurat.markers <- list()
152 |       for (c in clust_test) {
153 |         seurat.markers[[c]] <- FindMarkers(seurat_sampled, ident.1 = c, ident.2 = NULL, only.pos = TRUE, min.pct = 0.1, logfc.threshold = 0.25, verbose = verbose)
154 |       }
155 |       seurat.markers <- do.call(rbind, seurat.markers) %>%
156 |         as.data.frame() %>%
157 |         tibble::rownames_to_column("row") %>%
158 |         tidyr::separate(row, c("cluster", "gene"), remove = FALSE, sep = "\\.") %>%
159 |         tibble::column_to_rownames("row")
160 |       seurat.markers.unique <- seurat.markers[!duplicated(seurat.markers$gene) & seurat.markers$p_val_adj < 0.05, ]
161 |       clust_unique <- sum(table(seurat.markers.unique$cluster) >= uGene_clust)
162 |       diff2 <- length(clust_test) - clust_unique
163 |       if (diff2 > diff1) {
164 |         print(paste0("Optimal cluster resolution: ", data$cluster_resolution[i-1]))
165 |         seurat$seurat_clusters <- seurat[[paste0("RNA_snn_res.", data$cluster_resolution[i-1])]]
166 |         data$cluster_resolution <- data$cluster_resolution[[i-1]]
167 |         Idents(seurat_sampled) <- seurat_sampled[[paste0("RNA_snn_res.", data$cluster_resolution)]]
168 |         seurat.markers <- FindAllMarkers(seurat_sampled, only.pos = TRUE, min.pct = 0.1, logfc.threshold = 0.25, verbose = verbose)
169 |         write.csv(seurat.markers, file = "temp/DE_genes.csv")
170 |         break
171 |       }
172 |     }
173 |   }
174 | }
175 | seurat@meta.data <- seurat@meta.data[, !grepl("RNA_snn_res.", colnames(seurat@meta.data))]
176 | Idents(seurat) <- seurat$seurat_clusters #Set seurat_clusters to Idents
177 | 
178 | # Supervised annotation
179 | 
180 | print("STEP 3a: SUPERVISED ANNOTATION")
181 | load(chetahClassifier_path)
182 | input <- SingleCellExperiment(assays = list(counts = seurat[["RNA"]]@data),
183 |                               reducedDims = SimpleList(TSNE = seurat@reductions$umap@cell.embeddings))
184 | input <- CHETAHclassifier(input = input, ref_cells = reference, n_genes = 500, thresh = 0.05)
185 | p1 <- PlotCHETAH(input, return = TRUE) 
186 | #nodes <- c("Node1" = "Immune", "Node2" = "Immune", "Node3" = "Lymphoid", "Node4" = "Lymphoid", "Node5" = "NKT", "Node6" = "T", "Node7" = "T", "Node8" = "Myeloid", "Node9" = "Macro/DC", "Node10"= "Stromal", "Node11" = "Stromal")
187 | #input$celltype_CHETAH <- plyr::revalue(input$celltype_CHETAH, replace = nodes[names(nodes) %in% input$celltype_CHETAH])
188 | seurat@meta.data$annotation_CHETAH <- input$celltype_CHETAH
189 | ggsave(plot = p1, filename = "out/plots/CHETAH_classification.pdf", height = 6, width = 12)
190 | 
191 | ##CHETAH recommendation
192 | fraction_chetah <- seurat@meta.data %>%
193 |   group_by(seurat_clusters, annotation_CHETAH) %>%
194 |   tally(name = "nCells_CHETAH") %>%
195 |   mutate(fraction_CHETAH = round(nCells_CHETAH/sum(nCells_CHETAH), digits = 2)) %>%
196 |   select(-nCells_CHETAH) %>%
197 |   arrange(desc(fraction_CHETAH), .by_group = TRUE) %>%
198 |   slice_head(n = 1)
199 | 
200 | # copyKat
201 | 
202 | if (data$malignant == TRUE) {
203 |   print("STEP 3b: CALLING COPY NUMBER ABBERATIONS")
204 |   if (exists("subsamples")) {
205 |     seurat_sampled <- seurat[, subsamples]
206 |   } else {
207 |     seurat_sampled <- seurat
208 |   }
209 |   counts <- as.matrix(seurat_sampled[["RNA"]]@counts)
210 |   if (is.na(data$normal_cells)) {
211 |     normal_cells <- rownames(seurat_sampled@meta.data[seurat_sampled$annotation_CHETAH %in% c("Macrophage"), ])
212 |     print("Running copykat with Macrophages as normal cells")
213 |     copykat.test <- copykat(rawmat=counts, id.type="S", ngene.chr=5, win.size=25, KS.cut=0.15, distance="euclidean", norm.cell.names=normal_cells, n.cores=4)
214 |   } else if (data$normal_cells == FALSE) {
215 |     print("Running copykat without normal cells")
216 |     copykat.test <- copykat(rawmat=counts, id.type="S", ngene.chr=5, win.size=25, KS.cut=0.15, distance="euclidean", norm.cell.names="", n.cores=4)
217 |   } else {
218 |     normal_cells <- rownames(seurat_sampled@meta.data[seurat_sampled$annotation_CHETAH %in% c(data$normal_cells), ])
219 |     print(paste0("Running copykat with ", data$normal_cells, " as normal cells"))
220 |     copykat.test <- copykat(rawmat=counts, id.type="S", ngene.chr=5, win.size=25, KS.cut=0.15, distance="euclidean", norm.cell.names=normal_cells, n.cores=4)
221 |   }
222 |   pred.test <- data.frame(copykat.test$prediction)
223 |   pred.test <- pred.test[, "copykat.pred", drop = FALSE]
224 |   seurat@meta.data <- seurat@meta.data %>%
225 |     tibble::rownames_to_column("cell") %>%
226 |     left_join(tibble::rownames_to_column(pred.test, "cell"), by = "cell") %>%
227 |     tibble::column_to_rownames("cell")
228 |   
229 |   p1 <- DimPlot(seurat, group.by = "copykat.pred")
230 |   p2 <- DimPlot(seurat, group.by = "seurat_clusters", label = TRUE) + NoLegend()
231 |   if ("EPCAM" %in% rownames(seurat)) {
232 |     p3 <- FeaturePlot(seurat, features = "EPCAM")
233 |     p <- p1 + p2 + p3
234 |     ggsave(plot = p, filename = "out/plots/copyKat_umap.pdf", height = 5, width = 15)
235 |   }
236 |   p <- p1 + p2
237 |   ggsave(plot = p, filename = "out/plots/copyKat_umap.pdf", height = 5, width = 10)
238 | 
239 |   ##copykat recommendation
240 |   fraction_copykat <- seurat@meta.data %>%
241 |     group_by(seurat_clusters, copykat.pred) %>%
242 |     tally(name = "nCells_copykat") %>%
243 |     filter(is.na(copykat.pred) == FALSE) %>%
244 |     mutate(fraction_copykat = round(nCells_copykat/sum(nCells_copykat), digits = 2)) %>%
245 |     arrange(desc(fraction_copykat), .by_group = TRUE) %>%
246 |     slice_head(n = 1) %>%
247 |     select(-nCells_copykat, -fraction_copykat)
248 |   
249 |   annotation <- inner_join(fraction_chetah, fraction_copykat, by = "seurat_clusters")
250 |   annotation$abbreviation <- as.character("")
251 |   annotation[annotation$fraction_CHETAH >= .8, "abbreviation"] <- annotation[annotation$fraction_CHETAH >= .8, "annotation_CHETAH"]
252 |   annotation[annotation$copykat.pred == "aneuploid", "abbreviation"] <- "mal"
253 | } else {
254 |   annotation <- fraction_chetah
255 |   annotation$abbreviation <- as.character("")
256 |   annotation[annotation$fraction_CHETAH >= .8, "abbreviation"] <- annotation[annotation$fraction_CHETAH >= .8, "annotation_CHETAH"]
257 | }
258 | 
259 | ##Create annotation.xlsx
260 | if (!file.exists("out/annotation.xlsx")) {
261 |   write.xlsx(x = annotation, "out/annotation.xlsx")
262 | } else {
263 |   print("Not overwriting annotation.xlsx, saving as copy")
264 |   write.xlsx(x = annotation, "out/annotation_copy.xlsx")
265 | }
266 | 
267 | # Plot cell markers
268 | 
269 | print("STEP 4: CREATING MARKER GENE PLOTS")
270 | cell.markers <- readxl::read_excel(cellMarker_path)
271 | markers <- list()
272 | for (i in as.character(na.omit(unique(cell.markers$cell_type)))) {
273 |   temp <- rownames(seurat)[rownames(seurat) %in% na.omit(cell.markers[cell.markers$cell_type == i, "gene", drop = TRUE])]
274 |   if (length(temp) > 0) {
275 |     markers[[i]] <- temp
276 |   }
277 | }
278 | 
279 | #Idents(seurat) <- seurat$seurat_clusters #set seurat_clusters as idents
280 | temp <- AddModuleScore(seurat, features = markers)
281 | p <- DotPlot(temp, features = colnames(temp@meta.data)[grepl("Cluster[[:digit:]]", colnames(temp@meta.data))], group.by = "seurat_clusters", cluster.idents = TRUE) + scale_x_discrete(labels = names(markers)) + theme(axis.text.y = element_text(size = 8)) + RotatedAxis()
282 | ggsave(plot = p, filename = "temp/annotation/Dotplot_seuratClusters_geneModules.png", dpi = 300, height = 12, width = 12)
283 | 
284 | p0 <- DotPlot(seurat, features = unique(cell.markers$gene), group.by = "seurat_clusters", cluster.idents = TRUE) + theme(axis.text.y = element_text(size = 8)) + coord_flip()
285 | ggsave(plot = p0, filename = "temp/annotation/Dotplot_seuratClusters_genes.png", dpi = 300, height = 12, width = 12)
286 | 
287 | p1 <- AugmentPlot(DimPlot(seurat, label = TRUE, label.size = 8))
288 | cell.markers <- cell.markers[cell.markers$gene %in% rownames(seurat), ]
289 | for (type in unique(cell.markers$category)) {
290 |   markers <- unique(cell.markers[cell.markers$category == type, ]$gene)
291 |   if (length(markers) >= 6) {p2 <- FeaturePlot(seurat, features = markers, pt.size = .1, ncol = 6)} else {p2 <- FeaturePlot(seurat, features = markers, pt.size = .1)}
292 |   p3 <- DotPlot(seurat, features = markers, group.by = "seurat_clusters", cluster.idents = TRUE) + theme(axis.text.y = element_text(size = 8)) + coord_flip() + NoLegend()
293 |   layout <- "
294 |   ACC
295 |   BBB
296 |   BBB
297 |   "
298 |   p <- p1 + p2 + p3 + plot_layout(design = layout)
299 |   ggsave(plot = p, filename = paste0("temp/annotation/", type, ".png"), height = 20, width = 30, dpi = 300)
300 | }
301 | 
302 | temp <- table(seurat$seurat_clusters, seurat$annotation_CHETAH)
303 | temp <- apply(temp, 1, function(x) x / sum(x))
304 | pheatmap::pheatmap(temp, filename = "temp/annotation/cluster_comparison.pdf")
305 | 
306 | # Summary statistics
307 | 
308 | print("STEP 5: CREATING SUMMARY STATISTICS")
309 | harmony_summary = data.frame(
310 |   "Input_file" = data$object_path,
311 |   "Batch" = data$batch,
312 |   "QC_features_min" = data$QC_feature_min,
313 |   "QC_mito_max" = data$QC_mt_max,
314 |   "Variable_features" = data$features_var,
315 |   "PCA_dimensions" = data$pca_dims,
316 |   "Amount_genes" = nrow(seurat),
317 |   "Genes_detected_per_cell" = median(seurat@meta.data$nFeature_RNA),
318 |   "Cells_before_QC" = cells_before_QC,
319 |   "Cells_after_QC" = ncol(seurat),
320 |   "Cluster_resolution" = data$cluster_resolution
321 | )
322 | seurat@misc <- list(harmony_summary)
323 | 
324 | # Save RDS and convert to h5ad with seuratdisk
325 | 
326 | print("STEP 6: SAVING RESULTS")
327 | saveRDS(seurat, paste0("temp/harmony.rds"))
328 | data <- toJSON(data)
329 | write(data, "out/data.json")
330 | print("ALL DONE")
331 | 


--------------------------------------------------------------------------------
/scProcessor_1.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | SCRIPT='/gpfs01/home/glanl/scripts/IMMUcan/scProcessor_1.R'
 3 | BATCH='none' #Fill in
 4 | GENES='250' #Adapt
 5 | MITO='15' #Adapt
 6 | PCA='30' #Adapt
 7 | 
 8 | ml R/3.6.3-foss-2016b
 9 | 
10 | srun --mem=100G --time=4:00:00 --cpus-per-task=5 --partition=bigmem Rscript ${SCRIPT} ${BATCH} ${GENES} ${MITO} ${PCA}
11 | 


--------------------------------------------------------------------------------
/scProcessor_2.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | args = commandArgs(trailingOnly=TRUE)
  3 | object_path = "temp/harmony.rds" #harmony.rds file
  4 | if (file.exists("out/annotation.xlsx")) {
  5 |   annotationFile_path = "out/annotation.xlsx" #path to annotation file
  6 | } else {annotationFile_path = "out/annotation.xls"} #path to annotation file
  7 | cellOntology_path = "/home/jordi_camps/IMMUcan/cell_ontology.xlsx"
  8 | verbose = FALSE
  9 | if (!dir.exists("temp")) {dir.create("temp")}
 10 | if (!dir.exists("temp/plots")) {dir.create("temp/plots")}
 11 | if (!dir.exists("out")) {dir.create("out")}
 12 | if (!dir.exists("out/plots")) {dir.create("out/plots")}
 13 | 
 14 | dir <- getwd()
 15 | setwd(dir)
 16 | print(dir)
 17 | if (!file.exists("temp/harmony.rds")) {
 18 |   stop("first run scProcessor_1.R")
 19 | }
 20 | suppressPackageStartupMessages({
 21 | library(sceasy)
 22 | library(reticulate)
 23 | use_condaenv('sceasy')
 24 | loompy <- reticulate::import('loompy')
 25 | library(Seurat)
 26 | library(SeuratDisk)
 27 | library(ggplot2)
 28 | library(patchwork)
 29 | library(Matrix)
 30 | library(dplyr)
 31 | library(genesorteR)
 32 | library(data.table)
 33 | library(future)
 34 | library(jsonlite)
 35 | })
 36 | suppressWarnings(RNGkind(sample.kind = "Rounding"))
 37 | set.seed(111)
 38 | options(future.globals.maxSize= 150000*1024^2)
 39 | plan("multiprocess", workers = 12)
 40 | seurat <- readRDS(object_path)
 41 | data <- fromJSON("out/data.json")
 42 | if (!is.na(data$nSample) & ncol(seurat) > data$nSample) {subsamples <- sample(ncol(seurat), data$nSample, replace = FALSE)}
 43 | 
 44 | #makeReference, takes a Seurat Object and name of meta data column that contains the clusters. Returns a ranking of genes.
 45 | makeReference = function(seuratObj, groupBy) {
 46 |   groupBy = which(colnames(seuratObj@meta.data) == groupBy)
 47 |   gs = sortGenes(seuratObj@assays$RNA@counts, factor(seuratObj@meta.data[,groupBy]), binarizeMethod = "naive", cores = 12)
 48 |   pp = getPValues(gs, numPerm = 5, cores = 1)
 49 |   pp = apply(pp$adjpval, 1, function(x) any(x < 0.1))
 50 |   mm = getMarkers(gs)
 51 |   ref = mm$gene_shannon_index
 52 |   ref[!pp] = max(ref[[2]])
 53 |   return(sort(ref, decreasing = FALSE))
 54 | }
 55 | 
 56 | # Annotate
 57 | 
 58 | print("STEP 1: LINKING CELL ONTOLOGY")
 59 | anno_clust <- readxl::read_excel(annotationFile_path)
 60 | #anno_clust <- arrange(anno_clust, seurat_clusters)
 61 | new.cluster.ids <- tolower(anno_clust$abbreviation)
 62 | names(new.cluster.ids) <- levels(seurat)
 63 | seurat <- RenameIdents(seurat, new.cluster.ids)
 64 | seurat$abbreviation <- Idents(seurat)
 65 | cell_ont <- readxl::read_excel(cellOntology_path)
 66 | cell_ont$abbreviation <- tolower(cell_ont$abbreviation)
 67 | seurat@meta.data <- seurat@meta.data %>%
 68 |   tibble::rownames_to_column("cell") %>%
 69 |   left_join(cell_ont, by = "abbreviation") %>%
 70 |   tibble::column_to_rownames("cell")
 71 | if (any(is.na(seurat$cell_ontology)) == TRUE) {
 72 |   print(distinct(seurat@meta.data[is.na(seurat$cell_ontology), c("abbreviation", "cell_ontology")]))
 73 |   stop("NOT ALL CELL TYPE ABBREVIATIONS FIT")
 74 | } 
 75 | Idents(seurat) <- seurat$seurat_clusters
 76 | 
 77 | # Remove annotations with less than 10 cells
 78 | for (i in data$annotation) {
 79 |   temp <- names(table(seurat@meta.data[[i]]))[table(seurat@meta.data[[i]]) <= 10]
 80 |   seurat@meta.data[seurat@meta.data[[i]] %in% temp, i] <- NA
 81 | }
 82 | 
 83 | # Plotting
 84 | 
 85 | temp <- colnames(seurat@meta.data)[tolower(colnames(seurat@meta.data)) %in% tolower(data$annotation)]
 86 | for (i in temp) {
 87 |   if (is.numeric(seurat@meta.data[[i]]) == TRUE) {
 88 |     p <- FeaturePlot(seurat, features = i, reduction = "umap")
 89 |     ggsave(plot = p, filename = paste0("out/plots/", i, ".png"), dpi = 300, width = 10, height = 10)
 90 |   } else if (length(unique(seurat@meta.data[[i]])) <= 20) {
 91 |     p <- DimPlot(seurat, reduction = "umap", pt.size = 1, group.by = i, label = TRUE) + ggthemes::scale_color_tableau(palette = "Tableau 20")
 92 |     ggsave(plot = p, filename = paste0("out/plots/", i, ".png"), dpi = 300, width = 10, height = 10)
 93 |   } else {
 94 |     p <- DimPlot(seurat, reduction = "umap", pt.size = 1, group.by = i, label = TRUE)
 95 |     ggsave(plot = p, filename = paste0("out/plots/", i, ".png"), dpi = 300, width = 10, height = 10)
 96 |   }
 97 | }
 98 | 
 99 | temp <- colnames(seurat@meta.data)[tolower(colnames(seurat@meta.data)) %in% tolower(data$metadata)]
100 | for (i in temp) {
101 |   p <- ggplot(seurat@meta.data, aes_string(x = "cell_ontology", fill = i)) + geom_bar(position = "fill") + RotatedAxis()
102 |   ggsave(plot = p, filename = paste0("out/plots/", i, ".png"), dpi = 300, width = 10, height = 10)
103 | }
104 | 
105 | # DE
106 | print("STEP 2: CALCULATING MARKER GENES")
107 | #if (exists("subsamples")) {
108 | #  seurat_sampled <- seurat[, subsamples]
109 | #} else {
110 |   seurat_sampled <- seurat
111 | #}
112 | annoCounts <- list()
113 | for (i in data$annotation) {
114 |   Idents(seurat_sampled) <- seurat_sampled[[i, drop = TRUE]]
115 |   seurat.markers <- FindAllMarkers(seurat_sampled, only.pos = TRUE, min.pct = 0.1, logfc.threshold = 0.25, verbose = verbose)
116 |   write.table(seurat.markers, paste0("out/DE_", i, ".tsv"), sep = "\t")
117 |   temp <- table(seurat[[i]])
118 |   annoCounts[[i]] <- data.frame(annotation = i, cell_type = temp)
119 |   write.csv(data.table::rbindlist(annoCounts), file = "out/cell_count.csv", row.names = FALSE)
120 | }
121 | 
122 | # Gene entropy ranking
123 | print("STEP 3: CALCULATING GENE ENTROPY RANKING")
124 | geneIndex <- list()
125 | for (i in data$annotation) {
126 |   if (length(unique(seurat_sampled@meta.data[[i]])) > 1) {
127 |     geneIndex[[i]] <- makeReference(seuratObj = seurat_sampled, groupBy = i)
128 |   }
129 | }
130 | geneIndex <- do.call(cbind, geneIndex)
131 | write.table(geneIndex, "out/gene_index.tsv", row.names = TRUE, sep = "\t")
132 | 
133 | 
134 | # Export
135 | print("STEP 4: SAVING RESULTS")
136 | #Seurat
137 | saveRDS(seurat, "out/harmony.rds")
138 | #SaveH5Seurat(seurat, filename = "out/harmony.h5Seurat", overwrite = TRUE)
139 | #Convert("out/harmony.h5Seurat", dest = "h5ad", overwrite = TRUE)
140 | 
141 | # Export average gene expression over cluster
142 | #write.csv(x = seurat[["RNA"]]@data, file = "out/normCounts.csv", row.names = TRUE)
143 | Idents(seurat) <- seurat$annotation_major
144 | temp <- AverageExpression(seurat, assays = "RNA")
145 | write.table(x = temp$RNA, file = "out/avgExpr_major.tsv", row.names = TRUE, sep = "\t")
146 | Idents(seurat) <- seurat$annotation_immune
147 | temp <- AverageExpression(seurat, assays = "RNA")
148 | write.table(x = temp$RNA, file = "out/avgExpr_immune.tsv", row.names = TRUE, sep = "\t")
149 | Idents(seurat) <- seurat$annotation_minor
150 | temp <- AverageExpression(seurat, assays = "RNA")
151 | write.table(x = temp$RNA, file = "out/avgExpr_minor.tsv", row.names = TRUE, sep = "\t")
152 | Idents(seurat) <- seurat$annotation_CHETAH
153 | temp <- AverageExpression(seurat, assays = "RNA")
154 | write.table(x = temp$RNA, file = "out/avgExpr_CHETAH.tsv", row.names = TRUE, sep = "\t")
155 | 
156 | Idents(seurat) <- seurat$annotation_minor
157 | seurat@meta.data <- seurat@meta.data[, sapply(sapply(seurat@meta.data, unique), length) != 1, drop = FALSE] #Remove all columns that have only one variable
158 | seurat@meta.data <- seurat@meta.data[, !grepl("RNA_snn_res|abbreviation|cell_id|cell.id", colnames(seurat@meta.data))]
159 | 
160 | # Convert to h5ad with sceasy for immediate use with cellxgene
161 | sceasy::convertFormat(seurat, from="seurat", to="anndata", outFile= "out/cellxgene.h5ad")
162 | 
163 | # Export metadata with umap coordinates
164 | write.table(x = cbind(seurat@meta.data, seurat@reductions$umap@cell.embeddings), file = "out/metadata.tsv", row.names = TRUE, sep = "\t")
165 | 
166 | #Subsample object to 10k cells
167 | if (exists("subsamples")) {seurat <- seurat[, subsamples]}
168 | 
169 | # Export metadata with umap coordinates
170 | write.table(x = cbind(seurat@meta.data, seurat@reductions$umap@cell.embeddings), file = "out/metadata_10k.tsv", row.names = TRUE, sep = "\t")
171 | 
172 | # Convert to h5ad with sceasy for immediate use with cellxgene
173 | sceasy::convertFormat(seurat, from="seurat", to="anndata", outFile= "out/cellxgene_10k.h5ad")
174 | 
175 | #zip and checksum
176 | print("STEP 5: ZIP AND CHECKSUM")
177 | folder_name <- tail(unlist(strsplit(dir, "/")), n=1)
178 | dir.create(folder_name)
179 | out_files <- paste0("out/", list.files("out/"))
180 | file.copy(out_files, folder_name, recursive = TRUE)
181 | zip(paste0(folder_name, ".zip"), folder_name)
182 | checksum <- tools::md5sum(paste0(folder_name, ".zip"))
183 | file.rename(paste0(folder_name, ".zip"), paste0(folder_name, "_-_", checksum, ".zip"))
184 | file.copy(paste0(folder_name, "_-_", checksum, ".zip"), "../")
185 | unlink(paste0(folder_name, "_-_", checksum, ".zip"))
186 | unlink(folder_name, recursive = TRUE)
187 | 


--------------------------------------------------------------------------------
/scProcessor_2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | SCRIPT='/gpfs01/home/glanl/scripts/IMMUcan/scProcessor_2.R'
 3 | 
 4 | module use /gpfs01/sw/eb-2019/modules/all /gpfs01/sw/eb-rh7/modules/all /gpfs01$
 5 | ml Anaconda3/2020.02.lua
 6 | source activate	sceasy
 7 | ml R/3.6.3-foss-2016b
 8 | 
 9 | srun --mem=100G --time=6:00:00 --cpus-per-task=5 --partition=bigmem Rscript ${SCRIPT}
10 | 
11 | 


--------------------------------------------------------------------------------
/scRNA_seq_database_summary_stat.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Global stats IMMUcan database"
  3 | output: html_document
  4 | ---
  5 | 
  6 | ```{r, setup}
  7 | knitr::opts_knit$set(root.dir = "/Users/jordicamps/OneDrive - Bayer/IMMUcan/WP7/D1 - scRNAseq database/")
  8 | ```
  9 | 
 10 | 
 11 | ```{r, root.dir = TRUE}
 12 | library(dplyr)
 13 | library(readxl)
 14 | library(ggplot2)
 15 | library(patchwork)
 16 | library(stringr)
 17 | library(tidyr)
 18 | options(scipen = 999)
 19 | ```
 20 | 
 21 | ```{r}
 22 | theme_jc_vert <- theme(panel.border = element_blank(), panel.grid.major.y = element_blank(), panel.grid.minor = element_blank(), axis.line = element_blank(), axis.ticks = element_blank(), axis.text = element_text(colour = "black"))
 23 | theme_jc_hor <- theme(panel.border = element_blank(), panel.grid.major.x = element_blank(), panel.grid.minor = element_blank(), axis.line = element_blank(), axis.ticks = element_blank(), axis.text = element_text(colour = "black"))
 24 | ```
 25 | 
 26 | ```{r}
 27 | ScaleDiscretePositionFunc <- ggproto(
 28 |   "ScaleDiscretePositionReversed", ScaleDiscretePosition,
 29 |   get_limits = function(self) {
 30 |     if (self$is_empty()) {
 31 |       c(0, 1)
 32 |     } else if (is.null(self$limits)) {
 33 |       self$range$range
 34 |     } else if (is.function(self$limits)) {
 35 |       self$limits(self$range$range)
 36 |     } else {
 37 |       integer(0)
 38 |     }
 39 |   }
 40 | )
 41 | 
 42 | scale_x_discrete2 <- function(..., expand = waiver(), position = "bottom") {
 43 |   sc <- discrete_scale(c("x", "xmin", "xmax", "xend"), "position_d", identity, ...,
 44 |                        expand = expand, guide = "none", position = position, super = ScaleDiscretePositionFunc)
 45 |   
 46 |   sc$range_c <- ggplot2:::continuous_range()
 47 |   sc
 48 | }
 49 | ```
 50 | 
 51 | ```{r}
 52 | df <- read_excel("15052020_scRNAseq_database.xlsx", n_max = 75)
 53 | df
 54 | ```
 55 | 
 56 | ```{r}
 57 | df$`Cancer localization` <- tolower(df$`Cancer localization`)
 58 | df$`Library construction` <- tolower(df$`Library construction`)
 59 | df$cells_tenfive <- df$`Cell amount` / 100000
 60 | ```
 61 | 
 62 | ```{r fig.height=4, fig.width=5}
 63 | tech <- df %>%
 64 |   group_by(`Library construction`) %>%
 65 |   summarise("n_patient" = sum(`Number of Patients`, na.rm=TRUE), "n_cells" = sum(cells_tenfive, na.rm=TRUE), "n" = n()) %>%
 66 |   ggplot(aes(x = reorder(`Library construction`, -n_patient), y = n_patient)) +
 67 |   geom_segment(aes(xend = `Library construction`, yend = 0)) +
 68 |   geom_point(aes(col = n_cells, size = n)) +
 69 |   scale_size_continuous(range = c(2, 8))
 70 | ```
 71 | 
 72 | ```{r}
 73 | tech_vert <- tech +
 74 |   coord_flip() +
 75 |   scale_y_continuous("Patients") + 
 76 |   scale_x_discrete("Technology", limits = rev) +
 77 |   scale_color_continuous(type = "viridis") +
 78 |   labs(size="Datasets", color="Cells per\n100 000") +
 79 |   guides(color = guide_colorbar(barwidth = 1, barheight = 3, ticks = FALSE)) +
 80 |   theme_bw() +
 81 |   theme(legend.position = "right", axis.title.y = element_blank(), plot.title = element_text(hjust=0.5)) +
 82 |   theme_jc_vert +
 83 |   ggtitle("Technology")
 84 | tech_vert
 85 | ```
 86 | 
 87 | ```{r}
 88 | tech_hor <- tech +
 89 |   scale_y_continuous("Patients", limits = c(0, 400)) + 
 90 |   scale_x_discrete("Technology") +
 91 |   scale_color_continuous(type = "viridis") +
 92 |   labs(size="Datasets", color="Cells per\n100 000") +
 93 |   guides(color = guide_colorbar(barwidth = .5, barheight = 3, ticks = FALSE, direction = "vertical", title.position = "left"),
 94 |          size = guide_legend(ncol = 1)) +
 95 |   theme_bw() +
 96 |   theme(legend.position = "bottom", axis.title.x = element_blank(), plot.title = element_text(hjust=0.5), axis.text.x = element_text(angle = 45, hjust = 1), legend.title = element_text(size = 8), legend.text = element_text(size = 8)) +
 97 |   theme_jc_hor +
 98 |   ggtitle("Technology")
 99 | tech_hor
100 | ```
101 | 
102 | 
103 | ```{r fig.height=3, fig.width=5}
104 | treat <- df %>%
105 |   group_by(`Treatment type`) %>%
106 |   #tally(name = "Count") %>%
107 |   summarise("n_patient" = sum(`Number of Patients`, na.rm=TRUE), "n_cells" = sum(cells_tenfive, na.rm=TRUE), "n" = n()) %>%
108 |   ggplot(aes(x = reorder(`Treatment type`, -n_patient), y = n_patient)) +
109 |   geom_segment(aes(xend = `Treatment type`, yend = 0)) +
110 |   geom_point(aes(col = n_cells, size = n)) +
111 |   scale_size_continuous(range = c(2, 8))
112 | ```
113 | 
114 | ```{r}
115 | treat_vert <- treat +
116 |   coord_flip() +
117 |   scale_y_continuous("Patients") + 
118 |   scale_x_discrete("Treatment type", limits = rev) +
119 |   scale_color_continuous(type = "viridis") +
120 |   labs(size="Datasets", color="Cells per\n100 000") +
121 |   guides(color = guide_colorbar(barwidth = 1, barheight = 3, ticks = FALSE)) +
122 |   theme_bw() +
123 |   theme(legend.position = "right", axis.title.y = element_blank(), plot.title = element_text(hjust = 0.5)) +
124 |   ggtitle("Treatment") +
125 |   theme_jc_vert
126 | treat_vert
127 | ```
128 | 
129 | ```{r}
130 | treat_hor <- treat +
131 |   scale_y_continuous("Patients", limits = c(0, 400)) + 
132 |   scale_x_discrete("Treatment type") +
133 |   scale_color_continuous(type = "viridis") +
134 |   labs(size="Datasets", color="Cells per\n100 000") +
135 |   guides(color = guide_colorbar(barwidth = .5, barheight = 3, ticks = FALSE, direction = "vertical", title.position = "left"),
136 |          size = guide_legend(ncol = 1)) +
137 |   theme_bw() +
138 |   theme(legend.position = "bottom", axis.title.x = element_blank(), plot.title = element_text(hjust = 0.5),
139 |         axis.text.x = element_text(angle = 45, hjust = 1), legend.title = element_text(size = 8), legend.text = element_text(size = 8)) +
140 |   ggtitle("Treatment") +
141 |   theme_jc_hor
142 | treat_hor
143 | ```
144 | 
145 | 
146 | ```{r}
147 | cancer <- df %>%
148 |   group_by(`Cancer type abbreviation`) %>%
149 |   summarise("n_patient" = sum(`Number of Patients`, na.rm=TRUE), "n_cells" = sum(cells_tenfive, na.rm=TRUE), "n" = n()) %>%
150 |   ggplot(aes(x = reorder(`Cancer type abbreviation`, -n_patient), y = n_patient)) +
151 |   geom_segment(aes(xend = `Cancer type abbreviation`, yend = 0)) +
152 |   geom_point(aes(col=n_cells, size=n)) +
153 |   scale_size_continuous(range = c(2, 8))
154 | ```
155 | 
156 | ```{r}
157 | cancer_vert <- cancer +
158 |   coord_flip() +
159 |   scale_y_continuous("Patients") +
160 |   scale_x_discrete("Cancer type", limits = rev) +
161 |   viridis::scale_color_viridis(option = "viridis") +
162 |   #scale_size(breaks = c(3, 6, 9)) +
163 |   theme_bw() +
164 |   labs(size="Datasets", color="Cells per\n100 000") +
165 |   guides(color = guide_colorbar(barwidth = 1, barheight = 5, ticks = FALSE)) +
166 |   theme_jc_vert +
167 |   theme(axis.title.y = element_blank(), legend.position = "right", plot.title = element_text(hjust = 0.5)) +
168 |   ggtitle("Cancer type")
169 | cancer_vert
170 | ```
171 | 
172 | ```{r fig.height=4.5, fig.width=9}
173 | cancer_hor <- cancer +
174 |   scale_y_continuous("Patients") +
175 |   scale_x_discrete("Cancer type") +
176 |   viridis::scale_color_viridis(option = "viridis") +
177 |   #scale_size(breaks = c(3, 6, 9)) +
178 |   theme_bw() +
179 |   labs(size="Datasets", color="Cells per\n100 000") +
180 |   guides(color = guide_colorbar(barwidth = 1, barheight = 5, ticks = FALSE)) +
181 |   theme_jc_hor +
182 |   theme(axis.title.y = element_blank(), legend.position = "right", plot.title = element_text(hjust = 0.5), axis.text.x = element_text(angle = 45, hjust = 1), axis.title.x = element_blank())
183 |   #ggtitle("Cancer type")
184 | cancer_hor
185 | ggsave("plot_cancer_horizontal.pdf", dpi = 300)
186 | ```
187 | 
188 | 
189 | ```{r fig.height=4, fig.width=8}
190 | cell_type <- df %>%
191 |   group_by(`Enrichment cell types`) %>%
192 |   summarise("n_patient" = sum(`Number of Patients`, na.rm=TRUE), "n_cells" = sum(cells_tenfive, na.rm=TRUE), "n" = n()) %>%
193 |   ggplot(aes(x = reorder(`Enrichment cell types`, -n_patient), y = n_patient)) +
194 |   geom_segment(aes(xend = `Enrichment cell types`, yend = 0)) +
195 |   geom_point(aes(col = n_cells, size = n)) +
196 |   scale_size_continuous(range = c(2, 8))
197 | ```
198 | 
199 | ```{r}
200 | cell_type_vert <- cell_type +  
201 |   coord_flip() +
202 |   scale_y_continuous("Patients") + 
203 |   scale_x_discrete("Cell types", limits = rev) +
204 |   scale_color_continuous(type = "viridis") +
205 |   labs(size="Datasets", color="Cells per\n100 000") +
206 |   guides(color = guide_colorbar(barwidth = 1, barheight = 5, ticks = FALSE)) +
207 |   theme_bw() +
208 |   theme(axis.title.y = element_blank(), plot.title = element_text(hjust = 0.5)) +
209 |   theme_jc_vert +
210 |   ggtitle("Cell type enrichment")
211 | cell_type_vert
212 | ```
213 | 
214 | ```{r}
215 | cell_type_hor <- cell_type +
216 |   scale_y_continuous("Patients", limits = c(0, 400)) + 
217 |   scale_x_discrete("Cell types") +
218 |   scale_color_continuous(type = "viridis") +
219 |   labs(size="Datasets", color="Cells") +
220 |   guides(color = guide_colorbar(barwidth = .5, barheight = 3, ticks = FALSE, direction = "vertical", title.position = "left"),
221 |          size = guide_legend(ncol = 1)) +
222 |   theme_bw() +
223 |   theme(axis.title.x = element_blank(), plot.title = element_text(hjust = 0.5), axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "bottom", legend.title = element_text(size = 8), legend.text = element_text(size = 8)) +
224 |   theme_jc_hor +
225 |   ggtitle("Cell type enrichment")
226 | cell_type_hor
227 | ```
228 | 
229 | ## Plots patchwork
230 | ```{r}
231 | layout <- "
232 | AABBDD
233 | AABBDD
234 | AACCDD
235 | "
236 | ```
237 | 
238 | ```{r fig.height=6, fig.width=12}
239 | cancer_vert + cell_type_vert + treat_vert + tech_vert +
240 |   plot_layout(design = layout)
241 | ggsave("TME_table_quant.pdf", dpi = 300)
242 | ```
243 | 
244 | #Plots Jasna
245 | ```{r}
246 | layout <- "
247 | AABCC
248 | "
249 | ```
250 | 
251 | ```{r fig.height=5, fig.width=10}
252 | cell_type_hor + treat_hor + tech_hor +
253 |   plot_layout(design = layout)
254 | ggsave("plot_cellType_treatment_tech_horizontal.pdf", dpi = 300)
255 | ```
256 | 
257 | 
258 | 
259 | ```{r}
260 | sum(df$`Number of Patients`, na.rm = TRUE)
261 | sum(df$`Cell amount`, na.rm = TRUE)
262 | ```
263 | 
264 | 


--------------------------------------------------------------------------------
/tidy_metadata.R:
--------------------------------------------------------------------------------
 1 | dir <- getwd()
 2 | setwd(dir)
 3 | print(dir)
 4 | 
 5 | suppressPackageStartupMessages({
 6 |   library(sceasy)
 7 |   library(reticulate)
 8 |   use_condaenv('sceasy')
 9 |   loompy <- reticulate::import('loompy')
10 |   library(Seurat)
11 |   library(tidyverse)
12 |   library(readxl)
13 |   library(jsonlite)
14 | })
15 | 
16 | tidy_metadata_path <- "/home/jordi_camps/IMMUcan/tidy_metadata.xlsx"
17 | seurat_obj <- "out/harmony.rds"
18 | data <- fromJSON("out/data.json")
19 | 
20 | meta_cols <- read_excel(tidy_metadata_path)
21 | seurat <- readRDS(seurat_obj)
22 | if (!is.na(data$nSample) & ncol(seurat) > data$nSample) {subsamples <- sample(ncol(seurat), data$nSample, replace = FALSE)}
23 | #colnames(seurat@meta.data) <- tolower(colnames(seurat@meta.data))
24 | glimpse(seurat@meta.data)
25 | 
26 | #Clean metadata column names
27 | names <- tolower(colnames(seurat@meta.data))
28 | names <- gsub("\\.", "_", names)
29 | 
30 | if (any(meta_cols$col_names %in% names)) {
31 |   change_cols <- names[names %in% meta_cols$col_names]
32 |   for (i in change_cols) {
33 |     clean = FALSE
34 |     hit_1 <- grepl(i, names)
35 |     hit_2 <- meta_cols$col_names %in% i
36 |     print(paste0("changing ", i, " to ", meta_cols$general[hit_2]))
37 |     colnames(seurat@meta.data)[hit_1] <- meta_cols$general[hit_2]
38 |   }
39 | } else {
40 |   clean = TRUE
41 |   print("No meta.data columns to tidy up")
42 | }
43 | 
44 | print("Updating data.json")
45 | names <- tolower(data$metadata)
46 | names <- gsub("\\.", "_", names)
47 | if (isFALSE(clean)) {
48 |   change_cols <- names[names %in% meta_cols$col_names]
49 |   for (i in change_cols) {
50 |     hit_1 <- grepl(i, names)
51 |     hit_2 <- meta_cols$col_names %in% i
52 |     data$metadata[hit_1] <- meta_cols$general[hit_2]
53 |   }
54 |   names <- tolower(data$annotation)
55 |   names <- gsub("\\.", "_", names)
56 |   change_cols <- names[names %in% meta_cols$col_names]
57 |   for (i in change_cols) {
58 |     hit_1 <- grepl(i, names)
59 |     hit_2 <- meta_cols$col_names %in% i
60 |     data$annotation[hit_1] <- meta_cols$general[hit_2]
61 |   }
62 |   data <- toJSON(data)
63 |   write(data, "out/data.json")
64 | }
65 | 
66 | print("Saving objects")
67 | Idents(seurat) <- seurat$annotation_minor
68 | saveRDS(seurat, "out/harmony.rds")
69 | seurat@meta.data <- seurat@meta.data[, sapply(sapply(seurat@meta.data, unique), length) != 1, drop = FALSE] #Remove all columns that have only one variable
70 | seurat@meta.data <- seurat@meta.data[, !grepl("RNA_snn_res|abbreviation|cell_id|cell.id|cell_id", colnames(seurat@meta.data))]
71 | 
72 | # Convert to h5ad with sceasy for immediate use with cellxgene
73 | sceasy::convertFormat(seurat, from="seurat", to="anndata", outFile= "out/cellxgene.h5ad")
74 | 
75 | # Export metadata with umap coordinates
76 | write.table(x = cbind(seurat@meta.data, seurat@reductions$umap@cell.embeddings), file = "out/metadata.tsv", row.names = TRUE, sep = "\t")
77 | 
78 | #Subsample object to 10k cells
79 | if (exists("subsamples")) {seurat <- seurat[, subsamples]}
80 | 
81 | # Export metadata with umap coordinates
82 | write.table(x = cbind(seurat@meta.data, seurat@reductions$umap@cell.embeddings), file = "out/metadata_10k.tsv", row.names = TRUE, sep = "\t")
83 | 
84 | # Convert to h5ad with sceasy for immediate use with cellxgene
85 | sceasy::convertFormat(seurat, from="seurat", to="anndata", outFile= "out/cellxgene_10k.h5ad")
86 | 
87 | #zip and checksum
88 | print("STEP 5: ZIP AND CHECKSUM")
89 | folder_name <- tail(unlist(strsplit(dir, "/")), n=1)
90 | dir.create(folder_name)
91 | out_files <- paste0("out/", list.files("out/"))
92 | file.copy(out_files, folder_name, recursive = TRUE)
93 | zip(paste0(folder_name, ".zip"), folder_name)
94 | checksum <- tools::md5sum(paste0(folder_name, ".zip"))
95 | file.rename(paste0(folder_name, ".zip"), paste0(folder_name, "_-_", checksum, ".zip"))
96 | file.copy(paste0(folder_name, "_-_", checksum, ".zip"), "../")
97 | unlink(paste0(folder_name, "_-_", checksum, ".zip"))
98 | unlink(folder_name, recursive = TRUE)


--------------------------------------------------------------------------------
/tidy_metadata.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ImmucanWP7/immucan-scdb/51b8ca98c255823a23213f0744b3779640ef75ea/tidy_metadata.xlsx


--------------------------------------------------------------------------------
/zip_checksum.R:
--------------------------------------------------------------------------------
 1 | dir <- getwd()
 2 | setwd(dir)
 3 | print(dir)
 4 | 
 5 | # Create zip and checksum from out folder
 6 | folder_name <- tail(unlist(strsplit(dir, "/")), n=1)
 7 | dir.create(folder_name)
 8 | out_files <- paste0("out/", list.files("out/"))
 9 | file.copy(out_files, folder_name, recursive = TRUE)
10 | zip(paste0(folder_name, ".zip"), folder_name)
11 | checksum <- tools::md5sum(paste0(folder_name, ".zip"))
12 | file.rename(paste0(folder_name, ".zip"), paste0(folder_name, "_-_", checksum, ".zip"))
13 | file.copy(paste0(folder_name, "_-_", checksum, ".zip"), "../")
14 | unlink(paste0(folder_name, "_-_", checksum, ".zip"))
15 | unlink(folder_name, recursive = TRUE)


--------------------------------------------------------------------------------