├── Benchmarking-Integration-Methods ├── 1-run_harmony.R ├── 2-run_scanorama.py ├── 3-compute_lisi.R ├── 4-plot_lisi.R ├── 5-cluster_harmony.R ├── 6-cluster_scanorama.py └── 7-plot_umaps.R ├── Integration ├── 10_filter_annotate.R ├── 1_process_gene_names_objects.R ├── 2_find_batch_specific_genes.R ├── 3_normalize_each_dataset.R ├── 4_find_integration_features.R ├── 5_scale_each_dataset.R ├── 6_find_integration_anchors.R ├── 7_integration.R ├── 8_process_integrated_obj.R ├── 9_cluster_atlas.R └── README.md ├── Patient stratification ├── 1_create_proportions_dataset.R ├── 2_cluster_patients.R ├── 3_random_forest.R └── README.md ├── Projections ├── README.md ├── atlas_projection.Rmd ├── cancer_subtype_palette.rds └── cell_type_palette.rds ├── README.md ├── Random Forest ├── 1-calculate_cell_type_signatures.R ├── 2-create_test_sets_for_cross_validation.R ├── 3-run_random_forest.R ├── 4-evaluate_performance.R ├── 5-plot_figures.R ├── README.md └── data │ └── supp_table_2.xlsx ├── ST-breast ├── 1-10x_breast_QC.Rmd ├── 2-10x_breast_GO_enrichment.Rmd ├── 3-10x_breast_immune_reference.Rmd ├── 4-10x_breast_stratification.Rmd ├── RCTD-10x_breast_immune_reference.Rmd ├── RCTD-SPOTlight_comparison.Rmd └── README.md ├── ST-oropharyngeal ├── 1-australia_oroph_processing.Rmd ├── 2-australia_oroph_biological.Rmd ├── 3-australia_oroph_deconv.Rmd ├── 4-australia_oro_stratification.Rmd ├── 6-australia_oro_digitalFISH.Rmd ├── RCTD-SPOTlight_comparison.Rmd ├── RCTD-australia_oroph_deconv.Rmd ├── README.md ├── make_oroph_deconv.cmd └── spotlight_deconv_oroph_job.R ├── misc ├── SPOTlight_deconvolution_job.R ├── atlas.png ├── col_df_scrpt.R └── paths.R └── utils ├── bin.r ├── process_gene_names.R └── spatial_plot_spaniel.R /Benchmarking-Integration-Methods/1-run_harmony.R: -------------------------------------------------------------------------------- 1 | # This script computes harmony for the TICA atlas and saves the resulting 2 | # batch-corrected PCs and Seurat object 3 | 4 | 5 | # Load packages 6 | library(Seurat) 7 | library(tidyverse) 8 | library(harmony) 9 | library(SeuratWrappers) 10 | 11 | 12 | # Load data 13 | tica <- readRDS("data/TICAtlas.rds") 14 | 15 | 16 | # Save Seurat-corrected PCA coordinates 17 | saveRDS(tica@reductions$pca@cell.embeddings, "results/Seurat_v3_corrected_pca_coordinates.rds") 18 | 19 | 20 | # Run Harmony 21 | DefaultAssay(tica) <- "RNA" 22 | tica <- tica %>% 23 | FindVariableFeatures(nfeatures = 2000, verbose = FALSE, selection.method = "vst") %>% 24 | ScaleData() %>% 25 | RunPCA() 26 | print(tica@reductions) 27 | tica <- RunHarmony(tica, dims = 1:50, reduction = "pca", group.by.vars = "source") 28 | tica <- RunUMAP(tica, dims = 1:50, reduction = "harmony") 29 | tica <- FindNeighbors(tica, dims = 1:50, reduction = "harmony") 30 | 31 | 32 | # Save 33 | saveRDS(tica@reductions$pca@cell.embeddings, "results/uncorrected_pca_coordinates.rds") 34 | write_csv(as.data.frame(tica@reductions$pca@cell.embeddings), "results/uncorrected_pca_coordinates.csv", col_names = TRUE) 35 | saveRDS(tica@reductions$harmony@cell.embeddings, "results/harmony_corrected_pca_coordinates.rds") 36 | write_csv(rownames_to_column(tica@meta.data, var = "cell_barcode"), "results/tica_metadata.csv", col_names = TRUE) 37 | saveRDS(tica, "results/tica_with_harmony.rds") 38 | 39 | 40 | -------------------------------------------------------------------------------- /Benchmarking-Integration-Methods/2-run_scanorama.py: -------------------------------------------------------------------------------- 1 | # This script runs scanorama to obtain a batch-corrected dimensionality reduction matrix. 2 | # We will follow the pipeline described in https://nbisweden.github.io/workshop-scRNAseq/labs/compiled/scanpy/scanpy_03_integration.html 3 | 4 | # Import modules 5 | import numpy as np 6 | import pandas as pd 7 | import scanpy as sc 8 | import anndata 9 | import scanorama 10 | 11 | 12 | # Read data 13 | path_to_data = "data/TICAtlas.h5ad" 14 | tica = sc.read_h5ad(path_to_data) 15 | 16 | 17 | # Find highly variable genes 18 | sc.pp.highly_variable_genes(tica, min_mean = 0.0125, max_mean = 3, min_disp = 0.5, batch_key = 'source') 19 | var_genes = tica.var.highly_variable 20 | var_genes = list(var_genes.index[var_genes]) 21 | 22 | 23 | # Run Scanorama 24 | batches = tica.obs['source'].cat.categories.tolist() 25 | alldata = {} 26 | for batch in batches: 27 | alldata[batch] = tica[tica.obs['source'] == batch, var_genes] 28 | tica_list = list(alldata.values()) 29 | scanorama.integrate_scanpy(tica_list, dimred = 50) 30 | tica_list[0].obsm['X_scanorama'].shape 31 | scanorama_int = [ad.obsm['X_scanorama'] for ad in tica_list] 32 | cell_barcodes = [] 33 | for ad in tica_list: 34 | cell_barcodes.extend(ad.obs_names) 35 | tica = tica[cell_barcodes, :] 36 | all_s = np.concatenate(scanorama_int) 37 | print(all_s.shape) 38 | tica.obsm["Scanorama"] = all_s 39 | #cell_barcodes = [] 40 | #for ad in tica_list: 41 | # cell_barcodes.extend(ad.obs_names) 42 | #tica = tica[cell_barcodes, :] 43 | 44 | 45 | # Run UMAP 46 | sc.pp.neighbors(tica, n_pcs = 50, use_rep = "Scanorama") 47 | sc.tl.umap(tica) 48 | 49 | 50 | # Save 51 | tica.__dict__['_raw'].__dict__['_var'] = tica.__dict__['_raw'].__dict__['_var'].rename(columns={'_index': 'features'}) 52 | tica.write_h5ad("results/TICAtlas_scanorama3.h5ad") 53 | dimred = tica.obsm["Scanorama"] 54 | dimred_df = pd.DataFrame(dimred) 55 | dimred_df.columns = ["Scanorama_{}".format(str(x + 1)) for x in dimred_df.columns] 56 | dimred_df.insert(0, "cell_barcode", tica.obs_names, True) 57 | dimred_df.to_csv("results/Scanorama_corrected_pca_coordinates3.csv", index = False) 58 | umap_df = pd.DataFrame(tica.obsm["X_umap"]) 59 | umap_df.columns = ["UMAP1", "UMAP2"] 60 | umap_df.insert(0, "cell_barcode", tica.obs_names, True) 61 | umap_df.to_csv("results/Scanorama_corrected_umap_coordinates3.csv", index = False) 62 | -------------------------------------------------------------------------------- /Benchmarking-Integration-Methods/3-compute_lisi.R: -------------------------------------------------------------------------------- 1 | # This script computes the Local Inverse Simpson Index for the following cases: 2 | # 1. Unintegrated PCA 3 | # 2. Seurat v3-corrected PCA 4 | # 3. Harmony-corrected PCA 5 | # 4. Scanorama-corrected PCA 6 | 7 | 8 | # Load packages 9 | library(tidyverse) 10 | library(lisi) 11 | 12 | 13 | # Load data 14 | tica_metadata <- read_csv("results/tica_metadata.csv", col_names = TRUE) 15 | uncorrected_coords <- readRDS("results/uncorrected_pca_coordinates.rds") 16 | seurat_coords <- readRDS("results/Seurat_v3_corrected_pca_coordinates.rds") 17 | harmony_coords <- readRDS("results/harmony_corrected_pca_coordinates.rds") 18 | scanorama_coords <- read_csv("results/Scanorama_corrected_pca_coordinates3.csv") 19 | selected_cols <- str_subset(colnames(scanorama_coords), "^Scanorama_") 20 | scanorama_coords_mat <- scanorama_coords[, selected_cols] 21 | scanorama_coords_mat <- as.matrix(scanorama_coords_mat) 22 | rownames(scanorama_coords_mat) <- scanorama_coords$cell_barcode 23 | scanorama_coords_mat <- scanorama_coords_mat[rownames(harmony_coords), ] 24 | 25 | 26 | # Define technical and biological variables 27 | meta_data <- data.frame(source = tica_metadata$source) 28 | rownames(meta_data) <- tica_metadata$cell_barcode 29 | if (all(rownames(meta_data) == rownames(uncorrected_coords))) { 30 | print("row names are equal") 31 | } else { 32 | warning("row names are not equal!") 33 | } 34 | 35 | 36 | # Compute LISI 37 | dim_red_mats <- list( 38 | uncorrected_coords, 39 | seurat_coords, 40 | harmony_coords, 41 | scanorama_coords_mat 42 | ) 43 | names(dim_red_mats) <- c("uncorrected", "Seurat v3", "Harmony", 44 | "Scanorama") 45 | lisi_scores <- purrr::map(dim_red_mats, function(mat) { 46 | scores <- compute_lisi( 47 | X = mat, 48 | meta_data = meta_data, 49 | label_colnames = "source" 50 | ) 51 | }) 52 | lisi_scores <- bind_rows(lisi_scores, .id = "correction") 53 | 54 | 55 | # Save 56 | saveRDS(lisi_scores, "results/lisi_scores.rds") 57 | -------------------------------------------------------------------------------- /Benchmarking-Integration-Methods/4-plot_lisi.R: -------------------------------------------------------------------------------- 1 | # This script plots the Local Inverse Simpson Index (LISI) 2 | 3 | 4 | # Load packages 5 | library(tidyverse) 6 | library(ggridges) 7 | 8 | 9 | # Load data frame 10 | lisi_df <- readRDS("results/lisi_scores.rds") 11 | 12 | 13 | # Plot LISI 14 | sorted_corrections <- c("uncorrected", "Seurat v3", "Harmony", "Scanorama") 15 | palette <- c("#999999", "#92e7df", "#612c63", "#e5624f") 16 | lisi_scores_gg <- lisi_df %>% 17 | mutate(correction = factor(correction, levels = rev(sorted_corrections))) %>% 18 | ggplot(aes(source, correction, fill = correction)) + 19 | geom_density_ridges( 20 | quantile_lines = TRUE, 21 | quantile_fun = median 22 | ) + 23 | # geom_violin() + 24 | scale_fill_manual(values = palette) + 25 | labs(x = "iLISI", y = "") + 26 | theme_classic() + 27 | theme( 28 | legend.position = "none", 29 | axis.title.x = element_text(color = "black", size = 13), 30 | axis.text.y = element_text(color = "black", size = 12), 31 | axis.text.x = element_text(size = 11) 32 | ) 33 | 34 | 35 | # Save 36 | ggsave( 37 | filename = "results/plots/iLISI_TICAtlas.pdf", 38 | plot = lisi_scores_gg, 39 | width = 14, 40 | height = 8, 41 | units = "cm" 42 | ) 43 | -------------------------------------------------------------------------------- /Benchmarking-Integration-Methods/5-cluster_harmony.R: -------------------------------------------------------------------------------- 1 | # This script clusters the cells in the TICAtlas at varying resolutions using 2 | # the Harmony-corrected PC as features. 3 | 4 | 5 | # Load packages 6 | library(tidyverse) 7 | library(Seurat) 8 | 9 | 10 | # Load Seurat object 11 | tica <- readRDS("results/tica_with_harmony.rds") 12 | 13 | 14 | # Cluster 15 | tica <- FindClusters(tica, resolution = c(0.01, 0.05, 0.075, 0.1, 0.2)) 16 | 17 | 18 | # Save UMAP coordinates and clusters 19 | harmony_df <- data.frame( 20 | cell_barcode = colnames(tica), 21 | UMAP1 = tica@reductions$umap@cell.embeddings[, "UMAP_1"], 22 | UMAP2 = tica@reductions$umap@cell.embeddings[, "UMAP_2"], 23 | cell_type = tica$cell_type, 24 | cancer_subtype = tica$subtype, 25 | cluster_res_0.01 = tica$RNA_snn_res.0.01, 26 | cluster_res_0.05 = tica$RNA_snn_res.0.05, 27 | cluster_res_0.075 = tica$RNA_snn_res.0.075, 28 | cluster_res_0.1 = tica$RNA_snn_res.0.1, 29 | cluster_res_0.2 = tica$RNA_snn_res.0.2 30 | ) 31 | saveRDS(harmony_df, "results/umap_harmony_with_clusters.rds") 32 | -------------------------------------------------------------------------------- /Benchmarking-Integration-Methods/6-cluster_scanorama.py: -------------------------------------------------------------------------------- 1 | # This script clusters the cells in the TICAtlas at varying resolutions using 2 | # the Scanorama-corrected PC as features. 3 | 4 | 5 | # Load packages 6 | import numpy as np 7 | import pandas as pd 8 | import scanpy as sc 9 | import anndata 10 | 11 | 12 | # Read data 13 | path_to_data = "results/TICAtlas_scanorama3.h5ad" 14 | tica = sc.read_h5ad(path_to_data) 15 | 16 | 17 | 18 | # Cluster at varying resolutions 19 | for res in [0.25, 0.5, 0.75, 1]: 20 | sc.tl.leiden(tica, resolution = res) 21 | tica.obs["leiden_res{}".format(res)] = tica.obs["leiden"] 22 | 23 | 24 | 25 | # Save UMAP coords and clusters to visualize in a downstream script 26 | scanorama_df = { 27 | "cell_barcode": tica.obs.index, 28 | "UMAP1": tica.obsm["X_umap"][:, 0], 29 | "UMAP2": tica.obsm["X_umap"][:, 1], 30 | "cell_type": tica.obs["cell_type"], 31 | "cancer_subtype": tica.obs["subtype"], 32 | "leiden_res0.25": tica.obs["leiden_res0.25"], 33 | "leiden_res0.5": tica.obs["leiden_res0.5"], 34 | "leiden_res0.75": tica.obs["leiden_res0.75"], 35 | "leiden_res1": tica.obs["leiden_res1"] 36 | } 37 | scanorama_df = pd.DataFrame(scanorama_df) 38 | scanorama_df.to_csv("results/umap_scanorama_with_clusters.csv", index = False) 39 | -------------------------------------------------------------------------------- /Benchmarking-Integration-Methods/7-plot_umaps.R: -------------------------------------------------------------------------------- 1 | # This script plots the UMAP for Harmony and Scanorama integration of the 2 | # TICAtlas 3 | 4 | 5 | # Load packages 6 | library(Seurat) 7 | library(tidyverse) 8 | 9 | 10 | # Read data 11 | umap_harmony_df <- readRDS("results/umap_harmony_with_clusters.rds") 12 | umap_scanorama_df <- read_csv("results/umap_scanorama_with_clusters.csv") 13 | umap_scanorama_df <- as.data.frame(umap_scanorama_df) 14 | rownames(umap_scanorama_df) <- umap_scanorama_df$cell_barcode 15 | umap_scanorama_df <- umap_scanorama_df[rownames(umap_harmony_df), ] 16 | all(rownames(umap_harmony_df) == rownames(umap_scanorama_df)) 17 | umap_scanorama_df$cell_type <- umap_harmony_df$cell_type 18 | 19 | 20 | # UMAP colored by cell type 21 | cell_type_palette <- readRDS("data/cell_type_palette.rds") 22 | dfs <- list(harmony = umap_harmony_df, scanorama = umap_scanorama_df) 23 | umaps_cell_type <- purrr::map(dfs, function(df) { 24 | p <- df %>% 25 | ggplot(aes(UMAP1, UMAP2, color = cell_type)) + 26 | geom_point(size = 0.1) + 27 | scale_color_manual(values = cell_type_palette) + 28 | theme_classic() + 29 | theme( 30 | legend.position = "none", 31 | axis.title = element_blank(), 32 | axis.text = element_blank(), 33 | axis.ticks = element_blank(), 34 | axis.line = element_blank() 35 | ) 36 | p 37 | }) 38 | 39 | 40 | # UMAP colored by cancer subtype 41 | cancer_subtype_palette <- readRDS("data/cancer_subtype_palette.rds") 42 | cancer_subtype_palette <- c(cancer_subtype_palette, CM = "#ee6a50") 43 | umaps_cancer_subtype <- purrr::map(dfs, function(df) { 44 | p <- df %>% 45 | ggplot(aes(UMAP1, UMAP2, color = cancer_subtype)) + 46 | geom_point(size = 0.1) + 47 | scale_color_manual(values = cancer_subtype_palette) + 48 | theme_classic() + 49 | theme( 50 | legend.position = "none", 51 | axis.title = element_blank(), 52 | axis.text = element_blank(), 53 | axis.ticks = element_blank(), 54 | axis.line = element_blank() 55 | ) 56 | p 57 | }) 58 | 59 | 60 | # UMAPs colored by cluster 61 | # Harmony 62 | cluster_vars <- str_subset(colnames(dfs$harmony), "cluster_res_") 63 | umap_cluster_harmony <- purrr::map(cluster_vars, function(x) { 64 | p <- umap_harmony_df %>% 65 | ggplot(aes_string("UMAP1", "UMAP2", color = x)) + 66 | geom_point(size = 0.1) + 67 | ggtitle(str_remove(x, "cluster_res_")) + 68 | theme_classic() + 69 | theme(plot.title = element_text(hjust = 0.5, size = 13, face = "bold")) 70 | p 71 | }) 72 | 73 | fct_cells_cell_type <- umap_harmony_df %>% 74 | select("cell_type", "cluster_res_0.05") %>% 75 | group_by(cell_type, cluster_res_0.05) %>% 76 | summarize(n_cells = n()) %>% 77 | ungroup() %>% 78 | group_by(cluster_res_0.05) %>% 79 | mutate(n_cells_total = sum(n_cells), fct_cells = n_cells / n_cells_total) 80 | 81 | fct_cells_cell_type_gg <- fct_cells_cell_type %>% 82 | ggplot(aes(cluster_res_0.05, fct_cells, fill = cell_type)) + 83 | geom_col() + 84 | scale_fill_manual(values = cell_type_palette) + 85 | labs(x = "Cluster (Harmony)", y = "", fill = "") + 86 | theme_classic() 87 | 88 | 89 | # Scanorama 90 | 91 | 92 | # Save 93 | for (x in names(dfs)) { 94 | # Save umap colored by cell type 95 | path_save_umap1 <- str_c( 96 | "results/plots/TICAtlas_umap_colored_cell_type_", 97 | x, 98 | ".pdf", 99 | sep = "" 100 | ) 101 | ggsave( 102 | path_save_umap1, 103 | plot = umaps_cell_type[[x]], 104 | width = 9, 105 | height = 9, 106 | units = "cm" 107 | ) 108 | 109 | 110 | # Save umap colored by cell type 111 | path_save_umap2 <- str_c( 112 | "results/plots/TICAtlas_umap_colored_cancer_subtype_", 113 | x, 114 | ".pdf", 115 | sep = "" 116 | ) 117 | ggsave( 118 | path_save_umap2, 119 | plot = umaps_cancer_subtype[[x]], 120 | width = 9, 121 | height = 9, 122 | units = "cm" 123 | ) 124 | } 125 | 126 | 127 | -------------------------------------------------------------------------------- /Integration/10_filter_annotate.R: -------------------------------------------------------------------------------- 1 | # filter out non-immune clusters (clusters 15, 16, 20 and 34) and rename clusters (annotate them) 2 | 3 | library(Seurat) 4 | library(tidyverse) 5 | # library(future) 6 | # plan("multicore") 7 | # options(future.globals.maxSize = Inf) 8 | 9 | atlas <- readRDS("output/integrated_clustered.rds") 10 | 11 | # remove non-immune cells 12 | atlas <- subset(atlas, seurat_clusters %in% c("15", "16", "20", "34"), invert = TRUE) 13 | 14 | atlas$seurat_clusters <- as.factor(as.numeric(as.character(atlas$seurat_clusters))) 15 | Idents(atlas) <- "seurat_clusters" 16 | levels(atlas) 17 | 18 | new_ids <- c("CD4 memory stem cells", "CD8 exhausted", "CD8 cytotoxic", "CD4 memory stem cells", "CD4 resident effector memory", "T regs", "B cells", "TAMs C1QC", "NK", "CD4 effector memory RA", 19 | "Tumor infiltrating Monocytes", "CD4 activated", "Proliferative T cells", "CD4 transitional memory", "CD4 effector memory RA", "DC2 CD1C+", "CD8 IFN activated", "CD8 activated T cells", 20 | "Plasma B cells", "Macrophages SPP1", "Plasma B cells", "DC4 CD1C-", "T helper/Th17", "pDC", "DC3 LAMP3", "Macrophages proliferative", "Macrophages CXCL10", 21 | "Mast cells", "DC1", "Proliferative B cells", "Plasma blasts" 22 | ) 23 | 24 | names(new_ids) <- levels(atlas) 25 | 26 | atlas <- RenameIdents(atlas, new_ids) 27 | 28 | atlas$new_annot <- Idents(atlas) 29 | 30 | ggsave(plot = DimPlot(atlas, cols = as.vector(pals::polychrome())), filename = "atlas_plot.png", width = 10) 31 | 32 | saveRDS(atlas, "output/integrated_renamed_filtered.rds") -------------------------------------------------------------------------------- /Integration/1_process_gene_names_objects.R: -------------------------------------------------------------------------------- 1 | # run iteratively for each dataset that will be integrated 2 | # uses the "process_gene_names.R" function to correct gene names 3 | # and saves each dataset in a new file 4 | # takes as arguments each dataset path and its abbreviated name 5 | 6 | source("utils/process_gene_names.R") 7 | 8 | args = commandArgs(TRUE) 9 | data = args[1] 10 | name = args[2] 11 | 12 | print(name) 13 | 14 | data <- readRDS(data) 15 | 16 | # make sure we only have the RNA assay 17 | DefaultAssay(data) <- "RNA" 18 | if ( length(Assays(data)) > 1) {data[["SCT"]] <- NULL} 19 | 20 | print(data) 21 | 22 | processed <- process_gene_names(data) 23 | 24 | print(processed) 25 | 26 | saveRDS(processed, paste0("output/", name, "_filtered_genes.rds")) 27 | -------------------------------------------------------------------------------- /Integration/2_find_batch_specific_genes.R: -------------------------------------------------------------------------------- 1 | # to improve the integration, we limit the genes used to compute the anchors later on 2 | # we will remove the top genes that are specific for each batch 3 | # in our case the batch is in the "source" variable 4 | 5 | library(Seurat) 6 | library(tidyverse) 7 | 8 | source("code/functions/load_objects.R") 9 | 10 | obj_list <- load_objects(directory="output/", 11 | extension="_filtered_genes.rds") 12 | 13 | merged <- merge(obj_list[[1]], 14 | obj_list[2:length(obj_list)]) 15 | 16 | # to keep the consistency we are going to rename the cells 17 | 18 | merged <- RenameCells(merged, 19 | new.names =paste("cell", 1:ncol(merged), 20 | sep = "_")) 21 | 22 | merged <- merged %>% 23 | NormalizeData() %>% 24 | FindVariableFeatures(nfeatures = 5000) %>% 25 | ScaleData() 26 | 27 | # set the batches (source) as idents 28 | Idents(merged) <- "source" 29 | 30 | # get markers for each batch, limting cells per ident to reduce computing time 31 | markers <- FindAllMarkers(merged, 32 | verbose = TRUE, 33 | only.pos = TRUE, 34 | max.cells.per.ident = 5000) 35 | 36 | # filter by adjusted p_value 37 | markers <- filter(markers, 38 | p_val_adj < 0.01) 39 | 40 | saveRDS(markers, 41 | file = "output/markers/batch_specific_markers.rds") 42 | -------------------------------------------------------------------------------- /Integration/3_normalize_each_dataset.R: -------------------------------------------------------------------------------- 1 | # process each dataset, preparing them for integration 2 | # this scripts takes as arguments the data path for the dataset 3 | # and the abbreviated name 4 | # it normalizes and finds HVG of the dataset 5 | 6 | library(Seurat) 7 | library(tidyverse) 8 | 9 | args <- commandArgs(TRUE) 10 | data <- args[1] 11 | name <- args[2] 12 | 13 | print(name) 14 | 15 | # load dataset 16 | data <- readRDS(data) 17 | 18 | print(data) 19 | 20 | # normalize and identify variable features 21 | 22 | data <- data %>% 23 | NormalizeData() %>% 24 | FindVariableFeatures(nfeatures = 5000) 25 | 26 | print("done!") 27 | 28 | saveRDS(data, paste0("output/", name, "_normalized.rds")) 29 | -------------------------------------------------------------------------------- /Integration/4_find_integration_features.R: -------------------------------------------------------------------------------- 1 | # scale each dataset, compute its PCs and find the integration features 2 | 3 | library(Seurat) 4 | library(tidyverse) 5 | 6 | source("code/functions/load_objects.R") 7 | 8 | obj_list <- load_objects(directory="output/", 9 | extension="_normalized.rds") 10 | 11 | # select features that are repeatedly variable across datasets for integration 12 | # run PCA on each # dataset using these features 13 | features <- SelectIntegrationFeatures(obj_list, 14 | nfeatures = 5000) 15 | print(head(features)) 16 | 17 | # but remove dataset specific 18 | batch_markers <- readRDS("output/markers/batch_specific_markers.rds") 19 | batch_markers <-batch_markers %>% 20 | group_by(cluster) %>% 21 | top_n(n = 150, wt = avg_log2FC) 22 | 23 | table(batch_markers$gene %in% features) 24 | 25 | features <- setdiff(features, batch_markers$gene) 26 | print(length(features)) 27 | 28 | # save selected features 29 | saveRDS(features, "output/integration_features.rds") -------------------------------------------------------------------------------- /Integration/5_scale_each_dataset.R: -------------------------------------------------------------------------------- 1 | # scale each dataset and compute its PCs 2 | # using the selected integration features 3 | 4 | library(Seurat) 5 | library(tidyverse) 6 | 7 | args = commandArgs(TRUE) 8 | data = args[1] 9 | name = args[2] 10 | 11 | print(name) 12 | 13 | # load dataset 14 | 15 | data <- readRDS(data) 16 | 17 | # load selected integration features 18 | 19 | features <- readRDS("output/integration_features.rds") 20 | print(length(features)) 21 | 22 | 23 | # normalize and identify variable features 24 | 25 | data <- data %>% 26 | ScaleData(features = features) %>% 27 | RunPCA(features = features) 28 | 29 | print("done!") 30 | 31 | saveRDS(data, paste0("output/", name, "_scaled.rds")) -------------------------------------------------------------------------------- /Integration/6_find_integration_anchors.R: -------------------------------------------------------------------------------- 1 | # find the anchors to perform the integration 2 | 3 | library(Seurat) 4 | library(tidyverse) 5 | 6 | source("code/functions/load_objects.R") 7 | 8 | obj_list <- load_objects( 9 | directory = "output/", 10 | extension = "_scaled.rds" 11 | ) 12 | 13 | # load selected integration features 14 | 15 | features <- readRDS("output/integration_features.rds") 16 | 17 | anchors <- FindIntegrationAnchors(object.list = obj_list, 18 | normalization.method = "LogNormalize", 19 | anchor.features = features, 20 | reduction = "rpca") 21 | 22 | # save anchors 23 | saveRDS(anchors, "output/anchors.rds") 24 | -------------------------------------------------------------------------------- /Integration/7_integration.R: -------------------------------------------------------------------------------- 1 | # integrate the datasets 2 | 3 | library(Seurat) 4 | 5 | # load anchors 6 | anchors <- readRDS("output/anchors.rds") 7 | 8 | integrated <- IntegrateData(anchorset = anchors) 9 | 10 | rm(anchors) 11 | gc() 12 | 13 | # save integrated dataset 14 | saveRDS(integrated, "output/integrated.rds") -------------------------------------------------------------------------------- /Integration/8_process_integrated_obj.R: -------------------------------------------------------------------------------- 1 | # integrated analysis on all cells! 2 | 3 | library(Seurat) 4 | library(tidyverse) 5 | # library(future) 6 | # plan("multiprocess") 7 | # options(future.globals.maxSize = Inf) 8 | 9 | atlas <- readRDS("output/integrated.rds") 10 | 11 | DefaultAssay(atlas) <- "integrated" 12 | 13 | atlas 14 | 15 | # run the standard workflow on the integrated assay 16 | 17 | atlas <- atlas %>% 18 | ScaleData() %>% 19 | RunPCA(npcs = 30) %>% 20 | RunUMAP(reduction = "pca", dims = 1:30) %>% 21 | FindNeighbors(reduction = "pca", dims = 1:30) 22 | 23 | saveRDS(atlas, "output/integrated_processed.rds") -------------------------------------------------------------------------------- /Integration/9_cluster_atlas.R: -------------------------------------------------------------------------------- 1 | # cluster atlas with optimal resolution obtained from the random forest oob 2 | 3 | library(Seurat) 4 | library(tidyverse) 5 | # library(future) 6 | # plan("multiprocess") 7 | # options(future.globals.maxSize = Inf) 8 | 9 | atlas <- readRDS("output/integrated_processed.rds") 10 | 11 | atlas 12 | 13 | atlas <- FindClusters(atlas, 14 | resolution = 1.2) 15 | 16 | saveRDS(atlas, "output/integrated_clustered.rds") 17 | 18 | p1 <- DimPlot(atlas, 19 | reduction = "umap", 20 | group.by = "source", 21 | label = TRUE, 22 | repel = TRUE) 23 | 24 | p2 <- DimPlot(atlas, 25 | reduction = "umap", 26 | label = TRUE, 27 | repel = TRUE) 28 | 29 | plots <- p1 + p2 30 | 31 | ggsave(plot = plots, 32 | filename = "output/cluster_plots.png", 33 | dpi = 300, 34 | width = 11) 35 | 36 | p3 <- DimPlot(atlas, 37 | reduction = "umap", 38 | split.by = "source", 39 | ncol = 3) 40 | 41 | ggsave(plot = p3, 42 | filename = "output/cluster_plots_split.png", 43 | dpi = 300, 44 | height = 12, 45 | width = 12) 46 | 47 | table(atlas$seurat_clusters, atlas$source) -------------------------------------------------------------------------------- /Integration/README.md: -------------------------------------------------------------------------------- 1 | # Single-Cell Tumor Immune Atlas project: Dataset Integration 2 | 3 | In this folder you can find all the files related to the creation of the atlas: all the scripts leading to the integration of the datasets and the downstream processing of the atlas (clustering, annotation, etc.). 4 | 5 | The input to this pipeline (not included in this repository) are the datasets listed in Table 1 of the paper (https://www.biorxiv.org/content/10.1101/2020.10.26.354829v1) as seurat objects. 6 | 7 | ## Package versions 8 | Required packages for the code in this folder and versions used: 9 | * [Tidyverse 1.3.0](https://cran.r-project.org/web/packages/tidyverse/vignettes/paper.html) 10 | * [Seurat 3.2.2](https://www.cell.com/cell/fulltext/S0092-8674(19)30559-8?_returnURL=https%3A%2F%2Flinkinghub.elsevier.com%2Fretrieve%2Fpii%2FS0092867419305598%3Fshowall%3Dtrue) 11 | -------------------------------------------------------------------------------- /Patient stratification/1_create_proportions_dataset.R: -------------------------------------------------------------------------------- 1 | # this script creates the cell type proportion per 2 | # cluster dataset that we will use for the clustering 3 | 4 | # load libraries 5 | library(tidyverse) 6 | library(Seurat) 7 | 8 | # create proportion table per patient 9 | atlas <- readRDS("../output/integrated_renamed_filtered.rds") 10 | data <- table(atlas$patient, atlas$new_annot) 11 | data <- as.data.frame.matrix(data) 12 | data$total <- rowSums(data) 13 | 14 | # we filter out patients with less than 500 total 15 | # cells as this could introduce bias 16 | data <- filter(data, total > 500) 17 | 18 | for (patient in rownames(data)) { 19 | # iterate over the cell types 20 | for (cell in colnames(data)[1:(ncol(data)-1)]) { 21 | # calculate patient's cell type percentage 22 | data[patient, cell] <- round(data[patient, cell] / data[patient, "total"] * 100, 3) 23 | } 24 | } 25 | 26 | data <- data[, 1:(ncol(data)-1)] 27 | 28 | # get the metadata we are interested in 29 | meta <- atlas@meta.data[, c("source", "patient", "subtype")] 30 | 31 | # unify metadata to one row per patient 32 | meta <- unique(meta) 33 | rownames(meta) <- meta$patient 34 | 35 | data <- merge(meta, data, by = "row.names") 36 | data$Row.names <- NULL 37 | 38 | # remove datasets with no patient information 39 | # (they are labeled as having only one patient) 40 | data <- filter(data, !source %in% c("lung1", "lung2")) 41 | 42 | saveRDS(data, ".../output/atlas_proportion_dataset.rds") 43 | -------------------------------------------------------------------------------- /Patient stratification/2_cluster_patients.R: -------------------------------------------------------------------------------- 1 | # this script calculates the 6 patient clusters 2 | # using the hierarchical k-means algorithm 3 | 4 | ## load packages 5 | library(tidyverse) 6 | library(magrittr) 7 | library(ggsci) 8 | library(pals) 9 | library(RColorBrewer) 10 | library(patchwork) 11 | library(clValid) 12 | library(factoextra) 13 | library(NbClust) 14 | library(GGally) 15 | library(dendextend) 16 | library(Rtsne) 17 | library(ComplexHeatmap) 18 | library(circlize) 19 | library(kohonen) 20 | library(mclust) 21 | 22 | df <- readRDS("output/atlas_proportion_dataset.rds") 23 | tica_pal <- readRDS("output/tica_palettes.rds") 24 | 25 | df$subtype[df$source=="liver2"] <- "HCC" 26 | 27 | df <- mutate(df, 28 | `B cells` = `Activated B cells` + `Memory B cells` + `Naive B cells` + `Unswitched Memory B cells`, 29 | ) 30 | 31 | df[, c("Activated B cells", "Memory B cells", "Naive B cells", "Unswitched Memory B cells")] <- NULL 32 | 33 | # optimal number of clusters (from 1 to 20) 34 | clust_num <- NbClust( 35 | data = df[, 4:ncol(df)], distance = "euclidean", min.nc = 2, 36 | max.nc = 20, method = "kmeans", index = "alllong" 37 | ) 38 | fviz_nbclust(clust_num) 39 | 40 | # HIERACHICAL K-MEANS 41 | hkmeans_cluster <- hkmeans(x = df[, 4:(ncol(df))], hc.metric = "euclidean", hc.method = "ward.D2", k = 6) 42 | fviz_cluster(object = hkmeans_cluster, pallete = "jco", repel = TRUE) 43 | df$cluster_kmeans_k6 <- hkmeans_cluster$cluster 44 | 45 | # visualize as heatmap 46 | df <- df[order(df$cluster_kmeans_k6), ] 47 | mat <- as.matrix(df[, 4:(ncol(df) - 1)]) 48 | rownames(mat) <- NULL 49 | 50 | row_ha <- rowAnnotation( 51 | cluster = df$cluster_kmeans_k6, 52 | col = list(cluster = c("1" = "#1F77B4FF", "2" = "#FF7F0EFF", "3" = "#2CA02CFF", "4" = "#D62728FF", "5" = "#9467BDFF", "6" = "#8C564BFF")) 53 | ) 54 | Heatmap(mat, 55 | name = "composition (%)", 56 | cluster_rows = F, cluster_columns = FALSE, 57 | left_annotation = row_ha, column_names_max_height = unit(10, "cm"), 58 | col = colorRampPalette(brewer.pal(8, "Blues"))(50) 59 | ) 60 | 61 | # visualize cancer types per cluster 62 | ggplot(df, aes(factor(cluster_kmeans_k6))) + 63 | geom_bar(aes(fill = factor(subtype)), position = "fill") + 64 | coord_flip() + 65 | theme_classic() + 66 | labs(x = "Cluster", fill = "Cancer type", y = "Proportion of patients") + 67 | scale_fill_manual(values = tica_pal$cancer) + 68 | theme(text = element_text(size = 30))#, legend.position = "none") 69 | 70 | ggplot(df, aes(factor(cluster_kmeans_k6))) + 71 | geom_bar(aes(fill = factor(subtype))) + 72 | coord_flip() + 73 | theme_classic() + 74 | labs(x = "Cluster", fill = "Cancer type", y = "Number of patients") + 75 | scale_fill_manual(values = tica_pal$cancer) + 76 | theme(text = element_text(size = 30)) 77 | 78 | # calculate PCA 79 | res_pca <- prcomp(df[, 4:(ncol(df) - 1)], scale = F) 80 | fviz_eig(res_pca) 81 | res_ind <- get_pca_ind(res_pca) 82 | 83 | # Contributions of variables to PC1 and PC2 84 | p1 <- fviz_contrib(res_pca, choice = "var", axes = 1, top = 10, xtickslab.rt = 0) + 85 | theme(text = element_text(size = 35), title = element_text(size = 25), axis.text.x = element_text(vjust = 0.5, hjust = 1, angle = 90)) + 86 | labs(title = "PC1") 87 | p2 <- fviz_contrib(res_pca, choice = "var", axes = 2, top = 10, xtickslab.rt = 0) + 88 | theme(text = element_text(size = 35), title = element_text(size = 25), axis.text.x = element_text(vjust = 0.5, hjust = 1, angle = 90)) + 89 | labs(title = "PC2") 90 | p1 + p2 91 | 92 | # PC contributions table 93 | var <- get_pca_var(res_pca) 94 | ft <- as.data.frame(var$contrib[, 1:2]) 95 | ft$Variable <- rownames(ft) 96 | colnames(ft) <- c("PC1 (%)", "PC2 (%)", "Cell type") 97 | ft <- ft[, c("Cell type", "PC1 (%)", "PC2 (%)")] 98 | ft <- arrange(ft, desc(`PC1 (%)`)) 99 | ft %>% 100 | flextable::flextable() %>% 101 | flextable::theme_vanilla() %>% 102 | flextable::autofit() 103 | 104 | # visualization: TSNE PLOT 105 | tsne <- Rtsne(df[, 4:(ncol(df) - 1)], dims = 2, perplexity = 30, verbose = TRUE, max_iter = 500) 106 | 107 | df$tsne_x <- tsne$Y[, 1] 108 | df$tsne_y <- tsne$Y[, 2] 109 | 110 | p1 <- ggplot(df, aes(tsne_x, tsne_y, color = as.factor(cluster_kmeans_k6))) + 111 | geom_point(size = 6) + 112 | labs(color = "cluster", x = "tSNE_1", y = "tSNE_2") + 113 | scale_color_d3() + 114 | theme_classic() + 115 | theme( 116 | text = element_text(size = 30), 117 | legend.position = "none" 118 | ) + 119 | theme( 120 | axis.line = element_blank(), 121 | axis.text.x = element_blank(), 122 | axis.text.y = element_blank(), 123 | axis.ticks = element_blank(), 124 | axis.title.x = element_blank(), 125 | axis.title.y = element_blank(), 126 | legend.position = "none", 127 | panel.background = element_blank(), 128 | panel.border = element_blank(), 129 | panel.grid.major = element_blank(), 130 | panel.grid.minor = element_blank(), 131 | plot.background = element_blank(), 132 | plot.title = element_text(hjust = 0.5) 133 | ) + 134 | ggtitle("Cluster") 135 | 136 | p2 <- ggplot(df, aes(tsne_x, tsne_y, color = as.factor(subtype))) + 137 | geom_point(size = 6) + 138 | labs(color = "cluster", x = "tSNE_1", y = "tSNE_2") + 139 | theme_classic() + 140 | theme( 141 | text = element_text(size = 30), 142 | legend.position = "none" 143 | ) + 144 | theme( 145 | axis.line = element_blank(), 146 | axis.text.x = element_blank(), 147 | axis.text.y = element_blank(), 148 | axis.ticks = element_blank(), 149 | axis.title.x = element_blank(), 150 | axis.title.y = element_blank(), 151 | legend.position = "right", 152 | panel.background = element_blank(), 153 | panel.border = element_blank(), 154 | panel.grid.major = element_blank(), 155 | panel.grid.minor = element_blank(), 156 | plot.background = element_blank(), 157 | plot.title = element_text(hjust = 0.5) 158 | ) + 159 | ggtitle("Cancer type") + 160 | scale_color_manual(values = tica_pal$cancer) 161 | 162 | p1 + p2 163 | 164 | saveRDS(df, "output/atlas_proportion_dataset_clustered.rds") 165 | -------------------------------------------------------------------------------- /Patient stratification/3_random_forest.R: -------------------------------------------------------------------------------- 1 | # Atlas clustering dataset 2 | # create random forest classificator 3 | 4 | # load libraries 5 | library(caTools) 6 | library(rpart) 7 | library(rpart.plot) 8 | library(randomForest) 9 | library(rattle) 10 | library(caret) 11 | library(patchwork) 12 | library(tidyverse) 13 | library(ComplexHeatmap) 14 | library(circlize) 15 | library(scales) 16 | library(RColorBrewer) 17 | library(pals) 18 | library(ggsci) 19 | 20 | atlas <- readRDS("atlas_proportion_dataset_clustered.rds") 21 | atlas$cluster <- as.factor(atlas$cluster_kmeans_k6) 22 | atlas$cluster_kmeans_k6 <- NULL 23 | atlas$tsne_y <- NULL 24 | atlas$tsne_x <- NULL 25 | colnames(atlas) <- make.names(colnames(atlas)) 26 | 27 | # first we split the dataset into training and test 28 | split <- sample.split(atlas$cluster, SplitRatio = 0.75) 29 | training_set <- subset(atlas[, c(11:ncol(atlas))], split == TRUE) 30 | test_set <- subset(atlas[, c(11:ncol(atlas))], split == FALSE) 31 | 32 | # then train the RF model 33 | rf <- randomForest(cluster ~ ., ntree = 1000, data = as.data.frame.data.frame(training_set), importance = TRUE, proximity = TRUE) 34 | plot(rf) 35 | 36 | # fit model on test subset 37 | predicted_cluster <- predict(rf, test_set[1:25]) 38 | # evaluate with the confusion matrix 39 | confusionMatrix( 40 | data = predicted_cluster, 41 | reference = test_set$cluster 42 | ) 43 | 44 | # Variable Importance 45 | varImpPlot(rf, type = 1, sort = TRUE, n.var = 15, main = "Top 15 - Variable Importance") 46 | varImpPlot(rf, type = 2, sort = TRUE, n.var = 15, main = "Top 15 - Variable Importance") 47 | 48 | df <- as.data.frame(rf$importance[, c("MeanDecreaseAccuracy", "MeanDecreaseGini")]) 49 | df$cell_type <- rownames(df) 50 | df$cell_type <- gsub("\\.", " ", df$cell_type) 51 | colnames(df) <- c("Mean Decrease Accuracy", "Mean Decrease Gini", "cell_type") 52 | df <- pivot_longer(df, cols = c("Mean Decrease Accuracy", "Mean Decrease Gini"), names_to = "measure") 53 | df <- df[order(-df$value), ] 54 | df <- df[df$cell_type %in% df$cell_type[1:15], ] 55 | df$cell_type <- as.factor(df$cell_type) 56 | df$cell_type <- factor(df$cell_type, levels = unique(df$cell_type[order(df$value)])) 57 | 58 | ggplot(df, aes(x = cell_type, y = value)) + 59 | geom_point(col = "tomato2", size = 5) + # Draw points 60 | coord_flip() + 61 | facet_grid(. ~ measure, scales = "free", labeller = label_wrap_gen(width = 10)) + 62 | theme_minimal() + 63 | labs(x = "", y = "") + 64 | theme(text = element_text(size = 25)) + 65 | theme(panel.background = element_rect(fill = NA, color = "black")) 66 | -------------------------------------------------------------------------------- /Patient stratification/README.md: -------------------------------------------------------------------------------- 1 | # Single-Cell Tumor Immune Atlas project: Patient Stratification 2 | 3 | These are the scripts necessary for the creation of the patients cell-proportion dataset, for the hierarchical k-means clustering of the patients and to evaluate the clustering with a random forest model trained on the clustering output. 4 | 5 | ## Package versions 6 | Required packages for the code in this folder and versions used: 7 | 8 | * [Tidyverse 1.3.0](https://cran.r-project.org/web/packages/tidyverse/vignettes/paper.html) 9 | * [Seurat 3.2.2](https://www.cell.com/cell/fulltext/S0092-8674(19)30559-8?_returnURL=https%3A%2F%2Flinkinghub.elsevier.com%2Fretrieve%2Fpii%2FS0092867419305598%3Fshowall%3Dtrue) 10 | * [MClust 5.4.6](https://doi.org/10.32614/RJ-2016-021) 11 | * [dendextend 1.14](10.1093/bioinformatics/btv428) 12 | * [magrittr 1.5](https://cran.r-project.org/web/packages/magrittr/vignettes/magrittr.html) 13 | * [ggsci 2.9](https://cran.r-project.org/web/packages/ggsci/vignettes/ggsci.html) 14 | * [pals 1.6](https://kwstat.github.io/pals/) 15 | * [RColorBrewer 1.1](https://cran.r-project.org/web/packages/RColorBrewer/index.html) 16 | * [patchwork 1.1.0](https://github.com/thomasp85/patchwork) 17 | * [clValid 0.5](http://dx.doi.org/10.18637/jss.v025.i04) 18 | * [factoextra 1.0.7](https://cran.r-project.org/package=factoextra) 19 | * [NbClust 3.0](http://dx.doi.org/10.18637/jss.v061.i06) 20 | * [GGally 2.0.0](https://ggobi.github.io/ggally/) 21 | * [Rtsne 0.15](https://github.com/jkrijthe/Rtsne) 22 | * [ComplexHeatmap 2.4.3](http://bioconductor.org/packages/release/bioc/html/ComplexHeatmap.html) 23 | * [circlize 0.4.10](https://doi.org/10.1093/bioinformatics/btu393) 24 | * [kohonen 3.0.10](https://cran.r-project.org/web/packages/kohonen/A) 25 | * [caTools 1.18](https://cran.r-project.org/web/packages/caTools/index.html) 26 | * [rpart 4.1](https://cran.r-project.org/web/packages/rpart/) 27 | * [rpart.plot 3.0.9](https://cran.r-project.org/web/packages/rpart.plot/) 28 | * [randomForest 4.6](https://cran.r-project.org/web/packages/randomForest/) 29 | * [rattle 5.4.0](https://cran.r-project.org/web/packages/rattle/) 30 | * [caret 6.0](http://topepo.github.io/caret/index.html) 31 | * [scales 1.1.1](https://cran.r-project.org/web/packages/scales/index.html) 32 | -------------------------------------------------------------------------------- /Projections/README.md: -------------------------------------------------------------------------------- 1 | # Projection of different datasets on the atlas 2 | 3 | To demonstrate the power of our atlas, we predicted the cell types on datasets from different cancer types and varying experimental designs. We make use of FindTransferAnchors utility provided by Seurat [(Stuart et al, 2019)](https://www.cell.com/cell/fulltext/S0092-8674(19)30559-8) for transfering cell types on TICA to the query objects. 4 | 5 | This script showcases an example for projecting external datsets (query) onto the atlas. 6 | 7 | ## Dependencies 8 | 9 | * [R 3.6.0](https://cran.r-project.org/) 10 | * [Seurat 3.2.0](https://cran.r-project.org/web/packages/Seurat/index.html) 11 | * [tidyverse 1.3.0](https://cran.r-project.org/web/packages/tidyverse/index.html) 12 | * [magritts 1.5.0](https://cloud.r-project.org/package=magrittr) 13 | * [pals 1.6.0](https://kwstat.github.io/pals/) 14 | * [flextable 0.5.10](https://davidgohel.github.io/flextable/) 15 | * [ComplexHeatmap 2.4.3](https://github.com/jokergoo/ComplexHeatmap) 16 | * [patchwork 1.0.0](https://patchwork.data-imaginist.com/) 17 | * [matchSCore2 0.1.0](https://github.com/elimereu/matchSCore2) 18 | * [RColorBrewer 1.1.2](https://cloud.r-project.org/package=RColorBrewer) 19 | 20 | 21 | ## Data 22 | 23 | * Atlas Seurat object: download it as specified in the publication ([Zenodo](https://zenodo.org/record/4036020#.X5hFT4hKiHt)) 24 | 25 | * Query samples: two human uveal melanoma cancers, one human ovarian cancer, one human uveal melanoma liver metastasis, one human non-small cell lung cancer brain metastasis (including TCR) and two mice colorectal cancers (one full and one only T cells and TCR) 26 | 27 | [Available for download on GEO](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE158803). -------------------------------------------------------------------------------- /Projections/atlas_projection.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Projecting data on the Single-Cell Tumor Immune Atlas" 3 | date: "`r format(Sys.time(), '%b %Y')`" 4 | output: 5 | html_document: 6 | toc: yes 7 | --- 8 | 9 | ## Projecting data on the Single-Cell Tumor Immune Atlas 10 | 11 | This is an example to project external datasets (query) onto the atlas. The following code generates the projection of cell types from the atlas on a brain metastasis sample from a patient with melanoma. For this sample we have available both scRNA-seq and TCR sequencing to check the clonality associated on the cell types. 12 | 13 | WARNING: due to huge size this script should be run on a computational cluster 14 | 15 | 16 | ### Load all necesary packages 17 | 18 | ```{r packages, echo=FALSE, message=FALSE, warning=FALSE} 19 | library(tidyverse) 20 | library(Seurat) 21 | library(magrittr) 22 | library(pals) 23 | library(flextable) 24 | library(ComplexHeatmap) 25 | library(patchwork) 26 | library(matchSCore2) 27 | library(RColorBrewer) 28 | ``` 29 | 30 | ### Load a predefined color palette for each cell type 31 | 32 | ```{r palette, echo=FALSE} 33 | # this is the color palette, with names 34 | values <- readRDS(file="~/cell_type_palette.rds") 35 | ``` 36 | 37 | ### Load the atlas. Calculate 5000 VariableFeatures 38 | 39 | ```{r load atlas} 40 | atlas <- readRDS("downsampled_atlas.rds") # load downsampled atlas (1000 cells per cell type) 41 | atlas <- subset(atlas, new_cell_types != "Mast cells") 42 | Idents(atlas) <- "new_cell_types" 43 | atlas <- FindVariableFeatures(atlas, assay = "RNA", nfeatures = 5000) # 5000 Var.Feat for the RNA assay 44 | ``` 45 | 46 | ### params$file contains the path to the Seurat object preprocessed with the TCR info on it. Basic preprocessing, normalization, scaling and dimensionality reduction was applied following Seurat's vignette (https://satijalab.org/seurat/v3.2/pbmc3k_tutorial.html). 47 | 48 | ```{r load query, echo=FALSE} 49 | query <- readRDS(params$file) # load query dataset (BrainMet P12) 50 | Idents(query) <- query$clusters 51 | levels(query$clusters) <- c(1:length(unique(query$clusters))) # rename clusters from 1 to n 52 | ``` 53 | 54 | ### We use FindTransferAnchors utility provided by Seurat (Stuart et al, 2019) for transfering the cell types from the atlas to the query object 55 | 56 | ```{r find anchors, echo=FALSE} 57 | anchors <- FindTransferAnchors(reference = atlas, # find anchors between query and atlas 58 | query = query, 59 | dims = 1:30, 60 | normalization.method = "LogNormalize", 61 | reference.assay = "RNA", 62 | query.assay = "RNA") 63 | predictions <- TransferData(anchorset = anchors, refdata = atlas$new_cell_types, dims = 1:30) # transfer new_cell_types annot 64 | query <- AddMetaData(query, metadata = predictions) # add to object metdata 65 | ``` 66 | 67 | ### Show the results of the projection as a table 68 | 69 | ```{r table, results = "markup", echo=FALSE} 70 | tab <- as.data.frame.matrix(table(query$predicted.id, Idents(query))) # create table 71 | tab$annotated_cell_type <- rownames(tab) 72 | 73 | flextable(tab) %>% 74 | set_header_labels(annotated_cell_type = " ") %>% 75 | autofit() %>% 76 | theme_vanilla() %>% 77 | set_caption(paste0(toString(dim(query)[2]), " cells total")) %>% # add number of total cells at the top 78 | align(align = "center", part = "all") 79 | ``` 80 | 81 | ### Plot the predictions and their associated clonal size 82 | 83 | ```{r plots, results = "markup", fig.height=10, fig.width=12, echo=FALSE} 84 | # plot umap colored by original cell type, predicted cell type and clusters 85 | query$cell_type <- Idents(query) 86 | p1 <- DimPlot(query, group.by = "cell_type", pt.size = 1) + 87 | NoAxes() 88 | p1 + ggtitle("Predicted celltype") 89 | 90 | query$umap1 <- query@reductions$umap@cell.embeddings[,1] 91 | query$umap2 <- query@reductions$umap@cell.embeddings[,2] 92 | aux_df <- as.data.frame(cbind(query$umap1,query$umap2,Idents(query),query$Clono_size)) 93 | colnames(aux_df) <- c("umap1","umap2","clusters","Clono_size") 94 | aux_df$umap1 <- as.numeric(as.character(aux_df$umap1)) 95 | aux_df$umap2 <- as.numeric(as.character(aux_df$umap2)) 96 | aux_df$clusters <- as.character(aux_df$clusters) 97 | 98 | p2 <- ggplot(aux_df, aes(x=umap1, y=umap2)) + geom_point(aes(color=Clono_size),size = 1.4) + ggtitle("Clono_size") + 99 | scale_colour_gradient(low = "grey", high = "darkgreen") + 100 | theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),panel.background = element_blank(), axis.line = element_line(colour = "black")) + 101 | NoAxes() 102 | p2 103 | ``` 104 | 105 | -------------------------------------------------------------------------------- /Projections/cancer_subtype_palette.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Single-Cell-Genomics-Group-CNAG-CRG/Tumor-Immune-Cell-Atlas/f309409beb058ede05dfc85927b952cda51fc119/Projections/cancer_subtype_palette.rds -------------------------------------------------------------------------------- /Projections/cell_type_palette.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Single-Cell-Genomics-Group-CNAG-CRG/Tumor-Immune-Cell-Atlas/f309409beb058ede05dfc85927b952cda51fc119/Projections/cell_type_palette.rds -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Single-Cell Tumor Immune Atlas project 2 | Code repository for the Single-Cell Tumor Immune Atlas project. 3 | 4 | Our paper is out now on [Genome Research](https://singlecellgenomics-cnag-crg.shinyapps.io/TICA/). 5 | 6 | ![](misc/atlas.png) 7 | 8 | We present a single cell immune atlas of the tumor microenvironment composed by over 500k cells, 217 patients,13 cancer types, for: 9 | 10 | 11 | * Patient stratification 12 | * Annotation 13 | * Spatial mapping 14 | 15 | You can inspect our dataset and query your own easily, by using our ShinyApp [here](https://singlecellgenomics-cnag-crg.shinyapps.io/TICA/). 16 | 17 | 18 | ## Abstract 19 | 20 | The tumor immune microenvironment is a main contributor to cancer progression and a promising therapeutic target for oncology. However, immune microenvironments vary profoundly between patients and biomarkers for prognosis and treatment response lack precision. A comprehensive compendium of tumor immune cells is required to pinpoint predictive cellular states and their spatial localization. We generated a single-cell tumor immune atlas, jointly analyzing >500,000 cells from 217 patients and 13 cancer types, providing the basis for a patient stratification based on immune cell compositions. Projecting immune cells from external tumors onto the atlas facilitated an automated cell annotation system for a harmonized interpretation. To enable in situ mapping of immune populations for digital pathology, we developed, benchmarked and applied SPOTlight, a computational tool that identified striking spatial immune cell patterns in tumor sections. We expect the atlas, together with our versatile toolbox for precision oncology, to advance currently applied stratification approaches for prognosis and immuno-therapy. 21 | 22 | 23 | ## Code implementation 24 | 25 | Folders and content: 26 | 27 | * `Integration`: scripts necessary to create the atlas 28 | * `Patient stratification`: clustering of the patients and assessment by random forest 29 | * `Projections`: projecting external data onto the atlas 30 | * `Random Forest`: assessment of the clustering robustness 31 | * `ST-breast`: analysis of the spatial transcriptomics breast data 32 | * `ST-oropharyngeal`: analysis of the spatial transcriptomics oropharyngeal data 33 | 34 | See the `README.md` files in each directory for a full description. 35 | 36 | ## Getting the data 37 | The Single-Cell Tumor Immune Atlas dataset is open access on [Zenodo](https://zenodo.org/record/4263972). 38 | 39 | ## Getting the code 40 | 41 | You can download a copy of all the files in this repository by cloning the 42 | [git](https://git-scm.com/) repository: 43 | 44 | git clone https://github.com/Single-Cell-Genomics-Group-CNAG-CRG/Tumor-Immune-Cell-Atlas.git 45 | 46 | or [download a zip archive](https://github.com/Single-Cell-Genomics-Group-CNAG-CRG/Tumor-Immune-Cell-Atlas/archive/main.zip). 47 | 48 | ## Dependencies 49 | 50 | The dependencies for each part of the analysis are listed in each of the folders. 51 | -------------------------------------------------------------------------------- /Random Forest/1-calculate_cell_type_signatures.R: -------------------------------------------------------------------------------- 1 | # This script computes 25 cell type-specific signatures for each cell, based on the 2 | # markers provided in the Supplementary Table 2. It also calculates signatures 3 | # defined with random genes that will help us assess the significance of the classifier. 4 | # Finally, it saves the dataframe that will be the input for the random forest. 5 | 6 | 7 | # Load packages 8 | library(Seurat) 9 | library(readxl) 10 | library(tidyverse) 11 | 12 | 13 | # Load data 14 | tica <- readRDS("path_to_TICA_seurat_object") 15 | path_to_excel <- "data/supp_table_2.xlsx" 16 | sheets <- excel_sheets(path_to_excel) 17 | signatures_dfs <- purrr::map(sheets, ~ read_excel(path_to_excel, sheet = .x)) 18 | names(signatures_dfs) <- str_replace_all(sheets, " ", "_") 19 | signatures_tica <- purrr::map(signatures_dfs, "gene") 20 | 21 | 22 | # Create dataframe to run the random forest (signatures) 23 | DefaultAssay(tica) <- "integrated" 24 | tica@assays$RNA <- NULL 25 | set.seed(123) 26 | for (signature in names(signatures_tica)) { 27 | print(signature) 28 | start_time <- Sys.time() 29 | features <- signatures_tica[[signature]] 30 | features <- features[features %in% rownames(tica)] 31 | tica <- AddModuleScore( 32 | tica, 33 | features = list(features), 34 | name = str_c(signature, "signature", sep = "_") 35 | ) 36 | random_features <- sample( 37 | rownames(tica), 38 | size = length(features), 39 | replace = FALSE 40 | ) 41 | tica <- AddModuleScore( 42 | tica, 43 | features = list(random_features), 44 | name = str_c(signature, "random", sep = "_") 45 | ) 46 | end_time <- Sys.time() 47 | print(end_time - start_time) 48 | print(head(tica@meta.data)) 49 | } 50 | keep_cols_sign <- c( 51 | str_subset(colnames(tica@meta.data), "signature"), 52 | str_subset(colnames(tica@meta.data), "random"), 53 | "new_cell_types" 54 | ) 55 | tica_df_signatures <- tica@meta.data[, keep_cols_sign] 56 | tica_df_signatures$barcode <- colnames(tica) 57 | 58 | 59 | # Save 60 | dir.create("tmp", showWarnings = FALSE) 61 | saveRDS(tica_df_signatures, "tmp/tica_input_dataframe_signatures_random_forest.rds") 62 | -------------------------------------------------------------------------------- /Random Forest/2-create_test_sets_for_cross_validation.R: -------------------------------------------------------------------------------- 1 | # This script creates the test sets for the 5-fold cross-validation. 2 | # We ensure that each cell type is represented in each test set. Thus, we take 3 | # 20% of cells for each cell type in each fold. Each cell in the dataset is found 4 | # in 1 test set and 4 training sets. 5 | 6 | 7 | # Load packages 8 | library(tidyverse) 9 | 10 | 11 | # Load data 12 | tica_df_signatures <- readRDS("tmp/tica_input_dataframe_signatures_random_forest.rds") 13 | 14 | 15 | # Create test sets in each fold 16 | cells <- tica_df_signatures$barcode 17 | cell_types <- levels(tica_df_signatures$new_cell_types) 18 | test_cells_list <- list( 19 | fold1 = c(), 20 | fold2 = c(), 21 | fold3 = c(), 22 | fold4 = c(), 23 | fold5 = c() 24 | ) 25 | for (cell_type in cell_types) { 26 | print(cell_type) 27 | cells <- tica_df_signatures[tica_df_signatures$new_cell_types == cell_type, "barcode"] 28 | steps_cross_valid <- round( 29 | seq( 30 | from = 1, 31 | to = length(cells), 32 | by = (length(cells) / 5) 33 | ), 34 | 0 35 | ) 36 | test_cells_list[["fold1"]] <- c( 37 | test_cells_list[["fold1"]], 38 | cells[steps_cross_valid[1]:steps_cross_valid[2]] 39 | ) 40 | test_cells_list[["fold2"]] <- c( 41 | test_cells_list[["fold2"]], 42 | cells[(steps_cross_valid[2] + 1):steps_cross_valid[3]] 43 | ) 44 | test_cells_list[["fold3"]] <- c( 45 | test_cells_list[["fold3"]], 46 | cells[(steps_cross_valid[3] + 1):steps_cross_valid[4]] 47 | ) 48 | test_cells_list[["fold4"]] <- c( 49 | test_cells_list[["fold4"]], 50 | cells[(steps_cross_valid[4] + 1):steps_cross_valid[5]] 51 | ) 52 | test_cells_list[["fold5"]] <- c( 53 | test_cells_list[["fold5"]], 54 | cells[(steps_cross_valid[5] + 1):length(cells)] 55 | ) 56 | } 57 | saveRDS(test_cells_list, "tmp/test_set_per_fold.rds") 58 | 59 | -------------------------------------------------------------------------------- /Random Forest/3-run_random_forest.R: -------------------------------------------------------------------------------- 1 | # This script trains and tests a random forest classifier on a specific fold 2 | # of the cross validation specified in previous scripts. It saves both the trained model and 3 | # the predicted labels. 4 | start_time <- Sys.time() 5 | 6 | 7 | # Load packages 8 | library(Seurat) 9 | library(tidyverse) 10 | library(randomForest) 11 | library(doParallel) 12 | library(foreach) 13 | library(readxl) 14 | 15 | 16 | # Load data 17 | tica_df <- readRDS("tmp/tica_input_dataframe_signatures_random_forest.rds") 18 | test_cells_list <- readRDS("tmp/test_set_per_fold.rds") 19 | print("Data loaded successfully!") 20 | 21 | 22 | # Separate datframe in real and random signatures 23 | signatures_cols <- c( 24 | str_subset(colnames(tica_df), "signature"), 25 | "new_cell_types" 26 | ) 27 | random_cols <- c( 28 | str_subset(colnames(tica_df), "random"), 29 | "new_cell_types" 30 | ) 31 | tica_df_signatures <- tica_df[, signatures_cols] 32 | tica_df_random <- tica_df[, random_cols] 33 | print("Separated real and random signatures") 34 | 35 | 36 | # Process command-line arguments and subset to correct fold 37 | args <- commandArgs(trailingOnly = TRUE) 38 | fold <- args[[1]] 39 | test_cells <- test_cells_list[[fold]] 40 | 41 | 42 | # Divide cells in training and test sets 43 | test_df_signatures <- tica_df_signatures[test_cells, ] 44 | test_df_random <- tica_df_random[test_cells, ] 45 | training_cells <- rownames(tica_df)[!(rownames(tica_df) %in% test_cells)] 46 | training_df_signatures <- tica_df_signatures[training_cells, ] 47 | training_df_random <- tica_df_random[training_cells, ] 48 | print("Divided training and test sets") 49 | 50 | 51 | # Train random forest 52 | # ## Set parallelization backend 53 | total_trees <- 600 54 | parallel_trees <- 100 55 | ncpu <- round(total_trees / parallel_trees, 0) 56 | doParallel::registerDoParallel(ncpu) 57 | 58 | 59 | ######################################################################## 60 | ############################## SIGNATURES ############################## 61 | ######################################################################## 62 | 63 | # Run n (ncpu) forest of ntrees (parallel_trees) each: signatures 64 | rf_mod_signatures <- foreach::foreach(ntree = rep(parallel_trees, ncpu), 65 | # Indicating we want to use combine from the randomForest to join all the parallelized trees 66 | .combine = randomForest::combine, 67 | # Indicate we need the randomForest within the ForeEach loop + dopar indicating parallelization 68 | .packages="randomForest") %dopar% { 69 | ## train RF model 70 | model <- randomForest::randomForest( 71 | new_cell_types ~ ., 72 | data = training_df_signatures, 73 | type = "classification", 74 | # Keep importance in the RF object 75 | importance = FALSE, 76 | # How many trees to build 77 | ntree = ntree, 78 | # Return update every 10 trees 79 | verbose = 10 80 | ) 81 | return(model) 82 | } 83 | print("Finshed random forest signatures!") 84 | 85 | 86 | # Test random forest 87 | predicted_probs_signatures <- predict( 88 | object = rf_mod_signatures, 89 | newdata = test_df_signatures, 90 | type = "prob" 91 | ) 92 | predicted_class_signatures <- predict( 93 | object = rf_mod_signatures, 94 | newdata = test_df_signatures, 95 | type = "class" 96 | ) 97 | 98 | 99 | # Save model and predictions 100 | saveRDS(rf_mod_signatures, str_c("tmp/random_forest_model_signatures", "_", fold, ".rds", sep = "")) 101 | saveRDS(predicted_probs_signatures, str_c("tmp/random_forest_predicted_probs_signatures", "_", fold, ".rds")) 102 | saveRDS(predicted_class_signatures, str_c("tmp/random_forest_predicted_class_signatures", "_", fold, ".rds")) 103 | print("Saved results signatures!") 104 | 105 | 106 | ######################################################################## 107 | ############################## RANDOM ################################## 108 | ######################################################################## 109 | 110 | # Run n (ncpu) forest of ntrees (parallel_trees) each: signatures 111 | rf_mod_random <- foreach::foreach(ntree = rep(parallel_trees, ncpu), 112 | # Indicating we want to use combine from the randomForest to join all the parallelized trees 113 | .combine = randomForest::combine, 114 | # Indicate we need the randomForest within the ForeEach loop + dopar indicating parallelization 115 | .packages="randomForest") %dopar% { 116 | ## train RF model 117 | model <- randomForest::randomForest( 118 | new_cell_types ~ ., 119 | data = training_df_random, 120 | type = "classification", 121 | # Keep importance in the RF object 122 | importance = FALSE, 123 | # How many trees to build 124 | ntree = ntree, 125 | # Return update every 10 trees 126 | verbose = 10 127 | ) 128 | return(model) 129 | } 130 | print("Finshed random forest with random signatures!") 131 | 132 | 133 | # Test random forest 134 | predicted_probs_random <- predict( 135 | object = rf_mod_random, 136 | newdata = test_df_random, 137 | type = "prob" 138 | ) 139 | predicted_class_random <- predict( 140 | object = rf_mod_random, 141 | newdata = test_df_random, 142 | type = "class" 143 | ) 144 | 145 | 146 | # Save model and predictions 147 | saveRDS(rf_mod_random, str_c("tmp/random_forest_model_random", "_", fold, ".rds", sep = "")) 148 | saveRDS(predicted_probs_random, str_c("tmp/random_forest_predicted_probs_random", "_", fold, ".rds")) 149 | saveRDS(predicted_class_random, str_c("tmp/random_forest_predicted_class_random", "_", fold, ".rds")) 150 | 151 | print("Saved results random signatures!") 152 | 153 | end_time <- Sys.time() 154 | total_time <- end_time - start_time 155 | print(str_c("Job took ", total_time, " to compute", sep = "")) 156 | -------------------------------------------------------------------------------- /Random Forest/4-evaluate_performance.R: -------------------------------------------------------------------------------- 1 | # This script evaluates the performance of the random forest classifiers for all folds 2 | 3 | 4 | # Load packages 5 | library(caret) 6 | library(e1071) 7 | library(tidyverse) 8 | 9 | 10 | # Load data 11 | test_sets_cross_valid <- readRDS("tmp/test_set_per_fold.rds") 12 | tica_df <- readRDS("tmp/tica_input_dataframe_signatures_random_forest.rds") 13 | folds <- str_c("fold", 1:5, sep = "") 14 | pred_class_sign_l <- purrr::map(folds, function(fold) { 15 | path_to_preds1 <- "tmp/random_forest_predicted_class_signatures_" 16 | path_to_preds <- str_c(path_to_preds1, fold, ".rds", sep = "") 17 | pred_class <- readRDS(path_to_preds) 18 | pred_class 19 | }) 20 | names(pred_class_sign_l) <- folds 21 | pred_class_rand_l <- purrr::map(folds, function(fold) { 22 | path_to_preds1 <- "tmp/random_forest_predicted_class_random_" 23 | path_to_preds <- str_c(path_to_preds1, fold, ".rds", sep = "") 24 | pred_class <- readRDS(path_to_preds) 25 | pred_class 26 | }) 27 | names(pred_class_rand_l) <- folds 28 | 29 | pred_probs_sign_l <- purrr::map(folds, function(fold) { 30 | path_to_probs1 <- "tmp/random_forest_predicted_probs_signatures_" 31 | path_to_probs <- str_c(path_to_probs1, fold, ".rds", sep = "") 32 | pred_probs <- readRDS(path_to_probs) 33 | pred_probs 34 | }) 35 | names(pred_probs_sign_l) <- folds 36 | pred_probs_rand_l <- purrr::map(folds, function(fold) { 37 | path_to_probs1 <- "tmp/random_forest_predicted_probs_random_" 38 | path_to_probs <- str_c(path_to_probs1, fold, ".rds", sep = "") 39 | pred_probs <- readRDS(path_to_probs) 40 | pred_probs 41 | }) 42 | names(pred_probs_rand_l) <- folds 43 | 44 | 45 | # Measure accuracy for each fold 46 | accuracy_dfs <- purrr::map(folds, function(fold) { 47 | test_df <- tica_df[test_sets_cross_valid[[fold]], ] 48 | comparison_rownames_sign <- all(rownames(test_df) == names(pred_class_sign_l[[fold]])) 49 | comparison_rownames_rand <- all(rownames(test_df) == names(pred_class_rand_l[[fold]])) 50 | if (comparison_rownames_sign & comparison_rownames_rand) { 51 | conf_mat_sign <- caret::confusionMatrix( 52 | test_df$new_cell_types, 53 | pred_class_sign_l[[fold]] 54 | ) 55 | conf_mat_rand <- caret::confusionMatrix( 56 | test_df$new_cell_types, 57 | pred_class_rand_l[[fold]] 58 | ) 59 | df <- data.frame( 60 | accuracy = c(conf_mat_sign$overall["Accuracy"], conf_mat_rand$overall["Accuracy"]), 61 | kappa = c(conf_mat_sign$overall["Kappa"], conf_mat_rand$overall["Kappa"]), 62 | type = c("signatures", "random") 63 | ) 64 | df 65 | 66 | } else { 67 | "Rownames do not match" 68 | } 69 | }) 70 | names(accuracy_dfs) <- folds 71 | accuracy_df <- bind_rows(accuracy_dfs, .id = "fold") 72 | saveRDS(accuracy_df, "tmp/accuracy_random_forest_dataframe.rds") 73 | 74 | 75 | # Calculate confusion matrix of probabilities for each fold 76 | conf_mat_probs_l <- purrr::map(folds, function(fold) { 77 | test_df <- tica_df[test_sets_cross_valid[[fold]], ] 78 | if (all(rownames(test_df) == names(pred_probs_sign_l[[fold]]))) { 79 | cell_types <- levels(test_df$new_cell_types) 80 | average_probs_l <- purrr::map(cell_types, function(cell_type) { 81 | indices_cell_type <- rownames(test_df[test_df$new_cell_types == cell_type, ]) 82 | mat <- pred_probs_sign_l[[fold]][indices_cell_type, ] 83 | average_prob <- colMeans(mat) 84 | average_prob 85 | }) 86 | names(average_probs_l) <- cell_types 87 | average_probs_mat <- average_probs_l %>% 88 | bind_rows() %>% 89 | as.matrix() 90 | rownames(average_probs_mat) <- names(average_probs_l) 91 | average_probs_mat 92 | } else { 93 | "Rownames do not match" 94 | } 95 | }) 96 | names(conf_mat_probs_l) <- folds 97 | saveRDS(conf_mat_probs_l, "tmp/confusion_matrix_probabilities_list.rds") 98 | 99 | -------------------------------------------------------------------------------- /Random Forest/5-plot_figures.R: -------------------------------------------------------------------------------- 1 | # This script plots the results of the random forest 2 | 3 | 4 | # Load packages 5 | library(ggpubr) 6 | library(pheatmap) 7 | library(tidyverse) 8 | 9 | 10 | # Load data 11 | accuracy_df <- readRDS("tmp/accuracy_random_forest_dataframe.rds") 12 | conf_mat_probs_l <- readRDS("tmp/confusion_matrix_probabilities_list.rds") 13 | 14 | 15 | # Plot accuracy 16 | accuracy_gg <- accuracy_df %>% 17 | ggplot(aes(type, accuracy, col = type)) + 18 | geom_jitter() + 19 | geom_boxplot() + 20 | scale_y_continuous(limits = c(0, 1), breaks = c(0, 0.25, 0.5, 0.75, 1)) + 21 | scale_color_manual(values = c("gray50", "limegreen")) + 22 | labs(x = "", y = "Accuracy") + 23 | theme_classic() + 24 | theme(legend.position = "none", 25 | axis.title.y = element_text(size = 13), 26 | axis.text.x = element_text(size = 11, color = "black")) 27 | 28 | kappa_gg <- accuracy_df %>% 29 | ggplot(aes(type, kappa, col = type)) + 30 | geom_jitter() + 31 | geom_boxplot() + 32 | scale_y_continuous(limits = c(0, 1), breaks = c(0, 0.25, 0.5, 0.75, 1)) + 33 | scale_color_manual(values = c("gray50", "limegreen")) + 34 | labs(x = "", y = "Kappa") + 35 | theme_classic() + 36 | theme(legend.position = "none", 37 | axis.title.y = element_text(size = 13), 38 | axis.text.x = element_text(size = 11, color = "black")) 39 | 40 | arranged <- ggarrange(plotlist = list(accuracy_gg, kappa_gg), ncol = 2) 41 | ggsave( 42 | plot = arranged, 43 | filename = "results/accuracy_kappa_random_forest.pdf", 44 | width = 17.5, 45 | height = 8, 46 | units = "cm" 47 | ) 48 | 49 | 50 | # Plot heatmap confusion matrix 51 | colors_function <- colorRampPalette(colors = c("white", "red")) 52 | colors <- colors_function(100) 53 | heatmap <- pheatmap( 54 | conf_mat_probs_l$fold2, 55 | cluster_cols = FALSE, 56 | cluster_rows = FALSE, 57 | scale = "none", 58 | color = colors, 59 | angle_col = 315, 60 | legend = FALSE, 61 | fontsize_row = 9, 62 | fontsize_col = 9 63 | ) 64 | save_pheatmap_pdf <- function(x, filename, width = 17.5 * 0.394, height = 15 * 0.394) { 65 | stopifnot(!missing(x)) 66 | stopifnot(!missing(filename)) 67 | pdf(filename, width = width, height = height) 68 | grid::grid.newpage() 69 | grid::grid.draw(x$gtable) 70 | dev.off() 71 | } 72 | save_pheatmap_pdf(heatmap, "results/confusion_matrix_probabilities.pdf") 73 | 74 | -------------------------------------------------------------------------------- /Random Forest/README.md: -------------------------------------------------------------------------------- 1 | # Validation of TICA clusters using a random forest classifier 2 | 3 | 4 | ## Dependencies 5 | 6 | * [R 3.6.0](https://cran.r-project.org/) 7 | * [Seurat 3.2.0](https://cran.r-project.org/web/packages/Seurat/index.html) 8 | * [readxl 1.3.1](https://cran.r-project.org/web/packages/readxl/index.html) 9 | * [tidyverse 1.3.0](https://cran.r-project.org/web/packages/tidyverse/index.html) 10 | * [randomForest 4.6.14](https://cran.r-project.org/web/packages/randomForest/index.html) 11 | * [doParallel 1.0.15](https://cran.r-project.org/web/packages/doParallel/index.html) 12 | * [foreach 1.5.0](https://cran.r-project.org/web/packages/foreach/index.html) 13 | * [caret 6.0.86](https://cran.r-project.org/web/packages/caret/index.html) 14 | * [e1071 1.7.3](https://cran.r-project.org/web/packages/e1071/index.html) 15 | * [ggpubr 0.3.0](https://cran.r-project.org/web/packages/ggpubr/index.html) 16 | * [pheatmap 1.0.12](https://cran.r-project.org/web/packages/pheatmap/index.html) 17 | 18 | 19 | ## Data 20 | 21 | * TICA Seurat object: download it as specified in the publication 22 | * Supplementary table 2: excel file with the markers for each cell type. For simplicity, we provide it in the ./data/ folder. 23 | 24 | 25 | ## Pipeline 26 | 27 | The scripts should be run in the following order: 28 | 29 | 30 | 1. 1-calculate_cell_type_signatures.R: calculates 25 cell type-specific signatures + 25 random signatures for each cell in the dataset. 31 | * Input: 32 | * data/supp_table_2.xlsx: cell type specific markers (Supplementary Table 2 in the paper) 33 | * TICA Seurat object (path should be changed before running the script) 34 | * Output: 35 | * tmp/tica_input_dataframe_signatures_random_forest.rds: data frame with cells as observations, signatures as feautures and cell type as response variable. 36 | 2. 2-create_test_sets_for_cross_validation.R 37 | * Input: tmp/tica_input_dataframe_signatures_random_forest.rds 38 | * Output: tmp/test_set_per_fold.rds: list of 5 character vectors containing the barcodes of the cells that belong to each fold of the cross-validation. 39 | 3. 3-run_random_forest.R: train and test 2 RF (cell type-specific + random signatures). 40 | * Arguments: this script should have one argument specifying the fold to run the RF (ie "fold1"). The purpose of this is to parallelize the execution of the RF across folds. 41 | * Input: 42 | * tmp/tica_input_dataframe_signatures_random_forest.rds 43 | * tmp/test_set_per_fold.rds 44 | * Output: 45 | * Cell type-specific: 46 | * tmp/random_forest_model_signatures_fold{1-5}.rds (RF models) 47 | * tmp/random_forest_predicted_probs_signatures_fold{1-5}.rds (RF probabilities) 48 | * tmp/random_forest_predicted_class_signatures_fold{1-5}.rds (RF predicted classes) 49 | * Random signatures: 50 | * tmp/random_forest_model_random_fold{1-5}.rds 51 | * tmp/random_forest_predicted_probs_random_fold{1-5}.rds 52 | * tmp/random_forest_predicted_class_random_fold{1-5}.rds 53 | 4. 4-evaluate_performance.R 54 | * Input: all outputs from previous script 55 | * Output: 56 | * tmp/confusion_matrix_probabilities_list.rds: list with 5 confusion matrices (one per fold), where each CF is the probability that a cell from a cell type X is assigned to cell type Y. 57 | * tmp/accuracy_random_forest_dataframe.rds: accuracy and kappa statistics for each fold. 58 | 5. 5-plot_figures.R: 59 | * Input: 60 | * tmp/accuracy_random_forest_dataframe.rds 61 | * tmp/confusion_matrix_probabilities_list.rds 62 | * Output: 63 | * results/accuracy_kappa_random_forest.pdf (panels a and b) 64 | * results/confusion_matrix_probabilities.pdf (panel c) -------------------------------------------------------------------------------- /Random Forest/data/supp_table_2.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Single-Cell-Genomics-Group-CNAG-CRG/Tumor-Immune-Cell-Atlas/f309409beb058ede05dfc85927b952cda51fc119/Random Forest/data/supp_table_2.xlsx -------------------------------------------------------------------------------- /ST-breast/1-10x_breast_QC.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "1-10x Breast cancer samples" 3 | author: "Marc Elosua-Bayes" 4 | date: '`r format(Sys.Date(), "%B %d, %Y")`' 5 | output: 6 | BiocStyle::html_document: 7 | toc: yes 8 | toc_float: yes 9 | number_sections: yes 10 | df_print: paged 11 | --- 12 | 13 | ```{r setup, include=FALSE} 14 | knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE) 15 | ``` 16 | 17 | ## Introduction 18 | 19 | In this R markdown document we are going to look at the QC of breast carcinoma datasets publicly put out by 10X genomics [here](https://support.10xgenomics.com/spatial-gene-expression/datasets/). Specifications of the tissue are the following: 20 | 21 | 10x Genomics obtained fresh frozen Invasive Ductal Carcinoma breast tissue from BioIVT Asterand. 22 | The tissue was AJCC/UICC Stage Group IIA, ER positive, PR negative, Her2 positive and annotated with: 23 | 24 | * Ductal carcinoma in situ 25 | * Lobular carcinoma in situ 26 | * Invasive Carcinoma 27 | 28 | ## Libraries 29 | ```{r} 30 | library(Seurat) 31 | library(tidyverse) 32 | library(Matrix) 33 | # source("utils/spatial_plot_spaniel.R") 34 | ``` 35 | 36 | ## Paths 37 | ```{r} 38 | source(here::here("misc/paths.R")) 39 | source(here::here("utils/bin.R")) 40 | 41 | "{an_breast_10x}/{robj_dir}" %>% 42 | glue::glue() %>% 43 | here::here() %>% 44 | dir.create( 45 | path = ., 46 | showWarnings = FALSE, 47 | recursive = TRUE) 48 | 49 | "{an_breast_10x}/{plt_dir}" %>% 50 | glue::glue() %>% 51 | here::here() %>% 52 | dir.create( 53 | path = ., 54 | showWarnings = FALSE, 55 | recursive = TRUE) 56 | ``` 57 | 58 | ## Load data 59 | Within this project the data is located at data/breast_visium/section_1 and data/breast_visium/section_2 respectively 60 | ```{r} 61 | breast_1 <- Seurat::Load10X_Spatial(data.dir = here::here("data/breast_visium/section_1"), 62 | filename = "filtered_feature_bc_matrix.h5", 63 | slice = "breast_1") 64 | breast_1[["slice"]] <- "breast_1" 65 | 66 | breast_2 <- Seurat::Load10X_Spatial(data.dir = here::here("data/breast_visium/section_2"), 67 | filename = "filtered_feature_bc_matrix.h5", 68 | slice = "breast_2") 69 | breast_2[["slice"]] <- "breast_2" 70 | ``` 71 | 72 | The first slice has `r nrow(breast_1)` and `r ncol(breast_1)` samples, the second one in turn has `r nrow(breast_2)` and `r ncol(breast_2)` spots 73 | 74 | ### Remove empty genes 75 | We start by removing those genes that aren't expressed in any of the spots overlaying the tissue 76 | ```{r} 77 | table(rowSums(as.matrix(breast_1@assays$Spatial@counts)) == 0) 78 | 79 | keep_genes1 <- rowSums(as.matrix(breast_1@assays$Spatial@counts)) != 0 80 | breast_1 <- breast_1[keep_genes1, ] 81 | 82 | table(rowSums(as.matrix(breast_2@assays$Spatial@counts)) == 0) 83 | 84 | keep_genes2 <- rowSums(as.matrix(breast_2@assays$Spatial@counts)) != 0 85 | breast_2 <- breast_2[keep_genes2, ] 86 | ``` 87 | 88 | In order to work with multiple slices in the same Seurat object, we provide the merge function. 89 | ```{r} 90 | breast_merged <- merge(breast_1, breast_2) 91 | ``` 92 | 93 | ## Process data 94 | ```{r} 95 | breast_merged <- Seurat::SCTransform(object = breast_merged, 96 | assay = "Spatial") 97 | # breast_merged <- Seurat::FindVariableFeatures(breast_merged) 98 | ``` 99 | 100 | This then enables joint dimensional reduction and clustering on the underlying RNA expression data. 101 | ```{r} 102 | # Seurat::DefaultAssay(breast_merged) <- "SCT" 103 | # Seurat::VariableFeatures(breast_merged) <- c(Seurat::VariableFeatures(breast_1), 104 | # Seurat::VariableFeatures(breast_2)) 105 | breast_merged <- Seurat::RunPCA(breast_merged, 106 | verbose = FALSE) %>% 107 | Seurat::FindNeighbors(., dims = 1:30, verbose = FALSE) %>% 108 | Seurat::FindClusters(., 109 | verbose = FALSE, 110 | resolution = c(0.1, 0.25, 0.5, 0.8, 1)) %>% 111 | Seurat::RunUMAP(., dims = 1:30, verbose = FALSE) 112 | ``` 113 | 114 | Add mitochondrial and ribosomal % 115 | ```{r} 116 | # Collect all genes coded on the mitochondrial genome 117 | breast_merged[["percent.mito"]] <- Seurat::PercentageFeatureSet( 118 | object = breast_merged, 119 | pattern = "^MT-") 120 | summary(breast_merged[["percent.mito"]]) 121 | 122 | # Collect all genes coding for ribosomal proteins 123 | breast_merged[["percent.ribo"]] <- Seurat::PercentageFeatureSet( 124 | object = breast_merged, 125 | pattern = "^RPL|^RPS") 126 | summary(breast_merged[["percent.ribo"]]) 127 | ``` 128 | 129 | ## QC Analysis 130 | 131 | ### Basic features 132 | #### Number of genes 133 | We start by plotting some basic features that will help us visualize and define filtering options. 134 | We start by plotting the number of genes per spot, *complexity*, to assess if there are empty spots or huge disparity. 135 | ```{r fig.height=8, fig.width=12} 136 | p1 <- ggplot2::ggplot() + 137 | ggplot2::geom_histogram(data = breast_merged[[]], 138 | ggplot2::aes(nFeature_Spatial), 139 | fill = "red", 140 | alpha = 0.7, 141 | color = "red", 142 | bins = 50) + 143 | ggplot2::facet_wrap(. ~ slice, scales = "free") + 144 | ggplot2::ggtitle("Unique genes per spot") + 145 | ggplot2::labs(x = "Number of Detected Genes", 146 | y = "Number of Spots") + 147 | ggpubr::theme_pubr() 148 | 149 | p1 150 | ``` 151 | 152 | After looking at the distribution we are also going to look at how these spots look on the tissue 153 | ```{r fig.height=12, fig.width=12} 154 | Seurat::SpatialFeaturePlot( 155 | object = breast_merged, 156 | features = "nFeature_Spatial") & 157 | ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 18)) 158 | ``` 159 | 160 | #### Number of reads 161 | Next we want to look at the number of reads captured per spot, this should correlate with spot complexity and will allow us to see regions with higher transcriptional activity. 162 | ```{r fig.height=8, fig.width=12} 163 | p2 <- ggplot2::ggplot() + 164 | ggplot2::geom_histogram(data = breast_merged[[]], 165 | ggplot2::aes(nCount_Spatial), 166 | fill = "red", 167 | alpha = 0.7, 168 | color = "red", 169 | bins = 50) + 170 | ggplot2::facet_wrap(. ~ slice, scales = "free") + 171 | ggplot2::ggtitle("Total counts per spots") + 172 | ggplot2::labs(x = "Library Size (total UMI)", 173 | y = "Number of Spots") + 174 | ggpubr::theme_pubr() 175 | 176 | p2 177 | ``` 178 | 179 | 180 | ```{r fig.height=12, fig.width=12} 181 | Seurat::SpatialFeaturePlot( 182 | object = breast_merged, 183 | features = "nCount_Spatial") & 184 | ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 18)) 185 | ``` 186 | 187 | #### Gene counts 188 | Another characteristic we want to look at is how many counts per gene there are since we want to remove lowly expressed genes which aren't giving information but potentially introducing noise. 189 | ```{r fig.height=8, fig.width=12} 190 | count_mtrx <- Seurat::GetAssayData(object = breast_merged, 191 | slot = "counts", 192 | assay = "Spatial") 193 | 194 | gene_attr <- lapply(c("slice_1", "slice_2"), function(id) { 195 | if (id == "slice_1") tail_id <- "_1" else tail_id <- "_2" 196 | 197 | mask <- stringr::str_detect( 198 | string = colnames(count_mtrx), 199 | pattern = tail_id) 200 | 201 | gene_attr <- data.frame( 202 | nUMI = Matrix::rowSums(count_mtrx[, mask]), 203 | nSpots = Matrix::rowSums(count_mtrx[, mask] > 0), 204 | slice = id) 205 | }) %>% 206 | dplyr::bind_rows() 207 | 208 | p3 <- ggplot2::ggplot() + 209 | ggplot2::geom_histogram(data = gene_attr, 210 | ggplot2::aes(nUMI), 211 | fill = "red", 212 | alpha = 0.7, 213 | color = "red", 214 | bins = 50) + 215 | ggplot2::facet_wrap(. ~ slice, scales = "free") + 216 | ggplot2::scale_x_log10() + 217 | ggplot2::ggtitle("Total counts per gene (log10 scale)") + 218 | ggpubr::theme_pubr() 219 | p3 220 | ``` 221 | 222 | #### Gene ubiquitousness 223 | We also look at on how many spots each gene is detected, we see there are a few genes expressed in almost all the spots while and a majority of genes detected in few spots. 224 | ```{r fig.height=8, fig.width=12} 225 | p4 <- ggplot2::ggplot() + 226 | ggplot2::geom_histogram(data = gene_attr, 227 | ggplot2::aes(nSpots), 228 | fill = "red", 229 | alpha = 0.7, 230 | color = "red", 231 | bins = 50) + 232 | ggplot2::facet_wrap(. ~ slice, scales = "free") + 233 | ggplot2::ggtitle("Total spots per gene") + 234 | ggpubr::theme_pubr() 235 | 236 | p4 237 | ``` 238 | 239 | #### Mitochondrial % 240 | Next we take a look at the mitochondrial %; This metric can help us get a first glimpse of metabolic activity and/or necrotic regions - 10X explains [here](https://kb.10xgenomics.com/hc/en-us/articles/360001086611-Why-do-I-see-a-high-level-of-mitochondrial-gene-expression-) 241 | ```{r fig.height=8, fig.width=12} 242 | p5 <- ggplot2::ggplot() + 243 | ggplot2::geom_histogram(data = breast_merged[[]], 244 | ggplot2::aes(percent.mito), 245 | fill = "red", 246 | alpha = 0.7, 247 | color = "red", 248 | bins = 50) + 249 | ggplot2::facet_wrap(. ~ slice, scales = "free") + 250 | ggplot2::ggtitle("Mitochondrial % per spot") + 251 | ggplot2::labs(x = "Mitochondrial % ", 252 | y = "Number of Spots") + 253 | ggpubr::theme_pubr() 254 | p5 255 | ``` 256 | 257 | ```{r fig.height=12, fig.width=12} 258 | Seurat::SpatialFeaturePlot( 259 | object = breast_merged, 260 | features = "percent.mito") & 261 | ggplot2::theme(plot.title = element_text(hjust = 0.5, size = 18)) 262 | ``` 263 | 264 | #### Ribosomal % 265 | Lastly we look at the ribosomal % which gives us insight into which regions are the most transcriptomically active when looked side by side with the number of detected genes. 266 | ```{r fig.height=8, fig.width=12} 267 | p6 <- ggplot2::ggplot() + 268 | ggplot2::geom_histogram(data = breast_merged[[]], 269 | ggplot2::aes(percent.ribo), 270 | fill = "red", 271 | alpha = 0.7, 272 | color = "red", 273 | bins = 50) + 274 | ggplot2::facet_wrap(. ~ slice, scales = "free") + 275 | ggplot2::ggtitle("Ribosomal % per spot") + 276 | ggplot2::labs(x = "Ribosomal % ", 277 | y = "Number of Spots") + 278 | ggpubr::theme_pubr() 279 | p6 280 | ``` 281 | 282 | ```{r fig.height=12, fig.width=12} 283 | Seurat::SpatialFeaturePlot( 284 | object = breast_merged, 285 | features = "percent.ribo") + 286 | ggplot2::theme(plot.title = element_text(hjust = 0.5, size = 18)) 287 | ``` 288 | 289 | ### Feature covariation 290 | Next we look at how these features covariate. 291 | ```{r fig.height=15, fig.width=15} 292 | plt_covar_ls <- qc_covar_plots(se = breast_merged, 293 | nfeat = "nFeature_Spatial", 294 | ncount = "nCount_Spatial", 295 | slot = "counts", 296 | assay = "Spatial", 297 | percent.mito = "percent.mito", 298 | percent.ribo = "percent.ribo", 299 | facet = "slice") 300 | 301 | cowplot::plot_grid( 302 | plotlist = plt_covar_ls, 303 | ncol = 2, 304 | align = "hv", 305 | axis = "trbl") 306 | ``` 307 | 308 | ## Save RDS 309 | Save the object to use downstream. 310 | ```{r} 311 | lapply(c("breast_1", "breast_2"), function(id) { 312 | 313 | breast_sub <- breast_merged[, breast_merged$slice == id] 314 | 315 | # Remove other images 316 | breast_sub@images <- breast_sub@images[Seurat::Images(breast_sub) == id] 317 | 318 | "{an_breast_10x}/{robj_dir}/qc_se_{id}.rds" %>% 319 | glue::glue() %>% 320 | here::here() %>% 321 | saveRDS( 322 | object = breast_sub, 323 | file = .) 324 | }) 325 | ``` 326 | 327 | ## Session Info 328 | ```{r} 329 | sessionInfo() 330 | ``` 331 | -------------------------------------------------------------------------------- /ST-breast/2-10x_breast_GO_enrichment.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "2-10x breast GO enrichment" 3 | author: "Marc Elosua-Bayes" 4 | date: "6/12/2020" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE) 10 | ``` 11 | 12 | ## Introduction 13 | 14 | In this R markdown document we are going to carry out Gene marker and Gene Ontology enrichment analysis. 15 | 16 | ## Libraries 17 | ```{r} 18 | library(Seurat) 19 | library(Spaniel) 20 | library(tidyverse) 21 | library(Matrix) 22 | library(SPOTlight) 23 | source("utils/bin.r") 24 | source("utils/spatial_plot_spaniel.R") 25 | ``` 26 | 27 | ## Paths 28 | ```{r} 29 | source("misc/paths.R") 30 | dir.create(path = sprintf("%s/%s", an_breast_10x, robj_dir), 31 | showWarnings = FALSE, 32 | recursive = TRUE) 33 | 34 | dir.create(path = sprintf("%s/%s", an_breast_10x, plt_dir), 35 | showWarnings = FALSE, 36 | recursive = TRUE) 37 | 38 | ``` 39 | 40 | ## Load data 41 | Data loaded here comes from 1-10x_breast_QC.Rmd and 3-10x_breast_immune_reference.Rmd 42 | ```{r} 43 | breast_merged <- readRDS(file = sprintf("%s/%s/breast_merged_processed.RDS", an_breast_10x, robj_dir)) 44 | decon_mtrx_ls <- readRDS(file = sprintf("%s/%s/decon_mtrx_breast_cancer_10x.RDS", an_breast_10x, robj_dir)) 45 | ``` 46 | 47 | ## Analysis 48 | Start by getting the marker genes for the clusters 49 | ```{r} 50 | Seurat::Idents(breast_merged) <- breast_merged$SCT_snn_res.0.1 51 | st_markers <- Seurat::FindAllMarkers(object = breast_merged, 52 | assay = "SCT", 53 | slot = "data", 54 | only.pos = TRUE) 55 | st_markers %>% count(cluster) 56 | saveRDS(object = st_markers, 57 | file = sprintf("%s/%s/markers_breast_10x_res.0.1.RDS", an_breast_10x, robj_dir)) 58 | ``` 59 | 60 | Filter top genes 61 | ```{r} 62 | pull_col <- function(.x) { 63 | return(.x %>% pull(var = "gene")) 64 | } 65 | 66 | marker_ls <- st_markers %>% 67 | dplyr::filter(p_val_adj < 0.01) %>% 68 | dplyr::group_by(cluster) %>% 69 | dplyr::group_split() %>% 70 | purrr::map(.f = pull_col) 71 | 72 | ``` 73 | 74 | ### GO term enrichment analysis 75 | ```{r eval = FALSE} 76 | library(org.Hs.eg.db) 77 | library(GOstats) 78 | 79 | univ <- stringr::str_split(string = rownames(breast_merged@assays$Spatial@counts), 80 | pattern = " ", 81 | simplify = TRUE)[, 1] 82 | 83 | DE_df1 <- lapply(seq_len(length(marker_ls)), function(i) { 84 | # gene_set <- convert_symb_entrez(gene_vec = stringr::str_to_upper(markers_ln1_ls[[i]])) 85 | # gene_universe <- convert_symb_entrez(gene_vec = stringr::str_to_upper(rownames(ln_1))) 86 | print(i) 87 | go_clust <- gene_enrichment_GO(gene_de = stringr::str_to_upper(marker_ls[[i]]), 88 | gene_universe = univ) %>% 89 | summary(.) %>% 90 | dplyr::mutate(cluster = i) 91 | 92 | return(go_clust) 93 | }) %>% 94 | dplyr::bind_rows() 95 | 96 | saveRDS(object = DE_df1, file = sprintf("%s/%s/go_enrichment_breast_10x_res.0.1.RDS", an_breast_10x, robj_dir)) 97 | ``` 98 | 99 | 100 | ```{r} 101 | DE_df1 <- readRDS(file = sprintf("%s/%s/go_enrichment_breast_10x_res.0.1.RDS", an_breast_10x, robj_dir)) 102 | ``` 103 | 104 | #### Visualization 105 | For visualization purposses we are going to check GO terms at varying levels of specificities. 106 | 107 | ```{r} 108 | DE_specific <- DE_df1 %>% 109 | dplyr::filter(Size <= 50 & Pvalue < 0.01 & Count >= 3) 110 | 111 | DE_mid <- DE_df1 %>% 112 | dplyr::filter(Size <= 300 & Size >= 30 & Pvalue < 0.01 & Count >= 3) 113 | 114 | DE_general <- DE_df1 %>% 115 | dplyr::filter(Size > 300 & Pvalue < 0.01 & Count >= 3) 116 | ``` 117 | 118 | ```{r} 119 | GO_visualization <- function(ds) { 120 | plt_ls <- lapply(unique(ds$cluster), function(i) { 121 | tmp_plt <- ds %>% 122 | dplyr::filter(cluster == i) %>% 123 | dplyr::arrange(desc(OddsRatio)) %>% 124 | head(20) %>% 125 | ggplot(.) + 126 | geom_point(aes(x = OddsRatio, 127 | y = reorder(Term, OddsRatio), 128 | size = -Pvalue, 129 | color = Pvalue)) + 130 | labs(title = sprintf("Cluster %s", i - 1)) + 131 | scale_color_gradient(low = "green", 132 | high = "red") + 133 | theme_classic() 134 | return(tmp_plt) 135 | }) 136 | return(plt_ls) 137 | } 138 | ``` 139 | 140 | Specific 141 | ```{r} 142 | GO_visualization(ds = DE_specific) %>% 143 | ggpubr::ggexport(plotlist = ., 144 | filename = sprintf("%s/%s/go_enrich_breast_10x_res.0.1_specific.pdf", 145 | an_breast_10x, plt_dir), 146 | width = 12) 147 | ``` 148 | 149 | Middle 150 | ```{r} 151 | GO_visualization(ds = DE_mid) %>% 152 | ggpubr::ggexport(plotlist = ., 153 | filename = sprintf("%s/%s/go_enrich_breast_10x_res.0.1_mid.pdf", 154 | an_breast_10x, plt_dir), 155 | width = 12) 156 | ``` 157 | 158 | General 159 | ```{r} 160 | GO_visualization(ds = DE_general) %>% 161 | ggpubr::ggexport(plotlist = ., 162 | filename = sprintf("%s/%s/go_enrich_breast_10x_res.0.1_general.pdf", 163 | an_breast_10x, plt_dir), 164 | width = 12) 165 | ``` 166 | 167 | Look at estrogen, progesterone and HER2 receptor genes 168 | ```{r} 169 | Seurat::SpatialFeaturePlot(object = breast_merged, 170 | features = c("ESR1", # Estrogen receptor 171 | "PGR", # Progesterone receptor 172 | "ERBB2"), # HER2 positive 173 | alpha = c(0, 1)) 174 | 175 | 176 | Seurat::SpatialDimPlot(object = breast_merged, 177 | group.by = "seurat_clusters") 178 | 179 | ``` 180 | 181 | -------------------------------------------------------------------------------- /ST-breast/3-10x_breast_immune_reference.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | author: "Marc Elosua-Bayes" 3 | date: "`r format(Sys.time(), '%d %B, %Y')`" 4 | output: 5 | html_document: 6 | toc: yes 7 | toc_float: yes 8 | number_sections: yes 9 | df_print: paged 10 | editor_options: 11 | chunk_output_type: console 12 | params: 13 | sample_id: "Default!" 14 | clust_vr: "Default!" 15 | title: "`r sprintf('3-10x_breast_immune_reference %s', {params$sample_id})`" 16 | --- 17 | 18 | ```{r setup, include=FALSE} 19 | knitr::opts_chunk$set(echo = TRUE, out.width = "100%", fig.align='center', 20 | message = FALSE, warning = FALSE) 21 | options(width = 1200) 22 | ``` 23 | 24 | ## Introduction 25 | In this R markdown document we will map immune cells onto the breast cancer tissue. 26 | 27 | ```{r} 28 | library(Seurat) 29 | library(tidyverse) 30 | library(Matrix) 31 | library(SPOTlight) 32 | source(here::here("utils/bin.r")) 33 | ``` 34 | 35 | ## Paths 36 | ```{r} 37 | source(here::here("misc/paths.R")) 38 | 39 | "{an_breast_10x}/{robj_dir}" %>% 40 | glue::glue() %>% 41 | here::here() %>% 42 | dir.create( 43 | path = , 44 | showWarnings = FALSE, 45 | recursive = TRUE) 46 | 47 | "{an_breast_10x}/{plt_dir}" %>% 48 | glue::glue() %>% 49 | here::here() %>% 50 | dir.create( 51 | path = , 52 | showWarnings = FALSE, 53 | recursive = TRUE) 54 | ``` 55 | 56 | ## Parameters 57 | ```{r} 58 | set.seed(1243) 59 | 60 | sample_id <- params$sample_id 61 | # sample_id <- "breast_1" 62 | # sample_id <- "breast_2" 63 | clust_vr <- params$clust_vr 64 | # clust_vr <- "lv1_annot" 65 | # clust_vr <- "lv2_annot" 66 | 67 | trn <- "melanoma" 68 | cl_n <- 100 69 | hvg <- 3000 70 | ntop <- NULL 71 | transf <- "uv" 72 | method <- "nsNMF" 73 | min_cont <- 0 74 | 75 | if (is.null(ntop)) { 76 | spotlight_id <- glue::glue("trn-{trn}_cln-{cl_n}_hvg-{hvg}_ntop-NULL_transf-{transf}_method-{method}_mincont-{min_cont}") 77 | } else { 78 | spotlight_id <- glue::glue("trn-{trn}_cln-{cl_n}_hvg-{hvg}_ntop-{ntop}_transf-{transf}_method-{method}_mincont-{min_cont}") 79 | } 80 | ``` 81 | 82 | Define cell types of interest depending on the annotation level 83 | ```{r} 84 | if (clust_vr == "lv2_annot") { 85 | ct_interest <- c("Plasma.B.cells", "T.regs", "T.helper.Th17", 86 | "Proliferation", "CD8.exhausted", 87 | "CD8.cytotoxic", "NK", "Macrophages.SPP1", "TAMs.C1QC") 88 | } else if (clust_vr == "lv1_annot") { 89 | ct_interest <- c("Plasma B cells", "T cells regulatory", "T helper cells", 90 | "Proliferation", "CD8 pre-exhausted", 91 | "CD8 terminally exhausted", "NK", "Macrophages SPP1", "TAMs C1QC") 92 | } 93 | ``` 94 | 95 | 96 | ## Load data 97 | Breast cancer data loaded here comes from 1-10x_breast_QC.Rmd 98 | ```{r} 99 | ### Spatial breast cancer 100 | se_obj <- "{an_breast_10x}/{robj_dir}/qc_se_{sample_id}.rds" %>% 101 | glue::glue() %>% 102 | here::here() %>% 103 | readRDS(file = .) 104 | ``` 105 | 106 | Create a name/color dataframe 107 | ```{r} 108 | # source(here::here("misc/col_df_scrpt.R")) 109 | # source(here::here("misc/col_df_scrpt2.R")) 110 | 111 | # col_vec <- readRDS(here::here("data/immune_reference/complete_cell_type_palette.rds")) 112 | # col_df <- data.frame(col_vec) %>% 113 | # tibble::rownames_to_column("plt_name") %>% 114 | # dplyr::mutate(ct_name = stringr::str_replace_all( 115 | # string = plt_name, 116 | # pattern = "[[:punct:]]|[[:blank:]]", 117 | # replacement = ".")) 118 | # prloif_df <- data.frame(plt_name = "Proliferation", col_vec = "Brown", ct_name = "Proliferation") 119 | # col_df <- rbind(col_df, prloif_df) 120 | # col_df <- col_df %>% 121 | # dplyr::mutate( 122 | # # plt_name = dplyr::if_else(plt_name == "Th17 cells", "CD4 effector memory", plt_name), 123 | # # ct_name = dplyr::if_else(ct_name == "Th17.cells", "CD4.effector.memory", ct_name), 124 | # plt_name = dplyr::if_else(plt_name == "TAMs M2", "TAMs C1QC", plt_name) 125 | # ) 126 | # saveRDS(object = col_df, file = here::here("misc/col_df.rds")) 127 | col_df <- readRDS(here::here("misc/col_df.rds")) 128 | ``` 129 | 130 | Subset ICA to just use cells from the melanoma subset since we have a good representation of cells for each cluster from this dataset. 131 | ```{r eval = FALSE} 132 | ica_melanoma2_path <- "{an_breast_10x}/{robj_dir}/ica_melanoma2.rds" %>% 133 | glue::glue() %>% 134 | here::here() 135 | 136 | if (file.exists(ica_melanoma2_path)) { 137 | 138 | ica_sub <- readRDS(file = ica_melanoma2_path) 139 | } else { 140 | # ica_se <- readRDS("/scratch/devel/pnieto/TIL_Atlas/TICA/output/integrated_renamed_filtered.rds") 141 | ica_se <- "/scratch/devel/pnieto/TIL_Atlas/atlas_final_dataset/new_annot/TICAtlas.rds" %>% 142 | here::here() %>% 143 | readRDS(.) 144 | ### Immune reference atlas 145 | ica_sub <- subset(ica_se, subset = source == "melanoma2") 146 | rm(ica_se) 147 | 148 | saveRDS(object = ica_sub, file = ica_melanoma2_path) 149 | } 150 | 151 | table(ica_sub@meta.data$lv1_annot) 152 | table(ica_sub@meta.data$lv2_annot) 153 | ``` 154 | 155 | Change variable names to remove non-standard characters 156 | ```{r eval = FALSE} 157 | ica_sub[["specific_cell_type_mod"]] <- stringr::str_replace_all( 158 | string = as.character(ica_sub@meta.data[, clust_vr]), 159 | pattern = "[[:punct:]]|[[:blank:]]", 160 | replacement = ".") 161 | 162 | prolif_vec <- c("T.cells.proliferative", "Macrophages.and.monocytes.proliferative", 163 | "B.cells.proliferative", "Macrophages.proliferative") 164 | 165 | ica_sub[["specific_cell_type_mod"]] <- 166 | dplyr::if_else(ica_sub@meta.data[, "specific_cell_type_mod"] %in% prolif_vec, 167 | "Proliferation", ica_sub@meta.data[, "specific_cell_type_mod"]) 168 | 169 | table(ica_sub@meta.data$specific_cell_type_mod) 170 | ``` 171 | 172 | Immune cell marker genes 173 | ```{r eval = FALSE} 174 | ica_sub <- Seurat::SCTransform(object = ica_sub, assay = "RNA") 175 | 176 | Seurat::Idents(ica_sub) <- as.character(ica_sub@meta.data[, "specific_cell_type_mod"]) 177 | 178 | ica_markers <- Seurat::FindAllMarkers(object = ica_sub, 179 | assay = "SCT", 180 | slot = "data", 181 | only.pos = TRUE, 182 | logfc.threshold = 0, 183 | min.pct = 0, 184 | max.cells.per.ident = 500) 185 | 186 | 187 | "data/immune_reference/ica_markers_melanoma2_{clust_vr}.rds" %>% 188 | glue::glue()%>% 189 | here::here( ) %>% 190 | saveRDS( 191 | object = ica_markers, 192 | file = .) 193 | ``` 194 | 195 | Load marker genes per cluster 196 | ```{r eval = FALSE} 197 | ica_markers <- "data/immune_reference/ica_markers_melanoma2_{clust_vr}.rds" %>% 198 | glue::glue()%>% 199 | here::here() %>% 200 | readRDS(file = .) 201 | 202 | # ica_markers <- readRDS(file = here::here("data/immune_reference/ica_markers_melanoma2.rds")) 203 | 204 | # if (clust_vr == "lv1_annot") { 205 | # Remove MAST cells 206 | ica_markers <- ica_markers %>% dplyr::filter(cluster != "Mast.cells") 207 | # } else if (clust_vr == "lv2_annot") { 208 | # Remove MAST cells 209 | # ica_markers <- ica_markers %>% dplyr::filter(cluster != "Mast.cells") 210 | 211 | # } 212 | ``` 213 | 214 | Look at the proliferation markers 215 | ```{r eval = FALSE} 216 | ica_markers %>% 217 | dplyr::filter(cluster == "Proliferation") %>% 218 | head(20) 219 | ``` 220 | 221 | ## Deconvolution 222 | Run deconvolution. 223 | ```{r message = FALSE, eval = FALSE} 224 | # Remove Mast cells 225 | ica_sub <- subset(ica_sub, subset = specific_cell_type_mod != "Mast.cells") 226 | # Create Seurat object with filtered gene matrix after removing Mast cells 227 | ica_sub <- Seurat::CreateSeuratObject( 228 | counts = ica_sub@assays$RNA@counts[ 229 | sparseMatrixStats::rowSums2(ica_sub@assays$RNA@counts) != 0, ], 230 | meta.data = ica_sub@meta.data) 231 | 232 | # Remove empty spatial genes 233 | # keep_genes <- rowSums(as.matrix(se_obj@assays$Spatial@counts)) != 0 234 | # table(rowSums(ica_sub@assays$RNA@counts) != 0) 235 | 236 | # Run deconvolution 237 | decon_mtrx_ls <- SPOTlight::spotlight_deconvolution( 238 | se_sc = ica_sub, 239 | counts_spatial = se_obj@assays$Spatial@counts, 240 | clust_vr = "specific_cell_type_mod", 241 | cluster_markers = ica_markers, 242 | cl_n = cl_n, 243 | hvg = hvg, 244 | ntop = ntop, 245 | transf = transf, 246 | method = method, 247 | min_cont = min_cont, 248 | assay = "RNA", 249 | slot = "counts") 250 | 251 | # saveRDS(object = decon_mtrx_ls, 252 | # file = sprintf("%s/%s/decon_mtrx_breast_cancer_10x_atlas_%s_reg.rds", 253 | # an_breast_10x, robj_dir, spotlight_id)) 254 | # 255 | # saveRDS(object = decon_mtrx_ls, 256 | # file = sprintf("%s/%s/decon_mtrx_breast_cancer_10x_atlas_%s_latent.rds", 257 | # an_breast_10x, robj_dir, spotlight_id)) 258 | 259 | "{an_breast_10x}/{robj_dir}/decon_mtrx_breast_cancer_10x_atlas_{sample_id}_{spotlight_id}_{clust_vr}.rds" %>% 260 | glue::glue() %>% 261 | here::here() %>% 262 | saveRDS(object = decon_mtrx_ls, file = .) 263 | ``` 264 | 265 | Add deconvolution matrix to Seurat object metadata 266 | ```{r} 267 | decon_mtrx_ls <- "{an_breast_10x}/{robj_dir}/decon_mtrx_breast_cancer_10x_atlas_{sample_id}_{spotlight_id}_{clust_vr}.rds" %>% 268 | glue::glue() %>% 269 | here::here() %>% 270 | readRDS(file = .) 271 | 272 | # decon_mtrx_ls <- readRDS(file = sprintf("%s/%s/decon_mtrx_breast_cancer_10x_atlas_%s.rds", 273 | # an_breast_10x, robj_dir, spotlight_id)) 274 | # decon_mtrx_ls <- readRDS(file = sprintf("%s/%s/decon_mtrx_breast_cancer_10x_pdac_atlas.rds", an_breast_10x, robj_dir)) 275 | # decon_mtrx_ls <- readRDS(file = sprintf("%s/%s/decon_mtrx_breast_cancer_10x_atlas_%s_reg.rds", 276 | # an_breast_10x, robj_dir, spotlight_id)) 277 | # 278 | # decon_mtrx_ls <- readRDS(file = sprintf("%s/%s/decon_mtrx_breast_cancer_10x_atlas_%s_latent.rds", 279 | # an_breast_10x, robj_dir, spotlight_id)) 280 | 281 | decon_mtrx <- decon_mtrx_ls[[2]] 282 | decon_mtrx <- decon_mtrx[, colnames(decon_mtrx) != "res_ss"] 283 | 284 | # Set as 0 cell types predicted to be under 3 % of the spot 285 | decon_mtrx[decon_mtrx < 0.03] <- 0 286 | ``` 287 | 288 | Change names to original ones and reorder according to Paula 289 | ```{r} 290 | new_colnames <- data.frame(ct_name = colnames(decon_mtrx), stringsAsFactors = FALSE) %>% 291 | dplyr::left_join(col_df, by = "ct_name") %>% 292 | dplyr::pull(plt_name) 293 | 294 | colnames(decon_mtrx) <- new_colnames 295 | ``` 296 | 297 | Add deconvolution matrix to Seurat object metadata 298 | ```{r} 299 | se_obj@meta.data <- cbind(se_obj@meta.data, round(decon_mtrx, 3)) 300 | ``` 301 | 302 | ### Visualization 303 | Before we start we take a look at the topic profiles 304 | ```{r} 305 | # Extract coefficient matrix 306 | h <- NMF::coef(decon_mtrx_ls[[1]][[1]]) 307 | 308 | # Extract coefficient labels 309 | train_labs <- data.frame(ct_name = decon_mtrx_ls[[1]][[2]]) %>% 310 | dplyr::left_join(col_df, by = "ct_name") %>% 311 | dplyr::pull(plt_name) 312 | 313 | # rownames(h) <- paste("Topic", 1:nrow(h), sep = " ") 314 | profile_plt <- SPOTlight::dot_plot_profiles_fun( 315 | h = h, 316 | train_cell_clust = train_labs) 317 | 318 | plt_2 <- profile_plt[[2]] + 319 | ggplot2::scale_x_discrete(limits = unique(train_labs)) + 320 | ggplot2::scale_y_discrete(labels = glue::glue("Topic {1:nrow(h)}")) + 321 | ggplot2::labs(x = "", y = "") + 322 | ggplot2::theme( 323 | axis.title = ggplot2::element_text(size = 15), 324 | axis.text.x = ggplot2::element_text(hjust = 1) 325 | ) 326 | profile_plt[[1]] 327 | 328 | "{an_breast_10x}/{plt_dir}/all_ct_profiles_{spotlight_id}_{sample_id}_{clust_vr}.pdf" %>% 329 | glue::glue() %>% 330 | here::here() %>% 331 | cowplot::save_plot( 332 | filename = ., 333 | plot = profile_plt[[1]], 334 | base_height = 30, 335 | base_width = 30) 336 | 337 | "{an_breast_10x}/{plt_dir}/ct_profiles_{spotlight_id}_{sample_id}_{clust_vr}.pdf" %>% 338 | glue::glue() %>% 339 | here::here() %>% 340 | cowplot::save_plot( 341 | filename = ., 342 | plot = plt_2 + 343 | ggplot2::theme(axis.title = ggplot2::element_text(size = 18)), 344 | base_height = 15, 345 | base_width = 15) 346 | ``` 347 | 348 | Lastly we can take a look at which genes are the most important for each topic and get an insight into which genes are driving them. 349 | ```{r} 350 | basis_spotlight <- data.frame(NMF::basis(decon_mtrx_ls[[1]][[1]])) 351 | 352 | # colnames(basis_spotlight) <- unique(stringr::str_wrap(decon_mtrx_ls[[1]][[2]], width = 30)) 353 | colnames(basis_spotlight) <- glue::glue("Topic-{1:length(unique(decon_mtrx_ls[[1]][[2]]))}") 354 | 355 | basis_spotlight %>% 356 | round(., 5) %>% 357 | DT::datatable(., filter = "top") 358 | ``` 359 | 360 | We will start by looking at the location of each cell type 361 | ```{r} 362 | # https://stackoverflow.com/questions/38722202/how-do-i-change-the-number-of-decimal-places-on-axis-labels-in-ggplot2 363 | # Our transformation function to ensure 2 decimals in the legend 364 | scaleFUN <- function(x) sprintf("%.2f", x) 365 | 366 | ct_all <- colnames(decon_mtrx) 367 | ct_plt_ls <- lapply(ct_all, function(ct) { 368 | tmp_plt <- Seurat::SpatialPlot( 369 | object = se_obj, 370 | features = ct, 371 | alpha = c(0, 1), 372 | # Remove background image 373 | image.alpha = 0) 374 | 375 | if (sum(se_obj@meta.data[, ct]) == 0) { 376 | tmp_plt <- tmp_plt + ggplot2::scale_alpha(range = c(0,0)) 377 | } else { 378 | tmp_plt <- tmp_plt + ggplot2::scale_alpha(range = c(0, 1)) 379 | } 380 | 381 | tmp_plt <- tmp_plt + 382 | ggplot2::theme( 383 | legend.title = ggplot2::element_blank() 384 | ) + 385 | ggplot2::labs(title = ct) + 386 | ggplot2::scale_fill_gradientn( 387 | colors = grDevices::heat.colors(10, rev = TRUE), 388 | # Same number of breaks for all plots 389 | breaks = seq(min(se_obj@meta.data[, ct]), 390 | max(se_obj@meta.data[, ct]), 391 | length.out = 4), 392 | # 2 decimals in the legend 393 | labels = scaleFUN 394 | # limits = c(0, 1) 395 | ) 396 | return(tmp_plt) 397 | }) 398 | 399 | ct_grid <- cowplot::plot_grid( 400 | plotlist = ct_plt_ls, 401 | axis = "trbl", 402 | align = "hv", 403 | nrow = 5, 404 | ncol = 5) 405 | 406 | "{an_breast_10x}/{plt_dir}/{sample_id}_10x_immune_reference_arrangement_{clust_vr}.pdf" %>% 407 | glue::glue() %>% 408 | here::here() %>% 409 | cowplot::save_plot( 410 | filename = ., 411 | plot = ct_grid, 412 | base_height = 25, 413 | base_width = 25) 414 | ``` 415 | 416 | Next we will only plot the location of the cell types of interest for the main plot 417 | ```{r eval = FALSE} 418 | # i <- unique(se_obj$slice)[1] 419 | # ct <- cell_types[[1]] 420 | ct_plt_ls <- lapply(ct_interest, function(ct) { 421 | print(ct) 422 | tmp_plt <- Seurat::SpatialPlot( 423 | object = se_obj, 424 | features = ct, 425 | alpha = c(0, 1), 426 | # Remove background image 427 | image.alpha = 0) + 428 | theme(legend.title = ggplot2::element_blank()) 429 | 430 | if (sum(se_obj@meta.data[, ct]) == 0) { 431 | tmp_plt <- tmp_plt + ggplot2::scale_alpha(range = c(0,0)) 432 | } else { 433 | tmp_plt <- tmp_plt + 434 | ggplot2::scale_fill_gradientn( 435 | colors = grDevices::heat.colors(10, rev = TRUE), 436 | # Same number of breaks for all plots 437 | breaks = seq(min(se_obj@meta.data[, ct]), 438 | max(se_obj@meta.data[, ct]), 439 | length.out = 4), 440 | # 2 decimals in the legend 441 | labels = scaleFUN 442 | # limits = c(0, 1) 443 | ) 444 | } 445 | 446 | tmp_plt <- tmp_plt + 447 | ggplot2::labs(title = ct) + 448 | ggplot2::theme( 449 | plot.title = ggplot2::element_text(hjust = 0.5, size = 20, face = "bold"), 450 | legend.title = ggplot2::element_blank()) 451 | 452 | return(tmp_plt) 453 | }) 454 | 455 | interest_grid <- cowplot::plot_grid(plotlist = ct_plt_ls, 456 | axis = "trbl", 457 | align = "hv", 458 | nrow = 3, 459 | ncol = 3) 460 | 461 | "{an_breast_10x}/{plt_dir}/{sample_id}_10x_immune_reference_arrangement_interest_{clust_vr}.pdf" %>% 462 | glue::glue() %>% 463 | here::here() %>% 464 | cowplot::save_plot( 465 | filename = ., 466 | plot = interest_grid, 467 | base_height = 18, 468 | base_width = 15) 469 | ``` 470 | 471 | Then we will plot the spatial-scatterpie for both slices to get a gist of what is going on 472 | ```{r eval = FALSE} 473 | spsct_plt1 <- SPOTlight::spatial_scatterpie( 474 | se_obj = se_obj, 475 | cell_types_all = cell_types, 476 | img_path = here::here("data/breast_visium/section_1/spatial/tissue_lowres_image.pdf"), 477 | pie_scale = 0.4, 478 | slice = sample_id) 479 | 480 | if (clust_vr == "lv1_annot") { 481 | spsct_plt1 <- spsct_plt1 + 482 | ggplot2::scale_fill_manual( 483 | values = col_df[col_df$plt_name %in% cell_types, "ct_col"], 484 | breaks = cell_types) + 485 | ggplot2::guides(fill = ggplot2::guide_legend(ncol = 1)) 486 | } 487 | 488 | "{an_breast_10x}/{plt_dir}/{sample_id}_10x_spatial_scatterpie_{clust_vr}.pdf" %>% 489 | glue::glue() %>% 490 | here::here() %>% 491 | cowplot::save_plot( 492 | filename = ., 493 | plot = spsct_plt1, 494 | base_width = 12, 495 | base_height = 9) 496 | ``` 497 | 498 | We will also plot the scatterpie without the slice image underneath 499 | ```{r} 500 | sct_plt1 <- SPOTlight::scatterpie_plot(se_obj = se_obj, 501 | cell_types_all = ct_all, 502 | pie_scale = 0.7, 503 | slice = sample_id) + 504 | ggplot2::scale_fill_manual( 505 | values = col_df[col_df$plt_name %in% ct_all, "col_df"], 506 | breaks = colnames(decon_mtrx)) + 507 | ggplot2::coord_fixed(ratio = 1) + 508 | ggplot2::guides(fill = ggplot2::guide_legend(ncol = 1)) 509 | 510 | "{an_breast_10x}/{plt_dir}/{sample_id}_10x_scatterpie_{clust_vr}.pdf" %>% 511 | glue::glue() %>% 512 | here::here() %>% 513 | cowplot::save_plot( 514 | filename = ., 515 | plot = sct_plt1, 516 | base_width = 12, 517 | base_height = 9) 518 | ``` 519 | 520 | We can also take a loot at the spatial scatterpie by looking at cell types which are not present throughout the entire tissue. 521 | ```{r} 522 | # Subset cell types from metadata 523 | metadata_subset <- se_obj@meta.data[, ct_all] 524 | 525 | # Create masks 526 | keep_0.9 <- colSums(se_obj@meta.data[, ct_all] > 0) < 0.75 * ncol(se_obj) 527 | keep_g0 <- colSums(se_obj@meta.data[, ct_all] > 0) > 0 528 | 529 | # Select cell types fullfiling the conditions 530 | ct_var <- colnames(decon_mtrx)[keep_0.9 & keep_g0] 531 | 532 | col_df <- col_df %>% dplyr::arrange(plt_name) 533 | sct_plt_int <- SPOTlight::scatterpie_plot(se_obj = se_obj, 534 | cell_types_all = ct_var, 535 | pie_scale = 0.4) + 536 | ggplot2::scale_fill_manual( 537 | values = col_df[col_df$plt_name %in% ct_var, "col_vec"], 538 | breaks = col_df[col_df$plt_name %in% ct_var, "plt_name"]) + 539 | ggplot2::labs(fill = "") + 540 | ggplot2::coord_fixed(ratio = 1) + 541 | ggplot2::theme(legend.position = "top") + 542 | ggplot2::guides(fill = ggplot2::guide_legend(nrow = 3)) 543 | 544 | "{an_breast_10x}/{plt_dir}/{sample_id}_10x_scatterpie_interest_{clust_vr}.pdf" %>% 545 | glue::glue() %>% 546 | here::here() %>% 547 | cowplot::save_plot( 548 | filename = ., 549 | plot = sct_plt_int, 550 | base_width = 12, 551 | base_height = 9) 552 | ``` 553 | 554 | ### Gene - Cell correlation 555 | Check assumptions 556 | ```{r eval = FALSE} 557 | lapply(c("Regulatory T cells", "Cytotoxic CD8 T cells"), function(ct) { 558 | # Shapiro-Wilk normality test for mpg 559 | print(shapiro.test(se_obj@meta.data[, ct])) 560 | ggpubr::ggqqplot(se_obj@meta.data[, ct]) 561 | 562 | }) 563 | 564 | lapply(c("ENTPD1", "NT5E", "TGFB1"), function(gene) { 565 | # Shapiro-Wilk normality test for mpg 566 | print(shapiro.test(se_obj@assays$SCT@data[gene, ])) 567 | ggpubr::ggqqplot(se_obj@assays$SCT@data[gene, ]) 568 | 569 | }) 570 | ``` 571 | 572 | Next we want to check if the fibrotic marker TGFB1 with Treg and Cytotoxic CD8 T cells" 573 | ```{r eval = FALSE} 574 | library(ggpubr) 575 | 576 | metadata <- dplyr::bind_cols( 577 | se_obj@meta.data, 578 | data.frame(t(se_obj@assays$SCT@data[c("ENTPD1", "NT5E", "TGFB1"), ]))) 579 | 580 | cor_plts <- lapply(c("Regulatory T cells", "Cytotoxic CD8 T cells"), function(ct) { 581 | tmp_ls <- lapply(c("ENTPD1", "NT5E", "TGFB1"), function(gene) { 582 | 583 | tmp_plt <- ggpubr::ggscatter(data = metadata, 584 | x = gene, 585 | y = ct, 586 | add = "reg.line") + 587 | ggpubr::stat_cor(method = "pearson") 588 | return(tmp_plt) 589 | }) 590 | cowplot::plot_grid(plotlist = tmp_ls, ncol = 1) 591 | }) 592 | 593 | tmp_arr <- cowplot::plot_grid(plotlist = cor_plts, 594 | nrow = 1, 595 | ncol = 2) 596 | tmp_arr 597 | ``` 598 | 599 | ## Session Info 600 | ```{r} 601 | sessionInfo() 602 | ``` 603 | -------------------------------------------------------------------------------- /ST-breast/RCTD-10x_breast_immune_reference.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | author: "Marc Elosua-Bayes" 3 | date: "`r format(Sys.time(), '%d %B, %Y')`" 4 | output: 5 | html_document: 6 | toc: yes 7 | toc_float: yes 8 | number_sections: yes 9 | df_print: paged 10 | editor_options: 11 | chunk_output_type: console 12 | params: 13 | sample_id: "Default!" 14 | clust_vr: "lv1_annot" 15 | title: "`r sprintf('RCTD-10x_breast_immune_reference %s', {params$sample_id})`" 16 | --- 17 | 18 | ```{r setup, include=FALSE} 19 | knitr::opts_chunk$set(echo = TRUE, out.width = "100%", fig.align='center', 20 | message = FALSE, warning = FALSE) 21 | options(width = 1200) 22 | ``` 23 | 24 | ## Introduction 25 | In this R markdo9wn document we will map immune cells onto the breast cancer tissue using the RCTD tool. 26 | Robust Cell Type Deconvolution is developed by Dylan M. Cable from Rafa Irizarry's lab 27 | 28 | RCTD original paper can be found [here](https://doi.org/10.1038/s41587-021-00830-w) and the GitHub repo [here](https://github.com/dmcable/RCTD). 29 | 30 | 31 | ```{r} 32 | library(Seurat) 33 | library(tidyverse) 34 | library(Matrix) 35 | # devtools::install_github("dmcable/RCTD", build_vignettes = TRUE) 36 | library(RCTD) 37 | source(here::here("utils/bin.r")) 38 | ``` 39 | 40 | ## Paths 41 | ```{r} 42 | source(here::here("misc/paths.R")) 43 | 44 | "{an_breast_10x}/{robj_dir}" %>% 45 | glue::glue() %>% 46 | here::here() %>% 47 | dir.create( 48 | path = , 49 | showWarnings = FALSE, 50 | recursive = TRUE) 51 | 52 | "{an_breast_10x}/{plt_dir}" %>% 53 | glue::glue() %>% 54 | here::here() %>% 55 | dir.create( 56 | path = , 57 | showWarnings = FALSE, 58 | recursive = TRUE) 59 | ``` 60 | 61 | ## Parameters 62 | ```{r} 63 | set.seed(1243) 64 | 65 | sample_id <- params$sample_id 66 | # sample_id <- "breast_2" 67 | clust_vr <- params$clust_vr 68 | clust_vr <- "lv1_annot" 69 | # clust_vr <- "lv2_annot" 70 | 71 | trn <- "melanoma" 72 | cl_n <- 100 73 | hvg <- 3000 74 | ntop <- NULL 75 | transf <- "uv" 76 | method <- "nsNMF" 77 | min_cont <- 0 78 | 79 | if (is.null(ntop)) { 80 | spotlight_id <- glue::glue("trn-{trn}_cln-{cl_n}_hvg-{hvg}_ntop-NULL_transf-{transf}_method-{method}_mincont-{min_cont}") 81 | } else { 82 | spotlight_id <- glue::glue("trn-{trn}_cln-{cl_n}_hvg-{hvg}_ntop-{ntop}_transf-{transf}_method-{method}_mincont-{min_cont}") 83 | } 84 | ``` 85 | 86 | RCTD paths 87 | ```{r} 88 | refdir <- "{an_breast_10x}/{robj_dir}/RCTD_data/reference" %>% 89 | glue::glue() %>% 90 | here::here() 91 | 92 | dir.create(path = refdir, showWarnings = FALSE, recursive = TRUE) 93 | 94 | stdir <- "{an_breast_10x}/{robj_dir}/{sample_id}/RCTD_data/spatial" %>% 95 | glue::glue() %>% 96 | here::here() 97 | 98 | dir.create(path = stdir, showWarnings = FALSE, recursive = TRUE) 99 | ``` 100 | 101 | Define cell types of interest depending on the annotation level 102 | ```{r} 103 | if (clust_vr == "new_annot") { 104 | ct_interest <- c("Plasma.B.cells", "T.regs", "T.helper.Th17", 105 | "Proliferation", "CD8.exhausted", 106 | "CD8.cytotoxic", "NK", "Macrophages.SPP1", "TAMs.C1QC") 107 | } else if (clust_vr == "old_cell_types") { 108 | ct_interest <- c("Plasma B cells", "Regulatory T cells", "T helper cells", 109 | "Proliferation", "Pre-exhausted CD8 T cells", 110 | "Cytotoxic CD8 T cells", "NK", "SPP1 TAMs", "M2 TAMs") 111 | } 112 | ``` 113 | 114 | ## Load data 115 | Breast cancer data loaded here comes from 1-10x_breast_QC.Rmd 116 | ```{r eval = FALSE} 117 | ### Spatial breast cancer 118 | se_obj <- "{an_breast_10x}/{robj_dir}/qc_se_{sample_id}.rds" %>% 119 | glue::glue() %>% 120 | here::here() %>% 121 | readRDS(file = .) 122 | ``` 123 | 124 | 125 | Subset ICA to just use cells from the melanoma subset since we have a good representation of cells for each cluster from this dataset. 126 | ```{r eval = FALSE} 127 | ica_melanoma2_path <- "{an_breast_10x}/{robj_dir}/ica_melanoma2.rds" %>% 128 | glue::glue() %>% 129 | here::here() 130 | 131 | if (file.exists(ica_melanoma2_path)) { 132 | 133 | ica_sub <- readRDS(file = ica_melanoma2_path) 134 | } else { 135 | # ica_se <- readRDS("/scratch/devel/pnieto/TIL_Atlas/TICA/output/integrated_renamed_filtered.rds") 136 | ica_se <- "data/immune_reference/integrated_clustered_complete.rds" %>% 137 | here::here() %>% 138 | readRDS(.) 139 | ### Immune reference atlas 140 | ica_sub <- subset(ica_se, subset = source == "melanoma2") 141 | rm(ica_se) 142 | 143 | saveRDS(object = ica_sub, file = ica_melanoma2_path) 144 | } 145 | 146 | table(ica_sub@meta.data$lv1_annot) 147 | table(ica_sub@meta.data$lv2_annot) 148 | ``` 149 | 150 | Remove MAST cells from the training set 151 | ```{r eval = FALSE} 152 | ica_sub <- subset(ica_sub, subset = lv1_annot != "Mast cells") 153 | # Create Seurat object with filtered gene matrix after removing Mast cells 154 | ica_sub <- Seurat::CreateSeuratObject( 155 | counts = ica_sub@assays$RNA@counts[ 156 | sparseMatrixStats::rowSums2(ica_sub@assays$RNA@counts) != 0, ], 157 | meta.data = ica_sub@meta.data) 158 | ``` 159 | 160 | Join proliferatiing cell types to get a proliferation signature 161 | ```{r eval = FALSE} 162 | prolif_vec <- c("T cells proliferative", "Macrophages and monocytes proliferative", 163 | "B cells proliferative", "Macrophages proliferative") 164 | 165 | ica_sub[["cell_type_mod"]] <- dplyr::if_else( 166 | ica_sub@meta.data[, clust_vr] %in% prolif_vec, 167 | "Proliferation", as.character(ica_sub@meta.data[, clust_vr])) 168 | 169 | ``` 170 | 171 | ## Prepare data for RCTD 172 | ### Reference scRNAseq 173 | We are going to follow the vignette steps. 174 | 175 | In order to run RCTD, the first step is to process the single cell reference. Create a folder in ‘data/Reference’ e.g. ‘data/Reference/Vignette’ containing the following three files: 176 | 1. meta_data.csv: a CSV file (with 3 columns, with headers "barcode", "cluster", and "nUMI") containing the numeric cluster assignment for each cell.
177 | 2. cell_type_dict.csv: a CSV file (with 2 columns, with headers "Cluster" and "Name") containing the mapping between numeric cluster ID and cluster name. If you want a cluster to be filtered out of the single cell reference, you can leave the cluster name blank. The cell types must not contain the character ‘/’ or ‘-’.
178 | 3. dge.csv: a Digital Gene Expression (DGE) (barcodes by gene counts) CSV file in the standard 10x format.
179 | We use the dgeToSeurat function: 180 | 181 | ```{r eval = FALSE} 182 | scRCTD_structure <- function(sc_obj, clust_vr) { 183 | 184 | sc_obj[["Name"]] = sc_obj@meta.data[, clust_vr] 185 | 186 | # Cell type dictionary between cluster and cell-type 187 | ct <- unique(as.character(sc_obj@meta.data[, clust_vr])) 188 | df_ct <- data.frame("Cluster" = 1:length(ct), 189 | "Name" = ct) 190 | 191 | # 192 | metadata <- sc_obj@meta.data %>% 193 | # Rownames to columns must be before left join since after it the rownames are erased 194 | tibble::rownames_to_column("barcode") %>% 195 | dplyr::left_join(df_ct, by = c("Name" = "Name")) %>% 196 | # Change names to “barcode”, “cluster”, “nUMI” 197 | mutate( 198 | cluster = Cluster, 199 | nUMI = nCount_RNA 200 | ) %>% 201 | dplyr::select(barcode, cluster, nUMI) 202 | 203 | expr_mtrx <- sc_obj@assays$RNA@counts 204 | 205 | return(list("meta_data" = metadata, 206 | "cell_type_dict" = df_ct, 207 | "dge" = expr_mtrx)) 208 | } 209 | ``` 210 | 211 | Save data to reference directory 212 | ```{r eval = FALSE} 213 | sc_ls <- scRCTD_structure(sc_obj = ica_sub, clust_vr = "cell_type_mod") 214 | 215 | "{refdir}/meta_data.csv" %>% 216 | glue::glue() %>% 217 | here::here() %>% 218 | readr::write_csv(x = sc_ls[[1]], file = .) 219 | 220 | "{refdir}/cell_type_dict.csv" %>% 221 | glue::glue() %>% 222 | here::here() %>% 223 | readr::write_csv(x = sc_ls[[2]], path = .) 224 | 225 | dge_path <- "{refdir}/dge.csv" %>% 226 | glue::glue() %>% 227 | here::here() 228 | 229 | sc_ls[[3]] %>% 230 | data.frame() %>% 231 | tibble::rownames_to_column("gene") %>% 232 | readr::write_csv(x = ., 233 | path = dge_path, 234 | col_names = TRUE) 235 | 236 | rm(list = ls("ica_sub", "sc_ls")) 237 | ``` 238 | 239 | **Of note, here go to analysis/tool_benchmarking/RCTD_data/reference/dge.csv and remove the column name gene by "". If you don't it will detect the column as a gene and RCTD::dgeToSeurat won't be able to load the data.** 240 | 241 | ### Reference ST 242 | Next, put the SpatialRNA data in your ‘data/SpatialRNA’ directory (here ‘data/SpatialRNA/Vignette’). This needs to contain: 243 | 1. BeadLocationsForR.csv: a CSV file (with 3 columns, with headers “barcodes”, “xcoord”, and “ycoord”) containing the spatial locations of the pixels.
244 | 2. MappedDGEForR.csv: a DGE (gene counts by barcodes) CSV file. Represents raw counts at each pixel.
245 | 246 | ```{r eval = FALSE} 247 | dgef_path <- "{stdir}/MappedDGEForR.csv" %>% 248 | glue::glue() %>% 249 | here::here() 250 | 251 | se_obj@assays$Spatial@counts %>% 252 | data.frame() %>% 253 | tibble::rownames_to_column("gene") %>% 254 | dplyr::select(gene, everything()) %>% 255 | readr::write_csv(x = ., 256 | file = dgef_path, 257 | col_names = TRUE) 258 | 259 | coord_path <- "{stdir}/BeadLocationsForR.csv" %>% 260 | glue::glue() %>% 261 | here::here() 262 | 263 | # Extract spatial coordinates 264 | se_obj@images[[sample_id]]@coordinates %>% 265 | dplyr::select(imagecol, imagerow) %>% 266 | tibble::rownames_to_column("barcodes") %>% 267 | dplyr::rename(xcoord = imagecol, ycoord = imagerow) %>% 268 | readr::write_csv( 269 | x = ., 270 | file = coord_path) 271 | 272 | ``` 273 | 274 | ## RCTD deconvolution 275 | ### Read data in RCTD 276 | Read data in RCTD format 277 | ```{r} 278 | reference <- RCTD::dgeToSeurat(refdir = refdir) 279 | puck <- RCTD::read.SpatialRNA(datadir = stdir) 280 | ``` 281 | 282 | ### Creating and running RCTD 283 | We are now ready to create an RCTD object using the create.RCTD function. We simply need to pass in the SpatialRNA and scRNA-seq objects. There are several configuration options that can be set with this function: 284 | ```{r} 285 | myRCTD <- RCTD::create.RCTD(spatialRNA = puck, 286 | reference = reference, 287 | max_cores = 1, 288 | CELL_MIN = 18) 289 | ``` 290 | 291 | Now, we are ready to run RCTD, using the run.RCTD function. This function is equivalent to sequentially running the functions fitBulk, choose_sigma_c, and fitPixels. The doublet_mode argument sets whether RCTD will be run in ‘doublet mode’ (at most 1-2 cell types per pixel) or ‘full mode’ (no restrictions on number of cell types). 292 | ```{r} 293 | myRCTD <- RCTD::run.RCTD(RCTD = myRCTD, 294 | doublet_mode = FALSE) 295 | ``` 296 | 297 | Save RCTD resulting 298 | ```{r} 299 | "{an_breast_10x}/{robj_dir}/RCTDobj_{sample_id}_{clust_vr}.rds" %>% 300 | glue::glue() %>% 301 | here::here() %>% 302 | saveRDS(object = myRCTD, file = .) 303 | ``` 304 | 305 | 306 | ## Session Info 307 | ```{r} 308 | sessionInfo() 309 | ``` 310 | 311 | -------------------------------------------------------------------------------- /ST-breast/RCTD-SPOTlight_comparison.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | author: "Marc Elosua-Bayes" 3 | date: "`r format(Sys.time(), '%d %B, %Y')`" 4 | output: 5 | html_document: 6 | toc: yes 7 | toc_float: yes 8 | number_sections: yes 9 | df_print: paged 10 | editor_options: 11 | chunk_output_type: console 12 | params: 13 | sample_id: "Default!" 14 | clust_vr: "lv1_annot" 15 | title: "`r sprintf('RCTD-10x_breast_immune_reference %s', {params$sample_id})`" 16 | --- 17 | 18 | ```{r setup, include=FALSE} 19 | knitr::opts_chunk$set(echo = TRUE, out.width = "100%", fig.align='center', 20 | message = FALSE, warning = FALSE) 21 | options(width = 1200) 22 | ``` 23 | 24 | ## Introduction 25 | In this R markdown document we will compare the deconvolution obtained from SPOTlight and RCTD. 26 | 27 | RCTD original paper can be found [here](https://doi.org/10.1038/s41587-021-00830-w) and the GitHub repo [here](https://github.com/dmcable/RCTD). 28 | 29 | 30 | ```{r} 31 | library(Seurat) 32 | library(tidyverse) 33 | library(Matrix) 34 | # devtools::install_github("dmcable/RCTD", build_vignettes = TRUE) 35 | library(RCTD) 36 | source(here::here("utils/bin.r")) 37 | ``` 38 | 39 | ## Paths 40 | ```{r} 41 | source(here::here("misc/paths.R")) 42 | 43 | "{an_breast_10x}/{robj_dir}" %>% 44 | glue::glue() %>% 45 | here::here() %>% 46 | dir.create( 47 | path = , 48 | showWarnings = FALSE, 49 | recursive = TRUE) 50 | 51 | "{an_breast_10x}/{plt_dir}" %>% 52 | glue::glue() %>% 53 | here::here() %>% 54 | dir.create( 55 | path = , 56 | showWarnings = FALSE, 57 | recursive = TRUE) 58 | ``` 59 | 60 | ## Parameters 61 | ```{r} 62 | set.seed(1243) 63 | 64 | sample_id <- params$sample_id 65 | # sample_id <- "breast_2" 66 | # clust_vr <- params$clust_vr 67 | clust_vr <- "lv1_annot" 68 | # clust_vr <- "lv2_annot" 69 | 70 | trn <- "melanoma" 71 | cl_n <- 100 72 | hvg <- 3000 73 | ntop <- NULL 74 | transf <- "uv" 75 | method <- "nsNMF" 76 | min_cont <- 0 77 | 78 | if (is.null(ntop)) { 79 | spotlight_id <- glue::glue("trn-{trn}_cln-{cl_n}_hvg-{hvg}_ntop-NULL_transf-{transf}_method-{method}_mincont-{min_cont}") 80 | } else { 81 | spotlight_id <- glue::glue("trn-{trn}_cln-{cl_n}_hvg-{hvg}_ntop-{ntop}_transf-{transf}_method-{method}_mincont-{min_cont}") 82 | } 83 | ``` 84 | 85 | Create a name/color dataframe 86 | ```{r} 87 | # source(here::here("misc/col_df_scrpt.R")) 88 | # source(here::here("misc/col_df_scrpt2.R")) 89 | col_df <- readRDS(here::here("misc/col_df.rds")) 90 | ``` 91 | 92 | ## Load data 93 | Load spatial object 94 | ```{r} 95 | ### Spatial breast cancer 96 | se_obj <- "{an_breast_10x}/{robj_dir}/qc_se_{sample_id}.rds" %>% 97 | glue::glue() %>% 98 | here::here() %>% 99 | readRDS(file = .) 100 | ``` 101 | 102 | SPOTlight deconvolution 103 | ```{r} 104 | decon_mtrx_ls <- "{an_breast_10x}/{robj_dir}/decon_mtrx_breast_cancer_10x_atlas_{sample_id}_{spotlight_id}_{clust_vr}.rds" %>% 105 | glue::glue() %>% 106 | here::here() %>% 107 | readRDS(file = .) 108 | ``` 109 | 110 | RCTD deconvolution 111 | ```{r} 112 | myRCTD <- "{an_breast_10x}/{robj_dir}/RCTDobj_{sample_id}_{clust_vr}.rds" %>% 113 | glue::glue() %>% 114 | here::here() %>% 115 | readRDS(file = .) 116 | ``` 117 | 118 | ## Analysis 119 | 120 | ### Extract deconvolutions 121 | 122 | SPOTlight extraction and renaming 123 | ```{r} 124 | decon_mtrx <- decon_mtrx_ls[[2]] 125 | decon_mtrx <- decon_mtrx[, colnames(decon_mtrx) != "res_ss"] 126 | 127 | # Set as 0 cell types predicted to be under 5 % of the spot 128 | # decon_mtrx[decon_mtrx < 0.05] <- 0 129 | 130 | # Change names to original ones and reorder according to Paula 131 | new_colnames <- data.frame(ct_name = colnames(decon_mtrx), stringsAsFactors = FALSE) %>% 132 | dplyr::left_join(col_df, by = "ct_name") %>% 133 | dplyr::pull(plt_name) 134 | 135 | colnames(decon_mtrx) <- new_colnames 136 | 137 | 138 | colnames(decon_mtrx) <- glue::glue("SPOTlight-{colnames(decon_mtrx)}") 139 | rownames(decon_mtrx) <- colnames(se_obj) 140 | ``` 141 | 142 | RCTD extraction 143 | ```{r} 144 | RCTD_mtrx <- as.matrix(myRCTD@results[[1]]) 145 | colnames(RCTD_mtrx) <- glue::glue("RCTD-{colnames(RCTD_mtrx)}") 146 | # RCTD_mtrx[RCTD_mtrx < 0.05] <- 0 147 | 148 | # Normalize RCTD matrix to 1 149 | RCTD_mtrx <- RCTD_mtrx / rowSums(RCTD_mtrx) 150 | ``` 151 | 152 | Add deconvolution matrices to Seurat object 153 | ```{r} 154 | rownames(RCTD_mtrx) <- stringr::str_replace( 155 | string = rownames(RCTD_mtrx), 156 | pattern = "\\.", 157 | replacement = "-") 158 | 159 | barcodes <- purrr::reduce(list(rownames(se_obj@meta.data), 160 | rownames(RCTD_mtrx), 161 | rownames(decon_mtrx)), 162 | dplyr::intersect) 163 | 164 | RCTD_mtrx <- RCTD_mtrx[barcodes, ] 165 | decon_mtrx <- decon_mtrx[barcodes, ] 166 | 167 | se_obj@meta.data <- cbind(se_obj@meta.data[barcodes, ], 168 | RCTD_mtrx[barcodes, ], 169 | decon_mtrx[barcodes, ]) 170 | ``` 171 | 172 | ### Deconvolution comparison 173 | Here we are going to compare RCTD and SPOTlight deconvolutions by correlating the cell types 174 | ```{r fig.width=20, fig.height=20} 175 | cor_plots <- lapply(colnames(decon_mtrx), function(i) { 176 | print(i) 177 | i_spot <- i 178 | i_sub <- stringr::str_split(string = i, pattern = "-", simplify = TRUE, n = 2) 179 | i_rctd <- glue::glue("RCTD-{i_sub[2]}") 180 | 181 | dat <- data.frame( 182 | SPOTlight = decon_mtrx[, i_spot], 183 | RCTD = RCTD_mtrx[, i_rctd], 184 | check.names = FALSE) 185 | 186 | print(cor.test(decon_mtrx[, i_spot], 187 | RCTD_mtrx[, i_rctd])) 188 | 189 | ggpubr::ggscatter(dat, x = "SPOTlight", y = "RCTD", 190 | color = "black", 191 | add = "reg.line", # Add regressin line 192 | add.params = list(color = "blue", fill = "lightgray"), # Customize reg. line 193 | cor.coef = TRUE, # Add correlation coefficient. see ?stat_cor 194 | cor.coeff.args = list(method = "pearson", label.sep = "\n"), 195 | title = i_sub[2]) 196 | }) 197 | 198 | 199 | plt_grid <- cowplot::plot_grid(plotlist = cor_plots,align = "hv", axis = "trbl") 200 | 201 | "{an_breast_10x}/{plt_dir}/cor_grid_RCTD-SPOTlight_{sample_id}_{clust_vr}.pdf" %>% 202 | glue::glue() %>% 203 | here::here() %>% 204 | cowplot::save_plot( 205 | filename = ., 206 | plot = plt_grid, 207 | base_height = 20, 208 | base_width = 20) 209 | ``` 210 | 211 | ### Visualize cell types 212 | Next we want to visualize the cell types side by side 213 | ```{r} 214 | tissue_plots <- lapply(colnames(decon_mtrx), function(i) { 215 | print(i) 216 | i_spot <- i 217 | i_sub <- stringr::str_split(string = i, pattern = "-", simplify = TRUE, n = 2) 218 | i_rctd <- glue::glue("RCTD-{i_sub[2]}") 219 | 220 | Seurat::SpatialPlot( 221 | object = se_obj, 222 | features = c(i_spot, i_rctd), 223 | alpha = c(0, 1), 224 | image.alpha = 1) 225 | }) 226 | 227 | 228 | tissue_grid <- cowplot::plot_grid(plotlist = tissue_plots, 229 | align = "hv", 230 | axis = "trbl") 231 | 232 | "{an_breast_10x}/{plt_dir}/tissue_grid_RCTD-SPOTlight_{sample_id}_{clust_vr}.pdf" %>% 233 | glue::glue() %>% 234 | here::here() %>% 235 | cowplot::save_plot( 236 | filename = ., 237 | plot = tissue_grid, 238 | base_height = 25, 239 | base_width = 35) 240 | ``` 241 | 242 | ## Session Info 243 | ```{r} 244 | sessionInfo() 245 | ``` 246 | -------------------------------------------------------------------------------- /ST-breast/README.md: -------------------------------------------------------------------------------- 1 | # Breast Invasive Ductal Carcinoma 2 | 3 | ## Data description 4 | This data corresponds to 2 serial breast carcinoma sections and comes from the 10x Genomics website with the following definition: 5 | 6 | *10x Genomics obtained fresh frozen Invasive Ductal Carcinoma breast tissue from BioIVT Asterand. The tissue was embedded and cryosectioned as described in Visium Spatial Protocols - Tissue Preparation Guide (Demonstrated Protocol CG000240). Tissue sections of 10 µm thickness were placed on Visium Gene Expression Slides.* 7 | 8 | *The tissue was AJCC/UICC Stage Group IIA, ER positive, PR negative, Her2 positive and annotated with:* 9 | - *Ductal carcinoma in situ* 10 | - *Lobular carcinoma in situ* 11 | - *Invasive Carcinoma* 12 | 13 | ## Data availability 14 | This data is freely available to download from the 10X Genomics website reference datasets and can be downloaded directly from their website - Slice 1 is available [here](https://support.10xgenomics.com/spatial-gene-expression/datasets/1.0.0/V1_Breast_Cancer_Block_A_Section_1) and Slice 2 [here](https://support.10xgenomics.com/spatial-gene-expression/datasets/1.0.0/V1_Breast_Cancer_Block_A_Section_2). 15 | In this case we are using the data mapped using spaceranger 1.0.0. 16 | 17 | ## Code 18 | Scripts *1-10x_breast_QC.Rmd* and *2-10x_breast_GO_enrichment.Rmd* are in charge of preprocessing the data and mapping the TCA immune cell states onto the tissue. 19 | Scripts *3-10x_breast_immune_reference.Rmd* and *4-10x_breast_stratification.Rmd*, in turn, plot the Figure panels of Figure 6 and Supplementary Figure 13-17. 20 | 21 | ## Dependencies 22 | * [R 3.6.0](https://cran.r-project.org/) 23 | * [Seurat 3.2.0](https://cran.r-project.org/web/packages/Seurat/index.html) 24 | * [tidyverse 1.3.0](https://cran.r-project.org/web/packages/tidyverse/index.html) 25 | * [ggpubr 0.3.0](https://cran.r-project.org/web/packages/ggpubr/index.html) 26 | * [SPOTlight 1.0.0](https://github.com/MarcElosua/SPOTlight) 27 | * [Matrix 1.2.18](https://cran.r-project.org/web/packages/Matrix/index.html) 28 | * [svglite 1.2.3.2](https://cran.r-project.org/web/packages/svglite/index.html) 29 | * [ggcorrplot 0.1.3](https://cran.r-project.org/web/packages/ggcorrplot/index.html) 30 | * [cowplot 1.1.0](https://cran.r-project.org/web/packages/cowplot/index.html) 31 | 32 | -------------------------------------------------------------------------------- /ST-oropharyngeal/1-australia_oroph_processing.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "1- Australia oropharingeal cancer preprocessing" 3 | author: "Marc Elosua-Bayes" 4 | date: '`r format(Sys.Date(), "%B %d, %Y")`' 5 | output: 6 | BiocStyle::html_document: 7 | toc: yes 8 | toc_float: yes 9 | number_sections: yes 10 | df_print: paged 11 | --- 12 | 13 | ```{r setup, include=FALSE} 14 | knitr::opts_chunk$set(echo = TRUE, out.width = "100%", fig.align='center', 15 | message = FALSE, warning = FALSE) 16 | options(width = 1200) 17 | ``` 18 | 19 | ## Introduction 20 | In this Rmarkdown document we will load and preprocess samples 161429, 161430, 161431, and 161432. 21 | These samples are human oropharyngeal cancer and were provided to use by Joseph Powell from the Garvand Institute of Medical Research. 22 | 23 | Here is the methodology used to obtain and process this samples up to this point: 24 | 25 | All patients provided informed consent for the collection of human specimens and data. This was approved by the St Vincent’s Hospital Research Office (2019/PID04335) in accordance with the National Health and Medical Research Council’s National Statement of Ethical Conduct in Human Research. Patients undergoing surgical resection for a locally advanced oropharyngeal cancer were recruited to the study. After surgical removal, the anatomical pathologist dissected a sample of both the primary and nodal metastasis. Samples were tumour banked in accordance with our ethically approved protocol. 26 | 27 | **Sample storage** 28 | Within 30 minutes of collection, tumour samples were tumour banked. Samples were cut into 1mm x 1mm chunks with a scalpel blade. For Visium, a tissue chunk was snap frozen in OCT. After freezing, samples were moved to liquid nitrogen for long term storage. 29 | 30 | Frozen tissue samples were processed using the Visium Spatial Gene Expression slide and reagent kit (10X Genomics, US) following the manufacturer’s instruction. Briefly, 10 μm sections were placed into the capture areas of the Visium slide. Tissue morphology was assessed with H&E staining and imaging using a Leica DM6000 microscope equipped with a 20x lens (Leica, DE). The imaged sections were then permeabilized for 12 minutes using the supplied reagents. The permeabilization condition was previously optimised using the Visium Spatial Tissue Optimisation slide and reagent kit (10X Genomics, US). After permeabilization, cDNA libraries were prepared, checked for quality and sequenced on a NovaSeq 6000 platform (Illumina, US). Around 300 million pair-ended reads were obtained for each tissue section. Read 1, i7 index and Read 2 were sequenced with 28, 8 and 98 cycles respectively. Reads were demultiplexed and mapped to the reference genome GRCh38 using the Space Ranger Software v1.0.0 (10X Genomics). 31 | 32 | 33 | ## Libraries 34 | ```{r} 35 | library(Seurat) 36 | library(dplyr) 37 | library(cowplot) 38 | source(here::here("misc/paths.R")) 39 | source(here::here("utils/bin.R")) 40 | source(here::here("utils/bin.r")) 41 | ``` 42 | 43 | ## Create directory structure 44 | ```{r} 45 | "{an_oro}/{robj_dir}" %>% 46 | glue::glue() %>% 47 | here::here() %>% 48 | dir.create( 49 | path = , 50 | showWarnings = FALSE, 51 | recursive = TRUE) 52 | 53 | "{an_oro}/{plt_dir}" %>% 54 | glue::glue() %>% 55 | here::here() %>% 56 | dir.create( 57 | path = , 58 | showWarnings = FALSE, 59 | recursive = TRUE) 60 | ``` 61 | 62 | ## Parameters 63 | Set common parameters for the entire document. 64 | ```{r} 65 | id_ls <- c("161429", "161430", "161431", "161432") 66 | ``` 67 | 68 | ## Load data 69 | We are going to start by loading the data 70 | ```{r} 71 | st_ls <- lapply(id_ls, function(id) { 72 | print(id) 73 | tmp_se <- Seurat::Load10X_Spatial(data.dir = here::here(sprintf("data/australia_hn/%s/", id)), 74 | filename = sprintf("%s_filtered_feature_bc_matrix.h5", id), 75 | slice = id) 76 | return(tmp_se) 77 | }) 78 | names(st_ls) <- id_ls 79 | ``` 80 | 81 | ## QC 82 | 1st thing we do is remove empty gene 83 | ```{r} 84 | for (id in names(st_ls)) { 85 | table(rowSums(as.matrix(st_ls[[id]]@assays$Spatial@counts)) == 0) 86 | 87 | keep_genes <- rowSums(as.matrix(st_ls[[id]]@assays$Spatial@counts)) != 0 88 | st_ls[[id]] <- st_ls[[id]][keep_genes, ] 89 | } 90 | ``` 91 | 92 | Then we look at the number of reads, genes, and mitochondrial per spot to check how this tissue looks. 93 | First we add the mitochondrial and ribosomal percentage within each spot. 94 | ```{r} 95 | for (id in names(st_ls)) { 96 | 97 | # store mitochondrial percentage in object meta data 98 | st_ls[[id]] <- Seurat::PercentageFeatureSet(st_ls[[id]], 99 | pattern = "^MT-", 100 | col.name = "percent.mt") 101 | 102 | st_ls[[id]] <- Seurat::PercentageFeatureSet(st_ls[[id]], 103 | pattern = "^RPS|^RPS", 104 | col.name = "percent.rp") 105 | } 106 | ``` 107 | 108 | ### Visualize QC metrics 109 | #### Descriptive histograms 110 | ```{r} 111 | lapply(names(st_ls), function(id) { 112 | print(id) 113 | hist_qc <- QC_seurat_hist(se = st_ls[[id]], 114 | assay = "Spatial", 115 | slot = "counts", 116 | nfeat = "nFeature_Spatial", 117 | ncount = "nCount_Spatial", 118 | pctmt = "percent.mt", 119 | pctrp = "percent.rp") 120 | 121 | "{an_oro}/{plt_dir}/QC_histogram_{id}.pdf" %>% 122 | glue::glue() %>% 123 | here::here() %>% 124 | cowplot::save_plot( 125 | filename = ., 126 | plot = hist_qc, 127 | base_height = 9, 128 | base_width = 16) 129 | return(hist_qc) 130 | }) 131 | ``` 132 | 133 | When we look at all the histograms we appreciate there is a peak on 0 *genes per spot* and *total counts per spot*, these we need to look at in more detail since they could be empty spots which we would discard. 134 | 135 | #### Spatial visualization 136 | Next we want to visualize the spots on the tissue to assess their spatial behaviour. We are mainly interested to see where the empty spots map. 137 | ```{r} 138 | lapply(names(st_ls), function(id) { 139 | print(id) 140 | sp_qc <- Seurat::SpatialFeaturePlot(object = st_ls[[id]], 141 | features = c("nFeature_Spatial", 142 | "nCount_Spatial", 143 | "percent.mt", 144 | "percent.rp")) 145 | 146 | "{an_oro}/{plt_dir}/QC_spatial_{id}.pdf" %>% 147 | glue::glue() %>% 148 | here::here() %>% 149 | cowplot::save_plot( 150 | filename = ., 151 | plot = sp_qc, 152 | base_height = 12, 153 | base_width = 12) 154 | 155 | return(sp_qc) 156 | }) 157 | ``` 158 | 159 | We see that on all tissues there are spots with 0 or near 0 counts per spot. We will remove them by setting a thresholds since these are low quality spots that would distort the analysis. 160 | 161 | We will subset and remove those spots with < 500 counts. 162 | ```{r} 163 | for (id in names(st_ls)) { 164 | print(id) 165 | st_ls[[id]] <- st_ls[[id]][, st_ls[[id]]$nCount_Spatial > 500] 166 | } 167 | ``` 168 | 169 | Take a new look at the spatial plots to assess how we removed those spots 170 | ```{r} 171 | lapply(names(st_ls), function(id) { 172 | print(id) 173 | sp_qc <- Seurat::SpatialFeaturePlot(object = st_ls[[id]], 174 | features = c("nFeature_Spatial", 175 | "nCount_Spatial", 176 | "percent.mt", 177 | "percent.rp")) 178 | 179 | "{an_oro}/{plt_dir}/QC_spatial_subset_{id}.pdf" %>% 180 | glue::glue() %>% 181 | here::here() %>% 182 | cowplot::save_plot( 183 | filename = ., 184 | plot = sp_qc, 185 | base_height = 12, 186 | base_width = 12) 187 | 188 | return(sp_qc) 189 | }) 190 | ``` 191 | 192 | 161430 and 161431 appear to have regions with very low reads so we'll keep an eye on these 2 slices specially in the downstream analysis. 193 | 194 | ## Normalization 195 | Once we've subset the data to remove low quality spots we will scale, normalize and cluster the data. 196 | ```{r} 197 | for (id in names(st_ls)) { 198 | print(id) 199 | st_ls[[id]] <- Seurat::SCTransform(object = st_ls[[id]], 200 | assay = "Spatial") 201 | 202 | # Dimensionality reduction and clustering 203 | st_ls[[id]] <- Seurat::RunPCA(st_ls[[id]], assay = "SCT", verbose = TRUE) 204 | st_ls[[id]] <- Seurat::FindNeighbors(st_ls[[id]], reduction = "pca", dims = 1:30) 205 | st_ls[[id]] <- Seurat::FindClusters(st_ls[[id]], verbose = TRUE, resolution = c(0.09, 0.1, 0.25, 0.5, 0.75, 1)) 206 | st_ls[[id]] <- Seurat::RunUMAP(st_ls[[id]], reduction = "pca", dims = 1:30) 207 | } 208 | ``` 209 | 210 | Save processed dataset 211 | ```{r} 212 | for (id in names(st_ls)) { 213 | "{an_oro}/{robj_dir}/processed_sp_oropharyngeal_{id}.RDS" %>% 214 | glue::glue() %>% 215 | here::here() %>% 216 | saveRDS( 217 | object = st_ls[[id]], 218 | file = .) 219 | } 220 | ``` 221 | 222 | ### Cluster visualization 223 | Plot cluster on the spatial context at different resolutions. 224 | ```{r} 225 | lapply(names(st_ls), function(id) { 226 | print(id) 227 | 228 | sp_clust <- Seurat::SpatialDimPlot(object = st_ls[[id]], 229 | group.by = c("SCT_snn_res.0.1", 230 | "SCT_snn_res.0.25")) + 231 | geom_point(color = NA) 232 | 233 | "{an_oro}/{plt_dir}/Cluster_spatial_{id}.pdf" %>% 234 | glue::glue() %>% 235 | here::here() %>% 236 | cowplot::save_plot( 237 | filename = ., 238 | plot = sp_clust, 239 | base_height = 9, 240 | base_width = 15) 241 | 242 | return(sp_clust) 243 | }) 244 | ``` 245 | 246 | 247 | ## Session Info 248 | ```{r} 249 | sessionInfo() 250 | ``` 251 | -------------------------------------------------------------------------------- /ST-oropharyngeal/2-australia_oroph_biological.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | author: "Marc Elosua-Bayes" 3 | date: "`r format(Sys.time(), '%d %B, %Y')`" 4 | output: 5 | html_document: 6 | toc: yes 7 | toc_float: yes 8 | number_sections: yes 9 | df_print: paged 10 | editor_options: 11 | chunk_output_type: console 12 | params: 13 | sample_id: "Default!" 14 | title: "`r sprintf('2-Australia Oropharyngeal Biological processing %s', {params$sample_id})`" 15 | --- 16 | ```{r setup, include=FALSE} 17 | knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE) 18 | knitr::opts_knit$set(root.dir = "../..") 19 | ``` 20 | 21 | ## Introduction 22 | In this Rmarkdown document we are going to do some downstream analysis to try to elucidate what the clustered regions represent. 23 | We are going to use the data obtained from script *1-australis_oroph_processing.Rmd*. 24 | 25 | ## Libraries 26 | ```{r} 27 | library(Seurat) 28 | library(dplyr) 29 | library(ggplot2) 30 | library(cowplot) 31 | library(purrr) 32 | library(DT) 33 | library(org.Hs.eg.db) 34 | library(GOstats) 35 | library(progeny) 36 | 37 | source(here::here("misc/paths.R")) 38 | source(here::here("utils/bin.r")) 39 | source(here::here("utils/bin.R")) 40 | ``` 41 | 42 | ## Parameters 43 | Set common parameters for the entire document. 44 | ```{r} 45 | id_ls <- c("161429", "161430", "161431", "161432") 46 | sample <- params$sample_id 47 | sample <- "161429" 48 | ``` 49 | 50 | ## Load data 51 | We are going to start by loading the data 52 | ```{r} 53 | sp_obj <- "{an_oro}/{robj_dir}/processed_sp_oropharyngeal_{sample}.RDS" %>% 54 | glue::glue() %>% 55 | here::here() %>% 56 | readRDS(file = .) 57 | ``` 58 | 59 | ## Go enrichment 60 | ### Marker genes 61 | Iterate over the spatial seurat objects, set Identities to the lowest resolution and save the marker genes. 62 | ```{r} 63 | Seurat::Idents(sp_obj) <- sp_obj@meta.data[, "SCT_snn_res.0.1"] 64 | markers_all <- Seurat::FindAllMarkers(object = sp_obj, 65 | assay = "SCT", 66 | slot = "data", 67 | only.pos = TRUE) 68 | 69 | "{an_oro}/{robj_dir}/markers_{sample}_res.0.1.rds" %>% 70 | glue::glue() %>% 71 | here::here() %>% 72 | saveRDS( 73 | object = tmp_markers, 74 | file = .) 75 | ``` 76 | 77 | Interactive table to look at marker genes 78 | ```{r} 79 | DT::datatable(data = markers_all, 80 | filter = "top", 81 | options = list(pageLength = 20, autoWidth = TRUE)) 82 | ``` 83 | 84 | 85 | ```{r echo = FALSE} 86 | # Extracting the markers from each cluster 87 | # Function to pull the gene variable 88 | pull_col <- function(.x) { 89 | return(.x %>% pull(var = "gene")) 90 | } 91 | 92 | unique_markers <- lapply(marker_ls, function(marker_df) { 93 | markers_tmp_ls <- marker_df %>% 94 | dplyr::filter(p_val < 0.01) %>% 95 | dplyr::group_by(cluster) %>% 96 | dplyr::group_split() %>% 97 | purrr::map(.f = pull_col) 98 | 99 | # Next we find unique marker genes for each clusters 100 | unique_hn1_ls <- lapply(seq_len(length(markers_tmp_ls)), function(i) { 101 | markers_tmp_ls[[i]][! markers_tmp_ls[[i]] %in% unique(unlist(markers_tmp_ls[-i]))] 102 | }) 103 | return(unique_hn1_ls) 104 | 105 | }) 106 | 107 | names(unique_markers) <- names(marker_ls) 108 | ``` 109 | 110 | ### Cluster annotation 111 | #### GO analysis 112 | For ALL the slices 113 | ```{r} 114 | DE_df <- lapply(names(marker_ls), function(nm) { 115 | # print(nm) 116 | # Extract objects of interest 117 | markers_st <- marker_ls[[nm]] 118 | sp_obj <- st_ls[[nm]] 119 | 120 | tmp_df <- lapply(unique(markers_st$cluster), function(clust) { 121 | # Subset cluster of interest 122 | gene_de <- markers_st %>% 123 | dplyr::filter(cluster == all_of(clust)) %>% 124 | dplyr::pull(gene) %>% 125 | stringr::str_to_upper(.) 126 | 127 | go_clust <- gene_enrichment_GO( 128 | gene_de = gene_de, 129 | gene_universe = stringr::str_to_upper(rownames(sp_obj))) 130 | 131 | tmp <- summary(go_clust) %>% 132 | dplyr::filter(Size <= 300 & Size >=33 & Pvalue < 0.01 & Count >= 3) %>% 133 | dplyr::mutate( 134 | sample = nm, 135 | cluster = clust) 136 | 137 | return(tmp) 138 | }) %>% 139 | dplyr::bind_rows() 140 | 141 | return(tmp_df) 142 | }) %>% dplyr::bind_rows() 143 | 144 | # Save DE gene analysis 145 | "{an_oro}/{robj_dir}/go_enrichment_all_res.0.1_{sample}.rds" %>% 146 | glue::glue() %>% 147 | here::here() %>% 148 | saveRDS( 149 | object = DE_df, 150 | file = .) 151 | ``` 152 | 153 | Load enrichment RDS files 154 | ```{r} 155 | # Save DE gene analysis 156 | DE_df <- "{an_oro}/{robj_dir}/go_enrichment_all_res.0.1_{sample}.rds" %>% 157 | glue::glue() %>% 158 | here::here() %>% 159 | readRDS(file = .) 160 | ``` 161 | 162 | ##### Visualization 163 | For all the slices 164 | ```{r} 165 | plt_ls <- lapply(unique(as.character(tmp_de$cluster)), function(i) { 166 | # print(i) 167 | tmp_plt <- DE_df %>% 168 | dplyr::filter(cluster == i) %>% 169 | dplyr::arrange(desc(OddsRatio)) %>% 170 | head(25) %>% 171 | ggplot(.) + 172 | geom_point(aes(x = OddsRatio, 173 | y = reorder(Term, OddsRatio), 174 | size = -Pvalue, 175 | color = Pvalue)) + 176 | scale_color_gradient(low = "green", 177 | high = "red") + 178 | labs(title = glue::glue("Sample: {sample}; Cluster: {i}"), 179 | y = "") 180 | # cowplot::save_plot(filename = sprintf("%s/%s/go_enrich_hn1_clust-%s.pdf", 181 | # an_aussie, plt_dir, i), 182 | # plot = tmp_plt) 183 | }) 184 | 185 | "{an_oro}/{plt_dir}/go_enrich_res.0.1_{sample}.pdf" %>% 186 | glue::glue() %>% 187 | here::here() %>% 188 | ggpubr::ggexport( 189 | plotlist = plt_ls, 190 | filename = ., 191 | width = 10, 192 | height = 7) 193 | ``` 194 | 195 | ### PROGENy 196 | PROGENy (Pathway RespOnsive GENes) estimates the activity of relevant signaling pathways based on consensus gene signatures obtained from perturbation experiments, in other words, the footprint of the pathway on gene expression 197 | We compute PROGENy pathway activity scores on the scRNA-seq data, and we then characterice the different clusters based on these scores. 198 | ```{r} 199 | ## We compute the Progeny activity scores and add them to our Seurat object 200 | ## as a new assay called Progeny. 201 | sp_obj <- progeny::progeny( 202 | sp_obj, 203 | scale = FALSE, 204 | organism = "Human", 205 | top = 500, 206 | perm = 1, 207 | assay_name = "Spatial", 208 | return_assay = TRUE) 209 | 210 | ## We can now directly apply Seurat functions in our Progeny scores. 211 | ## For instance, we scale the pathway activity scores. 212 | sp_obj <- Seurat::ScaleData(sp_obj, assay = "progeny") 213 | ``` 214 | 215 | Next we create a data frame with the specification of the spot id that belong to each cluster to match with the Progeny scores. 216 | ```{r} 217 | ## We transform Progeny scores into a data frame to better handling the results 218 | progeny_scores_df <- 219 | as.data.frame(t(GetAssayData(sp_obj, slot = "scale.data", 220 | assay = "progeny"))) %>% 221 | tibble::rownames_to_column("Cell") %>% 222 | tidyr::gather(Pathway, Activity, -Cell) 223 | 224 | Seurat::Idents(sp_obj) <- sp_obj@meta.data[, "SCT_snn_res.0.1"] 225 | ## We create a data frame with the specification of the spot id that belong to 226 | ## each cluster to match with the Progeny scores. 227 | CellsClusters <- data.frame(Cell = names(Idents(sp_obj)), 228 | CellType = as.character(Idents(sp_obj)), 229 | stringsAsFactors = FALSE) 230 | 231 | ## We match Progeny scores with the cell clusters. 232 | progeny_scores_df <- inner_join(progeny_scores_df, CellsClusters, by = "Cell") 233 | 234 | ## We summarize the Progeny scores by cellpopulation 235 | summarized_progeny_scores <- progeny_scores_df %>% 236 | group_by(Pathway, CellType) %>% 237 | summarise(avg = mean(Activity), std = sd(Activity)) 238 | ``` 239 | 240 | Lastly we want to visualize the pathways scores in a heatmap 241 | ```{r} 242 | ## We prepare the data for the plot 243 | summarized_progeny_scores_df <- summarized_progeny_scores %>% 244 | dplyr::select(-std) %>% 245 | tidyr::spread(Pathway, avg) %>% 246 | data.frame(row.names = 1, check.names = FALSE, stringsAsFactors = FALSE) 247 | 248 | paletteLength = 100 249 | myColor = colorRampPalette(c("Darkblue", "white","red"))(paletteLength) 250 | 251 | progenyBreaks <- c(seq(min(summarized_progeny_scores_df), 0, 252 | length.out = ceiling(paletteLength / 2) + 1), 253 | seq(max(summarized_progeny_scores_df) / paletteLength, 254 | max(summarized_progeny_scores_df), 255 | length.out = floor(paletteLength / 2))) 256 | 257 | "{an_oro}/{plt_dir}/progeny_hmap_{sample}.png" %>% 258 | glue::glue() %>% 259 | here::here() %>% 260 | pheatmap::pheatmap( 261 | mat = t(summarized_progeny_scores_df[, -1]), 262 | fontsize = 14, 263 | fontsize_row = 10, 264 | color = myColor, 265 | breaks = progenyBreaks, 266 | main = "PROGENy (500)", 267 | angle_col = 45, 268 | treeheight_col = 0, 269 | border_color = NA, 270 | filename = .) 271 | 272 | progeny_hmap <- pheatmap::pheatmap(mat = t(summarized_progeny_scores_df[, -1]), 273 | fontsize = 14, 274 | fontsize_row = 10, 275 | color = myColor, 276 | breaks = progenyBreaks, 277 | main = "PROGENy (500)", 278 | angle_col = 45, 279 | treeheight_col = 0, 280 | border_color = NA) 281 | ``` 282 | 283 | ## Session Info 284 | ```{r} 285 | sessionInfo() 286 | ``` 287 | 288 | -------------------------------------------------------------------------------- /ST-oropharyngeal/3-australia_oroph_deconv.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | author: "Marc Elosua-Bayes" 3 | date: "`r format(Sys.time(), '%d %B, %Y')`" 4 | output: 5 | html_document: 6 | toc: yes 7 | toc_float: yes 8 | number_sections: yes 9 | df_print: paged 10 | editor_options: 11 | chunk_output_type: console 12 | params: 13 | sample_id: "Default!" 14 | clust_vr: "Default!" 15 | title: "`r sprintf('3- Australia oropharyngeal cancer immune deconvolution %s', {params$sample_id})`" 16 | --- 17 | 18 | ```{r setup, include=FALSE} 19 | knitr::opts_chunk$set(echo = TRUE) 20 | ``` 21 | 22 | ## Introduction 23 | In this Rmarkdown I am going to map immune cells from TICA on to the spatial tissue using [SPOTlight](https://github.com/MarcElosua/SPOTlight). 24 | 25 | ## Libraries 26 | ```{r} 27 | library(Seurat) 28 | library(dplyr) 29 | library(ggplot2) 30 | library(purrr) 31 | library(SPOTlight) 32 | source(here::here("misc/paths.R")) 33 | source(here::here("utils/bin.r")) 34 | ``` 35 | 36 | ## Parameters 37 | ```{r} 38 | set.seed(1243) 39 | 40 | sample_id <- params$sample_id 41 | # sample_id <- "161429" 42 | # sample_id <- "161430" 43 | clust_vr <- params$clust_vr 44 | # clust_vr <- "lv1_annot" 45 | # clust_vr <- "lv2_annot" 46 | 47 | trn <- "melanoma" 48 | cl_n <- 100 49 | hvg <- 3000 50 | ntop <- NULL 51 | transf <- "uv" 52 | method <- "nsNMF" 53 | min_cont <- 0 54 | 55 | if (is.null(ntop)) { 56 | spotlight_id <- sprintf("trn-%s_cln-%s_hvg-%s_ntop-NULL_transf-%s_method-%s_mincont-%s", 57 | trn, cl_n, hvg, transf, method, min_cont) 58 | } else { 59 | spotlight_id <- sprintf("trn-%s_cln-%s_hvg-%s_ntop-%s_transf-%s_method-%s_mincont-%s", 60 | trn, cl_n, hvg, ntop, transf, method, min_cont) 61 | } 62 | ``` 63 | 64 | Define cell types of interest depending on the annotation level 65 | ```{r} 66 | if (sample_id == "161430") { 67 | ct_interest <- c("Plasma B cells", "T cells regulatory", 68 | "CD8 pre-exhausted", "Macrophages SPP1", "TAMs C1QC", 69 | "CD8 cytotoxic", "Proliferation", "CD4 naive-memory") 70 | } else if (sample_id == "161429") { 71 | ct_interest <- c("B cells", "Plasma B cells", "T cells regulatory", 72 | "CD8 terminally exhausted", "Macrophages SPP1", "TAMs C1QC", 73 | "cDC", "Proliferation") 74 | } 75 | ``` 76 | 77 | ## Load data 78 | We are going to start by loading the data from the previosu script *1-australis_oroph_processing.Rmd*. 79 | 80 | ```{r} 81 | ### Spatial oropharyngeal cancer 82 | se_obj <- "{an_oro}/{robj_dir}/processed_sp_oropharyngeal_{sample_id}.RDS" %>% 83 | glue::glue() %>% 84 | here::here() %>% 85 | readRDS(file = .) 86 | ``` 87 | 88 | Subset ICA to just use cells from the melanoma subset since we have a good representation of cells for each cluster from this dataset. 89 | ```{r} 90 | ica_melanoma2_path <- "{an_breast_10x}/{robj_dir}/ica_melanoma2.rds" %>% 91 | glue::glue() %>% 92 | here::here() 93 | 94 | if (file.exists(ica_melanoma2_path)) { 95 | 96 | ica_sub <- readRDS(file = ica_melanoma2_path) 97 | } else { 98 | # ica_se <- readRDS("/scratch/devel/pnieto/TIL_Atlas/TICA/output/integrated_renamed_filtered.rds") 99 | ica_se <- "data/immune_reference/integrated_clustered_complete.rds" %>% 100 | here::here() %>% 101 | readRDS(.) 102 | ### Immune reference atlas 103 | ica_sub <- subset(ica_se, subset = source == "melanoma2") 104 | rm(ica_se) 105 | 106 | saveRDS(object = ica_sub, file = ica_melanoma2_path) 107 | } 108 | 109 | table(ica_sub@meta.data$lv1_annot) 110 | table(ica_sub@meta.data$lv2_annot) 111 | ``` 112 | 113 | Create a name/color dataframe 114 | ```{r} 115 | # source(here::here("misc/col_df_scrpt.R")) 116 | # source(here::here("misc/col_df_scrpt2.R")) 117 | 118 | col_df <- readRDS(here::here("misc/col_df.rds")) 119 | ``` 120 | 121 | Change variable names to remove non-standard characters 122 | ```{r} 123 | ica_sub[["specific_cell_type_mod"]] <- stringr::str_replace_all( 124 | string = as.character(ica_sub@meta.data[, clust_vr]), 125 | pattern = "[[:punct:]]|[[:blank:]]", 126 | replacement = ".") 127 | 128 | prolif_vec <- c("T.cells.proliferative", "Macrophages.and.monocytes.proliferative", 129 | "B.cells.proliferative", "Macrophages.proliferative") 130 | 131 | ica_sub[["specific_cell_type_mod"]] <- 132 | dplyr::if_else(ica_sub@meta.data[, "specific_cell_type_mod"] %in% prolif_vec, 133 | "Proliferation", ica_sub@meta.data[, "specific_cell_type_mod"]) 134 | 135 | table(ica_sub@meta.data$specific_cell_type_mod) 136 | ``` 137 | 138 | Immune cell marker genes 139 | ```{r eval = FALSE} 140 | ica_sub <- Seurat::SCTransform(object = ica_sub, assay = "RNA") 141 | 142 | Seurat::Idents(ica_sub) <- as.character(ica_sub@meta.data[, "specific_cell_type_mod"]) 143 | 144 | ica_markers <- Seurat::FindAllMarkers(object = ica_sub, 145 | assay = "SCT", 146 | slot = "data", 147 | only.pos = TRUE, 148 | logfc.threshold = 0, 149 | min.pct = 0, 150 | max.cells.per.ident = 500) 151 | 152 | 153 | "data/immune_reference/ica_markers_melanoma2_{clust_vr}.rds" %>% 154 | glue::glue()%>% 155 | here::here( ) %>% 156 | saveRDS( 157 | object = ica_markers, 158 | file = .) 159 | ``` 160 | 161 | Load marker genes per cluster 162 | ```{r} 163 | ica_markers <- "data/immune_reference/ica_markers_melanoma2_{clust_vr}.rds" %>% 164 | glue::glue()%>% 165 | here::here() %>% 166 | readRDS(file = .) 167 | 168 | ica_markers <- ica_markers %>% dplyr::filter(cluster != "Mast.cells") 169 | ``` 170 | 171 | Look at the proliferation markers 172 | ```{r} 173 | ica_markers %>% 174 | dplyr::filter(cluster == "Prolifeºration") %>% 175 | head(20) 176 | ``` 177 | 178 | ## Deconvolution 179 | ```{r message = FALSE} 180 | # Remove Mast cells 181 | ica_sub <- subset(ica_sub, subset = specific_cell_type_mod != "Mast.cells") 182 | # Create Seurat object with filtered gene matrix after removing Mast cells 183 | ica_sub <- Seurat::CreateSeuratObject( 184 | counts = ica_sub@assays$RNA@counts[ 185 | sparseMatrixStats::rowSums2(ica_sub@assays$RNA@counts) != 0, ], 186 | meta.data = ica_sub@meta.data) 187 | 188 | # Remove empty spatial genes 189 | # keep_genes <- rowSums(as.matrix(se_obj@assays$Spatial@counts)) != 0 190 | # table(rowSums(ica_sub@assays$RNA@counts) != 0) 191 | 192 | # Run deconvolution 193 | decon_mtrx_ls <- SPOTlight::spotlight_deconvolution( 194 | se_sc = ica_sub, 195 | counts_spatial = se_obj@assays$Spatial@counts, 196 | clust_vr = "specific_cell_type_mod", 197 | cluster_markers = ica_markers, 198 | cl_n = cl_n, 199 | hvg = hvg, 200 | ntop = ntop, 201 | transf = transf, 202 | method = method, 203 | min_cont = min_cont, 204 | assay = "RNA", 205 | slot = "counts") 206 | 207 | # saveRDS(object = decon_mtrx_ls, 208 | # file = sprintf("%s/%s/decon_mtrx_breast_cancer_10x_atlas_%s_reg.rds", 209 | # an_oro, robj_dir, spotlight_id)) 210 | # 211 | # saveRDS(object = decon_mtrx_ls, 212 | # file = sprintf("%s/%s/decon_mtrx_breast_cancer_10x_atlas_%s_latent.rds", 213 | # an_oro, robj_dir, spotlight_id)) 214 | 215 | "{an_oro}/{robj_dir}/decon_mtrx_atlas_{sample_id}_{spotlight_id}_{clust_vr}.rds" %>% 216 | glue::glue() %>% 217 | here::here() %>% 218 | saveRDS(object = decon_mtrx_ls, file = .) 219 | ``` 220 | 221 | Add deconvolution matrix to Seurat object metadata 222 | ```{r} 223 | decon_mtrx_ls <- "{an_oro}/{robj_dir}/decon_mtrx_atlas_{sample_id}_{spotlight_id}_{clust_vr}.rds" %>% 224 | glue::glue() %>% 225 | here::here() %>% 226 | readRDS(file = .) 227 | 228 | # decon_mtrx_ls <- readRDS(file = sprintf("%s/%s/decon_mtrx_breast_cancer_10x_atlas_%s.rds", 229 | # an_oro, robj_dir, spotlight_id)) 230 | # decon_mtrx_ls <- readRDS(file = sprintf("%s/%s/decon_mtrx_breast_cancer_10x_pdac_atlas.rds", an_oro, robj_dir)) 231 | # decon_mtrx_ls <- readRDS(file = sprintf("%s/%s/decon_mtrx_breast_cancer_10x_atlas_%s_reg.rds", 232 | # an_oro, robj_dir, spotlight_id)) 233 | # 234 | # decon_mtrx_ls <- readRDS(file = sprintf("%s/%s/decon_mtrx_breast_cancer_10x_atlas_%s_latent.rds", 235 | # an_oro, robj_dir, spotlight_id)) 236 | 237 | decon_mtrx <- decon_mtrx_ls[[2]] 238 | decon_mtrx <- decon_mtrx[, colnames(decon_mtrx) != "res_ss"] 239 | 240 | # Set as 0 cell types predicted to be under 3 % of the spot 241 | decon_mtrx[decon_mtrx < 0.03] <- 0 242 | ``` 243 | 244 | Change names to original ones and reorder according to Paula 245 | ```{r} 246 | new_colnames <- data.frame(ct_name = colnames(decon_mtrx), stringsAsFactors = FALSE) %>% 247 | dplyr::left_join(col_df, by = "ct_name") %>% 248 | dplyr::pull(plt_name) 249 | 250 | colnames(decon_mtrx) <- new_colnames 251 | ``` 252 | 253 | Add deconvolution matrix to Seurat object metadata 254 | ```{r} 255 | se_obj@meta.data <- cbind(se_obj@meta.data, round(decon_mtrx, 3)) 256 | ``` 257 | 258 | ### Visualization 259 | Before we start we take a look at the topic profiles 260 | ```{r} 261 | # Extract coefficient matrix 262 | h <- NMF::coef(decon_mtrx_ls[[1]][[1]]) 263 | 264 | # Extract coefficient labels 265 | train_labs <- data.frame(ct_name = decon_mtrx_ls[[1]][[2]]) %>% 266 | dplyr::left_join(col_df, by = "ct_name") %>% 267 | dplyr::pull(plt_name) 268 | 269 | # rownames(h) <- paste("Topic", 1:nrow(h), sep = " ") 270 | profile_plt <- SPOTlight::dot_plot_profiles_fun( 271 | h = h, 272 | train_cell_clust = train_labs) 273 | 274 | profile_plt[[1]] 275 | 276 | "{an_oro}/{plt_dir}/all_ct_profiles_{spotlight_id}_{sample_id}_{clust_vr}.pdf" %>% 277 | glue::glue() %>% 278 | here::here() %>% 279 | cowplot::save_plot( 280 | filename = ., 281 | plot = profile_plt[[1]], 282 | base_height = 30, 283 | base_width = 30) 284 | 285 | h_df <- data.frame(t(h)) 286 | 287 | # Fix column names after converting to dataframe 288 | colnames(h_df) <- gsub(".", " ", colnames(h_df), fixed = TRUE) 289 | 290 | # Get proportions for each row 291 | h_ds <- round(h_df/rowSums(h_df), 4) 292 | h_ds[, "clust_vr"] <- train_labs 293 | 294 | 295 | ct_topic_profiles <- h_ds %>% 296 | dplyr::group_by(clust_vr) %>% 297 | dplyr::summarise_all(list(median)) %>% 298 | tibble::remove_rownames(.) %>% 299 | tibble::column_to_rownames("clust_vr") 300 | 301 | ct_topic_profiles <- ct_topic_profiles / rowSums(ct_topic_profiles) 302 | # In case a row is all 0 303 | ct_topic_profiles[is.na(ct_topic_profiles)] <- 0 304 | 305 | cell_type_plt <- round(ct_topic_profiles, 2) %>% 306 | tibble::rownames_to_column('Cell type') %>% 307 | tidyr::pivot_longer(cols = -`Cell type`, names_to = "Topics") %>% 308 | dplyr::mutate( 309 | value_txt = dplyr::if_else(value > 0.1, round(value, 2), NULL), 310 | Topics = factor(x = Topics, 311 | levels = stringr::str_sort(colnames(ct_topic_profiles), 312 | numeric = TRUE)), 313 | topics_num = stringr::str_replace_all( 314 | string = Topics, 315 | pattern = "X", 316 | replacement = ""), 317 | topics_num = factor(topics_num, levels = as.character(1:length(unique(ct_topic_profiles)))) 318 | 319 | ) %>% 320 | ggplot2::ggplot(ggplot2::aes(x = `Cell type`, y = topics_num)) + 321 | ggplot2::geom_point(ggplot2::aes(size = value, colour = value)) + 322 | ggplot2::scale_color_continuous(low = "grey", high = "#59b371") + 323 | ggplot2::theme_classic() + 324 | ggplot2::labs(title = "NMF: Topic profiles by cell type") + 325 | ggplot2::theme( 326 | plot.title = ggplot2::element_text(hjust = 0.5, size = 20), 327 | axis.text.x = ggplot2::element_text(angle = 90, vjust = 0.5), 328 | axis.text = ggplot2::element_text(size = 15)) + 329 | ggplot2::scale_size(range = c(0, 10)) + 330 | ggplot2::guides(colour = ggplot2::guide_legend("Proportion"), 331 | size = ggplot2::guide_legend("Proportion")) + 332 | ggplot2::labs( 333 | title = glue::glue("NMF: Topic profiles by cell type"), 334 | x = "", 335 | y = "Topics") + 336 | ggplot2::scale_x_discrete( 337 | limits = unique(stringr::str_wrap(train_labs, width = 30))) + 338 | # ggplot2::scale_y_discrete(breaks = c(1:length(unique(train_labs)))) 339 | ggplot2::theme( 340 | axis.text = ggplot2::element_text(size = 22), 341 | axis.text.x = ggplot2::element_text(hjust = 1, vjust = 0.5), 342 | legend.title = ggplot2::element_text(size = 20, face = "bold"), 343 | legend.text = ggplot2::element_text(size = 18), 344 | axis.title.y = ggplot2::element_text(size = 26), 345 | # plot.title = element_blank() 346 | plot.title = ggplot2::element_text(size = 30)) 347 | 348 | cell_type_plt 349 | 350 | "{an_oro}/{plt_dir}/ct_profiles_{spotlight_id}_{sample_id}_{clust_vr}.pdf" %>% 351 | glue::glue() %>% 352 | here::here() %>% 353 | cowplot::save_plot( 354 | filename = ., 355 | plot = cell_type_plt, 356 | base_height = 10, 357 | base_width = 10) 358 | ``` 359 | 360 | Lastly we can take a look at which genes are the most important for each topic and get an insight into which genes are driving them. 361 | ```{r} 362 | basis_spotlight <- data.frame(NMF::basis(decon_mtrx_ls[[1]][[1]])) 363 | 364 | colnames(basis_spotlight) <- unique(stringr::str_wrap(decon_mtrx_ls[[1]][[2]], width = 30)) 365 | 366 | basis_spotlight %>% 367 | round(., 5) %>% 368 | DT::datatable(., filter = "top") 369 | ``` 370 | 371 | We will start by looking at the location of each cell type 372 | ```{r} 373 | # https://stackoverflow.com/questions/38722202/how-do-i-change-the-number-of-decimal-places-on-axis-labels-in-ggplot2 374 | # Our transformation function to ensure 2 decimals in the legend 375 | scaleFUN <- function(x) sprintf("%.2f", x) 376 | 377 | ct_all <- colnames(decon_mtrx) 378 | ct_plt_ls <- lapply(ct_all, function(ct) { 379 | tmp_plt <- Seurat::SpatialFeaturePlot(object = se_obj, 380 | features = ct, 381 | alpha = c(0, 1)) 382 | 383 | if (sum(se_obj@meta.data[, ct]) == 0) { 384 | tmp_plt <- tmp_plt + ggplot2::scale_alpha(range = c(0,0)) 385 | } else { 386 | tmp_plt <- tmp_plt + ggplot2::scale_alpha(range = c(0, 1)) 387 | } 388 | 389 | tmp_plt <- tmp_plt + 390 | ggplot2::theme( 391 | legend.title = ggplot2::element_blank() 392 | ) + 393 | ggplot2::labs(title = ct) + 394 | ggplot2::scale_fill_gradientn( 395 | colors = grDevices::heat.colors(10, rev = TRUE), 396 | # Same number of breaks for all plots 397 | breaks = seq(min(se_obj@meta.data[, ct]), 398 | max(se_obj@meta.data[, ct]), 399 | length.out = 4), 400 | # 2 decimals in the legend 401 | labels = scaleFUN 402 | # limits = c(0, 1) 403 | ) 404 | return(tmp_plt) 405 | }) 406 | 407 | ct_grid <- cowplot::plot_grid( 408 | plotlist = ct_plt_ls, 409 | axis = "trbl", 410 | align = "hv", 411 | nrow = 5, 412 | ncol = 5) 413 | 414 | "{an_oro}/{plt_dir}/{sample_id}_immune_reference_arrangement_{clust_vr}.pdf" %>% 415 | glue::glue() %>% 416 | here::here() %>% 417 | cowplot::save_plot( 418 | filename = ., 419 | plot = ct_grid, 420 | base_height = 25, 421 | base_width = 25) 422 | ``` 423 | 424 | Next we will only plot the location of the cell types of interest for the main plot 425 | ```{r} 426 | # i <- unique(se_obj$slice)[1] 427 | # ct <- cell_types[[1]] 428 | ct_plt_ls <- lapply(ct_interest, function(ct) { 429 | print(ct) 430 | tmp_plt <- Seurat::SpatialPlot( 431 | object = se_obj, 432 | features = ct, 433 | alpha = c(0, 1), 434 | # Remove background image 435 | image.alpha = 0, 436 | crop = FALSE, 437 | pt.size.factor = 1 438 | ) + 439 | theme( 440 | legend.title = ggplot2::element_blank(), 441 | legend.key.size = unit(1, 'cm')) 442 | 443 | if (sum(se_obj@meta.data[, ct]) == 0) { 444 | tmp_plt <- tmp_plt + ggplot2::scale_alpha(range = c(0,0)) 445 | } else { 446 | tmp_plt <- tmp_plt + 447 | ggplot2::scale_fill_gradientn( 448 | colors = grDevices::heat.colors(10, rev = TRUE), 449 | # Same number of breaks for all plots with 2 decimals 450 | breaks = round(seq(min(se_obj@meta.data[, ct]), 451 | max(se_obj@meta.data[, ct]), 452 | length.out = 3), 2), 453 | # 2 decimals in the legend 454 | labels = function(x) sapply(x, FUN = function(i) format(x = i, nsmall = 0)) 455 | # labels = scaleFUN 456 | # limits = c(0, 1) 457 | ) 458 | } 459 | 460 | tmp_plt <- tmp_plt + 461 | ggplot2::labs(title = ct) + 462 | ggplot2::theme( 463 | plot.title = ggplot2::element_text(hjust = 0.5, size = 30, face = "bold"), 464 | legend.title = ggplot2::element_blank(), 465 | legend.text = ggplot2::element_text(size = 20)) 466 | 467 | return(tmp_plt) 468 | }) 469 | 470 | interest_grid <- cowplot::plot_grid(plotlist = ct_plt_ls, 471 | axis = "trbl", 472 | align = "hv", 473 | nrow = 4, 474 | ncol = 2) 475 | 476 | "{an_oro}/{plt_dir}/{sample_id}_immune_reference_arrangement_interest_{clust_vr}.pdf" %>% 477 | glue::glue() %>% 478 | here::here() %>% 479 | cowplot::save_plot( 480 | filename = ., 481 | plot = interest_grid, 482 | base_height = 25, 483 | base_width = 15) 484 | ``` 485 | 486 | Then we will plot the spatial-scatterpie for both slices to get a gist of what is going on 487 | ```{r eval = FALSE} 488 | spsct_plt1 <- SPOTlight::spatial_scatterpie( 489 | se_obj = se_obj, 490 | cell_types_all = cell_types, 491 | img_path = here::here("data/breast_visium/section_1/spatial/tissue_lowres_image.pdf"), 492 | pie_scale = 0.4, 493 | slice = sample_id) + 494 | ggplot2::scale_fill_manual( 495 | values = col_df[col_df$plt_name %in% cell_types, "col_df"], 496 | breaks = cell_types) + 497 | ggplot2::guides(fill = ggplot2::guide_legend(ncol = 1)) 498 | 499 | "{an_oro}/{plt_dir}/{sample_id}_spatial_scatterpie_{clust_vr}.pdf" %>% 500 | glue::glue() %>% 501 | here::here() %>% 502 | cowplot::save_plot( 503 | filename = ., 504 | plot = spsct_plt1, 505 | base_width = 12, 506 | base_height = 9) 507 | ``` 508 | 509 | We will also plot the scatterpie without the slice image underneath 510 | ```{r} 511 | sct_plt1 <- SPOTlight::scatterpie_plot(se_obj = se_obj, 512 | cell_types_all = ct_all, 513 | pie_scale = 0.7, 514 | slice = sample_id) + 515 | ggplot2::scale_fill_manual( 516 | values = col_df[col_df$plt_name %in% ct_all, "col_df"], 517 | breaks = colnames(decon_mtrx)) + 518 | ggplot2::coord_fixed(ratio = 1) + 519 | ggplot2::guides(fill = ggplot2::guide_legend(ncol = 1)) 520 | 521 | "{an_oro}/{plt_dir}/{sample_id}_scatterpie_{clust_vr}.pdf" %>% 522 | glue::glue() %>% 523 | here::here() %>% 524 | cowplot::save_plot( 525 | filename = ., 526 | plot = sct_plt1, 527 | base_width = 12, 528 | base_height = 9) 529 | ``` 530 | 531 | We can also take a loot at the spatial scatterpie by looking at cell types which are not present throughout the entire tissue. 532 | ```{r} 533 | # Subset cell types from metadata 534 | metadata_subset <- se_obj@meta.data[, ct_all] 535 | 536 | # Create masks 537 | keep_0.75 <- colSums(se_obj@meta.data[, ct_all] > 0) < 0.75 * ncol(se_obj) 538 | keep_g0 <- colSums(se_obj@meta.data[, ct_all] > 0) > 0 539 | 540 | # Select cell types fullfiling the conditions 541 | ct_var <- c(colnames(decon_mtrx)[keep_0.75 & keep_g0]) 542 | 543 | # Get color order 544 | col_vec <- data.frame(plt_name = ct_var) %>% 545 | dplyr::left_join(col_df, by = "plt_name") %>% 546 | dplyr::pull(col_df) 547 | 548 | sct_plt_int <- SPOTlight::scatterpie_plot( 549 | se_obj = se_obj, 550 | cell_types_all = ct_var, 551 | pie_scale = 0.4) + 552 | ggplot2::scale_fill_manual( 553 | values = col_vec, 554 | breaks = ct_var) + 555 | ggplot2::labs(fill = "") + 556 | ggplot2::coord_fixed(ratio = 1) + 557 | ggplot2::theme(legend.position = "top", 558 | legend.text = ggplot2::element_text(size = 12)) + 559 | ggplot2::guides(fill = ggplot2::guide_legend(nrow = 3)) 560 | 561 | "{an_oro}/{plt_dir}/{sample_id}_scatterpie_interest_{clust_vr}.pdf" %>% 562 | glue::glue() %>% 563 | here::here() %>% 564 | cowplot::save_plot( 565 | filename = ., 566 | plot = sct_plt_int, 567 | base_width = 12, 568 | base_height = 9) 569 | ``` 570 | 571 | ### Gene - Cell correlation 572 | Check assumptions 573 | ```{r eval = FALSE} 574 | 575 | lapply(c("Regulatory T cells", "Cytotoxic CD8 T cells"), function(ct) { 576 | # Shapiro-Wilk normality test for mpg 577 | print(shapiro.test(se_obj@meta.data[, ct])) 578 | ggpubr::ggqqplot(se_obj@meta.data[, ct]) 579 | 580 | }) 581 | 582 | lapply(c("ENTPD1", "NT5E", "TGFB1"), function(gene) { 583 | # Shapiro-Wilk normality test for mpg 584 | print(shapiro.test(se_obj@assays$SCT@data[gene, ])) 585 | ggpubr::ggqqplot(se_obj@assays$SCT@data[gene, ]) 586 | 587 | }) 588 | ``` 589 | 590 | Next we want to check if the fibrotic marker TGFB1 with Treg and Cytotoxic CD8 T cells" 591 | ```{r eval = FALSE} 592 | library(ggpubr) 593 | 594 | metadata <- dplyr::bind_cols( 595 | se_obj@meta.data, 596 | data.frame(t(se_obj@assays$SCT@data[c("ENTPD1", "NT5E", "TGFB1"), ]))) 597 | 598 | cor_plts <- lapply(c("Regulatory T cells", "Cytotoxic CD8 T cells"), function(ct) { 599 | tmp_ls <- lapply(c("ENTPD1", "NT5E", "TGFB1"), function(gene) { 600 | 601 | tmp_plt <- ggpubr::ggscatter(data = metadata, 602 | x = gene, 603 | y = ct, 604 | add = "reg.line") + 605 | ggpubr::stat_cor(method = "pearson") 606 | return(tmp_plt) 607 | }) 608 | cowplot::plot_grid(plotlist = tmp_ls, ncol = 1) 609 | }) 610 | 611 | tmp_arr <- cowplot::plot_grid(plotlist = cor_plts, 612 | nrow = 1, 613 | ncol = 2) 614 | tmp_arr 615 | ``` 616 | 617 | ## Session Info 618 | ```{r} 619 | sessionInfo() 620 | ``` 621 | -------------------------------------------------------------------------------- /ST-oropharyngeal/RCTD-SPOTlight_comparison.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | author: "Marc Elosua-Bayes" 3 | date: "`r format(Sys.time(), '%d %B, %Y')`" 4 | output: 5 | html_document: 6 | toc: yes 7 | toc_float: yes 8 | number_sections: yes 9 | df_print: paged 10 | editor_options: 11 | chunk_output_type: console 12 | params: 13 | sample_id: "Default!" 14 | clust_vr: "lv1_annot" 15 | title: "`r sprintf('RCTD-10x_breast_immune_reference %s', {params$sample_id})`" 16 | --- 17 | 18 | ```{r setup, include=FALSE} 19 | knitr::opts_chunk$set(echo = TRUE, out.width = "100%", fig.align='center', 20 | message = FALSE, warning = FALSE) 21 | options(width = 1200) 22 | ``` 23 | 24 | ## Introduction 25 | In this R markdown document we will compare the deconvolution obtained from SPOTlight and RCTD. 26 | 27 | RCTD original paper can be found [here](https://doi.org/10.1038/s41587-021-00830-w) and the GitHub repo [here](https://github.com/dmcable/RCTD). 28 | 29 | 30 | ```{r} 31 | library(Seurat) 32 | library(tidyverse) 33 | library(Matrix) 34 | # devtools::install_github("dmcable/RCTD", build_vignettes = TRUE) 35 | library(RCTD) 36 | source(here::here("utils/bin.r")) 37 | ``` 38 | 39 | ## Paths 40 | ```{r} 41 | source(here::here("misc/paths.R")) 42 | 43 | "{an_oro}/{robj_dir}" %>% 44 | glue::glue() %>% 45 | here::here() %>% 46 | dir.create( 47 | path = , 48 | showWarnings = FALSE, 49 | recursive = TRUE) 50 | 51 | "{an_oro}/{plt_dir}" %>% 52 | glue::glue() %>% 53 | here::here() %>% 54 | dir.create( 55 | path = , 56 | showWarnings = FALSE, 57 | recursive = TRUE) 58 | ``` 59 | 60 | ## Parameters 61 | ```{r} 62 | set.seed(1243) 63 | 64 | sample_id <- params$sample_id 65 | # sample_id <- "161429" 66 | clust_vr <- params$clust_vr 67 | # clust_vr <- "new_annot" 68 | # clust_vr <- "old_cell_types" 69 | 70 | trn <- "melanoma" 71 | cl_n <- 100 72 | hvg <- 3000 73 | ntop <- NULL 74 | transf <- "uv" 75 | method <- "nsNMF" 76 | min_cont <- 0 77 | 78 | if (is.null(ntop)) { 79 | spotlight_id <- glue::glue("trn-{trn}_cln-{cl_n}_hvg-{hvg}_ntop-NULL_transf-{transf}_method-{method}_mincont-{min_cont}") 80 | } else { 81 | spotlight_id <- glue::glue("trn-{trn}_cln-{cl_n}_hvg-{hvg}_ntop-{ntop}_transf-{transf}_method-{method}_mincont-{min_cont}") 82 | } 83 | ``` 84 | 85 | Create a name/color dataframe 86 | ```{r} 87 | # source(here::here("misc/col_df_scrpt.R")) 88 | # source(here::here("misc/col_df_scrpt2.R")) 89 | col_df <- readRDS(here::here("misc/col_df.rds")) 90 | ``` 91 | 92 | ## Load data 93 | Load spatial object 94 | ```{r} 95 | ### Spatial breast cancer 96 | se_obj <- "{an_oro}/{robj_dir}/processed_sp_oropharyngeal_{sample_id}.RDS" %>% 97 | glue::glue() %>% 98 | here::here() %>% 99 | readRDS(file = .) 100 | ``` 101 | 102 | SPOTlight deconvolution 103 | ```{r} 104 | decon_mtrx_ls <- "{an_oro}/{robj_dir}/decon_mtrx_atlas_{sample_id}_{spotlight_id}_{clust_vr}.rds" %>% 105 | glue::glue() %>% 106 | here::here() %>% 107 | readRDS(file = .) 108 | ``` 109 | 110 | RCTD deconvolution 111 | ```{r} 112 | myRCTD <- "{an_oro}/{robj_dir}/RCTDobj_{sample_id}_{clust_vr}.rds" %>% 113 | glue::glue() %>% 114 | here::here() %>% 115 | readRDS(file = .) 116 | ``` 117 | 118 | ## Analysis 119 | 120 | ### Extract deconvolutions 121 | 122 | SPOTlight extraction and renaming 123 | ```{r} 124 | decon_mtrx <- decon_mtrx_ls[[2]] 125 | decon_mtrx <- decon_mtrx[, colnames(decon_mtrx) != "res_ss"] 126 | 127 | # Set as 0 cell types predicted to be under 5 % of the spot 128 | # decon_mtrx[decon_mtrx < 0.05] <- 0 129 | 130 | # Change names to original ones and reorder according to Paula 131 | new_colnames <- data.frame(ct_name = colnames(decon_mtrx), stringsAsFactors = FALSE) %>% 132 | dplyr::left_join(col_df, by = "ct_name") %>% 133 | dplyr::pull(plt_name) 134 | 135 | colnames(decon_mtrx) <- new_colnames 136 | colnames(decon_mtrx) <- glue::glue("SPOTlight-{colnames(decon_mtrx)}") 137 | rownames(decon_mtrx) <- colnames(se_obj) 138 | ``` 139 | 140 | RCTD extraction 141 | ```{r} 142 | RCTD_mtrx <- as.matrix(myRCTD@results[[1]]) 143 | colnames(RCTD_mtrx) <- glue::glue("RCTD-{colnames(RCTD_mtrx)}") 144 | # RCTD_mtrx[RCTD_mtrx < 0.05] <- 0 145 | 146 | # Normalize RCTD matrix to 1 147 | RCTD_mtrx <- RCTD_mtrx / rowSums(RCTD_mtrx) 148 | ``` 149 | 150 | Add deconvolution matrices to Seurat object 151 | ```{r} 152 | rownames(RCTD_mtrx) <- stringr::str_replace( 153 | string = rownames(RCTD_mtrx), 154 | pattern = "\\.", 155 | replacement = "-") 156 | 157 | barcodes <- purrr::reduce(list(rownames(se_obj@meta.data), 158 | rownames(RCTD_mtrx), 159 | rownames(decon_mtrx)), 160 | dplyr::intersect) 161 | 162 | RCTD_mtrx <- RCTD_mtrx[barcodes, ] 163 | decon_mtrx <- decon_mtrx[barcodes, ] 164 | 165 | se_obj@meta.data <- cbind(se_obj@meta.data[barcodes, ], 166 | RCTD_mtrx[barcodes, ], 167 | decon_mtrx[barcodes, ]) 168 | 169 | 170 | ``` 171 | 172 | ### Deconvolution comparison 173 | Here we are going to compare RCTD and SPOTlight deconvolutions by correlating the cell types 174 | ```{r fig.width=20, fig.height=20} 175 | cor_plots <- lapply(colnames(decon_mtrx), function(i) { 176 | print(i) 177 | i_spot <- i 178 | i_sub <- stringr::str_split(string = i, pattern = "-", simplify = TRUE, n = 2) 179 | i_rctd <- glue::glue("RCTD-{i_sub[2]}") 180 | 181 | dat <- data.frame( 182 | SPOTlight = decon_mtrx[, i_spot], 183 | RCTD = RCTD_mtrx[, i_rctd], 184 | check.names = FALSE) 185 | 186 | print(cor.test(decon_mtrx[, i_spot], 187 | RCTD_mtrx[, i_rctd])) 188 | 189 | ggpubr::ggscatter(dat, x = "SPOTlight", y = "RCTD", 190 | color = "black", 191 | add = "reg.line", # Add regressin line 192 | add.params = list(color = "blue", fill = "lightgray"), # Customize reg. line 193 | cor.coef = TRUE, # Add correlation coefficient. see ?stat_cor 194 | cor.coeff.args = list(method = "pearson", label.sep = "\n"), 195 | title = i_sub[2]) 196 | }) 197 | 198 | 199 | plt_grid <- cowplot::plot_grid(plotlist = cor_plots,align = "hv", axis = "trbl") 200 | 201 | "{an_oro}/{plt_dir}/cor_grid_RCTD-SPOTlight_{sample_id}_{clust_vr}.pdf" %>% 202 | glue::glue() %>% 203 | here::here() %>% 204 | cowplot::save_plot( 205 | filename = ., 206 | plot = plt_grid, 207 | base_height = 20, 208 | base_width = 20) 209 | ``` 210 | 211 | ### Visualize cell types 212 | Next we want to visualize the cell types side by side 213 | ```{r} 214 | tissue_plots <- lapply(colnames(decon_mtrx), function(i) { 215 | print(i) 216 | i_spot <- i 217 | i_sub <- stringr::str_split(string = i, pattern = "-", simplify = TRUE, n = 2) 218 | i_rctd <- glue::glue("RCTD-{i_sub[2]}") 219 | 220 | Seurat::SpatialPlot( 221 | object = se_obj, 222 | features = c(i_spot, i_rctd), 223 | alpha = c(0, 1), 224 | image.alpha = 1) 225 | }) 226 | 227 | 228 | tissue_grid <- cowplot::plot_grid(plotlist = tissue_plots, 229 | align = "hv", 230 | axis = "trbl") 231 | 232 | "{an_oro}/{plt_dir}/tissue_grid_RCTD-SPOTlight_{sample_id}_{clust_vr}.pdf" %>% 233 | glue::glue() %>% 234 | here::here() %>% 235 | cowplot::save_plot( 236 | filename = ., 237 | plot = tissue_grid, 238 | base_height = 25, 239 | base_width = 35) 240 | ``` 241 | 242 | ## Session Info 243 | ```{r} 244 | sessionInfo() 245 | ``` 246 | -------------------------------------------------------------------------------- /ST-oropharyngeal/RCTD-australia_oroph_deconv.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | author: "Marc Elosua-Bayes" 3 | date: "`r format(Sys.time(), '%d %B, %Y')`" 4 | output: 5 | html_document: 6 | toc: yes 7 | toc_float: yes 8 | number_sections: yes 9 | df_print: paged 10 | editor_options: 11 | chunk_output_type: console 12 | params: 13 | sample_id: "Default!" 14 | clust_vr: "lv1_annot" 15 | title: "`r sprintf('RCTD-10x_breast_immune_reference %s', {params$sample_id})`" 16 | --- 17 | 18 | ```{r setup, include=FALSE} 19 | knitr::opts_chunk$set(echo = TRUE, out.width = "100%", fig.align='center', 20 | message = FALSE, warning = FALSE) 21 | options(width = 1200) 22 | ``` 23 | 24 | ## Introduction 25 | In this R markdo9wn document we will map immune cells onto the Oropharyngeal cancer tissue using the RCTD tool. 26 | Robust Cell Type Deconvolution is developed by Dylan M. Cable from Rafa Irizarry's lab 27 | 28 | RCTD original paper can be found [here](https://doi.org/10.1038/s41587-021-00830-w) and the GitHub repo [here](https://github.com/dmcable/RCTD). 29 | 30 | 31 | ```{r} 32 | library(Seurat) 33 | library(tidyverse) 34 | library(Matrix) 35 | # devtools::install_github("dmcable/RCTD", build_vignettes = TRUE) 36 | library(RCTD) 37 | source(here::here("utils/bin.r")) 38 | ``` 39 | 40 | ## Paths 41 | ```{r} 42 | source(here::here("misc/paths.R")) 43 | 44 | "{an_breast_10x}/{robj_dir}" %>% 45 | glue::glue() %>% 46 | here::here() %>% 47 | dir.create( 48 | path = , 49 | showWarnings = FALSE, 50 | recursive = TRUE) 51 | 52 | "{an_breast_10x}/{plt_dir}" %>% 53 | glue::glue() %>% 54 | here::here() %>% 55 | dir.create( 56 | path = , 57 | showWarnings = FALSE, 58 | recursive = TRUE) 59 | ``` 60 | 61 | ## Parameters 62 | ```{r} 63 | set.seed(1243) 64 | 65 | sample_id <- params$sample_id 66 | # sample_id <- "161429" 67 | # sample_id <- "161430" 68 | # clust_vr <- params$clust_vr 69 | clust_vr <- "lv1_annot" 70 | # clust_vr <- "lv2_annot" 71 | ``` 72 | 73 | RCTD paths 74 | ```{r} 75 | refdir <- "{an_breast_10x}/{robj_dir}/RCTD_data/reference" %>% 76 | glue::glue() %>% 77 | here::here() 78 | 79 | dir.create(path = refdir, showWarnings = FALSE, recursive = TRUE) 80 | 81 | stdir <- "{an_oro}/{robj_dir}/{sample_id}/RCTD_data/spatial" %>% 82 | glue::glue() %>% 83 | here::here() 84 | 85 | dir.create(path = stdir, showWarnings = FALSE, recursive = TRUE) 86 | ``` 87 | 88 | ## Load data 89 | 90 | We are going to start by loading the data from the previosu script *1-australis_oroph_processing.Rmd*. 91 | ```{r eval = FALSE} 92 | ### Spatial oropharyngeal cancer 93 | se_obj <- "{an_oro}/{robj_dir}/processed_sp_oropharyngeal_{sample_id}.RDS" %>% 94 | glue::glue() %>% 95 | here::here() %>% 96 | readRDS(file = .) 97 | ``` 98 | 99 | Subset ICA to just use cells from the melanoma subset since we have a good representation of cells for each cluster from this dataset. 100 | ```{r eval = FALSE} 101 | ica_melanoma2_path <- "{an_breast_10x}/{robj_dir}/ica_melanoma2.rds" %>% 102 | glue::glue() %>% 103 | here::here() 104 | 105 | if (file.exists(ica_melanoma2_path)) { 106 | 107 | ica_sub <- readRDS(file = ica_melanoma2_path) 108 | } else { 109 | # ica_se <- readRDS("/scratch/devel/pnieto/TIL_Atlas/TICA/output/integrated_renamed_filtered.rds") 110 | ica_se <- "data/immune_reference/integrated_clustered_complete.rds" %>% 111 | here::here() %>% 112 | readRDS(.) 113 | ### Immune reference atlas 114 | ica_sub <- subset(ica_se, subset = source == "melanoma2") 115 | rm(ica_se) 116 | 117 | saveRDS(object = ica_sub, file = ica_melanoma2_path) 118 | } 119 | 120 | table(ica_sub@meta.data$lv1_annot) 121 | table(ica_sub@meta.data$lv2_annot) 122 | ``` 123 | 124 | Remove MAST cells from the training set 125 | ```{r eval = FALSE} 126 | ica_sub <- subset(ica_sub, subset = lv1_annot != "Mast cells") 127 | # Create Seurat object with filtered gene matrix after removing Mast cells 128 | ica_sub <- Seurat::CreateSeuratObject( 129 | counts = ica_sub@assays$RNA@counts[ 130 | sparseMatrixStats::rowSums2(ica_sub@assays$RNA@counts) != 0, ], 131 | meta.data = ica_sub@meta.data) 132 | ``` 133 | 134 | Join proliferatiing cell types to get a proliferation signature 135 | ```{r eval = FALSE} 136 | prolif_vec <- c("T cells proliferative", "Macrophages and monocytes proliferative", 137 | "B cells proliferative", "Macrophages proliferative") 138 | 139 | ica_sub[["cell_type_mod"]] <- dplyr::if_else( 140 | ica_sub@meta.data[, clust_vr] %in% prolif_vec, 141 | "Proliferation", as.character(ica_sub@meta.data[, clust_vr])) 142 | 143 | ``` 144 | 145 | ## Prepare data for RCTD 146 | ### Reference scRNAseq 147 | We are going to follow the vignette steps. 148 | 149 | In order to run RCTD, the first step is to process the single cell reference. Create a folder in ‘data/Reference’ e.g. ‘data/Reference/Vignette’ containing the following three files: 150 | 1. meta_data.csv: a CSV file (with 3 columns, with headers "barcode", "cluster", and "nUMI") containing the numeric cluster assignment for each cell.
151 | 2. cell_type_dict.csv: a CSV file (with 2 columns, with headers "Cluster" and "Name") containing the mapping between numeric cluster ID and cluster name. If you want a cluster to be filtered out of the single cell reference, you can leave the cluster name blank. The cell types must not contain the character ‘/’ or ‘-’.
152 | 3. dge.csv: a Digital Gene Expression (DGE) (barcodes by gene counts) CSV file in the standard 10x format.
153 | We use the dgeToSeurat function: 154 | 155 | ```{r eval = FALSE} 156 | scRCTD_structure <- function(sc_obj, clust_vr) { 157 | 158 | sc_obj[["Name"]] = sc_obj@meta.data[, clust_vr] 159 | 160 | # Cell type dictionary between cluster and cell-type 161 | ct <- unique(as.character(sc_obj@meta.data[, clust_vr])) 162 | df_ct <- data.frame("Cluster" = 1:length(ct), 163 | "Name" = ct) 164 | 165 | # 166 | metadata <- sc_obj@meta.data %>% 167 | # Rownames to columns must be before left join since after it the rownames are erased 168 | tibble::rownames_to_column("barcode") %>% 169 | dplyr::left_join(df_ct, by = c("Name" = "Name")) %>% 170 | # Change names to “barcode”, “cluster”, “nUMI” 171 | mutate( 172 | cluster = Cluster, 173 | nUMI = nCount_RNA 174 | ) %>% 175 | dplyr::select(barcode, cluster, nUMI) 176 | 177 | expr_mtrx <- sc_obj@assays$RNA@counts 178 | 179 | return(list("meta_data" = metadata, 180 | "cell_type_dict" = df_ct, 181 | "dge" = expr_mtrx)) 182 | } 183 | ``` 184 | 185 | Save data to reference directory 186 | ```{r eval = FALSE} 187 | sc_ls <- scRCTD_structure(sc_obj = ica_sub, clust_vr = "cell_type_mod") 188 | 189 | "{refdir}/meta_data.csv" %>% 190 | glue::glue() %>% 191 | here::here() %>% 192 | readr::write_csv(x = sc_ls[[1]], file = .) 193 | 194 | "{refdir}/cell_type_dict.csv" %>% 195 | glue::glue() %>% 196 | here::here() %>% 197 | readr::write_csv(x = sc_ls[[2]], file = .) 198 | 199 | dge_path <- "{refdir}/dge.csv" %>% 200 | glue::glue() %>% 201 | here::here() 202 | 203 | sc_ls[[3]] %>% 204 | data.frame() %>% 205 | tibble::rownames_to_column("gene") %>% 206 | readr::write_csv(x = ., 207 | file = dge_path, 208 | col_names = TRUE) 209 | 210 | ``` 211 | 212 | **Of note, here go to analysis/tool_benchmarking/RCTD_data/reference/dge.csv and remove the column name gene by "". If you don't it will detect the column as a gene and RCTD::dgeToSeurat won't be able to load the data.** 213 | 214 | ### Reference ST 215 | Next, put the SpatialRNA data in your ‘data/SpatialRNA’ directory (here ‘data/SpatialRNA/Vignette’). This needs to contain: 216 | 1. BeadLocationsForR.csv: a CSV file (with 3 columns, with headers “barcodes”, “xcoord”, and “ycoord”) containing the spatial locations of the pixels.
217 | 2. MappedDGEForR.csv: a DGE (gene counts by barcodes) CSV file. Represents raw counts at each pixel.
218 | 219 | ```{r eval = FALSE} 220 | dgef_path <- "{stdir}/MappedDGEForR.csv" %>% 221 | glue::glue() %>% 222 | here::here() 223 | 224 | se_obj@assays$Spatial@counts %>% 225 | data.frame() %>% 226 | tibble::rownames_to_column("gene") %>% 227 | dplyr::select(gene, everything()) %>% 228 | readr::write_csv(x = ., 229 | file = dgef_path, 230 | col_names = TRUE) 231 | 232 | coord_path <- "{stdir}/BeadLocationsForR.csv" %>% 233 | glue::glue() %>% 234 | here::here() 235 | 236 | # Extract spatial coordinates 237 | image_id <- names(se_obj@images) 238 | se_obj@images[[image_id]]@coordinates %>% 239 | dplyr::select(imagecol, imagerow) %>% 240 | tibble::rownames_to_column("barcodes") %>% 241 | dplyr::rename(xcoord = imagecol, ycoord = imagerow) %>% 242 | readr::write_csv( 243 | x = ., 244 | file = coord_path) 245 | 246 | ``` 247 | 248 | ## RCTD deconvolution 249 | ### Read data in RCTD 250 | Read data in RCTD format 251 | ```{r} 252 | reference <- RCTD::dgeToSeurat(refdir = refdir) 253 | puck <- RCTD::read.SpatialRNA(datadir = stdir) 254 | ``` 255 | 256 | ### Creating and running RCTD 257 | We are now ready to create an RCTD object using the create.RCTD function. We simply need to pass in the SpatialRNA and scRNA-seq objects. There are several configuration options that can be set with this function: 258 | ```{r} 259 | myRCTD <- RCTD::create.RCTD(spatialRNA = puck, 260 | reference = reference, 261 | max_cores = 1, 262 | CELL_MIN = 18) 263 | ``` 264 | 265 | Now, we are ready to run RCTD, using the run.RCTD function. This function is equivalent to sequentially running the functions fitBulk, choose_sigma_c, and fitPixels. The doublet_mode argument sets whether RCTD will be run in ‘doublet mode’ (at most 1-2 cell types per pixel) or ‘full mode’ (no restrictions on number of cell types). 266 | ```{r} 267 | myRCTD <- RCTD::run.RCTD(RCTD = myRCTD, 268 | doublet_mode = FALSE) 269 | ``` 270 | 271 | Save RCTD resulting 272 | ```{r} 273 | "{an_oro}/{robj_dir}/RCTDobj_{sample_id}_{clust_vr}.rds" %>% 274 | glue::glue() %>% 275 | here::here() %>% 276 | saveRDS(object = myRCTD, file = .) 277 | ``` 278 | 279 | 280 | ## Session Info 281 | ```{r} 282 | sessionInfo() 283 | ``` 284 | 285 | -------------------------------------------------------------------------------- /ST-oropharyngeal/README.md: -------------------------------------------------------------------------------- 1 | # Oropharyngeal 2 | 3 | ## Data description 4 | **Data acquisition** 5 | All patients provided informed consent for the collection of human specimens and data. 6 | This was approved by the St Vincent’s Hospital Research Office (2019/PID04335) in accordance with the National Health and Medical Research Council’s National Statement of Ethical Conduct in Human Research. Patients undergoing surgical resection for a locally advanced oropharyngeal cancer were recruited to the study. After surgical removal, the anatomical pathologist dissected a sample of both the primary and nodal metastasis. 7 | Samples were tumour banked in accordance with our ethically approved protocol. 8 | 9 | **Sample storage** 10 | Within 30 minutes of collection, tumour samples were tumour banked. Samples were cut into 1mm x 1mm chunks with a scalpel blade. 11 | For Visium, a tissue chunk was snap frozen in OCT. After freezing, samples were moved to liquid nitrogen for long term storage. 12 | 13 | **Visium Spatial Gene Expression** 14 | Frozen tissue samples were processed using the Visium Spatial Gene Expression slide and reagent kit (10X Genomics, US) following the manufacturer’s instruction. 15 | Briefly, 10 μm sections were placed into the capture areas of the Visium slide. 16 | Tissue morphology was assessed with H&E staining and imaging using a Leica DM6000 microscope equipped with a 20x lens (Leica, DE). 17 | The imaged sections were then permeabilized for 12 minutes using the supplied reagents. 18 | The permeabilization condition was previously optimised using the Visium Spatial Tissue Optimisation slide and reagent kit (10X Genomics, US). 19 | After permeabilization, cDNA libraries were prepared, checked for quality and sequenced on a NovaSeq 6000 platform (Illumina, US). 20 | Around 300 million pair-ended reads were obtained for each tissue section. Read 1, i7 index and Read 2 were sequenced with 28, 8 and 98 cycles respectively. 21 | 22 | ## Data availability 23 | Pending to get approval to post to GEO. 24 | 25 | ## Code 26 | Scripts *1-_australia_oroph_processing.Rmd*, *2-australia_oroph_biological.Rmd*, and *3-australia_oroph_deconv.Rmd* are in charge of preprocessing and mapping 27 | the TICA immune cell states to the tissue slices. *4-australia_oro_srtatification.Rmd* and *5-australia_oro_plots.Rmd*, in turn, are in charge of making the plots for Figure 5 and Supplementary Figures 8-12. 28 | 29 | ## Dependencies 30 | * [R 3.6.0](https://cran.r-project.org/) 31 | * [Seurat 3.2.0](https://cran.r-project.org/web/packages/Seurat/index.html) 32 | * [tidyverse 1.3.0](https://cran.r-project.org/web/packages/tidyverse/index.html) 33 | * [ggpubr 0.3.0](https://cran.r-project.org/web/packages/ggpubr/index.html) 34 | * [SPOTlight 1.0.0](https://github.com/MarcElosua/SPOTlight) 35 | * [Matrix 1.2.18](https://cran.r-project.org/web/packages/Matrix/index.html) 36 | * [svglite 1.2.3.2](https://cran.r-project.org/web/packages/svglite/index.html) 37 | * [ggcorrplot 0.1.3](https://cran.r-project.org/web/packages/ggcorrplot/index.html) 38 | * [cowplot 1.1.0](https://cran.r-project.org/web/packages/cowplot/index.html) 39 | 40 | -------------------------------------------------------------------------------- /ST-oropharyngeal/make_oroph_deconv.cmd: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | pipe_dir='/scratch/devel/melosua/phd/projects/TICA/misc/jobs/pipeline_dir' 4 | 5 | pipe_fn="$pipe_dir/pipe_spotlight_oroph.pipe" 6 | rm "$pipe_fn" 7 | touch "$pipe_fn" 8 | 9 | trn='melanoma' 10 | for spatial in 161429 161430 161431 161432 11 | do 12 | printf "$spatial.decon\t$spatial\t-\t.\tn\t8:00:00\t1\t12\t.\tmodule purge; module load gsl/1.9_64 gsl/2.4 gcc/6.3.0 gmp/6.1.2 R/3.6.0 hdf5/1.10.1; analysis/aussie_oro/spotlight_deconv_oroph_job.R $trn $spatial\n" >> "$pipe_fn" 13 | done 14 | /home/devel/melosua/bin/cnag_pipeline.pl "$pipe_fn" 15 | -------------------------------------------------------------------------------- /ST-oropharyngeal/spotlight_deconv_oroph_job.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | # Train Prostate model 4 | library(SPOTlight) 5 | library(NMF) 6 | library(Seurat) 7 | library(dplyr) 8 | 9 | # Set parameters 10 | # trn <- "melanoma" 11 | cl_n <- 100 12 | hvg <- 3000 13 | ntop <- NULL 14 | transf <- "uv" 15 | method <- "nsNMF" 16 | min_cont <- 0 17 | clust_vr <- "new_cell_types" 18 | 19 | args <- commandArgs(trailingOnly=TRUE) 20 | # trn <- "melanoma" 21 | # spatial <- "161432" 22 | trn <- args[[1]] 23 | spatial <- args[[2]] 24 | 25 | if (is.null(ntop)) { 26 | spotlight_id <- sprintf("trn-%s_cln-%s_hvg-%s_ntop-NULL_transf-%s_method-%s_mincont-%s", 27 | trn, cl_n, hvg, transf, method, min_cont) 28 | } else { 29 | spotlight_id <- sprintf("trn-%s_cln-%s_hvg-%s_ntop-%s_transf-%s_method-%s_mincont-%s", 30 | trn, cl_n, hvg, ntop, transf, method, min_cont) 31 | } 32 | 33 | source("misc/paths.R") 34 | source("utils/bin.r") 35 | source("utils/spatial_plot_spaniel.R") 36 | 37 | # Load data 38 | ### Spatial data 39 | st_ls <- readRDS(file = sprintf("%s/%s/processed_st_ls_oropharyngeal.RDS", 40 | an_oro, robj_dir)) 41 | 42 | st_se <- st_ls[[spatial]] 43 | 44 | ### Immune reference atlas 45 | if (trn == "full") { 46 | ica_se <- readRDS("data/immune_reference/atlas_250_specific_cell_type.rds") 47 | } else if (trn == "breast") { 48 | ica_se <- readRDS("data/immune_reference/atlas_complete_annotation_breast.rds") 49 | } else if (trn == "melanoma") { 50 | ica_se <- readRDS("data/immune_reference/atlas_complete_annotation_melanoma.rds") 51 | } else if (trn == "pancreas") { 52 | ica_se <- readRDS("data/immune_reference/atlas_complete_annotation_pancreas.rds") 53 | } 54 | 55 | # ica_se <- ica_se[, ica_se$new_cell_types != "MAST"] 56 | ica_se[["specific_cell_type_mod"]] <- gsub(pattern = "[[:punct:]]|[[:blank:]]", ".", 57 | x = as.character(ica_se@meta.data[, clust_vr]), 58 | perl = TRUE) 59 | 60 | ### Markers TICA 61 | if (trn == "full") { 62 | ica_markers <- readRDS(file = "data/immune_reference/ica_markers_all.RDS") 63 | } else if (trn == "breast") { 64 | ica_markers <- readRDS(file = "data/immune_reference/ica_markers_breast.RDS") 65 | } else if (trn == "melanoma") { 66 | ica_markers <- readRDS(file = "data/immune_reference/ica_markers_melanoma.RDS") 67 | } else if (trn == "pancreas") { 68 | ica_markers <- readRDS(file = "data/immune_reference/ica_markers_melanoma.RDS") 69 | } 70 | 71 | # ica_markers <- ica_markers %>% filter(cluster != "MAST") 72 | 73 | deconv_ls <- SPOTlight::spotlight_deconvolution(se_sc = ica_se, 74 | counts_spatial = st_se@assays$Spatial@counts, 75 | clust_vr = "specific_cell_type_mod", 76 | cluster_markers = ica_markers, 77 | cl_n = cl_n, 78 | hvg = hvg, 79 | ntop = ntop, 80 | transf = transf, 81 | method = method, 82 | min_cont = min_cont, 83 | assay = "RNA", 84 | slot = "counts") 85 | 86 | saveRDS(object = deconv_ls, 87 | file = sprintf("%s/%s/spotlight_deconv_ls_%s_%s.RDS", 88 | an_oro, robj_dir, spatial, spotlight_id)) 89 | -------------------------------------------------------------------------------- /misc/SPOTlight_deconvolution_job.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | ## Load libraries 4 | library(SPOTlight) 5 | library(dplyr) 6 | 7 | ## Set parameters 8 | source("misc/paths.R") 9 | dir.create(path = sprintf("%s/%s", an_breast_10x, robj_dir), 10 | showWarnings = FALSE, 11 | recursive = TRUE) 12 | 13 | dir.create(path = sprintf("%s/%s", an_breast_10x, plt_dir), 14 | showWarnings = FALSE, 15 | recursive = TRUE) 16 | 17 | cl_n <- 100 18 | hvg <- 3000 19 | ntop <- NULL 20 | transf <- "uv" 21 | method <- "nsNMF" 22 | min_cont <- 0 23 | clust_vr <- "new_cell_types" 24 | 25 | ### Variable passed from command line to determine the SC and spatial dataset 26 | args <- commandArgs(trailingOnly=TRUE) 27 | # trn <- "melanoma" 28 | # spatial <- "breast" 29 | trn <- args[[1]] 30 | spatial <- args[[2]] 31 | 32 | 33 | 34 | if (is.null(ntop)) { 35 | spotlight_id <- sprintf("trn-%s_cln-%s_hvg-%s_ntop-NULL_transf-%s_method-%s_mincont-%s", 36 | trn, cl_n, hvg, transf, method, min_cont) 37 | } else { 38 | spotlight_id <- sprintf("trn-%s_cln-%s_hvg-%s_ntop-%s_transf-%s_method-%s_mincont-%s", 39 | trn, cl_n, hvg, ntop, transf, method, min_cont) 40 | } 41 | 42 | 43 | ## Load data 44 | ### Reference scRNAseq dataset 45 | if (trn == "full") { 46 | print("Loading full 250 TICA") 47 | ica_se <- readRDS(file = "data/immune_reference/ica_se_full_processed.RDS") 48 | } else { 49 | print(sprintf("Loading: data/immune_reference/ica_se_%s_processed.RDS",trn)) 50 | ica_se <- readRDS(file = sprintf("data/immune_reference/ica_se_%s_processed.RDS", 51 | trn)) 52 | } 53 | 54 | ### Cell type markers obtained from Seruat::FindAllMarkers 55 | if (trn == "full") { 56 | print("Loading full TICA markers") 57 | ica_markers <- readRDS(file = "data/immune_reference/ica_markers_all.RDS") 58 | } else { 59 | print(sprintf("Loading: data/immune_reference/ica_markers_%s.RDS", 60 | trn)) 61 | ica_markers <- readRDS(file = sprintf("data/immune_reference/ica_markers_%s.RDS", 62 | trn)) 63 | } 64 | 65 | ### Spatial data 66 | if (spatial == "breast") { 67 | print("Loading breast spatial") 68 | spatial_se <- readRDS(file = sprintf("%s/%s/breast_merged_processed.RDS", 69 | an_breast_10x, robj_dir)) 70 | } else if (spatial == "hn") { 71 | print("Loading hn spatial") 72 | spatial_se <- readRDS(file = sprintf("%s/%s/hn1_processed.RDS", 73 | an_aussie, robj_dir)) 74 | } else if (spatial == "pdac") { 75 | print("Loading pdac spatial") 76 | 77 | } else if (spatial == "prostate") { 78 | print("Loading prostate spatial") 79 | 80 | } 81 | 82 | # Run deconvolution 83 | ## Here we set transf = RAW because when we carried out SCTransform on the SC and spatial dataset we already performed the normalization. 84 | ## That is why we point directly to the SCT assay and data slot 85 | decon_mtrx_ls <- SPOTlight::spotlight_deconvolution( 86 | se_sc = ica_se, 87 | counts_spatial = spatial_se@assays$SCT@data, 88 | clust_vr = "specific_cell_type_mod", 89 | cluster_markers = ica_markers, 90 | cl_n = cl_n, 91 | hvg = hvg, 92 | ntop = ntop, 93 | transf = "raw", # Set to raw 94 | method = method, 95 | min_cont = min_cont, 96 | assay = "SCT", 97 | slot = "data") 98 | 99 | saveRDS(object = decon_mtrx_ls, 100 | file = sprintf("%s/%s/decon_mtrx_TICA_%s_%s.RDS", 101 | an_breast_10x, robj_dir, spatial, spotlight_id)) 102 | 103 | 104 | -------------------------------------------------------------------------------- /misc/atlas.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Single-Cell-Genomics-Group-CNAG-CRG/Tumor-Immune-Cell-Atlas/f309409beb058ede05dfc85927b952cda51fc119/misc/atlas.png -------------------------------------------------------------------------------- /misc/col_df_scrpt.R: -------------------------------------------------------------------------------- 1 | # library(RColorBrewer) 2 | library(pals) 3 | library(dplyr) 4 | # n <- 60 5 | # qual_col_pals <- RColorBrewer::brewer.pal.info[RColorBrewer::brewer.pal.info$category == 'qual',] 6 | # Load color palette 7 | col_pal_v <- readRDS(file = "data/immune_reference/cell_type_palette.rds") 8 | 9 | # Save as df 10 | col_df <- data.frame(ct_col = col_pal_v, stringsAsFactors = FALSE) %>% 11 | tibble::rownames_to_column("plt_name") %>% 12 | dplyr::mutate( 13 | ct_name = gsub(pattern = "[[:punct:]]|[[:blank:]]", ".", 14 | x = plt_name, 15 | perl = TRUE), 16 | plt_name = dplyr::if_else(plt_name == "Central memory CD4 T cells", 17 | "Transitional Memory CD4 T cells", 18 | plt_name), 19 | ct_name = dplyr::case_when( 20 | plt_name == "Recently activated CD4 T cells" ~ "Effector.precursor.CD4.T.cells", 21 | plt_name == "B cells" ~ "B.cell", 22 | plt_name == "Proliferative B cells" ~ "Proliferative.B.cell", 23 | plt_name == "Naive-memory CD4 T cells" ~ "Effector.memory.CD4.T.cells", 24 | plt_name == "Plasma B cells" ~ "Plasma.B", 25 | plt_name == "Proinflamatory TAMs" ~ "Proinflamatory.TAMs.and.neutrophils", 26 | # plt_name == "Central memory CD4 T cells" ~ "Transitional.Memory.CD4.T.cells", 27 | plt_name == "Mast cells" ~ "MAST", 28 | TRUE ~ ct_name) 29 | ) %>% 30 | dplyr::filter(plt_name != "Unknown") 31 | 32 | # col_vector <- as.vector(pals::polychrome()) 33 | paula_order_old <- c("B cell", "Proliferative B cell", "Plasma B", "Naive T cells", "Regulatory T cells", "T helper cells", "Th17 cells", "Proliferative T cells", "Effector precursor CD4 T cells", "Effector memory CD4 T cells", "Central memory CD4 T cells", "Pre-exhausted CD8 T cells", "Cytotoxic CD8 T cells", "Effector memory CD8 T cells", "Terminally exhausted CD8 T cells", "NK", "SPP1 TAMs", "M2 TAMs", "Proinflamatory TAMs and neutrophils", "Proliferative monocytes and macrophages", "Monocytes", "cDC", "pDC", "mDC", "MAST") 34 | paula_order <- c("B cells", "Proliferative B cells", "Plasma B cells", "Naive T cells", "Regulatory T cells", "T helper cells", "Th17 cells", "Proliferative T cells", "Recently activated CD4 T cells", "Naive-memory CD4 T cells", "Transitional Memory CD4 T cells", "Pre-exhausted CD8 T cells", "Cytotoxic CD8 T cells", "Effector memory CD8 T cells", "Terminally exhausted CD8 T cells", "NK", "SPP1 TAMs", "M2 TAMs", "Proinflamatory TAMs", "Proliferative monocytes and macrophages", "Monocytes", "cDC", "pDC", "mDC", "Mast cells") 35 | 36 | # col_df <- data.frame(ct_name = gsub(pattern = "[[:punct:]]|[[:blank:]]", ".", 37 | # x = paula_order_old, 38 | # perl = TRUE), 39 | # plt_name = paula_order_old, 40 | # ct_col = col_vector[1:length(paula_order_old)], 41 | # stringsAsFactors = FALSE) 42 | # 43 | # # Change label names 44 | # col_df <- col_df %>% 45 | # dplyr::mutate(plt_name = dplyr::if_else(plt_name == "Effector precursor CD4 T cells", 46 | # "Recently activated CD4 T cells", plt_name)) 47 | -------------------------------------------------------------------------------- /misc/paths.R: -------------------------------------------------------------------------------- 1 | # Variables 2 | ver <- "2020-06-04" 3 | 4 | ## PATHS and common variables 5 | version_dir <- sprintf("%s",ver) 6 | dir.create(path = version_dir, 7 | showWarnings = F, 8 | recursive = T) 9 | 10 | ## Create directory for plots 11 | plt_dir <- sprintf("%s/plots_%s", ver, ver) 12 | # dir.create(path = plt_dir, 13 | # showWarnings = FALSE, 14 | # recursive = TRUE) 15 | 16 | ## Create directory for RDS objects 17 | robj_dir <- sprintf("%s/R_objects_%s", ver, ver) 18 | # dir.create(path = robj_dir, 19 | # showWarnings = FALSE, 20 | # recursive = TRUE) 21 | 22 | ## Paths to all the folders 23 | an <- "analysis" 24 | an_aussie <- sprintf("%s/aussie_ln", an) 25 | an_oro <- sprintf("%s/aussie_oro", an) 26 | an_melanoma <- sprintf("%s/melanoma", an) 27 | an_prostate <- sprintf("%s/prostate", an) 28 | an_breast_10x <- sprintf("%s/breast_10x", an) 29 | an_pdac <- sprintf("%s/pdac_st", an) 30 | an_epid <- sprintf("%s/epid_27", an) 31 | -------------------------------------------------------------------------------- /utils/process_gene_names.R: -------------------------------------------------------------------------------- 1 | # ideally, this should be done before creating the Seurat object, 2 | # but because we used already processed data (not raw fastqs), in most cases this wasn't possible 3 | # this function filters dataset genes by removing genes expressed in few cells 4 | # and replacing non accepted genes with their accepted version 5 | 6 | library(Seurat) 7 | library(tidyverse) 8 | 9 | process_gene_names <- function(data, n_cells = NULL, remove_ensembl = TRUE) { 10 | 11 | # 1 - remove genes expressed in less than n cells 12 | # (default assay should be defined beforehand) 13 | # not recommended in most cases, as it can remove too many genes 14 | 15 | if (is.numeric(n_cells)) { 16 | genes_keep <- c() 17 | 18 | for (gene in rownames(data)) { 19 | # print(gene) 20 | if (sum(GetAssayData(object = data, slot = "counts")[gene, ] > n_cells)) { 21 | genes_keep <- append(genes_keep, gene) 22 | } 23 | } 24 | 25 | # keep those genes from the seurat object 26 | 27 | data <- subset(data, features = genes_keep) 28 | } 29 | 30 | 31 | # 2- convert ENSEMBL gene names to HGNC SYMBOL 32 | 33 | # check if there are ENSMBL genes, otherwise omit 34 | 35 | if (grepl(rownames(data), pattern = "^ENSG")) { 36 | require(org.Hs.eg.db) 37 | 38 | symbols_match <- mapIds(org.Hs.eg.db, keys = rownames(data), keytype = "ENSEMBL", column = "SYMBOL", multiVals = "first") 39 | 40 | symbols <- c() 41 | for (i in 1:length(symbols_match)) { 42 | correct_symbol <- if_else(is.na(symbols_match[i]), names(symbols_match)[i], symbols_match[i]) 43 | symbols <- append(symbols, correct_symbol) 44 | } 45 | } else { 46 | symbols <- rownames(data) 47 | } 48 | 49 | 50 | # 3 - make sure the naming is homogeneous (synonym symbols, etc.) 51 | 52 | library(HGNChelper) 53 | 54 | symbols <- checkGeneSymbols(symbols, 55 | unmapped.as.na = FALSE, # if there is no mapping, return original gene name 56 | # map = getCurrentHumanMap(), # downloads latest version of aliases (needs internet connection) 57 | species = "human" 58 | ) 59 | 60 | print(length(symbols$Suggested.Symbol)) 61 | print(length(unique(symbols$Suggested.Symbol))) 62 | 63 | symbols <- make.unique(symbols$Suggested.Symbol) # keep only suggested symbols for renaming 64 | 65 | 66 | # rename 67 | 68 | if (nrow(data) == length(symbols)) { 69 | if (!all(is.na(data[[data@active.assay]]@counts))) { 70 | rownames(data[[data@active.assay]]@counts) <- symbols 71 | } 72 | if (!all(is.na(data[[data@active.assay]]@data))) { 73 | rownames(data[[data@active.assay]]@data) <- symbols 74 | } 75 | } else { 76 | "Different number of genes. Cannot rename" 77 | } 78 | 79 | 80 | # if after this step we still have (ENSG...) genes they probably are novel transcripts or other not-so-relevant genes, thus could be deleted 81 | if (remove_ensembl == TRUE) { 82 | print(paste( 83 | grep(rownames(data), pattern = "^ENSG", value = TRUE), 84 | "will be removed", 85 | "\\n", 86 | sep = " " 87 | )) 88 | data <- subset(data, features = grep(rownames(data), pattern = "^ENSG", value = TRUE, invert = TRUE)) 89 | } 90 | 91 | # finally make sure the meta.features are correctly named 92 | data[["RNA"]]@meta.features <- data.frame(row.names = rownames(data[["RNA"]])) 93 | return(data) 94 | } 95 | -------------------------------------------------------------------------------- /utils/spatial_plot_spaniel.R: -------------------------------------------------------------------------------- 1 | plot_spaniel <- function(data_df, 2 | grob, 3 | x, 4 | y, 5 | point_colour, 6 | point_size, 7 | point_alpha){ 8 | 9 | # Inverse Y to flip the coordinates 10 | # data_df$y_inv <- 36 - data_df$y 11 | # data_df$y_inv <- data_df$y 12 | 13 | data_df[, point_size] <- if_else(data_df[, point_size] == 0, NA_real_, data_df[, point_size]) 14 | 15 | tmp_plt <- ggplot2::ggplot(data_df, 16 | ggplot2::aes_string(x, y, 17 | color = point_colour, 18 | size = point_size, 19 | alpha = point_alpha)) + 20 | ggplot2::xlim(1, 36) + 21 | ggplot2::ylim(1, 36) + 22 | # Layer 1 - Plot image 23 | ggplot2::annotation_custom(grob, 24 | xmin = 1, 25 | xmax = 36, 26 | ymin = 1, 27 | ymax = 36) + 28 | # Layer 2 - Plot points 29 | geom_point() + 30 | # Layer 3 - Join legends all with same name 31 | labs(color = "Proportion", 32 | size = "Proportion" 33 | # alpha = "Proportion" 34 | ) + 35 | # Coordinates fixed so x and y keep the proportions 36 | coord_fixed(1) + 37 | # Tune colour parameters 38 | ggplot2::scale_size_continuous(range=c(0, 3), limits = c(0, 1)) + 39 | ggplot2::scale_color_gradientn( 40 | colours = heat.colors(10, rev = TRUE), 41 | limits = c(0, 1)) + 42 | ggplot2::scale_alpha(range = c(0, 1), limits = c(0, 1)) + 43 | # Join legends into one 44 | ggplot2::guides(color = ggplot2::guide_legend(), 45 | size = ggplot2::guide_legend(), 46 | alpha = ggplot2::guide_legend() 47 | ) 48 | 49 | return(tmp_plt) 50 | } 51 | 52 | 53 | ################################################################################ 54 | ################################################################################ 55 | ############################Join Spatial plots################################## 56 | ################################################################################ 57 | ################################################################################ 58 | 59 | plt_theme <- function(plt, rm_leg) { 60 | tmp_plt <- plt + 61 | scale_fill_gradientn( 62 | colours = heat.colors(10, rev = TRUE), 63 | limits = c(0, 1)) + 64 | scale_alpha(range = c(0, 1)) + 65 | labs(title = "", 66 | fill = "Proportion") + 67 | theme(plot.margin = unit(c(0, 0, 0, 0), "cm"), 68 | plot.title = element_text(hjust = 0.5)) 69 | 70 | if (rm_leg) { 71 | tmp_plt + theme(legend.position = "none") 72 | } else { 73 | tmp_plt + theme(legend.position = "right") 74 | } 75 | } 76 | 77 | draw_title <- function(feat, 78 | title_size = 20, 79 | title_face = "bold", 80 | title_margin_t = 0, 81 | title_margin_r = 0, 82 | title_margin_b = 0, 83 | title_margin_l = 10) { 84 | title <- cowplot::ggdraw() + 85 | cowplot::draw_label( 86 | feat, 87 | fontface = 'bold', 88 | x = 0, 89 | hjust = 0 90 | ) + 91 | ggplot2::theme( 92 | # add margin on the left of the drawing canvas, 93 | # so title is aligned with left edge of first plot 94 | plot.margin = margin(0, 0, 0, 7) 95 | ) 96 | return(title) 97 | } 98 | 99 | 100 | join_seurat_spatial <- function(se_obj, 101 | plt, 102 | grp, 103 | feat, 104 | title_size = 20, 105 | title_face = "bold", 106 | title_margin_t = 0, 107 | title_margin_r = 0, 108 | title_margin_b = 0, 109 | title_margin_l = 10) { 110 | 111 | # Determine which groups are 0 112 | metadata_ds <- se_obj@meta.data 113 | slices <- unique(metadata_ds[, grp]) 114 | grp1 <- sum(metadata_ds[metadata_ds$slice == slices[[1]], feat]) 115 | grp2 <- sum(metadata_ds[metadata_ds$slice == slices[[2]], feat]) 116 | 117 | # Process individual pannels 118 | plt_1 <- suppressMessages(plt_theme(plt = plt[[1]], rm_leg = TRUE)) 119 | ### Remove dots if 0s everywhere 120 | if (grp1 == 0) { 121 | plt_1 <- suppressMessages(plt_1 + scale_alpha(range = c(0,0))) 122 | } 123 | 124 | plt_2 <- suppressMessages(plt_theme(plt = plt[[2]], rm_leg = FALSE)) 125 | ### Remove dots if 0s everywhere 126 | if (grp2 == 0) { 127 | plt_2 <- suppressMessages(plt_2 + scale_alpha(range = c(0,0))) 128 | } 129 | 130 | # Recombine plt 1 and 2 131 | tmp_plt <- cowplot::plot_grid(plt_1, 132 | plt_2, 133 | align = "hv", 134 | axis = "trbl", 135 | nrow = 1) 136 | 137 | # Add title to plot composition 138 | title <- draw_title(feat = feat, 139 | title_size = title_size, 140 | title_face = title_face, 141 | title_margin_t = title_margin_t, 142 | title_margin_r = title_margin_r, 143 | title_margin_b = title_margin_b, 144 | title_margin_l = title_margin_l) 145 | 146 | plt_arr <- cowplot::plot_grid( 147 | title, tmp_plt, 148 | ncol = 1, 149 | # rel_heights values control vertical title margins 150 | rel_heights = c(0.1, 1) 151 | ) 152 | 153 | return(plt_arr) 154 | 155 | } 156 | 157 | --------------------------------------------------------------------------------