├── Benchmarking-Integration-Methods
    ├── 1-run_harmony.R
    ├── 2-run_scanorama.py
    ├── 3-compute_lisi.R
    ├── 4-plot_lisi.R
    ├── 5-cluster_harmony.R
    ├── 6-cluster_scanorama.py
    └── 7-plot_umaps.R
├── Integration
    ├── 10_filter_annotate.R
    ├── 1_process_gene_names_objects.R
    ├── 2_find_batch_specific_genes.R
    ├── 3_normalize_each_dataset.R
    ├── 4_find_integration_features.R
    ├── 5_scale_each_dataset.R
    ├── 6_find_integration_anchors.R
    ├── 7_integration.R
    ├── 8_process_integrated_obj.R
    ├── 9_cluster_atlas.R
    └── README.md
├── Patient stratification
    ├── 1_create_proportions_dataset.R
    ├── 2_cluster_patients.R
    ├── 3_random_forest.R
    └── README.md
├── Projections
    ├── README.md
    ├── atlas_projection.Rmd
    ├── cancer_subtype_palette.rds
    └── cell_type_palette.rds
├── README.md
├── Random Forest
    ├── 1-calculate_cell_type_signatures.R
    ├── 2-create_test_sets_for_cross_validation.R
    ├── 3-run_random_forest.R
    ├── 4-evaluate_performance.R
    ├── 5-plot_figures.R
    ├── README.md
    └── data
    │   └── supp_table_2.xlsx
├── ST-breast
    ├── 1-10x_breast_QC.Rmd
    ├── 2-10x_breast_GO_enrichment.Rmd
    ├── 3-10x_breast_immune_reference.Rmd
    ├── 4-10x_breast_stratification.Rmd
    ├── RCTD-10x_breast_immune_reference.Rmd
    ├── RCTD-SPOTlight_comparison.Rmd
    └── README.md
├── ST-oropharyngeal
    ├── 1-australia_oroph_processing.Rmd
    ├── 2-australia_oroph_biological.Rmd
    ├── 3-australia_oroph_deconv.Rmd
    ├── 4-australia_oro_stratification.Rmd
    ├── 6-australia_oro_digitalFISH.Rmd
    ├── RCTD-SPOTlight_comparison.Rmd
    ├── RCTD-australia_oroph_deconv.Rmd
    ├── README.md
    ├── make_oroph_deconv.cmd
    └── spotlight_deconv_oroph_job.R
├── misc
    ├── SPOTlight_deconvolution_job.R
    ├── atlas.png
    ├── col_df_scrpt.R
    └── paths.R
└── utils
    ├── bin.r
    ├── process_gene_names.R
    └── spatial_plot_spaniel.R


/Benchmarking-Integration-Methods/1-run_harmony.R:
--------------------------------------------------------------------------------
 1 | # This script computes harmony for the TICA atlas and saves the resulting
 2 | # batch-corrected PCs and Seurat object
 3 | 
 4 | 
 5 | # Load packages
 6 | library(Seurat)
 7 | library(tidyverse)
 8 | library(harmony)
 9 | library(SeuratWrappers)
10 | 
11 | 
12 | # Load data
13 | tica <- readRDS("data/TICAtlas.rds")
14 | 
15 | 
16 | # Save Seurat-corrected PCA coordinates
17 | saveRDS(tica@reductions$pca@cell.embeddings, "results/Seurat_v3_corrected_pca_coordinates.rds")
18 | 
19 | 
20 | # Run Harmony
21 | DefaultAssay(tica) <- "RNA"
22 | tica <- tica %>%
23 |   FindVariableFeatures(nfeatures = 2000, verbose = FALSE, selection.method = "vst") %>%
24 |   ScaleData() %>%
25 |   RunPCA()
26 | print(tica@reductions)
27 | tica <- RunHarmony(tica, dims = 1:50, reduction = "pca", group.by.vars = "source")
28 | tica <- RunUMAP(tica, dims = 1:50, reduction = "harmony")
29 | tica <- FindNeighbors(tica, dims = 1:50, reduction = "harmony")
30 | 
31 | 
32 | # Save
33 | saveRDS(tica@reductions$pca@cell.embeddings, "results/uncorrected_pca_coordinates.rds")
34 | write_csv(as.data.frame(tica@reductions$pca@cell.embeddings), "results/uncorrected_pca_coordinates.csv", col_names = TRUE)
35 | saveRDS(tica@reductions$harmony@cell.embeddings, "results/harmony_corrected_pca_coordinates.rds")
36 | write_csv(rownames_to_column(tica@meta.data, var = "cell_barcode"), "results/tica_metadata.csv", col_names = TRUE)
37 | saveRDS(tica, "results/tica_with_harmony.rds")
38 | 
39 | 
40 | 


--------------------------------------------------------------------------------
/Benchmarking-Integration-Methods/2-run_scanorama.py:
--------------------------------------------------------------------------------
 1 | # This script runs scanorama to obtain a batch-corrected dimensionality reduction matrix.
 2 | # We will follow the pipeline described in https://nbisweden.github.io/workshop-scRNAseq/labs/compiled/scanpy/scanpy_03_integration.html
 3 | 
 4 | # Import modules
 5 | import numpy as np
 6 | import pandas as pd
 7 | import scanpy as sc
 8 | import anndata
 9 | import scanorama
10 | 
11 | 
12 | # Read data
13 | path_to_data = "data/TICAtlas.h5ad"
14 | tica = sc.read_h5ad(path_to_data)
15 | 
16 | 
17 | # Find highly variable genes
18 | sc.pp.highly_variable_genes(tica, min_mean = 0.0125, max_mean = 3, min_disp = 0.5, batch_key = 'source')
19 | var_genes = tica.var.highly_variable
20 | var_genes = list(var_genes.index[var_genes])
21 | 
22 | 
23 | # Run Scanorama
24 | batches = tica.obs['source'].cat.categories.tolist()
25 | alldata = {}
26 | for batch in batches:
27 |     alldata[batch] = tica[tica.obs['source'] == batch, var_genes]
28 | tica_list = list(alldata.values())
29 | scanorama.integrate_scanpy(tica_list, dimred = 50)
30 | tica_list[0].obsm['X_scanorama'].shape
31 | scanorama_int = [ad.obsm['X_scanorama'] for ad in tica_list]
32 | cell_barcodes = []
33 | for ad in tica_list:
34 |     cell_barcodes.extend(ad.obs_names)
35 | tica = tica[cell_barcodes, :]
36 | all_s = np.concatenate(scanorama_int)
37 | print(all_s.shape)
38 | tica.obsm["Scanorama"] = all_s
39 | #cell_barcodes = []
40 | #for ad in tica_list:
41 | #    cell_barcodes.extend(ad.obs_names)
42 | #tica = tica[cell_barcodes, :]
43 | 
44 | 
45 | # Run UMAP
46 | sc.pp.neighbors(tica, n_pcs = 50, use_rep = "Scanorama")
47 | sc.tl.umap(tica)
48 | 
49 | 
50 | # Save
51 | tica.__dict__['_raw'].__dict__['_var'] = tica.__dict__['_raw'].__dict__['_var'].rename(columns={'_index': 'features'})
52 | tica.write_h5ad("results/TICAtlas_scanorama3.h5ad")
53 | dimred = tica.obsm["Scanorama"]
54 | dimred_df = pd.DataFrame(dimred)
55 | dimred_df.columns = ["Scanorama_{}".format(str(x + 1)) for x in dimred_df.columns]
56 | dimred_df.insert(0, "cell_barcode", tica.obs_names, True)
57 | dimred_df.to_csv("results/Scanorama_corrected_pca_coordinates3.csv", index = False)
58 | umap_df = pd.DataFrame(tica.obsm["X_umap"])
59 | umap_df.columns = ["UMAP1", "UMAP2"]
60 | umap_df.insert(0, "cell_barcode", tica.obs_names, True)
61 | umap_df.to_csv("results/Scanorama_corrected_umap_coordinates3.csv", index = False)
62 | 


--------------------------------------------------------------------------------
/Benchmarking-Integration-Methods/3-compute_lisi.R:
--------------------------------------------------------------------------------
 1 | # This script computes the Local Inverse Simpson Index for the following cases:
 2 | # 1. Unintegrated PCA
 3 | # 2. Seurat v3-corrected PCA
 4 | # 3. Harmony-corrected PCA
 5 | # 4. Scanorama-corrected PCA
 6 | 
 7 | 
 8 | # Load packages
 9 | library(tidyverse)
10 | library(lisi)
11 | 
12 | 
13 | # Load data
14 | tica_metadata <- read_csv("results/tica_metadata.csv", col_names = TRUE)
15 | uncorrected_coords <- readRDS("results/uncorrected_pca_coordinates.rds")
16 | seurat_coords <- readRDS("results/Seurat_v3_corrected_pca_coordinates.rds")
17 | harmony_coords <- readRDS("results/harmony_corrected_pca_coordinates.rds")
18 | scanorama_coords <- read_csv("results/Scanorama_corrected_pca_coordinates3.csv")
19 | selected_cols <- str_subset(colnames(scanorama_coords), "^Scanorama_")
20 | scanorama_coords_mat <- scanorama_coords[, selected_cols]
21 | scanorama_coords_mat <- as.matrix(scanorama_coords_mat)
22 | rownames(scanorama_coords_mat) <- scanorama_coords$cell_barcode
23 | scanorama_coords_mat <- scanorama_coords_mat[rownames(harmony_coords), ]
24 | 
25 | 
26 | # Define technical and biological variables
27 | meta_data <- data.frame(source = tica_metadata$source)
28 | rownames(meta_data) <- tica_metadata$cell_barcode
29 | if (all(rownames(meta_data) == rownames(uncorrected_coords))) {
30 |   print("row names are equal")
31 | } else {
32 |   warning("row names are not equal!")
33 | }
34 | 
35 | 
36 | # Compute LISI
37 | dim_red_mats <- list(
38 |   uncorrected_coords,
39 |   seurat_coords,
40 |   harmony_coords,
41 |   scanorama_coords_mat
42 | )
43 | names(dim_red_mats) <- c("uncorrected", "Seurat v3", "Harmony",
44 |                          "Scanorama")
45 | lisi_scores <- purrr::map(dim_red_mats, function(mat) {
46 |   scores <- compute_lisi(
47 |     X = mat,
48 |     meta_data = meta_data,
49 |     label_colnames = "source"
50 |   )
51 | })
52 | lisi_scores <- bind_rows(lisi_scores, .id = "correction")
53 | 
54 | 
55 | # Save
56 | saveRDS(lisi_scores, "results/lisi_scores.rds")
57 | 


--------------------------------------------------------------------------------
/Benchmarking-Integration-Methods/4-plot_lisi.R:
--------------------------------------------------------------------------------
 1 | # This script plots the Local Inverse Simpson Index (LISI)
 2 | 
 3 | 
 4 | # Load packages
 5 | library(tidyverse)
 6 | library(ggridges)
 7 | 
 8 | 
 9 | # Load data frame
10 | lisi_df <- readRDS("results/lisi_scores.rds")
11 | 
12 | 
13 | # Plot LISI
14 | sorted_corrections <- c("uncorrected", "Seurat v3", "Harmony", "Scanorama")
15 | palette <- c("#999999", "#92e7df", "#612c63", "#e5624f")
16 | lisi_scores_gg <- lisi_df %>%
17 |   mutate(correction = factor(correction, levels = rev(sorted_corrections))) %>%
18 |   ggplot(aes(source, correction, fill = correction)) +
19 |     geom_density_ridges(
20 |       quantile_lines = TRUE,
21 |       quantile_fun = median
22 |     ) +
23 |     # geom_violin() +
24 |     scale_fill_manual(values = palette) +
25 |     labs(x = "iLISI", y = "") +
26 |     theme_classic() +
27 |     theme(
28 |       legend.position = "none",
29 |       axis.title.x = element_text(color = "black", size = 13),
30 |       axis.text.y = element_text(color = "black", size = 12),
31 |       axis.text.x = element_text(size = 11)
32 |     )
33 | 
34 | 
35 | # Save
36 | ggsave(
37 |   filename = "results/plots/iLISI_TICAtlas.pdf",
38 |   plot = lisi_scores_gg,
39 |   width = 14,
40 |   height = 8,
41 |   units = "cm"
42 | )
43 | 


--------------------------------------------------------------------------------
/Benchmarking-Integration-Methods/5-cluster_harmony.R:
--------------------------------------------------------------------------------
 1 | # This script clusters the cells in the TICAtlas at varying resolutions using
 2 | # the Harmony-corrected PC as features.
 3 | 
 4 | 
 5 | # Load packages
 6 | library(tidyverse)
 7 | library(Seurat)
 8 | 
 9 | 
10 | # Load Seurat object
11 | tica <- readRDS("results/tica_with_harmony.rds")
12 | 
13 | 
14 | # Cluster
15 | tica <- FindClusters(tica, resolution = c(0.01, 0.05, 0.075, 0.1, 0.2))
16 | 
17 | 
18 | # Save UMAP coordinates and clusters
19 | harmony_df <- data.frame(
20 |   cell_barcode = colnames(tica),
21 |   UMAP1 = tica@reductions$umap@cell.embeddings[, "UMAP_1"],
22 |   UMAP2 = tica@reductions$umap@cell.embeddings[, "UMAP_2"],
23 |   cell_type = tica$cell_type,
24 |   cancer_subtype = tica$subtype,
25 |   cluster_res_0.01 = tica$RNA_snn_res.0.01,
26 |   cluster_res_0.05 = tica$RNA_snn_res.0.05,
27 |   cluster_res_0.075 = tica$RNA_snn_res.0.075,
28 |   cluster_res_0.1 = tica$RNA_snn_res.0.1,
29 |   cluster_res_0.2 = tica$RNA_snn_res.0.2
30 | )
31 | saveRDS(harmony_df, "results/umap_harmony_with_clusters.rds")
32 | 


--------------------------------------------------------------------------------
/Benchmarking-Integration-Methods/6-cluster_scanorama.py:
--------------------------------------------------------------------------------
 1 | # This script clusters the cells in the TICAtlas at varying resolutions using
 2 | # the Scanorama-corrected PC as features.
 3 | 
 4 | 
 5 | # Load packages
 6 | import numpy as np
 7 | import pandas as pd
 8 | import scanpy as sc
 9 | import anndata
10 | 
11 | 
12 | # Read data
13 | path_to_data = "results/TICAtlas_scanorama3.h5ad"
14 | tica = sc.read_h5ad(path_to_data)
15 | 
16 | 
17 | 
18 | # Cluster at varying resolutions
19 | for res in [0.25, 0.5, 0.75, 1]:
20 | 	sc.tl.leiden(tica, resolution = res)
21 | 	tica.obs["leiden_res{}".format(res)] = tica.obs["leiden"]
22 | 
23 | 
24 | 
25 | # Save UMAP coords and clusters to visualize in a downstream script
26 | scanorama_df = {
27 | 	"cell_barcode": tica.obs.index,
28 | 	"UMAP1": tica.obsm["X_umap"][:, 0],
29 | 	"UMAP2": tica.obsm["X_umap"][:, 1],
30 | 	"cell_type": tica.obs["cell_type"],
31 | 	"cancer_subtype": tica.obs["subtype"],
32 | 	"leiden_res0.25": tica.obs["leiden_res0.25"],
33 | 	"leiden_res0.5": tica.obs["leiden_res0.5"],
34 | 	"leiden_res0.75": tica.obs["leiden_res0.75"],
35 | 	"leiden_res1": tica.obs["leiden_res1"]
36 | }
37 | scanorama_df = pd.DataFrame(scanorama_df)
38 | scanorama_df.to_csv("results/umap_scanorama_with_clusters.csv", index = False)
39 | 


--------------------------------------------------------------------------------
/Benchmarking-Integration-Methods/7-plot_umaps.R:
--------------------------------------------------------------------------------
  1 | # This script plots the UMAP for Harmony and Scanorama integration of the
  2 | # TICAtlas
  3 | 
  4 | 
  5 | # Load packages
  6 | library(Seurat)
  7 | library(tidyverse)
  8 | 
  9 | 
 10 | # Read data
 11 | umap_harmony_df <- readRDS("results/umap_harmony_with_clusters.rds")
 12 | umap_scanorama_df <- read_csv("results/umap_scanorama_with_clusters.csv")
 13 | umap_scanorama_df <- as.data.frame(umap_scanorama_df)
 14 | rownames(umap_scanorama_df) <- umap_scanorama_df$cell_barcode
 15 | umap_scanorama_df <- umap_scanorama_df[rownames(umap_harmony_df), ]
 16 | all(rownames(umap_harmony_df) == rownames(umap_scanorama_df))
 17 | umap_scanorama_df$cell_type <- umap_harmony_df$cell_type
 18 | 
 19 | 
 20 | # UMAP colored by cell type
 21 | cell_type_palette <- readRDS("data/cell_type_palette.rds")
 22 | dfs <- list(harmony = umap_harmony_df, scanorama = umap_scanorama_df)
 23 | umaps_cell_type <- purrr::map(dfs, function(df) {
 24 |   p <- df %>%
 25 |     ggplot(aes(UMAP1, UMAP2, color = cell_type)) +
 26 |       geom_point(size = 0.1) +
 27 |       scale_color_manual(values = cell_type_palette) +
 28 |       theme_classic() +
 29 |       theme(
 30 |         legend.position = "none",
 31 |         axis.title = element_blank(),
 32 |         axis.text = element_blank(),
 33 |         axis.ticks = element_blank(),
 34 |         axis.line = element_blank()
 35 |       )
 36 |   p
 37 | })
 38 | 
 39 | 
 40 | # UMAP colored by cancer subtype
 41 | cancer_subtype_palette <- readRDS("data/cancer_subtype_palette.rds")
 42 | cancer_subtype_palette <- c(cancer_subtype_palette, CM = "#ee6a50")
 43 | umaps_cancer_subtype <- purrr::map(dfs, function(df) {
 44 |   p <- df %>%
 45 |     ggplot(aes(UMAP1, UMAP2, color = cancer_subtype)) +
 46 |       geom_point(size = 0.1) +
 47 |       scale_color_manual(values = cancer_subtype_palette) +
 48 |       theme_classic() +
 49 |       theme(
 50 |         legend.position = "none",
 51 |         axis.title = element_blank(),
 52 |         axis.text = element_blank(),
 53 |         axis.ticks = element_blank(),
 54 |         axis.line = element_blank()
 55 |       )
 56 |   p
 57 | })
 58 | 
 59 | 
 60 | # UMAPs colored by cluster
 61 | # Harmony
 62 | cluster_vars <- str_subset(colnames(dfs$harmony), "cluster_res_")
 63 | umap_cluster_harmony <- purrr::map(cluster_vars, function(x) {
 64 |   p <- umap_harmony_df %>%
 65 |     ggplot(aes_string("UMAP1", "UMAP2", color = x)) +
 66 |       geom_point(size = 0.1) +
 67 |       ggtitle(str_remove(x, "cluster_res_")) +
 68 |       theme_classic() +
 69 |       theme(plot.title = element_text(hjust = 0.5, size = 13, face = "bold"))
 70 |   p
 71 | })
 72 | 
 73 | fct_cells_cell_type <- umap_harmony_df %>%
 74 |   select("cell_type", "cluster_res_0.05") %>%
 75 |   group_by(cell_type, cluster_res_0.05) %>%
 76 |   summarize(n_cells = n()) %>%
 77 |   ungroup() %>% 
 78 |   group_by(cluster_res_0.05) %>%
 79 |   mutate(n_cells_total = sum(n_cells), fct_cells = n_cells / n_cells_total)
 80 | 
 81 | fct_cells_cell_type_gg <- fct_cells_cell_type %>%
 82 |   ggplot(aes(cluster_res_0.05, fct_cells, fill = cell_type)) +
 83 |   geom_col() +
 84 |   scale_fill_manual(values = cell_type_palette) +
 85 |   labs(x = "Cluster (Harmony)", y = "", fill = "") +
 86 |   theme_classic()
 87 | 
 88 | 
 89 | # Scanorama
 90 | 
 91 | 
 92 | # Save
 93 | for (x in names(dfs)) {
 94 |   # Save umap colored by cell type
 95 |   path_save_umap1 <- str_c(
 96 |     "results/plots/TICAtlas_umap_colored_cell_type_",
 97 |     x,
 98 |     ".pdf",
 99 |     sep = ""
100 |   )
101 |   ggsave(
102 |     path_save_umap1,
103 |     plot = umaps_cell_type[[x]],
104 |     width = 9,
105 |     height = 9,
106 |     units = "cm"
107 |   )
108 |   
109 |   
110 |   # Save umap colored by cell type
111 |   path_save_umap2 <- str_c(
112 |     "results/plots/TICAtlas_umap_colored_cancer_subtype_",
113 |     x,
114 |     ".pdf",
115 |     sep = ""
116 |   )
117 |   ggsave(
118 |     path_save_umap2,
119 |     plot = umaps_cancer_subtype[[x]],
120 |     width = 9,
121 |     height = 9,
122 |     units = "cm"
123 |   )
124 | }
125 | 
126 | 
127 | 


--------------------------------------------------------------------------------
/Integration/10_filter_annotate.R:
--------------------------------------------------------------------------------
 1 | # filter out non-immune clusters (clusters 15, 16, 20 and 34) and rename clusters (annotate them)
 2 | 
 3 | library(Seurat)
 4 | library(tidyverse)
 5 | # library(future)
 6 | # plan("multicore")
 7 | # options(future.globals.maxSize = Inf)
 8 | 
 9 | atlas <- readRDS("output/integrated_clustered.rds")
10 | 
11 | # remove non-immune cells
12 | atlas <- subset(atlas, seurat_clusters %in% c("15", "16", "20", "34"), invert = TRUE)
13 | 
14 | atlas$seurat_clusters <- as.factor(as.numeric(as.character(atlas$seurat_clusters)))
15 | Idents(atlas) <- "seurat_clusters"
16 | levels(atlas)
17 | 
18 | new_ids <- c("CD4 memory stem cells", "CD8 exhausted", "CD8 cytotoxic", "CD4 memory stem cells", "CD4 resident effector memory", "T regs", "B cells", "TAMs C1QC", "NK", "CD4 effector memory RA",
19 |              "Tumor infiltrating Monocytes", "CD4 activated", "Proliferative T cells", "CD4 transitional memory", "CD4 effector memory RA", "DC2 CD1C+", "CD8 IFN activated", "CD8 activated T cells",
20 |              "Plasma B cells", "Macrophages SPP1", "Plasma B cells", "DC4 CD1C-", "T helper/Th17", "pDC", "DC3 LAMP3", "Macrophages proliferative", "Macrophages CXCL10",
21 |              "Mast cells", "DC1", "Proliferative B cells", "Plasma blasts"
22 |              )
23 | 
24 | names(new_ids) <- levels(atlas)
25 | 
26 | atlas <- RenameIdents(atlas, new_ids)
27 | 
28 | atlas$new_annot <- Idents(atlas)
29 | 
30 | ggsave(plot = DimPlot(atlas, cols = as.vector(pals::polychrome())), filename = "atlas_plot.png", width = 10)
31 | 
32 | saveRDS(atlas, "output/integrated_renamed_filtered.rds")


--------------------------------------------------------------------------------
/Integration/1_process_gene_names_objects.R:
--------------------------------------------------------------------------------
 1 | # run iteratively for each dataset that will be integrated
 2 | # uses the "process_gene_names.R" function to correct gene names
 3 | # and saves each dataset in a new file
 4 | # takes as arguments each dataset path and its abbreviated name
 5 | 
 6 | source("utils/process_gene_names.R")
 7 | 
 8 | args = commandArgs(TRUE)
 9 | data = args[1]
10 | name = args[2]
11 | 
12 | print(name)
13 | 
14 | data <- readRDS(data)
15 | 
16 | # make sure we only have the RNA assay
17 | DefaultAssay(data) <- "RNA"
18 | if ( length(Assays(data)) > 1) {data[["SCT"]] <- NULL}
19 | 
20 | print(data)
21 | 
22 | processed <- process_gene_names(data)
23 | 
24 | print(processed)
25 | 
26 | saveRDS(processed, paste0("output/", name, "_filtered_genes.rds"))
27 | 


--------------------------------------------------------------------------------
/Integration/2_find_batch_specific_genes.R:
--------------------------------------------------------------------------------
 1 | # to improve the integration, we limit the genes used to compute the anchors later on
 2 | # we will remove the top genes that are specific for each batch
 3 | # in our case the batch is in the "source" variable
 4 | 
 5 | library(Seurat)
 6 | library(tidyverse)
 7 | 
 8 | source("code/functions/load_objects.R")
 9 | 
10 | obj_list <- load_objects(directory="output/",
11 |                          extension="_filtered_genes.rds")
12 | 
13 | merged <- merge(obj_list[[1]],
14 |                 obj_list[2:length(obj_list)])
15 | 
16 | # to keep the consistency we are going to rename the cells
17 | 
18 | merged <- RenameCells(merged,
19 |                       new.names =paste("cell", 1:ncol(merged),
20 |                                        sep = "_"))
21 | 
22 | merged <- merged %>%
23 |   NormalizeData() %>%
24 |   FindVariableFeatures(nfeatures = 5000) %>%
25 |   ScaleData()
26 | 
27 | # set the batches (source) as idents
28 | Idents(merged) <- "source"
29 | 
30 | # get markers for each batch, limting cells per ident to reduce computing time
31 | markers <- FindAllMarkers(merged,
32 |                           verbose = TRUE,
33 |                           only.pos = TRUE,
34 |                           max.cells.per.ident = 5000)
35 | 
36 | # filter by adjusted p_value
37 | markers <- filter(markers, 
38 |                   p_val_adj < 0.01)
39 | 
40 | saveRDS(markers,
41 |         file = "output/markers/batch_specific_markers.rds")
42 | 


--------------------------------------------------------------------------------
/Integration/3_normalize_each_dataset.R:
--------------------------------------------------------------------------------
 1 | # process each dataset, preparing them for integration
 2 | # this scripts takes as arguments the data path for the dataset
 3 | # and the abbreviated name
 4 | # it normalizes and finds HVG of the dataset
 5 | 
 6 | library(Seurat)
 7 | library(tidyverse)
 8 | 
 9 | args <- commandArgs(TRUE)
10 | data <- args[1]
11 | name <- args[2]
12 | 
13 | print(name)
14 | 
15 | # load dataset
16 | data <- readRDS(data)
17 | 
18 | print(data)
19 | 
20 | # normalize and identify variable features
21 | 
22 | data <- data %>%
23 |   NormalizeData() %>%
24 |   FindVariableFeatures(nfeatures = 5000)
25 | 
26 | print("done!")
27 | 
28 | saveRDS(data, paste0("output/", name, "_normalized.rds"))
29 | 


--------------------------------------------------------------------------------
/Integration/4_find_integration_features.R:
--------------------------------------------------------------------------------
 1 | # scale each dataset, compute its PCs and find the integration features
 2 | 
 3 | library(Seurat)
 4 | library(tidyverse)
 5 | 
 6 | source("code/functions/load_objects.R")
 7 | 
 8 | obj_list <- load_objects(directory="output/",
 9 |                          extension="_normalized.rds")
10 | 
11 | # select features that are repeatedly variable across datasets for integration 
12 | # run PCA on each # dataset using these features
13 | features <- SelectIntegrationFeatures(obj_list, 
14 |                                       nfeatures = 5000)
15 | print(head(features))
16 | 
17 | # but remove dataset specific
18 | batch_markers <- readRDS("output/markers/batch_specific_markers.rds")
19 | batch_markers <-batch_markers %>% 
20 |   group_by(cluster) %>% 
21 |   top_n(n = 150, wt = avg_log2FC)
22 | 
23 | table(batch_markers$gene %in% features)
24 | 
25 | features <- setdiff(features, batch_markers$gene)
26 | print(length(features))
27 | 
28 | # save selected features
29 | saveRDS(features, "output/integration_features.rds")


--------------------------------------------------------------------------------
/Integration/5_scale_each_dataset.R:
--------------------------------------------------------------------------------
 1 | # scale each dataset and compute its PCs
 2 | # using the selected integration features
 3 | 
 4 | library(Seurat)
 5 | library(tidyverse)
 6 | 
 7 | args = commandArgs(TRUE)
 8 | data = args[1]
 9 | name = args[2]
10 | 
11 | print(name)
12 | 
13 | # load dataset
14 | 
15 | data <- readRDS(data)
16 | 
17 | # load selected integration features
18 | 
19 | features <- readRDS("output/integration_features.rds")
20 | print(length(features))
21 | 
22 | 
23 | # normalize and identify variable features 
24 | 
25 | data <- data %>% 
26 |   ScaleData(features = features) %>% 
27 |   RunPCA(features = features)
28 | 
29 | print("done!")
30 | 
31 | saveRDS(data, paste0("output/", name, "_scaled.rds"))


--------------------------------------------------------------------------------
/Integration/6_find_integration_anchors.R:
--------------------------------------------------------------------------------
 1 | # find the anchors to perform the integration
 2 | 
 3 | library(Seurat)
 4 | library(tidyverse)
 5 | 
 6 | source("code/functions/load_objects.R")
 7 | 
 8 | obj_list <- load_objects(
 9 |   directory = "output/",
10 |   extension = "_scaled.rds"
11 | )
12 | 
13 | # load selected integration features
14 | 
15 | features <- readRDS("output/integration_features.rds")
16 | 
17 | anchors <- FindIntegrationAnchors(object.list = obj_list, 
18 |                                   normalization.method = "LogNormalize",
19 |                                   anchor.features = features, 
20 |                                   reduction = "rpca")
21 | 
22 | # save anchors
23 | saveRDS(anchors, "output/anchors.rds")
24 | 


--------------------------------------------------------------------------------
/Integration/7_integration.R:
--------------------------------------------------------------------------------
 1 | # integrate the datasets
 2 | 
 3 | library(Seurat)
 4 | 
 5 | # load anchors
 6 | anchors <- readRDS("output/anchors.rds")
 7 | 
 8 | integrated <- IntegrateData(anchorset = anchors)
 9 | 
10 | rm(anchors)
11 | gc()
12 | 
13 | # save integrated dataset
14 | saveRDS(integrated, "output/integrated.rds")


--------------------------------------------------------------------------------
/Integration/8_process_integrated_obj.R:
--------------------------------------------------------------------------------
 1 | # integrated analysis on all cells!
 2 |   
 3 | library(Seurat)
 4 | library(tidyverse)
 5 | # library(future)
 6 | # plan("multiprocess")
 7 | # options(future.globals.maxSize = Inf)
 8 | 
 9 | atlas <- readRDS("output/integrated.rds")
10 | 
11 | DefaultAssay(atlas) <- "integrated"
12 | 
13 | atlas
14 |   
15 | # run the standard workflow on the integrated assay
16 | 
17 | atlas <- atlas %>% 
18 |   ScaleData() %>% 
19 |   RunPCA(npcs = 30) %>% 
20 |   RunUMAP(reduction = "pca", dims = 1:30) %>% 
21 |   FindNeighbors(reduction = "pca", dims = 1:30)
22 | 
23 | saveRDS(atlas, "output/integrated_processed.rds")


--------------------------------------------------------------------------------
/Integration/9_cluster_atlas.R:
--------------------------------------------------------------------------------
 1 | # cluster atlas with optimal resolution obtained from the random forest oob
 2 | 
 3 | library(Seurat)
 4 | library(tidyverse)
 5 | # library(future)
 6 | # plan("multiprocess")
 7 | # options(future.globals.maxSize = Inf)
 8 | 
 9 | atlas <- readRDS("output/integrated_processed.rds")
10 | 
11 | atlas
12 | 
13 | atlas <- FindClusters(atlas, 
14 |                       resolution = 1.2)
15 | 
16 | saveRDS(atlas, "output/integrated_clustered.rds")
17 | 
18 | p1 <- DimPlot(atlas, 
19 |               reduction = "umap", 
20 |               group.by = "source", 
21 |               label = TRUE, 
22 |               repel = TRUE)
23 | 
24 | p2 <- DimPlot(atlas, 
25 |               reduction = "umap", 
26 |               label = TRUE, 
27 |               repel = TRUE)
28 | 
29 | plots <- p1 + p2
30 | 
31 | ggsave(plot = plots, 
32 |        filename = "output/cluster_plots.png",
33 |        dpi = 300,
34 |        width = 11)
35 | 
36 | p3 <- DimPlot(atlas, 
37 |               reduction = "umap", 
38 |               split.by = "source",
39 |               ncol = 3)
40 | 
41 | ggsave(plot = p3, 
42 |        filename = "output/cluster_plots_split.png",
43 |        dpi = 300,
44 |        height = 12,
45 |        width = 12)
46 | 
47 | table(atlas$seurat_clusters, atlas$source)


--------------------------------------------------------------------------------
/Integration/README.md:
--------------------------------------------------------------------------------
 1 | # Single-Cell Tumor Immune Atlas project: Dataset Integration
 2 | 
 3 | In this folder you can find all the files related to the creation of the atlas: all the scripts leading to the integration of the datasets and the downstream processing of the atlas (clustering, annotation, etc.).
 4 | 
 5 | The input to this pipeline (not included in this repository) are the datasets listed in Table 1 of the paper (https://www.biorxiv.org/content/10.1101/2020.10.26.354829v1) as seurat objects.
 6 | 
 7 | ## Package versions
 8 | Required packages for the code in this folder and versions used:
 9 | * [Tidyverse 1.3.0](https://cran.r-project.org/web/packages/tidyverse/vignettes/paper.html)
10 | * [Seurat 3.2.2](https://www.cell.com/cell/fulltext/S0092-8674(19)30559-8?_returnURL=https%3A%2F%2Flinkinghub.elsevier.com%2Fretrieve%2Fpii%2FS0092867419305598%3Fshowall%3Dtrue)
11 | 


--------------------------------------------------------------------------------
/Patient stratification/1_create_proportions_dataset.R:
--------------------------------------------------------------------------------
 1 | # this script creates the cell type proportion per
 2 | # cluster dataset that we will use for the clustering
 3 | 
 4 | # load libraries
 5 | library(tidyverse)
 6 | library(Seurat)
 7 | 
 8 | # create proportion table per patient
 9 | atlas <- readRDS("../output/integrated_renamed_filtered.rds")
10 | data <- table(atlas$patient, atlas$new_annot)
11 | data <- as.data.frame.matrix(data)
12 | data$total <- rowSums(data)
13 | 
14 | # we filter out patients with less than 500 total
15 | # cells as this could introduce bias
16 | data <- filter(data, total > 500)
17 | 
18 | for (patient in rownames(data)) {
19 |   # iterate over the cell types
20 |   for (cell in colnames(data)[1:(ncol(data)-1)]) {
21 |     # calculate patient's cell type percentage
22 |     data[patient, cell] <- round(data[patient, cell] / data[patient, "total"] * 100, 3)
23 |   }
24 | }
25 | 
26 | data <- data[, 1:(ncol(data)-1)]
27 | 
28 | # get the metadata we are interested in
29 | meta <- atlas@meta.data[, c("source", "patient", "subtype")]
30 | 
31 | # unify metadata to one row per patient
32 | meta <- unique(meta)
33 | rownames(meta) <- meta$patient
34 | 
35 | data <- merge(meta, data, by = "row.names")
36 | data$Row.names <- NULL
37 | 
38 | # remove datasets with no patient information
39 | # (they are labeled as having only one patient)
40 | data <- filter(data, !source %in% c("lung1", "lung2"))
41 | 
42 | saveRDS(data, ".../output/atlas_proportion_dataset.rds")
43 | 


--------------------------------------------------------------------------------
/Patient stratification/2_cluster_patients.R:
--------------------------------------------------------------------------------
  1 | # this script calculates the 6 patient clusters
  2 | # using the hierarchical k-means algorithm
  3 | 
  4 | ## load packages
  5 | library(tidyverse)
  6 | library(magrittr)
  7 | library(ggsci)
  8 | library(pals)
  9 | library(RColorBrewer)
 10 | library(patchwork)
 11 | library(clValid)
 12 | library(factoextra)
 13 | library(NbClust)
 14 | library(GGally)
 15 | library(dendextend)
 16 | library(Rtsne)
 17 | library(ComplexHeatmap)
 18 | library(circlize)
 19 | library(kohonen)
 20 | library(mclust)
 21 | 
 22 | df <- readRDS("output/atlas_proportion_dataset.rds")
 23 | tica_pal <- readRDS("output/tica_palettes.rds")
 24 | 
 25 | df$subtype[df$source=="liver2"] <- "HCC"
 26 | 
 27 | df <- mutate(df,
 28 |              `B cells` = `Activated B cells` + `Memory B cells` + `Naive B cells` + `Unswitched Memory B cells`,
 29 |               )
 30 | 
 31 | df[, c("Activated B cells", "Memory B cells", "Naive B cells", "Unswitched Memory B cells")] <- NULL
 32 | 
 33 | # optimal number of clusters (from 1 to 20)
 34 | clust_num <- NbClust(
 35 |   data = df[, 4:ncol(df)], distance = "euclidean", min.nc = 2,
 36 |   max.nc = 20, method = "kmeans", index = "alllong"
 37 | )
 38 | fviz_nbclust(clust_num)
 39 | 
 40 | # HIERACHICAL K-MEANS
 41 | hkmeans_cluster <- hkmeans(x = df[, 4:(ncol(df))], hc.metric = "euclidean", hc.method = "ward.D2", k = 6)
 42 | fviz_cluster(object = hkmeans_cluster, pallete = "jco", repel = TRUE)
 43 | df$cluster_kmeans_k6 <- hkmeans_cluster$cluster
 44 | 
 45 | # visualize as heatmap
 46 | df <- df[order(df$cluster_kmeans_k6), ]
 47 | mat <- as.matrix(df[, 4:(ncol(df) - 1)])
 48 | rownames(mat) <- NULL
 49 | 
 50 | row_ha <- rowAnnotation(
 51 |   cluster = df$cluster_kmeans_k6,
 52 |   col = list(cluster = c("1" = "#1F77B4FF", "2" = "#FF7F0EFF", "3" = "#2CA02CFF", "4" = "#D62728FF", "5" = "#9467BDFF", "6" = "#8C564BFF"))
 53 | )
 54 | Heatmap(mat,
 55 |         name = "composition (%)",
 56 |         cluster_rows = F, cluster_columns = FALSE,
 57 |         left_annotation = row_ha, column_names_max_height = unit(10, "cm"),
 58 |         col = colorRampPalette(brewer.pal(8, "Blues"))(50)
 59 | )
 60 | 
 61 | # visualize cancer types per cluster
 62 | ggplot(df, aes(factor(cluster_kmeans_k6))) +
 63 |   geom_bar(aes(fill = factor(subtype)), position = "fill") +
 64 |   coord_flip() +
 65 |   theme_classic() +
 66 |   labs(x = "Cluster", fill = "Cancer type", y = "Proportion of patients") +
 67 |   scale_fill_manual(values = tica_pal$cancer) +
 68 |   theme(text = element_text(size = 30))#, legend.position = "none")
 69 | 
 70 | ggplot(df, aes(factor(cluster_kmeans_k6))) +
 71 |   geom_bar(aes(fill = factor(subtype))) +
 72 |   coord_flip() +
 73 |   theme_classic() +
 74 |   labs(x = "Cluster", fill = "Cancer type", y = "Number of patients") +
 75 |   scale_fill_manual(values = tica_pal$cancer) +
 76 |   theme(text = element_text(size = 30))
 77 | 
 78 | # calculate PCA
 79 | res_pca <- prcomp(df[, 4:(ncol(df) - 1)], scale = F)
 80 | fviz_eig(res_pca)
 81 | res_ind <- get_pca_ind(res_pca)
 82 | 
 83 | # Contributions of variables to PC1 and PC2
 84 | p1 <- fviz_contrib(res_pca, choice = "var", axes = 1, top = 10, xtickslab.rt = 0) +
 85 |   theme(text = element_text(size = 35), title = element_text(size = 25), axis.text.x = element_text(vjust = 0.5, hjust = 1, angle = 90)) +
 86 |   labs(title = "PC1")
 87 | p2 <- fviz_contrib(res_pca, choice = "var", axes = 2, top = 10, xtickslab.rt = 0) +
 88 |   theme(text = element_text(size = 35), title = element_text(size = 25), axis.text.x = element_text(vjust = 0.5, hjust = 1, angle = 90)) +
 89 |   labs(title = "PC2")
 90 | p1 + p2
 91 | 
 92 | # PC contributions table
 93 | var <- get_pca_var(res_pca)
 94 | ft <- as.data.frame(var$contrib[, 1:2])
 95 | ft$Variable <- rownames(ft)
 96 | colnames(ft) <- c("PC1 (%)", "PC2 (%)", "Cell type")
 97 | ft <- ft[, c("Cell type", "PC1 (%)", "PC2 (%)")]
 98 | ft <- arrange(ft, desc(`PC1 (%)`))
 99 | ft %>%
100 |   flextable::flextable() %>%
101 |   flextable::theme_vanilla() %>%
102 |   flextable::autofit()
103 | 
104 | # visualization: TSNE PLOT
105 | tsne <- Rtsne(df[, 4:(ncol(df) - 1)], dims = 2, perplexity = 30, verbose = TRUE, max_iter = 500)
106 | 
107 | df$tsne_x <- tsne$Y[, 1]
108 | df$tsne_y <- tsne$Y[, 2]
109 | 
110 | p1 <- ggplot(df, aes(tsne_x, tsne_y, color = as.factor(cluster_kmeans_k6))) +
111 |   geom_point(size = 6) +
112 |   labs(color = "cluster", x = "tSNE_1", y = "tSNE_2") +
113 |   scale_color_d3() +
114 |   theme_classic() +
115 |   theme(
116 |     text = element_text(size = 30),
117 |     legend.position = "none"
118 |   ) +
119 |   theme(
120 |     axis.line = element_blank(),
121 |     axis.text.x = element_blank(),
122 |     axis.text.y = element_blank(),
123 |     axis.ticks = element_blank(),
124 |     axis.title.x = element_blank(),
125 |     axis.title.y = element_blank(),
126 |     legend.position = "none",
127 |     panel.background = element_blank(),
128 |     panel.border = element_blank(),
129 |     panel.grid.major = element_blank(),
130 |     panel.grid.minor = element_blank(),
131 |     plot.background = element_blank(),
132 |     plot.title = element_text(hjust = 0.5)
133 |   ) +
134 |   ggtitle("Cluster")
135 | 
136 | p2 <- ggplot(df, aes(tsne_x, tsne_y, color = as.factor(subtype))) +
137 |   geom_point(size = 6) +
138 |   labs(color = "cluster", x = "tSNE_1", y = "tSNE_2") +
139 |   theme_classic() +
140 |   theme(
141 |     text = element_text(size = 30),
142 |     legend.position = "none"
143 |   ) +
144 |   theme(
145 |     axis.line = element_blank(),
146 |     axis.text.x = element_blank(),
147 |     axis.text.y = element_blank(),
148 |     axis.ticks = element_blank(),
149 |     axis.title.x = element_blank(),
150 |     axis.title.y = element_blank(),
151 |     legend.position = "right",
152 |     panel.background = element_blank(),
153 |     panel.border = element_blank(),
154 |     panel.grid.major = element_blank(),
155 |     panel.grid.minor = element_blank(),
156 |     plot.background = element_blank(),
157 |     plot.title = element_text(hjust = 0.5)
158 |   ) +
159 |   ggtitle("Cancer type") +
160 |   scale_color_manual(values = tica_pal$cancer)
161 | 
162 | p1 + p2
163 | 
164 | saveRDS(df, "output/atlas_proportion_dataset_clustered.rds")
165 | 


--------------------------------------------------------------------------------
/Patient stratification/3_random_forest.R:
--------------------------------------------------------------------------------
 1 | # Atlas clustering dataset
 2 | # create random forest classificator
 3 | 
 4 | # load libraries
 5 | library(caTools)
 6 | library(rpart)
 7 | library(rpart.plot)
 8 | library(randomForest)
 9 | library(rattle)
10 | library(caret)
11 | library(patchwork)
12 | library(tidyverse)
13 | library(ComplexHeatmap)
14 | library(circlize)
15 | library(scales)
16 | library(RColorBrewer)
17 | library(pals)
18 | library(ggsci)
19 | 
20 | atlas <- readRDS("atlas_proportion_dataset_clustered.rds")
21 | atlas$cluster <- as.factor(atlas$cluster_kmeans_k6)
22 | atlas$cluster_kmeans_k6 <- NULL
23 | atlas$tsne_y <- NULL
24 | atlas$tsne_x <- NULL
25 | colnames(atlas) <- make.names(colnames(atlas))
26 | 
27 | # first we split the dataset into training and test
28 | split <- sample.split(atlas$cluster, SplitRatio = 0.75)
29 | training_set <- subset(atlas[, c(11:ncol(atlas))], split == TRUE)
30 | test_set <- subset(atlas[, c(11:ncol(atlas))], split == FALSE)
31 | 
32 | # then train the RF model
33 | rf <- randomForest(cluster ~ ., ntree = 1000, data = as.data.frame.data.frame(training_set), importance = TRUE, proximity = TRUE)
34 | plot(rf)
35 | 
36 | # fit model on test subset
37 | predicted_cluster <- predict(rf, test_set[1:25])
38 | # evaluate with the confusion matrix
39 | confusionMatrix(
40 |   data = predicted_cluster,
41 |   reference = test_set$cluster
42 | )
43 | 
44 | # Variable Importance
45 | varImpPlot(rf, type = 1, sort = TRUE, n.var = 15, main = "Top 15 - Variable Importance")
46 | varImpPlot(rf, type = 2, sort = TRUE, n.var = 15, main = "Top 15 - Variable Importance")
47 | 
48 | df <- as.data.frame(rf$importance[, c("MeanDecreaseAccuracy", "MeanDecreaseGini")])
49 | df$cell_type <- rownames(df)
50 | df$cell_type <- gsub("\\.", " ", df$cell_type)
51 | colnames(df) <- c("Mean Decrease Accuracy", "Mean Decrease Gini", "cell_type")
52 | df <- pivot_longer(df, cols = c("Mean Decrease Accuracy", "Mean Decrease Gini"), names_to = "measure")
53 | df <- df[order(-df$value), ]
54 | df <- df[df$cell_type %in% df$cell_type[1:15], ]
55 | df$cell_type <- as.factor(df$cell_type)
56 | df$cell_type <- factor(df$cell_type, levels = unique(df$cell_type[order(df$value)]))
57 | 
58 | ggplot(df, aes(x = cell_type, y = value)) +
59 |   geom_point(col = "tomato2", size = 5) + # Draw points
60 |   coord_flip() +
61 |   facet_grid(. ~ measure, scales = "free", labeller = label_wrap_gen(width = 10)) +
62 |   theme_minimal() +
63 |   labs(x = "", y = "") +
64 |   theme(text = element_text(size = 25)) +
65 |   theme(panel.background = element_rect(fill = NA, color = "black"))
66 | 


--------------------------------------------------------------------------------
/Patient stratification/README.md:
--------------------------------------------------------------------------------
 1 | # Single-Cell Tumor Immune Atlas project: Patient Stratification
 2 | 
 3 | These are the scripts necessary for the creation of the patients cell-proportion dataset, for the hierarchical k-means clustering of the patients and to evaluate the clustering with a random forest model trained on the clustering output.
 4 | 
 5 | ## Package versions
 6 | Required packages for the code in this folder and versions used:
 7 | 
 8 | * [Tidyverse 1.3.0](https://cran.r-project.org/web/packages/tidyverse/vignettes/paper.html)
 9 | * [Seurat 3.2.2](https://www.cell.com/cell/fulltext/S0092-8674(19)30559-8?_returnURL=https%3A%2F%2Flinkinghub.elsevier.com%2Fretrieve%2Fpii%2FS0092867419305598%3Fshowall%3Dtrue)
10 | * [MClust 5.4.6](https://doi.org/10.32614/RJ-2016-021)
11 | * [dendextend 1.14](10.1093/bioinformatics/btv428)
12 | * [magrittr 1.5](https://cran.r-project.org/web/packages/magrittr/vignettes/magrittr.html)
13 | * [ggsci 2.9](https://cran.r-project.org/web/packages/ggsci/vignettes/ggsci.html)
14 | * [pals 1.6](https://kwstat.github.io/pals/)
15 | * [RColorBrewer 1.1](https://cran.r-project.org/web/packages/RColorBrewer/index.html)
16 | * [patchwork 1.1.0](https://github.com/thomasp85/patchwork)
17 | * [clValid 0.5](http://dx.doi.org/10.18637/jss.v025.i04)
18 | * [factoextra 1.0.7](https://cran.r-project.org/package=factoextra)
19 | * [NbClust 3.0](http://dx.doi.org/10.18637/jss.v061.i06)
20 | * [GGally 2.0.0](https://ggobi.github.io/ggally/)
21 | * [Rtsne 0.15](https://github.com/jkrijthe/Rtsne)
22 | * [ComplexHeatmap 2.4.3](http://bioconductor.org/packages/release/bioc/html/ComplexHeatmap.html)
23 | * [circlize 0.4.10](https://doi.org/10.1093/bioinformatics/btu393)
24 | * [kohonen 3.0.10](https://cran.r-project.org/web/packages/kohonen/A)
25 | * [caTools 1.18](https://cran.r-project.org/web/packages/caTools/index.html)
26 | * [rpart 4.1](https://cran.r-project.org/web/packages/rpart/)
27 | * [rpart.plot 3.0.9](https://cran.r-project.org/web/packages/rpart.plot/)
28 | * [randomForest 4.6](https://cran.r-project.org/web/packages/randomForest/)
29 | * [rattle 5.4.0](https://cran.r-project.org/web/packages/rattle/)
30 | * [caret 6.0](http://topepo.github.io/caret/index.html)
31 | * [scales 1.1.1](https://cran.r-project.org/web/packages/scales/index.html)
32 | 


--------------------------------------------------------------------------------
/Projections/README.md:
--------------------------------------------------------------------------------
 1 | # Projection of different datasets on the atlas
 2 | 
 3 | To demonstrate the power of our atlas, we predicted the cell types on datasets from different cancer types and varying experimental designs. We make use of FindTransferAnchors utility provided by Seurat [(Stuart et al, 2019)](https://www.cell.com/cell/fulltext/S0092-8674(19)30559-8) for transfering cell types on TICA to the query objects. 
 4 | 
 5 | This script showcases an example for projecting external datsets (query) onto the atlas.
 6 | 
 7 | ## Dependencies
 8 | 
 9 | * [R 3.6.0](https://cran.r-project.org/)
10 | * [Seurat 3.2.0](https://cran.r-project.org/web/packages/Seurat/index.html)
11 | * [tidyverse 1.3.0](https://cran.r-project.org/web/packages/tidyverse/index.html)
12 | * [magritts 1.5.0](https://cloud.r-project.org/package=magrittr)
13 | * [pals 1.6.0](https://kwstat.github.io/pals/)
14 | * [flextable 0.5.10](https://davidgohel.github.io/flextable/)
15 | * [ComplexHeatmap 2.4.3](https://github.com/jokergoo/ComplexHeatmap)
16 | * [patchwork 1.0.0](https://patchwork.data-imaginist.com/)
17 | * [matchSCore2 0.1.0](https://github.com/elimereu/matchSCore2)
18 | * [RColorBrewer 1.1.2](https://cloud.r-project.org/package=RColorBrewer)
19 | 
20 | 
21 | ## Data
22 | 
23 | * Atlas Seurat object: download it as specified in the publication ([Zenodo](https://zenodo.org/record/4036020#.X5hFT4hKiHt))
24 | 
25 | * Query samples: two human uveal melanoma cancers, one human ovarian cancer, one human uveal melanoma liver metastasis, one human non-small cell lung cancer brain metastasis (including TCR) and two mice colorectal cancers (one full and one only T cells and TCR) 
26 | 
27 | [Available for download on GEO](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE158803).


--------------------------------------------------------------------------------
/Projections/atlas_projection.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Projecting data on the Single-Cell Tumor Immune Atlas"
  3 | date: "`r format(Sys.time(), '%b %Y')`"
  4 | output:
  5 |   html_document:
  6 |     toc: yes
  7 | ---
  8 | 
  9 | ## Projecting data on the Single-Cell Tumor Immune Atlas
 10 | 
 11 | This is an example to project external datasets (query) onto the atlas. The following code generates the projection of cell types from the atlas on a brain metastasis sample from a patient with melanoma. For this sample we have available both scRNA-seq and TCR sequencing to check the clonality associated on the cell types.
 12 | 
 13 | WARNING: due to huge size this script should be run on a computational cluster
 14 | 
 15 | 
 16 | ### Load all necesary packages
 17 | 
 18 | ```{r packages, echo=FALSE, message=FALSE, warning=FALSE}
 19 | library(tidyverse)
 20 | library(Seurat)
 21 | library(magrittr)
 22 | library(pals)
 23 | library(flextable)
 24 | library(ComplexHeatmap)
 25 | library(patchwork)
 26 | library(matchSCore2)
 27 | library(RColorBrewer)
 28 | ```
 29 | 
 30 | ### Load a predefined color palette for each cell type
 31 | 
 32 | ```{r palette, echo=FALSE}
 33 | # this is the color palette, with names
 34 | values <- readRDS(file="~/cell_type_palette.rds")
 35 | ```
 36 | 
 37 | ### Load the atlas. Calculate 5000 VariableFeatures
 38 | 
 39 | ```{r load atlas}
 40 | atlas <- readRDS("downsampled_atlas.rds") # load downsampled atlas (1000 cells per cell type)
 41 | atlas <- subset(atlas, new_cell_types != "Mast cells") 
 42 | Idents(atlas) <- "new_cell_types"
 43 | atlas <- FindVariableFeatures(atlas, assay = "RNA", nfeatures = 5000) # 5000 Var.Feat for the RNA assay
 44 | ```
 45 | 
 46 | ### params$file contains the path to the Seurat object preprocessed with the TCR info on it. Basic preprocessing, normalization, scaling and dimensionality reduction was applied following Seurat's vignette (https://satijalab.org/seurat/v3.2/pbmc3k_tutorial.html).
 47 | 
 48 | ```{r load query, echo=FALSE}
 49 | query <- readRDS(params$file) # load query dataset (BrainMet P12)
 50 | Idents(query) <- query$clusters
 51 | levels(query$clusters) <- c(1:length(unique(query$clusters))) # rename clusters from 1 to n
 52 | ```
 53 | 
 54 | ### We use FindTransferAnchors utility provided by Seurat (Stuart et al, 2019) for transfering the cell types from the atlas to the query object
 55 | 
 56 | ```{r find anchors, echo=FALSE} 
 57 | anchors <- FindTransferAnchors(reference = atlas, # find anchors between query and atlas
 58 |                                query = query, 
 59 |                                dims = 1:30, 
 60 |                                normalization.method = "LogNormalize",
 61 |                                reference.assay = "RNA", 
 62 |                                query.assay = "RNA")
 63 | predictions <- TransferData(anchorset = anchors, refdata = atlas$new_cell_types, dims = 1:30) # transfer new_cell_types annot
 64 | query <- AddMetaData(query, metadata = predictions) # add to object metdata
 65 | ``` 
 66 | 
 67 | ### Show the results of the projection as a table
 68 | 
 69 | ```{r table, results = "markup", echo=FALSE}
 70 | tab <- as.data.frame.matrix(table(query$predicted.id, Idents(query))) # create table
 71 | tab$annotated_cell_type <- rownames(tab)
 72 | 
 73 | flextable(tab) %>% 
 74 |   set_header_labels(annotated_cell_type = " ") %>% 
 75 |   autofit() %>% 
 76 |   theme_vanilla() %>% 
 77 |   set_caption(paste0(toString(dim(query)[2]), " cells total"))  %>% # add number of total cells at the top
 78 |   align(align = "center", part = "all")
 79 | ```
 80 | 
 81 | ### Plot the predictions and their associated clonal size
 82 | 
 83 | ```{r plots, results = "markup", fig.height=10, fig.width=12, echo=FALSE}
 84 | # plot umap colored by original cell type, predicted cell type and clusters
 85 | query$cell_type <- Idents(query)
 86 | p1 <- DimPlot(query, group.by = "cell_type", pt.size = 1) +
 87 |   NoAxes()
 88 | p1 + ggtitle("Predicted celltype") 
 89 | 
 90 | query$umap1 <- query@reductions$umap@cell.embeddings[,1]
 91 | query$umap2 <- query@reductions$umap@cell.embeddings[,2]
 92 | aux_df <- as.data.frame(cbind(query$umap1,query$umap2,Idents(query),query$Clono_size))
 93 | colnames(aux_df) <- c("umap1","umap2","clusters","Clono_size")
 94 | aux_df$umap1 <- as.numeric(as.character(aux_df$umap1))
 95 | aux_df$umap2 <- as.numeric(as.character(aux_df$umap2))
 96 | aux_df$clusters <- as.character(aux_df$clusters)
 97 | 
 98 | p2 <- ggplot(aux_df, aes(x=umap1, y=umap2)) + geom_point(aes(color=Clono_size),size = 1.4) + ggtitle("Clono_size") +
 99 |   scale_colour_gradient(low = "grey", high = "darkgreen") +
100 |   theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),panel.background = element_blank(), axis.line = element_line(colour = "black")) +
101 |   NoAxes()
102 | p2
103 | ```
104 | 
105 | 


--------------------------------------------------------------------------------
/Projections/cancer_subtype_palette.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Single-Cell-Genomics-Group-CNAG-CRG/Tumor-Immune-Cell-Atlas/f309409beb058ede05dfc85927b952cda51fc119/Projections/cancer_subtype_palette.rds


--------------------------------------------------------------------------------
/Projections/cell_type_palette.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Single-Cell-Genomics-Group-CNAG-CRG/Tumor-Immune-Cell-Atlas/f309409beb058ede05dfc85927b952cda51fc119/Projections/cell_type_palette.rds


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Single-Cell Tumor Immune Atlas project
 2 | Code repository for the Single-Cell Tumor Immune Atlas project.
 3 | 
 4 | Our paper is out now on [Genome Research](https://singlecellgenomics-cnag-crg.shinyapps.io/TICA/).
 5 | 
 6 | ![](misc/atlas.png)
 7 | 
 8 | We present a single cell immune atlas of the tumor microenvironment composed by over 500k cells, 217 patients,13 cancer types, for:
 9 | 
10 |  
11 | * Patient stratification 
12 | * Annotation 
13 | * Spatial mapping
14 | 
15 | You can inspect our dataset and query your own easily, by using our ShinyApp [here](https://singlecellgenomics-cnag-crg.shinyapps.io/TICA/).
16 | 
17 | 
18 | ## Abstract
19 | 
20 | The tumor immune microenvironment is a main contributor to cancer progression and a promising therapeutic target for oncology. However, immune microenvironments vary profoundly between patients and biomarkers for prognosis and treatment response lack precision. A comprehensive compendium of tumor immune cells is required to pinpoint predictive cellular states and their spatial localization. We generated a single-cell tumor immune atlas, jointly analyzing >500,000 cells from 217 patients and 13 cancer types, providing the basis for a patient stratification based on immune cell compositions. Projecting immune cells from external tumors onto the atlas facilitated an automated cell annotation system for a harmonized interpretation. To enable in situ mapping of immune populations for digital pathology, we developed, benchmarked and applied SPOTlight, a computational tool that identified striking spatial immune cell patterns in tumor sections. We expect the atlas, together with our versatile toolbox for precision oncology, to advance currently applied stratification approaches for prognosis and immuno-therapy.
21 | 
22 | 
23 | ## Code implementation
24 | 
25 | Folders and content:
26 | 
27 | * `Integration`: scripts necessary to create the atlas
28 | * `Patient stratification`: clustering of the patients and assessment by random forest
29 | * `Projections`: projecting external data onto the atlas
30 | * `Random Forest`: assessment of the clustering robustness
31 | * `ST-breast`: analysis of the spatial transcriptomics breast data
32 | * `ST-oropharyngeal`: analysis of the spatial transcriptomics oropharyngeal data
33 | 
34 | See the `README.md` files in each directory for a full description.
35 | 
36 | ## Getting the data
37 | The Single-Cell Tumor Immune Atlas dataset is open access on [Zenodo](https://zenodo.org/record/4263972).
38 | 
39 | ## Getting the code
40 | 
41 | You can download a copy of all the files in this repository by cloning the
42 | [git](https://git-scm.com/) repository:
43 | 
44 |     git clone https://github.com/Single-Cell-Genomics-Group-CNAG-CRG/Tumor-Immune-Cell-Atlas.git
45 | 
46 | or [download a zip archive](https://github.com/Single-Cell-Genomics-Group-CNAG-CRG/Tumor-Immune-Cell-Atlas/archive/main.zip).
47 | 
48 | ## Dependencies
49 | 
50 | The dependencies for each part of the analysis are listed in each of the folders.
51 | 


--------------------------------------------------------------------------------
/Random Forest/1-calculate_cell_type_signatures.R:
--------------------------------------------------------------------------------
 1 | # This script computes 25 cell type-specific signatures for each cell, based on the
 2 | # markers provided in the Supplementary Table 2. It also calculates signatures
 3 | # defined with random genes that will help us assess the significance of the classifier.
 4 | # Finally, it saves the dataframe that will be the input for the random forest.
 5 | 
 6 | 
 7 | # Load packages
 8 | library(Seurat)
 9 | library(readxl)
10 | library(tidyverse)
11 | 
12 | 
13 | # Load data
14 | tica <- readRDS("path_to_TICA_seurat_object")
15 | path_to_excel <- "data/supp_table_2.xlsx"
16 | sheets <- excel_sheets(path_to_excel)
17 | signatures_dfs <- purrr::map(sheets, ~ read_excel(path_to_excel, sheet = .x))
18 | names(signatures_dfs) <- str_replace_all(sheets, " ", "_")
19 | signatures_tica <- purrr::map(signatures_dfs, "gene")
20 | 
21 | 
22 | # Create dataframe to run the random forest (signatures)
23 | DefaultAssay(tica) <- "integrated"
24 | tica@assays$RNA <- NULL
25 | set.seed(123)
26 | for (signature in names(signatures_tica)) {
27 |   print(signature)
28 |   start_time <- Sys.time()
29 |   features <- signatures_tica[[signature]]
30 |   features <- features[features %in% rownames(tica)]
31 |   tica <- AddModuleScore(
32 |     tica,
33 |     features = list(features),
34 |     name = str_c(signature, "signature", sep = "_")
35 |   )
36 |   random_features <- sample(
37 |     rownames(tica),
38 |     size = length(features),
39 |     replace = FALSE
40 |   )
41 |   tica <- AddModuleScore(
42 |     tica,
43 |     features = list(random_features),
44 |     name = str_c(signature, "random", sep = "_")
45 |   )
46 |   end_time <- Sys.time()
47 |   print(end_time - start_time)
48 |   print(head(tica@meta.data))
49 | }
50 | keep_cols_sign <- c(
51 |   str_subset(colnames(tica@meta.data), "signature"),
52 |   str_subset(colnames(tica@meta.data), "random"),
53 |   "new_cell_types"
54 | )
55 | tica_df_signatures <- tica@meta.data[, keep_cols_sign]
56 | tica_df_signatures$barcode <- colnames(tica)
57 | 
58 | 
59 | # Save
60 | dir.create("tmp", showWarnings = FALSE)
61 | saveRDS(tica_df_signatures, "tmp/tica_input_dataframe_signatures_random_forest.rds")
62 | 


--------------------------------------------------------------------------------
/Random Forest/2-create_test_sets_for_cross_validation.R:
--------------------------------------------------------------------------------
 1 | # This script creates the test sets for the 5-fold cross-validation.
 2 | # We ensure that each cell type is represented in each test set. Thus, we take
 3 | # 20% of cells for each cell type in each fold. Each cell in the dataset is found
 4 | # in 1 test set and 4 training sets.
 5 | 
 6 | 
 7 | # Load packages
 8 | library(tidyverse)
 9 | 
10 | 
11 | # Load data
12 | tica_df_signatures <- readRDS("tmp/tica_input_dataframe_signatures_random_forest.rds")
13 | 
14 | 
15 | # Create test sets in each fold
16 | cells <- tica_df_signatures$barcode
17 | cell_types <- levels(tica_df_signatures$new_cell_types)
18 | test_cells_list <- list(
19 |   fold1 = c(),
20 |   fold2 = c(),
21 |   fold3 = c(),
22 |   fold4 = c(),
23 |   fold5 = c()
24 | )
25 | for (cell_type in cell_types) {
26 |   print(cell_type)
27 |   cells <- tica_df_signatures[tica_df_signatures$new_cell_types == cell_type, "barcode"]
28 |   steps_cross_valid <- round(
29 |     seq(
30 |       from = 1,
31 |       to = length(cells),
32 |       by = (length(cells) / 5)
33 |     ),
34 |     0
35 |   )
36 |   test_cells_list[["fold1"]] <- c(
37 |     test_cells_list[["fold1"]],
38 |     cells[steps_cross_valid[1]:steps_cross_valid[2]]
39 |   )
40 |   test_cells_list[["fold2"]] <- c(
41 |     test_cells_list[["fold2"]],
42 |     cells[(steps_cross_valid[2] + 1):steps_cross_valid[3]]
43 |   )
44 |   test_cells_list[["fold3"]] <- c(
45 |     test_cells_list[["fold3"]],
46 |     cells[(steps_cross_valid[3] + 1):steps_cross_valid[4]]
47 |   )
48 |   test_cells_list[["fold4"]] <- c(
49 |     test_cells_list[["fold4"]],
50 |     cells[(steps_cross_valid[4] + 1):steps_cross_valid[5]]
51 |   )
52 |   test_cells_list[["fold5"]] <- c(
53 |     test_cells_list[["fold5"]],
54 |     cells[(steps_cross_valid[5] + 1):length(cells)]
55 |   )
56 | }
57 | saveRDS(test_cells_list, "tmp/test_set_per_fold.rds")
58 | 
59 | 


--------------------------------------------------------------------------------
/Random Forest/3-run_random_forest.R:
--------------------------------------------------------------------------------
  1 | # This script trains and tests a random forest classifier on a specific fold
  2 | # of the cross validation specified in previous scripts. It saves both the trained model and
  3 | # the predicted labels.
  4 | start_time <- Sys.time()
  5 | 
  6 | 
  7 | # Load packages
  8 | library(Seurat)
  9 | library(tidyverse)
 10 | library(randomForest)
 11 | library(doParallel)
 12 | library(foreach)
 13 | library(readxl)
 14 | 
 15 | 
 16 | # Load data
 17 | tica_df <- readRDS("tmp/tica_input_dataframe_signatures_random_forest.rds")
 18 | test_cells_list <- readRDS("tmp/test_set_per_fold.rds")
 19 | print("Data loaded successfully!")
 20 | 
 21 | 
 22 | # Separate datframe in real and random signatures
 23 | signatures_cols <- c(
 24 |   str_subset(colnames(tica_df), "signature"),
 25 |   "new_cell_types"
 26 | )
 27 | random_cols <- c(
 28 |   str_subset(colnames(tica_df), "random"),
 29 |   "new_cell_types"
 30 | )
 31 | tica_df_signatures <- tica_df[, signatures_cols]
 32 | tica_df_random <- tica_df[, random_cols]
 33 | print("Separated real and random signatures")
 34 | 
 35 |   
 36 | # Process command-line arguments and subset to correct fold
 37 | args <- commandArgs(trailingOnly = TRUE)
 38 | fold <- args[[1]]
 39 | test_cells <- test_cells_list[[fold]]
 40 | 
 41 | 
 42 | # Divide cells in training and test sets
 43 | test_df_signatures <- tica_df_signatures[test_cells, ]
 44 | test_df_random <- tica_df_random[test_cells, ]
 45 | training_cells <- rownames(tica_df)[!(rownames(tica_df) %in% test_cells)]
 46 | training_df_signatures <- tica_df_signatures[training_cells, ]
 47 | training_df_random <- tica_df_random[training_cells, ]
 48 | print("Divided training and test sets")
 49 | 
 50 | 
 51 | # Train random forest
 52 | # ## Set parallelization backend
 53 | total_trees <- 600
 54 | parallel_trees <- 100
 55 | ncpu <- round(total_trees / parallel_trees, 0)
 56 | doParallel::registerDoParallel(ncpu)
 57 | 
 58 | 
 59 | ########################################################################
 60 | ############################## SIGNATURES ##############################
 61 | ########################################################################
 62 | 
 63 | # Run n (ncpu) forest of ntrees (parallel_trees) each: signatures
 64 | rf_mod_signatures <- foreach::foreach(ntree = rep(parallel_trees, ncpu),
 65 |                            # Indicating we want to use combine from the randomForest to join all the parallelized trees
 66 |                            .combine = randomForest::combine,
 67 |                            # Indicate we need the randomForest within the ForeEach loop + dopar indicating parallelization
 68 |                            .packages="randomForest") %dopar% {
 69 |                              ## train RF model
 70 |                              model <- randomForest::randomForest(
 71 |                                new_cell_types ~ .,
 72 |                                data = training_df_signatures,
 73 |                                type = "classification",
 74 |                                # Keep importance in the RF object
 75 |                                importance = FALSE,
 76 |                                # How many trees to build
 77 |                                ntree = ntree,
 78 |                                # Return update every 10 trees
 79 |                                verbose = 10
 80 |                              )
 81 |                              return(model)
 82 |                            }
 83 | print("Finshed random forest signatures!")
 84 | 
 85 | 
 86 | # Test random forest
 87 | predicted_probs_signatures <- predict(
 88 |   object = rf_mod_signatures,
 89 |   newdata = test_df_signatures,
 90 |   type = "prob"
 91 | )
 92 | predicted_class_signatures <- predict(
 93 |   object = rf_mod_signatures,
 94 |   newdata = test_df_signatures,
 95 |   type = "class"
 96 | )
 97 | 
 98 | 
 99 | # Save model and predictions
100 | saveRDS(rf_mod_signatures, str_c("tmp/random_forest_model_signatures", "_", fold, ".rds", sep = ""))
101 | saveRDS(predicted_probs_signatures, str_c("tmp/random_forest_predicted_probs_signatures", "_", fold, ".rds"))
102 | saveRDS(predicted_class_signatures, str_c("tmp/random_forest_predicted_class_signatures", "_", fold, ".rds"))
103 | print("Saved results signatures!")
104 | 
105 | 
106 | ########################################################################
107 | ############################## RANDOM ##################################
108 | ########################################################################
109 | 
110 | # Run n (ncpu) forest of ntrees (parallel_trees) each: signatures
111 | rf_mod_random <- foreach::foreach(ntree = rep(parallel_trees, ncpu),
112 |                                       # Indicating we want to use combine from the randomForest to join all the parallelized trees
113 |                                       .combine = randomForest::combine,
114 |                                       # Indicate we need the randomForest within the ForeEach loop + dopar indicating parallelization
115 |                                       .packages="randomForest") %dopar% {
116 |                                         ## train RF model
117 |                                         model <- randomForest::randomForest(
118 |                                           new_cell_types ~ .,
119 |                                           data = training_df_random,
120 |                                           type = "classification",
121 |                                           # Keep importance in the RF object
122 |                                           importance = FALSE,
123 |                                           # How many trees to build
124 |                                           ntree = ntree,
125 |                                           # Return update every 10 trees
126 |                                           verbose = 10
127 |                                         )
128 |                                         return(model)
129 |                                       }
130 | print("Finshed random forest with random signatures!")
131 | 
132 | 
133 | # Test random forest
134 | predicted_probs_random <- predict(
135 |   object = rf_mod_random,
136 |   newdata = test_df_random,
137 |   type = "prob"
138 | )
139 | predicted_class_random <- predict(
140 |   object = rf_mod_random,
141 |   newdata = test_df_random,
142 |   type = "class"
143 | )
144 | 
145 | 
146 | # Save model and predictions
147 | saveRDS(rf_mod_random, str_c("tmp/random_forest_model_random", "_", fold, ".rds", sep = ""))
148 | saveRDS(predicted_probs_random, str_c("tmp/random_forest_predicted_probs_random", "_", fold, ".rds"))
149 | saveRDS(predicted_class_random, str_c("tmp/random_forest_predicted_class_random", "_", fold, ".rds"))
150 | 
151 | print("Saved results random signatures!")
152 | 
153 | end_time <- Sys.time()
154 | total_time <- end_time - start_time
155 | print(str_c("Job took ", total_time, " to compute", sep = ""))
156 | 


--------------------------------------------------------------------------------
/Random Forest/4-evaluate_performance.R:
--------------------------------------------------------------------------------
 1 | # This script evaluates the performance of the random forest classifiers for all folds
 2 | 
 3 | 
 4 | # Load packages
 5 | library(caret)
 6 | library(e1071)
 7 | library(tidyverse)
 8 | 
 9 | 
10 | # Load data
11 | test_sets_cross_valid <- readRDS("tmp/test_set_per_fold.rds")
12 | tica_df <- readRDS("tmp/tica_input_dataframe_signatures_random_forest.rds")
13 | folds <- str_c("fold", 1:5, sep = "")
14 | pred_class_sign_l <- purrr::map(folds, function(fold) {
15 |   path_to_preds1 <- "tmp/random_forest_predicted_class_signatures_"
16 |   path_to_preds <- str_c(path_to_preds1, fold, ".rds", sep = "")
17 |   pred_class <- readRDS(path_to_preds)
18 |   pred_class
19 | })
20 | names(pred_class_sign_l) <- folds
21 | pred_class_rand_l <- purrr::map(folds, function(fold) {
22 |   path_to_preds1 <- "tmp/random_forest_predicted_class_random_"
23 |   path_to_preds <- str_c(path_to_preds1, fold, ".rds", sep = "")
24 |   pred_class <- readRDS(path_to_preds)
25 |   pred_class
26 | })
27 | names(pred_class_rand_l) <- folds
28 | 
29 | pred_probs_sign_l <- purrr::map(folds, function(fold) {
30 |   path_to_probs1 <- "tmp/random_forest_predicted_probs_signatures_"
31 |   path_to_probs <- str_c(path_to_probs1, fold, ".rds", sep = "")
32 |   pred_probs <- readRDS(path_to_probs)
33 |   pred_probs
34 | })
35 | names(pred_probs_sign_l) <- folds
36 | pred_probs_rand_l <- purrr::map(folds, function(fold) {
37 |   path_to_probs1 <- "tmp/random_forest_predicted_probs_random_"
38 |   path_to_probs <- str_c(path_to_probs1, fold, ".rds", sep = "")
39 |   pred_probs <- readRDS(path_to_probs)
40 |   pred_probs
41 | })
42 | names(pred_probs_rand_l) <- folds
43 | 
44 | 
45 | # Measure accuracy for each fold
46 | accuracy_dfs <- purrr::map(folds, function(fold) {
47 |   test_df <- tica_df[test_sets_cross_valid[[fold]], ]
48 |   comparison_rownames_sign <- all(rownames(test_df) == names(pred_class_sign_l[[fold]]))
49 |   comparison_rownames_rand <- all(rownames(test_df) == names(pred_class_rand_l[[fold]]))
50 |   if (comparison_rownames_sign & comparison_rownames_rand) {
51 |     conf_mat_sign <- caret::confusionMatrix(
52 |       test_df$new_cell_types,
53 |       pred_class_sign_l[[fold]]
54 |     )
55 |     conf_mat_rand <- caret::confusionMatrix(
56 |       test_df$new_cell_types,
57 |       pred_class_rand_l[[fold]]
58 |     )
59 |     df <- data.frame(
60 |       accuracy = c(conf_mat_sign$overall["Accuracy"], conf_mat_rand$overall["Accuracy"]),
61 |       kappa = c(conf_mat_sign$overall["Kappa"], conf_mat_rand$overall["Kappa"]),
62 |       type = c("signatures", "random")
63 |     )
64 |     df
65 | 
66 |   } else {
67 |     "Rownames do not match"
68 |   }
69 | })
70 | names(accuracy_dfs) <- folds
71 | accuracy_df <- bind_rows(accuracy_dfs, .id = "fold")
72 | saveRDS(accuracy_df, "tmp/accuracy_random_forest_dataframe.rds")
73 | 
74 | 
75 | # Calculate confusion matrix of probabilities for each fold
76 | conf_mat_probs_l <- purrr::map(folds, function(fold) {
77 |   test_df <- tica_df[test_sets_cross_valid[[fold]], ]
78 |   if (all(rownames(test_df) == names(pred_probs_sign_l[[fold]]))) {
79 |     cell_types <- levels(test_df$new_cell_types)
80 |     average_probs_l <- purrr::map(cell_types, function(cell_type) {
81 |       indices_cell_type <- rownames(test_df[test_df$new_cell_types == cell_type, ])
82 |       mat <- pred_probs_sign_l[[fold]][indices_cell_type, ]
83 |       average_prob <- colMeans(mat)
84 |       average_prob
85 |     })
86 |     names(average_probs_l) <- cell_types
87 |     average_probs_mat <- average_probs_l %>%
88 |       bind_rows() %>%
89 |       as.matrix()
90 |     rownames(average_probs_mat) <- names(average_probs_l)
91 |     average_probs_mat
92 |   } else {
93 |     "Rownames do not match"
94 |   }
95 | })
96 | names(conf_mat_probs_l) <- folds
97 | saveRDS(conf_mat_probs_l, "tmp/confusion_matrix_probabilities_list.rds")
98 | 
99 | 


--------------------------------------------------------------------------------
/Random Forest/5-plot_figures.R:
--------------------------------------------------------------------------------
 1 | # This script plots the results of the random forest
 2 | 
 3 | 
 4 | # Load packages
 5 | library(ggpubr)
 6 | library(pheatmap)
 7 | library(tidyverse)
 8 | 
 9 | 
10 | # Load data
11 | accuracy_df <- readRDS("tmp/accuracy_random_forest_dataframe.rds")
12 | conf_mat_probs_l <- readRDS("tmp/confusion_matrix_probabilities_list.rds")
13 | 
14 | 
15 | # Plot accuracy
16 | accuracy_gg <- accuracy_df %>%
17 |   ggplot(aes(type, accuracy, col = type)) +
18 |   geom_jitter() +
19 |   geom_boxplot() +
20 |   scale_y_continuous(limits = c(0, 1), breaks = c(0, 0.25, 0.5, 0.75, 1)) +
21 |   scale_color_manual(values = c("gray50", "limegreen")) +
22 |   labs(x = "", y = "Accuracy") +
23 |   theme_classic() +
24 |   theme(legend.position = "none",
25 |         axis.title.y = element_text(size = 13),
26 |         axis.text.x = element_text(size = 11, color = "black"))
27 | 
28 | kappa_gg <- accuracy_df %>%
29 |   ggplot(aes(type, kappa, col = type)) +
30 |   geom_jitter() +
31 |   geom_boxplot() +
32 |   scale_y_continuous(limits = c(0, 1), breaks = c(0, 0.25, 0.5, 0.75, 1)) +
33 |   scale_color_manual(values = c("gray50", "limegreen")) +
34 |   labs(x = "", y = "Kappa") +
35 |   theme_classic() +
36 |   theme(legend.position = "none",
37 |         axis.title.y = element_text(size = 13),
38 |         axis.text.x = element_text(size = 11, color = "black"))
39 | 
40 | arranged <- ggarrange(plotlist = list(accuracy_gg, kappa_gg), ncol = 2)
41 | ggsave(
42 |   plot = arranged,
43 |   filename = "results/accuracy_kappa_random_forest.pdf",
44 |   width = 17.5,
45 |   height = 8,
46 |   units = "cm"
47 | )
48 | 
49 | 
50 | # Plot heatmap confusion matrix
51 | colors_function <- colorRampPalette(colors = c("white", "red"))
52 | colors <- colors_function(100)
53 | heatmap <- pheatmap(
54 |   conf_mat_probs_l$fold2,
55 |   cluster_cols = FALSE,
56 |   cluster_rows = FALSE,
57 |   scale = "none",
58 |   color = colors,
59 |   angle_col = 315,
60 |   legend = FALSE,
61 |   fontsize_row = 9,
62 |   fontsize_col = 9
63 | )
64 | save_pheatmap_pdf <- function(x, filename, width = 17.5 * 0.394, height = 15 * 0.394) {
65 |   stopifnot(!missing(x))
66 |   stopifnot(!missing(filename))
67 |   pdf(filename, width = width, height = height)
68 |   grid::grid.newpage()
69 |   grid::grid.draw(x$gtable)
70 |   dev.off()
71 | }
72 | save_pheatmap_pdf(heatmap, "results/confusion_matrix_probabilities.pdf")
73 | 
74 | 


--------------------------------------------------------------------------------
/Random Forest/README.md:
--------------------------------------------------------------------------------
 1 | # Validation of TICA clusters using a random forest classifier
 2 | 
 3 | 
 4 | ## Dependencies
 5 | 
 6 | * [R 3.6.0](https://cran.r-project.org/)
 7 | * [Seurat 3.2.0](https://cran.r-project.org/web/packages/Seurat/index.html)
 8 | * [readxl 1.3.1](https://cran.r-project.org/web/packages/readxl/index.html)
 9 | * [tidyverse 1.3.0](https://cran.r-project.org/web/packages/tidyverse/index.html)
10 | * [randomForest 4.6.14](https://cran.r-project.org/web/packages/randomForest/index.html)
11 | * [doParallel 1.0.15](https://cran.r-project.org/web/packages/doParallel/index.html)
12 | * [foreach 1.5.0](https://cran.r-project.org/web/packages/foreach/index.html)
13 | * [caret 6.0.86](https://cran.r-project.org/web/packages/caret/index.html)
14 | * [e1071 1.7.3](https://cran.r-project.org/web/packages/e1071/index.html)
15 | * [ggpubr 0.3.0](https://cran.r-project.org/web/packages/ggpubr/index.html)
16 | * [pheatmap 1.0.12](https://cran.r-project.org/web/packages/pheatmap/index.html)
17 | 
18 | 
19 | ## Data
20 | 
21 | * TICA Seurat object: download it as specified in the publication
22 | * Supplementary table 2: excel file with the markers for each cell type. For simplicity, we provide it in the ./data/ folder.
23 | 
24 | 
25 | ## Pipeline
26 | 
27 | The scripts should be run in the following order:
28 | 
29 | 
30 | 1. 1-calculate_cell_type_signatures.R: calculates 25 cell type-specific signatures + 25 random signatures for each cell in the dataset.
31 |   * Input:
32 |     * data/supp_table_2.xlsx: cell type specific markers (Supplementary Table 2 in the paper)
33 |     * TICA Seurat object (path should be changed before running the script)
34 |   * Output:
35 |     * tmp/tica_input_dataframe_signatures_random_forest.rds: data frame with cells as observations, signatures as feautures and cell type as response variable.
36 | 2. 2-create_test_sets_for_cross_validation.R
37 |   * Input: tmp/tica_input_dataframe_signatures_random_forest.rds
38 |   * Output: tmp/test_set_per_fold.rds: list of 5 character vectors containing the barcodes of the cells that belong to each fold of the cross-validation.
39 | 3. 3-run_random_forest.R: train and test 2 RF (cell type-specific + random signatures).
40 |   * Arguments: this script should have one argument specifying the fold to run the RF (ie "fold1"). The purpose of this is to parallelize the execution of the RF across folds.
41 |   * Input:
42 |     * tmp/tica_input_dataframe_signatures_random_forest.rds
43 |     * tmp/test_set_per_fold.rds
44 |   * Output: 
45 |     * Cell type-specific:
46 |       * tmp/random_forest_model_signatures_fold{1-5}.rds (RF models)
47 |       * tmp/random_forest_predicted_probs_signatures_fold{1-5}.rds (RF probabilities)
48 |       * tmp/random_forest_predicted_class_signatures_fold{1-5}.rds (RF predicted classes)
49 |     * Random signatures:
50 |       * tmp/random_forest_model_random_fold{1-5}.rds
51 |       * tmp/random_forest_predicted_probs_random_fold{1-5}.rds
52 |       * tmp/random_forest_predicted_class_random_fold{1-5}.rds
53 | 4. 4-evaluate_performance.R
54 |   * Input: all outputs from previous script
55 |   * Output:
56 |   	* tmp/confusion_matrix_probabilities_list.rds: list with 5 confusion matrices (one per fold), where each CF is the probability that a cell from a cell type X is assigned to cell type Y.
57 |     * tmp/accuracy_random_forest_dataframe.rds: accuracy and kappa statistics for each fold.
58 | 5. 5-plot_figures.R: 
59 |   * Input:
60 |     * tmp/accuracy_random_forest_dataframe.rds
61 |     * tmp/confusion_matrix_probabilities_list.rds
62 |   * Output: 
63 |     * results/accuracy_kappa_random_forest.pdf (panels a and b)
64 |     * results/confusion_matrix_probabilities.pdf (panel c)


--------------------------------------------------------------------------------
/Random Forest/data/supp_table_2.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Single-Cell-Genomics-Group-CNAG-CRG/Tumor-Immune-Cell-Atlas/f309409beb058ede05dfc85927b952cda51fc119/Random Forest/data/supp_table_2.xlsx


--------------------------------------------------------------------------------
/ST-breast/1-10x_breast_QC.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "1-10x Breast cancer samples"
  3 | author: "Marc Elosua-Bayes"
  4 | date: '`r format(Sys.Date(), "%B %d, %Y")`'
  5 | output:
  6 |   BiocStyle::html_document:
  7 |     toc: yes
  8 |     toc_float: yes
  9 |     number_sections: yes
 10 |     df_print: paged
 11 | ---
 12 | 
 13 | ```{r setup, include=FALSE}
 14 | knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
 15 | ```
 16 | 
 17 | ## Introduction
 18 | 
 19 | In this R markdown document we are going to look at the QC of breast carcinoma datasets publicly put out by 10X genomics [here](https://support.10xgenomics.com/spatial-gene-expression/datasets/). Specifications of the tissue are the following:
 20 | 
 21 | 10x Genomics obtained fresh frozen Invasive Ductal Carcinoma breast tissue from BioIVT Asterand.  
 22 | The tissue was AJCC/UICC Stage Group IIA, ER positive, PR negative, Her2 positive and annotated with:  
 23 | 
 24 | * Ductal carcinoma in situ
 25 | * Lobular carcinoma in situ
 26 | * Invasive Carcinoma
 27 | 
 28 | ## Libraries
 29 | ```{r}
 30 | library(Seurat)
 31 | library(tidyverse)
 32 | library(Matrix)
 33 | # source("utils/spatial_plot_spaniel.R")
 34 | ```
 35 | 
 36 | ## Paths
 37 | ```{r}
 38 | source(here::here("misc/paths.R"))
 39 | source(here::here("utils/bin.R"))
 40 | 
 41 | "{an_breast_10x}/{robj_dir}" %>%
 42 |   glue::glue() %>%
 43 |   here::here() %>%
 44 |   dir.create(
 45 |     path = .,
 46 |     showWarnings = FALSE,
 47 |     recursive = TRUE)
 48 | 
 49 | "{an_breast_10x}/{plt_dir}" %>%
 50 |   glue::glue() %>%
 51 |   here::here() %>%
 52 |   dir.create(
 53 |     path = .,
 54 |     showWarnings = FALSE,
 55 |     recursive = TRUE)
 56 | ```
 57 | 
 58 | ## Load data
 59 | Within this project the data is located at data/breast_visium/section_1 and data/breast_visium/section_2 respectively
 60 | ```{r}
 61 | breast_1 <- Seurat::Load10X_Spatial(data.dir = here::here("data/breast_visium/section_1"),
 62 |                         filename = "filtered_feature_bc_matrix.h5",
 63 |                         slice = "breast_1")
 64 | breast_1[["slice"]] <- "breast_1"
 65 | 
 66 | breast_2 <- Seurat::Load10X_Spatial(data.dir = here::here("data/breast_visium/section_2"),
 67 |                         filename = "filtered_feature_bc_matrix.h5",
 68 |                         slice = "breast_2")
 69 | breast_2[["slice"]] <- "breast_2"
 70 | ```
 71 | 
 72 | The first slice has `r nrow(breast_1)` and `r ncol(breast_1)` samples, the second one in turn has `r nrow(breast_2)` and `r ncol(breast_2)` spots
 73 | 
 74 | ### Remove empty genes
 75 | We start by removing those genes that aren't expressed in any of the spots overlaying the tissue
 76 | ```{r}
 77 | table(rowSums(as.matrix(breast_1@assays$Spatial@counts)) == 0)
 78 | 
 79 | keep_genes1 <- rowSums(as.matrix(breast_1@assays$Spatial@counts)) != 0
 80 | breast_1 <- breast_1[keep_genes1, ]
 81 | 
 82 | table(rowSums(as.matrix(breast_2@assays$Spatial@counts)) == 0)
 83 | 
 84 | keep_genes2 <- rowSums(as.matrix(breast_2@assays$Spatial@counts)) != 0
 85 | breast_2 <- breast_2[keep_genes2, ]
 86 | ```
 87 | 
 88 | In order to work with multiple slices in the same Seurat object, we provide the merge function.
 89 | ```{r}
 90 | breast_merged <- merge(breast_1, breast_2)
 91 | ```
 92 | 
 93 | ## Process data
 94 | ```{r}
 95 | breast_merged <- Seurat::SCTransform(object = breast_merged,
 96 |                                      assay = "Spatial")
 97 | # breast_merged <- Seurat::FindVariableFeatures(breast_merged)
 98 | ```
 99 | 
100 | This then enables joint dimensional reduction and clustering on the underlying RNA expression data.
101 | ```{r}
102 | # Seurat::DefaultAssay(breast_merged) <- "SCT"
103 | # Seurat::VariableFeatures(breast_merged) <- c(Seurat::VariableFeatures(breast_1),
104 | #                                              Seurat::VariableFeatures(breast_2))
105 | breast_merged <- Seurat::RunPCA(breast_merged,
106 |                                 verbose = FALSE) %>%
107 |   Seurat::FindNeighbors(., dims = 1:30, verbose = FALSE) %>%
108 |   Seurat::FindClusters(.,
109 |                        verbose = FALSE,
110 |                        resolution = c(0.1, 0.25, 0.5, 0.8, 1)) %>%
111 |   Seurat::RunUMAP(., dims = 1:30, verbose = FALSE)
112 | ```
113 | 
114 | Add mitochondrial and ribosomal %
115 | ```{r}
116 | # Collect all genes coded on the mitochondrial genome
117 | breast_merged[["percent.mito"]] <- Seurat::PercentageFeatureSet(
118 |   object = breast_merged,
119 |   pattern = "^MT-")
120 | summary(breast_merged[["percent.mito"]])
121 | 
122 | # Collect all genes coding for ribosomal proteins
123 | breast_merged[["percent.ribo"]] <- Seurat::PercentageFeatureSet(
124 |   object = breast_merged,
125 |   pattern = "^RPL|^RPS")
126 | summary(breast_merged[["percent.ribo"]])
127 | ```
128 | 
129 | ## QC Analysis
130 | 
131 | ### Basic features
132 | #### Number of genes
133 | We start by plotting some basic features that will help us visualize and define filtering options.
134 | We start by plotting the number of genes per spot, *complexity*, to assess if there are empty spots or huge disparity.
135 | ```{r fig.height=8, fig.width=12}
136 | p1 <- ggplot2::ggplot() +
137 |   ggplot2::geom_histogram(data = breast_merged[[]], 
138 |                           ggplot2::aes(nFeature_Spatial),
139 |                           fill = "red",
140 |                           alpha = 0.7,
141 |                           color = "red",
142 |                           bins = 50) +
143 |   ggplot2::facet_wrap(. ~ slice, scales = "free") +
144 |   ggplot2::ggtitle("Unique genes per spot") +
145 |   ggplot2::labs(x = "Number of Detected Genes",
146 |                 y = "Number of Spots") +
147 |   ggpubr::theme_pubr()
148 | 
149 | p1
150 | ```
151 | 
152 | After looking at the distribution we are also going to look at how these spots look on the tissue
153 | ```{r fig.height=12, fig.width=12}
154 | Seurat::SpatialFeaturePlot(
155 |     object = breast_merged,
156 |     features = "nFeature_Spatial") &
157 |   ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 18))
158 | ```
159 | 
160 | #### Number of reads
161 | Next we want to look at the number of reads captured per spot, this should correlate with spot complexity and will allow us to see regions with higher transcriptional activity.
162 | ```{r fig.height=8, fig.width=12}
163 | p2 <- ggplot2::ggplot() +
164 |   ggplot2::geom_histogram(data = breast_merged[[]],
165 |                           ggplot2::aes(nCount_Spatial),
166 |                           fill = "red",
167 |                           alpha = 0.7,
168 |                           color = "red",
169 |                           bins = 50) +
170 |   ggplot2::facet_wrap(. ~ slice, scales = "free") +
171 |   ggplot2::ggtitle("Total counts per spots") +
172 |   ggplot2::labs(x = "Library Size (total UMI)",
173 |                 y = "Number of Spots") +
174 |   ggpubr::theme_pubr()
175 | 
176 | p2
177 | ```
178 | 
179 | 
180 | ```{r fig.height=12, fig.width=12}
181 | Seurat::SpatialFeaturePlot(
182 |     object = breast_merged,
183 |     features = "nCount_Spatial") &
184 |   ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 18))
185 | ```
186 | 
187 | #### Gene counts
188 | Another characteristic we want to look at is how many counts per gene there are since we want to remove lowly expressed genes which aren't giving information but potentially introducing noise.
189 | ```{r fig.height=8, fig.width=12}
190 | count_mtrx <- Seurat::GetAssayData(object = breast_merged,
191 |                                    slot = "counts",
192 |                                    assay = "Spatial")
193 | 
194 | gene_attr <- lapply(c("slice_1", "slice_2"), function(id) {
195 |   if (id == "slice_1") tail_id <- "_1" else tail_id <- "_2"
196 |   
197 |   mask <- stringr::str_detect(
198 |     string = colnames(count_mtrx),
199 |     pattern = tail_id)
200 |   
201 |   gene_attr <- data.frame(
202 |     nUMI = Matrix::rowSums(count_mtrx[, mask]), 
203 |     nSpots = Matrix::rowSums(count_mtrx[, mask] > 0),
204 |     slice = id)
205 | }) %>%
206 |   dplyr::bind_rows()
207 | 
208 | p3 <- ggplot2::ggplot() +
209 |   ggplot2::geom_histogram(data = gene_attr,
210 |                           ggplot2::aes(nUMI),
211 |                           fill = "red",
212 |                           alpha = 0.7,
213 |                           color = "red",
214 |                           bins = 50) +
215 |   ggplot2::facet_wrap(. ~ slice, scales = "free") +
216 |   ggplot2::scale_x_log10() +
217 |   ggplot2::ggtitle("Total counts per gene (log10 scale)") +
218 |   ggpubr::theme_pubr()
219 | p3
220 | ```
221 | 
222 | #### Gene ubiquitousness
223 | We also look at on how many spots each gene is detected, we see there are a few genes expressed in almost all the spots while and a majority of genes detected in few spots.
224 | ```{r fig.height=8, fig.width=12}
225 | p4 <- ggplot2::ggplot() +
226 |   ggplot2::geom_histogram(data = gene_attr,
227 |                           ggplot2::aes(nSpots),
228 |                           fill = "red",
229 |                           alpha = 0.7,
230 |                           color = "red",
231 |                           bins = 50) +
232 |   ggplot2::facet_wrap(. ~ slice, scales = "free") +
233 |   ggplot2::ggtitle("Total spots per gene") +
234 |   ggpubr::theme_pubr()
235 | 
236 | p4
237 | ```
238 | 
239 | #### Mitochondrial %
240 | Next we take a look at the mitochondrial %; This metric can help us get a first glimpse of metabolic activity and/or necrotic regions - 10X explains [here](https://kb.10xgenomics.com/hc/en-us/articles/360001086611-Why-do-I-see-a-high-level-of-mitochondrial-gene-expression-)
241 | ```{r fig.height=8, fig.width=12}
242 | p5 <- ggplot2::ggplot() +
243 |   ggplot2::geom_histogram(data = breast_merged[[]], 
244 |                           ggplot2::aes(percent.mito),
245 |                           fill = "red",
246 |                           alpha = 0.7,
247 |                           color = "red",
248 |                           bins = 50) +
249 |   ggplot2::facet_wrap(. ~ slice, scales = "free") +
250 |   ggplot2::ggtitle("Mitochondrial % per spot") +
251 |   ggplot2::labs(x = "Mitochondrial % ",
252 |                 y = "Number of Spots") +
253 |   ggpubr::theme_pubr()
254 | p5
255 | ```
256 | 
257 | ```{r fig.height=12, fig.width=12}
258 | Seurat::SpatialFeaturePlot(
259 |   object = breast_merged,
260 |   features = "percent.mito") &
261 |   ggplot2::theme(plot.title = element_text(hjust = 0.5, size = 18))
262 | ```
263 | 
264 | #### Ribosomal %
265 | Lastly we look at the ribosomal % which gives us insight into which regions are the most transcriptomically active when looked side by side with the number of detected genes.
266 | ```{r fig.height=8, fig.width=12}
267 | p6 <- ggplot2::ggplot() +
268 |   ggplot2::geom_histogram(data = breast_merged[[]], 
269 |                           ggplot2::aes(percent.ribo),
270 |                           fill = "red",
271 |                           alpha = 0.7,
272 |                           color = "red",
273 |                           bins = 50) +
274 |   ggplot2::facet_wrap(. ~ slice, scales = "free") +
275 |   ggplot2::ggtitle("Ribosomal % per spot") +
276 |   ggplot2::labs(x = "Ribosomal % ",
277 |                 y = "Number of Spots") +
278 |   ggpubr::theme_pubr()
279 | p6
280 | ```
281 | 
282 | ```{r fig.height=12, fig.width=12}
283 | Seurat::SpatialFeaturePlot(
284 |   object = breast_merged,
285 |   features = "percent.ribo") +
286 |   ggplot2::theme(plot.title = element_text(hjust = 0.5, size = 18))
287 | ```
288 | 
289 | ### Feature covariation
290 | Next we look at how these features covariate.
291 | ```{r fig.height=15, fig.width=15}
292 | plt_covar_ls <- qc_covar_plots(se = breast_merged,
293 |                                nfeat = "nFeature_Spatial",
294 |                                ncount = "nCount_Spatial",
295 |                                slot = "counts",
296 |                                assay = "Spatial",
297 |                                percent.mito = "percent.mito",
298 |                                percent.ribo = "percent.ribo",
299 |                                facet = "slice")
300 | 
301 | cowplot::plot_grid(
302 |   plotlist = plt_covar_ls,
303 |   ncol = 2,
304 |   align = "hv",
305 |   axis = "trbl")
306 | ```
307 | 
308 | ## Save RDS
309 | Save the object to use downstream.
310 | ```{r}
311 | lapply(c("breast_1", "breast_2"), function(id) {
312 |   
313 |   breast_sub <- breast_merged[, breast_merged$slice == id]
314 |   
315 |   # Remove other images
316 |   breast_sub@images <- breast_sub@images[Seurat::Images(breast_sub) == id]
317 |   
318 |   "{an_breast_10x}/{robj_dir}/qc_se_{id}.rds" %>%
319 |     glue::glue() %>%
320 |     here::here() %>%
321 |     saveRDS(
322 |       object = breast_sub,
323 |       file = .)
324 | })
325 | ```
326 | 
327 | ## Session Info
328 | ```{r}
329 | sessionInfo()
330 | ```
331 | 


--------------------------------------------------------------------------------
/ST-breast/2-10x_breast_GO_enrichment.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "2-10x breast GO enrichment"
  3 | author: "Marc Elosua-Bayes"
  4 | date: "6/12/2020"
  5 | output: html_document
  6 | ---
  7 | 
  8 | ```{r setup, include=FALSE}
  9 | knitr::opts_chunk$set(echo = TRUE)
 10 | ```
 11 | 
 12 | ## Introduction
 13 | 
 14 | In this R markdown document we are going to carry out Gene marker and Gene Ontology enrichment analysis.
 15 | 
 16 | ## Libraries
 17 | ```{r}
 18 | library(Seurat)
 19 | library(Spaniel)
 20 | library(tidyverse)
 21 | library(Matrix)
 22 | library(SPOTlight)
 23 | source("utils/bin.r")
 24 | source("utils/spatial_plot_spaniel.R")
 25 | ```
 26 | 
 27 | ## Paths
 28 | ```{r}
 29 | source("misc/paths.R")
 30 | dir.create(path = sprintf("%s/%s", an_breast_10x, robj_dir),
 31 |            showWarnings = FALSE,
 32 |            recursive = TRUE)
 33 | 
 34 | dir.create(path = sprintf("%s/%s", an_breast_10x, plt_dir),
 35 |            showWarnings = FALSE,
 36 |            recursive = TRUE)
 37 | 
 38 | ```
 39 | 
 40 | ## Load data
 41 | Data loaded here comes from 1-10x_breast_QC.Rmd and 3-10x_breast_immune_reference.Rmd
 42 | ```{r}
 43 | breast_merged <- readRDS(file = sprintf("%s/%s/breast_merged_processed.RDS", an_breast_10x, robj_dir))
 44 | decon_mtrx_ls <- readRDS(file = sprintf("%s/%s/decon_mtrx_breast_cancer_10x.RDS", an_breast_10x, robj_dir))
 45 | ```
 46 | 
 47 | ## Analysis
 48 | Start by getting the marker genes for the clusters
 49 | ```{r}
 50 | Seurat::Idents(breast_merged) <- breast_merged$SCT_snn_res.0.1
 51 | st_markers <- Seurat::FindAllMarkers(object = breast_merged,
 52 |                        assay = "SCT",
 53 |                        slot = "data",
 54 |                        only.pos = TRUE)
 55 | st_markers %>% count(cluster)
 56 | saveRDS(object = st_markers,
 57 |         file = sprintf("%s/%s/markers_breast_10x_res.0.1.RDS", an_breast_10x, robj_dir))
 58 | ```
 59 | 
 60 | Filter top genes
 61 | ```{r}
 62 | pull_col <- function(.x) {
 63 |   return(.x %>% pull(var = "gene"))
 64 | }
 65 | 
 66 | marker_ls <- st_markers %>% 
 67 |   dplyr::filter(p_val_adj < 0.01) %>%
 68 |   dplyr::group_by(cluster) %>% 
 69 |   dplyr::group_split() %>% 
 70 |   purrr::map(.f = pull_col)
 71 | 
 72 | ```
 73 | 
 74 | ### GO term enrichment analysis
 75 | ```{r eval = FALSE}
 76 | library(org.Hs.eg.db)
 77 | library(GOstats)
 78 | 
 79 | univ <- stringr::str_split(string = rownames(breast_merged@assays$Spatial@counts),
 80 |                            pattern = " ",
 81 |                            simplify = TRUE)[, 1]
 82 | 
 83 | DE_df1 <- lapply(seq_len(length(marker_ls)), function(i) {
 84 |   # gene_set <- convert_symb_entrez(gene_vec = stringr::str_to_upper(markers_ln1_ls[[i]]))
 85 |   # gene_universe <- convert_symb_entrez(gene_vec = stringr::str_to_upper(rownames(ln_1)))
 86 |   print(i)
 87 |   go_clust <- gene_enrichment_GO(gene_de = stringr::str_to_upper(marker_ls[[i]]),
 88 |                      gene_universe = univ) %>% 
 89 |     summary(.) %>% 
 90 |     dplyr::mutate(cluster = i)
 91 |   
 92 |   return(go_clust)
 93 | }) %>% 
 94 |   dplyr::bind_rows()
 95 | 
 96 | saveRDS(object = DE_df1, file = sprintf("%s/%s/go_enrichment_breast_10x_res.0.1.RDS", an_breast_10x, robj_dir))
 97 | ```
 98 | 
 99 | 
100 | ```{r}
101 | DE_df1 <- readRDS(file = sprintf("%s/%s/go_enrichment_breast_10x_res.0.1.RDS", an_breast_10x, robj_dir))
102 | ```
103 | 
104 | #### Visualization
105 | For visualization purposses we are going to check GO terms at varying levels of specificities.
106 | 
107 | ```{r}
108 | DE_specific <- DE_df1 %>%
109 |   dplyr::filter(Size <= 50 & Pvalue < 0.01 & Count >= 3)
110 | 
111 | DE_mid <- DE_df1 %>%
112 |   dplyr::filter(Size <= 300 & Size >= 30 & Pvalue < 0.01 & Count >= 3)
113 | 
114 | DE_general <- DE_df1 %>%
115 |   dplyr::filter(Size > 300 & Pvalue < 0.01 & Count >= 3)
116 | ```
117 | 
118 | ```{r}
119 | GO_visualization <- function(ds) {
120 |   plt_ls <- lapply(unique(ds$cluster), function(i) {
121 |     tmp_plt <- ds %>%
122 |       dplyr::filter(cluster == i) %>%
123 |       dplyr::arrange(desc(OddsRatio)) %>%
124 |       head(20) %>% 
125 |       ggplot(.) +
126 |         geom_point(aes(x = OddsRatio,
127 |                        y = reorder(Term, OddsRatio),
128 |                        size = -Pvalue,
129 |                        color = Pvalue)) +
130 |         labs(title = sprintf("Cluster %s", i - 1)) +
131 |         scale_color_gradient(low = "green",
132 |                              high = "red") +
133 |       theme_classic()
134 |     return(tmp_plt)
135 |   })
136 |   return(plt_ls)
137 | }
138 | ```
139 | 
140 | Specific
141 | ```{r}
142 | GO_visualization(ds = DE_specific) %>%
143 |   ggpubr::ggexport(plotlist = .,
144 |                    filename = sprintf("%s/%s/go_enrich_breast_10x_res.0.1_specific.pdf",
145 |                                       an_breast_10x, plt_dir),
146 |                    width = 12)
147 | ```
148 | 
149 | Middle
150 | ```{r}
151 | GO_visualization(ds = DE_mid) %>%
152 |   ggpubr::ggexport(plotlist = .,
153 |                    filename = sprintf("%s/%s/go_enrich_breast_10x_res.0.1_mid.pdf",
154 |                                       an_breast_10x, plt_dir),
155 |                    width = 12)
156 | ```
157 | 
158 | General
159 | ```{r}
160 | GO_visualization(ds = DE_general) %>%
161 |   ggpubr::ggexport(plotlist = .,
162 |                    filename = sprintf("%s/%s/go_enrich_breast_10x_res.0.1_general.pdf",
163 |                                       an_breast_10x, plt_dir),
164 |                    width = 12)
165 | ```
166 | 
167 | Look at estrogen, progesterone and HER2 receptor genes
168 | ```{r}
169 | Seurat::SpatialFeaturePlot(object = breast_merged,
170 |                            features = c("ESR1", # Estrogen receptor
171 |                                         "PGR", # Progesterone receptor
172 |                                         "ERBB2"), # HER2 positive
173 |                            alpha = c(0, 1))
174 | 
175 | 
176 | Seurat::SpatialDimPlot(object = breast_merged,
177 |                        group.by = "seurat_clusters")
178 | 
179 | ```
180 | 
181 | 


--------------------------------------------------------------------------------
/ST-breast/3-10x_breast_immune_reference.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | author: "Marc Elosua-Bayes"
  3 | date: "`r format(Sys.time(), '%d %B, %Y')`"
  4 | output:
  5 |   html_document:
  6 |     toc: yes
  7 |     toc_float: yes
  8 |     number_sections: yes
  9 |     df_print: paged
 10 | editor_options: 
 11 |   chunk_output_type: console
 12 | params: 
 13 |     sample_id: "Default!"
 14 |     clust_vr: "Default!"
 15 | title: "`r sprintf('3-10x_breast_immune_reference %s', {params$sample_id})`"
 16 | ---
 17 | 
 18 | ```{r setup, include=FALSE}
 19 | knitr::opts_chunk$set(echo = TRUE, out.width = "100%", fig.align='center', 
 20 |                       message = FALSE, warning = FALSE)
 21 | options(width = 1200)
 22 | ```
 23 | 
 24 | ## Introduction
 25 | In this R markdown document we will map immune cells onto the breast cancer tissue. 
 26 | 
 27 | ```{r}
 28 | library(Seurat)
 29 | library(tidyverse)
 30 | library(Matrix)
 31 | library(SPOTlight)
 32 | source(here::here("utils/bin.r"))
 33 | ```
 34 | 
 35 | ## Paths
 36 | ```{r}
 37 | source(here::here("misc/paths.R"))
 38 | 
 39 | "{an_breast_10x}/{robj_dir}" %>%
 40 |   glue::glue() %>%
 41 |   here::here() %>%
 42 |   dir.create(
 43 |     path = ,
 44 |     showWarnings = FALSE,
 45 |     recursive = TRUE)
 46 | 
 47 | "{an_breast_10x}/{plt_dir}" %>%
 48 |   glue::glue() %>%
 49 |   here::here() %>%
 50 |   dir.create(
 51 |     path = ,
 52 |     showWarnings = FALSE,
 53 |     recursive = TRUE)
 54 | ```
 55 | 
 56 | ## Parameters
 57 | ```{r}
 58 | set.seed(1243)
 59 | 
 60 | sample_id <- params$sample_id
 61 | # sample_id <- "breast_1"
 62 | # sample_id <- "breast_2"
 63 | clust_vr <- params$clust_vr
 64 | # clust_vr <- "lv1_annot"
 65 | # clust_vr <- "lv2_annot"
 66 | 
 67 | trn <- "melanoma"
 68 | cl_n <- 100
 69 | hvg <- 3000
 70 | ntop <- NULL
 71 | transf <- "uv"
 72 | method <- "nsNMF"
 73 | min_cont <- 0
 74 | 
 75 | if (is.null(ntop)) {
 76 |   spotlight_id <- glue::glue("trn-{trn}_cln-{cl_n}_hvg-{hvg}_ntop-NULL_transf-{transf}_method-{method}_mincont-{min_cont}")
 77 | } else {
 78 |   spotlight_id <- glue::glue("trn-{trn}_cln-{cl_n}_hvg-{hvg}_ntop-{ntop}_transf-{transf}_method-{method}_mincont-{min_cont}")
 79 | }
 80 | ```
 81 | 
 82 | Define cell types of interest depending on the annotation level
 83 | ```{r}
 84 | if (clust_vr == "lv2_annot") {
 85 |   ct_interest <- c("Plasma.B.cells", "T.regs", "T.helper.Th17",
 86 |                    "Proliferation", "CD8.exhausted",
 87 |                    "CD8.cytotoxic", "NK", "Macrophages.SPP1", "TAMs.C1QC")
 88 | } else if (clust_vr == "lv1_annot") {
 89 |   ct_interest <- c("Plasma B cells", "T cells regulatory", "T helper cells",
 90 |                    "Proliferation", "CD8 pre-exhausted",
 91 |                    "CD8 terminally exhausted", "NK", "Macrophages SPP1", "TAMs C1QC")
 92 | }
 93 | ```
 94 | 
 95 | 
 96 | ## Load data
 97 | Breast cancer data loaded here comes from 1-10x_breast_QC.Rmd
 98 | ```{r}
 99 | ### Spatial breast cancer
100 | se_obj <- "{an_breast_10x}/{robj_dir}/qc_se_{sample_id}.rds" %>%
101 |   glue::glue() %>%
102 |   here::here() %>%
103 |   readRDS(file = .)
104 | ```
105 | 
106 | Create a name/color dataframe
107 | ```{r}
108 | # source(here::here("misc/col_df_scrpt.R"))
109 | # source(here::here("misc/col_df_scrpt2.R"))
110 | 
111 | # col_vec <- readRDS(here::here("data/immune_reference/complete_cell_type_palette.rds"))
112 | # col_df <- data.frame(col_vec) %>%
113 | #   tibble::rownames_to_column("plt_name") %>%
114 | #   dplyr::mutate(ct_name = stringr::str_replace_all(
115 | #     string = plt_name,
116 | #     pattern = "[[:punct:]]|[[:blank:]]",
117 | #     replacement = "."))
118 | # prloif_df <- data.frame(plt_name = "Proliferation", col_vec = "Brown", ct_name = "Proliferation")
119 | # col_df <- rbind(col_df, prloif_df)
120 | # col_df <- col_df %>%
121 | #   dplyr::mutate(
122 | #     # plt_name = dplyr::if_else(plt_name == "Th17 cells", "CD4 effector memory", plt_name),
123 | #     # ct_name = dplyr::if_else(ct_name == "Th17.cells", "CD4.effector.memory", ct_name),
124 | #     plt_name = dplyr::if_else(plt_name == "TAMs M2", "TAMs C1QC", plt_name)
125 | #   )
126 | # saveRDS(object = col_df, file = here::here("misc/col_df.rds"))
127 | col_df <- readRDS(here::here("misc/col_df.rds"))
128 | ```
129 | 
130 | Subset ICA to just use cells from the melanoma subset since we have a good representation of cells for each cluster from this dataset.
131 | ```{r eval = FALSE}
132 | ica_melanoma2_path <- "{an_breast_10x}/{robj_dir}/ica_melanoma2.rds" %>%
133 |   glue::glue() %>%
134 |   here::here()
135 | 
136 | if (file.exists(ica_melanoma2_path)) {
137 |   
138 |   ica_sub <- readRDS(file = ica_melanoma2_path)
139 | } else {
140 |   # ica_se <- readRDS("/scratch/devel/pnieto/TIL_Atlas/TICA/output/integrated_renamed_filtered.rds")
141 |   ica_se <- "/scratch/devel/pnieto/TIL_Atlas/atlas_final_dataset/new_annot/TICAtlas.rds" %>%
142 |     here::here() %>%
143 |     readRDS(.)
144 |   ### Immune reference atlas
145 |   ica_sub <- subset(ica_se, subset = source == "melanoma2")
146 |   rm(ica_se)
147 |   
148 |   saveRDS(object = ica_sub, file = ica_melanoma2_path)
149 | }
150 | 
151 | table(ica_sub@meta.data$lv1_annot)
152 | table(ica_sub@meta.data$lv2_annot)
153 | ```
154 | 
155 | Change variable names to remove non-standard characters
156 | ```{r eval = FALSE}
157 | ica_sub[["specific_cell_type_mod"]] <- stringr::str_replace_all(
158 |   string = as.character(ica_sub@meta.data[, clust_vr]),
159 |   pattern = "[[:punct:]]|[[:blank:]]",
160 |   replacement = ".")
161 | 
162 | prolif_vec <- c("T.cells.proliferative", "Macrophages.and.monocytes.proliferative",
163 |                 "B.cells.proliferative", "Macrophages.proliferative")
164 | 
165 | ica_sub[["specific_cell_type_mod"]] <- 
166 |   dplyr::if_else(ica_sub@meta.data[, "specific_cell_type_mod"] %in% prolif_vec,
167 |                  "Proliferation", ica_sub@meta.data[, "specific_cell_type_mod"])
168 | 
169 | table(ica_sub@meta.data$specific_cell_type_mod)
170 | ```
171 | 
172 | Immune cell marker genes
173 | ```{r eval = FALSE}
174 | ica_sub <- Seurat::SCTransform(object = ica_sub, assay = "RNA")
175 | 
176 | Seurat::Idents(ica_sub) <- as.character(ica_sub@meta.data[, "specific_cell_type_mod"])
177 | 
178 | ica_markers <- Seurat::FindAllMarkers(object = ica_sub,
179 |                        assay = "SCT",
180 |                        slot = "data",
181 |                        only.pos = TRUE,
182 |                        logfc.threshold = 0,
183 |                        min.pct = 0,
184 |                        max.cells.per.ident = 500)
185 | 
186 | 
187 | "data/immune_reference/ica_markers_melanoma2_{clust_vr}.rds" %>%
188 |   glue::glue()%>%
189 |   here::here( ) %>%
190 |   saveRDS(
191 |     object = ica_markers,
192 |     file = .)
193 | ```
194 | 
195 | Load marker genes per cluster
196 | ```{r eval = FALSE}
197 | ica_markers <- "data/immune_reference/ica_markers_melanoma2_{clust_vr}.rds" %>%
198 |   glue::glue()%>%
199 |   here::here() %>%
200 |   readRDS(file = .)
201 | 
202 | # ica_markers <- readRDS(file = here::here("data/immune_reference/ica_markers_melanoma2.rds"))
203 | 
204 | # if (clust_vr == "lv1_annot") {
205 |   # Remove MAST cells
206 | ica_markers <- ica_markers %>% dplyr::filter(cluster != "Mast.cells")
207 | # } else if (clust_vr == "lv2_annot") {
208 |   # Remove MAST cells
209 |   # ica_markers <- ica_markers %>% dplyr::filter(cluster != "Mast.cells")
210 | 
211 | # }
212 | ```
213 | 
214 | Look at the proliferation markers
215 | ```{r  eval = FALSE}
216 | ica_markers %>%
217 |   dplyr::filter(cluster == "Proliferation") %>%
218 |   head(20)
219 | ```
220 | 
221 | ## Deconvolution
222 | Run deconvolution.
223 | ```{r message = FALSE,  eval = FALSE}
224 | # Remove Mast cells
225 | ica_sub <- subset(ica_sub, subset = specific_cell_type_mod != "Mast.cells")
226 | # Create Seurat object with filtered gene matrix after removing Mast cells
227 | ica_sub <- Seurat::CreateSeuratObject(
228 |   counts = ica_sub@assays$RNA@counts[
229 |     sparseMatrixStats::rowSums2(ica_sub@assays$RNA@counts) != 0, ],
230 |   meta.data = ica_sub@meta.data)
231 | 
232 | # Remove empty spatial genes
233 | # keep_genes <- rowSums(as.matrix(se_obj@assays$Spatial@counts)) != 0
234 | # table(rowSums(ica_sub@assays$RNA@counts) != 0)
235 | 
236 | # Run deconvolution
237 | decon_mtrx_ls <- SPOTlight::spotlight_deconvolution(
238 |   se_sc = ica_sub,
239 |   counts_spatial = se_obj@assays$Spatial@counts,
240 |   clust_vr = "specific_cell_type_mod",
241 |   cluster_markers = ica_markers,
242 |   cl_n = cl_n,
243 |   hvg = hvg,
244 |   ntop = ntop,
245 |   transf = transf,
246 |   method = method,
247 |   min_cont = min_cont,
248 |   assay = "RNA",
249 |   slot = "counts")
250 | 
251 | # saveRDS(object = decon_mtrx_ls,
252 | #         file = sprintf("%s/%s/decon_mtrx_breast_cancer_10x_atlas_%s_reg.rds",
253 | #                        an_breast_10x, robj_dir, spotlight_id))
254 | # 
255 | # saveRDS(object = decon_mtrx_ls,
256 | #         file = sprintf("%s/%s/decon_mtrx_breast_cancer_10x_atlas_%s_latent.rds",
257 | #                        an_breast_10x, robj_dir, spotlight_id))
258 | 
259 | "{an_breast_10x}/{robj_dir}/decon_mtrx_breast_cancer_10x_atlas_{sample_id}_{spotlight_id}_{clust_vr}.rds" %>%
260 |   glue::glue() %>%
261 |   here::here() %>%
262 |   saveRDS(object = decon_mtrx_ls, file = .)
263 | ```
264 | 
265 | Add deconvolution matrix to Seurat object metadata
266 | ```{r}
267 | decon_mtrx_ls <- "{an_breast_10x}/{robj_dir}/decon_mtrx_breast_cancer_10x_atlas_{sample_id}_{spotlight_id}_{clust_vr}.rds" %>%
268 |   glue::glue() %>%
269 |   here::here() %>%
270 |   readRDS(file = .)
271 | 
272 | # decon_mtrx_ls <- readRDS(file = sprintf("%s/%s/decon_mtrx_breast_cancer_10x_atlas_%s.rds",
273 | #                                         an_breast_10x, robj_dir, spotlight_id))
274 | # decon_mtrx_ls <- readRDS(file = sprintf("%s/%s/decon_mtrx_breast_cancer_10x_pdac_atlas.rds", an_breast_10x, robj_dir))
275 | # decon_mtrx_ls <- readRDS(file = sprintf("%s/%s/decon_mtrx_breast_cancer_10x_atlas_%s_reg.rds",
276 | #                        an_breast_10x, robj_dir, spotlight_id))
277 | # 
278 | # decon_mtrx_ls <- readRDS(file = sprintf("%s/%s/decon_mtrx_breast_cancer_10x_atlas_%s_latent.rds",
279 | #                        an_breast_10x, robj_dir, spotlight_id))
280 | 
281 | decon_mtrx <- decon_mtrx_ls[[2]]
282 | decon_mtrx <- decon_mtrx[, colnames(decon_mtrx) != "res_ss"]
283 | 
284 | # Set as 0 cell types predicted to be under 3 % of the spot
285 | decon_mtrx[decon_mtrx < 0.03] <- 0
286 | ```
287 | 
288 | Change names to original ones and reorder according to Paula
289 | ```{r}
290 | new_colnames <- data.frame(ct_name = colnames(decon_mtrx), stringsAsFactors = FALSE) %>%
291 |   dplyr::left_join(col_df, by = "ct_name") %>%
292 |   dplyr::pull(plt_name)
293 | 
294 | colnames(decon_mtrx) <- new_colnames
295 | ```
296 | 
297 | Add deconvolution matrix to Seurat object metadata
298 | ```{r}
299 | se_obj@meta.data <- cbind(se_obj@meta.data, round(decon_mtrx, 3))
300 | ```
301 | 
302 | ### Visualization
303 | Before we start we take a look at the topic profiles
304 | ```{r}
305 | # Extract coefficient matrix
306 | h <- NMF::coef(decon_mtrx_ls[[1]][[1]])
307 | 
308 | # Extract coefficient labels
309 | train_labs <- data.frame(ct_name = decon_mtrx_ls[[1]][[2]]) %>%
310 |   dplyr::left_join(col_df, by = "ct_name") %>%
311 |   dplyr::pull(plt_name)
312 | 
313 | # rownames(h) <- paste("Topic", 1:nrow(h), sep = " ")
314 | profile_plt <- SPOTlight::dot_plot_profiles_fun(
315 |   h = h,
316 |   train_cell_clust = train_labs)
317 | 
318 | plt_2 <- profile_plt[[2]] +
319 |   ggplot2::scale_x_discrete(limits = unique(train_labs)) +
320 |   ggplot2::scale_y_discrete(labels = glue::glue("Topic {1:nrow(h)}")) +
321 |   ggplot2::labs(x = "", y = "") +
322 |   ggplot2::theme(
323 |     axis.title = ggplot2::element_text(size = 15),
324 |     axis.text.x = ggplot2::element_text(hjust = 1)
325 |     )
326 | profile_plt[[1]]
327 | 
328 | "{an_breast_10x}/{plt_dir}/all_ct_profiles_{spotlight_id}_{sample_id}_{clust_vr}.pdf" %>%
329 |   glue::glue() %>%
330 |   here::here() %>%
331 |   cowplot::save_plot(
332 |     filename = .,
333 |     plot = profile_plt[[1]],
334 |     base_height = 30,
335 |     base_width = 30)
336 | 
337 | "{an_breast_10x}/{plt_dir}/ct_profiles_{spotlight_id}_{sample_id}_{clust_vr}.pdf" %>%
338 |   glue::glue() %>%
339 |   here::here() %>%
340 |   cowplot::save_plot(
341 |     filename = .,
342 |     plot = plt_2 +
343 |       ggplot2::theme(axis.title = ggplot2::element_text(size = 18)),
344 |     base_height = 15,
345 |     base_width = 15)
346 | ```
347 | 
348 | Lastly we can take a look at which genes are the most important for each topic and get an insight into which genes are driving them.
349 | ```{r}
350 | basis_spotlight <- data.frame(NMF::basis(decon_mtrx_ls[[1]][[1]]))
351 | 
352 | # colnames(basis_spotlight) <- unique(stringr::str_wrap(decon_mtrx_ls[[1]][[2]], width = 30))
353 | colnames(basis_spotlight) <- glue::glue("Topic-{1:length(unique(decon_mtrx_ls[[1]][[2]]))}")
354 | 
355 | basis_spotlight %>%
356 |   round(., 5) %>% 
357 |   DT::datatable(., filter = "top")
358 | ```
359 | 
360 | We will start by looking at the location of each cell type
361 | ```{r}
362 | # https://stackoverflow.com/questions/38722202/how-do-i-change-the-number-of-decimal-places-on-axis-labels-in-ggplot2
363 | # Our transformation function to ensure 2 decimals in the legend
364 | scaleFUN <- function(x) sprintf("%.2f", x)
365 | 
366 | ct_all <- colnames(decon_mtrx)
367 | ct_plt_ls <- lapply(ct_all, function(ct) {
368 |   tmp_plt <- Seurat::SpatialPlot(
369 |     object = se_obj,
370 |     features = ct,
371 |     alpha = c(0, 1),
372 |     # Remove background image
373 |     image.alpha = 0)
374 |   
375 |   if (sum(se_obj@meta.data[, ct]) == 0) {
376 |     tmp_plt <- tmp_plt + ggplot2::scale_alpha(range = c(0,0))
377 |   } else {
378 |     tmp_plt <- tmp_plt + ggplot2::scale_alpha(range = c(0, 1))
379 |   }
380 |   
381 |   tmp_plt <- tmp_plt +
382 |     ggplot2::theme(
383 |       legend.title = ggplot2::element_blank()
384 |       ) +
385 |     ggplot2::labs(title = ct) +
386 |     ggplot2::scale_fill_gradientn(
387 |       colors = grDevices::heat.colors(10, rev = TRUE),
388 |       # Same number of breaks for all plots
389 |       breaks = seq(min(se_obj@meta.data[, ct]),
390 |                    max(se_obj@meta.data[, ct]),
391 |                    length.out = 4),
392 |       # 2 decimals in the legend
393 |       labels = scaleFUN
394 |       # limits = c(0, 1)
395 |       )
396 | return(tmp_plt)
397 | })
398 | 
399 | ct_grid <- cowplot::plot_grid(
400 |   plotlist = ct_plt_ls,
401 |   axis = "trbl",
402 |   align = "hv",
403 |   nrow = 5,
404 |   ncol = 5)
405 | 
406 | "{an_breast_10x}/{plt_dir}/{sample_id}_10x_immune_reference_arrangement_{clust_vr}.pdf" %>%
407 |   glue::glue() %>%
408 |   here::here() %>%
409 |   cowplot::save_plot(
410 |     filename = .,
411 |     plot = ct_grid,
412 |     base_height = 25,
413 |     base_width = 25)
414 | ```
415 | 
416 | Next we will only plot the location of the cell types of interest for the main plot
417 | ```{r eval = FALSE}
418 | # i <- unique(se_obj$slice)[1]
419 | # ct <- cell_types[[1]]
420 | ct_plt_ls <- lapply(ct_interest, function(ct) {
421 |   print(ct)
422 |   tmp_plt <- Seurat::SpatialPlot(
423 |     object = se_obj,
424 |     features = ct,
425 |     alpha = c(0, 1),
426 |     # Remove background image
427 |     image.alpha = 0) +
428 |     theme(legend.title = ggplot2::element_blank())
429 |   
430 |   if (sum(se_obj@meta.data[, ct]) == 0) {
431 |     tmp_plt <- tmp_plt + ggplot2::scale_alpha(range = c(0,0))
432 |   } else {
433 |     tmp_plt <- tmp_plt +
434 |     ggplot2::scale_fill_gradientn(
435 |       colors = grDevices::heat.colors(10, rev = TRUE),
436 |       # Same number of breaks for all plots
437 |       breaks = seq(min(se_obj@meta.data[, ct]),
438 |                    max(se_obj@meta.data[, ct]),
439 |                    length.out = 4),
440 |       # 2 decimals in the legend
441 |       labels = scaleFUN
442 |       # limits = c(0, 1)
443 |       )
444 |   }
445 |   
446 |   tmp_plt <- tmp_plt +
447 |     ggplot2::labs(title = ct) +
448 |     ggplot2::theme(
449 |       plot.title = ggplot2::element_text(hjust = 0.5, size = 20, face = "bold"),
450 |       legend.title = ggplot2::element_blank())
451 | 
452 | return(tmp_plt)
453 | })
454 | 
455 | interest_grid <- cowplot::plot_grid(plotlist = ct_plt_ls,
456 |                    axis = "trbl",
457 |                    align = "hv",
458 |                    nrow = 3,
459 |                    ncol = 3)
460 | 
461 | "{an_breast_10x}/{plt_dir}/{sample_id}_10x_immune_reference_arrangement_interest_{clust_vr}.pdf" %>%
462 |   glue::glue() %>%
463 |   here::here() %>%
464 |   cowplot::save_plot(
465 |     filename = .,
466 |     plot = interest_grid,
467 |     base_height = 18,
468 |     base_width = 15)
469 | ```
470 | 
471 | Then we will plot the spatial-scatterpie for both slices to get a gist of what is going on
472 | ```{r eval = FALSE}
473 | spsct_plt1 <- SPOTlight::spatial_scatterpie(
474 |   se_obj = se_obj,
475 |   cell_types_all = cell_types,
476 |   img_path = here::here("data/breast_visium/section_1/spatial/tissue_lowres_image.pdf"),
477 |   pie_scale = 0.4,
478 |   slice = sample_id)
479 | 
480 | if (clust_vr == "lv1_annot") {
481 |   spsct_plt1 <- spsct_plt1 +
482 |     ggplot2::scale_fill_manual(
483 |     values = col_df[col_df$plt_name %in% cell_types, "ct_col"],
484 |     breaks = cell_types) +
485 |     ggplot2::guides(fill = ggplot2::guide_legend(ncol = 1))
486 | }
487 | 
488 | "{an_breast_10x}/{plt_dir}/{sample_id}_10x_spatial_scatterpie_{clust_vr}.pdf" %>%
489 |   glue::glue() %>%
490 |   here::here() %>%
491 |   cowplot::save_plot(
492 |     filename = .,
493 |     plot = spsct_plt1,
494 |     base_width = 12,
495 |     base_height = 9)
496 | ```
497 | 
498 | We will also plot the scatterpie without the slice image underneath
499 | ```{r}
500 | sct_plt1 <- SPOTlight::scatterpie_plot(se_obj = se_obj,
501 |                                        cell_types_all = ct_all,
502 |                                        pie_scale = 0.7,
503 |                                        slice = sample_id) +
504 |   ggplot2::scale_fill_manual(
505 |     values = col_df[col_df$plt_name %in% ct_all, "col_df"],
506 |     breaks = colnames(decon_mtrx)) +
507 |   ggplot2::coord_fixed(ratio = 1) +
508 |   ggplot2::guides(fill = ggplot2::guide_legend(ncol = 1))
509 | 
510 | "{an_breast_10x}/{plt_dir}/{sample_id}_10x_scatterpie_{clust_vr}.pdf" %>%
511 |   glue::glue() %>%
512 |   here::here() %>%
513 |   cowplot::save_plot(
514 |     filename = .,
515 |     plot = sct_plt1,
516 |     base_width = 12,
517 |     base_height = 9)
518 | ```
519 | 
520 | We can also take a loot at the spatial scatterpie by looking at cell types which are not present throughout the entire tissue.
521 | ```{r}
522 | # Subset cell types from metadata
523 | metadata_subset <- se_obj@meta.data[, ct_all]
524 | 
525 | # Create masks
526 | keep_0.9 <- colSums(se_obj@meta.data[, ct_all] > 0) < 0.75 * ncol(se_obj)
527 | keep_g0 <- colSums(se_obj@meta.data[, ct_all] > 0) > 0
528 | 
529 | # Select cell types fullfiling the conditions
530 | ct_var <- colnames(decon_mtrx)[keep_0.9 & keep_g0]
531 | 
532 | col_df <- col_df %>% dplyr::arrange(plt_name)
533 | sct_plt_int <- SPOTlight::scatterpie_plot(se_obj = se_obj,
534 |                                        cell_types_all = ct_var,
535 |                                        pie_scale = 0.4) +
536 |   ggplot2::scale_fill_manual(
537 |     values = col_df[col_df$plt_name %in% ct_var, "col_vec"],
538 |     breaks = col_df[col_df$plt_name %in% ct_var, "plt_name"]) +
539 |   ggplot2::labs(fill = "") +
540 |   ggplot2::coord_fixed(ratio = 1) +
541 |   ggplot2::theme(legend.position = "top") +
542 |   ggplot2::guides(fill = ggplot2::guide_legend(nrow = 3))
543 | 
544 | "{an_breast_10x}/{plt_dir}/{sample_id}_10x_scatterpie_interest_{clust_vr}.pdf" %>%
545 |   glue::glue() %>%
546 |   here::here() %>%
547 |   cowplot::save_plot(
548 |     filename = .,
549 |     plot = sct_plt_int,
550 |     base_width = 12,
551 |     base_height = 9)
552 | ```
553 | 
554 | ### Gene - Cell correlation
555 | Check assumptions
556 | ```{r eval = FALSE}
557 | lapply(c("Regulatory T cells", "Cytotoxic CD8 T cells"), function(ct) {
558 |   # Shapiro-Wilk normality test for mpg
559 |   print(shapiro.test(se_obj@meta.data[, ct]))
560 |   ggpubr::ggqqplot(se_obj@meta.data[, ct])
561 | 
562 | })
563 | 
564 | lapply(c("ENTPD1", "NT5E", "TGFB1"), function(gene) {
565 |   # Shapiro-Wilk normality test for mpg
566 |   print(shapiro.test(se_obj@assays$SCT@data[gene, ]))
567 |   ggpubr::ggqqplot(se_obj@assays$SCT@data[gene, ])
568 | 
569 | })
570 | ```
571 | 
572 | Next we want to check if the fibrotic marker TGFB1 with Treg and Cytotoxic CD8 T cells"
573 | ```{r eval = FALSE}
574 | library(ggpubr)
575 | 
576 | metadata <- dplyr::bind_cols(
577 |   se_obj@meta.data,
578 |   data.frame(t(se_obj@assays$SCT@data[c("ENTPD1", "NT5E", "TGFB1"), ])))
579 | 
580 | cor_plts <- lapply(c("Regulatory T cells", "Cytotoxic CD8 T cells"), function(ct) {
581 |   tmp_ls <- lapply(c("ENTPD1", "NT5E", "TGFB1"), function(gene) {
582 |     
583 |     tmp_plt <- ggpubr::ggscatter(data = metadata,
584 |                       x = gene,
585 |                       y = ct,
586 |                       add = "reg.line") +
587 |        ggpubr::stat_cor(method = "pearson")
588 |     return(tmp_plt)
589 |     })
590 |   cowplot::plot_grid(plotlist = tmp_ls, ncol = 1)
591 |   })
592 |   
593 | tmp_arr <- cowplot::plot_grid(plotlist = cor_plts,
594 |                               nrow = 1,
595 |                               ncol = 2)
596 | tmp_arr
597 | ```
598 | 
599 | ## Session Info
600 | ```{r}
601 | sessionInfo()
602 | ```
603 | 


--------------------------------------------------------------------------------
/ST-breast/RCTD-10x_breast_immune_reference.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | author: "Marc Elosua-Bayes"
  3 | date: "`r format(Sys.time(), '%d %B, %Y')`"
  4 | output:
  5 |   html_document:
  6 |     toc: yes
  7 |     toc_float: yes
  8 |     number_sections: yes
  9 |     df_print: paged
 10 | editor_options: 
 11 |   chunk_output_type: console
 12 | params: 
 13 |     sample_id: "Default!"
 14 |     clust_vr: "lv1_annot"
 15 | title: "`r sprintf('RCTD-10x_breast_immune_reference %s', {params$sample_id})`"
 16 | ---
 17 | 
 18 | ```{r setup, include=FALSE}
 19 | knitr::opts_chunk$set(echo = TRUE, out.width = "100%", fig.align='center', 
 20 |                       message = FALSE, warning = FALSE)
 21 | options(width = 1200)
 22 | ```
 23 | 
 24 | ## Introduction
 25 | In this R markdo9wn document we will map immune cells onto the breast cancer tissue using the RCTD  tool.
 26 | Robust Cell Type Deconvolution is developed by Dylan M. Cable from Rafa Irizarry's lab
 27 | 
 28 | RCTD original paper can be found [here](https://doi.org/10.1038/s41587-021-00830-w) and the GitHub repo [here](https://github.com/dmcable/RCTD).
 29 | 
 30 | 
 31 | ```{r}
 32 | library(Seurat)
 33 | library(tidyverse)
 34 | library(Matrix)
 35 | # devtools::install_github("dmcable/RCTD", build_vignettes = TRUE)
 36 | library(RCTD) 
 37 | source(here::here("utils/bin.r"))
 38 | ```
 39 | 
 40 | ## Paths
 41 | ```{r}
 42 | source(here::here("misc/paths.R"))
 43 | 
 44 | "{an_breast_10x}/{robj_dir}" %>%
 45 |   glue::glue() %>%
 46 |   here::here() %>%
 47 |   dir.create(
 48 |     path = ,
 49 |     showWarnings = FALSE,
 50 |     recursive = TRUE)
 51 | 
 52 | "{an_breast_10x}/{plt_dir}" %>%
 53 |   glue::glue() %>%
 54 |   here::here() %>%
 55 |   dir.create(
 56 |     path = ,
 57 |     showWarnings = FALSE,
 58 |     recursive = TRUE)
 59 | ```
 60 | 
 61 | ## Parameters
 62 | ```{r}
 63 | set.seed(1243)
 64 | 
 65 | sample_id <- params$sample_id
 66 | # sample_id <- "breast_2"
 67 | clust_vr <- params$clust_vr
 68 | clust_vr <- "lv1_annot"
 69 | # clust_vr <- "lv2_annot"
 70 | 
 71 | trn <- "melanoma"
 72 | cl_n <- 100
 73 | hvg <- 3000
 74 | ntop <- NULL
 75 | transf <- "uv"
 76 | method <- "nsNMF"
 77 | min_cont <- 0
 78 | 
 79 | if (is.null(ntop)) {
 80 |   spotlight_id <- glue::glue("trn-{trn}_cln-{cl_n}_hvg-{hvg}_ntop-NULL_transf-{transf}_method-{method}_mincont-{min_cont}")
 81 | } else {
 82 |   spotlight_id <- glue::glue("trn-{trn}_cln-{cl_n}_hvg-{hvg}_ntop-{ntop}_transf-{transf}_method-{method}_mincont-{min_cont}")
 83 | }
 84 | ```
 85 | 
 86 | RCTD paths
 87 | ```{r}
 88 | refdir <- "{an_breast_10x}/{robj_dir}/RCTD_data/reference" %>%
 89 |   glue::glue() %>%
 90 |   here::here()
 91 | 
 92 | dir.create(path = refdir, showWarnings = FALSE, recursive = TRUE)
 93 | 
 94 | stdir <- "{an_breast_10x}/{robj_dir}/{sample_id}/RCTD_data/spatial" %>%
 95 |   glue::glue() %>%
 96 |   here::here()
 97 | 
 98 | dir.create(path = stdir, showWarnings = FALSE, recursive = TRUE)
 99 | ```
100 | 
101 | Define cell types of interest depending on the annotation level
102 | ```{r}
103 | if (clust_vr == "new_annot") {
104 |   ct_interest <- c("Plasma.B.cells", "T.regs", "T.helper.Th17",
105 |                  "Proliferation", "CD8.exhausted",
106 |                  "CD8.cytotoxic", "NK", "Macrophages.SPP1", "TAMs.C1QC")
107 | } else if (clust_vr == "old_cell_types") {
108 |   ct_interest <- c("Plasma B cells", "Regulatory T cells", "T helper cells",
109 |                  "Proliferation", "Pre-exhausted CD8 T cells",
110 |                  "Cytotoxic CD8 T cells", "NK", "SPP1 TAMs", "M2 TAMs")
111 | }
112 | ```
113 | 
114 | ## Load data
115 | Breast cancer data loaded here comes from 1-10x_breast_QC.Rmd
116 | ```{r eval = FALSE}
117 | ### Spatial breast cancer
118 | se_obj <- "{an_breast_10x}/{robj_dir}/qc_se_{sample_id}.rds" %>%
119 |   glue::glue() %>%
120 |   here::here() %>%
121 |   readRDS(file = .)
122 | ```
123 | 
124 | 
125 | Subset ICA to just use cells from the melanoma subset since we have a good representation of cells for each cluster from this dataset.
126 | ```{r eval = FALSE}
127 | ica_melanoma2_path <- "{an_breast_10x}/{robj_dir}/ica_melanoma2.rds" %>%
128 |   glue::glue() %>%
129 |   here::here()
130 | 
131 | if (file.exists(ica_melanoma2_path)) {
132 |   
133 |   ica_sub <- readRDS(file = ica_melanoma2_path)
134 | } else {
135 |   # ica_se <- readRDS("/scratch/devel/pnieto/TIL_Atlas/TICA/output/integrated_renamed_filtered.rds")
136 |   ica_se <- "data/immune_reference/integrated_clustered_complete.rds" %>%
137 |     here::here() %>%
138 |     readRDS(.)
139 |   ### Immune reference atlas
140 |   ica_sub <- subset(ica_se, subset = source == "melanoma2")
141 |   rm(ica_se)
142 |   
143 |   saveRDS(object = ica_sub, file = ica_melanoma2_path)
144 | }
145 | 
146 | table(ica_sub@meta.data$lv1_annot)
147 | table(ica_sub@meta.data$lv2_annot)
148 | ```
149 | 
150 | Remove MAST cells from the training set
151 | ```{r eval = FALSE}
152 | ica_sub <- subset(ica_sub, subset = lv1_annot != "Mast cells")
153 | # Create Seurat object with filtered gene matrix after removing Mast cells
154 | ica_sub <- Seurat::CreateSeuratObject(
155 |   counts = ica_sub@assays$RNA@counts[
156 |     sparseMatrixStats::rowSums2(ica_sub@assays$RNA@counts) != 0, ],
157 |   meta.data = ica_sub@meta.data)
158 | ```
159 | 
160 | Join proliferatiing cell types to get a proliferation signature
161 | ```{r eval = FALSE}
162 | prolif_vec <- c("T cells proliferative", "Macrophages and monocytes proliferative",
163 |                 "B cells proliferative", "Macrophages proliferative")
164 | 
165 | ica_sub[["cell_type_mod"]] <- dplyr::if_else(
166 |   ica_sub@meta.data[, clust_vr] %in% prolif_vec,
167 |   "Proliferation", as.character(ica_sub@meta.data[, clust_vr]))
168 | 
169 | ```
170 | 
171 | ## Prepare data for RCTD
172 | ### Reference scRNAseq
173 | We are going to follow the vignette steps.
174 | 
175 | In order to run RCTD, the first step is to process the single cell reference. Create a folder in ‘data/Reference’ e.g. ‘data/Reference/Vignette’ containing the following three files:
176 | 1. meta_data.csv: a CSV file (with 3 columns, with headers "barcode", "cluster", and "nUMI") containing the numeric cluster assignment for each cell. <br/>
177 | 2. cell_type_dict.csv: a CSV file (with 2 columns, with headers "Cluster" and "Name") containing the mapping between numeric cluster ID and cluster name. If you want a cluster to be filtered out of the single cell reference, you can leave the cluster name blank. The cell types must not contain the character ‘/’ or ‘-’. <br/>
178 | 3. dge.csv: a Digital Gene Expression (DGE) (barcodes by gene counts) CSV file in the standard 10x format. <br/>
179 | We use the dgeToSeurat function:
180 | 
181 | ```{r eval = FALSE}
182 | scRCTD_structure <- function(sc_obj, clust_vr) {
183 |   
184 |   sc_obj[["Name"]] = sc_obj@meta.data[, clust_vr]
185 |   
186 |   # Cell type dictionary between cluster and cell-type
187 |   ct <- unique(as.character(sc_obj@meta.data[, clust_vr]))
188 |   df_ct <- data.frame("Cluster" = 1:length(ct),
189 |                       "Name" = ct)
190 |   
191 |   # 
192 |   metadata <- sc_obj@meta.data %>%
193 |     # Rownames to columns must be before left join since after it the rownames are erased
194 |     tibble::rownames_to_column("barcode") %>%
195 |     dplyr::left_join(df_ct, by = c("Name" = "Name")) %>%
196 |     # Change names to “barcode”, “cluster”, “nUMI”
197 |     mutate(
198 |       cluster = Cluster,
199 |       nUMI = nCount_RNA
200 |       ) %>%
201 |     dplyr::select(barcode, cluster, nUMI)
202 |   
203 |   expr_mtrx <- sc_obj@assays$RNA@counts
204 |   
205 |   return(list("meta_data" = metadata,
206 |               "cell_type_dict" = df_ct,
207 |               "dge" = expr_mtrx))
208 | }
209 | ```
210 | 
211 | Save data to reference directory
212 | ```{r eval = FALSE}
213 | sc_ls <- scRCTD_structure(sc_obj = ica_sub, clust_vr = "cell_type_mod")
214 | 
215 | "{refdir}/meta_data.csv" %>%
216 |   glue::glue() %>%
217 |   here::here() %>%
218 |   readr::write_csv(x = sc_ls[[1]], file = .)
219 | 
220 | "{refdir}/cell_type_dict.csv" %>%
221 |   glue::glue() %>%
222 |   here::here() %>%
223 |   readr::write_csv(x = sc_ls[[2]], path = .)
224 | 
225 | dge_path <- "{refdir}/dge.csv" %>%
226 |   glue::glue() %>%
227 |   here::here()
228 | 
229 | sc_ls[[3]] %>%
230 |   data.frame() %>%
231 |   tibble::rownames_to_column("gene") %>%
232 |   readr::write_csv(x = .,
233 |                    path = dge_path,
234 |                    col_names = TRUE)
235 | 
236 | rm(list = ls("ica_sub", "sc_ls"))
237 | ```
238 | 
239 | **Of note, here go to analysis/tool_benchmarking/RCTD_data/reference/dge.csv and remove the column name gene by "". If you don't it will detect the column as a gene and RCTD::dgeToSeurat won't be able to load the data.**
240 | 
241 | ### Reference ST
242 | Next, put the SpatialRNA data in your ‘data/SpatialRNA’ directory (here ‘data/SpatialRNA/Vignette’). This needs to contain:
243 | 1. BeadLocationsForR.csv: a CSV file (with 3 columns, with headers “barcodes”, “xcoord”, and “ycoord”) containing the spatial locations of the pixels. <br/>
244 | 2. MappedDGEForR.csv: a DGE (gene counts by barcodes) CSV file. Represents raw counts at each pixel. <br/>
245 | 
246 | ```{r eval = FALSE}
247 | dgef_path <- "{stdir}/MappedDGEForR.csv" %>%
248 |   glue::glue() %>%
249 |   here::here()
250 | 
251 | se_obj@assays$Spatial@counts %>%
252 |   data.frame() %>%
253 |   tibble::rownames_to_column("gene") %>% 
254 |   dplyr::select(gene, everything()) %>% 
255 |   readr::write_csv(x = .,
256 |                    file = dgef_path,
257 |                    col_names = TRUE)
258 | 
259 | coord_path <- "{stdir}/BeadLocationsForR.csv" %>%
260 |   glue::glue() %>%
261 |   here::here()
262 | 
263 | # Extract spatial coordinates
264 | se_obj@images[[sample_id]]@coordinates %>%
265 |   dplyr::select(imagecol, imagerow) %>% 
266 |   tibble::rownames_to_column("barcodes") %>%
267 |   dplyr::rename(xcoord = imagecol, ycoord = imagerow) %>%
268 |   readr::write_csv(
269 |     x = .,
270 |     file = coord_path)
271 | 
272 | ```
273 | 
274 | ## RCTD deconvolution
275 | ### Read data in RCTD
276 | Read data in RCTD format
277 | ```{r}
278 | reference <- RCTD::dgeToSeurat(refdir = refdir)
279 | puck <- RCTD::read.SpatialRNA(datadir = stdir)
280 | ```
281 | 
282 | ### Creating and running RCTD
283 | We are now ready to create an RCTD object using the create.RCTD function. We simply need to pass in the SpatialRNA and scRNA-seq objects. There are several configuration options that can be set with this function:
284 | ```{r}
285 | myRCTD <- RCTD::create.RCTD(spatialRNA = puck,
286 |                             reference = reference,
287 |                             max_cores = 1,
288 |                             CELL_MIN = 18)
289 | ```
290 | 
291 | Now, we are ready to run RCTD, using the run.RCTD function. This function is equivalent to sequentially running the functions fitBulk, choose_sigma_c, and fitPixels. The doublet_mode argument sets whether RCTD will be run in ‘doublet mode’ (at most 1-2 cell types per pixel) or ‘full mode’ (no restrictions on number of cell types).
292 | ```{r}
293 | myRCTD <- RCTD::run.RCTD(RCTD = myRCTD,
294 |                          doublet_mode = FALSE)
295 | ```
296 | 
297 | Save RCTD resulting
298 | ```{r}
299 | "{an_breast_10x}/{robj_dir}/RCTDobj_{sample_id}_{clust_vr}.rds" %>%
300 |   glue::glue() %>%
301 |   here::here() %>%
302 |   saveRDS(object = myRCTD, file = .)
303 | ```
304 | 
305 | 
306 | ## Session Info
307 | ```{r}
308 | sessionInfo()
309 | ```
310 | 
311 | 


--------------------------------------------------------------------------------
/ST-breast/RCTD-SPOTlight_comparison.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | author: "Marc Elosua-Bayes"
  3 | date: "`r format(Sys.time(), '%d %B, %Y')`"
  4 | output:
  5 |   html_document:
  6 |     toc: yes
  7 |     toc_float: yes
  8 |     number_sections: yes
  9 |     df_print: paged
 10 | editor_options: 
 11 |   chunk_output_type: console
 12 | params: 
 13 |     sample_id: "Default!"
 14 |     clust_vr: "lv1_annot"
 15 | title: "`r sprintf('RCTD-10x_breast_immune_reference %s', {params$sample_id})`"
 16 | ---
 17 | 
 18 | ```{r setup, include=FALSE}
 19 | knitr::opts_chunk$set(echo = TRUE, out.width = "100%", fig.align='center', 
 20 |                       message = FALSE, warning = FALSE)
 21 | options(width = 1200)
 22 | ```
 23 | 
 24 | ## Introduction
 25 | In this R markdown document we will compare the deconvolution obtained from SPOTlight and RCTD.
 26 | 
 27 | RCTD original paper can be found [here](https://doi.org/10.1038/s41587-021-00830-w) and the GitHub repo [here](https://github.com/dmcable/RCTD).
 28 | 
 29 | 
 30 | ```{r}
 31 | library(Seurat)
 32 | library(tidyverse)
 33 | library(Matrix)
 34 | # devtools::install_github("dmcable/RCTD", build_vignettes = TRUE)
 35 | library(RCTD)
 36 | source(here::here("utils/bin.r"))
 37 | ```
 38 | 
 39 | ## Paths
 40 | ```{r}
 41 | source(here::here("misc/paths.R"))
 42 | 
 43 | "{an_breast_10x}/{robj_dir}" %>%
 44 |   glue::glue() %>%
 45 |   here::here() %>%
 46 |   dir.create(
 47 |     path = ,
 48 |     showWarnings = FALSE,
 49 |     recursive = TRUE)
 50 | 
 51 | "{an_breast_10x}/{plt_dir}" %>%
 52 |   glue::glue() %>%
 53 |   here::here() %>%
 54 |   dir.create(
 55 |     path = ,
 56 |     showWarnings = FALSE,
 57 |     recursive = TRUE)
 58 | ```
 59 | 
 60 | ## Parameters
 61 | ```{r}
 62 | set.seed(1243)
 63 | 
 64 | sample_id <- params$sample_id
 65 | # sample_id <- "breast_2"
 66 | # clust_vr <- params$clust_vr
 67 | clust_vr <- "lv1_annot"
 68 | # clust_vr <- "lv2_annot"
 69 | 
 70 | trn <- "melanoma"
 71 | cl_n <- 100
 72 | hvg <- 3000
 73 | ntop <- NULL
 74 | transf <- "uv"
 75 | method <- "nsNMF"
 76 | min_cont <- 0
 77 | 
 78 | if (is.null(ntop)) {
 79 |   spotlight_id <- glue::glue("trn-{trn}_cln-{cl_n}_hvg-{hvg}_ntop-NULL_transf-{transf}_method-{method}_mincont-{min_cont}")
 80 | } else {
 81 |   spotlight_id <- glue::glue("trn-{trn}_cln-{cl_n}_hvg-{hvg}_ntop-{ntop}_transf-{transf}_method-{method}_mincont-{min_cont}")
 82 | }
 83 | ```
 84 | 
 85 | Create a name/color dataframe
 86 | ```{r}
 87 | # source(here::here("misc/col_df_scrpt.R"))
 88 | # source(here::here("misc/col_df_scrpt2.R"))
 89 | col_df <- readRDS(here::here("misc/col_df.rds"))
 90 | ```
 91 | 
 92 | ## Load data
 93 | Load spatial object
 94 | ```{r}
 95 | ### Spatial breast cancer
 96 | se_obj <- "{an_breast_10x}/{robj_dir}/qc_se_{sample_id}.rds" %>%
 97 |   glue::glue() %>%
 98 |   here::here() %>%
 99 |   readRDS(file = .)
100 | ```
101 | 
102 | SPOTlight deconvolution
103 | ```{r}
104 | decon_mtrx_ls <- "{an_breast_10x}/{robj_dir}/decon_mtrx_breast_cancer_10x_atlas_{sample_id}_{spotlight_id}_{clust_vr}.rds" %>%
105 |   glue::glue() %>%
106 |   here::here() %>%
107 |   readRDS(file = .)
108 | ```
109 | 
110 | RCTD deconvolution
111 | ```{r}
112 | myRCTD <- "{an_breast_10x}/{robj_dir}/RCTDobj_{sample_id}_{clust_vr}.rds" %>%
113 |   glue::glue() %>%
114 |   here::here() %>%
115 |   readRDS(file = .)
116 | ```
117 | 
118 | ## Analysis
119 | 
120 | ### Extract deconvolutions
121 | 
122 | SPOTlight extraction and renaming
123 | ```{r}
124 | decon_mtrx <- decon_mtrx_ls[[2]]
125 | decon_mtrx <- decon_mtrx[, colnames(decon_mtrx) != "res_ss"]
126 | 
127 | # Set as 0 cell types predicted to be under 5 % of the spot
128 | # decon_mtrx[decon_mtrx < 0.05] <- 0
129 | 
130 | # Change names to original ones and reorder according to Paula
131 | new_colnames <- data.frame(ct_name = colnames(decon_mtrx), stringsAsFactors = FALSE) %>%
132 |   dplyr::left_join(col_df, by = "ct_name") %>%
133 |   dplyr::pull(plt_name)
134 | 
135 | colnames(decon_mtrx) <- new_colnames
136 | 
137 | 
138 | colnames(decon_mtrx) <- glue::glue("SPOTlight-{colnames(decon_mtrx)}")
139 | rownames(decon_mtrx) <- colnames(se_obj)
140 | ```
141 | 
142 | RCTD extraction
143 | ```{r}
144 | RCTD_mtrx <- as.matrix(myRCTD@results[[1]])
145 | colnames(RCTD_mtrx) <- glue::glue("RCTD-{colnames(RCTD_mtrx)}")
146 | # RCTD_mtrx[RCTD_mtrx < 0.05] <- 0
147 | 
148 | # Normalize RCTD matrix to 1
149 | RCTD_mtrx <- RCTD_mtrx / rowSums(RCTD_mtrx)
150 | ```
151 | 
152 | Add deconvolution matrices to Seurat object
153 | ```{r}
154 | rownames(RCTD_mtrx) <- stringr::str_replace(
155 |   string = rownames(RCTD_mtrx),
156 |   pattern = "\\.",
157 |   replacement = "-")
158 | 
159 | barcodes <- purrr::reduce(list(rownames(se_obj@meta.data),
160 |                    rownames(RCTD_mtrx),
161 |                    rownames(decon_mtrx)),
162 |               dplyr::intersect)
163 | 
164 | RCTD_mtrx <- RCTD_mtrx[barcodes, ]
165 | decon_mtrx <- decon_mtrx[barcodes, ]
166 | 
167 | se_obj@meta.data <- cbind(se_obj@meta.data[barcodes, ],
168 |                           RCTD_mtrx[barcodes, ],
169 |                           decon_mtrx[barcodes, ])
170 | ```
171 | 
172 | ### Deconvolution comparison
173 | Here we are going to compare RCTD and SPOTlight deconvolutions by correlating the cell types
174 | ```{r fig.width=20, fig.height=20}
175 | cor_plots <- lapply(colnames(decon_mtrx), function(i) {
176 |   print(i)
177 |   i_spot <- i
178 |   i_sub <- stringr::str_split(string = i, pattern = "-", simplify = TRUE, n = 2)
179 |   i_rctd <- glue::glue("RCTD-{i_sub[2]}")
180 |   
181 |   dat <- data.frame(
182 |     SPOTlight = decon_mtrx[, i_spot],
183 |     RCTD = RCTD_mtrx[, i_rctd],
184 |     check.names = FALSE)
185 |   
186 |   print(cor.test(decon_mtrx[, i_spot],
187 |            RCTD_mtrx[, i_rctd]))
188 |   
189 |   ggpubr::ggscatter(dat, x = "SPOTlight", y = "RCTD",
190 |      color = "black",
191 |      add = "reg.line",  # Add regressin line
192 |      add.params = list(color = "blue", fill = "lightgray"), # Customize reg. line
193 |      cor.coef = TRUE, # Add correlation coefficient. see ?stat_cor
194 |      cor.coeff.args = list(method = "pearson", label.sep = "\n"),
195 |      title = i_sub[2])
196 |   })
197 | 
198 | 
199 | plt_grid <- cowplot::plot_grid(plotlist = cor_plots,align = "hv", axis = "trbl")
200 | 
201 | "{an_breast_10x}/{plt_dir}/cor_grid_RCTD-SPOTlight_{sample_id}_{clust_vr}.pdf" %>%
202 |   glue::glue() %>%
203 |   here::here() %>%
204 |   cowplot::save_plot(
205 |     filename = .,
206 |     plot = plt_grid,
207 |     base_height = 20,
208 |     base_width = 20)
209 | ```
210 | 
211 | ### Visualize cell types
212 | Next we want to visualize the cell types side by side
213 | ```{r}
214 | tissue_plots <- lapply(colnames(decon_mtrx), function(i) {
215 |   print(i)
216 |   i_spot <- i
217 |   i_sub <- stringr::str_split(string = i, pattern = "-", simplify = TRUE, n = 2)
218 |   i_rctd <- glue::glue("RCTD-{i_sub[2]}")
219 |   
220 |   Seurat::SpatialPlot(
221 |     object = se_obj,
222 |     features = c(i_spot, i_rctd),
223 |     alpha = c(0, 1),
224 |     image.alpha = 1)
225 | })
226 | 
227 | 
228 | tissue_grid <- cowplot::plot_grid(plotlist = tissue_plots,
229 |                                   align = "hv",
230 |                                   axis = "trbl")
231 | 
232 | "{an_breast_10x}/{plt_dir}/tissue_grid_RCTD-SPOTlight_{sample_id}_{clust_vr}.pdf" %>%
233 |   glue::glue() %>%
234 |   here::here() %>%
235 |   cowplot::save_plot(
236 |     filename = .,
237 |     plot = tissue_grid,
238 |     base_height = 25,
239 |     base_width = 35)
240 | ```
241 | 
242 | ## Session Info
243 | ```{r}
244 | sessionInfo()
245 | ```
246 | 


--------------------------------------------------------------------------------
/ST-breast/README.md:
--------------------------------------------------------------------------------
 1 | # Breast Invasive Ductal Carcinoma
 2 | 
 3 | ## Data description
 4 | This data corresponds to 2 serial breast carcinoma sections and comes from the 10x Genomics website with the following definition: 
 5 | 
 6 | *10x Genomics obtained fresh frozen Invasive Ductal Carcinoma breast tissue from BioIVT Asterand. The tissue was embedded and cryosectioned as described in Visium Spatial Protocols - Tissue Preparation Guide (Demonstrated Protocol CG000240). Tissue sections of 10 µm thickness were placed on Visium Gene Expression Slides.*
 7 | 
 8 | *The tissue was AJCC/UICC Stage Group IIA, ER positive, PR negative, Her2 positive and annotated with:*
 9 | - *Ductal carcinoma in situ*
10 | - *Lobular carcinoma in situ*
11 | - *Invasive Carcinoma*
12 | 
13 | ## Data availability
14 | This data is freely available to download from the 10X Genomics website reference datasets and can be downloaded directly from their website - Slice 1 is available [here](https://support.10xgenomics.com/spatial-gene-expression/datasets/1.0.0/V1_Breast_Cancer_Block_A_Section_1) and Slice 2 [here](https://support.10xgenomics.com/spatial-gene-expression/datasets/1.0.0/V1_Breast_Cancer_Block_A_Section_2).
15 | In this case we are using the data mapped using spaceranger 1.0.0.
16 | 
17 | ## Code
18 | Scripts *1-10x_breast_QC.Rmd* and *2-10x_breast_GO_enrichment.Rmd* are in charge of preprocessing the data and mapping the TCA immune cell states onto the tissue. 
19 | Scripts *3-10x_breast_immune_reference.Rmd*  and *4-10x_breast_stratification.Rmd*, in turn, plot the Figure panels of Figure 6 and Supplementary Figure 13-17.
20 | 
21 | ## Dependencies
22 | * [R 3.6.0](https://cran.r-project.org/)
23 | * [Seurat 3.2.0](https://cran.r-project.org/web/packages/Seurat/index.html)
24 | * [tidyverse 1.3.0](https://cran.r-project.org/web/packages/tidyverse/index.html)
25 | * [ggpubr 0.3.0](https://cran.r-project.org/web/packages/ggpubr/index.html)
26 | * [SPOTlight 1.0.0](https://github.com/MarcElosua/SPOTlight)
27 | * [Matrix 1.2.18](https://cran.r-project.org/web/packages/Matrix/index.html)
28 | * [svglite 1.2.3.2](https://cran.r-project.org/web/packages/svglite/index.html)
29 | * [ggcorrplot 0.1.3](https://cran.r-project.org/web/packages/ggcorrplot/index.html)
30 | * [cowplot 1.1.0](https://cran.r-project.org/web/packages/cowplot/index.html)
31 | 
32 | 


--------------------------------------------------------------------------------
/ST-oropharyngeal/1-australia_oroph_processing.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "1- Australia oropharingeal cancer preprocessing"
  3 | author: "Marc Elosua-Bayes"
  4 | date: '`r format(Sys.Date(), "%B %d, %Y")`'
  5 | output:
  6 |   BiocStyle::html_document:
  7 |     toc: yes
  8 |     toc_float: yes
  9 |     number_sections: yes
 10 |     df_print: paged
 11 | ---
 12 | 
 13 | ```{r setup, include=FALSE}
 14 | knitr::opts_chunk$set(echo = TRUE, out.width = "100%", fig.align='center', 
 15 |                       message = FALSE, warning = FALSE)
 16 | options(width = 1200)
 17 | ```
 18 | 
 19 | ## Introduction
 20 | In this Rmarkdown document we will load and preprocess samples 161429, 161430, 161431, and 161432.
 21 | These samples are human oropharyngeal cancer and were provided to use by Joseph Powell from the Garvand Institute of Medical Research.
 22 | 
 23 | Here is the methodology used to obtain and process this samples up to this point:
 24 | 
 25 | All patients provided informed consent for the collection of human specimens and data.  This was approved by the St Vincent’s Hospital Research Office (2019/PID04335) in accordance with the National Health and Medical Research Council’s National Statement of Ethical Conduct in Human Research.  Patients undergoing surgical resection for a locally advanced oropharyngeal cancer were recruited to the study.  After surgical removal, the anatomical pathologist dissected a sample of both the primary and nodal metastasis.   Samples were tumour banked in accordance with our ethically approved protocol.  
 26 | 
 27 | **Sample storage**
 28 | Within 30 minutes of collection, tumour samples were tumour banked.  Samples were cut into 1mm x 1mm chunks with a scalpel blade.    For Visium, a tissue chunk was snap frozen in OCT. After freezing, samples were moved to liquid nitrogen for long term storage.
 29 | 
 30 | Frozen tissue samples were processed using the Visium Spatial Gene Expression slide and reagent kit (10X Genomics, US) following the manufacturer’s instruction. Briefly, 10 μm sections were placed into the capture areas of the Visium slide. Tissue morphology was assessed with H&E staining and imaging using a Leica DM6000 microscope equipped with a 20x lens (Leica, DE). The imaged sections were then permeabilized for 12 minutes using the supplied reagents. The permeabilization condition was previously optimised using the Visium Spatial Tissue Optimisation slide and reagent kit (10X Genomics, US). After permeabilization, cDNA libraries were prepared, checked for quality and sequenced on a NovaSeq 6000 platform (Illumina, US). Around 300 million pair-ended reads were obtained for each tissue section. Read 1, i7 index and Read 2 were sequenced with 28, 8 and 98 cycles respectively. Reads were demultiplexed and mapped to the reference genome GRCh38 using the Space Ranger Software v1.0.0 (10X Genomics).
 31 | 
 32 | 
 33 | ## Libraries
 34 | ```{r}
 35 | library(Seurat)
 36 | library(dplyr)
 37 | library(cowplot)
 38 | source(here::here("misc/paths.R"))
 39 | source(here::here("utils/bin.R"))
 40 | source(here::here("utils/bin.r"))
 41 | ```
 42 | 
 43 | ## Create directory structure
 44 | ```{r}
 45 | "{an_oro}/{robj_dir}" %>%
 46 |   glue::glue() %>%
 47 |   here::here() %>%
 48 |   dir.create(
 49 |     path = ,
 50 |     showWarnings = FALSE,
 51 |     recursive = TRUE)
 52 | 
 53 | "{an_oro}/{plt_dir}" %>%
 54 |   glue::glue() %>%
 55 |   here::here() %>%
 56 |   dir.create(
 57 |     path = ,
 58 |     showWarnings = FALSE,
 59 |     recursive = TRUE)
 60 | ```
 61 | 
 62 | ## Parameters
 63 | Set common parameters for the entire document.
 64 | ```{r}
 65 | id_ls <- c("161429", "161430", "161431", "161432")
 66 | ```
 67 | 
 68 | ## Load data
 69 | We are going to start by loading the data
 70 | ```{r}
 71 | st_ls <- lapply(id_ls, function(id) {
 72 |   print(id)
 73 |   tmp_se <- Seurat::Load10X_Spatial(data.dir = here::here(sprintf("data/australia_hn/%s/", id)),
 74 |                                   filename = sprintf("%s_filtered_feature_bc_matrix.h5", id),
 75 |                                   slice = id)
 76 |   return(tmp_se)
 77 | })
 78 | names(st_ls) <- id_ls
 79 | ```
 80 | 
 81 | ## QC
 82 | 1st thing we do is remove empty gene
 83 | ```{r}
 84 | for (id in names(st_ls)) {
 85 |   table(rowSums(as.matrix(st_ls[[id]]@assays$Spatial@counts)) == 0)
 86 |   
 87 |   keep_genes <- rowSums(as.matrix(st_ls[[id]]@assays$Spatial@counts)) != 0
 88 |   st_ls[[id]] <- st_ls[[id]][keep_genes, ]
 89 | }
 90 | ```
 91 | 
 92 | Then we look at the number of reads, genes, and mitochondrial per spot to check how this tissue looks.
 93 | First we add the mitochondrial and ribosomal percentage within each spot.
 94 | ```{r}
 95 | for (id in names(st_ls)) {
 96 |   
 97 |   # store mitochondrial percentage in object meta data
 98 |   st_ls[[id]] <- Seurat::PercentageFeatureSet(st_ls[[id]],
 99 |                                      pattern = "^MT-",
100 |                                      col.name = "percent.mt")
101 |   
102 |   st_ls[[id]] <- Seurat::PercentageFeatureSet(st_ls[[id]],
103 |                                      pattern = "^RPS|^RPS",
104 |                                      col.name = "percent.rp")
105 | }
106 | ```
107 | 
108 | ### Visualize QC metrics
109 | #### Descriptive histograms
110 | ```{r}
111 | lapply(names(st_ls), function(id) {
112 |   print(id)
113 |   hist_qc <- QC_seurat_hist(se = st_ls[[id]],
114 |                           assay = "Spatial",
115 |                           slot = "counts",
116 |                           nfeat = "nFeature_Spatial",
117 |                           ncount = "nCount_Spatial",
118 |                           pctmt = "percent.mt",
119 |                           pctrp = "percent.rp")
120 |   
121 |   "{an_oro}/{plt_dir}/QC_histogram_{id}.pdf" %>%
122 |     glue::glue() %>%
123 |     here::here() %>%
124 |     cowplot::save_plot(
125 |       filename = .,
126 |       plot = hist_qc,
127 |       base_height = 9,
128 |       base_width = 16)
129 |   return(hist_qc)
130 | })
131 | ```
132 | 
133 | When we look at all the histograms we appreciate there is a peak on 0 *genes per spot* and *total counts per spot*, these we need to look at in more detail since they could be empty spots which we would discard.
134 | 
135 | #### Spatial visualization
136 | Next we want to visualize the spots on the tissue to assess their spatial behaviour. We are mainly interested to see where the empty spots map.
137 | ```{r}
138 | lapply(names(st_ls), function(id) {
139 |   print(id)
140 |   sp_qc <- Seurat::SpatialFeaturePlot(object =  st_ls[[id]],
141 |                                       features = c("nFeature_Spatial",
142 |                                                    "nCount_Spatial",
143 |                                                    "percent.mt",
144 |                                                    "percent.rp"))
145 |   
146 |   "{an_oro}/{plt_dir}/QC_spatial_{id}.pdf" %>%
147 |     glue::glue() %>%
148 |     here::here() %>%
149 |     cowplot::save_plot(
150 |       filename = .,
151 |       plot = sp_qc,
152 |       base_height = 12,
153 |       base_width = 12)
154 | 
155 |   return(sp_qc)
156 | })
157 | ```
158 | 
159 | We see that on all tissues there are spots with 0 or near 0 counts per spot. We will remove them by setting a thresholds since these are low quality spots that would distort the analysis.
160 | 
161 | We will subset and remove those spots with < 500 counts.
162 | ```{r}
163 | for (id in names(st_ls)) {
164 |   print(id)
165 |   st_ls[[id]] <- st_ls[[id]][, st_ls[[id]]$nCount_Spatial > 500]
166 | }
167 | ```
168 | 
169 | Take a new look at the spatial plots to assess how we removed those spots
170 | ```{r}
171 | lapply(names(st_ls), function(id) {
172 |   print(id)
173 |   sp_qc <- Seurat::SpatialFeaturePlot(object =  st_ls[[id]],
174 |                                       features = c("nFeature_Spatial",
175 |                                                    "nCount_Spatial",
176 |                                                    "percent.mt",
177 |                                                    "percent.rp"))
178 |   
179 |     "{an_oro}/{plt_dir}/QC_spatial_subset_{id}.pdf" %>%
180 |     glue::glue() %>%
181 |     here::here() %>%
182 |     cowplot::save_plot(
183 |       filename = .,
184 |       plot = sp_qc,
185 |       base_height = 12,
186 |       base_width = 12)
187 |   
188 |   return(sp_qc)
189 | })
190 | ```
191 | 
192 | 161430 and 161431 appear to have regions with very low reads so we'll keep an eye on these 2 slices specially in the downstream analysis.
193 | 
194 | ## Normalization
195 | Once we've subset the data to remove low quality spots we will scale, normalize and cluster the data.
196 | ```{r}
197 | for (id in names(st_ls)) {
198 |   print(id)
199 |   st_ls[[id]] <- Seurat::SCTransform(object = st_ls[[id]],
200 |                                      assay = "Spatial")
201 |   
202 |   # Dimensionality reduction and clustering
203 |   st_ls[[id]] <- Seurat::RunPCA(st_ls[[id]], assay = "SCT", verbose = TRUE)
204 |   st_ls[[id]] <- Seurat::FindNeighbors(st_ls[[id]], reduction = "pca", dims = 1:30)
205 |   st_ls[[id]] <- Seurat::FindClusters(st_ls[[id]], verbose = TRUE, resolution = c(0.09, 0.1, 0.25, 0.5, 0.75, 1))
206 |   st_ls[[id]] <- Seurat::RunUMAP(st_ls[[id]], reduction = "pca", dims = 1:30)
207 | }
208 | ```
209 | 
210 | Save processed dataset
211 | ```{r}
212 | for (id in names(st_ls)) {
213 |   "{an_oro}/{robj_dir}/processed_sp_oropharyngeal_{id}.RDS" %>%
214 |     glue::glue() %>%
215 |     here::here() %>%
216 |     saveRDS(
217 |       object = st_ls[[id]],
218 |       file = .)
219 | }
220 | ```
221 | 
222 | ### Cluster visualization
223 | Plot cluster on the spatial context at different resolutions.
224 | ```{r}
225 | lapply(names(st_ls), function(id) {
226 |   print(id)
227 |   
228 |   sp_clust <- Seurat::SpatialDimPlot(object =  st_ls[[id]],
229 |                                       group.by = c("SCT_snn_res.0.1",
230 |                                                    "SCT_snn_res.0.25")) +
231 |     geom_point(color = NA)
232 |   
233 |   "{an_oro}/{plt_dir}/Cluster_spatial_{id}.pdf" %>%
234 |     glue::glue() %>%
235 |     here::here() %>%
236 |     cowplot::save_plot(
237 |       filename = .,
238 |       plot = sp_clust,
239 |       base_height = 9,
240 |       base_width = 15)
241 | 
242 |   return(sp_clust)
243 | })
244 | ```
245 | 
246 | 
247 | ## Session Info
248 | ```{r}
249 | sessionInfo()
250 | ```
251 | 


--------------------------------------------------------------------------------
/ST-oropharyngeal/2-australia_oroph_biological.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | author: "Marc Elosua-Bayes"
  3 | date: "`r format(Sys.time(), '%d %B, %Y')`"
  4 | output:
  5 |   html_document:
  6 |     toc: yes
  7 |     toc_float: yes
  8 |     number_sections: yes
  9 |     df_print: paged
 10 | editor_options: 
 11 |   chunk_output_type: console
 12 | params: 
 13 |     sample_id: "Default!"
 14 | title: "`r sprintf('2-Australia Oropharyngeal Biological processing %s', {params$sample_id})`"
 15 | ---
 16 | ```{r setup, include=FALSE}
 17 | knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
 18 | knitr::opts_knit$set(root.dir = "../..")
 19 | ```
 20 | 
 21 | ## Introduction
 22 | In this Rmarkdown document we are going to do some downstream analysis to try to elucidate what the clustered regions represent.
 23 | We are going to use the data obtained from script *1-australis_oroph_processing.Rmd*.
 24 | 
 25 | ## Libraries
 26 | ```{r}
 27 | library(Seurat)
 28 | library(dplyr)
 29 | library(ggplot2)
 30 | library(cowplot)
 31 | library(purrr)
 32 | library(DT)
 33 | library(org.Hs.eg.db)
 34 | library(GOstats)
 35 | library(progeny)
 36 | 
 37 | source(here::here("misc/paths.R"))
 38 | source(here::here("utils/bin.r"))
 39 | source(here::here("utils/bin.R"))
 40 | ```
 41 | 
 42 | ## Parameters
 43 | Set common parameters for the entire document.
 44 | ```{r}
 45 | id_ls <- c("161429", "161430", "161431", "161432")
 46 | sample <- params$sample_id
 47 | sample <- "161429"
 48 | ```
 49 | 
 50 | ## Load data
 51 | We are going to start by loading the data
 52 | ```{r}
 53 | sp_obj <- "{an_oro}/{robj_dir}/processed_sp_oropharyngeal_{sample}.RDS" %>%
 54 |   glue::glue() %>%
 55 |   here::here() %>%
 56 |   readRDS(file = .)
 57 | ```
 58 | 
 59 | ## Go enrichment
 60 | ### Marker genes
 61 | Iterate over the spatial seurat objects, set Identities to the lowest resolution and save the marker genes.
 62 | ```{r}
 63 | Seurat::Idents(sp_obj) <- sp_obj@meta.data[, "SCT_snn_res.0.1"]
 64 | markers_all <- Seurat::FindAllMarkers(object = sp_obj,
 65 |                                       assay = "SCT",
 66 |                                       slot = "data",
 67 |                                       only.pos = TRUE)
 68 | 
 69 | "{an_oro}/{robj_dir}/markers_{sample}_res.0.1.rds" %>%
 70 |   glue::glue() %>%
 71 |   here::here() %>%
 72 |   saveRDS(
 73 |     object = tmp_markers,
 74 |     file = .)
 75 | ```
 76 | 
 77 | Interactive table to look at marker genes
 78 | ```{r}
 79 | DT::datatable(data = markers_all,
 80 |               filter = "top",
 81 |               options = list(pageLength = 20, autoWidth = TRUE))
 82 | ```
 83 | 
 84 | 
 85 | ```{r echo = FALSE}
 86 | # Extracting the markers from each cluster
 87 | # Function to pull the gene variable
 88 | pull_col <- function(.x) {
 89 |   return(.x %>% pull(var = "gene"))
 90 | }
 91 | 
 92 | unique_markers <- lapply(marker_ls, function(marker_df) {
 93 |   markers_tmp_ls <- marker_df %>%
 94 |   dplyr::filter(p_val < 0.01) %>%
 95 |   dplyr::group_by(cluster) %>% 
 96 |   dplyr::group_split() %>% 
 97 |   purrr::map(.f = pull_col)
 98 | 
 99 |   # Next we find unique marker genes for each clusters
100 |   unique_hn1_ls <- lapply(seq_len(length(markers_tmp_ls)), function(i) {
101 |     markers_tmp_ls[[i]][! markers_tmp_ls[[i]] %in% unique(unlist(markers_tmp_ls[-i]))]
102 |   })
103 |   return(unique_hn1_ls)
104 | 
105 | })
106 | 
107 | names(unique_markers) <- names(marker_ls)
108 | ```
109 | 
110 | ### Cluster annotation
111 | #### GO analysis
112 | For ALL the slices
113 | ```{r}
114 | DE_df <- lapply(names(marker_ls), function(nm) {
115 |   # print(nm)
116 |   # Extract objects of interest
117 |   markers_st <- marker_ls[[nm]]
118 |   sp_obj <- st_ls[[nm]]
119 |   
120 |   tmp_df <- lapply(unique(markers_st$cluster), function(clust) {
121 |     # Subset cluster of interest
122 |     gene_de <- markers_st %>%
123 |       dplyr::filter(cluster == all_of(clust)) %>%
124 |       dplyr::pull(gene) %>%
125 |       stringr::str_to_upper(.)
126 |     
127 |     go_clust <- gene_enrichment_GO(
128 |       gene_de = gene_de,
129 |       gene_universe = stringr::str_to_upper(rownames(sp_obj)))
130 |     
131 |     tmp <- summary(go_clust) %>%
132 |       dplyr::filter(Size <= 300 & Size >=33 & Pvalue < 0.01 & Count >= 3) %>% 
133 |       dplyr::mutate(
134 |         sample = nm,
135 |         cluster = clust)
136 |     
137 |     return(tmp)
138 |   }) %>% 
139 |   dplyr::bind_rows()
140 |   
141 |   return(tmp_df)
142 | }) %>% dplyr::bind_rows()
143 | 
144 | # Save DE gene analysis
145 | "{an_oro}/{robj_dir}/go_enrichment_all_res.0.1_{sample}.rds" %>%
146 |   glue::glue() %>%
147 |   here::here() %>%
148 |   saveRDS(
149 |     object = DE_df,
150 |     file = .)
151 | ```
152 | 
153 | Load enrichment RDS files
154 | ```{r}
155 | # Save DE gene analysis
156 | DE_df <- "{an_oro}/{robj_dir}/go_enrichment_all_res.0.1_{sample}.rds" %>%
157 |   glue::glue() %>%
158 |   here::here() %>%
159 |   readRDS(file = .)
160 | ```
161 | 
162 | ##### Visualization
163 | For all the slices
164 | ```{r}
165 | plt_ls <- lapply(unique(as.character(tmp_de$cluster)), function(i) {
166 |   # print(i)
167 |   tmp_plt <- DE_df %>%
168 |     dplyr::filter(cluster == i) %>%
169 |     dplyr::arrange(desc(OddsRatio)) %>%
170 |     head(25) %>% 
171 |     ggplot(.) +
172 |       geom_point(aes(x = OddsRatio,
173 |                      y = reorder(Term, OddsRatio),
174 |                      size = -Pvalue,
175 |                      color = Pvalue)) +
176 |       scale_color_gradient(low = "green",
177 |                            high = "red") +
178 |       labs(title = glue::glue("Sample: {sample}; Cluster: {i}"),
179 |            y = "")
180 |   # cowplot::save_plot(filename = sprintf("%s/%s/go_enrich_hn1_clust-%s.pdf",
181 |   # an_aussie, plt_dir, i),
182 |   #                    plot = tmp_plt)
183 |   })
184 | 
185 | "{an_oro}/{plt_dir}/go_enrich_res.0.1_{sample}.pdf" %>%
186 |   glue::glue() %>%
187 |   here::here() %>%
188 |   ggpubr::ggexport(
189 |     plotlist = plt_ls, 
190 |     filename = .,
191 |     width = 10,
192 |     height = 7)
193 | ```
194 | 
195 | ### PROGENy
196 | PROGENy (Pathway RespOnsive GENes) estimates the activity of relevant signaling pathways based on consensus gene signatures obtained from perturbation experiments, in other words, the footprint of the pathway on gene expression
197 | We compute PROGENy pathway activity scores on the scRNA-seq data, and we then characterice the different clusters based on these scores.
198 | ```{r}
199 | ## We compute the Progeny activity scores and add them to our Seurat object
200 | ## as a new assay called Progeny. 
201 | sp_obj <- progeny::progeny(
202 |   sp_obj,
203 |   scale = FALSE,
204 |   organism = "Human",
205 |   top = 500,
206 |   perm = 1,
207 |   assay_name = "Spatial",
208 |   return_assay = TRUE)
209 | 
210 | ## We can now directly apply Seurat functions in our Progeny scores. 
211 | ## For instance, we scale the pathway activity scores. 
212 | sp_obj <- Seurat::ScaleData(sp_obj, assay = "progeny") 
213 | ```
214 | 
215 | Next we create a data frame with the specification of the spot id that belong to each cluster to match with the Progeny scores.
216 | ```{r}
217 | ## We transform Progeny scores into a data frame to better handling the results
218 | progeny_scores_df <- 
219 |     as.data.frame(t(GetAssayData(sp_obj, slot = "scale.data", 
220 |         assay = "progeny"))) %>%
221 |     tibble::rownames_to_column("Cell") %>%
222 |     tidyr::gather(Pathway, Activity, -Cell)
223 | 
224 | Seurat::Idents(sp_obj) <- sp_obj@meta.data[, "SCT_snn_res.0.1"]
225 | ## We create a data frame with the specification of the spot id that belong to 
226 | ## each cluster to match with the Progeny scores.
227 | CellsClusters <- data.frame(Cell = names(Idents(sp_obj)), 
228 |     CellType = as.character(Idents(sp_obj)),
229 |     stringsAsFactors = FALSE)
230 | 
231 | ## We match Progeny scores with the cell clusters.
232 | progeny_scores_df <- inner_join(progeny_scores_df, CellsClusters, by = "Cell")
233 | 
234 | ## We summarize the Progeny scores by cellpopulation
235 | summarized_progeny_scores <- progeny_scores_df %>% 
236 |     group_by(Pathway, CellType) %>%
237 |     summarise(avg = mean(Activity), std = sd(Activity))
238 | ```
239 | 
240 | Lastly we want to visualize the pathways scores in a heatmap
241 | ```{r}
242 | ## We prepare the data for the plot
243 | summarized_progeny_scores_df <- summarized_progeny_scores %>%
244 |     dplyr::select(-std) %>%   
245 |     tidyr::spread(Pathway, avg) %>%
246 |     data.frame(row.names = 1, check.names = FALSE, stringsAsFactors = FALSE) 
247 | 
248 | paletteLength = 100
249 | myColor = colorRampPalette(c("Darkblue", "white","red"))(paletteLength)
250 | 
251 | progenyBreaks <- c(seq(min(summarized_progeny_scores_df), 0, 
252 |                       length.out = ceiling(paletteLength / 2) + 1),
253 |                   seq(max(summarized_progeny_scores_df) / paletteLength, 
254 |                       max(summarized_progeny_scores_df), 
255 |                       length.out = floor(paletteLength / 2)))
256 | 
257 | "{an_oro}/{plt_dir}/progeny_hmap_{sample}.png" %>%
258 |   glue::glue() %>%
259 |   here::here() %>%
260 |   pheatmap::pheatmap(
261 |     mat = t(summarized_progeny_scores_df[, -1]),
262 |     fontsize = 14,
263 |     fontsize_row = 10,
264 |     color = myColor,
265 |     breaks = progenyBreaks,
266 |     main = "PROGENy (500)",
267 |     angle_col = 45,
268 |     treeheight_col = 0,
269 |     border_color = NA,
270 |     filename = .)
271 | 
272 | progeny_hmap <- pheatmap::pheatmap(mat = t(summarized_progeny_scores_df[, -1]),
273 |                                  fontsize = 14,
274 |                                  fontsize_row = 10,
275 |                                  color = myColor,
276 |                                  breaks = progenyBreaks,
277 |                                  main = "PROGENy (500)",
278 |                                  angle_col = 45,
279 |                                  treeheight_col = 0,
280 |                                  border_color = NA)
281 | ```
282 | 
283 | ## Session Info
284 | ```{r}
285 | sessionInfo()
286 | ```
287 | 
288 | 


--------------------------------------------------------------------------------
/ST-oropharyngeal/3-australia_oroph_deconv.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | author: "Marc Elosua-Bayes"
  3 | date: "`r format(Sys.time(), '%d %B, %Y')`"
  4 | output:
  5 |   html_document:
  6 |     toc: yes
  7 |     toc_float: yes
  8 |     number_sections: yes
  9 |     df_print: paged
 10 | editor_options: 
 11 |   chunk_output_type: console
 12 | params: 
 13 |     sample_id: "Default!"
 14 |     clust_vr: "Default!"
 15 | title: "`r sprintf('3- Australia oropharyngeal cancer immune deconvolution %s', {params$sample_id})`"
 16 | ---
 17 | 
 18 | ```{r setup, include=FALSE}
 19 | knitr::opts_chunk$set(echo = TRUE)
 20 | ```
 21 | 
 22 | ## Introduction
 23 | In this Rmarkdown I am going to map immune cells from TICA on to the spatial tissue using [SPOTlight](https://github.com/MarcElosua/SPOTlight).
 24 | 
 25 | ## Libraries
 26 | ```{r}
 27 | library(Seurat)
 28 | library(dplyr)
 29 | library(ggplot2)
 30 | library(purrr)
 31 | library(SPOTlight)
 32 | source(here::here("misc/paths.R"))
 33 | source(here::here("utils/bin.r"))
 34 | ```
 35 | 
 36 | ## Parameters
 37 | ```{r}
 38 | set.seed(1243)
 39 | 
 40 | sample_id <- params$sample_id
 41 | # sample_id <- "161429"
 42 | # sample_id <- "161430"
 43 | clust_vr <- params$clust_vr
 44 | # clust_vr <- "lv1_annot"
 45 | # clust_vr <- "lv2_annot"
 46 | 
 47 | trn <- "melanoma"
 48 | cl_n <- 100
 49 | hvg <- 3000
 50 | ntop <- NULL
 51 | transf <- "uv"
 52 | method <- "nsNMF"
 53 | min_cont <- 0
 54 | 
 55 | if (is.null(ntop)) {
 56 |   spotlight_id <- sprintf("trn-%s_cln-%s_hvg-%s_ntop-NULL_transf-%s_method-%s_mincont-%s",
 57 |                           trn, cl_n, hvg, transf, method, min_cont)
 58 | } else {
 59 |   spotlight_id <- sprintf("trn-%s_cln-%s_hvg-%s_ntop-%s_transf-%s_method-%s_mincont-%s",
 60 |                           trn, cl_n, hvg, ntop, transf, method, min_cont)
 61 | }
 62 | ```
 63 | 
 64 | Define cell types of interest depending on the annotation level
 65 | ```{r}
 66 | if (sample_id == "161430") {
 67 |   ct_interest <- c("Plasma B cells", "T cells regulatory",
 68 |                    "CD8 pre-exhausted", "Macrophages SPP1", "TAMs C1QC",
 69 |                    "CD8 cytotoxic", "Proliferation", "CD4 naive-memory")
 70 | } else if (sample_id == "161429") {
 71 |   ct_interest <- c("B cells", "Plasma B cells", "T cells regulatory",
 72 |                    "CD8 terminally exhausted", "Macrophages SPP1", "TAMs C1QC",
 73 |                    "cDC", "Proliferation")
 74 | }
 75 | ```
 76 | 
 77 | ## Load data
 78 | We are going to start by loading the data from the previosu script *1-australis_oroph_processing.Rmd*.
 79 | 
 80 | ```{r}
 81 | ### Spatial oropharyngeal cancer
 82 | se_obj <- "{an_oro}/{robj_dir}/processed_sp_oropharyngeal_{sample_id}.RDS" %>%
 83 |   glue::glue() %>%
 84 |   here::here() %>%
 85 |   readRDS(file = .)
 86 | ```
 87 | 
 88 | Subset ICA to just use cells from the melanoma subset since we have a good representation of cells for each cluster from this dataset.
 89 | ```{r}
 90 | ica_melanoma2_path <- "{an_breast_10x}/{robj_dir}/ica_melanoma2.rds" %>%
 91 |   glue::glue() %>%
 92 |   here::here()
 93 | 
 94 | if (file.exists(ica_melanoma2_path)) {
 95 |   
 96 |   ica_sub <- readRDS(file = ica_melanoma2_path)
 97 | } else {
 98 |   # ica_se <- readRDS("/scratch/devel/pnieto/TIL_Atlas/TICA/output/integrated_renamed_filtered.rds")
 99 |   ica_se <- "data/immune_reference/integrated_clustered_complete.rds" %>%
100 |     here::here() %>%
101 |     readRDS(.)
102 |   ### Immune reference atlas
103 |   ica_sub <- subset(ica_se, subset = source == "melanoma2")
104 |   rm(ica_se)
105 |   
106 |   saveRDS(object = ica_sub, file = ica_melanoma2_path)
107 | }
108 | 
109 | table(ica_sub@meta.data$lv1_annot)
110 | table(ica_sub@meta.data$lv2_annot)
111 | ```
112 | 
113 | Create a name/color dataframe
114 | ```{r}
115 | # source(here::here("misc/col_df_scrpt.R"))
116 | # source(here::here("misc/col_df_scrpt2.R"))
117 | 
118 | col_df <- readRDS(here::here("misc/col_df.rds"))
119 | ```
120 | 
121 | Change variable names to remove non-standard characters
122 | ```{r}
123 | ica_sub[["specific_cell_type_mod"]] <- stringr::str_replace_all(
124 |   string = as.character(ica_sub@meta.data[, clust_vr]),
125 |   pattern = "[[:punct:]]|[[:blank:]]",
126 |   replacement = ".")
127 | 
128 | prolif_vec <- c("T.cells.proliferative", "Macrophages.and.monocytes.proliferative",
129 |                 "B.cells.proliferative", "Macrophages.proliferative")
130 | 
131 | ica_sub[["specific_cell_type_mod"]] <- 
132 |   dplyr::if_else(ica_sub@meta.data[, "specific_cell_type_mod"] %in% prolif_vec,
133 |                  "Proliferation", ica_sub@meta.data[, "specific_cell_type_mod"])
134 | 
135 | table(ica_sub@meta.data$specific_cell_type_mod)
136 | ```
137 | 
138 | Immune cell marker genes
139 | ```{r eval = FALSE}
140 | ica_sub <- Seurat::SCTransform(object = ica_sub, assay = "RNA")
141 | 
142 | Seurat::Idents(ica_sub) <- as.character(ica_sub@meta.data[, "specific_cell_type_mod"])
143 | 
144 | ica_markers <- Seurat::FindAllMarkers(object = ica_sub,
145 |                        assay = "SCT",
146 |                        slot = "data",
147 |                        only.pos = TRUE,
148 |                        logfc.threshold = 0,
149 |                        min.pct = 0,
150 |                        max.cells.per.ident = 500)
151 | 
152 | 
153 | "data/immune_reference/ica_markers_melanoma2_{clust_vr}.rds" %>%
154 |   glue::glue()%>%
155 |   here::here( ) %>%
156 |   saveRDS(
157 |     object = ica_markers,
158 |     file = .)
159 | ```
160 | 
161 | Load marker genes per cluster
162 | ```{r}
163 | ica_markers <- "data/immune_reference/ica_markers_melanoma2_{clust_vr}.rds" %>%
164 |   glue::glue()%>%
165 |   here::here() %>%
166 |   readRDS(file = .)
167 | 
168 | ica_markers <- ica_markers %>% dplyr::filter(cluster != "Mast.cells")
169 | ```
170 | 
171 | Look at the proliferation markers
172 | ```{r}
173 | ica_markers %>%
174 |   dplyr::filter(cluster == "Prolifeºration") %>%
175 |   head(20)
176 | ```
177 | 
178 | ## Deconvolution
179 | ```{r message = FALSE}
180 | # Remove Mast cells
181 | ica_sub <- subset(ica_sub, subset = specific_cell_type_mod != "Mast.cells")
182 | # Create Seurat object with filtered gene matrix after removing Mast cells
183 | ica_sub <- Seurat::CreateSeuratObject(
184 |   counts = ica_sub@assays$RNA@counts[
185 |     sparseMatrixStats::rowSums2(ica_sub@assays$RNA@counts) != 0, ],
186 |   meta.data = ica_sub@meta.data)
187 | 
188 | # Remove empty spatial genes
189 | # keep_genes <- rowSums(as.matrix(se_obj@assays$Spatial@counts)) != 0
190 | # table(rowSums(ica_sub@assays$RNA@counts) != 0)
191 | 
192 | # Run deconvolution
193 | decon_mtrx_ls <- SPOTlight::spotlight_deconvolution(
194 |   se_sc = ica_sub,
195 |   counts_spatial = se_obj@assays$Spatial@counts,
196 |   clust_vr = "specific_cell_type_mod",
197 |   cluster_markers = ica_markers,
198 |   cl_n = cl_n,
199 |   hvg = hvg,
200 |   ntop = ntop,
201 |   transf = transf,
202 |   method = method,
203 |   min_cont = min_cont,
204 |   assay = "RNA",
205 |   slot = "counts")
206 | 
207 | # saveRDS(object = decon_mtrx_ls,
208 | #         file = sprintf("%s/%s/decon_mtrx_breast_cancer_10x_atlas_%s_reg.rds",
209 | #                        an_oro, robj_dir, spotlight_id))
210 | # 
211 | # saveRDS(object = decon_mtrx_ls,
212 | #         file = sprintf("%s/%s/decon_mtrx_breast_cancer_10x_atlas_%s_latent.rds",
213 | #                        an_oro, robj_dir, spotlight_id))
214 | 
215 | "{an_oro}/{robj_dir}/decon_mtrx_atlas_{sample_id}_{spotlight_id}_{clust_vr}.rds" %>%
216 |   glue::glue() %>%
217 |   here::here() %>%
218 |   saveRDS(object = decon_mtrx_ls, file = .)
219 | ```
220 | 
221 | Add deconvolution matrix to Seurat object metadata
222 | ```{r}
223 | decon_mtrx_ls <- "{an_oro}/{robj_dir}/decon_mtrx_atlas_{sample_id}_{spotlight_id}_{clust_vr}.rds" %>%
224 |   glue::glue() %>%
225 |   here::here() %>%
226 |   readRDS(file = .)
227 | 
228 | # decon_mtrx_ls <- readRDS(file = sprintf("%s/%s/decon_mtrx_breast_cancer_10x_atlas_%s.rds",
229 | #                                         an_oro, robj_dir, spotlight_id))
230 | # decon_mtrx_ls <- readRDS(file = sprintf("%s/%s/decon_mtrx_breast_cancer_10x_pdac_atlas.rds", an_oro, robj_dir))
231 | # decon_mtrx_ls <- readRDS(file = sprintf("%s/%s/decon_mtrx_breast_cancer_10x_atlas_%s_reg.rds",
232 | #                        an_oro, robj_dir, spotlight_id))
233 | # 
234 | # decon_mtrx_ls <- readRDS(file = sprintf("%s/%s/decon_mtrx_breast_cancer_10x_atlas_%s_latent.rds",
235 | #                        an_oro, robj_dir, spotlight_id))
236 | 
237 | decon_mtrx <- decon_mtrx_ls[[2]]
238 | decon_mtrx <- decon_mtrx[, colnames(decon_mtrx) != "res_ss"]
239 | 
240 | # Set as 0 cell types predicted to be under 3 % of the spot
241 | decon_mtrx[decon_mtrx < 0.03] <- 0
242 | ```
243 | 
244 | Change names to original ones and reorder according to Paula
245 | ```{r}
246 | new_colnames <- data.frame(ct_name = colnames(decon_mtrx), stringsAsFactors = FALSE) %>%
247 |   dplyr::left_join(col_df, by = "ct_name") %>%
248 |   dplyr::pull(plt_name)
249 | 
250 | colnames(decon_mtrx) <- new_colnames
251 | ```
252 | 
253 | Add deconvolution matrix to Seurat object metadata
254 | ```{r}
255 | se_obj@meta.data <- cbind(se_obj@meta.data, round(decon_mtrx, 3))
256 | ```
257 | 
258 | ### Visualization
259 | Before we start we take a look at the topic profiles
260 | ```{r}
261 | # Extract coefficient matrix
262 | h <- NMF::coef(decon_mtrx_ls[[1]][[1]])
263 | 
264 | # Extract coefficient labels
265 | train_labs <- data.frame(ct_name = decon_mtrx_ls[[1]][[2]]) %>%
266 |   dplyr::left_join(col_df, by = "ct_name") %>%
267 |   dplyr::pull(plt_name)
268 | 
269 | # rownames(h) <- paste("Topic", 1:nrow(h), sep = " ")
270 | profile_plt <- SPOTlight::dot_plot_profiles_fun(
271 |   h = h,
272 |   train_cell_clust = train_labs)
273 | 
274 | profile_plt[[1]]
275 | 
276 | "{an_oro}/{plt_dir}/all_ct_profiles_{spotlight_id}_{sample_id}_{clust_vr}.pdf" %>%
277 |   glue::glue() %>%
278 |   here::here() %>%
279 |   cowplot::save_plot(
280 |     filename = .,
281 |     plot = profile_plt[[1]],
282 |     base_height = 30,
283 |     base_width = 30)
284 | 
285 | h_df <- data.frame(t(h))
286 | 
287 | # Fix column names after converting to dataframe
288 | colnames(h_df) <- gsub(".", " ", colnames(h_df), fixed = TRUE)
289 | 
290 | # Get proportions for each row
291 | h_ds <- round(h_df/rowSums(h_df), 4)
292 | h_ds[, "clust_vr"] <- train_labs
293 | 
294 | 
295 | ct_topic_profiles <- h_ds %>%
296 |   dplyr::group_by(clust_vr) %>%
297 |   dplyr::summarise_all(list(median)) %>%
298 |   tibble::remove_rownames(.) %>%
299 |   tibble::column_to_rownames("clust_vr")
300 | 
301 | ct_topic_profiles <- ct_topic_profiles / rowSums(ct_topic_profiles)
302 | # In case a row is all 0
303 | ct_topic_profiles[is.na(ct_topic_profiles)] <- 0
304 | 
305 | cell_type_plt <- round(ct_topic_profiles, 2) %>%
306 |   tibble::rownames_to_column('Cell type') %>%
307 |   tidyr::pivot_longer(cols = -`Cell type`, names_to = "Topics") %>%
308 |   dplyr::mutate(
309 |     value_txt = dplyr::if_else(value > 0.1, round(value, 2), NULL),
310 |     Topics = factor(x = Topics,
311 |                     levels = stringr::str_sort(colnames(ct_topic_profiles),
312 |                                                numeric = TRUE)),
313 |     topics_num = stringr::str_replace_all(
314 |       string = Topics,
315 |       pattern =  "X",
316 |       replacement = ""),
317 |     topics_num = factor(topics_num, levels = as.character(1:length(unique(ct_topic_profiles))))
318 |     
319 |   ) %>%
320 |   ggplot2::ggplot(ggplot2::aes(x = `Cell type`, y = topics_num)) +
321 |   ggplot2::geom_point(ggplot2::aes(size = value, colour = value)) +
322 |   ggplot2::scale_color_continuous(low = "grey", high = "#59b371") +
323 |   ggplot2::theme_classic() +
324 |   ggplot2::labs(title = "NMF: Topic profiles by cell type") +
325 |   ggplot2::theme(
326 |     plot.title = ggplot2::element_text(hjust = 0.5, size = 20),
327 |     axis.text.x = ggplot2::element_text(angle = 90, vjust = 0.5),
328 |     axis.text = ggplot2::element_text(size = 15)) +
329 |   ggplot2::scale_size(range = c(0, 10)) +
330 |   ggplot2::guides(colour = ggplot2::guide_legend("Proportion"),
331 |                   size = ggplot2::guide_legend("Proportion")) +
332 |   ggplot2::labs(
333 |     title = glue::glue("NMF: Topic profiles by cell type"),
334 |     x = "",
335 |     y = "Topics") +
336 |   ggplot2::scale_x_discrete(
337 |     limits = unique(stringr::str_wrap(train_labs, width = 30))) +
338 |   # ggplot2::scale_y_discrete(breaks = c(1:length(unique(train_labs))))
339 |   ggplot2::theme(
340 |     axis.text = ggplot2::element_text(size = 22),
341 |     axis.text.x = ggplot2::element_text(hjust = 1, vjust = 0.5),
342 |     legend.title = ggplot2::element_text(size = 20, face = "bold"),
343 |     legend.text = ggplot2::element_text(size = 18),
344 |     axis.title.y = ggplot2::element_text(size = 26),
345 |     # plot.title = element_blank()
346 |     plot.title = ggplot2::element_text(size = 30))
347 | 
348 | cell_type_plt
349 | 
350 | "{an_oro}/{plt_dir}/ct_profiles_{spotlight_id}_{sample_id}_{clust_vr}.pdf" %>%
351 |   glue::glue() %>%
352 |   here::here() %>%
353 |   cowplot::save_plot(
354 |     filename = .,
355 |     plot = cell_type_plt,
356 |     base_height = 10,
357 |     base_width = 10)
358 | ```
359 | 
360 | Lastly we can take a look at which genes are the most important for each topic and get an insight into which genes are driving them.
361 | ```{r}
362 | basis_spotlight <- data.frame(NMF::basis(decon_mtrx_ls[[1]][[1]]))
363 | 
364 | colnames(basis_spotlight) <- unique(stringr::str_wrap(decon_mtrx_ls[[1]][[2]], width = 30))
365 | 
366 | basis_spotlight %>%
367 |   round(., 5) %>% 
368 |   DT::datatable(., filter = "top")
369 | ```
370 | 
371 | We will start by looking at the location of each cell type
372 | ```{r}
373 | # https://stackoverflow.com/questions/38722202/how-do-i-change-the-number-of-decimal-places-on-axis-labels-in-ggplot2
374 | # Our transformation function to ensure 2 decimals in the legend
375 | scaleFUN <- function(x) sprintf("%.2f", x)
376 | 
377 | ct_all <- colnames(decon_mtrx)
378 | ct_plt_ls <- lapply(ct_all, function(ct) {
379 |   tmp_plt <- Seurat::SpatialFeaturePlot(object = se_obj,
380 |                              features = ct,
381 |                              alpha = c(0, 1))
382 |   
383 |   if (sum(se_obj@meta.data[, ct]) == 0) {
384 |     tmp_plt <- tmp_plt + ggplot2::scale_alpha(range = c(0,0))
385 |   } else {
386 |     tmp_plt <- tmp_plt + ggplot2::scale_alpha(range = c(0, 1))
387 |   }
388 |   
389 |   tmp_plt <- tmp_plt +
390 |     ggplot2::theme(
391 |       legend.title = ggplot2::element_blank()
392 |       ) +
393 |     ggplot2::labs(title = ct) +
394 |     ggplot2::scale_fill_gradientn(
395 |       colors = grDevices::heat.colors(10, rev = TRUE),
396 |       # Same number of breaks for all plots
397 |       breaks = seq(min(se_obj@meta.data[, ct]),
398 |                    max(se_obj@meta.data[, ct]),
399 |                    length.out = 4),
400 |       # 2 decimals in the legend
401 |       labels = scaleFUN
402 |       # limits = c(0, 1)
403 |       )
404 | return(tmp_plt)
405 | })
406 | 
407 | ct_grid <- cowplot::plot_grid(
408 |   plotlist = ct_plt_ls,
409 |   axis = "trbl",
410 |   align = "hv",
411 |   nrow = 5,
412 |   ncol = 5)
413 | 
414 | "{an_oro}/{plt_dir}/{sample_id}_immune_reference_arrangement_{clust_vr}.pdf" %>%
415 |   glue::glue() %>%
416 |   here::here() %>%
417 |   cowplot::save_plot(
418 |     filename = .,
419 |     plot = ct_grid,
420 |     base_height = 25,
421 |     base_width = 25)
422 | ```
423 | 
424 | Next we will only plot the location of the cell types of interest for the main plot
425 | ```{r}
426 | # i <- unique(se_obj$slice)[1]
427 | # ct <- cell_types[[1]]
428 | ct_plt_ls <- lapply(ct_interest, function(ct) {
429 |   print(ct)
430 |   tmp_plt <- Seurat::SpatialPlot(
431 |     object = se_obj,
432 |     features = ct,
433 |     alpha = c(0, 1),
434 |     # Remove background image
435 |     image.alpha = 0,
436 |     crop = FALSE,
437 |     pt.size.factor = 1
438 |     ) +
439 |     theme(
440 |       legend.title = ggplot2::element_blank(),
441 |       legend.key.size = unit(1, 'cm'))
442 |   
443 |   if (sum(se_obj@meta.data[, ct]) == 0) {
444 |     tmp_plt <- tmp_plt + ggplot2::scale_alpha(range = c(0,0))
445 |   } else {
446 |     tmp_plt <- tmp_plt +
447 |     ggplot2::scale_fill_gradientn(
448 |       colors = grDevices::heat.colors(10, rev = TRUE),
449 |       # Same number of breaks for all plots with 2 decimals
450 |       breaks = round(seq(min(se_obj@meta.data[, ct]),
451 |                    max(se_obj@meta.data[, ct]),
452 |                    length.out = 3), 2),
453 |       # 2 decimals in the legend
454 |       labels = function(x) sapply(x, FUN = function(i) format(x = i, nsmall = 0))
455 |       # labels = scaleFUN
456 |       # limits = c(0, 1)
457 |       )
458 |   }
459 |   
460 |   tmp_plt <- tmp_plt +
461 |     ggplot2::labs(title = ct) +
462 |     ggplot2::theme(
463 |       plot.title = ggplot2::element_text(hjust = 0.5, size = 30, face = "bold"),
464 |       legend.title = ggplot2::element_blank(),
465 |       legend.text = ggplot2::element_text(size = 20))
466 | 
467 | return(tmp_plt)
468 | })
469 | 
470 | interest_grid <- cowplot::plot_grid(plotlist = ct_plt_ls,
471 |                    axis = "trbl",
472 |                    align = "hv",
473 |                    nrow = 4,
474 |                    ncol = 2)
475 | 
476 | "{an_oro}/{plt_dir}/{sample_id}_immune_reference_arrangement_interest_{clust_vr}.pdf" %>%
477 |   glue::glue() %>%
478 |   here::here() %>%
479 |   cowplot::save_plot(
480 |     filename = .,
481 |     plot = interest_grid,
482 |     base_height = 25,
483 |     base_width = 15)
484 | ```
485 | 
486 | Then we will plot the spatial-scatterpie for both slices to get a gist of what is going on
487 | ```{r eval = FALSE}
488 | spsct_plt1 <- SPOTlight::spatial_scatterpie(
489 |   se_obj = se_obj,
490 |   cell_types_all = cell_types,
491 |   img_path = here::here("data/breast_visium/section_1/spatial/tissue_lowres_image.pdf"),
492 |   pie_scale = 0.4,
493 |   slice = sample_id) +
494 |     ggplot2::scale_fill_manual(
495 |     values = col_df[col_df$plt_name %in% cell_types, "col_df"],
496 |     breaks = cell_types) +
497 |     ggplot2::guides(fill = ggplot2::guide_legend(ncol = 1))
498 | 
499 | "{an_oro}/{plt_dir}/{sample_id}_spatial_scatterpie_{clust_vr}.pdf" %>%
500 |   glue::glue() %>%
501 |   here::here() %>%
502 |   cowplot::save_plot(
503 |     filename = .,
504 |     plot = spsct_plt1,
505 |     base_width = 12,
506 |     base_height = 9)
507 | ```
508 | 
509 | We will also plot the scatterpie without the slice image underneath
510 | ```{r}
511 | sct_plt1 <- SPOTlight::scatterpie_plot(se_obj = se_obj,
512 |                                        cell_types_all = ct_all,
513 |                                        pie_scale = 0.7,
514 |                                        slice = sample_id) +
515 |     ggplot2::scale_fill_manual(
516 |       values = col_df[col_df$plt_name %in% ct_all, "col_df"],
517 |       breaks = colnames(decon_mtrx)) +
518 |     ggplot2::coord_fixed(ratio = 1) +
519 |     ggplot2::guides(fill = ggplot2::guide_legend(ncol = 1))
520 | 
521 | "{an_oro}/{plt_dir}/{sample_id}_scatterpie_{clust_vr}.pdf" %>%
522 |   glue::glue() %>%
523 |   here::here() %>%
524 |   cowplot::save_plot(
525 |     filename = .,
526 |     plot = sct_plt1,
527 |     base_width = 12,
528 |     base_height = 9)
529 | ```
530 | 
531 | We can also take a loot at the spatial scatterpie by looking at cell types which are not present throughout the entire tissue.
532 | ```{r}
533 | # Subset cell types from metadata
534 | metadata_subset <- se_obj@meta.data[, ct_all]
535 | 
536 | # Create masks
537 | keep_0.75 <- colSums(se_obj@meta.data[, ct_all] > 0) < 0.75 * ncol(se_obj)
538 | keep_g0 <- colSums(se_obj@meta.data[, ct_all] > 0) > 0
539 | 
540 | # Select cell types fullfiling the conditions
541 | ct_var <- c(colnames(decon_mtrx)[keep_0.75 & keep_g0])
542 | 
543 | # Get color order
544 | col_vec <- data.frame(plt_name = ct_var) %>%
545 |   dplyr::left_join(col_df, by = "plt_name") %>%
546 |   dplyr::pull(col_df)
547 | 
548 | sct_plt_int <- SPOTlight::scatterpie_plot(
549 |   se_obj = se_obj,
550 |   cell_types_all = ct_var,
551 |   pie_scale = 0.4) +
552 |   ggplot2::scale_fill_manual(
553 |     values = col_vec,
554 |     breaks = ct_var) +
555 |   ggplot2::labs(fill = "") +
556 |   ggplot2::coord_fixed(ratio = 1) +
557 |   ggplot2::theme(legend.position = "top",
558 |                  legend.text = ggplot2::element_text(size = 12)) +
559 |   ggplot2::guides(fill = ggplot2::guide_legend(nrow = 3))
560 | 
561 | "{an_oro}/{plt_dir}/{sample_id}_scatterpie_interest_{clust_vr}.pdf" %>%
562 |   glue::glue() %>%
563 |   here::here() %>%
564 |   cowplot::save_plot(
565 |     filename = .,
566 |     plot = sct_plt_int,
567 |     base_width = 12,
568 |     base_height = 9)
569 | ```
570 | 
571 | ### Gene - Cell correlation
572 | Check assumptions
573 | ```{r eval = FALSE}
574 | 
575 | lapply(c("Regulatory T cells", "Cytotoxic CD8 T cells"), function(ct) {
576 |   # Shapiro-Wilk normality test for mpg
577 |   print(shapiro.test(se_obj@meta.data[, ct]))
578 |   ggpubr::ggqqplot(se_obj@meta.data[, ct])
579 | 
580 | })
581 | 
582 | lapply(c("ENTPD1", "NT5E", "TGFB1"), function(gene) {
583 |   # Shapiro-Wilk normality test for mpg
584 |   print(shapiro.test(se_obj@assays$SCT@data[gene, ]))
585 |   ggpubr::ggqqplot(se_obj@assays$SCT@data[gene, ])
586 | 
587 | })
588 | ```
589 | 
590 | Next we want to check if the fibrotic marker TGFB1 with Treg and Cytotoxic CD8 T cells"
591 | ```{r eval = FALSE}
592 | library(ggpubr)
593 | 
594 | metadata <- dplyr::bind_cols(
595 |   se_obj@meta.data,
596 |   data.frame(t(se_obj@assays$SCT@data[c("ENTPD1", "NT5E", "TGFB1"), ])))
597 | 
598 | cor_plts <- lapply(c("Regulatory T cells", "Cytotoxic CD8 T cells"), function(ct) {
599 |   tmp_ls <- lapply(c("ENTPD1", "NT5E", "TGFB1"), function(gene) {
600 |     
601 |     tmp_plt <- ggpubr::ggscatter(data = metadata,
602 |                       x = gene,
603 |                       y = ct,
604 |                       add = "reg.line") +
605 |        ggpubr::stat_cor(method = "pearson")
606 |     return(tmp_plt)
607 |     })
608 |   cowplot::plot_grid(plotlist = tmp_ls, ncol = 1)
609 |   })
610 |   
611 | tmp_arr <- cowplot::plot_grid(plotlist = cor_plts,
612 |                               nrow = 1,
613 |                               ncol = 2)
614 | tmp_arr
615 | ```
616 | 
617 | ## Session Info
618 | ```{r}
619 | sessionInfo()
620 | ```
621 | 


--------------------------------------------------------------------------------
/ST-oropharyngeal/RCTD-SPOTlight_comparison.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | author: "Marc Elosua-Bayes"
  3 | date: "`r format(Sys.time(), '%d %B, %Y')`"
  4 | output:
  5 |   html_document:
  6 |     toc: yes
  7 |     toc_float: yes
  8 |     number_sections: yes
  9 |     df_print: paged
 10 | editor_options: 
 11 |   chunk_output_type: console
 12 | params: 
 13 |     sample_id: "Default!"
 14 |     clust_vr: "lv1_annot"
 15 | title: "`r sprintf('RCTD-10x_breast_immune_reference %s', {params$sample_id})`"
 16 | ---
 17 | 
 18 | ```{r setup, include=FALSE}
 19 | knitr::opts_chunk$set(echo = TRUE, out.width = "100%", fig.align='center', 
 20 |                       message = FALSE, warning = FALSE)
 21 | options(width = 1200)
 22 | ```
 23 | 
 24 | ## Introduction
 25 | In this R markdown document we will compare the deconvolution obtained from SPOTlight and RCTD.
 26 | 
 27 | RCTD original paper can be found [here](https://doi.org/10.1038/s41587-021-00830-w) and the GitHub repo [here](https://github.com/dmcable/RCTD).
 28 | 
 29 | 
 30 | ```{r}
 31 | library(Seurat)
 32 | library(tidyverse)
 33 | library(Matrix)
 34 | # devtools::install_github("dmcable/RCTD", build_vignettes = TRUE)
 35 | library(RCTD)
 36 | source(here::here("utils/bin.r"))
 37 | ```
 38 | 
 39 | ## Paths
 40 | ```{r}
 41 | source(here::here("misc/paths.R"))
 42 | 
 43 | "{an_oro}/{robj_dir}" %>%
 44 |   glue::glue() %>%
 45 |   here::here() %>%
 46 |   dir.create(
 47 |     path = ,
 48 |     showWarnings = FALSE,
 49 |     recursive = TRUE)
 50 | 
 51 | "{an_oro}/{plt_dir}" %>%
 52 |   glue::glue() %>%
 53 |   here::here() %>%
 54 |   dir.create(
 55 |     path = ,
 56 |     showWarnings = FALSE,
 57 |     recursive = TRUE)
 58 | ```
 59 | 
 60 | ## Parameters
 61 | ```{r}
 62 | set.seed(1243)
 63 | 
 64 | sample_id <- params$sample_id
 65 | # sample_id <- "161429"
 66 | clust_vr <- params$clust_vr
 67 | # clust_vr <- "new_annot"
 68 | # clust_vr <- "old_cell_types"
 69 | 
 70 | trn <- "melanoma"
 71 | cl_n <- 100
 72 | hvg <- 3000
 73 | ntop <- NULL
 74 | transf <- "uv"
 75 | method <- "nsNMF"
 76 | min_cont <- 0
 77 | 
 78 | if (is.null(ntop)) {
 79 |   spotlight_id <- glue::glue("trn-{trn}_cln-{cl_n}_hvg-{hvg}_ntop-NULL_transf-{transf}_method-{method}_mincont-{min_cont}")
 80 | } else {
 81 |   spotlight_id <- glue::glue("trn-{trn}_cln-{cl_n}_hvg-{hvg}_ntop-{ntop}_transf-{transf}_method-{method}_mincont-{min_cont}")
 82 | }
 83 | ```
 84 | 
 85 | Create a name/color dataframe
 86 | ```{r}
 87 | # source(here::here("misc/col_df_scrpt.R"))
 88 | # source(here::here("misc/col_df_scrpt2.R"))
 89 | col_df <- readRDS(here::here("misc/col_df.rds"))
 90 | ```
 91 | 
 92 | ## Load data
 93 | Load spatial object
 94 | ```{r}
 95 | ### Spatial breast cancer
 96 | se_obj <- "{an_oro}/{robj_dir}/processed_sp_oropharyngeal_{sample_id}.RDS" %>%
 97 |   glue::glue() %>%
 98 |   here::here() %>%
 99 |   readRDS(file = .)
100 | ```
101 | 
102 | SPOTlight deconvolution
103 | ```{r}
104 | decon_mtrx_ls <- "{an_oro}/{robj_dir}/decon_mtrx_atlas_{sample_id}_{spotlight_id}_{clust_vr}.rds" %>%
105 |   glue::glue() %>%
106 |   here::here() %>%
107 |   readRDS(file = .)
108 | ```
109 | 
110 | RCTD deconvolution
111 | ```{r}
112 | myRCTD <- "{an_oro}/{robj_dir}/RCTDobj_{sample_id}_{clust_vr}.rds" %>%
113 |   glue::glue() %>%
114 |   here::here() %>%
115 |   readRDS(file = .)
116 | ```
117 | 
118 | ## Analysis
119 | 
120 | ### Extract deconvolutions
121 | 
122 | SPOTlight extraction and renaming
123 | ```{r}
124 | decon_mtrx <- decon_mtrx_ls[[2]]
125 | decon_mtrx <- decon_mtrx[, colnames(decon_mtrx) != "res_ss"]
126 | 
127 | # Set as 0 cell types predicted to be under 5 % of the spot
128 | # decon_mtrx[decon_mtrx < 0.05] <- 0
129 | 
130 | # Change names to original ones and reorder according to Paula
131 | new_colnames <- data.frame(ct_name = colnames(decon_mtrx), stringsAsFactors = FALSE) %>%
132 |   dplyr::left_join(col_df, by = "ct_name") %>%
133 |   dplyr::pull(plt_name)
134 | 
135 | colnames(decon_mtrx) <- new_colnames
136 | colnames(decon_mtrx) <- glue::glue("SPOTlight-{colnames(decon_mtrx)}")
137 | rownames(decon_mtrx) <- colnames(se_obj)
138 | ```
139 | 
140 | RCTD extraction
141 | ```{r}
142 | RCTD_mtrx <- as.matrix(myRCTD@results[[1]])
143 | colnames(RCTD_mtrx) <- glue::glue("RCTD-{colnames(RCTD_mtrx)}")
144 | # RCTD_mtrx[RCTD_mtrx < 0.05] <- 0
145 | 
146 | # Normalize RCTD matrix to 1
147 | RCTD_mtrx <- RCTD_mtrx / rowSums(RCTD_mtrx)
148 | ```
149 | 
150 | Add deconvolution matrices to Seurat object
151 | ```{r}
152 | rownames(RCTD_mtrx) <- stringr::str_replace(
153 |   string = rownames(RCTD_mtrx),
154 |   pattern = "\\.",
155 |   replacement = "-")
156 | 
157 | barcodes <- purrr::reduce(list(rownames(se_obj@meta.data),
158 |                    rownames(RCTD_mtrx),
159 |                    rownames(decon_mtrx)),
160 |               dplyr::intersect)
161 | 
162 | RCTD_mtrx <- RCTD_mtrx[barcodes, ]
163 | decon_mtrx <- decon_mtrx[barcodes, ]
164 | 
165 | se_obj@meta.data <- cbind(se_obj@meta.data[barcodes, ],
166 |                           RCTD_mtrx[barcodes, ],
167 |                           decon_mtrx[barcodes, ])
168 | 
169 | 
170 | ```
171 | 
172 | ### Deconvolution comparison
173 | Here we are going to compare RCTD and SPOTlight deconvolutions by correlating the cell types
174 | ```{r fig.width=20, fig.height=20}
175 | cor_plots <- lapply(colnames(decon_mtrx), function(i) {
176 |   print(i)
177 |   i_spot <- i
178 |   i_sub <- stringr::str_split(string = i, pattern = "-", simplify = TRUE, n = 2)
179 |   i_rctd <- glue::glue("RCTD-{i_sub[2]}")
180 |   
181 |   dat <- data.frame(
182 |     SPOTlight = decon_mtrx[, i_spot],
183 |     RCTD = RCTD_mtrx[, i_rctd],
184 |     check.names = FALSE)
185 |   
186 |   print(cor.test(decon_mtrx[, i_spot],
187 |            RCTD_mtrx[, i_rctd]))
188 |   
189 |   ggpubr::ggscatter(dat, x = "SPOTlight", y = "RCTD",
190 |      color = "black",
191 |      add = "reg.line",  # Add regressin line
192 |      add.params = list(color = "blue", fill = "lightgray"), # Customize reg. line
193 |      cor.coef = TRUE, # Add correlation coefficient. see ?stat_cor
194 |      cor.coeff.args = list(method = "pearson", label.sep = "\n"),
195 |      title = i_sub[2])
196 |   })
197 | 
198 | 
199 | plt_grid <- cowplot::plot_grid(plotlist = cor_plots,align = "hv", axis = "trbl")
200 | 
201 | "{an_oro}/{plt_dir}/cor_grid_RCTD-SPOTlight_{sample_id}_{clust_vr}.pdf" %>%
202 |   glue::glue() %>%
203 |   here::here() %>%
204 |   cowplot::save_plot(
205 |     filename = .,
206 |     plot = plt_grid,
207 |     base_height = 20,
208 |     base_width = 20)
209 | ```
210 | 
211 | ### Visualize cell types
212 | Next we want to visualize the cell types side by side
213 | ```{r}
214 | tissue_plots <- lapply(colnames(decon_mtrx), function(i) {
215 |   print(i)
216 |   i_spot <- i
217 |   i_sub <- stringr::str_split(string = i, pattern = "-", simplify = TRUE, n = 2)
218 |   i_rctd <- glue::glue("RCTD-{i_sub[2]}")
219 |   
220 |   Seurat::SpatialPlot(
221 |     object = se_obj,
222 |     features = c(i_spot, i_rctd),
223 |     alpha = c(0, 1),
224 |     image.alpha = 1)
225 | })
226 | 
227 | 
228 | tissue_grid <- cowplot::plot_grid(plotlist = tissue_plots,
229 |                                   align = "hv",
230 |                                   axis = "trbl")
231 | 
232 | "{an_oro}/{plt_dir}/tissue_grid_RCTD-SPOTlight_{sample_id}_{clust_vr}.pdf" %>%
233 |   glue::glue() %>%
234 |   here::here() %>%
235 |   cowplot::save_plot(
236 |     filename = .,
237 |     plot = tissue_grid,
238 |     base_height = 25,
239 |     base_width = 35)
240 | ```
241 | 
242 | ## Session Info
243 | ```{r}
244 | sessionInfo()
245 | ```
246 | 


--------------------------------------------------------------------------------
/ST-oropharyngeal/RCTD-australia_oroph_deconv.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | author: "Marc Elosua-Bayes"
  3 | date: "`r format(Sys.time(), '%d %B, %Y')`"
  4 | output:
  5 |   html_document:
  6 |     toc: yes
  7 |     toc_float: yes
  8 |     number_sections: yes
  9 |     df_print: paged
 10 | editor_options: 
 11 |   chunk_output_type: console
 12 | params: 
 13 |     sample_id: "Default!"
 14 |     clust_vr: "lv1_annot"
 15 | title: "`r sprintf('RCTD-10x_breast_immune_reference %s', {params$sample_id})`"
 16 | ---
 17 | 
 18 | ```{r setup, include=FALSE}
 19 | knitr::opts_chunk$set(echo = TRUE, out.width = "100%", fig.align='center', 
 20 |                       message = FALSE, warning = FALSE)
 21 | options(width = 1200)
 22 | ```
 23 | 
 24 | ## Introduction
 25 | In this R markdo9wn document we will map immune cells onto the Oropharyngeal cancer tissue using the RCTD  tool.
 26 | Robust Cell Type Deconvolution is developed by Dylan M. Cable from Rafa Irizarry's lab
 27 | 
 28 | RCTD original paper can be found [here](https://doi.org/10.1038/s41587-021-00830-w) and the GitHub repo [here](https://github.com/dmcable/RCTD).
 29 | 
 30 | 
 31 | ```{r}
 32 | library(Seurat)
 33 | library(tidyverse)
 34 | library(Matrix)
 35 | # devtools::install_github("dmcable/RCTD", build_vignettes = TRUE)
 36 | library(RCTD)
 37 | source(here::here("utils/bin.r"))
 38 | ```
 39 | 
 40 | ## Paths
 41 | ```{r}
 42 | source(here::here("misc/paths.R"))
 43 | 
 44 | "{an_breast_10x}/{robj_dir}" %>%
 45 |   glue::glue() %>%
 46 |   here::here() %>%
 47 |   dir.create(
 48 |     path = ,
 49 |     showWarnings = FALSE,
 50 |     recursive = TRUE)
 51 | 
 52 | "{an_breast_10x}/{plt_dir}" %>%
 53 |   glue::glue() %>%
 54 |   here::here() %>%
 55 |   dir.create(
 56 |     path = ,
 57 |     showWarnings = FALSE,
 58 |     recursive = TRUE)
 59 | ```
 60 | 
 61 | ## Parameters
 62 | ```{r}
 63 | set.seed(1243)
 64 | 
 65 | sample_id <- params$sample_id
 66 | # sample_id <- "161429"
 67 | # sample_id <- "161430"
 68 | # clust_vr <- params$clust_vr
 69 | clust_vr <- "lv1_annot"
 70 | # clust_vr <- "lv2_annot"
 71 | ```
 72 | 
 73 | RCTD paths
 74 | ```{r}
 75 | refdir <- "{an_breast_10x}/{robj_dir}/RCTD_data/reference" %>%
 76 |   glue::glue() %>%
 77 |   here::here()
 78 | 
 79 | dir.create(path = refdir, showWarnings = FALSE, recursive = TRUE)
 80 | 
 81 | stdir <- "{an_oro}/{robj_dir}/{sample_id}/RCTD_data/spatial" %>%
 82 |   glue::glue() %>%
 83 |   here::here()
 84 | 
 85 | dir.create(path = stdir, showWarnings = FALSE, recursive = TRUE)
 86 | ```
 87 | 
 88 | ## Load data
 89 | 
 90 | We are going to start by loading the data from the previosu script *1-australis_oroph_processing.Rmd*.
 91 | ```{r eval = FALSE}
 92 | ### Spatial oropharyngeal cancer
 93 | se_obj <- "{an_oro}/{robj_dir}/processed_sp_oropharyngeal_{sample_id}.RDS" %>%
 94 |   glue::glue() %>%
 95 |   here::here() %>%
 96 |   readRDS(file = .)
 97 | ```
 98 | 
 99 | Subset ICA to just use cells from the melanoma subset since we have a good representation of cells for each cluster from this dataset.
100 | ```{r eval = FALSE}
101 | ica_melanoma2_path <- "{an_breast_10x}/{robj_dir}/ica_melanoma2.rds" %>%
102 |   glue::glue() %>%
103 |   here::here()
104 | 
105 | if (file.exists(ica_melanoma2_path)) {
106 |   
107 |   ica_sub <- readRDS(file = ica_melanoma2_path)
108 | } else {
109 |   # ica_se <- readRDS("/scratch/devel/pnieto/TIL_Atlas/TICA/output/integrated_renamed_filtered.rds")
110 |   ica_se <- "data/immune_reference/integrated_clustered_complete.rds" %>%
111 |     here::here() %>%
112 |     readRDS(.)
113 |   ### Immune reference atlas
114 |   ica_sub <- subset(ica_se, subset = source == "melanoma2")
115 |   rm(ica_se)
116 |   
117 |   saveRDS(object = ica_sub, file = ica_melanoma2_path)
118 | }
119 | 
120 | table(ica_sub@meta.data$lv1_annot)
121 | table(ica_sub@meta.data$lv2_annot)
122 | ```
123 | 
124 | Remove MAST cells from the training set
125 | ```{r eval = FALSE}
126 | ica_sub <- subset(ica_sub, subset = lv1_annot != "Mast cells")
127 | # Create Seurat object with filtered gene matrix after removing Mast cells
128 | ica_sub <- Seurat::CreateSeuratObject(
129 |   counts = ica_sub@assays$RNA@counts[
130 |     sparseMatrixStats::rowSums2(ica_sub@assays$RNA@counts) != 0, ],
131 |   meta.data = ica_sub@meta.data)
132 | ```
133 | 
134 | Join proliferatiing cell types to get a proliferation signature
135 | ```{r eval = FALSE}
136 | prolif_vec <- c("T cells proliferative", "Macrophages and monocytes proliferative",
137 |                 "B cells proliferative", "Macrophages proliferative")
138 | 
139 | ica_sub[["cell_type_mod"]] <- dplyr::if_else(
140 |   ica_sub@meta.data[, clust_vr] %in% prolif_vec,
141 |   "Proliferation", as.character(ica_sub@meta.data[, clust_vr]))
142 | 
143 | ```
144 | 
145 | ## Prepare data for RCTD
146 | ### Reference scRNAseq
147 | We are going to follow the vignette steps.
148 | 
149 | In order to run RCTD, the first step is to process the single cell reference. Create a folder in ‘data/Reference’ e.g. ‘data/Reference/Vignette’ containing the following three files:
150 | 1. meta_data.csv: a CSV file (with 3 columns, with headers "barcode", "cluster", and "nUMI") containing the numeric cluster assignment for each cell. <br/>
151 | 2. cell_type_dict.csv: a CSV file (with 2 columns, with headers "Cluster" and "Name") containing the mapping between numeric cluster ID and cluster name. If you want a cluster to be filtered out of the single cell reference, you can leave the cluster name blank. The cell types must not contain the character ‘/’ or ‘-’. <br/>
152 | 3. dge.csv: a Digital Gene Expression (DGE) (barcodes by gene counts) CSV file in the standard 10x format. <br/>
153 | We use the dgeToSeurat function:
154 | 
155 | ```{r eval = FALSE}
156 | scRCTD_structure <- function(sc_obj, clust_vr) {
157 |   
158 |   sc_obj[["Name"]] = sc_obj@meta.data[, clust_vr]
159 |   
160 |   # Cell type dictionary between cluster and cell-type
161 |   ct <- unique(as.character(sc_obj@meta.data[, clust_vr]))
162 |   df_ct <- data.frame("Cluster" = 1:length(ct),
163 |                       "Name" = ct)
164 |   
165 |   # 
166 |   metadata <- sc_obj@meta.data %>%
167 |     # Rownames to columns must be before left join since after it the rownames are erased
168 |     tibble::rownames_to_column("barcode") %>%
169 |     dplyr::left_join(df_ct, by = c("Name" = "Name")) %>%
170 |     # Change names to “barcode”, “cluster”, “nUMI”
171 |     mutate(
172 |       cluster = Cluster,
173 |       nUMI = nCount_RNA
174 |       ) %>%
175 |     dplyr::select(barcode, cluster, nUMI)
176 |   
177 |   expr_mtrx <- sc_obj@assays$RNA@counts
178 |   
179 |   return(list("meta_data" = metadata,
180 |               "cell_type_dict" = df_ct,
181 |               "dge" = expr_mtrx))
182 | }
183 | ```
184 | 
185 | Save data to reference directory
186 | ```{r eval = FALSE}
187 | sc_ls <- scRCTD_structure(sc_obj = ica_sub, clust_vr = "cell_type_mod")
188 | 
189 | "{refdir}/meta_data.csv" %>%
190 |   glue::glue() %>%
191 |   here::here() %>%
192 |   readr::write_csv(x = sc_ls[[1]], file = .)
193 | 
194 | "{refdir}/cell_type_dict.csv" %>%
195 |   glue::glue() %>%
196 |   here::here() %>%
197 |   readr::write_csv(x = sc_ls[[2]], file = .)
198 | 
199 | dge_path <- "{refdir}/dge.csv" %>%
200 |   glue::glue() %>%
201 |   here::here()
202 | 
203 | sc_ls[[3]] %>%
204 |   data.frame() %>%
205 |   tibble::rownames_to_column("gene") %>%
206 |   readr::write_csv(x = .,
207 |                    file = dge_path,
208 |                    col_names = TRUE)
209 | 
210 | ```
211 | 
212 | **Of note, here go to analysis/tool_benchmarking/RCTD_data/reference/dge.csv and remove the column name gene by "". If you don't it will detect the column as a gene and RCTD::dgeToSeurat won't be able to load the data.**
213 | 
214 | ### Reference ST
215 | Next, put the SpatialRNA data in your ‘data/SpatialRNA’ directory (here ‘data/SpatialRNA/Vignette’). This needs to contain:
216 | 1. BeadLocationsForR.csv: a CSV file (with 3 columns, with headers “barcodes”, “xcoord”, and “ycoord”) containing the spatial locations of the pixels. <br/>
217 | 2. MappedDGEForR.csv: a DGE (gene counts by barcodes) CSV file. Represents raw counts at each pixel. <br/>
218 | 
219 | ```{r eval = FALSE}
220 | dgef_path <- "{stdir}/MappedDGEForR.csv" %>%
221 |   glue::glue() %>%
222 |   here::here()
223 | 
224 | se_obj@assays$Spatial@counts %>%
225 |   data.frame() %>%
226 |   tibble::rownames_to_column("gene") %>% 
227 |   dplyr::select(gene, everything()) %>% 
228 |   readr::write_csv(x = .,
229 |                    file = dgef_path,
230 |                    col_names = TRUE)
231 | 
232 | coord_path <- "{stdir}/BeadLocationsForR.csv" %>%
233 |   glue::glue() %>%
234 |   here::here()
235 | 
236 | # Extract spatial coordinates
237 | image_id <- names(se_obj@images)
238 | se_obj@images[[image_id]]@coordinates %>%
239 |   dplyr::select(imagecol, imagerow) %>% 
240 |   tibble::rownames_to_column("barcodes") %>%
241 |   dplyr::rename(xcoord = imagecol, ycoord = imagerow) %>%
242 |   readr::write_csv(
243 |     x = .,
244 |     file = coord_path)
245 | 
246 | ```
247 | 
248 | ## RCTD deconvolution
249 | ### Read data in RCTD
250 | Read data in RCTD format
251 | ```{r}
252 | reference <- RCTD::dgeToSeurat(refdir = refdir)
253 | puck <- RCTD::read.SpatialRNA(datadir = stdir)
254 | ```
255 | 
256 | ### Creating and running RCTD
257 | We are now ready to create an RCTD object using the create.RCTD function. We simply need to pass in the SpatialRNA and scRNA-seq objects. There are several configuration options that can be set with this function:
258 | ```{r}
259 | myRCTD <- RCTD::create.RCTD(spatialRNA = puck,
260 |                             reference = reference,
261 |                             max_cores = 1,
262 |                             CELL_MIN = 18)
263 | ```
264 | 
265 | Now, we are ready to run RCTD, using the run.RCTD function. This function is equivalent to sequentially running the functions fitBulk, choose_sigma_c, and fitPixels. The doublet_mode argument sets whether RCTD will be run in ‘doublet mode’ (at most 1-2 cell types per pixel) or ‘full mode’ (no restrictions on number of cell types).
266 | ```{r}
267 | myRCTD <- RCTD::run.RCTD(RCTD = myRCTD,
268 |                          doublet_mode = FALSE)
269 | ```
270 | 
271 | Save RCTD resulting
272 | ```{r}
273 | "{an_oro}/{robj_dir}/RCTDobj_{sample_id}_{clust_vr}.rds" %>%
274 |   glue::glue() %>%
275 |   here::here() %>%
276 |   saveRDS(object = myRCTD, file = .)
277 | ```
278 | 
279 | 
280 | ## Session Info
281 | ```{r}
282 | sessionInfo()
283 | ```
284 | 
285 | 


--------------------------------------------------------------------------------
/ST-oropharyngeal/README.md:
--------------------------------------------------------------------------------
 1 | # Oropharyngeal 
 2 | 
 3 | ## Data description
 4 | **Data acquisition**
 5 | All patients provided informed consent for the collection of human specimens and data.
 6 | This was approved by the St Vincent’s Hospital Research Office (2019/PID04335) in accordance with the National Health and Medical Research Council’s National Statement of Ethical Conduct in Human Research.  Patients undergoing surgical resection for a locally advanced oropharyngeal cancer were recruited to the study.  After surgical removal, the anatomical pathologist dissected a sample of both the primary and nodal metastasis.
 7 | Samples were tumour banked in accordance with our ethically approved protocol.
 8 | 
 9 | **Sample storage**
10 | Within 30 minutes of collection, tumour samples were tumour banked.  Samples were cut into 1mm x 1mm chunks with a scalpel blade.
11 | For Visium, a tissue chunk was snap frozen in OCT. After freezing, samples were moved to liquid nitrogen for long term storage.
12 | 
13 | **Visium Spatial Gene Expression**
14 | Frozen tissue samples were processed using the Visium Spatial Gene Expression slide and reagent kit (10X Genomics, US) following the manufacturer’s instruction. 
15 | Briefly, 10 μm sections were placed into the capture areas of the Visium slide.
16 | Tissue morphology was assessed with H&E staining and imaging using a Leica DM6000 microscope equipped with a 20x lens (Leica, DE).
17 | The imaged sections were then permeabilized for 12 minutes using the supplied reagents.
18 | The permeabilization condition was previously optimised using the Visium Spatial Tissue Optimisation slide and reagent kit (10X Genomics, US).
19 | After permeabilization, cDNA libraries were prepared, checked for quality and sequenced on a NovaSeq 6000 platform (Illumina, US).
20 | Around 300 million pair-ended reads were obtained for each tissue section. Read 1, i7 index and Read 2 were sequenced with 28, 8 and 98 cycles respectively.
21 | 
22 | ## Data availability
23 | Pending to get approval to post to GEO.
24 | 
25 | ## Code
26 | Scripts *1-_australia_oroph_processing.Rmd*, *2-australia_oroph_biological.Rmd*, and *3-australia_oroph_deconv.Rmd* are in charge of preprocessing and mapping 
27 | the TICA immune cell states to the tissue slices. *4-australia_oro_srtatification.Rmd*  and *5-australia_oro_plots.Rmd*, in turn, are in charge of making the plots for Figure 5 and Supplementary Figures 8-12.
28 | 
29 | ## Dependencies
30 | * [R 3.6.0](https://cran.r-project.org/)
31 | * [Seurat 3.2.0](https://cran.r-project.org/web/packages/Seurat/index.html)
32 | * [tidyverse 1.3.0](https://cran.r-project.org/web/packages/tidyverse/index.html)
33 | * [ggpubr 0.3.0](https://cran.r-project.org/web/packages/ggpubr/index.html)
34 | * [SPOTlight 1.0.0](https://github.com/MarcElosua/SPOTlight)
35 | * [Matrix 1.2.18](https://cran.r-project.org/web/packages/Matrix/index.html)
36 | * [svglite 1.2.3.2](https://cran.r-project.org/web/packages/svglite/index.html)
37 | * [ggcorrplot 0.1.3](https://cran.r-project.org/web/packages/ggcorrplot/index.html)
38 | * [cowplot 1.1.0](https://cran.r-project.org/web/packages/cowplot/index.html)
39 | 
40 | 


--------------------------------------------------------------------------------
/ST-oropharyngeal/make_oroph_deconv.cmd:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | pipe_dir='/scratch/devel/melosua/phd/projects/TICA/misc/jobs/pipeline_dir'
 4 | 
 5 | pipe_fn="$pipe_dir/pipe_spotlight_oroph.pipe"
 6 | rm "$pipe_fn"
 7 | touch "$pipe_fn"
 8 | 
 9 | trn='melanoma'
10 | for spatial in 161429 161430 161431 161432
11 | do
12 | printf "$spatial.decon\t$spatial\t-\t.\tn\t8:00:00\t1\t12\t.\tmodule purge; module load gsl/1.9_64 gsl/2.4 gcc/6.3.0 gmp/6.1.2 R/3.6.0 hdf5/1.10.1; analysis/aussie_oro/spotlight_deconv_oroph_job.R $trn $spatial\n" >> "$pipe_fn"
13 | done
14 | /home/devel/melosua/bin/cnag_pipeline.pl "$pipe_fn"
15 | 


--------------------------------------------------------------------------------
/ST-oropharyngeal/spotlight_deconv_oroph_job.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | # Train Prostate model
 4 | library(SPOTlight)
 5 | library(NMF)
 6 | library(Seurat)
 7 | library(dplyr)
 8 | 
 9 | # Set parameters
10 | # trn <- "melanoma"
11 | cl_n <- 100
12 | hvg <- 3000
13 | ntop <- NULL
14 | transf <- "uv"
15 | method <- "nsNMF"
16 | min_cont <- 0
17 | clust_vr <- "new_cell_types"
18 | 
19 | args <- commandArgs(trailingOnly=TRUE)
20 | # trn <- "melanoma"
21 | # spatial <- "161432"
22 | trn <- args[[1]]
23 | spatial <- args[[2]]
24 | 
25 | if (is.null(ntop)) {
26 |   spotlight_id <- sprintf("trn-%s_cln-%s_hvg-%s_ntop-NULL_transf-%s_method-%s_mincont-%s",
27 |                           trn, cl_n, hvg, transf, method, min_cont)
28 | } else {
29 |   spotlight_id <- sprintf("trn-%s_cln-%s_hvg-%s_ntop-%s_transf-%s_method-%s_mincont-%s",
30 |                           trn, cl_n, hvg, ntop, transf, method, min_cont)
31 | }
32 | 
33 | source("misc/paths.R")
34 | source("utils/bin.r")
35 | source("utils/spatial_plot_spaniel.R")
36 | 
37 | # Load data
38 | ### Spatial data
39 | st_ls <- readRDS(file = sprintf("%s/%s/processed_st_ls_oropharyngeal.RDS",
40 |                                 an_oro, robj_dir))
41 | 
42 | st_se <- st_ls[[spatial]]
43 | 
44 | ### Immune reference atlas
45 | if (trn == "full") {
46 |   ica_se <- readRDS("data/immune_reference/atlas_250_specific_cell_type.rds")
47 | } else if (trn == "breast") {
48 |   ica_se <- readRDS("data/immune_reference/atlas_complete_annotation_breast.rds")
49 | } else if (trn == "melanoma") {
50 |   ica_se <- readRDS("data/immune_reference/atlas_complete_annotation_melanoma.rds")
51 | }  else if (trn == "pancreas") {
52 |   ica_se <- readRDS("data/immune_reference/atlas_complete_annotation_pancreas.rds")
53 | }
54 | 
55 | # ica_se <- ica_se[, ica_se$new_cell_types != "MAST"]
56 | ica_se[["specific_cell_type_mod"]] <- gsub(pattern = "[[:punct:]]|[[:blank:]]", ".",
57 |                                            x = as.character(ica_se@meta.data[, clust_vr]),
58 |                                            perl = TRUE)
59 | 
60 | ### Markers TICA
61 | if (trn == "full") {
62 |   ica_markers <- readRDS(file = "data/immune_reference/ica_markers_all.RDS")
63 | } else if (trn == "breast") {
64 |   ica_markers <- readRDS(file = "data/immune_reference/ica_markers_breast.RDS")
65 | } else if (trn == "melanoma") {
66 |   ica_markers <- readRDS(file = "data/immune_reference/ica_markers_melanoma.RDS")
67 | } else if (trn == "pancreas") {
68 |   ica_markers <- readRDS(file = "data/immune_reference/ica_markers_melanoma.RDS")
69 | }
70 | 
71 | # ica_markers <- ica_markers %>% filter(cluster != "MAST")
72 | 
73 | deconv_ls <- SPOTlight::spotlight_deconvolution(se_sc = ica_se,
74 |                                                 counts_spatial = st_se@assays$Spatial@counts,
75 |                                                 clust_vr = "specific_cell_type_mod",
76 |                                                 cluster_markers = ica_markers,
77 |                                                 cl_n = cl_n,
78 |                                                 hvg = hvg,
79 |                                                 ntop = ntop,
80 |                                                 transf = transf,
81 |                                                 method = method,
82 |                                                 min_cont = min_cont,
83 |                                                 assay = "RNA",
84 |                                                 slot = "counts")
85 | 
86 | saveRDS(object = deconv_ls,
87 |         file = sprintf("%s/%s/spotlight_deconv_ls_%s_%s.RDS",
88 |                        an_oro, robj_dir, spatial, spotlight_id))
89 | 


--------------------------------------------------------------------------------
/misc/SPOTlight_deconvolution_job.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | ## Load libraries
  4 | library(SPOTlight)
  5 | library(dplyr)
  6 | 
  7 | ## Set parameters
  8 | source("misc/paths.R")
  9 | dir.create(path = sprintf("%s/%s", an_breast_10x, robj_dir),
 10 |            showWarnings = FALSE,
 11 |            recursive = TRUE)
 12 | 
 13 | dir.create(path = sprintf("%s/%s", an_breast_10x, plt_dir),
 14 |            showWarnings = FALSE,
 15 |            recursive = TRUE)
 16 | 
 17 | cl_n <- 100
 18 | hvg <- 3000
 19 | ntop <- NULL
 20 | transf <- "uv"
 21 | method <- "nsNMF"
 22 | min_cont <- 0
 23 | clust_vr <- "new_cell_types"
 24 | 
 25 | ### Variable passed from command line to determine the SC and spatial dataset  
 26 | args <- commandArgs(trailingOnly=TRUE)
 27 | # trn <- "melanoma"
 28 | # spatial <- "breast"
 29 | trn <- args[[1]]
 30 | spatial <- args[[2]]
 31 | 
 32 | 
 33 | 
 34 | if (is.null(ntop)) {
 35 |   spotlight_id <- sprintf("trn-%s_cln-%s_hvg-%s_ntop-NULL_transf-%s_method-%s_mincont-%s",
 36 |                           trn, cl_n, hvg, transf, method, min_cont)
 37 | } else {
 38 |   spotlight_id <- sprintf("trn-%s_cln-%s_hvg-%s_ntop-%s_transf-%s_method-%s_mincont-%s",
 39 |                           trn, cl_n, hvg, ntop, transf, method, min_cont)
 40 | }
 41 | 
 42 | 
 43 | ## Load data
 44 | ### Reference scRNAseq dataset
 45 | if (trn == "full") {
 46 |   print("Loading full 250 TICA")
 47 |   ica_se <- readRDS(file = "data/immune_reference/ica_se_full_processed.RDS")
 48 | } else {
 49 |   print(sprintf("Loading: data/immune_reference/ica_se_%s_processed.RDS",trn))
 50 |   ica_se <- readRDS(file = sprintf("data/immune_reference/ica_se_%s_processed.RDS",
 51 |                                    trn))
 52 | }
 53 | 
 54 | ### Cell type markers obtained from Seruat::FindAllMarkers
 55 | if (trn == "full") {
 56 |   print("Loading full TICA markers")
 57 |   ica_markers <- readRDS(file = "data/immune_reference/ica_markers_all.RDS")
 58 | } else {
 59 |   print(sprintf("Loading: data/immune_reference/ica_markers_%s.RDS",
 60 |                 trn))
 61 |   ica_markers <- readRDS(file = sprintf("data/immune_reference/ica_markers_%s.RDS",
 62 |                                         trn))
 63 | }
 64 | 
 65 | ### Spatial data
 66 | if (spatial == "breast") {
 67 |   print("Loading breast spatial")
 68 |   spatial_se <- readRDS(file = sprintf("%s/%s/breast_merged_processed.RDS",
 69 |                                     an_breast_10x, robj_dir))
 70 | } else if (spatial == "hn") {
 71 |   print("Loading hn spatial")
 72 |   spatial_se <- readRDS(file = sprintf("%s/%s/hn1_processed.RDS",
 73 |                                  an_aussie, robj_dir))
 74 | } else if (spatial == "pdac") {
 75 |   print("Loading pdac spatial")
 76 |   
 77 | } else if (spatial == "prostate") {
 78 |   print("Loading prostate spatial")
 79 |   
 80 | }
 81 | 
 82 | # Run deconvolution
 83 | ## Here we set transf = RAW because when we carried out SCTransform on the SC and spatial dataset we already performed the normalization.
 84 | ## That is why we point directly to the SCT assay and data slot 
 85 | decon_mtrx_ls <- SPOTlight::spotlight_deconvolution(
 86 |   se_sc = ica_se,
 87 |   counts_spatial = spatial_se@assays$SCT@data,
 88 |   clust_vr = "specific_cell_type_mod",
 89 |   cluster_markers = ica_markers,
 90 |   cl_n = cl_n,
 91 |   hvg = hvg,
 92 |   ntop = ntop,
 93 |   transf = "raw", # Set to raw 
 94 |   method = method,
 95 |   min_cont = min_cont,
 96 |   assay = "SCT",
 97 |   slot = "data")
 98 | 
 99 | saveRDS(object = decon_mtrx_ls,
100 |         file = sprintf("%s/%s/decon_mtrx_TICA_%s_%s.RDS",
101 |                        an_breast_10x, robj_dir, spatial, spotlight_id))
102 | 
103 | 
104 | 


--------------------------------------------------------------------------------
/misc/atlas.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Single-Cell-Genomics-Group-CNAG-CRG/Tumor-Immune-Cell-Atlas/f309409beb058ede05dfc85927b952cda51fc119/misc/atlas.png


--------------------------------------------------------------------------------
/misc/col_df_scrpt.R:
--------------------------------------------------------------------------------
 1 | # library(RColorBrewer)
 2 | library(pals)
 3 | library(dplyr)
 4 | # n <- 60
 5 | # qual_col_pals <- RColorBrewer::brewer.pal.info[RColorBrewer::brewer.pal.info$category == 'qual',]
 6 | # Load color palette
 7 | col_pal_v <- readRDS(file = "data/immune_reference/cell_type_palette.rds")
 8 | 
 9 | # Save as df
10 | col_df <- data.frame(ct_col = col_pal_v, stringsAsFactors = FALSE) %>%
11 |   tibble::rownames_to_column("plt_name") %>%
12 |   dplyr::mutate(
13 |     ct_name =  gsub(pattern = "[[:punct:]]|[[:blank:]]", ".",
14 |                     x = plt_name, 
15 |                     perl = TRUE),
16 |     plt_name = dplyr::if_else(plt_name == "Central memory CD4 T cells", 
17 |                                "Transitional Memory CD4 T cells",
18 |                                plt_name),
19 |     ct_name = dplyr::case_when(
20 |       plt_name == "Recently activated CD4 T cells" ~ "Effector.precursor.CD4.T.cells",
21 |       plt_name == "B cells" ~ "B.cell",
22 |       plt_name == "Proliferative B cells" ~ "Proliferative.B.cell",
23 |       plt_name == "Naive-memory CD4 T cells" ~ "Effector.memory.CD4.T.cells",
24 |       plt_name == "Plasma B cells" ~ "Plasma.B",
25 |       plt_name == "Proinflamatory TAMs" ~ "Proinflamatory.TAMs.and.neutrophils",
26 |       # plt_name == "Central memory CD4 T cells" ~ "Transitional.Memory.CD4.T.cells",
27 |       plt_name == "Mast cells" ~ "MAST",
28 |       TRUE ~ ct_name)
29 |     ) %>%
30 |   dplyr::filter(plt_name != "Unknown")
31 | 
32 | # col_vector  <-  as.vector(pals::polychrome())
33 | paula_order_old <- c("B cell", "Proliferative B cell", "Plasma B", "Naive T cells", "Regulatory T cells", "T helper cells", "Th17 cells", "Proliferative T cells", "Effector precursor CD4 T cells", "Effector memory CD4 T cells", "Central memory CD4 T cells", "Pre-exhausted CD8 T cells", "Cytotoxic CD8 T cells", "Effector memory CD8 T cells", "Terminally exhausted CD8 T cells", "NK", "SPP1 TAMs", "M2 TAMs", "Proinflamatory TAMs and neutrophils", "Proliferative monocytes and macrophages", "Monocytes", "cDC", "pDC", "mDC", "MAST")
34 | paula_order <- c("B cells", "Proliferative B cells", "Plasma B cells", "Naive T cells", "Regulatory T cells", "T helper cells", "Th17 cells", "Proliferative T cells", "Recently activated CD4 T cells", "Naive-memory CD4 T cells", "Transitional Memory CD4 T cells", "Pre-exhausted CD8 T cells", "Cytotoxic CD8 T cells", "Effector memory CD8 T cells", "Terminally exhausted CD8 T cells", "NK", "SPP1 TAMs", "M2 TAMs", "Proinflamatory TAMs", "Proliferative monocytes and macrophages", "Monocytes", "cDC", "pDC", "mDC", "Mast cells")
35 | 
36 | # col_df <- data.frame(ct_name = gsub(pattern = "[[:punct:]]|[[:blank:]]", ".",
37 | #                                     x = paula_order_old,
38 | #                                     perl = TRUE),
39 | #                      plt_name = paula_order_old,
40 | #                      ct_col = col_vector[1:length(paula_order_old)],
41 | #                      stringsAsFactors = FALSE)
42 | # 
43 | # # Change label names
44 | # col_df <- col_df %>%
45 | #   dplyr::mutate(plt_name = dplyr::if_else(plt_name == "Effector precursor CD4 T cells",
46 | #                                           "Recently activated CD4 T cells", plt_name))
47 | 


--------------------------------------------------------------------------------
/misc/paths.R:
--------------------------------------------------------------------------------
 1 | # Variables
 2 | ver <- "2020-06-04"
 3 | 
 4 | ## PATHS and common variables
 5 | version_dir <- sprintf("%s",ver)
 6 | dir.create(path = version_dir, 
 7 |            showWarnings = F, 
 8 |            recursive = T)
 9 | 
10 | ## Create directory for plots
11 | plt_dir <- sprintf("%s/plots_%s", ver, ver) 
12 | # dir.create(path = plt_dir, 
13 | #            showWarnings = FALSE, 
14 | #            recursive = TRUE)
15 | 
16 | ## Create directory for RDS objects
17 | robj_dir <- sprintf("%s/R_objects_%s", ver, ver)
18 | # dir.create(path = robj_dir, 
19 | #            showWarnings = FALSE, 
20 | #            recursive = TRUE)
21 | 
22 | ## Paths to all the folders
23 | an <- "analysis"
24 | an_aussie <- sprintf("%s/aussie_ln", an)
25 | an_oro <- sprintf("%s/aussie_oro", an)
26 | an_melanoma <- sprintf("%s/melanoma", an)
27 | an_prostate <- sprintf("%s/prostate", an)
28 | an_breast_10x <- sprintf("%s/breast_10x", an)
29 | an_pdac <- sprintf("%s/pdac_st", an)
30 | an_epid <- sprintf("%s/epid_27", an)
31 | 


--------------------------------------------------------------------------------
/utils/process_gene_names.R:
--------------------------------------------------------------------------------
 1 | # ideally, this should be done before creating the Seurat object,
 2 | # but because we used already processed data (not raw fastqs), in most cases this wasn't possible
 3 | # this function filters dataset genes by removing genes expressed in few cells
 4 | # and replacing non accepted genes with their accepted version
 5 | 
 6 | library(Seurat)
 7 | library(tidyverse)
 8 | 
 9 | process_gene_names <- function(data, n_cells = NULL, remove_ensembl = TRUE) {
10 | 
11 |   # 1 - remove genes expressed in less than n cells
12 |   # (default assay should be defined beforehand)
13 |   # not recommended in most cases, as it can remove too many genes
14 | 
15 |   if (is.numeric(n_cells)) {
16 |     genes_keep <- c()
17 | 
18 |     for (gene in rownames(data)) {
19 |       # print(gene)
20 |       if (sum(GetAssayData(object = data, slot = "counts")[gene, ] > n_cells)) {
21 |         genes_keep <- append(genes_keep, gene)
22 |       }
23 |     }
24 | 
25 |     # keep those genes from the seurat object
26 | 
27 |     data <- subset(data, features = genes_keep)
28 |   }
29 | 
30 | 
31 |   # 2- convert ENSEMBL gene names to HGNC SYMBOL
32 | 
33 |   # check if there are ENSMBL genes, otherwise omit
34 | 
35 |   if (grepl(rownames(data), pattern = "^ENSG")) {
36 |     require(org.Hs.eg.db)
37 | 
38 |     symbols_match <- mapIds(org.Hs.eg.db, keys = rownames(data), keytype = "ENSEMBL", column = "SYMBOL", multiVals = "first")
39 | 
40 |     symbols <- c()
41 |     for (i in 1:length(symbols_match)) {
42 |       correct_symbol <- if_else(is.na(symbols_match[i]), names(symbols_match)[i], symbols_match[i])
43 |       symbols <- append(symbols, correct_symbol)
44 |     }
45 |   } else {
46 |     symbols <- rownames(data)
47 |   }
48 | 
49 | 
50 |   # 3 - make sure the naming is homogeneous (synonym symbols, etc.)
51 | 
52 |   library(HGNChelper)
53 | 
54 |   symbols <- checkGeneSymbols(symbols,
55 |     unmapped.as.na = FALSE, # if there is no mapping, return original gene name
56 |     # map = getCurrentHumanMap(), # downloads latest version of aliases (needs internet connection)
57 |     species = "human"
58 |   )
59 | 
60 |   print(length(symbols$Suggested.Symbol))
61 |   print(length(unique(symbols$Suggested.Symbol)))
62 | 
63 |   symbols <- make.unique(symbols$Suggested.Symbol) # keep only suggested symbols for renaming
64 | 
65 | 
66 |   # rename
67 | 
68 |   if (nrow(data) == length(symbols)) {
69 |     if (!all(is.na(data[[data@active.assay]]@counts))) {
70 |       rownames(data[[data@active.assay]]@counts) <- symbols
71 |     }
72 |     if (!all(is.na(data[[data@active.assay]]@data))) {
73 |       rownames(data[[data@active.assay]]@data) <- symbols
74 |     }
75 |   } else {
76 |     "Different number of genes. Cannot rename"
77 |   }
78 | 
79 | 
80 |   # if after this step we still have (ENSG...) genes they probably are novel transcripts or other not-so-relevant genes, thus could be deleted
81 |   if (remove_ensembl == TRUE) {
82 |     print(paste(
83 |       grep(rownames(data), pattern = "^ENSG", value = TRUE),
84 |       "will be removed",
85 |       "\\n",
86 |       sep = " "
87 |     ))
88 |     data <- subset(data, features = grep(rownames(data), pattern = "^ENSG", value = TRUE, invert = TRUE))
89 |   }
90 | 
91 |   # finally make sure the meta.features are correctly named
92 |   data[["RNA"]]@meta.features <- data.frame(row.names = rownames(data[["RNA"]]))
93 |   return(data)
94 | }
95 | 


--------------------------------------------------------------------------------
/utils/spatial_plot_spaniel.R:
--------------------------------------------------------------------------------
  1 | plot_spaniel <- function(data_df, 
  2 |                          grob, 
  3 |                          x, 
  4 |                          y, 
  5 |                          point_colour, 
  6 |                          point_size,
  7 |                          point_alpha){
  8 |   
  9 |   # Inverse Y to flip the coordinates
 10 |   # data_df$y_inv <- 36 - data_df$y
 11 |   # data_df$y_inv <- data_df$y
 12 |   
 13 |   data_df[, point_size] <- if_else(data_df[, point_size] == 0, NA_real_, data_df[, point_size])
 14 |   
 15 |   tmp_plt <- ggplot2::ggplot(data_df,
 16 |                   ggplot2::aes_string(x, y,
 17 |                                       color = point_colour, 
 18 |                                       size = point_size,
 19 |                                       alpha = point_alpha)) +
 20 |     ggplot2::xlim(1, 36) +
 21 |     ggplot2::ylim(1, 36) +
 22 |     # Layer 1 - Plot image
 23 |     ggplot2::annotation_custom(grob,
 24 |                                xmin = 1,
 25 |                                xmax = 36,
 26 |                                ymin = 1,
 27 |                                ymax = 36) +
 28 |     # Layer 2 - Plot points
 29 |     geom_point() +
 30 |     # Layer 3 - Join legends all with same name
 31 |     labs(color = "Proportion",
 32 |          size = "Proportion"
 33 |          # alpha = "Proportion"
 34 |     ) +
 35 |     # Coordinates fixed so x and y keep the proportions
 36 |     coord_fixed(1) +
 37 |     # Tune colour parameters
 38 |     ggplot2::scale_size_continuous(range=c(0, 3), limits = c(0, 1)) +
 39 |     ggplot2::scale_color_gradientn(
 40 |       colours = heat.colors(10, rev = TRUE),
 41 |       limits = c(0, 1)) +
 42 |     ggplot2::scale_alpha(range = c(0, 1), limits = c(0, 1)) +
 43 |     # Join legends into one
 44 |     ggplot2::guides(color = ggplot2::guide_legend(), 
 45 |                     size = ggplot2::guide_legend(),
 46 |                     alpha = ggplot2::guide_legend()
 47 |     )
 48 |   
 49 |   return(tmp_plt)
 50 | }
 51 | 
 52 | 
 53 | ################################################################################
 54 | ################################################################################
 55 | ############################Join Spatial plots##################################
 56 | ################################################################################
 57 | ################################################################################
 58 | 
 59 | plt_theme <- function(plt, rm_leg) {
 60 |   tmp_plt <- plt + 
 61 |     scale_fill_gradientn(
 62 |       colours = heat.colors(10, rev = TRUE),
 63 |       limits = c(0, 1)) +
 64 |     scale_alpha(range = c(0, 1)) +
 65 |     labs(title = "",
 66 |          fill = "Proportion") +
 67 |     theme(plot.margin = unit(c(0, 0, 0, 0), "cm"),
 68 |           plot.title = element_text(hjust = 0.5))
 69 |   
 70 |   if (rm_leg) {
 71 |     tmp_plt + theme(legend.position = "none")
 72 |   } else {
 73 |     tmp_plt + theme(legend.position = "right")
 74 |   }
 75 | }
 76 | 
 77 | draw_title <- function(feat,
 78 |                        title_size = 20,
 79 |                        title_face = "bold",
 80 |                        title_margin_t = 0,
 81 |                        title_margin_r = 0,
 82 |                        title_margin_b = 0,
 83 |                        title_margin_l = 10) {
 84 |   title <- cowplot::ggdraw() + 
 85 |     cowplot::draw_label(
 86 |       feat,
 87 |       fontface = 'bold',
 88 |       x = 0,
 89 |       hjust = 0
 90 |     ) +
 91 |     ggplot2::theme(
 92 |       # add margin on the left of the drawing canvas,
 93 |       # so title is aligned with left edge of first plot
 94 |       plot.margin = margin(0, 0, 0, 7)
 95 |     )
 96 |   return(title)
 97 | }
 98 | 
 99 | 
100 | join_seurat_spatial <- function(se_obj,
101 |                                 plt,
102 |                                 grp,
103 |                                 feat,
104 |                                 title_size = 20,
105 |                                 title_face = "bold",
106 |                                 title_margin_t = 0,
107 |                                 title_margin_r = 0,
108 |                                 title_margin_b = 0,
109 |                                 title_margin_l = 10) {
110 |   
111 |   # Determine which groups are 0
112 |   metadata_ds <- se_obj@meta.data
113 |   slices <- unique(metadata_ds[, grp])
114 |   grp1 <- sum(metadata_ds[metadata_ds$slice == slices[[1]], feat])
115 |   grp2 <- sum(metadata_ds[metadata_ds$slice == slices[[2]], feat])
116 |   
117 |   # Process individual pannels
118 |   plt_1 <- suppressMessages(plt_theme(plt = plt[[1]], rm_leg = TRUE))
119 |   ### Remove dots if 0s everywhere
120 |   if (grp1 == 0) {
121 |     plt_1 <- suppressMessages(plt_1 + scale_alpha(range = c(0,0)))
122 |   }
123 |   
124 |   plt_2 <- suppressMessages(plt_theme(plt = plt[[2]], rm_leg = FALSE))
125 |   ### Remove dots if 0s everywhere
126 |   if (grp2 == 0) {
127 |     plt_2 <- suppressMessages(plt_2 + scale_alpha(range = c(0,0)))
128 |   }
129 |   
130 |   # Recombine plt 1 and 2
131 |   tmp_plt <- cowplot::plot_grid(plt_1,
132 |                                 plt_2,
133 |                                 align = "hv",
134 |                                 axis = "trbl",
135 |                                 nrow = 1)
136 |   
137 |   # Add title to plot composition
138 |   title <- draw_title(feat = feat,
139 |                       title_size = title_size,
140 |                       title_face = title_face,
141 |                       title_margin_t = title_margin_t,
142 |                       title_margin_r = title_margin_r,
143 |                       title_margin_b = title_margin_b,
144 |                       title_margin_l = title_margin_l)
145 |   
146 |   plt_arr <- cowplot::plot_grid(
147 |     title, tmp_plt,
148 |     ncol = 1,
149 |     # rel_heights values control vertical title margins
150 |     rel_heights = c(0.1, 1)
151 |   )
152 |   
153 |   return(plt_arr)
154 |   
155 | }
156 | 
157 | 


--------------------------------------------------------------------------------