├── Figure 1
    ├── Cancer Cell Identification
    │   └── identification.R
    ├── Deconvolution
    │   ├── Deconvolution.R
    │   └── reference_processing.R
    └── Numbat
    │   ├── processing.R
    │   └── run_numbat.R
├── Figure 2
    ├── Core Edge Identification
    │   └── state_anno.R
    ├── Correlation Analysis
    │   └── corplot.R
    ├── Differential Expression
    │   └── degs.R
    ├── IPA
    │   ├── IPA Upstream regulator.R
    │   └── IPA cannonical.R
    └── TF anaysis
    │   └── pySCENIC_oscc.R
├── Figure 3
    ├── Cancer Stem Cell
    │   └── csc_analysis.R
    ├── Cell Neighbor Analysis
    │   └── cell_neighbor_analysis.R
    └── Cell Signaling
    │   └── cellchat.R
├── Figure 4
    ├── Classifier Training
    │   └── script.R
    └── Plotting
    │   └── analysis.R
├── Figure 5
    ├── OSCC survival
    │   └── oscc_tcga.R
    ├── Pan-cancer survival
    │   └── Survival_pan_cancer.R
    └── Validation
    │   └── GSE41613_survival.R
├── Figure 6
    ├── Dynamo
    │   ├── dynamo.R
    │   └── dynamo.ipynb
    ├── PharmacoDB and DGIdb
    │   └── phamacogx.R
    ├── Velocity on Sample
    │   └── scvelo_sample.ipynb
    └── Velocity
    │   ├── differential_splicing.R
    │   ├── scvelo_analysis.ipynb
    │   └── tissue_position_splitting.ipynb
├── LICENSE
└── README.md


/Figure 1/Cancer Cell Identification/identification.R:
--------------------------------------------------------------------------------
 1 | library(tidyverse)
 2 | library(dplyr)
 3 | library(Seurat)
 4 | library(patchwork)
 5 | load(file = "Seurat_objs.Robj")
 6 | 
 7 | load(file = "nb.Robj")
 8 | 
 9 | samples = c('sample_1','sample_2','sample_3','sample_4','sample_5','sample_6','sample_7','sample_8','sample_9','sample_10','sample_11','sample_12')
10 | for (sample in samples) {
11 |   
12 |   Seurat_meta <- Seurat_objs[[sample]]@meta.data
13 |   Seurat_meta$barcode <- rownames(Seurat_meta)
14 |   barcode_id <- strsplit(rownames(Seurat_meta), "-")[[1]][[2]]
15 |   p_cnvs <- nb[[sample]]$clone_post
16 |   
17 |   p_cnvs$cell <- str_replace(p_cnvs$cell, "1", barcode_id)
18 |   
19 |   Seurat_objs[[sample]]@meta.data <- Seurat_meta %>%
20 |     left_join(
21 |       p_cnvs, 
22 |       by = c('barcode' = 'cell')
23 |     ) %>%
24 |     column_to_rownames("barcode")
25 |   
26 |   #define cancer cell spots as being either p cnv > 0.99 or deconvolution probability > 0.99
27 |   Seurat_objs[[sample]]$is_cancer_cell <- ifelse( (Seurat_objs[[sample]]$p_cnv > 0.99 |  Seurat_objs[[sample]]$cancer.cell > 0.99) & Seurat_objs[[sample]]$pathologist_anno == "SCC", "1", "0")
28 |   
29 |   #define non cancer cell states
30 |   Seurat_objs[[sample]]$celltype <- ifelse(Seurat_objs[[sample]]$is_cancer_cell == "1","cancer cell","other celltype")
31 |   
32 |   meta <- Seurat_objs[[sample]]@meta.data
33 | 
34 |   noncancer_meta <- meta[c("myofibroblast","B.cell","ecm.myCAF","Intermediate.fibroblast","detox.iCAF","macrophage","endothelial","dendritic.","mast",
35 |          "conventional.CD4..T.helper.cells","cytotoxic.CD8..T.","Tregs","cytotoxic.CD8..T.exhausted")]
36 |   
37 |   #identify the noncancer celltype as the noncancer celltype with the greatest proportion
38 |   noncancer_meta <- noncancer_meta %>% mutate(noncancer_celltype=names(.)[max.col(.)])
39 |   noncancer_meta$barcode <- rownames(noncancer_meta)
40 |   
41 |   Seurat_objs[[sample]]@meta.data <- merge(x = Seurat_objs[[sample]]@meta.data, y = noncancer_meta[ , c("noncancer_celltype","barcode")], all.x=TRUE, by = 0) %>%
42 |     column_to_rownames("barcode")
43 |   
44 |   Seurat_objs[[sample]]@meta.data$final_celltype <- ifelse(Seurat_objs[[sample]]$is_cancer_cell == "1","cancer cell",Seurat_objs[[sample]]@meta.data$noncancer_celltype)
45 | }
46 | 
47 | #check all annotations
48 | lapply(
49 |   samples,
50 |   function(sample) {
51 |     SpatialDimPlot(Seurat_objs[[sample]], group.by = "final_celltype", interactive = T)
52 |   }
53 | ) %>% 
54 |   wrap_plots(guides = 'collect')
55 | 
56 | annotated_objects <- Seurat_objs
57 | 
58 | save(annotated_objects, file = "annotated_objects.Robj")
59 | 
60 | #plots for Figure 1 were generated in Figure 2, Core Edge Identification code
61 | 


--------------------------------------------------------------------------------
/Figure 1/Deconvolution/Deconvolution.R:
--------------------------------------------------------------------------------
 1 | library(CARD)
 2 | library(Seurat)
 3 | library(tidyverse)
 4 | library(dplyr)
 5 | library(RColorBrewer)
 6 | library(ggplot2)
 7 | #load all objects
 8 | load(file = "Seurat/obj_1.Robj")
 9 | load(file = "Seurat/obj_2.Robj")
10 | load(file = "Seurat/obj_3.Robj")
11 | load(file = "Seurat/obj_4.Robj")
12 | load(file = "Seurat/obj_5.Robj")
13 | load(file = "Seurat/obj_6.Robj")
14 | load(file = "Seurat/obj_7.Robj")
15 | load(file = "Seurat/obj_8.Robj")
16 | load(file = "Seurat/obj_9.Robj")
17 | load(file = "Seurat/obj_10.Robj")
18 | load(file = "Seurat/obj_11.Robj")
19 | load(file = "Seurat/obj_12.Robj")
20 | 
21 | Seurat_objs <- c(obj_1,obj_2,obj_3,obj_4,obj_5,obj_6,obj_7,obj_8,
22 |                  obj_9,obj_10,obj_11,obj_12)
23 | 
24 | samples <- c("sample_1","sample_2","sample_3","sample_4","sample_5","sample_6","sample_7","sample_8",
25 |              "sample_9","sample_10", "sample_11","sample_12")
26 | 
27 | names(Seurat_objs) <- samples
28 | 
29 | 
30 | 
31 | for (sample in Seurat_objs) {
32 | sample@meta.data$sample <- NULL
33 | spatial_count <- sample@assays$Spatial@counts
34 | 
35 | spatial_location <- sample@images$tumor@coordinates %>% select(row, col)
36 | names(spatial_location) <- c("x","y")
37 | 
38 | if (!identical(colnames(spatial_count), rownames(spatial_location))){
39 | #ensure that count matrix contains the same spots as the location matrix 
40 | spatial_count <- spatial_count[, colnames(spatial_count) %in% rownames(spatial_location)]
41 | }
42 | #match order
43 | spatial_location <- spatial_location[order(match(rownames(spatial_location), colnames(spatial_count))), , drop = FALSE]
44 | 
45 | load(file = "puram/puram_data.Robj")
46 | sc_count <- puram_data@assays$RNA@counts
47 | sc_meta <- puram_data@meta.data
48 | 
49 | #deconvolute using CARD
50 | CARD_obj = createCARDObject(
51 |   sc_count = sc_count,
52 |   sc_meta = sc_meta,
53 |   spatial_count = spatial_count,
54 |   spatial_location = spatial_location,
55 |   ct.varname = "cellype_fine",
56 |   ct.select = unique(sc_meta$cellype_fine),
57 |   sample.varname = "orig.ident",
58 |   minCountGene = 100,
59 |   minCountSpot = 5) 
60 | 
61 | CARD_obj = CARD_deconvolution(CARD_object = CARD_obj)
62 | 
63 | proportions <- as.data.frame(CARD_obj@Proportion_CARD)
64 | 
65 | proportions$dominant_celltype <- apply(proportions, 1, function(x) names(proportions)[which.max(x)])
66 | 
67 | sample <- Seurat::AddMetaData(sample,proportions)
68 | 
69 | Seurat_objs[unique(sample@meta.data$sample_id)] <- sample
70 | 
71 | }
72 | 
73 | # save(Seurat_objs,file = "Seurat_objs.Robj")
74 | # 
75 | # load(file = "Seurat_objs.Robj")
76 | 
77 | lapply(
78 |   Seurat_objs,
79 |   function(sample) {
80 |     SpatialFeaturePlot(sample,features = "cancer.cell", min.cutoff = 1)
81 |   }
82 | ) %>% 
83 |   wrap_plots(guides = 'collect')
84 | 
85 | 


--------------------------------------------------------------------------------
/Figure 1/Deconvolution/reference_processing.R:
--------------------------------------------------------------------------------
 1 | ###load in and process single-cell data from GEO
 2 | library(Seurat)
 3 | 
 4 | all_data <- read.delim2("HNSCC_all_data.txt")
 5 | 
 6 | all_data$X  <- as.list(sapply(all_data$X , function(x) gsub("\'", "", x)))
 7 | 
 8 | rownames(all_data) <- all_data$X
 9 | 
10 | all_data$X <- NULL
11 | 
12 | meta_data <- all_data[0:5,]
13 | meta_data <- as.data.frame(t(meta_data))
14 | 
15 | expr_data <- all_data[6:23691,]
16 | 
17 | seurat_data <- CreateSeuratObject(expr_data, project = "puram_data")
18 | 
19 | seurat_data <- AddMetaData(seurat_data,metadata = meta_data)
20 | 
21 | library(reshape2)
22 | library(Seurat)
23 | library(ggplot2)
24 | library(tidyverse)
25 | library(tidyr)
26 | #heatmap
27 | 
28 | seurat_data <- FindVariableFeatures(seurat_data)
29 | seurat_data <- ScaleData(seurat_data)
30 | seurat_data <- Seurat::RunPCA(seurat_data, verbose = FALSE) %>%
31 |   Seurat::RunUMAP(., dims = 1:30, verbose = FALSE)%>%
32 |   FindNeighbors(., dims = 1:30)%>%
33 |   FindClusters(., resolution = 0.5)
34 | 
35 | seurat_data@meta.data$non.cancer.cell.type <- gsub("-","",as.character(seurat_data@meta.data$non.cancer.cell.type))
36 | 
37 | #rename cancer cells
38 | seurat_data@meta.data$non.cancer.cell.type <- gsub("0","Cancer cell",as.character(seurat_data@meta.data$non.cancer.cell.type))
39 | 
40 | Seurat::DimPlot(seurat_data,
41 |                 group.by = "RNA_snn_res.0.5",
42 |                 label = TRUE) + Seurat::NoLegend()
43 | 
44 | #identify celltypes based on published markers
45 | cluster_ids <- read.csv(file = "cluster_identification.csv")
46 | 
47 | Idents(seurat_data) = "RNA_snn_res.0.5"
48 | new.cluster.ids <- cluster_ids$cell_type
49 | names(new.cluster.ids) <- levels(seurat_data)
50 | seurat_data <- RenameIdents(seurat_data, new.cluster.ids)
51 | seurat_data@meta.data$cellype_fine <- Idents(seurat_data)
52 | 
53 | colourCount = length(unique(seurat_data@meta.data$cellype_fine))
54 | library(RColorBrewer)
55 | mycolors <- colorRampPalette(brewer.pal(7, "Set1"))(colourCount)
56 | 
57 | #refactor
58 | 
59 | seurat_data@meta.data$cellype_fine <- factor(seurat_data@meta.data$cellype_fine, levels = 
60 |                                                c("myofibroblast","cancer cell","B cell","ecm-myCAF",
61 |                                                  "Intermediate fibroblast","detox-iCAF","macrophage",
62 |                                                  "endothelial","dendritic ","mast","conventional CD4+ T-helper cells",
63 |                                                  "cytotoxic CD8+ T ","Tregs","cytotoxic CD8+ T exhausted"))
64 | 
65 | Idents(seurat_data) = "cellype_fine"
66 | png("puram_corescore_subtype.png",units = "in", res = 300, width = 12, height = 8)
67 | VlnPlot(seurat_data, features = "core_genes1", cols = mycolors)
68 | dev.off()
69 | 
70 | png("puram_edgescore_subtype.png",units = "in", res = 300, width = 12, height = 8)
71 | VlnPlot(seurat_data, features = "edge_genes1", cols = mycolors)
72 | dev.off()
73 | 
74 | 
75 | png("puram_annotated.png",units = "in", res = 300, width = 8, height = 6)
76 | Seurat::DimPlot(seurat_data,
77 |                 group.by = "cellype_fine",
78 |                 label = F, cols = mycolors) 
79 | dev.off()
80 | puram_data <- seurat_data
81 | save(puram_data, file = "puram_data.Robj")                    
82 | 
83 | 


--------------------------------------------------------------------------------
/Figure 1/Numbat/processing.R:
--------------------------------------------------------------------------------
  1 | library(data.table)
  2 | library(dplyr)
  3 | library(glue)
  4 | library(stringr)
  5 | library(numbat)
  6 | library(ggplot2)
  7 | library(patchwork)
  8 | 
  9 | # read in the numbat results and spot coordinates
 10 | nb = list()
 11 | spots = list()
 12 | samples = c('sample_1','sample_2','sample_3','sample_4','sample_5','sample_6','sample_7','sample_8','sample_9','sample_10','sample_11','sample_12')
 13 | for (sample in samples) {
 14 |   nb[[sample]] = Numbat$new(glue('outs/{sample}'))
 15 |   spots[[sample]] = fread(glue('tp/{sample}/tissue_positions_list.csv'))
 16 |   colnames(spots[[sample]]) <- c("barcode","in_tissue","array_row","array_col","tissue_scaled_x","tissue_scaled_y")
 17 | }
 18 | 
 19 | options(repr.plot.width = 8.5, repr.plot.height = 8, repr.plot.res = 300)
 20 | 
 21 | #plot CNV probability on samples
 22 | lapply(
 23 |   samples,
 24 |   function(sample) {
 25 |     
 26 |     spots[[sample]] %>%
 27 |       left_join(
 28 |         nb[[sample]]$clone_post, 
 29 |         by = c('barcode' = 'cell')
 30 |       ) %>%
 31 |       filter(in_tissue == 1) %>%
 32 |       ggplot(
 33 |         aes(x = array_col, y = array_row)
 34 |       ) +
 35 |       geom_point(aes(color = p_cnv), size = 1, alpha = 0.8, pch = 16) +
 36 |       scale_color_gradient2(
 37 |         low = 'darkgreen', high = 'red3', mid = 'yellow', 
 38 |         midpoint = 0.5, limits = c(0,1), oob = scales::oob_squish
 39 |       ) +
 40 |       theme_bw() +
 41 |       ggtitle(sample) + 
 42 |       scale_y_reverse()
 43 |   }
 44 | ) %>% 
 45 |   wrap_plots(guides = 'collect')
 46 | 
 47 | 
 48 | chrom_labeller <- function(chr) {
 49 |   chr[chr %in% c(19, 21, 22)] = ""
 50 |   return(chr)
 51 | }
 52 | 
 53 | pal = c("#E41A1C", "#377EB8", "#4DAF4A", "#984EA3", "#FF7F00", "#FFFF33", "#A65628", "#F781BF")
 54 | getPalette = colorRampPalette(pal)
 55 | 
 56 | cnv_colors = c("neu" = "gray",
 57 |                "neu_up" = "darkgray", "neu_down" = "gray",
 58 |                "del_up" = "royalblue", "del_down" = "darkblue",
 59 |                "loh_up" = "darkgreen", "loh_down" = "olivedrab4",
 60 |                "amp_up" = "red", "amp_down" = "tomato3",
 61 |                "del_1_up" = "royalblue", "del_1_down" = "darkblue",
 62 |                "loh_1_up" = "darkgreen", "loh_1_down" = "olivedrab4",
 63 |                "amp_1_up" = "red", "amp_1_down" = "tomato3",
 64 |                "del_2_up" = "royalblue", "del_2_down" = "darkblue",
 65 |                "loh_2_up" = "darkgreen", "loh_2_down" = "olivedrab4",
 66 |                "amp_2_up" = "red", "amp_2_down" = "tomato3",
 67 |                "del_up_1" = "royalblue", "del_down_1" = "darkblue",
 68 |                "loh_up_1" = "darkgreen", "loh_down_1" = "olivedrab4",
 69 |                "amp_up_1" = "red", "amp_down_1" = "tomato3",
 70 |                "del_up_2" = "royalblue", "del_down_2" = "darkblue",
 71 |                "loh_up_2" = "darkgreen", "loh_down_2" = "olivedrab4",
 72 |                "amp_up_2" = "red", "amp_down_2" = "tomato3",
 73 |                "bamp" = "salmon", "bdel" = "skyblue",
 74 |                "amp" = "tomato3", "loh" = "olivedrab4", "del" = "royalblue",
 75 |                "theta_up" = "darkgreen", "theta_down" = "olivedrab4",
 76 |                "theta_1_up" = "darkgreen", "theta_1_down" = "olivedrab4",
 77 |                "theta_2_up" = "darkgreen", "theta_2_down" = "olivedrab4",
 78 |                "theta_up_1" = "darkgreen", "theta_down_1" = "olivedrab4",
 79 |                "theta_up_2" = "darkgreen", "theta_down_2" = "olivedrab4",
 80 |                '0|1' = 'red', '1|0' = 'blue','major' = '#66C2A5', 'minor' = '#FC8D62')
 81 | 
 82 | cnv_labels = names(cnv_colors) %>%
 83 |   stringr::str_remove_all('_') %>%
 84 |   stringr::str_to_upper() %>%
 85 |   stringr::str_replace('UP', '(major)') %>%
 86 |   stringr::str_replace('DOWN', '(minor)') %>%
 87 |   stringr::str_replace('LOH', 'CNLoH') %>%
 88 |   setNames(names(cnv_colors))
 89 | 
 90 | numbat_plot <- function (segs) {
 91 |   chrom_labeller <- function(chr) {
 92 |     chr[chr %in% c(19, 21, 22)] = ""
 93 |     return(chr)
 94 |   }
 95 |   ggplot(segs) + geom_rect(aes(xmin = seg_start, xmax = seg_end, 
 96 |                                ymin = -0.5, ymax = 0.5, fill = cnv_state_post)) + theme_void() + 
 97 |     theme(panel.spacing = unit(1, "mm"), strip.background = element_blank(), 
 98 |           strip.text.y = element_text(angle = 0), plot.margin = margin(0, 
 99 |            0, 0, 0), legend.position = "none") + facet_grid(~CHROM, 
100 |     space = "free_x", scales = "free") + 
101 |     theme(
102 |       strip.background = element_blank(),
103 |       strip.text.x = element_blank()
104 |     )+ 
105 |     scale_fill_manual(values = cnv_colors, labels = cnv_labels, 
106 |                       name = "CN states") +
107 |     ggrepel::geom_text_repel(aes(x = (seg_start +
108 |     seg_end)/2, y = -0.5, label = str_remove(seg_cons, "\\d+")),
109 |     min.segment.length = 0, vjust = 1, hjust = 0, direction = "x",
110 |     segment.curvature = -0.2, segment.ncp = 3, segment.angle = 30,
111 |     segment.inflect = TRUE, max.overlaps = 3) + scale_y_continuous(expand = expansion(add = c(0.5,
112 |     0))) + scale_x_continuous(expand = expansion(mult = 0.05)) +
113 |     guides(fill = "none")
114 | }
115 | 
116 | png("numbat_plot.png", units = "in", res = 300, height = 5, width = 10, type = "cairo")
117 | options(repr.plot.width = 14, repr.plot.height = 12, repr.plot.res = 300)
118 | lapply(
119 |   samples,
120 |   function(sample) {
121 |     numbat_plot(nb[[sample]]$segs_consensus) 
122 |   }
123 | ) %>% 
124 |   wrap_plots(guides = 'collect', ncol = 1)
125 | dev.off()
126 | 


--------------------------------------------------------------------------------
/Figure 1/Numbat/run_numbat.R:
--------------------------------------------------------------------------------
  1 | module load singularity/3.8.1
  2 | 
  3 | #singularity pull docker://pkharchenkolab/numbat-rbase:latest
  4 | 
  5 | salloc --time 4:59:00 --ntasks=40 --mem-per-cpu=40G
  6 | 
  7 | singularity exec --home /work/bose_lab/Rohit/numbat numbat-rbase_latest.sif Rscript /numbat/inst/bin/pileup_and_phase.R \
  8 | --label "sample_1" \
  9 | --samples "sample_1" \
 10 | --bams sample_data/sample_1/possorted_genome_bam.bam \
 11 | --barcodes sample_data/sample_1/barcodes.tsv \
 12 | --outdir "pap/sample_1" \
 13 | --gmap /Eagle_v2.4.1/tables/genetic_map_hg38_withX.txt.gz \
 14 | --snpvcf /data/genome1K.phase3.SNP_AF5e2.chr1toX.hg38.vcf \
 15 | --paneldir /data/1000G_hg38 \
 16 | --ncores 30
 17 | 
 18 | singularity exec --home /work/bose_lab/Rohit/numbat numbat-rbase_latest.sif Rscript /numbat/inst/bin/pileup_and_phase.R \
 19 | --label "sample_2" \
 20 | --samples "sample_2" \
 21 | --bams sample_data/sample_2/possorted_genome_bam.bam \
 22 | --barcodes sample_data/sample_2/barcodes.tsv \
 23 | --outdir "pap/sample_2" \
 24 | --gmap /Eagle_v2.4.1/tables/genetic_map_hg38_withX.txt.gz \
 25 | --snpvcf /data/genome1K.phase3.SNP_AF5e2.chr1toX.hg38.vcf \
 26 | --paneldir /data/1000G_hg38 \
 27 | --ncores 30
 28 | 
 29 | singularity exec --home /work/bose_lab/Rohit/numbat numbat-rbase_latest.sif Rscript /numbat/inst/bin/pileup_and_phase.R \
 30 | --label "sample_3" \
 31 | --samples "sample_3" \
 32 | --bams sample_data/sample_3/possorted_genome_bam.bam \
 33 | --barcodes sample_data/sample_3/barcodes.tsv \
 34 | --outdir "pap/sample_3" \
 35 | --gmap /Eagle_v2.4.1/tables/genetic_map_hg38_withX.txt.gz \
 36 | --snpvcf /data/genome1K.phase3.SNP_AF5e2.chr1toX.hg38.vcf \
 37 | --paneldir /data/1000G_hg38 \
 38 | --ncores 30
 39 | 
 40 | 
 41 | singularity exec --home /work/bose_lab/Rohit/numbat numbat-rbase_latest.sif Rscript /numbat/inst/bin/pileup_and_phase.R \
 42 | --label "sample_4" \
 43 | --samples "sample_4" \
 44 | --bams sample_data/sample_4/possorted_genome_bam.bam \
 45 | --barcodes sample_data/sample_4/barcodes.tsv \
 46 | --outdir "pap/sample_4" \
 47 | --gmap /Eagle_v2.4.1/tables/genetic_map_hg38_withX.txt.gz \
 48 | --snpvcf /data/genome1K.phase3.SNP_AF5e2.chr1toX.hg38.vcf \
 49 | --paneldir /data/1000G_hg38 \
 50 | --ncores 30
 51 | 
 52 | 
 53 | singularity exec --home /work/bose_lab/Rohit/numbat numbat-rbase_latest.sif Rscript /numbat/inst/bin/pileup_and_phase.R \
 54 | --label "sample_5" \
 55 | --samples "sample_5" \
 56 | --bams sample_data/sample_5/possorted_genome_bam.bam \
 57 | --barcodes sample_data/sample_5/barcodes.tsv \
 58 | --outdir "pap/sample_5" \
 59 | --gmap /Eagle_v2.4.1/tables/genetic_map_hg38_withX.txt.gz \
 60 | --snpvcf /data/genome1K.phase3.SNP_AF5e2.chr1toX.hg38.vcf \
 61 | --paneldir /data/1000G_hg38 \
 62 | --ncores 30
 63 | 
 64 | 
 65 | singularity exec --home /work/bose_lab/Rohit/numbat numbat-rbase_latest.sif Rscript /numbat/inst/bin/pileup_and_phase.R \
 66 | --label "sample_6" \
 67 | --samples "sample_6" \
 68 | --bams sample_data/sample_6/possorted_genome_bam.bam \
 69 | --barcodes sample_data/sample_6/barcodes.tsv \
 70 | --outdir "pap/sample_6" \
 71 | --gmap /Eagle_v2.4.1/tables/genetic_map_hg38_withX.txt.gz \
 72 | --snpvcf /data/genome1K.phase3.SNP_AF5e2.chr1toX.hg38.vcf \
 73 | --paneldir /data/1000G_hg38 \
 74 | --ncores 30
 75 | 
 76 | 
 77 | singularity exec --home /work/bose_lab/Rohit/numbat numbat-rbase_latest.sif Rscript /numbat/inst/bin/pileup_and_phase.R \
 78 | --label "sample_7" \
 79 | --samples "sample_7" \
 80 | --bams sample_data/sample_7/possorted_genome_bam.bam \
 81 | --barcodes sample_data/sample_7/barcodes.tsv \
 82 | --outdir "pap/sample_7" \
 83 | --gmap /Eagle_v2.4.1/tables/genetic_map_hg38_withX.txt.gz \
 84 | --snpvcf /data/genome1K.phase3.SNP_AF5e2.chr1toX.hg38.vcf \
 85 | --paneldir /data/1000G_hg38 \
 86 | --ncores 30
 87 | 
 88 | singularity exec --home /work/bose_lab/Rohit/numbat numbat-rbase_latest.sif Rscript /numbat/inst/bin/pileup_and_phase.R \
 89 | --label "sample_8" \
 90 | --samples "sample_8" \
 91 | --bams sample_data/sample_8/possorted_genome_bam.bam \
 92 | --barcodes sample_data/sample_8/barcodes.tsv \
 93 | --outdir "pap/sample_8" \
 94 | --gmap /Eagle_v2.4.1/tables/genetic_map_hg38_withX.txt.gz \
 95 | --snpvcf /data/genome1K.phase3.SNP_AF5e2.chr1toX.hg38.vcf \
 96 | --paneldir /data/1000G_hg38 \
 97 | --ncores 30
 98 | 
 99 | 
100 | singularity exec --home /work/bose_lab/Rohit/numbat numbat-rbase_latest.sif Rscript /numbat/inst/bin/pileup_and_phase.R \
101 | --label "sample_9" \
102 | --samples "sample_9" \
103 | --bams sample_data/sample_9/possorted_genome_bam.bam \
104 | --barcodes sample_data/sample_9/barcodes.tsv \
105 | --outdir "pap/sample_9" \
106 | --gmap /Eagle_v2.4.1/tables/genetic_map_hg38_withX.txt.gz \
107 | --snpvcf /data/genome1K.phase3.SNP_AF5e2.chr1toX.hg38.vcf \
108 | --paneldir /data/1000G_hg38 \
109 | --ncores 30
110 | 
111 | 
112 | singularity exec --home /work/bose_lab/Rohit/numbat numbat-rbase_latest.sif Rscript /numbat/inst/bin/pileup_and_phase.R \
113 | --label "sample_10" \
114 | --samples "sample_10" \
115 | --bams sample_data/sample_10/possorted_genome_bam.bam \
116 | --barcodes sample_data/sample_10/barcodes.tsv \
117 | --outdir "pap/sample_10" \
118 | --gmap /Eagle_v2.4.1/tables/genetic_map_hg38_withX.txt.gz \
119 | --snpvcf /data/genome1K.phase3.SNP_AF5e2.chr1toX.hg38.vcf \
120 | --paneldir /data/1000G_hg38 \
121 | --ncores 30
122 | 
123 | 
124 | singularity exec --home /work/bose_lab/Rohit/numbat numbat-rbase_latest.sif Rscript /numbat/inst/bin/pileup_and_phase.R \
125 | --label "sample_11" \
126 | --samples "sample_11" \
127 | --bams sample_data/sample_11/possorted_genome_bam.bam \
128 | --barcodes sample_data/sample_11/barcodes.tsv \
129 | --outdir "pap/sample_11" \
130 | --gmap /Eagle_v2.4.1/tables/genetic_map_hg38_withX.txt.gz \
131 | --snpvcf /data/genome1K.phase3.SNP_AF5e2.chr1toX.hg38.vcf \
132 | --paneldir /data/1000G_hg38 \
133 | --ncores 30
134 | 
135 | 
136 | singularity exec --home /work/bose_lab/Rohit/numbat numbat-rbase_latest.sif Rscript /numbat/inst/bin/pileup_and_phase.R \
137 | --label "sample_12" \
138 | --samples "sample_12" \
139 | --bams sample_data/sample_12/possorted_genome_bam.bam \
140 | --barcodes sample_data/sample_12/barcodes.tsv \
141 | --outdir "pap/sample_12" \
142 | --gmap /Eagle_v2.4.1/tables/genetic_map_hg38_withX.txt.gz \
143 | --snpvcf /data/genome1K.phase3.SNP_AF5e2.chr1toX.hg38.vcf \
144 | --paneldir /data/1000G_hg38 \
145 | --ncores 30
146 | 
147 | 
148 | 
149 | #prep h5 files for each sample
150 | library(Seurat)
151 | mtx1 <- Read10X_h5("mtx_files/s1_filtered_feature_bc_matrix.h5")
152 | saveRDS(mtx1, file ="mtx_processed/sample_1/mtx.RDS")
153 | 
154 | mtx2 <- Read10X_h5("mtx_files/s2_filtered_feature_bc_matrix.h5")
155 | saveRDS(mtx2, file ="mtx_processed/sample_2/mtx.RDS")
156 | 
157 | mtx3 <- Read10X_h5("mtx_files/s3_filtered_feature_bc_matrix.h5")
158 | saveRDS(mtx3, file ="mtx_processed/sample_3/mtx.RDS")
159 | 
160 | mtx4 <- Read10X_h5("mtx_files/s4_filtered_feature_bc_matrix.h5")
161 | saveRDS(mtx4, file ="mtx_processed/sample_4/mtx.RDS")
162 | 
163 | mtx5 <- Read10X_h5("mtx_files/s5_filtered_feature_bc_matrix.h5")
164 | saveRDS(mtx5, file ="mtx_processed/sample_5/mtx.RDS")
165 | 
166 | mtx6 <- Read10X_h5("mtx_files/s6_filtered_feature_bc_matrix.h5")
167 | saveRDS(mtx6, file ="mtx_processed/sample_6/mtx.RDS")
168 | 
169 | mtx7 <- Read10X_h5("mtx_files/s7_filtered_feature_bc_matrix.h5")
170 | saveRDS(mtx7, file ="mtx_processed/sample_7/mtx.RDS")
171 | 
172 | mtx8 <- Read10X_h5("mtx_files/s8_filtered_feature_bc_matrix.h5")
173 | saveRDS(mtx8, file ="mtx_processed/sample_8/mtx.RDS")
174 | 
175 | mtx9 <- Read10X_h5("mtx_files/s9_filtered_feature_bc_matrix.h5")
176 | saveRDS(mtx9, file ="mtx_processed/sample_9/mtx.RDS")
177 | 
178 | mtx10 <- Read10X_h5("mtx_files/s10_filtered_feature_bc_matrix.h5")
179 | saveRDS(mtx10, file ="mtx_processed/sample_10/mtx.RDS")
180 | 
181 | mtx11 <- Read10X_h5("mtx_files/s11_filtered_feature_bc_matrix.h5")
182 | saveRDS(mtx11, file ="mtx_processed/sample_11/mtx.RDS")
183 | 
184 | mtx12 <- Read10X_h5("mtx_files/s12_filtered_feature_bc_matrix.h5")
185 | saveRDS(mtx12, file ="mtx_processed/sample_12/mtx.RDS")
186 | 
187 | #gunzip allele counts file
188 | gunzip pap/sample_1/sample_1_allele_counts.tsv.gz 
189 | gunzip pap/sample_2/sample_2_allele_counts.tsv.gz 
190 | gunzip pap/sample_3/sample_3_allele_counts.tsv.gz 
191 | gunzip pap/sample_4/sample_4_allele_counts.tsv.gz 
192 | gunzip pap/sample_5/sample_5_allele_counts.tsv.gz 
193 | gunzip pap/sample_6/sample_6_allele_counts.tsv.gz 
194 | gunzip pap/sample_7/sample_7_allele_counts.tsv.gz 
195 | gunzip pap/sample_8/sample_8_allele_counts.tsv.gz 
196 | gunzip pap/sample_9/sample_9_allele_counts.tsv.gz 
197 | gunzip pap/sample_10/sample_10_allele_counts.tsv.gz 
198 | gunzip pap/sample_11/sample_11_allele_counts.tsv.gz 
199 | gunzip pap/sample_12/sample_12_allele_counts.tsv.gz 
200 | 
201 | 
202 | salloc --time 4:59:00 --ntasks=40 --mem-per-cpu=40G
203 | #run once for each sample
204 | singularity exec --home /work/bose_lab/Rohit/numbat numbat-rbase_latest.sif R
205 | 
206 | library(numbat)
207 | 
208 | count_mat <- readRDS("sample_data/sample_1/mtx.RDS")
209 | allele_df <- read.delim("pap/sample_1/sample_1_allele_counts.tsv") 
210 | out = run_numbat(
211 |   count_mat, # gene x cell integer UMI count matrix 
212 |   ref_hca, # reference expression profile, a gene x cell type normalized expression level matrix
213 |   allele_df, # allele dataframe generated by pileup_and_phase script
214 |   genome = "hg38",
215 |   t = 1e-5,
216 |   ncores = 4,
217 |   plot = TRUE,
218 |   out_dir = 'outs/sample_1',
219 |   max_entropy = 0.8
220 | )
221 | 
222 | q() 
223 | 
224 | n 
225 | 
226 | singularity exec --home /work/bose_lab/Rohit/numbat numbat-rbase_latest.sif R
227 | 
228 | library(numbat)
229 | 
230 | count_mat <- readRDS("sample_data/sample_2/mtx.RDS")
231 | allele_df <- read.delim("pap/sample_2/sample_2_allele_counts.tsv") 
232 | out = run_numbat(
233 |   count_mat, # gene x cell integer UMI count matrix 
234 |   ref_hca, # reference expression profile, a gene x cell type normalized expression level matrix
235 |   allele_df, # allele dataframe generated by pileup_and_phase script
236 |   genome = "hg38",
237 |   t = 1e-5,
238 |   ncores = 4,
239 |   plot = TRUE,
240 |   out_dir = 'outs/sample_2',
241 |   max_entropy = 0.8
242 | )
243 | 
244 | q() 
245 | 
246 | n 
247 | 
248 | singularity exec --home /work/bose_lab/Rohit/numbat numbat-rbase_latest.sif R
249 | 
250 | library(numbat)
251 | 
252 | count_mat <- readRDS("sample_data/sample_3/mtx.RDS")
253 | allele_df <- read.delim("pap/sample_3/sample_3_allele_counts.tsv") 
254 | out = run_numbat(
255 |   count_mat, # gene x cell integer UMI count matrix 
256 |   ref_hca, # reference expression profile, a gene x cell type normalized expression level matrix
257 |   allele_df, # allele dataframe generated by pileup_and_phase script
258 |   genome = "hg38",
259 |   t = 1e-5,
260 |   ncores = 4,
261 |   plot = TRUE,
262 |   out_dir = 'outs/sample_3',
263 |   max_entropy = 0.8
264 | )
265 | 
266 | q() 
267 | 
268 | n 
269 | 
270 | singularity exec --home /work/bose_lab/Rohit/numbat numbat-rbase_latest.sif R
271 | 
272 | library(numbat)
273 | 
274 | count_mat <- readRDS("sample_data/sample_4/mtx.RDS")
275 | allele_df <- read.delim("pap/sample_4/sample_4_allele_counts.tsv") 
276 | out = run_numbat(
277 |   count_mat, # gene x cell integer UMI count matrix 
278 |   ref_hca, # reference expression profile, a gene x cell type normalized expression level matrix
279 |   allele_df, # allele dataframe generated by pileup_and_phase script
280 |   genome = "hg38",
281 |   t = 1e-5,
282 |   ncores = 4,
283 |   plot = TRUE,
284 |   out_dir = 'outs/sample_4',
285 |   max_entropy = 0.8
286 | )
287 | 
288 | q() 
289 | 
290 | n 
291 | 
292 | singularity exec --home /work/bose_lab/Rohit/numbat numbat-rbase_latest.sif R
293 | 
294 | library(numbat)
295 | 
296 | count_mat <- readRDS("sample_data/sample_5/mtx.RDS")
297 | allele_df <- read.delim("pap/sample_5/sample_5_allele_counts.tsv") 
298 | out = run_numbat(
299 |   count_mat, # gene x cell integer UMI count matrix 
300 |   ref_hca, # reference expression profile, a gene x cell type normalized expression level matrix
301 |   allele_df, # allele dataframe generated by pileup_and_phase script
302 |   genome = "hg38",
303 |   t = 1e-5,
304 |   ncores = 4,
305 |   plot = TRUE,
306 |   out_dir = 'outs/sample_5',
307 |   max_entropy = 0.8
308 | )
309 | 
310 | q() 
311 | 
312 | n 
313 | 
314 | singularity exec --home /work/bose_lab/Rohit/numbat numbat-rbase_latest.sif R
315 | 
316 | library(numbat)
317 | 
318 | count_mat <- readRDS("sample_data/sample_6/mtx.RDS")
319 | allele_df <- read.delim("pap/sample_6/sample_6_allele_counts.tsv") 
320 | out = run_numbat(
321 |   count_mat, # gene x cell integer UMI count matrix 
322 |   ref_hca, # reference expression profile, a gene x cell type normalized expression level matrix
323 |   allele_df, # allele dataframe generated by pileup_and_phase script
324 |   genome = "hg38",
325 |   t = 1e-5,
326 |   ncores = 4,
327 |   plot = TRUE,
328 |   out_dir = 'outs/sample_6',
329 |   max_entropy = 0.8
330 | )
331 | 
332 | q() 
333 | 
334 | n 
335 | 
336 | singularity exec --home /work/bose_lab/Rohit/numbat numbat-rbase_latest.sif R
337 | 
338 | library(numbat)
339 | 
340 | count_mat <- readRDS("sample_data/sample_7/mtx.RDS")
341 | allele_df <- read.delim("pap/sample_7/sample_7_allele_counts.tsv") 
342 | out = run_numbat(
343 |   count_mat, # gene x cell integer UMI count matrix 
344 |   ref_hca, # reference expression profile, a gene x cell type normalized expression level matrix
345 |   allele_df, # allele dataframe generated by pileup_and_phase script
346 |   genome = "hg38",
347 |   t = 1e-5,
348 |   ncores = 4,
349 |   plot = TRUE,
350 |   out_dir = 'outs/sample_7',
351 |   max_entropy = 0.8
352 | )
353 | 
354 | q() 
355 | 
356 | n 
357 | 
358 | 
359 | singularity exec --home /work/bose_lab/Rohit/numbat numbat-rbase_latest.sif R
360 | 
361 | library(numbat)
362 | 
363 | count_mat <- readRDS("sample_data/sample_8/mtx.RDS")
364 | allele_df <- read.delim("pap/sample_8/sample_8_allele_counts.tsv") 
365 | out = run_numbat(
366 |   count_mat, # gene x cell integer UMI count matrix 
367 |   ref_hca, # reference expression profile, a gene x cell type normalized expression level matrix
368 |   allele_df, # allele dataframe generated by pileup_and_phase script
369 |   genome = "hg38",
370 |   t = 1e-5,
371 |   ncores = 4,
372 |   plot = TRUE,
373 |   out_dir = 'outs/sample_8',
374 |   max_entropy = 0.8
375 | )
376 | 
377 | q() 
378 | 
379 | n 
380 | 
381 | singularity exec --home /work/bose_lab/Rohit/numbat numbat-rbase_latest.sif R
382 | 
383 | library(numbat)
384 | 
385 | count_mat <- readRDS("sample_data/sample_9/mtx.RDS")
386 | allele_df <- read.delim("pap/sample_9/sample_9_allele_counts.tsv") 
387 | out = run_numbat(
388 |   count_mat, # gene x cell integer UMI count matrix 
389 |   ref_hca, # reference expression profile, a gene x cell type normalized expression level matrix
390 |   allele_df, # allele dataframe generated by pileup_and_phase script
391 |   genome = "hg38",
392 |   t = 1e-5,
393 |   ncores = 4,
394 |   plot = TRUE,
395 |   out_dir = 'outs/sample_9',
396 |   max_entropy = 0.8
397 | )
398 | 
399 | q() 
400 | 
401 | n 
402 | 
403 | singularity exec --home /work/bose_lab/Rohit/numbat numbat-rbase_latest.sif R
404 | 
405 | library(numbat)
406 | 
407 | count_mat <- readRDS("sample_data/sample_10/mtx.RDS")
408 | allele_df <- read.delim("pap/sample_10/sample_10_allele_counts.tsv") 
409 | out = run_numbat(
410 |   count_mat, # gene x cell integer UMI count matrix 
411 |   ref_hca, # reference expression profile, a gene x cell type normalized expression level matrix
412 |   allele_df, # allele dataframe generated by pileup_and_phase script
413 |   genome = "hg38",
414 |   t = 1e-5,
415 |   ncores = 4,
416 |   plot = TRUE,
417 |   out_dir = 'outs/sample_10',
418 |   max_entropy = 0.8
419 | )
420 | 
421 | q() 
422 | 
423 | n 
424 | 
425 | singularity exec --home /work/bose_lab/Rohit/numbat numbat-rbase_latest.sif R
426 | 
427 | library(numbat)
428 | 
429 | count_mat <- readRDS("sample_data/sample_11/mtx.RDS")
430 | allele_df <- read.delim("pap/sample_11/sample_11_allele_counts.tsv") 
431 | out = run_numbat(
432 |   count_mat, # gene x cell integer UMI count matrix 
433 |   ref_hca, # reference expression profile, a gene x cell type normalized expression level matrix
434 |   allele_df, # allele dataframe generated by pileup_and_phase script
435 |   genome = "hg38",
436 |   t = 1e-5,
437 |   ncores = 4,
438 |   plot = TRUE,
439 |   out_dir = 'outs/sample_11',
440 |   max_entropy = 0.8
441 | )
442 | 
443 | q() 
444 | 
445 | n 
446 | 
447 | singularity exec --home /work/bose_lab/Rohit/numbat numbat-rbase_latest.sif R
448 | 
449 | library(numbat)
450 | 
451 | count_mat <- readRDS("sample_data/sample_12/mtx.RDS")
452 | allele_df <- read.delim("pap/sample_12/sample_12_allele_counts.tsv") 
453 | out = run_numbat(
454 |   count_mat, # gene x cell integer UMI count matrix 
455 |   ref_hca, # reference expression profile, a gene x cell type normalized expression level matrix
456 |   allele_df, # allele dataframe generated by pileup_and_phase script
457 |   genome = "hg38",
458 |   t = 1e-5,
459 |   ncores = 4,
460 |   plot = TRUE,
461 |   out_dir = 'outs/sample_12',
462 |   max_entropy = 0.8
463 | )
464 | 
465 | q() 
466 | 
467 | n 
468 | 
469 | 


--------------------------------------------------------------------------------
/Figure 2/Core Edge Identification/state_anno.R:
--------------------------------------------------------------------------------
  1 | library(Seurat)
  2 | library(harmony)
  3 | library(cowplot)
  4 | library(RColorBrewer)
  5 | library(ggplot2)
  6 | load(file = "annotated_objects.Robj")
  7 | 
  8 | all_spots <- purrr::reduce(annotated_objects, merge)
  9 | 
 10 | DefaultAssay(all_spots) <- "Spatial"
 11 | 
 12 | #integration by sample
 13 | 
 14 | obj_list <- SplitObject(all_spots, split.by = "sample_id")
 15 | rm(list=setdiff(ls(), "obj_list"))
 16 | 
 17 | obj_list <- lapply(X = obj_list, FUN = function(x) {
 18 |   x <- NormalizeData(x)
 19 |   x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 3000)
 20 | })
 21 | 
 22 | anchors <- FindIntegrationAnchors(object.list = obj_list, dims = 1:30)
 23 | 
 24 | comb_as <- IntegrateData(anchorset = anchors, dims = 1:30)
 25 | comb_as <- ScaleData(comb_as, verbose = FALSE)
 26 | comb_as <- RunPCA(comb_as, npcs = 50, verbose = FALSE)
 27 | 
 28 | png("elbowplot_all.png",units = "in", res = 300, width = 6, height = 4, type = "cairo")
 29 | ElbowPlot(comb_as, ndims = 25)
 30 | dev.off()
 31 | 
 32 | comb_as <- RunUMAP(comb_as, reduction = "pca", dims = 1:15)
 33 | comb_as <- FindNeighbors(comb_as, reduction = "pca", dims = 1:15)
 34 | #comb_as <- FindClusters(comb_as, resolution = 0.5)
 35 | 
 36 | DefaultAssay(comb_as) <- "integrated"
 37 | 
 38 | save(comb_as, file = "comb_as.Robj")
 39 | 
 40 | comb_as <- RunPCA(comb_as, npcs = 6, verbose = TRUE, seed.use = 12)
 41 | comb_as <- RunUMAP(comb_as, reduction = "pca", dims = 1:6, seed.use = 12)
 42 | comb_as <- FindNeighbors(comb_as, reduction = "pca", dims = 1:6, seed.use = 12)
 43 | 
 44 | #plot for pathologist annotations used in Figure 1
 45 | png("all_samples_patho.png",units = "in", res = 300, width = 6, height = 4, type = "cairo")
 46 | DimPlot(comb_as, reduction = "umap", group.by = "pathologist_anno", 
 47 |         cols = c("SCC" = "maroon", "Lymphocyte Negative Stroma" = "darkblue", 
 48 |         "Lymphocyte Positive Stroma" = "lightblue", "Keratin" = "lightgreen", "Fold" = "grey", 
 49 |         "Artifact" = "grey", "Artery/Vein" = "orange",
 50 |         "Muscle" = "lightyellow","Non-cancerous Mucosa" = "purple",
 51 |         "Glandular Stroma" = "violet"))
 52 | dev.off()
 53 | 
 54 | #plot for cancer cell identification by pcnv used in Figure 1
 55 | png("all_samples_pcnv.png",units = "in", res = 300, width = 5, height = 5, type = "cairo")
 56 | FeaturePlot(comb_as,features = "p_cnv", min.cutoff = 0.99, pt.size = 0.2,cols = c("grey", "#D9381E"),
 57 |             max.cutoff = 1)
 58 | dev.off()
 59 | 
 60 | #plot for cancer cell identification by deconvolution used in Figure 1
 61 | png("all_samples_deconv.png",units = "in", res = 300, width = 5, height = 5, type = "cairo")
 62 | FeaturePlot(comb_as,features = "cancer.cell", min.cutoff = 0.99, pt.size = 0.2,cols = c("grey", "#DC143C"),
 63 |             max.cutoff = 1)
 64 | dev.off()
 65 | 
 66 | colourCount = length(unique(comb_as$final_celltype))
 67 | 
 68 | #plot for celltype used in Figure 1.
 69 | png("all_samples_final_celltype.png",units = "in", res = 300, width = 6.5, height = 5, type = "cairo")
 70 | DimPlot(comb_as, reduction = "umap", group.by = "final_celltype", 
 71 |         cols = c("B.cell" = cols[1], "cancer cell"= cols[2], "cytotoxic.CD8..T." = cols[3],
 72 |         "cytotoxic.CD8..T.exhausted" = cols[4],
 73 |         "dendritic." = cols[5],
 74 |         "detox.iCAF" = cols[6],
 75 |         "ecm.myCAF" = cols[7],
 76 |         "endothelial" = cols[8],
 77 |         "Intermediate.fibroblast" = cols[9],
 78 |         "macrophage" = cols[10],
 79 |         "mast" = cols[11],
 80 |         "myofibroblast" = cols[12],
 81 |         "Tregs" = cols[13],
 82 |         "NA" = cols[14]))
 83 | dev.off()
 84 | 
 85 | #plot for cancer cell identification used in figure 1
 86 | png("all_samples_iscancercell.png",units = "in", res = 300, width = 5, height = 5, type = "cairo")
 87 | DimPlot(comb_as, reduction = "umap", group.by = "is_cancer_cell",cols = c("grey", "red"))
 88 | dev.off()
 89 | 
 90 | #example plot for sample 1 of pathologist annotations used in Figure 1
 91 | png("sample_1_patho_anno.png",units = "in", res = 300, width = 5, height = 5, type = "cairo")
 92 | SpatialDimPlot(obj_list[[1]], group.by = "pathologist_anno", images = "tumor", image.alpha = 0.5,
 93 |                stroke = 0,
 94 |         cols = c("SCC" = "maroon", "Lymphocyte Negative Stroma" = "darkblue", 
 95 |                  "Lymphocyte Positive Stroma" = "lightblue", "Keratin" = "lightgreen", "Fold" = "grey", 
 96 |                  "Artifact" = "grey", "Artery/Vein" = "orange",
 97 |                  "Muscle" = "lightyellow","Non-cancerous Mucosa" = "purple",
 98 |                  "Glandular Stroma" = "violet")) + NoLegend()
 99 | dev.off()
100 | 
101 | #example plot for pcnv for sample 1 used in Figure 1
102 | png("sample_1_pcnv.png",units = "in", res = 300, width = 5, height = 5, type = "cairo")
103 | SpatialFeaturePlot(obj_list[[1]],features = "p_cnv", images = "tumor", min.cutoff = 0.99,stroke = 0,
104 |                    image.alpha = 0.5) +
105 |   scale_fill_gradient(low = "grey", high = "#D9381E")+ NoLegend()
106 | dev.off()
107 | 
108 | #example plot for deconvolution for sample 1 used in Figure 1
109 | png("sample_1_deconv.png",units = "in", res = 300, width = 5, height = 5, type = "cairo")
110 | SpatialFeaturePlot(obj_list[[1]],features = "cancer.cell", images = "tumor", min.cutoff = 0.99,
111 |                    stroke = 0,
112 |                    image.alpha = 0.5) +
113 |   scale_fill_gradient(low = "grey", high = "#DC143C")+ NoLegend()
114 | dev.off()
115 | 
116 | #example plot for cancer cell status for sample 1 used in Figure 1
117 | png("sample_1_iscancercell.png",units = "in", res = 300, width = 5, height = 5, type = "cairo")
118 | SpatialDimPlot(obj_list[[1]], group.by = "is_cancer_cell", images = "tumor",image.alpha = 0.5,
119 | 
120 |                stroke = 0,
121 |                cols = c("0" = "grey","1" = "red")) + NoLegend()
122 | dev.off()
123 | 
124 | cols = colorRampPalette(brewer.pal(9, "Set1"))(colourCount)
125 | #example plot for final celltype for sample 1 used in Figure 1
126 | png("sample_1_final_celltype.png",units = "in", res = 300, width = 5, height = 5, type = "cairo")
127 | SpatialDimPlot(annotated_objects[[1]], group.by = "final_celltype", images = "tumor",image.alpha = 0.5,
128 |                stroke = 0,
129 |               cols = c("B.cell" = cols[1], "cancer cell"= cols[2], "cytotoxic.CD8..T." = cols[3],
130 |                        "cytotoxic.CD8..T.exhausted" = cols[4],
131 |                        "detox.iCAF" = cols[6],
132 |                        "ecm.myCAF" = cols[7],
133 |                        "endothelial" = cols[8],
134 |                        "Intermediate.fibroblast" = cols[9],
135 |                        "macrophage" = cols[10],
136 |                        "mast" = cols[11],
137 |                        "myofibroblast" = cols[12],
138 |                        "Tregs" = cols[13]))+ NoLegend()
139 | 
140 | #plot proportions of celltypes for supplementary fig 1
141 | all_data <- all_spots@meta.data
142 | 
143 | all_data[is.na(all_data)] <- "unknown"
144 | colourCount = length(unique(all_data$final_celltype))
145 | 
146 | library(RColorBrewer)
147 | all_data$sample_id <- factor(all_data$sample_id, levels = c("sample_1","sample_2","sample_3","sample_4","sample_5","sample_6","sample_7","sample_8",
148 |                                                             "sample_9","sample_10", "sample_11","sample_12"))
149 | png("celltype_bar.png",units = "in", res = 300, width = 7, height = 5, type = "cairo")
150 | ggplot(all_data, aes(x = sample_id, fill = final_celltype))  + 
151 |   geom_bar(position="fill")+
152 |   theme_bw() + 
153 |   labs(x=c("spatial annotation"), y="Proportion")+ 
154 |   theme(panel.grid.major = element_blank(), 
155 |         panel.grid.minor = element_blank(),
156 |         panel.border = element_blank(), panel.background = element_blank()) +
157 |   guides(fill = guide_legend(title = 'cell type')) +
158 |   scale_fill_manual(values = colorRampPalette(brewer.pal(9, "Set1"))(colourCount))+
159 |   theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
160 | dev.off()
161 | 
162 | #plot proportions of pathologist regions for supplementary fig 1
163 | all_data <- all_spots@meta.data
164 | 
165 | all_data[is.na(all_data)] <- "unknown"
166 | colourCount = length(unique(all_data$pathologist_anno))
167 | 
168 | library(RColorBrewer)
169 | all_data$sample_id <- factor(all_data$sample_id, levels = c("sample_1","sample_2","sample_3","sample_4","sample_5","sample_6","sample_7","sample_8",
170 |                                                             "sample_9","sample_10", "sample_11","sample_12"))
171 | png("celltype_bar_pathologist_anno.png",units = "in", res = 300, width = 7, height = 5, type = "cairo")
172 | ggplot(all_data, aes(x = sample_id, fill = pathologist_anno))  + 
173 |   geom_bar(position="fill")+
174 |   theme_bw() + 
175 |   labs(x=c("spatial annotation"), y="Proportion")+ 
176 |   theme(panel.grid.major = element_blank(), 
177 |         panel.grid.minor = element_blank(),
178 |         panel.border = element_blank(), panel.background = element_blank()) +
179 |   guides(fill = guide_legend(title = 'pathologist annotations')) +
180 |   scale_fill_manual(values = c("SCC" = "maroon", "Lymphocyte Negative Stroma" = "darkblue", 
181 |                                "Lymphocyte Positive Stroma" = "lightblue", "Keratin" = "lightgreen", "Fold" = "grey", 
182 |                                "Artifact" = "grey", "Artery/Vein" = "orange",
183 |                                "Muscle" = "lightyellow","Non-cancerous Mucosa" = "purple","Glandular Stroma" = "violet"))+
184 |   theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
185 | dev.off()
186 | 
187 | #plot dimplot of pathologist region for each sample and cancer cell annotation for each sample for supplementary fig 1
188 | for(object in annotated_objects) {
189 |   #image = paste0("tumor.",as.numeric(stringr::str_split(unique(object@meta.data$sample_id),"_")[[1]][2])-1)
190 |   png(paste0("pathologist_anno/",unique(object@meta.data$sample_id),"patho_anno.png"),units = "in", res = 300, width = 5, height = 5, type = "cairo")
191 |   print(SpatialDimPlot(object, group.by = "pathologist_anno", images = "tumor", image.alpha = 0.5,
192 |                        stroke = 0,
193 |                        cols = c("SCC" = "maroon", "Lymphocyte Negative Stroma" = "darkblue", 
194 |                                 "Lymphocyte Positive Stroma" = "lightblue", "Keratin" = "lightgreen", "Fold" = "grey", 
195 |                                 "Artifact" = "grey", "Artery/Vein" = "orange",
196 |                                 "Muscle" = "lightyellow","Non-cancerous Mucosa" = "purple",
197 |                                 "Glandular Stroma" = "violet")) + NoLegend())
198 |   dev.off()
199 |   
200 |   png(paste0("cancer_cell_anno/",unique(object@meta.data$sample_id),"iscancercell.png"),units = "in", res = 300, width = 5, height = 5, type = "cairo")
201 |   print(SpatialDimPlot(object, group.by = "is_cancer_cell", images = "tumor",image.alpha = 0.5,
202 |                        stroke = 0,
203 |                        cols = c("0" = "grey","1" = "red")) + NoLegend())
204 |   dev.off()
205 |   
206 | }
207 | 
208 | all_data <- comb_as@meta.data
209 | 
210 | #Only unknown spots are in sample 3 and this is because there are a few low quality spots that are not on the tissue section
211 | #Due to this, the "unknown status" will be assumed to be noncancer 
212 | View(all_data$is_cancer_cell[is.na(all_data$is_cancer_cell)])
213 | all_data[is.na(all_data)] <- "0"
214 | all_data$sample_id <- factor(all_data$sample_id, levels = c("sample_1","sample_2","sample_3","sample_4","sample_5","sample_6","sample_7","sample_8",
215 |                                                             "sample_9","sample_10", "sample_11","sample_12"))
216 | 
217 | png("malignant_barplot.png",units = "in", res = 300, width = 7, height = 5, type = "cairo")
218 | ggplot(all_data, aes(x = sample_id, fill = is_cancer_cell))  + 
219 |   geom_bar(position="fill")+
220 |   theme_bw() + 
221 |   labs(x=c("spatial annotation"), y="Proportion")+ 
222 |   theme(panel.grid.major = element_blank(), 
223 |         panel.grid.minor = element_blank(),
224 |         panel.border = element_blank(), panel.background = element_blank()) +
225 |   guides(fill = guide_legend(title = 'malignant status')) +
226 |   scale_fill_manual(values = c("0" = "grey","1" = "red"))+
227 |   theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
228 | dev.off()
229 | 
230 | comb_d2c <- comb
231 | DefaultAssay(comb_d2c) <- "Spatial"
232 | 
233 | comb_d2c@assays$SCT = NULL
234 | 
235 | comb_d2c$cluster_annotations <- as.character(comb_d2c$cluster_annotations)
236 | SaveH5Seurat(comb_d2c, filename = "comb_d2c.h5Seurat")
237 | Convert("comb_d2c.h5Seurat", dest = "h5ad")
238 | 
239 | #integrate only cancer cells together
240 | all_spots_c <- subset(all_spots, subset = final_celltype == "cancer cell")
241 | 
242 | DefaultAssay(all_spots_c) <- "Spatial"
243 | #integration
244 | obj_list <- SplitObject(all_spots_c, split.by = "sample_id")
245 | rm(list=setdiff(ls(), "obj_list"))
246 | obj_list <- lapply(X = obj_list, FUN = function(x) {
247 |   x <- NormalizeData(x)
248 |   x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 3000)
249 | })
250 | 
251 | anchors <- FindIntegrationAnchors(object.list = obj_list, dims = 1:30)
252 | comb <- IntegrateData(anchorset = anchors, dims = 1:30)
253 | comb <- ScaleData(comb, verbose = FALSE)
254 | comb <- RunPCA(comb, npcs = 50, verbose = FALSE)
255 | comb <- RunUMAP(comb, reduction = "pca", dims = 1:50)
256 | 
257 | ElbowPlot(comb, ndims = 25)
258 | 
259 | comb <- RunUMAP(comb, reduction = "pca", dims = 1:10)
260 | comb <- FindNeighbors(comb, reduction = "pca", dims = 1:10)
261 | 
262 | save(comb, file = "comb.Robj")
263 | 
264 | library(Seurat)
265 | 
266 | load(file = "comb.Robj")
267 | 
268 | comb <- Seurat::FindClusters(object = comb, resolution = 1)
269 | # pull the tree
270 | data.tree <- Tool(object = comb, slot = "BuildClusterTree")
271 | 
272 | #plot of clusters for fig 2
273 | png("cluster_res1.png",units = "in", res = 300, width = 5, height = 4, type = "cairo")
274 | Seurat::DimPlot(comb)
275 | dev.off()
276 | 
277 | #plot of phylogenetic tree for fig 2
278 | png("cluster_res1_phylo.png",units = "in", res = 300, width = 5, height = 5, type = "cairo")
279 | ape::plot.phylo(x = data.tree, direction = "right")
280 | dev.off()
281 | 
282 | comb <- Seurat::FindClusters(object = comb, resolution = 1)
283 | 
284 | trial_ids <- c("n3", "n2","n3","n2","n3", "n1","n3","n3","n2","n1","n3","n2","n2","n2","n1")
285 | 
286 | names(trial_ids) <- levels(comb)
287 | 
288 | comb <- RenameIdents(comb, trial_ids)
289 | 
290 | library(SCpubr)
291 | library(ggplot2)
292 | 
293 | #plot of nodes for Fig 2
294 | png("nodal_definitions.png",units = "in", res = 300, width = 5, height = 4, type = "cairo")
295 | DimPlot(comb, reduction = "umap", label = FALSE, pt.size = 0.5, 
296 |         cols = c("n1" = "#4DBBD5FF",
297 |                  "n2" = "#F9E076", "n3" = "#E64B35FF")) 
298 | dev.off()
299 | 
300 | load(file = "comb.Robj")
301 | #heatmap of top markers
302 | DefaultAssay(comb) <- "integrated"
303 | 
304 | comb <- NormalizeData(comb)
305 | top_markers <- FindAllMarkers(comb, only.pos = TRUE)
306 | 
307 | #plot of top markers in each nodal group 
308 | png("logfc_top_markers.png",units = "in", res = 300, width = 5, height = 5, type = "cairo")
309 | SCpubr::do_GroupwiseDEPlot(sample = comb,
310 |                            de_genes = tibble::tibble(top_markers),
311 |                            top_genes = 5)
312 | dev.off()
313 | 
314 | write.csv(top_markers, file = "node_markers.csv")
315 | 
316 | #nebulosa plot of validated markers for fig 2 
317 | png("nebulosa_CLDN4.png",units = "in", res = 300, width = 4.5, height = 5, type = "cairo")
318 | SCpubr::do_NebulosaPlot(comb, "CLDN4")
319 | dev.off()
320 | 
321 | png("nebulosa_SPRR1B.png",units = "in", res = 300, width = 4.5, height = 5, type = "cairo")
322 | SCpubr::do_NebulosaPlot(comb, "SPRR1B")
323 | dev.off()
324 | 
325 | png("nebulosa_LAMC2.png",units = "in", res = 300, width = 4.5, height = 5, type = "cairo")
326 | SCpubr::do_NebulosaPlot(comb, "LAMC2")
327 | dev.off()
328 | 
329 | png("nebulosa_ITGA5.png",units = "in", res = 300, width = 4.5, height = 5, type = "cairo")
330 | SCpubr::do_NebulosaPlot(comb, "ITGA5")
331 | dev.off()
332 | 
333 | comb <- Seurat::FindClusters(object = comb, resolution = 1)
334 | 
335 | trial_ids <- c("edge", "transitory","edge","transitory","edge", "core","edge","edge","transitory","core","edge","transitory","transitory","transitory","core")
336 | 
337 | names(trial_ids) <- levels(comb)
338 | 
339 | comb <- RenameIdents(comb, trial_ids)
340 | 
341 | library(SCpubr)
342 | library(ggplot2)
343 | 
344 | #renaming regions as core and edge
345 | png("core_edge_transitory_anno.png",units = "in", res = 300, width = 5.5, height = 4, type = "cairo")
346 | DimPlot(comb, reduction = "umap", label = FALSE, pt.size = 0.5, 
347 |         cols = c("core" = "#4DBBD5FF",
348 |                  "transitory" = "#F9E076", "edge" = "#E64B35FF")) 
349 | dev.off()
350 | 
351 | ##plot of clonotypes by state for supplementary fig 3
352 | library(ggplot2)
353 | library(RColorBrewer)
354 | comb@meta.data$core_edge_anno
355 | meta_df <- comb@meta.data
356 | meta_df$clone_opt <- as.factor(meta_df$clone_opt)
357 | 
358 | png("cancer_cell_clone_byregion.png",units = "in", res = 300, width = 5, height = 5, type = "cairo")
359 | ggplot(meta_df, aes(x = core_edge_anno, fill = clone_opt))  + 
360 |   geom_bar(position="fill")+
361 |   facet_wrap(~sample_id)+
362 |   theme_bw() + 
363 |   labs(x=c("sample ID"), y="Proportion of each clone in cancer cell regions")+ 
364 |   theme(panel.grid.major = element_blank(), 
365 |         panel.grid.minor = element_blank(),
366 |         panel.border = element_blank(), panel.background = element_blank()) +
367 |   theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
368 |   scale_fill_manual(values = colorRampPalette(brewer.pal(6, "Set2"))(6))
369 | dev.off()
370 | 
371 | # top_markers <- FindAllMarkers(comb, only.pos = TRUE)
372 | # write.csv(top_markers, file = "node_markers.csv")
373 | 
374 | 
375 | 
376 | 


--------------------------------------------------------------------------------
/Figure 2/Correlation Analysis/corplot.R:
--------------------------------------------------------------------------------
 1 | library(reshape2)
 2 | library(patchwork)
 3 | library(ggplot2)
 4 | library(dplyr)
 5 | library(Seurat)
 6 | library(RColorBrewer)
 7 | library(sctransform)
 8 | 
 9 | load(file = "comb.Robj")
10 | 
11 | Idents(comb) = "cluster_annotations"
12 | comb_core_edge <- subset(comb, idents = c("core","edge"))
13 | 
14 | comb_core_edge@meta.data$sample_group <- paste(comb_core_edge@meta.data$sample_id, 
15 |                                                       comb_core_edge@meta.data$cluster_annotations, sep = "_")
16 | 
17 | object <- comb_core_edge
18 | Idents(object) = "sample_group"
19 | object@meta.data$sample_group <- factor(object@meta.data$sample_group, levels = c(
20 |   "sample_1_edge",
21 |   "sample_2_edge",
22 |   "sample_3_edge",
23 |   "sample_4_edge",
24 |   "sample_5_edge",
25 |   "sample_6_edge",
26 |   "sample_7_edge",
27 |   "sample_8_edge",
28 |   "sample_9_edge",
29 |   "sample_10_edge",
30 |   "sample_11_edge",
31 |   "sample_12_edge",
32 |   "sample_1_core",
33 |   "sample_2_core",
34 |   "sample_3_core",
35 |   "sample_4_core",
36 |   "sample_5_core",
37 |   "sample_6_core",
38 |   "sample_7_core",
39 |   "sample_8_core",
40 |   "sample_9_core",
41 |   "sample_10_core",
42 |   "sample_11_core",
43 |   "sample_12_core"))
44 | Idents(object) = "sample_group"
45 | 
46 | av.exp <- AverageExpression(object)$SCT
47 | av.exp <- na.omit(av.exp)
48 | cor.exp <- as.data.frame(cor(av.exp))
49 | cor.exp$x <- rownames(cor.exp)
50 | cor.df <- tidyr::gather(data = cor.exp, y, correlation, unique(comb_core_edge@meta.data$sample_group))
51 | 
52 | setwd("corplot/")
53 | 
54 | group = "cluster_annotations"
55 | cor.exp$x = NULL
56 | cor.exp <- as.matrix(cor.exp)
57 | cols <- RColorBrewer::brewer.pal(11, "Spectral")
58 | cols <- rev(cols)
59 | 
60 | group1 <- unique(object@meta.data[[group]])[1]
61 | group2 <- unique(object@meta.data[[group]])[2]
62 | 
63 | 
64 | cor.exp2 = data.frame(group = c(rep(group1, length(unique(object@meta.data$sample_group))/2), 
65 |                                 rep(group2, length(unique(object@meta.data$sample_group))/2)))
66 | 
67 | 
68 | list1 = list(group = c(group1 = '#E64B35FF',
69 |                        group2 = '#4DBBD5FF'))
70 | 
71 | list2 = list(setNames(list1$group, c(group1, group2)))
72 | 
73 | names(list2) = "group"
74 | library(ComplexHeatmap)
75 | ha = HeatmapAnnotation(df = cor.exp2, col = list2)
76 | hab = rowAnnotation(df = cor.exp2, col = list2)
77 | 
78 | #generate correlation plot for Figure 2
79 | png("heatmap_corr.png",units = "in", res = 300, width = 10, height = 9,type = "cairo")
80 | 
81 | print(ComplexHeatmap::Heatmap(cor.exp, col = cols, bottom_annotation = ha, 
82 |                               right_annotation = hab, column_split = c(rep(group1, length(unique(object@meta.data$sample_group))/2), 
83 |                                                                        rep(group2, length(unique(object@meta.data$sample_group))/2)), 
84 |                               
85 |                               row_split = c(rep(group1, length(unique(object@meta.data$sample_group))/2), 
86 |                                             rep(group2, length(unique(object@meta.data$sample_group))/2)),
87 |                               cluster_rows = T,
88 |                               cluster_columns = T
89 | ))
90 | dev.off()
91 | 


--------------------------------------------------------------------------------
/Figure 2/Differential Expression/degs.R:
--------------------------------------------------------------------------------
  1 | library(Seurat)
  2 | library(dplyr)
  3 | library(tibble)
  4 | library(cowplot)
  5 | library(patchwork)
  6 | load(file = "annotated_objects.Robj")
  7 | load(file = "comb.Robj")
  8 |  
  9 | comb_meta <- comb@meta.data
 10 | #transfer annotations of core/transitory/edge back to objects with spatial positions
 11 | for (object in annotated_objects) {
 12 |   meta <- object@meta.data
 13 |   
 14 |   meta_sample_id <- left_join(meta,comb_meta, by = "Row.names")%>%column_to_rownames("Row.names")
 15 |   
 16 |   meta_sample_id$cluster_annotations <- factor(meta_sample_id$cluster_annotations, levels = c("core","edge","transitory","nc"))
 17 |   meta_sample_id$cluster_annotations[is.na(meta_sample_id$cluster_annotations)] <- "nc"
 18 |   
 19 |   annotated_objects[[unique(object$sample_id)]]@meta.data <- meta_sample_id
 20 | }
 21 | 
 22 | #barplot of state for core transtiroy adn edge regions 
 23 | png("core_transitory_edge/barplot.png",units = "in", res = 300, width = 5, height = 5, type = "cairo")
 24 | ggplot(comb@meta.data, aes(x = factor(comb@meta.data$sample_id, levels = c("sample_1","sample_2","sample_3","sample_4","sample_5","sample_6","sample_7","sample_8",
 25 |                                                        "sample_9","sample_10", "sample_11","sample_12")), fill = cluster_annotations))  + 
 26 |   geom_bar(position="fill")+
 27 |   theme_bw() + 
 28 |   labs(x=c("spatial annotation"), y="Proportion")+ 
 29 |   theme(panel.grid.major = element_blank(), 
 30 |         panel.grid.minor = element_blank(),
 31 |         panel.border = element_blank(), panel.background = element_blank()) +
 32 |   guides(fill = guide_legend(title = 'cancer cell state')) +
 33 |   scale_fill_manual(values = c('edge' = '#E64B35FF',
 34 |                                'transitory' = '#F9E076',
 35 |                                'core' = '#4DBBD5FF'))+
 36 |   theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
 37 | 
 38 | dev.off()
 39 | 
 40 | #core and edge annoations for each sample (used in Figure 2)
 41 | for(object in annotated_objects) {
 42 |   png(paste0("core_transitory_edge/",unique(object@meta.data$sample_id.x),"core_edge_anno.png"),units = "in", res = 300, width = 5, height = 5, type = "cairo")
 43 |   print(SpatialDimPlot(object, group.by = "cluster_annotations", images = "tumor",image.alpha = 0.5,
 44 |                        stroke = 0,
 45 |                        cols = c('edge' = '#E64B35FF',
 46 |                                 'transitory' = '#F9E076',
 47 |                                 'core' = '#4DBBD5FF')) + NoLegend())
 48 |   dev.off()
 49 |   
 50 | }
 51 | 
 52 | # lapply(
 53 | #   annotated_objects,
 54 | #   function(sample) {
 55 | #     SpatialDimPlot(sample,group.by  = "cluster_annotations")
 56 | #   }
 57 | # ) %>% 
 58 | #   wrap_plots(guides = 'collect')
 59 | 
 60 | all_samples <- annotated_objects
 61 | 
 62 | save(all_samples, file = "all_samples.Robj")
 63 | 
 64 | load(file = "all_samples.Robj")
 65 | 
 66 | #### DEG analysis
 67 | 
 68 | #define colours
 69 | patient_col_dict <- c("sample_1" = "#66D7D1", "sample_2" = "#BEE3DB","sample_3" = "#D3D3D3",
 70 |                       "sample_4" = "#FFD6BA","sample_5" = "#274C77","sample_6" = "#A3CEF1", 
 71 |                       "sample_7" = "#F7CE5B", 
 72 |                       "sample_8" = "#7CEA9C","sample_9" = "#7C90A0",
 73 |                       "sample_10" = "#DCCCFF","sample_11" = "#EE6055",
 74 |                       "sample_12" = "#C492B1")
 75 | markers_list = c()
 76 | i = 1
 77 | for (Seurat_obj in all_samples) { 
 78 | 
 79 |   Idents(Seurat_obj) <- "cluster_annotations"
 80 |   #scale and normalize each sample individually
 81 |   Seurat_obj <- ScaleData(Seurat_obj)
 82 |   Seurat_obj <- NormalizeData(Seurat_obj)
 83 |   sub_obj <- subset(Seurat_obj, idents = c("core","edge"))
 84 |   
 85 |   #core is positive, edge is negative
 86 |   results <- FindMarkers(sub_obj,ident.1  = "core",ident.2 = "edge", group.by = "cluster_annotations")%>%
 87 |     tibble::rownames_to_column(var = "gene")%>%
 88 |     tibble::add_column("sample_id.x" = unique(sub_obj@meta.data$sample_id.x))
 89 |   
 90 |   write.csv(results, file = paste("differential_expression/raw_csvs/",unique(sub_obj@meta.data$sample_id.x),"_raw_deg.csv", sep = ""))
 91 |   markers_list[[i]] <- results
 92 |   i = i+1
 93 | }
 94 | 
 95 | #code adapted to make consensus plots
 96 | library(dplyr)
 97 | 
 98 | markers_list_saved <- markers_list
 99 | 
100 | markers.list <- plyr::ldply(markers_list, data.frame)
101 | 
102 | #pval cutoff of 0.001
103 | 
104 | markers.list.mat <- markers.list[markers.list$p_val_adj<0.001,]
105 | 
106 | markers.list.mat <- markers.list.mat[,c("gene", "avg_log2FC", "sample_id.x")]
107 | 
108 | markers.list.mat <- reshape2::dcast(markers.list.mat, formula = gene~sample_id.x, value.var = "avg_log2FC")
109 | 
110 | # markers.list.mat <- reshape2::dcast(markers.list.mat, formula = gene~sample_id.x, value.var = "log2FC")
111 | 
112 | markers.list.mat[is.na(markers.list.mat)] = 0
113 | markers.list.mat <- markers.list.mat %>% tibble::column_to_rownames(var = "gene")
114 | 
115 | markers.matrix = markers.list.mat
116 | 
117 | markers.matrix.m <- markers.matrix %>% tibble::rownames_to_column(var = "gene")
118 | markers.matrix.m <- reshape2::melt(markers.matrix.m)
119 | order.markers <- rownames(markers.matrix[order(rowSums(-markers.matrix)),])
120 | markers.sum <- as.data.frame(ifelse(markers.matrix != 0, 1, 0))
121 | markers.sum$total <- rowSums(markers.sum)
122 | #must be differnetially expressed in at least 9 samples
123 | keep <- rownames(markers.sum[markers.sum$total>9,])
124 | markers.matrix.m <- markers.matrix.m[markers.matrix.m$gene %in% keep,]
125 | 
126 | markers_table <- reshape2::dcast(markers.matrix.m, formula = gene~variable, value.var = "value")
127 | markers_table <- column_to_rownames(markers_table, "gene")
128 | markers_table$sum <- rowSums(markers_table)
129 | write.csv(markers_table, file = "markers_table.csv")
130 | 
131 | core_genes <- markers.matrix.m[markers.matrix.m$value > 0,]
132 | edge_genes <- markers.matrix.m[markers.matrix.m$value < 0,]
133 | 
134 | core_genes <- unique(core_genes$gene)
135 | edge_genes <- unique(edge_genes$gene)
136 | 
137 | saveRDS(core_genes,"gene_lists/core_genes.RDS")
138 | saveRDS(edge_genes,"gene_lists/edge_genes.RDS")
139 | 
140 | markers.matrix.m <- markers.matrix.m[markers.matrix.m$gene %in% union(edge_genes,core_genes),]
141 | 
142 | setwd("differential_expression/")
143 | #create consensus plot 
144 | png("differentially_expressed_genes_sct_seurat.png", units="in", width=20, height=7, res=300, type = "cairo")
145 | ggplot(markers.matrix.m, aes(x = factor(gene, level = order.markers), 
146 |                              y = value, fill = variable)) +
147 |   geom_bar(stat="identity", color = "black", size = 0.25) + theme_minimal() + 
148 |   scale_fill_manual( values = patient_col_dict)+
149 |   labs(x = "", y = "Cumulative average log(fold-change)", fill = "sample_id.x") +
150 |   ggpubr::rotate_x_text()
151 | dev.off()
152 | 
153 | #only plot top 25 genes for Figure 2
154 | topweights <-aggregate(value~gene,markers.matrix.m,sum)
155 | 
156 | topweights <- topweights[order(-topweights$value),] 
157 | 
158 | markers_top_25 <- markers.matrix.m[markers.matrix.m$gene %in% c(head(topweights,25)$gene,tail(topweights,25)$gene),]
159 | 
160 | png("deg_top25.png", units="in", width=9, height=3, res=300, type = "cairo")
161 | ggplot(markers_top_25, aes(x = factor(gene, level = order.markers), 
162 |                              y = value, fill = variable)) +
163 |   geom_bar(stat="identity", color = "black", size = 0.25) + theme_minimal() + 
164 |   scale_fill_manual( values = patient_col_dict)+
165 |   labs(x = "", y = "Cumulative average log(fold-change)", fill = "sample_id.x") +
166 |   ggpubr::rotate_x_text()
167 | dev.off()
168 | 
169 | 
170 | setwd("..")
171 | 
172 | #do similar analysis to extract raw data for each sample for IPA
173 | 
174 | markers_list = c()
175 | i = 1
176 | for (Seurat_obj in all_samples) { 
177 |  
178 |   Idents(Seurat_obj) <- "cluster_annotations"
179 |   Seurat_obj <- ScaleData(Seurat_obj)
180 |   Seurat_obj <- NormalizeData(Seurat_obj)
181 |   sub_obj <- subset(Seurat_obj, idents = c("core","edge"))
182 |   
183 | results <- FindMarkers(sub_obj,ident.1  = "core",ident.2 = "edge", group.by = "cluster_annotations")%>%
184 |   tibble::rownames_to_column(var = "gene")%>%
185 |   tibble::add_column("sample_id.x" = unique(sub_obj@meta.data$sample_id.x))
186 |  
187 |  results <- results[results$p_val_adj < 0.001,]
188 |  results_edge <- results[results$avg_log2FC < 0,]
189 |  results_edge$avg_log2FC <- results_edge$avg_log2FC*-1
190 |  results_core <- results[results$avg_log2FC > 0,]
191 |  write.csv(results_edge, file = paste("differential_expression/IPA/",unique(sub_obj@meta.data$sample_id.x),"_edge_raw_deg.csv", sep = ""))
192 |  write.csv(results_core, file = paste("differential_expression/IPA/",unique(sub_obj@meta.data$sample_id.x),"_core_raw_deg.csv", sep = ""))
193 |  
194 |  markers_list[[i]] <- results
195 |  i = i+1
196 | }
197 | 
198 | 
199 | #check overlap with puram et al. genesets
200 | 
201 | puram_core <- scan("puram_genesets/epithelial_diff.txt", what="", sep="\n")
202 | 
203 | puram_edge <- scan("puram_genesets/p_emt.txt", what="", sep="\n")
204 | 
205 | core_genes <- readRDS("gene_lists/core_genes.RDS")
206 | 
207 | edge_genes <- readRDS("gene_lists/edge_genes.RDS")
208 | 
209 | length(intersect(puram_core,core_genes))
210 | 
211 | #47/80 overlapping for epithelial diff
212 | 
213 | length(intersect(puram_edge,edge_genes))
214 | 
215 | #only 7/100 edge overlap
216 |            


--------------------------------------------------------------------------------
/Figure 2/IPA/IPA Upstream regulator.R:
--------------------------------------------------------------------------------
  1 | library(remotes)
  2 | library(clusterProfiler)
  3 | library(rlang)
  4 | library(multienrichjam);
  5 | library(jamba)
  6 | library(amap)
  7 | #> 
  8 | #> Attaching package: 'jamba'
  9 | #> The following objects are masked from 'package:multienrichjam':
 10 | #> 
 11 | #>     heatmap_column_order, heatmap_row_order
 12 | library(colorjam);
 13 | suppressPackageStartupMessages(library(ComplexHeatmap));
 14 | options("stringsAsFactors"=FALSE, "warn"=-1);
 15 | knitr::opts_chunk$set(
 16 |   fig.height=10,
 17 |   fig.width=10,
 18 |   fig.align="center"
 19 | )
 20 | ragg_png = function(..., res = 192) {
 21 |   ragg::agg_png(..., res = res, units = "in")
 22 | }
 23 | knitr::opts_chunk$set(dev = "ragg_png", fig.ext = "png")
 24 | 
 25 | 
 26 | #Importing IPA Text file
 27 | ipa_files <- c(Patient_1_Core='patient1-core.txt',
 28 |                Patient_2_Core='patient2-core.txt',
 29 |                Patient_3_Core='patient3-core.txt',
 30 |                Patient_4_Core='patient4-core.txt',
 31 |                Patient_5_Core='patient5-core.txt',
 32 |                Patient_6_Core='patient6-core.txt',
 33 |                Patient_7_Core='patient7-core.txt',
 34 |                Patient_8_Core='patient8-core.txt',
 35 |                Patient_9_Core='patient9-core.txt',
 36 |                Patient_10_Core='patient10-core.txt',
 37 |                Patient_11_Core='patient11-core.txt',
 38 |                Patient_12_Core='patient12-core.txt',
 39 |                Patient_1_Edge='patient1-edge.txt',
 40 |                Patient_2_Edge='patient2-edge.txt',
 41 |                Patient_3_Edge='patient3-edge.txt',
 42 |                Patient_4_Edge='patient4-edge.txt',
 43 |                Patient_5_Edge='patient5-edge.txt',
 44 |                Patient_6_Edge='patient6-edge.txt',
 45 |                Patient_7_Edge='patient7-edge.txt',
 46 |                Patient_8_Edge='patient8-edge.txt',
 47 |                Patient_9_Edge='patient9-edge.txt',
 48 |                Patient_10_Edge='patient10-edge.txt',
 49 |                Patient_11_Edge='patient11-edge.txt',
 50 |                Patient_12_Edge='patient12-edge.txt')
 51 | 
 52 | ipa_l <- lapply(ipa_files, importIPAenrichment)
 53 | ssdim(ipa_l)
 54 | 
 55 | 
 56 | #Analyze IPA enrichments from canonical enrichment test
 57 | enrichList_canonical <- lapply(ipa_l, function(i){
 58 |   i[["Upstream Regulators"]];
 59 | });
 60 | sdim(enrichList_canonical);
 61 | 
 62 | ## Convert data.frame to enrichResult
 63 | ## multienrichjam::enrichDF2enrichResult
 64 | er_canonical <- lapply(enrichList_canonical, function(i){
 65 |   enrichDF2enrichResult(i,
 66 |                         keyColname="Name",
 67 |                         pvalueColname="P-value",
 68 |                         geneColname="geneNames",
 69 |                         pathGenes="Bias Term",
 70 |                         pvalueCutoff=1)
 71 | });
 72 | sdim(er_canonical);
 73 | head(as.data.frame(er_canonical[[1]]));
 74 | 
 75 | 
 76 | ?multiEnrichMap
 77 | #Running multiEnrichMap()
 78 | mem_canonical <- multiEnrichMap(er_canonical,
 79 |                                 enrichBaseline=1,
 80 |                                 cutoffRowMinP=0.05,
 81 |                                 colorV=c("purple", "orange"),
 82 |                                 topEnrichN=20)
 83 | sdim(mem_canonical)
 84 | 
 85 | nameColname = c("Name", "pathway", "Description", "itemsetID", 
 86 |                 "ID")
 87 | nameColname <- find_colname(nameColname, er_canonical)
 88 | #
 89 | # for upstreeam regulator use Activation z-score
 90 | enrichIM <- enrichList2IM(er_canonical, valueColname = "Activation z-score", 
 91 |                           keyColname = nameColname, verbose = F, emptyValue = 0, 
 92 |                           GmtT = NULL)
 93 | 
 94 | enrichIM <- enrichIM[rownames(enrichIM) %in% rownames(mem_canonical$enrichIM),]
 95 | 
 96 | #change enrichIM if you want to change the whole plot
 97 | 
 98 | enrichIM <- enrichIM[rowSums(enrichIM[]) != 0 ,]
 99 | 
100 | mem_canonical$enrichIM <- mem_canonical$enrichIM[rownames(mem_canonical$enrichIM) %in% rownames(mem_canonical$enrichIM),]
101 | 
102 | binarydf <- enrichIM
103 | binarydf <- as.data.frame(ifelse(enrichIM != 0, 1, 0))
104 | binarydf$total <- rowSums(binarydf)
105 | keep <- rownames(binarydf[binarydf$total>10,])
106 | enrichIM <- enrichIM[rownames(enrichIM) %in% keep,]
107 | 
108 | use_matrix <- enrichIM
109 | 
110 | name = "Activation z-score"
111 | col_logp <- circlize::colorRamp2(breaks = c(min(enrichIM), 0, max(enrichIM)), colors = c(rev(jamba::getColorRamp("RdBu", 
112 |                                                                                                                  lens = 2, n = 3, trimRamp = c(2, 2)))))
113 | 
114 | er_hc2 <- amap::hcluster(link = "ward", jamba::noiseFloor(enrichIM[rownames(enrichIM), 
115 |                                                                    , drop = FALSE], minimum = min(enrichIM), 
116 |                                                           newValue = 0, ceiling = max(enrichIM)), method = "euclidean")
117 | er_hc2 <- as.dendrogram(er_hc2)
118 | row_cex = 1
119 | row_fontsize <- jamba::noiseFloor(row_cex * 60/(nrow(enrichIM))^(1/2), 
120 |                                   minimum = 12, ceiling = 18)
121 | column_cex = 1
122 | 
123 | column_fontsize <- jamba::noiseFloor(column_cex * 60/(ncol(enrichIM))^(1/2), 
124 |                                      minimum = 1, ceiling = 20)
125 | 
126 | show_heatmap_legend <- TRUE
127 | top_annotation <- NULL 
128 | show = NULL
129 | cell_fun_custom <- cell_fun_bivariate(list(use_matrix, 
130 |                                            FALSE, mem_canonical$enrichIMgeneCount), pch = 21, 
131 |                                       size_fun = ct_approxfun, size_by = 3, outline_style = "darker", 
132 |                                       col_hm = col_logp, show = show, cex = 0, 
133 |                                       type = "univariate", prefix = c("z-score: ", 
134 |                                                                       "-log10P: ", "genes: ")[show])
135 | 
136 | hm <- call_fn_ellipsis(ComplexHeatmap::Heatmap, matrix = use_matrix, 
137 |                        name = name, col = col_logp, cluster_rows = er_hc2, 
138 |                        row_dend_reorder = TRUE, border = TRUE, 
139 |                        row_names_gp = grid::gpar(fontsize = row_fontsize), 
140 |                        row_names_max_width = grid::unit(12, "cm"), column_names_gp = grid::gpar(fontsize = column_fontsize), 
141 |                        column_names_max_height = grid::unit(12, "cm"), 
142 |                        cluster_columns = FALSE, row_dend_width = grid::unit(30, "cm"), 
143 |                        column_title = NULL, heatmap_legend_param = list(border = "black", legend_height = grid::unit(6, "cm")), 
144 |                        rect_gp = grid::gpar(type = "none"), cell_fun = cell_fun_custom, 
145 |                        show_heatmap_legend = show_heatmap_legend, top_annotation = top_annotation)
146 | 
147 | gene_count_max <- NULL
148 | min_count = 1
149 | if (length(gene_count_max) == 0) {
150 |   ctmax <- ceiling(max(mem_canonical$enrichIMgeneCount, na.rm = TRUE))
151 | } else {
152 |   ctmax <- gene_count_max
153 | }
154 | if (ctmax <= 1) {
155 |   ct_ticks <- c(0, 1)
156 | } else {
157 |   n <- 8
158 |   ct_ticks <- setdiff(unique(c(min_count, round(pretty(c(0, 
159 |                                                          ctmax), n = n)))), 0)
160 |   ct_step <- median(diff(ct_ticks))
161 |   if (max(ct_ticks) > ctmax) {
162 |     ct_ticks[which.max(ct_ticks)] <- ctmax
163 |     if (tail(diff(ct_ticks), 1) < ceiling(ct_step/4)) {
164 |       ct_ticks <- head(ct_ticks, -2)
165 |     }
166 |     else if (tail(diff(ct_ticks), 1) < ceiling(ct_step/2)) {
167 |       ct_ticks <- c(head(ct_ticks, -2), ctmax)
168 |     }
169 |   }
170 | }
171 | point_size_factor = 1
172 | point_size_max = 8
173 | point_size_min = 1
174 | 
175 | ct_approxfun <- function(x, ...) {
176 |   approxfun(x = sqrt(c(min_count, ctmax)), yleft = 0, 
177 |             ties = "ordered", yright = point_size_max, y = c(point_size_min, 
178 |                                                              point_size_max * point_size_factor))(sqrt(x), 
179 |                                                                                                   ...)
180 | }
181 | ct_tick_sizes <- ct_approxfun(ct_ticks)
182 | pt_legend_ncol <- 1
183 | if (length(ct_ticks) >= 8) {
184 |   pt_legend_ncol <- 2
185 | }
186 | 
187 | ct_tick_sizes <- ct_approxfun(ct_ticks)
188 | pch <- 21
189 | pt_legend <- ComplexHeatmap::Legend(labels = ct_ticks, 
190 |                                     title = "Gene Count", type = "points", pch = pch, 
191 |                                     ncol = pt_legend_ncol, size = grid::unit(ct_tick_sizes, 
192 |                                                                              "mm"), grid_height = grid::unit(max(ct_tick_sizes) * 
193 |                                                                                                                0.95, "mm"), grid_width = grid::unit(max(ct_tick_sizes) * 
194 |                                                                                                                                                       0.95, "mm"), background = "transparent", legend_gp = grid::gpar(col = "black", 
195 |                                                                                                                                                                                                                       fill = "grey85"))
196 | anno_legends <- list(pt_legend)
197 | attr(hm, "annotation_legend_list") <- anno_legends
198 | draw(hm, annotation_legend_list = anno_legends)
199 | 
200 | 
201 | cor.exp2 = data.frame(group = c(rep("core", 12), 
202 |                                 rep("edge", 12)))
203 | 
204 | 
205 | list1 = list(group = c("edge" = '#E64B35FF',
206 |                        "core" = '#4DBBD5FF'))
207 | 
208 | list2 = list(setNames(list1$group, c("edge", "core")))
209 | names(list2) = "group"
210 | 
211 | ha = HeatmapAnnotation(df = cor.exp2, col = list2)
212 | 
213 | hm <- call_fn_ellipsis(ComplexHeatmap::Heatmap, matrix = use_matrix, 
214 |                        name = name, col = col_logp, cluster_rows = er_hc2, 
215 |                        row_dend_reorder = TRUE, border = TRUE, 
216 |                        row_names_gp = grid::gpar(fontsize = row_fontsize), 
217 |                        row_names_max_width = grid::unit(12, "cm"), column_names_gp = grid::gpar(fontsize = 14), 
218 |                        column_names_max_height = grid::unit(12, "cm"), 
219 |                        cluster_columns = FALSE, row_dend_width = grid::unit(2, "cm"), 
220 |                        column_title = NULL, heatmap_legend_param = list(border = "black", legend_height = grid::unit(6, "cm")),
221 |                        bottom_annotation = ha)
222 | 
223 | 
224 | png("IPA_Heatmap_UpReg.png", units = "in", res=300, width=9.5, height=16)
225 | draw(hm)
226 | dev.off()
227 | 


--------------------------------------------------------------------------------
/Figure 2/IPA/IPA cannonical.R:
--------------------------------------------------------------------------------
  1 | library(remotes)
  2 | library(clusterProfiler)
  3 | library(rlang)
  4 | library(multienrichjam);
  5 | library(jamba)
  6 | library(amap)
  7 | #> 
  8 | #> Attaching package: 'jamba'
  9 | #> The following objects are masked from 'package:multienrichjam':
 10 | #> 
 11 | #>     heatmap_column_order, heatmap_row_order
 12 | library(colorjam);
 13 | suppressPackageStartupMessages(library(ComplexHeatmap));
 14 | options("stringsAsFactors"=FALSE, "warn"=-1);
 15 | knitr::opts_chunk$set(
 16 |   fig.height=10,
 17 |   fig.width=10,
 18 |   fig.align="center"
 19 | )
 20 | ragg_png = function(..., res = 192) {
 21 |   ragg::agg_png(..., res = res, units = "in")
 22 | }
 23 | knitr::opts_chunk$set(dev = "ragg_png", fig.ext = "png")
 24 | 
 25 | 
 26 | #Importing IPA Text file
 27 | ipa_files <- c(Patient_1_Core='patient1-core.txt',
 28 |                Patient_2_Core='patient2-core.txt',
 29 |                Patient_3_Core='patient3-core.txt',
 30 |                Patient_4_Core='patient4-core.txt',
 31 |                Patient_5_Core='patient5-core.txt',
 32 |                Patient_6_Core='patient6-core.txt',
 33 |                Patient_7_Core='patient7-core.txt',
 34 |                Patient_8_Core='patient8-core.txt',
 35 |                Patient_9_Core='patient9-core.txt',
 36 |                Patient_10_Core='patient10-core.txt',
 37 |                Patient_11_Core='patient11-core.txt',
 38 |                Patient_12_Core='patient12-core.txt',
 39 |                Patient_1_Edge='patient1-edge.txt',
 40 |                Patient_2_Edge='patient2-edge.txt',
 41 |                Patient_3_Edge='patient3-edge.txt',
 42 |                Patient_4_Edge='patient4-edge.txt',
 43 |                Patient_5_Edge='patient5-edge.txt',
 44 |                Patient_6_Edge='patient6-edge.txt',
 45 |                Patient_7_Edge='patient7-edge.txt',
 46 |                Patient_8_Edge='patient8-edge.txt',
 47 |                Patient_9_Edge='patient9-edge.txt',
 48 |                Patient_10_Edge='patient10-edge.txt',
 49 |                Patient_11_Edge='patient11-edge.txt',
 50 |                Patient_12_Edge='patient12-edge.txt')
 51 | ipa_l <- lapply(ipa_files, importIPAenrichment)
 52 | ssdim(ipa_l)
 53 | 
 54 | 
 55 | #Analyze IPA enrichments from canonical enrichment test
 56 | enrichList_canonical <- lapply(ipa_l, function(i){
 57 |   i[["Canonical Pathways"]];
 58 | });
 59 | sdim(enrichList_canonical);
 60 | 
 61 | ## Convert data.frame to enrichResult
 62 | ## multienrichjam::enrichDF2enrichResult
 63 | er_canonical <- lapply(enrichList_canonical, function(i){
 64 |   enrichDF2enrichResult(i,
 65 |                         keyColname="Name",
 66 |                         pvalueColname="P-value",
 67 |                         geneColname="geneNames",
 68 |                         geneRatioColname="Ratio",
 69 |                         pvalueCutoff=1)
 70 | });
 71 | sdim(er_canonical);
 72 | head(as.data.frame(er_canonical[[1]]));
 73 | 
 74 | 
 75 | ?multiEnrichMap
 76 | #Running multiEnrichMap()
 77 | mem_canonical <- multiEnrichMap(er_canonical,
 78 |                                 enrichBaseline=1,
 79 |                                 cutoffRowMinP=0.05,
 80 |                                 colorV=c("purple", "orange"),
 81 |                                 topEnrichN=20)
 82 | sdim(mem_canonical)
 83 | 
 84 | nameColname = c("Name", "pathway", "Description", "itemsetID", 
 85 |                 "ID")
 86 | nameColname <- find_colname(nameColname, er_canonical)
 87 | #
 88 | # for upstreeam regulator use Activation z-score
 89 | enrichIM <- enrichList2IM(er_canonical, valueColname = "zScore", 
 90 |                           keyColname = nameColname, verbose = F, emptyValue = 0, 
 91 |                           GmtT = NULL)
 92 | 
 93 | enrichIM <- enrichIM[rownames(enrichIM) %in% rownames(mem_canonical$enrichIM),]
 94 | 
 95 | #change enrichIM if you want to change the whole plot
 96 | 
 97 | enrichIM <- enrichIM[rowSums(enrichIM[]) != 0 ,]
 98 | 
 99 | mem_canonical$enrichIM <- mem_canonical$enrichIM[rownames(mem_canonical$enrichIM) %in% rownames(mem_canonical$enrichIM),]
100 | 
101 | binarydf <- enrichIM
102 | binarydf <- as.data.frame(ifelse(enrichIM != 0, 1, 0))
103 | binarydf$total <- rowSums(binarydf)
104 | keep <- rownames(binarydf[binarydf$total>9,])
105 | enrichIM <- enrichIM[rownames(enrichIM) %in% keep,]
106 | 
107 | use_matrix <- enrichIM
108 | 
109 | name = "zscore"
110 | col_logp <- circlize::colorRamp2(breaks = c(min(enrichIM), 0, max(enrichIM)), colors = c(rev(jamba::getColorRamp("RdBu", 
111 |                                                                                                                  lens = 2, n = 3, trimRamp = c(2, 2)))))
112 | 
113 | er_hc2 <- amap::hcluster(link = "ward", jamba::noiseFloor(enrichIM[rownames(enrichIM), 
114 |                                                                    , drop = FALSE], minimum = min(enrichIM), 
115 |                                                           newValue = 0, ceiling = max(enrichIM)), method = "euclidean")
116 | er_hc2 <- as.dendrogram(er_hc2)
117 | row_cex = 1
118 | row_fontsize <- jamba::noiseFloor(row_cex * 60/(nrow(enrichIM))^(1/2), 
119 |                                   minimum = 16, ceiling = 20)
120 | column_cex = 1
121 | 
122 | column_fontsize <- jamba::noiseFloor(column_cex * 60/(ncol(enrichIM))^(1/2), 
123 |                                      minimum = 1, ceiling = 20)
124 | 
125 | show_heatmap_legend <- TRUE
126 | top_annotation <- NULL 
127 | show = NULL
128 | cell_fun_custom <- cell_fun_bivariate(list(use_matrix, 
129 |                                            FALSE, mem_canonical$enrichIMgeneCount), pch = 21, 
130 |                                       size_fun = ct_approxfun, size_by = 3, outline_style = "darker", 
131 |                                       col_hm = col_logp, show = show, cex = 0, 
132 |                                       type = "univariate", prefix = c("z-score: ", 
133 |                                                                       "-log10P: ", "genes: ")[show])
134 | 
135 | hm <- call_fn_ellipsis(ComplexHeatmap::Heatmap, matrix = use_matrix, 
136 |                        name = name, col = col_logp, cluster_rows = er_hc2, 
137 |                        row_dend_reorder = TRUE, border = TRUE, 
138 |                        row_names_gp = grid::gpar(fontsize = row_fontsize), 
139 |                        row_names_max_width = grid::unit(12, "cm"), column_names_gp = grid::gpar(fontsize = column_fontsize), 
140 |                        column_names_max_height = grid::unit(12, "cm"), 
141 |                        cluster_columns = FALSE, row_dend_width = grid::unit(30, "cm"), 
142 |                        column_title = NULL, heatmap_legend_param = list(border = "black", legend_height = grid::unit(6, "cm")), 
143 |                        rect_gp = grid::gpar(type = "none"), cell_fun = cell_fun_custom, 
144 |                        show_heatmap_legend = show_heatmap_legend, top_annotation = top_annotation)
145 | 
146 | gene_count_max <- NULL
147 | min_count = 1
148 | if (length(gene_count_max) == 0) {
149 |   ctmax <- ceiling(max(mem_canonical$enrichIMgeneCount, na.rm = TRUE))
150 | } else {
151 |   ctmax <- gene_count_max
152 | }
153 | if (ctmax <= 1) {
154 |   ct_ticks <- c(0, 1)
155 | } else {
156 |   n <- 8
157 |   ct_ticks <- setdiff(unique(c(min_count, round(pretty(c(0, 
158 |                                                          ctmax), n = n)))), 0)
159 |   ct_step <- median(diff(ct_ticks))
160 |   if (max(ct_ticks) > ctmax) {
161 |     ct_ticks[which.max(ct_ticks)] <- ctmax
162 |     if (tail(diff(ct_ticks), 1) < ceiling(ct_step/4)) {
163 |       ct_ticks <- head(ct_ticks, -2)
164 |     }
165 |     else if (tail(diff(ct_ticks), 1) < ceiling(ct_step/2)) {
166 |       ct_ticks <- c(head(ct_ticks, -2), ctmax)
167 |     }
168 |   }
169 | }
170 | point_size_factor = 1
171 | point_size_max = 8
172 | point_size_min = 1
173 | 
174 | ct_approxfun <- function(x, ...) {
175 |   approxfun(x = sqrt(c(min_count, ctmax)), yleft = 0, 
176 |             ties = "ordered", yright = point_size_max, y = c(point_size_min, 
177 |                                                              point_size_max * point_size_factor))(sqrt(x), 
178 |                                                                                                   ...)
179 | }
180 | ct_tick_sizes <- ct_approxfun(ct_ticks)
181 | pt_legend_ncol <- 1
182 | if (length(ct_ticks) >= 8) {
183 |   pt_legend_ncol <- 2
184 | }
185 | 
186 | ct_tick_sizes <- ct_approxfun(ct_ticks)
187 | pch <- 21
188 | pt_legend <- ComplexHeatmap::Legend(labels = ct_ticks, 
189 |                                     title = "Gene Count", type = "points", pch = pch, 
190 |                                     ncol = pt_legend_ncol, size = grid::unit(ct_tick_sizes, 
191 |                                                                              "mm"), grid_height = grid::unit(max(ct_tick_sizes) * 
192 |                                                                                                                0.95, "mm"), grid_width = grid::unit(max(ct_tick_sizes) * 
193 |                                                                                                                                                       0.95, "mm"), background = "transparent", legend_gp = grid::gpar(col = "black", 
194 |                                                                                                                                                                                                                       fill = "grey85"))
195 | anno_legends <- list(pt_legend)
196 | attr(hm, "annotation_legend_list") <- anno_legends
197 | draw(hm, annotation_legend_list = anno_legends)
198 | 
199 | 
200 | cor.exp2 = data.frame(group = c(rep("core", 12), 
201 |                                 rep("edge", 12)))
202 | 
203 | 
204 | list1 = list(group = c("edge" = '#E64B35FF',
205 |                        "core" = '#4DBBD5FF'))
206 | 
207 | list2 = list(setNames(list1$group, c("edge", "core")))
208 | names(list2) = "group"
209 | 
210 | ha = HeatmapAnnotation(df = cor.exp2, col = list2)
211 | 
212 | hm <- call_fn_ellipsis(ComplexHeatmap::Heatmap, matrix = use_matrix, 
213 |                        name = name, col = col_logp, cluster_rows = er_hc2, 
214 |                        row_dend_reorder = TRUE, border = TRUE, 
215 |                        row_names_gp = grid::gpar(fontsize = row_fontsize), 
216 |                        row_names_max_width = grid::unit(20, "cm"), column_names_gp = grid::gpar(fontsize = 16), 
217 |                        column_names_max_height = grid::unit(12, "cm"), 
218 |                        cluster_columns = FALSE, row_dend_width = grid::unit(2, "cm"), 
219 |                        column_title = NULL, heatmap_legend_param = list(border = "black", legend_height = grid::unit(8, "cm")),
220 |                        bottom_annotation = ha)
221 | 
222 | png("IPA_Heatmap_Canonical.png", units = "in", res=300, width=18, height=14)
223 | draw(hm)
224 | dev.off()
225 | 


--------------------------------------------------------------------------------
/Figure 2/TF anaysis/pySCENIC_oscc.R:
--------------------------------------------------------------------------------
  1 | library(Matrix)
  2 | library(plyr)
  3 | library(dplyr)
  4 | library(Seurat)
  5 | library(igraph)
  6 | library(ComplexHeatmap)
  7 | library(circlize)
  8 | require(dplyr)
  9 | require(ggplot2)
 10 | library(ggpubr)
 11 | require(cowplot)
 12 | library(data.table)
 13 | library(RColorBrewer)
 14 | library(scater)
 15 | library(tidyverse)
 16 | library(AUCell)
 17 | library(SCENIC)
 18 | library(tibble)
 19 | 
 20 | custom_fill_colors <- c("sample_1" = "#66D7D1", "sample_2" = "#BEE3DB","sample_3" = "#D3D3D3",
 21 |                                             "sample_4" = "#FFD6BA","sample_5" = "#274C77","sample_6" = "#A3CEF1", 
 22 |                                             "sample_7" = "#F7CE5B", 
 23 |                                             "sample_8" = "#7CEA9C","sample_9" = "#7C90A0",
 24 |                                             "sample_10" = "#DCCCFF","sample_11" = "#EE6055",
 25 |                                             "sample_12" = "#C492B1")
 26 | 
 27 | #load file where TF AUC has been added as an assay to a Seurat object of "comb"
 28 | 
 29 | SCENIC <- readRDS("x_AUC.rds")
 30 | 
 31 | Idents(SCENIC) = "cluster_annotations"
 32 | DefaultAssay(SCENIC) <- "AUC"
 33 | SCENIC_proj <- subset(SCENIC, idents = c("core","edge"))
 34 | 
 35 | sub <- SCENIC_proj
 36 | sample_ids.2 <- list(unique(sub$sample_id))
 37 | 
 38 | markers_list <- list()
 39 | 
 40 | for (k in 1:length(sample_ids.2[[1]])) {
 41 |   
 42 |       markers <- subset(sub, cells = c(which(sub$sample_id == sample_ids.2[[1]][k]))) %>%
 43 |         FindMarkers(ident.1 ="core", group.by = "cluster_annotations",logfc.threshold = 0,min.pct = 0) %>%
 44 |         rownames_to_column(var = "gene")%>%
 45 |         add_column(sample_id = sample_ids.2[[1]][k])
 46 |       
 47 |       markers_list[[k]] <- markers
 48 |   
 49 |   
 50 | }
 51 | 
 52 | markers.list <- plyr::ldply(markers_list, data.frame)
 53 | #pval adj 0.05
 54 | markers.list.mat <- markers.list[markers.list$p_val_adj<0.05,]
 55 | markers.list.mat <- markers.list.mat[,c("gene", "avg_log2FC", "sample_id")]
 56 | 
 57 | markers.list.mat <- reshape2::dcast(markers.list.mat, formula = gene~sample_id, value.var = "avg_log2FC")
 58 | 
 59 | markers.list.mat[is.na(markers.list.mat)] = 0
 60 | markers.list.mat <- markers.list.mat %>% tibble::column_to_rownames(var = "gene")
 61 | 
 62 | 
 63 | markers.matrix = markers.list.mat
 64 | 
 65 | markers.matrix.m <- markers.matrix %>% tibble::rownames_to_column(var = "gene")
 66 | markers.matrix.m <- reshape2::melt(markers.matrix.m)
 67 | order.markers <- rownames(markers.matrix[order(rowSums(-markers.matrix)),])
 68 | markers.sum <- as.data.frame(ifelse(markers.matrix != 0, 1, 0))
 69 | markers.sum$total <- rowSums(markers.sum)
 70 | #at least 9 samples
 71 | keep <- rownames(markers.sum[markers.sum$total>9,])
 72 | markers.matrix.m <- markers.matrix.m[markers.matrix.m$gene %in% keep,]
 73 | 
 74 | #genes is only used as this code is repurposed from the degs script, these are not genes
 75 | #these are TFs
 76 | 
 77 | core_genes <- markers.matrix.m[markers.matrix.m$value > 0,]
 78 | edge_genes <- markers.matrix.m[markers.matrix.m$value < 0,]
 79 | core_genes <- unique(core_genes$gene)
 80 | edge_genes <- unique(edge_genes$gene)
 81 | 
 82 | saveRDS(core_genes,"TF_lists//core_TFS.RDS")
 83 | saveRDS(edge_genes,"TF_lists/edge_TFS.RDS")
 84 | 
 85 | markers.matrix.m <- markers.matrix.m[markers.matrix.m$gene %in% union(edge_genes,core_genes),]
 86 | 
 87 | markers_table <- reshape2::dcast(markers.matrix.m, formula = gene~variable, value.var = "value")
 88 | markers_table <- column_to_rownames(markers_table, "gene")
 89 | markers_table$sum <- rowSums(markers_table)
 90 | write.csv(markers_table, file = "markers_table.csv")
 91 | 
 92 | 
 93 | png("diff_exp.png", units="in", width=15, height=4, res=300, type = "cairo")
 94 | print(ggplot(markers.matrix.m, aes(x = factor(gene, level = order.markers),
 95 |                                  y = value, fill = variable)) +
 96 |         geom_bar(stat="identity", color = "black", size = 0.25) + theme_minimal() +
 97 |         labs(x = "", y = "Cumulative average log(fold-change)", fill = "sample_id") +
 98 |         ggpubr::rotate_x_text() + scale_fill_manual(values = custom_fill_colors)+ 
 99 |         theme(axis.text = element_text(face="bold"), legend.text = element_text(face="bold"), 
100 |               axis.text.y = element_text(face="bold")))
101 | dev.off()
102 | 
103 | #only plot top 30 of each for supplementary fig
104 | topweights <-aggregate(value~gene,markers.matrix.m,sum)
105 | 
106 | topweights <- topweights[order(-topweights$value),] 
107 | 
108 | markers_top_25 <- markers.matrix.m[markers.matrix.m$gene %in% c(head(topweights,25)$gene,tail(topweights,25)$gene),]
109 | 
110 | png("diff_exp_top.png", units="in", width=8, height=4, res=300, type = "cairo")
111 | print(ggplot(markers_top_25, aes(x = factor(gene, level = order.markers),
112 |                                    y = value, fill = variable)) +
113 |         geom_bar(stat="identity", color = "black", size = 0.25) + theme_minimal() +
114 |         labs(x = "", y = "Cumulative average log(fold-change)", fill = "sample_id") +
115 |         ggpubr::rotate_x_text() + scale_fill_manual(values = custom_fill_colors)+ 
116 |         theme(axis.text = element_text(face="bold"), legend.text = element_text(face="bold"), 
117 |               axis.text.y = element_text(face="bold")))
118 | dev.off()
119 | 
120 | setwd("..")
121 | 
122 | #reformat for interpretability
123 | markers.list <- plyr::ldply(markers_list, data.frame)
124 | 
125 | markers.list.mat <- markers.list[markers.list$p_val_adj<0.05,]
126 | markers.list.mat <- markers.list.mat[,c("gene", "avg_log2FC","p_val_adj","sample_id")]
127 | 
128 | markers.list.mat$enrichment <- ifelse(markers.list.mat$gene %in% core_genes, "core",
129 |                                       "edge")
130 | 
131 | write.csv(markers.list.mat, file = "markers.list.mat.csv")
132 | 


--------------------------------------------------------------------------------
/Figure 3/Cancer Stem Cell/csc_analysis.R:
--------------------------------------------------------------------------------
  1 | library(tidyverse)
  2 | library(Seurat)
  3 | library(ggplot2)
  4 | library(reshape2)
  5 | library(ggpubr)
  6 | 
  7 | load(file = "comb.Robj")
  8 | 
  9 | #function to summarize data
 10 | data_summary <- function(data, varname, groupnames){
 11 |   require(plyr)
 12 |   summary_func <- function(x, col){
 13 |     c(mean = mean(x[[col]], na.rm=TRUE),
 14 |       sd = sd(x[[col]], na.rm=TRUE))
 15 |   }
 16 |   data_sum<-ddply(data, groupnames, .fun=summary_func,
 17 |                   varname)
 18 |   data_sum <- rename(data_sum, c("mean" = varname))
 19 |   return(data_sum)
 20 | }
 21 | 
 22 | #funtion to plot individual data points
 23 | plot_effects <- function(data,comparision_group,comparisions,values,sample_id,
 24 |                          patient_col_dict) {
 25 |   geneset <- values
 26 |   
 27 |   Data = data[,c(sample_id,comparision_group,geneset)]
 28 |   
 29 |   plot <- Data
 30 |   
 31 |   plot_summary <- data_summary(plot, varname=geneset, 
 32 |                                groupnames=c(comparision_group, sample_id))
 33 |   
 34 |   plot_summary$geneset <- plot_summary[[geneset]]
 35 |   plot_summary$comparision_group <- plot_summary[[comparision_group]]
 36 |   
 37 |   p1 <- ggplot(data = plot_summary, aes_string(x=comparision_group, y= geneset), fill = sample_id) +
 38 |     geom_pointrange(aes(ymin=geneset-sd, 
 39 |                         ymax=geneset+sd, color = sample_id),
 40 |                     position = position_dodge(0.5), width = 0.2)+ 
 41 |     scale_color_manual(values = patient_col_dict) +
 42 |     stat_compare_means(comparisons = comparisions,label = "p.signif", paired = F)+
 43 |     theme_light() +
 44 |     theme(axis.text.x = element_text(angle = 45, hjust=1)) +
 45 |     xlab("") + ylab(paste(geneset, " scores")) +
 46 |     theme(strip.background =element_rect(fill="white"))+
 47 |     theme(strip.text = element_text(colour = 'black'), ) +
 48 |     theme(legend.position="none") 
 49 |   
 50 |   return(p1)
 51 | }
 52 | 
 53 | #adding score for each gene separately, ALDH1 has all isotypes
 54 | Seurat_obj <- comb
 55 | DefaultAssay(Seurat_obj) <- "SCT"
 56 | Seurat_obj <- AddModuleScore(Seurat_obj, features = list(c("ALDH1A1","ALDH1A2","ALDH1A3")),
 57 |                              name = "ALDH1")
 58 | Seurat_obj <- AddModuleScore(Seurat_obj, features = list(c("VIM")),
 59 |                              name = "VIM")
 60 | Seurat_obj <- AddModuleScore(Seurat_obj, features = list(c("EPCAM")),
 61 |                              name = "EPCAM")
 62 | Seurat_obj <- AddModuleScore(Seurat_obj, features = list(c("CDH1")),
 63 |                              name = "CDH1")
 64 | Seurat_obj <- AddModuleScore(Seurat_obj, features = list(c("CD24")),
 65 |                              name = "CD24")
 66 | Seurat_obj <- AddModuleScore(Seurat_obj, features = list(c("CD44")),
 67 |                              name = "CD44")
 68 | 
 69 | #MET like CSC score -VIM + EPCAM +CDH1 +ALDH1
 70 | Seurat_obj@meta.data$MET_like_CSC <- -1*Seurat_obj@meta.data$VIM1 + Seurat_obj@meta.data$EPCAM1+ Seurat_obj@meta.data$CDH1 + Seurat_obj@meta.data$ALDH1
 71 | 
 72 | #EMT score VIM - EPCAM - CDH1 + CD44 - CD24
 73 | Seurat_obj@meta.data$EMT_like_CSC <- Seurat_obj@meta.data$VIM1 + -1*Seurat_obj@meta.data$EPCAM1+ -1*Seurat_obj@meta.data$CDH1 + Seurat_obj@meta.data$CD441 + -1*Seurat_obj@meta.data$CD241 
 74 | 
 75 | #add broad score in
 76 | Seurat_obj <- AddModuleScore(Seurat_obj, features = list(c("ALDH1A1","ALDH1A2","ALDH1A3","NANOG","PROM1","POU5F1","SOX2","STAT3","ATP6AP2","MSI1","CD44")),
 77 |                              name = "CSC_broad")
 78 | 
 79 | samples <- Seurat_obj@meta.data
 80 | 
 81 | patient_col_dict <- c("sample_1" = "#66D7D1", "sample_2" = "#BEE3DB","sample_3" = "#D3D3D3",
 82 |                       "sample_4" = "#FFD6BA","sample_5" = "#274C77","sample_6" = "#A3CEF1", 
 83 |                       "sample_7" = "#F7CE5B", 
 84 |                       "sample_8" = "#7CEA9C","sample_9" = "#7C90A0",
 85 |                       "sample_10" = "#DCCCFF","sample_11" = "#EE6055",
 86 |                       "sample_12" = "#C492B1")
 87 | 
 88 | samples$sample_id <- factor(samples$sample_id, levels = 
 89 |                            c(names(patient_col_dict)))
 90 | 
 91 | samples$cluster_annotations <- factor(samples$cluster_annotations, levels = 
 92 |                                    c("core","transitory","edge"))
 93 | 
 94 | comparision_group <- "cluster_annotations"
 95 | comparisions <- list(c("core","edge"))
 96 | sample <- "sample_id"
 97 | 
 98 | #CSC score plots for Fig 3
 99 | for (values in c("MET_like_CSC","EMT_like_CSC","CSC_broad1")) {
100 |   
101 |   png(paste(values,"stat_plot.png",sep = "_"), units="in", width=5, height=4, res=300, type = "cairo")
102 |   print(plot_effects(samples,comparision_group, comparisions,values,sample,patient_col_dict))
103 |   dev.off()
104 |   
105 | }
106 | 
107 | #nebulosa plots for Fig 3
108 | 
109 | png("CSC_broad_nebulosa.png", units="in", width=5.5, height=6, res=300, type = "cairo")
110 | print(SCpubr::do_NebulosaPlot(Seurat_obj, features = "CSC_broad1", method = "wkde"))
111 | dev.off()
112 | 
113 | png("ALDH1A1and2and3_feature.png", units="in", width=6, height=6, res=300, type = "cairo")
114 | SCpubr::do_FeaturePlot(Seurat_obj, features = c("ALDH11"), order = TRUE)
115 | dev.off()
116 | 
117 | png("ALDH1A1and2and3_density.png", units="in", width=6, height=6, res=300, type = "cairo")
118 | SCpubr::do_NebulosaPlot(Seurat_obj, features = c("ALDH11"))
119 | dev.off()
120 | 
121 | png("CD44_feature.png", units="in", width=6, height=6, res=300, type = "cairo")
122 | SCpubr::do_FeaturePlot(Seurat_obj, features = "CD44", order = TRUE)
123 | dev.off()
124 | 
125 | png("CD44_density.png", units="in", width=6, height=6, res=300, type = "cairo")
126 | SCpubr::do_NebulosaPlot(Seurat_obj, features = "CD44")
127 | dev.off()
128 | 
129 | png("CD24_feature.png", units="in", width=6, height=6, res=300, type = "cairo")
130 | SCpubr::do_FeaturePlot(Seurat_obj, features = "CD24", order = TRUE)
131 | dev.off()
132 | 
133 | png("CD24_density.png", units="in", width=6, height=6, res=300, type = "cairo")
134 | SCpubr::do_NebulosaPlot(Seurat_obj, features = "CD24")
135 | dev.off()
136 | 
137 | png("CD24_inverse_density.png", units="in", width=6, height=6, res=300, type = "cairo")
138 | SCpubr::do_NebulosaPlot(Seurat_obj, features = "CD24", viridis_direction = -1)
139 | dev.off()
140 | 


--------------------------------------------------------------------------------
/Figure 3/Cell Neighbor Analysis/cell_neighbor_analysis.R:
--------------------------------------------------------------------------------
  1 | library(Seurat)
  2 | library(dplyr)
  3 | library(tibble)
  4 | library(cowplot)
  5 | library(patchwork)
  6 | load(file = "annotated_objects.Robj")
  7 | load(file = "comb.Robj")
  8 | 
  9 | comb_meta <- comb@meta.data
 10 | 
 11 | #transfer annotations over from combined object to individual objects with spatial positions
 12 | #in similar fashion to degs script 
 13 | 
 14 | for (object in annotated_objects) {
 15 |   meta <- object@meta.data
 16 |   
 17 |   meta_sample_id <- left_join(meta,comb_meta, by = "Row.names")%>%column_to_rownames("Row.names")
 18 |   
 19 |   meta_sample_id$cluster_annotations <- as.character(meta_sample_id$cluster_annotations)
 20 |   for(row in rownames(meta_sample_id)){
 21 |     if(is.na(meta_sample_id[row,]$cluster_annotations)) {
 22 |       meta_sample_id[row,]$cluster_annotations <- meta_sample_id[row,]$noncancer_celltype.x
 23 |     }
 24 |   }
 25 |   annotated_objects[[unique(object$sample_id)]]@meta.data <- meta_sample_id
 26 | }
 27 | 
 28 | #now find neighboring cells and quanitfy neighbors for core cells and neighbors for edge cells 
 29 | 
 30 | annotations_bcs<- list()
 31 | i = 1
 32 | for (object in annotated_objects) {
 33 | 
 34 | #check if a spot is an edge spot,# if the spot is an edge spot check its neighbors,note down the barcodes
 35 | #of its neighbors, quantify the neighboring cells based on the barcode matrix
 36 | 
 37 | BC_mtx_neighbors <- object@meta.data[c("sample_id.x","cluster_annotations")]
 38 | 
 39 | 
 40 | offsets <- data.frame(x.offset=c(-2, 2, -1,  1, -1, 1),
 41 |                       y.offset=c( 0, 0, -1, -1,  1, 1))
 42 | 
 43 | spot.positions <- object@images$tumor@coordinates[,c("row","col")]
 44 | spot.positions$spot.idx <- rownames(spot.positions)
 45 | 
 46 | ## Compute coordinates of each possible spot neighbor
 47 | neighbor.positions <- merge(spot.positions, offsets)
 48 | neighbor.positions$x.pos <- neighbor.positions$col + neighbor.positions$x.offset
 49 | neighbor.positions$y.pos <- neighbor.positions$row + neighbor.positions$y.offset
 50 | 
 51 | ## Select spots that exist at neighbor coordinates
 52 | neighbors <- merge(as.data.frame(neighbor.positions), 
 53 |                    as.data.frame(spot.positions), 
 54 |                    by.x=c("x.pos", "y.pos"), by.y=c("col", "row"),
 55 |                    suffixes=c(".primary", ".neighbor"),
 56 |                    all.x=TRUE)
 57 | 
 58 | neighbors_anno <- neighbors[complete.cases(neighbors), ]
 59 | 
 60 | neighbors_anno$celltype_primary <- object@meta.data[neighbors_anno$spot.idx.primary,]$cluster_annotations
 61 | neighbors_anno$celltype_neighbor <- object@meta.data[neighbors_anno$spot.idx.neighbor,]$cluster_annotations
 62 | annotations_bcs[[i]] <- neighbors_anno
 63 | #based on neighbor graph,identify neighboring celltypes for each cell state
 64 | object@meta.data$neighbor_clust_anno <- "none"
 65 | object@meta.data$neighbor_bcs <- "none"
 66 | 
 67 | for(bc in rownames(object@meta.data)) {
 68 |   nbs <- neighbors[neighbors$spot.idx.primary == bc,]
 69 |   nb_bc <- na.omit(nbs$spot.idx.neighbor)
 70 |   celltypes <- object@meta.data[nb_bc,]$cluster_annotations
 71 |   
 72 |   object@meta.data[bc,]$neighbor_clust_anno <- paste(celltypes, collapse = ",")
 73 |   object@meta.data[bc,]$neighbor_bcs <- paste(nb_bc, collapse = ",")
 74 | }
 75 | annotated_objects[[i]] <- object
 76 | i =i+1
 77 | 
 78 | }
 79 | 
 80 | nb_list <- list()
 81 | for (object in annotated_objects) {
 82 |   # Convert to a data frame
 83 |   ct_nbs_df <- object@meta.data[,c("cluster_annotations","neighbor_clust_anno", "neighbor_bcs")]
 84 |   
 85 |   all_ct_nbs <- strsplit(ct_nbs_df$neighbor_clust_anno, ",")
 86 |   all_ct_nbs_barcodes <- strsplit(ct_nbs_df$neighbor_bcs, ",")
 87 |   
 88 |   neigh_df <- data.frame(primary = rep(ct_nbs_df$cluster_annotations, sapply(all_ct_nbs, length)),
 89 |                          neighbor = unlist(all_ct_nbs), neighbor_barcodes = unlist(all_ct_nbs_barcodes))
 90 |   
 91 |   cne_neighbors <- neigh_df[neigh_df$primary %in% c("core","edge"),]
 92 |   `%!in%` = Negate(`%in%`)
 93 |   cne_neighbors <- as.data.frame(cne_neighbors[cne_neighbors$neighbor %!in% c("core","edge","transitory"),])
 94 |   
 95 |   cne_neighbors <- unique(subset(cne_neighbors, !duplicated(cne_neighbors)))
 96 |   
 97 |   cne_nbs <- as.data.frame(table(cne_neighbors$primary, cne_neighbors$neighbor))
 98 |   
 99 |   colnames(cne_nbs) <- c("primary","neighbor","#neighbors")
100 |   `%!in%` = Negate(`%in%`)
101 |   
102 |   cne_nbs$sample <- unique(object$sample_id.x)
103 |   
104 |   nb_list <- append(nb_list, list(cne_nbs), 0)
105 | }
106 | 
107 | allc2c_net <- plyr::ldply(nb_list)
108 | 
109 | library(rstatix)
110 | 
111 | signif_abbreviation <- function(p_values) {
112 |   sapply(p_values, function(x) {
113 |     if (is.na(x)) {
114 |       return(NA)
115 |     } else if (x < 0.0001) {
116 |       return("****")
117 |     } else if (x < 0.001) {
118 |       return("***")
119 |     } else if (x < 0.01) {
120 |       return("**")
121 |     } else if (x < 0.05) {
122 |       return("*")
123 |     } else if (x > 0.05) {
124 |       return("NS")
125 |     } else {
126 |       return("NS")
127 |     }
128 |   })
129 | }
130 | 
131 | allc2c_net_sig <- allc2c_net[allc2c_net$neighbor %in% c("ecm.myCAF","macrophage","cytotoxic.CD8..T.","Intermediate.fibroblast"),]
132 | pairwise_comparisons <- allc2c_net_sig %>%
133 |   group_by(neighbor) %>%
134 |   filter(n_distinct(primary) > 1) %>%# skip groups with only one observation
135 |   wilcox_test(`#neighbors` ~ primary, p.adjust.method = "bonferroni")
136 | 
137 | #adjust p value
138 | pairwise_comparisons$p.adj <- p.adjust(pairwise_comparisons$p, method = "BH")
139 | pairwise_comparisons <- pairwise_comparisons %>%
140 |   mutate(p.adj.signif = as.character(signif_abbreviation(p.adj)))
141 | 
142 | allc2c_net <- left_join(allc2c_net, pairwise_comparisons, by = "neighbor")
143 | 
144 | summary_data <-allc2c_net[allc2c_net$primary %in% c("core"),]
145 | 
146 | png("signaling_cellneighbor_sig_adj.png", units="in", width=12, height=5, res=300, type = "cairo")
147 | ggplot(allc2c_net, aes(x = primary, y = `#neighbors`, color = factor(primary), fill = primary)) +
148 |   geom_boxplot(outlier.shape = NA, aes(fill = factor(primary)), alpha = 0.5) +
149 |   scale_fill_manual(values = c('edge' = '#E64B35FF',
150 |                                'core' = '#4DBBD5FF')) +
151 |   facet_wrap(~neighbor, scales = "free", nrow = 1) +
152 |   geom_point(alpha = 1, aes(color = factor(sample), group = primary, fill = primary)) +
153 |   scale_color_manual(values = c("sample_1" = "#66D7D1", "sample_2" = "#BEE3DB","sample_3" = "#D3D3D3",
154 |                                 "sample_4" = "#FFD6BA","sample_5" = "#274C77","sample_6" = "#A3CEF1",
155 |                                 "sample_7" = "#F7CE5B",
156 |                                 "sample_8" = "#7CEA9C","sample_9" = "#7C90A0",
157 |                                 "sample_10" = "#DCCCFF","sample_11" = "#EE6055",
158 |                                 "sample_12" = "#C492B1")) +
159 |   ggtitle("Number of Neighboring Spots") +
160 |   xlab("Neighbor") +
161 |   ylab("Number of Neighbors") +
162 |   theme_bw() +
163 |   theme(legend.position = "bottom") +
164 |   guides(color = guide_legend(title = "Sample")) +
165 |   geom_text(data = summary_data, aes(x = primary, y = Inf, label = p.adj.signif), vjust = 1.5, hjust = 0.5, fontface = "bold") +
166 |   theme(axis.text.x = element_text(angle = 45, hjust = 1),   # Rotate and adjust axis labels
167 |         plot.margin = unit(c(1, 5, 1, 1), "lines"),
168 |         axis.title = element_text(face = "bold"),
169 |         axis.title.y = element_blank(),
170 |         axis.text = element_text(size = 10, face = "bold"),
171 |         axis.ticks.y = element_blank())
172 | dev.off()
173 | 


--------------------------------------------------------------------------------
/Figure 3/Cell Signaling/cellchat.R:
--------------------------------------------------------------------------------
  1 | library(reshape2)
  2 | library(patchwork)
  3 | library(ggplot2)
  4 | library(gprofiler2)
  5 | library(dplyr)
  6 | library(Seurat)
  7 | library(RColorBrewer)
  8 | library(sctransform)
  9 | library(CellChat)
 10 | library(patchwork)
 11 | 
 12 | CellChatDB <- CellChatDB.human
 13 | showDatabaseCategory(CellChatDB)
 14 | CellChatDB.use <- CellChatDB
 15 | 
 16 | library(Seurat)
 17 | library(dplyr)
 18 | library(tibble)
 19 | library(cowplot)
 20 | library(patchwork)
 21 | load(file = "annotated_objects.Robj")
 22 | load(file = "comb.Robj")
 23 | 
 24 | comb_meta <- comb@meta.data
 25 | 
 26 | for (object in annotated_objects) {
 27 |   meta <- object@meta.data
 28 |   
 29 |   meta_sample_id <- left_join(meta,comb_meta, by = "Row.names")%>%column_to_rownames("Row.names")
 30 |   
 31 |   meta_sample_id$cluster_annotations <- as.character(meta_sample_id$cluster_annotations)
 32 |   for(row in rownames(meta_sample_id)){
 33 |     if(is.na(meta_sample_id[row,]$cluster_annotations)) {
 34 |       meta_sample_id[row,]$cluster_annotations <- meta_sample_id[row,]$noncancer_celltype.x
 35 |     }
 36 |   }
 37 |   annotated_objects[[unique(object$sample_id)]]@meta.data <- meta_sample_id
 38 | }
 39 | 
 40 | merged_obj <- purrr::reduce(annotated_objects,merge)
 41 | 
 42 | Idents(merged_obj) <- "cluster_annotations"
 43 | 
 44 | #Analysis only with core, edge, and ecm myCAFs
 45 | 
 46 | merged_obj <- subset(merged_obj, idents = c("core","edge","ecm.myCAF"))
 47 | merged_obj@images <- list()
 48 | 
 49 | sp_cc_full <- createCellChat(merged_obj)
 50 | sp_cc_full@DB <- CellChatDB.use
 51 | sp_cc_full <- subsetData(sp_cc_full)
 52 | sp_cc_full <- identifyOverExpressedGenes(sp_cc_full)
 53 | sp_cc_full <- identifyOverExpressedInteractions(sp_cc_full)
 54 | 
 55 | sp_cc_full <- computeCommunProb(sp_cc_full)
 56 | sp_cc_full <- filterCommunication(sp_cc_full, min.cells = 10)
 57 | sp_cc_full <- computeCommunProbPathway(sp_cc_full)
 58 | sp_cc_full <- aggregateNet(sp_cc_full)
 59 | cellchat <- sp_cc_full
 60 | cellchat <- netAnalysis_computeCentrality(cellchat, slot.name = "netP")
 61 | 
 62 | 
 63 | count_interactions <- as.data.frame(cellchat@net[["count"]])
 64 | count_interactions <- melt(count_interactions)
 65 | library(dplyr)
 66 | count_interactions <- count_interactions[c(1,5,9),]
 67 | 
 68 | gg1 <- ggplot(count_interactions, aes(x=variable, y=value, fill=variable))+
 69 |   geom_bar(stat="identity", color="black")+
 70 |   scale_fill_manual(values=c("core" = "#4DBBD5FF", 
 71 |                              "edge" = "#E64B35FF",
 72 |                              "ecm.myCAF" = "#3f908d"))+
 73 |   labs(y = "Number of intracellular interactions") +
 74 |   theme_minimal()
 75 | 
 76 | color_map = c("core" = "#4DBBD5FF", 
 77 |               "edge" = "#E64B35FF",
 78 |               "ecm.myCAF" = "#3f908d")
 79 | 
 80 | #intercellular interaction strength
 81 | 
 82 | weight <- as.data.frame(cellchat@net[["weight"]])
 83 | 
 84 | weight_interactions <- melt(weight)
 85 | library(dplyr)
 86 | weight_interactions <- weight_interactions[c(1,5,9),]
 87 | 
 88 | gg2 <- ggplot(weight_interactions, aes(x=variable, y=value, fill=variable))+
 89 |   geom_bar(stat="identity", color="black")+
 90 |   scale_fill_manual(values=c("core" = "#4DBBD5FF", 
 91 |                              "edge" = "#E64B35FF",
 92 |                              "ecm.myCAF" = "#3f908d"))+
 93 |   labs(y = "Intracellular interaction strength") + 
 94 |   theme_minimal()
 95 | 
 96 | png("intracellular_weighted_caf.png", units="in", width=10, height=5, res=300, type = "cairo")
 97 | gg1 + gg2
 98 | dev.off()
 99 | 
100 | #count of interactions
101 | 
102 | count_interactions <- as.data.frame(cellchat@net[["count"]])
103 | count_interactions <- melt(count_interactions)
104 | library(dplyr)
105 | count_interactions <- count_interactions[c(4,5,9),]
106 | count_interactions[1,] <- c('ecm.myCAF',count_interactions[1,]$value)
107 | 
108 | gg1 <- ggplot(count_interactions, aes(x=variable, y=value, fill=variable))+
109 |   geom_bar(stat="identity", color="black")+
110 |   scale_fill_manual(values=c("core" = "#4DBBD5FF", 
111 |                              "edge" = "#E64B35FF",
112 |                              "ecm.myCAF" = "#3f908d"))+
113 |   labs(y = "cell-cancer interactions") +
114 |   theme_minimal()
115 | 
116 | #
117 | weight <- as.data.frame(cellchat@net[["weight"]])
118 | 
119 | weight_interactions <- melt(weight)
120 | library(dplyr)
121 | weight_interactions <- weight_interactions[c(4,5,9),]
122 | weight_interactions[1,] <- c('ecm.myCAF',weight_interactions[1,]$value)
123 | 
124 | gg2 <- ggplot(weight_interactions, aes(x=variable, y=value, fill=variable))+
125 |   geom_bar(stat="identity", color="black")+
126 |   scale_fill_manual(values=c("core" = "#4DBBD5FF", 
127 |                              "edge" = "#E64B35FF",
128 |                              "ecm.myCAF" = "#3f908d"))+
129 |   labs(y = "cell-cancer interaction strength") + 
130 |   theme_minimal()
131 | 
132 | png("cell_to_cancer.png", units="in", width=10, height=5, res=300, type = "cairo")
133 | gg1 + gg2
134 | dev.off()
135 | 
136 | color_map = c("core" = "#4DBBD5FF", 
137 |               "edge" = "#E64B35FF",
138 |               "ecm.myCAF" = "#3f908d")
139 | 
140 | #cell signaling from edge cells to edge cells
141 | png("e2e.png", units="in", width=20, height=10, res=300, type = "cairo")
142 | netVisual_chord_gene(cellchat, sources.use = c(2), targets.use = c(2),legend.pos.x = 8,
143 |                      color.use = c("edge" = "#E64B35FF"), lab.cex = 1.2)
144 | dev.off()
145 | 
146 | #cell signaling from core cells to core cells
147 | png("c2c.png", units="in", width=20, height=10, res=300, type = "cairo")
148 | netVisual_chord_gene(cellchat, sources.use = c(3), targets.use = c(3),legend.pos.x = 8, lab.cex = 1.2,
149 |                      color.use = c("core" = "#4DBBD5FF"))
150 | dev.off()
151 | 
152 | #cell signaling between CAF and edge cells
153 | png("caf_and_edge.png", units="in", width=20, height=10, res=300, type = "cairo")
154 | netVisual_chord_gene(cellchat, sources.use = c(1), targets.use = c(1,2),legend.pos.x = 8, lab.cex = 1.2,
155 |                      color.use = c("edge" = "#E64B35FF",
156 |                                    "ecm.myCAF" = "#3f908d"))
157 | dev.off()
158 | 
159 | #subset for only top ligand recceptor pairs 
160 | object <- cellchat
161 | prob <- slot(object, "net")$prob
162 | pval <- slot(object, "net")$pval
163 | prob[pval > 0.05] <- 0
164 | net <- reshape2::melt(prob, value.name = "prob")
165 | colnames(net)[1:3] <- c("source", "target", "interaction_name")
166 | pairLR = dplyr::select(object@LR$LRsig, c("interaction_name_2", 
167 |                                           "pathway_name", "ligand", "receptor", "annotation", 
168 |                                           "evidence"))
169 | idx <- match(net$interaction_name, rownames(pairLR))
170 | temp <- pairLR[idx, ]
171 | net <- cbind(net, temp)
172 | 
173 | net <- net[net$prob > 1.0e-03,]
174 | 
175 | # #cell signaling between CAF and edge cells with a more stringent cutoff
176 | # png("caf_and_edge_sub.png", units="in", width=20, height=10, res=300, type = "cairo")
177 | # netVisual_chord_gene(object, net = net, sources.use = c(1), targets.use = c(1,2),legend.pos.x = 8,
178 | #                      color.use = c("edge" = "#E64B35FF",
179 | #                                    "ecm.myCAF" = "#3f908d"), thresh = 0.001,lab.cex = 1.2)
180 | # dev.off()
181 | 
182 | 
183 | # png("e2e_sub.png", units="in", width=20, height=10, res=300, type = "cairo")
184 | # netVisual_chord_gene(object, net = net, sources.use = c(2), targets.use = c(2),legend.pos.x = 8,
185 | #                      color.use = c("edge" = "#E64B35FF"), thresh = 0.001,lab.cex = 1.2)
186 | # dev.off()
187 | 
188 | #cell signaling between core and core cells with a more stringent cutoff of 0.001
189 | png("c2c_0.001.png", units="in", width=20, height=10, res=300, type = "cairo")
190 | netVisual_chord_gene(object, net = net, sources.use = c(3), targets.use = c(3),legend.pos.x = 8,
191 |                      color.use = c("core" = "#4DBBD5FF"), thresh = 0.001,lab.cex = 1.2)
192 | dev.off()
193 | 
194 | net <- net[net$prob > 5.0e-03,]
195 | #cell signaling between edge and edge cells with a more stringent cutoff of 0.005
196 | png("e2e_0.005.png", units="in", width=20, height=10, res=300, type = "cairo")
197 | netVisual_chord_gene(object, net = net, sources.use = c(2), targets.use = c(2),legend.pos.x = 8,
198 |                      color.use = c("edge" = "#E64B35FF"), thresh = 0.001,lab.cex = 1.2)
199 | dev.off()
200 | 
201 | # net <- net[net$prob > 1e-02,]
202 | # 
203 | # png("caf_and_edge_0.01.png", units="in", width=20, height=10, res=300, type = "cairo")
204 | # netVisual_chord_gene(object, net = net, sources.use = c(1), targets.use = c(1,2),legend.pos.x = 8,
205 | #                      color.use = c("edge" = "#E64B35FF",
206 | #                                    "ecm.myCAF" = "#3f908d"), thresh = 0.001,lab.cex = 1.2)
207 | # dev.off()
208 | 
209 | net <- net[net$prob > 5e-02,]
210 | #cell signaling between edge and edge cells with a more stringent cutoff of 0.05
211 | png("caf_and_edge_0.05.png", units="in", width=20, height=10, res=300, type = "cairo")
212 | netVisual_chord_gene(object, net = net, sources.use = c(1), targets.use = c(1,2),legend.pos.x = 8,
213 |                      color.use = c("edge" = "#E64B35FF",
214 |                                    "ecm.myCAF" = "#3f908d"), thresh = 0.001,lab.cex = 1.2)
215 | dev.off()
216 | 
217 | # extract signaling info
218 | object <- cellchat
219 | prob <- slot(object, "net")$prob
220 | pval <- slot(object, "net")$pval
221 | prob[pval > 0.05] <- 0
222 | net <- reshape2::melt(prob, value.name = "prob")
223 | colnames(net)[1:3] <- c("source", "target", "interaction_name")
224 | pairLR = dplyr::select(object@LR$LRsig, c("interaction_name_2", 
225 |                                           "pathway_name", "ligand", "receptor", "annotation", 
226 |                                           "evidence"))
227 | idx <- match(net$interaction_name, rownames(pairLR))
228 | temp <- pairLR[idx, ]
229 | net <- cbind(net, temp)
230 | 
231 | write.csv(net, file ="cellchat_signaling.csv")
232 | 
233 | #analyze only core and edge cells
234 | 
235 | merged_obj <- subset(merged_obj, idents = c("core", 
236 |                                             "edge"))
237 | 
238 | sp_cc_full <- createCellChat(merged_obj)
239 | sp_cc_full@DB <- CellChatDB.use
240 | sp_cc_full <- subsetData(sp_cc_full)
241 | sp_cc_full <- identifyOverExpressedGenes(sp_cc_full)
242 | sp_cc_full <- identifyOverExpressedInteractions(sp_cc_full)
243 | 
244 | sp_cc_full <- computeCommunProb(sp_cc_full)
245 | sp_cc_full <- filterCommunication(sp_cc_full, min.cells = 10)
246 | sp_cc_full <- computeCommunProbPathway(sp_cc_full)
247 | sp_cc_full <- aggregateNet(sp_cc_full)
248 | cellchat <- sp_cc_full
249 | cellchat <- netAnalysis_computeCentrality(cellchat, slot.name = "netP")
250 | 
251 | color_map = c( 
252 |   "edge" = "#E64B35FF", "core" = "#4DBBD5FF")
253 | 
254 | png("signaling_pathways_onlycancercell.png", units="in", width=15, height=7, res=300, type = "cairo")
255 | ht1 <- netAnalysis_signalingRole_heatmap(cellchat, pattern = "outgoing", color.use = color_map,
256 |                                          width = 3)
257 | ht2 <- netAnalysis_signalingRole_heatmap(cellchat, pattern = "incoming", color.use = color_map,
258 |                                          width = 3)
259 | ht1 + ht2
260 | dev.off()
261 | 
262 | #analyze other celltypes and how they interact with core/edge cancer cells
263 | 
264 | rm(list=setdiff(ls(), "annotated_objects"))
265 | CellChatDB <- CellChatDB.human
266 | CellChatDB.use <- CellChatDB
267 | 
268 | merged_obj <- purrr::reduce(annotated_objects,merge)
269 | merged_obj@images <- list()
270 | Idents(merged_obj) <- "cluster_annotations"
271 | 
272 | #cellchat  with core cancer cells and macrophages
273 | core_obj <- subset(merged_obj, idents = c("core","macrophage"), invert = FALSE)
274 | 
275 | core_cc <- createCellChat(core_obj)
276 | core_cc@DB <- CellChatDB.use
277 | core_cc <- subsetData(core_cc)
278 | core_cc <- identifyOverExpressedGenes(core_cc)
279 | core_cc <- identifyOverExpressedInteractions(core_cc)
280 | 
281 | core_cc <- computeCommunProb(core_cc)
282 | core_cc <- filterCommunication(core_cc, min.cells = 10)
283 | core_cc <- computeCommunProbPathway(core_cc)
284 | core_cc <- aggregateNet(core_cc)
285 | core_cc <- netAnalysis_computeCentrality(core_cc, slot.name = "netP")
286 | 
287 | #make a plot for core signaling to macrophages, in prominent core pathways
288 | netVisual_individual(core_cc, signaling = c(core_cc@netP$pathways), 
289 |                      pairLR.use = c(2), layout = "chord")
290 | 
291 | png("other_celltypes/core_mac_demosome.png", units="in", width=6, height=5, res=300, type = "cairo")
292 | netVisual_aggregate(core_cc, signaling = c("DESMOSOME"), layout = "chord",color.use = c("core" = "#4DBBD5FF",
293 |                                                                                         "macrophage" = "#cfa42d"))
294 | dev.off()
295 | 
296 | png("other_celltypes/core_mac_cdh.png", units="in", width=6, height=5, res=300, type = "cairo")
297 | netVisual_aggregate(core_cc, signaling = c("CDH"), layout = "chord",color.use = c("core" = "#4DBBD5FF",
298 |                                                                                   "macrophage" = "#cfa42d"))
299 | dev.off()
300 | 
301 | 
302 | #cellchat  with edge cancer cells, macropahges, ecmmyCAFs and cytotoxic CD8T cells
303 | edge_obj <- subset(merged_obj, idents = c("edge", "macrophage","cytotoxic.CD8..T.",
304 |                                           "ecm.myCAF"))
305 | edge_cc <- createCellChat(edge_obj)
306 | edge_cc@DB <- CellChatDB.use
307 | edge_cc <- subsetData(edge_cc)
308 | edge_cc <- identifyOverExpressedGenes(edge_cc)
309 | edge_cc <- identifyOverExpressedInteractions(edge_cc)
310 | 
311 | edge_cc <- computeCommunProb(edge_cc)
312 | edge_cc <- filterCommunication(edge_cc, min.cells = 10)
313 | edge_cc <- computeCommunProbPathway(edge_cc)
314 | edge_cc <- aggregateNet(edge_cc)
315 | edge_cc <- netAnalysis_computeCentrality(edge_cc, slot.name = "netP")
316 | 
317 | #make a plot for edge signaling to macrophages, CD8 T cells, and ECM myCAFs, in prominent edge pathways
318 | png("other_celltypes/edge_APP.png", units="in", width=10, height=10, res=300, type = "cairo")
319 | netVisual_aggregate(edge_cc, signaling = c("APP"), layout = "chord",color.use = c("edge" = "#E64B35FF",
320 |                                                                                         "macrophage" = "#cfa42d",
321 |                                                                                        "cytotoxic.CD8..T." = "#79577c",
322 |                                                                                        "ecm.myCAF" = "#3f908d"))
323 | dev.off()
324 | 
325 | png("other_celltypes/edge_LAMININ.png", units="in", width=10, height=10, res=300, type = "cairo")
326 | netVisual_aggregate(edge_cc, signaling = c("LAMININ"), layout = "chord",color.use = c("edge" = "#E64B35FF",
327 |                                                                                   "macrophage" = "#cfa42d",
328 |                                                                                   "cytotoxic.CD8..T." = "#79577c",
329 |                                                                                   "ecm.myCAF" = "#3f908d"))
330 | dev.off()
331 | 
332 | png("other_celltypes/edge_COLLAGEN.png", units="in", width=10, height=10, res=300, type = "cairo")
333 | netVisual_aggregate(edge_cc, signaling = c("COLLAGEN"), layout = "chord",color.use = c("edge" = "#E64B35FF",
334 |                                                                                       "macrophage" = "#cfa42d",
335 |                                                                                       "cytotoxic.CD8..T." = "#79577c",
336 |                                                                                       "ecm.myCAF" = "#3f908d"))
337 | dev.off()
338 | 


--------------------------------------------------------------------------------
/Figure 4/Classifier Training/script.R:
--------------------------------------------------------------------------------
  1 | library("hardhat")
  2 | library("scPred")
  3 | library("Seurat")
  4 | library("magrittr")
  5 | library(tibble)
  6 | 
  7 | load("annotated_objects.Robj")
  8 | 
  9 | #reduce all objects
 10 | SCC_all_samples <- purrr::reduce(annotated_objects,merge)
 11 | 
 12 | #load in cancer cell annotations
 13 | load(file = "comb.Robj")
 14 | comb@meta.data$cluster_annotations = comb@active.ident
 15 | cancer_cell_meta <-comb@meta.data[,colnames(comb@meta.data) %in% c("new_annotations_trial","cluster_annotations")]
 16 | 
 17 | #transfer cancer cell state information, core, transitory, and edge
 18 | full_meta <- list(SCC_all_samples@meta.data, cancer_cell_meta) %>% 
 19 |   purrr::map(~ .x %>% 
 20 |         as.data.frame %>%
 21 |         rownames_to_column('rn')) %>% 
 22 |         purrr::reduce(left_join, by = 'rn') %>%
 23 |         column_to_rownames('rn')
 24 | 
 25 | full_meta$cluster_annotations <- as.character(full_meta$cluster_annotations)
 26 | #label noncancer cells
 27 | full_meta$cluster_annotations[is.na(full_meta$cluster_annotations)] <- "noncancer"
 28 | 
 29 | full_meta$cluster_annotations <- factor(full_meta$cluster_annotations,levels = unique(full_meta$cluster_annotations))
 30 | 
 31 | #transfer annotations to object
 32 | SCC_all_samples@meta.data <- full_meta
 33 | 
 34 | #generate reference
 35 | reference <- SCC_all_samples
 36 | DefaultAssay(reference) <- "Spatial"
 37 | reference <- reference %>%
 38 |   Seurat::NormalizeData(verbose = FALSE) %>%
 39 |   FindVariableFeatures(selection.method = "vst", nfeatures = 2000) %>% 
 40 |   ScaleData(verbose = FALSE) %>% 
 41 |   RunHarmony("sample_id")  %>% 
 42 |   RunPCA(pc.genes = reference@var.genes, npcs = 20, verbose = FALSE) %>% 
 43 |   RunUMAP(reduction = "harmony", dims = 1:20) %>% 
 44 |   FindNeighbors(reduction = "harmony", dims = 1:20) %>% 
 45 |   FindClusters(resolution = 0.5) %>% 
 46 |   identity()
 47 | 
 48 | reference <- getFeatureSpace(reference, "cluster_annotations")
 49 | rm(list=setdiff(ls(), "reference"))
 50 | #train model on avNNet, svmRadial, and nb
 51 | reference_avNNet <- trainModel(reference,allowParallel = T,preProcess = c("center", "scale","YeoJohnson"),seed = 21,model = "avNNet",number = 10)
 52 | reference_svmRadial <- trainModel(reference,allowParallel = T,preProcess = c("center", "scale","YeoJohnson"),seed = 21,model = "svmRadial",number = 10)
 53 | reference_nb <- trainModel(reference,allowParallel = T,preProcess = c("center", "scale","YeoJohnson"),seed = 21,model = "nb",number = 10)
 54 | 
 55 | save(reference_avNNet, file = "reference_avNNet.Robj")
 56 | save(reference_svmRadial, file = "reference_svmRadial.Robj")
 57 | save(reference_nb, file = "reference_nb.Robj")
 58 | load(file = "reference_avNNet.Robj")
 59 | load(file = "reference_svmRadial.Robj")
 60 | load(file = "reference_nb.Robj")
 61 | 
 62 | get_scpred(reference_avNNet)
 63 | get_scpred(reference_svmRadial)
 64 | get_scpred(reference_nb)
 65 | 
 66 | #based on the best models for each subtype, we should do:
 67 | #core: svm
 68 | #edge: svm
 69 | #transitory: avNNet
 70 | #noncancer: svm
 71 | 
 72 | #reclassify svmRadial with avNNet for transitory 
 73 | reference <- trainModel(reference_svmRadial,allowParallel = T,preProcess = c("center", "scale","YeoJohnson"),seed = 21,model = "avNNet",reclassify = c("transitory"),number = 10)
 74 | save(reference, file = "reference.Robj")
 75 | rm(list=setdiff(ls(), "reference"))
 76 | ### On server
 77 | 
 78 | library(Seurat)
 79 | library("scPred")
 80 | library("magrittr")
 81 | load(file = "reference.Robj")
 82 | 
 83 | get_scpred(reference)
 84 | model <- get_scpred(reference)
 85 | save(model,file = "model.Robj")
 86 | #plot model statistics
 87 | png(file = "plot_probabilities_reference_1.png", 
 88 |      width = 6, height = 6, units = "in", res = 300,type = "cairo")
 89 | plot_probabilities(reference)
 90 | dev.off()
 91 | 
 92 | #load all cancers and predict cancer cell states on them
 93 | 
 94 | ## CHC
 95 | load(file = "objects/CHC.Robj")
 96 | query = CHC
 97 | query <- NormalizeData(query)
 98 | query <- scPredict(query, reference, threshold = 0.2)
 99 | save(query, file = "processed_data/CHC.Robj")
100 | 
101 | ##CHC1_T 
102 | load(file = "objects/CHC1_T.Robj")
103 | query = CHC1_T
104 | query <- NormalizeData(query)
105 | query <- scPredict(query, reference, threshold = 0.2)
106 | save(query, file = "processed_data/CHC1_T.Robj")
107 | 
108 | 
109 | #crc 
110 | load(file = "objects/crc.Robj")
111 | query = crc
112 | query <- NormalizeData(query)
113 | query <- scPredict(query, reference, threshold = 0.2)
114 | save(query, file = "processed_data/crc.Robj")
115 | 
116 | #gbm
117 | load(file = "objects/gbm.Robj")
118 | query = gbm
119 | query <- NormalizeData(query)
120 | query <- scPredict(query, reference, threshold = 0.2)
121 | save(query, file = "processed_data/gbm.Robj")
122 | 
123 | 
124 | #HBC_IDC
125 | load(file = "objects/HBC_IDC.Robj")
126 | query = HBC_IDC
127 | query <- NormalizeData(query)
128 | query <- scPredict(query, reference, threshold = 0.2)
129 | save(query, file = "processed_data/HBC_IDC.Robj")
130 | 
131 | #HBC_ILC
132 | load(file = "objects/HBC_ILC.Robj")
133 | query = HBC_ILC
134 | query <- NormalizeData(query)
135 | query <- scPredict(query, reference, threshold = 0.2)
136 | save(query, file = "processed_data/HBC_ILC.Robj")
137 | 
138 | 
139 | #HCC1_T
140 | load(file = "objects/HCC1_T.Robj")
141 | query = HCC1_T
142 | query <- NormalizeData(query)
143 | query <- scPredict(query, reference, threshold = 0.2)
144 | save(query, file = "processed_data/HCC1_T.Robj")
145 | 
146 | #HCC1
147 | load(file = "objects/HCC1.Robj")
148 | query = HCC1
149 | query <- NormalizeData(query)
150 | query <- scPredict(query, reference, threshold = 0.2)
151 | save(query, file = "processed_data/HCC1.Robj")
152 | 
153 | #HCC2
154 | load(file = "objects/HCC2_T.Robj")
155 | query = HCC2_T
156 | query <- NormalizeData(query)
157 | query <- scPredict(query, reference, threshold = 0.2)
158 | save(query, file = "processed_data/HCC2_T.Robj")
159 | 
160 | #ICC
161 | load(file = "objects/ICC.Robj")
162 | query = ICC
163 | query <- NormalizeData(query)
164 | query <- scPredict(query, reference, threshold = 0.2)
165 | save(query, file = "processed_data/ICC.Robj")
166 | 
167 | #OV
168 | load(file = "objects/OV.Robj")
169 | query = OV
170 | query <- NormalizeData(query)
171 | query <- scPredict(query, reference, threshold = 0.2)
172 | save(query, file = "processed_data/OV.Robj")
173 | 
174 | #p4_scc
175 | load(file = "objects/p4_scc.Robj")
176 | query = p4_scc
177 | query <- NormalizeData(query)
178 | query <- scPredict(query, reference, threshold = 0.2)
179 | save(query, file = "processed_data/p4_cscc.Robj")
180 | 
181 | #p6_scc
182 | load(file = "objects/p6_scc.Robj")
183 | query = p6_scc
184 | query <- NormalizeData(query)
185 | query <- scPredict(query, reference, threshold = 0.2)
186 | save(query, file = "processed_data/p6_cscc.Robj")
187 | 
188 | rm(list=setdiff(ls(), "reference"))
189 | 
190 | load(file = "objects/PDAC_A.Robj")
191 | query = PDAC_A
192 | query <- NormalizeData(query)
193 | query <- scPredict(query, reference, threshold = 0.2)
194 | save(query, file = "processed_data/PDAC_A.Robj")
195 | 
196 | load(file = "objects/PDAC_B.Robj")
197 | query = PDAC_B
198 | query <- NormalizeData(query)
199 | query <- scPredict(query, reference, threshold = 0.2)
200 | save(query, file = "processed_data/PDAC_B.Robj")
201 | 
202 | load(file = "objects/PDAC_C.Robj")
203 | query = PDAC_C
204 | query <- NormalizeData(query)
205 | query <- scPredict(query, reference, threshold = 0.2)
206 | save(query, file = "processed_data/PDAC_C.Robj")
207 | 
208 | load(file = "objects/cscc1.Robj")
209 | query = cscc1
210 | query <- NormalizeData(query)
211 | query <- scPredict(query, reference, threshold = 0.2)
212 | save(query, file = "processed_data/cscc1.Robj")
213 | 
214 | load(file = "objects/cscc2.Robj")
215 | query = cscc2
216 | query <- NormalizeData(query)
217 | query <- scPredict(query, reference, threshold = 0.2)
218 | save(query, file = "processed_data/cscc2.Robj")
219 | 
220 | load(file = "objects/cscc3.Robj")
221 | query = cscc3
222 | query <- NormalizeData(query)
223 | query <- scPredict(query, reference, threshold = 0.2)
224 | save(query, file = "processed_data/cscc3.Robj")
225 | 
226 | load(file = "objects/cscc4.Robj")
227 | query = cscc4
228 | query <- NormalizeData(query)
229 | query <- scPredict(query, reference, threshold = 0.2)
230 | save(query, file = "processed_data/cscc4.Robj")
231 | 
232 | rm(list=setdiff(ls(), "reference"))
233 | 
234 | load(file = "objects/cervical_scc.Robj")
235 | query = cervical_scc
236 | query <- NormalizeData(query)
237 | query <- scPredict(query, reference, threshold = 0.2)
238 | save(query, file = "processed_data/cervical_scc.Robj")
239 | 
240 | load(file = "objects/intestinal.Robj")
241 | query = intestinal
242 | query <- NormalizeData(query)
243 | query <- scPredict(query, reference, threshold = 0.2)
244 | save(query, file = "processed_data/intestinals.Robj")
245 | 
246 | load(file = "objects/prostate.Robj")
247 | query = prostate
248 | query <- NormalizeData(query)
249 | query <- scPredict(query, reference, threshold = 0.2)
250 | save(query, file = "processed_data/prostate.Robj")
251 | 
252 | load(file = "objects/prostate_acinar.Robj")
253 | query = prostate_acinar
254 | query <- NormalizeData(query)
255 | query <- scPredict(query, reference, threshold = 0.2)
256 | save(query, file = "processed_data/prostate_acinar.Robj")
257 | 
258 | load(file = "objects/lung_scc.Robj")
259 | query = lung_scc
260 | query <- NormalizeData(query)
261 | query <- scPredict(query, reference, threshold = 0.2)
262 | save(query, file = "processed_data/lung_scc.Robj")
263 | 
264 | load(file = "objects/melanoma.Robj")
265 | query = melanoma
266 | query <- NormalizeData(query)
267 | query <- scPredict(query, reference, threshold = 0.2)
268 | save(query, file = "processed_data/melanoma.Robj")
269 | 
270 | rm(list=setdiff(ls(), "reference"))
271 | 
272 | load(file = "objects/s1_paediatric_medulloblastoma.Robj")
273 | query = s1_paediatric_medulloblastoma
274 | query <- NormalizeData(query)
275 | query <- scPredict(query, reference, threshold = 0.2)
276 | save(query, file = "processed_data/s1_paediatric_medulloblastoma.Robj")
277 | 
278 | load(file = "objects/s2_paediatric_medulloblastoma.Robj")
279 | query = s2_paediatric_medulloblastoma
280 | query <- NormalizeData(query)
281 | query <- scPredict(query, reference, threshold = 0.2)
282 | save(query, file = "processed_data/s2_paediatric_medulloblastoma.Robj")
283 | 
284 | load(file = "objects/s3_paediatric_CNS_embryonal.Robj")
285 | query = s3_paediatric_CNS_embryonal
286 | query <- NormalizeData(query)
287 | query <- scPredict(query, reference, threshold = 0.2)
288 | save(query, file = "processed_data/s3_paediatric_CNS_embryonal.Robj")
289 | 
290 | 
291 | 


--------------------------------------------------------------------------------
/Figure 4/Plotting/analysis.R:
--------------------------------------------------------------------------------
  1 | library(Seurat)
  2 | nm <- list.files(path="processed_data/")
  3 | 
  4 | df_full <- data.frame()
  5 | 
  6 | #create a full 3x13 figure
  7 | 
  8 | for (file in nm) { 
  9 |   load(paste("processed_data/",file, sep = ""))
 10 |   df <- query@meta.data
 11 |   df <- data.frame(df$scpred_prediction,stringr::str_split(file,".R")[[1]][1])
 12 |   df_full <- rbind(df_full, df)
 13 | }
 14 | 
 15 | names(df_full) = c("core_edge","dataset")
 16 | library(ggplot2)
 17 | 
 18 | library(tidyr)
 19 | library(dplyr)
 20 | df_full$core_edge <- factor(df_full$core_edge, levels = c("core","transitory","edge","noncancer"))
 21 | 
 22 | library(tidyr)
 23 | library(dplyr)
 24 | df_count <- df_full%>%
 25 |   group_by( dataset,core_edge) %>%
 26 |   summarize(count = n()) 
 27 | 
 28 | png("scpred.png",units = "in", res = 300, width = 8, height = 5, type = "cairo")
 29 | ggplot(df_count,aes(x = dataset, y = count, fill = core_edge)) +
 30 |   geom_bar(position="fill", stat="identity")+ 
 31 |   theme(panel.grid.major = element_blank(), 
 32 |         panel.grid.minor = element_blank(),
 33 |         panel.border = element_blank(), panel.background = element_blank()) +
 34 |   guides(fill = guide_legend(title = 'cell type')) +
 35 |   scale_fill_manual(values = c("core" = "#4DBBD5FF",
 36 |                                "transitory" = "#F9E076", "edge" = "#E64B35FF", "noncancer" = "lightgray"))+
 37 |   theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
 38 | dev.off()
 39 | 
 40 | #cluster on composition simillarity 
 41 | library(tidyverse)
 42 | library(ggdendro)
 43 | library(vegan)
 44 | library(colorspace)
 45 | library(cowplot)
 46 | t <- reshape2::dcast(df_count, dataset ~ core_edge, fun.aggregate = sum) 
 47 | t <- t %>%
 48 |   group_by(dataset) %>% 
 49 |   mutate(sum = as.character(rowSums(select(cur_data(), is.numeric)))) %>%
 50 |   summarise_if(is.numeric, ~ . / as.numeric(sum)) %>% tibble::column_to_rownames("dataset")
 51 | hc=hclust(dist(t),method="ward.D2")
 52 | hc=reorder(hc,wts=-as.matrix(t)%*%seq(ncol(t))^2) # vegan::reorder.hclust
 53 | tree=ggdendro::dendro_data(as.dendrogram(hc),type="rectangle")
 54 | 
 55 | p1=ggplot(ggdendro::segment(tree))+
 56 |   geom_segment(aes(x=y,y=x,xend=yend,yend=xend),lineend="round",size=.4)+
 57 |   scale_x_continuous(expand=expansion(add=c(0,.01)))+ # don't crop half of line between top-level nodes
 58 |   scale_y_continuous(limits=.5+c(0,nrow(t)),expand=c(0,0))+
 59 |   theme(
 60 |     axis.text=element_blank(),
 61 |     axis.ticks=element_blank(),
 62 |     axis.ticks.length=unit(0,"pt"), # remove extra space occupied by ticks
 63 |     axis.title=element_blank(),
 64 |     panel.background=element_rect(fill="white"),
 65 |     panel.grid=element_blank(),
 66 |     plot.margin=margin(5,5,5,0)
 67 |   )
 68 | 
 69 | t=t[hc$labels[hc$order],]
 70 | t2=data.frame(V1=rownames(t)[row(t)],V2=colnames(t)[col(t)],V3=unname(do.call(c,t)))
 71 | lab=round(100*t2$V3)
 72 | lab[lab==0]=""
 73 | 
 74 | t2$V2 <- factor(t2$V2, levels = c("core","transitory","edge","noncancer"))
 75 | p2=ggplot(t2,aes(x=factor(V1,level=rownames(t)),y=V3,fill=V2))+
 76 |   geom_bar(stat="identity",width=1,position=position_fill(reverse=T))+
 77 |   geom_text(aes(label=lab),position=position_stack(vjust=.5,reverse=T),size=3.5)+
 78 |   coord_flip()+
 79 |   scale_x_discrete(expand=c(0,0))+
 80 |   scale_y_discrete(expand=c(0,0))+
 81 |   scale_fill_manual(values = c("core" = "#4DBBD5FF",
 82 |                                "transitory" = "#F9E076", "edge" = "#E64B35FF", "noncancer" = "lightgray"))+
 83 |   theme(
 84 |     axis.text=element_text(color="black",size=11),
 85 |     axis.text.x=element_blank(),
 86 |     axis.ticks=element_blank(),
 87 |     axis.title=element_blank(),
 88 |     legend.position="none",
 89 |     plot.margin=margin(5,0,5,5)
 90 |   )
 91 | 
 92 | png("scpred_clustered.png",units = "in", res = 300, width = 7, height = 10, type = "cairo")
 93 | cowplot::plot_grid(p2,p1,rel_widths=c(1,.2))
 94 | dev.off()
 95 | 
 96 | # #cluster plot for only core, transitory, and edge
 97 | # library(tidyverse)
 98 | # library(ggdendro)
 99 | # library(vegan)
100 | # library(colorspace)
101 | # library(cowplot)
102 | # df_full_cc <- subset(df_full, df_full$core_edge %in% c("core","edge","transitory"))
103 | # df_count_cc <- df_full_cc%>%
104 | #   group_by( dataset,core_edge) %>%
105 | #   summarize(count = n()) 
106 | # 
107 | # t <- reshape2::dcast(df_count_cc, dataset ~ core_edge, fun.aggregate = sum) 
108 | # t <- t %>%
109 | #   group_by(dataset) %>% 
110 | #   mutate(sum = as.character(rowSums(select(cur_data(), is.numeric)))) %>%
111 | #   summarise_if(is.numeric, ~ . / as.numeric(sum)) %>% tibble::column_to_rownames("dataset")
112 | # hc=hclust(dist(t),method="ward.D2")
113 | # hc=reorder(hc,wts=-as.matrix(t)%*%seq(ncol(t))^2) # vegan::reorder.hclust
114 | # tree=ggdendro::dendro_data(as.dendrogram(hc),type="rectangle")
115 | # 
116 | # p1=ggplot(ggdendro::segment(tree))+
117 | #   geom_segment(aes(x=y,y=x,xend=yend,yend=xend),lineend="round",size=.4)+
118 | #   scale_x_continuous(expand=expansion(add=c(0,.01)))+ # don't crop half of line between top-level nodes
119 | #   scale_y_continuous(limits=.5+c(0,nrow(t)),expand=c(0,0))+
120 | #   theme(
121 | #     axis.text=element_blank(),
122 | #     axis.ticks=element_blank(),
123 | #     axis.ticks.length=unit(0,"pt"), # remove extra space occupied by ticks
124 | #     axis.title=element_blank(),
125 | #     panel.background=element_rect(fill="white"),
126 | #     panel.grid=element_blank(),
127 | #     plot.margin=margin(5,5,5,0)
128 | #   )
129 | # 
130 | # t=t[hc$labels[hc$order],]
131 | # t2=data.frame(V1=rownames(t)[row(t)],V2=colnames(t)[col(t)],V3=unname(do.call(c,t)))
132 | # lab=round(100*t2$V3)
133 | # lab[lab==0]=""
134 | # 
135 | # t2$V2 <- factor(t2$V2, levels = c("core","transitory","edge","noncancer"))
136 | # p2=ggplot(t2,aes(x=factor(V1,level=rownames(t)),y=V3,fill=V2))+
137 | #   geom_bar(stat="identity",width=1,position=position_fill(reverse=T))+
138 | #   geom_text(aes(label=lab),position=position_stack(vjust=.5,reverse=T),size=3.5)+
139 | #   coord_flip()+
140 | #   scale_x_discrete(expand=c(0,0))+
141 | #   scale_y_discrete(expand=c(0,0))+
142 | #   scale_fill_manual(values = c("core" = "#4DBBD5FF",
143 | #                                "transitory" = "#F9E076", "edge" = "#E64B35FF", "noncancer" = "lightgray"))+
144 | #   theme(
145 | #     axis.text=element_text(color="black",size=11),
146 | #     axis.text.x=element_blank(),
147 | #     axis.ticks=element_blank(),
148 | #     axis.title=element_blank(),
149 | #     legend.position="none",
150 | #     plot.margin=margin(5,0,5,5)
151 | #   )
152 | # 
153 | # cowplot::plot_grid(p2,p1,rel_widths=c(1,.4))
154 | 
155 | 
156 | #plot classification of every single sample 
157 | library(ggplot2)
158 | library(stringr)
159 | p <- list()
160 | i =1 
161 | 
162 | for (file in nm) { 
163 |   load(paste("processed_data/",file, sep = ""))
164 |   
165 |   p1 = SpatialDimPlot(query, alpha = c(0,0)) +theme(legend.position="none") + ggtitle(label = strsplit(file[1], ".", fixed = TRUE)[[1]][1])
166 |   
167 |   p2 = SpatialDimPlot(query, image.alpha = 0.5, group.by = "scpred_prediction", pt.size.factor = 2,
168 |                       cols = c("core" = "#4DBBD5FF",
169 |                                "transitory" = "#F9E076", "edge" = "#E64B35FF", "noncancer" = "lightgray"), stroke = 0) +theme(legend.position="none")
170 |   
171 |   #find clusters
172 |   query <- FindVariableFeatures(query)
173 |   query <- ScaleData(query)
174 |   query <- RunPCA(query)
175 |   query <- RunUMAP(query,dims = 1:15,reduction = "pca")
176 |   p3 = DimPlot(query, group.by = "scpred_prediction",
177 |                cols = c("core" = "#4DBBD5FF",
178 |                         "transitory" = "#F9E076", "edge" = "#E64B35FF", "noncancer" = "lightgray"))+theme(legend.position="none")+ theme(
179 |                           plot.title = element_blank(),axis.title.x = element_blank(),
180 |                           axis.title.y = element_blank())+ 
181 |     theme(axis.ticks.x = element_blank(),
182 |           axis.text.x = element_blank()) + 
183 |     theme(axis.ticks.y = element_blank(),
184 |           axis.text.y = element_blank())
185 |   all_plots <- p1 + p2 + p3
186 |   p[[i]] = all_plots
187 |   i = i + 1
188 |   
189 |   
190 | }
191 | 
192 | library(gridExtra)
193 | library(patchwork)
194 | png("scpred_full.png",units = "in", res = 300, width = 12, height = 20, type = "cairo")
195 | wrap_plots(p, ncol = 3)
196 | dev.off()
197 | 
198 | #plot some examples
199 | png("scpred_example_cscc.png",units = "in", res = 300, width = 5, height = 4, type = "cairo")
200 | p[[20]]
201 | dev.off()
202 | 
203 | png("scpred_example_cervicalscc.png",units = "in", res = 300, width = 5, height = 4, type = "cairo")
204 | p[[1]]
205 | dev.off()
206 | 
207 | png("scpred_example_melanoma.png",units = "in", res = 300, width = 5, height = 4, type = "cairo")
208 | p[[18]]
209 | dev.off()
210 | 
211 | png("scpred_example_colorectal.png",units = "in", res = 300, width = 5, height = 4, type = "cairo")
212 | p[[4]]
213 | dev.off()
214 | 


--------------------------------------------------------------------------------
/Figure 5/OSCC survival/oscc_tcga.R:
--------------------------------------------------------------------------------
  1 | library(UCSCXenaTools)
  2 | library(dplyr)
  3 | library(survival)
  4 | library(survminer)
  5 | library(data.table)
  6 | library(ggplot2)
  7 | library(tibble)
  8 | library(stringr)
  9 | library(plyr)
 10 | library(singscore)
 11 | library(pROC)
 12 | indication <- "HNSC"
 13 | `%!in%` = Negate(`%in%`)
 14 | 
 15 | replacement_dictionary <- c(
 16 |   "ADIRF" = "C10orf116", "CTSV" = 'CTSL2',"ERO1A" = "ERO1L","IL36G" = "IL1F9",
 17 |   "IL36RN" = "IL1F5","ATP5F1A" = "ATP5A1","ATP5F1B" = "ATP5B", "ATP5MF" = "ATP5J2",
 18 |   'DDX39B' = "BAT1","HNRNPDL" = 'HNRPDL',"MZT2B" = "FAM128B","PKM" = 'PKM2', 
 19 |   'RACK1' = 'GNB2L1', 'SNHG29' = 'NCRNA00188','SRSF2' = "SFRS2" ,"TMA7" = 'CCDC72')
 20 | 
 21 | genes_to_remove <- c("DEFB4B","PRR9","RNF223","SLURP2","MIR205HG","TMSB4X")
 22 | 
 23 | core_genes <- readRDS("core_genes.RDS")
 24 | core_genes <- c(core_genes)
 25 | for (gene in core_genes) {
 26 |   if (gene %in% names(replacement_dictionary)) {
 27 |     core_genes <- core_genes[core_genes != gene]
 28 |     core_genes <- append(core_genes, replacement_dictionary[[gene]])
 29 |   }
 30 | } 
 31 | 
 32 | core_genes <- core_genes[core_genes %!in% genes_to_remove]
 33 | 
 34 | edge_genes <- readRDS("edge_genes.RDS")
 35 | edge_genes <- c(edge_genes)
 36 | 
 37 | for (gene in edge_genes) {
 38 |   if (gene %in% names(replacement_dictionary)) {
 39 |     edge_genes <- edge_genes[edge_genes != gene]
 40 |     edge_genes <- append(edge_genes, replacement_dictionary[[gene]])
 41 |   }
 42 | } 
 43 | 
 44 | edge_genes <- edge_genes[edge_genes %!in% genes_to_remove]
 45 | 
 46 | geneset_of_interest <- list(core_genes,edge_genes)
 47 | names(geneset_of_interest) <- c("core","edge")
 48 | 
 49 | all_genes <- union(core_genes,edge_genes)
 50 | 
 51 | #load in TCGA OSCC HPV negative codes
 52 | sample_ids <- read.csv(file = "TCGA_OSCC_HPV_neg_CODES.csv")
 53 | tcga_oscc_codes <- sample_ids$TCGA_codes 
 54 | tcga_oscc_sampleID <- lapply(tcga_oscc_codes, function(x) paste(x, "-01", sep = ""))
 55 | 
 56 | #initialize object
 57 | Xena_cohort = XenaData %>% 
 58 |   filter(XenaHostNames == "tcgaHub") %>% # select TCGA Hub
 59 |   XenaScan(indication)   # select cohort
 60 | 
 61 | #download clinical and survival datasets
 62 | cli_query = Xena_cohort %>% 
 63 |   filter(DataSubtype == "phenotype") %>%  # select clinical dataset
 64 |   XenaGenerate() %>%  # generate a XenaHub object
 65 |   XenaQuery() %>% 
 66 |   XenaDownload()
 67 | 
 68 | cli = XenaPrepare(cli_query)
 69 | survival_data = cli[[2]]
 70 | clinical_data = cli[[1]]
 71 | clinical_data$sample = clinical_data$sampleID
 72 | 
 73 | if ("xena_sample" %in% colnames(survival_data)) {
 74 |   survival_data$sample = survival_data$xena_sample 
 75 |   survival_data$xena_sample = NULL
 76 | }
 77 | 
 78 | clinical_data <- clinical_data[clinical_data$sampleID %in% tcga_oscc_sampleID, ]
 79 | survival_data <- survival_data[survival_data$sample %in% tcga_oscc_sampleID, ]  
 80 | 
 81 | ge <- XenaData %>%
 82 |   filter(XenaHostNames == "tcgaHub") %>% # select TCGA Hub
 83 |   XenaScan(indication) %>%
 84 |   filter(DataSubtype == "gene expression RNAseq", Label == "IlluminaHiSeq", Unit == "log2(norm_count+1)") 
 85 | 
 86 | gene_info_km <- data.frame()
 87 | 
 88 | gene_info_km <- fetch_dense_values(host = unique(ge$XenaHosts), 
 89 |                                    dataset = ge$XenaDatasets,
 90 |                                    samples = tcga_oscc_sampleID,
 91 |                                    use_probeMap = F,
 92 |                                    check = FALSE,
 93 |                                    time_limit = 10000)
 94 | 
 95 | gene_info_km <- as.data.frame(gene_info_km)
 96 | 
 97 | 
 98 | dir.create("tcga_plots_km")
 99 | setwd("tcga_plots_km")
100 | 
101 | ###use multiple time methods
102 | time_methods <- c("DSS.time","PFI.time","OS.time")
103 | 
104 | 
105 | for(time_method in time_methods) { 
106 |   
107 |   rankData <- rankGenes(gene_info_km)
108 |   
109 |   scoredf_core <- simpleScore(rankData, upSet = core_genes , knownDirection = F)
110 |   scoredf_edge <- simpleScore(rankData, upSet = edge_genes , knownDirection = F)
111 |   
112 |   #calculate gene sets
113 |   names(scoredf_core) <- c("core","core_dispersion")
114 |   names(scoredf_edge) <- c("edge","edge_dispersion")
115 |   
116 |   scores_df_singshot <- cbind(scoredf_core,scoredf_edge)
117 |   
118 |   scores_df_singshot <- cbind(sample = rownames(scores_df_singshot), data.frame(scores_df_singshot, row.names=NULL))
119 |   
120 |   merged_data = scores_df_singshot %>% 
121 |     left_join(survival_data, by = "sample") %>% 
122 |     dplyr::select(sample,names(geneset_of_interest), time_method, str_split(time_method,"[.]")[[1]][1])  %>% 
123 |     left_join(clinical_data, by = "sample") %>% 
124 |     dplyr::rename(time = time_method, 
125 |                   status = str_split(time_method,"[.]")[[1]][1])
126 |   
127 |   #minprop 0.2
128 |   cut = surv_cutpoint(data = merged_data, time = "time", event = "status", variables = c("core","edge"), minprop = 0.2)
129 |   
130 |   res.cat <- surv_categorize(cut)
131 | 
132 | 
133 | coxph <- coxph(Surv(time, status) ~ edge, data = res.cat)
134 | kableExtra::as_image(tab::tabcoxph(coxph), file = paste(indication, str_split(time_method,"[.]")[[1]][1],"edge.png",sep = "_"))
135 | 
136 | png(paste(indication, str_split(time_method,"[.]")[[1]][1],"edge_km.png",sep = "_"), units="in", width=7, height=5, res=300, type = "cairo")
137 | print(ggsurvplot(survfit(Surv(time, status) ~ edge, data = res.cat), pval = T, conf.int = F,
138 |                  palette=c("#EA738DFF","#89ABE3FF"),xscale = 365.25,surv.median.line = "v",break.time.by = 365.25,
139 |                  xlab="Time in years",
140 |                  ylab=paste("Probability of",str_split(time_method,"[.]")[[1]][1]),
141 |                  surv.scale="percent"))
142 | dev.off()
143 | 
144 | 
145 | coxph <- coxph(Surv(time, status) ~ core, data = res.cat)
146 | kableExtra::as_image(tab::tabcoxph(coxph), file = paste(indication, str_split(time_method,"[.]")[[1]][1],"core.png",sep = "_"))
147 | 
148 | png(paste(indication, str_split(time_method,"[.]")[[1]][1],"core_km.png",sep = "_"), units="in", width=7, height=5, res=300, type = "cairo")
149 | print(ggsurvplot(survfit(Surv(time, status) ~ core, data = res.cat), pval = TRUE, conf.int = F,
150 |       palette=c("#EA738DFF","#89ABE3FF"),xscale = 365.25,surv.median.line = "v",break.time.by = 365.25,
151 |       xlab="Time in years",
152 |       ylab=paste("Probability of",str_split(time_method,"[.]")[[1]][1]),
153 |       surv.scale="percent"))
154 | dev.off()
155 | 
156 | 
157 | png(paste("figs/",indication, str_split(time_method,"[.]")[[1]][1],"edge_km.png",sep = "_"), units="in", width=6.5 , height=6, res=300, type = "cairo")
158 | print(ggsurvplot(survfit(Surv(time, status) ~ edge, data = res.cat), pval = F, conf.int = F,risk.table = T, 
159 |                  palette=c("#EA738DFF","#89ABE3FF"),xscale = 365.25,surv.median.line = "v",break.time.by = 365.25,
160 |                  xlab="Time in years",
161 |                  ylab=paste("Probability of",str_split(time_method,"[.]")[[1]][1]),
162 |                  surv.scale="percent"))
163 | dev.off()
164 | 
165 | png(paste("figs/",indication, str_split(time_method,"[.]")[[1]][1],"core_km.png",sep = "_"), units="in", width=6.5, height=6, res=300, type = "cairo")
166 | print(ggsurvplot(survfit(Surv(time, status) ~ core, data = res.cat), pval = F, conf.int = F,risk.table = T, 
167 |                  palette=c("#EA738DFF","#89ABE3FF"),xscale = 365.25,surv.median.line = "v",break.time.by = 365.25,
168 |                  xlab="Time in years",
169 |                  ylab=paste("Probability of",str_split(time_method,"[.]")[[1]][1]),
170 |                  surv.scale="percent"))
171 | dev.off()
172 | 
173 | }
174 | 
175 | setwd("..")
176 | 
177 | dir.create("other_tcga_plots")
178 | setwd("other_tcga_plots")
179 | 
180 | library(immunedeconv)
181 | png("rankplot_core.png", units="in", width=4, height=3, res=300, type = "cairo")
182 | plotRankDensity(rankData[,2,drop = FALSE], upSet = core_genes, isInteractive = F) + scale_color_manual(values = c("#4DBBD5FF"))
183 | dev.off()
184 | 
185 | png("rankplot_edge.png", units="in", width=4, height=3, res=300, type = "cairo")
186 | plotRankDensity(rankData[,2,drop = FALSE], upSet = edge_genes, isInteractive = F) + scale_color_manual(values = c("#E64B35FF"))
187 | dev.off()
188 | 
189 | res = deconvolute(gene_info_km, "epic")
190 | 
191 | t_res = as.data.frame(t(res))
192 | names(t_res) <- lapply(t_res[1, ], as.character)
193 | t_res <- t_res[-1, ] 
194 | 
195 | merged_data = scores_df_singshot %>% 
196 |   left_join(clinical_data, by = "sample") 
197 | 
198 | rownames(merged_data) <- merged_data$sample
199 | 
200 | full_df_deconv  <- merge(t_res, merged_data, by = 0, all = TRUE)
201 | png("core_edge_correlation.png", units="in", width=5, height=5, res=300, type = "cairo")
202 | 
203 | #correlation core edge
204 | sp <- ggscatter(merged_data, x = "edge", y = "core",
205 |                 add = "reg.line",  # Add regression line
206 |                 add.params = list(color = "blue", fill = "lightgray"), # Customize reg. line
207 |                 conf.int = TRUE # Add confidence interval
208 | )
209 | # Add correlation coefficient
210 | sp + stat_cor(method = "pearson", label.x = 0.87, label.y = 0.3)
211 | dev.off()
212 | library(rstatix)
213 | 
214 | ###
215 | full_df_deconv$CAF <- full_df_deconv$`Cancer associated fibroblast`
216 | full_df_deconv$CAF <- as.numeric(full_df_deconv$CAF)
217 | png("CAF_edge_correlation.png", units="in", width=6, height=5, res=300, type = "cairo")
218 | 
219 | #correlation edge CAF
220 | sp <- ggscatter(full_df_deconv, x = "edge", y = "CAF",
221 |                 add = "reg.line",  # Add regressin line
222 |                 add.params = list(color = "blue", fill = "lightgray"), # Customize reg. line
223 |                 conf.int = TRUE # Add confidence interval
224 | )
225 | # Add correlation coefficient
226 | sp + stat_cor(method = "pearson", label.x = 0.87, label.y = 0.01)
227 | dev.off()
228 | 
229 | give.n <- function(x){
230 |   return(c(y = mean(x), label = length(x)))
231 | }
232 | 
233 | merged_data_stage <- merged_data[merged_data$pathologic_stage %in% c("Stage I","Stage II","Stage III",
234 |                                                                      "Stage IVA","Stage IVB"),]
235 | 
236 | png("stage_plot_core.png", units="in", width=7, height=5, res=300, type = "cairo")
237 | ggplot(merged_data_stage, aes_string(x = "pathologic_stage", y = "core", fill = "pathologic_stage")) +
238 |   geom_boxplot(alpha=0.7)+
239 |   stat_summary(fun.data = give.n, geom = "text") +
240 |   scale_y_continuous(name = "gene expression RNAseq") +
241 |   scale_x_discrete(name = "Gene") +
242 |   theme_bw() + 
243 |   ggpubr::stat_compare_means()
244 | dev.off()
245 | 
246 | png("stage_plot_edge.png", units="in", width=7, height=5, res=300, type = "cairo")
247 | ggplot(merged_data_stage, aes_string(x = "pathologic_stage", y = "edge", fill = "pathologic_stage")) +
248 |   geom_boxplot(alpha=0.7)+
249 |   stat_summary(fun.data = give.n, geom = "text") +
250 |   scale_y_continuous(name = "gene expression RNAseq") +
251 |   scale_x_discrete(name = "Gene") +
252 |   theme_bw() + 
253 |   ggpubr::stat_compare_means()
254 | dev.off()
255 | 
256 | library(RColorBrewer)
257 | 
258 | ###comparison across clinical characteristics
259 | 
260 | merged_data = scores_df_singshot %>% 
261 |   left_join(survival_data, by = "sample") %>% 
262 |   dplyr::select(sample,names(geneset_of_interest), time_method, str_split(time_method,"[.]")[[1]][1])  %>% 
263 |   left_join(clinical_data, by = "sample") %>% 
264 |   dplyr::rename(time = time_method, 
265 |                 status = str_split(time_method,"[.]")[[1]][1])
266 | 
267 | edited_data <- merged_data %>%
268 |               mutate(margin_status = replace(margin_status,margin_status %in% c("Positive"), "present"))  %>%
269 |               mutate(pathologic_M = replace(pathologic_M,pathologic_M %in% c("MX"), "present"))  %>%
270 |               mutate(pathologic_N = replace(pathologic_N,pathologic_N %in% c("N2a","N2b","N2","N2c"), "present")) %>%
271 |               mutate(pathologic_T = replace(pathologic_T,pathologic_T %in% c("T3","T4","T4a"), "present")) %>%
272 |               mutate(presence_of_pathological_nodal_extracapsular_spread = replace(presence_of_pathological_nodal_extracapsular_spread,
273 |                                                                                    presence_of_pathological_nodal_extracapsular_spread %in% c("Gross Extension","Microscopic Extension"), "present")) %>%
274 |               mutate(lymphovascular_invasion_present = replace(lymphovascular_invasion_present,lymphovascular_invasion_present %in% c("YES"), "present"))  %>% 
275 |             mutate(neoplasm_histologic_grade = replace(neoplasm_histologic_grade,neoplasm_histologic_grade %in% c("G3"), "present"))  %>% 
276 |               select(edge,core,pathologic_M,pathologic_T,pathologic_N,
277 |                      lymphovascular_invasion_present,neoplasm_histologic_grade, margin_status,
278 |                      presence_of_pathological_nodal_extracapsular_spread) %>%
279 |                      melt(id.vars = c('core',"edge"), measure.vars = c("pathologic_M","pathologic_T","pathologic_N",
280 |                                                             "lymphovascular_invasion_present","neoplasm_histologic_grade", "margin_status",
281 |                                                             "presence_of_pathological_nodal_extracapsular_spread"))%>% na.omit
282 |               
283 | edited_data$value[!grepl('present', edited_data$value)] <- "absent"
284 | 
285 | ###
286 | 
287 | signif_abbreviation <- function(p_values) {
288 |   sapply(p_values, function(x) {
289 |     if (is.na(x)) {
290 |       return(NA)
291 |     } else if (x < 0.0001) {
292 |       return("****")
293 |     } else if (x < 0.001) {
294 |       return("***")
295 |     } else if (x < 0.01) {
296 |       return("**")
297 |     } else if (x < 0.05) {
298 |       return("*")
299 |     } else if (x > 0.05) {
300 |       return("NS")
301 |     } else {
302 |       return("NS")
303 |     }
304 |   })
305 | }
306 | 
307 | wilcox_tests <- edited_data %>%
308 |   group_by(variable) %>%
309 |   do(tidy(wilcox.test(edge ~ value, data = .))) 
310 | 
311 | wilcox_tests$p.adj <- p.adjust(wilcox_tests$p.value, method = "BH")
312 | 
313 | wilcox_tests <- wilcox_tests %>%
314 |   mutate(p.adj.signif = as.character(signif_abbreviation(p.adj)))
315 | 
316 | edited_data_with_p_adj <- left_join(edited_data, wilcox_tests, by = "variable")
317 | 
318 | 
319 | png("comparision_plot_edge.png", units="in", width=6, height=6.5, res=300, type = "cairo")
320 | 
321 | ggplot(edited_data_with_p_adj, aes(x = variable, y = edge), fill = value)+
322 |   geom_boxplot(outlier.shape = NA,aes(fill = factor(value)), alpha = 0.5)+
323 |   scale_fill_manual(values = c('present' = '#Fa5d0f',
324 |                                'absent' = '#2B1055'))+
325 |   theme_bw() +
326 |   theme(legend.position = "bottom") +
327 |   guides(color = guide_legend(title = "Sample")) +
328 |   geom_text(data = edited_data_with_p_adj, aes(x = variable, y = Inf, label = p.adj.signif), vjust = 1.5, hjust = 0.5, fontface = "bold") +
329 |   theme(axis.text.x = element_text(angle = 45, hjust=1)) +   # Bold axis labels
330 |   theme(plot.margin = unit(c(1, 5, 1, 1), "lines"),
331 |         axis.title = element_text(face = "bold"),
332 |         axis.text = element_text(size = 10, face = "bold"),
333 |         axis.ticks.y = element_blank(),  panel.border = element_rect(linewidth = 0)) 
334 | dev.off()
335 | 
336 | wilcox_tests <- edited_data %>%
337 |   group_by(variable) %>%
338 |   do(tidy(wilcox.test(core ~ value, data = .))) 
339 | 
340 | wilcox_tests$p.adj <- p.adjust(wilcox_tests$p.value, method = "BH")
341 | 
342 | wilcox_tests <- wilcox_tests %>%
343 |   mutate(p.adj.signif = as.character(signif_abbreviation(p.adj)))
344 | 
345 | edited_data_with_p_adj <- left_join(edited_data, wilcox_tests, by = "variable")
346 | 
347 | 
348 | png("comparision_plot_core.png", units="in", width=6, height=6.5, res=300, type = "cairo")
349 | ggplot(edited_data, aes(x = variable, y = core), fill = value)+
350 |   geom_boxplot(outlier.shape = NA,aes(fill = factor(value)), alpha = 0.5)+
351 |   scale_fill_manual(values = c('present' = '#Fa5d0f',
352 |                                'absent' = '#2B1055'))+
353 |   theme_bw() +
354 |   theme(legend.position = "bottom") +
355 |   guides(color = guide_legend(title = "Sample")) +
356 |   geom_text(data = edited_data_with_p_adj, aes(x = variable, y = Inf, label = p.adj.signif), vjust = 1.5, hjust = 0.5, fontface = "bold") +
357 |   theme(axis.text.x = element_text(angle = 45, hjust=1)) +   # Bold axis labels
358 |   theme(plot.margin = unit(c(1, 5, 1, 1), "lines"),
359 |         axis.title = element_text(face = "bold"),
360 |         axis.text = element_text(size = 10, face = "bold"),
361 |         axis.ticks.y = element_blank(),  panel.border = element_rect(linewidth = 0)) 
362 | dev.off()
363 | 
364 | 
365 | 
366 | 
367 | 


--------------------------------------------------------------------------------
/Figure 5/Pan-cancer survival/Survival_pan_cancer.R:
--------------------------------------------------------------------------------
  1 | library(UCSCXenaTools)
  2 | library(dplyr)
  3 | library(survival)
  4 | library(survminer)
  5 | library(data.table)
  6 | library(ggplot2)
  7 | library(tibble)
  8 | library(stringr)
  9 | library(plyr)
 10 | library(singscore)
 11 | library(pROC)
 12 | indication <- "HNSC"
 13 | `%!in%` = Negate(`%in%`)
 14 | 
 15 | #replace gene names to gene names that exist in TCGA
 16 | replacement_dictionary <- c(
 17 |   "ADIRF" = "C10orf116", "CTSV" = 'CTSL2',"ERO1A" = "ERO1L","IL36G" = "IL1F9",
 18 |   "IL36RN" = "IL1F5","ATP5F1A" = "ATP5A1","ATP5F1B" = "ATP5B", "ATP5MF" = "ATP5J2",
 19 |   'DDX39B' = "BAT1","HNRNPDL" = 'HNRPDL',"MZT2B" = "FAM128B","PKM" = 'PKM2', 
 20 |   'RACK1' = 'GNB2L1', 'SNHG29' = 'NCRNA00188','SRSF2' = "SFRS2" ,"TMA7" = 'CCDC72')
 21 | 
 22 | #remove genes not in TCGA
 23 | genes_to_remove <- c("DEFB4B","PRR9","RNF223","SLURP2","MIR205HG","TMSB4X")
 24 | 
 25 | 
 26 | core_genes <- readRDS("core_genes.RDS")
 27 | 
 28 | for (gene in core_genes) {
 29 |   if (gene %in% names(replacement_dictionary)) {
 30 |     core_genes <- core_genes[core_genes != gene]
 31 |     core_genes <- append(core_genes, replacement_dictionary[[gene]])
 32 |   }
 33 | } 
 34 | 
 35 | core_genes <- core_genes[core_genes %!in% genes_to_remove]
 36 | 
 37 | edge_genes <- readRDS("edge_genes.RDS")
 38 | 
 39 | for (gene in edge_genes) {
 40 |   if (gene %in% names(replacement_dictionary)) {
 41 |     edge_genes <- edge_genes[edge_genes != gene]
 42 |     edge_genes <- append(edge_genes, replacement_dictionary[[gene]])
 43 |   }
 44 | } 
 45 | 
 46 | edge_genes <- edge_genes[edge_genes %!in% genes_to_remove]
 47 | 
 48 | geneset_of_interest <- list(core_genes,edge_genes)
 49 | names(geneset_of_interest) <- c("core","edge")
 50 | 
 51 | all_genes <- union(core_genes,edge_genes)
 52 | 
 53 | 
 54 | cancer_indications <- c("BRCA","LUAD","LUSC","LIHC","OSCC","COADREAD","PAAD","ACC",
 55 |                         "BLCA","STAD","THCA","PRAD",
 56 |                         "CESC","GBMLGG","KIRP","MESO",
 57 |                         "KIRC","OV","SARC","SKCM")
 58 | 
 59 | #make a df to keep track of geneset, indication, pval, worse outcome, and hazard ratio
 60 | df <- data.frame (geneset  = rep(names(geneset_of_interest),length(cancer_indications)),
 61 |                   indication  = rep(cancer_indications,each = length(geneset_of_interest)),
 62 |                   p_val = 1,worse_outcome = 0, hazard_ratio = 0
 63 | )
 64 | 
 65 | for (indication in cancer_indications) {
 66 |   #initialize object
 67 |   if (indication != "OSCC") {
 68 |   Xena_cohort = XenaData %>% 
 69 |     filter(XenaHostNames == "tcgaHub") %>% # select TCGA Hub
 70 |     XenaScan(indication)   # select cohort
 71 |   
 72 |   #download clinical and survival datasets
 73 |   cli_query = Xena_cohort %>% 
 74 |     filter(DataSubtype == "phenotype") %>%  # select clinical dataset
 75 |     XenaGenerate() %>%  # generate a XenaHub object
 76 |     XenaQuery() %>% 
 77 |     XenaDownload()
 78 |   
 79 |   if (indication != "OV") {
 80 |     cli = XenaPrepare(cli_query)
 81 |     survival_data = cli[[2]]
 82 |     clinical_data = cli[[1]]
 83 |     clinical_data$sample = clinical_data$sampleID
 84 |   } else {
 85 |     cli = XenaPrepare(cli_query)
 86 |     survival_data = cli[[3]]
 87 |     clinical_data = cli[[1]]
 88 |     clinical_data$sample = clinical_data$sampleID
 89 |   }
 90 |   
 91 |   if ("xena_sample" %in% colnames(survival_data)) {
 92 |     survival_data$sample = survival_data$xena_sample 
 93 |     survival_data$xena_sample = NULL
 94 |   }
 95 |   
 96 |   #process exceptions
 97 |   if (indication == "OV") {
 98 |     ge <- XenaData %>%
 99 |       filter(XenaHostNames == "tcgaHub") %>% # select TCGA Hub
100 |       XenaScan(indication) %>%
101 |       filter(DataSubtype == "gene expression RNAseq", Label == "IlluminaHiSeq UNC", Unit == "log2(norm_count+1)") 
102 |   } else { 
103 |     if (indication == "STAD") {
104 |       ge <- XenaData %>%
105 |         filter(XenaHostNames == "tcgaHub") %>% # select TCGA Hub
106 |         XenaScan(indication) %>%
107 |         filter(DataSubtype == "gene expression RNAseq", Label == "IlluminaHiSeq UNC", Unit == "log2(norm_count+1)") 
108 |       ge <- ge[2,]
109 |     } else {
110 |       ge <- XenaData %>%
111 |         filter(XenaHostNames == "tcgaHub") %>% # select TCGA Hub
112 |         XenaScan(indication) %>%
113 |         filter(DataSubtype == "gene expression RNAseq", Label == "IlluminaHiSeq", Unit == "log2(norm_count+1)") 
114 |     }
115 |   }
116 |   
117 |   if (indication == "SARC") {
118 |     ge = ge[1,]
119 |   } } else {
120 |     sample_ids <- read.csv(file = "TCGA_OSCC_HPV_neg_CODES.csv")
121 |     tcga_oscc_codes <- sample_ids$TCGA_codes 
122 |     tcga_oscc_sampleID <- lapply(tcga_oscc_codes, function(x) paste(x, "-01", sep = ""))
123 |     
124 |     #initialize object
125 |     Xena_cohort = XenaData %>% 
126 |       filter(XenaHostNames == "tcgaHub") %>% # select TCGA Hub
127 |       XenaScan("HNSC")   # select cohort
128 |     
129 |     #download clinical and survival datasets
130 |     cli_query = Xena_cohort %>% 
131 |       filter(DataSubtype == "phenotype") %>%  # select clinical dataset
132 |       XenaGenerate() %>%  # generate a XenaHub object
133 |       XenaQuery() %>% 
134 |       XenaDownload()
135 |     
136 |     cli = XenaPrepare(cli_query)
137 |     survival_data = cli[[2]]
138 |     clinical_data = cli[[1]]
139 |     clinical_data$sample = clinical_data$sampleID
140 |     
141 |     if ("xena_sample" %in% colnames(survival_data)) {
142 |       survival_data$sample = survival_data$xena_sample 
143 |       survival_data$xena_sample = NULL
144 |     }
145 |     
146 |     clinical_data <- clinical_data[clinical_data$sampleID %in% tcga_oscc_sampleID, ]
147 |     survival_data <- survival_data[survival_data$sample %in% tcga_oscc_sampleID, ]  
148 |     
149 |     ge <- XenaData %>%
150 |       filter(XenaHostNames == "tcgaHub") %>% # select TCGA Hub
151 |       XenaScan("HNSC") %>%
152 |       filter(DataSubtype == "gene expression RNAseq", Label == "IlluminaHiSeq", Unit == "log2(norm_count+1)") 
153 |   }
154 |   
155 |   gene_info_km <- data.frame()
156 |   
157 |   samples <- fetch_dataset_samples( unique(ge$XenaHosts),  ge$XenaDatasets, limit = NULL)
158 |   #fetch gene expression data
159 |   gene_info_km <- fetch_dense_values(host = unique(ge$XenaHosts), 
160 |                                      dataset = ge$XenaDatasets,
161 |                                      samples = samples,
162 |                                      use_probeMap = F,
163 |                                      check = FALSE,
164 |                                      time_limit = 10000)
165 |   
166 |   gene_info_km <- as.data.frame((gene_info_km))
167 |   
168 |   
169 |   #calculate gene sets
170 |   
171 |   rankData <- rankGenes(gene_info_km)
172 |   
173 |   scoredf_core <- simpleScore(rankData, upSet = core_genes , knownDirection = FALSE)
174 |   scoredf_edge <- simpleScore(rankData, upSet = edge_genes , knownDirection = FALSE)
175 |   
176 |   #calculate gene sets
177 |   names(scoredf_core) <- c("core","core_dispersion")
178 |   names(scoredf_edge) <- c("edge","edge_dispersion")
179 |   
180 |   scores_df_singshot <- cbind(scoredf_core,scoredf_edge)
181 |   
182 |   scores_df_singshot <- cbind(sample = rownames(scores_df_singshot), data.frame(scores_df_singshot, row.names=NULL))
183 |   
184 |   merged_data = survival_data %>% 
185 |     left_join(scores_df_singshot, by = "sample") %>% 
186 |     dplyr::select(sample,names(geneset_of_interest), OS.time, OS)  %>% 
187 |     left_join(clinical_data, by = "sample") %>% 
188 |     dplyr::select(sample,names(geneset_of_interest), OS.time, OS,sample_type)  %>% 
189 |     dplyr::rename(time = OS.time, 
190 |                   status = OS)
191 |   
192 |   if (indication != "LAML") {
193 |     merged_data = merged_data[merged_data$sample_type == "Primary Tumor",]
194 |   }
195 |   
196 |   merged_data <- merged_data[complete.cases(merged_data), ]
197 |   
198 |   #identify cutpoint using minprop of 0.1
199 |   cut = surv_cutpoint(data = merged_data, time = "time", event = "status", variables = names(geneset_of_interest), minprop = 0.1)
200 |   res.cat <- surv_categorize(cut)
201 |   
202 |   for (gene in names(geneset_of_interest)) {
203 |     
204 |     res.cat$binary = res.cat[[gene]]
205 |     
206 |     row = df[df$gene == gene & df$indication == indication,]
207 |     rownumber = row.names(df[df$gene == gene & df$indication == indication,])
208 |     
209 |     res.cat$binary <- factor(res.cat$binary, levels = c("high","low"))
210 |     
211 |     fit.coxph <- coxph(Surv(time, status) ~ binary, data = res.cat)
212 |     
213 |     HR <- round(exp(coef(fit.coxph)), 2)[[1]]
214 |     
215 |     if (is.na(HR)) {
216 |       row$hazard_ratio <- 1
217 |       row$worse_outcome = "n/a"
218 |     } else {
219 |       if (coef(fit.coxph) > 0) {
220 |         row$hazard_ratio <- round(exp(coef(fit.coxph)), 2)[[1]]
221 |         row$worse_outcome = "Low"
222 |       } else {
223 |         row$hazard_ratio <- 1/round(exp(coef(fit.coxph)), 2)[[1]]
224 |         row$worse_outcome = "High"
225 |       }
226 |     }
227 |     
228 |     #get p val
229 |     
230 |     p_value = summary(fit.coxph)$waldtest[[3]]
231 |     
232 |     row$p_val = p_value
233 |     
234 |     df[rownumber,] = row
235 |   }
236 |   
237 | }
238 | 
239 | df$sig = 1
240 | df[is.na(df)] = 1
241 | 
242 | 
243 | for (data_row in rownames(df)) {
244 |   row = df[data_row,]
245 |   if(row$p_val > 0.05) {
246 |     row$sig = "ns"
247 |     if(row$hazard_ratio == "Inf" || row$hazard_ratio > 100) {
248 |       row$hazard_ratio = 1
249 |     }
250 |   } else {
251 |     if(row$hazard_ratio == "Inf" || row$hazard_ratio > 100) {
252 |       row$sig = row$worse_outcome 
253 |       row$hazard_ratio = 1
254 |     } else {
255 |       row$sig = row$worse_outcome 
256 |     }
257 |   }
258 |   df[data_row,] = row
259 | }
260 | 
261 | 
262 | dir.create("heatmap_tcga")
263 | setwd("heatmap_tcga")
264 | 
265 | 
266 | png("oss_survival_dotplot.png", units="in", width=3, height=5, res=300, type = "cairo")
267 | ggplot(df, aes(x = geneset, y = indication, color = sig)) +
268 |   geom_point(aes(size = hazard_ratio, fill = sig)) +
269 |   scale_colour_manual(values = c("High" = "#d73027",
270 |                                  "Low" = "#4575b4","ns" ="grey")) +
271 |   labs(x="geneset", y="indication",
272 |        title="Decreased OS") + 
273 |   theme_minimal()
274 | dev.off()
275 | setwd("..")
276 | 
277 | ###
278 | 
279 | library(UCSCXenaTools)
280 | library(dplyr)
281 | library(survival)
282 | library(survminer)
283 | library(data.table)
284 | library(ggplot2)
285 | library(tibble)
286 | library(stringr)
287 | library(plyr)
288 | library(singscore)
289 | library(pROC)
290 | `%!in%` = Negate(`%in%`)
291 | 
292 | 
293 | replacement_dictionary <- c(
294 |   "ADIRF" = "C10orf116", "CTSV" = 'CTSL2',"ERO1A" = "ERO1L","IL36G" = "IL1F9",
295 |   "IL36RN" = "IL1F5","ATP5F1A" = "ATP5A1","ATP5F1B" = "ATP5B", "ATP5MF" = "ATP5J2",
296 |   'DDX39B' = "BAT1","HNRNPDL" = 'HNRPDL',"MZT2B" = "FAM128B","PKM" = 'PKM2', 
297 |   'RACK1' = 'GNB2L1', 'SNHG29' = 'NCRNA00188','SRSF2' = "SFRS2" ,"TMA7" = 'CCDC72')
298 | 
299 | genes_to_remove <- c("DEFB4B","PRR9","RNF223","SLURP2","MIR205HG","TMSB4X")
300 | 
301 | 
302 | core_genes <- readRDS("core_genes.RDS")
303 | 
304 | for (gene in core_genes) {
305 |   if (gene %in% names(replacement_dictionary)) {
306 |     core_genes <- core_genes[core_genes != gene]
307 |     core_genes <- append(core_genes, replacement_dictionary[[gene]])
308 |   }
309 | } 
310 | 
311 | core_genes <- core_genes[core_genes %!in% genes_to_remove]
312 | 
313 | edge_genes <- readRDS("edge_genes.RDS")
314 | 
315 | for (gene in edge_genes) {
316 |   if (gene %in% names(replacement_dictionary)) {
317 |     edge_genes <- edge_genes[edge_genes != gene]
318 |     edge_genes <- append(edge_genes, replacement_dictionary[[gene]])
319 |   }
320 | } 
321 | 
322 | edge_genes <- edge_genes[edge_genes %!in% genes_to_remove]
323 | 
324 | geneset_of_interest <- list(core_genes,edge_genes)
325 | names(geneset_of_interest) <- c("core","edge")
326 | 
327 | all_genes <- union(core_genes,edge_genes)
328 | 
329 | 
330 | cancer_indications <- c("BRCA","LUAD","LUSC","LIHC","OSCC","COADREAD","PAAD","ACC",
331 |                         "BLCA","STAD","THCA","PRAD",
332 |                         "CESC","GBMLGG","KIRP","MESO",
333 |                         "KIRC","OV","SARC","SKCM")
334 | 
335 | #make a df row 1 is gene, row 2 indication, row 3 p value, row 4 low or high
336 | df <- data.frame (geneset  = rep(names(geneset_of_interest),length(cancer_indications)),
337 |                   indication  = rep(cancer_indications,each = length(geneset_of_interest)),
338 |                   p_val = 1,worse_outcome = 0, hazard_ratio = 0
339 | )
340 | 
341 | for (indication in cancer_indications) {
342 |   #initialize object
343 |   if (indication != "OSCC") {
344 |     Xena_cohort = XenaData %>% 
345 |       filter(XenaHostNames == "tcgaHub") %>% # select TCGA Hub
346 |       XenaScan(indication)   # select cohort
347 |     
348 |     #download clinical and survival datasets
349 |     cli_query = Xena_cohort %>% 
350 |       filter(DataSubtype == "phenotype") %>%  # select clinical dataset
351 |       XenaGenerate() %>%  # generate a XenaHub object
352 |       XenaQuery() %>% 
353 |       XenaDownload()
354 |     
355 |     if (indication != "OV") {
356 |       cli = XenaPrepare(cli_query)
357 |       survival_data = cli[[2]]
358 |       clinical_data = cli[[1]]
359 |       clinical_data$sample = clinical_data$sampleID
360 |     } else {
361 |       cli = XenaPrepare(cli_query)
362 |       survival_data = cli[[3]]
363 |       clinical_data = cli[[1]]
364 |       clinical_data$sample = clinical_data$sampleID
365 |     }
366 |     
367 |     if ("xena_sample" %in% colnames(survival_data)) {
368 |       survival_data$sample = survival_data$xena_sample 
369 |       survival_data$xena_sample = NULL
370 |     }
371 |     
372 |     if (indication == "OV") {
373 |       ge <- XenaData %>%
374 |         filter(XenaHostNames == "tcgaHub") %>% # select TCGA Hub
375 |         XenaScan(indication) %>%
376 |         filter(DataSubtype == "gene expression RNAseq", Label == "IlluminaHiSeq UNC", Unit == "log2(norm_count+1)") 
377 |     } else { 
378 |       if (indication == "STAD") {
379 |         ge <- XenaData %>%
380 |           filter(XenaHostNames == "tcgaHub") %>% # select TCGA Hub
381 |           XenaScan(indication) %>%
382 |           filter(DataSubtype == "gene expression RNAseq", Label == "IlluminaHiSeq UNC", Unit == "log2(norm_count+1)") 
383 |         ge <- ge[2,]
384 |       } else {
385 |         ge <- XenaData %>%
386 |           filter(XenaHostNames == "tcgaHub") %>% # select TCGA Hub
387 |           XenaScan(indication) %>%
388 |           filter(DataSubtype == "gene expression RNAseq", Label == "IlluminaHiSeq", Unit == "log2(norm_count+1)") 
389 |       }
390 |     }
391 |     
392 |     if (indication == "SARC") {
393 |       ge = ge[1,]
394 |     } } else {
395 |       sample_ids <- read.csv(file = "TCGA_OSCC_HPV_neg_CODES.csv")
396 |       tcga_oscc_codes <- sample_ids$TCGA_codes 
397 |       tcga_oscc_sampleID <- lapply(tcga_oscc_codes, function(x) paste(x, "-01", sep = ""))
398 |       
399 |       #initialize object
400 |       Xena_cohort = XenaData %>% 
401 |         filter(XenaHostNames == "tcgaHub") %>% # select TCGA Hub
402 |         XenaScan("HNSC")   # select cohort
403 |       
404 |       #download clinical and survival datasets
405 |       cli_query = Xena_cohort %>% 
406 |         filter(DataSubtype == "phenotype") %>%  # select clinical dataset
407 |         XenaGenerate() %>%  # generate a XenaHub object
408 |         XenaQuery() %>% 
409 |         XenaDownload()
410 |       
411 |       cli = XenaPrepare(cli_query)
412 |       survival_data = cli[[2]]
413 |       clinical_data = cli[[1]]
414 |       clinical_data$sample = clinical_data$sampleID
415 |       
416 |       if ("xena_sample" %in% colnames(survival_data)) {
417 |         survival_data$sample = survival_data$xena_sample 
418 |         survival_data$xena_sample = NULL
419 |       }
420 |       
421 |       clinical_data <- clinical_data[clinical_data$sampleID %in% tcga_oscc_sampleID, ]
422 |       survival_data <- survival_data[survival_data$sample %in% tcga_oscc_sampleID, ]  
423 |       
424 |       ge <- XenaData %>%
425 |         filter(XenaHostNames == "tcgaHub") %>% # select TCGA Hub
426 |         XenaScan("HNSC") %>%
427 |         filter(DataSubtype == "gene expression RNAseq", Label == "IlluminaHiSeq", Unit == "log2(norm_count+1)") 
428 |     }
429 |   
430 |   gene_info_km <- data.frame()
431 |   
432 |   samples <- fetch_dataset_samples( unique(ge$XenaHosts),  ge$XenaDatasets, limit = NULL)
433 |   
434 |   gene_info_km <- fetch_dense_values(host = unique(ge$XenaHosts), 
435 |                                      dataset = ge$XenaDatasets,
436 |                                      samples = samples,
437 |                                      use_probeMap = F,
438 |                                      check = FALSE,
439 |                                      time_limit = 10000)
440 |   
441 |   gene_info_km <- as.data.frame((gene_info_km))
442 |   
443 |   
444 |   #calculate gene sets
445 |   
446 |   rankData <- rankGenes(gene_info_km)
447 |   
448 |   scoredf_core <- simpleScore(rankData, upSet = core_genes , knownDirection = FALSE)
449 |   scoredf_edge <- simpleScore(rankData, upSet = edge_genes , knownDirection = FALSE)
450 |   
451 |   #calculate gene sets
452 |   names(scoredf_core) <- c("core","core_dispersion")
453 |   names(scoredf_edge) <- c("edge","edge_dispersion")
454 |   
455 |   scores_df_singshot <- cbind(scoredf_core,scoredf_edge)
456 |   
457 |   scores_df_singshot <- cbind(sample = rownames(scores_df_singshot), data.frame(scores_df_singshot, row.names=NULL))
458 |   
459 |   merged_data = survival_data %>% 
460 |     left_join(scores_df_singshot, by = "sample") %>% 
461 |     dplyr::select(sample,names(geneset_of_interest), DSS.time, DSS)  %>% 
462 |     left_join(clinical_data, by = "sample") %>% 
463 |     dplyr::select(sample,names(geneset_of_interest), DSS.time, DSS,sample_type)  %>% 
464 |     dplyr::rename(time = DSS.time, 
465 |                   status = DSS)
466 |   
467 |   if (indication != "LAML") {
468 |     merged_data = merged_data[merged_data$sample_type == "Primary Tumor",]
469 |   }
470 |   
471 |   merged_data <- merged_data[complete.cases(merged_data), ]
472 |   
473 |   cut = surv_cutpoint(data = merged_data, time = "time", event = "status", variables = names(geneset_of_interest), minprop = 0.1)
474 |   res.cat <- surv_categorize(cut)
475 |   
476 |   ##switch stuff here more
477 |   for (gene in names(geneset_of_interest)) {
478 |     
479 |     res.cat$binary = res.cat[[gene]]
480 |     
481 |     row = df[df$gene == gene & df$indication == indication,]
482 |     rownumber = row.names(df[df$gene == gene & df$indication == indication,])
483 |     
484 |     #try high ref
485 |     res.cat$binary <- factor(res.cat$binary, levels = c("high","low"))
486 |     
487 |     fit.coxph <- coxph(Surv(time, status) ~ binary, data = res.cat)
488 |     
489 |     HR <- round(exp(coef(fit.coxph)), 2)[[1]]
490 |     
491 |     if (is.na(HR)) {
492 |       row$hazard_ratio <- 1
493 |       row$worse_outcome = "n/a"
494 |     } else {
495 |       if (coef(fit.coxph) > 0) {
496 |         row$hazard_ratio <- round(exp(coef(fit.coxph)), 2)[[1]]
497 |         row$worse_outcome = "Low"
498 |       } else {
499 |         row$hazard_ratio <- 1/round(exp(coef(fit.coxph)), 2)[[1]]
500 |         row$worse_outcome = "High"
501 |       }
502 |     }
503 |     
504 |     #get p val
505 |     
506 |     p_value = summary(fit.coxph)$waldtest[[3]]
507 |     
508 |     row$p_val = p_value
509 |     
510 |     df[rownumber,] = row
511 |   }
512 |   
513 | }
514 | 
515 | df$sig = 1
516 | df[is.na(df)] = 1
517 | 
518 | 
519 | for (data_row in rownames(df)) {
520 |   row = df[data_row,]
521 |   if(row$p_val > 0.05) {
522 |     row$sig = "ns"
523 |     if(row$hazard_ratio == "Inf" || row$hazard_ratio > 100) {
524 |       row$hazard_ratio = 1
525 |     }
526 |   } else {
527 |     if(row$hazard_ratio == "Inf" || row$hazard_ratio > 100) {
528 |       row$sig = row$worse_outcome 
529 |       row$hazard_ratio = 1
530 |     } else {
531 |       row$sig = row$worse_outcome 
532 |     }
533 |   }
534 |   df[data_row,] = row
535 | }
536 | 
537 | 
538 | dir.create("heatmap_tcga")
539 | setwd("heatmap_tcga")
540 | 
541 | 
542 | png("survival_dotplot_DSS.png", units="in", width=3, height=5, res=300, type = "cairo")
543 | ggplot(df, aes(x = geneset, y = indication, color = sig)) +
544 |   geom_point(aes(size = hazard_ratio, fill = sig)) +
545 |   scale_colour_manual(values = c("High" = "#d73027",
546 |                                  "Low" = "#4575b4","ns" ="grey")) +
547 |   labs(x="geneset", y="indication",
548 |        title="Decreased DSS") + 
549 |   theme_minimal()
550 | dev.off()
551 | setwd("..")
552 | ###
553 | 
554 | library(UCSCXenaTools)
555 | library(dplyr)
556 | library(survival)
557 | library(survminer)
558 | library(data.table)
559 | library(ggplot2)
560 | library(tibble)
561 | library(stringr)
562 | library(plyr)
563 | library(singscore)
564 | library(pROC)
565 | `%!in%` = Negate(`%in%`)
566 | 
567 | 
568 | replacement_dictionary <- c(
569 |   "ADIRF" = "C10orf116", "CTSV" = 'CTSL2',"ERO1A" = "ERO1L","IL36G" = "IL1F9",
570 |   "IL36RN" = "IL1F5","ATP5F1A" = "ATP5A1","ATP5F1B" = "ATP5B", "ATP5MF" = "ATP5J2",
571 |   'DDX39B' = "BAT1","HNRNPDL" = 'HNRPDL',"MZT2B" = "FAM128B","PKM" = 'PKM2', 
572 |   'RACK1' = 'GNB2L1', 'SNHG29' = 'NCRNA00188','SRSF2' = "SFRS2" ,"TMA7" = 'CCDC72')
573 | 
574 | genes_to_remove <- c("DEFB4B","PRR9","RNF223","SLURP2","MIR205HG","TMSB4X")
575 | 
576 | 
577 | core_genes <- readRDS("core_genes.RDS")
578 | 
579 | for (gene in core_genes) {
580 |   if (gene %in% names(replacement_dictionary)) {
581 |     core_genes <- core_genes[core_genes != gene]
582 |     core_genes <- append(core_genes, replacement_dictionary[[gene]])
583 |   }
584 | } 
585 | 
586 | core_genes <- core_genes[core_genes %!in% genes_to_remove]
587 | 
588 | edge_genes <- readRDS("edge_genes.RDS")
589 | 
590 | for (gene in edge_genes) {
591 |   if (gene %in% names(replacement_dictionary)) {
592 |     edge_genes <- edge_genes[edge_genes != gene]
593 |     edge_genes <- append(edge_genes, replacement_dictionary[[gene]])
594 |   }
595 | } 
596 | 
597 | edge_genes <- edge_genes[edge_genes %!in% genes_to_remove]
598 | 
599 | geneset_of_interest <- list(core_genes,edge_genes)
600 | names(geneset_of_interest) <- c("core","edge")
601 | 
602 | all_genes <- union(core_genes,edge_genes)
603 | 
604 | 
605 | cancer_indications <- c("BRCA","LUAD","LUSC","LIHC","OSCC","COADREAD","PAAD","ACC",
606 |                         "BLCA","STAD","THCA","PRAD",
607 |                         "CESC","GBMLGG","KIRP","MESO",
608 |                         "KIRC","OV","SARC","SKCM")
609 | 
610 | #make a df row 1 is gene, row 2 indication, row 3 p value, row 4 low or high
611 | df <- data.frame (geneset  = rep(names(geneset_of_interest),length(cancer_indications)),
612 |                   indication  = rep(cancer_indications,each = length(geneset_of_interest)),
613 |                   p_val = 1,worse_outcome = 0, hazard_ratio = 0
614 | )
615 | 
616 | for (indication in cancer_indications) {
617 |   #initialize object
618 |   if (indication != "OSCC") {
619 |     Xena_cohort = XenaData %>% 
620 |       filter(XenaHostNames == "tcgaHub") %>% # select TCGA Hub
621 |       XenaScan(indication)   # select cohort
622 |     
623 |     #download clinical and survival datasets
624 |     cli_query = Xena_cohort %>% 
625 |       filter(DataSubtype == "phenotype") %>%  # select clinical dataset
626 |       XenaGenerate() %>%  # generate a XenaHub object
627 |       XenaQuery() %>% 
628 |       XenaDownload()
629 |     
630 |     if (indication != "OV") {
631 |       cli = XenaPrepare(cli_query)
632 |       survival_data = cli[[2]]
633 |       clinical_data = cli[[1]]
634 |       clinical_data$sample = clinical_data$sampleID
635 |     } else {
636 |       cli = XenaPrepare(cli_query)
637 |       survival_data = cli[[3]]
638 |       clinical_data = cli[[1]]
639 |       clinical_data$sample = clinical_data$sampleID
640 |     }
641 |     
642 |     if ("xena_sample" %in% colnames(survival_data)) {
643 |       survival_data$sample = survival_data$xena_sample 
644 |       survival_data$xena_sample = NULL
645 |     }
646 |     
647 |     if (indication == "OV") {
648 |       ge <- XenaData %>%
649 |         filter(XenaHostNames == "tcgaHub") %>% # select TCGA Hub
650 |         XenaScan(indication) %>%
651 |         filter(DataSubtype == "gene expression RNAseq", Label == "IlluminaHiSeq UNC", Unit == "log2(norm_count+1)") 
652 |     } else { 
653 |       if (indication == "STAD") {
654 |         ge <- XenaData %>%
655 |           filter(XenaHostNames == "tcgaHub") %>% # select TCGA Hub
656 |           XenaScan(indication) %>%
657 |           filter(DataSubtype == "gene expression RNAseq", Label == "IlluminaHiSeq UNC", Unit == "log2(norm_count+1)") 
658 |         ge <- ge[2,]
659 |       } else {
660 |         ge <- XenaData %>%
661 |           filter(XenaHostNames == "tcgaHub") %>% # select TCGA Hub
662 |           XenaScan(indication) %>%
663 |           filter(DataSubtype == "gene expression RNAseq", Label == "IlluminaHiSeq", Unit == "log2(norm_count+1)") 
664 |       }
665 |     }
666 |     
667 |     if (indication == "SARC") {
668 |       ge = ge[1,]
669 |     } } else {
670 |       sample_ids <- read.csv(file = "TCGA_OSCC_HPV_neg_CODES.csv")
671 |       tcga_oscc_codes <- sample_ids$TCGA_codes 
672 |       tcga_oscc_sampleID <- lapply(tcga_oscc_codes, function(x) paste(x, "-01", sep = ""))
673 |       
674 |       #initialize object
675 |       Xena_cohort = XenaData %>% 
676 |         filter(XenaHostNames == "tcgaHub") %>% # select TCGA Hub
677 |         XenaScan("HNSC")   # select cohort
678 |       
679 |       #download clinical and survival datasets
680 |       cli_query = Xena_cohort %>% 
681 |         filter(DataSubtype == "phenotype") %>%  # select clinical dataset
682 |         XenaGenerate() %>%  # generate a XenaHub object
683 |         XenaQuery() %>% 
684 |         XenaDownload()
685 |       
686 |       cli = XenaPrepare(cli_query)
687 |       survival_data = cli[[2]]
688 |       clinical_data = cli[[1]]
689 |       clinical_data$sample = clinical_data$sampleID
690 |       
691 |       if ("xena_sample" %in% colnames(survival_data)) {
692 |         survival_data$sample = survival_data$xena_sample 
693 |         survival_data$xena_sample = NULL
694 |       }
695 |       
696 |       clinical_data <- clinical_data[clinical_data$sampleID %in% tcga_oscc_sampleID, ]
697 |       survival_data <- survival_data[survival_data$sample %in% tcga_oscc_sampleID, ]  
698 |       
699 |       ge <- XenaData %>%
700 |         filter(XenaHostNames == "tcgaHub") %>% # select TCGA Hub
701 |         XenaScan("HNSC") %>%
702 |         filter(DataSubtype == "gene expression RNAseq", Label == "IlluminaHiSeq", Unit == "log2(norm_count+1)") 
703 |     }
704 |   
705 |   gene_info_km <- data.frame()
706 |   
707 |   samples <- fetch_dataset_samples( unique(ge$XenaHosts),  ge$XenaDatasets, limit = NULL)
708 |   
709 |   gene_info_km <- fetch_dense_values(host = unique(ge$XenaHosts), 
710 |                                      dataset = ge$XenaDatasets,
711 |                                      samples = samples,
712 |                                      use_probeMap = F,
713 |                                      check = FALSE,
714 |                                      time_limit = 10000)
715 |   
716 |   gene_info_km <- as.data.frame((gene_info_km))
717 |   
718 |   
719 |   #calculate gene sets
720 |   
721 |   rankData <- rankGenes(gene_info_km)
722 |   
723 |   scoredf_core <- simpleScore(rankData, upSet = core_genes , knownDirection = FALSE)
724 |   scoredf_edge <- simpleScore(rankData, upSet = edge_genes , knownDirection = FALSE)
725 |   
726 |   #calculate gene sets
727 |   names(scoredf_core) <- c("core","core_dispersion")
728 |   names(scoredf_edge) <- c("edge","edge_dispersion")
729 |   
730 |   scores_df_singshot <- cbind(scoredf_core,scoredf_edge)
731 |   
732 |   scores_df_singshot <- cbind(sample = rownames(scores_df_singshot), data.frame(scores_df_singshot, row.names=NULL))
733 |   
734 |   merged_data = survival_data %>% 
735 |     left_join(scores_df_singshot, by = "sample") %>% 
736 |     dplyr::select(sample,names(geneset_of_interest), PFI.time, PFI)  %>% 
737 |     left_join(clinical_data, by = "sample") %>% 
738 |     dplyr::select(sample,names(geneset_of_interest), PFI.time, PFI,sample_type)  %>% 
739 |     dplyr::rename(time = PFI.time, 
740 |                   status = PFI)
741 |   
742 |   if (indication != "LAML") {
743 |     merged_data = merged_data[merged_data$sample_type == "Primary Tumor",]
744 |   }
745 |   
746 |   merged_data <- merged_data[complete.cases(merged_data), ]
747 |   
748 |   cut = surv_cutpoint(data = merged_data, time = "time", event = "status", variables = names(geneset_of_interest), minprop = 0.1)
749 |   res.cat <- surv_categorize(cut)
750 |   
751 |   ##switch stuff here more
752 |   for (gene in names(geneset_of_interest)) {
753 |     
754 |     res.cat$binary = res.cat[[gene]]
755 |     
756 |     row = df[df$gene == gene & df$indication == indication,]
757 |     rownumber = row.names(df[df$gene == gene & df$indication == indication,])
758 |     
759 |     #try high ref
760 |     res.cat$binary <- factor(res.cat$binary, levels = c("high","low"))
761 |     
762 |     fit.coxph <- coxph(Surv(time, status) ~ binary, data = res.cat)
763 |     
764 |     HR <- round(exp(coef(fit.coxph)), 2)[[1]]
765 |     
766 |     if (is.na(HR)) {
767 |       row$hazard_ratio <- 1
768 |       row$worse_outcome = "n/a"
769 |     } else {
770 |       if (coef(fit.coxph) > 0) {
771 |         row$hazard_ratio <- round(exp(coef(fit.coxph)), 2)[[1]]
772 |         row$worse_outcome = "Low"
773 |       } else {
774 |         row$hazard_ratio <- 1/round(exp(coef(fit.coxph)), 2)[[1]]
775 |         row$worse_outcome = "High"
776 |       }
777 |     }
778 |     
779 |     #get p val
780 |     
781 |     p_value = summary(fit.coxph)$waldtest[[3]]
782 |     
783 |     row$p_val = p_value
784 |     
785 |     df[rownumber,] = row
786 |   }
787 |   
788 | }
789 | 
790 | df$sig = 1
791 | df[is.na(df)] = 1
792 | 
793 | 
794 | for (data_row in rownames(df)) {
795 |   row = df[data_row,]
796 |   if(row$p_val > 0.05) {
797 |     row$sig = "ns"
798 |     if(row$hazard_ratio == "Inf" || row$hazard_ratio > 100) {
799 |       row$hazard_ratio = 1
800 |     }
801 |   } else {
802 |     if(row$hazard_ratio == "Inf" || row$hazard_ratio > 100) {
803 |       row$sig = row$worse_outcome 
804 |       row$hazard_ratio = 1
805 |     } else {
806 |       row$sig = row$worse_outcome 
807 |     }
808 |   }
809 |   df[data_row,] = row
810 | }
811 | 
812 | 
813 | dir.create("heatmap_tcga")
814 | setwd("heatmap_tcga")
815 | 
816 | 
817 | png("survival_dotplot_PFI.png", units="in", width=3, height=5, res=300, type = "cairo")
818 | ggplot(df, aes(x = geneset, y = indication, color = sig)) +
819 |   geom_point(aes(size = hazard_ratio, fill = sig)) +
820 |   scale_colour_manual(values = c("High" = "#d73027",
821 |                                  "Low" = "#4575b4","ns" ="grey")) +
822 |   labs(x="geneset", y="indication",
823 |        title="Decreased PFI") + 
824 |   theme_minimal()
825 | dev.off()
826 | setwd("..")
827 | 


--------------------------------------------------------------------------------
/Figure 5/Validation/GSE41613_survival.R:
--------------------------------------------------------------------------------
  1 | library(GEOquery)
  2 | library(tidyr)
  3 | GSE41613 <- GEOquery::getGEO(GEO = "GSE41613")
  4 | 
  5 | #process this dataset using limma package
  6 | 
  7 | GSE41613_meta <- GSE41613$GSE41613_series_matrix.txt.gz@phenoData@data
  8 | 
  9 | GSE41613_survival <- GSE41613_meta[,colnames(GSE41613_meta)%in% c("characteristics_ch1.4","characteristics_ch1.5")]
 10 | names(GSE41613_survival) <- c("survival","fu_time")
 11 | 
 12 | GSE41613_expr <- GSE41613$GSE41613_series_matrix.txt.gz@assayData$exprs
 13 | GSE41613_ex_info <- GSE41613$GSE41613_series_matrix.txt.gz@featureData@data
 14 | library(limma)
 15 | 
 16 | # perform background correction on fluorescent intensities
 17 | #GSE41613_expr <- backgroundCorrect(GSE41613_expr, method = 'normexp',offset=0)
 18 | GSE41613_expr <- avereps(GSE41613_expr,ID = GSE41613_ex_info$`Gene Symbol`)
 19 | rownames(GSE41613_expr) <- lapply(rownames(GSE41613_expr), function(x) stringr::str_split(x,"///")[[1]][1])
 20 | rownames(GSE41613_expr) <- lapply(rownames(GSE41613_expr), function(x) gsub(" ", "", x))
 21 | 
 22 | 
 23 | GSE41613_survival$time <- lapply(GSE41613_survival$fu_time, function(x) gsub("fu time:", "", x) )
 24 | 
 25 | GSE41613_survival$time <- as.numeric(GSE41613_survival$time)
 26 | 
 27 | GSE41613_survival$os <- ifelse(GSE41613_survival$survival == "vital: Alive", 0,1)
 28 | GSE41613_survival$dss <- ifelse(GSE41613_survival$survival == "vital: Alive", 0,ifelse(
 29 |   GSE41613_survival$survival == "vital: Dead-non OC",NA,1
 30 | ))
 31 | 
 32 | 
 33 | ###
 34 | library(singscore)
 35 | indication <- "HNSC"
 36 | 
 37 | core_genes <- readRDS("core_genes.RDS")
 38 | core_genes <- c(core_genes)
 39 | 
 40 | edge_genes <- readRDS("edge_genes.RDS")
 41 | edge_genes <- c(edge_genes)
 42 | 
 43 | geneset_of_interest <- list(core_genes,edge_genes)
 44 | names(geneset_of_interest) <- c("core","edge")
 45 | 
 46 | all_genes <- union(core_genes,edge_genes)
 47 | 
 48 | rankData <- rankGenes(as.data.frame(GSE41613_expr))
 49 | 
 50 | #add edge and croe scores
 51 | scoredf_core <- simpleScore(rankData, upSet = core_genes , knownDirection = F)
 52 | scoredf_edge <- simpleScore(rankData, upSet = edge_genes , knownDirection = F)
 53 | 
 54 | #calculate gene sets
 55 | names(scoredf_core) <- c("core","core_dispersion")
 56 | names(scoredf_edge) <- c("edge","edge_dispersion")
 57 | 
 58 | scores_df_singshot <- cbind(scoredf_core,scoredf_edge)
 59 | 
 60 | scores_df_singshot <- cbind(sample = rownames(scores_df_singshot), data.frame(scores_df_singshot, row.names=NULL))
 61 | rownames(scores_df_singshot) = scores_df_singshot$sample
 62 | 
 63 | merged_data = scores_df_singshot %>% 
 64 |   merge(GSE41613_survival, by = 0) %>% 
 65 |   dplyr::select(Row.names,names(geneset_of_interest), time, os,dss) 
 66 | 
 67 | #survival testing
 68 | library(survminer)
 69 | library(survival)
 70 | df <- data.frame (geneset  = rep(names(geneset_of_interest),2),
 71 |                   indication  = rep(c("os","dss"),each = 2),
 72 |                   p_val = 1,worse_outcome = 0, hazard_ratio = 0
 73 | )
 74 | 
 75 | for (survival_method in c("os","dss")) {
 76 | ##switch stuff here more
 77 | for (gene in names(geneset_of_interest)) {
 78 |   
 79 |   cut = surv_cutpoint(data = merged_data, time = "time", event = survival_method, variables = names(geneset_of_interest),
 80 |                       minprop = 0.1)
 81 |   res.cat <- surv_categorize(cut)
 82 |   
 83 |   res.cat$binary = res.cat[[gene]]
 84 |   
 85 |   row = df[df$gene == gene & df$indication == survival_method,]
 86 |   rownumber = row.names(df[df$gene == gene & df$indication == survival_method,])
 87 |   
 88 |   res.cat$binary <- factor(res.cat$binary, levels = c("high","low"))
 89 |   res.cat$status <- res.cat[[survival_method]]
 90 |   fit.coxph <- coxph(Surv(time, status) ~ binary, data = res.cat)
 91 |   
 92 |   HR <- round(exp(coef(fit.coxph)), 2)[[1]]
 93 |   
 94 |   if (is.na(HR)) {
 95 |     row$hazard_ratio <- 1
 96 |     row$worse_outcome = "n/a"
 97 |   } else {
 98 |     if (coef(fit.coxph) > 0) {
 99 |       row$hazard_ratio <- round(exp(coef(fit.coxph)), 2)[[1]]
100 |       row$worse_outcome = "Low"
101 |     } else {
102 |       row$hazard_ratio <- 1/round(exp(coef(fit.coxph)), 2)[[1]]
103 |       row$worse_outcome = "High"
104 |     }
105 |   }
106 |   
107 |   #get p val
108 |   
109 |   p_value = summary(fit.coxph)$waldtest[[3]]
110 |   
111 |   row$p_val = p_value
112 |   
113 |   df[rownumber,] = row
114 | 
115 | }
116 | }
117 | 
118 | 
119 | df$sig = 1
120 | df[is.na(df)] = 1
121 | 
122 | 
123 | for (data_row in rownames(df)) {
124 |   row = df[data_row,]
125 |   if(row$p_val > 0.05) {
126 |     row$sig = "ns"
127 |     if(row$hazard_ratio == "Inf" || row$hazard_ratio > 100) {
128 |       row$hazard_ratio = 1
129 |     }
130 |   } else {
131 |     if(row$hazard_ratio == "Inf" || row$hazard_ratio > 100) {
132 |       row$sig = row$worse_outcome 
133 |       row$hazard_ratio = 1
134 |     } else {
135 |       row$sig = row$worse_outcome 
136 |     }
137 |   }
138 |   df[data_row,] = row
139 | }
140 | 
141 | dir.create("heatmap_GSE41613")
142 | setwd("heatmap_GSE41613")
143 | 
144 | png("survival_dotplot.png", units="cm", width=9, height=7, res=300, type = "cairo")
145 | ggplot(df, aes(x = geneset, y = indication, color = sig)) +
146 |   geom_point(aes(size = hazard_ratio)) +
147 |   scale_size(range = c(7, 10))+
148 |   scale_colour_manual(values = c("High" = "#d73027",
149 |                                  "Low" = "#4575b4",
150 |                                  "ns" = "lightgray")) +
151 |   labs(x="geneset", y="survival type",
152 |        title="GSE41613") + 
153 |   theme_minimal()
154 | dev.off()
155 | setwd("..")
156 | 
157 | 
158 | 


--------------------------------------------------------------------------------
/Figure 6/Dynamo/dynamo.R:
--------------------------------------------------------------------------------
  1 | scores_df <- read.csv("all_200_rescaled_vf.csv")
  2 | library(ggplot2)
  3 | library(ggpubr)
  4 | library(dplyr)
  5 | library(tidyr)
  6 | scores_df <- scores_df %>% mutate(rank=dense_rank(desc(-AUC_mean)))
  7 | 
  8 | scores_df$median_group <- ifelse(scores_df$AUC_mean > median(scores_df$AUC_mean), "High", "Low")
  9 | 
 10 | PRISM <- readRDS("./targeted_survival/files/PSet_PRISM.rds")
 11 | PRISM <- updateObject(PRISM)
 12 | treatment <- PRISM@treatment[,colnames(PRISM@treatment) %in% c("moa","treatmentid", "cid", "smiles", "phase","indication")]
 13 | 
 14 | find_closest_match <- function(drug, treatment_table, threshold = 3) {
 15 |   treatmentid <- treatment_table$treatmentid
 16 |   distances <- adist(drug, treatmentid)
 17 |   
 18 |   closest_index <- which.min(distances)
 19 |   min_distance <- min(distances)
 20 |   
 21 |   if (min_distance <= threshold) {
 22 |     return(treatment_table[closest_index, ])
 23 |   } else {
 24 |     return(data.frame(moa = NA, treatmentid = NA, cid = NA, smiles = NA, phase = NA, indication = NA))
 25 |   }
 26 | }
 27 | 
 28 | result <- scores_df %>%
 29 |   rowwise() %>%
 30 |   mutate(closest_treatment = list(find_closest_match(drugs, treatment))) %>%
 31 |   unnest(cols = closest_treatment) %>%
 32 |   select(-treatmentid) 
 33 | 
 34 | ##edge outgoing
 35 | png("dyn_plots/edge_outgoing_bar.png", units="in", width=4, height=6, res=300, type = "cairo")
 36 | ggplot(scores_df, aes(x = median_group, y = edge_outgoing, fill = median_group)) +
 37 |   geom_boxplot(alpha = 0.6) +
 38 |   geom_jitter(width = 0.1, alpha = 0.6, size = 2) +
 39 |   stat_compare_means(
 40 |     method = "wilcox.test",
 41 |     label = "p.format",
 42 |     hide.ns = TRUE,
 43 |     bracket.size = 0.5,
 44 |     label.y = max(scores_df$edge_outgoing) * 0.95,
 45 |     label.x = 1.5
 46 |   ) +
 47 |   labs(
 48 |     x = "AAC Score",
 49 |     y = "Edge Outgoing Vector Field Score"
 50 |   ) +
 51 |   theme_bw() +
 52 |   theme(
 53 |     text = element_text(face = "bold"),
 54 |     axis.text = element_text(face = "bold"),
 55 |     legend.position = "none"
 56 |   ) +
 57 |   scale_fill_manual(values = c("Low" = "#1b9e77", "High" = "#7570b3")) +
 58 |   scale_x_discrete(labels = c("High (n = 35)","Low (n = 35)"))
 59 | dev.off()
 60 | 
 61 | 
 62 | ##core incoming
 63 | png("dyn_plots/core_incoming_bar.png", units="in", width=4, height=6, res=300, type = "cairo")
 64 | ggplot(scores_df, aes(x = median_group, y = core_incoming, fill = median_group)) +
 65 |   geom_boxplot(alpha = 0.6) +
 66 |   geom_jitter(width = 0.1, alpha = 0.6, size = 2) +
 67 |   stat_compare_means(
 68 |     method = "wilcox.test",
 69 |     label = "p.format",
 70 |     hide.ns = TRUE,
 71 |     bracket.size = 0.5,
 72 |     label.y = max(scores_df$edge_outgoing) * 0.95,
 73 |     label.x = 1.5
 74 |   ) +
 75 |   labs(
 76 |     x = "AAC Score",
 77 |     y = "Core Incoming Vector Field Score"
 78 |   ) +
 79 |   theme_bw() +
 80 |   theme(
 81 |     text = element_text(face = "bold"),
 82 |     axis.text = element_text(face = "bold"),
 83 |     legend.position = "none"
 84 |   ) +
 85 |   scale_fill_manual(values = c("Low" = "#1b9e77", "High" = "#7570b3")) +
 86 |   scale_x_discrete(labels = c("High (n = 35)","Low (n = 35)"))
 87 | dev.off()
 88 | 
 89 | ###drug class analysis
 90 | 
 91 | #filter for only classes with multiple drugs
 92 | filtered_result <- result %>%
 93 |   group_by(moa) %>%
 94 |   filter(n() > 1) %>%
 95 |   ungroup()
 96 | 
 97 | filtered_result <- filtered_result[!is.na(filtered_result$moa),]
 98 | 
 99 | png("dyn_plots/edge_outgoing_drug_class.png", units="in", width=8, height=6, res=300, type = "cairo")
100 | ggplot(filtered_result, aes(x = moa, y = edge_outgoing, fill = moa)) +
101 |   geom_boxplot(alpha = 0.6) +
102 |   geom_jitter(width = 0.1, alpha = 0.6, size = 2) +
103 |   stat_compare_means(
104 |     method = "kruskal.test",
105 |     label = "p.format",
106 |     hide.ns = TRUE,
107 |     bracket.size = 0.5,
108 |     label.y = max(scores_df$edge_outgoing) * 0.95,
109 |     label.x = 1.5
110 |   ) +
111 |   labs(
112 |     x = "Drug MOA",
113 |     y = "Edge Outgoing Vector Field Score"
114 |   ) +
115 |   theme_bw() +
116 |   theme(
117 |     text = element_text(face = "bold"),
118 |     axis.text = element_text(face = "bold"),
119 |     axis.text.x = element_text(angle = 45, hjust = 1),
120 |     legend.position = "none"
121 |   )
122 | dev.off()
123 | 
124 | png("dyn_plots/core_incoming_drug_class.png", units="in", width=8, height=6, res=300, type = "cairo")
125 | 
126 | ggplot(filtered_result, aes(x = moa, y = core_incoming, fill = moa)) +
127 |   geom_boxplot(alpha = 0.6) +
128 |   geom_jitter(width = 0.1, alpha = 0.6, size = 2) +
129 |   stat_compare_means(
130 |     method = "kruskal.test",
131 |     label = "p.format",
132 |     hide.ns = TRUE,
133 |     bracket.size = 0.5,
134 |     label.y = max(scores_df$core_incoming) * 0.95,
135 |     label.x = 1.5
136 |   ) +
137 |   labs(
138 |     x = "Drug MOA",
139 |     y = "Core Incoming Vector Field Score"
140 |   ) +
141 |   theme_bw() +
142 |   theme(
143 |     text = element_text(face = "bold"),
144 |     axis.text = element_text(face = "bold"),
145 |     axis.text.x = element_text(angle = 45, hjust = 1),
146 |     legend.position = "none"
147 |   )
148 | dev.off()
149 | 
150 | write.csv(result, file = "Dyanmo_Vf_Pertrub.csv")
151 | 


--------------------------------------------------------------------------------
/Figure 6/PharmacoDB and DGIdb/phamacogx.R:
--------------------------------------------------------------------------------
  1 | library(PharmacoGx)
  2 | '%!in%' <- function(x,y)!('%in%'(x,y))
  3 | 
  4 | CCLE <- readRDS("files/CCLE.rds")
  5 | CCLE <- updateObject(CCLE)
  6 | 
  7 | CTRPv2 <- readRDS("files/PSet_CTRPv2.rds")
  8 | CTRPv2 <- updateObject(CTRPv2)
  9 | 
 10 | PRISM <- readRDS("files/PSet_PRISM.rds")
 11 | PRISM <- updateObject(PRISM)
 12 | 
 13 | GDSC2 <- readRDS("files/PSet_GDSC2020.rds")
 14 | GDSC2 <- updateObject(GDSC2)
 15 | 
 16 | gCSI <- readRDS("files/PSet_gCSI2019.rds")
 17 | gCSI <- updateObject(gCSI)
 18 | 
 19 | ##ccle
 20 | CCLE_cell_line_id <- CCLE@sample[CCLE@sample$tissueid %in% c("Head and Neck") ,]$sampleid
 21 | 
 22 | CTRPv2_cell_line_id <- CTRPv2@sample[CTRPv2@sample$tissueid %in% c("Head and Neck") ,]$sampleid
 23 | 
 24 | PRISM_cell_line_id <- PRISM@sample[PRISM@sample$tissueid %in% c("Head and Neck") ,]$sampleid
 25 | 
 26 | gCSI_cell_line_id <- gCSI@sample[gCSI@sample$tissueid %in% c("Head and Neck") ,]$sampleid
 27 | 
 28 | GDSC2_cell_line_id <- GDSC2@sample[GDSC2@sample$tissueid %in% c("Head and Neck") ,]$sampleid
 29 | 
 30 | 
 31 | #64 HNSC cell lines currently
 32 | all_cell_lines <- union(union(union(union(CCLE_cell_line_id,gCSI_cell_line_id),GDSC2_cell_line_id),CTRPv2_cell_line_id),PRISM_cell_line_id)
 33 | 
 34 | HNSC_cl_class <- read.csv(file = "HNSC_cl_class.csv")
 35 | HNSC_cl_class_pos <- HNSC_cl_class[HNSC_cl_class$HPV.Status == "HPV-negative",]$HNSC.Cell.Line
 36 |   
 37 | all_cell_lines = HNSC_cl_class_pos
 38 | 
 39 | 
 40 | i = 1
 41 | 
 42 | treat_response_studies <- c(CTRPv2,PRISM,GDSC2,CCLE,gCSI)
 43 | 
 44 | CCLE_drugs<- unique(CCLE@treatmentResponse$info$treatmentid)
 45 | CTRPv2_drugs<- unique(CTRPv2@treatmentResponse$info$treatmentid)
 46 | PRISM_drugs<- unique(PRISM@treatmentResponse$info$treatmentid)
 47 | gCSI_drugs <- unique(gCSI@treatmentResponse$info$treatmentid)
 48 | GDSC2_drugs <- unique(GDSC2@treatmentResponse$info$treatmentid)
 49 | 
 50 | all_drugs <- union(union(union(union(gCSI_drugs,GDSC2_drugs),CTRPv2_drugs),gCSI_drugs),PRISM_drugs)
 51 | 
 52 | auc_df = data.frame(matrix(nrow = length(all_drugs), ncol = length(all_cell_lines))) 
 53 | colnames(auc_df) <- all_cell_lines
 54 | rownames(auc_df) <- all_drugs
 55 | 
 56 | for (treat_response in treat_response_studies){
 57 |   
 58 |   response_to_treatment <- PharmacoGx::summarizeSensitivityProfiles(treat_response,
 59 |                                                                     sensitivity.measure='aac_recomputed',
 60 |                                                                     summary.stat="mean",
 61 |                                                                     verbose=FALSE)
 62 |   response_to_treatment <- as.data.frame(response_to_treatment)
 63 |   response_to_treatment <- response_to_treatment[,colnames(response_to_treatment) %in% all_cell_lines]
 64 |   response_to_treatment <- response_to_treatment[,colSums(is.na(response_to_treatment))<nrow(response_to_treatment)]
 65 |   for (rowenum in 1:nrow(response_to_treatment)){
 66 |     row = response_to_treatment[rowenum,]
 67 |     for (colenum in 1:ncol(row)) {
 68 |       value = response_to_treatment[rowenum,colenum]
 69 |       if (!is.na(value)){
 70 |         drug_name <- rownames(response_to_treatment)[rowenum]
 71 |         sample_name <- colnames(response_to_treatment)[colenum]
 72 |         if (is.na(auc_df[drug_name,sample_name])) {
 73 |           auc_df[drug_name,sample_name] = value
 74 |         } else {
 75 |           auc_df[drug_name,sample_name]  = paste0(auc_df[drug_name,sample_name], "&",value)
 76 |         }
 77 |       }
 78 |     }
 79 |   }
 80 | }
 81 | 
 82 | mean_applied = function(value) {
 83 |   if(!is.na(value)) {
 84 |   if(stringr:::str_detect(value, "&")){
 85 |     avg <- mean(as.numeric(unlist((stringr::str_split(value, "&")[[1]]))))
 86 |     return(avg)
 87 |   }
 88 |     return(value)
 89 |   }
 90 |     return(value)
 91 | }
 92 | 
 93 | auc_df <-apply(auc_df, FUN = mean_applied, MARGIN = c(1,2)) 
 94 | 
 95 | #rank drugs, only include drugs in more than half of cell lines
 96 | library(dplyr)
 97 | auc_df_reduced <- auc_df[,colSums(is.na(auc_df))<nrow(auc_df)]
 98 | auc_df_reduced <- auc_df_reduced[rowSums(is.na(auc_df_reduced))<ncol(auc_df_reduced)/2,]
 99 | auc_df_reduced <- as.data.frame(apply(auc_df_reduced,FUN = as.numeric, MARGIN = c(1,2)))
100 | auc_df_reduced$mean = 0
101 | 
102 | #trimmed mean calculation
103 | for (row in 1:nrow(auc_df_reduced)) {
104 |   mean <- base::mean(t(auc_df_reduced[row,])[,1], trim=0.1, na.rm=TRUE)
105 |   auc_df_reduced[row,]$mean <- mean
106 | }
107 | 
108 | auc_df_reduced <- auc_df_reduced %>% mutate(rank=dense_rank(mean))
109 | 
110 | write.csv(auc_df_reduced, file = "HNSC_hpvneg_drug_AAC.csv")
111 | 
112 | #
113 | drug_ic50summarized <- read.csv("HNSC_hpvneg_drug_AAC.csv")
114 | drug_names <- drug_ic50summarized$X
115 | 
116 | library(rDGIdb)
117 | library(httr)
118 | library(jsonlite)
119 | 
120 | body <- list(drugs = paste(drug_names, collapse = ","))
121 |             
122 | url <- "https://dgidb.org/api/v2/interactions.json"
123 | body <- body[!sapply(body, is.null)]
124 | httr::verbose()
125 | postRequest <- POST(url = url, body = body, encode = 'multipart')
126 | 
127 | text <- content(postRequest, as = "text")
128 | result <- fromJSON(text, simplifyVector = TRUE)
129 | 
130 | #create a dataframe of #drugname, #IC50 value, #genes up- or down-regulated
131 | 
132 | dgidb_ic50 <- data.frame(drugs = drug_ic50summarized$X, IC50_value <- drug_ic50summarized$mean, dgidb_drug_name = 0 ,
133 |                          genes_up = "",
134 |                          genes_down = "")
135 | 
136 | names(dgidb_ic50) <- c("drugs","AUC_mean","dgidb_drug_name","genes_up","genes_down")
137 | dgidb_ic50$genes_up <- as.list(dgidb_ic50$genes_up)
138 | dgidb_ic50$genes_down <- as.list(dgidb_ic50$genes_down)
139 | 
140 | matched_terms <- result$matchedTerms
141 | matched_terms <- as.data.frame(matched_terms)
142 | typeof(matched_terms)
143 | 
144 | #upregulated gene terms
145 | is_enriched <- c("activator", "agonist","inducer","partial agonist","positive modulator","potentiator","stimulator")
146 |   
147 | #downregulated gene terms
148 | is_depeleted <- c("inhibitor", "antagonist","partial antagonist","blocker","inverse agonist","negative modulator","suppressor")
149 |   
150 | truefunc <- function(value) {
151 |   if(length(value > 0)) {
152 |     return(all(value))
153 |   } else {
154 |     return(FALSE)
155 |   }
156 | }
157 | 
158 | for (row in rownames(matched_terms)) {
159 |   row_vals <- matched_terms[row,]
160 |   
161 |   dgidb_name <- row_vals$drugName
162 |   interactions <- as.data.frame(row_vals$interactions)
163 |   
164 |   interactions_deplete <- lapply(interactions$interactionTypes, function(x) any(x %in% is_depeleted))
165 |   interactions_deplete <- lapply(interactions_deplete, truefunc)
166 |   genes_deplete <- interactions[unlist(interactions_deplete),]$geneName
167 |   
168 |   interactions_enrich <- lapply(interactions$interactionTypes, function(x) any(x %in% is_enriched))
169 |   interactions_enrich <- lapply(interactions_enrich, truefunc)
170 |   genes_enrich <- interactions[unlist(interactions_enrich),]$geneName
171 | 
172 |   dgidb_ic50[toupper(dgidb_ic50$drugs) == row_vals$searchTerm,]$dgidb_drug_name <- dgidb_name
173 |   if (length(genes_enrich) != 0 ) {
174 |     dgidb_ic50[toupper(dgidb_ic50$drugs) == row_vals$searchTerm,]$genes_up<- purrr::reduce(genes_enrich, paste)
175 |   }
176 |   if (length(genes_deplete) != 0 ) {
177 |     dgidb_ic50[toupper(dgidb_ic50$drugs) == row_vals$searchTerm,]$genes_down <- purrr::reduce(genes_deplete, paste)
178 |   }
179 | }
180 | 
181 | dgidb_AUC_values <- dgidb_ic50
182 | 
183 | dgidb_AUC_values$genes_up <- lapply(dgidb_AUC_values$genes_up, plyr::ldply)
184 | 
185 | dgidb_AUC_values$genes_up <- lapply(dgidb_AUC_values$genes_up, `[[`, 1)
186 |   
187 | dgidb_AUC_values$genes_down <-  lapply(dgidb_AUC_values$genes_down, plyr::ldply)
188 | 
189 | dgidb_AUC_values$genes_down <- lapply(dgidb_AUC_values$genes_down, `[[`, 1)
190 | 
191 | dgidb_AUC_values$genes_up <- as.character(dgidb_AUC_values$genes_up)
192 | 
193 | dgidb_AUC_values$genes_down <- as.character(dgidb_AUC_values$genes_down)
194 | 
195 | #exact number and AAC mean of drugs identified may be slightly variable based on version of pharmacogx datasets used
196 |                                 
197 | write.csv(dgidb_AUC_values, file = "dgidb_AUC_values.csv")
198 | 
199 | 


--------------------------------------------------------------------------------
/Figure 6/Velocity/differential_splicing.R:
--------------------------------------------------------------------------------
 1 | diff_kinetics <- read.csv(file = "diff_kinetics.csv")
 2 | 
 3 | top_genes_diff_splice <- read.csv(file = "top_genes_diff_splice.csv")
 4 | 
 5 | GOI <- head(top_genes_diff_splice,10000)
 6 | 
 7 | GOI <- right_join(GOI,diff_kinetics, by = "X")
 8 | 
 9 | GOI <- head(GOI,10000)
10 | 
11 | write.csv(GOI, "diff_spliced_genes.csv")
12 | 
13 | GOI <- GOI[GOI$fit_diff_kinetics %in% c("core","edge","transitory"),]
14 | 
15 | GOI <- head(GOI,15)
16 | 
17 | png("diff_spliced_genes.png", units="in", width=6, height=4, res=300, type = "cairo")
18 | ggplot(GOI, aes(x = factor(X, level = c(X)), 
19 |                    y = fit_likelihood, fill = fit_diff_kinetics)) + 
20 |   geom_bar(stat="identity", color = "black", size = 0.25) + theme_minimal() + 
21 |   labs(x = "", y = "Splicing fit likelihood", fill = "Sample") +
22 |   scale_fill_manual( values = c("core" = "#4DBBD5FF", "edge" = "#E64B35FF","transitory" = "#F9E076"))+
23 |   ggpubr::rotate_x_text()+ theme(axis.text = element_text(face="bold", size = 11), legend.text = element_text(face="bold"), 
24 |                                  axis.text.y = element_text(face="bold"), axis.title.y = element_text(face="bold")) + ggpubr::rotate_y_text()
25 | dev.off()
26 | 


--------------------------------------------------------------------------------
/Figure 6/Velocity/tissue_position_splitting.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 9,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from pathlib import Path, PurePath\n",
 10 |     "import csv\n",
 11 |     "from h5py import File\n",
 12 |     "import os"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 11,
 18 |    "metadata": {},
 19 |    "outputs": [],
 20 |    "source": [
 21 |     "barcode_dict = {1:\"sample_1\",2:\"sample_2\",3:\"sample_3\",4:\"sample_4\",5:\"sample_5\",\n",
 22 |     "                6:\"sample_6\",9:\"sample_7\",10:\"sample_8\",\n",
 23 |     "               11:\"sample_9\", 12:\"sample_10\",13:\"sample_11\",14:\"sample_12\"}"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": 18,
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "#split inital positions csv\n",
 33 |     "with open('coordinates.csv') as fin:    \n",
 34 |     "    csvin = csv.DictReader(fin)\n",
 35 |     "    # Category -> open file lookup\n",
 36 |     "    outputs = {}\n",
 37 |     "    for row in csvin:\n",
 38 |     "        cat = row['Barcodes'].split(\"-\")[1]\n",
 39 |     "# Open a new file and write the header\n",
 40 |     "        if cat not in outputs:\n",
 41 |     "            fout = open(f\"positions_scvelo.csv\", 'a')\n",
 42 |     "            dw = csv.DictWriter(fout, fieldnames=csvin.fieldnames)\n",
 43 |     "            dw.writeheader()\n",
 44 |     "            outputs[cat] = fout, dw\n",
 45 |     "        # Always write the row\n",
 46 |     "        # switch this up etc.\n",
 47 |     "        if barcode_dict[int(cat)] == \"sample_1\":\n",
 48 |     "            row[\"Barcodes\"] = \"possorted_genome_bam_WJFV6:\" + row[\"Barcodes\"].split(\"-\")[0]+\"x\"\n",
 49 |     "        elif barcode_dict[int(cat)] == \"sample_2\":\n",
 50 |     "            row[\"Barcodes\"] = \"possorted_genome_bam_IBODB:\" + row[\"Barcodes\"].split(\"-\")[0]+\"x\"\n",
 51 |     "        elif barcode_dict[int(cat)] == \"sample_3\":\n",
 52 |     "            row[\"Barcodes\"] = \"possorted_genome_bam_IE2VP:\" + row[\"Barcodes\"].split(\"-\")[0]+\"x\"\n",
 53 |     "        elif barcode_dict[int(cat)] == \"sample_4\":\n",
 54 |     "            row[\"Barcodes\"] = \"possorted_genome_bam_4RUJG:\" + row[\"Barcodes\"].split(\"-\")[0]+\"x\"\n",
 55 |     "        elif barcode_dict[int(cat)] == \"sample_5\":\n",
 56 |     "            row[\"Barcodes\"] = \"possorted_genome_bam_NOEMP:\" + row[\"Barcodes\"].split(\"-\")[0]+\"x\"\n",
 57 |     "        elif barcode_dict[int(cat)] == \"sample_6\":\n",
 58 |     "            row[\"Barcodes\"] = \"possorted_genome_bam_I4FGA:\" + row[\"Barcodes\"].split(\"-\")[0]+\"x\"\n",
 59 |     "        elif barcode_dict[int(cat)] == \"sample_7\":\n",
 60 |     "            row[\"Barcodes\"] = \"possorted_genome_bam_30XDV:\" + row[\"Barcodes\"].split(\"-\")[0]+\"x\"\n",
 61 |     "        elif barcode_dict[int(cat)] == \"sample_8\":\n",
 62 |     "            row[\"Barcodes\"] = \"possorted_genome_bam_U0GTQ:\" + row[\"Barcodes\"].split(\"-\")[0]+\"x\"\n",
 63 |     "        elif barcode_dict[int(cat)] == \"sample_9\":\n",
 64 |     "            row[\"Barcodes\"] = \"possorted_genome_bam_W7L9N:\" + row[\"Barcodes\"].split(\"-\")[0]+\"x\"\n",
 65 |     "        elif barcode_dict[int(cat)] == \"sample_10\":\n",
 66 |     "            row[\"Barcodes\"] = \"possorted_genome_bam_A0TQE:\" + row[\"Barcodes\"].split(\"-\")[0]+\"x\"\n",
 67 |     "        elif barcode_dict[int(cat)] == \"sample_11\":\n",
 68 |     "            row[\"Barcodes\"] = \"possorted_genome_bam_EZTZD:\" + row[\"Barcodes\"].split(\"-\")[0]+\"x\"\n",
 69 |     "        elif barcode_dict[int(cat)] == \"sample_12\":\n",
 70 |     "            row[\"Barcodes\"] = \"possorted_genome_bam_DD1CT:\" + row[\"Barcodes\"].split(\"-\")[0]+\"x\"\n",
 71 |     "        outputs[cat][1].writerow(row)\n",
 72 |     "    # Close all the files\n",
 73 |     "    for fout, _ in outputs.values():\n",
 74 |     "        fout.close()"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": 19,
 80 |    "metadata": {},
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "barcode_dict = {1:\"sample_1\",2:\"sample_2\",3:\"sample_3\",4:\"sample_4\",5:\"sample_5\",\n",
 84 |     "                6:\"sample_6\",9:\"sample_7\",10:\"sample_8\",\n",
 85 |     "               11:\"sample_9\", 12:\"sample_10\",13:\"sample_11\",14:\"sample_12\"}"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": 20,
 91 |    "metadata": {},
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "import pandas as pd\n",
 95 |     "import scvelo as scv\n",
 96 |     "import numpy as np"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": 21,
102 |    "metadata": {},
103 |    "outputs": [],
104 |    "source": [
105 |     "with open(\"meta_data.csv\") as fin:    \n",
106 |     "    csvin = csv.DictReader(fin)\n",
107 |     "    # Category -> open file lookup\n",
108 |     "    outputs = {}\n",
109 |     "    for row in csvin:\n",
110 |     "        cat = row['Barcode'].split(\"-\")[1]\n",
111 |     "# Open a new file and write the header\n",
112 |     "        if cat not in outputs:\n",
113 |     "            fout = open(\"scvelo_metadata_headers.csv\", 'a')\n",
114 |     "            dw = csv.DictWriter(fout, fieldnames=csvin.fieldnames)\n",
115 |     "            dw.writeheader()\n",
116 |     "            outputs[cat] = fout, dw\n",
117 |     "        # Always write the row\n",
118 |     "        # switch this up etc.\n",
119 |     "        if barcode_dict[int(cat)] == \"sample_1\":\n",
120 |     "            row[\"Barcode\"] = \"possorted_genome_bam_WJFV6:\" + row[\"Barcode\"].split(\"-\")[0]+\"x\"\n",
121 |     "        elif barcode_dict[int(cat)] == \"sample_2\":\n",
122 |     "            row[\"Barcode\"] = \"possorted_genome_bam_IBODB:\" + row[\"Barcode\"].split(\"-\")[0]+\"x\"\n",
123 |     "        elif barcode_dict[int(cat)] == \"sample_3\":\n",
124 |     "            row[\"Barcode\"] = \"possorted_genome_bam_IE2VP:\" + row[\"Barcode\"].split(\"-\")[0]+\"x\"\n",
125 |     "        elif barcode_dict[int(cat)] == \"sample_4\":\n",
126 |     "            row[\"Barcode\"] = \"possorted_genome_bam_4RUJG:\" + row[\"Barcode\"].split(\"-\")[0]+\"x\"\n",
127 |     "        elif barcode_dict[int(cat)] == \"sample_5\":\n",
128 |     "            row[\"Barcode\"] = \"possorted_genome_bam_NOEMP:\" + row[\"Barcode\"].split(\"-\")[0]+\"x\"\n",
129 |     "        elif barcode_dict[int(cat)] == \"sample_6\":\n",
130 |     "            row[\"Barcode\"] = \"possorted_genome_bam_I4FGA:\" + row[\"Barcode\"].split(\"-\")[0]+\"x\"\n",
131 |     "        elif barcode_dict[int(cat)] == \"sample_7\":\n",
132 |     "            row[\"Barcode\"] = \"possorted_genome_bam_30XDV:\" + row[\"Barcode\"].split(\"-\")[0]+\"x\"\n",
133 |     "        elif barcode_dict[int(cat)] == \"sample_8\":\n",
134 |     "            row[\"Barcode\"] = \"possorted_genome_bam_U0GTQ:\" + row[\"Barcode\"].split(\"-\")[0]+\"x\"\n",
135 |     "        elif barcode_dict[int(cat)] == \"sample_9\":\n",
136 |     "            row[\"Barcode\"] = \"possorted_genome_bam_W7L9N:\" + row[\"Barcode\"].split(\"-\")[0]+\"x\"\n",
137 |     "        elif barcode_dict[int(cat)] == \"sample_10\":\n",
138 |     "            row[\"Barcode\"] = \"possorted_genome_bam_A0TQE:\" + row[\"Barcode\"].split(\"-\")[0]+\"x\"\n",
139 |     "        elif barcode_dict[int(cat)] == \"sample_11\":\n",
140 |     "            row[\"Barcode\"] = \"possorted_genome_bam_EZTZD:\" + row[\"Barcode\"].split(\"-\")[0]+\"x\"\n",
141 |     "        elif barcode_dict[int(cat)] == \"sample_12\":\n",
142 |     "            row[\"Barcode\"] = \"possorted_genome_bam_DD1CT:\" + row[\"Barcode\"].split(\"-\")[0]+\"x\"\n",
143 |     "        outputs[cat][1].writerow(row)\n",
144 |     "    # Close all the files\n",
145 |     "    for fout, _ in outputs.values():\n",
146 |     "        fout.close()"
147 |    ]
148 |   }
149 |  ],
150 |  "metadata": {
151 |   "kernelspec": {
152 |    "display_name": "Python 3 (ipykernel)",
153 |    "language": "python",
154 |    "name": "python3"
155 |   },
156 |   "language_info": {
157 |    "codemirror_mode": {
158 |     "name": "ipython",
159 |     "version": 3
160 |    },
161 |    "file_extension": ".py",
162 |    "mimetype": "text/x-python",
163 |    "name": "python",
164 |    "nbconvert_exporter": "python",
165 |    "pygments_lexer": "ipython3",
166 |    "version": "3.9.12"
167 |   }
168 |  },
169 |  "nbformat": 4,
170 |  "nbformat_minor": 4
171 | }
172 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # OSCC Spatial Transcriptomics
2 | Code from Arora and Cao et al, 2023. for the analysis of oral cancer spatial transcriptomics data.
3 | 
4 | Please separately cite this code base as: [![DOI](https://zenodo.org/badge/434734032.svg)](https://zenodo.org/badge/latestdoi/434734032)
5 | 


--------------------------------------------------------------------------------