├── Figure 1 ├── Cancer Cell Identification │ └── identification.R ├── Deconvolution │ ├── Deconvolution.R │ └── reference_processing.R └── Numbat │ ├── processing.R │ └── run_numbat.R ├── Figure 2 ├── Core Edge Identification │ └── state_anno.R ├── Correlation Analysis │ └── corplot.R ├── Differential Expression │ └── degs.R ├── IPA │ ├── IPA Upstream regulator.R │ └── IPA cannonical.R └── TF anaysis │ └── pySCENIC_oscc.R ├── Figure 3 ├── Cancer Stem Cell │ └── csc_analysis.R ├── Cell Neighbor Analysis │ └── cell_neighbor_analysis.R └── Cell Signaling │ └── cellchat.R ├── Figure 4 ├── Classifier Training │ └── script.R └── Plotting │ └── analysis.R ├── Figure 5 ├── OSCC survival │ └── oscc_tcga.R ├── Pan-cancer survival │ └── Survival_pan_cancer.R └── Validation │ └── GSE41613_survival.R ├── Figure 6 ├── Dynamo │ ├── dynamo.R │ └── dynamo.ipynb ├── PharmacoDB and DGIdb │ └── phamacogx.R ├── Velocity on Sample │ └── scvelo_sample.ipynb └── Velocity │ ├── differential_splicing.R │ ├── scvelo_analysis.ipynb │ └── tissue_position_splitting.ipynb ├── LICENSE └── README.md /Figure 1/Cancer Cell Identification/identification.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | library(dplyr) 3 | library(Seurat) 4 | library(patchwork) 5 | load(file = "Seurat_objs.Robj") 6 | 7 | load(file = "nb.Robj") 8 | 9 | samples = c('sample_1','sample_2','sample_3','sample_4','sample_5','sample_6','sample_7','sample_8','sample_9','sample_10','sample_11','sample_12') 10 | for (sample in samples) { 11 | 12 | Seurat_meta <- Seurat_objs[[sample]]@meta.data 13 | Seurat_meta$barcode <- rownames(Seurat_meta) 14 | barcode_id <- strsplit(rownames(Seurat_meta), "-")[[1]][[2]] 15 | p_cnvs <- nb[[sample]]$clone_post 16 | 17 | p_cnvs$cell <- str_replace(p_cnvs$cell, "1", barcode_id) 18 | 19 | Seurat_objs[[sample]]@meta.data <- Seurat_meta %>% 20 | left_join( 21 | p_cnvs, 22 | by = c('barcode' = 'cell') 23 | ) %>% 24 | column_to_rownames("barcode") 25 | 26 | #define cancer cell spots as being either p cnv > 0.99 or deconvolution probability > 0.99 27 | Seurat_objs[[sample]]$is_cancer_cell <- ifelse( (Seurat_objs[[sample]]$p_cnv > 0.99 | Seurat_objs[[sample]]$cancer.cell > 0.99) & Seurat_objs[[sample]]$pathologist_anno == "SCC", "1", "0") 28 | 29 | #define non cancer cell states 30 | Seurat_objs[[sample]]$celltype <- ifelse(Seurat_objs[[sample]]$is_cancer_cell == "1","cancer cell","other celltype") 31 | 32 | meta <- Seurat_objs[[sample]]@meta.data 33 | 34 | noncancer_meta <- meta[c("myofibroblast","B.cell","ecm.myCAF","Intermediate.fibroblast","detox.iCAF","macrophage","endothelial","dendritic.","mast", 35 | "conventional.CD4..T.helper.cells","cytotoxic.CD8..T.","Tregs","cytotoxic.CD8..T.exhausted")] 36 | 37 | #identify the noncancer celltype as the noncancer celltype with the greatest proportion 38 | noncancer_meta <- noncancer_meta %>% mutate(noncancer_celltype=names(.)[max.col(.)]) 39 | noncancer_meta$barcode <- rownames(noncancer_meta) 40 | 41 | Seurat_objs[[sample]]@meta.data <- merge(x = Seurat_objs[[sample]]@meta.data, y = noncancer_meta[ , c("noncancer_celltype","barcode")], all.x=TRUE, by = 0) %>% 42 | column_to_rownames("barcode") 43 | 44 | Seurat_objs[[sample]]@meta.data$final_celltype <- ifelse(Seurat_objs[[sample]]$is_cancer_cell == "1","cancer cell",Seurat_objs[[sample]]@meta.data$noncancer_celltype) 45 | } 46 | 47 | #check all annotations 48 | lapply( 49 | samples, 50 | function(sample) { 51 | SpatialDimPlot(Seurat_objs[[sample]], group.by = "final_celltype", interactive = T) 52 | } 53 | ) %>% 54 | wrap_plots(guides = 'collect') 55 | 56 | annotated_objects <- Seurat_objs 57 | 58 | save(annotated_objects, file = "annotated_objects.Robj") 59 | 60 | #plots for Figure 1 were generated in Figure 2, Core Edge Identification code 61 | -------------------------------------------------------------------------------- /Figure 1/Deconvolution/Deconvolution.R: -------------------------------------------------------------------------------- 1 | library(CARD) 2 | library(Seurat) 3 | library(tidyverse) 4 | library(dplyr) 5 | library(RColorBrewer) 6 | library(ggplot2) 7 | #load all objects 8 | load(file = "Seurat/obj_1.Robj") 9 | load(file = "Seurat/obj_2.Robj") 10 | load(file = "Seurat/obj_3.Robj") 11 | load(file = "Seurat/obj_4.Robj") 12 | load(file = "Seurat/obj_5.Robj") 13 | load(file = "Seurat/obj_6.Robj") 14 | load(file = "Seurat/obj_7.Robj") 15 | load(file = "Seurat/obj_8.Robj") 16 | load(file = "Seurat/obj_9.Robj") 17 | load(file = "Seurat/obj_10.Robj") 18 | load(file = "Seurat/obj_11.Robj") 19 | load(file = "Seurat/obj_12.Robj") 20 | 21 | Seurat_objs <- c(obj_1,obj_2,obj_3,obj_4,obj_5,obj_6,obj_7,obj_8, 22 | obj_9,obj_10,obj_11,obj_12) 23 | 24 | samples <- c("sample_1","sample_2","sample_3","sample_4","sample_5","sample_6","sample_7","sample_8", 25 | "sample_9","sample_10", "sample_11","sample_12") 26 | 27 | names(Seurat_objs) <- samples 28 | 29 | 30 | 31 | for (sample in Seurat_objs) { 32 | sample@meta.data$sample <- NULL 33 | spatial_count <- sample@assays$Spatial@counts 34 | 35 | spatial_location <- sample@images$tumor@coordinates %>% select(row, col) 36 | names(spatial_location) <- c("x","y") 37 | 38 | if (!identical(colnames(spatial_count), rownames(spatial_location))){ 39 | #ensure that count matrix contains the same spots as the location matrix 40 | spatial_count <- spatial_count[, colnames(spatial_count) %in% rownames(spatial_location)] 41 | } 42 | #match order 43 | spatial_location <- spatial_location[order(match(rownames(spatial_location), colnames(spatial_count))), , drop = FALSE] 44 | 45 | load(file = "puram/puram_data.Robj") 46 | sc_count <- puram_data@assays$RNA@counts 47 | sc_meta <- puram_data@meta.data 48 | 49 | #deconvolute using CARD 50 | CARD_obj = createCARDObject( 51 | sc_count = sc_count, 52 | sc_meta = sc_meta, 53 | spatial_count = spatial_count, 54 | spatial_location = spatial_location, 55 | ct.varname = "cellype_fine", 56 | ct.select = unique(sc_meta$cellype_fine), 57 | sample.varname = "orig.ident", 58 | minCountGene = 100, 59 | minCountSpot = 5) 60 | 61 | CARD_obj = CARD_deconvolution(CARD_object = CARD_obj) 62 | 63 | proportions <- as.data.frame(CARD_obj@Proportion_CARD) 64 | 65 | proportions$dominant_celltype <- apply(proportions, 1, function(x) names(proportions)[which.max(x)]) 66 | 67 | sample <- Seurat::AddMetaData(sample,proportions) 68 | 69 | Seurat_objs[unique(sample@meta.data$sample_id)] <- sample 70 | 71 | } 72 | 73 | # save(Seurat_objs,file = "Seurat_objs.Robj") 74 | # 75 | # load(file = "Seurat_objs.Robj") 76 | 77 | lapply( 78 | Seurat_objs, 79 | function(sample) { 80 | SpatialFeaturePlot(sample,features = "cancer.cell", min.cutoff = 1) 81 | } 82 | ) %>% 83 | wrap_plots(guides = 'collect') 84 | 85 | -------------------------------------------------------------------------------- /Figure 1/Deconvolution/reference_processing.R: -------------------------------------------------------------------------------- 1 | ###load in and process single-cell data from GEO 2 | library(Seurat) 3 | 4 | all_data <- read.delim2("HNSCC_all_data.txt") 5 | 6 | all_data$X <- as.list(sapply(all_data$X , function(x) gsub("\'", "", x))) 7 | 8 | rownames(all_data) <- all_data$X 9 | 10 | all_data$X <- NULL 11 | 12 | meta_data <- all_data[0:5,] 13 | meta_data <- as.data.frame(t(meta_data)) 14 | 15 | expr_data <- all_data[6:23691,] 16 | 17 | seurat_data <- CreateSeuratObject(expr_data, project = "puram_data") 18 | 19 | seurat_data <- AddMetaData(seurat_data,metadata = meta_data) 20 | 21 | library(reshape2) 22 | library(Seurat) 23 | library(ggplot2) 24 | library(tidyverse) 25 | library(tidyr) 26 | #heatmap 27 | 28 | seurat_data <- FindVariableFeatures(seurat_data) 29 | seurat_data <- ScaleData(seurat_data) 30 | seurat_data <- Seurat::RunPCA(seurat_data, verbose = FALSE) %>% 31 | Seurat::RunUMAP(., dims = 1:30, verbose = FALSE)%>% 32 | FindNeighbors(., dims = 1:30)%>% 33 | FindClusters(., resolution = 0.5) 34 | 35 | seurat_data@meta.data$non.cancer.cell.type <- gsub("-","",as.character(seurat_data@meta.data$non.cancer.cell.type)) 36 | 37 | #rename cancer cells 38 | seurat_data@meta.data$non.cancer.cell.type <- gsub("0","Cancer cell",as.character(seurat_data@meta.data$non.cancer.cell.type)) 39 | 40 | Seurat::DimPlot(seurat_data, 41 | group.by = "RNA_snn_res.0.5", 42 | label = TRUE) + Seurat::NoLegend() 43 | 44 | #identify celltypes based on published markers 45 | cluster_ids <- read.csv(file = "cluster_identification.csv") 46 | 47 | Idents(seurat_data) = "RNA_snn_res.0.5" 48 | new.cluster.ids <- cluster_ids$cell_type 49 | names(new.cluster.ids) <- levels(seurat_data) 50 | seurat_data <- RenameIdents(seurat_data, new.cluster.ids) 51 | seurat_data@meta.data$cellype_fine <- Idents(seurat_data) 52 | 53 | colourCount = length(unique(seurat_data@meta.data$cellype_fine)) 54 | library(RColorBrewer) 55 | mycolors <- colorRampPalette(brewer.pal(7, "Set1"))(colourCount) 56 | 57 | #refactor 58 | 59 | seurat_data@meta.data$cellype_fine <- factor(seurat_data@meta.data$cellype_fine, levels = 60 | c("myofibroblast","cancer cell","B cell","ecm-myCAF", 61 | "Intermediate fibroblast","detox-iCAF","macrophage", 62 | "endothelial","dendritic ","mast","conventional CD4+ T-helper cells", 63 | "cytotoxic CD8+ T ","Tregs","cytotoxic CD8+ T exhausted")) 64 | 65 | Idents(seurat_data) = "cellype_fine" 66 | png("puram_corescore_subtype.png",units = "in", res = 300, width = 12, height = 8) 67 | VlnPlot(seurat_data, features = "core_genes1", cols = mycolors) 68 | dev.off() 69 | 70 | png("puram_edgescore_subtype.png",units = "in", res = 300, width = 12, height = 8) 71 | VlnPlot(seurat_data, features = "edge_genes1", cols = mycolors) 72 | dev.off() 73 | 74 | 75 | png("puram_annotated.png",units = "in", res = 300, width = 8, height = 6) 76 | Seurat::DimPlot(seurat_data, 77 | group.by = "cellype_fine", 78 | label = F, cols = mycolors) 79 | dev.off() 80 | puram_data <- seurat_data 81 | save(puram_data, file = "puram_data.Robj") 82 | 83 | -------------------------------------------------------------------------------- /Figure 1/Numbat/processing.R: -------------------------------------------------------------------------------- 1 | library(data.table) 2 | library(dplyr) 3 | library(glue) 4 | library(stringr) 5 | library(numbat) 6 | library(ggplot2) 7 | library(patchwork) 8 | 9 | # read in the numbat results and spot coordinates 10 | nb = list() 11 | spots = list() 12 | samples = c('sample_1','sample_2','sample_3','sample_4','sample_5','sample_6','sample_7','sample_8','sample_9','sample_10','sample_11','sample_12') 13 | for (sample in samples) { 14 | nb[[sample]] = Numbat$new(glue('outs/{sample}')) 15 | spots[[sample]] = fread(glue('tp/{sample}/tissue_positions_list.csv')) 16 | colnames(spots[[sample]]) <- c("barcode","in_tissue","array_row","array_col","tissue_scaled_x","tissue_scaled_y") 17 | } 18 | 19 | options(repr.plot.width = 8.5, repr.plot.height = 8, repr.plot.res = 300) 20 | 21 | #plot CNV probability on samples 22 | lapply( 23 | samples, 24 | function(sample) { 25 | 26 | spots[[sample]] %>% 27 | left_join( 28 | nb[[sample]]$clone_post, 29 | by = c('barcode' = 'cell') 30 | ) %>% 31 | filter(in_tissue == 1) %>% 32 | ggplot( 33 | aes(x = array_col, y = array_row) 34 | ) + 35 | geom_point(aes(color = p_cnv), size = 1, alpha = 0.8, pch = 16) + 36 | scale_color_gradient2( 37 | low = 'darkgreen', high = 'red3', mid = 'yellow', 38 | midpoint = 0.5, limits = c(0,1), oob = scales::oob_squish 39 | ) + 40 | theme_bw() + 41 | ggtitle(sample) + 42 | scale_y_reverse() 43 | } 44 | ) %>% 45 | wrap_plots(guides = 'collect') 46 | 47 | 48 | chrom_labeller <- function(chr) { 49 | chr[chr %in% c(19, 21, 22)] = "" 50 | return(chr) 51 | } 52 | 53 | pal = c("#E41A1C", "#377EB8", "#4DAF4A", "#984EA3", "#FF7F00", "#FFFF33", "#A65628", "#F781BF") 54 | getPalette = colorRampPalette(pal) 55 | 56 | cnv_colors = c("neu" = "gray", 57 | "neu_up" = "darkgray", "neu_down" = "gray", 58 | "del_up" = "royalblue", "del_down" = "darkblue", 59 | "loh_up" = "darkgreen", "loh_down" = "olivedrab4", 60 | "amp_up" = "red", "amp_down" = "tomato3", 61 | "del_1_up" = "royalblue", "del_1_down" = "darkblue", 62 | "loh_1_up" = "darkgreen", "loh_1_down" = "olivedrab4", 63 | "amp_1_up" = "red", "amp_1_down" = "tomato3", 64 | "del_2_up" = "royalblue", "del_2_down" = "darkblue", 65 | "loh_2_up" = "darkgreen", "loh_2_down" = "olivedrab4", 66 | "amp_2_up" = "red", "amp_2_down" = "tomato3", 67 | "del_up_1" = "royalblue", "del_down_1" = "darkblue", 68 | "loh_up_1" = "darkgreen", "loh_down_1" = "olivedrab4", 69 | "amp_up_1" = "red", "amp_down_1" = "tomato3", 70 | "del_up_2" = "royalblue", "del_down_2" = "darkblue", 71 | "loh_up_2" = "darkgreen", "loh_down_2" = "olivedrab4", 72 | "amp_up_2" = "red", "amp_down_2" = "tomato3", 73 | "bamp" = "salmon", "bdel" = "skyblue", 74 | "amp" = "tomato3", "loh" = "olivedrab4", "del" = "royalblue", 75 | "theta_up" = "darkgreen", "theta_down" = "olivedrab4", 76 | "theta_1_up" = "darkgreen", "theta_1_down" = "olivedrab4", 77 | "theta_2_up" = "darkgreen", "theta_2_down" = "olivedrab4", 78 | "theta_up_1" = "darkgreen", "theta_down_1" = "olivedrab4", 79 | "theta_up_2" = "darkgreen", "theta_down_2" = "olivedrab4", 80 | '0|1' = 'red', '1|0' = 'blue','major' = '#66C2A5', 'minor' = '#FC8D62') 81 | 82 | cnv_labels = names(cnv_colors) %>% 83 | stringr::str_remove_all('_') %>% 84 | stringr::str_to_upper() %>% 85 | stringr::str_replace('UP', '(major)') %>% 86 | stringr::str_replace('DOWN', '(minor)') %>% 87 | stringr::str_replace('LOH', 'CNLoH') %>% 88 | setNames(names(cnv_colors)) 89 | 90 | numbat_plot <- function (segs) { 91 | chrom_labeller <- function(chr) { 92 | chr[chr %in% c(19, 21, 22)] = "" 93 | return(chr) 94 | } 95 | ggplot(segs) + geom_rect(aes(xmin = seg_start, xmax = seg_end, 96 | ymin = -0.5, ymax = 0.5, fill = cnv_state_post)) + theme_void() + 97 | theme(panel.spacing = unit(1, "mm"), strip.background = element_blank(), 98 | strip.text.y = element_text(angle = 0), plot.margin = margin(0, 99 | 0, 0, 0), legend.position = "none") + facet_grid(~CHROM, 100 | space = "free_x", scales = "free") + 101 | theme( 102 | strip.background = element_blank(), 103 | strip.text.x = element_blank() 104 | )+ 105 | scale_fill_manual(values = cnv_colors, labels = cnv_labels, 106 | name = "CN states") + 107 | ggrepel::geom_text_repel(aes(x = (seg_start + 108 | seg_end)/2, y = -0.5, label = str_remove(seg_cons, "\\d+")), 109 | min.segment.length = 0, vjust = 1, hjust = 0, direction = "x", 110 | segment.curvature = -0.2, segment.ncp = 3, segment.angle = 30, 111 | segment.inflect = TRUE, max.overlaps = 3) + scale_y_continuous(expand = expansion(add = c(0.5, 112 | 0))) + scale_x_continuous(expand = expansion(mult = 0.05)) + 113 | guides(fill = "none") 114 | } 115 | 116 | png("numbat_plot.png", units = "in", res = 300, height = 5, width = 10, type = "cairo") 117 | options(repr.plot.width = 14, repr.plot.height = 12, repr.plot.res = 300) 118 | lapply( 119 | samples, 120 | function(sample) { 121 | numbat_plot(nb[[sample]]$segs_consensus) 122 | } 123 | ) %>% 124 | wrap_plots(guides = 'collect', ncol = 1) 125 | dev.off() 126 | -------------------------------------------------------------------------------- /Figure 1/Numbat/run_numbat.R: -------------------------------------------------------------------------------- 1 | module load singularity/3.8.1 2 | 3 | #singularity pull docker://pkharchenkolab/numbat-rbase:latest 4 | 5 | salloc --time 4:59:00 --ntasks=40 --mem-per-cpu=40G 6 | 7 | singularity exec --home /work/bose_lab/Rohit/numbat numbat-rbase_latest.sif Rscript /numbat/inst/bin/pileup_and_phase.R \ 8 | --label "sample_1" \ 9 | --samples "sample_1" \ 10 | --bams sample_data/sample_1/possorted_genome_bam.bam \ 11 | --barcodes sample_data/sample_1/barcodes.tsv \ 12 | --outdir "pap/sample_1" \ 13 | --gmap /Eagle_v2.4.1/tables/genetic_map_hg38_withX.txt.gz \ 14 | --snpvcf /data/genome1K.phase3.SNP_AF5e2.chr1toX.hg38.vcf \ 15 | --paneldir /data/1000G_hg38 \ 16 | --ncores 30 17 | 18 | singularity exec --home /work/bose_lab/Rohit/numbat numbat-rbase_latest.sif Rscript /numbat/inst/bin/pileup_and_phase.R \ 19 | --label "sample_2" \ 20 | --samples "sample_2" \ 21 | --bams sample_data/sample_2/possorted_genome_bam.bam \ 22 | --barcodes sample_data/sample_2/barcodes.tsv \ 23 | --outdir "pap/sample_2" \ 24 | --gmap /Eagle_v2.4.1/tables/genetic_map_hg38_withX.txt.gz \ 25 | --snpvcf /data/genome1K.phase3.SNP_AF5e2.chr1toX.hg38.vcf \ 26 | --paneldir /data/1000G_hg38 \ 27 | --ncores 30 28 | 29 | singularity exec --home /work/bose_lab/Rohit/numbat numbat-rbase_latest.sif Rscript /numbat/inst/bin/pileup_and_phase.R \ 30 | --label "sample_3" \ 31 | --samples "sample_3" \ 32 | --bams sample_data/sample_3/possorted_genome_bam.bam \ 33 | --barcodes sample_data/sample_3/barcodes.tsv \ 34 | --outdir "pap/sample_3" \ 35 | --gmap /Eagle_v2.4.1/tables/genetic_map_hg38_withX.txt.gz \ 36 | --snpvcf /data/genome1K.phase3.SNP_AF5e2.chr1toX.hg38.vcf \ 37 | --paneldir /data/1000G_hg38 \ 38 | --ncores 30 39 | 40 | 41 | singularity exec --home /work/bose_lab/Rohit/numbat numbat-rbase_latest.sif Rscript /numbat/inst/bin/pileup_and_phase.R \ 42 | --label "sample_4" \ 43 | --samples "sample_4" \ 44 | --bams sample_data/sample_4/possorted_genome_bam.bam \ 45 | --barcodes sample_data/sample_4/barcodes.tsv \ 46 | --outdir "pap/sample_4" \ 47 | --gmap /Eagle_v2.4.1/tables/genetic_map_hg38_withX.txt.gz \ 48 | --snpvcf /data/genome1K.phase3.SNP_AF5e2.chr1toX.hg38.vcf \ 49 | --paneldir /data/1000G_hg38 \ 50 | --ncores 30 51 | 52 | 53 | singularity exec --home /work/bose_lab/Rohit/numbat numbat-rbase_latest.sif Rscript /numbat/inst/bin/pileup_and_phase.R \ 54 | --label "sample_5" \ 55 | --samples "sample_5" \ 56 | --bams sample_data/sample_5/possorted_genome_bam.bam \ 57 | --barcodes sample_data/sample_5/barcodes.tsv \ 58 | --outdir "pap/sample_5" \ 59 | --gmap /Eagle_v2.4.1/tables/genetic_map_hg38_withX.txt.gz \ 60 | --snpvcf /data/genome1K.phase3.SNP_AF5e2.chr1toX.hg38.vcf \ 61 | --paneldir /data/1000G_hg38 \ 62 | --ncores 30 63 | 64 | 65 | singularity exec --home /work/bose_lab/Rohit/numbat numbat-rbase_latest.sif Rscript /numbat/inst/bin/pileup_and_phase.R \ 66 | --label "sample_6" \ 67 | --samples "sample_6" \ 68 | --bams sample_data/sample_6/possorted_genome_bam.bam \ 69 | --barcodes sample_data/sample_6/barcodes.tsv \ 70 | --outdir "pap/sample_6" \ 71 | --gmap /Eagle_v2.4.1/tables/genetic_map_hg38_withX.txt.gz \ 72 | --snpvcf /data/genome1K.phase3.SNP_AF5e2.chr1toX.hg38.vcf \ 73 | --paneldir /data/1000G_hg38 \ 74 | --ncores 30 75 | 76 | 77 | singularity exec --home /work/bose_lab/Rohit/numbat numbat-rbase_latest.sif Rscript /numbat/inst/bin/pileup_and_phase.R \ 78 | --label "sample_7" \ 79 | --samples "sample_7" \ 80 | --bams sample_data/sample_7/possorted_genome_bam.bam \ 81 | --barcodes sample_data/sample_7/barcodes.tsv \ 82 | --outdir "pap/sample_7" \ 83 | --gmap /Eagle_v2.4.1/tables/genetic_map_hg38_withX.txt.gz \ 84 | --snpvcf /data/genome1K.phase3.SNP_AF5e2.chr1toX.hg38.vcf \ 85 | --paneldir /data/1000G_hg38 \ 86 | --ncores 30 87 | 88 | singularity exec --home /work/bose_lab/Rohit/numbat numbat-rbase_latest.sif Rscript /numbat/inst/bin/pileup_and_phase.R \ 89 | --label "sample_8" \ 90 | --samples "sample_8" \ 91 | --bams sample_data/sample_8/possorted_genome_bam.bam \ 92 | --barcodes sample_data/sample_8/barcodes.tsv \ 93 | --outdir "pap/sample_8" \ 94 | --gmap /Eagle_v2.4.1/tables/genetic_map_hg38_withX.txt.gz \ 95 | --snpvcf /data/genome1K.phase3.SNP_AF5e2.chr1toX.hg38.vcf \ 96 | --paneldir /data/1000G_hg38 \ 97 | --ncores 30 98 | 99 | 100 | singularity exec --home /work/bose_lab/Rohit/numbat numbat-rbase_latest.sif Rscript /numbat/inst/bin/pileup_and_phase.R \ 101 | --label "sample_9" \ 102 | --samples "sample_9" \ 103 | --bams sample_data/sample_9/possorted_genome_bam.bam \ 104 | --barcodes sample_data/sample_9/barcodes.tsv \ 105 | --outdir "pap/sample_9" \ 106 | --gmap /Eagle_v2.4.1/tables/genetic_map_hg38_withX.txt.gz \ 107 | --snpvcf /data/genome1K.phase3.SNP_AF5e2.chr1toX.hg38.vcf \ 108 | --paneldir /data/1000G_hg38 \ 109 | --ncores 30 110 | 111 | 112 | singularity exec --home /work/bose_lab/Rohit/numbat numbat-rbase_latest.sif Rscript /numbat/inst/bin/pileup_and_phase.R \ 113 | --label "sample_10" \ 114 | --samples "sample_10" \ 115 | --bams sample_data/sample_10/possorted_genome_bam.bam \ 116 | --barcodes sample_data/sample_10/barcodes.tsv \ 117 | --outdir "pap/sample_10" \ 118 | --gmap /Eagle_v2.4.1/tables/genetic_map_hg38_withX.txt.gz \ 119 | --snpvcf /data/genome1K.phase3.SNP_AF5e2.chr1toX.hg38.vcf \ 120 | --paneldir /data/1000G_hg38 \ 121 | --ncores 30 122 | 123 | 124 | singularity exec --home /work/bose_lab/Rohit/numbat numbat-rbase_latest.sif Rscript /numbat/inst/bin/pileup_and_phase.R \ 125 | --label "sample_11" \ 126 | --samples "sample_11" \ 127 | --bams sample_data/sample_11/possorted_genome_bam.bam \ 128 | --barcodes sample_data/sample_11/barcodes.tsv \ 129 | --outdir "pap/sample_11" \ 130 | --gmap /Eagle_v2.4.1/tables/genetic_map_hg38_withX.txt.gz \ 131 | --snpvcf /data/genome1K.phase3.SNP_AF5e2.chr1toX.hg38.vcf \ 132 | --paneldir /data/1000G_hg38 \ 133 | --ncores 30 134 | 135 | 136 | singularity exec --home /work/bose_lab/Rohit/numbat numbat-rbase_latest.sif Rscript /numbat/inst/bin/pileup_and_phase.R \ 137 | --label "sample_12" \ 138 | --samples "sample_12" \ 139 | --bams sample_data/sample_12/possorted_genome_bam.bam \ 140 | --barcodes sample_data/sample_12/barcodes.tsv \ 141 | --outdir "pap/sample_12" \ 142 | --gmap /Eagle_v2.4.1/tables/genetic_map_hg38_withX.txt.gz \ 143 | --snpvcf /data/genome1K.phase3.SNP_AF5e2.chr1toX.hg38.vcf \ 144 | --paneldir /data/1000G_hg38 \ 145 | --ncores 30 146 | 147 | 148 | 149 | #prep h5 files for each sample 150 | library(Seurat) 151 | mtx1 <- Read10X_h5("mtx_files/s1_filtered_feature_bc_matrix.h5") 152 | saveRDS(mtx1, file ="mtx_processed/sample_1/mtx.RDS") 153 | 154 | mtx2 <- Read10X_h5("mtx_files/s2_filtered_feature_bc_matrix.h5") 155 | saveRDS(mtx2, file ="mtx_processed/sample_2/mtx.RDS") 156 | 157 | mtx3 <- Read10X_h5("mtx_files/s3_filtered_feature_bc_matrix.h5") 158 | saveRDS(mtx3, file ="mtx_processed/sample_3/mtx.RDS") 159 | 160 | mtx4 <- Read10X_h5("mtx_files/s4_filtered_feature_bc_matrix.h5") 161 | saveRDS(mtx4, file ="mtx_processed/sample_4/mtx.RDS") 162 | 163 | mtx5 <- Read10X_h5("mtx_files/s5_filtered_feature_bc_matrix.h5") 164 | saveRDS(mtx5, file ="mtx_processed/sample_5/mtx.RDS") 165 | 166 | mtx6 <- Read10X_h5("mtx_files/s6_filtered_feature_bc_matrix.h5") 167 | saveRDS(mtx6, file ="mtx_processed/sample_6/mtx.RDS") 168 | 169 | mtx7 <- Read10X_h5("mtx_files/s7_filtered_feature_bc_matrix.h5") 170 | saveRDS(mtx7, file ="mtx_processed/sample_7/mtx.RDS") 171 | 172 | mtx8 <- Read10X_h5("mtx_files/s8_filtered_feature_bc_matrix.h5") 173 | saveRDS(mtx8, file ="mtx_processed/sample_8/mtx.RDS") 174 | 175 | mtx9 <- Read10X_h5("mtx_files/s9_filtered_feature_bc_matrix.h5") 176 | saveRDS(mtx9, file ="mtx_processed/sample_9/mtx.RDS") 177 | 178 | mtx10 <- Read10X_h5("mtx_files/s10_filtered_feature_bc_matrix.h5") 179 | saveRDS(mtx10, file ="mtx_processed/sample_10/mtx.RDS") 180 | 181 | mtx11 <- Read10X_h5("mtx_files/s11_filtered_feature_bc_matrix.h5") 182 | saveRDS(mtx11, file ="mtx_processed/sample_11/mtx.RDS") 183 | 184 | mtx12 <- Read10X_h5("mtx_files/s12_filtered_feature_bc_matrix.h5") 185 | saveRDS(mtx12, file ="mtx_processed/sample_12/mtx.RDS") 186 | 187 | #gunzip allele counts file 188 | gunzip pap/sample_1/sample_1_allele_counts.tsv.gz 189 | gunzip pap/sample_2/sample_2_allele_counts.tsv.gz 190 | gunzip pap/sample_3/sample_3_allele_counts.tsv.gz 191 | gunzip pap/sample_4/sample_4_allele_counts.tsv.gz 192 | gunzip pap/sample_5/sample_5_allele_counts.tsv.gz 193 | gunzip pap/sample_6/sample_6_allele_counts.tsv.gz 194 | gunzip pap/sample_7/sample_7_allele_counts.tsv.gz 195 | gunzip pap/sample_8/sample_8_allele_counts.tsv.gz 196 | gunzip pap/sample_9/sample_9_allele_counts.tsv.gz 197 | gunzip pap/sample_10/sample_10_allele_counts.tsv.gz 198 | gunzip pap/sample_11/sample_11_allele_counts.tsv.gz 199 | gunzip pap/sample_12/sample_12_allele_counts.tsv.gz 200 | 201 | 202 | salloc --time 4:59:00 --ntasks=40 --mem-per-cpu=40G 203 | #run once for each sample 204 | singularity exec --home /work/bose_lab/Rohit/numbat numbat-rbase_latest.sif R 205 | 206 | library(numbat) 207 | 208 | count_mat <- readRDS("sample_data/sample_1/mtx.RDS") 209 | allele_df <- read.delim("pap/sample_1/sample_1_allele_counts.tsv") 210 | out = run_numbat( 211 | count_mat, # gene x cell integer UMI count matrix 212 | ref_hca, # reference expression profile, a gene x cell type normalized expression level matrix 213 | allele_df, # allele dataframe generated by pileup_and_phase script 214 | genome = "hg38", 215 | t = 1e-5, 216 | ncores = 4, 217 | plot = TRUE, 218 | out_dir = 'outs/sample_1', 219 | max_entropy = 0.8 220 | ) 221 | 222 | q() 223 | 224 | n 225 | 226 | singularity exec --home /work/bose_lab/Rohit/numbat numbat-rbase_latest.sif R 227 | 228 | library(numbat) 229 | 230 | count_mat <- readRDS("sample_data/sample_2/mtx.RDS") 231 | allele_df <- read.delim("pap/sample_2/sample_2_allele_counts.tsv") 232 | out = run_numbat( 233 | count_mat, # gene x cell integer UMI count matrix 234 | ref_hca, # reference expression profile, a gene x cell type normalized expression level matrix 235 | allele_df, # allele dataframe generated by pileup_and_phase script 236 | genome = "hg38", 237 | t = 1e-5, 238 | ncores = 4, 239 | plot = TRUE, 240 | out_dir = 'outs/sample_2', 241 | max_entropy = 0.8 242 | ) 243 | 244 | q() 245 | 246 | n 247 | 248 | singularity exec --home /work/bose_lab/Rohit/numbat numbat-rbase_latest.sif R 249 | 250 | library(numbat) 251 | 252 | count_mat <- readRDS("sample_data/sample_3/mtx.RDS") 253 | allele_df <- read.delim("pap/sample_3/sample_3_allele_counts.tsv") 254 | out = run_numbat( 255 | count_mat, # gene x cell integer UMI count matrix 256 | ref_hca, # reference expression profile, a gene x cell type normalized expression level matrix 257 | allele_df, # allele dataframe generated by pileup_and_phase script 258 | genome = "hg38", 259 | t = 1e-5, 260 | ncores = 4, 261 | plot = TRUE, 262 | out_dir = 'outs/sample_3', 263 | max_entropy = 0.8 264 | ) 265 | 266 | q() 267 | 268 | n 269 | 270 | singularity exec --home /work/bose_lab/Rohit/numbat numbat-rbase_latest.sif R 271 | 272 | library(numbat) 273 | 274 | count_mat <- readRDS("sample_data/sample_4/mtx.RDS") 275 | allele_df <- read.delim("pap/sample_4/sample_4_allele_counts.tsv") 276 | out = run_numbat( 277 | count_mat, # gene x cell integer UMI count matrix 278 | ref_hca, # reference expression profile, a gene x cell type normalized expression level matrix 279 | allele_df, # allele dataframe generated by pileup_and_phase script 280 | genome = "hg38", 281 | t = 1e-5, 282 | ncores = 4, 283 | plot = TRUE, 284 | out_dir = 'outs/sample_4', 285 | max_entropy = 0.8 286 | ) 287 | 288 | q() 289 | 290 | n 291 | 292 | singularity exec --home /work/bose_lab/Rohit/numbat numbat-rbase_latest.sif R 293 | 294 | library(numbat) 295 | 296 | count_mat <- readRDS("sample_data/sample_5/mtx.RDS") 297 | allele_df <- read.delim("pap/sample_5/sample_5_allele_counts.tsv") 298 | out = run_numbat( 299 | count_mat, # gene x cell integer UMI count matrix 300 | ref_hca, # reference expression profile, a gene x cell type normalized expression level matrix 301 | allele_df, # allele dataframe generated by pileup_and_phase script 302 | genome = "hg38", 303 | t = 1e-5, 304 | ncores = 4, 305 | plot = TRUE, 306 | out_dir = 'outs/sample_5', 307 | max_entropy = 0.8 308 | ) 309 | 310 | q() 311 | 312 | n 313 | 314 | singularity exec --home /work/bose_lab/Rohit/numbat numbat-rbase_latest.sif R 315 | 316 | library(numbat) 317 | 318 | count_mat <- readRDS("sample_data/sample_6/mtx.RDS") 319 | allele_df <- read.delim("pap/sample_6/sample_6_allele_counts.tsv") 320 | out = run_numbat( 321 | count_mat, # gene x cell integer UMI count matrix 322 | ref_hca, # reference expression profile, a gene x cell type normalized expression level matrix 323 | allele_df, # allele dataframe generated by pileup_and_phase script 324 | genome = "hg38", 325 | t = 1e-5, 326 | ncores = 4, 327 | plot = TRUE, 328 | out_dir = 'outs/sample_6', 329 | max_entropy = 0.8 330 | ) 331 | 332 | q() 333 | 334 | n 335 | 336 | singularity exec --home /work/bose_lab/Rohit/numbat numbat-rbase_latest.sif R 337 | 338 | library(numbat) 339 | 340 | count_mat <- readRDS("sample_data/sample_7/mtx.RDS") 341 | allele_df <- read.delim("pap/sample_7/sample_7_allele_counts.tsv") 342 | out = run_numbat( 343 | count_mat, # gene x cell integer UMI count matrix 344 | ref_hca, # reference expression profile, a gene x cell type normalized expression level matrix 345 | allele_df, # allele dataframe generated by pileup_and_phase script 346 | genome = "hg38", 347 | t = 1e-5, 348 | ncores = 4, 349 | plot = TRUE, 350 | out_dir = 'outs/sample_7', 351 | max_entropy = 0.8 352 | ) 353 | 354 | q() 355 | 356 | n 357 | 358 | 359 | singularity exec --home /work/bose_lab/Rohit/numbat numbat-rbase_latest.sif R 360 | 361 | library(numbat) 362 | 363 | count_mat <- readRDS("sample_data/sample_8/mtx.RDS") 364 | allele_df <- read.delim("pap/sample_8/sample_8_allele_counts.tsv") 365 | out = run_numbat( 366 | count_mat, # gene x cell integer UMI count matrix 367 | ref_hca, # reference expression profile, a gene x cell type normalized expression level matrix 368 | allele_df, # allele dataframe generated by pileup_and_phase script 369 | genome = "hg38", 370 | t = 1e-5, 371 | ncores = 4, 372 | plot = TRUE, 373 | out_dir = 'outs/sample_8', 374 | max_entropy = 0.8 375 | ) 376 | 377 | q() 378 | 379 | n 380 | 381 | singularity exec --home /work/bose_lab/Rohit/numbat numbat-rbase_latest.sif R 382 | 383 | library(numbat) 384 | 385 | count_mat <- readRDS("sample_data/sample_9/mtx.RDS") 386 | allele_df <- read.delim("pap/sample_9/sample_9_allele_counts.tsv") 387 | out = run_numbat( 388 | count_mat, # gene x cell integer UMI count matrix 389 | ref_hca, # reference expression profile, a gene x cell type normalized expression level matrix 390 | allele_df, # allele dataframe generated by pileup_and_phase script 391 | genome = "hg38", 392 | t = 1e-5, 393 | ncores = 4, 394 | plot = TRUE, 395 | out_dir = 'outs/sample_9', 396 | max_entropy = 0.8 397 | ) 398 | 399 | q() 400 | 401 | n 402 | 403 | singularity exec --home /work/bose_lab/Rohit/numbat numbat-rbase_latest.sif R 404 | 405 | library(numbat) 406 | 407 | count_mat <- readRDS("sample_data/sample_10/mtx.RDS") 408 | allele_df <- read.delim("pap/sample_10/sample_10_allele_counts.tsv") 409 | out = run_numbat( 410 | count_mat, # gene x cell integer UMI count matrix 411 | ref_hca, # reference expression profile, a gene x cell type normalized expression level matrix 412 | allele_df, # allele dataframe generated by pileup_and_phase script 413 | genome = "hg38", 414 | t = 1e-5, 415 | ncores = 4, 416 | plot = TRUE, 417 | out_dir = 'outs/sample_10', 418 | max_entropy = 0.8 419 | ) 420 | 421 | q() 422 | 423 | n 424 | 425 | singularity exec --home /work/bose_lab/Rohit/numbat numbat-rbase_latest.sif R 426 | 427 | library(numbat) 428 | 429 | count_mat <- readRDS("sample_data/sample_11/mtx.RDS") 430 | allele_df <- read.delim("pap/sample_11/sample_11_allele_counts.tsv") 431 | out = run_numbat( 432 | count_mat, # gene x cell integer UMI count matrix 433 | ref_hca, # reference expression profile, a gene x cell type normalized expression level matrix 434 | allele_df, # allele dataframe generated by pileup_and_phase script 435 | genome = "hg38", 436 | t = 1e-5, 437 | ncores = 4, 438 | plot = TRUE, 439 | out_dir = 'outs/sample_11', 440 | max_entropy = 0.8 441 | ) 442 | 443 | q() 444 | 445 | n 446 | 447 | singularity exec --home /work/bose_lab/Rohit/numbat numbat-rbase_latest.sif R 448 | 449 | library(numbat) 450 | 451 | count_mat <- readRDS("sample_data/sample_12/mtx.RDS") 452 | allele_df <- read.delim("pap/sample_12/sample_12_allele_counts.tsv") 453 | out = run_numbat( 454 | count_mat, # gene x cell integer UMI count matrix 455 | ref_hca, # reference expression profile, a gene x cell type normalized expression level matrix 456 | allele_df, # allele dataframe generated by pileup_and_phase script 457 | genome = "hg38", 458 | t = 1e-5, 459 | ncores = 4, 460 | plot = TRUE, 461 | out_dir = 'outs/sample_12', 462 | max_entropy = 0.8 463 | ) 464 | 465 | q() 466 | 467 | n 468 | 469 | -------------------------------------------------------------------------------- /Figure 2/Core Edge Identification/state_anno.R: -------------------------------------------------------------------------------- 1 | library(Seurat) 2 | library(harmony) 3 | library(cowplot) 4 | library(RColorBrewer) 5 | library(ggplot2) 6 | load(file = "annotated_objects.Robj") 7 | 8 | all_spots <- purrr::reduce(annotated_objects, merge) 9 | 10 | DefaultAssay(all_spots) <- "Spatial" 11 | 12 | #integration by sample 13 | 14 | obj_list <- SplitObject(all_spots, split.by = "sample_id") 15 | rm(list=setdiff(ls(), "obj_list")) 16 | 17 | obj_list <- lapply(X = obj_list, FUN = function(x) { 18 | x <- NormalizeData(x) 19 | x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 3000) 20 | }) 21 | 22 | anchors <- FindIntegrationAnchors(object.list = obj_list, dims = 1:30) 23 | 24 | comb_as <- IntegrateData(anchorset = anchors, dims = 1:30) 25 | comb_as <- ScaleData(comb_as, verbose = FALSE) 26 | comb_as <- RunPCA(comb_as, npcs = 50, verbose = FALSE) 27 | 28 | png("elbowplot_all.png",units = "in", res = 300, width = 6, height = 4, type = "cairo") 29 | ElbowPlot(comb_as, ndims = 25) 30 | dev.off() 31 | 32 | comb_as <- RunUMAP(comb_as, reduction = "pca", dims = 1:15) 33 | comb_as <- FindNeighbors(comb_as, reduction = "pca", dims = 1:15) 34 | #comb_as <- FindClusters(comb_as, resolution = 0.5) 35 | 36 | DefaultAssay(comb_as) <- "integrated" 37 | 38 | save(comb_as, file = "comb_as.Robj") 39 | 40 | comb_as <- RunPCA(comb_as, npcs = 6, verbose = TRUE, seed.use = 12) 41 | comb_as <- RunUMAP(comb_as, reduction = "pca", dims = 1:6, seed.use = 12) 42 | comb_as <- FindNeighbors(comb_as, reduction = "pca", dims = 1:6, seed.use = 12) 43 | 44 | #plot for pathologist annotations used in Figure 1 45 | png("all_samples_patho.png",units = "in", res = 300, width = 6, height = 4, type = "cairo") 46 | DimPlot(comb_as, reduction = "umap", group.by = "pathologist_anno", 47 | cols = c("SCC" = "maroon", "Lymphocyte Negative Stroma" = "darkblue", 48 | "Lymphocyte Positive Stroma" = "lightblue", "Keratin" = "lightgreen", "Fold" = "grey", 49 | "Artifact" = "grey", "Artery/Vein" = "orange", 50 | "Muscle" = "lightyellow","Non-cancerous Mucosa" = "purple", 51 | "Glandular Stroma" = "violet")) 52 | dev.off() 53 | 54 | #plot for cancer cell identification by pcnv used in Figure 1 55 | png("all_samples_pcnv.png",units = "in", res = 300, width = 5, height = 5, type = "cairo") 56 | FeaturePlot(comb_as,features = "p_cnv", min.cutoff = 0.99, pt.size = 0.2,cols = c("grey", "#D9381E"), 57 | max.cutoff = 1) 58 | dev.off() 59 | 60 | #plot for cancer cell identification by deconvolution used in Figure 1 61 | png("all_samples_deconv.png",units = "in", res = 300, width = 5, height = 5, type = "cairo") 62 | FeaturePlot(comb_as,features = "cancer.cell", min.cutoff = 0.99, pt.size = 0.2,cols = c("grey", "#DC143C"), 63 | max.cutoff = 1) 64 | dev.off() 65 | 66 | colourCount = length(unique(comb_as$final_celltype)) 67 | 68 | #plot for celltype used in Figure 1. 69 | png("all_samples_final_celltype.png",units = "in", res = 300, width = 6.5, height = 5, type = "cairo") 70 | DimPlot(comb_as, reduction = "umap", group.by = "final_celltype", 71 | cols = c("B.cell" = cols[1], "cancer cell"= cols[2], "cytotoxic.CD8..T." = cols[3], 72 | "cytotoxic.CD8..T.exhausted" = cols[4], 73 | "dendritic." = cols[5], 74 | "detox.iCAF" = cols[6], 75 | "ecm.myCAF" = cols[7], 76 | "endothelial" = cols[8], 77 | "Intermediate.fibroblast" = cols[9], 78 | "macrophage" = cols[10], 79 | "mast" = cols[11], 80 | "myofibroblast" = cols[12], 81 | "Tregs" = cols[13], 82 | "NA" = cols[14])) 83 | dev.off() 84 | 85 | #plot for cancer cell identification used in figure 1 86 | png("all_samples_iscancercell.png",units = "in", res = 300, width = 5, height = 5, type = "cairo") 87 | DimPlot(comb_as, reduction = "umap", group.by = "is_cancer_cell",cols = c("grey", "red")) 88 | dev.off() 89 | 90 | #example plot for sample 1 of pathologist annotations used in Figure 1 91 | png("sample_1_patho_anno.png",units = "in", res = 300, width = 5, height = 5, type = "cairo") 92 | SpatialDimPlot(obj_list[[1]], group.by = "pathologist_anno", images = "tumor", image.alpha = 0.5, 93 | stroke = 0, 94 | cols = c("SCC" = "maroon", "Lymphocyte Negative Stroma" = "darkblue", 95 | "Lymphocyte Positive Stroma" = "lightblue", "Keratin" = "lightgreen", "Fold" = "grey", 96 | "Artifact" = "grey", "Artery/Vein" = "orange", 97 | "Muscle" = "lightyellow","Non-cancerous Mucosa" = "purple", 98 | "Glandular Stroma" = "violet")) + NoLegend() 99 | dev.off() 100 | 101 | #example plot for pcnv for sample 1 used in Figure 1 102 | png("sample_1_pcnv.png",units = "in", res = 300, width = 5, height = 5, type = "cairo") 103 | SpatialFeaturePlot(obj_list[[1]],features = "p_cnv", images = "tumor", min.cutoff = 0.99,stroke = 0, 104 | image.alpha = 0.5) + 105 | scale_fill_gradient(low = "grey", high = "#D9381E")+ NoLegend() 106 | dev.off() 107 | 108 | #example plot for deconvolution for sample 1 used in Figure 1 109 | png("sample_1_deconv.png",units = "in", res = 300, width = 5, height = 5, type = "cairo") 110 | SpatialFeaturePlot(obj_list[[1]],features = "cancer.cell", images = "tumor", min.cutoff = 0.99, 111 | stroke = 0, 112 | image.alpha = 0.5) + 113 | scale_fill_gradient(low = "grey", high = "#DC143C")+ NoLegend() 114 | dev.off() 115 | 116 | #example plot for cancer cell status for sample 1 used in Figure 1 117 | png("sample_1_iscancercell.png",units = "in", res = 300, width = 5, height = 5, type = "cairo") 118 | SpatialDimPlot(obj_list[[1]], group.by = "is_cancer_cell", images = "tumor",image.alpha = 0.5, 119 | 120 | stroke = 0, 121 | cols = c("0" = "grey","1" = "red")) + NoLegend() 122 | dev.off() 123 | 124 | cols = colorRampPalette(brewer.pal(9, "Set1"))(colourCount) 125 | #example plot for final celltype for sample 1 used in Figure 1 126 | png("sample_1_final_celltype.png",units = "in", res = 300, width = 5, height = 5, type = "cairo") 127 | SpatialDimPlot(annotated_objects[[1]], group.by = "final_celltype", images = "tumor",image.alpha = 0.5, 128 | stroke = 0, 129 | cols = c("B.cell" = cols[1], "cancer cell"= cols[2], "cytotoxic.CD8..T." = cols[3], 130 | "cytotoxic.CD8..T.exhausted" = cols[4], 131 | "detox.iCAF" = cols[6], 132 | "ecm.myCAF" = cols[7], 133 | "endothelial" = cols[8], 134 | "Intermediate.fibroblast" = cols[9], 135 | "macrophage" = cols[10], 136 | "mast" = cols[11], 137 | "myofibroblast" = cols[12], 138 | "Tregs" = cols[13]))+ NoLegend() 139 | 140 | #plot proportions of celltypes for supplementary fig 1 141 | all_data <- all_spots@meta.data 142 | 143 | all_data[is.na(all_data)] <- "unknown" 144 | colourCount = length(unique(all_data$final_celltype)) 145 | 146 | library(RColorBrewer) 147 | all_data$sample_id <- factor(all_data$sample_id, levels = c("sample_1","sample_2","sample_3","sample_4","sample_5","sample_6","sample_7","sample_8", 148 | "sample_9","sample_10", "sample_11","sample_12")) 149 | png("celltype_bar.png",units = "in", res = 300, width = 7, height = 5, type = "cairo") 150 | ggplot(all_data, aes(x = sample_id, fill = final_celltype)) + 151 | geom_bar(position="fill")+ 152 | theme_bw() + 153 | labs(x=c("spatial annotation"), y="Proportion")+ 154 | theme(panel.grid.major = element_blank(), 155 | panel.grid.minor = element_blank(), 156 | panel.border = element_blank(), panel.background = element_blank()) + 157 | guides(fill = guide_legend(title = 'cell type')) + 158 | scale_fill_manual(values = colorRampPalette(brewer.pal(9, "Set1"))(colourCount))+ 159 | theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) 160 | dev.off() 161 | 162 | #plot proportions of pathologist regions for supplementary fig 1 163 | all_data <- all_spots@meta.data 164 | 165 | all_data[is.na(all_data)] <- "unknown" 166 | colourCount = length(unique(all_data$pathologist_anno)) 167 | 168 | library(RColorBrewer) 169 | all_data$sample_id <- factor(all_data$sample_id, levels = c("sample_1","sample_2","sample_3","sample_4","sample_5","sample_6","sample_7","sample_8", 170 | "sample_9","sample_10", "sample_11","sample_12")) 171 | png("celltype_bar_pathologist_anno.png",units = "in", res = 300, width = 7, height = 5, type = "cairo") 172 | ggplot(all_data, aes(x = sample_id, fill = pathologist_anno)) + 173 | geom_bar(position="fill")+ 174 | theme_bw() + 175 | labs(x=c("spatial annotation"), y="Proportion")+ 176 | theme(panel.grid.major = element_blank(), 177 | panel.grid.minor = element_blank(), 178 | panel.border = element_blank(), panel.background = element_blank()) + 179 | guides(fill = guide_legend(title = 'pathologist annotations')) + 180 | scale_fill_manual(values = c("SCC" = "maroon", "Lymphocyte Negative Stroma" = "darkblue", 181 | "Lymphocyte Positive Stroma" = "lightblue", "Keratin" = "lightgreen", "Fold" = "grey", 182 | "Artifact" = "grey", "Artery/Vein" = "orange", 183 | "Muscle" = "lightyellow","Non-cancerous Mucosa" = "purple","Glandular Stroma" = "violet"))+ 184 | theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) 185 | dev.off() 186 | 187 | #plot dimplot of pathologist region for each sample and cancer cell annotation for each sample for supplementary fig 1 188 | for(object in annotated_objects) { 189 | #image = paste0("tumor.",as.numeric(stringr::str_split(unique(object@meta.data$sample_id),"_")[[1]][2])-1) 190 | png(paste0("pathologist_anno/",unique(object@meta.data$sample_id),"patho_anno.png"),units = "in", res = 300, width = 5, height = 5, type = "cairo") 191 | print(SpatialDimPlot(object, group.by = "pathologist_anno", images = "tumor", image.alpha = 0.5, 192 | stroke = 0, 193 | cols = c("SCC" = "maroon", "Lymphocyte Negative Stroma" = "darkblue", 194 | "Lymphocyte Positive Stroma" = "lightblue", "Keratin" = "lightgreen", "Fold" = "grey", 195 | "Artifact" = "grey", "Artery/Vein" = "orange", 196 | "Muscle" = "lightyellow","Non-cancerous Mucosa" = "purple", 197 | "Glandular Stroma" = "violet")) + NoLegend()) 198 | dev.off() 199 | 200 | png(paste0("cancer_cell_anno/",unique(object@meta.data$sample_id),"iscancercell.png"),units = "in", res = 300, width = 5, height = 5, type = "cairo") 201 | print(SpatialDimPlot(object, group.by = "is_cancer_cell", images = "tumor",image.alpha = 0.5, 202 | stroke = 0, 203 | cols = c("0" = "grey","1" = "red")) + NoLegend()) 204 | dev.off() 205 | 206 | } 207 | 208 | all_data <- comb_as@meta.data 209 | 210 | #Only unknown spots are in sample 3 and this is because there are a few low quality spots that are not on the tissue section 211 | #Due to this, the "unknown status" will be assumed to be noncancer 212 | View(all_data$is_cancer_cell[is.na(all_data$is_cancer_cell)]) 213 | all_data[is.na(all_data)] <- "0" 214 | all_data$sample_id <- factor(all_data$sample_id, levels = c("sample_1","sample_2","sample_3","sample_4","sample_5","sample_6","sample_7","sample_8", 215 | "sample_9","sample_10", "sample_11","sample_12")) 216 | 217 | png("malignant_barplot.png",units = "in", res = 300, width = 7, height = 5, type = "cairo") 218 | ggplot(all_data, aes(x = sample_id, fill = is_cancer_cell)) + 219 | geom_bar(position="fill")+ 220 | theme_bw() + 221 | labs(x=c("spatial annotation"), y="Proportion")+ 222 | theme(panel.grid.major = element_blank(), 223 | panel.grid.minor = element_blank(), 224 | panel.border = element_blank(), panel.background = element_blank()) + 225 | guides(fill = guide_legend(title = 'malignant status')) + 226 | scale_fill_manual(values = c("0" = "grey","1" = "red"))+ 227 | theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) 228 | dev.off() 229 | 230 | comb_d2c <- comb 231 | DefaultAssay(comb_d2c) <- "Spatial" 232 | 233 | comb_d2c@assays$SCT = NULL 234 | 235 | comb_d2c$cluster_annotations <- as.character(comb_d2c$cluster_annotations) 236 | SaveH5Seurat(comb_d2c, filename = "comb_d2c.h5Seurat") 237 | Convert("comb_d2c.h5Seurat", dest = "h5ad") 238 | 239 | #integrate only cancer cells together 240 | all_spots_c <- subset(all_spots, subset = final_celltype == "cancer cell") 241 | 242 | DefaultAssay(all_spots_c) <- "Spatial" 243 | #integration 244 | obj_list <- SplitObject(all_spots_c, split.by = "sample_id") 245 | rm(list=setdiff(ls(), "obj_list")) 246 | obj_list <- lapply(X = obj_list, FUN = function(x) { 247 | x <- NormalizeData(x) 248 | x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 3000) 249 | }) 250 | 251 | anchors <- FindIntegrationAnchors(object.list = obj_list, dims = 1:30) 252 | comb <- IntegrateData(anchorset = anchors, dims = 1:30) 253 | comb <- ScaleData(comb, verbose = FALSE) 254 | comb <- RunPCA(comb, npcs = 50, verbose = FALSE) 255 | comb <- RunUMAP(comb, reduction = "pca", dims = 1:50) 256 | 257 | ElbowPlot(comb, ndims = 25) 258 | 259 | comb <- RunUMAP(comb, reduction = "pca", dims = 1:10) 260 | comb <- FindNeighbors(comb, reduction = "pca", dims = 1:10) 261 | 262 | save(comb, file = "comb.Robj") 263 | 264 | library(Seurat) 265 | 266 | load(file = "comb.Robj") 267 | 268 | comb <- Seurat::FindClusters(object = comb, resolution = 1) 269 | # pull the tree 270 | data.tree <- Tool(object = comb, slot = "BuildClusterTree") 271 | 272 | #plot of clusters for fig 2 273 | png("cluster_res1.png",units = "in", res = 300, width = 5, height = 4, type = "cairo") 274 | Seurat::DimPlot(comb) 275 | dev.off() 276 | 277 | #plot of phylogenetic tree for fig 2 278 | png("cluster_res1_phylo.png",units = "in", res = 300, width = 5, height = 5, type = "cairo") 279 | ape::plot.phylo(x = data.tree, direction = "right") 280 | dev.off() 281 | 282 | comb <- Seurat::FindClusters(object = comb, resolution = 1) 283 | 284 | trial_ids <- c("n3", "n2","n3","n2","n3", "n1","n3","n3","n2","n1","n3","n2","n2","n2","n1") 285 | 286 | names(trial_ids) <- levels(comb) 287 | 288 | comb <- RenameIdents(comb, trial_ids) 289 | 290 | library(SCpubr) 291 | library(ggplot2) 292 | 293 | #plot of nodes for Fig 2 294 | png("nodal_definitions.png",units = "in", res = 300, width = 5, height = 4, type = "cairo") 295 | DimPlot(comb, reduction = "umap", label = FALSE, pt.size = 0.5, 296 | cols = c("n1" = "#4DBBD5FF", 297 | "n2" = "#F9E076", "n3" = "#E64B35FF")) 298 | dev.off() 299 | 300 | load(file = "comb.Robj") 301 | #heatmap of top markers 302 | DefaultAssay(comb) <- "integrated" 303 | 304 | comb <- NormalizeData(comb) 305 | top_markers <- FindAllMarkers(comb, only.pos = TRUE) 306 | 307 | #plot of top markers in each nodal group 308 | png("logfc_top_markers.png",units = "in", res = 300, width = 5, height = 5, type = "cairo") 309 | SCpubr::do_GroupwiseDEPlot(sample = comb, 310 | de_genes = tibble::tibble(top_markers), 311 | top_genes = 5) 312 | dev.off() 313 | 314 | write.csv(top_markers, file = "node_markers.csv") 315 | 316 | #nebulosa plot of validated markers for fig 2 317 | png("nebulosa_CLDN4.png",units = "in", res = 300, width = 4.5, height = 5, type = "cairo") 318 | SCpubr::do_NebulosaPlot(comb, "CLDN4") 319 | dev.off() 320 | 321 | png("nebulosa_SPRR1B.png",units = "in", res = 300, width = 4.5, height = 5, type = "cairo") 322 | SCpubr::do_NebulosaPlot(comb, "SPRR1B") 323 | dev.off() 324 | 325 | png("nebulosa_LAMC2.png",units = "in", res = 300, width = 4.5, height = 5, type = "cairo") 326 | SCpubr::do_NebulosaPlot(comb, "LAMC2") 327 | dev.off() 328 | 329 | png("nebulosa_ITGA5.png",units = "in", res = 300, width = 4.5, height = 5, type = "cairo") 330 | SCpubr::do_NebulosaPlot(comb, "ITGA5") 331 | dev.off() 332 | 333 | comb <- Seurat::FindClusters(object = comb, resolution = 1) 334 | 335 | trial_ids <- c("edge", "transitory","edge","transitory","edge", "core","edge","edge","transitory","core","edge","transitory","transitory","transitory","core") 336 | 337 | names(trial_ids) <- levels(comb) 338 | 339 | comb <- RenameIdents(comb, trial_ids) 340 | 341 | library(SCpubr) 342 | library(ggplot2) 343 | 344 | #renaming regions as core and edge 345 | png("core_edge_transitory_anno.png",units = "in", res = 300, width = 5.5, height = 4, type = "cairo") 346 | DimPlot(comb, reduction = "umap", label = FALSE, pt.size = 0.5, 347 | cols = c("core" = "#4DBBD5FF", 348 | "transitory" = "#F9E076", "edge" = "#E64B35FF")) 349 | dev.off() 350 | 351 | ##plot of clonotypes by state for supplementary fig 3 352 | library(ggplot2) 353 | library(RColorBrewer) 354 | comb@meta.data$core_edge_anno 355 | meta_df <- comb@meta.data 356 | meta_df$clone_opt <- as.factor(meta_df$clone_opt) 357 | 358 | png("cancer_cell_clone_byregion.png",units = "in", res = 300, width = 5, height = 5, type = "cairo") 359 | ggplot(meta_df, aes(x = core_edge_anno, fill = clone_opt)) + 360 | geom_bar(position="fill")+ 361 | facet_wrap(~sample_id)+ 362 | theme_bw() + 363 | labs(x=c("sample ID"), y="Proportion of each clone in cancer cell regions")+ 364 | theme(panel.grid.major = element_blank(), 365 | panel.grid.minor = element_blank(), 366 | panel.border = element_blank(), panel.background = element_blank()) + 367 | theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + 368 | scale_fill_manual(values = colorRampPalette(brewer.pal(6, "Set2"))(6)) 369 | dev.off() 370 | 371 | # top_markers <- FindAllMarkers(comb, only.pos = TRUE) 372 | # write.csv(top_markers, file = "node_markers.csv") 373 | 374 | 375 | 376 | -------------------------------------------------------------------------------- /Figure 2/Correlation Analysis/corplot.R: -------------------------------------------------------------------------------- 1 | library(reshape2) 2 | library(patchwork) 3 | library(ggplot2) 4 | library(dplyr) 5 | library(Seurat) 6 | library(RColorBrewer) 7 | library(sctransform) 8 | 9 | load(file = "comb.Robj") 10 | 11 | Idents(comb) = "cluster_annotations" 12 | comb_core_edge <- subset(comb, idents = c("core","edge")) 13 | 14 | comb_core_edge@meta.data$sample_group <- paste(comb_core_edge@meta.data$sample_id, 15 | comb_core_edge@meta.data$cluster_annotations, sep = "_") 16 | 17 | object <- comb_core_edge 18 | Idents(object) = "sample_group" 19 | object@meta.data$sample_group <- factor(object@meta.data$sample_group, levels = c( 20 | "sample_1_edge", 21 | "sample_2_edge", 22 | "sample_3_edge", 23 | "sample_4_edge", 24 | "sample_5_edge", 25 | "sample_6_edge", 26 | "sample_7_edge", 27 | "sample_8_edge", 28 | "sample_9_edge", 29 | "sample_10_edge", 30 | "sample_11_edge", 31 | "sample_12_edge", 32 | "sample_1_core", 33 | "sample_2_core", 34 | "sample_3_core", 35 | "sample_4_core", 36 | "sample_5_core", 37 | "sample_6_core", 38 | "sample_7_core", 39 | "sample_8_core", 40 | "sample_9_core", 41 | "sample_10_core", 42 | "sample_11_core", 43 | "sample_12_core")) 44 | Idents(object) = "sample_group" 45 | 46 | av.exp <- AverageExpression(object)$SCT 47 | av.exp <- na.omit(av.exp) 48 | cor.exp <- as.data.frame(cor(av.exp)) 49 | cor.exp$x <- rownames(cor.exp) 50 | cor.df <- tidyr::gather(data = cor.exp, y, correlation, unique(comb_core_edge@meta.data$sample_group)) 51 | 52 | setwd("corplot/") 53 | 54 | group = "cluster_annotations" 55 | cor.exp$x = NULL 56 | cor.exp <- as.matrix(cor.exp) 57 | cols <- RColorBrewer::brewer.pal(11, "Spectral") 58 | cols <- rev(cols) 59 | 60 | group1 <- unique(object@meta.data[[group]])[1] 61 | group2 <- unique(object@meta.data[[group]])[2] 62 | 63 | 64 | cor.exp2 = data.frame(group = c(rep(group1, length(unique(object@meta.data$sample_group))/2), 65 | rep(group2, length(unique(object@meta.data$sample_group))/2))) 66 | 67 | 68 | list1 = list(group = c(group1 = '#E64B35FF', 69 | group2 = '#4DBBD5FF')) 70 | 71 | list2 = list(setNames(list1$group, c(group1, group2))) 72 | 73 | names(list2) = "group" 74 | library(ComplexHeatmap) 75 | ha = HeatmapAnnotation(df = cor.exp2, col = list2) 76 | hab = rowAnnotation(df = cor.exp2, col = list2) 77 | 78 | #generate correlation plot for Figure 2 79 | png("heatmap_corr.png",units = "in", res = 300, width = 10, height = 9,type = "cairo") 80 | 81 | print(ComplexHeatmap::Heatmap(cor.exp, col = cols, bottom_annotation = ha, 82 | right_annotation = hab, column_split = c(rep(group1, length(unique(object@meta.data$sample_group))/2), 83 | rep(group2, length(unique(object@meta.data$sample_group))/2)), 84 | 85 | row_split = c(rep(group1, length(unique(object@meta.data$sample_group))/2), 86 | rep(group2, length(unique(object@meta.data$sample_group))/2)), 87 | cluster_rows = T, 88 | cluster_columns = T 89 | )) 90 | dev.off() 91 | -------------------------------------------------------------------------------- /Figure 2/Differential Expression/degs.R: -------------------------------------------------------------------------------- 1 | library(Seurat) 2 | library(dplyr) 3 | library(tibble) 4 | library(cowplot) 5 | library(patchwork) 6 | load(file = "annotated_objects.Robj") 7 | load(file = "comb.Robj") 8 | 9 | comb_meta <- comb@meta.data 10 | #transfer annotations of core/transitory/edge back to objects with spatial positions 11 | for (object in annotated_objects) { 12 | meta <- object@meta.data 13 | 14 | meta_sample_id <- left_join(meta,comb_meta, by = "Row.names")%>%column_to_rownames("Row.names") 15 | 16 | meta_sample_id$cluster_annotations <- factor(meta_sample_id$cluster_annotations, levels = c("core","edge","transitory","nc")) 17 | meta_sample_id$cluster_annotations[is.na(meta_sample_id$cluster_annotations)] <- "nc" 18 | 19 | annotated_objects[[unique(object$sample_id)]]@meta.data <- meta_sample_id 20 | } 21 | 22 | #barplot of state for core transtiroy adn edge regions 23 | png("core_transitory_edge/barplot.png",units = "in", res = 300, width = 5, height = 5, type = "cairo") 24 | ggplot(comb@meta.data, aes(x = factor(comb@meta.data$sample_id, levels = c("sample_1","sample_2","sample_3","sample_4","sample_5","sample_6","sample_7","sample_8", 25 | "sample_9","sample_10", "sample_11","sample_12")), fill = cluster_annotations)) + 26 | geom_bar(position="fill")+ 27 | theme_bw() + 28 | labs(x=c("spatial annotation"), y="Proportion")+ 29 | theme(panel.grid.major = element_blank(), 30 | panel.grid.minor = element_blank(), 31 | panel.border = element_blank(), panel.background = element_blank()) + 32 | guides(fill = guide_legend(title = 'cancer cell state')) + 33 | scale_fill_manual(values = c('edge' = '#E64B35FF', 34 | 'transitory' = '#F9E076', 35 | 'core' = '#4DBBD5FF'))+ 36 | theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) 37 | 38 | dev.off() 39 | 40 | #core and edge annoations for each sample (used in Figure 2) 41 | for(object in annotated_objects) { 42 | png(paste0("core_transitory_edge/",unique(object@meta.data$sample_id.x),"core_edge_anno.png"),units = "in", res = 300, width = 5, height = 5, type = "cairo") 43 | print(SpatialDimPlot(object, group.by = "cluster_annotations", images = "tumor",image.alpha = 0.5, 44 | stroke = 0, 45 | cols = c('edge' = '#E64B35FF', 46 | 'transitory' = '#F9E076', 47 | 'core' = '#4DBBD5FF')) + NoLegend()) 48 | dev.off() 49 | 50 | } 51 | 52 | # lapply( 53 | # annotated_objects, 54 | # function(sample) { 55 | # SpatialDimPlot(sample,group.by = "cluster_annotations") 56 | # } 57 | # ) %>% 58 | # wrap_plots(guides = 'collect') 59 | 60 | all_samples <- annotated_objects 61 | 62 | save(all_samples, file = "all_samples.Robj") 63 | 64 | load(file = "all_samples.Robj") 65 | 66 | #### DEG analysis 67 | 68 | #define colours 69 | patient_col_dict <- c("sample_1" = "#66D7D1", "sample_2" = "#BEE3DB","sample_3" = "#D3D3D3", 70 | "sample_4" = "#FFD6BA","sample_5" = "#274C77","sample_6" = "#A3CEF1", 71 | "sample_7" = "#F7CE5B", 72 | "sample_8" = "#7CEA9C","sample_9" = "#7C90A0", 73 | "sample_10" = "#DCCCFF","sample_11" = "#EE6055", 74 | "sample_12" = "#C492B1") 75 | markers_list = c() 76 | i = 1 77 | for (Seurat_obj in all_samples) { 78 | 79 | Idents(Seurat_obj) <- "cluster_annotations" 80 | #scale and normalize each sample individually 81 | Seurat_obj <- ScaleData(Seurat_obj) 82 | Seurat_obj <- NormalizeData(Seurat_obj) 83 | sub_obj <- subset(Seurat_obj, idents = c("core","edge")) 84 | 85 | #core is positive, edge is negative 86 | results <- FindMarkers(sub_obj,ident.1 = "core",ident.2 = "edge", group.by = "cluster_annotations")%>% 87 | tibble::rownames_to_column(var = "gene")%>% 88 | tibble::add_column("sample_id.x" = unique(sub_obj@meta.data$sample_id.x)) 89 | 90 | write.csv(results, file = paste("differential_expression/raw_csvs/",unique(sub_obj@meta.data$sample_id.x),"_raw_deg.csv", sep = "")) 91 | markers_list[[i]] <- results 92 | i = i+1 93 | } 94 | 95 | #code adapted to make consensus plots 96 | library(dplyr) 97 | 98 | markers_list_saved <- markers_list 99 | 100 | markers.list <- plyr::ldply(markers_list, data.frame) 101 | 102 | #pval cutoff of 0.001 103 | 104 | markers.list.mat <- markers.list[markers.list$p_val_adj<0.001,] 105 | 106 | markers.list.mat <- markers.list.mat[,c("gene", "avg_log2FC", "sample_id.x")] 107 | 108 | markers.list.mat <- reshape2::dcast(markers.list.mat, formula = gene~sample_id.x, value.var = "avg_log2FC") 109 | 110 | # markers.list.mat <- reshape2::dcast(markers.list.mat, formula = gene~sample_id.x, value.var = "log2FC") 111 | 112 | markers.list.mat[is.na(markers.list.mat)] = 0 113 | markers.list.mat <- markers.list.mat %>% tibble::column_to_rownames(var = "gene") 114 | 115 | markers.matrix = markers.list.mat 116 | 117 | markers.matrix.m <- markers.matrix %>% tibble::rownames_to_column(var = "gene") 118 | markers.matrix.m <- reshape2::melt(markers.matrix.m) 119 | order.markers <- rownames(markers.matrix[order(rowSums(-markers.matrix)),]) 120 | markers.sum <- as.data.frame(ifelse(markers.matrix != 0, 1, 0)) 121 | markers.sum$total <- rowSums(markers.sum) 122 | #must be differnetially expressed in at least 9 samples 123 | keep <- rownames(markers.sum[markers.sum$total>9,]) 124 | markers.matrix.m <- markers.matrix.m[markers.matrix.m$gene %in% keep,] 125 | 126 | markers_table <- reshape2::dcast(markers.matrix.m, formula = gene~variable, value.var = "value") 127 | markers_table <- column_to_rownames(markers_table, "gene") 128 | markers_table$sum <- rowSums(markers_table) 129 | write.csv(markers_table, file = "markers_table.csv") 130 | 131 | core_genes <- markers.matrix.m[markers.matrix.m$value > 0,] 132 | edge_genes <- markers.matrix.m[markers.matrix.m$value < 0,] 133 | 134 | core_genes <- unique(core_genes$gene) 135 | edge_genes <- unique(edge_genes$gene) 136 | 137 | saveRDS(core_genes,"gene_lists/core_genes.RDS") 138 | saveRDS(edge_genes,"gene_lists/edge_genes.RDS") 139 | 140 | markers.matrix.m <- markers.matrix.m[markers.matrix.m$gene %in% union(edge_genes,core_genes),] 141 | 142 | setwd("differential_expression/") 143 | #create consensus plot 144 | png("differentially_expressed_genes_sct_seurat.png", units="in", width=20, height=7, res=300, type = "cairo") 145 | ggplot(markers.matrix.m, aes(x = factor(gene, level = order.markers), 146 | y = value, fill = variable)) + 147 | geom_bar(stat="identity", color = "black", size = 0.25) + theme_minimal() + 148 | scale_fill_manual( values = patient_col_dict)+ 149 | labs(x = "", y = "Cumulative average log(fold-change)", fill = "sample_id.x") + 150 | ggpubr::rotate_x_text() 151 | dev.off() 152 | 153 | #only plot top 25 genes for Figure 2 154 | topweights <-aggregate(value~gene,markers.matrix.m,sum) 155 | 156 | topweights <- topweights[order(-topweights$value),] 157 | 158 | markers_top_25 <- markers.matrix.m[markers.matrix.m$gene %in% c(head(topweights,25)$gene,tail(topweights,25)$gene),] 159 | 160 | png("deg_top25.png", units="in", width=9, height=3, res=300, type = "cairo") 161 | ggplot(markers_top_25, aes(x = factor(gene, level = order.markers), 162 | y = value, fill = variable)) + 163 | geom_bar(stat="identity", color = "black", size = 0.25) + theme_minimal() + 164 | scale_fill_manual( values = patient_col_dict)+ 165 | labs(x = "", y = "Cumulative average log(fold-change)", fill = "sample_id.x") + 166 | ggpubr::rotate_x_text() 167 | dev.off() 168 | 169 | 170 | setwd("..") 171 | 172 | #do similar analysis to extract raw data for each sample for IPA 173 | 174 | markers_list = c() 175 | i = 1 176 | for (Seurat_obj in all_samples) { 177 | 178 | Idents(Seurat_obj) <- "cluster_annotations" 179 | Seurat_obj <- ScaleData(Seurat_obj) 180 | Seurat_obj <- NormalizeData(Seurat_obj) 181 | sub_obj <- subset(Seurat_obj, idents = c("core","edge")) 182 | 183 | results <- FindMarkers(sub_obj,ident.1 = "core",ident.2 = "edge", group.by = "cluster_annotations")%>% 184 | tibble::rownames_to_column(var = "gene")%>% 185 | tibble::add_column("sample_id.x" = unique(sub_obj@meta.data$sample_id.x)) 186 | 187 | results <- results[results$p_val_adj < 0.001,] 188 | results_edge <- results[results$avg_log2FC < 0,] 189 | results_edge$avg_log2FC <- results_edge$avg_log2FC*-1 190 | results_core <- results[results$avg_log2FC > 0,] 191 | write.csv(results_edge, file = paste("differential_expression/IPA/",unique(sub_obj@meta.data$sample_id.x),"_edge_raw_deg.csv", sep = "")) 192 | write.csv(results_core, file = paste("differential_expression/IPA/",unique(sub_obj@meta.data$sample_id.x),"_core_raw_deg.csv", sep = "")) 193 | 194 | markers_list[[i]] <- results 195 | i = i+1 196 | } 197 | 198 | 199 | #check overlap with puram et al. genesets 200 | 201 | puram_core <- scan("puram_genesets/epithelial_diff.txt", what="", sep="\n") 202 | 203 | puram_edge <- scan("puram_genesets/p_emt.txt", what="", sep="\n") 204 | 205 | core_genes <- readRDS("gene_lists/core_genes.RDS") 206 | 207 | edge_genes <- readRDS("gene_lists/edge_genes.RDS") 208 | 209 | length(intersect(puram_core,core_genes)) 210 | 211 | #47/80 overlapping for epithelial diff 212 | 213 | length(intersect(puram_edge,edge_genes)) 214 | 215 | #only 7/100 edge overlap 216 | -------------------------------------------------------------------------------- /Figure 2/IPA/IPA Upstream regulator.R: -------------------------------------------------------------------------------- 1 | library(remotes) 2 | library(clusterProfiler) 3 | library(rlang) 4 | library(multienrichjam); 5 | library(jamba) 6 | library(amap) 7 | #> 8 | #> Attaching package: 'jamba' 9 | #> The following objects are masked from 'package:multienrichjam': 10 | #> 11 | #> heatmap_column_order, heatmap_row_order 12 | library(colorjam); 13 | suppressPackageStartupMessages(library(ComplexHeatmap)); 14 | options("stringsAsFactors"=FALSE, "warn"=-1); 15 | knitr::opts_chunk$set( 16 | fig.height=10, 17 | fig.width=10, 18 | fig.align="center" 19 | ) 20 | ragg_png = function(..., res = 192) { 21 | ragg::agg_png(..., res = res, units = "in") 22 | } 23 | knitr::opts_chunk$set(dev = "ragg_png", fig.ext = "png") 24 | 25 | 26 | #Importing IPA Text file 27 | ipa_files <- c(Patient_1_Core='patient1-core.txt', 28 | Patient_2_Core='patient2-core.txt', 29 | Patient_3_Core='patient3-core.txt', 30 | Patient_4_Core='patient4-core.txt', 31 | Patient_5_Core='patient5-core.txt', 32 | Patient_6_Core='patient6-core.txt', 33 | Patient_7_Core='patient7-core.txt', 34 | Patient_8_Core='patient8-core.txt', 35 | Patient_9_Core='patient9-core.txt', 36 | Patient_10_Core='patient10-core.txt', 37 | Patient_11_Core='patient11-core.txt', 38 | Patient_12_Core='patient12-core.txt', 39 | Patient_1_Edge='patient1-edge.txt', 40 | Patient_2_Edge='patient2-edge.txt', 41 | Patient_3_Edge='patient3-edge.txt', 42 | Patient_4_Edge='patient4-edge.txt', 43 | Patient_5_Edge='patient5-edge.txt', 44 | Patient_6_Edge='patient6-edge.txt', 45 | Patient_7_Edge='patient7-edge.txt', 46 | Patient_8_Edge='patient8-edge.txt', 47 | Patient_9_Edge='patient9-edge.txt', 48 | Patient_10_Edge='patient10-edge.txt', 49 | Patient_11_Edge='patient11-edge.txt', 50 | Patient_12_Edge='patient12-edge.txt') 51 | 52 | ipa_l <- lapply(ipa_files, importIPAenrichment) 53 | ssdim(ipa_l) 54 | 55 | 56 | #Analyze IPA enrichments from canonical enrichment test 57 | enrichList_canonical <- lapply(ipa_l, function(i){ 58 | i[["Upstream Regulators"]]; 59 | }); 60 | sdim(enrichList_canonical); 61 | 62 | ## Convert data.frame to enrichResult 63 | ## multienrichjam::enrichDF2enrichResult 64 | er_canonical <- lapply(enrichList_canonical, function(i){ 65 | enrichDF2enrichResult(i, 66 | keyColname="Name", 67 | pvalueColname="P-value", 68 | geneColname="geneNames", 69 | pathGenes="Bias Term", 70 | pvalueCutoff=1) 71 | }); 72 | sdim(er_canonical); 73 | head(as.data.frame(er_canonical[[1]])); 74 | 75 | 76 | ?multiEnrichMap 77 | #Running multiEnrichMap() 78 | mem_canonical <- multiEnrichMap(er_canonical, 79 | enrichBaseline=1, 80 | cutoffRowMinP=0.05, 81 | colorV=c("purple", "orange"), 82 | topEnrichN=20) 83 | sdim(mem_canonical) 84 | 85 | nameColname = c("Name", "pathway", "Description", "itemsetID", 86 | "ID") 87 | nameColname <- find_colname(nameColname, er_canonical) 88 | # 89 | # for upstreeam regulator use Activation z-score 90 | enrichIM <- enrichList2IM(er_canonical, valueColname = "Activation z-score", 91 | keyColname = nameColname, verbose = F, emptyValue = 0, 92 | GmtT = NULL) 93 | 94 | enrichIM <- enrichIM[rownames(enrichIM) %in% rownames(mem_canonical$enrichIM),] 95 | 96 | #change enrichIM if you want to change the whole plot 97 | 98 | enrichIM <- enrichIM[rowSums(enrichIM[]) != 0 ,] 99 | 100 | mem_canonical$enrichIM <- mem_canonical$enrichIM[rownames(mem_canonical$enrichIM) %in% rownames(mem_canonical$enrichIM),] 101 | 102 | binarydf <- enrichIM 103 | binarydf <- as.data.frame(ifelse(enrichIM != 0, 1, 0)) 104 | binarydf$total <- rowSums(binarydf) 105 | keep <- rownames(binarydf[binarydf$total>10,]) 106 | enrichIM <- enrichIM[rownames(enrichIM) %in% keep,] 107 | 108 | use_matrix <- enrichIM 109 | 110 | name = "Activation z-score" 111 | col_logp <- circlize::colorRamp2(breaks = c(min(enrichIM), 0, max(enrichIM)), colors = c(rev(jamba::getColorRamp("RdBu", 112 | lens = 2, n = 3, trimRamp = c(2, 2))))) 113 | 114 | er_hc2 <- amap::hcluster(link = "ward", jamba::noiseFloor(enrichIM[rownames(enrichIM), 115 | , drop = FALSE], minimum = min(enrichIM), 116 | newValue = 0, ceiling = max(enrichIM)), method = "euclidean") 117 | er_hc2 <- as.dendrogram(er_hc2) 118 | row_cex = 1 119 | row_fontsize <- jamba::noiseFloor(row_cex * 60/(nrow(enrichIM))^(1/2), 120 | minimum = 12, ceiling = 18) 121 | column_cex = 1 122 | 123 | column_fontsize <- jamba::noiseFloor(column_cex * 60/(ncol(enrichIM))^(1/2), 124 | minimum = 1, ceiling = 20) 125 | 126 | show_heatmap_legend <- TRUE 127 | top_annotation <- NULL 128 | show = NULL 129 | cell_fun_custom <- cell_fun_bivariate(list(use_matrix, 130 | FALSE, mem_canonical$enrichIMgeneCount), pch = 21, 131 | size_fun = ct_approxfun, size_by = 3, outline_style = "darker", 132 | col_hm = col_logp, show = show, cex = 0, 133 | type = "univariate", prefix = c("z-score: ", 134 | "-log10P: ", "genes: ")[show]) 135 | 136 | hm <- call_fn_ellipsis(ComplexHeatmap::Heatmap, matrix = use_matrix, 137 | name = name, col = col_logp, cluster_rows = er_hc2, 138 | row_dend_reorder = TRUE, border = TRUE, 139 | row_names_gp = grid::gpar(fontsize = row_fontsize), 140 | row_names_max_width = grid::unit(12, "cm"), column_names_gp = grid::gpar(fontsize = column_fontsize), 141 | column_names_max_height = grid::unit(12, "cm"), 142 | cluster_columns = FALSE, row_dend_width = grid::unit(30, "cm"), 143 | column_title = NULL, heatmap_legend_param = list(border = "black", legend_height = grid::unit(6, "cm")), 144 | rect_gp = grid::gpar(type = "none"), cell_fun = cell_fun_custom, 145 | show_heatmap_legend = show_heatmap_legend, top_annotation = top_annotation) 146 | 147 | gene_count_max <- NULL 148 | min_count = 1 149 | if (length(gene_count_max) == 0) { 150 | ctmax <- ceiling(max(mem_canonical$enrichIMgeneCount, na.rm = TRUE)) 151 | } else { 152 | ctmax <- gene_count_max 153 | } 154 | if (ctmax <= 1) { 155 | ct_ticks <- c(0, 1) 156 | } else { 157 | n <- 8 158 | ct_ticks <- setdiff(unique(c(min_count, round(pretty(c(0, 159 | ctmax), n = n)))), 0) 160 | ct_step <- median(diff(ct_ticks)) 161 | if (max(ct_ticks) > ctmax) { 162 | ct_ticks[which.max(ct_ticks)] <- ctmax 163 | if (tail(diff(ct_ticks), 1) < ceiling(ct_step/4)) { 164 | ct_ticks <- head(ct_ticks, -2) 165 | } 166 | else if (tail(diff(ct_ticks), 1) < ceiling(ct_step/2)) { 167 | ct_ticks <- c(head(ct_ticks, -2), ctmax) 168 | } 169 | } 170 | } 171 | point_size_factor = 1 172 | point_size_max = 8 173 | point_size_min = 1 174 | 175 | ct_approxfun <- function(x, ...) { 176 | approxfun(x = sqrt(c(min_count, ctmax)), yleft = 0, 177 | ties = "ordered", yright = point_size_max, y = c(point_size_min, 178 | point_size_max * point_size_factor))(sqrt(x), 179 | ...) 180 | } 181 | ct_tick_sizes <- ct_approxfun(ct_ticks) 182 | pt_legend_ncol <- 1 183 | if (length(ct_ticks) >= 8) { 184 | pt_legend_ncol <- 2 185 | } 186 | 187 | ct_tick_sizes <- ct_approxfun(ct_ticks) 188 | pch <- 21 189 | pt_legend <- ComplexHeatmap::Legend(labels = ct_ticks, 190 | title = "Gene Count", type = "points", pch = pch, 191 | ncol = pt_legend_ncol, size = grid::unit(ct_tick_sizes, 192 | "mm"), grid_height = grid::unit(max(ct_tick_sizes) * 193 | 0.95, "mm"), grid_width = grid::unit(max(ct_tick_sizes) * 194 | 0.95, "mm"), background = "transparent", legend_gp = grid::gpar(col = "black", 195 | fill = "grey85")) 196 | anno_legends <- list(pt_legend) 197 | attr(hm, "annotation_legend_list") <- anno_legends 198 | draw(hm, annotation_legend_list = anno_legends) 199 | 200 | 201 | cor.exp2 = data.frame(group = c(rep("core", 12), 202 | rep("edge", 12))) 203 | 204 | 205 | list1 = list(group = c("edge" = '#E64B35FF', 206 | "core" = '#4DBBD5FF')) 207 | 208 | list2 = list(setNames(list1$group, c("edge", "core"))) 209 | names(list2) = "group" 210 | 211 | ha = HeatmapAnnotation(df = cor.exp2, col = list2) 212 | 213 | hm <- call_fn_ellipsis(ComplexHeatmap::Heatmap, matrix = use_matrix, 214 | name = name, col = col_logp, cluster_rows = er_hc2, 215 | row_dend_reorder = TRUE, border = TRUE, 216 | row_names_gp = grid::gpar(fontsize = row_fontsize), 217 | row_names_max_width = grid::unit(12, "cm"), column_names_gp = grid::gpar(fontsize = 14), 218 | column_names_max_height = grid::unit(12, "cm"), 219 | cluster_columns = FALSE, row_dend_width = grid::unit(2, "cm"), 220 | column_title = NULL, heatmap_legend_param = list(border = "black", legend_height = grid::unit(6, "cm")), 221 | bottom_annotation = ha) 222 | 223 | 224 | png("IPA_Heatmap_UpReg.png", units = "in", res=300, width=9.5, height=16) 225 | draw(hm) 226 | dev.off() 227 | -------------------------------------------------------------------------------- /Figure 2/IPA/IPA cannonical.R: -------------------------------------------------------------------------------- 1 | library(remotes) 2 | library(clusterProfiler) 3 | library(rlang) 4 | library(multienrichjam); 5 | library(jamba) 6 | library(amap) 7 | #> 8 | #> Attaching package: 'jamba' 9 | #> The following objects are masked from 'package:multienrichjam': 10 | #> 11 | #> heatmap_column_order, heatmap_row_order 12 | library(colorjam); 13 | suppressPackageStartupMessages(library(ComplexHeatmap)); 14 | options("stringsAsFactors"=FALSE, "warn"=-1); 15 | knitr::opts_chunk$set( 16 | fig.height=10, 17 | fig.width=10, 18 | fig.align="center" 19 | ) 20 | ragg_png = function(..., res = 192) { 21 | ragg::agg_png(..., res = res, units = "in") 22 | } 23 | knitr::opts_chunk$set(dev = "ragg_png", fig.ext = "png") 24 | 25 | 26 | #Importing IPA Text file 27 | ipa_files <- c(Patient_1_Core='patient1-core.txt', 28 | Patient_2_Core='patient2-core.txt', 29 | Patient_3_Core='patient3-core.txt', 30 | Patient_4_Core='patient4-core.txt', 31 | Patient_5_Core='patient5-core.txt', 32 | Patient_6_Core='patient6-core.txt', 33 | Patient_7_Core='patient7-core.txt', 34 | Patient_8_Core='patient8-core.txt', 35 | Patient_9_Core='patient9-core.txt', 36 | Patient_10_Core='patient10-core.txt', 37 | Patient_11_Core='patient11-core.txt', 38 | Patient_12_Core='patient12-core.txt', 39 | Patient_1_Edge='patient1-edge.txt', 40 | Patient_2_Edge='patient2-edge.txt', 41 | Patient_3_Edge='patient3-edge.txt', 42 | Patient_4_Edge='patient4-edge.txt', 43 | Patient_5_Edge='patient5-edge.txt', 44 | Patient_6_Edge='patient6-edge.txt', 45 | Patient_7_Edge='patient7-edge.txt', 46 | Patient_8_Edge='patient8-edge.txt', 47 | Patient_9_Edge='patient9-edge.txt', 48 | Patient_10_Edge='patient10-edge.txt', 49 | Patient_11_Edge='patient11-edge.txt', 50 | Patient_12_Edge='patient12-edge.txt') 51 | ipa_l <- lapply(ipa_files, importIPAenrichment) 52 | ssdim(ipa_l) 53 | 54 | 55 | #Analyze IPA enrichments from canonical enrichment test 56 | enrichList_canonical <- lapply(ipa_l, function(i){ 57 | i[["Canonical Pathways"]]; 58 | }); 59 | sdim(enrichList_canonical); 60 | 61 | ## Convert data.frame to enrichResult 62 | ## multienrichjam::enrichDF2enrichResult 63 | er_canonical <- lapply(enrichList_canonical, function(i){ 64 | enrichDF2enrichResult(i, 65 | keyColname="Name", 66 | pvalueColname="P-value", 67 | geneColname="geneNames", 68 | geneRatioColname="Ratio", 69 | pvalueCutoff=1) 70 | }); 71 | sdim(er_canonical); 72 | head(as.data.frame(er_canonical[[1]])); 73 | 74 | 75 | ?multiEnrichMap 76 | #Running multiEnrichMap() 77 | mem_canonical <- multiEnrichMap(er_canonical, 78 | enrichBaseline=1, 79 | cutoffRowMinP=0.05, 80 | colorV=c("purple", "orange"), 81 | topEnrichN=20) 82 | sdim(mem_canonical) 83 | 84 | nameColname = c("Name", "pathway", "Description", "itemsetID", 85 | "ID") 86 | nameColname <- find_colname(nameColname, er_canonical) 87 | # 88 | # for upstreeam regulator use Activation z-score 89 | enrichIM <- enrichList2IM(er_canonical, valueColname = "zScore", 90 | keyColname = nameColname, verbose = F, emptyValue = 0, 91 | GmtT = NULL) 92 | 93 | enrichIM <- enrichIM[rownames(enrichIM) %in% rownames(mem_canonical$enrichIM),] 94 | 95 | #change enrichIM if you want to change the whole plot 96 | 97 | enrichIM <- enrichIM[rowSums(enrichIM[]) != 0 ,] 98 | 99 | mem_canonical$enrichIM <- mem_canonical$enrichIM[rownames(mem_canonical$enrichIM) %in% rownames(mem_canonical$enrichIM),] 100 | 101 | binarydf <- enrichIM 102 | binarydf <- as.data.frame(ifelse(enrichIM != 0, 1, 0)) 103 | binarydf$total <- rowSums(binarydf) 104 | keep <- rownames(binarydf[binarydf$total>9,]) 105 | enrichIM <- enrichIM[rownames(enrichIM) %in% keep,] 106 | 107 | use_matrix <- enrichIM 108 | 109 | name = "zscore" 110 | col_logp <- circlize::colorRamp2(breaks = c(min(enrichIM), 0, max(enrichIM)), colors = c(rev(jamba::getColorRamp("RdBu", 111 | lens = 2, n = 3, trimRamp = c(2, 2))))) 112 | 113 | er_hc2 <- amap::hcluster(link = "ward", jamba::noiseFloor(enrichIM[rownames(enrichIM), 114 | , drop = FALSE], minimum = min(enrichIM), 115 | newValue = 0, ceiling = max(enrichIM)), method = "euclidean") 116 | er_hc2 <- as.dendrogram(er_hc2) 117 | row_cex = 1 118 | row_fontsize <- jamba::noiseFloor(row_cex * 60/(nrow(enrichIM))^(1/2), 119 | minimum = 16, ceiling = 20) 120 | column_cex = 1 121 | 122 | column_fontsize <- jamba::noiseFloor(column_cex * 60/(ncol(enrichIM))^(1/2), 123 | minimum = 1, ceiling = 20) 124 | 125 | show_heatmap_legend <- TRUE 126 | top_annotation <- NULL 127 | show = NULL 128 | cell_fun_custom <- cell_fun_bivariate(list(use_matrix, 129 | FALSE, mem_canonical$enrichIMgeneCount), pch = 21, 130 | size_fun = ct_approxfun, size_by = 3, outline_style = "darker", 131 | col_hm = col_logp, show = show, cex = 0, 132 | type = "univariate", prefix = c("z-score: ", 133 | "-log10P: ", "genes: ")[show]) 134 | 135 | hm <- call_fn_ellipsis(ComplexHeatmap::Heatmap, matrix = use_matrix, 136 | name = name, col = col_logp, cluster_rows = er_hc2, 137 | row_dend_reorder = TRUE, border = TRUE, 138 | row_names_gp = grid::gpar(fontsize = row_fontsize), 139 | row_names_max_width = grid::unit(12, "cm"), column_names_gp = grid::gpar(fontsize = column_fontsize), 140 | column_names_max_height = grid::unit(12, "cm"), 141 | cluster_columns = FALSE, row_dend_width = grid::unit(30, "cm"), 142 | column_title = NULL, heatmap_legend_param = list(border = "black", legend_height = grid::unit(6, "cm")), 143 | rect_gp = grid::gpar(type = "none"), cell_fun = cell_fun_custom, 144 | show_heatmap_legend = show_heatmap_legend, top_annotation = top_annotation) 145 | 146 | gene_count_max <- NULL 147 | min_count = 1 148 | if (length(gene_count_max) == 0) { 149 | ctmax <- ceiling(max(mem_canonical$enrichIMgeneCount, na.rm = TRUE)) 150 | } else { 151 | ctmax <- gene_count_max 152 | } 153 | if (ctmax <= 1) { 154 | ct_ticks <- c(0, 1) 155 | } else { 156 | n <- 8 157 | ct_ticks <- setdiff(unique(c(min_count, round(pretty(c(0, 158 | ctmax), n = n)))), 0) 159 | ct_step <- median(diff(ct_ticks)) 160 | if (max(ct_ticks) > ctmax) { 161 | ct_ticks[which.max(ct_ticks)] <- ctmax 162 | if (tail(diff(ct_ticks), 1) < ceiling(ct_step/4)) { 163 | ct_ticks <- head(ct_ticks, -2) 164 | } 165 | else if (tail(diff(ct_ticks), 1) < ceiling(ct_step/2)) { 166 | ct_ticks <- c(head(ct_ticks, -2), ctmax) 167 | } 168 | } 169 | } 170 | point_size_factor = 1 171 | point_size_max = 8 172 | point_size_min = 1 173 | 174 | ct_approxfun <- function(x, ...) { 175 | approxfun(x = sqrt(c(min_count, ctmax)), yleft = 0, 176 | ties = "ordered", yright = point_size_max, y = c(point_size_min, 177 | point_size_max * point_size_factor))(sqrt(x), 178 | ...) 179 | } 180 | ct_tick_sizes <- ct_approxfun(ct_ticks) 181 | pt_legend_ncol <- 1 182 | if (length(ct_ticks) >= 8) { 183 | pt_legend_ncol <- 2 184 | } 185 | 186 | ct_tick_sizes <- ct_approxfun(ct_ticks) 187 | pch <- 21 188 | pt_legend <- ComplexHeatmap::Legend(labels = ct_ticks, 189 | title = "Gene Count", type = "points", pch = pch, 190 | ncol = pt_legend_ncol, size = grid::unit(ct_tick_sizes, 191 | "mm"), grid_height = grid::unit(max(ct_tick_sizes) * 192 | 0.95, "mm"), grid_width = grid::unit(max(ct_tick_sizes) * 193 | 0.95, "mm"), background = "transparent", legend_gp = grid::gpar(col = "black", 194 | fill = "grey85")) 195 | anno_legends <- list(pt_legend) 196 | attr(hm, "annotation_legend_list") <- anno_legends 197 | draw(hm, annotation_legend_list = anno_legends) 198 | 199 | 200 | cor.exp2 = data.frame(group = c(rep("core", 12), 201 | rep("edge", 12))) 202 | 203 | 204 | list1 = list(group = c("edge" = '#E64B35FF', 205 | "core" = '#4DBBD5FF')) 206 | 207 | list2 = list(setNames(list1$group, c("edge", "core"))) 208 | names(list2) = "group" 209 | 210 | ha = HeatmapAnnotation(df = cor.exp2, col = list2) 211 | 212 | hm <- call_fn_ellipsis(ComplexHeatmap::Heatmap, matrix = use_matrix, 213 | name = name, col = col_logp, cluster_rows = er_hc2, 214 | row_dend_reorder = TRUE, border = TRUE, 215 | row_names_gp = grid::gpar(fontsize = row_fontsize), 216 | row_names_max_width = grid::unit(20, "cm"), column_names_gp = grid::gpar(fontsize = 16), 217 | column_names_max_height = grid::unit(12, "cm"), 218 | cluster_columns = FALSE, row_dend_width = grid::unit(2, "cm"), 219 | column_title = NULL, heatmap_legend_param = list(border = "black", legend_height = grid::unit(8, "cm")), 220 | bottom_annotation = ha) 221 | 222 | png("IPA_Heatmap_Canonical.png", units = "in", res=300, width=18, height=14) 223 | draw(hm) 224 | dev.off() 225 | -------------------------------------------------------------------------------- /Figure 2/TF anaysis/pySCENIC_oscc.R: -------------------------------------------------------------------------------- 1 | library(Matrix) 2 | library(plyr) 3 | library(dplyr) 4 | library(Seurat) 5 | library(igraph) 6 | library(ComplexHeatmap) 7 | library(circlize) 8 | require(dplyr) 9 | require(ggplot2) 10 | library(ggpubr) 11 | require(cowplot) 12 | library(data.table) 13 | library(RColorBrewer) 14 | library(scater) 15 | library(tidyverse) 16 | library(AUCell) 17 | library(SCENIC) 18 | library(tibble) 19 | 20 | custom_fill_colors <- c("sample_1" = "#66D7D1", "sample_2" = "#BEE3DB","sample_3" = "#D3D3D3", 21 | "sample_4" = "#FFD6BA","sample_5" = "#274C77","sample_6" = "#A3CEF1", 22 | "sample_7" = "#F7CE5B", 23 | "sample_8" = "#7CEA9C","sample_9" = "#7C90A0", 24 | "sample_10" = "#DCCCFF","sample_11" = "#EE6055", 25 | "sample_12" = "#C492B1") 26 | 27 | #load file where TF AUC has been added as an assay to a Seurat object of "comb" 28 | 29 | SCENIC <- readRDS("x_AUC.rds") 30 | 31 | Idents(SCENIC) = "cluster_annotations" 32 | DefaultAssay(SCENIC) <- "AUC" 33 | SCENIC_proj <- subset(SCENIC, idents = c("core","edge")) 34 | 35 | sub <- SCENIC_proj 36 | sample_ids.2 <- list(unique(sub$sample_id)) 37 | 38 | markers_list <- list() 39 | 40 | for (k in 1:length(sample_ids.2[[1]])) { 41 | 42 | markers <- subset(sub, cells = c(which(sub$sample_id == sample_ids.2[[1]][k]))) %>% 43 | FindMarkers(ident.1 ="core", group.by = "cluster_annotations",logfc.threshold = 0,min.pct = 0) %>% 44 | rownames_to_column(var = "gene")%>% 45 | add_column(sample_id = sample_ids.2[[1]][k]) 46 | 47 | markers_list[[k]] <- markers 48 | 49 | 50 | } 51 | 52 | markers.list <- plyr::ldply(markers_list, data.frame) 53 | #pval adj 0.05 54 | markers.list.mat <- markers.list[markers.list$p_val_adj<0.05,] 55 | markers.list.mat <- markers.list.mat[,c("gene", "avg_log2FC", "sample_id")] 56 | 57 | markers.list.mat <- reshape2::dcast(markers.list.mat, formula = gene~sample_id, value.var = "avg_log2FC") 58 | 59 | markers.list.mat[is.na(markers.list.mat)] = 0 60 | markers.list.mat <- markers.list.mat %>% tibble::column_to_rownames(var = "gene") 61 | 62 | 63 | markers.matrix = markers.list.mat 64 | 65 | markers.matrix.m <- markers.matrix %>% tibble::rownames_to_column(var = "gene") 66 | markers.matrix.m <- reshape2::melt(markers.matrix.m) 67 | order.markers <- rownames(markers.matrix[order(rowSums(-markers.matrix)),]) 68 | markers.sum <- as.data.frame(ifelse(markers.matrix != 0, 1, 0)) 69 | markers.sum$total <- rowSums(markers.sum) 70 | #at least 9 samples 71 | keep <- rownames(markers.sum[markers.sum$total>9,]) 72 | markers.matrix.m <- markers.matrix.m[markers.matrix.m$gene %in% keep,] 73 | 74 | #genes is only used as this code is repurposed from the degs script, these are not genes 75 | #these are TFs 76 | 77 | core_genes <- markers.matrix.m[markers.matrix.m$value > 0,] 78 | edge_genes <- markers.matrix.m[markers.matrix.m$value < 0,] 79 | core_genes <- unique(core_genes$gene) 80 | edge_genes <- unique(edge_genes$gene) 81 | 82 | saveRDS(core_genes,"TF_lists//core_TFS.RDS") 83 | saveRDS(edge_genes,"TF_lists/edge_TFS.RDS") 84 | 85 | markers.matrix.m <- markers.matrix.m[markers.matrix.m$gene %in% union(edge_genes,core_genes),] 86 | 87 | markers_table <- reshape2::dcast(markers.matrix.m, formula = gene~variable, value.var = "value") 88 | markers_table <- column_to_rownames(markers_table, "gene") 89 | markers_table$sum <- rowSums(markers_table) 90 | write.csv(markers_table, file = "markers_table.csv") 91 | 92 | 93 | png("diff_exp.png", units="in", width=15, height=4, res=300, type = "cairo") 94 | print(ggplot(markers.matrix.m, aes(x = factor(gene, level = order.markers), 95 | y = value, fill = variable)) + 96 | geom_bar(stat="identity", color = "black", size = 0.25) + theme_minimal() + 97 | labs(x = "", y = "Cumulative average log(fold-change)", fill = "sample_id") + 98 | ggpubr::rotate_x_text() + scale_fill_manual(values = custom_fill_colors)+ 99 | theme(axis.text = element_text(face="bold"), legend.text = element_text(face="bold"), 100 | axis.text.y = element_text(face="bold"))) 101 | dev.off() 102 | 103 | #only plot top 30 of each for supplementary fig 104 | topweights <-aggregate(value~gene,markers.matrix.m,sum) 105 | 106 | topweights <- topweights[order(-topweights$value),] 107 | 108 | markers_top_25 <- markers.matrix.m[markers.matrix.m$gene %in% c(head(topweights,25)$gene,tail(topweights,25)$gene),] 109 | 110 | png("diff_exp_top.png", units="in", width=8, height=4, res=300, type = "cairo") 111 | print(ggplot(markers_top_25, aes(x = factor(gene, level = order.markers), 112 | y = value, fill = variable)) + 113 | geom_bar(stat="identity", color = "black", size = 0.25) + theme_minimal() + 114 | labs(x = "", y = "Cumulative average log(fold-change)", fill = "sample_id") + 115 | ggpubr::rotate_x_text() + scale_fill_manual(values = custom_fill_colors)+ 116 | theme(axis.text = element_text(face="bold"), legend.text = element_text(face="bold"), 117 | axis.text.y = element_text(face="bold"))) 118 | dev.off() 119 | 120 | setwd("..") 121 | 122 | #reformat for interpretability 123 | markers.list <- plyr::ldply(markers_list, data.frame) 124 | 125 | markers.list.mat <- markers.list[markers.list$p_val_adj<0.05,] 126 | markers.list.mat <- markers.list.mat[,c("gene", "avg_log2FC","p_val_adj","sample_id")] 127 | 128 | markers.list.mat$enrichment <- ifelse(markers.list.mat$gene %in% core_genes, "core", 129 | "edge") 130 | 131 | write.csv(markers.list.mat, file = "markers.list.mat.csv") 132 | -------------------------------------------------------------------------------- /Figure 3/Cancer Stem Cell/csc_analysis.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | library(Seurat) 3 | library(ggplot2) 4 | library(reshape2) 5 | library(ggpubr) 6 | 7 | load(file = "comb.Robj") 8 | 9 | #function to summarize data 10 | data_summary <- function(data, varname, groupnames){ 11 | require(plyr) 12 | summary_func <- function(x, col){ 13 | c(mean = mean(x[[col]], na.rm=TRUE), 14 | sd = sd(x[[col]], na.rm=TRUE)) 15 | } 16 | data_sum<-ddply(data, groupnames, .fun=summary_func, 17 | varname) 18 | data_sum <- rename(data_sum, c("mean" = varname)) 19 | return(data_sum) 20 | } 21 | 22 | #funtion to plot individual data points 23 | plot_effects <- function(data,comparision_group,comparisions,values,sample_id, 24 | patient_col_dict) { 25 | geneset <- values 26 | 27 | Data = data[,c(sample_id,comparision_group,geneset)] 28 | 29 | plot <- Data 30 | 31 | plot_summary <- data_summary(plot, varname=geneset, 32 | groupnames=c(comparision_group, sample_id)) 33 | 34 | plot_summary$geneset <- plot_summary[[geneset]] 35 | plot_summary$comparision_group <- plot_summary[[comparision_group]] 36 | 37 | p1 <- ggplot(data = plot_summary, aes_string(x=comparision_group, y= geneset), fill = sample_id) + 38 | geom_pointrange(aes(ymin=geneset-sd, 39 | ymax=geneset+sd, color = sample_id), 40 | position = position_dodge(0.5), width = 0.2)+ 41 | scale_color_manual(values = patient_col_dict) + 42 | stat_compare_means(comparisons = comparisions,label = "p.signif", paired = F)+ 43 | theme_light() + 44 | theme(axis.text.x = element_text(angle = 45, hjust=1)) + 45 | xlab("") + ylab(paste(geneset, " scores")) + 46 | theme(strip.background =element_rect(fill="white"))+ 47 | theme(strip.text = element_text(colour = 'black'), ) + 48 | theme(legend.position="none") 49 | 50 | return(p1) 51 | } 52 | 53 | #adding score for each gene separately, ALDH1 has all isotypes 54 | Seurat_obj <- comb 55 | DefaultAssay(Seurat_obj) <- "SCT" 56 | Seurat_obj <- AddModuleScore(Seurat_obj, features = list(c("ALDH1A1","ALDH1A2","ALDH1A3")), 57 | name = "ALDH1") 58 | Seurat_obj <- AddModuleScore(Seurat_obj, features = list(c("VIM")), 59 | name = "VIM") 60 | Seurat_obj <- AddModuleScore(Seurat_obj, features = list(c("EPCAM")), 61 | name = "EPCAM") 62 | Seurat_obj <- AddModuleScore(Seurat_obj, features = list(c("CDH1")), 63 | name = "CDH1") 64 | Seurat_obj <- AddModuleScore(Seurat_obj, features = list(c("CD24")), 65 | name = "CD24") 66 | Seurat_obj <- AddModuleScore(Seurat_obj, features = list(c("CD44")), 67 | name = "CD44") 68 | 69 | #MET like CSC score -VIM + EPCAM +CDH1 +ALDH1 70 | Seurat_obj@meta.data$MET_like_CSC <- -1*Seurat_obj@meta.data$VIM1 + Seurat_obj@meta.data$EPCAM1+ Seurat_obj@meta.data$CDH1 + Seurat_obj@meta.data$ALDH1 71 | 72 | #EMT score VIM - EPCAM - CDH1 + CD44 - CD24 73 | Seurat_obj@meta.data$EMT_like_CSC <- Seurat_obj@meta.data$VIM1 + -1*Seurat_obj@meta.data$EPCAM1+ -1*Seurat_obj@meta.data$CDH1 + Seurat_obj@meta.data$CD441 + -1*Seurat_obj@meta.data$CD241 74 | 75 | #add broad score in 76 | Seurat_obj <- AddModuleScore(Seurat_obj, features = list(c("ALDH1A1","ALDH1A2","ALDH1A3","NANOG","PROM1","POU5F1","SOX2","STAT3","ATP6AP2","MSI1","CD44")), 77 | name = "CSC_broad") 78 | 79 | samples <- Seurat_obj@meta.data 80 | 81 | patient_col_dict <- c("sample_1" = "#66D7D1", "sample_2" = "#BEE3DB","sample_3" = "#D3D3D3", 82 | "sample_4" = "#FFD6BA","sample_5" = "#274C77","sample_6" = "#A3CEF1", 83 | "sample_7" = "#F7CE5B", 84 | "sample_8" = "#7CEA9C","sample_9" = "#7C90A0", 85 | "sample_10" = "#DCCCFF","sample_11" = "#EE6055", 86 | "sample_12" = "#C492B1") 87 | 88 | samples$sample_id <- factor(samples$sample_id, levels = 89 | c(names(patient_col_dict))) 90 | 91 | samples$cluster_annotations <- factor(samples$cluster_annotations, levels = 92 | c("core","transitory","edge")) 93 | 94 | comparision_group <- "cluster_annotations" 95 | comparisions <- list(c("core","edge")) 96 | sample <- "sample_id" 97 | 98 | #CSC score plots for Fig 3 99 | for (values in c("MET_like_CSC","EMT_like_CSC","CSC_broad1")) { 100 | 101 | png(paste(values,"stat_plot.png",sep = "_"), units="in", width=5, height=4, res=300, type = "cairo") 102 | print(plot_effects(samples,comparision_group, comparisions,values,sample,patient_col_dict)) 103 | dev.off() 104 | 105 | } 106 | 107 | #nebulosa plots for Fig 3 108 | 109 | png("CSC_broad_nebulosa.png", units="in", width=5.5, height=6, res=300, type = "cairo") 110 | print(SCpubr::do_NebulosaPlot(Seurat_obj, features = "CSC_broad1", method = "wkde")) 111 | dev.off() 112 | 113 | png("ALDH1A1and2and3_feature.png", units="in", width=6, height=6, res=300, type = "cairo") 114 | SCpubr::do_FeaturePlot(Seurat_obj, features = c("ALDH11"), order = TRUE) 115 | dev.off() 116 | 117 | png("ALDH1A1and2and3_density.png", units="in", width=6, height=6, res=300, type = "cairo") 118 | SCpubr::do_NebulosaPlot(Seurat_obj, features = c("ALDH11")) 119 | dev.off() 120 | 121 | png("CD44_feature.png", units="in", width=6, height=6, res=300, type = "cairo") 122 | SCpubr::do_FeaturePlot(Seurat_obj, features = "CD44", order = TRUE) 123 | dev.off() 124 | 125 | png("CD44_density.png", units="in", width=6, height=6, res=300, type = "cairo") 126 | SCpubr::do_NebulosaPlot(Seurat_obj, features = "CD44") 127 | dev.off() 128 | 129 | png("CD24_feature.png", units="in", width=6, height=6, res=300, type = "cairo") 130 | SCpubr::do_FeaturePlot(Seurat_obj, features = "CD24", order = TRUE) 131 | dev.off() 132 | 133 | png("CD24_density.png", units="in", width=6, height=6, res=300, type = "cairo") 134 | SCpubr::do_NebulosaPlot(Seurat_obj, features = "CD24") 135 | dev.off() 136 | 137 | png("CD24_inverse_density.png", units="in", width=6, height=6, res=300, type = "cairo") 138 | SCpubr::do_NebulosaPlot(Seurat_obj, features = "CD24", viridis_direction = -1) 139 | dev.off() 140 | -------------------------------------------------------------------------------- /Figure 3/Cell Neighbor Analysis/cell_neighbor_analysis.R: -------------------------------------------------------------------------------- 1 | library(Seurat) 2 | library(dplyr) 3 | library(tibble) 4 | library(cowplot) 5 | library(patchwork) 6 | load(file = "annotated_objects.Robj") 7 | load(file = "comb.Robj") 8 | 9 | comb_meta <- comb@meta.data 10 | 11 | #transfer annotations over from combined object to individual objects with spatial positions 12 | #in similar fashion to degs script 13 | 14 | for (object in annotated_objects) { 15 | meta <- object@meta.data 16 | 17 | meta_sample_id <- left_join(meta,comb_meta, by = "Row.names")%>%column_to_rownames("Row.names") 18 | 19 | meta_sample_id$cluster_annotations <- as.character(meta_sample_id$cluster_annotations) 20 | for(row in rownames(meta_sample_id)){ 21 | if(is.na(meta_sample_id[row,]$cluster_annotations)) { 22 | meta_sample_id[row,]$cluster_annotations <- meta_sample_id[row,]$noncancer_celltype.x 23 | } 24 | } 25 | annotated_objects[[unique(object$sample_id)]]@meta.data <- meta_sample_id 26 | } 27 | 28 | #now find neighboring cells and quanitfy neighbors for core cells and neighbors for edge cells 29 | 30 | annotations_bcs<- list() 31 | i = 1 32 | for (object in annotated_objects) { 33 | 34 | #check if a spot is an edge spot,# if the spot is an edge spot check its neighbors,note down the barcodes 35 | #of its neighbors, quantify the neighboring cells based on the barcode matrix 36 | 37 | BC_mtx_neighbors <- object@meta.data[c("sample_id.x","cluster_annotations")] 38 | 39 | 40 | offsets <- data.frame(x.offset=c(-2, 2, -1, 1, -1, 1), 41 | y.offset=c( 0, 0, -1, -1, 1, 1)) 42 | 43 | spot.positions <- object@images$tumor@coordinates[,c("row","col")] 44 | spot.positions$spot.idx <- rownames(spot.positions) 45 | 46 | ## Compute coordinates of each possible spot neighbor 47 | neighbor.positions <- merge(spot.positions, offsets) 48 | neighbor.positions$x.pos <- neighbor.positions$col + neighbor.positions$x.offset 49 | neighbor.positions$y.pos <- neighbor.positions$row + neighbor.positions$y.offset 50 | 51 | ## Select spots that exist at neighbor coordinates 52 | neighbors <- merge(as.data.frame(neighbor.positions), 53 | as.data.frame(spot.positions), 54 | by.x=c("x.pos", "y.pos"), by.y=c("col", "row"), 55 | suffixes=c(".primary", ".neighbor"), 56 | all.x=TRUE) 57 | 58 | neighbors_anno <- neighbors[complete.cases(neighbors), ] 59 | 60 | neighbors_anno$celltype_primary <- object@meta.data[neighbors_anno$spot.idx.primary,]$cluster_annotations 61 | neighbors_anno$celltype_neighbor <- object@meta.data[neighbors_anno$spot.idx.neighbor,]$cluster_annotations 62 | annotations_bcs[[i]] <- neighbors_anno 63 | #based on neighbor graph,identify neighboring celltypes for each cell state 64 | object@meta.data$neighbor_clust_anno <- "none" 65 | object@meta.data$neighbor_bcs <- "none" 66 | 67 | for(bc in rownames(object@meta.data)) { 68 | nbs <- neighbors[neighbors$spot.idx.primary == bc,] 69 | nb_bc <- na.omit(nbs$spot.idx.neighbor) 70 | celltypes <- object@meta.data[nb_bc,]$cluster_annotations 71 | 72 | object@meta.data[bc,]$neighbor_clust_anno <- paste(celltypes, collapse = ",") 73 | object@meta.data[bc,]$neighbor_bcs <- paste(nb_bc, collapse = ",") 74 | } 75 | annotated_objects[[i]] <- object 76 | i =i+1 77 | 78 | } 79 | 80 | nb_list <- list() 81 | for (object in annotated_objects) { 82 | # Convert to a data frame 83 | ct_nbs_df <- object@meta.data[,c("cluster_annotations","neighbor_clust_anno", "neighbor_bcs")] 84 | 85 | all_ct_nbs <- strsplit(ct_nbs_df$neighbor_clust_anno, ",") 86 | all_ct_nbs_barcodes <- strsplit(ct_nbs_df$neighbor_bcs, ",") 87 | 88 | neigh_df <- data.frame(primary = rep(ct_nbs_df$cluster_annotations, sapply(all_ct_nbs, length)), 89 | neighbor = unlist(all_ct_nbs), neighbor_barcodes = unlist(all_ct_nbs_barcodes)) 90 | 91 | cne_neighbors <- neigh_df[neigh_df$primary %in% c("core","edge"),] 92 | `%!in%` = Negate(`%in%`) 93 | cne_neighbors <- as.data.frame(cne_neighbors[cne_neighbors$neighbor %!in% c("core","edge","transitory"),]) 94 | 95 | cne_neighbors <- unique(subset(cne_neighbors, !duplicated(cne_neighbors))) 96 | 97 | cne_nbs <- as.data.frame(table(cne_neighbors$primary, cne_neighbors$neighbor)) 98 | 99 | colnames(cne_nbs) <- c("primary","neighbor","#neighbors") 100 | `%!in%` = Negate(`%in%`) 101 | 102 | cne_nbs$sample <- unique(object$sample_id.x) 103 | 104 | nb_list <- append(nb_list, list(cne_nbs), 0) 105 | } 106 | 107 | allc2c_net <- plyr::ldply(nb_list) 108 | 109 | library(rstatix) 110 | 111 | signif_abbreviation <- function(p_values) { 112 | sapply(p_values, function(x) { 113 | if (is.na(x)) { 114 | return(NA) 115 | } else if (x < 0.0001) { 116 | return("****") 117 | } else if (x < 0.001) { 118 | return("***") 119 | } else if (x < 0.01) { 120 | return("**") 121 | } else if (x < 0.05) { 122 | return("*") 123 | } else if (x > 0.05) { 124 | return("NS") 125 | } else { 126 | return("NS") 127 | } 128 | }) 129 | } 130 | 131 | allc2c_net_sig <- allc2c_net[allc2c_net$neighbor %in% c("ecm.myCAF","macrophage","cytotoxic.CD8..T.","Intermediate.fibroblast"),] 132 | pairwise_comparisons <- allc2c_net_sig %>% 133 | group_by(neighbor) %>% 134 | filter(n_distinct(primary) > 1) %>%# skip groups with only one observation 135 | wilcox_test(`#neighbors` ~ primary, p.adjust.method = "bonferroni") 136 | 137 | #adjust p value 138 | pairwise_comparisons$p.adj <- p.adjust(pairwise_comparisons$p, method = "BH") 139 | pairwise_comparisons <- pairwise_comparisons %>% 140 | mutate(p.adj.signif = as.character(signif_abbreviation(p.adj))) 141 | 142 | allc2c_net <- left_join(allc2c_net, pairwise_comparisons, by = "neighbor") 143 | 144 | summary_data <-allc2c_net[allc2c_net$primary %in% c("core"),] 145 | 146 | png("signaling_cellneighbor_sig_adj.png", units="in", width=12, height=5, res=300, type = "cairo") 147 | ggplot(allc2c_net, aes(x = primary, y = `#neighbors`, color = factor(primary), fill = primary)) + 148 | geom_boxplot(outlier.shape = NA, aes(fill = factor(primary)), alpha = 0.5) + 149 | scale_fill_manual(values = c('edge' = '#E64B35FF', 150 | 'core' = '#4DBBD5FF')) + 151 | facet_wrap(~neighbor, scales = "free", nrow = 1) + 152 | geom_point(alpha = 1, aes(color = factor(sample), group = primary, fill = primary)) + 153 | scale_color_manual(values = c("sample_1" = "#66D7D1", "sample_2" = "#BEE3DB","sample_3" = "#D3D3D3", 154 | "sample_4" = "#FFD6BA","sample_5" = "#274C77","sample_6" = "#A3CEF1", 155 | "sample_7" = "#F7CE5B", 156 | "sample_8" = "#7CEA9C","sample_9" = "#7C90A0", 157 | "sample_10" = "#DCCCFF","sample_11" = "#EE6055", 158 | "sample_12" = "#C492B1")) + 159 | ggtitle("Number of Neighboring Spots") + 160 | xlab("Neighbor") + 161 | ylab("Number of Neighbors") + 162 | theme_bw() + 163 | theme(legend.position = "bottom") + 164 | guides(color = guide_legend(title = "Sample")) + 165 | geom_text(data = summary_data, aes(x = primary, y = Inf, label = p.adj.signif), vjust = 1.5, hjust = 0.5, fontface = "bold") + 166 | theme(axis.text.x = element_text(angle = 45, hjust = 1), # Rotate and adjust axis labels 167 | plot.margin = unit(c(1, 5, 1, 1), "lines"), 168 | axis.title = element_text(face = "bold"), 169 | axis.title.y = element_blank(), 170 | axis.text = element_text(size = 10, face = "bold"), 171 | axis.ticks.y = element_blank()) 172 | dev.off() 173 | -------------------------------------------------------------------------------- /Figure 3/Cell Signaling/cellchat.R: -------------------------------------------------------------------------------- 1 | library(reshape2) 2 | library(patchwork) 3 | library(ggplot2) 4 | library(gprofiler2) 5 | library(dplyr) 6 | library(Seurat) 7 | library(RColorBrewer) 8 | library(sctransform) 9 | library(CellChat) 10 | library(patchwork) 11 | 12 | CellChatDB <- CellChatDB.human 13 | showDatabaseCategory(CellChatDB) 14 | CellChatDB.use <- CellChatDB 15 | 16 | library(Seurat) 17 | library(dplyr) 18 | library(tibble) 19 | library(cowplot) 20 | library(patchwork) 21 | load(file = "annotated_objects.Robj") 22 | load(file = "comb.Robj") 23 | 24 | comb_meta <- comb@meta.data 25 | 26 | for (object in annotated_objects) { 27 | meta <- object@meta.data 28 | 29 | meta_sample_id <- left_join(meta,comb_meta, by = "Row.names")%>%column_to_rownames("Row.names") 30 | 31 | meta_sample_id$cluster_annotations <- as.character(meta_sample_id$cluster_annotations) 32 | for(row in rownames(meta_sample_id)){ 33 | if(is.na(meta_sample_id[row,]$cluster_annotations)) { 34 | meta_sample_id[row,]$cluster_annotations <- meta_sample_id[row,]$noncancer_celltype.x 35 | } 36 | } 37 | annotated_objects[[unique(object$sample_id)]]@meta.data <- meta_sample_id 38 | } 39 | 40 | merged_obj <- purrr::reduce(annotated_objects,merge) 41 | 42 | Idents(merged_obj) <- "cluster_annotations" 43 | 44 | #Analysis only with core, edge, and ecm myCAFs 45 | 46 | merged_obj <- subset(merged_obj, idents = c("core","edge","ecm.myCAF")) 47 | merged_obj@images <- list() 48 | 49 | sp_cc_full <- createCellChat(merged_obj) 50 | sp_cc_full@DB <- CellChatDB.use 51 | sp_cc_full <- subsetData(sp_cc_full) 52 | sp_cc_full <- identifyOverExpressedGenes(sp_cc_full) 53 | sp_cc_full <- identifyOverExpressedInteractions(sp_cc_full) 54 | 55 | sp_cc_full <- computeCommunProb(sp_cc_full) 56 | sp_cc_full <- filterCommunication(sp_cc_full, min.cells = 10) 57 | sp_cc_full <- computeCommunProbPathway(sp_cc_full) 58 | sp_cc_full <- aggregateNet(sp_cc_full) 59 | cellchat <- sp_cc_full 60 | cellchat <- netAnalysis_computeCentrality(cellchat, slot.name = "netP") 61 | 62 | 63 | count_interactions <- as.data.frame(cellchat@net[["count"]]) 64 | count_interactions <- melt(count_interactions) 65 | library(dplyr) 66 | count_interactions <- count_interactions[c(1,5,9),] 67 | 68 | gg1 <- ggplot(count_interactions, aes(x=variable, y=value, fill=variable))+ 69 | geom_bar(stat="identity", color="black")+ 70 | scale_fill_manual(values=c("core" = "#4DBBD5FF", 71 | "edge" = "#E64B35FF", 72 | "ecm.myCAF" = "#3f908d"))+ 73 | labs(y = "Number of intracellular interactions") + 74 | theme_minimal() 75 | 76 | color_map = c("core" = "#4DBBD5FF", 77 | "edge" = "#E64B35FF", 78 | "ecm.myCAF" = "#3f908d") 79 | 80 | #intercellular interaction strength 81 | 82 | weight <- as.data.frame(cellchat@net[["weight"]]) 83 | 84 | weight_interactions <- melt(weight) 85 | library(dplyr) 86 | weight_interactions <- weight_interactions[c(1,5,9),] 87 | 88 | gg2 <- ggplot(weight_interactions, aes(x=variable, y=value, fill=variable))+ 89 | geom_bar(stat="identity", color="black")+ 90 | scale_fill_manual(values=c("core" = "#4DBBD5FF", 91 | "edge" = "#E64B35FF", 92 | "ecm.myCAF" = "#3f908d"))+ 93 | labs(y = "Intracellular interaction strength") + 94 | theme_minimal() 95 | 96 | png("intracellular_weighted_caf.png", units="in", width=10, height=5, res=300, type = "cairo") 97 | gg1 + gg2 98 | dev.off() 99 | 100 | #count of interactions 101 | 102 | count_interactions <- as.data.frame(cellchat@net[["count"]]) 103 | count_interactions <- melt(count_interactions) 104 | library(dplyr) 105 | count_interactions <- count_interactions[c(4,5,9),] 106 | count_interactions[1,] <- c('ecm.myCAF',count_interactions[1,]$value) 107 | 108 | gg1 <- ggplot(count_interactions, aes(x=variable, y=value, fill=variable))+ 109 | geom_bar(stat="identity", color="black")+ 110 | scale_fill_manual(values=c("core" = "#4DBBD5FF", 111 | "edge" = "#E64B35FF", 112 | "ecm.myCAF" = "#3f908d"))+ 113 | labs(y = "cell-cancer interactions") + 114 | theme_minimal() 115 | 116 | # 117 | weight <- as.data.frame(cellchat@net[["weight"]]) 118 | 119 | weight_interactions <- melt(weight) 120 | library(dplyr) 121 | weight_interactions <- weight_interactions[c(4,5,9),] 122 | weight_interactions[1,] <- c('ecm.myCAF',weight_interactions[1,]$value) 123 | 124 | gg2 <- ggplot(weight_interactions, aes(x=variable, y=value, fill=variable))+ 125 | geom_bar(stat="identity", color="black")+ 126 | scale_fill_manual(values=c("core" = "#4DBBD5FF", 127 | "edge" = "#E64B35FF", 128 | "ecm.myCAF" = "#3f908d"))+ 129 | labs(y = "cell-cancer interaction strength") + 130 | theme_minimal() 131 | 132 | png("cell_to_cancer.png", units="in", width=10, height=5, res=300, type = "cairo") 133 | gg1 + gg2 134 | dev.off() 135 | 136 | color_map = c("core" = "#4DBBD5FF", 137 | "edge" = "#E64B35FF", 138 | "ecm.myCAF" = "#3f908d") 139 | 140 | #cell signaling from edge cells to edge cells 141 | png("e2e.png", units="in", width=20, height=10, res=300, type = "cairo") 142 | netVisual_chord_gene(cellchat, sources.use = c(2), targets.use = c(2),legend.pos.x = 8, 143 | color.use = c("edge" = "#E64B35FF"), lab.cex = 1.2) 144 | dev.off() 145 | 146 | #cell signaling from core cells to core cells 147 | png("c2c.png", units="in", width=20, height=10, res=300, type = "cairo") 148 | netVisual_chord_gene(cellchat, sources.use = c(3), targets.use = c(3),legend.pos.x = 8, lab.cex = 1.2, 149 | color.use = c("core" = "#4DBBD5FF")) 150 | dev.off() 151 | 152 | #cell signaling between CAF and edge cells 153 | png("caf_and_edge.png", units="in", width=20, height=10, res=300, type = "cairo") 154 | netVisual_chord_gene(cellchat, sources.use = c(1), targets.use = c(1,2),legend.pos.x = 8, lab.cex = 1.2, 155 | color.use = c("edge" = "#E64B35FF", 156 | "ecm.myCAF" = "#3f908d")) 157 | dev.off() 158 | 159 | #subset for only top ligand recceptor pairs 160 | object <- cellchat 161 | prob <- slot(object, "net")$prob 162 | pval <- slot(object, "net")$pval 163 | prob[pval > 0.05] <- 0 164 | net <- reshape2::melt(prob, value.name = "prob") 165 | colnames(net)[1:3] <- c("source", "target", "interaction_name") 166 | pairLR = dplyr::select(object@LR$LRsig, c("interaction_name_2", 167 | "pathway_name", "ligand", "receptor", "annotation", 168 | "evidence")) 169 | idx <- match(net$interaction_name, rownames(pairLR)) 170 | temp <- pairLR[idx, ] 171 | net <- cbind(net, temp) 172 | 173 | net <- net[net$prob > 1.0e-03,] 174 | 175 | # #cell signaling between CAF and edge cells with a more stringent cutoff 176 | # png("caf_and_edge_sub.png", units="in", width=20, height=10, res=300, type = "cairo") 177 | # netVisual_chord_gene(object, net = net, sources.use = c(1), targets.use = c(1,2),legend.pos.x = 8, 178 | # color.use = c("edge" = "#E64B35FF", 179 | # "ecm.myCAF" = "#3f908d"), thresh = 0.001,lab.cex = 1.2) 180 | # dev.off() 181 | 182 | 183 | # png("e2e_sub.png", units="in", width=20, height=10, res=300, type = "cairo") 184 | # netVisual_chord_gene(object, net = net, sources.use = c(2), targets.use = c(2),legend.pos.x = 8, 185 | # color.use = c("edge" = "#E64B35FF"), thresh = 0.001,lab.cex = 1.2) 186 | # dev.off() 187 | 188 | #cell signaling between core and core cells with a more stringent cutoff of 0.001 189 | png("c2c_0.001.png", units="in", width=20, height=10, res=300, type = "cairo") 190 | netVisual_chord_gene(object, net = net, sources.use = c(3), targets.use = c(3),legend.pos.x = 8, 191 | color.use = c("core" = "#4DBBD5FF"), thresh = 0.001,lab.cex = 1.2) 192 | dev.off() 193 | 194 | net <- net[net$prob > 5.0e-03,] 195 | #cell signaling between edge and edge cells with a more stringent cutoff of 0.005 196 | png("e2e_0.005.png", units="in", width=20, height=10, res=300, type = "cairo") 197 | netVisual_chord_gene(object, net = net, sources.use = c(2), targets.use = c(2),legend.pos.x = 8, 198 | color.use = c("edge" = "#E64B35FF"), thresh = 0.001,lab.cex = 1.2) 199 | dev.off() 200 | 201 | # net <- net[net$prob > 1e-02,] 202 | # 203 | # png("caf_and_edge_0.01.png", units="in", width=20, height=10, res=300, type = "cairo") 204 | # netVisual_chord_gene(object, net = net, sources.use = c(1), targets.use = c(1,2),legend.pos.x = 8, 205 | # color.use = c("edge" = "#E64B35FF", 206 | # "ecm.myCAF" = "#3f908d"), thresh = 0.001,lab.cex = 1.2) 207 | # dev.off() 208 | 209 | net <- net[net$prob > 5e-02,] 210 | #cell signaling between edge and edge cells with a more stringent cutoff of 0.05 211 | png("caf_and_edge_0.05.png", units="in", width=20, height=10, res=300, type = "cairo") 212 | netVisual_chord_gene(object, net = net, sources.use = c(1), targets.use = c(1,2),legend.pos.x = 8, 213 | color.use = c("edge" = "#E64B35FF", 214 | "ecm.myCAF" = "#3f908d"), thresh = 0.001,lab.cex = 1.2) 215 | dev.off() 216 | 217 | # extract signaling info 218 | object <- cellchat 219 | prob <- slot(object, "net")$prob 220 | pval <- slot(object, "net")$pval 221 | prob[pval > 0.05] <- 0 222 | net <- reshape2::melt(prob, value.name = "prob") 223 | colnames(net)[1:3] <- c("source", "target", "interaction_name") 224 | pairLR = dplyr::select(object@LR$LRsig, c("interaction_name_2", 225 | "pathway_name", "ligand", "receptor", "annotation", 226 | "evidence")) 227 | idx <- match(net$interaction_name, rownames(pairLR)) 228 | temp <- pairLR[idx, ] 229 | net <- cbind(net, temp) 230 | 231 | write.csv(net, file ="cellchat_signaling.csv") 232 | 233 | #analyze only core and edge cells 234 | 235 | merged_obj <- subset(merged_obj, idents = c("core", 236 | "edge")) 237 | 238 | sp_cc_full <- createCellChat(merged_obj) 239 | sp_cc_full@DB <- CellChatDB.use 240 | sp_cc_full <- subsetData(sp_cc_full) 241 | sp_cc_full <- identifyOverExpressedGenes(sp_cc_full) 242 | sp_cc_full <- identifyOverExpressedInteractions(sp_cc_full) 243 | 244 | sp_cc_full <- computeCommunProb(sp_cc_full) 245 | sp_cc_full <- filterCommunication(sp_cc_full, min.cells = 10) 246 | sp_cc_full <- computeCommunProbPathway(sp_cc_full) 247 | sp_cc_full <- aggregateNet(sp_cc_full) 248 | cellchat <- sp_cc_full 249 | cellchat <- netAnalysis_computeCentrality(cellchat, slot.name = "netP") 250 | 251 | color_map = c( 252 | "edge" = "#E64B35FF", "core" = "#4DBBD5FF") 253 | 254 | png("signaling_pathways_onlycancercell.png", units="in", width=15, height=7, res=300, type = "cairo") 255 | ht1 <- netAnalysis_signalingRole_heatmap(cellchat, pattern = "outgoing", color.use = color_map, 256 | width = 3) 257 | ht2 <- netAnalysis_signalingRole_heatmap(cellchat, pattern = "incoming", color.use = color_map, 258 | width = 3) 259 | ht1 + ht2 260 | dev.off() 261 | 262 | #analyze other celltypes and how they interact with core/edge cancer cells 263 | 264 | rm(list=setdiff(ls(), "annotated_objects")) 265 | CellChatDB <- CellChatDB.human 266 | CellChatDB.use <- CellChatDB 267 | 268 | merged_obj <- purrr::reduce(annotated_objects,merge) 269 | merged_obj@images <- list() 270 | Idents(merged_obj) <- "cluster_annotations" 271 | 272 | #cellchat with core cancer cells and macrophages 273 | core_obj <- subset(merged_obj, idents = c("core","macrophage"), invert = FALSE) 274 | 275 | core_cc <- createCellChat(core_obj) 276 | core_cc@DB <- CellChatDB.use 277 | core_cc <- subsetData(core_cc) 278 | core_cc <- identifyOverExpressedGenes(core_cc) 279 | core_cc <- identifyOverExpressedInteractions(core_cc) 280 | 281 | core_cc <- computeCommunProb(core_cc) 282 | core_cc <- filterCommunication(core_cc, min.cells = 10) 283 | core_cc <- computeCommunProbPathway(core_cc) 284 | core_cc <- aggregateNet(core_cc) 285 | core_cc <- netAnalysis_computeCentrality(core_cc, slot.name = "netP") 286 | 287 | #make a plot for core signaling to macrophages, in prominent core pathways 288 | netVisual_individual(core_cc, signaling = c(core_cc@netP$pathways), 289 | pairLR.use = c(2), layout = "chord") 290 | 291 | png("other_celltypes/core_mac_demosome.png", units="in", width=6, height=5, res=300, type = "cairo") 292 | netVisual_aggregate(core_cc, signaling = c("DESMOSOME"), layout = "chord",color.use = c("core" = "#4DBBD5FF", 293 | "macrophage" = "#cfa42d")) 294 | dev.off() 295 | 296 | png("other_celltypes/core_mac_cdh.png", units="in", width=6, height=5, res=300, type = "cairo") 297 | netVisual_aggregate(core_cc, signaling = c("CDH"), layout = "chord",color.use = c("core" = "#4DBBD5FF", 298 | "macrophage" = "#cfa42d")) 299 | dev.off() 300 | 301 | 302 | #cellchat with edge cancer cells, macropahges, ecmmyCAFs and cytotoxic CD8T cells 303 | edge_obj <- subset(merged_obj, idents = c("edge", "macrophage","cytotoxic.CD8..T.", 304 | "ecm.myCAF")) 305 | edge_cc <- createCellChat(edge_obj) 306 | edge_cc@DB <- CellChatDB.use 307 | edge_cc <- subsetData(edge_cc) 308 | edge_cc <- identifyOverExpressedGenes(edge_cc) 309 | edge_cc <- identifyOverExpressedInteractions(edge_cc) 310 | 311 | edge_cc <- computeCommunProb(edge_cc) 312 | edge_cc <- filterCommunication(edge_cc, min.cells = 10) 313 | edge_cc <- computeCommunProbPathway(edge_cc) 314 | edge_cc <- aggregateNet(edge_cc) 315 | edge_cc <- netAnalysis_computeCentrality(edge_cc, slot.name = "netP") 316 | 317 | #make a plot for edge signaling to macrophages, CD8 T cells, and ECM myCAFs, in prominent edge pathways 318 | png("other_celltypes/edge_APP.png", units="in", width=10, height=10, res=300, type = "cairo") 319 | netVisual_aggregate(edge_cc, signaling = c("APP"), layout = "chord",color.use = c("edge" = "#E64B35FF", 320 | "macrophage" = "#cfa42d", 321 | "cytotoxic.CD8..T." = "#79577c", 322 | "ecm.myCAF" = "#3f908d")) 323 | dev.off() 324 | 325 | png("other_celltypes/edge_LAMININ.png", units="in", width=10, height=10, res=300, type = "cairo") 326 | netVisual_aggregate(edge_cc, signaling = c("LAMININ"), layout = "chord",color.use = c("edge" = "#E64B35FF", 327 | "macrophage" = "#cfa42d", 328 | "cytotoxic.CD8..T." = "#79577c", 329 | "ecm.myCAF" = "#3f908d")) 330 | dev.off() 331 | 332 | png("other_celltypes/edge_COLLAGEN.png", units="in", width=10, height=10, res=300, type = "cairo") 333 | netVisual_aggregate(edge_cc, signaling = c("COLLAGEN"), layout = "chord",color.use = c("edge" = "#E64B35FF", 334 | "macrophage" = "#cfa42d", 335 | "cytotoxic.CD8..T." = "#79577c", 336 | "ecm.myCAF" = "#3f908d")) 337 | dev.off() 338 | -------------------------------------------------------------------------------- /Figure 4/Classifier Training/script.R: -------------------------------------------------------------------------------- 1 | library("hardhat") 2 | library("scPred") 3 | library("Seurat") 4 | library("magrittr") 5 | library(tibble) 6 | 7 | load("annotated_objects.Robj") 8 | 9 | #reduce all objects 10 | SCC_all_samples <- purrr::reduce(annotated_objects,merge) 11 | 12 | #load in cancer cell annotations 13 | load(file = "comb.Robj") 14 | comb@meta.data$cluster_annotations = comb@active.ident 15 | cancer_cell_meta <-comb@meta.data[,colnames(comb@meta.data) %in% c("new_annotations_trial","cluster_annotations")] 16 | 17 | #transfer cancer cell state information, core, transitory, and edge 18 | full_meta <- list(SCC_all_samples@meta.data, cancer_cell_meta) %>% 19 | purrr::map(~ .x %>% 20 | as.data.frame %>% 21 | rownames_to_column('rn')) %>% 22 | purrr::reduce(left_join, by = 'rn') %>% 23 | column_to_rownames('rn') 24 | 25 | full_meta$cluster_annotations <- as.character(full_meta$cluster_annotations) 26 | #label noncancer cells 27 | full_meta$cluster_annotations[is.na(full_meta$cluster_annotations)] <- "noncancer" 28 | 29 | full_meta$cluster_annotations <- factor(full_meta$cluster_annotations,levels = unique(full_meta$cluster_annotations)) 30 | 31 | #transfer annotations to object 32 | SCC_all_samples@meta.data <- full_meta 33 | 34 | #generate reference 35 | reference <- SCC_all_samples 36 | DefaultAssay(reference) <- "Spatial" 37 | reference <- reference %>% 38 | Seurat::NormalizeData(verbose = FALSE) %>% 39 | FindVariableFeatures(selection.method = "vst", nfeatures = 2000) %>% 40 | ScaleData(verbose = FALSE) %>% 41 | RunHarmony("sample_id") %>% 42 | RunPCA(pc.genes = reference@var.genes, npcs = 20, verbose = FALSE) %>% 43 | RunUMAP(reduction = "harmony", dims = 1:20) %>% 44 | FindNeighbors(reduction = "harmony", dims = 1:20) %>% 45 | FindClusters(resolution = 0.5) %>% 46 | identity() 47 | 48 | reference <- getFeatureSpace(reference, "cluster_annotations") 49 | rm(list=setdiff(ls(), "reference")) 50 | #train model on avNNet, svmRadial, and nb 51 | reference_avNNet <- trainModel(reference,allowParallel = T,preProcess = c("center", "scale","YeoJohnson"),seed = 21,model = "avNNet",number = 10) 52 | reference_svmRadial <- trainModel(reference,allowParallel = T,preProcess = c("center", "scale","YeoJohnson"),seed = 21,model = "svmRadial",number = 10) 53 | reference_nb <- trainModel(reference,allowParallel = T,preProcess = c("center", "scale","YeoJohnson"),seed = 21,model = "nb",number = 10) 54 | 55 | save(reference_avNNet, file = "reference_avNNet.Robj") 56 | save(reference_svmRadial, file = "reference_svmRadial.Robj") 57 | save(reference_nb, file = "reference_nb.Robj") 58 | load(file = "reference_avNNet.Robj") 59 | load(file = "reference_svmRadial.Robj") 60 | load(file = "reference_nb.Robj") 61 | 62 | get_scpred(reference_avNNet) 63 | get_scpred(reference_svmRadial) 64 | get_scpred(reference_nb) 65 | 66 | #based on the best models for each subtype, we should do: 67 | #core: svm 68 | #edge: svm 69 | #transitory: avNNet 70 | #noncancer: svm 71 | 72 | #reclassify svmRadial with avNNet for transitory 73 | reference <- trainModel(reference_svmRadial,allowParallel = T,preProcess = c("center", "scale","YeoJohnson"),seed = 21,model = "avNNet",reclassify = c("transitory"),number = 10) 74 | save(reference, file = "reference.Robj") 75 | rm(list=setdiff(ls(), "reference")) 76 | ### On server 77 | 78 | library(Seurat) 79 | library("scPred") 80 | library("magrittr") 81 | load(file = "reference.Robj") 82 | 83 | get_scpred(reference) 84 | model <- get_scpred(reference) 85 | save(model,file = "model.Robj") 86 | #plot model statistics 87 | png(file = "plot_probabilities_reference_1.png", 88 | width = 6, height = 6, units = "in", res = 300,type = "cairo") 89 | plot_probabilities(reference) 90 | dev.off() 91 | 92 | #load all cancers and predict cancer cell states on them 93 | 94 | ## CHC 95 | load(file = "objects/CHC.Robj") 96 | query = CHC 97 | query <- NormalizeData(query) 98 | query <- scPredict(query, reference, threshold = 0.2) 99 | save(query, file = "processed_data/CHC.Robj") 100 | 101 | ##CHC1_T 102 | load(file = "objects/CHC1_T.Robj") 103 | query = CHC1_T 104 | query <- NormalizeData(query) 105 | query <- scPredict(query, reference, threshold = 0.2) 106 | save(query, file = "processed_data/CHC1_T.Robj") 107 | 108 | 109 | #crc 110 | load(file = "objects/crc.Robj") 111 | query = crc 112 | query <- NormalizeData(query) 113 | query <- scPredict(query, reference, threshold = 0.2) 114 | save(query, file = "processed_data/crc.Robj") 115 | 116 | #gbm 117 | load(file = "objects/gbm.Robj") 118 | query = gbm 119 | query <- NormalizeData(query) 120 | query <- scPredict(query, reference, threshold = 0.2) 121 | save(query, file = "processed_data/gbm.Robj") 122 | 123 | 124 | #HBC_IDC 125 | load(file = "objects/HBC_IDC.Robj") 126 | query = HBC_IDC 127 | query <- NormalizeData(query) 128 | query <- scPredict(query, reference, threshold = 0.2) 129 | save(query, file = "processed_data/HBC_IDC.Robj") 130 | 131 | #HBC_ILC 132 | load(file = "objects/HBC_ILC.Robj") 133 | query = HBC_ILC 134 | query <- NormalizeData(query) 135 | query <- scPredict(query, reference, threshold = 0.2) 136 | save(query, file = "processed_data/HBC_ILC.Robj") 137 | 138 | 139 | #HCC1_T 140 | load(file = "objects/HCC1_T.Robj") 141 | query = HCC1_T 142 | query <- NormalizeData(query) 143 | query <- scPredict(query, reference, threshold = 0.2) 144 | save(query, file = "processed_data/HCC1_T.Robj") 145 | 146 | #HCC1 147 | load(file = "objects/HCC1.Robj") 148 | query = HCC1 149 | query <- NormalizeData(query) 150 | query <- scPredict(query, reference, threshold = 0.2) 151 | save(query, file = "processed_data/HCC1.Robj") 152 | 153 | #HCC2 154 | load(file = "objects/HCC2_T.Robj") 155 | query = HCC2_T 156 | query <- NormalizeData(query) 157 | query <- scPredict(query, reference, threshold = 0.2) 158 | save(query, file = "processed_data/HCC2_T.Robj") 159 | 160 | #ICC 161 | load(file = "objects/ICC.Robj") 162 | query = ICC 163 | query <- NormalizeData(query) 164 | query <- scPredict(query, reference, threshold = 0.2) 165 | save(query, file = "processed_data/ICC.Robj") 166 | 167 | #OV 168 | load(file = "objects/OV.Robj") 169 | query = OV 170 | query <- NormalizeData(query) 171 | query <- scPredict(query, reference, threshold = 0.2) 172 | save(query, file = "processed_data/OV.Robj") 173 | 174 | #p4_scc 175 | load(file = "objects/p4_scc.Robj") 176 | query = p4_scc 177 | query <- NormalizeData(query) 178 | query <- scPredict(query, reference, threshold = 0.2) 179 | save(query, file = "processed_data/p4_cscc.Robj") 180 | 181 | #p6_scc 182 | load(file = "objects/p6_scc.Robj") 183 | query = p6_scc 184 | query <- NormalizeData(query) 185 | query <- scPredict(query, reference, threshold = 0.2) 186 | save(query, file = "processed_data/p6_cscc.Robj") 187 | 188 | rm(list=setdiff(ls(), "reference")) 189 | 190 | load(file = "objects/PDAC_A.Robj") 191 | query = PDAC_A 192 | query <- NormalizeData(query) 193 | query <- scPredict(query, reference, threshold = 0.2) 194 | save(query, file = "processed_data/PDAC_A.Robj") 195 | 196 | load(file = "objects/PDAC_B.Robj") 197 | query = PDAC_B 198 | query <- NormalizeData(query) 199 | query <- scPredict(query, reference, threshold = 0.2) 200 | save(query, file = "processed_data/PDAC_B.Robj") 201 | 202 | load(file = "objects/PDAC_C.Robj") 203 | query = PDAC_C 204 | query <- NormalizeData(query) 205 | query <- scPredict(query, reference, threshold = 0.2) 206 | save(query, file = "processed_data/PDAC_C.Robj") 207 | 208 | load(file = "objects/cscc1.Robj") 209 | query = cscc1 210 | query <- NormalizeData(query) 211 | query <- scPredict(query, reference, threshold = 0.2) 212 | save(query, file = "processed_data/cscc1.Robj") 213 | 214 | load(file = "objects/cscc2.Robj") 215 | query = cscc2 216 | query <- NormalizeData(query) 217 | query <- scPredict(query, reference, threshold = 0.2) 218 | save(query, file = "processed_data/cscc2.Robj") 219 | 220 | load(file = "objects/cscc3.Robj") 221 | query = cscc3 222 | query <- NormalizeData(query) 223 | query <- scPredict(query, reference, threshold = 0.2) 224 | save(query, file = "processed_data/cscc3.Robj") 225 | 226 | load(file = "objects/cscc4.Robj") 227 | query = cscc4 228 | query <- NormalizeData(query) 229 | query <- scPredict(query, reference, threshold = 0.2) 230 | save(query, file = "processed_data/cscc4.Robj") 231 | 232 | rm(list=setdiff(ls(), "reference")) 233 | 234 | load(file = "objects/cervical_scc.Robj") 235 | query = cervical_scc 236 | query <- NormalizeData(query) 237 | query <- scPredict(query, reference, threshold = 0.2) 238 | save(query, file = "processed_data/cervical_scc.Robj") 239 | 240 | load(file = "objects/intestinal.Robj") 241 | query = intestinal 242 | query <- NormalizeData(query) 243 | query <- scPredict(query, reference, threshold = 0.2) 244 | save(query, file = "processed_data/intestinals.Robj") 245 | 246 | load(file = "objects/prostate.Robj") 247 | query = prostate 248 | query <- NormalizeData(query) 249 | query <- scPredict(query, reference, threshold = 0.2) 250 | save(query, file = "processed_data/prostate.Robj") 251 | 252 | load(file = "objects/prostate_acinar.Robj") 253 | query = prostate_acinar 254 | query <- NormalizeData(query) 255 | query <- scPredict(query, reference, threshold = 0.2) 256 | save(query, file = "processed_data/prostate_acinar.Robj") 257 | 258 | load(file = "objects/lung_scc.Robj") 259 | query = lung_scc 260 | query <- NormalizeData(query) 261 | query <- scPredict(query, reference, threshold = 0.2) 262 | save(query, file = "processed_data/lung_scc.Robj") 263 | 264 | load(file = "objects/melanoma.Robj") 265 | query = melanoma 266 | query <- NormalizeData(query) 267 | query <- scPredict(query, reference, threshold = 0.2) 268 | save(query, file = "processed_data/melanoma.Robj") 269 | 270 | rm(list=setdiff(ls(), "reference")) 271 | 272 | load(file = "objects/s1_paediatric_medulloblastoma.Robj") 273 | query = s1_paediatric_medulloblastoma 274 | query <- NormalizeData(query) 275 | query <- scPredict(query, reference, threshold = 0.2) 276 | save(query, file = "processed_data/s1_paediatric_medulloblastoma.Robj") 277 | 278 | load(file = "objects/s2_paediatric_medulloblastoma.Robj") 279 | query = s2_paediatric_medulloblastoma 280 | query <- NormalizeData(query) 281 | query <- scPredict(query, reference, threshold = 0.2) 282 | save(query, file = "processed_data/s2_paediatric_medulloblastoma.Robj") 283 | 284 | load(file = "objects/s3_paediatric_CNS_embryonal.Robj") 285 | query = s3_paediatric_CNS_embryonal 286 | query <- NormalizeData(query) 287 | query <- scPredict(query, reference, threshold = 0.2) 288 | save(query, file = "processed_data/s3_paediatric_CNS_embryonal.Robj") 289 | 290 | 291 | -------------------------------------------------------------------------------- /Figure 4/Plotting/analysis.R: -------------------------------------------------------------------------------- 1 | library(Seurat) 2 | nm <- list.files(path="processed_data/") 3 | 4 | df_full <- data.frame() 5 | 6 | #create a full 3x13 figure 7 | 8 | for (file in nm) { 9 | load(paste("processed_data/",file, sep = "")) 10 | df <- query@meta.data 11 | df <- data.frame(df$scpred_prediction,stringr::str_split(file,".R")[[1]][1]) 12 | df_full <- rbind(df_full, df) 13 | } 14 | 15 | names(df_full) = c("core_edge","dataset") 16 | library(ggplot2) 17 | 18 | library(tidyr) 19 | library(dplyr) 20 | df_full$core_edge <- factor(df_full$core_edge, levels = c("core","transitory","edge","noncancer")) 21 | 22 | library(tidyr) 23 | library(dplyr) 24 | df_count <- df_full%>% 25 | group_by( dataset,core_edge) %>% 26 | summarize(count = n()) 27 | 28 | png("scpred.png",units = "in", res = 300, width = 8, height = 5, type = "cairo") 29 | ggplot(df_count,aes(x = dataset, y = count, fill = core_edge)) + 30 | geom_bar(position="fill", stat="identity")+ 31 | theme(panel.grid.major = element_blank(), 32 | panel.grid.minor = element_blank(), 33 | panel.border = element_blank(), panel.background = element_blank()) + 34 | guides(fill = guide_legend(title = 'cell type')) + 35 | scale_fill_manual(values = c("core" = "#4DBBD5FF", 36 | "transitory" = "#F9E076", "edge" = "#E64B35FF", "noncancer" = "lightgray"))+ 37 | theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) 38 | dev.off() 39 | 40 | #cluster on composition simillarity 41 | library(tidyverse) 42 | library(ggdendro) 43 | library(vegan) 44 | library(colorspace) 45 | library(cowplot) 46 | t <- reshape2::dcast(df_count, dataset ~ core_edge, fun.aggregate = sum) 47 | t <- t %>% 48 | group_by(dataset) %>% 49 | mutate(sum = as.character(rowSums(select(cur_data(), is.numeric)))) %>% 50 | summarise_if(is.numeric, ~ . / as.numeric(sum)) %>% tibble::column_to_rownames("dataset") 51 | hc=hclust(dist(t),method="ward.D2") 52 | hc=reorder(hc,wts=-as.matrix(t)%*%seq(ncol(t))^2) # vegan::reorder.hclust 53 | tree=ggdendro::dendro_data(as.dendrogram(hc),type="rectangle") 54 | 55 | p1=ggplot(ggdendro::segment(tree))+ 56 | geom_segment(aes(x=y,y=x,xend=yend,yend=xend),lineend="round",size=.4)+ 57 | scale_x_continuous(expand=expansion(add=c(0,.01)))+ # don't crop half of line between top-level nodes 58 | scale_y_continuous(limits=.5+c(0,nrow(t)),expand=c(0,0))+ 59 | theme( 60 | axis.text=element_blank(), 61 | axis.ticks=element_blank(), 62 | axis.ticks.length=unit(0,"pt"), # remove extra space occupied by ticks 63 | axis.title=element_blank(), 64 | panel.background=element_rect(fill="white"), 65 | panel.grid=element_blank(), 66 | plot.margin=margin(5,5,5,0) 67 | ) 68 | 69 | t=t[hc$labels[hc$order],] 70 | t2=data.frame(V1=rownames(t)[row(t)],V2=colnames(t)[col(t)],V3=unname(do.call(c,t))) 71 | lab=round(100*t2$V3) 72 | lab[lab==0]="" 73 | 74 | t2$V2 <- factor(t2$V2, levels = c("core","transitory","edge","noncancer")) 75 | p2=ggplot(t2,aes(x=factor(V1,level=rownames(t)),y=V3,fill=V2))+ 76 | geom_bar(stat="identity",width=1,position=position_fill(reverse=T))+ 77 | geom_text(aes(label=lab),position=position_stack(vjust=.5,reverse=T),size=3.5)+ 78 | coord_flip()+ 79 | scale_x_discrete(expand=c(0,0))+ 80 | scale_y_discrete(expand=c(0,0))+ 81 | scale_fill_manual(values = c("core" = "#4DBBD5FF", 82 | "transitory" = "#F9E076", "edge" = "#E64B35FF", "noncancer" = "lightgray"))+ 83 | theme( 84 | axis.text=element_text(color="black",size=11), 85 | axis.text.x=element_blank(), 86 | axis.ticks=element_blank(), 87 | axis.title=element_blank(), 88 | legend.position="none", 89 | plot.margin=margin(5,0,5,5) 90 | ) 91 | 92 | png("scpred_clustered.png",units = "in", res = 300, width = 7, height = 10, type = "cairo") 93 | cowplot::plot_grid(p2,p1,rel_widths=c(1,.2)) 94 | dev.off() 95 | 96 | # #cluster plot for only core, transitory, and edge 97 | # library(tidyverse) 98 | # library(ggdendro) 99 | # library(vegan) 100 | # library(colorspace) 101 | # library(cowplot) 102 | # df_full_cc <- subset(df_full, df_full$core_edge %in% c("core","edge","transitory")) 103 | # df_count_cc <- df_full_cc%>% 104 | # group_by( dataset,core_edge) %>% 105 | # summarize(count = n()) 106 | # 107 | # t <- reshape2::dcast(df_count_cc, dataset ~ core_edge, fun.aggregate = sum) 108 | # t <- t %>% 109 | # group_by(dataset) %>% 110 | # mutate(sum = as.character(rowSums(select(cur_data(), is.numeric)))) %>% 111 | # summarise_if(is.numeric, ~ . / as.numeric(sum)) %>% tibble::column_to_rownames("dataset") 112 | # hc=hclust(dist(t),method="ward.D2") 113 | # hc=reorder(hc,wts=-as.matrix(t)%*%seq(ncol(t))^2) # vegan::reorder.hclust 114 | # tree=ggdendro::dendro_data(as.dendrogram(hc),type="rectangle") 115 | # 116 | # p1=ggplot(ggdendro::segment(tree))+ 117 | # geom_segment(aes(x=y,y=x,xend=yend,yend=xend),lineend="round",size=.4)+ 118 | # scale_x_continuous(expand=expansion(add=c(0,.01)))+ # don't crop half of line between top-level nodes 119 | # scale_y_continuous(limits=.5+c(0,nrow(t)),expand=c(0,0))+ 120 | # theme( 121 | # axis.text=element_blank(), 122 | # axis.ticks=element_blank(), 123 | # axis.ticks.length=unit(0,"pt"), # remove extra space occupied by ticks 124 | # axis.title=element_blank(), 125 | # panel.background=element_rect(fill="white"), 126 | # panel.grid=element_blank(), 127 | # plot.margin=margin(5,5,5,0) 128 | # ) 129 | # 130 | # t=t[hc$labels[hc$order],] 131 | # t2=data.frame(V1=rownames(t)[row(t)],V2=colnames(t)[col(t)],V3=unname(do.call(c,t))) 132 | # lab=round(100*t2$V3) 133 | # lab[lab==0]="" 134 | # 135 | # t2$V2 <- factor(t2$V2, levels = c("core","transitory","edge","noncancer")) 136 | # p2=ggplot(t2,aes(x=factor(V1,level=rownames(t)),y=V3,fill=V2))+ 137 | # geom_bar(stat="identity",width=1,position=position_fill(reverse=T))+ 138 | # geom_text(aes(label=lab),position=position_stack(vjust=.5,reverse=T),size=3.5)+ 139 | # coord_flip()+ 140 | # scale_x_discrete(expand=c(0,0))+ 141 | # scale_y_discrete(expand=c(0,0))+ 142 | # scale_fill_manual(values = c("core" = "#4DBBD5FF", 143 | # "transitory" = "#F9E076", "edge" = "#E64B35FF", "noncancer" = "lightgray"))+ 144 | # theme( 145 | # axis.text=element_text(color="black",size=11), 146 | # axis.text.x=element_blank(), 147 | # axis.ticks=element_blank(), 148 | # axis.title=element_blank(), 149 | # legend.position="none", 150 | # plot.margin=margin(5,0,5,5) 151 | # ) 152 | # 153 | # cowplot::plot_grid(p2,p1,rel_widths=c(1,.4)) 154 | 155 | 156 | #plot classification of every single sample 157 | library(ggplot2) 158 | library(stringr) 159 | p <- list() 160 | i =1 161 | 162 | for (file in nm) { 163 | load(paste("processed_data/",file, sep = "")) 164 | 165 | p1 = SpatialDimPlot(query, alpha = c(0,0)) +theme(legend.position="none") + ggtitle(label = strsplit(file[1], ".", fixed = TRUE)[[1]][1]) 166 | 167 | p2 = SpatialDimPlot(query, image.alpha = 0.5, group.by = "scpred_prediction", pt.size.factor = 2, 168 | cols = c("core" = "#4DBBD5FF", 169 | "transitory" = "#F9E076", "edge" = "#E64B35FF", "noncancer" = "lightgray"), stroke = 0) +theme(legend.position="none") 170 | 171 | #find clusters 172 | query <- FindVariableFeatures(query) 173 | query <- ScaleData(query) 174 | query <- RunPCA(query) 175 | query <- RunUMAP(query,dims = 1:15,reduction = "pca") 176 | p3 = DimPlot(query, group.by = "scpred_prediction", 177 | cols = c("core" = "#4DBBD5FF", 178 | "transitory" = "#F9E076", "edge" = "#E64B35FF", "noncancer" = "lightgray"))+theme(legend.position="none")+ theme( 179 | plot.title = element_blank(),axis.title.x = element_blank(), 180 | axis.title.y = element_blank())+ 181 | theme(axis.ticks.x = element_blank(), 182 | axis.text.x = element_blank()) + 183 | theme(axis.ticks.y = element_blank(), 184 | axis.text.y = element_blank()) 185 | all_plots <- p1 + p2 + p3 186 | p[[i]] = all_plots 187 | i = i + 1 188 | 189 | 190 | } 191 | 192 | library(gridExtra) 193 | library(patchwork) 194 | png("scpred_full.png",units = "in", res = 300, width = 12, height = 20, type = "cairo") 195 | wrap_plots(p, ncol = 3) 196 | dev.off() 197 | 198 | #plot some examples 199 | png("scpred_example_cscc.png",units = "in", res = 300, width = 5, height = 4, type = "cairo") 200 | p[[20]] 201 | dev.off() 202 | 203 | png("scpred_example_cervicalscc.png",units = "in", res = 300, width = 5, height = 4, type = "cairo") 204 | p[[1]] 205 | dev.off() 206 | 207 | png("scpred_example_melanoma.png",units = "in", res = 300, width = 5, height = 4, type = "cairo") 208 | p[[18]] 209 | dev.off() 210 | 211 | png("scpred_example_colorectal.png",units = "in", res = 300, width = 5, height = 4, type = "cairo") 212 | p[[4]] 213 | dev.off() 214 | -------------------------------------------------------------------------------- /Figure 5/OSCC survival/oscc_tcga.R: -------------------------------------------------------------------------------- 1 | library(UCSCXenaTools) 2 | library(dplyr) 3 | library(survival) 4 | library(survminer) 5 | library(data.table) 6 | library(ggplot2) 7 | library(tibble) 8 | library(stringr) 9 | library(plyr) 10 | library(singscore) 11 | library(pROC) 12 | indication <- "HNSC" 13 | `%!in%` = Negate(`%in%`) 14 | 15 | replacement_dictionary <- c( 16 | "ADIRF" = "C10orf116", "CTSV" = 'CTSL2',"ERO1A" = "ERO1L","IL36G" = "IL1F9", 17 | "IL36RN" = "IL1F5","ATP5F1A" = "ATP5A1","ATP5F1B" = "ATP5B", "ATP5MF" = "ATP5J2", 18 | 'DDX39B' = "BAT1","HNRNPDL" = 'HNRPDL',"MZT2B" = "FAM128B","PKM" = 'PKM2', 19 | 'RACK1' = 'GNB2L1', 'SNHG29' = 'NCRNA00188','SRSF2' = "SFRS2" ,"TMA7" = 'CCDC72') 20 | 21 | genes_to_remove <- c("DEFB4B","PRR9","RNF223","SLURP2","MIR205HG","TMSB4X") 22 | 23 | core_genes <- readRDS("core_genes.RDS") 24 | core_genes <- c(core_genes) 25 | for (gene in core_genes) { 26 | if (gene %in% names(replacement_dictionary)) { 27 | core_genes <- core_genes[core_genes != gene] 28 | core_genes <- append(core_genes, replacement_dictionary[[gene]]) 29 | } 30 | } 31 | 32 | core_genes <- core_genes[core_genes %!in% genes_to_remove] 33 | 34 | edge_genes <- readRDS("edge_genes.RDS") 35 | edge_genes <- c(edge_genes) 36 | 37 | for (gene in edge_genes) { 38 | if (gene %in% names(replacement_dictionary)) { 39 | edge_genes <- edge_genes[edge_genes != gene] 40 | edge_genes <- append(edge_genes, replacement_dictionary[[gene]]) 41 | } 42 | } 43 | 44 | edge_genes <- edge_genes[edge_genes %!in% genes_to_remove] 45 | 46 | geneset_of_interest <- list(core_genes,edge_genes) 47 | names(geneset_of_interest) <- c("core","edge") 48 | 49 | all_genes <- union(core_genes,edge_genes) 50 | 51 | #load in TCGA OSCC HPV negative codes 52 | sample_ids <- read.csv(file = "TCGA_OSCC_HPV_neg_CODES.csv") 53 | tcga_oscc_codes <- sample_ids$TCGA_codes 54 | tcga_oscc_sampleID <- lapply(tcga_oscc_codes, function(x) paste(x, "-01", sep = "")) 55 | 56 | #initialize object 57 | Xena_cohort = XenaData %>% 58 | filter(XenaHostNames == "tcgaHub") %>% # select TCGA Hub 59 | XenaScan(indication) # select cohort 60 | 61 | #download clinical and survival datasets 62 | cli_query = Xena_cohort %>% 63 | filter(DataSubtype == "phenotype") %>% # select clinical dataset 64 | XenaGenerate() %>% # generate a XenaHub object 65 | XenaQuery() %>% 66 | XenaDownload() 67 | 68 | cli = XenaPrepare(cli_query) 69 | survival_data = cli[[2]] 70 | clinical_data = cli[[1]] 71 | clinical_data$sample = clinical_data$sampleID 72 | 73 | if ("xena_sample" %in% colnames(survival_data)) { 74 | survival_data$sample = survival_data$xena_sample 75 | survival_data$xena_sample = NULL 76 | } 77 | 78 | clinical_data <- clinical_data[clinical_data$sampleID %in% tcga_oscc_sampleID, ] 79 | survival_data <- survival_data[survival_data$sample %in% tcga_oscc_sampleID, ] 80 | 81 | ge <- XenaData %>% 82 | filter(XenaHostNames == "tcgaHub") %>% # select TCGA Hub 83 | XenaScan(indication) %>% 84 | filter(DataSubtype == "gene expression RNAseq", Label == "IlluminaHiSeq", Unit == "log2(norm_count+1)") 85 | 86 | gene_info_km <- data.frame() 87 | 88 | gene_info_km <- fetch_dense_values(host = unique(ge$XenaHosts), 89 | dataset = ge$XenaDatasets, 90 | samples = tcga_oscc_sampleID, 91 | use_probeMap = F, 92 | check = FALSE, 93 | time_limit = 10000) 94 | 95 | gene_info_km <- as.data.frame(gene_info_km) 96 | 97 | 98 | dir.create("tcga_plots_km") 99 | setwd("tcga_plots_km") 100 | 101 | ###use multiple time methods 102 | time_methods <- c("DSS.time","PFI.time","OS.time") 103 | 104 | 105 | for(time_method in time_methods) { 106 | 107 | rankData <- rankGenes(gene_info_km) 108 | 109 | scoredf_core <- simpleScore(rankData, upSet = core_genes , knownDirection = F) 110 | scoredf_edge <- simpleScore(rankData, upSet = edge_genes , knownDirection = F) 111 | 112 | #calculate gene sets 113 | names(scoredf_core) <- c("core","core_dispersion") 114 | names(scoredf_edge) <- c("edge","edge_dispersion") 115 | 116 | scores_df_singshot <- cbind(scoredf_core,scoredf_edge) 117 | 118 | scores_df_singshot <- cbind(sample = rownames(scores_df_singshot), data.frame(scores_df_singshot, row.names=NULL)) 119 | 120 | merged_data = scores_df_singshot %>% 121 | left_join(survival_data, by = "sample") %>% 122 | dplyr::select(sample,names(geneset_of_interest), time_method, str_split(time_method,"[.]")[[1]][1]) %>% 123 | left_join(clinical_data, by = "sample") %>% 124 | dplyr::rename(time = time_method, 125 | status = str_split(time_method,"[.]")[[1]][1]) 126 | 127 | #minprop 0.2 128 | cut = surv_cutpoint(data = merged_data, time = "time", event = "status", variables = c("core","edge"), minprop = 0.2) 129 | 130 | res.cat <- surv_categorize(cut) 131 | 132 | 133 | coxph <- coxph(Surv(time, status) ~ edge, data = res.cat) 134 | kableExtra::as_image(tab::tabcoxph(coxph), file = paste(indication, str_split(time_method,"[.]")[[1]][1],"edge.png",sep = "_")) 135 | 136 | png(paste(indication, str_split(time_method,"[.]")[[1]][1],"edge_km.png",sep = "_"), units="in", width=7, height=5, res=300, type = "cairo") 137 | print(ggsurvplot(survfit(Surv(time, status) ~ edge, data = res.cat), pval = T, conf.int = F, 138 | palette=c("#EA738DFF","#89ABE3FF"),xscale = 365.25,surv.median.line = "v",break.time.by = 365.25, 139 | xlab="Time in years", 140 | ylab=paste("Probability of",str_split(time_method,"[.]")[[1]][1]), 141 | surv.scale="percent")) 142 | dev.off() 143 | 144 | 145 | coxph <- coxph(Surv(time, status) ~ core, data = res.cat) 146 | kableExtra::as_image(tab::tabcoxph(coxph), file = paste(indication, str_split(time_method,"[.]")[[1]][1],"core.png",sep = "_")) 147 | 148 | png(paste(indication, str_split(time_method,"[.]")[[1]][1],"core_km.png",sep = "_"), units="in", width=7, height=5, res=300, type = "cairo") 149 | print(ggsurvplot(survfit(Surv(time, status) ~ core, data = res.cat), pval = TRUE, conf.int = F, 150 | palette=c("#EA738DFF","#89ABE3FF"),xscale = 365.25,surv.median.line = "v",break.time.by = 365.25, 151 | xlab="Time in years", 152 | ylab=paste("Probability of",str_split(time_method,"[.]")[[1]][1]), 153 | surv.scale="percent")) 154 | dev.off() 155 | 156 | 157 | png(paste("figs/",indication, str_split(time_method,"[.]")[[1]][1],"edge_km.png",sep = "_"), units="in", width=6.5 , height=6, res=300, type = "cairo") 158 | print(ggsurvplot(survfit(Surv(time, status) ~ edge, data = res.cat), pval = F, conf.int = F,risk.table = T, 159 | palette=c("#EA738DFF","#89ABE3FF"),xscale = 365.25,surv.median.line = "v",break.time.by = 365.25, 160 | xlab="Time in years", 161 | ylab=paste("Probability of",str_split(time_method,"[.]")[[1]][1]), 162 | surv.scale="percent")) 163 | dev.off() 164 | 165 | png(paste("figs/",indication, str_split(time_method,"[.]")[[1]][1],"core_km.png",sep = "_"), units="in", width=6.5, height=6, res=300, type = "cairo") 166 | print(ggsurvplot(survfit(Surv(time, status) ~ core, data = res.cat), pval = F, conf.int = F,risk.table = T, 167 | palette=c("#EA738DFF","#89ABE3FF"),xscale = 365.25,surv.median.line = "v",break.time.by = 365.25, 168 | xlab="Time in years", 169 | ylab=paste("Probability of",str_split(time_method,"[.]")[[1]][1]), 170 | surv.scale="percent")) 171 | dev.off() 172 | 173 | } 174 | 175 | setwd("..") 176 | 177 | dir.create("other_tcga_plots") 178 | setwd("other_tcga_plots") 179 | 180 | library(immunedeconv) 181 | png("rankplot_core.png", units="in", width=4, height=3, res=300, type = "cairo") 182 | plotRankDensity(rankData[,2,drop = FALSE], upSet = core_genes, isInteractive = F) + scale_color_manual(values = c("#4DBBD5FF")) 183 | dev.off() 184 | 185 | png("rankplot_edge.png", units="in", width=4, height=3, res=300, type = "cairo") 186 | plotRankDensity(rankData[,2,drop = FALSE], upSet = edge_genes, isInteractive = F) + scale_color_manual(values = c("#E64B35FF")) 187 | dev.off() 188 | 189 | res = deconvolute(gene_info_km, "epic") 190 | 191 | t_res = as.data.frame(t(res)) 192 | names(t_res) <- lapply(t_res[1, ], as.character) 193 | t_res <- t_res[-1, ] 194 | 195 | merged_data = scores_df_singshot %>% 196 | left_join(clinical_data, by = "sample") 197 | 198 | rownames(merged_data) <- merged_data$sample 199 | 200 | full_df_deconv <- merge(t_res, merged_data, by = 0, all = TRUE) 201 | png("core_edge_correlation.png", units="in", width=5, height=5, res=300, type = "cairo") 202 | 203 | #correlation core edge 204 | sp <- ggscatter(merged_data, x = "edge", y = "core", 205 | add = "reg.line", # Add regression line 206 | add.params = list(color = "blue", fill = "lightgray"), # Customize reg. line 207 | conf.int = TRUE # Add confidence interval 208 | ) 209 | # Add correlation coefficient 210 | sp + stat_cor(method = "pearson", label.x = 0.87, label.y = 0.3) 211 | dev.off() 212 | library(rstatix) 213 | 214 | ### 215 | full_df_deconv$CAF <- full_df_deconv$`Cancer associated fibroblast` 216 | full_df_deconv$CAF <- as.numeric(full_df_deconv$CAF) 217 | png("CAF_edge_correlation.png", units="in", width=6, height=5, res=300, type = "cairo") 218 | 219 | #correlation edge CAF 220 | sp <- ggscatter(full_df_deconv, x = "edge", y = "CAF", 221 | add = "reg.line", # Add regressin line 222 | add.params = list(color = "blue", fill = "lightgray"), # Customize reg. line 223 | conf.int = TRUE # Add confidence interval 224 | ) 225 | # Add correlation coefficient 226 | sp + stat_cor(method = "pearson", label.x = 0.87, label.y = 0.01) 227 | dev.off() 228 | 229 | give.n <- function(x){ 230 | return(c(y = mean(x), label = length(x))) 231 | } 232 | 233 | merged_data_stage <- merged_data[merged_data$pathologic_stage %in% c("Stage I","Stage II","Stage III", 234 | "Stage IVA","Stage IVB"),] 235 | 236 | png("stage_plot_core.png", units="in", width=7, height=5, res=300, type = "cairo") 237 | ggplot(merged_data_stage, aes_string(x = "pathologic_stage", y = "core", fill = "pathologic_stage")) + 238 | geom_boxplot(alpha=0.7)+ 239 | stat_summary(fun.data = give.n, geom = "text") + 240 | scale_y_continuous(name = "gene expression RNAseq") + 241 | scale_x_discrete(name = "Gene") + 242 | theme_bw() + 243 | ggpubr::stat_compare_means() 244 | dev.off() 245 | 246 | png("stage_plot_edge.png", units="in", width=7, height=5, res=300, type = "cairo") 247 | ggplot(merged_data_stage, aes_string(x = "pathologic_stage", y = "edge", fill = "pathologic_stage")) + 248 | geom_boxplot(alpha=0.7)+ 249 | stat_summary(fun.data = give.n, geom = "text") + 250 | scale_y_continuous(name = "gene expression RNAseq") + 251 | scale_x_discrete(name = "Gene") + 252 | theme_bw() + 253 | ggpubr::stat_compare_means() 254 | dev.off() 255 | 256 | library(RColorBrewer) 257 | 258 | ###comparison across clinical characteristics 259 | 260 | merged_data = scores_df_singshot %>% 261 | left_join(survival_data, by = "sample") %>% 262 | dplyr::select(sample,names(geneset_of_interest), time_method, str_split(time_method,"[.]")[[1]][1]) %>% 263 | left_join(clinical_data, by = "sample") %>% 264 | dplyr::rename(time = time_method, 265 | status = str_split(time_method,"[.]")[[1]][1]) 266 | 267 | edited_data <- merged_data %>% 268 | mutate(margin_status = replace(margin_status,margin_status %in% c("Positive"), "present")) %>% 269 | mutate(pathologic_M = replace(pathologic_M,pathologic_M %in% c("MX"), "present")) %>% 270 | mutate(pathologic_N = replace(pathologic_N,pathologic_N %in% c("N2a","N2b","N2","N2c"), "present")) %>% 271 | mutate(pathologic_T = replace(pathologic_T,pathologic_T %in% c("T3","T4","T4a"), "present")) %>% 272 | mutate(presence_of_pathological_nodal_extracapsular_spread = replace(presence_of_pathological_nodal_extracapsular_spread, 273 | presence_of_pathological_nodal_extracapsular_spread %in% c("Gross Extension","Microscopic Extension"), "present")) %>% 274 | mutate(lymphovascular_invasion_present = replace(lymphovascular_invasion_present,lymphovascular_invasion_present %in% c("YES"), "present")) %>% 275 | mutate(neoplasm_histologic_grade = replace(neoplasm_histologic_grade,neoplasm_histologic_grade %in% c("G3"), "present")) %>% 276 | select(edge,core,pathologic_M,pathologic_T,pathologic_N, 277 | lymphovascular_invasion_present,neoplasm_histologic_grade, margin_status, 278 | presence_of_pathological_nodal_extracapsular_spread) %>% 279 | melt(id.vars = c('core',"edge"), measure.vars = c("pathologic_M","pathologic_T","pathologic_N", 280 | "lymphovascular_invasion_present","neoplasm_histologic_grade", "margin_status", 281 | "presence_of_pathological_nodal_extracapsular_spread"))%>% na.omit 282 | 283 | edited_data$value[!grepl('present', edited_data$value)] <- "absent" 284 | 285 | ### 286 | 287 | signif_abbreviation <- function(p_values) { 288 | sapply(p_values, function(x) { 289 | if (is.na(x)) { 290 | return(NA) 291 | } else if (x < 0.0001) { 292 | return("****") 293 | } else if (x < 0.001) { 294 | return("***") 295 | } else if (x < 0.01) { 296 | return("**") 297 | } else if (x < 0.05) { 298 | return("*") 299 | } else if (x > 0.05) { 300 | return("NS") 301 | } else { 302 | return("NS") 303 | } 304 | }) 305 | } 306 | 307 | wilcox_tests <- edited_data %>% 308 | group_by(variable) %>% 309 | do(tidy(wilcox.test(edge ~ value, data = .))) 310 | 311 | wilcox_tests$p.adj <- p.adjust(wilcox_tests$p.value, method = "BH") 312 | 313 | wilcox_tests <- wilcox_tests %>% 314 | mutate(p.adj.signif = as.character(signif_abbreviation(p.adj))) 315 | 316 | edited_data_with_p_adj <- left_join(edited_data, wilcox_tests, by = "variable") 317 | 318 | 319 | png("comparision_plot_edge.png", units="in", width=6, height=6.5, res=300, type = "cairo") 320 | 321 | ggplot(edited_data_with_p_adj, aes(x = variable, y = edge), fill = value)+ 322 | geom_boxplot(outlier.shape = NA,aes(fill = factor(value)), alpha = 0.5)+ 323 | scale_fill_manual(values = c('present' = '#Fa5d0f', 324 | 'absent' = '#2B1055'))+ 325 | theme_bw() + 326 | theme(legend.position = "bottom") + 327 | guides(color = guide_legend(title = "Sample")) + 328 | geom_text(data = edited_data_with_p_adj, aes(x = variable, y = Inf, label = p.adj.signif), vjust = 1.5, hjust = 0.5, fontface = "bold") + 329 | theme(axis.text.x = element_text(angle = 45, hjust=1)) + # Bold axis labels 330 | theme(plot.margin = unit(c(1, 5, 1, 1), "lines"), 331 | axis.title = element_text(face = "bold"), 332 | axis.text = element_text(size = 10, face = "bold"), 333 | axis.ticks.y = element_blank(), panel.border = element_rect(linewidth = 0)) 334 | dev.off() 335 | 336 | wilcox_tests <- edited_data %>% 337 | group_by(variable) %>% 338 | do(tidy(wilcox.test(core ~ value, data = .))) 339 | 340 | wilcox_tests$p.adj <- p.adjust(wilcox_tests$p.value, method = "BH") 341 | 342 | wilcox_tests <- wilcox_tests %>% 343 | mutate(p.adj.signif = as.character(signif_abbreviation(p.adj))) 344 | 345 | edited_data_with_p_adj <- left_join(edited_data, wilcox_tests, by = "variable") 346 | 347 | 348 | png("comparision_plot_core.png", units="in", width=6, height=6.5, res=300, type = "cairo") 349 | ggplot(edited_data, aes(x = variable, y = core), fill = value)+ 350 | geom_boxplot(outlier.shape = NA,aes(fill = factor(value)), alpha = 0.5)+ 351 | scale_fill_manual(values = c('present' = '#Fa5d0f', 352 | 'absent' = '#2B1055'))+ 353 | theme_bw() + 354 | theme(legend.position = "bottom") + 355 | guides(color = guide_legend(title = "Sample")) + 356 | geom_text(data = edited_data_with_p_adj, aes(x = variable, y = Inf, label = p.adj.signif), vjust = 1.5, hjust = 0.5, fontface = "bold") + 357 | theme(axis.text.x = element_text(angle = 45, hjust=1)) + # Bold axis labels 358 | theme(plot.margin = unit(c(1, 5, 1, 1), "lines"), 359 | axis.title = element_text(face = "bold"), 360 | axis.text = element_text(size = 10, face = "bold"), 361 | axis.ticks.y = element_blank(), panel.border = element_rect(linewidth = 0)) 362 | dev.off() 363 | 364 | 365 | 366 | 367 | -------------------------------------------------------------------------------- /Figure 5/Pan-cancer survival/Survival_pan_cancer.R: -------------------------------------------------------------------------------- 1 | library(UCSCXenaTools) 2 | library(dplyr) 3 | library(survival) 4 | library(survminer) 5 | library(data.table) 6 | library(ggplot2) 7 | library(tibble) 8 | library(stringr) 9 | library(plyr) 10 | library(singscore) 11 | library(pROC) 12 | indication <- "HNSC" 13 | `%!in%` = Negate(`%in%`) 14 | 15 | #replace gene names to gene names that exist in TCGA 16 | replacement_dictionary <- c( 17 | "ADIRF" = "C10orf116", "CTSV" = 'CTSL2',"ERO1A" = "ERO1L","IL36G" = "IL1F9", 18 | "IL36RN" = "IL1F5","ATP5F1A" = "ATP5A1","ATP5F1B" = "ATP5B", "ATP5MF" = "ATP5J2", 19 | 'DDX39B' = "BAT1","HNRNPDL" = 'HNRPDL',"MZT2B" = "FAM128B","PKM" = 'PKM2', 20 | 'RACK1' = 'GNB2L1', 'SNHG29' = 'NCRNA00188','SRSF2' = "SFRS2" ,"TMA7" = 'CCDC72') 21 | 22 | #remove genes not in TCGA 23 | genes_to_remove <- c("DEFB4B","PRR9","RNF223","SLURP2","MIR205HG","TMSB4X") 24 | 25 | 26 | core_genes <- readRDS("core_genes.RDS") 27 | 28 | for (gene in core_genes) { 29 | if (gene %in% names(replacement_dictionary)) { 30 | core_genes <- core_genes[core_genes != gene] 31 | core_genes <- append(core_genes, replacement_dictionary[[gene]]) 32 | } 33 | } 34 | 35 | core_genes <- core_genes[core_genes %!in% genes_to_remove] 36 | 37 | edge_genes <- readRDS("edge_genes.RDS") 38 | 39 | for (gene in edge_genes) { 40 | if (gene %in% names(replacement_dictionary)) { 41 | edge_genes <- edge_genes[edge_genes != gene] 42 | edge_genes <- append(edge_genes, replacement_dictionary[[gene]]) 43 | } 44 | } 45 | 46 | edge_genes <- edge_genes[edge_genes %!in% genes_to_remove] 47 | 48 | geneset_of_interest <- list(core_genes,edge_genes) 49 | names(geneset_of_interest) <- c("core","edge") 50 | 51 | all_genes <- union(core_genes,edge_genes) 52 | 53 | 54 | cancer_indications <- c("BRCA","LUAD","LUSC","LIHC","OSCC","COADREAD","PAAD","ACC", 55 | "BLCA","STAD","THCA","PRAD", 56 | "CESC","GBMLGG","KIRP","MESO", 57 | "KIRC","OV","SARC","SKCM") 58 | 59 | #make a df to keep track of geneset, indication, pval, worse outcome, and hazard ratio 60 | df <- data.frame (geneset = rep(names(geneset_of_interest),length(cancer_indications)), 61 | indication = rep(cancer_indications,each = length(geneset_of_interest)), 62 | p_val = 1,worse_outcome = 0, hazard_ratio = 0 63 | ) 64 | 65 | for (indication in cancer_indications) { 66 | #initialize object 67 | if (indication != "OSCC") { 68 | Xena_cohort = XenaData %>% 69 | filter(XenaHostNames == "tcgaHub") %>% # select TCGA Hub 70 | XenaScan(indication) # select cohort 71 | 72 | #download clinical and survival datasets 73 | cli_query = Xena_cohort %>% 74 | filter(DataSubtype == "phenotype") %>% # select clinical dataset 75 | XenaGenerate() %>% # generate a XenaHub object 76 | XenaQuery() %>% 77 | XenaDownload() 78 | 79 | if (indication != "OV") { 80 | cli = XenaPrepare(cli_query) 81 | survival_data = cli[[2]] 82 | clinical_data = cli[[1]] 83 | clinical_data$sample = clinical_data$sampleID 84 | } else { 85 | cli = XenaPrepare(cli_query) 86 | survival_data = cli[[3]] 87 | clinical_data = cli[[1]] 88 | clinical_data$sample = clinical_data$sampleID 89 | } 90 | 91 | if ("xena_sample" %in% colnames(survival_data)) { 92 | survival_data$sample = survival_data$xena_sample 93 | survival_data$xena_sample = NULL 94 | } 95 | 96 | #process exceptions 97 | if (indication == "OV") { 98 | ge <- XenaData %>% 99 | filter(XenaHostNames == "tcgaHub") %>% # select TCGA Hub 100 | XenaScan(indication) %>% 101 | filter(DataSubtype == "gene expression RNAseq", Label == "IlluminaHiSeq UNC", Unit == "log2(norm_count+1)") 102 | } else { 103 | if (indication == "STAD") { 104 | ge <- XenaData %>% 105 | filter(XenaHostNames == "tcgaHub") %>% # select TCGA Hub 106 | XenaScan(indication) %>% 107 | filter(DataSubtype == "gene expression RNAseq", Label == "IlluminaHiSeq UNC", Unit == "log2(norm_count+1)") 108 | ge <- ge[2,] 109 | } else { 110 | ge <- XenaData %>% 111 | filter(XenaHostNames == "tcgaHub") %>% # select TCGA Hub 112 | XenaScan(indication) %>% 113 | filter(DataSubtype == "gene expression RNAseq", Label == "IlluminaHiSeq", Unit == "log2(norm_count+1)") 114 | } 115 | } 116 | 117 | if (indication == "SARC") { 118 | ge = ge[1,] 119 | } } else { 120 | sample_ids <- read.csv(file = "TCGA_OSCC_HPV_neg_CODES.csv") 121 | tcga_oscc_codes <- sample_ids$TCGA_codes 122 | tcga_oscc_sampleID <- lapply(tcga_oscc_codes, function(x) paste(x, "-01", sep = "")) 123 | 124 | #initialize object 125 | Xena_cohort = XenaData %>% 126 | filter(XenaHostNames == "tcgaHub") %>% # select TCGA Hub 127 | XenaScan("HNSC") # select cohort 128 | 129 | #download clinical and survival datasets 130 | cli_query = Xena_cohort %>% 131 | filter(DataSubtype == "phenotype") %>% # select clinical dataset 132 | XenaGenerate() %>% # generate a XenaHub object 133 | XenaQuery() %>% 134 | XenaDownload() 135 | 136 | cli = XenaPrepare(cli_query) 137 | survival_data = cli[[2]] 138 | clinical_data = cli[[1]] 139 | clinical_data$sample = clinical_data$sampleID 140 | 141 | if ("xena_sample" %in% colnames(survival_data)) { 142 | survival_data$sample = survival_data$xena_sample 143 | survival_data$xena_sample = NULL 144 | } 145 | 146 | clinical_data <- clinical_data[clinical_data$sampleID %in% tcga_oscc_sampleID, ] 147 | survival_data <- survival_data[survival_data$sample %in% tcga_oscc_sampleID, ] 148 | 149 | ge <- XenaData %>% 150 | filter(XenaHostNames == "tcgaHub") %>% # select TCGA Hub 151 | XenaScan("HNSC") %>% 152 | filter(DataSubtype == "gene expression RNAseq", Label == "IlluminaHiSeq", Unit == "log2(norm_count+1)") 153 | } 154 | 155 | gene_info_km <- data.frame() 156 | 157 | samples <- fetch_dataset_samples( unique(ge$XenaHosts), ge$XenaDatasets, limit = NULL) 158 | #fetch gene expression data 159 | gene_info_km <- fetch_dense_values(host = unique(ge$XenaHosts), 160 | dataset = ge$XenaDatasets, 161 | samples = samples, 162 | use_probeMap = F, 163 | check = FALSE, 164 | time_limit = 10000) 165 | 166 | gene_info_km <- as.data.frame((gene_info_km)) 167 | 168 | 169 | #calculate gene sets 170 | 171 | rankData <- rankGenes(gene_info_km) 172 | 173 | scoredf_core <- simpleScore(rankData, upSet = core_genes , knownDirection = FALSE) 174 | scoredf_edge <- simpleScore(rankData, upSet = edge_genes , knownDirection = FALSE) 175 | 176 | #calculate gene sets 177 | names(scoredf_core) <- c("core","core_dispersion") 178 | names(scoredf_edge) <- c("edge","edge_dispersion") 179 | 180 | scores_df_singshot <- cbind(scoredf_core,scoredf_edge) 181 | 182 | scores_df_singshot <- cbind(sample = rownames(scores_df_singshot), data.frame(scores_df_singshot, row.names=NULL)) 183 | 184 | merged_data = survival_data %>% 185 | left_join(scores_df_singshot, by = "sample") %>% 186 | dplyr::select(sample,names(geneset_of_interest), OS.time, OS) %>% 187 | left_join(clinical_data, by = "sample") %>% 188 | dplyr::select(sample,names(geneset_of_interest), OS.time, OS,sample_type) %>% 189 | dplyr::rename(time = OS.time, 190 | status = OS) 191 | 192 | if (indication != "LAML") { 193 | merged_data = merged_data[merged_data$sample_type == "Primary Tumor",] 194 | } 195 | 196 | merged_data <- merged_data[complete.cases(merged_data), ] 197 | 198 | #identify cutpoint using minprop of 0.1 199 | cut = surv_cutpoint(data = merged_data, time = "time", event = "status", variables = names(geneset_of_interest), minprop = 0.1) 200 | res.cat <- surv_categorize(cut) 201 | 202 | for (gene in names(geneset_of_interest)) { 203 | 204 | res.cat$binary = res.cat[[gene]] 205 | 206 | row = df[df$gene == gene & df$indication == indication,] 207 | rownumber = row.names(df[df$gene == gene & df$indication == indication,]) 208 | 209 | res.cat$binary <- factor(res.cat$binary, levels = c("high","low")) 210 | 211 | fit.coxph <- coxph(Surv(time, status) ~ binary, data = res.cat) 212 | 213 | HR <- round(exp(coef(fit.coxph)), 2)[[1]] 214 | 215 | if (is.na(HR)) { 216 | row$hazard_ratio <- 1 217 | row$worse_outcome = "n/a" 218 | } else { 219 | if (coef(fit.coxph) > 0) { 220 | row$hazard_ratio <- round(exp(coef(fit.coxph)), 2)[[1]] 221 | row$worse_outcome = "Low" 222 | } else { 223 | row$hazard_ratio <- 1/round(exp(coef(fit.coxph)), 2)[[1]] 224 | row$worse_outcome = "High" 225 | } 226 | } 227 | 228 | #get p val 229 | 230 | p_value = summary(fit.coxph)$waldtest[[3]] 231 | 232 | row$p_val = p_value 233 | 234 | df[rownumber,] = row 235 | } 236 | 237 | } 238 | 239 | df$sig = 1 240 | df[is.na(df)] = 1 241 | 242 | 243 | for (data_row in rownames(df)) { 244 | row = df[data_row,] 245 | if(row$p_val > 0.05) { 246 | row$sig = "ns" 247 | if(row$hazard_ratio == "Inf" || row$hazard_ratio > 100) { 248 | row$hazard_ratio = 1 249 | } 250 | } else { 251 | if(row$hazard_ratio == "Inf" || row$hazard_ratio > 100) { 252 | row$sig = row$worse_outcome 253 | row$hazard_ratio = 1 254 | } else { 255 | row$sig = row$worse_outcome 256 | } 257 | } 258 | df[data_row,] = row 259 | } 260 | 261 | 262 | dir.create("heatmap_tcga") 263 | setwd("heatmap_tcga") 264 | 265 | 266 | png("oss_survival_dotplot.png", units="in", width=3, height=5, res=300, type = "cairo") 267 | ggplot(df, aes(x = geneset, y = indication, color = sig)) + 268 | geom_point(aes(size = hazard_ratio, fill = sig)) + 269 | scale_colour_manual(values = c("High" = "#d73027", 270 | "Low" = "#4575b4","ns" ="grey")) + 271 | labs(x="geneset", y="indication", 272 | title="Decreased OS") + 273 | theme_minimal() 274 | dev.off() 275 | setwd("..") 276 | 277 | ### 278 | 279 | library(UCSCXenaTools) 280 | library(dplyr) 281 | library(survival) 282 | library(survminer) 283 | library(data.table) 284 | library(ggplot2) 285 | library(tibble) 286 | library(stringr) 287 | library(plyr) 288 | library(singscore) 289 | library(pROC) 290 | `%!in%` = Negate(`%in%`) 291 | 292 | 293 | replacement_dictionary <- c( 294 | "ADIRF" = "C10orf116", "CTSV" = 'CTSL2',"ERO1A" = "ERO1L","IL36G" = "IL1F9", 295 | "IL36RN" = "IL1F5","ATP5F1A" = "ATP5A1","ATP5F1B" = "ATP5B", "ATP5MF" = "ATP5J2", 296 | 'DDX39B' = "BAT1","HNRNPDL" = 'HNRPDL',"MZT2B" = "FAM128B","PKM" = 'PKM2', 297 | 'RACK1' = 'GNB2L1', 'SNHG29' = 'NCRNA00188','SRSF2' = "SFRS2" ,"TMA7" = 'CCDC72') 298 | 299 | genes_to_remove <- c("DEFB4B","PRR9","RNF223","SLURP2","MIR205HG","TMSB4X") 300 | 301 | 302 | core_genes <- readRDS("core_genes.RDS") 303 | 304 | for (gene in core_genes) { 305 | if (gene %in% names(replacement_dictionary)) { 306 | core_genes <- core_genes[core_genes != gene] 307 | core_genes <- append(core_genes, replacement_dictionary[[gene]]) 308 | } 309 | } 310 | 311 | core_genes <- core_genes[core_genes %!in% genes_to_remove] 312 | 313 | edge_genes <- readRDS("edge_genes.RDS") 314 | 315 | for (gene in edge_genes) { 316 | if (gene %in% names(replacement_dictionary)) { 317 | edge_genes <- edge_genes[edge_genes != gene] 318 | edge_genes <- append(edge_genes, replacement_dictionary[[gene]]) 319 | } 320 | } 321 | 322 | edge_genes <- edge_genes[edge_genes %!in% genes_to_remove] 323 | 324 | geneset_of_interest <- list(core_genes,edge_genes) 325 | names(geneset_of_interest) <- c("core","edge") 326 | 327 | all_genes <- union(core_genes,edge_genes) 328 | 329 | 330 | cancer_indications <- c("BRCA","LUAD","LUSC","LIHC","OSCC","COADREAD","PAAD","ACC", 331 | "BLCA","STAD","THCA","PRAD", 332 | "CESC","GBMLGG","KIRP","MESO", 333 | "KIRC","OV","SARC","SKCM") 334 | 335 | #make a df row 1 is gene, row 2 indication, row 3 p value, row 4 low or high 336 | df <- data.frame (geneset = rep(names(geneset_of_interest),length(cancer_indications)), 337 | indication = rep(cancer_indications,each = length(geneset_of_interest)), 338 | p_val = 1,worse_outcome = 0, hazard_ratio = 0 339 | ) 340 | 341 | for (indication in cancer_indications) { 342 | #initialize object 343 | if (indication != "OSCC") { 344 | Xena_cohort = XenaData %>% 345 | filter(XenaHostNames == "tcgaHub") %>% # select TCGA Hub 346 | XenaScan(indication) # select cohort 347 | 348 | #download clinical and survival datasets 349 | cli_query = Xena_cohort %>% 350 | filter(DataSubtype == "phenotype") %>% # select clinical dataset 351 | XenaGenerate() %>% # generate a XenaHub object 352 | XenaQuery() %>% 353 | XenaDownload() 354 | 355 | if (indication != "OV") { 356 | cli = XenaPrepare(cli_query) 357 | survival_data = cli[[2]] 358 | clinical_data = cli[[1]] 359 | clinical_data$sample = clinical_data$sampleID 360 | } else { 361 | cli = XenaPrepare(cli_query) 362 | survival_data = cli[[3]] 363 | clinical_data = cli[[1]] 364 | clinical_data$sample = clinical_data$sampleID 365 | } 366 | 367 | if ("xena_sample" %in% colnames(survival_data)) { 368 | survival_data$sample = survival_data$xena_sample 369 | survival_data$xena_sample = NULL 370 | } 371 | 372 | if (indication == "OV") { 373 | ge <- XenaData %>% 374 | filter(XenaHostNames == "tcgaHub") %>% # select TCGA Hub 375 | XenaScan(indication) %>% 376 | filter(DataSubtype == "gene expression RNAseq", Label == "IlluminaHiSeq UNC", Unit == "log2(norm_count+1)") 377 | } else { 378 | if (indication == "STAD") { 379 | ge <- XenaData %>% 380 | filter(XenaHostNames == "tcgaHub") %>% # select TCGA Hub 381 | XenaScan(indication) %>% 382 | filter(DataSubtype == "gene expression RNAseq", Label == "IlluminaHiSeq UNC", Unit == "log2(norm_count+1)") 383 | ge <- ge[2,] 384 | } else { 385 | ge <- XenaData %>% 386 | filter(XenaHostNames == "tcgaHub") %>% # select TCGA Hub 387 | XenaScan(indication) %>% 388 | filter(DataSubtype == "gene expression RNAseq", Label == "IlluminaHiSeq", Unit == "log2(norm_count+1)") 389 | } 390 | } 391 | 392 | if (indication == "SARC") { 393 | ge = ge[1,] 394 | } } else { 395 | sample_ids <- read.csv(file = "TCGA_OSCC_HPV_neg_CODES.csv") 396 | tcga_oscc_codes <- sample_ids$TCGA_codes 397 | tcga_oscc_sampleID <- lapply(tcga_oscc_codes, function(x) paste(x, "-01", sep = "")) 398 | 399 | #initialize object 400 | Xena_cohort = XenaData %>% 401 | filter(XenaHostNames == "tcgaHub") %>% # select TCGA Hub 402 | XenaScan("HNSC") # select cohort 403 | 404 | #download clinical and survival datasets 405 | cli_query = Xena_cohort %>% 406 | filter(DataSubtype == "phenotype") %>% # select clinical dataset 407 | XenaGenerate() %>% # generate a XenaHub object 408 | XenaQuery() %>% 409 | XenaDownload() 410 | 411 | cli = XenaPrepare(cli_query) 412 | survival_data = cli[[2]] 413 | clinical_data = cli[[1]] 414 | clinical_data$sample = clinical_data$sampleID 415 | 416 | if ("xena_sample" %in% colnames(survival_data)) { 417 | survival_data$sample = survival_data$xena_sample 418 | survival_data$xena_sample = NULL 419 | } 420 | 421 | clinical_data <- clinical_data[clinical_data$sampleID %in% tcga_oscc_sampleID, ] 422 | survival_data <- survival_data[survival_data$sample %in% tcga_oscc_sampleID, ] 423 | 424 | ge <- XenaData %>% 425 | filter(XenaHostNames == "tcgaHub") %>% # select TCGA Hub 426 | XenaScan("HNSC") %>% 427 | filter(DataSubtype == "gene expression RNAseq", Label == "IlluminaHiSeq", Unit == "log2(norm_count+1)") 428 | } 429 | 430 | gene_info_km <- data.frame() 431 | 432 | samples <- fetch_dataset_samples( unique(ge$XenaHosts), ge$XenaDatasets, limit = NULL) 433 | 434 | gene_info_km <- fetch_dense_values(host = unique(ge$XenaHosts), 435 | dataset = ge$XenaDatasets, 436 | samples = samples, 437 | use_probeMap = F, 438 | check = FALSE, 439 | time_limit = 10000) 440 | 441 | gene_info_km <- as.data.frame((gene_info_km)) 442 | 443 | 444 | #calculate gene sets 445 | 446 | rankData <- rankGenes(gene_info_km) 447 | 448 | scoredf_core <- simpleScore(rankData, upSet = core_genes , knownDirection = FALSE) 449 | scoredf_edge <- simpleScore(rankData, upSet = edge_genes , knownDirection = FALSE) 450 | 451 | #calculate gene sets 452 | names(scoredf_core) <- c("core","core_dispersion") 453 | names(scoredf_edge) <- c("edge","edge_dispersion") 454 | 455 | scores_df_singshot <- cbind(scoredf_core,scoredf_edge) 456 | 457 | scores_df_singshot <- cbind(sample = rownames(scores_df_singshot), data.frame(scores_df_singshot, row.names=NULL)) 458 | 459 | merged_data = survival_data %>% 460 | left_join(scores_df_singshot, by = "sample") %>% 461 | dplyr::select(sample,names(geneset_of_interest), DSS.time, DSS) %>% 462 | left_join(clinical_data, by = "sample") %>% 463 | dplyr::select(sample,names(geneset_of_interest), DSS.time, DSS,sample_type) %>% 464 | dplyr::rename(time = DSS.time, 465 | status = DSS) 466 | 467 | if (indication != "LAML") { 468 | merged_data = merged_data[merged_data$sample_type == "Primary Tumor",] 469 | } 470 | 471 | merged_data <- merged_data[complete.cases(merged_data), ] 472 | 473 | cut = surv_cutpoint(data = merged_data, time = "time", event = "status", variables = names(geneset_of_interest), minprop = 0.1) 474 | res.cat <- surv_categorize(cut) 475 | 476 | ##switch stuff here more 477 | for (gene in names(geneset_of_interest)) { 478 | 479 | res.cat$binary = res.cat[[gene]] 480 | 481 | row = df[df$gene == gene & df$indication == indication,] 482 | rownumber = row.names(df[df$gene == gene & df$indication == indication,]) 483 | 484 | #try high ref 485 | res.cat$binary <- factor(res.cat$binary, levels = c("high","low")) 486 | 487 | fit.coxph <- coxph(Surv(time, status) ~ binary, data = res.cat) 488 | 489 | HR <- round(exp(coef(fit.coxph)), 2)[[1]] 490 | 491 | if (is.na(HR)) { 492 | row$hazard_ratio <- 1 493 | row$worse_outcome = "n/a" 494 | } else { 495 | if (coef(fit.coxph) > 0) { 496 | row$hazard_ratio <- round(exp(coef(fit.coxph)), 2)[[1]] 497 | row$worse_outcome = "Low" 498 | } else { 499 | row$hazard_ratio <- 1/round(exp(coef(fit.coxph)), 2)[[1]] 500 | row$worse_outcome = "High" 501 | } 502 | } 503 | 504 | #get p val 505 | 506 | p_value = summary(fit.coxph)$waldtest[[3]] 507 | 508 | row$p_val = p_value 509 | 510 | df[rownumber,] = row 511 | } 512 | 513 | } 514 | 515 | df$sig = 1 516 | df[is.na(df)] = 1 517 | 518 | 519 | for (data_row in rownames(df)) { 520 | row = df[data_row,] 521 | if(row$p_val > 0.05) { 522 | row$sig = "ns" 523 | if(row$hazard_ratio == "Inf" || row$hazard_ratio > 100) { 524 | row$hazard_ratio = 1 525 | } 526 | } else { 527 | if(row$hazard_ratio == "Inf" || row$hazard_ratio > 100) { 528 | row$sig = row$worse_outcome 529 | row$hazard_ratio = 1 530 | } else { 531 | row$sig = row$worse_outcome 532 | } 533 | } 534 | df[data_row,] = row 535 | } 536 | 537 | 538 | dir.create("heatmap_tcga") 539 | setwd("heatmap_tcga") 540 | 541 | 542 | png("survival_dotplot_DSS.png", units="in", width=3, height=5, res=300, type = "cairo") 543 | ggplot(df, aes(x = geneset, y = indication, color = sig)) + 544 | geom_point(aes(size = hazard_ratio, fill = sig)) + 545 | scale_colour_manual(values = c("High" = "#d73027", 546 | "Low" = "#4575b4","ns" ="grey")) + 547 | labs(x="geneset", y="indication", 548 | title="Decreased DSS") + 549 | theme_minimal() 550 | dev.off() 551 | setwd("..") 552 | ### 553 | 554 | library(UCSCXenaTools) 555 | library(dplyr) 556 | library(survival) 557 | library(survminer) 558 | library(data.table) 559 | library(ggplot2) 560 | library(tibble) 561 | library(stringr) 562 | library(plyr) 563 | library(singscore) 564 | library(pROC) 565 | `%!in%` = Negate(`%in%`) 566 | 567 | 568 | replacement_dictionary <- c( 569 | "ADIRF" = "C10orf116", "CTSV" = 'CTSL2',"ERO1A" = "ERO1L","IL36G" = "IL1F9", 570 | "IL36RN" = "IL1F5","ATP5F1A" = "ATP5A1","ATP5F1B" = "ATP5B", "ATP5MF" = "ATP5J2", 571 | 'DDX39B' = "BAT1","HNRNPDL" = 'HNRPDL',"MZT2B" = "FAM128B","PKM" = 'PKM2', 572 | 'RACK1' = 'GNB2L1', 'SNHG29' = 'NCRNA00188','SRSF2' = "SFRS2" ,"TMA7" = 'CCDC72') 573 | 574 | genes_to_remove <- c("DEFB4B","PRR9","RNF223","SLURP2","MIR205HG","TMSB4X") 575 | 576 | 577 | core_genes <- readRDS("core_genes.RDS") 578 | 579 | for (gene in core_genes) { 580 | if (gene %in% names(replacement_dictionary)) { 581 | core_genes <- core_genes[core_genes != gene] 582 | core_genes <- append(core_genes, replacement_dictionary[[gene]]) 583 | } 584 | } 585 | 586 | core_genes <- core_genes[core_genes %!in% genes_to_remove] 587 | 588 | edge_genes <- readRDS("edge_genes.RDS") 589 | 590 | for (gene in edge_genes) { 591 | if (gene %in% names(replacement_dictionary)) { 592 | edge_genes <- edge_genes[edge_genes != gene] 593 | edge_genes <- append(edge_genes, replacement_dictionary[[gene]]) 594 | } 595 | } 596 | 597 | edge_genes <- edge_genes[edge_genes %!in% genes_to_remove] 598 | 599 | geneset_of_interest <- list(core_genes,edge_genes) 600 | names(geneset_of_interest) <- c("core","edge") 601 | 602 | all_genes <- union(core_genes,edge_genes) 603 | 604 | 605 | cancer_indications <- c("BRCA","LUAD","LUSC","LIHC","OSCC","COADREAD","PAAD","ACC", 606 | "BLCA","STAD","THCA","PRAD", 607 | "CESC","GBMLGG","KIRP","MESO", 608 | "KIRC","OV","SARC","SKCM") 609 | 610 | #make a df row 1 is gene, row 2 indication, row 3 p value, row 4 low or high 611 | df <- data.frame (geneset = rep(names(geneset_of_interest),length(cancer_indications)), 612 | indication = rep(cancer_indications,each = length(geneset_of_interest)), 613 | p_val = 1,worse_outcome = 0, hazard_ratio = 0 614 | ) 615 | 616 | for (indication in cancer_indications) { 617 | #initialize object 618 | if (indication != "OSCC") { 619 | Xena_cohort = XenaData %>% 620 | filter(XenaHostNames == "tcgaHub") %>% # select TCGA Hub 621 | XenaScan(indication) # select cohort 622 | 623 | #download clinical and survival datasets 624 | cli_query = Xena_cohort %>% 625 | filter(DataSubtype == "phenotype") %>% # select clinical dataset 626 | XenaGenerate() %>% # generate a XenaHub object 627 | XenaQuery() %>% 628 | XenaDownload() 629 | 630 | if (indication != "OV") { 631 | cli = XenaPrepare(cli_query) 632 | survival_data = cli[[2]] 633 | clinical_data = cli[[1]] 634 | clinical_data$sample = clinical_data$sampleID 635 | } else { 636 | cli = XenaPrepare(cli_query) 637 | survival_data = cli[[3]] 638 | clinical_data = cli[[1]] 639 | clinical_data$sample = clinical_data$sampleID 640 | } 641 | 642 | if ("xena_sample" %in% colnames(survival_data)) { 643 | survival_data$sample = survival_data$xena_sample 644 | survival_data$xena_sample = NULL 645 | } 646 | 647 | if (indication == "OV") { 648 | ge <- XenaData %>% 649 | filter(XenaHostNames == "tcgaHub") %>% # select TCGA Hub 650 | XenaScan(indication) %>% 651 | filter(DataSubtype == "gene expression RNAseq", Label == "IlluminaHiSeq UNC", Unit == "log2(norm_count+1)") 652 | } else { 653 | if (indication == "STAD") { 654 | ge <- XenaData %>% 655 | filter(XenaHostNames == "tcgaHub") %>% # select TCGA Hub 656 | XenaScan(indication) %>% 657 | filter(DataSubtype == "gene expression RNAseq", Label == "IlluminaHiSeq UNC", Unit == "log2(norm_count+1)") 658 | ge <- ge[2,] 659 | } else { 660 | ge <- XenaData %>% 661 | filter(XenaHostNames == "tcgaHub") %>% # select TCGA Hub 662 | XenaScan(indication) %>% 663 | filter(DataSubtype == "gene expression RNAseq", Label == "IlluminaHiSeq", Unit == "log2(norm_count+1)") 664 | } 665 | } 666 | 667 | if (indication == "SARC") { 668 | ge = ge[1,] 669 | } } else { 670 | sample_ids <- read.csv(file = "TCGA_OSCC_HPV_neg_CODES.csv") 671 | tcga_oscc_codes <- sample_ids$TCGA_codes 672 | tcga_oscc_sampleID <- lapply(tcga_oscc_codes, function(x) paste(x, "-01", sep = "")) 673 | 674 | #initialize object 675 | Xena_cohort = XenaData %>% 676 | filter(XenaHostNames == "tcgaHub") %>% # select TCGA Hub 677 | XenaScan("HNSC") # select cohort 678 | 679 | #download clinical and survival datasets 680 | cli_query = Xena_cohort %>% 681 | filter(DataSubtype == "phenotype") %>% # select clinical dataset 682 | XenaGenerate() %>% # generate a XenaHub object 683 | XenaQuery() %>% 684 | XenaDownload() 685 | 686 | cli = XenaPrepare(cli_query) 687 | survival_data = cli[[2]] 688 | clinical_data = cli[[1]] 689 | clinical_data$sample = clinical_data$sampleID 690 | 691 | if ("xena_sample" %in% colnames(survival_data)) { 692 | survival_data$sample = survival_data$xena_sample 693 | survival_data$xena_sample = NULL 694 | } 695 | 696 | clinical_data <- clinical_data[clinical_data$sampleID %in% tcga_oscc_sampleID, ] 697 | survival_data <- survival_data[survival_data$sample %in% tcga_oscc_sampleID, ] 698 | 699 | ge <- XenaData %>% 700 | filter(XenaHostNames == "tcgaHub") %>% # select TCGA Hub 701 | XenaScan("HNSC") %>% 702 | filter(DataSubtype == "gene expression RNAseq", Label == "IlluminaHiSeq", Unit == "log2(norm_count+1)") 703 | } 704 | 705 | gene_info_km <- data.frame() 706 | 707 | samples <- fetch_dataset_samples( unique(ge$XenaHosts), ge$XenaDatasets, limit = NULL) 708 | 709 | gene_info_km <- fetch_dense_values(host = unique(ge$XenaHosts), 710 | dataset = ge$XenaDatasets, 711 | samples = samples, 712 | use_probeMap = F, 713 | check = FALSE, 714 | time_limit = 10000) 715 | 716 | gene_info_km <- as.data.frame((gene_info_km)) 717 | 718 | 719 | #calculate gene sets 720 | 721 | rankData <- rankGenes(gene_info_km) 722 | 723 | scoredf_core <- simpleScore(rankData, upSet = core_genes , knownDirection = FALSE) 724 | scoredf_edge <- simpleScore(rankData, upSet = edge_genes , knownDirection = FALSE) 725 | 726 | #calculate gene sets 727 | names(scoredf_core) <- c("core","core_dispersion") 728 | names(scoredf_edge) <- c("edge","edge_dispersion") 729 | 730 | scores_df_singshot <- cbind(scoredf_core,scoredf_edge) 731 | 732 | scores_df_singshot <- cbind(sample = rownames(scores_df_singshot), data.frame(scores_df_singshot, row.names=NULL)) 733 | 734 | merged_data = survival_data %>% 735 | left_join(scores_df_singshot, by = "sample") %>% 736 | dplyr::select(sample,names(geneset_of_interest), PFI.time, PFI) %>% 737 | left_join(clinical_data, by = "sample") %>% 738 | dplyr::select(sample,names(geneset_of_interest), PFI.time, PFI,sample_type) %>% 739 | dplyr::rename(time = PFI.time, 740 | status = PFI) 741 | 742 | if (indication != "LAML") { 743 | merged_data = merged_data[merged_data$sample_type == "Primary Tumor",] 744 | } 745 | 746 | merged_data <- merged_data[complete.cases(merged_data), ] 747 | 748 | cut = surv_cutpoint(data = merged_data, time = "time", event = "status", variables = names(geneset_of_interest), minprop = 0.1) 749 | res.cat <- surv_categorize(cut) 750 | 751 | ##switch stuff here more 752 | for (gene in names(geneset_of_interest)) { 753 | 754 | res.cat$binary = res.cat[[gene]] 755 | 756 | row = df[df$gene == gene & df$indication == indication,] 757 | rownumber = row.names(df[df$gene == gene & df$indication == indication,]) 758 | 759 | #try high ref 760 | res.cat$binary <- factor(res.cat$binary, levels = c("high","low")) 761 | 762 | fit.coxph <- coxph(Surv(time, status) ~ binary, data = res.cat) 763 | 764 | HR <- round(exp(coef(fit.coxph)), 2)[[1]] 765 | 766 | if (is.na(HR)) { 767 | row$hazard_ratio <- 1 768 | row$worse_outcome = "n/a" 769 | } else { 770 | if (coef(fit.coxph) > 0) { 771 | row$hazard_ratio <- round(exp(coef(fit.coxph)), 2)[[1]] 772 | row$worse_outcome = "Low" 773 | } else { 774 | row$hazard_ratio <- 1/round(exp(coef(fit.coxph)), 2)[[1]] 775 | row$worse_outcome = "High" 776 | } 777 | } 778 | 779 | #get p val 780 | 781 | p_value = summary(fit.coxph)$waldtest[[3]] 782 | 783 | row$p_val = p_value 784 | 785 | df[rownumber,] = row 786 | } 787 | 788 | } 789 | 790 | df$sig = 1 791 | df[is.na(df)] = 1 792 | 793 | 794 | for (data_row in rownames(df)) { 795 | row = df[data_row,] 796 | if(row$p_val > 0.05) { 797 | row$sig = "ns" 798 | if(row$hazard_ratio == "Inf" || row$hazard_ratio > 100) { 799 | row$hazard_ratio = 1 800 | } 801 | } else { 802 | if(row$hazard_ratio == "Inf" || row$hazard_ratio > 100) { 803 | row$sig = row$worse_outcome 804 | row$hazard_ratio = 1 805 | } else { 806 | row$sig = row$worse_outcome 807 | } 808 | } 809 | df[data_row,] = row 810 | } 811 | 812 | 813 | dir.create("heatmap_tcga") 814 | setwd("heatmap_tcga") 815 | 816 | 817 | png("survival_dotplot_PFI.png", units="in", width=3, height=5, res=300, type = "cairo") 818 | ggplot(df, aes(x = geneset, y = indication, color = sig)) + 819 | geom_point(aes(size = hazard_ratio, fill = sig)) + 820 | scale_colour_manual(values = c("High" = "#d73027", 821 | "Low" = "#4575b4","ns" ="grey")) + 822 | labs(x="geneset", y="indication", 823 | title="Decreased PFI") + 824 | theme_minimal() 825 | dev.off() 826 | setwd("..") 827 | -------------------------------------------------------------------------------- /Figure 5/Validation/GSE41613_survival.R: -------------------------------------------------------------------------------- 1 | library(GEOquery) 2 | library(tidyr) 3 | GSE41613 <- GEOquery::getGEO(GEO = "GSE41613") 4 | 5 | #process this dataset using limma package 6 | 7 | GSE41613_meta <- GSE41613$GSE41613_series_matrix.txt.gz@phenoData@data 8 | 9 | GSE41613_survival <- GSE41613_meta[,colnames(GSE41613_meta)%in% c("characteristics_ch1.4","characteristics_ch1.5")] 10 | names(GSE41613_survival) <- c("survival","fu_time") 11 | 12 | GSE41613_expr <- GSE41613$GSE41613_series_matrix.txt.gz@assayData$exprs 13 | GSE41613_ex_info <- GSE41613$GSE41613_series_matrix.txt.gz@featureData@data 14 | library(limma) 15 | 16 | # perform background correction on fluorescent intensities 17 | #GSE41613_expr <- backgroundCorrect(GSE41613_expr, method = 'normexp',offset=0) 18 | GSE41613_expr <- avereps(GSE41613_expr,ID = GSE41613_ex_info$`Gene Symbol`) 19 | rownames(GSE41613_expr) <- lapply(rownames(GSE41613_expr), function(x) stringr::str_split(x,"///")[[1]][1]) 20 | rownames(GSE41613_expr) <- lapply(rownames(GSE41613_expr), function(x) gsub(" ", "", x)) 21 | 22 | 23 | GSE41613_survival$time <- lapply(GSE41613_survival$fu_time, function(x) gsub("fu time:", "", x) ) 24 | 25 | GSE41613_survival$time <- as.numeric(GSE41613_survival$time) 26 | 27 | GSE41613_survival$os <- ifelse(GSE41613_survival$survival == "vital: Alive", 0,1) 28 | GSE41613_survival$dss <- ifelse(GSE41613_survival$survival == "vital: Alive", 0,ifelse( 29 | GSE41613_survival$survival == "vital: Dead-non OC",NA,1 30 | )) 31 | 32 | 33 | ### 34 | library(singscore) 35 | indication <- "HNSC" 36 | 37 | core_genes <- readRDS("core_genes.RDS") 38 | core_genes <- c(core_genes) 39 | 40 | edge_genes <- readRDS("edge_genes.RDS") 41 | edge_genes <- c(edge_genes) 42 | 43 | geneset_of_interest <- list(core_genes,edge_genes) 44 | names(geneset_of_interest) <- c("core","edge") 45 | 46 | all_genes <- union(core_genes,edge_genes) 47 | 48 | rankData <- rankGenes(as.data.frame(GSE41613_expr)) 49 | 50 | #add edge and croe scores 51 | scoredf_core <- simpleScore(rankData, upSet = core_genes , knownDirection = F) 52 | scoredf_edge <- simpleScore(rankData, upSet = edge_genes , knownDirection = F) 53 | 54 | #calculate gene sets 55 | names(scoredf_core) <- c("core","core_dispersion") 56 | names(scoredf_edge) <- c("edge","edge_dispersion") 57 | 58 | scores_df_singshot <- cbind(scoredf_core,scoredf_edge) 59 | 60 | scores_df_singshot <- cbind(sample = rownames(scores_df_singshot), data.frame(scores_df_singshot, row.names=NULL)) 61 | rownames(scores_df_singshot) = scores_df_singshot$sample 62 | 63 | merged_data = scores_df_singshot %>% 64 | merge(GSE41613_survival, by = 0) %>% 65 | dplyr::select(Row.names,names(geneset_of_interest), time, os,dss) 66 | 67 | #survival testing 68 | library(survminer) 69 | library(survival) 70 | df <- data.frame (geneset = rep(names(geneset_of_interest),2), 71 | indication = rep(c("os","dss"),each = 2), 72 | p_val = 1,worse_outcome = 0, hazard_ratio = 0 73 | ) 74 | 75 | for (survival_method in c("os","dss")) { 76 | ##switch stuff here more 77 | for (gene in names(geneset_of_interest)) { 78 | 79 | cut = surv_cutpoint(data = merged_data, time = "time", event = survival_method, variables = names(geneset_of_interest), 80 | minprop = 0.1) 81 | res.cat <- surv_categorize(cut) 82 | 83 | res.cat$binary = res.cat[[gene]] 84 | 85 | row = df[df$gene == gene & df$indication == survival_method,] 86 | rownumber = row.names(df[df$gene == gene & df$indication == survival_method,]) 87 | 88 | res.cat$binary <- factor(res.cat$binary, levels = c("high","low")) 89 | res.cat$status <- res.cat[[survival_method]] 90 | fit.coxph <- coxph(Surv(time, status) ~ binary, data = res.cat) 91 | 92 | HR <- round(exp(coef(fit.coxph)), 2)[[1]] 93 | 94 | if (is.na(HR)) { 95 | row$hazard_ratio <- 1 96 | row$worse_outcome = "n/a" 97 | } else { 98 | if (coef(fit.coxph) > 0) { 99 | row$hazard_ratio <- round(exp(coef(fit.coxph)), 2)[[1]] 100 | row$worse_outcome = "Low" 101 | } else { 102 | row$hazard_ratio <- 1/round(exp(coef(fit.coxph)), 2)[[1]] 103 | row$worse_outcome = "High" 104 | } 105 | } 106 | 107 | #get p val 108 | 109 | p_value = summary(fit.coxph)$waldtest[[3]] 110 | 111 | row$p_val = p_value 112 | 113 | df[rownumber,] = row 114 | 115 | } 116 | } 117 | 118 | 119 | df$sig = 1 120 | df[is.na(df)] = 1 121 | 122 | 123 | for (data_row in rownames(df)) { 124 | row = df[data_row,] 125 | if(row$p_val > 0.05) { 126 | row$sig = "ns" 127 | if(row$hazard_ratio == "Inf" || row$hazard_ratio > 100) { 128 | row$hazard_ratio = 1 129 | } 130 | } else { 131 | if(row$hazard_ratio == "Inf" || row$hazard_ratio > 100) { 132 | row$sig = row$worse_outcome 133 | row$hazard_ratio = 1 134 | } else { 135 | row$sig = row$worse_outcome 136 | } 137 | } 138 | df[data_row,] = row 139 | } 140 | 141 | dir.create("heatmap_GSE41613") 142 | setwd("heatmap_GSE41613") 143 | 144 | png("survival_dotplot.png", units="cm", width=9, height=7, res=300, type = "cairo") 145 | ggplot(df, aes(x = geneset, y = indication, color = sig)) + 146 | geom_point(aes(size = hazard_ratio)) + 147 | scale_size(range = c(7, 10))+ 148 | scale_colour_manual(values = c("High" = "#d73027", 149 | "Low" = "#4575b4", 150 | "ns" = "lightgray")) + 151 | labs(x="geneset", y="survival type", 152 | title="GSE41613") + 153 | theme_minimal() 154 | dev.off() 155 | setwd("..") 156 | 157 | 158 | -------------------------------------------------------------------------------- /Figure 6/Dynamo/dynamo.R: -------------------------------------------------------------------------------- 1 | scores_df <- read.csv("all_200_rescaled_vf.csv") 2 | library(ggplot2) 3 | library(ggpubr) 4 | library(dplyr) 5 | library(tidyr) 6 | scores_df <- scores_df %>% mutate(rank=dense_rank(desc(-AUC_mean))) 7 | 8 | scores_df$median_group <- ifelse(scores_df$AUC_mean > median(scores_df$AUC_mean), "High", "Low") 9 | 10 | PRISM <- readRDS("./targeted_survival/files/PSet_PRISM.rds") 11 | PRISM <- updateObject(PRISM) 12 | treatment <- PRISM@treatment[,colnames(PRISM@treatment) %in% c("moa","treatmentid", "cid", "smiles", "phase","indication")] 13 | 14 | find_closest_match <- function(drug, treatment_table, threshold = 3) { 15 | treatmentid <- treatment_table$treatmentid 16 | distances <- adist(drug, treatmentid) 17 | 18 | closest_index <- which.min(distances) 19 | min_distance <- min(distances) 20 | 21 | if (min_distance <= threshold) { 22 | return(treatment_table[closest_index, ]) 23 | } else { 24 | return(data.frame(moa = NA, treatmentid = NA, cid = NA, smiles = NA, phase = NA, indication = NA)) 25 | } 26 | } 27 | 28 | result <- scores_df %>% 29 | rowwise() %>% 30 | mutate(closest_treatment = list(find_closest_match(drugs, treatment))) %>% 31 | unnest(cols = closest_treatment) %>% 32 | select(-treatmentid) 33 | 34 | ##edge outgoing 35 | png("dyn_plots/edge_outgoing_bar.png", units="in", width=4, height=6, res=300, type = "cairo") 36 | ggplot(scores_df, aes(x = median_group, y = edge_outgoing, fill = median_group)) + 37 | geom_boxplot(alpha = 0.6) + 38 | geom_jitter(width = 0.1, alpha = 0.6, size = 2) + 39 | stat_compare_means( 40 | method = "wilcox.test", 41 | label = "p.format", 42 | hide.ns = TRUE, 43 | bracket.size = 0.5, 44 | label.y = max(scores_df$edge_outgoing) * 0.95, 45 | label.x = 1.5 46 | ) + 47 | labs( 48 | x = "AAC Score", 49 | y = "Edge Outgoing Vector Field Score" 50 | ) + 51 | theme_bw() + 52 | theme( 53 | text = element_text(face = "bold"), 54 | axis.text = element_text(face = "bold"), 55 | legend.position = "none" 56 | ) + 57 | scale_fill_manual(values = c("Low" = "#1b9e77", "High" = "#7570b3")) + 58 | scale_x_discrete(labels = c("High (n = 35)","Low (n = 35)")) 59 | dev.off() 60 | 61 | 62 | ##core incoming 63 | png("dyn_plots/core_incoming_bar.png", units="in", width=4, height=6, res=300, type = "cairo") 64 | ggplot(scores_df, aes(x = median_group, y = core_incoming, fill = median_group)) + 65 | geom_boxplot(alpha = 0.6) + 66 | geom_jitter(width = 0.1, alpha = 0.6, size = 2) + 67 | stat_compare_means( 68 | method = "wilcox.test", 69 | label = "p.format", 70 | hide.ns = TRUE, 71 | bracket.size = 0.5, 72 | label.y = max(scores_df$edge_outgoing) * 0.95, 73 | label.x = 1.5 74 | ) + 75 | labs( 76 | x = "AAC Score", 77 | y = "Core Incoming Vector Field Score" 78 | ) + 79 | theme_bw() + 80 | theme( 81 | text = element_text(face = "bold"), 82 | axis.text = element_text(face = "bold"), 83 | legend.position = "none" 84 | ) + 85 | scale_fill_manual(values = c("Low" = "#1b9e77", "High" = "#7570b3")) + 86 | scale_x_discrete(labels = c("High (n = 35)","Low (n = 35)")) 87 | dev.off() 88 | 89 | ###drug class analysis 90 | 91 | #filter for only classes with multiple drugs 92 | filtered_result <- result %>% 93 | group_by(moa) %>% 94 | filter(n() > 1) %>% 95 | ungroup() 96 | 97 | filtered_result <- filtered_result[!is.na(filtered_result$moa),] 98 | 99 | png("dyn_plots/edge_outgoing_drug_class.png", units="in", width=8, height=6, res=300, type = "cairo") 100 | ggplot(filtered_result, aes(x = moa, y = edge_outgoing, fill = moa)) + 101 | geom_boxplot(alpha = 0.6) + 102 | geom_jitter(width = 0.1, alpha = 0.6, size = 2) + 103 | stat_compare_means( 104 | method = "kruskal.test", 105 | label = "p.format", 106 | hide.ns = TRUE, 107 | bracket.size = 0.5, 108 | label.y = max(scores_df$edge_outgoing) * 0.95, 109 | label.x = 1.5 110 | ) + 111 | labs( 112 | x = "Drug MOA", 113 | y = "Edge Outgoing Vector Field Score" 114 | ) + 115 | theme_bw() + 116 | theme( 117 | text = element_text(face = "bold"), 118 | axis.text = element_text(face = "bold"), 119 | axis.text.x = element_text(angle = 45, hjust = 1), 120 | legend.position = "none" 121 | ) 122 | dev.off() 123 | 124 | png("dyn_plots/core_incoming_drug_class.png", units="in", width=8, height=6, res=300, type = "cairo") 125 | 126 | ggplot(filtered_result, aes(x = moa, y = core_incoming, fill = moa)) + 127 | geom_boxplot(alpha = 0.6) + 128 | geom_jitter(width = 0.1, alpha = 0.6, size = 2) + 129 | stat_compare_means( 130 | method = "kruskal.test", 131 | label = "p.format", 132 | hide.ns = TRUE, 133 | bracket.size = 0.5, 134 | label.y = max(scores_df$core_incoming) * 0.95, 135 | label.x = 1.5 136 | ) + 137 | labs( 138 | x = "Drug MOA", 139 | y = "Core Incoming Vector Field Score" 140 | ) + 141 | theme_bw() + 142 | theme( 143 | text = element_text(face = "bold"), 144 | axis.text = element_text(face = "bold"), 145 | axis.text.x = element_text(angle = 45, hjust = 1), 146 | legend.position = "none" 147 | ) 148 | dev.off() 149 | 150 | write.csv(result, file = "Dyanmo_Vf_Pertrub.csv") 151 | -------------------------------------------------------------------------------- /Figure 6/PharmacoDB and DGIdb/phamacogx.R: -------------------------------------------------------------------------------- 1 | library(PharmacoGx) 2 | '%!in%' <- function(x,y)!('%in%'(x,y)) 3 | 4 | CCLE <- readRDS("files/CCLE.rds") 5 | CCLE <- updateObject(CCLE) 6 | 7 | CTRPv2 <- readRDS("files/PSet_CTRPv2.rds") 8 | CTRPv2 <- updateObject(CTRPv2) 9 | 10 | PRISM <- readRDS("files/PSet_PRISM.rds") 11 | PRISM <- updateObject(PRISM) 12 | 13 | GDSC2 <- readRDS("files/PSet_GDSC2020.rds") 14 | GDSC2 <- updateObject(GDSC2) 15 | 16 | gCSI <- readRDS("files/PSet_gCSI2019.rds") 17 | gCSI <- updateObject(gCSI) 18 | 19 | ##ccle 20 | CCLE_cell_line_id <- CCLE@sample[CCLE@sample$tissueid %in% c("Head and Neck") ,]$sampleid 21 | 22 | CTRPv2_cell_line_id <- CTRPv2@sample[CTRPv2@sample$tissueid %in% c("Head and Neck") ,]$sampleid 23 | 24 | PRISM_cell_line_id <- PRISM@sample[PRISM@sample$tissueid %in% c("Head and Neck") ,]$sampleid 25 | 26 | gCSI_cell_line_id <- gCSI@sample[gCSI@sample$tissueid %in% c("Head and Neck") ,]$sampleid 27 | 28 | GDSC2_cell_line_id <- GDSC2@sample[GDSC2@sample$tissueid %in% c("Head and Neck") ,]$sampleid 29 | 30 | 31 | #64 HNSC cell lines currently 32 | all_cell_lines <- union(union(union(union(CCLE_cell_line_id,gCSI_cell_line_id),GDSC2_cell_line_id),CTRPv2_cell_line_id),PRISM_cell_line_id) 33 | 34 | HNSC_cl_class <- read.csv(file = "HNSC_cl_class.csv") 35 | HNSC_cl_class_pos <- HNSC_cl_class[HNSC_cl_class$HPV.Status == "HPV-negative",]$HNSC.Cell.Line 36 | 37 | all_cell_lines = HNSC_cl_class_pos 38 | 39 | 40 | i = 1 41 | 42 | treat_response_studies <- c(CTRPv2,PRISM,GDSC2,CCLE,gCSI) 43 | 44 | CCLE_drugs<- unique(CCLE@treatmentResponse$info$treatmentid) 45 | CTRPv2_drugs<- unique(CTRPv2@treatmentResponse$info$treatmentid) 46 | PRISM_drugs<- unique(PRISM@treatmentResponse$info$treatmentid) 47 | gCSI_drugs <- unique(gCSI@treatmentResponse$info$treatmentid) 48 | GDSC2_drugs <- unique(GDSC2@treatmentResponse$info$treatmentid) 49 | 50 | all_drugs <- union(union(union(union(gCSI_drugs,GDSC2_drugs),CTRPv2_drugs),gCSI_drugs),PRISM_drugs) 51 | 52 | auc_df = data.frame(matrix(nrow = length(all_drugs), ncol = length(all_cell_lines))) 53 | colnames(auc_df) <- all_cell_lines 54 | rownames(auc_df) <- all_drugs 55 | 56 | for (treat_response in treat_response_studies){ 57 | 58 | response_to_treatment <- PharmacoGx::summarizeSensitivityProfiles(treat_response, 59 | sensitivity.measure='aac_recomputed', 60 | summary.stat="mean", 61 | verbose=FALSE) 62 | response_to_treatment <- as.data.frame(response_to_treatment) 63 | response_to_treatment <- response_to_treatment[,colnames(response_to_treatment) %in% all_cell_lines] 64 | response_to_treatment <- response_to_treatment[,colSums(is.na(response_to_treatment))% mutate(rank=dense_rank(mean)) 109 | 110 | write.csv(auc_df_reduced, file = "HNSC_hpvneg_drug_AAC.csv") 111 | 112 | # 113 | drug_ic50summarized <- read.csv("HNSC_hpvneg_drug_AAC.csv") 114 | drug_names <- drug_ic50summarized$X 115 | 116 | library(rDGIdb) 117 | library(httr) 118 | library(jsonlite) 119 | 120 | body <- list(drugs = paste(drug_names, collapse = ",")) 121 | 122 | url <- "https://dgidb.org/api/v2/interactions.json" 123 | body <- body[!sapply(body, is.null)] 124 | httr::verbose() 125 | postRequest <- POST(url = url, body = body, encode = 'multipart') 126 | 127 | text <- content(postRequest, as = "text") 128 | result <- fromJSON(text, simplifyVector = TRUE) 129 | 130 | #create a dataframe of #drugname, #IC50 value, #genes up- or down-regulated 131 | 132 | dgidb_ic50 <- data.frame(drugs = drug_ic50summarized$X, IC50_value <- drug_ic50summarized$mean, dgidb_drug_name = 0 , 133 | genes_up = "", 134 | genes_down = "") 135 | 136 | names(dgidb_ic50) <- c("drugs","AUC_mean","dgidb_drug_name","genes_up","genes_down") 137 | dgidb_ic50$genes_up <- as.list(dgidb_ic50$genes_up) 138 | dgidb_ic50$genes_down <- as.list(dgidb_ic50$genes_down) 139 | 140 | matched_terms <- result$matchedTerms 141 | matched_terms <- as.data.frame(matched_terms) 142 | typeof(matched_terms) 143 | 144 | #upregulated gene terms 145 | is_enriched <- c("activator", "agonist","inducer","partial agonist","positive modulator","potentiator","stimulator") 146 | 147 | #downregulated gene terms 148 | is_depeleted <- c("inhibitor", "antagonist","partial antagonist","blocker","inverse agonist","negative modulator","suppressor") 149 | 150 | truefunc <- function(value) { 151 | if(length(value > 0)) { 152 | return(all(value)) 153 | } else { 154 | return(FALSE) 155 | } 156 | } 157 | 158 | for (row in rownames(matched_terms)) { 159 | row_vals <- matched_terms[row,] 160 | 161 | dgidb_name <- row_vals$drugName 162 | interactions <- as.data.frame(row_vals$interactions) 163 | 164 | interactions_deplete <- lapply(interactions$interactionTypes, function(x) any(x %in% is_depeleted)) 165 | interactions_deplete <- lapply(interactions_deplete, truefunc) 166 | genes_deplete <- interactions[unlist(interactions_deplete),]$geneName 167 | 168 | interactions_enrich <- lapply(interactions$interactionTypes, function(x) any(x %in% is_enriched)) 169 | interactions_enrich <- lapply(interactions_enrich, truefunc) 170 | genes_enrich <- interactions[unlist(interactions_enrich),]$geneName 171 | 172 | dgidb_ic50[toupper(dgidb_ic50$drugs) == row_vals$searchTerm,]$dgidb_drug_name <- dgidb_name 173 | if (length(genes_enrich) != 0 ) { 174 | dgidb_ic50[toupper(dgidb_ic50$drugs) == row_vals$searchTerm,]$genes_up<- purrr::reduce(genes_enrich, paste) 175 | } 176 | if (length(genes_deplete) != 0 ) { 177 | dgidb_ic50[toupper(dgidb_ic50$drugs) == row_vals$searchTerm,]$genes_down <- purrr::reduce(genes_deplete, paste) 178 | } 179 | } 180 | 181 | dgidb_AUC_values <- dgidb_ic50 182 | 183 | dgidb_AUC_values$genes_up <- lapply(dgidb_AUC_values$genes_up, plyr::ldply) 184 | 185 | dgidb_AUC_values$genes_up <- lapply(dgidb_AUC_values$genes_up, `[[`, 1) 186 | 187 | dgidb_AUC_values$genes_down <- lapply(dgidb_AUC_values$genes_down, plyr::ldply) 188 | 189 | dgidb_AUC_values$genes_down <- lapply(dgidb_AUC_values$genes_down, `[[`, 1) 190 | 191 | dgidb_AUC_values$genes_up <- as.character(dgidb_AUC_values$genes_up) 192 | 193 | dgidb_AUC_values$genes_down <- as.character(dgidb_AUC_values$genes_down) 194 | 195 | #exact number and AAC mean of drugs identified may be slightly variable based on version of pharmacogx datasets used 196 | 197 | write.csv(dgidb_AUC_values, file = "dgidb_AUC_values.csv") 198 | 199 | -------------------------------------------------------------------------------- /Figure 6/Velocity/differential_splicing.R: -------------------------------------------------------------------------------- 1 | diff_kinetics <- read.csv(file = "diff_kinetics.csv") 2 | 3 | top_genes_diff_splice <- read.csv(file = "top_genes_diff_splice.csv") 4 | 5 | GOI <- head(top_genes_diff_splice,10000) 6 | 7 | GOI <- right_join(GOI,diff_kinetics, by = "X") 8 | 9 | GOI <- head(GOI,10000) 10 | 11 | write.csv(GOI, "diff_spliced_genes.csv") 12 | 13 | GOI <- GOI[GOI$fit_diff_kinetics %in% c("core","edge","transitory"),] 14 | 15 | GOI <- head(GOI,15) 16 | 17 | png("diff_spliced_genes.png", units="in", width=6, height=4, res=300, type = "cairo") 18 | ggplot(GOI, aes(x = factor(X, level = c(X)), 19 | y = fit_likelihood, fill = fit_diff_kinetics)) + 20 | geom_bar(stat="identity", color = "black", size = 0.25) + theme_minimal() + 21 | labs(x = "", y = "Splicing fit likelihood", fill = "Sample") + 22 | scale_fill_manual( values = c("core" = "#4DBBD5FF", "edge" = "#E64B35FF","transitory" = "#F9E076"))+ 23 | ggpubr::rotate_x_text()+ theme(axis.text = element_text(face="bold", size = 11), legend.text = element_text(face="bold"), 24 | axis.text.y = element_text(face="bold"), axis.title.y = element_text(face="bold")) + ggpubr::rotate_y_text() 25 | dev.off() 26 | -------------------------------------------------------------------------------- /Figure 6/Velocity/tissue_position_splitting.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 9, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from pathlib import Path, PurePath\n", 10 | "import csv\n", 11 | "from h5py import File\n", 12 | "import os" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 11, 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "barcode_dict = {1:\"sample_1\",2:\"sample_2\",3:\"sample_3\",4:\"sample_4\",5:\"sample_5\",\n", 22 | " 6:\"sample_6\",9:\"sample_7\",10:\"sample_8\",\n", 23 | " 11:\"sample_9\", 12:\"sample_10\",13:\"sample_11\",14:\"sample_12\"}" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 18, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "#split inital positions csv\n", 33 | "with open('coordinates.csv') as fin: \n", 34 | " csvin = csv.DictReader(fin)\n", 35 | " # Category -> open file lookup\n", 36 | " outputs = {}\n", 37 | " for row in csvin:\n", 38 | " cat = row['Barcodes'].split(\"-\")[1]\n", 39 | "# Open a new file and write the header\n", 40 | " if cat not in outputs:\n", 41 | " fout = open(f\"positions_scvelo.csv\", 'a')\n", 42 | " dw = csv.DictWriter(fout, fieldnames=csvin.fieldnames)\n", 43 | " dw.writeheader()\n", 44 | " outputs[cat] = fout, dw\n", 45 | " # Always write the row\n", 46 | " # switch this up etc.\n", 47 | " if barcode_dict[int(cat)] == \"sample_1\":\n", 48 | " row[\"Barcodes\"] = \"possorted_genome_bam_WJFV6:\" + row[\"Barcodes\"].split(\"-\")[0]+\"x\"\n", 49 | " elif barcode_dict[int(cat)] == \"sample_2\":\n", 50 | " row[\"Barcodes\"] = \"possorted_genome_bam_IBODB:\" + row[\"Barcodes\"].split(\"-\")[0]+\"x\"\n", 51 | " elif barcode_dict[int(cat)] == \"sample_3\":\n", 52 | " row[\"Barcodes\"] = \"possorted_genome_bam_IE2VP:\" + row[\"Barcodes\"].split(\"-\")[0]+\"x\"\n", 53 | " elif barcode_dict[int(cat)] == \"sample_4\":\n", 54 | " row[\"Barcodes\"] = \"possorted_genome_bam_4RUJG:\" + row[\"Barcodes\"].split(\"-\")[0]+\"x\"\n", 55 | " elif barcode_dict[int(cat)] == \"sample_5\":\n", 56 | " row[\"Barcodes\"] = \"possorted_genome_bam_NOEMP:\" + row[\"Barcodes\"].split(\"-\")[0]+\"x\"\n", 57 | " elif barcode_dict[int(cat)] == \"sample_6\":\n", 58 | " row[\"Barcodes\"] = \"possorted_genome_bam_I4FGA:\" + row[\"Barcodes\"].split(\"-\")[0]+\"x\"\n", 59 | " elif barcode_dict[int(cat)] == \"sample_7\":\n", 60 | " row[\"Barcodes\"] = \"possorted_genome_bam_30XDV:\" + row[\"Barcodes\"].split(\"-\")[0]+\"x\"\n", 61 | " elif barcode_dict[int(cat)] == \"sample_8\":\n", 62 | " row[\"Barcodes\"] = \"possorted_genome_bam_U0GTQ:\" + row[\"Barcodes\"].split(\"-\")[0]+\"x\"\n", 63 | " elif barcode_dict[int(cat)] == \"sample_9\":\n", 64 | " row[\"Barcodes\"] = \"possorted_genome_bam_W7L9N:\" + row[\"Barcodes\"].split(\"-\")[0]+\"x\"\n", 65 | " elif barcode_dict[int(cat)] == \"sample_10\":\n", 66 | " row[\"Barcodes\"] = \"possorted_genome_bam_A0TQE:\" + row[\"Barcodes\"].split(\"-\")[0]+\"x\"\n", 67 | " elif barcode_dict[int(cat)] == \"sample_11\":\n", 68 | " row[\"Barcodes\"] = \"possorted_genome_bam_EZTZD:\" + row[\"Barcodes\"].split(\"-\")[0]+\"x\"\n", 69 | " elif barcode_dict[int(cat)] == \"sample_12\":\n", 70 | " row[\"Barcodes\"] = \"possorted_genome_bam_DD1CT:\" + row[\"Barcodes\"].split(\"-\")[0]+\"x\"\n", 71 | " outputs[cat][1].writerow(row)\n", 72 | " # Close all the files\n", 73 | " for fout, _ in outputs.values():\n", 74 | " fout.close()" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 19, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "barcode_dict = {1:\"sample_1\",2:\"sample_2\",3:\"sample_3\",4:\"sample_4\",5:\"sample_5\",\n", 84 | " 6:\"sample_6\",9:\"sample_7\",10:\"sample_8\",\n", 85 | " 11:\"sample_9\", 12:\"sample_10\",13:\"sample_11\",14:\"sample_12\"}" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 20, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "import pandas as pd\n", 95 | "import scvelo as scv\n", 96 | "import numpy as np" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 21, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "with open(\"meta_data.csv\") as fin: \n", 106 | " csvin = csv.DictReader(fin)\n", 107 | " # Category -> open file lookup\n", 108 | " outputs = {}\n", 109 | " for row in csvin:\n", 110 | " cat = row['Barcode'].split(\"-\")[1]\n", 111 | "# Open a new file and write the header\n", 112 | " if cat not in outputs:\n", 113 | " fout = open(\"scvelo_metadata_headers.csv\", 'a')\n", 114 | " dw = csv.DictWriter(fout, fieldnames=csvin.fieldnames)\n", 115 | " dw.writeheader()\n", 116 | " outputs[cat] = fout, dw\n", 117 | " # Always write the row\n", 118 | " # switch this up etc.\n", 119 | " if barcode_dict[int(cat)] == \"sample_1\":\n", 120 | " row[\"Barcode\"] = \"possorted_genome_bam_WJFV6:\" + row[\"Barcode\"].split(\"-\")[0]+\"x\"\n", 121 | " elif barcode_dict[int(cat)] == \"sample_2\":\n", 122 | " row[\"Barcode\"] = \"possorted_genome_bam_IBODB:\" + row[\"Barcode\"].split(\"-\")[0]+\"x\"\n", 123 | " elif barcode_dict[int(cat)] == \"sample_3\":\n", 124 | " row[\"Barcode\"] = \"possorted_genome_bam_IE2VP:\" + row[\"Barcode\"].split(\"-\")[0]+\"x\"\n", 125 | " elif barcode_dict[int(cat)] == \"sample_4\":\n", 126 | " row[\"Barcode\"] = \"possorted_genome_bam_4RUJG:\" + row[\"Barcode\"].split(\"-\")[0]+\"x\"\n", 127 | " elif barcode_dict[int(cat)] == \"sample_5\":\n", 128 | " row[\"Barcode\"] = \"possorted_genome_bam_NOEMP:\" + row[\"Barcode\"].split(\"-\")[0]+\"x\"\n", 129 | " elif barcode_dict[int(cat)] == \"sample_6\":\n", 130 | " row[\"Barcode\"] = \"possorted_genome_bam_I4FGA:\" + row[\"Barcode\"].split(\"-\")[0]+\"x\"\n", 131 | " elif barcode_dict[int(cat)] == \"sample_7\":\n", 132 | " row[\"Barcode\"] = \"possorted_genome_bam_30XDV:\" + row[\"Barcode\"].split(\"-\")[0]+\"x\"\n", 133 | " elif barcode_dict[int(cat)] == \"sample_8\":\n", 134 | " row[\"Barcode\"] = \"possorted_genome_bam_U0GTQ:\" + row[\"Barcode\"].split(\"-\")[0]+\"x\"\n", 135 | " elif barcode_dict[int(cat)] == \"sample_9\":\n", 136 | " row[\"Barcode\"] = \"possorted_genome_bam_W7L9N:\" + row[\"Barcode\"].split(\"-\")[0]+\"x\"\n", 137 | " elif barcode_dict[int(cat)] == \"sample_10\":\n", 138 | " row[\"Barcode\"] = \"possorted_genome_bam_A0TQE:\" + row[\"Barcode\"].split(\"-\")[0]+\"x\"\n", 139 | " elif barcode_dict[int(cat)] == \"sample_11\":\n", 140 | " row[\"Barcode\"] = \"possorted_genome_bam_EZTZD:\" + row[\"Barcode\"].split(\"-\")[0]+\"x\"\n", 141 | " elif barcode_dict[int(cat)] == \"sample_12\":\n", 142 | " row[\"Barcode\"] = \"possorted_genome_bam_DD1CT:\" + row[\"Barcode\"].split(\"-\")[0]+\"x\"\n", 143 | " outputs[cat][1].writerow(row)\n", 144 | " # Close all the files\n", 145 | " for fout, _ in outputs.values():\n", 146 | " fout.close()" 147 | ] 148 | } 149 | ], 150 | "metadata": { 151 | "kernelspec": { 152 | "display_name": "Python 3 (ipykernel)", 153 | "language": "python", 154 | "name": "python3" 155 | }, 156 | "language_info": { 157 | "codemirror_mode": { 158 | "name": "ipython", 159 | "version": 3 160 | }, 161 | "file_extension": ".py", 162 | "mimetype": "text/x-python", 163 | "name": "python", 164 | "nbconvert_exporter": "python", 165 | "pygments_lexer": "ipython3", 166 | "version": "3.9.12" 167 | } 168 | }, 169 | "nbformat": 4, 170 | "nbformat_minor": 4 171 | } 172 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # OSCC Spatial Transcriptomics 2 | Code from Arora and Cao et al, 2023. for the analysis of oral cancer spatial transcriptomics data. 3 | 4 | Please separately cite this code base as: [![DOI](https://zenodo.org/badge/434734032.svg)](https://zenodo.org/badge/latestdoi/434734032) 5 | --------------------------------------------------------------------------------