├── 3CA_v2 ├── MP_analysis_new_batch.R ├── README.md ├── cc_analysis.R ├── cc_analysis_consensus_comparison.R ├── ct_specificity.R ├── gene_analysis.R ├── mp_new.R └── pseudobulks.R ├── ITH_hallmarks ├── Generating_MPs │ ├── Generate_Meta_Programs.R │ ├── Genes_nmf_w_basis_example.RDS │ ├── custom_magma.R │ └── robust_nmf_programs.R ├── MPs_distribution │ ├── MP_distribution.R │ ├── MP_list.RDS.gz │ ├── My_study.RDS.gz │ └── heatCols.RDS.gz ├── README.md └── TCGA_analysis │ ├── functions.R │ ├── pdac_classical_main.R │ ├── preprocess_clinical_data.R │ ├── preprocess_mutations.R │ ├── preprocess_tcga_data.R │ ├── study_contribution_per_MP.RDS │ ├── study_tcga_map.R │ ├── tcga_clinical.R │ ├── tcga_clinical_deconv.R │ ├── tcga_clinical_meta.R │ ├── tcga_mp_scores.R │ ├── tcga_mp_scores_deconv.R │ ├── tcga_mutations_new.R │ └── tcga_subtypes_data.R ├── LICENSE ├── README.md ├── aliases_pubmed.R ├── aliases_table.R ├── bins.R ├── canonical_markers.R ├── cc_sigs_consensus.R ├── cna_mat_prep.R ├── functions.R ├── gene_ave.R ├── gene_mp_cor.R ├── gene_mp_cor_all_web.R ├── gene_plots_cell_types.R ├── gene_plots_data_all_web.R ├── gene_plots_render_web.R ├── gene_plots_rmd_web.R ├── mp_scores_all.R ├── mp_scores_prep.R ├── study_plots_cc.R ├── study_plots_cna.R ├── study_plots_ct_comp.R ├── study_plots_ct_umap.R ├── study_plots_data_cc.R ├── study_plots_data_cc_consensus.R ├── study_plots_data_cc_thresh.R ├── study_plots_data_cc_thresh_consensus.R ├── study_plots_data_cna.R ├── study_plots_data_ct_comp.R ├── study_plots_data_ct_umap.R ├── study_plots_data_dist.R └── study_plots_dist.R /3CA_v2/README.md: -------------------------------------------------------------------------------- 1 | # Analysis with 3CA version 2 2 | 3 | The 3CA_v2 directory contains code for analyses conducted on the expanded version of 3CA, including: cell cycle analysis; measurement of gene specificity and sensitivity; quantification of cell type specificity; and an updated characterisation of recurrent expression programs (meta-programs/MPs). All analyses were run using R version 4.1.1. 4 | -------------------------------------------------------------------------------- /3CA_v2/cc_analysis_consensus_comparison.R: -------------------------------------------------------------------------------- 1 | library(data.table) 2 | library(ggplot2) 3 | library(magrittr) 4 | library(Matrix) 5 | library(stringr) 6 | library(plyr) 7 | library(cowplot) 8 | library(RColorBrewer) 9 | library(scales) 10 | library(readxl) 11 | library(ggpubr) 12 | library(ggtext) 13 | library(matkot) 14 | 15 | try(library(randomcoloR), silent = TRUE) 16 | 17 | source('functions.R') 18 | 19 | paths_table <- fread('../data/paths_table.csv', key = c('study', 'cancer_type')) 20 | 21 | 22 | 23 | 24 | 25 | diseases <- lapply(transpose(as.list(unique(paths_table[cancer_type != 'Other/Models', .(study, cancer_type)]))), function(r) { 26 | cat(r, '\n') 27 | samples_path <- paste0('~/../shared/pan_cancer_datasets/', paths_table[as.list(r), directory[1]], '/samples.csv') 28 | if(file.exists(samples_path)) { 29 | samples <- fread(samples_path)[!is.na(sample)] # Setting na.strings = '' doesn't seem to work here... 30 | samples[, sample := as.character(sample)] 31 | samples[cancer_type == '', cancer_type := NA] 32 | setkey(samples, sample) 33 | } else { 34 | warning("Samples file doesn't exist for ", r[1], ", ", r[2]) 35 | return(NULL) 36 | } 37 | if('cancer_type' %in% names(samples)) { 38 | return(samples[, .(study = r[1], cancer_type = r[2], sample = sample, disease = cancer_type)]) 39 | } else { 40 | warning("Samples file for ", r[1], ", ", r[2], " doesn't contain cancer_type column") 41 | return(NULL) 42 | } 43 | }) %>% rbindlist 44 | to_exclude <- list(c('Chen et al. 2020', 'Head and Neck'), c('Sun et al. 2021', 'Liver/Biliary')) %>% 45 | transpose %>% as.data.table %>% setNames(c('study', 'cancer_type')) 46 | setkey(diseases, study, cancer_type) 47 | diseases_to_include <- diseases[!to_exclude][ 48 | !is.na(disease) & !(disease %in% c('Normal', 'Premalignant')), 49 | .SD[, .(n_sample = .N, n_study = nrow(unique(.SD))), by = disease, .SDcols = c('study', 'cancer_type')][ 50 | (n_study >= 2 & n_sample >= 10) | (n_sample >= 20), 51 | disease 52 | ] 53 | ] 54 | to_include <- diseases[!to_exclude][disease %in% diseases_to_include, .(study, cancer_type)] %>% unique 55 | 56 | 57 | 58 | 59 | 60 | set.seed(1852) 61 | cc_prop <- lapply(c('data_cc_consensus.rds', 'data_cc.RDS'), function(fn) rbindlist(lapply(transpose(as.list(to_include)), function(r) { 62 | 63 | cat(r, '\n') 64 | 65 | # Only difference is to change 'data_cc_consensus.rds' to 'data_cc.RDS' in the following two lines: 66 | if(!(fn %in% dir(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1])))) return(NULL) 67 | plot_data <- readRDS(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1], '/', fn)) 68 | 69 | nullcond <- sapply(plot_data, function(x) ifelse(is.null(x), TRUE, all(sapply(x[names(x) != 'path'], is.null)))) 70 | if(all(nullcond)) return(NULL) 71 | 72 | paths <- apply(paths_table[as.list(r), .(cells, genes, expmat)], 1, as.list, simplify = FALSE) 73 | 74 | samples_path <- paste0('~/../shared/pan_cancer_datasets/', paths_table[as.list(r), directory[1]], '/samples.csv') 75 | if(file.exists(samples_path)) { 76 | samples <- fread(samples_path, colClasses = c(sample = 'character'), na.strings = '')[!is.na(sample) & cancer_type %in% diseases_to_include] 77 | setkey(samples, sample) 78 | } 79 | 80 | # Split HNSCC samples into HPV pos and neg: 81 | if(all(r == c('Puram et al. 2017', 'Head and Neck'))) { 82 | samples <- samples[sample %in% c('5', '6', '16', '17', '18', '20', '22', '25', '26', '28')][, cancer_type := 'HNSCC (HPV-neg.)'] 83 | } else if(all(r == c('Cillo et al. 2020', 'Head and Neck'))) { 84 | samples[, cancer_type := ifelse(p16_status == 'p16+', 'HNSCC (HPV-pos.)', 'HNSCC (HPV-neg.)')] 85 | } else if(all(r == c('Kürten et al. 2021', 'Head and Neck'))) { 86 | samples[, cancer_type := ifelse(additional_tumor_characterisics == 'HPV Positive', 'HNSCC (HPV-pos.)', 'HNSCC (HPV-neg.)')] 87 | } else if(all(r == c('Unpublished HNSCC HTAN', 'Head and Neck'))) { 88 | samples[, cancer_type := ifelse(hpv == 'HPV+', 'HNSCC (HPV-pos.)', 'HNSCC (HPV-neg.)')] 89 | } 90 | 91 | # Split CRC and Gastric into MSS and MSI: 92 | if(all(r == c('Lee et al. 2020', 'Colorectal'))) { 93 | samples[, cancer_type := ifelse(grepl('mss', genetic_hormonal_features), 'CRC (MSS)', 'CRC (MSI)')] 94 | } else if(all(r == c('Pelka et al. 2021', 'Colorectal'))) { 95 | samples[, cancer_type := ifelse(MSIStatus == 'MSS', 'CRC (MSS)', 'CRC (MSI)')] 96 | } else if(all(r == c('Zhang et al. 2018', 'Colorectal'))) { 97 | samples[, cancer_type := ifelse(msi_status == 'MSS', 'CRC (MSS)', 'CRC (MSI)')] 98 | } else if(all(r == c('Kumar et al. 2022', 'Gastric'))) { 99 | samples[, cancer_type := ifelse(is.na(subtype) | subtype == '', NA, ifelse(subtype == 'MSI', 'Gastric (MSI)', 'Gastric (MSS)'))] 100 | } else if(all(r == c('Zhang et al. 2019', 'Gastric'))) { 101 | samples[, cancer_type := NA] # There's only 1 true gastric cancer sample in this dataset, and it's not annotated with MSI status. 102 | } 103 | 104 | samples <- samples[!is.na(cancer_type)]; if(nrow(samples) == 0) return(NULL) 105 | 106 | rout <- lapply(which(!nullcond), function(i) { 107 | 108 | dt <- plot_data[[i]]$ccdata 109 | dt <- dt[sample %in% samples$sample][, c('disease', 'tech') := do.call(`[`, list(samples, sample))[, .(cancer_type, technology)]] 110 | if(!(nrow(dt) >= 100 & 'cell_type' %in% names(dt))) return(NULL) 111 | dt <- dt[, .(cell_name, cell_type, sample, disease, tech, phase)] 112 | if(any(c('Myeloid', 'Monocyte') %in% dt$cell_type)) dt[cell_type %in% c('Myeloid', 'Monocyte'), cell_type := 'Macrophage'] 113 | 114 | # Add complexity column to dt: 115 | cells <- suppressWarnings(fread(paths[[i]]$cells, na.strings = '', colClasses = c(cell_name = 'character', sample = 'character'))) 116 | setkey(cells, cell_name) 117 | if('complexity' %in% names(cells)) dt[, n_gene := do.call(`[`, list(cells, cell_name, 'complexity'))] else dt[, n_gene := as.numeric(NA)] 118 | return(dt) 119 | 120 | }) %>% rbindlist 121 | 122 | if(is.null(rout) || nrow(rout) == 0) return(NULL) 123 | 124 | # Cell cycle proportions across all cells in a dataset (per disease and tech): 125 | rout_all <- rout[, # Require at least 100 cells altogether, across samples 126 | if(.N >= 100) .(sample = 'all', n_sample = length(unique(sample)), n_cell = .N, n_g1s = sum(phase == 'G1/S'), 127 | n_g2m = sum(phase == 'G2/M'), n_int = sum(phase == 'Intermediate'), n_gene = mean(n_gene)), 128 | by = .(cell_type, disease, tech) 129 | ] 130 | 131 | # As above but imposing bounds on the number of cells per sample, so that samples with very many cells don't skew the average: 132 | rout_all_b <- rout[, 133 | if(.N >= 100) { # Require at least 100 cells altogether, across samples 134 | bounds <- .SD[, .(N = .N), by = sample][, quantile(N, c(0.75, 0.25)) + c(1.5, -1.5)*floor(IQR(N))] 135 | .SD[, if(.N >= bounds[2]) {if(.N <= bounds[1]) .SD else .SD[sample(1:.N, bounds[1])]}, by = sample][, 136 | if(.N >= 100) .(sample = 'all_b', n_sample = length(unique(sample)), n_cell = .N, n_g1s = sum(phase == 'G1/S'), 137 | n_g2m = sum(phase == 'G2/M'), n_int = sum(phase == 'Intermediate'), n_gene = mean(n_gene)) 138 | ] 139 | }, 140 | by = .(cell_type, disease, tech) 141 | ] 142 | 143 | # Cell cycle proportions per sample: 144 | rout_smpl <- rout[, # Require at least 100 cells per sample 145 | if(.N >= 100) .(n_sample = NA, n_cell = .N, n_g1s = sum(phase == 'G1/S'), n_g2m = sum(phase == 'G2/M'), 146 | n_int = sum(phase == 'Intermediate'), n_gene = mean(n_gene)), 147 | by = .(cell_type, disease, tech, sample) 148 | ] 149 | 150 | rout <- rbindlist(list(rout_all, rout_all_b, rout_smpl)[c(nrow(rout_all) > 0, nrow(rout_all_b) > 0, nrow(rout_smpl) > 0)]) 151 | 152 | if(!is.null(rout) && nrow(rout) > 0) { 153 | rout <- unique(rout) 154 | rout[, c('cancer_type', 'study') := .(r[2], r[1])] 155 | setcolorder(rout, c('cancer_type', 'study')) 156 | return(rout) 157 | } 158 | 159 | }))) %>% setNames(c('cons', 'bes')) 160 | 161 | for(x in c('cons', 'bes')) { 162 | cc_prop[[x]] <- cc_prop[[x]][!(cancer_type == 'Brain' & cell_type == 'Fibroblast')][, disease := mapvalues( 163 | gsub(' Cancer', '', disease), 164 | c('Acute Myeloid Leukemia', 'Chronic Myeloid Leukemia', 'Clear Cell Renal Cell Carcinoma', 'Colorectal', 'Cutaneous Basal Cell Carcinoma', 165 | 'Cutaneous Squamous Cell Carcinoma', 'Diffuse Large B Cell Lymphoma', 'Glioblastoma', 'Hepatocellular Carcinoma', 'Lung Adenocarcinoma', 166 | 'Lung Squamous Cell Carcinoma', 'Multiple Myeloma', 'Neuroendocrine Tumor', 'Pancreatic Ductal Adenocarcinoma', 'Small Cell Lung'), 167 | c('AML', 'CML', 'ccRCC', 'CRC', 'Skin BCC', 'Skin SCC', 'DLBCL', 'GBM', 'HCC', 'Lung Adeno.', 'Lung Squamous', 'MM', 'NET', 'PDAC', 'SCLC'), 168 | warn_missing = FALSE 169 | )] 170 | } 171 | 172 | common_cts <- cc_prop$cons[sample == 'all_b', .(n = nrow(unique(.SD))), by = cell_type, .SDcols = c('study', 'disease')][n >= 10, cell_type] 173 | 174 | 175 | 176 | 177 | 178 | cor_data <- do.call(merge, lapply(names(cc_prop), function(x) rbind( 179 | cc_prop[[x]][ 180 | sample == 'all_b' & cell_type %in% common_cts, 181 | setNames(.('Cell cycle proportion', weighted.mean((n_g1s + n_g2m + n_int)/n_cell, n_sample)), c('v', paste0('val_', x))), 182 | by = .(cell_type, study, disease) 183 | ], 184 | cc_prop[[x]][ 185 | sample == 'all_b' & cell_type %in% common_cts & n_g1s + n_g2m + n_int >= 50, 186 | setNames(.('Phase bias', weighted.mean((n_g1s - n_g2m)/(n_g1s + n_g2m + n_int), n_sample)), c('v', paste0('val_', x))), 187 | by = .(cell_type, study, disease) 188 | ] 189 | ))) 190 | 191 | ggplot(cor_data, aes(x = val_cons, y = val_bes)) + 192 | geom_point(size = 3, shape = 21, fill = 'lightblue') + 193 | facet_wrap(vars(v), nrow = 1, ncol = 2, scales = 'free') + 194 | geom_text( 195 | aes(x = x, y = y, label = l), 196 | cor_data[, 197 | .(x = min(val_cons) + 0.18*diff(range(val_cons)), y = min(val_bes) + 0.9*diff(range(val_bes)), 198 | l = paste('r =', signif(cor(val_bes, val_cons), 2))), 199 | by = v 200 | ], 201 | size = 8 202 | ) + 203 | geom_abline(slope = 1, intercept = 0, colour = 'tomato', linewidth = 1) + 204 | theme_bw() + 205 | theme( 206 | axis.text = element_text(size = 14), 207 | axis.title = element_text(size = 16), 208 | plot.title = element_text(size = 18), 209 | strip.text = element_text(size = 16), 210 | strip.background = element_rect(fill = 'grey95'), 211 | aspect.ratio = 1 212 | ) + 213 | labs(x = 'Consensus signature', y = 'Bespoke signature', title = 'Cell cycle quantification with bespoke and consensus gene signatures') 214 | -------------------------------------------------------------------------------- /3CA_v2/pseudobulks.R: -------------------------------------------------------------------------------- 1 | # The command line arguments should supply study name and cancer type (in that order). 2 | r = commandArgs(trailingOnly = TRUE) 3 | 4 | 5 | 6 | 7 | 8 | library(data.table) 9 | library(magrittr) 10 | library(plyr) 11 | library(stringr) 12 | library(Matrix) 13 | library(matkot) 14 | 15 | source('functions.R') 16 | 17 | paths_table <- fread('../data/paths_table.csv', key = c('study', 'cancer_type')) 18 | paths <- copy(paths_table[as.list(r)]); setkey(paths, group) 19 | 20 | hgnc_complete_set <- fread('../data/hgnc_complete_set_2023-04-13.txt', key = 'symbol') 21 | hgnc_complete_set <- hgnc_complete_set[!(ensembl_gene_id %in% names(table(ensembl_gene_id))[table(ensembl_gene_id) > 1])] 22 | alias_table <- make_alias_table(hgnc_complete_set) 23 | 24 | genes_all <- fread('../data/gene_plots_data_all_web.csv', select = 'symbol')[symbol %in% hgnc_complete_set$symbol, unique(symbol)] 25 | cts <- c('Malignant', 'B_cell', 'Endothelial', 'Epithelial', 'Fibroblast', 'Macrophage', 'T_cell') 26 | rabbr <- paste(gsub(' | et al. ', '', r[1]), gsub(' |/', '-', r[2]), sep = '_') 27 | 28 | 29 | 30 | 31 | 32 | pbulks <- lapply(paths$group, function(g) { 33 | 34 | cells <- suppressWarnings(fread(paths[g, cells], na.strings = '', colClasses = c(cell_name = 'character', sample = 'character'))) 35 | if(!all(c('cell_type', 'sample') %in% names(cells)) | !endsWith(paths[g, expmat], 'mtx')) return(NULL) 36 | if(cells[, .N > length(unique(cell_name))]) cells[, cell_name := paste(cell_name, .I, sep = '_')] 37 | genes <- fread(paths[g, genes], header = FALSE)$V1 38 | expmat <- readMM(paths[g, expmat]) 39 | expmat <- expmat[genes != '', ] # In case any genes are empty strings 40 | genes <- genes[genes != ''] 41 | expmat <- expmat[genes %in% names(table(genes))[table(genes) == 1], ] # Removes repeated gene names 42 | genes <- genes[genes %in% names(table(genes))[table(genes) == 1]] 43 | genes <- update_symbols_fast(genes, alias_table) # Update gene symbols 44 | rownames(expmat) <- genes 45 | colnames(expmat) <- cells$cell_name 46 | 47 | genes <- genes[genes %in% genes_all & genes %in% names(table(genes))[table(genes) == 1]] 48 | 49 | cells <- cells[cell_type %in% cts & col_nnz(expmat) >= 1000] # Filter cell types and remove low-complexity cells 50 | setkey(cells, sample, cell_type) 51 | cells <- cells[cells[, .N, by = .(sample, cell_type)][N >= 30, -'N']] 52 | 53 | if(nrow(cells) < 30) return(NULL) 54 | expmat <- round(1e+05*to_frac(expmat[genes, cells$cell_name]), 4) # Normalise to TPM/10 55 | 56 | Reduce(merge, apply(unique(cells[, .(sample, cell_type)]), 1, function(sct) { 57 | out <- as.data.table(rowMeans(expmat[, cells[as.list(sct), cell_name]]), keep.rownames = TRUE) 58 | names(out) <- c('gene', paste(rabbr, g, sct[1], gsub('_', '', sct[2]), sep = '_')) 59 | return(out) 60 | }, simplify = FALSE)) 61 | 62 | }) 63 | 64 | pbulks <- Reduce(function(x, y) merge(x, y, by = 'gene', all = TRUE), pbulks[!sapply(pbulks, is.null)]) 65 | 66 | if(!is.null(pbulks)) fwrite(pbulks, paste0('../data/pseudobulks/', rabbr, '.csv')) 67 | -------------------------------------------------------------------------------- /ITH_hallmarks/Generating_MPs/Generate_Meta_Programs.R: -------------------------------------------------------------------------------- 1 | 2 | # ---------------------------------------------------------------------------------------------------- 3 | # Load packages and functions 4 | # ---------------------------------------------------------------------------------------------------- 5 | 6 | library(reshape2) 7 | library(NMF) 8 | library(ggplot2) 9 | library(scales) 10 | source("custom_magma.R") 11 | source("robust_nmf_programs.R") 12 | 13 | 14 | ## Input: 15 | # Genes_nmf_w_basis is a list in which each entry contains NMF gene-scores of a single sample. In our study we ran NMF using ranks 4-9 on the top 7000 genes in each sample. Hence each entry in Genes_nmf_w_basis is a matrix with 7000 rows (genes) X 39 columns (NMF programs) 16 | # For the code below to run smoothly, please use the following nomenclature: 17 | # 1) End entry names in Genes_nmf_w_basis (i.e. each sample name) with '_rank4_9_nruns10.RDS' 18 | # 2) End each matrix column name with an extension that represents the NMF rank and program index. for example '_rank4_9_nrun10.RDS.4.1' to represent the first NMF program obtained using rank=4, or '_rank4_9_nrun10.RDS.6.5' to represent the fifth NMF program obtained using rank=6 19 | # See Genes_nmf_w_basis_example.RDS for an example 20 | # We define MPs in 2 steps: 21 | # 1) The function robust_nmf_programs.R performs filtering, so that programs selected for defining MPs are: 22 | # i) Robust - recur in more that one rank within the sample 23 | # ii) Non-redundant - once a NMF program is selected, other programs within the sample that are similar to it are removed 24 | # iii) Not sample specific - has similarity to NMF programs in other samples 25 | # ** Please see https://github.com/gabrielakinker/CCLE_heterogeneity for more details on how to define robust NMF programs 26 | # 2) Selected NMFs are then clustered iteratively, as described in Figure S1. At the end of the process, each cluster generates a list of the 50 genes (i.e. the MP) that represent the NMF programs that contributed to the cluster. Notably, not all initially selected NMFs end up participating in a cluster 27 | 28 | 29 | # ---------------------------------------------------------------------------------------------------- 30 | # Select NMF programs 31 | # ---------------------------------------------------------------------------------------------------- 32 | 33 | 34 | ## Parameters 35 | intra_min_parameter <- 35 36 | intra_max_parameter <- 10 37 | inter_min_parameter <- 10 38 | 39 | 40 | # get top 50 genes for each NMF program 41 | nmf_programs <- lapply(Genes_nmf_w_basis, function(x) apply(x, 2, function(y) names(sort(y, decreasing = T))[1:50])) 42 | nmf_programs <- lapply(nmf_programs,toupper) ## convert all genes to uppercase 43 | 44 | # for each sample, select robust NMF programs (i.e. observed using different ranks in the same sample), remove redundancy due to multiple ranks, and apply a filter based on the similarity to programs from other samples. 45 | nmf_filter_ccle <- robust_nmf_programs(nmf_programs, intra_min = intra_min_parameter, intra_max = intra_max_parameter, inter_filter=T, inter_min = inter_min_parameter) 46 | nmf_programs <- lapply(nmf_programs, function(x) x[, is.element(colnames(x), nmf_filter_ccle),drop=F]) 47 | nmf_programs <- do.call(cbind, nmf_programs) 48 | 49 | # calculate similarity between programs 50 | nmf_intersect <- apply(nmf_programs , 2, function(x) apply(nmf_programs , 2, function(y) length(intersect(x,y)))) 51 | 52 | # hierarchical clustering of the similarity matrix 53 | nmf_intersect_hc <- hclust(as.dist(50-nmf_intersect), method="average") 54 | nmf_intersect_hc <- reorder(as.dendrogram(nmf_intersect_hc), colMeans(nmf_intersect)) 55 | nmf_intersect <- nmf_intersect[order.dendrogram(nmf_intersect_hc), order.dendrogram(nmf_intersect_hc)] 56 | 57 | 58 | 59 | # ---------------------------------------------------------------------------------------------------- 60 | # Cluster selected NMF programs to generate MPs 61 | # ---------------------------------------------------------------------------------------------------- 62 | 63 | 64 | ### Parameters for clustering 65 | Min_intersect_initial <- 10 # the minimal intersection cutoff for defining the first NMF program in a cluster 66 | Min_intersect_cluster <- 10 # the minimal intersection cutoff for adding a new NMF to the forming cluster 67 | Min_group_size <- 5 # the minimal group size to consider for defining the first NMF program in a cluster 68 | 69 | Sorted_intersection <- sort(apply(nmf_intersect , 2, function(x) (length(which(x>=Min_intersect_initial))-1) ) , decreasing = TRUE) 70 | 71 | Cluster_list <- list() ### Every entry contains the NMFs of a chosen cluster 72 | MP_list <- list() 73 | k <- 1 74 | Curr_cluster <- c() 75 | 76 | nmf_intersect_original <- nmf_intersect 77 | 78 | while (Sorted_intersection[1]>Min_group_size) { 79 | 80 | Curr_cluster <- c(Curr_cluster , names(Sorted_intersection[1])) 81 | 82 | ### intersection between all remaining NMFs and Genes in MP 83 | Genes_MP <- nmf_programs[,names(Sorted_intersection[1])] # Genes in the forming MP are first chosen to be those in the first NMF. Genes_MP always has only 50 genes and evolves during the formation of the cluster 84 | nmf_programs <- nmf_programs[,-match(names(Sorted_intersection[1]) , colnames(nmf_programs))] # remove selected NMF 85 | Intersection_with_Genes_MP <- sort(apply(nmf_programs, 2, function(x) length(intersect(Genes_MP,x))) , decreasing = TRUE) # intersection between all other NMFs and Genes_MP 86 | NMF_history <- Genes_MP # has genes in all NMFs in the current cluster, for redefining Genes_MP after adding a new NMF 87 | 88 | ### Create gene list is composed of intersecting genes (in descending order by frequency). When the number of genes with a given frequency span bewond the 50th genes, they are sorted according to their NMF score. 89 | while ( Intersection_with_Genes_MP[1] >= Min_intersect_cluster) { 90 | 91 | Curr_cluster <- c(Curr_cluster , names(Intersection_with_Genes_MP)[1]) 92 | 93 | Genes_MP_temp <- sort(table(c(NMF_history , nmf_programs[,names(Intersection_with_Genes_MP)[1]])), decreasing = TRUE) ## Genes_MP is newly defined each time according to all NMFs in the current cluster 94 | Genes_at_border <- Genes_MP_temp[which(Genes_MP_temp == Genes_MP_temp[50])] ### genes with overlap equal to the 50th gene 95 | 96 | if (length(Genes_at_border)>1){ 97 | ### Sort last genes in Genes_at_border according to maximal NMF gene scores 98 | ### Run across all NMF programs in Curr_cluster and extract NMF scores for each gene 99 | Genes_curr_NMF_score <- c() 100 | for (i in Curr_cluster) { 101 | curr_study <- paste( strsplit(i , "[.]")[[1]][1 : which(strsplit(i , "[.]")[[1]] == "RDS")] , collapse = "." ) 102 | Q <- Genes_nmf_w_basis[[curr_study]][ match(names(Genes_at_border),toupper(rownames(Genes_nmf_w_basis[[curr_study]])))[!is.na(match(names(Genes_at_border),toupper(rownames(Genes_nmf_w_basis[[curr_study]]))))] ,i] 103 | names(Q) <- names(Genes_at_border[!is.na(match(names(Genes_at_border),toupper(rownames(Genes_nmf_w_basis[[curr_study]]))))]) ### sometimes when adding genes the names do not appear 104 | Genes_curr_NMF_score <- c(Genes_curr_NMF_score, Q ) 105 | } 106 | Genes_curr_NMF_score_sort <- sort(Genes_curr_NMF_score , decreasing = TRUE) 107 | Genes_curr_NMF_score_sort <- Genes_curr_NMF_score_sort[unique(names(Genes_curr_NMF_score_sort))] 108 | 109 | Genes_MP_temp <- c(names(Genes_MP_temp[which(Genes_MP_temp > Genes_MP_temp[50])]) , names(Genes_curr_NMF_score_sort)) 110 | 111 | } else { 112 | Genes_MP_temp <- names(Genes_MP_temp)[1:50] 113 | } 114 | 115 | NMF_history <- c(NMF_history , nmf_programs[,names(Intersection_with_Genes_MP)[1]]) 116 | Genes_MP <- Genes_MP_temp[1:50] 117 | 118 | nmf_programs <- nmf_programs[,-match(names(Intersection_with_Genes_MP)[1] , colnames(nmf_programs))] # remove selected NMF 119 | 120 | Intersection_with_Genes_MP <- sort(apply(nmf_programs, 2, function(x) length(intersect(Genes_MP,x))) , decreasing = TRUE) # intersection between all other NMFs and Genes_MP 121 | 122 | } 123 | 124 | Cluster_list[[paste0("Cluster_",k)]] <- Curr_cluster 125 | MP_list[[paste0("MP_",k)]] <- Genes_MP 126 | k <- k+1 127 | 128 | nmf_intersect <- nmf_intersect[-match(Curr_cluster,rownames(nmf_intersect) ) , -match(Curr_cluster,colnames(nmf_intersect) ) ] # Remove current chosen cluster 129 | 130 | Sorted_intersection <- sort(apply(nmf_intersect , 2, function(x) (length(which(x>=Min_intersect_initial))-1) ) , decreasing = TRUE) # Sort intersection of remaining NMFs not included in any of the previous clusters 131 | 132 | Curr_cluster <- c() 133 | print(dim(nmf_intersect)[2]) 134 | } 135 | 136 | 137 | 138 | #### Sort Jaccard similarity plot according to new clusters: 139 | 140 | inds_sorted <- c() 141 | 142 | for (j in 1:length(Cluster_list)){ 143 | 144 | inds_sorted <- c(inds_sorted , match(Cluster_list[[j]] , colnames(nmf_intersect_original))) 145 | 146 | } 147 | inds_new <- c(inds_sorted , which(is.na( match(1:dim(nmf_intersect_original)[2],inds_sorted)))) ### clustered NMFs will appear first, and the latter are the NMFs that were not clustered 148 | 149 | nmf_intersect_meltI_NEW <- reshape2::melt(nmf_intersect_original[inds_new,inds_new]) 150 | 151 | ggplot(data = nmf_intersect_meltI_NEW, aes(x=Var1, y=Var2, fill=100*value/(100-value), color=100*value/(100-value))) + 152 | geom_tile() + 153 | scale_color_gradient2(limits=c(2,25), low=custom_magma[1:111], mid =custom_magma[112:222], high = custom_magma[223:333], midpoint = 13.5, oob=squish, name="Similarity\n(Jaccard index)") + 154 | scale_fill_gradient2(limits=c(2,25), low=custom_magma[1:111], mid =custom_magma[112:222], high = custom_magma[223:333], midpoint = 13.5, oob=squish, name="Similarity\n(Jaccard index)") + 155 | theme( axis.ticks = element_blank(), panel.border = element_rect(fill=F), panel.background = element_blank(), axis.line = element_blank(), axis.text = element_text(size = 11), axis.title = element_text(size = 12), legend.title = element_text(size=11), legend.text = element_text(size = 10), legend.text.align = 0.5, legend.justification = "bottom") + 156 | theme(axis.title.x=element_blank(), axis.text.x=element_blank(), axis.ticks.x=element_blank()) + 157 | theme(axis.title.y=element_blank(), axis.text.y=element_blank(), axis.ticks.y=element_blank()) + 158 | guides(fill = guide_colourbar(barheight = 4, barwidth = 1)) 159 | 160 | 161 | MP_list <- do.call(cbind, MP_list) 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | -------------------------------------------------------------------------------- /ITH_hallmarks/Generating_MPs/Genes_nmf_w_basis_example.RDS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tiroshlab/3ca/21a784103528cdf15008761f9669e2025159d6d2/ITH_hallmarks/Generating_MPs/Genes_nmf_w_basis_example.RDS -------------------------------------------------------------------------------- /ITH_hallmarks/Generating_MPs/custom_magma.R: -------------------------------------------------------------------------------- 1 | # Custom color palette 2 | library(RColorBrewer) 3 | library(viridis) 4 | custom_magma <- c(colorRampPalette(c("white", rev(magma(323, begin = 0.15))[1]))(10), rev(magma(323, begin = 0.18))) -------------------------------------------------------------------------------- /ITH_hallmarks/Generating_MPs/robust_nmf_programs.R: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------- 2 | # Function for selecting robust nonnegative matrix factorization (NMF) programs 3 | # ------------------------------------------------------------------------------------------- 4 | 5 | # - nmf_programs = a list; each element contains a matrix with NMF programs (top 50 genes) generated for a specific cell line using different NMF factorization ranks. 6 | # - intra_min = minimum overlap with a program from the same cell line (for selecting robust programs) 7 | # - intra_max = maximum overlap with a program from the same cell line (for removing redundant programs) 8 | # - inter_filter = logical; indicates whether programs should be filtered based on their similarity to programs of other cell lines 9 | # - inter_min = minimum overlap with a program from another cell line 10 | 11 | # Returns a character vector with the NMF programs selected 12 | 13 | robust_nmf_programs <- function(nmf_programs, intra_min = 35, intra_max = 10, inter_filter=T, inter_min = 10) { 14 | 15 | # Select NMF programs based on the minimum overlap with other NMF programs from the same cell line 16 | intra_intersect <- lapply(nmf_programs, function(z) apply(z, 2, function(x) apply(z, 2, function(y) length(intersect(x,y))))) 17 | intra_intersect_max <- lapply(intra_intersect, function(x) apply(x, 2, function(y) sort(y, decreasing = T)[2])) 18 | nmf_sel <- lapply(names(nmf_programs), function(x) nmf_programs[[x]][,intra_intersect_max[[x]]>=intra_min]) 19 | names(nmf_sel) <- names(nmf_programs) 20 | 21 | # Select NMF programs based on i) the maximum overlap with other NMF programs from the same cell line and 22 | # ii) the minimum overlap with programs from another cell line 23 | nmf_sel_unlist <- do.call(cbind, nmf_sel) 24 | inter_intersect <- apply(nmf_sel_unlist , 2, function(x) apply(nmf_sel_unlist , 2, function(y) length(intersect(x,y)))) ## calculating intersection between all programs 25 | 26 | final_filter <- NULL 27 | for(i in names(nmf_sel)) { 28 | a <- inter_intersect[grep(i, colnames(inter_intersect), invert = T),grep(i, colnames(inter_intersect))] 29 | b <- sort(apply(a, 2, max), decreasing = T) # for each cell line, ranks programs based on their maximum overlap with programs of other cell lines 30 | if(inter_filter==T) b <- b[b>=inter_min] # selects programs with a maximum intersection of at least 10 31 | if(length(b) > 1) { 32 | c <- names(b[1]) 33 | for(y in 2:length(b)) { 34 | if(max(inter_intersect[c,names(b[y])]) <= intra_max) c <- c(c,names(b[y])) # selects programs iteratively from top-down. Only selects programs that have a intersection smaller than 10 with a previously selected programs 35 | } 36 | final_filter <- c(final_filter, c) 37 | } else { 38 | final_filter <- c(final_filter, names(b)) 39 | } 40 | } 41 | return(final_filter) 42 | } 43 | 44 | -------------------------------------------------------------------------------- /ITH_hallmarks/MPs_distribution/MP_distribution.R: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------- 2 | # Program for depicting the MP distribution of a given cell types across and within samples 3 | # ------------------------------------------------------------------------------------------- 4 | 5 | # My_study = a list in which each element represents a sample in the study; the elements are expression matrices of a given cell type (i.e. cell_type below) in CPM or TPM units. Row names in each matrix should be gene symbols, and column names the cell IDs. 6 | # cell_type = one of the following cell types : "Cancer" , "Endothelial" , "Epithelial" , "Fibroblasts" , "Macrophages" , "CD4_T_cells", "CD8_T_cells" , "B_cells". Here "Cancer" represents malignant cells 7 | # MinGenes = the min number of MP genes that should exist in the study (default is 25) 8 | # MinScore = the minimal score for assigning a cell to a MP (default is 1). A cell is assigned to the MP with the maximal score, given that it exceeded MinScore. If the maximal score is below MinScore, the cell is unassigned. 9 | # MinCells = the minimal % of cells that should be assigned to a MP in order to account for the MP (default is 0.05) 10 | # MP_list = a list of the MPs for each cell_type 11 | 12 | ### Output: 13 | # (a) Pie chart of the MP distribution across the study 14 | # (b) Bar plot per tumor with the % of cells per MP 15 | # (c) Expression heatmap per tumor 16 | 17 | 18 | 19 | # ------------------------------------------------------------------------------------------- 20 | 21 | 22 | library(ggplot2) 23 | library(scalop) # see https://rdrr.io/github/jlaffy/scalop/man/sigScores.html 24 | library(gridExtra) 25 | library(ggpubr) 26 | 27 | 28 | 29 | ### Define the following: 30 | # My_study = 31 | # cell_type = 32 | # MP_list = readRDS("MP_list.RDS") # can be found in the Github repository 33 | # heatCols = readRDS("heatCols.RDS") # can be found in the Github repository 34 | 35 | 36 | Score_cells <- function(L, 37 | cell_type, 38 | MinGenes = 25, 39 | MinScore = 1, 40 | MinCells = 0.05, 41 | MP_list 42 | ) 43 | { 44 | 45 | MP_list <- MP_list[[cell_type]] 46 | 47 | L <- lapply(L, function(x) log2((x/10) + 1) ) # Log normalize before scoring 48 | 49 | MP_scores_per_sample <- lapply(L, function(x) scalop::sigScores(x, sigs = MP_list, conserved.genes = MinGenes/50 ) ) 50 | 51 | remove_cells <- function(x,MinScore){ # remove cells whose max score was below MinScore 52 | max_score <- apply(x, 1, function(y) max(y)) 53 | cells_rm <- which(max_score < MinScore) 54 | if (length(cells_rm) > 0 ){ 55 | x <- x[-cells_rm , ] 56 | } 57 | return(x) 58 | } 59 | MP_scores_per_sample <- lapply(MP_scores_per_sample, function(x) remove_cells(x,MinScore)) 60 | 61 | Assign_MP_per_cell <- lapply(MP_scores_per_sample, function(x) apply(x, 1, function(y) colnames(x)[which(y==max(y))] ) ) 62 | 63 | filter_cells <- function(x,MinCells){ # remove MP that were assassin to less than MinCells in each sample 64 | MP_frequency <- as.numeric(ave(x, x, FUN = length)) 65 | MP_rm <- which(MP_frequency/length(x) < MinCells) # MPs to be removed 66 | if (length(MP_rm)>0){ 67 | x <- x[-MP_rm] 68 | } 69 | return(x) 70 | } 71 | Assign_MP_per_cell_filtered <- lapply(Assign_MP_per_cell, function(x) filter_cells(x,MinCells)) 72 | 73 | return(Assign_MP_per_cell_filtered) 74 | 75 | } 76 | 77 | 78 | Assign_MP_per_cell_filtered <- Score_cells(L = My_study , cell_type = "Cancer" , MP_list=MP_list) 79 | 80 | 81 | 82 | #### Part A: pie chart of MPs across the whole study 83 | MinCells <- 0.05 84 | df <- data.frame(MPs = names(table(unlist(Assign_MP_per_cell_filtered))) , frequency = as.vector(table(unlist(Assign_MP_per_cell_filtered))) ) 85 | MPs_rm <- which(df$frequency/sum(df$frequency) < MinCells) ## remove also MPs that were assassin to less than MinCells from the total 86 | if (length(MPs_rm > 0)){ 87 | df$MPs[MPs_rm] <- paste0("Other MPs (<", MinCells*100, "%)") 88 | } 89 | 90 | 91 | blank_theme <- theme_minimal()+ 92 | theme( 93 | axis.title.x = element_blank(), 94 | axis.title.y = element_blank(), 95 | panel.border = element_blank(), 96 | panel.grid=element_blank(), 97 | axis.ticks = element_blank(), 98 | plot.title=element_text(size=14, face="bold") 99 | ) 100 | 101 | 102 | pie <- ggplot(df, aes(x="", y=frequency, fill=MPs)) + 103 | geom_bar(width = 1, stat = "identity") + 104 | coord_polar("y", start=0)+ 105 | blank_theme + 106 | theme(axis.text.x=element_blank()) 107 | 108 | 109 | pie 110 | 111 | 112 | #### Part B: Bar plot with MP distribution per sample 113 | 114 | sample_num <- length(Assign_MP_per_cell_filtered) 115 | nCol <- floor(sqrt(sample_num)) 116 | MPs <- unique(unlist(Assign_MP_per_cell_filtered)) 117 | 118 | ### For arranging the full MP names as legend in the plot 119 | MP_list <- MP_list[[cell_type]] 120 | MPs <- MPs[sort(match(MPs, names(MP_list)),index.return = T)$ix] 121 | 122 | Assign_MP_per_cell_filtered_abbrev <- lapply(Assign_MP_per_cell_filtered, function(x) apply(as.data.frame(x) , 1 , function(x) strsplit(x, "[ ]")[[1]][1]) ) 123 | 124 | df_list <- lapply(Assign_MP_per_cell_filtered_abbrev, function(x) data.frame(MPs = names(table(x)) , frequency = as.vector(table(x))/sum(as.vector(table(x))) )) 125 | df_list2 <- lapply(Assign_MP_per_cell_filtered, function(x) data.frame(MPs = names(table(x)) , frequency = as.vector(table(x))/sum(as.vector(table(x))) )) 126 | 127 | P <- lapply(seq_along(df_list), function(I) 128 | ggplot(data=df_list[[I]], aes(x=reorder(MPs,-frequency), y=frequency)) + 129 | geom_bar(stat="identity")+ 130 | theme(axis.title.x=element_blank())+ 131 | ggtitle(names(df_list)[I]) 132 | ) 133 | 134 | 135 | P 136 | 137 | #### Part C: an expression heatmap for each sample 138 | 139 | df_list_ordered <- lapply(df_list, function(x) x[sort(x$frequency,decreasing = T , index.return = T)$ix,]) ## order MPs in each sample as in the bar plots, in decreasing order 140 | df_list_ordered2 <- lapply(df_list2, function(x) x[sort(x$frequency,decreasing = T , index.return = T)$ix,]) ## order MPs in each sample as in the bar plots, in decreasing order 141 | 142 | 143 | L1 <- lapply(seq_along(My_study), function(I) My_study[[I]][ , names(Assign_MP_per_cell_filtered[[I]])]) ## extract relevant cells that were assigned to an MP (at least 5%) 144 | names(L1) <- names(My_study) 145 | 146 | L1 <- lapply(L1, function(x) log2((x/10) + 1) ) ## Log normalize 147 | L1 <- lapply(L1, function(x) (x - rowMeans(x)) ) ## center 148 | 149 | L2 <- lapply(seq_along(L1), function(I) L1[[I]][ , sort(match(Assign_MP_per_cell_filtered_abbrev[[I]],df_list_ordered[[I]]$MPs), index.return = T)$ix] ) # sort cells according to MPs in df_list_ordered 150 | names(L2) <- names(L1) 151 | 152 | MP_genes_per_sample <- lapply(df_list_ordered2, function(x) unlist(MP_list[x$MPs])) ### MP genes sorted by the MP order in df_list_ordered2 153 | 154 | L_plot <- lapply(seq_along(L2), function(I) L2[[I]][match(MP_genes_per_sample[[I]] , rownames(L2[[I]]))[!is.na(match(MP_genes_per_sample[[I]] , rownames(L2[[I]])))] , ]) 155 | names(L_plot) <- names(L2) 156 | 157 | color.scheme <- colorRampPalette(c(heatCols))(n=333) 158 | 159 | Plot_heatmap <- function(M){ 160 | 161 | M_new2 <- M 162 | M_new2 <- apply(M_new2, 2, rev) 163 | M_meltII <- reshape2::melt(t(M_new2)) 164 | M_meltII$Var2 <- factor(M_meltII$Var2) 165 | 166 | G <- ggplot(data = M_meltII, aes(x=Var1, y=Var2, fill=value, color=value)) + 167 | geom_tile() + 168 | scale_color_gradient2(limits=c(-4,4), low=color.scheme[1:111], mid =color.scheme[112:222], high = color.scheme[223:333], midpoint = 0, oob=squish, name=NULL) + 169 | scale_fill_gradient2(limits=c(-4,4), low=color.scheme[1:111], mid =color.scheme[112:222], high = color.scheme[223:333], midpoint = 0, oob=squish, name=NULL) + 170 | theme( panel.border = element_rect(fill=F), panel.background = element_blank(), axis.line = element_blank(), axis.text = element_text(size = 8), axis.title = element_text(size = 8), legend.title = element_text(size=8), legend.text = element_text(size = 8), legend.text.align = 0.5, legend.justification = "bottom" ) + 171 | theme(axis.title.x=element_blank(), axis.text.x=element_blank(), axis.ticks.x=element_blank()) + 172 | theme(axis.title.y=element_blank(), axis.text.y=element_blank(), axis.ticks.y=element_blank() ) 173 | 174 | return(G) 175 | } 176 | 177 | P1 <- lapply(L_plot, function(x) Plot_heatmap(x)) 178 | 179 | P_plot <- c() 180 | for (i in 1:length(P)){ 181 | P_plot <- c(P_plot , P[i] , P1[i]) 182 | } 183 | 184 | 185 | P_plot <- do.call("ggarrange", c(P_plot, ncol=2, nrow = 3)) 186 | 187 | lapply(P_plot, function(x) 188 | 189 | annotate_figure(x, left = text_grob(paste(MPs , collapse = "\n"), 190 | color = "red", face = "bold", size = 10)) 191 | 192 | ) 193 | 194 | 195 | 196 | -------------------------------------------------------------------------------- /ITH_hallmarks/MPs_distribution/MP_list.RDS.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tiroshlab/3ca/21a784103528cdf15008761f9669e2025159d6d2/ITH_hallmarks/MPs_distribution/MP_list.RDS.gz -------------------------------------------------------------------------------- /ITH_hallmarks/MPs_distribution/My_study.RDS.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tiroshlab/3ca/21a784103528cdf15008761f9669e2025159d6d2/ITH_hallmarks/MPs_distribution/My_study.RDS.gz -------------------------------------------------------------------------------- /ITH_hallmarks/MPs_distribution/heatCols.RDS.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tiroshlab/3ca/21a784103528cdf15008761f9669e2025159d6d2/ITH_hallmarks/MPs_distribution/heatCols.RDS.gz -------------------------------------------------------------------------------- /ITH_hallmarks/README.md: -------------------------------------------------------------------------------- 1 | # Source code for Gavish et al. 2 | 3 | The ITH_hallmarks folder contains code for reproducing the analyses in the Gavish et al. This code is arranged into 3 folders. 4 | 5 | ## Generating_MPs 6 | - Generate_Meta_Programs.R generates MPs from NMFs programs that were calculated for each sample using different ranks. The NMF programs were calculated per sample using the ‘NMF’ R package: 7 | NMFs_per_sample = nmf(x = Expression_matrix, rank = 4:9, method="snmf/r", nrun = 10) 8 | NMF programs are listed in Genes_nmf_w_basis, where each entry contains NMF gene-scores of a single sample. 9 | Using the function robust_nmf_programs.R (described below) and a custom written clustering method, MPs are derived from the NMF programs. 10 | - Genes_nmf_w_basis_example.RDS is an example for how the NMF output is arranged. Note that each element name in the list ends with '_rank4_9_nruns10.RDS', and each matrix column ends with an extension that represents the NMF rank and program index. 11 | - robust_nmf_programs.R filters outs redundant and non-robust NMF programs for each sample. The remaining NMF programs are those that appeared in more than one NMF rank within the sample and have similarity to a NMF program in a different sample. Please see also https://github.com/gabrielakinker/CCLE_heterogeneity for more details on how to define robust NMF programs. 12 | 13 | ## MPs_distribution 14 | - MP_distribution.R depicts the meta-program distribution of a certain cell type within individual samples and across the whole study. It requires a list of scRNAseq expression matrices of different samples in a study (of a certain cell type). 15 | - MP_list.RDS is a list of meta-programs for each cell type required for inferring the distributions in MP_distribution.R 16 | - heatCols.RDS is the colormap used for plotting heatmaps in MP_distribution.R 17 | - My_study.RDS can be used for an example 18 | 19 | ## TCGA_analysis 20 | These scripts are for analysing expression of the MPs in TCGA data. A few notes on the data used here: 21 | - Some of the external datasets used are annotated with their source (e.g. URL to paper) via comments in the code. 22 | - The 'hgnc_complete_set.txt' file can be downloaded from the HGNC website: . 23 | - Other datasets used are available on request. 24 | -------------------------------------------------------------------------------- /ITH_hallmarks/TCGA_analysis/functions.R: -------------------------------------------------------------------------------- 1 | make_alias_table <- function(hgnc_complete_set) { 2 | out <- hgnc_complete_set[, 3 | rbind( 4 | data.table(symbol_alt = str_split(alias_symbol, '\\|')[[1]], type = 'alias'), 5 | data.table(symbol_alt = str_split(prev_symbol, '\\|')[[1]], type = 'prev') 6 | ), 7 | by = symbol 8 | ][symbol_alt == '', symbol_alt := NA] 9 | setkey(out, symbol_alt) 10 | return(out) 11 | } 12 | 13 | update_symbols_fast <- function(symbols, alias_table) { 14 | if(length(symbols) != length(unique(symbols))) warning('Not all symbols are unique!') 15 | setkey(alias_table, symbol_alt) 16 | symbols[grep('C[0-9]+ORF[0-9]+', symbols)] <- gsub('ORF', 'orf', symbols[grep('C[0-9]+ORF[0-9]+', symbols)]) 17 | cond <- !(symbols %in% alias_table$symbol) 18 | # The following addition to ensures two different symbols don't map to the same "prev" symbol... 19 | cond <- cond & !(symbols %in% alias_table[type == 'prev'][symbols[cond]][symbol %in% names(table(symbol))[table(symbol) > 1], symbol_alt]) 20 | # ...And the following makes sure we have all unique prev symbols: 21 | cond <- cond & !(symbols %in% alias_table[type == 'prev'][symbols[cond], names(table(symbol_alt))[table(symbol_alt) > 1]]) 22 | if(any(cond)) { 23 | # alias_table[type == 'prev'][symbols[cond]] 24 | symbols[cond] <- alias_table[type == 'prev'][symbols[cond]][, 25 | .(newsymb = { # Match symbols that have at least one "prev" symbol that isn't already in symbols: 26 | cands <- symbol[!is.na(symbol) & !(symbol %in% symbols)] 27 | ifelse(length(cands) == 1, cands, symbol_alt) # Convert only those symbols that have exactly one such "prev" symbol 28 | }), 29 | by = symbol_alt 30 | ]$newsymb 31 | } 32 | cond <- !(symbols %in% alias_table$symbol) 33 | # As above, the following additions to ensure two different symbols don't map to the same "alias", and we have all unique aliases: 34 | cond <- cond & !(symbols %in% alias_table[type == 'alias'][symbols[cond]][symbol %in% names(table(symbol))[table(symbol) > 1], symbol_alt]) 35 | cond <- cond & !(symbols %in% alias_table[type == 'alias'][symbols[cond], names(table(symbol_alt))[table(symbol_alt) > 1]]) 36 | if(any(cond)) { 37 | symbols[cond] <- alias_table[type == 'alias'][symbols[cond]][, 38 | .(newsymb = { # Match symbols that have at least one alias that isn't already in symbols: 39 | cands <- symbol[!is.na(symbol) & !(symbol %in% symbols)] 40 | ifelse(length(cands) == 1, cands, symbol_alt) # Convert only those symbols that have exactly one such alias 41 | }), 42 | by = symbol_alt 43 | ]$newsymb 44 | } 45 | return(symbols) 46 | } 47 | 48 | dirr <- function(x) dir(x, recursive = TRUE) 49 | -------------------------------------------------------------------------------- /ITH_hallmarks/TCGA_analysis/pdac_classical_main.R: -------------------------------------------------------------------------------- 1 | library(data.table) 2 | library(readxl) 3 | library(magrittr) 4 | library(stringr) 5 | library(plyr) 6 | library(ggplot2) 7 | library(RColorBrewer) 8 | library(scales) 9 | library(cowplot) 10 | library(randomcoloR) 11 | library(matkot) 12 | 13 | cancer_types <- c('ACC', 'BLCA', 'BRCA', 'CESC', 'CHOL', 'COAD', 'DLBC', 'ESCA_AC', 'ESCA_ESCC', 'GBM_IDH-WT', 'HNSC', 'KICH', 'KIRC', 'KIRP', 14 | 'LAML', 'LGG_astro', 'LGG_IDH-WT', 'LGG_oligo', 'LIHC', 'LUAD', 'LUSC', 'MESO', 'OV', 'PAAD', 'PCPG', 'PRAD', 'READ', 'SARC', 'SKCM_primary', 15 | 'SKCM_metastatic', 'STAD', 'TGCT', 'THCA', 'THYM', 'UCEC', 'UCS', 'UVM') 16 | 17 | cancer_genes <- as.data.table(read_xlsx('/home/labs/tirosh/tyler/pan_cancer/cancer5000.xlsx', skip = 1, n_max = 260))[`Cancer5000-S (219)` == 1, gene] 18 | 19 | hgnc_complete_set <- fread('../data/hgnc_complete_set.txt', key = 'symbol')[ 20 | !(ensembl_gene_id %in% names(table(ensembl_gene_id))[table(ensembl_gene_id) > 1]) 21 | ][!grepl('unplaced', location), chrarm := str_extract(location, '[0-9]*[XY]*[pq]')][!is.na(chrarm)][ 22 | order(as.numeric(mapvalues(gsub('[pq]', '', chrarm), c('X', 'Y'), c(23, 24))), str_extract(chrarm, '[pq]')) 23 | ] 24 | 25 | 26 | 27 | 28 | 29 | meta_hnsc <- fread('/home/labs/tirosh/tyler/TCGA_data/HNSC/Cells.csv', key = 'sample_id', na.strings = '') 30 | scores_hnsc <- fread('/home/labs/tirosh/tyler/TCGA_data/HNSC/Scores.csv')[grep('MP17 Interferon|PDAC-classical', meta_program)] 31 | mut_data_hnsc <- fread('/home/labs/tirosh/tyler/TCGA_data/HNSC/Mutation_data.csv') 32 | mut_data_hnsc <- mut_data_hnsc[gene %in% cancer_genes] 33 | setkey(mut_data_hnsc, gene, Variant_Classification) 34 | mut_data_hnsc <- rbind( 35 | mut_data_hnsc[Variant_Classification %in% c('Nonsense_Mutation', 'Frame_Shift_Del', 'Frame_Shift_Ins', 'Splice_Site', 'Nonstop_Mutation', 36 | 'Translation_Start_Site')], 37 | mut_data_hnsc[ 38 | mut_data_hnsc[ 39 | Variant_Classification %in% c('Missense_Mutation', 'In_Frame_Del', 'In_Frame_Ins'), 40 | .(n = length(unique(patient_id))), 41 | by = .(gene, Variant_Classification) 42 | ][n >= 2, -'n'] 43 | ] 44 | ) 45 | pscores_hnsc <- copy(scores_hnsc)[, patient_id := do.call(`[`, args = list(meta_hnsc, sample_id))$patient_id][, -'sample_id'] 46 | setcolorder(pscores_hnsc, 'patient_id') 47 | setkey(pscores_hnsc, patient_id) 48 | pscores_hnsc <- pscores_hnsc[patient_id %in% mut_data_hnsc$patient_id] 49 | mut_data_hnsc <- mut_data_hnsc[patient_id %in% pscores_hnsc$patient_id] 50 | pids_tab <- pscores_hnsc[meta_program == meta_program[1], table(patient_id)] 51 | pids <- names(pids_tab)[pids_tab == 1] 52 | pscores_hnsc <- pscores_hnsc[patient_id %in% pids] 53 | mut_data_hnsc <- mut_data_hnsc[patient_id %in% pids] 54 | 55 | pscores_hnsc <- dcast(pscores_hnsc[, .(patient_id, meta_program, score)], patient_id ~ meta_program, value.var = 'score') 56 | genes <- c('CASP8', 'NSD1') 57 | pscores_hnsc[, (genes) := lapply(genes, function(g) patient_id %in% mut_data_hnsc[gene == g, patient_id])] 58 | 59 | # See how scores for this MP match with the trinity of CASP8-mut, HRAS-mut and TP53-WT: 60 | pscores_hnsc[, 61 | categ3 := patient_id %in% mut_data_hnsc[gene == 'CASP8', patient_id] & 62 | patient_id %in% mut_data_hnsc[gene == 'HRAS', patient_id] & 63 | !(patient_id %in% mut_data_hnsc[gene == 'TP53', patient_id]), 64 | by = patient_id 65 | ] 66 | 67 | bdata_hnsc_ifn <- melt( 68 | pscores_hnsc[, .(patient_id, `MP17 Interferon/MHC-II (I)`, CASP8, categ3)], 69 | id.vars = c('patient_id', 'MP17 Interferon/MHC-II (I)'), 70 | variable.name = 'event', 71 | value.name = 'event_pres' 72 | )[, event := mapvalues(event, c('CASP8', 'categ3'), c('CASP8-mut', 'CASP8-mut, HRAS-mut,\nTP53-wt'))] 73 | box_hnsc_ifn <- ggplot(bdata_hnsc_ifn) + 74 | geom_boxplot(aes(x = event_pres, y = `MP17 Interferon/MHC-II (I)`), outlier.shape = NA) + 75 | geom_point(aes(x = event_pres, y = `MP17 Interferon/MHC-II (I)`), position = position_jitter(width = 0.3), shape = 21, fill = 'black', 76 | stroke = 0.3, alpha = 0.3) + 77 | geom_text( 78 | data = data.table( 79 | x = c(1.5, 1.5), 80 | y = c(3.1, 3.1), 81 | label = pscores_hnsc[, paste('p =', c( 82 | signif(t.test(`MP17 Interferon/MHC-II (I)` ~ CASP8)$p.value, 2), 83 | signif(t.test(`MP17 Interferon/MHC-II (I)` ~ categ3)$p.value, 2) 84 | ))], 85 | event = c('CASP8-mut', 'CASP8-mut, HRAS-mut,\nTP53-wt') 86 | ), 87 | aes(x = x, y = y, label = label) 88 | ) + 89 | geom_segment( 90 | data = data.table(x = c(1, 1, 2, 1, 1, 2), xend = c(2, 1, 2, 2, 1, 2), y = rep(2.8, 6), yend = c(2.8, 2.7, 2.7, 2.8, 2.7, 2.7), 91 | event = c('CASP8-mut', 'CASP8-mut, HRAS-mut,\nTP53-wt')), 92 | aes(x = x, xend = xend, y = y, yend = yend) 93 | ) + 94 | facet_wrap(vars(event), strip.position = 'bottom') + 95 | theme_test() + 96 | theme(strip.placement = 'outside', strip.background = element_rect(fill = NA, colour = NA), strip.text = element_text(size = 11, vjust = 1)) + 97 | labs(x = NULL, y = 'Interferon/MHC-II (I) score', title = 'Interferon/MHC-II (I) and mutations in HNSC') 98 | 99 | subtypes_hnsc <- fread('../../TCGA_data/HNSC/subtypes.csv', na.strings = '', key = 'patient_id') 100 | pscores_hnsc[, subtype := do.call(`[`, list(subtypes_hnsc, patient_id))$subtype] 101 | pscores_hnsc[, subtype_classical := subtype == 'Classical'] 102 | # pscores_hnsc[, t.test(subtype_classical ~ pdac_high)] # No enrichment of the Classical subtype in the PDAC-high population. 103 | 104 | clin_hnsc <- fread('../../TCGA_data/HNSC/Clinical.csv', key = 'patient_id') 105 | pscores_hnsc[, site := do.call(`[`, list(clin_hnsc, patient_id))$anatomic_neoplasm_subdivision] 106 | pscores_hnsc[, larynx := (site == 'larynx')] 107 | # pscores_hnsc[, t.test(larynx ~ pdac_high)] # Larynx is enriched in the PDAC-high population. 108 | 109 | pscores_hnsc[, larynx_nice := factor(ifelse(larynx, 'Larynx', 'Non-larynx'), levels = c('Non-larynx', 'Larynx'))] 110 | pscores_hnsc[, nsd1_nice := factor(ifelse(NSD1, 'NSD1-mut', 'NSD1-wt'))] 111 | bdata_hnsc_pdac_larynx <- rbind( 112 | pscores_hnsc[, .(id = patient_id, score = `MP30 PDAC-classical`, nsd1 = NSD1, categ = ifelse(larynx, 'Larynx', 'Non-larynx'))], 113 | pscores_hnsc[, .(id = patient_id, score = `MP30 PDAC-classical`, nsd1 = NSD1, categ = 'All tumors')] 114 | ) 115 | bdata_hnsc_pdac_larynx[, nsd1 := factor(ifelse(nsd1, 'NSD1-mut', 'NSD1-wt'), levels = c('NSD1-wt', 'NSD1-mut'))] 116 | bdata_hnsc_pdac_larynx[, categ := factor(categ, levels = c('All tumors', 'Non-larynx', 'Larynx'))] 117 | 118 | # Just the "All tumors" boxes: 119 | box_hnsc <- ggplot(bdata_hnsc_pdac_larynx[categ == 'All tumors']) + 120 | geom_boxplot(aes(x = nsd1, y = score), outlier.shape = NA, width = 0.8) + 121 | geom_point(aes(x = nsd1, y = score), fill = 'grey80', size = 2, shape = 21, stroke = 0.3, position = position_jitter(width = 0.3)) + 122 | geom_text( 123 | data = data.table(x = 1.5, y = 2.9, label = paste('p =', signif(c(pscores_hnsc[, t.test(`MP30 PDAC-classical` ~ NSD1)$p.value]), 2))), 124 | aes(x = x, y = y, label = label) 125 | ) + 126 | geom_segment( 127 | data = data.table(x = c(1, 1, 2), xend = c(2, 1, 2), y = rep(2.7, 3), yend = c(2.7, 2.65, 2.65)), 128 | aes(x = x, xend = xend, y = y, yend = yend) 129 | ) + 130 | theme_test() + 131 | theme(axis.text.x = element_text(size = 11), plot.title = element_text(hjust = 0.5)) + 132 | labs(title = 'PDAC-classical and\nNSD1 mutation in HNSC', x = NULL, y = 'PDAC-classical score') 133 | 134 | 135 | 136 | 137 | 138 | library(seriation) 139 | 140 | scores_patient_mut_all <- fread('../data/scores_patient_mut.csv') 141 | 142 | exp_data <- slapply(c('LUAD', 'PAAD'), function(ct) { 143 | expmat <- fread(paste0('/home/labs/tirosh/tyler/TCGA_data/', ct, '/Exp_data_TPM.csv'))[, set_rownames(as.matrix(.SD), V1), .SDcols = -'V1'] 144 | meta <- fread(paste0('/home/labs/tirosh/tyler/TCGA_data/', ct, '/Cells.csv'), na.strings = '')[ 145 | sample_type != 'normal' & patient_id %in% scores_patient_mut_all[cancer_type == ct, patient_id] 146 | ] 147 | expmat <- expmat[, meta$sample_id] 148 | colnames(expmat) <- meta$patient_id 149 | return(list(expmat = expmat, meta = meta)) 150 | }) 151 | top_genes <- slapply( 152 | names(exp_data), 153 | function(ct) head(names(sort(apply(exp_data[[ct]]$expmat, 1, function(x) median(abs(x - median(x)))), decreasing = TRUE)), 2500) 154 | ) 155 | cormat <- slapply(names(exp_data), function(ct) cor(t(apply(exp_data[[ct]]$expmat[top_genes[[ct]], ], 1, function(x) x - median(x))))) 156 | clust <- slapply(names(exp_data), function(ct) seriate(dist(cormat[[ct]]), method = 'HC_average')[[1]]) 157 | pdac_clust <- labels(as.dendrogram(clust$LUAD)[[1]][[2]][[1]]) 158 | 159 | luad_clinical <- fread('/home/labs/tirosh/tyler/TCGA_data/LUAD/Clinical.csv', key = 'patient_id') 160 | hist_data <- copy(scores_patient_mut_all[cancer_type == 'LUAD' & grepl('PDAC Classical', meta_program)])[, 161 | c('pdac_high', 'hist_type') := .((score > 1.5), do.call(function(x) luad_clinical[x, histological_type], list(x = patient_id))) 162 | ][, mucinous := grepl('^mucinous| mucinous', hist_type)] # This regex ensures "nonmucinous" is not counted 163 | setkey(hist_data, patient_id) 164 | 165 | top_genes_int <- intersect(top_genes$LUAD, top_genes$PAAD) 166 | cor_luad_paad <- cor(exp_data$LUAD$expmat[top_genes_int, ], exp_data$PAAD$expmat[top_genes_int, ]) 167 | bxplt_data <- data.table(patient_id = rownames(cor_luad_paad), mean_cor = rowMeans(cor_luad_paad))[, pdac_clust := (patient_id %in% pdac_clust)] 168 | setkey(scores_patient_mut_all, patient_id) 169 | bxplt_data[, pdac_high := do.call(function(x) scores_patient_mut_all[grepl('PDAC Classical', meta_program)][x, score > 1.5], list(x = patient_id))] 170 | bxplt_data[, mucinous := hist_data$mucinous] # Checked that this gives TRUE: all(bxplt_data$patient_id == hist_data$patient_id) 171 | 172 | box_luad <- ggplot(bxplt_data, aes(x = mapvalues(pdac_high, c(TRUE, FALSE), c('PDAC score\n\u2265 1.5', 'PDAC score\n< 1.5')), y = mean_cor)) + 173 | geom_boxplot(outlier.shape = NA) + 174 | geom_point( 175 | aes(fill = factor(mapvalues(mucinous, c(TRUE, FALSE), c('Mucinous', 'Non-mucinous')), levels = c('Non-mucinous', 'Mucinous'))), 176 | size = 2, shape = 21, stroke = 0.3, position = position_jitter(width = 0.3) 177 | ) + 178 | scale_fill_manual(values = c('Non-mucinous' = 'grey80', 'Mucinous' = 'deeppink2')) + 179 | theme_test() + 180 | theme(axis.text.x = element_text(size = 11)) + 181 | labs(x = NULL, y = 'Mean correlation with PDAC samples', title = NULL, fill = 'LUAD subtype') 182 | 183 | 184 | 185 | 186 | 187 | box_hnsc_grob <- ggplotGrob(box_hnsc) 188 | box_luad_grob <- ggplotGrob(box_luad) 189 | widths_hnsc <- c(0.3, 0, 0.5, 0.4, 8, 0, 0, 0, 0.3) 190 | widths_luad <- c(0.3, 0, 0.5, 0.7, 8, 0, 0, 0.3, 3, 0, 0.3) 191 | heights_hnsc <- c(0.3, 0, 1, 0, 0, 0, 10, 0.8, 0, 0, 0, 0.3) 192 | heights_luad <- c(0.3, 0, 1, 0, 0, 0, 10, 0.8, 0, 0, 0, 0.3) 193 | box_hnsc_grob$widths <- unit(widths_hnsc, 'cm') 194 | box_luad_grob$widths <- unit(widths_luad, 'cm') 195 | box_hnsc_grob$heights <- unit(heights_hnsc, 'cm') 196 | box_luad_grob$heights <- unit(heights_luad, 'cm') 197 | 198 | cairo_pdf('../data/pdac_classical_main.pdf', height = sum(heights_luad)/2.54, width = sum(c(widths_luad, widths_hnsc))/2.54) 199 | plot_grid(box_luad_grob, box_hnsc_grob, nrow = 1, ncol = 2, rel_widths = c(sum(widths_luad), sum(widths_hnsc))) 200 | dev.off() 201 | -------------------------------------------------------------------------------- /ITH_hallmarks/TCGA_analysis/preprocess_clinical_data.R: -------------------------------------------------------------------------------- 1 | library(data.table) 2 | library(magrittr) 3 | library(matkot) 4 | 5 | # source('functions.R') 6 | 7 | cancer_types <- c('ACC', 'BLCA', 'BRCA', 'CESC', 'CHOL', 'COAD', 'DLBC', 'ESCA', 'GBM', 'HNSC', 'KICH', 'KIRC', 'KIRP', 'LAML', 'LGG', 'LIHC', 8 | 'LUAD', 'LUSC', 'MESO', 'OV', 'PAAD', 'PCPG', 'PRAD', 'READ', 'SARC', 'SKCM', 'STAD', 'TGCT', 'THCA', 'THYM', 'UCEC', 'UCS', 'UVM') 9 | 10 | for(ct in cancer_types) { 11 | 12 | cat(ct, '\b...') 13 | 14 | if(ct %in% dir('~/TCGA_data') && 'All_CDEs.txt' %in% dir(paste0('~/TCGA_data/', ct))) { 15 | clinical_data <- fread(paste0('~/TCGA_data/', ct, '/All_CDEs.txt'), showProgress = FALSE) 16 | } else { 17 | 18 | download.file( 19 | paste0( 20 | 'http://gdac.broadinstitute.org/runs/stddata__2016_01_28/data/', 21 | ct, 22 | '/20160128/gdac.broadinstitute.org_', 23 | ct, 24 | '.Clinical_Pick_Tier1.Level_4.2016012800.0.0.tar.gz' 25 | ), 26 | destfile = 'tmp.tar.gz', 27 | quiet = TRUE 28 | ) 29 | 30 | file_names <- untar('tmp.tar.gz', list = TRUE) 31 | untar('tmp.tar.gz', files = file_names[endsWith(file_names, 'All_CDEs.txt')], exdir = 'tmp') 32 | file.remove('tmp.tar.gz') 33 | 34 | # Read in the data: 35 | clinical_data <- fread(paste0('tmp/', file_names[endsWith(file_names, 'All_CDEs.txt')]), showProgress = FALSE) 36 | 37 | # Remove the created directory: 38 | unlink('tmp', recursive = TRUE) 39 | 40 | } 41 | 42 | clinical_data <- transpose(clinical_data, make.names = 'bcr_patient_barcode', keep.names = 'bcr_patient_barcode') 43 | clinical_data <- clinical_data[, -'patient_id'] 44 | setnames(clinical_data, 'bcr_patient_barcode', 'patient_id') 45 | clinical_data[, c('patient_id', 'cancer_type') := .(toupper(gsub('-', '\\.', patient_id)), ct)] 46 | clinical_data <- clinical_data[order(patient_id)] 47 | setcolorder(clinical_data, c('patient_id', 'cancer_type')) 48 | 49 | fwrite(clinical_data, paste0('~/TCGA_data/', ct, '/Clinical.csv')) 50 | 51 | cat('Done!\n') 52 | 53 | } 54 | -------------------------------------------------------------------------------- /ITH_hallmarks/TCGA_analysis/preprocess_mutations.R: -------------------------------------------------------------------------------- 1 | library(data.table) 2 | library(magrittr) 3 | library(limma) 4 | library(stringr) 5 | library(plyr) 6 | library(RCurl) 7 | library(matkot) 8 | 9 | cancer_types <- c('ACC', 'BLCA', 'BRCA', 'CESC', 'CHOL', 'COAD', 'DLBC', 'ESCA', 'GBM', 'HNSC', 'KICH', 'KIRC', 'KIRP', 'LAML', 'LGG', 'LIHC', 10 | 'LUAD', 'LUSC', 'MESO', 'OV', 'PAAD', 'PCPG', 'PRAD', 'READ', 'SARC', 'SKCM', 'STAD', 'TGCT', 'THCA', 'THYM', 'UCEC', 'UCS', 'UVM') 11 | 12 | hgnc_complete_set <- fread('../data/hgnc_complete_set.txt', key = 'ensembl_gene_id')[ 13 | !(ensembl_gene_id %in% names(table(ensembl_gene_id))[table(ensembl_gene_id) > 1]), 14 | .(symbol = symbol, alias = alias_symbol, entrez_id = entrez_id) 15 | ][, entrez_id := as.character(entrez_id)] 16 | 17 | 18 | 19 | 20 | 21 | for(ct in cancer_types) { 22 | 23 | cat(ct, '\n') 24 | 25 | # The CNV file URLs use one of three suffixes, so we need to choose whichever one exists: 26 | data_url <- sapply( 27 | c('-TP', '-TB', '-TM'), 28 | function(suff) paste0( 'http://gdac.broadinstitute.org/runs/analyses__2016_01_28/data/', ct, suff, '/20160128/gdac.broadinstitute.org_', ct, 29 | suff, '.CopyNumber_Gistic2.Level_4.2016012800.0.0.tar.gz'), 30 | USE.NAMES = FALSE 31 | ) 32 | data_url <- data_url[sapply(data_url, url.exists)] 33 | 34 | download.file(data_url, destfile = 'tmp.tar.gz', quiet = TRUE) 35 | 36 | file_names <- untar('tmp.tar.gz', list = TRUE) 37 | untar('tmp.tar.gz', files = file_names[endsWith(file_names, 'all_thresholded.by_genes.txt')], exdir = 'tmp') 38 | file.remove('tmp.tar.gz') 39 | 40 | # Read in the data: 41 | cnv_data <- fread(paste0('tmp/', file_names[endsWith(file_names, 'all_thresholded.by_genes.txt')])) 42 | 43 | # Remove the created directory: 44 | unlink('tmp', recursive = TRUE) 45 | 46 | cnv_data[, 47 | c('Gene Symbol', 'Locus ID', 'Cytoband') := .( 48 | do.call( 49 | function(gl) { 50 | gl[grep('C[0-9]+ORF[0-9]+', gl)] <- gsub('ORF', 'orf', gl[grep('C[0-9]+ORF[0-9]+', gl)]) 51 | gl_mapped <- alias2SymbolTable(gl) 52 | gl_log <- gl %in% hgnc_complete_set$symbol 53 | gl_mapped_log <- gl_mapped %in% hgnc_complete_set$symbol 54 | out <- gl 55 | out[!gl_log & gl_mapped_log] <- gl_mapped[!gl_log & gl_mapped_log] 56 | out[!gl_log & !gl_mapped_log] <- sapply( 57 | gl_mapped[!gl_log & !gl_mapped_log], 58 | function(g) hgnc_complete_set[ 59 | grepl(paste0('^', g, '\\||\\|', g, '\\||\\|', g, '$'), alias), 60 | switch((.N == 1) + 1, NA, symbol) 61 | ] 62 | ) 63 | return(out) 64 | }, 65 | args = list(gl = cnv_data$`Gene Symbol`) 66 | ), 67 | NULL, 68 | NULL 69 | ) 70 | ] 71 | 72 | cnv_data <- cnv_data[!is.na(`Gene Symbol`)] 73 | cnv_data <- cnv_data[, set_rownames(as.matrix(.SD), `Gene Symbol`), .SDcols = -'Gene Symbol'] 74 | colnames(cnv_data) <- gsub('-', '\\.', colnames(cnv_data)) 75 | 76 | fwrite(as.data.frame(cnv_data), paste0('~/TCGA_data/', ct, '/CNV_GISTIC_data.csv'), row.names = TRUE) 77 | 78 | } 79 | 80 | 81 | 82 | 83 | 84 | # Mutations annotations files were downloaded from GDC Data Portal, filtering by MAF, open access, Masked Somatic Mutation, MuTect2. 85 | 86 | # These files have full sample IDs, but since they don't match the sample IDs in the expression data, I'll extract just the patient IDs. 87 | 88 | setkey(hgnc_complete_set, entrez_id) 89 | 90 | for(ct in cancer_types) { 91 | cat(ct, '\n') 92 | cont <- dir(paste0('~/TCGA_data/mut_data/', ct)) 93 | mut_data <- fread(paste0('~/TCGA_data/mut_data/', ct, '/', cont[grep(ct, cont)]))[, 94 | c('Entrez_Gene_Id', 'patient_id', 'sample_type') := .( 95 | as.character(Entrez_Gene_Id), 96 | apply(str_split_fixed(Tumor_Sample_Barcode, '-', 4)[, 1:3], 1, paste, collapse = '.'), 97 | mapvalues( 98 | gsub('[A-Z]', '', str_split_fixed(Tumor_Sample_Barcode, '-', 5)[, 4]), 99 | c('01', '02', '03', '05', '06', '07', '11'), 100 | c('primary', 'recurrent', 'primary', 'primary_additional', 'metastatic', 'metastatic_additional', 'normal'), 101 | warn_missing = FALSE 102 | ) 103 | ) 104 | ][, 105 | gene := hgnc_complete_set[Entrez_Gene_Id, symbol] 106 | ][ 107 | !is.na(gene), 108 | .(gene, patient_id, sample_type, Chromosome, Start_Position, End_Position, Strand, Variant_Classification, Variant_Type, 109 | Reference_Allele, Tumor_Seq_Allele1, Tumor_Seq_Allele2, Mutation_Status) 110 | ] 111 | fwrite(mut_data, paste0('~/TCGA_data/', ct, '/Mutation_data.csv')) 112 | } 113 | -------------------------------------------------------------------------------- /ITH_hallmarks/TCGA_analysis/study_contribution_per_MP.RDS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tiroshlab/3ca/21a784103528cdf15008761f9669e2025159d6d2/ITH_hallmarks/TCGA_analysis/study_contribution_per_MP.RDS -------------------------------------------------------------------------------- /ITH_hallmarks/TCGA_analysis/study_tcga_map.R: -------------------------------------------------------------------------------- 1 | library(data.table) 2 | library(magrittr) 3 | 4 | study_tcga_map <- data.table()[, 5 | c('study', 'cancer_type') := rbindlist(.( 6 | .('ALL_Ebinger_2016', NA), 7 | .('AML_Galen_2019', 'LAML'), 8 | .('AML_Wu_2020', 'LAML'), 9 | .('AUTONOMIC_GANGLIA_Kinker_2020', NA), 10 | .('BILIARY_TRACT_Kinker_2020', 'LIHC'), 11 | .('BONE_Kinker_2020', NA), 12 | .('Breast_Chung_2017', 'BRCA'), 13 | .('Breast_Gao_2021', 'BRCA'), 14 | .('Breast_Jordan_2016', 'BRCA'), 15 | .('Breast_Karaayvas_2018', 'BRCA'), 16 | .('BREAST_Kinker_2020', 'BRCA'), 17 | .('Breast_Pal_2021', 'BRCA'), 18 | .('Breast_Qian_2020', 'BRCA'), 19 | .('CENTRAL_NERVOUS_SYSTEM_Kinker_2020', NA), 20 | .('CML_Giustacchini_2017', NA), 21 | .('Colon_Li_2017', 'COAD'), 22 | .('Colon_Li_2017', 'READ'), 23 | .('CRC_Lee_2020', 'COAD'), 24 | .('CRC_Lee_2020', 'READ'), 25 | .('ENDOMETRIUM_Kinker_2020', 'UCEC'), 26 | .('Ependymoma_Gojo_2020', NA), 27 | .('ESCC_Yao_2020', 'ESCA_ESCC'), 28 | .('Ewing_CellLines_Aynaud_2020', NA), 29 | .('Ewing_PDX_Aynaud_2020', NA), 30 | .('GBM_cell_lines', 'GBM_IDH-WT'), 31 | .('GBM_mouse_Unpublished', 'GBM_IDH-WT'), 32 | .('Glioblastoma_Couturier_2020', 'GBM_IDH-WT'), 33 | .('Glioblastoma_Darmanis_2017', 'GBM_IDH-WT'), 34 | .('Glioblastoma_Wang_2019', 'GBM_IDH-WT'), 35 | .('Glioma_Filbin_2018', NA), # H3-K27M mutant - a kind of HGG that's not GBM 36 | .('Glioma_Neftel_2019', 'GBM_IDH-WT'), 37 | .('Glioma_Tirosh_2016', 'LGG_oligo'), 38 | .('Glioma_Venteicher_2017', 'LGG_astro'), 39 | .('Glioma_Yuan_2018', 'GBM_IDH-WT'), # 7 GBMs and 1 grade III astrocytoma (TCGA LGG does include plenty of Grade 3 LGGs) 40 | .('Glioma_Yuan_2018', 'LGG_astro'), 41 | .('HCC_Sun_2021', 'LIHC'), 42 | .('HGSOC_Izar_2020', 'OV'), 43 | .('HNSCC_Puram_2017', 'HNSC'), 44 | .('IPMN_Franses_2017', NA), # CTCs 45 | .('KIDNEY_Kinker_2020', 'KICH'), 46 | .('KIDNEY_Kinker_2020', 'KIRC'), 47 | .('KIDNEY_Kinker_2020', 'KIRP'), 48 | .('LARGE_INTESTINE_Kinker_2020', 'COAD'), 49 | .('LARGE_INTESTINE_Kinker_2020', 'READ'), 50 | .('Leukemia_Caron_2020', NA), # ALL 51 | .('LIVER_Kinker_2020', 'LIHC'), 52 | .('Liver_Ma_2019', 'LIHC'), 53 | .('Liver_Ma_2019', 'CHOL'), 54 | .('LiverMets_Massalha_2020', NA), # these are either CRC metastases to the liver or intra-hepatic cholangiocarcinoma 55 | .('LSCC_Unpublished', 'HNSC'), 56 | .('Lung_Kim_2020', 'LUAD'), 57 | .('LUNG_Kinker_2020', 'LUAD'), 58 | .('LUNG_Kinker_2020', 'LUSC'), 59 | .('Lung_Laughney_2020', 'LUAD'), 60 | .('Lung_Qian_2020', 'LUAD'), 61 | .('Lung_Qian_2020', 'LUSC'), # This study contains about half LUAD and half LUSC 62 | .('Lung_Xing_2021', 'LUAD'), 63 | .('lungcancer_Antonella_2020', 'LUAD'), 64 | .('lungcancer_Maynard_2020', 'LUAD'), 65 | .('lungcancer_Maynard_2020', 'LUSC'), # 45 LUAD and 1 LUSC 66 | .('Medulloblastoma_Hovestadt_2019', NA), 67 | .('Melanoma_JerbyArnon_2018', 'SKCM_metastatic'), 68 | .('Melanoma_JerbyArnon_2018', 'SKCM_primary'), 69 | .('Melanoma_Tirosh_2016', 'SKCM_metastatic'), 70 | .('Melanoma_Tirosh_2016', 'SKCM_primary'), 71 | .('MM_Cohen_2021', NA), 72 | .('MM_Ledergor_2018', NA), 73 | .('MM_Liu_2021', NA), 74 | .('Myeloproliferative_Nam_2019', NA), 75 | .('NasopharyngealCarcinoma_Chen_2020', 'HNSC'), 76 | .('NET_Unpublished', NA), 77 | .('Neuroblastoma_CellLine_Jansky_2021', NA), 78 | .('Neuroblastoma_Dong_2020', NA), 79 | .('Neuroblastoma_Jansky_2021', NA), 80 | .('Neuroblastoma_Kildisiute_2021', NA), 81 | .('Neuroblastoma_Mercatelli_2021', NA), 82 | .('OESOPHAGUS_Kinker_2020', 'ESCA_AC'), 83 | .('OESOPHAGUS_Kinker_2020', 'ESCA_ESCC'), 84 | .('OPSCC_Unpublished', 'HNSC'), 85 | .('Osteosarcoma_Zhou_2020', NA), 86 | .('Ovarian_Nath_2021', 'OV'), 87 | .('Ovarian_Olalekan_2021', 'OV'), 88 | .('Ovarian_Qian_2020', 'OV'), 89 | .('OVARY_Kinker_2020', 'OV'), 90 | .('PANCREAS_Kinker_2020', 'PAAD'), 91 | .('PDAC_Franses_2020', NA), # CTCs 92 | .('PDAC_Ligorio_2019', 'PAAD'), 93 | .('PDAC_Lin_2020', 'PAAD'), 94 | .('PDAC_Moncada_2020', 'PAAD'), 95 | .('PDAC_Peng_2019', 'PAAD'), 96 | .('PDAC_Steele_2020', 'PAAD'), 97 | .('PDAC_Yu_2012', 'PAAD'), 98 | .('PDAC_Yue_2020', 'PAAD'), 99 | .('Pituitary_Cui_2021', NA), 100 | .('PLEURA_Kinker_2020', NA), 101 | .('Prostate_Chen_2021', 'PRAD'), 102 | .('Prostate_Dong_2020', 'PRAD'), 103 | .('PROSTATE_Kinker_2020', 'PRAD'), 104 | .('Prostate_Miyamoto_2015', 'PRAD'), 105 | .('RCC_Bi_2021', 'KIRC'), # 7 KIRC and 1 KIRP 106 | .('RCC_Bi_2021', 'KIRP'), 107 | .('RCC_Obradovic_2021', 'KIRC'), 108 | .('RCC_Zhang_2021', 'KICH'), # 7 KIRC and 1 KICH 109 | .('RCC_Zhang_2021', 'KIRC'), 110 | .('SCHW_Unpublished', NA), 111 | .('SCLC_HumanMet_Ireland_2020', NA), 112 | .('SCLC_Mouse_Ireland_2020', NA), 113 | .('SKIN_Kinker_2020', 'SKCM_metastatic'), 114 | .('SKIN_Kinker_2020', 'SKCM_primary'), 115 | .('SkinSCC_Ji_2020', 'SKCM_primary'), 116 | .('SOFT_TISSUE_Kinker_2020', 'SARC'), 117 | .('STOMACH_Kinker_2020', 'STAD'), 118 | .('SyS_JerbyArnon_2021', 'SARC'), 119 | .('Thyroid_Gao_2021', 'THCA'), 120 | .('THYROID_Kinker_2020', 'THCA'), 121 | .('UPPER_AERODIGESTIVE_TRACT_Kinker_2020', 'HNSC'), 122 | .('URINARY_TRACT_Kinker_2020', 'BLCA'), 123 | .('WilmsTumor_Young_2018', NA) 124 | )) 125 | ] 126 | 127 | fwrite(study_tcga_map, '../data/study_tcga_map.csv') 128 | -------------------------------------------------------------------------------- /ITH_hallmarks/TCGA_analysis/tcga_clinical.R: -------------------------------------------------------------------------------- 1 | library(data.table) 2 | library(magrittr) 3 | library(ggplot2) 4 | library(stringdist) 5 | library(stringr) 6 | library(survival) 7 | library(readxl) 8 | library(plyr) 9 | library(matkot) 10 | 11 | cancer_types <- c('ACC', 'BLCA', 'BRCA', 'CESC', 'CHOL', 'COAD', 'DLBC', 'ESCA_AC', 'ESCA_ESCC', 'GBM_IDH-WT', 'HNSC', 'KICH', 'KIRC', 'KIRP', 12 | 'LAML', 'LGG_astro', 'LGG_IDH-WT', 'LGG_oligo', 'LIHC', 'LUAD', 'LUSC', 'MESO', 'OV', 'PAAD', 'PCPG', 'PRAD', 'READ', 'SARC', 'SKCM_primary', 13 | 'SKCM_metastatic', 'STAD', 'TGCT', 'THCA', 'THYM', 'UCEC', 'UCS', 'UVM') 14 | 15 | study_contrib <- readRDS('../data/study_contribution_per_MP.RDS') 16 | names(study_contrib) <- gsub(' ', ' ', names(study_contrib)) 17 | names(study_contrib)[41] <- 'MP41 Unassigned' 18 | 19 | study_tcga_map <- fread('../data/study_tcga_map.csv', na.strings = '', key = 'cancer_type') 20 | cancer_types <- cancer_types[cancer_types %in% study_tcga_map$cancer_type] 21 | 22 | # Data from Liu et al. 2018 (https://doi.org/10.1016/j.cell.2018.02.052): 23 | liu <- as.data.table( 24 | read_xlsx( 25 | '~/TCGA_data/Liu_1-s2.0-S0092867418302290-mmc1.xlsx', 26 | sheet = 1, 27 | na = c('[Unknown]', '[Not Evaluated]', '[Not Applicable]', '[Not Available]', '[Discrepancy]', '#N/A') 28 | ) 29 | )[, .(patient_id = gsub('-', '\\.', bcr_patient_barcode), cancer_type = type, OS = OS.time, OSc = OS, PFI = PFI.time, PFIc = PFI, 30 | stage = ajcc_pathologic_tumor_stage, grade = histological_grade, ther1 = str_to_title(treatment_outcome_first_course))] 31 | setkey(liu, patient_id) 32 | 33 | firehose <- fread('~/TCGA_data/clinical.csv', na.strings = '', key = 'patient_id')[, .(patient_id = patient_id, 34 | cancer_type = cancer_type, ln_n = as.character(number_of_lymphnodes_positive_by_he), ther2 = str_to_title(primary_therapy_outcome_success), 35 | ther3 = str_to_title(followup_treatment_success), t_stage = pathologic_t, n_stage = pathologic_n, m_stage = pathologic_m)] 36 | 37 | clin <- merge(liu, firehose, by = c('patient_id', 'cancer_type'), all = TRUE) 38 | 39 | meta_all <- lapply( 40 | cancer_types, 41 | function(ct) fread(paste0('~/TCGA_data/', ct, '/Cells.csv'))[, .(patient_id = patient_id, cancer_type = ct)] 42 | ) %>% rbindlist %>% unique 43 | meta_all <- meta_all[patient_id %in% names(table(patient_id))[table(patient_id) == 1]] 44 | setkey(meta_all, patient_id) 45 | clin <- clin[patient_id %in% meta_all$patient_id] 46 | clin[, cancer_type := do.call(`[`, list(meta_all, patient_id))$cancer_type] # Convert from TCGA label where relevant, e.g. 'ESCA' --> 'ESCA_AC' 47 | 48 | clin_test <- lapply(cancer_types, function(ct) { 49 | 50 | cat(ct, '\n') 51 | 52 | if('Scores.csv' %in% dir(paste0('~/TCGA_data/', ct))) { 53 | scores <- fread(paste0('~/TCGA_data/', ct, '/Scores.csv'), key = 'sample_id') 54 | } else return(NULL) 55 | 56 | # Read in tumour metadata file and extract one sample ID per patient ID, according to order of priority of sample types: 57 | meta <- fread(paste0('~/TCGA_data/', ct, '/Cells.csv'), key = 'sample_type', na.strings = '')[ 58 | sample_type != 'normal', 59 | .SD[c('primary', 'primary_additional', 'recurrent', 'metastatic', 'metastatic_additional')][!is.na(sample_id)][1], 60 | keyby = patient_id 61 | ][clin[cancer_type == ct, patient_id]][!is.na(sample_id)] 62 | setkey(meta, sample_id) 63 | 64 | # Subset scores table with the above IDs: 65 | scores <- scores[sample_id %in% meta$sample_id] 66 | scores <- scores[, c(do.call(`[`, list(meta, sample_id))[, .(patient_id, purity)], .(meta_program = meta_program, score = score))] 67 | 68 | # Subset clinical data table with the above IDs: 69 | clin_ct <- clin[patient_id %in% scores$patient_id] 70 | 71 | # Define variable categories: 72 | 73 | # M stage: 74 | clin_ct[, m_stage := ifelse(grepl('m0', m_stage), 'Low', ifelse(grepl('m1', m_stage), 'High', NA))] 75 | 76 | # Therapy resistance: 77 | for(v in c('ther1', 'ther2', 'ther3')) clin_ct[, (v) := ifelse(grepl('^C|^No ', get(v)), 'Low', ifelse(is.na(get(v)), NA, 'High'))] 78 | clin_ct[, ther1 := {if(is.na(ther1)) {if(is.na(ther2)) NA else ther2} else if(is.na(ther2)) ther1 else if(ther1 == ther2) ther1 else NA}, 79 | by = patient_id] 80 | clin_ct[, ther := 'a'] 81 | clin_ct[, ther := if(any(c(ther1, ther3) == 'High', na.rm = TRUE)) 'High' else if(all(is.na(c(ther1, ther3)))) NA else 'Low', by = patient_id] 82 | clin_ct[, c('ther1', 'ther2', 'ther3') := NULL] 83 | 84 | # Lymphatic spread: 85 | clin_ct[!is.na(ln_n), ln_n := ifelse(ln_n == '0', 'Low', 'High')] 86 | clin_ct[, n_stage := ifelse(grepl('n0', n_stage), 'Low', ifelse(grepl('n1|n2|n3', n_stage), 'High', NA))] 87 | clin_ct[, ln := 'a'] 88 | clin_ct[, ln := if(any(c(ln_n, n_stage) == 'High', na.rm = TRUE)) 'High' else if(all(is.na(c(ln_n, n_stage)))) NA else 'Low', by = patient_id] 89 | clin_ct[, c('ln_n', 'n_stage') := NULL] 90 | 91 | # In cases where we have intermediate levels, define boundary as wherever the cumulative sum of sample numbers passes 10: 92 | 93 | ints <- list() 94 | 95 | # Grade: 96 | clin_ct[, grade := mapvalues(grade, c('G1', 'G4', 'GX', 'GB', 'High Grade', 'Low Grade'), c('Low', 'High', NA, 'Low', 'High', 'Low'), 97 | warn_missing = FALSE)] 98 | ints$grade <- c('G2', 'G3')[c('G2', 'G3') %in% clin_ct$grade] 99 | 100 | # Stage: 101 | clin_ct[, stage := mapvalues(gsub('[ABC]$', '', stage), c('Stage 0', 'Stage I', 'Stage IV', 'IS', 'I/II NOS', 'Stage X'), 102 | c('Low', 'Low', 'High', 'Low', NA, NA), warn_missing = FALSE)] 103 | ints$stage <- c('Stage II', 'Stage III')[c('Stage II', 'Stage III') %in% clin_ct$stage] 104 | 105 | # T stage: 106 | clin_ct[, t_stage := mapvalues(str_extract(t_stage, 't[0-4]|tis'), c('tis', 't0', 't1', 't4'), c('Low', 'Low', 'Low', 'High'), 107 | warn_missing = FALSE)] 108 | ints$t_stage <- c('t2', 't3')[c('t2', 't3') %in% clin_ct$t_stage] 109 | 110 | for(v in names(ints)) { 111 | if(length(ints[[v]]) > 0) { 112 | sums <- clin_ct[!is.na(get(v)), .(N = .N), keyby = .(kv = get(v))][c('Low', ints[[v]], 'High')][!is.na(N), setNames(cumsum(N), kv)] 113 | if(!any(sums[names(sums) != 'High'] >= 10)) { 114 | clin_ct[, (v) := NA] 115 | } else { 116 | clin_ct[get(v) %in% names(sums)[1:min(which(sums >= 10))], (v) := 'Low'] 117 | clin_ct[get(v) %in% names(sums)[(min(which(sums >= 10)) + 1):length(sums)], (v) := 'High'] 118 | } 119 | } 120 | } 121 | 122 | scores <- scores[, cbind(.SD, do.call(`[`, list(clin_ct, patient_id))[, -c('patient_id', 'cancer_type')])] 123 | scores[, cancer_type := ct] 124 | setcolorder(scores, c('patient_id', 'cancer_type')) 125 | 126 | surv <- lapply(unique(scores$meta_program), function(mp) { 127 | lapply(c('OS', 'PFI'), function(v) { 128 | sdata <- scores[meta_program == mp & !is.na(get(v))] 129 | if(nrow(sdata) < 20) return(NULL) 130 | coxmod <- coxph(Surv(get(v), get(paste0(v, 'c'))) ~ score, data = sdata) 131 | data.table(cancer_type = ct, var_name = v, meta_program = mp, eff = coxmod$coefficients,# hr = exp(coxmod$coefficients), 132 | pval = summary(coxmod)$logtest['pvalue']) 133 | }) %>% rbindlist 134 | }) %>% rbindlist 135 | 136 | categ <- lapply(unique(scores$meta_program), function(mp) { 137 | lapply(c('stage', 'grade', 't_stage', 'm_stage', 'ln', 'ther'), function(v) { 138 | sdata <- scores[meta_program == mp & !is.na(get(v))] 139 | if(nrow(sdata) < 20) return(NULL) else if(sdata[, sum(get(v) == 'Low') < 10 | sum(get(v) == 'High') < 10]) return(NULL) 140 | data.table(cancer_type = ct, var_name = v, meta_program = mp, 141 | eff = sdata[, .SD[get(v) == 'High', mean(score)] - .SD[get(v) == 'Low', mean(score)]], 142 | pval = t.test(score ~ get(v), sdata)$p.value) 143 | }) %>% rbindlist 144 | }) %>% rbindlist 145 | 146 | return(list(res = rbind(surv, categ), scores = scores, clin = clin_ct)) 147 | 148 | }) 149 | 150 | clin_test_res <- rbindlist(lapply(clin_test, `[[`, 'res')) 151 | clin_test_data <- rbindlist(lapply(clin_test, `[[`, 'clin')) 152 | clin_test_scores <- rbindlist(lapply(clin_test, `[[`, 'scores')) 153 | 154 | clin_test_res[, pval_adj := p.adjust(pval, method = 'BH')] 155 | setkey(clin_test_res, cancer_type, meta_program) 156 | 157 | fwrite(clin_test_res, '../data/clin_test_res.csv') 158 | fwrite(clin_test_data, '../data/clin_test_data.csv') 159 | fwrite(clin_test_scores, '../data/clin_test_scores.csv') 160 | -------------------------------------------------------------------------------- /ITH_hallmarks/TCGA_analysis/tcga_clinical_deconv.R: -------------------------------------------------------------------------------- 1 | library(data.table) 2 | library(magrittr) 3 | library(ggplot2) 4 | library(stringdist) 5 | library(stringr) 6 | library(survival) 7 | library(readxl) 8 | library(plyr) 9 | library(matkot) 10 | 11 | cancer_types <- c('BLCA', 'BRCA', 'CESC', 'CHOL', 'COAD', 'ESCA_AC', 'ESCA_ESCC', 'HNSC', 'LUAD', 'LUSC', 'OV', 'PAAD', 'PRAD', 'READ', 'STAD', 12 | 'THCA', 'UCEC') 13 | 14 | study_contrib <- readRDS('../data/study_contribution_per_MP.RDS') 15 | names(study_contrib) <- gsub(' ', ' ', names(study_contrib)) 16 | names(study_contrib)[41] <- 'MP41 Unassigned' 17 | 18 | study_tcga_map <- fread('../data/study_tcga_map.csv', na.strings = '') 19 | cancer_types <- cancer_types[cancer_types %in% study_tcga_map$cancer_type] 20 | setkey(study_tcga_map, cancer_type) 21 | 22 | # Data from Liu et al. 2018 (https://doi.org/10.1016/j.cell.2018.02.052): 23 | liu <- as.data.table( 24 | read_xlsx( 25 | '~/TCGA_data/Liu_1-s2.0-S0092867418302290-mmc1.xlsx', 26 | sheet = 1, 27 | na = c('[Unknown]', '[Not Evaluated]', '[Not Applicable]', '[Not Available]', '[Discrepancy]', '#N/A') 28 | ) 29 | )[, .(patient_id = gsub('-', '\\.', bcr_patient_barcode), cancer_type = type, OS = OS.time, OSc = OS, PFI = PFI.time, PFIc = PFI, 30 | stage = ajcc_pathologic_tumor_stage, grade = histological_grade, ther1 = str_to_title(treatment_outcome_first_course))] 31 | setkey(liu, patient_id) 32 | 33 | firehose <- fread('~/TCGA_data/clinical.csv', na.strings = '', key = 'patient_id')[, .(patient_id = patient_id, 34 | cancer_type = cancer_type, ln_n = as.character(number_of_lymphnodes_positive_by_he), ther2 = str_to_title(primary_therapy_outcome_success), 35 | ther3 = str_to_title(followup_treatment_success), t_stage = pathologic_t, n_stage = pathologic_n, m_stage = pathologic_m)] 36 | 37 | clin <- merge(liu, firehose, by = c('patient_id', 'cancer_type'), all = TRUE) 38 | 39 | meta_all <- lapply( 40 | cancer_types, 41 | function(ct) fread(paste0('~/TCGA_data/', ct, '/Cells.csv'))[, .(patient_id = patient_id, cancer_type = ct)] 42 | ) %>% rbindlist %>% unique 43 | meta_all <- meta_all[patient_id %in% names(table(patient_id))[table(patient_id) == 1]] 44 | setkey(meta_all, patient_id) 45 | clin <- clin[patient_id %in% meta_all$patient_id] 46 | clin[, cancer_type := do.call(`[`, list(meta_all, patient_id))$cancer_type] # Convert from TCGA label where relevant, e.g. 'ESCA' --> 'ESCA_AC' 47 | 48 | clin_test <- lapply(cancer_types, function(ct) { 49 | 50 | cat(ct, '\n') 51 | 52 | if('Scores_deconv.csv' %in% dir(paste0('~/TCGA_data/', ct))) { 53 | scores <- fread(paste0('~/TCGA_data/', ct, '/Scores_deconv.csv'), key = 'sample_id') 54 | } else return(NULL) 55 | 56 | # Read in tumour metadata file and extract one sample ID per patient ID, according to order of priority of sample types: 57 | meta <- fread(paste0('~/TCGA_data/', ct, '/Cells.csv'), key = 'sample_type')[ 58 | sample_type != 'normal', 59 | .SD[c('primary', 'primary_additional', 'recurrent', 'metastatic', 'metastatic_additional')][!is.na(sample_id)][1], 60 | keyby = patient_id 61 | ][clin[cancer_type == ct, patient_id]][!is.na(sample_id)] 62 | setkey(meta, sample_id) 63 | 64 | # Subset scores table with the above IDs: 65 | scores <- scores[sample_id %in% meta$sample_id] 66 | scores <- scores[, c(do.call(`[`, list(meta, sample_id))[, .(patient_id, purity)], .(meta_program = meta_program, score = score))] 67 | 68 | # Subset clinical data table with the above IDs: 69 | clin_ct <- clin[patient_id %in% scores$patient_id] 70 | 71 | # Define variable categories: 72 | 73 | # M stage: 74 | clin_ct[, m_stage := ifelse(grepl('m0', m_stage), 'Low', ifelse(grepl('m1', m_stage), 'High', NA))] 75 | 76 | # Therapy resistance: 77 | for(v in c('ther1', 'ther2', 'ther3')) clin_ct[, (v) := ifelse(grepl('^C|^No ', get(v)), 'Low', ifelse(is.na(get(v)), NA, 'High'))] 78 | clin_ct[, ther1 := {if(is.na(ther1)) {if(is.na(ther2)) NA else ther2} else if(is.na(ther2)) ther1 else if(ther1 == ther2) ther1 else NA}, 79 | by = patient_id] 80 | clin_ct[, ther := 'a'] 81 | clin_ct[, ther := if(any(c(ther1, ther3) == 'High', na.rm = TRUE)) 'High' else if(all(is.na(c(ther1, ther3)))) NA else 'Low', by = patient_id] 82 | clin_ct[, c('ther1', 'ther2', 'ther3') := NULL] 83 | 84 | # Lymphatic spread: 85 | clin_ct[!is.na(ln_n), ln_n := ifelse(ln_n == '0', 'Low', 'High')] 86 | clin_ct[, n_stage := ifelse(grepl('n0', n_stage), 'Low', ifelse(grepl('n1|n2|n3', n_stage), 'High', NA))] 87 | clin_ct[, ln := 'a'] 88 | clin_ct[, ln := if(any(c(ln_n, n_stage) == 'High', na.rm = TRUE)) 'High' else if(all(is.na(c(ln_n, n_stage)))) NA else 'Low', by = patient_id] 89 | clin_ct[, c('ln_n', 'n_stage') := NULL] 90 | 91 | # In cases where we have intermediate levels, define boundary as wherever the cumulative sum of sample numbers passes 10: 92 | 93 | ints <- list() 94 | 95 | # Grade: 96 | clin_ct[, grade := mapvalues(grade, c('G1', 'G4', 'GX', 'GB', 'High Grade', 'Low Grade'), c('Low', 'High', NA, 'Low', 'High', 'Low'), 97 | warn_missing = FALSE)] 98 | ints$grade <- c('G2', 'G3')[c('G2', 'G3') %in% clin_ct$grade] 99 | 100 | # Stage: 101 | clin_ct[, stage := mapvalues(gsub('[ABC]$', '', stage), c('Stage 0', 'Stage I', 'Stage IV', 'IS', 'I/II NOS', 'Stage X'), 102 | c('Low', 'Low', 'High', 'Low', NA, NA), warn_missing = FALSE)] 103 | ints$stage <- c('Stage II', 'Stage III')[c('Stage II', 'Stage III') %in% clin_ct$stage] 104 | 105 | # T stage: 106 | clin_ct[, t_stage := mapvalues(str_extract(t_stage, 't[0-4]|tis'), c('tis', 't0', 't1', 't4'), c('Low', 'Low', 'Low', 'High'), 107 | warn_missing = FALSE)] 108 | ints$t_stage <- c('t2', 't3')[c('t2', 't3') %in% clin_ct$t_stage] 109 | 110 | for(v in names(ints)) { 111 | if(length(ints[[v]]) > 0) { 112 | sums <- clin_ct[!is.na(get(v)), .(N = .N), keyby = .(kv = get(v))][c('Low', ints[[v]], 'High')][!is.na(N), setNames(cumsum(N), kv)] 113 | if(!any(sums[names(sums) != 'High'] >= 10)) { 114 | clin_ct[, (v) := NA] 115 | } else { 116 | clin_ct[get(v) %in% names(sums)[1:min(which(sums >= 10))], (v) := 'Low'] 117 | clin_ct[get(v) %in% names(sums)[(min(which(sums >= 10)) + 1):length(sums)], (v) := 'High'] 118 | } 119 | } 120 | } 121 | 122 | scores <- scores[, cbind(.SD, do.call(`[`, list(clin_ct, patient_id))[, -c('patient_id', 'cancer_type')])] 123 | scores[, cancer_type := ct] 124 | setcolorder(scores, c('patient_id', 'cancer_type')) 125 | 126 | surv <- lapply(unique(scores$meta_program), function(mp) { 127 | lapply(c('OS', 'PFI'), function(v) { 128 | sdata <- scores[meta_program == mp & !is.na(get(v))] 129 | if(nrow(sdata) < 20) return(NULL) 130 | coxmod <- coxph(Surv(get(v), get(paste0(v, 'c'))) ~ score, data = sdata) 131 | data.table(cancer_type = ct, var_name = v, meta_program = mp, eff = coxmod$coefficients,# hr = exp(coxmod$coefficients), 132 | pval = summary(coxmod)$logtest['pvalue']) 133 | }) %>% rbindlist 134 | }) %>% rbindlist 135 | 136 | categ <- lapply(unique(scores$meta_program), function(mp) { 137 | lapply(c('stage', 'grade', 't_stage', 'm_stage', 'ln', 'ther'), function(v) { 138 | sdata <- scores[meta_program == mp & !is.na(get(v))] 139 | if(nrow(sdata) < 20) return(NULL) else if(sdata[, sum(get(v) == 'Low') < 10 | sum(get(v) == 'High') < 10]) return(NULL) 140 | data.table(cancer_type = ct, var_name = v, meta_program = mp, 141 | eff = sdata[, .SD[get(v) == 'High', mean(score)] - .SD[get(v) == 'Low', mean(score)]], 142 | pval = t.test(score ~ get(v), sdata)$p.value) 143 | }) %>% rbindlist 144 | }) %>% rbindlist 145 | 146 | return(list(res = rbind(surv, categ), scores = scores, clin = clin_ct)) 147 | 148 | }) 149 | 150 | clin_test_res <- rbindlist(lapply(clin_test, `[[`, 'res')) 151 | clin_test_data <- rbindlist(lapply(clin_test, `[[`, 'clin')) 152 | clin_test_scores <- rbindlist(lapply(clin_test, `[[`, 'scores')) 153 | 154 | clin_test_res[, pval_adj := p.adjust(pval, method = 'BH')] 155 | setkey(clin_test_res, cancer_type, meta_program) 156 | 157 | fwrite(clin_test_res, '../data/clin_test_res_deconv.csv') 158 | fwrite(clin_test_data, '../data/clin_test_data_deconv.csv') 159 | fwrite(clin_test_scores, '../data/clin_test_scores_deconv.csv') 160 | -------------------------------------------------------------------------------- /ITH_hallmarks/TCGA_analysis/tcga_clinical_meta.R: -------------------------------------------------------------------------------- 1 | library(data.table) 2 | library(magrittr) 3 | library(ggplot2) 4 | library(stringr) 5 | library(randomcoloR) 6 | library(plyr) 7 | library(scales) 8 | library(latex2exp) 9 | library(RColorBrewer) 10 | library(cowplot) 11 | library(ggrepel) 12 | library(matkot) 13 | 14 | 15 | 16 | 17 | 18 | # Combine results with and without deconvolution into the same heatmaps: 19 | 20 | clin_test_res <- rbind(fread('../data/clin_test_res.csv'), fread('../data/clin_test_res_deconv.csv')[, cancer_type := paste(cancer_type, '(d)')]) 21 | clin_test_res[, pval_adj := p.adjust(pval, method = 'BH')] 22 | 23 | setkey(clin_test_res, cancer_type, meta_program) 24 | cancer_types <- sort(unique(clin_test_res$cancer_type)) 25 | mps <- unique(clin_test_res$meta_program) 26 | 27 | pdata <- slapply( 28 | c('OS', 'PFI', 'stage', 'grade', 't_stage', 'm_stage', 'ln', 'ther'), 29 | function(v) { 30 | htmp_data <- clin_test_res[var_name == v][CJ(cancer_types, mps), -c('var_name', 'pval')] 31 | htmp_data[is.na(eff), eff := 0] 32 | htmp_data[is.na(pval_adj), pval_adj := 1] 33 | clust_data <- dcast(htmp_data[, -'pval_adj'], meta_program ~ cancer_type)[, 34 | set_rownames(as.matrix(.SD), meta_program), 35 | .SDcols = -'meta_program' 36 | ] 37 | clust_data <- clust_data[apply(clust_data, 1, function(x) !all(x == 0)), apply(clust_data, 2, function(x) !all(x == 0))] 38 | clust_x <- colSums(abs(clust_data)) 39 | clust_x <- data.table(ctd = names(clust_x), ct = gsub(' \\(d\\)', '', names(clust_x)), vald = clust_x)[, val := mean(vald), by = ct] 40 | clust_x_temp <- clust_x[, unique(.SD)[, .(cancer_type = ct, r = order(order(-val))/.N)], .SDcols = c('ct', 'val')] 41 | setkey(clust_x_temp, cancer_type) 42 | clust_x[, r := clust_x_temp[ct, r]] 43 | clust_y <- list(labels = rownames(clust_data), order = order(-rowSums(abs(clust_data)))) 44 | htmp_data <- htmp_data[cancer_type %in% colnames(clust_data) & meta_program %in% rownames(clust_data)] 45 | return(list(data = htmp_data, clust_x = clust_x[, .(ctd, ct, r)], clust_y = clust_y)) 46 | } 47 | ) 48 | 49 | mps_data <- rbindlist(lapply(names(pdata), function(v) with(pdata[[v]]$clust_y, data.table(v = v, mp = labels, r = order(order)/length(order))))) 50 | setkey(mps_data, v, mp) 51 | mps_data <- mps_data[CJ(c('OS', 'PFI', 'stage', 'grade', 't_stage', 'm_stage', 'ln', 'ther'), mps)] 52 | mps_data[, rmean := mean(r[!is.na(r)]), by = mp] 53 | mps_ord <- unique(mps_data[, .(mp, rmean)])[is.finite(rmean), mp[order(rmean)]] 54 | 55 | cts_data <- rbindlist(lapply(pdata, `[[`, 'clust_x')) 56 | cts_data_temp <- cts_data[, .(r = mean(r)), keyby = ct] 57 | cts_ord <- unique(cts_data[, .(ctd = ctd, ct = ct, r = do.call(`[`, list(cts_data_temp, ct))$r)])[order(ctd), ctd[order(r)]] 58 | 59 | # Overall survival: 60 | os_data <- copy(pdata$OS)$data 61 | os_data[, c('cancer_type', 'meta_program') := .(factor(cancer_type, levels = cts_ord), factor(meta_program, levels = mps_ord))] 62 | os_data[, signif := ifelse(pval_adj < 0.05, ifelse(pval_adj < 0.01, ifelse(pval_adj < 0.001, '***', '**'), '*'), '')] 63 | htmp_os <- ggplot(os_data, aes(x = cancer_type, y = meta_program, fill = exp(eff))) + 64 | geom_tile(colour = 'grey90') + 65 | geom_text(aes(label = signif), hjust = 0.5, vjust = 0.5, size = 3) + 66 | scale_x_discrete(expand = c(0, 0)) + 67 | scale_y_discrete(expand = c(0, 0)) + 68 | scale_fill_gradientn(colours = rev(brewer.pal(9, "PiYG")), limits = c(0.5, 2.5), 69 | values = rescale(c(0, 1, 2, 3, 4, 7, 10, 13, 16), to = c(0, 1)), oob = squish, na.value = '#F7F7F7', name = 'Hazard\nratio', 70 | guide = guide_colourbar(frame.colour = 'black', ticks.colour = 'black')) + 71 | theme(panel.border = element_rect(size = 0.5, fill = NA), axis.text.x = element_text(angle = 55, hjust = 1), 72 | legend.title = element_text(size = 13)) + 73 | labs(title = 'Overall survival', x = 'Cancer type', y = NULL) 74 | 75 | # LN mets and therapy resistance: 76 | ln_data <- copy(pdata$ln)$data 77 | setkey(ln_data, cancer_type, meta_program) 78 | ther_data <- copy(pdata$ther)$data 79 | setkey(ther_data, cancer_type, meta_program) 80 | mps_ln_ther <- union(ln_data$meta_program, ther_data$meta_program) 81 | ln_data <- ln_data[CJ(unique(cancer_type), mps_ln_ther)] 82 | ln_data[, c('cancer_type', 'meta_program') := .(factor(cancer_type, levels = cts_ord), factor(meta_program, levels = mps_ord))] 83 | ln_data[, signif := ifelse(is.na(pval_adj), '', ifelse(pval_adj < 0.05, ifelse(pval_adj < 0.01, ifelse(pval_adj < 0.001, '***', '**'), '*'), ''))] 84 | ther_data <- ther_data[CJ(unique(cancer_type), mps_ln_ther)] 85 | ther_data[, c('cancer_type', 'meta_program') := .(factor(cancer_type, levels = cts_ord), factor(meta_program, levels = mps_ord))] 86 | ther_data[, signif := ifelse(is.na(pval_adj), '', ifelse(pval_adj < 0.05, ifelse(pval_adj < 0.01, ifelse(pval_adj < 0.001, '***', '**'), '*'), ''))] 87 | htmp_ln <- ggplot(ln_data, aes(x = cancer_type, y = meta_program, fill = eff)) + 88 | geom_tile(colour = 'grey90') + 89 | geom_text(aes(label = signif), hjust = 0.5, vjust = 0.5, size = 3) + 90 | scale_x_discrete(expand = c(0, 0)) + 91 | scale_y_discrete(expand = c(0, 0)) + 92 | scale_fill_gradientn(colours = rev(brewer.pal(9, "PiYG")), limits = c(-0.5, 0.5), breaks = c('-0.5' = -0.5, '0' = 0, '0.5' = 0.5), oob = squish, 93 | na.value = '#F7F7F7', name = expression(Delta*'(score)'), guide = guide_colourbar(frame.colour = 'black', ticks.colour = 'black')) + 94 | theme(panel.border = element_rect(size = 0.5, fill = NA), axis.text.x = element_text(angle = 55, hjust = 1), 95 | legend.title = element_text(size = 13)) + 96 | labs(title = 'Lymph node metastasis', x = 'Cancer type', y = NULL) 97 | htmp_ther <- ggplot(ther_data, aes(x = cancer_type, y = meta_program, fill = eff)) + 98 | geom_tile(colour = 'grey90') + 99 | geom_text(aes(label = signif), hjust = 0.5, vjust = 0.5, size = 3) + 100 | scale_x_discrete(expand = c(0, 0)) + 101 | scale_y_discrete(expand = c(0, 0)) + 102 | scale_fill_gradientn(colours = rev(brewer.pal(9, "PiYG")), limits = c(-0.5, 0.5), breaks = c('-0.5' = -0.5, '0' = 0, '0.5' = 0.5), oob = squish, 103 | na.value = '#F7F7F7', name = expression(Delta*'(score)'), guide = guide_colourbar(frame.colour = 'black', ticks.colour = 'black')) + 104 | theme(panel.border = element_rect(size = 0.5, fill = NA), axis.text.x = element_text(angle = 55, hjust = 1), 105 | legend.title = element_text(size = 13)) + 106 | labs(title = 'Therapy resistance', x = 'Cancer type', y = NULL) 107 | 108 | # Make grobs and fix widths and heights: 109 | htmp_os_grob <- ggplotGrob(htmp_os) 110 | htmp_os_grob$widths <- unit(c(0.3, 0, 0, 4.5, 0.5*os_data[, length(unique(cancer_type))], 0, 0, 0.5, 2, 0, 0.3), 'cm') 111 | htmp_os_grob$heights <- unit(c(0.3, 0, 0.8, 0, 0, 0, 0.4*os_data[, length(unique(meta_program))], 2.5, 0.6, 0, 0, 0.3), 'cm') 112 | htmp_ln_grob <- ggplotGrob(htmp_ln + theme(legend.position = 'none')) 113 | htmp_ln_grob$widths <- unit(c(0.3, 0, 0, 4.5, 0.5*ln_data[, length(unique(cancer_type))], 0, 0, 0, 0.3), 'cm') 114 | htmp_ln_grob$heights <- unit(c(0.3, 0, 0.8, 0, 0, 0, 0.4*ln_data[, length(unique(meta_program))], 2.5, 0.6, 0, 0, 0.3), 'cm') 115 | htmp_ther_grob <- ggplotGrob(htmp_ther + theme(axis.text.y = element_blank())) 116 | htmp_ther_grob$widths <- unit(c(0.3, 0, 0, 0, 0.5*ther_data[, length(unique(cancer_type))], 0, 0, 0.5, 2, 0, 0.3), 'cm') 117 | htmp_ther_grob$heights <- unit(c(0.3, 0, 0.8, 0, 0, 0, 0.4*ther_data[, length(unique(meta_program))], 2.5, 0.6, 0, 0, 0.3), 'cm') 118 | 119 | pdf('../data/clin_htmps_sub.pdf', width = sum(c(htmp_ln_grob$widths, htmp_ther_grob$widths))/2.54, 120 | height = sum(c(htmp_os_grob$heights, htmp_ln_grob$heights))/2.54) 121 | plot_grid( 122 | plot_grid( 123 | htmp_os_grob, 124 | ggplot() + theme_void(), 125 | nrow = 1, 126 | ncol = 2, 127 | rel_widths = c( 128 | as.numeric(sum(htmp_os_grob$widths)), 129 | sum(c(htmp_ln_grob$widths, htmp_ther_grob$widths)) - as.numeric(sum(htmp_os_grob$widths)) 130 | ) 131 | ), 132 | plot_grid( 133 | htmp_ln_grob, 134 | htmp_ther_grob, 135 | nrow = 1, 136 | ncol = 2, 137 | rel_widths = c(sum(htmp_ln_grob$widths), sum(htmp_ther_grob$widths)) 138 | ), 139 | nrow = 2, 140 | ncol = 1, 141 | rel_heights = c(sum(htmp_os_grob$heights), sum(htmp_ln_grob$heights)) 142 | ) 143 | dev.off() 144 | 145 | 146 | 147 | 148 | 149 | # Summary showing pan-cancer consistency of most common MPs: 150 | 151 | clin_test_res <- rbind(fread('../data/clin_test_res.csv'), fread('../data/clin_test_res_deconv.csv')[, cancer_type := paste(cancer_type, '(d)')]) 152 | clin_test_res[, pval_adj := p.adjust(pval, method = 'BH')] 153 | 154 | study_contrib <- readRDS('../data/study_contribution_per_MP.RDS') 155 | names(study_contrib) <- gsub(' ', ' ', names(study_contrib)) 156 | names(study_contrib)[41] <- 'MP41 Unassigned' 157 | 158 | study_tcga_map <- fread('../data/study_tcga_map.csv', na.strings = '', key = 'study') 159 | 160 | mps_tab <- lapply( 161 | names(study_contrib), 162 | function(mp_name) study_tcga_map[study_contrib[[mp_name]], .(meta_program = mp_name, study = study, cancer_type = cancer_type)] 163 | ) %>% rbindlist 164 | 165 | common_mps <- mps_tab[, .(n_study = .N, n_ct = length(unique(cancer_type[!is.na(cancer_type)]))), by = meta_program][n_ct >= 5 & n_study >= 10] 166 | common_mps <- common_mps$meta_program 167 | 168 | pdata_scatter <- clin_test_res[meta_program %in% common_mps, .(s = sum(sign(eff)), s_sig = sum(sign(eff[pval_adj < 0.05]))), by = .(meta_program)] 169 | 170 | set.seed(8331) 171 | pdf('../data/clin_summary_scatter.pdf', width = 14/2.54, height = 11.5/2.54) 172 | ggplot(pdata_scatter, aes(x = s_sig, y = s)) + 173 | theme_test() + 174 | geom_hline(yintercept = 0, linetype = 'dashed', size = 0.5, colour = 'lightgrey') + 175 | geom_vline(xintercept = 0, linetype = 'dashed', size = 0.5, colour = 'lightgrey') + 176 | geom_point(shape = 21, fill = 'tomato', stroke = 0.3, size = 3) + 177 | geom_text_repel(data = pdata_scatter[grep('Cell Cycle|Stress \\(in vitro\\)|Proteasomal|Hypoxia', meta_program)], aes(label = meta_program), 178 | colour = 'steelblue') + 179 | geom_text_repel(data = pdata_scatter[grep('Stress$|PDAC', meta_program)], aes(label = meta_program), 180 | colour = 'steelblue', nudge_x = 14, nudge_y = 8) + 181 | geom_text_repel(data = pdata_scatter[grep('MHC-II \\(I\\)', meta_program)], aes(label = meta_program), 182 | colour = 'steelblue', nudge_x = 1, nudge_y = -1) + 183 | labs(x = expression(sum()*italic(sign)*'(significant effects)'), y = expression(sum()*italic(sign)*'(all effects)')) 184 | dev.off() 185 | -------------------------------------------------------------------------------- /ITH_hallmarks/TCGA_analysis/tcga_mp_scores.R: -------------------------------------------------------------------------------- 1 | library(data.table) 2 | library(magrittr) 3 | library(stringr) 4 | library(readxl) 5 | library(matkot) 6 | 7 | source('functions.R') 8 | 9 | cancer_types <- c('ACC', 'BLCA', 'BRCA', 'CESC', 'CHOL', 'COAD', 'DLBC', 'ESCA_AC', 'ESCA_ESCC', 'GBM_IDH-WT', 'HNSC', 'KICH', 'KIRC', 'KIRP', 10 | 'LAML', 'LGG_astro', 'LGG_IDH-WT', 'LGG_oligo', 'LIHC', 'LUAD', 'LUSC', 'MESO', 'OV', 'PAAD', 'PCPG', 'PRAD', 'READ', 'SARC', 'SKCM_primary', 11 | 'SKCM_metastatic', 'STAD', 'TGCT', 'THCA', 'THYM', 'UCEC', 'UCS', 'UVM') 12 | 13 | hgnc_complete_set <- fread('../data/hgnc_complete_set.txt', key = 'ensembl_gene_id')[ 14 | !(ensembl_gene_id %in% names(table(ensembl_gene_id))[table(ensembl_gene_id) > 1]) 15 | ] 16 | alias_table <- make_alias_table(hgnc_complete_set) 17 | 18 | study_contrib <- readRDS('../data/study_contribution_per_MP.RDS') 19 | mps <- as.list(read_xlsx('../data/mps_malignant.xlsx')) # Also already ordered by MP number 20 | mps <- slapply(mps, update_symbols_fast, alias_table) 21 | 22 | # Make sure MP names are consistent: 23 | names(mps) <- gsub(' ', ' ', names(mps)) 24 | names(study_contrib) <- gsub(' ', ' ', names(study_contrib)) 25 | names(study_contrib)[41] <- 'MP41 Unassigned' 26 | 27 | study_tcga_map <- fread('../data/study_tcga_map.csv', na.strings = '') 28 | cancer_types <- cancer_types[cancer_types %in% study_tcga_map$cancer_type] 29 | setkey(study_tcga_map, cancer_type) 30 | 31 | for(ct in cancer_types) { 32 | 33 | cat(ct, '\n') 34 | 35 | mps_ct <- mps[sapply(study_contrib, function(x) any(x %in% study_tcga_map[ct, study]))] 36 | 37 | expmat <- fread(paste0('~/TCGA_data/', ct, '/Exp_data_TPM.csv'))[, set_rownames(as.matrix(.SD), V1), .SDcols = -'V1'] 38 | meta <- fread(paste0('~/TCGA_data/', ct, '/Cells.csv'))[sample_type != 'normal'] 39 | expmat <- expmat[, meta$sample_id] 40 | 41 | mps_ct <- slapply(mps_ct, function(x) x[x %in% rownames(expmat)]) 42 | 43 | set.seed(140) 44 | scores <- lapply( 45 | names(mps_ct), 46 | function(mp) meta[, .(sample_id = sample_id, meta_program = mp, score = sig_score(expmat, mps_ct[[mp]], nbin = 50, n = 50))] 47 | ) %>% rbindlist 48 | 49 | fwrite(scores, paste0('~/TCGA_data/', ct, '/Scores.csv')) 50 | 51 | } 52 | -------------------------------------------------------------------------------- /ITH_hallmarks/TCGA_analysis/tcga_mp_scores_deconv.R: -------------------------------------------------------------------------------- 1 | library(data.table) 2 | library(magrittr) 3 | library(stringr) 4 | library(readxl) 5 | library(matkot) 6 | 7 | source('functions.R') 8 | 9 | cancer_types <- c('BLCA', 'BRCA', 'CESC', 'CHOL', 'COAD', 'ESCA_AC', 'ESCA_ESCC', 'HNSC', 'LUAD', 'LUSC', 'OV', 'PAAD', 'PRAD', 'READ', 'STAD', 10 | 'THCA', 'UCEC') 11 | 12 | hgnc_complete_set <- fread('../data/hgnc_complete_set.txt', key = 'ensembl_gene_id')[ 13 | !(ensembl_gene_id %in% names(table(ensembl_gene_id))[table(ensembl_gene_id) > 1]) 14 | ] 15 | alias_table <- make_alias_table(hgnc_complete_set) 16 | 17 | study_contrib <- readRDS('../data/study_contribution_per_MP.RDS') 18 | mps <- as.list(read_xlsx('../data/mps_malignant.xlsx')) # Also already ordered by MP number 19 | mps <- slapply(mps, update_symbols_fast, alias_table) 20 | 21 | # Make sure MP names are consistent: 22 | names(mps) <- gsub(' ', ' ', names(mps)) 23 | names(study_contrib) <- gsub(' ', ' ', names(study_contrib)) 24 | names(study_contrib)[41] <- 'MP41 Unassigned' 25 | 26 | study_tcga_map <- fread('../data/study_tcga_map.csv', na.strings = '') 27 | cancer_types <- cancer_types[cancer_types %in% study_tcga_map$cancer_type] 28 | setkey(study_tcga_map, cancer_type) 29 | 30 | for(ct in cancer_types) { 31 | 32 | cat(ct, '\n') 33 | 34 | mps_ct <- mps[sapply(study_contrib, function(x) any(x %in% study_tcga_map[ct, study]))] 35 | 36 | meta <- fread(paste0('~/TCGA_data/', ct, '/Cells.csv'))[sample_type != 'normal'] 37 | 38 | if(startsWith(ct, 'ESCA')) { 39 | expmat <- fread('~/pan_cancer/data/TCGA_deconvolved_scaled/ESCA_Epithelial.cells.txt') 40 | } else expmat <- fread(paste0('~/pan_cancer/data/TCGA_deconvolved_scaled/', ct, '_Epithelial.cells.txt')) 41 | expmat <- expmat[, set_rownames(as.matrix(.SD), Gene), .SDcols = -'Gene'] 42 | meta <- meta[sample_id %in% colnames(expmat)] 43 | expmat <- expmat[, meta$sample_id] 44 | 45 | mps_ct <- slapply(mps_ct, function(x) x[x %in% rownames(expmat)]) 46 | 47 | set.seed(140) 48 | scores <- lapply( 49 | names(mps_ct), 50 | function(mp) meta[, .(sample_id = sample_id, meta_program = mp, score = colMeans(expmat[mps_ct[[mp]], ]))] 51 | ) %>% rbindlist 52 | 53 | fwrite(scores, paste0('~/TCGA_data/', ct, '/Scores_deconv.csv')) 54 | 55 | } 56 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Tirosh Lab 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 3ca 2 | This repository contains code relating to the Curated Cancer Cell Atlas (3CA), including code for reproducing analyses and figures on the [3CA website](https://www.weizmann.ac.il/sites/3CA), and code for reproducing the analysis in Gavish et al. "The hallmarks of transcriptional intra-tumor heterogeneity". 3 | 4 | ### Source code for Gavish et al. 5 | Code for reproducing the analyses in the Gavish et al. study is contained in the ITH_hallmarks directory. 6 | 7 | ### 3CA version 2 8 | The 3CA_v2 directory code for new analyses conducted on the expanded version of 3CA. 9 | -------------------------------------------------------------------------------- /aliases_pubmed.R: -------------------------------------------------------------------------------- 1 | library(data.table) 2 | library(stringr) 3 | library(easyPubMed) 4 | 5 | hgnc_complete_set <- fread('../data/hgnc_complete_set.txt', key = 'symbol')[ 6 | !(ensembl_gene_id %in% names(table(ensembl_gene_id))[table(ensembl_gene_id) > 1]) & locus_group == 'protein-coding gene' 7 | ] 8 | 9 | alias_table <- hgnc_complete_set[, .(symbol_alt = unique(c(str_split(alias_symbol, '\\|')[[1]], str_split(prev_symbol, '\\|')[[1]]))), by = symbol] 10 | alias_table <- alias_table[symbol_alt != ''] 11 | 12 | # Remove elements of that also occur in : 13 | alias_table <- alias_table[!(symbol_alt %in% symbol)] 14 | 15 | # Get number of PubMed search results for each alternative symbol: 16 | alias_table[, pubmed_freq := as.numeric(get_pubmed_ids(paste0(symbol_alt, '[Text Word]'))$Count), by = symbol_alt] 17 | 18 | # Adding combined searches, i.e. check for cases where the alias and the up-to-date symbol occur together: 19 | alias_table[, 20 | pubmed_comb_freq := as.numeric(get_pubmed_ids(paste0('(', symbol_alt, '[Text Word]) AND (', symbol, '[Text Word])'))$Count), 21 | by = .(symbol, symbol_alt) 22 | ] 23 | 24 | fwrite(alias_table, '../data/aliases_pubmed.csv') 25 | -------------------------------------------------------------------------------- /aliases_table.R: -------------------------------------------------------------------------------- 1 | library(data.table) 2 | library(magrittr) 3 | library(matkot) 4 | 5 | aliases_table <- fread('../data/aliases_pubmed.csv') 6 | aliases_table <- aliases_table[ 7 | pubmed_freq > 1000 & 8 | pubmed_comb_freq > 100 & 9 | !(symbol_alt %in% symbol) & 10 | str_count(symbol_alt, '[a-z]') <= 1 & 11 | str_count(symbol_alt, '.') > 1 & # Could put 2 here as some 2-letter aliases map poorly, but some are important like RB -> RB1 12 | !grepl('\\.', symbol_alt) 13 | ] 14 | aliases_table <- aliases_table[!(symbol_alt %in% aliases_table[, .(l = length(unique(symbol))), by = symbol_alt][l > 1, symbol_alt])] 15 | # Try capitalising and removing dashes. If this makes any of symbol_alt the same as symbol, remove these entries. Otherwise, if it makes any 16 | # elements of symbol_alt the same, take the one with the highest pubmed_freq. 17 | aliases_table[, symbol_alt_x := gsub('-', '', toupper(symbol_alt))] 18 | aliases_table <- aliases_table[symbol_alt_x != symbol] 19 | aliases_table <- aliases_table[, 20 | .(symbol_alt = rbindlist(lapply( 21 | names(table(symbol_alt_x)), 22 | function(symb) if(table(symbol_alt_x)[symb] == 1) { 23 | return(.SD[symbol_alt_x == symb, .(symbol_alt, pubmed_freq)]) 24 | } else return(.SD[symbol_alt_x == symb, .(symbol_alt = symbol_alt[which.max(pubmed_freq)], pubmed_freq = max(pubmed_freq))]) 25 | ))[order(-pubmed_freq), symbol_alt]), 26 | by = symbol 27 | ] 28 | 29 | fwrite(aliases_table, '../data/alias_table.csv') 30 | -------------------------------------------------------------------------------- /bins.R: -------------------------------------------------------------------------------- 1 | library(data.table) 2 | library(magrittr) 3 | library(Matrix) 4 | library(stringr) 5 | library(plyr) 6 | library(matkot) 7 | 8 | source('functions.R') 9 | 10 | paths_table <- fread('../data/paths_table.csv', key = c('study', 'cancer_type'), encoding = 'UTF-8') 11 | 12 | hgnc_complete_set <- fread('../data/hgnc_complete_set.txt', key = 'symbol') 13 | hgnc_complete_set <- hgnc_complete_set[!(ensembl_gene_id %in% names(table(ensembl_gene_id))[table(ensembl_gene_id) > 1])] 14 | alias_table <- make_alias_table(hgnc_complete_set) 15 | 16 | set.seed(763) 17 | data_genes_all <- lapply(transpose(as.list(unique(paths_table[, .(study, cancer_type)]))), function(r) { 18 | cat(r[1], '-', r[2], '\n') 19 | rdir <- paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1]) 20 | if('gene_ave.csv' %in% dir(rdir)) gene_ave <- fread(paste0(rdir, '/gene_ave.csv'))[!is.na(ave)] else return(NULL) 21 | if(nrow(gene_ave) == 0) return(NULL) 22 | gene_ave <- gene_ave[!is.na(ave), if(unique(.SD[, .(group, sample, n_cell)])[, sum(n_cell) >= 10]) .SD[, 23 | .(study = r[1], cancer_type = r[2], ave = sum(ave*n_cell)/sum(n_cell)), 24 | by = symbol 25 | ], by = .(group = cell_type)] 26 | setcolorder(gene_ave, c('study', 'cancer_type')) 27 | setkey(gene_ave, study, cancer_type, group, symbol) 28 | # Shuffle the genes before ranking so that the zero genes don't simply get ordered alphabetically: 29 | gene_ave[, ave_rank := .SD[sample(1:.N), order(order(ave))[order(symbol)]]/.N, .SDcols = c('symbol', 'ave'), by = group] 30 | }) %>% rbindlist 31 | 32 | # Bins for non-malignant cell types: 33 | 34 | # Get the non-malignant cell types occuring in sufficiently many datasets: 35 | cell_types_nm <- data_genes_all[ 36 | !(group %in% c('', 'Unassigned', 'Malignant')) & 37 | !(study == 'Chen et al. 2020' & cancer_type == 'Head and Neck') & 38 | !(study == 'Sun et al. 2021' & cancer_type == 'Liver/Biliary'), 39 | unique(.SD), 40 | .SDcols = c('study', 'cancer_type', 'group') 41 | ][, .N, by = group][N >= 3, group] 42 | 43 | genes_nm <- fread('../data/gene_plots_data_all_web.csv', select = 'symbol')$symbol %>% unique 44 | 45 | bins_nm <- data_genes_all[ 46 | symbol %in% genes_nm & group %in% cell_types_nm & 47 | !(study == 'Chen et al. 2020' & cancer_type == 'Head and Neck') & 48 | !(study == 'Sun et al. 2021' & cancer_type == 'Liver/Biliary'), 49 | .(ave = mean(ave[!is.na(ave)]), ave_rank = mean(ave_rank[!is.na(ave_rank)])), 50 | by = .(symbol, group) 51 | ][, cbind(.SD[order(ave_rank)], bin = cut(1:.N, 15, labels = FALSE)), by = group] 52 | 53 | fwrite(bins_nm, '../data/bins_nm.csv') 54 | -------------------------------------------------------------------------------- /canonical_markers.R: -------------------------------------------------------------------------------- 1 | library(data.table) 2 | 3 | canonical_markers <- list( 4 | Adipocyte = c('ADIPOQ', 'FABP4', 'LEP'), 5 | Astrocyte = c('GFAP', 'GJA1'), 6 | B_cell = c('MS4A1', 'CD79A', 'CD79B'), 7 | Basophil = c('CCR3', 'IL3RA'), 8 | Dendritic = c('CCR7', 'CD86', 'CLEC10A'), 9 | Endothelial = c('VWF', 'CDH5', 'CLEC14A', 'CLDN5', 'ADGRL4'), 10 | Epithelial = c('EPCAM', 'KRT19', 'KRT7', 'MUC1'), 11 | Erythrocyte = c('HBA1', 'HBA2', 'HBB'), 12 | Fibroblast = c('COL1A1', 'COL1A2', 'COL3A1', 'LUM'), 13 | Immune = 'PTPRC', 14 | Keratinocyte = c('FLG', 'IVL', 'LORICRIN'), 15 | Lymphovascular = c('CCL21', 'TFF3'), 16 | Macrophage = c('AIF1', 'CD14', 'CD163'), 17 | Mast = c('CPA3', 'MS4A2', 'TPSB2'), 18 | Melanocyte = c('DCT', 'MLANA', 'PMEL', 'TYR', 'TYRP1'), 19 | Monocyte = c('CCR2', 'CSF1R'), 20 | Myocyte = c('ACTA1', 'DES', 'MYL1', 'TTN', 'NEB'), 21 | Neuron = c('ENO2', 'RBFOX3'), 22 | Neutrophil = c('AZU1', 'CTSG', 'ELANE', 'MPO'), 23 | NK_cell = c('NKG7', 'KLRD1', 'GZMB', 'KLRF1'), 24 | Oligodendrocyte = c('TF', 'PLP1', 'CLDN11', 'MAG', 'MBP', 'MOG'), 25 | OPC = c('OLIG1', 'OLIG2', 'NEU4'), 26 | Pericyte = c('RGS5', 'ESAM', 'MEF2C'), 27 | Plasma = c('JCHAIN', 'MZB1', 'TNFRSF17'), 28 | T_cell = c('CD2', 'CD3D', 'CD3E') 29 | ) 30 | 31 | canonical_markers <- rbindlist(lapply(names(canonical_markers), function(ct) data.table(cell_type = ct, gene = canonical_markers[[ct]]))) 32 | 33 | fwrite(canonical_markers, '../data/canonical_markers.csv') 34 | -------------------------------------------------------------------------------- /cc_sigs_consensus.R: -------------------------------------------------------------------------------- 1 | library(data.table) 2 | library(ggplot2) 3 | library(magrittr) 4 | library(matkot) 5 | 6 | source('functions.R') 7 | 8 | paths_table <- fread('../data/paths_table.csv', key = c('study', 'cancer_type')) 9 | 10 | sigs <- lapply(transpose(as.list(unique(paths_table[cancer_type != 'Other/Models', .(study, cancer_type)]))), function(r) { 11 | cat(r, '\n') 12 | if(!('data_cc.RDS' %in% dir(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1])))) return(NULL) 13 | plot_data <- readRDS(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1], '/data_cc.RDS')) 14 | nullcond <- sapply(plot_data, function(x) ifelse(is.null(x), TRUE, all(sapply(x[names(x) != 'path'], is.null)))) 15 | if(all(nullcond)) return(NULL) 16 | paths <- apply(paths_table[as.list(r), .(cells, genes, expmat)], 1, as.list, simplify = FALSE) 17 | rout <- lapply(which(!nullcond), function(i) list(r = r, group = i, g1s = plot_data[[i]]$g1s, g2m = plot_data[[i]]$g2m)) 18 | }) %>% unlist(recursive = FALSE) 19 | 20 | g1s_tab <- table(unlist(lapply(sigs, `[[`, 'g1s'))) 21 | g2m_tab <- table(unlist(lapply(sigs, `[[`, 'g2m'))) 22 | 23 | sigs_cons <- list(g1s = names(g1s_tab)[order(-g1s_tab)][1:50], g2m = names(g2m_tab)[order(-g2m_tab)][1:50]) 24 | 25 | saveRDS(sigs_cons, '../data/cc_sigs_consensus.rds') 26 | -------------------------------------------------------------------------------- /cna_mat_prep.R: -------------------------------------------------------------------------------- 1 | # The command line arguments should supply study name and cancer type (in that order). 2 | r = commandArgs(trailingOnly = TRUE) 3 | 4 | 5 | 6 | 7 | 8 | library(data.table) 9 | library(magrittr) 10 | library(matkot) 11 | 12 | paths_table <- fread('../data/paths_table.csv', encoding = 'UTF-8', key = c('study', 'cancer_type')) 13 | 14 | d <- paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1]) 15 | if('data_cna.rds' %in% dir(d)) { 16 | cna <- readRDS(paste0(d, '/data_cna.rds')) # Genes are already in order 17 | cna_cond <- sapply(cna, function(x) is.null(x) | is.null(x$cna_data)) 18 | if(!all(cna_cond)) { 19 | for(i in (1:length(cna))[!cna_cond]) { # Indices where data is not NULL 20 | out <- cna[[i]]$cna_data[, .(gene, cell_name, cna = round(cna, 4))] 21 | out <- dcast(out, cell_name ~ gene)[, c('cell_name', out[, unique(gene)]), with = FALSE] 22 | if(sum(!cna_cond) > 1) { 23 | if(!('CNA matrix' %in% dir(d))) dir.create(paste0(d, '/CNA matrix')) 24 | suff <- paths_table[as.list(r)][i, if(group_name != '') group_name else paste0('group', i)] 25 | fwrite(out, paste0(d, '/CNA matrix/CNA_matrix_', suff, '.csv')) 26 | } else fwrite(out, paste0(d, '/CNA matrix.csv')) 27 | } 28 | } else cat('No CNA data\n') 29 | } else cat('No CNA data\n') 30 | -------------------------------------------------------------------------------- /gene_ave.R: -------------------------------------------------------------------------------- 1 | r = commandArgs(trailingOnly = TRUE) 2 | 3 | 4 | 5 | 6 | 7 | library(data.table) 8 | library(magrittr) 9 | library(Matrix) 10 | library(stringr) 11 | library(plyr) 12 | library(matkot) 13 | 14 | source('functions.R') 15 | 16 | paths_table <- fread('../data/paths_table.csv', key = c('study', 'cancer_type')) 17 | 18 | hgnc_complete_set <- fread('../data/hgnc_complete_set_2023-04-13.txt', key = 'symbol') 19 | hgnc_complete_set <- hgnc_complete_set[!(ensembl_gene_id %in% names(table(ensembl_gene_id))[table(ensembl_gene_id) > 1])] 20 | alias_table <- make_alias_table(hgnc_complete_set) 21 | 22 | paths <- copy(paths_table[as.list(r)]); setkey(paths, group) 23 | 24 | out <- rbindlist(lapply(paths$group, function(grp) { 25 | 26 | cells <- suppressWarnings(fread(paths[grp, cells], na.strings = '', colClasses = c(cell_name = 'character', sample = 'character'))) 27 | 28 | if(!all(c('cell_type', 'sample') %in% names(cells)) | !endsWith(paths[grp, expmat], 'mtx')) { 29 | return(data.table(group = grp, group_name = paths[grp, group_name], cell_type = character(), sample = character(), symbol = character(), 30 | ave = numeric(), prop_pos = numeric(), n_cell = integer())) 31 | } 32 | 33 | genes <- fread(paths[grp, genes], header = FALSE)$V1 34 | expmat <- readMM(paths[grp, expmat]) 35 | expmat <- expmat[genes %in% names(table(genes))[table(genes) == 1], ] 36 | genes <- genes[genes %in% names(table(genes))[table(genes) == 1]] 37 | genes <- update_symbols_fast(genes, alias_table) 38 | rownames(expmat) <- genes 39 | colnames(expmat) <- cells$cell_name 40 | 41 | # Remove low-complexity cells and normalise to log TPM/10: 42 | cells <- cells[col_nnz(expmat) >= 1000] 43 | if(nrow(cells) < 10) return(data.table(group = grp, group_name = paths[grp, group_name], cell_type = character(), sample = character(), 44 | symbol = character(), ave = numeric(), prop_pos = numeric(), n_cell = integer())) 45 | expmat <- round(log_transform(1e+05*to_frac(expmat[, cells$cell_name])), 4) 46 | 47 | pout <- cells[ 48 | !is.na(cell_type) & !is.na(sample), 49 | .(symbol = genes, ave = rowMeans(expmat[, cell_name, drop = FALSE]), prop_pos = row_nnz(expmat[, cell_name, drop = FALSE])/.N, n_cell = .N), 50 | by = .(cell_type, sample) 51 | ] 52 | pout[, c('group', 'group_name') := .(grp, paths[grp, group_name])] 53 | setcolorder(pout, c('group', 'group_name')) 54 | 55 | return(pout) 56 | 57 | }), use.names = TRUE) 58 | 59 | if(nrow(out) > 0) { 60 | ct <- gsub('/', '-', r[2]) 61 | if(!(ct %in% dir('../data/study_plots'))) {dir.create(paste0('../data/study_plots/', ct))} 62 | if(!(r[1] %in% dir(paste0('../data/study_plots/', ct)))) {dir.create(paste0('../data/study_plots/', ct, '/', r[1]))} 63 | fwrite(out, paste0('../data/study_plots/', ct, '/', r[1], '/gene_ave.csv')) 64 | } 65 | -------------------------------------------------------------------------------- /gene_mp_cor.R: -------------------------------------------------------------------------------- 1 | # The command line arguments should supply study name and cancer type (in that order). 2 | r = commandArgs(trailingOnly = TRUE) 3 | 4 | 5 | 6 | 7 | 8 | library(data.table) 9 | library(magrittr) 10 | library(Matrix) 11 | library(stringr) 12 | library(matkot) 13 | 14 | source('functions.R') 15 | 16 | paths_table <- fread('../data/paths_table.csv', key = c('study', 'cancer_type')) 17 | 18 | hgnc_complete_set <- fread('../data/hgnc_complete_set_2023-04-13.txt', key = 'symbol') 19 | hgnc_complete_set <- hgnc_complete_set[!(ensembl_gene_id %in% names(table(ensembl_gene_id))[table(ensembl_gene_id) > 1])] 20 | alias_table <- make_alias_table(hgnc_complete_set) 21 | 22 | paths <- copy(paths_table[as.list(r)]); setkey(paths, group) 23 | 24 | rdir <- paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1]) 25 | 26 | if('mp_scores_all.csv' %in% dir(rdir)) { 27 | scores <- fread(paste0(rdir, '/mp_scores_all.csv'), colClasses = c(cell_name = 'character')) 28 | if(nrow(scores) == 0 | !('score' %in% names(scores))) scores <- NULL 29 | } else scores <- NULL 30 | 31 | if(!is.null(scores)) { 32 | 33 | out <- lapply(paths$group, function(grp) { 34 | 35 | cells <- suppressWarnings(fread(paths[grp, cells], na.strings = '', colClasses = c(cell_name = 'character', sample = 'character'))) 36 | 37 | if( 38 | scores[group == grp, .N == 0 | all(is.na(score))] | 39 | !all(c('cell_type', 'sample') %in% names(cells)) | 40 | !endsWith(paths[grp, expmat], 'mtx') 41 | ) { 42 | return(data.table(group = numeric(), group_name = character(), cell_type = character(), meta_program = character(), 43 | gene = character(), corr = numeric(), n_cell = integer(), n_sample = integer(), n_sample_thresh = integer())) 44 | } 45 | 46 | genes <- fread(paths[grp, genes], header = FALSE)$V1 47 | expmat <- readMM(paths[grp, expmat]) 48 | expmat <- expmat[genes %in% names(table(genes))[table(genes) == 1], ] 49 | genes <- genes[genes %in% names(table(genes))[table(genes) == 1]] 50 | genes <- update_symbols_fast(genes, alias_table) 51 | rownames(expmat) <- genes 52 | colnames(expmat) <- cells$cell_name 53 | 54 | # Remove low-complexity cells and normalise to log TPM/10: 55 | cells <- cells[col_nnz(expmat) >= 1000] 56 | if(nrow(cells) < 10) return(data.table(group = numeric(), group_name = character(), cell_type = character(), meta_program = character(), 57 | gene = character(), corr = numeric(), n_cell = integer(), n_sample = integer(), n_sample_thresh = integer())) 58 | expmat <- round(log_transform(1e+05*to_frac(expmat[, cells$cell_name])), 4) 59 | 60 | setkey(cells, cell_name) 61 | scores[group == grp, c('cell_type', 'sample') := do.call(`[`, list(cells, cell_name))[, .(cell_type, sample)]] 62 | setkey(scores, sample, cell_name) 63 | # In the following, I'm using a threshold of 10, in a few places, for minimum number of cells. 64 | grp_out <- scores[group == grp][, # Chaining so we can use the key even if the same cell/sample names are present in multiple groups 65 | if(length(unique(cell_name)) >= 10) { 66 | cat(unique(cell_type), '\n') 67 | ids <- unique(.SD[, .(sample, cell_name)]) 68 | bounds <- ids[, .N, by = sample][, quantile(N, c(0.75, 0.25)) + c(1.5, -1.5)*floor(IQR(N))] 69 | ids <- ids[, if(.N >= bounds[2]) {if(.N <= bounds[1]) .(cell_name) else .(cell_name = sample(cell_name, bounds[1]))}, keyby = sample] 70 | ids_n <- ids[, .N, keyby = sample] 71 | expmat_sub <- expmat[, ids$cell_name] 72 | rm_all <- rowMeans(expmat_sub) 73 | # The following binds together matrices across samples, which might lead to memory issues for large datasets. But we need it to 74 | # compute the correlation across samples. 75 | expmat_sub <- t(Reduce(cbind, lapply(ids_n$sample, function(smpl) { 76 | if(ids_n[smpl, N] < 10) { 77 | apply(expmat_sub[, ids[smpl, cell_name], drop = FALSE], 2, function(x) x - rm_all) 78 | } else { 79 | rm_smpl <- rowMeans(expmat_sub[, ids[smpl, cell_name]]) 80 | apply(expmat_sub[, ids[smpl, cell_name]], 2, function(x) x - rm_smpl) 81 | } 82 | }))) 83 | setkey(ids, NULL) 84 | out <- .SD[ids, .(gene = rownames(expmat), corr = cor(score, expmat_sub)[1, ]), by = meta_program] 85 | out[, c('n_cell', 'n_sample', 'n_sample_thresh') := ids_n[, .(nrow(ids), .N, sum(N >= 10))]] 86 | out 87 | }, 88 | by = cell_type 89 | ] 90 | 91 | grp_out[, c('group', 'group_name') := .(grp, paths[grp, group_name])] 92 | setcolorder(grp_out, c('group', 'group_name')) 93 | 94 | return(grp_out) 95 | 96 | }) %>% rbindlist(use.names = TRUE) 97 | 98 | if(nrow(out) > 0) fwrite(out, paste0(rdir, '/gene_mp_cor.csv')) 99 | 100 | } 101 | -------------------------------------------------------------------------------- /gene_mp_cor_all_web.R: -------------------------------------------------------------------------------- 1 | library(data.table) 2 | library(magrittr) 3 | library(stringr) 4 | library(Matrix) 5 | library(matkot) 6 | 7 | source('functions.R') 8 | 9 | paths_table <- fread('../data/paths_table.csv', key = c('study', 'cancer_type'), encoding = 'UTF-8') 10 | 11 | hgnc_complete_set <- fread('../data/hgnc_complete_set.txt', key = 'symbol') 12 | hgnc_complete_set <- hgnc_complete_set[!(ensembl_gene_id %in% names(table(ensembl_gene_id))[table(ensembl_gene_id) > 1])] 13 | 14 | genes <- fread('../data/gene_plots_data_all_web.csv', select = 'symbol')[symbol %in% hgnc_complete_set$symbol, unique(symbol)] 15 | 16 | to_include <- unique(paths_table[ 17 | cancer_type != 'Other/Models' & 18 | !grepl('Unpublished', study) & 19 | !(study == 'Chen et al. 2020' & cancer_type == 'Head and Neck') & 20 | !(study == 'Sun et al. 2021' & cancer_type == 'Liver/Biliary'), 21 | .(study, cancer_type) 22 | ]) 23 | 24 | gene_mp_cor <- lapply(transpose(as.list(to_include)), function(r) { 25 | 26 | cat(r, '\n') 27 | if(!('gene_mp_cor.csv' %in% dir(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1])))) return(NULL) 28 | 29 | rout <- fread(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1], '/gene_mp_cor.csv')) 30 | rout <- rout[gene %in% genes] 31 | 32 | rout[, c('study', 'cancer_type') := .(r[1], r[2])] 33 | setcolorder(rout, c('cancer_type', 'study')) 34 | 35 | }) %>% rbindlist 36 | 37 | dtkey <- unique(gene_mp_cor[, .(gene, cell_type, cancer_type, meta_program)])[order(gene, cell_type, cancer_type, meta_program)] 38 | gene_mp_cor_ave <- gene_mp_cor[ 39 | !is.na(corr), 40 | if(sum(n_cell) >= 50 & sum(n_sample) >= 3) .(mean_corr = weighted.mean(corr, n_sample_thresh)), 41 | keyby = .(gene, cell_type, cancer_type, meta_program) 42 | ] 43 | gene_mp_cor_ave <- gene_mp_cor_ave[dtkey] 44 | 45 | fwrite(gene_mp_cor_ave, '../data/gene_mp_cor_all_web.csv') 46 | 47 | # Save tables for individual genes, to speed up the writing of the Rmd files: 48 | setkey(gene_mp_cor_ave, gene) 49 | for(g in sort(genes[genes %in% gene_mp_cor_ave$gene])) { 50 | cat(g, '\n') 51 | fwrite(gene_mp_cor_ave[g, -'gene'], paste0('../data/gene_plots/gene_mp_cor/', g, '.csv')) 52 | } 53 | -------------------------------------------------------------------------------- /gene_plots_cell_types.R: -------------------------------------------------------------------------------- 1 | library(data.table) 2 | library(magrittr) 3 | library(ggplot2) 4 | library(stringr) 5 | library(Matrix) 6 | library(matkot) 7 | 8 | source('functions.R') 9 | 10 | paths_table <- fread('../data/paths_table.csv', key = c('study', 'cancer_type'), encoding = 'UTF-8') 11 | 12 | 13 | 14 | 15 | 16 | # For each dataset, find cell types that constitute at least 1% of the data: 17 | cell_types_thresh <- lapply(transpose(as.list(unique(paths_table[, .(study, cancer_type)]))), function(r) { 18 | 19 | cat(r, '\n') 20 | 21 | cts <- lapply(paths_table[as.list(r), cells], function(p) { 22 | cells <- suppressWarnings(fread(p, na.strings = '')) 23 | if(!('cell_type' %in% names(cells))) return(NULL) 24 | cells <- cells[!is.na(cell_type) & cell_type != 'Unassigned'] 25 | out <- cells[, .(N = .N/nrow(cells)), by = cell_type][N >= 0.01, cell_type] 26 | # Check if 'Epithelial' cells are definitely normal epithelial cells: 27 | if( 28 | 'Epithelial' %in% out && ( 29 | ('malignant' %in% names(cells) && cells[cell_type == 'Epithelial' & malignant == 'no', .N/nrow(cells)] < 0.01) | 30 | !('Malignant' %in% cells$cell_type) 31 | ) 32 | ) {out <- out[out != 'Epithelial']} 33 | if(length(out) > 0) return(out) 34 | }) 35 | 36 | cts <- cts[!sapply(cts, is.null)] 37 | 38 | if(length(cts) > 0) return(Reduce(intersect, cts)) 39 | 40 | }) 41 | 42 | # Take cell types that pass the 1% threshold in at least 5 datasets: 43 | cell_types <- table(unlist(cell_types_thresh[!sapply(cell_types_thresh, is.null)])) 44 | cell_types <- names(cell_types)[cell_types >= 5 & names(cell_types) != ''] 45 | 46 | saveRDS(cell_types, '../data/gene_plots_cell_types.rds') 47 | -------------------------------------------------------------------------------- /gene_plots_data_all_web.R: -------------------------------------------------------------------------------- 1 | library(data.table) 2 | library(magrittr) 3 | library(stringr) 4 | library(Matrix) 5 | library(matkot) 6 | 7 | source('functions.R') 8 | 9 | paths_table <- fread('../data/paths_table.csv', key = c('study', 'cancer_type'), encoding = 'UTF-8') 10 | 11 | cell_types <- readRDS('../data/gene_plots_cell_types.rds') 12 | 13 | hgnc_complete_set <- fread('../data/hgnc_complete_set.txt', key = 'symbol') 14 | hgnc_complete_set <- hgnc_complete_set[!(ensembl_gene_id %in% names(table(ensembl_gene_id))[table(ensembl_gene_id) > 1])] 15 | 16 | # The following averages over studies for a given cancer type: 17 | 18 | to_include <- unique(paths_table[ 19 | cancer_type != 'Other/Models' & 20 | !grepl('Unpublished', study) & 21 | !(study == 'Chen et al. 2020' & cancer_type == 'Head and Neck') & 22 | !(study == 'Sun et al. 2021' & cancer_type == 'Liver/Biliary'), 23 | .(study, cancer_type) 24 | ]) 25 | 26 | gene_ave <- lapply(transpose(as.list(to_include)), function(r) { 27 | 28 | cat(r, '\n') 29 | if(!('gene_ave.csv' %in% dir(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1])))) return(NULL) 30 | 31 | rout <- fread( 32 | paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1], '/gene_ave.csv'), 33 | colClasses = c(cell_type = 'character', symbol = 'character'), 34 | key = c('cell_type', 'symbol') 35 | )[cell_type %in% cell_types] 36 | rout[, c('study', 'cancer_type') := as.list(r)] 37 | 38 | samples_path <- paste0('/home/labs/tirosh/shared/pan_cancer_datasets/', paths_table[as.list(r), directory[1]], '/samples.csv') 39 | samples <- fread(samples_path, colClasses = c(sample = 'character'), na.strings = '') 40 | samples <- samples[!is.na(sample) & !is.na(cancer_type) & !(cancer_type %in% c('Normal', 'Premalignant'))] 41 | if(nrow(samples) == 0) return(NULL) 42 | 43 | rout <- rout[sample %in% samples$sample] # This excludes the sample == 'all' category 44 | 45 | if(all(r == c('Jerby-Arnon et al. 2021', 'Sarcoma'))) { # This dataset is unusual because it has the same sample names in 10x and SS2 datasets 46 | rout[, tech := group_name] 47 | } else { 48 | setkey(samples, sample) 49 | rout[, tech := do.call(`[`, list(samples, sample))$technology] 50 | } 51 | 52 | setcolorder(rout, c('cancer_type', 'study', 'group', 'group_name', 'tech')) 53 | 54 | return(rout) 55 | 56 | }) %>% rbindlist 57 | 58 | gene_ave[ 59 | cell_type %in% c('Macrophage', 'Myeloid', 'Monocyte'), 60 | c('cell_type', 'n_cell', 'ave', 'prop_pos') := .('Macrophage', sum(n_cell), sum(ave*n_cell)/sum(n_cell), sum(prop_pos*n_cell)/sum(n_cell)), 61 | by = .(cancer_type, study, tech, sample, symbol) 62 | ] 63 | gene_ave <- unique(gene_ave) 64 | 65 | gene_ave <- gene_ave[!(cancer_type == 'Brain' & cell_type == 'Fibroblast')] 66 | 67 | gene_ave <- gene_ave[symbol %in% hgnc_complete_set$symbol] 68 | 69 | 70 | 71 | 72 | 73 | # Retain genes that have at least one value in all but at most 3 cancer types: 74 | gene_ave_study <- gene_ave[symbol %in% gene_ave[, .(n = length(unique(cancer_type))), by = symbol][n >= max(n) - 3, symbol]] 75 | 76 | gene_ave_study <- gene_ave_study[, 77 | if(sum(n_cell) >= 10) .(ave = sum(ave*n_cell)/sum(n_cell), prop_pos = sum(prop_pos*n_cell)/sum(n_cell), n_cell = sum(n_cell), n_sample = .N, 78 | n_sample_thresh = sum(n_cell >= 10)), 79 | by = .(symbol, cell_type, cancer_type, study, tech) # Mean across samples for each study, cancer type and tech (require >=10 cells in each case) 80 | ][, 81 | .(ave = mean(ave), prop_pos = mean(prop_pos), n_cell = sum(n_cell), n_sample = sum(n_sample), n_sample_thresh = sum(n_sample_thresh)), 82 | by = .(symbol, cell_type, cancer_type, study) # Mean across datasets of the same study and cancer type but different tech 83 | ] 84 | 85 | gene_ave_all <- gene_ave_study[, 86 | .(study = 'all', ave = weighted.mean(ave, n_sample_thresh + 1), prop_pos = weighted.mean(prop_pos, n_sample_thresh + 1), n_cell = sum(n_cell), 87 | n_sample = sum(n_sample), n_sample_thresh = sum(n_sample_thresh)), 88 | by = .(symbol, cell_type, cancer_type) # Weighted mean across studies of the same disease, weighted by (number of samples with >= 10 cells) + 1 89 | ] 90 | 91 | gene_plots_data_all_web <- rbind(gene_ave_study, gene_ave_all, use.names = TRUE) 92 | 93 | fwrite(gene_plots_data_all_web, '../data/gene_plots_data_all_web.csv') 94 | 95 | 96 | 97 | 98 | 99 | unique_ct <- gene_plots_data_all_web[study == 'all', setNames(CJ(unique(cell_type), unique(cancer_type)), c('cell_type', 'cancer_type'))] 100 | unique_ct[, study := 'all'] 101 | unique_study <- gene_plots_data_all_web[ 102 | study != 'all', 103 | setNames(CJ(unique(cell_type), unique(paste(study, cancer_type, sep = ' - '))), c('cell_type', 'study')) 104 | ][, c('study', 'cancer_type') := as.data.table(str_split_fixed(study, ' - ', 2))] 105 | unique_dt <- rbind(unique_ct, unique_study, use.names = TRUE) 106 | 107 | # Save tables for individual genes, to speed up the writing of the Rmd files: 108 | setkey(gene_plots_data_all_web, cell_type, cancer_type, study) 109 | for(g in gene_plots_data_all_web[, sort(unique(symbol))]) { 110 | cat(g, '\n') 111 | # No gene names have underscores in them: sum(grepl('_', gene_plots_data_all_web[, sort(unique(symbol))])) 112 | # Deal with gene names with slashes in them by replacing with underscore: 113 | fwrite( 114 | gene_plots_data_all_web[symbol == g, -'symbol'][unique_dt], 115 | paste0('../data/gene_plots/gene_plots_data_all_web/', gsub('/', '_', g), '.csv') 116 | ) 117 | } 118 | -------------------------------------------------------------------------------- /gene_plots_render_web.R: -------------------------------------------------------------------------------- 1 | n = commandArgs(trailingOnly = TRUE) 2 | 3 | genes_n <- gsub('.csv$', '', dir('../data/gene_plots/gene_plots_data_all_web')) 4 | genes_n <- genes_n[cut(1:length(genes_n), 500, labels = FALSE) == n] 5 | genes_n <- genes_n[genes_n != 'backup'] 6 | for(g in genes_n) { 7 | if(paste0(g, '.Rmd') %in% dir('../data/gene_plots/rmds_web') & paste0(g, '.rds') %in% dir('../data/gene_plots/rds_plots_web')) { 8 | rmarkdown::render( 9 | paste0('/home/labs/tirosh/tyler/pan_cancer/data/gene_plots/rmds_web/', g, '.Rmd'), 10 | intermediates_dir = paste0('/home/labs/tirosh/tyler/pan_cancer/data/gene_plots/pdfs_web/tmp/', g), 11 | output_dir = '/home/labs/tirosh/tyler/pan_cancer/data/gene_plots/pdfs_web' 12 | ) 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /gene_plots_rmd_web.R: -------------------------------------------------------------------------------- 1 | n = commandArgs(trailingOnly = TRUE) 2 | 3 | 4 | 5 | 6 | 7 | library(data.table) 8 | library(magrittr) 9 | library(ggplot2) 10 | library(stringr) 11 | library(plyr) 12 | library(RColorBrewer) 13 | library(scales) 14 | library(cowplot) 15 | library(matkot) 16 | 17 | source('functions.R') 18 | 19 | aliases <- fread('../data/alias_table.csv') 20 | 21 | 22 | 23 | 24 | 25 | genes_n <- gsub('.csv$', '', gsub('_', '/', dir('../data/gene_plots/gene_plots_data_all_web'))) 26 | genes_n <- genes_n[cut(1:length(genes_n), 200, labels = FALSE) == n] 27 | genes_n <- genes_n[genes_n != 'backup'] 28 | for(g in genes_n) { 29 | 30 | cat(g, '\n') 31 | 32 | data_g <- fread(paste0('../data/gene_plots/gene_plots_data_all_web/', g, '.csv')) 33 | data_g[, cell_type := gsub('_', ' ', cell_type)] 34 | 35 | data_ct <- data_g[study == 'all', -'study'] 36 | data_study <- data_g[study != 'all'] 37 | 38 | data_ct[, facet_y := 'c'] 39 | data_ct <- rbind( 40 | data_ct, 41 | cbind( 42 | cancer_type = 'All cancers', 43 | data_ct[, .(ave = mean(ave[!is.na(ave)]), prop_pos = mean(prop_pos[!is.na(prop_pos)]), n_cell = sum(n_cell), n_sample = sum(n_sample), 44 | n_sample_thresh = sum(n_sample_thresh)), by = cell_type] 45 | )[, facet_y := 'a'], 46 | use.names = TRUE 47 | ) 48 | data_ct[, 49 | c('facet_y', 'cell_type', 'cancer_type') := .( 50 | factor(facet_y, levels = c('c', 'a')), 51 | factor(cell_type, levels = c('Malignant', 'Macrophage', 'T cell', 'NK cell', 'B cell', 'Plasma', 'Dendritic', 'Mast', 52 | 'Fibroblast', 'Pericyte', 'Endothelial', 'Epithelial', 'Oligodendrocyte')), 53 | factor(cancer_type, levels = sort(unique(cancer_type), decreasing = TRUE)) 54 | ) 55 | ] 56 | 57 | data_study[, cell_type := factor(cell_type, levels = c('Malignant', 'Macrophage', 'T cell', 'NK cell', 'B cell', 'Plasma', 58 | 'Dendritic', 'Mast', 'Fibroblast', 'Pericyte', 'Endothelial', 'Epithelial', 'Oligodendrocyte'))] 59 | data_study[, study := factor(study, levels = rev(sort(unique(study))))] # So that studies go alphabetically from the top 60 | 61 | # Cut cancer types into groups for separate pages: 62 | ct_groups <- list(character(0)) 63 | i <- 0 64 | j <- 1 65 | for(ct in data_study[, sort(unique(as.character(cancer_type)))]) { 66 | ns <- data_study[cancer_type == ct, length(unique(study))] 67 | if(30 - i >= ns) { 68 | ct_groups[[j]] <- c(ct_groups[[j]], ct) 69 | i <- i + ns 70 | } else { 71 | j <- j + 1 72 | ct_groups[[j]] <- ct 73 | i <- ns 74 | } 75 | } 76 | 77 | mp_cor_g <- fread(paste0('../data/gene_plots/gene_mp_cor/', g, '.csv')) 78 | mp_cor_g[, cell_type := gsub('_', ' ', cell_type)] 79 | mp_cor_cts <- mp_cor_g[, {uct <- unique(cell_type); if('Malignant' %in% uct) c('Malignant', sort(uct[uct != 'Malignant'])) else sort(uct)}] 80 | 81 | plots <- list( 82 | dotplot = ggplot(data_ct, aes(x = cell_type, y = cancer_type)) + 83 | geom_point(aes(fill = ave, size = 100*prop_pos), shape = 21, colour = 'black') + 84 | facet_grid(rows = vars(facet_y), scales = 'free', space = 'free') + 85 | scale_x_discrete(labels = c('Malignant' = expression(bold('Malignant')), parse = TRUE)) + 86 | scale_y_discrete(labels = c('All cancers' = expression(bold('All cancers')), parse = TRUE)) + 87 | scale_fill_gradientn( 88 | colours = brewer.pal(9, 'YlOrRd'), 89 | limits = c(0, 8), 90 | breaks = c(0, 2, 4, 6, 8), 91 | labels = c('0' = '0', '2' = '2', '4' = '4', '6' = '6', '8' = '\u2265 8'), 92 | oob = squish 93 | ) + 94 | scale_radius(limits = c(0, 100), range = c(1, 8), breaks = c(0, 20, 40, 60, 80, 100)) + 95 | theme_bw() + 96 | theme( 97 | axis.text.x = element_text(angle = 55, hjust = 1), 98 | axis.title.x = element_blank(), 99 | axis.title.y = element_text(margin = margin(r = 10)), 100 | panel.grid = element_line(linewidth = 0.4), 101 | strip.background = element_blank(), 102 | strip.text = element_blank() 103 | ) + 104 | guides(fill = guide_colourbar(frame.colour = 'black')) + 105 | labs(y = 'Cancer type', fill = 'Mean log2\nexpression\nlevel', size = '% expressing\ncells'), 106 | dotplot_stdy = lapply(ct_groups, function(grp) { 107 | pdata <- data_study[cancer_type %in% grp] 108 | ggplot(pdata, aes(x = cell_type, y = study)) + 109 | geom_point(aes(fill = ave, size = 100*prop_pos), shape = 21, colour = 'black') + 110 | facet_grid(rows = vars(cancer_type), scales = 'free', space = 'free') + 111 | scale_x_discrete(labels = c('Malignant' = expression(bold('Malignant')), parse = TRUE)) + 112 | scale_fill_gradientn( 113 | colours = brewer.pal(9, 'YlOrRd'), 114 | limits = c(0, 8), 115 | breaks = c(0, 2, 4, 6, 8), 116 | labels = c('0' = '0', '2' = '2', '4' = '4', '6' = '6', '8' = '\u2265 8'), 117 | oob = squish 118 | ) + 119 | scale_radius(limits = c(0, 100), range = c(1, 8), breaks = c(0, 20, 40, 60, 80, 100)) + 120 | theme_bw() + 121 | theme( 122 | axis.text.x = element_text(angle = 55, hjust = 1), 123 | axis.title.x = element_blank(), 124 | axis.title.y = element_text(margin = margin(r = 10)), 125 | panel.grid = element_line(linewidth = 0.4), 126 | strip.background = element_rect(colour = NA, fill = NA), 127 | strip.text = element_text(vjust = 0), 128 | legend.justification = ifelse(data_study[cancer_type %in% grp, length(unique(study)) <= 14], 'top', 'center') 129 | ) + 130 | guides(fill = guide_colourbar(frame.colour = 'black')) + 131 | labs(y = 'Study', fill = 'Mean log2\nexpression\nlevel', size = '% expressing\ncells') 132 | }), 133 | htmp = slapply(mp_cor_cts, function(ct) { 134 | ct_data <- mp_cor_g[cell_type == ct] 135 | setkey(ct_data, cancer_type, meta_program) 136 | ct_data <- ct_data[CJ(unique(cancer_type), unique(meta_program))] 137 | ct_data[, meta_program := factor(meta_program, levels = rev(sort(unique(meta_program))))] 138 | ct_title <- mapvalues(ct, mp_cor_cts, c('Malignant cells', 'B cells', 'Endothelial cells', 'Epithelial cells', 'Fibroblasts', 139 | 'Macrophages', 'T cells'), warn_missing = FALSE) 140 | ggplot(ct_data) + 141 | geom_tile(aes(x = cancer_type, y = meta_program, fill = as.numeric(mean_corr), colour = 'a')) + # as.numeric() in case all NA 142 | scale_x_discrete(expand = c(0, 0)) + 143 | scale_y_discrete(expand = c(0, 0)) + 144 | scale_colour_manual(name = NULL, values = c(a = 'grey90'), labels = c(a = 'NA')) + 145 | scale_fill_gradientn(colours = rev(brewer.pal(9, 'RdBu')), limits = c(-0.6, 0.6), breaks = c(-0.6, -0.3, 0, 0.3, 0.6), 146 | oob = squish, na.value = 'grey80') + 147 | guides(fill = guide_colourbar(order = 1, frame.colour = 'black'), 148 | colour = guide_legend(override.aes = list(fill = 'grey80'), order = 2)) + 149 | theme(panel.border = element_rect(fill = NA, colour = 'black'), axis.text.x = element_text(angle = 35, hjust = 1), 150 | axis.title.x = element_text(margin = margin(t = 10)), axis.title.y = element_text(margin = margin(r = 10))) + 151 | labs(x = 'Cancer type', y = 'Meta-program', fill = 'Average\ncorrelation', title = ct_title) 152 | }) 153 | ) 154 | 155 | saveRDS(plots, paste0('../data/gene_plots/rds_plots_web/', g, '.rds')) 156 | 157 | rmdlines <- c('---\n', 'title: "**', g, '**"\n') 158 | 159 | if(g %in% aliases$symbol) rmdlines <- c(rmdlines, 'subtitle: "', paste(aliases[symbol == g, symbol_alt], collapse = ', '), '"\n') 160 | 161 | rmdlines <- c( 162 | rmdlines, 163 | 'header-includes:\n', 164 | ' \\renewcommand{\\familydefault}{\\sfdefault}\n', 165 | ' \\pagenumbering{gobble}\n', 166 | 'geometry: margin=1cm\n', 167 | 'output: pdf_document\n', 168 | 'papersize: a4\n', 169 | '---\n', 170 | '\n', 171 | '```{r setup, include = FALSE}\n', 172 | 'knitr::opts_chunk$set(echo = FALSE, warning = FALSE, error = FALSE, dev = "cairo_pdf")\n', 173 | '```\n', 174 | '\n', 175 | '```{r}\n', 176 | 'plots <- readRDS("../rds_plots_web/', g, '.rds")\n', # This path has to be relative to where the Rmd file will be 177 | '```\n', 178 | '\n', 179 | '## **A**\n', 180 | '\n', 181 | '```{r fig.align="center", fig.height=', 165/25.4, ', fig.width=', 180/25.4, '}\n', 182 | 'plots$dotplot\n', 183 | '```\n', 184 | '\n', 185 | '\\vspace*{\\fill}\n', 186 | '\n', 187 | '**A. Expression of ', g, ' per cell type and cancer type.** Plot showing the average expression level of ', g, ' and the percentage of ', 188 | 'cells expressing ', g, ' in each cancer type and each of the most common cell types, namely those cell types constituting at least 1% of ', 189 | 'at least 5 individual datasets. Expression levels are defined as log2(TPM/10). Average expression levels and percentages were measured ', 190 | 'per cell type within each dataset, then averaged across studies within each cancer type.\n' 191 | ) 192 | 193 | for(i in 1:length(plots$dotplot_stdy)) { 194 | plot_title <- ifelse(i == 1, '**B**', '**B (continued)**') 195 | plot_height <- 30 + 7.5*data_study[cancer_type %in% ct_groups[[i]], length(unique(study))] + 2*(length(ct_groups[[i]]) - 1) 196 | legend_space <- ifelse(i == length(plots$dotplot_stdy), 20, 0) 197 | rmdlines <- c( 198 | rmdlines, 199 | '\n', 200 | '\\newpage\n', 201 | '\n', 202 | '## ', plot_title, '\n', 203 | '\n', 204 | '```{r fig.align="center", fig.height=', (267 - legend_space)/25.4, ', fig.width=', 190/25.4, '}\n', 205 | 'cowplot::plot_grid(plots$dotplot_stdy[[', i, ']], nrow = 2, ncol = 1, rel_heights = c(', plot_height, ', ', 206 | 267 - legend_space - plot_height, '))\n', 207 | '```\n' 208 | ) 209 | } 210 | 211 | rmdlines <- c( 212 | rmdlines, 213 | '\n', 214 | '\\vspace*{\\fill}\n', 215 | '\n', 216 | '**B. Expression of ', g, ' per cell type and study.** Plot showing the average expression level of ', g, ' and the percentage of cells ', 217 | 'expressing ', g, ' in each study and each of the cell types shown in A. Expression levels are defined as log2(TPM/10).\n' 218 | ) 219 | 220 | for(i in 1:length(plots$htmp)) { 221 | ct <- names(plots$htmp)[i] 222 | plot_title <- ifelse(i == 1, '**C**', '**C (continued)**') 223 | plot_height <- 30 + 5*mp_cor_g[cell_type == ct, length(unique(meta_program))] 224 | legend_space <- ifelse(i == length(plots$htmp), 20, 0) 225 | rmdlines <- c( 226 | rmdlines, 227 | '\n', 228 | '\\newpage\n', 229 | '\n', 230 | '## ', plot_title, '\n', 231 | '\n', 232 | '```{r fig.align="center", fig.height=', (267 - legend_space)/25.4, ', fig.width=', 190/25.4, '}\n', 233 | 'cowplot::plot_grid(plots$htmp[[', i, ']], nrow = 2, ncol = 1, rel_heights = c(', plot_height, ', ', 234 | 267 - legend_space - plot_height, '))\n', 235 | '```\n' 236 | ) 237 | } 238 | 239 | rmdlines <- c( 240 | rmdlines, 241 | '\n', 242 | '\\vspace*{\\fill}\n', 243 | '\n', 244 | '**C. Correlation of ', g, ' with meta-programs.** Heatmaps showing, for each cell type, the average Pearson correlation of ', g, 245 | ' with meta-program scores in each cancer type. NAs indicate insufficient data or zero variance.\n' 246 | ) 247 | 248 | out_con <- file(paste0('../data/gene_plots/rmds_web/', g, '.Rmd')) 249 | writeLines(rmdlines, con = out_con, sep = '') 250 | close(out_con) 251 | 252 | } 253 | -------------------------------------------------------------------------------- /mp_scores_all.R: -------------------------------------------------------------------------------- 1 | # The command line arguments should supply study name and cancer type (in that order). 2 | r = commandArgs(trailingOnly = TRUE) 3 | 4 | 5 | 6 | 7 | 8 | library(data.table) 9 | library(magrittr) 10 | library(readxl) 11 | library(plyr) 12 | library(stringr) 13 | library(Matrix) 14 | library(matkot) 15 | 16 | source('functions.R') 17 | 18 | paths_table <- fread('../data/paths_table.csv', key = c('study', 'cancer_type')) 19 | paths <- copy(paths_table[as.list(r)]); setkey(paths, group) 20 | 21 | hgnc_complete_set <- fread('../data/hgnc_complete_set_2023-04-13.txt', key = 'symbol') 22 | hgnc_complete_set <- hgnc_complete_set[!(ensembl_gene_id %in% names(table(ensembl_gene_id))[table(ensembl_gene_id) > 1])] 23 | alias_table <- make_alias_table(hgnc_complete_set) 24 | 25 | meta_programs <- lapply( 26 | c('Malignant', 'B cells', 'Endothelial', 'Epithelial', 'Fibroblasts', 'Macrophages', 'CD4 T cells', 'CD8 T cells'), 27 | function(ct) { 28 | dt <- as.data.table(read_xlsx('../data/meta_programs_2023-07-13.xlsx', sheet = ct)) 29 | dt[, (names(dt)) := lapply(dt, update_symbols_fast, alias_table)] 30 | return(dt) 31 | } 32 | ) 33 | 34 | # Change some MP names, especially for CD4 and CD8 T cells, so we can merge them into a single T cell category: 35 | names(meta_programs[[1]])[c(12:15, 30, 32, 34, 39)] <- gsub('-', ' ', names(meta_programs[[1]])[c(12:15, 30, 32, 34, 39)]) 36 | names(meta_programs[[1]])[c(19, 35, 37, 41)] <- c('EpiSen', 'Hemato-related I', 'Hemato-related II', 'MP41 (Unassigned)') 37 | names(meta_programs[[2]])[3] <- 'Cell Cycle' 38 | names(meta_programs[[3]])[c(1, 6)] <- c('Notch signaling', 'Cell Cycle') 39 | names(meta_programs[[4]])[5] <- 'Cell Cycle' 40 | names(meta_programs[[5]])[c(6, 10, 15)] <- c('Cell Cycle', 'Metal response', 'Lipid metabolism') 41 | names(meta_programs[[6]])[c(3, 9, 12)] <- c('Cell Cycle', 'Proteasomal degradation', 'Unfolded protein response') 42 | names(meta_programs[[7]]) <- paste('CD4 -', names(meta_programs[[7]])) 43 | names(meta_programs[[7]])[c(1, 3)] <- c('CD4 - Treg', 'CD4 - Cell Cycle') 44 | names(meta_programs[[8]]) <- paste('CD8 -', names(meta_programs[[8]])) 45 | names(meta_programs[[8]])[c(2, 10)] <- c('CD8 - Cell Cycle', 'CD8 - Heat shock') 46 | 47 | # Merge CD4 and CD8 T cells: 48 | meta_programs[[7]] <- cbind(meta_programs[[7]], meta_programs[[8]]) 49 | meta_programs <- meta_programs[1:7] 50 | 51 | cts <- c('Malignant', 'B_cell', 'Endothelial', 'Epithelial', 'Fibroblast', 'Macrophage', 'T_cell') 52 | names(meta_programs) <- cts 53 | 54 | 55 | 56 | 57 | 58 | rdir <- paste0('../study_plots/', gsub('/', '-', r[2]), '/', r[1]) 59 | 60 | gene_ave <- fread(paste0(rdir, '/gene_ave.csv'))[!is.na(ave)] 61 | 62 | if(nrow(gene_ave) > 0) { 63 | bins_nm <- fread('../data/bins_nm.csv') 64 | gene_ave_list <- list(gene_ave[cell_type == 'Malignant' & n_cell >= 10, .(group, symbol, sample, cell_type, ave)]) 65 | if(unique(gene_ave[cell_type == 'Malignant', .(group, sample, n_cell)])[, sum(n_cell) >= 10]) gene_ave_list <- c(gene_ave_list, list( 66 | gene_ave[ 67 | cell_type == 'Malignant', 68 | .(group = 'all', sample = 'all', cell_type = 'Malignant', ave = sum(ave*n_cell)/sum(n_cell)), 69 | by = symbol 70 | ] 71 | )) 72 | gene_ave_list <- c(gene_ave_list, list(gene_ave[, 73 | if(unique(.SD[, .(cell_type, n_cell)])[, sum(n_cell) >= 10]) .SD[, .(cell_type = 'all', ave = sum(ave*n_cell)/sum(n_cell)), by = symbol], 74 | by = .(group, sample) 75 | ])) 76 | if(unique(gene_ave[, .(sample, cell_type, n_cell)])[, sum(n_cell) >= 10]) gene_ave_list <- c(gene_ave_list, list( 77 | gene_ave[, .(group = 'all', sample = 'all', cell_type = 'all', ave = sum(ave*n_cell)/sum(n_cell)), by = symbol] 78 | )) 79 | gene_ave <- rbindlist(gene_ave_list, use.names = TRUE) 80 | set.seed(7327) # Shuffle the genes before ranking so that the zero genes don't simply get ordered alphabetically: 81 | gene_ave[sample(1:.N), ave_rank := order(order(ave))/.N, by = .(group, sample, cell_type)] 82 | bins_m <- gene_ave[ 83 | cell_type == 'Malignant', 84 | cbind(.SD[order(ave_rank)], bin = cut(1:.N, 15, labels = FALSE)), 85 | by = .(group, sample), 86 | .SDcols = -'cell_type' 87 | ] 88 | bins_all <- gene_ave[ 89 | cell_type == 'all', 90 | cbind(.SD[order(ave_rank)], bin = cut(1:.N, 15, labels = FALSE)), 91 | by = .(group, sample), 92 | .SDcols = -'cell_type' 93 | ] 94 | } 95 | 96 | 97 | 98 | 99 | 100 | if(nrow(gene_ave) > 0) out <- lapply(paths$group, function(g) { 101 | 102 | cat('Preparing data\n') 103 | cells <- suppressWarnings(fread(paths[g, cells], na.strings = '', colClasses = c(cell_name = 'character', sample = 'character'))) 104 | if(!all(c('cell_type', 'sample') %in% names(cells)) | !endsWith(paths[g, expmat], 'mtx')) return(NULL) 105 | if(cells[, .N > length(unique(cell_name))]) cells[, cell_name := paste(cell_name, .I, sep = '_')] 106 | genes <- fread(paths[g, genes], header = FALSE)$V1 107 | expmat <- readMM(paths[g, expmat]) 108 | expmat <- expmat[genes != '', ] # In case any genes are empty strings 109 | genes <- genes[genes != ''] 110 | expmat <- expmat[genes %in% names(table(genes))[table(genes) == 1], ] # Removes repeated gene names 111 | genes <- genes[genes %in% names(table(genes))[table(genes) == 1]] 112 | genes <- update_symbols_fast(genes, alias_table) # Update gene symbols 113 | rownames(expmat) <- genes 114 | colnames(expmat) <- cells$cell_name 115 | cells <- cells[cell_type %in% cts & col_nnz(expmat) >= 1000] # Filter cell types and remove low-complexity cells 116 | if(nrow(cells) < 30) return(NULL) 117 | expmat <- round(log_transform(1e+05*to_frac(expmat[, cells$cell_name])), 4) # Normalise to log TPM/10 118 | 119 | if(sum(unique(bins_nm$symbol) %in% genes) < 1000) bins_nm[, symbol := str_to_title(symbol)] # In case this is mouse data 120 | if(sum(unique(bins_nm$symbol) %in% genes) < 1000) return(NULL) 121 | bins_nm <- bins_nm[symbol %in% genes, .(group, symbol, bin = as.character(bin))] 122 | bins_m <- bins_m[symbol %in% genes & group %in% c(g, 'all'), .(group = mapvalues(sample, 'all', 'Malignant'), symbol, bin = as.character(bin))] 123 | bins_all <- bins_all[symbol %in% genes & group %in% c(g, 'all'), .(group = sample, symbol, bin = as.character(bin))] 124 | 125 | # Consolidate bins tables (and add column to , for easy matching with ): 126 | cat('Consolidating bins tables\n') 127 | if(all(unique(cells$sample) %in% bins_m$group)) { 128 | bins <- rbind(bins_nm[group %in% cells$cell_type], bins_m[group != 'Malignant']) 129 | cells[cell_type %in% c(unique(bins_nm$group), 'Malignant'), group := ifelse(cell_type == 'Malignant', sample, cell_type)] 130 | } else { # Use "Malignant" group for samples with too few malignant cells: 131 | bins <- rbind(bins_nm[group %in% cells$cell_type], bins_m) 132 | cells[ 133 | cell_type %in% c(unique(bins_nm$group), 'Malignant'), 134 | group := ifelse(cell_type != 'Malignant' | !(sample %in% bins_m$group), cell_type, sample) 135 | ] 136 | } 137 | if(!all(cells[cell_type != 'Malignant', unique(cell_type)] %in% bins_nm$group)) { 138 | bins <- rbind(bins, bins_all[group == 'all']) # Use "all" group for non-malignant cell types not listed in bins_nm 139 | cells[cell_type != 'Malignant' & !(cell_type %in% bins_nm$group), group := 'all'] 140 | } 141 | 142 | setkey(bins, group, symbol) 143 | bins_copy <- copy(bins); setkey(bins_copy, group, bin) 144 | 145 | cat('Computing bin averages\n') 146 | set.seed(9432) 147 | bin_aves <- cells[, { 148 | ct <- unique(cell_type) 149 | cat(ct, '\n') 150 | ct_genes <- sort(unique(unlist(meta_programs[[ct]]))) 151 | if(sum(ct_genes %in% genes) < 50) ct_genes <- str_to_title(ct_genes) # In case this is mouse data 152 | ct_genes <- ct_genes[ct_genes %in% genes] 153 | if(length(ct_genes) < 50) NULL else { 154 | .SD[, { 155 | grp <- unique(group) 156 | cat('\t', grp, '\n') 157 | sdout <- bins_copy[ 158 | .(grp, bins[.(grp, ct_genes), as.character(bin)]), 159 | if(.N >= 50) { 160 | if(.N <= 100) { 161 | .(gene = ct_genes[.GRP], bin_genes = symbol) 162 | } else .(gene = ct_genes[.GRP], bin_genes = sample(symbol, 100, replace = FALSE)) 163 | }, 164 | by = .EACHI 165 | ] 166 | expmat_sub <- expmat[unique(sdout$bin_genes), cell_name, drop = FALSE] 167 | sdout[, .(cell_name = cell_name, ave = colMeans(expmat_sub[bin_genes, , drop = FALSE])), by = gene] 168 | }, by = group] 169 | } 170 | }, by = cell_type] 171 | 172 | bin_ave_mat <- dcast(bin_aves[, .(gene, cell_name, ave)], gene ~ cell_name)[, set_rownames(as.matrix(.SD), gene), .SDcols = -'gene'] 173 | 174 | cat('Computing scores\n') 175 | scores <- cells[, { 176 | ct <- unique(cell_type) 177 | cat(ct, '\n') 178 | c( 179 | .(cell_name = cell_name), 180 | slapply(names(meta_programs[[ct]]), function(mp_name) { 181 | cat('\t', mp_name, '\n') 182 | mp <- meta_programs[[ct]][[mp_name]] 183 | if(sum(mp %in% genes & mp %in% bin_aves$gene) < 10) mp <- str_to_title(mp) 184 | mp <- mp[mp %in% genes & mp %in% bin_aves$gene] 185 | if(length(mp) >= 30) { 186 | xout <- expmat[mp, cell_name, drop = FALSE] - bin_ave_mat[mp, cell_name, drop = FALSE] 187 | xout[xout > 3] <- 3; xout[xout < -3] <- 3 # Cap relative expression levels to reduce influence of extreme values. 188 | colMeans(xout, na.rm = TRUE) # Vector of scores - values for some cells/groups may be NA/NaN. 189 | } 190 | }) 191 | ) %>% as.data.table %>% melt(id.vars = 'cell_name', variable.name = 'meta_program', value.name = 'score', variable.factor = FALSE) 192 | }, by = cell_type] 193 | 194 | scores <- scores[, -'cell_type'] 195 | scores[, group := g] 196 | setcolorder(scores, 'group') 197 | return(scores) 198 | 199 | }) %>% rbindlist 200 | 201 | if(nrow(gene_ave) > 0) fwrite(out, paste0(rdir, '/mp_scores_all.csv')) 202 | -------------------------------------------------------------------------------- /mp_scores_prep.R: -------------------------------------------------------------------------------- 1 | library(data.table) 2 | library(magrittr) 3 | library(matkot) 4 | 5 | paths_table <- fread('../data/paths_table.csv', encoding = 'UTF-8', key = c('study', 'cancer_type')) 6 | 7 | for(r in transpose(as.list(unique(paths_table[, .(study, cancer_type)])))) tryCatch({ 8 | cat(r, '-') 9 | d <- paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1]) 10 | if('data_dist.rds' %in% dir(d)) { 11 | data_dist <- readRDS(paste0(d, '/data_dist.rds')) 12 | data_dist_cond <- sapply(data_dist, function(x) is.null(x) | is.null(x$scores)) 13 | if(!all(data_dist_cond)) { 14 | for(i in (1:length(data_dist))[!data_dist_cond]) { # Indices where data is not NULL 15 | out <- data_dist[[i]]$scores[, .(cell_name, sample, cell_type, meta_program, score = round(score, 4))] 16 | if(sum(!data_dist_cond) > 1) { 17 | if(!('MP scores' %in% dir(d))) dir.create(paste0(d, '/MP scores')) 18 | suff <- paths_table[as.list(r)][i, if(group_name != '') group_name else paste0('group', i)] 19 | fwrite(out, paste0(d, '/MP scores/MP_scores_', suff, '.csv')) 20 | } else fwrite(out, paste0(d, '/MP scores.csv')) 21 | } 22 | } else cat('No MP score data') 23 | } else cat('No MP score data') 24 | cat('\n') 25 | }, error = function(e) print('ERROR\n')) 26 | -------------------------------------------------------------------------------- /study_plots_cc.R: -------------------------------------------------------------------------------- 1 | library(data.table) 2 | library(ggplot2) 3 | library(magrittr) 4 | library(Matrix) 5 | library(stringr) 6 | library(plyr) 7 | library(cowplot) 8 | library(RColorBrewer) 9 | library(scales) 10 | library(gtable) 11 | library(grid) 12 | library(gridExtra) 13 | library(matkot) 14 | 15 | source('functions.R') 16 | 17 | paths_table <- fread('../data/paths_table.csv', encoding = 'UTF-8', key = c('study', 'cancer_type')) 18 | 19 | ct_to_title <- function(ct) { 20 | if(grepl('ic$|al$|ar$|oid$|crine$|ant$', ct) | ct %in% c('Mast', 'Plasma', 'Immune', 'Stem', 'Schwann', 'Langerhans', 'Tuft', 'Stellate')) { 21 | return(paste0(ct, ' cells')) 22 | } else { 23 | return(paste0(gsub('_', ' ', ct), 's')) 24 | } 25 | } 26 | 27 | 28 | 29 | 30 | 31 | for(r in transpose(as.list(unique(paths_table[, .(study, cancer_type)])))) { 32 | 33 | cat(r, '\n') 34 | 35 | if(all(c('data_cc.RDS', 'Cell cycle.pdf') %in% dir(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1])) == c(TRUE, FALSE))) { 36 | 37 | plot_data <- readRDS(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1], '/data_cc.RDS')) 38 | nullcond <- sapply(plot_data, function(x) ifelse(is.null(x), TRUE, all(sapply(x[names(x) != 'path'], is.null)))) 39 | 40 | if(!all(nullcond)) { 41 | 42 | suff <- paste(gsub(' et al. ', '', r[1]), gsub('/| ', '-', r[2]), sep = '_') 43 | 44 | out <- lapply(which(!nullcond), function(i) { 45 | 46 | # To use in plot titles: 47 | if(sum(unique(paths_table[, .(study, cancer_type)])$study == r[1]) == 1) { 48 | if(paths_table[as.list(r), .N] > 1) { 49 | title_tail <- paste0(r[1], ' - ', paths_table[as.list(r)][i, if(group_name == '') paste('Group', group) else group_name]) 50 | } else { 51 | title_tail <- r[1] 52 | } 53 | } else { 54 | if(paths_table[as.list(r), .N] > 1) { 55 | title_tail <- paste0(r[1], ', ', r[2], ' - ', paths_table[as.list(r)][i, if(group_name == '') paste('Group', group) else 56 | group_name]) 57 | } else { 58 | title_tail <- paste0(r[1], ', ', r[2]) 59 | } 60 | } 61 | 62 | plot_out <- do.call(cc_plot, args = plot_data[[i]][c('ccdata', 'hdata', 'g1s', 'g2m')]) 63 | plot_title <- paste0('Cell cycle activity in ', title_tail) 64 | return(list(plot_out = plot_out, plot_title = plot_title)) 65 | 66 | }) 67 | 68 | if(!all(sapply(out, function(x) is.null(x$plot_out)))) { 69 | 70 | rmdlines <- c( 71 | '---\n', 72 | 'title: ""\n', 73 | 'header-includes:\n', 74 | ' \\renewcommand{\\familydefault}{\\sfdefault}\n', # Indent is important but must use spaces, NOT tabs! 75 | ' \\pagenumbering{gobble}\n', 76 | 'geometry: margin=1cm\n', 77 | 'output: pdf_document\n', 78 | 'papersize: a4\n', 79 | '---\n', 80 | '\n', 81 | '```{r setup, include = FALSE}\n', 82 | 'knitr::opts_chunk$set(echo = FALSE, warning = FALSE, error = FALSE, dev = "cairo_pdf")\n', 83 | '```\n' 84 | ) 85 | for(i in 1:length(out)) { 86 | if(is.null(out[[i]]$plot_out)) next 87 | saveRDS(out[[i]]$plot_out, paste0('comp_plots_', i, '_', suff, '.rds')) 88 | if(i > 1) rmdlines <- c(rmdlines, '\n\\newpage\n') 89 | rmdlines <- c( 90 | rmdlines, 91 | '\n', 92 | '```{r}\n', 93 | 'plots <- readRDS("comp_plots_', i, '_', suff, '.rds")\n', 94 | '```\n' 95 | ) 96 | if(names(out[[i]]$plot_out$scatter)[1] == 'data') { 97 | # In this case there's only one scatter plot ('data' is the 1st element of a ggplot object), and there will be a heatmap. 98 | rmdlines <- c( 99 | rmdlines, 100 | '\n', 101 | '### ', out[[i]]$plot_title, '\n', 102 | '\n', 103 | '### **A**\n', 104 | '\n', 105 | '```{r fig.align="center", fig.height=', 110/25.4, ', fig.width=', 170/25.4, ', out.width="70%", out.height="70%"}\n', 106 | 'plots$scatter\n', 107 | '```\n', 108 | '\n', 109 | '### **B**\n', 110 | '\n', 111 | '```{r fig.align="center", fig.height=', 175/25.4, ', fig.width=', 210/25.4, ', out.width="80%", out.height="80%"}\n', 112 | 'plots$heatmap\n', 113 | '```\n', 114 | '\n', 115 | '\\vspace*{\\fill}\n', # Aligns text with bottom of page 116 | '\n', 117 | '**A.** Scatter plot showing the scores for G1/S (x axis) and G2/M (y axis) cell cycle phases for individual cells ', 118 | '(points). The color indicates the phase assigned to each cell, and the percentage refers to all cells which are ', 119 | 'assigned as either G1/S, G2/M or intermediate. **B.** Heatmap of relative expression levels in individual cells ', 120 | '(columns) of the genes (rows) best distinguishing G1/S and G2/M phases. Cells are ordered by phase (indicated by ', 121 | 'the color bar, top) and, within each phase, by the maximum of the G1/S and G2/M scores.\n' 122 | ) 123 | } else { # In this case every element is a separate scatter plot, corresponding to a cell type, possibly with no heatmap. 124 | cts <- names(out[[i]]$plot_out$scatter) 125 | if('Malignant' %in% cts) {cts <- c('Malignant', sort(cts[cts != 'Malignant']))} else {cts <- sort(cts)} 126 | for(j in 1:length(out[[i]]$plot_out$scatter)) { 127 | if(j > 1) rmdlines <- c(rmdlines, '\n\\newpage\n') 128 | ct <- cts[j] 129 | plot_title <- paste0(out[[i]]$plot_title, ' - ', ct_to_title(ct)) 130 | if(ct %in% names(out[[i]]$plot_out$heatmap)) { 131 | rmdlines <- c( 132 | rmdlines, 133 | '\n', 134 | '### ', plot_title, '\n', 135 | '\n', 136 | '### **A**\n', 137 | '\n', 138 | '```{r fig.align="center", fig.height=', 139 | 110/25.4, 140 | ', fig.width=', 141 | 170/25.4, 142 | ', out.width="70%", out.height="70%"}\n', 143 | 'plots$scatter[["', ct, '"]]\n', 144 | '```\n', 145 | '\n', 146 | '### **B**\n', 147 | '\n', 148 | '```{r fig.align="center", fig.height=', 149 | 175/25.4, 150 | ', fig.width=', 151 | 210/25.4, 152 | ', out.width="80%", out.height="80%"}\n', 153 | 'plots$heatmap[["', ct, '"]]\n', 154 | '```\n', 155 | '\n', 156 | '\\vspace*{\\fill}\n', # Aligns text with bottom of page 157 | '\n', 158 | '**A.** Scatter plot showing the scores for G1/S (x axis) and G2/M (y axis) cell cycle phases for individual ', 159 | ct_to_title(ct), 160 | ' (points). The color indicates the phase assigned to each cell, and the percentage refers to all cells ', 161 | 'which are assigned as either G1/S, G2/M or intermediate. **B.** Heatmap of relative expression levels in ', 162 | 'individual ', 163 | ct_to_title(ct), 164 | ' (columns) of the genes (rows) best distinguishing G1/S and G2/M phases. Cells are ordered by phase ', 165 | '(indicated by the color bar, top) and, within each phase, by the maximum of the G1/S and G2/M scores.\n' 166 | ) 167 | } else { 168 | rmdlines <- c( 169 | rmdlines, 170 | '\n', 171 | '### ', plot_title, '\n', 172 | '\n', 173 | '```{r fig.align = "center", fig.height = ', 120/25.4, ', fig.width = ', 180/25.4, 174 | ', out.width = "70%", out.height = "70%"}\n', 175 | 'plots$scatter[["', ct, '"]]\n', 176 | '```\n', 177 | '\n', 178 | 'Scatter plot showing the scores for G1/S (x axis) and G2/M (y axis) cell cycle phases for individual ', 179 | ct_to_title(ct), 180 | ' (points). The color indicates the phase assigned to each cell, and the percentage refers to all cells which ', 181 | 'are assigned as either G1/S, G2/M or intermediate.\n' 182 | ) 183 | } 184 | } 185 | } 186 | } 187 | out_con <- file(paste0('temp_', suff, '.Rmd')) 188 | writeLines(rmdlines, con = out_con, sep = '') 189 | close(out_con) 190 | rmarkdown::render( 191 | paste0('temp_', suff, '.Rmd'), 192 | output_file = paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1], '/Cell cycle.pdf') 193 | ) 194 | file.remove(paste0('temp_', suff, '.Rmd')) 195 | for(i in 1:length(out)) file.remove(paste0('comp_plots_', i, '_', suff, '.rds')) 196 | 197 | } 198 | 199 | } 200 | 201 | } 202 | 203 | } 204 | -------------------------------------------------------------------------------- /study_plots_cna.R: -------------------------------------------------------------------------------- 1 | # The command line arguments should supply study name and cancer type (in that order). 2 | r = commandArgs(trailingOnly = TRUE) 3 | 4 | 5 | 6 | 7 | 8 | library(data.table) 9 | library(magrittr) 10 | library(ggplot2) 11 | library(RColorBrewer) 12 | library(randomcoloR) 13 | library(scales) 14 | library(cowplot) 15 | library(ggtext) 16 | library(matkot) 17 | 18 | source('functions.R') 19 | 20 | paths_table <- fread('../data/paths_table.csv', key = c('study', 'cancer_type')) 21 | 22 | 23 | 24 | 25 | 26 | paths <- apply(paths_table[as.list(r), .(cells, genes, expmat)], 1, as.list, simplify = FALSE) 27 | 28 | if('data_cna.rds' %in% dir(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1]))) { 29 | 30 | plot_data <- readRDS(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1], '/data_cna.rds')) 31 | nullcond <- sapply(plot_data, function(x) ifelse(is.null(x), TRUE, all(sapply(x[names(x) != 'path'], is.null)))) 32 | 33 | out <- lapply(which(!nullcond), function(i) { 34 | 35 | # To use in plot titles: 36 | if(sum(unique(paths_table[, .(study, cancer_type)])$study == r[1]) == 1) { 37 | if(paths_table[as.list(r), .N] > 1) { 38 | title_tail <- paste0(r[1], ' - ', paths_table[as.list(r)][i, if(group_name == '') paste('Group', group) else group_name]) 39 | } else { 40 | title_tail <- r[1] 41 | } 42 | } else { 43 | if(paths_table[as.list(r), .N] > 1) { 44 | title_tail <- paste0(r[1], ', ', r[2], ' - ', paths_table[as.list(r)][i, if(group_name == '') paste('Group', group) else group_name]) 45 | } else { 46 | title_tail <- paste0(r[1], ', ', r[2]) 47 | } 48 | } 49 | 50 | cells <- suppressWarnings(fread(paths[[i]]$cells, na.strings = '')) 51 | cells$cell_name <- as.character(cells$cell_name) 52 | setkey(cells, cell_name) 53 | 54 | pdata <- plot_data[[i]]$cna_data[, .(gene, cell_name, cna, sample, chr)] 55 | pdata[, cell_type := do.call(`[`, list(cells, cell_name))$cell_type] 56 | pdata <- pdata[!is.na(cell_type)] 57 | chr_n <- pdata[, .(n = length(unique(gene))), keyby = chr][n > 20] 58 | pdata <- pdata[chr %in% chr_n$chr] 59 | pdata[, malignant := ifelse(cell_type == 'Malignant', 'm', 'nm')] 60 | 61 | # Sample cells so that we have no more than 2000, of which between 60 and 85% are malignant: 62 | grp_n <- pdata[, .(n = length(unique(cell_name))), keyby = .(malignant, sample)] 63 | grp_tot <- grp_n[, min(max(sum(malignant == 'm')/.N, 0.6), 0.85)] 64 | grp_tot <- 2000*c(m = grp_tot, nm = 1 - grp_tot) 65 | grp_tot_true <- grp_n[, .(N = sum(n)), keyby = malignant]$N 66 | if(any(grp_tot > grp_tot_true)) {wm <- which.min(grp_tot_true - grp_tot); grp_tot <- grp_tot*grp_tot_true[wm]/grp_tot[wm]} 67 | grp_n[, n_to_sample := { # Returns number of cells to take from each sample 68 | tot <- grp_tot[unique(malignant)] # Total allowed for this group 69 | baseline <- pmin(n, min(20, floor(tot/length(unique(sample))))) # Number of cells to keep (samples with too few cells are untouched) 70 | N <- n - baseline 71 | to_lose <- sum(n) - tot # Number of cells from each sample that we can lose 72 | N <- N - floor(to_lose*N/sum(N)) # Take number of cells from each sample proportional to the total number in that sample 73 | to_lose <- sum(N + baseline) - tot # Any left over due to the floor() function 74 | sub_places <- order(order(-N)) %in% 1:to_lose # Take these off evenly from the samples having the most cells 75 | N[sub_places] <- N[sub_places] - 1 76 | N + baseline 77 | }, by = malignant] 78 | set.seed(1012) 79 | pdata <- pdata[, .SD[cell_name %in% sample( # The actual sampling 80 | unique(cell_name), 81 | do.call(`[`, list(grp_n, .(unique(malignant), unique(sample))))$n_to_sample, 82 | replace = FALSE 83 | )], by = .(malignant, sample)] 84 | pdata[, gene := factor(gene, levels = unique(gene))] 85 | pdata[, cell_name := factor(cell_name, levels = unique(cell_name[order(cell_type != 'Malignant', sample)]))] 86 | pdata[, cell_num := as.numeric(cell_name)] 87 | 88 | # Manually choose chromosome labels: 89 | chr_lab <- c('01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12', '14', '16', '17', '18', '19', '20', '22', 'X') 90 | chr_lab <- unique(pdata[, .(gene, chr)])[, 91 | {inds <- chr_n[, floor(cumsum(n) - n/2)[chr %in% chr_lab]]; setNames(gsub('^0', '', chr[inds]), as.character(gene)[inds])} 92 | ] 93 | # To choose them automatically using some threshold for minimum number of genes (100 in this case): 94 | # chr_lab <- unique(pdata[, .(gene, chr)])[, 95 | # {inds <- chr_n[, floor(cumsum(n) - n/2)[n >= 100]]; setNames(gsub('^0', '', chr[inds]), as.character(gene)[inds])} 96 | # ] 97 | 98 | cells_lab <- pdata[, .(n = length(unique(cell_num))), keyby = malignant][, floor(c(n[1]/2, n[1] + n[2]/2))] 99 | 100 | set.seed(1358) 101 | bar_smpl <- ggplot(pdata) + 102 | geom_raster(aes(x = 'a', y = cell_num, fill = sample)) + 103 | scale_fill_manual(name = 'Sample', values = pdata[, setNames(distinctColorPalette(length(unique(sample))), unique(sample))]) + 104 | scale_x_discrete(expand = c(0, 0)) + 105 | scale_y_continuous(expand = c(0, 0)) + 106 | theme( 107 | axis.text = element_blank(), 108 | axis.title = element_blank(), 109 | axis.ticks = element_blank(), 110 | axis.ticks.length = unit(0, 'pt'), 111 | legend.position = 'left', 112 | legend.title = element_text(size = 14), 113 | legend.text = element_text(size = 12), 114 | legend.justification = c(1, 0.5), 115 | panel.border = element_rect(fill = NA, colour = 'black', linewidth = 0.5), 116 | plot.margin = unit(c(5.5, 2, 5.5, 5.5), 'pt') 117 | ) + 118 | guides(fill = guide_legend(keywidth = unit(10, 'pt'), keyheight = unit(15, 'pt'))) 119 | 120 | htmp <- ggplot(pdata) + 121 | geom_raster(aes(x = gene, y = cell_num, fill = cna)) + 122 | geom_vline(aes(xintercept = xi), data = data.table(xi = chr_n[1:(.N - 1), cumsum(n) + 0.5]), linewidth = 0.3) + 123 | geom_hline(aes(yintercept = yi), data = data.table(yi = pdata[cell_type == 'Malignant', length(unique(cell_name)) + 0.5]), 124 | linewidth = 0.5) + 125 | scale_fill_gradientn(colours = rev(brewer.pal(11, 'RdBu')), limits = c(-1, 1), breaks = c(-1, 0, 1), oob = squish) + 126 | scale_x_discrete(expand = c(0, 0), breaks = names(chr_lab), labels = chr_lab) + 127 | scale_y_continuous(expand = c(0, 0), sec.axis = dup_axis(breaks = cells_lab, labels = c('Malignant', 'TME'))) + 128 | theme( 129 | axis.text.x = element_text(size = 12), 130 | axis.text.y.left = element_blank(), 131 | axis.text.y.right = element_text(angle = -90, hjust = 0.5, size = 14), 132 | axis.ticks = element_blank(), 133 | axis.ticks.length = unit(0, 'pt'), 134 | axis.title.x = element_text(size = 14), 135 | axis.title.y.left = element_blank(), 136 | axis.title.y.right = element_text(size = 14), 137 | legend.title = element_text(size = 12, margin = margin(b = 6)), 138 | legend.text = element_text(size = 10), 139 | legend.justification = c(0.5, 0), 140 | panel.border = element_rect(fill = NA, colour = 'black', linewidth = 0.5), 141 | plot.title = element_text(size = 14, face = 'bold'), 142 | plot.subtitle = element_text(size = 12), 143 | plot.margin = unit(c(5.5, 5.5, 5.5, 1), 'pt') 144 | ) + 145 | guides(fill = guide_colourbar(barwidth = unit(15, 'pt'), barheight = unit(70, 'pt'), ticks.colour = 'black', frame.colour = 'black')) + 146 | labs( 147 | x = 'Chromosome', 148 | y = 'Cells', 149 | fill = 'Inferred CNA\n(log2 ratio)', 150 | title = paste('Inferred CNAs in', title_tail), 151 | subtitle = paste( 152 | 'Reference cells:', 153 | paste(plot_data[[i]]$ref_cells[, .N, by = cell_type][order(-N), gsub('_', ' ', cell_type)], collapse = ', ') 154 | ) 155 | ) 156 | 157 | return(list(heatmap = htmp, bar = bar_smpl)) 158 | 159 | }) 160 | 161 | if(sum(unique(paths_table[, .(study, cancer_type)])$study == r[1]) == 1) caption <- r[1] else caption <- paste0(r[1], ' (', r[2], ')') 162 | if(length(out) == 1) { 163 | caption <- paste0('Heatmap of inferred copy number alteration (CNA) values (quantified as log2 ratio) at each chromosomal position', 164 | ' for a representative subset of cells in ', caption, ', with colour bar (left) showing the corresponding samples.') 165 | } else { 166 | caption <- paste0('Heatmaps of inferred copy number alteration (CNA) values (quantified as log2 ratio) at each chromosomal position', 167 | ' for representative subsets of cells in ', caption, ', with colour bars (left) showing the corresponding samples.') 168 | } 169 | caption <- ggplot() + 170 | theme(panel.background = element_rect(fill = NA), plot.caption.position = 'plot', 171 | plot.caption = element_textbox_simple(hjust = 0, size = 12)) + 172 | labs(caption = caption) 173 | 174 | leg <- lapply(out, function(x) get_legend(x$bar)) 175 | leg_width <- max(sapply(leg, function(x) as.numeric(x$widths[3]))) 176 | w <- c(leg_width + 0.4, 0.7, 26.7) 177 | 178 | # Open the PNG before using plot_grid(), because this function calls ggdraw(), which opens a graphic device, leading to X11 error on the server. 179 | png( 180 | paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1], '/CNAs.png'), 181 | width = sum(w), height = length(out)*18 + (length(out) - 1)*4 + 4, units = 'cm', res = 300 182 | ) 183 | bar_grob <- lapply(out, function(x) ggplotGrob(x$bar + theme(legend.position = 'none'))) 184 | htmp_grob <- lapply(out, function(x) ggplotGrob(x$heatmap)) 185 | for(j in 1:length(bar_grob)) bar_grob[[j]]$heights[c(1, 7, 12)] <- unit(c(1.7, 14.9, 1.4), 'cm') 186 | for(j in 1:length(htmp_grob)) htmp_grob[[j]]$heights <- unit(c(0.2, 0, 0.8, 0.7, 0, 0, 14.9, 0.5, 0.7, 0, 0, 0.2), 'cm') 187 | plist <- lapply(1:length(out), function(j) { 188 | if(j == 1) { 189 | plot_grid(leg[[j]], bar_grob[[j]], htmp_grob[[j]], nrow = 1, ncol = 3, rel_widths = w) 190 | } else plot_grid( 191 | ggplot() + theme_void(), 192 | plot_grid(leg[[j]], bar_grob[[j]], htmp_grob[[j]], nrow = 1, ncol = 3, rel_widths = w), 193 | nrow = 2, ncol = 1, rel_heights = c(4, 18) 194 | ) 195 | }) 196 | if(length(plist) == 1) { 197 | print(plot_grid(plist[[1]], caption, nrow = 2, ncol = 1, rel_heights = c(18, 4))) 198 | } else { 199 | print(plot_grid(plotlist = c(plist, list(caption)), nrow = length(plist) + 1, ncol = 1, rel_heights = c(18, rep(22, length(plist) - 1), 4))) 200 | } 201 | dev.off() 202 | 203 | } 204 | -------------------------------------------------------------------------------- /study_plots_ct_comp.R: -------------------------------------------------------------------------------- 1 | # bsub -q new-medium -R "select[model=Intel_Skylake] rusage[mem=32000]" -oo log/study_plots_ct_comp.o -eo log/study_plots_ct_comp.e Rscript study_plots_ct_comp.R 2 | 3 | library(data.table) 4 | library(ggplot2) 5 | library(magrittr) 6 | library(Matrix) 7 | library(stringr) 8 | library(plyr) 9 | library(cowplot) 10 | library(RColorBrewer) 11 | library(scales) 12 | library(matkot) 13 | 14 | try(library(randomcoloR), silent = TRUE) 15 | 16 | source('functions.R') 17 | 18 | paths_table <- fread('../data/paths_table.csv', encoding = 'UTF-8', key = c('study', 'cancer_type')) 19 | 20 | 21 | 22 | 23 | 24 | for(r in transpose(as.list(unique(paths_table[, .(study, cancer_type)])))) { 25 | 26 | cat(r, '\n') 27 | 28 | if(all(c('data_ct_comp.RDS', 'Cell types.pdf') %in% dir(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1])) == c(TRUE, FALSE))) { 29 | 30 | plot_data <- readRDS(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1], '/data_ct_comp.RDS')) 31 | 32 | nullcond <- sapply(plot_data, function(x) ifelse(is.null(x), TRUE, all(sapply(x[names(x) != 'path'], is.null)))) 33 | if(all(nullcond)) next 34 | 35 | out <- lapply(which(!nullcond), function(i) { 36 | 37 | # To use in plot titles: 38 | if(sum(unique(paths_table[, .(study, cancer_type)])$study == r[1]) == 1) { 39 | if(paths_table[as.list(r), .N] > 1) { 40 | title_tail <- paste0(r[1], ' - ', paths_table[as.list(r)][i, if(group_name == '') paste('Group', group) else group_name]) 41 | } else { 42 | title_tail <- r[1] 43 | } 44 | } else { 45 | if(paths_table[as.list(r), .N] > 1) { 46 | title_tail <- paste0(r[1], ', ', r[2], ' - ', paths_table[as.list(r)][i, if(group_name == '') paste('Group', group) else 47 | group_name]) 48 | } else { 49 | title_tail <- paste0(r[1], ', ', r[2]) 50 | } 51 | } 52 | 53 | if(!is.null(plot_data[[i]]$data)) { 54 | 55 | pdata <- plot_data[[i]]$data 56 | 57 | marker_cond <- 'marker_plot_data' %in% names(pdata) && 58 | pdata$marker_plot_data[, length(unique(gene)) > 1 & length(unique(cell_type)) > 1] 59 | if('marker_plot_data' %in% names(pdata) && pdata$marker_plot_data[, length(unique(gene)) < 10]) { 60 | leg_arr <- 'horizontal' 61 | } else {leg_arr <- 'vertical'} 62 | 63 | if('randomcoloR' %in% .packages(TRUE)) { 64 | set.seed(9728) 65 | plot_colours <- pdata$pie_data[, setNames(distinctColorPalette(.N), cell_type)] 66 | plot_out <- do.call( 67 | ct_comp_plot, 68 | args = c(switch(marker_cond + 1, pdata['pie_data'], pdata), list(colours = plot_colours, legends_arrange = leg_arr)) 69 | ) 70 | } else { 71 | plot_out <- do.call( 72 | ct_comp_plot, 73 | args = c(switch(marker_cond + 1, pdata['pie_data'], pdata), list(legends_arrange = leg_arr)) 74 | ) 75 | } 76 | 77 | if(marker_cond) { 78 | plot_title <- paste0('Cell type composition and marker gene expression in ', title_tail) 79 | } else {plot_title <- paste0('Cell type composition in ', title_tail)} 80 | 81 | return(list(plot_out = plot_out, plot_title = plot_title, leg_arr = leg_arr)) 82 | 83 | } 84 | 85 | }) 86 | 87 | # A4 page is 210x297mm. Here, I'm allowing 20mm for the title and 15mm each for the "A" and "B" labels. 88 | rmdlines <- c( 89 | '---\n', 90 | 'title: ""\n', 91 | 'header-includes:\n', 92 | ' \\renewcommand{\\familydefault}{\\sfdefault}\n', # Indent is important but must use spaces, NOT tabs! 93 | ' \\pagenumbering{gobble}\n', 94 | 'geometry: margin=1cm\n', 95 | 'output: pdf_document\n', 96 | 'papersize: a4\n', 97 | '---\n', 98 | '\n', 99 | '```{r setup, include = FALSE}\n', 100 | 'knitr::opts_chunk$set(echo = FALSE, warning = FALSE, error = FALSE, dev = "cairo_pdf")\n', 101 | '```\n' 102 | ) 103 | for(i in 1:length(out)) { 104 | if('marker_plot' %in% names(out[[i]]$plot_out)) { 105 | margin_x <- (ifelse(out[[i]]$leg_arr == 'vertical', 150, 125) - # Take 25mm off margin if legends are arranged horizontally 106 | 10*length(unique(plot_data[!nullcond][[i]]$data$marker_plot_data$cell_type)))/2 107 | n_genes <- length(unique(plot_data[!nullcond][[i]]$data$marker_plot_data$gene)) 108 | h <- (log2(24/n_genes) + 6)*n_genes + 30 109 | # The following top-aligns legends in case of too few genes, else the tops of the legends get cut off. It also depends on the length 110 | # of the x labels, as long labels squash the plot. 4 seems OK, but 5 is safer in case there's a longer label than "Oligodendrocyte". 111 | if(n_genes <= 5) out[[i]]$plot_out$marker_plot <- out[[i]]$plot_out$marker_plot + theme(legend.justification = 'top') 112 | } 113 | saveRDS(out[[i]]$plot_out, paste0('comp_plots_', i, '.rds')) 114 | if(i > 1) rmdlines <- c(rmdlines, '\n\\newpage\n') 115 | if('marker_plot' %in% names(out[[i]]$plot_out)) { 116 | rmdlines <- c( 117 | rmdlines, 118 | '\n', 119 | '### ', out[[i]]$plot_title, '\n', 120 | '\n', 121 | '```{r}\n', 122 | 'plots <- readRDS("comp_plots_', i, '.rds")\n', 123 | '```\n', 124 | '\n', 125 | '### **A**\n', 126 | '\n', 127 | '```{r fig.align = "center", fig.height = ', 117/25.4, ', fig.width = ', 210/25.4, ', out.width = "80%", out.height = "80%"}\n', 128 | 'plots$pie\n', 129 | '```\n', 130 | '\n', 131 | '### **B**\n', 132 | '\n', 133 | '```{r fig.align = "center", fig.height = ', h/25.4, ', fig.width = ', 210/25.4, ', out.width = "80%", out.height = "80%"}\n', 134 | 'plots$marker_plot + theme(plot.margin = unit(c(0, ', margin_x, ', 0, ', margin_x, '), "mm"))\n', 135 | '```\n', 136 | '\n', 137 | '\\vspace*{\\fill}\n', # Aligns text with bottom of page 138 | '\n', 139 | '**A.** Pie chart showing proportions of cell types after quality control. Cell types are ordered by abundance. **B.** Plot ', 140 | 'showing the average log2(TPM/10) expression levels of marker genes in each cell type and, for each gene, the percentage of ', 141 | 'cells of each type in which that gene was detected. Points are not shown for combinations where the percentage of expressing ', 142 | 'cells is below 50%. Only non-malignant cell types with at least 10 cells are shown, and these are ordered as in A.\n' 143 | ) 144 | } else { 145 | rmdlines <- c( 146 | rmdlines, 147 | '\n', 148 | '### ', out[[i]]$plot_title, '\n', 149 | '\n', 150 | '```{r}\n', 151 | 'plots <- readRDS("comp_plots_', i, '.rds")\n', 152 | '```\n', 153 | '\n', 154 | '```{r fig.align = "center", fig.height = ', 117/25.4, ', fig.width = ', 210/25.4, ', out.width = "80%", out.height = "80%"}\n', 155 | 'plots$pie\n', 156 | '```\n', 157 | '\n', 158 | 'Pie chart showing proportions of cell types after quality control. Cell types are ordered by abundance.\n' 159 | ) 160 | } 161 | } 162 | out_con <- file('temp.Rmd') 163 | writeLines(rmdlines, con = out_con, sep = '') 164 | close(out_con) 165 | rmarkdown::render('temp.Rmd', output_file = paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1], '/Cell types.pdf')) 166 | file.remove('temp.Rmd') 167 | for(i in 1:length(out)) file.remove(paste0('comp_plots_', i, '.rds')) 168 | 169 | } 170 | 171 | } 172 | -------------------------------------------------------------------------------- /study_plots_ct_umap.R: -------------------------------------------------------------------------------- 1 | # bsub -q new-medium -R "select[model=Intel_Skylake] rusage[mem=32000]" -oo log/study_plots_ct_umap.o -eo log/study_plots_ct_umap.e Rscript study_plots_ct_umap.R 2 | 3 | library(data.table) 4 | library(ggplot2) 5 | library(magrittr) 6 | library(Matrix) 7 | library(stringr) 8 | library(plyr) 9 | library(cowplot) 10 | library(RColorBrewer) 11 | library(scales) 12 | library(matkot) 13 | 14 | try(library(randomcoloR), silent = TRUE) 15 | 16 | source('functions.R') 17 | 18 | paths_table <- fread('../data/paths_table.csv', encoding = 'UTF-8', key = c('study', 'cancer_type')) 19 | 20 | 21 | 22 | 23 | 24 | for(r in transpose(as.list(unique(paths_table[, .(study, cancer_type)])))) { 25 | 26 | cat(r, '\n') 27 | 28 | if(all(c('data_ct_umap.RDS', 'UMAP.pdf') %in% dir(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1])) == c(TRUE, FALSE))) { 29 | 30 | plot_data <- readRDS(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1], '/data_ct_umap.RDS')) 31 | 32 | nullcond <- sapply(plot_data, function(x) ifelse(is.null(x), TRUE, is.null(x$data))) 33 | if(all(nullcond)) next 34 | 35 | if('randomcoloR' %in% .packages(TRUE)) { 36 | set.seed(9728) 37 | plot_colours <- lapply( 38 | which(!nullcond), 39 | function(i) { 40 | pdata <- plot_data[[i]]$data 41 | slapply(c('cell_type', 'sample')[c('cell_type', 'sample') %in% names(pdata)], function(vn) unique(pdata[[vn]])) 42 | } 43 | ) 44 | plot_colours <- slapply( 45 | c('cell_type', 'sample'), 46 | function(vn) { 47 | vn_unique <- unique(unlist(lapply(plot_colours, function(x) if(vn %in% names(x)) x[[vn]]))) 48 | if(!is.null(vn_unique)) setNames(distinctColorPalette(length(vn_unique)), vn_unique) 49 | } 50 | ) 51 | } 52 | 53 | out <- lapply(which(!nullcond), function(i) { 54 | 55 | pdata <- plot_data[[i]]$data 56 | if(!any(c('cell_type', 'sample') %in% names(pdata))) return(NULL) 57 | 58 | # To use in plot titles: 59 | if(sum(unique(paths_table[, .(study, cancer_type)])$study == r[1]) == 1) { 60 | if(paths_table[as.list(r), .N] > 1) { 61 | title_tail <- paste0(r[1], ' - ', paths_table[as.list(r)][i, if(group_name == '') paste('Group', group) else group_name]) 62 | } else { 63 | title_tail <- r[1] 64 | } 65 | } else { 66 | if(paths_table[as.list(r), .N] > 1) { 67 | title_tail <- paste0(r[1], ', ', r[2], ' - ', paths_table[as.list(r)][i, if(group_name == '') paste('Group', group) else 68 | group_name]) 69 | } else { 70 | title_tail <- paste0(r[1], ', ', r[2]) 71 | } 72 | } 73 | 74 | if('cell_type' %in% names(pdata)) { 75 | if('Malignant' %in% pdata$cell_type) { 76 | pdata[, cell_type := factor(cell_type, levels = c('Malignant', sort(unique(cell_type[cell_type != 'Malignant']))))] 77 | } else { 78 | pdata[, cell_type := factor(cell_type, levels = sort(unique(cell_type)))] 79 | } 80 | } 81 | if('sample' %in% names(pdata)) { 82 | if(!any(is.na(as.numeric(unique(pdata$sample))))) { 83 | pdata[, sample := factor(as.character(sample), levels = sort(as.numeric(unique(sample))))] 84 | } else pdata[, sample := factor(as.character(sample), levels = sort(as.character(unique(sample))))] 85 | } 86 | if('randomcoloR' %in% .packages(TRUE)) c(list( 87 | title_tail = title_tail, 88 | plots = slapply( 89 | c('cell_type', 'sample')[c('cell_type', 'sample') %in% names(pdata)], 90 | function(vn) ct_umap_plot(pdata, vn, colours = plot_colours[[vn]][levels(pdata[[vn]])], 91 | legend_title = gsub('_', ' ', str_to_title(vn))) 92 | ) 93 | )) else c(list( 94 | title_tail = title_tail, 95 | plots = slapply( 96 | c('cell_type', 'sample')[c('cell_type', 'sample') %in% names(pdata)], 97 | function(vn) ct_umap_plot(pdata, vn, legend_title = gsub('_', ' ', str_to_title(vn))) 98 | ) 99 | )) 100 | 101 | }) 102 | 103 | if(all(sapply(out, is.null))) next 104 | 105 | # A4 page is 210x297mm. Here, I'm allowing 20mm for the title and 15mm each for the "A" and "B" labels. 106 | rmdlines <- c( 107 | '---\n', 108 | 'title: ""\n', 109 | 'header-includes:\n', 110 | ' \\renewcommand{\\familydefault}{\\sfdefault}\n', # Indent is important but must use spaces, NOT tabs! 111 | ' \\pagenumbering{gobble}\n', 112 | 'geometry: margin=1cm\n', 113 | 'output: pdf_document\n', 114 | 'papersize: a4\n', 115 | '---\n', 116 | '\n', 117 | '```{r setup, include = FALSE}\n', 118 | 'knitr::opts_chunk$set(echo = FALSE, warning = FALSE, error = FALSE, dev = "cairo_pdf")\n', 119 | '```\n' 120 | ) 121 | for(i in 1:length(out)) { 122 | saveRDS(out[[i]]$plots, paste0('umap_plots_', i, '.rds')) 123 | if(i > 1) rmdlines <- c(rmdlines, '\n\\newpage\n') 124 | if(length(out[[i]]$plots) == 2) { 125 | rmdlines <- c( 126 | rmdlines, 127 | '\n', 128 | '### UMAP in ', out[[i]]$title_tail, '\n', 129 | '\n', 130 | '```{r}\n', 131 | 'plots <- readRDS("umap_plots_', i, '.rds")\n', 132 | '```\n', 133 | '\n', 134 | '### **A**\n', 135 | '\n', 136 | '```{r fig.align = "center", fig.height = ', 130/25.4, ', fig.width = ', 180/25.4, ', out.width = "80%", out.height = "80%"}\n', 137 | 'plots$cell_type\n', 138 | '```\n', 139 | '\n', 140 | '### **B**\n', 141 | '\n', 142 | '```{r fig.align = "center", fig.height = ', 130/25.4, ', fig.width = ', 180/25.4, ', out.width = "80%", out.height = "80%"}\n', 143 | 'plots$sample\n', 144 | '```\n', 145 | '\n', 146 | '\\vspace*{\\fill}\n', # Aligns text with bottom of page 147 | '\n', 148 | '**A.** UMAP plot of single cells in ', out[[i]]$title_tail, ', after quality control. Cells are colored by cell type. **B.** ', 149 | 'UMAP plot as in A, with cells colored by sample.\n' 150 | ) 151 | } else { 152 | rmdlines <- c( 153 | rmdlines, 154 | '\n', 155 | '### UMAP in ', out[[i]]$title_tail, '\n', 156 | '\n', 157 | '```{r}\n', 158 | 'plots <- readRDS("umap_plots_', i, '.rds")\n', 159 | '```\n', 160 | '\n', 161 | '```{r fig.align = "center", fig.height = ', 130/25.4, ', fig.width = ', 180/25.4, ', out.width = "80%", out.height = "80%"}\n', 162 | 'plots[[1]]\n', 163 | '```\n', 164 | '\n', 165 | 'UMAP plot of single cells in ', out[[i]]$title_tail, ', after quality control. Cells are colored by ', 166 | gsub('_', ' ', names(out[[i]]$plots)), '.\n' 167 | ) 168 | } 169 | } 170 | out_con <- file('temp.Rmd') 171 | writeLines(rmdlines, con = out_con, sep = '') 172 | close(out_con) 173 | rmarkdown::render('temp.Rmd', output_file = paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1], '/UMAP.pdf')) 174 | file.remove('temp.Rmd') 175 | for(i in 1:length(out)) file.remove(paste0('umap_plots_', i, '.rds')) 176 | 177 | } 178 | 179 | } 180 | -------------------------------------------------------------------------------- /study_plots_data_cc_thresh.R: -------------------------------------------------------------------------------- 1 | library(data.table) 2 | library(magrittr) 3 | library(matkot) 4 | 5 | paths_table <- fread('../data/paths_table.csv', key = c('study', 'cancer_type')) 6 | 7 | thresh <- lapply(transpose(as.list(unique(paths_table[, .(study, cancer_type)]))), function(r) { 8 | if(!('data_cc.RDS' %in% dir(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1])))) return(NULL) 9 | plot_data <- readRDS(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1], '/data_cc.RDS')) 10 | nullcond <- sapply(plot_data, function(x) ifelse(is.null(x), TRUE, all(sapply(x[names(x) != 'path'], is.null)))) 11 | if(all(nullcond)) return(NULL) 12 | lapply(which(!nullcond), function(i) { 13 | if('cell_type' %in% names(plot_data[[i]]$ccdata)) { 14 | plot_data[[i]]$ccdata[, 15 | .(g1s = min(g1s_score[lr_g1s]), g2m = min(g2m_score[lr_g2m]), study = r[1], cancer_type = r[2]), 16 | by = cell_type 17 | ] 18 | } else { 19 | plot_data[[i]]$ccdata[, .(cell_type = NA, g1s = min(g1s_score[lr_g1s]), g2m = min(g2m_score[lr_g2m]), study = r[1], cancer_type = r[2])] 20 | } 21 | }) %>% rbindlist 22 | }) %>% rbindlist 23 | 24 | # plot(thresh[is.finite(g1s), sort(g1s)]); abline(h = 0.7) 25 | # plot(thresh[is.finite(g2m), sort(g2m)]); abline(h = 0.65) 26 | 27 | for(r in transpose(as.list(unique(paths_table[, .(study, cancer_type)])))) { 28 | cat(r, '\n') 29 | if(!('data_cc.RDS' %in% dir(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1])))) next 30 | plot_data <- readRDS(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1], '/data_cc.RDS')) 31 | nullcond <- sapply(plot_data, function(x) ifelse(is.null(x), TRUE, all(sapply(x[names(x) != 'path'], is.null)))) 32 | if(all(nullcond)) next 33 | for(i in which(!nullcond)) { 34 | 35 | ccdata <- copy(plot_data[[i]]$ccdata) 36 | ccdata[, phase := 'Cycling'] 37 | ccdata[g1s_score < 0.7 & g2m_score < 0.65, phase := 'Not cycling'] 38 | 39 | # Define phases and intermediates by fold change threshold: 40 | ccdata[phase == 'Cycling' & g1s_score > g2m_score & g1s_score - 0.5 < 2*(g2m_score - 0.5), phase := 'Intermediate'] 41 | ccdata[phase == 'Cycling' & g2m_score > g1s_score & g2m_score - 0.5 < 2*(g1s_score - 0.5), phase := 'Intermediate'] 42 | ccdata[phase == 'Cycling', phase := c('G1/S', 'G2/M')[which.max(.(g1s_score, g2m_score))], by = cell_name] 43 | 44 | # ggplot(ccdata[cell_type == 'Malignant'], aes(x = g1s_score, y = g2m_score)) + geom_point(aes(colour = phase)) + theme_test() 45 | 46 | setkey(ccdata, cell_name) 47 | plot_data[[i]]$ccdata <- ccdata 48 | 49 | hdata <- copy(plot_data[[i]]$hdata) 50 | if('data.table' %in% class(hdata)) { 51 | setkey(hdata, cell_name) 52 | hdata[, phase := do.call(`[`, list(ccdata, cell_name))$phase] 53 | annot_cols <- c('cell_name', 'g1s_score', 'g2m_score', 'phase') 54 | hdata <- slapply( 55 | switch( 56 | ('cell_type' %in% names(ccdata)) + 1, 57 | list(ccdata$cell_name), 58 | slapply( 59 | ccdata[phase != 'Not cycling', .(N = .N), by = cell_type][N >= 50, cell_type], 60 | function(ct) ccdata[cell_type == ct, cell_name] 61 | ) 62 | ), 63 | function(idvec) { 64 | # Downsample cell IDs if one phase (probably non-cycling cells) takes up more than two thirds of the data: 65 | cell_ids <- hdata[ 66 | idvec, 67 | .(id = switch((.N > hdata[idvec][phase != p, 2*.N]) + 1, cell_name, sample(cell_name, hdata[idvec][phase != p, 2*.N]))), 68 | by = .(p = phase) 69 | ]$id 70 | out_data <- copy(hdata)[cell_name %in% cell_ids, -'cycling'] 71 | # Take cumulative sum of numbers of cells for each phase, to be used in ordering the cells and plotting division 72 | # lines in the heatmap: 73 | setkey(out_data, phase) 74 | const <- out_data[cell_name %in% cell_ids][c('G2/M', 'Intermediate', 'G1/S')][!is.na(cell_name), .(N = .N), by = phase] 75 | const <- const[, setNames(c(0, cumsum(N)), c(phase, 'Not cycling'))] 76 | # Get ordered cell ID list: 77 | cell_ids <- out_data[c('G2/M', 'Intermediate', 'G1/S', 'Not cycling')][ 78 | !is.na(cell_name), 79 | .(cell_name = cell_name, cell_order = order(-pmax(g1s_score, g2m_score)) + const[phase]), 80 | by = phase 81 | ][, cell_name[cell_order]] 82 | # Melt heatmap data and order cells: 83 | out_data <- melt(out_data, id.vars = annot_cols, variable.name = 'gene', value.name = 'exp_level') 84 | out_data[, cell_name := factor(cell_name, levels = cell_ids)] 85 | # Order the genes: 86 | setkey(out_data, phase) 87 | if(all(c('G1/S', 'G2/M') %in% out_data$phase)) { 88 | out_data[, gene := factor(gene, levels = out_data[ 89 | c('G1/S', 'G2/M'), 90 | .SD[gene %in% plot_data[[i]][[tolower(gsub('/', '', phase))]], .(ave_exp = mean(exp_level)), by = gene], 91 | by = phase 92 | ][, .(ordered_genes = gene[order(ave_exp)]), by = phase]$ordered_genes)] 93 | } else { # Already checked there are enough cycling cells in idvec, so can assume that 'G1/S' or 'G2/M' is present 94 | out_data[, gene := factor(gene, levels = out_data[phase %in% c('G1/S', 'G2/M'), .(ave_exp = mean(exp_level)), by = gene][, 95 | Reduce(c, lapply( 96 | c('G1/S', 'G2/M'), 97 | function(p) .SD[gene %in% plot_data[[i]][[tolower(gsub('/', '', p))]]][order(ave_exp), gene] 98 | )) 99 | ])] 100 | } 101 | return(list(data = out_data, vline = const)) 102 | } 103 | ) 104 | plot_data[[i]]$hdata <- hdata 105 | } else plot_data[[i]]$hdata <- NULL 106 | 107 | } 108 | saveRDS(plot_data, paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1], '/data_cc.RDS')) 109 | } 110 | -------------------------------------------------------------------------------- /study_plots_data_cc_thresh_consensus.R: -------------------------------------------------------------------------------- 1 | library(data.table) 2 | library(magrittr) 3 | library(matkot) 4 | 5 | paths_table <- fread('../data/paths_table.csv', key = c('study', 'cancer_type')) 6 | 7 | thresh <- lapply(transpose(as.list(unique(paths_table[, .(study, cancer_type)]))), function(r) { 8 | if(!('data_cc_consensus.rds' %in% dir(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1])))) return(NULL) 9 | plot_data <- readRDS(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1], '/data_cc_consensus.rds')) 10 | nullcond <- sapply(plot_data, function(x) ifelse(is.null(x), TRUE, all(sapply(x[names(x) != 'path'], is.null)))) 11 | if(all(nullcond)) return(NULL) 12 | lapply(which(!nullcond), function(i) { 13 | if('cell_type' %in% names(plot_data[[i]]$ccdata)) { 14 | plot_data[[i]]$ccdata[, 15 | .(g1s = min(g1s_score[lr_g1s]), g2m = min(g2m_score[lr_g2m]), study = r[1], cancer_type = r[2]), 16 | by = cell_type 17 | ] 18 | } else { 19 | plot_data[[i]]$ccdata[, .(cell_type = NA, g1s = min(g1s_score[lr_g1s]), g2m = min(g2m_score[lr_g2m]), study = r[1], cancer_type = r[2])] 20 | } 21 | }) %>% rbindlist 22 | }) %>% rbindlist 23 | 24 | # plot(thresh[is.finite(g1s), sort(g1s)]); abline(h = 0.7) 25 | # plot(thresh[is.finite(g2m), sort(g2m)]); abline(h = 0.7) 26 | 27 | for(r in transpose(as.list(unique(paths_table[, .(study, cancer_type)])))) { 28 | cat(r, '\n') 29 | if(!('data_cc_consensus.rds' %in% dir(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1])))) next 30 | plot_data <- readRDS(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1], '/data_cc_consensus.rds')) 31 | nullcond <- sapply(plot_data, function(x) ifelse(is.null(x), TRUE, all(sapply(x[names(x) != 'path'], is.null)))) 32 | if(all(nullcond)) next 33 | for(i in which(!nullcond)) { 34 | 35 | ccdata <- copy(plot_data[[i]]$ccdata) 36 | ccdata[, phase := 'Cycling'] 37 | ccdata[g1s_score < 0.7 & g2m_score < 0.7, phase := 'Not cycling'] 38 | 39 | # Define phases and intermediates by fold change threshold: 40 | ccdata[phase == 'Cycling' & g1s_score > g2m_score & g1s_score - 0.5 < 2*(g2m_score - 0.5), phase := 'Intermediate'] 41 | ccdata[phase == 'Cycling' & g2m_score > g1s_score & g2m_score - 0.5 < 2*(g1s_score - 0.5), phase := 'Intermediate'] 42 | ccdata[phase == 'Cycling', phase := c('G1/S', 'G2/M')[which.max(.(g1s_score, g2m_score))], by = cell_name] 43 | 44 | # ggplot(ccdata[cell_type == 'Malignant'], aes(x = g1s_score, y = g2m_score)) + geom_point(aes(colour = phase)) + theme_test() 45 | 46 | setkey(ccdata, cell_name) 47 | plot_data[[i]]$ccdata <- ccdata 48 | 49 | hdata <- copy(plot_data[[i]]$hdata) 50 | if('data.table' %in% class(hdata)) { 51 | setkey(hdata, cell_name) 52 | hdata[, phase := do.call(`[`, list(ccdata, cell_name))$phase] 53 | annot_cols <- c('cell_name', 'g1s_score', 'g2m_score', 'phase') 54 | hdata <- slapply( 55 | switch( 56 | ('cell_type' %in% names(ccdata)) + 1, 57 | list(ccdata$cell_name), 58 | slapply( 59 | ccdata[phase != 'Not cycling', .(N = .N), by = cell_type][N >= 50, cell_type], 60 | function(ct) ccdata[cell_type == ct, cell_name] 61 | ) 62 | ), 63 | function(idvec) { 64 | # Downsample cell IDs if one phase (probably non-cycling cells) takes up more than two thirds of the data: 65 | cell_ids <- hdata[ 66 | idvec, 67 | .(id = switch((.N > hdata[idvec][phase != p, 2*.N]) + 1, cell_name, sample(cell_name, hdata[idvec][phase != p, 2*.N]))), 68 | by = .(p = phase) 69 | ]$id 70 | out_data <- copy(hdata)[cell_name %in% cell_ids, -'cycling'] 71 | # Take cumulative sum of numbers of cells for each phase, to be used in ordering the cells and plotting division lines in the 72 | # heatmap: 73 | setkey(out_data, phase) 74 | const <- out_data[cell_name %in% cell_ids][c('G2/M', 'Intermediate', 'G1/S')][!is.na(cell_name), .(N = .N), by = phase] 75 | const <- const[, setNames(c(0, cumsum(N)), c(phase, 'Not cycling'))] 76 | # Get ordered cell ID list: 77 | cell_ids <- out_data[c('G2/M', 'Intermediate', 'G1/S', 'Not cycling')][ 78 | !is.na(cell_name), 79 | .(cell_name = cell_name, cell_order = order(-pmax(g1s_score, g2m_score)) + const[phase]), 80 | by = phase 81 | ][, cell_name[cell_order]] 82 | # Melt heatmap data and order cells: 83 | out_data <- melt(out_data, id.vars = annot_cols, variable.name = 'gene', value.name = 'exp_level') 84 | out_data[, cell_name := factor(cell_name, levels = cell_ids)] 85 | # Order the genes: 86 | setkey(out_data, phase) 87 | if(all(c('G1/S', 'G2/M') %in% out_data$phase)) { 88 | out_data[, gene := factor(gene, levels = out_data[ 89 | c('G1/S', 'G2/M'), 90 | .SD[gene %in% plot_data[[i]][[tolower(gsub('/', '', phase))]], .(ave_exp = mean(exp_level)), by = gene], 91 | by = phase 92 | ][, .(ordered_genes = gene[order(ave_exp)]), by = phase]$ordered_genes)] 93 | } else { # Already checked there are enough cycling cells in idvec, so can assume that 'G1/S' or 'G2/M' is present 94 | out_data[, gene := factor(gene, levels = out_data[phase %in% c('G1/S', 'G2/M'), .(ave_exp = mean(exp_level)), by = gene][, 95 | Reduce(c, lapply( 96 | c('G1/S', 'G2/M'), 97 | function(p) .SD[gene %in% plot_data[[i]][[tolower(gsub('/', '', p))]]][order(ave_exp), gene] 98 | )) 99 | ])] 100 | } 101 | return(list(data = out_data, vline = const)) 102 | } 103 | ) 104 | plot_data[[i]]$hdata <- hdata 105 | } else plot_data[[i]]$hdata <- NULL 106 | 107 | } 108 | saveRDS(plot_data, paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1], '/data_cc_consensus.rds')) 109 | } 110 | -------------------------------------------------------------------------------- /study_plots_data_cna.R: -------------------------------------------------------------------------------- 1 | # The command line arguments should supply study name and cancer type (in that order). 2 | r = commandArgs(trailingOnly = TRUE) 3 | 4 | 5 | 6 | 7 | 8 | library(data.table) 9 | library(magrittr) 10 | library(Matrix) 11 | library(caTools) 12 | library(stringr) 13 | library(plyr) 14 | library(matkot) 15 | 16 | source('functions.R') 17 | 18 | paths_table <- fread('../data/paths_table.csv', key = c('study', 'cancer_type')) 19 | 20 | hgnc_complete_set <- fread('../data/hgnc_complete_set_2023-04-13.txt', key = 'ensembl_gene_id') 21 | hgnc_complete_set <- hgnc_complete_set[!(ensembl_gene_id %in% names(table(ensembl_gene_id))[table(ensembl_gene_id) > 1])] 22 | alias_table <- make_alias_table(hgnc_complete_set) 23 | 24 | gene_positions <- fread('../data/gene_positions.csv') 25 | 26 | # This is how gene_positions was defined: 27 | # library(biomaRt) 28 | # ensembl_mart <- useMart('ensembl') 29 | # ensembl_data <- useDataset('hsapiens_gene_ensembl', mart = ensembl_mart) 30 | # gene_positions <- as.data.table(getBM(attributes = c('ensembl_gene_id', 'chromosome_name', 'start_position', 'end_position'), mart = ensembl_data)) 31 | # fwrite(gene_positions, '../data/gene_positions.csv') 32 | 33 | gene_positions[, c('symbol', 'location') := do.call(`[`, list(hgnc_complete_set, ensembl_gene_id))[, .(symbol, location_sortable)]] 34 | gene_positions <- gene_positions[!is.na(symbol) & chromosome_name %in% c(as.character(1:22), 'X', 'Y')] 35 | gene_positions[, chromosome_name := mapvalues(chromosome_name, as.character(1:9), paste0(0, 1:9))] 36 | setkey(gene_positions, symbol) 37 | 38 | 39 | 40 | 41 | 42 | paths <- apply(paths_table[as.list(r), .(cells, genes, expmat)], 1, as.list, simplify = FALSE) 43 | ct <- gsub('/', '-', r[2]) 44 | out <- lapply(paths, function(p) { 45 | 46 | if(!endsWith(p$expmat, 'mtx')) return(list(cna_data = NULL, ref_cells = NULL, ref_range = NULL, path = p)) 47 | 48 | cat('Preparing data\n') 49 | cells <- suppressWarnings(fread(p$cells, na.strings = '')) 50 | if(!('cell_type' %in% names(cells)) || length(unique(cells$cell_type)) == 1 || !('Malignant' %in% cells$cell_type)) { 51 | return(list(cna_data = NULL, ref_cells = NULL, ref_range = NULL, path = p)) 52 | } 53 | cells$cell_name <- as.character(cells$cell_name) # In case cell names are the same as row numbers 54 | if('sample' %in% names(cells)) cells$sample <- as.character(cells$sample) 55 | if(cells[, .N > length(unique(cell_name))]) cells[, cell_name := paste(cell_name, .I, sep = '_')] 56 | genes <- fread(p$genes, header = FALSE)$V1 57 | expmat <- readMM(p$expmat) 58 | expmat <- expmat[genes != '', ] 59 | genes <- genes[genes != ''] 60 | expmat <- expmat[genes %in% names(table(genes))[table(genes) == 1], ] 61 | genes <- genes[genes %in% names(table(genes))[table(genes) == 1]] 62 | genes <- update_symbols_fast(genes, alias_table) # Update gene symbols 63 | rownames(expmat) <- genes 64 | genes <- genes[genes %in% gene_positions$symbol]; expmat <- expmat[genes, ] 65 | colnames(expmat) <- cells$cell_name 66 | cells <- cells[col_nnz(expmat) >= 1000] # Remove low-complexity cells 67 | if(nrow(cells) < 30) return(list(cna_data = NULL, ref_cells = NULL, ref_range = NULL, path = p)) 68 | expmat <- round(log_transform(1e+05*to_frac(expmat[, cells$cell_name])), 4) # Normalise to log TPM/10 69 | 70 | # Get reference cells, prioritising Fibroblasts, then Endothelial, then Macrophage, then whatever is most abundant. 71 | cat('Defining reference\n') 72 | ref_cts_global <- cells[cell_type != 'Malignant', .(N = .N), keyby = cell_type][N >= 100] 73 | ref_cts_global <- rbind( 74 | ref_cts_global[c('Fibroblast', 'Endothelial', 'Macrophage')], 75 | ref_cts_global[!(cell_type %in% c('Fibroblast', 'Endothelial', 'Macrophage'))][order(-N)] 76 | )[!is.na(N), cell_type[complete.cases(.SD)][1:min(2, .N)]] 77 | if(length(ref_cts_global) < 2) return(list(cna_data = NULL, ref_cells = NULL, ref_range = NULL, path = p)) 78 | ref_cells <- cells[cell_type %in% ref_cts_global, .(cell_name = cell_name, cell_type = cell_type, ref_type = 'global')] 79 | ref_cells_smpl <- cells[ 80 | cell_type != 'Malignant', 81 | { 82 | ref_cts_smpl <- .SD[, .(N = .N), keyby = cell_type][N >= 100] 83 | ref_cts_smpl <- ref_cts_smpl[c('Fibroblast', 'Endothelial', 'Macrophage'), cell_type[complete.cases(.SD)]] 84 | if(length(ref_cts_smpl) >= 2) .SD[cell_type %in% ref_cts_smpl, .(cell_name = cell_name, cell_type = cell_type)] 85 | }, 86 | by = .(ref_type = sample) 87 | ] 88 | if(nrow(ref_cells_smpl) > 0) ref_cells <- rbind(ref_cells, ref_cells_smpl, use.names = TRUE) 89 | rm(ref_cells_smpl) 90 | 91 | cat('Computing CNA values\n') 92 | rms <- rowMeans(expmat) 93 | genes_top <- head(names(rms)[order(-rms)], 5000) 94 | expmat <- expmat[genes_top, ]; rms <- rms[genes_top] 95 | expmat <- expmat - rms # Centre rows/genes 96 | expmat[expmat > 3] <- 3; expmat[expmat < -3] <- -3 # Restrict range to limit influence of extreme values 97 | expdt <- as.data.table(as.matrix(expmat), keep.rownames = 'gene') 98 | expdt <- melt(expdt, id.vars = 'gene', variable.name = 'cell_name', value.name = 'value', variable.factor = FALSE) 99 | expdt[, c('chr', 'start_pos') := gene_positions[gene, .(chromosome_name, start_position)]] 100 | expdt <- expdt[order(chr, start_pos)] 101 | expdt[, cna := log2(runmean(2^value, k = 100)), by = .(chr, cell_name)] # Computing the CNA values 102 | expdt[, cna := cna - median(cna), keyby = cell_name] 103 | ref_range <- ref_cells[, do.call(`[`, list(expdt, cell_name))[, .(mean_cna = mean(cna)), by = gene], by = .(ref_type, cell_type)] 104 | ref_range <- ref_range[, .(mn = min(mean_cna), mx = max(mean_cna)), keyby = .(ref_type, gene)] 105 | setkey(cells, cell_name) 106 | expdt[, sample := do.call(`[`, list(cells, cell_name))[, as.character(sample)]] 107 | 108 | cat('Correcting CNA values w.r.t. reference\n') 109 | expdt[, # Correcting the CNA values w.r.t. reference cells 110 | cna := { 111 | rt <- unique(sample) 112 | if(!(rt %in% ref_range$ref_type)) rt <- 'global' 113 | out <- .SD[, .(cna, do.call(`[`, list(ref_range, .(rt, gene)))[, .(mn, mx)])] 114 | out[, ifelse(cna > mx + 0.1, cna - mx - 0.1, ifelse(cna < mn - 0.1, cna - mn + 0.1, 0))] 115 | }, 116 | by = sample 117 | ] 118 | 119 | cat('Done!\n\n') 120 | return(list(cna_data = expdt, ref_cells = ref_cells, ref_range = ref_range, path = p)) 121 | 122 | }) 123 | 124 | if(!all(sapply(out, function(x) is.null(x$cna_data)))) { 125 | 126 | ct <- gsub('/', '-', r[2]) 127 | if(!(ct %in% dir('../data/study_plots'))) {dir.create(paste0('../data/study_plots/', ct))} 128 | if(!(r[1] %in% dir(paste0('../data/study_plots/', ct)))) {dir.create(paste0('../data/study_plots/', ct, '/', r[1]))} 129 | 130 | saveRDS(out, paste0('../data/study_plots/', ct, '/', r[1], '/data_cna.rds')) 131 | 132 | } 133 | -------------------------------------------------------------------------------- /study_plots_data_ct_comp.R: -------------------------------------------------------------------------------- 1 | # The command line arguments should supply study name and cancer type (in that order). 2 | r = commandArgs(trailingOnly = TRUE) 3 | 4 | 5 | 6 | 7 | 8 | library(data.table) 9 | library(ggplot2) 10 | library(magrittr) 11 | library(Matrix) 12 | library(stringr) 13 | library(plyr) 14 | library(matkot) 15 | 16 | source('functions.R') 17 | 18 | canonical_markers <- fread('../data/canonical_markers.csv') 19 | paths_table <- fread('../data/paths_table.csv', encoding = 'UTF-8', key = c('study', 'cancer_type')) 20 | 21 | 22 | 23 | 24 | 25 | paths <- apply(paths_table[as.list(r), .(cells, genes, expmat)], 1, as.list, simplify = FALSE) 26 | 27 | out <- lapply( 28 | paths, 29 | function(p) { 30 | 31 | if(endsWith(p$expmat, 'mtx')) { 32 | 33 | cells <- suppressWarnings(fread(p$cells, na.strings = '')) 34 | 35 | cells$cell_name <- as.character(cells$cell_name) 36 | 37 | if(cells[, .N > length(unique(cell_name))]) cells[, cell_name := paste(cell_name, .I, sep = '_')] 38 | 39 | genes <- fread(p$genes, header = FALSE)$V1 40 | expmat <- readMM(p$expmat) 41 | rownames(expmat) <- genes 42 | colnames(expmat) <- cells$cell_name 43 | 44 | # Remove low-complexity cells and normalise to log TPM/10: 45 | cells <- cells[col_nnz(expmat) >= 1000] 46 | expmat <- round(log_transform(1e+05*to_frac(expmat[, cells$cell_name])), 4) 47 | 48 | ct_comp_data_out <- ct_comp_data(expmat, cells, markers = canonical_markers) 49 | 50 | return(list(data = ct_comp_data_out, path = p)) 51 | 52 | } else return(list(data = NULL, path = p)) 53 | 54 | } 55 | ) 56 | 57 | if(!all(sapply(out, function(x) is.null(x$data)))) { 58 | 59 | ct <- gsub('/', '-', r[2]) 60 | if(!(ct %in% dir('../data/study_plots'))) {dir.create(paste0('../data/study_plots/', ct))} 61 | if(!(r[1] %in% dir(paste0('../data/study_plots/', ct)))) {dir.create(paste0('../data/study_plots/', ct, '/', r[1]))} 62 | 63 | saveRDS(out, paste0('../data/study_plots/', ct, '/', r[1], '/data_ct_comp.RDS')) 64 | 65 | } 66 | -------------------------------------------------------------------------------- /study_plots_data_ct_umap.R: -------------------------------------------------------------------------------- 1 | # The command line arguments should supply study name and cancer type (in that order). 2 | r = commandArgs(trailingOnly = TRUE) 3 | 4 | 5 | 6 | 7 | 8 | library(data.table) 9 | library(ggplot2) 10 | library(magrittr) 11 | library(Matrix) 12 | library(stringr) 13 | library(plyr) 14 | library(irlba) 15 | library(uwot) 16 | library(matkot) 17 | 18 | source('functions.R') 19 | 20 | paths_table <- fread('../data/paths_table.csv', encoding = 'UTF-8', key = c('study', 'cancer_type')) 21 | 22 | 23 | 24 | 25 | 26 | paths <- apply(paths_table[as.list(r), .(cells, genes, expmat)], 1, as.list, simplify = FALSE) 27 | 28 | out <- lapply(paths, function(p) { 29 | 30 | if(endsWith(p$expmat, 'mtx')) { 31 | 32 | cells <- suppressWarnings(fread(p$cells, na.strings = '')) 33 | 34 | cells$cell_name <- as.character(cells$cell_name) 35 | 36 | if(cells[, .N > length(unique(cell_name))]) cells[, cell_name := paste(cell_name, .I, sep = '_')] 37 | 38 | genes <- fread(p$genes, header = FALSE)$V1 39 | expmat <- readMM(p$expmat) 40 | rownames(expmat) <- genes 41 | colnames(expmat) <- cells$cell_name 42 | 43 | # Remove low-complexity cells and normalise to log TPM/10: 44 | cells <- cells[complexity >= 1000] 45 | expmat <- round(log_transform(1e+05*to_frac(expmat[, cells$cell_name])), 4) 46 | 47 | # Filtered gene list, after removing lowly expressed genes: 48 | if('sample' %in% names(cells)) { 49 | filtered_genes <- cells[ 50 | sample %in% cells[, .(N = .N), by = sample][N >= 10, sample], 51 | .(gene = genes[rowMeans(log_transform(expmat[, cell_name], reverse = TRUE)) >= 1e+05*(2^4 - 1)/1e+06]), 52 | by = sample 53 | ]$gene %>% unique 54 | } else { 55 | filtered_genes <- genes[rowMeans(log_transform(expmat, reverse = TRUE)) >= 1e+05*(2^4 - 1)/1e+06] 56 | } 57 | 58 | ct_umap_data_out <- ct_umap_data(expmat[filtered_genes, ], cells) 59 | 60 | return(list(data = ct_umap_data_out, path = p)) 61 | 62 | } else return(NULL) 63 | 64 | }) 65 | 66 | if(!all(sapply(out, function(x) is.null(x$data)))) { 67 | 68 | ct <- gsub('/', '-', r[2]) 69 | if(!(ct %in% dir('../data/study_plots'))) {dir.create(paste0('../data/study_plots/', ct))} 70 | if(!(r[1] %in% dir(paste0('../data/study_plots/', ct)))) {dir.create(paste0('../data/study_plots/', ct, '/', r[1]))} 71 | 72 | saveRDS(out, paste0('../data/study_plots/', ct, '/', r[1], '/data_ct_umap.RDS')) 73 | 74 | } 75 | -------------------------------------------------------------------------------- /study_plots_data_dist.R: -------------------------------------------------------------------------------- 1 | r = commandArgs(trailingOnly = TRUE) 2 | 3 | 4 | 5 | 6 | 7 | library(data.table) 8 | library(magrittr) 9 | library(readxl) 10 | library(plyr) 11 | library(stringr) 12 | library(Matrix) 13 | library(matkot) 14 | 15 | source('functions.R') 16 | 17 | paths_table <- fread('../data/paths_table.csv', key = c('study', 'cancer_type')) 18 | 19 | hgnc_complete_set <- fread('../data/hgnc_complete_set_2023-04-13.txt', key = 'symbol') 20 | hgnc_complete_set <- hgnc_complete_set[!(ensembl_gene_id %in% names(table(ensembl_gene_id))[table(ensembl_gene_id) > 1])] 21 | alias_table <- make_alias_table(hgnc_complete_set) 22 | 23 | meta_programs <- lapply( 24 | c('Malignant', 'B cells', 'Endothelial', 'Epithelial', 'Fibroblasts', 'Macrophages', 'CD4 T cells', 'CD8 T cells'), 25 | function(ct) { 26 | dt <- as.data.table(read_xlsx('../data/meta_programs_2023-07-13.xlsx', sheet = ct)) 27 | dt[, (names(dt)) := lapply(dt, update_symbols_fast, alias_table)] 28 | return(dt) 29 | } 30 | ) 31 | 32 | # Change some MP names, especially for CD4 and CD8 T cells, so we can merge them into a single T cell category: 33 | names(meta_programs[[1]])[c(12:15, 30, 32, 34, 39)] <- gsub('-', ' ', names(meta_programs[[1]])[c(12:15, 30, 32, 34, 39)]) 34 | names(meta_programs[[1]])[c(19, 35, 37, 41)] <- c('EpiSen', 'Hemato-related I', 'Hemato-related II', 'MP41 (Unassigned)') 35 | names(meta_programs[[2]])[3] <- 'Cell Cycle' 36 | names(meta_programs[[3]])[c(1, 6)] <- c('Notch signaling', 'Cell Cycle') 37 | names(meta_programs[[4]])[5] <- 'Cell Cycle' 38 | names(meta_programs[[5]])[c(6, 10, 15)] <- c('Cell Cycle', 'Metal response', 'Lipid metabolism') 39 | names(meta_programs[[6]])[c(3, 9, 12)] <- c('Cell Cycle', 'Proteasomal degradation', 'Unfolded protein response') 40 | names(meta_programs[[7]]) <- paste('CD4 -', names(meta_programs[[7]])) 41 | names(meta_programs[[7]])[c(1, 3)] <- c('CD4 - Treg', 'CD4 - Cell Cycle') 42 | names(meta_programs[[8]]) <- paste('CD8 -', names(meta_programs[[8]])) 43 | names(meta_programs[[8]])[c(2, 10)] <- c('CD8 - Cell Cycle', 'CD8 - Heat shock') 44 | 45 | # Merge CD4 and CD8 T cells: 46 | meta_programs[[7]] <- cbind(meta_programs[[7]], meta_programs[[8]]) 47 | meta_programs <- meta_programs[1:7] 48 | 49 | cts <- c('Malignant', 'B_cell', 'Endothelial', 'Epithelial', 'Fibroblast', 'Macrophage', 'T_cell') 50 | names(meta_programs) <- cts 51 | 52 | 53 | 54 | 55 | 56 | paths <- apply(paths_table[as.list(r), .(cells, genes, expmat)], 1, as.list, simplify = FALSE) 57 | 58 | out <- lapply(paths, function(p) { 59 | 60 | if(!endsWith(p$expmat, 'mtx')) return(NULL) 61 | 62 | cat('Preparing data\n') 63 | cells <- suppressWarnings(fread(p$cells, na.strings = '')) 64 | if(!all(c('cell_type', 'sample') %in% names(cells))) return(NULL) 65 | cells$cell_name <- as.character(cells$cell_name) 66 | cells$sample <- as.character(cells$sample) 67 | if(cells[, .N > length(unique(cell_name))]) cells[, cell_name := paste(cell_name, .I, sep = '_')] 68 | genes <- fread(p$genes, header = FALSE)$V1 69 | expmat <- readMM(p$expmat) 70 | expmat <- expmat[genes != '', ] 71 | genes <- genes[genes != ''] 72 | expmat <- expmat[genes %in% names(table(genes))[table(genes) == 1], ] 73 | genes <- genes[genes %in% names(table(genes))[table(genes) == 1]] 74 | genes <- update_symbols_fast(genes, alias_table) # Update gene symbols 75 | rownames(expmat) <- genes 76 | colnames(expmat) <- cells$cell_name 77 | cells <- cells[col_nnz(expmat) >= 1000] # Remove low-complexity cells 78 | if(nrow(cells) < 30) return(NULL) 79 | expmat <- round(log_transform(1e+05*to_frac(expmat[, cells$cell_name])), 4) # Normalise to log TPM/10 80 | 81 | cat('Assigning MP to each cell\n') 82 | set.seed(4387) 83 | scores <- cells[ 84 | cell_type %in% cts, 85 | if(.N >= 100) c( # Cut-off of 100 cells of this cell type in this sample 86 | .(cell_name = cell_name), 87 | slapply( 88 | meta_programs[[unique(cell_type)]], 89 | function(mp) { 90 | if(sum(mp %in% rownames(expmat)) < 10) mp <- str_to_title(mp) 91 | if(sum(mp %in% rownames(expmat)) >= 30) sig_score(expmat[, cell_name], mp[mp %in% rownames(expmat)], nbin = 50, n = 50) 92 | } 93 | ) 94 | ) %>% as.data.table %>% melt(id.vars = 'cell_name', variable.name = 'meta_program', value.name = 'score', variable.factor = FALSE), 95 | by = .(cell_type, sample) 96 | ] 97 | if(nrow(scores) == 0) return(NULL) 98 | # Assign MP to each cell (take the one with highest score): 99 | assignments <- scores[, .(max_score = max(score), meta_program = meta_program[which.max(score)]), by = .(cell_type, sample, cell_name)] 100 | assignments[, a_ct := ifelse(max_score < 1, as.character(NA), meta_program)] 101 | # In each cell type and sample, remove MPs that were assigned to less than 5% of those cells that were given an assignment (i.e. that had max 102 | # score >= 1): 103 | assignments[!is.na(a_ct), a_smpl := {a <- table(a_ct)/.N; ifelse(a_ct %in% names(a)[a < 0.05], NA, a_ct)}, by = .(cell_type, sample)] 104 | # Similar but more lenient criterion for a_ct, to get rid of spurious assignments: 105 | assignments[!is.na(a_ct), a_ct := {a <- table(a_ct)/.N; ifelse(a_ct %in% names(a)[a < 0.01], NA, a_ct)}, by = .(cell_type, sample)] 106 | setkey(assignments, cell_type, sample) 107 | 108 | cat('Constructing plot data\n') 109 | pie_data <- slapply(cts[cts %in% assignments$cell_type], function(ct) { 110 | pie_out <- assignments[ct, {N_ct <- .N; .SD[, .(prop = .N/N_ct), by = a_ct]}][order(-prop)] 111 | pie_out[is.na(a_ct), a_ct := 'Unassigned'] 112 | pie_out[prop < 0.03, a_ct := 'Rare MPs (each <3%)'] 113 | pie_out[a_ct == 'Rare MPs (each <3%)', prop := sum(prop)] 114 | pie_out <- unique(pie_out) 115 | pie_out[, a_ct := factor(a_ct, levels = c(a_ct[!(a_ct %in% c('Rare MPs (each <3%)', 'Unassigned'))], 'Rare MPs (each <3%)', 'Unassigned'))] 116 | pie_out <- pie_out[order(a_ct)] 117 | pie_out[, cumprop := c(0, cumsum(prop)[-.N])] 118 | return(pie_out) 119 | }) 120 | bar_data <- slapply(cts[cts %in% assignments$cell_type], function(ct) { 121 | slapply(assignments[ct, unique(sample)], function(smpl) { 122 | bar_out <- assignments[.(ct, as.character(smpl))][, {N_smpl <- .N; .SD[, .(prop = .N/N_smpl), by = a_smpl]}][!is.na(a_smpl)][order(prop)] 123 | if(nrow(bar_out) == 0) return(NULL) 124 | bar_out[, a_smpl := factor(a_smpl, levels = a_smpl)] 125 | return(bar_out) 126 | }) 127 | }) 128 | bar_data <- slapply(bar_data, function(x) x[!sapply(x, is.null)]) 129 | htmp_data <- slapply(cts[cts %in% assignments$cell_type], function(ct) { 130 | slapply(assignments[ct, unique(sample)], function(smpl) { 131 | if(!(smpl %in% names(bar_data[[ct]]))) return(NULL) 132 | mps <- bar_data[[ct]][[as.character(smpl)]][, levels(a_smpl)] 133 | g <- slapply(meta_programs[[ct]][, ..mps], function(mp) { 134 | if(sum(mp %in% rownames(expmat)) < 10) mp <- str_to_title(mp) 135 | mp[mp %in% rownames(expmat)] 136 | }) 137 | asub <- assignments[!is.na(a_smpl)][.(ct, as.character(smpl))] 138 | if(nrow(asub) > 100) asub <- asub[, {n <- floor(100*.N/nrow(asub)); .SD[sample(1:.N, n, replace = FALSE)]}, by = a_smpl] 139 | htmp_out_list <- slapply(mps, function(mp_name) { 140 | mat <- expmat[g[[mp_name]], asub$cell_name, drop = FALSE] 141 | mat_sub <- mat[, asub[a_smpl == mp_name, cell_name], drop = FALSE] 142 | dt <- as.data.table(as.matrix(mat - rowMeans(mat)), keep.rownames = 'gene') # centre per gene and convert to data.table 143 | dt <- melt(dt, id.vars = 'gene', variable.name = 'cell_name', value.name = 'value')[, a_smpl := mp_name] 144 | return(list(data = dt, genes = names(sort(rowMeans(mat_sub))), cells = names(sort(colMeans(mat_sub))))) 145 | }) 146 | htmp_out <- rbindlist(lapply(htmp_out_list, `[[`, 'data')) 147 | htmp_out[, c('gene_a', 'a_smpl') := .(paste(gene, gsub(' |-|/|\\(|\\)', '', a_smpl), sep = '_'), factor(a_smpl, levels = rev(mps)))] 148 | cells_ord <- unlist(lapply(htmp_out_list[rev(mps)], `[[`, 'cells')) 149 | genes_ord <- unlist(lapply(mps, function(mp_name) paste(htmp_out_list[[mp_name]]$genes, gsub(' |-|/|\\(|\\)', '', mp_name), sep = '_'))) 150 | htmp_out[, c('gene_a', 'cell_name') := .(factor(gene_a, levels = genes_ord), factor(cell_name, levels = cells_ord))] 151 | htmp_out[, gene_num := as.numeric(gene_a)] 152 | return(htmp_out) 153 | }) 154 | }) 155 | htmp_data <- slapply(htmp_data, function(x) x[!sapply(x, is.null)]) 156 | 157 | cat('Done!\n') 158 | return(list(scores = scores, assignments = assignments, pie_data = pie_data, bar_data = bar_data, heatmap_data = htmp_data, path = p)) 159 | 160 | }) 161 | 162 | if(!all(sapply(out, function(x) is.null(x$assignments)))) { 163 | 164 | ct <- gsub('/', '-', r[2]) 165 | if(!(ct %in% dir('../data/study_plots'))) {dir.create(paste0('../data/study_plots/', ct))} 166 | if(!(r[1] %in% dir(paste0('../data/study_plots/', ct)))) {dir.create(paste0('../data/study_plots/', ct, '/', r[1]))} 167 | 168 | saveRDS(out, paste0('../data/study_plots/', ct, '/', r[1], '/data_dist.rds')) 169 | 170 | } 171 | --------------------------------------------------------------------------------