├── 3CA_v2
    ├── MP_analysis_new_batch.R
    ├── README.md
    ├── cc_analysis.R
    ├── cc_analysis_consensus_comparison.R
    ├── ct_specificity.R
    ├── gene_analysis.R
    ├── mp_new.R
    └── pseudobulks.R
├── ITH_hallmarks
    ├── Generating_MPs
    │   ├── Generate_Meta_Programs.R
    │   ├── Genes_nmf_w_basis_example.RDS
    │   ├── custom_magma.R
    │   └── robust_nmf_programs.R
    ├── MPs_distribution
    │   ├── MP_distribution.R
    │   ├── MP_list.RDS.gz
    │   ├── My_study.RDS.gz
    │   └── heatCols.RDS.gz
    ├── README.md
    └── TCGA_analysis
    │   ├── functions.R
    │   ├── pdac_classical_main.R
    │   ├── preprocess_clinical_data.R
    │   ├── preprocess_mutations.R
    │   ├── preprocess_tcga_data.R
    │   ├── study_contribution_per_MP.RDS
    │   ├── study_tcga_map.R
    │   ├── tcga_clinical.R
    │   ├── tcga_clinical_deconv.R
    │   ├── tcga_clinical_meta.R
    │   ├── tcga_mp_scores.R
    │   ├── tcga_mp_scores_deconv.R
    │   ├── tcga_mutations_new.R
    │   └── tcga_subtypes_data.R
├── LICENSE
├── README.md
├── aliases_pubmed.R
├── aliases_table.R
├── bins.R
├── canonical_markers.R
├── cc_sigs_consensus.R
├── cna_mat_prep.R
├── functions.R
├── gene_ave.R
├── gene_mp_cor.R
├── gene_mp_cor_all_web.R
├── gene_plots_cell_types.R
├── gene_plots_data_all_web.R
├── gene_plots_render_web.R
├── gene_plots_rmd_web.R
├── mp_scores_all.R
├── mp_scores_prep.R
├── study_plots_cc.R
├── study_plots_cna.R
├── study_plots_ct_comp.R
├── study_plots_ct_umap.R
├── study_plots_data_cc.R
├── study_plots_data_cc_consensus.R
├── study_plots_data_cc_thresh.R
├── study_plots_data_cc_thresh_consensus.R
├── study_plots_data_cna.R
├── study_plots_data_ct_comp.R
├── study_plots_data_ct_umap.R
├── study_plots_data_dist.R
└── study_plots_dist.R


/3CA_v2/README.md:
--------------------------------------------------------------------------------
1 | # Analysis with 3CA version 2
2 | 
3 | The 3CA_v2 directory contains code for analyses conducted on the expanded version of 3CA, including: cell cycle analysis; measurement of gene specificity and sensitivity; quantification of cell type specificity; and an updated characterisation of recurrent expression programs (meta-programs/MPs). All analyses were run using R version 4.1.1.
4 | 


--------------------------------------------------------------------------------
/3CA_v2/cc_analysis_consensus_comparison.R:
--------------------------------------------------------------------------------
  1 | library(data.table)
  2 | library(ggplot2)
  3 | library(magrittr)
  4 | library(Matrix)
  5 | library(stringr)
  6 | library(plyr)
  7 | library(cowplot)
  8 | library(RColorBrewer)
  9 | library(scales)
 10 | library(readxl)
 11 | library(ggpubr)
 12 | library(ggtext)
 13 | library(matkot)
 14 | 
 15 | try(library(randomcoloR), silent = TRUE)
 16 | 
 17 | source('functions.R')
 18 | 
 19 | paths_table <- fread('../data/paths_table.csv', key = c('study', 'cancer_type'))
 20 | 
 21 | 
 22 | 
 23 | 
 24 | 
 25 | diseases <- lapply(transpose(as.list(unique(paths_table[cancer_type != 'Other/Models', .(study, cancer_type)]))), function(r) {
 26 |     cat(r, '\n')
 27 |     samples_path <- paste0('~/../shared/pan_cancer_datasets/', paths_table[as.list(r), directory[1]], '/samples.csv')
 28 |     if(file.exists(samples_path)) {
 29 |         samples <- fread(samples_path)[!is.na(sample)] # Setting na.strings = '' doesn't seem to work here...
 30 |         samples[, sample := as.character(sample)]
 31 |         samples[cancer_type == '', cancer_type := NA]
 32 |         setkey(samples, sample)
 33 |     } else {
 34 |         warning("Samples file doesn't exist for ", r[1], ", ", r[2])
 35 |         return(NULL)
 36 |     }
 37 |     if('cancer_type' %in% names(samples)) {
 38 |         return(samples[, .(study = r[1], cancer_type = r[2], sample = sample, disease = cancer_type)])
 39 |     } else {
 40 |         warning("Samples file for ", r[1], ", ", r[2], " doesn't contain cancer_type column")
 41 |         return(NULL)
 42 |     }
 43 | }) %>% rbindlist
 44 | to_exclude <- list(c('Chen et al. 2020', 'Head and Neck'), c('Sun et al. 2021', 'Liver/Biliary')) %>%
 45 |     transpose %>% as.data.table %>% setNames(c('study', 'cancer_type'))
 46 | setkey(diseases, study, cancer_type)
 47 | diseases_to_include <- diseases[!to_exclude][
 48 |     !is.na(disease) & !(disease %in% c('Normal', 'Premalignant')),
 49 |     .SD[, .(n_sample = .N, n_study = nrow(unique(.SD))), by = disease, .SDcols = c('study', 'cancer_type')][
 50 |         (n_study >= 2 & n_sample >= 10) | (n_sample >= 20),
 51 |         disease
 52 |     ]
 53 | ]
 54 | to_include <- diseases[!to_exclude][disease %in% diseases_to_include, .(study, cancer_type)] %>% unique
 55 | 
 56 | 
 57 | 
 58 | 
 59 | 
 60 | set.seed(1852)
 61 | cc_prop <- lapply(c('data_cc_consensus.rds', 'data_cc.RDS'), function(fn) rbindlist(lapply(transpose(as.list(to_include)), function(r) {
 62 |     
 63 |     cat(r, '\n')
 64 |     
 65 |     # Only difference is to change 'data_cc_consensus.rds' to 'data_cc.RDS' in the following two lines:
 66 |     if(!(fn %in% dir(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1])))) return(NULL)
 67 |     plot_data <- readRDS(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1], '/', fn))
 68 |     
 69 |     nullcond <- sapply(plot_data, function(x) ifelse(is.null(x), TRUE, all(sapply(x[names(x) != 'path'], is.null))))
 70 |     if(all(nullcond)) return(NULL)
 71 |     
 72 |     paths <- apply(paths_table[as.list(r), .(cells, genes, expmat)], 1, as.list, simplify = FALSE)
 73 |     
 74 |     samples_path <- paste0('~/../shared/pan_cancer_datasets/', paths_table[as.list(r), directory[1]], '/samples.csv')
 75 |     if(file.exists(samples_path)) {
 76 |         samples <- fread(samples_path, colClasses = c(sample = 'character'), na.strings = '')[!is.na(sample) & cancer_type %in% diseases_to_include]
 77 |         setkey(samples, sample)
 78 |     }
 79 |     
 80 |     # Split HNSCC samples into HPV pos and neg:
 81 |     if(all(r == c('Puram et al. 2017', 'Head and Neck'))) {
 82 |         samples <- samples[sample %in% c('5', '6', '16', '17', '18', '20', '22', '25', '26', '28')][, cancer_type := 'HNSCC (HPV-neg.)']
 83 |     } else if(all(r == c('Cillo et al. 2020', 'Head and Neck'))) {
 84 |         samples[, cancer_type := ifelse(p16_status == 'p16+', 'HNSCC (HPV-pos.)', 'HNSCC (HPV-neg.)')]
 85 |     } else if(all(r == c('Kürten et al. 2021', 'Head and Neck'))) {
 86 |         samples[, cancer_type := ifelse(additional_tumor_characterisics == 'HPV Positive', 'HNSCC (HPV-pos.)', 'HNSCC (HPV-neg.)')]
 87 |     } else if(all(r == c('Unpublished HNSCC HTAN', 'Head and Neck'))) {
 88 |         samples[, cancer_type := ifelse(hpv == 'HPV+', 'HNSCC (HPV-pos.)', 'HNSCC (HPV-neg.)')]
 89 |     }
 90 |     
 91 |     # Split CRC and Gastric into MSS and MSI:
 92 |     if(all(r == c('Lee et al. 2020', 'Colorectal'))) {
 93 |         samples[, cancer_type := ifelse(grepl('mss', genetic_hormonal_features), 'CRC (MSS)', 'CRC (MSI)')]
 94 |     } else if(all(r == c('Pelka et al. 2021', 'Colorectal'))) {
 95 |         samples[, cancer_type := ifelse(MSIStatus == 'MSS', 'CRC (MSS)', 'CRC (MSI)')]
 96 |     } else if(all(r == c('Zhang et al. 2018', 'Colorectal'))) {
 97 |         samples[, cancer_type := ifelse(msi_status == 'MSS', 'CRC (MSS)', 'CRC (MSI)')]
 98 |     } else if(all(r == c('Kumar et al. 2022', 'Gastric'))) {
 99 |         samples[, cancer_type := ifelse(is.na(subtype) | subtype == '', NA, ifelse(subtype == 'MSI', 'Gastric (MSI)', 'Gastric (MSS)'))]
100 |     } else if(all(r == c('Zhang et al. 2019', 'Gastric'))) {
101 |         samples[, cancer_type := NA] # There's only 1 true gastric cancer sample in this dataset, and it's not annotated with MSI status.
102 |     }
103 |     
104 |     samples <- samples[!is.na(cancer_type)]; if(nrow(samples) == 0) return(NULL)
105 |     
106 |     rout <- lapply(which(!nullcond), function(i) {
107 |         
108 |         dt <- plot_data[[i]]$ccdata
109 |         dt <- dt[sample %in% samples$sample][, c('disease', 'tech') := do.call(`[`, list(samples, sample))[, .(cancer_type, technology)]]
110 |         if(!(nrow(dt) >= 100 & 'cell_type' %in% names(dt))) return(NULL)
111 |         dt <- dt[, .(cell_name, cell_type, sample, disease, tech, phase)]
112 |         if(any(c('Myeloid', 'Monocyte') %in% dt$cell_type)) dt[cell_type %in% c('Myeloid', 'Monocyte'), cell_type := 'Macrophage']
113 |         
114 |         # Add complexity column to dt:
115 |         cells <- suppressWarnings(fread(paths[[i]]$cells, na.strings = '', colClasses = c(cell_name = 'character', sample = 'character')))
116 |         setkey(cells, cell_name)
117 |         if('complexity' %in% names(cells)) dt[, n_gene := do.call(`[`, list(cells, cell_name, 'complexity'))] else dt[, n_gene := as.numeric(NA)]
118 |         return(dt)
119 |         
120 |     }) %>% rbindlist
121 |     
122 |     if(is.null(rout) || nrow(rout) == 0) return(NULL)
123 |     
124 |     # Cell cycle proportions across all cells in a dataset (per disease and tech):
125 |     rout_all <- rout[, # Require at least 100 cells altogether, across samples
126 |         if(.N >= 100) .(sample = 'all', n_sample = length(unique(sample)), n_cell = .N, n_g1s = sum(phase == 'G1/S'),
127 |             n_g2m = sum(phase == 'G2/M'), n_int = sum(phase == 'Intermediate'), n_gene = mean(n_gene)),
128 |         by = .(cell_type, disease, tech)
129 |     ]
130 |     
131 |     # As above but imposing bounds on the number of cells per sample, so that samples with very many cells don't skew the average:
132 |     rout_all_b <- rout[,
133 |         if(.N >= 100) { # Require at least 100 cells altogether, across samples
134 |             bounds <- .SD[, .(N = .N), by = sample][, quantile(N, c(0.75, 0.25)) + c(1.5, -1.5)*floor(IQR(N))]
135 |             .SD[, if(.N >= bounds[2]) {if(.N <= bounds[1]) .SD else .SD[sample(1:.N, bounds[1])]}, by = sample][,
136 |                 if(.N >= 100) .(sample = 'all_b', n_sample = length(unique(sample)), n_cell = .N, n_g1s = sum(phase == 'G1/S'),
137 |                     n_g2m = sum(phase == 'G2/M'), n_int = sum(phase == 'Intermediate'), n_gene = mean(n_gene))
138 |             ]
139 |         },
140 |         by = .(cell_type, disease, tech)
141 |     ]
142 |     
143 |     # Cell cycle proportions per sample:
144 |     rout_smpl <- rout[, # Require at least 100 cells per sample
145 |         if(.N >= 100) .(n_sample = NA, n_cell = .N, n_g1s = sum(phase == 'G1/S'), n_g2m = sum(phase == 'G2/M'),
146 |             n_int = sum(phase == 'Intermediate'), n_gene = mean(n_gene)),
147 |         by = .(cell_type, disease, tech, sample)
148 |     ]
149 |     
150 |     rout <- rbindlist(list(rout_all, rout_all_b, rout_smpl)[c(nrow(rout_all) > 0, nrow(rout_all_b) > 0, nrow(rout_smpl) > 0)])
151 |     
152 |     if(!is.null(rout) && nrow(rout) > 0) {
153 |         rout <- unique(rout)
154 |         rout[, c('cancer_type', 'study') := .(r[2], r[1])]
155 |         setcolorder(rout, c('cancer_type', 'study'))
156 |         return(rout)
157 |     }
158 |     
159 | }))) %>% setNames(c('cons', 'bes'))
160 | 
161 | for(x in c('cons', 'bes')) {
162 |     cc_prop[[x]] <- cc_prop[[x]][!(cancer_type == 'Brain' & cell_type == 'Fibroblast')][, disease := mapvalues(
163 |         gsub(' Cancer', '', disease),
164 |         c('Acute Myeloid Leukemia', 'Chronic Myeloid Leukemia', 'Clear Cell Renal Cell Carcinoma', 'Colorectal', 'Cutaneous Basal Cell Carcinoma',
165 |             'Cutaneous Squamous Cell Carcinoma', 'Diffuse Large B Cell Lymphoma', 'Glioblastoma', 'Hepatocellular Carcinoma', 'Lung Adenocarcinoma',
166 |             'Lung Squamous Cell Carcinoma', 'Multiple Myeloma', 'Neuroendocrine Tumor', 'Pancreatic Ductal Adenocarcinoma', 'Small Cell Lung'),
167 |         c('AML', 'CML', 'ccRCC', 'CRC', 'Skin BCC', 'Skin SCC', 'DLBCL', 'GBM', 'HCC', 'Lung Adeno.', 'Lung Squamous', 'MM', 'NET', 'PDAC', 'SCLC'),
168 |         warn_missing = FALSE
169 |     )]
170 | }
171 | 
172 | common_cts <- cc_prop$cons[sample == 'all_b', .(n = nrow(unique(.SD))), by = cell_type, .SDcols = c('study', 'disease')][n >= 10, cell_type]
173 | 
174 | 
175 | 
176 | 
177 | 
178 | cor_data <- do.call(merge, lapply(names(cc_prop), function(x) rbind(
179 |     cc_prop[[x]][
180 |         sample == 'all_b' & cell_type %in% common_cts,
181 |         setNames(.('Cell cycle proportion', weighted.mean((n_g1s + n_g2m + n_int)/n_cell, n_sample)), c('v', paste0('val_', x))),
182 |         by = .(cell_type, study, disease)
183 |     ],
184 |     cc_prop[[x]][
185 |         sample == 'all_b' & cell_type %in% common_cts & n_g1s + n_g2m + n_int >= 50,
186 |         setNames(.('Phase bias', weighted.mean((n_g1s - n_g2m)/(n_g1s + n_g2m + n_int), n_sample)), c('v', paste0('val_', x))),
187 |         by = .(cell_type, study, disease)
188 |     ]
189 | )))
190 | 
191 | ggplot(cor_data, aes(x = val_cons, y = val_bes)) +
192 |     geom_point(size = 3, shape = 21, fill = 'lightblue') +
193 |     facet_wrap(vars(v), nrow = 1, ncol = 2, scales = 'free') +
194 |     geom_text(
195 |         aes(x = x, y = y, label = l),
196 |         cor_data[,
197 |             .(x = min(val_cons) + 0.18*diff(range(val_cons)), y = min(val_bes) + 0.9*diff(range(val_bes)),
198 |                 l = paste('r =', signif(cor(val_bes, val_cons), 2))),
199 |             by = v
200 |         ],
201 |         size = 8
202 |     ) +
203 |     geom_abline(slope = 1, intercept = 0, colour = 'tomato', linewidth = 1) +
204 |     theme_bw() +
205 |     theme(
206 |         axis.text = element_text(size = 14),
207 |         axis.title = element_text(size = 16),
208 |         plot.title = element_text(size = 18),
209 |         strip.text = element_text(size = 16),
210 |         strip.background = element_rect(fill = 'grey95'),
211 |         aspect.ratio = 1
212 |     ) +
213 |     labs(x = 'Consensus signature', y = 'Bespoke signature', title = 'Cell cycle quantification with bespoke and consensus gene signatures')
214 | 


--------------------------------------------------------------------------------
/3CA_v2/pseudobulks.R:
--------------------------------------------------------------------------------
 1 | # The command line arguments should supply study name and cancer type (in that order).
 2 | r = commandArgs(trailingOnly = TRUE)
 3 | 
 4 | 
 5 | 
 6 | 
 7 | 
 8 | library(data.table)
 9 | library(magrittr)
10 | library(plyr)
11 | library(stringr)
12 | library(Matrix)
13 | library(matkot)
14 | 
15 | source('functions.R')
16 | 
17 | paths_table <- fread('../data/paths_table.csv', key = c('study', 'cancer_type'))
18 | paths <- copy(paths_table[as.list(r)]); setkey(paths, group)
19 | 
20 | hgnc_complete_set <- fread('../data/hgnc_complete_set_2023-04-13.txt', key = 'symbol')
21 | hgnc_complete_set <- hgnc_complete_set[!(ensembl_gene_id %in% names(table(ensembl_gene_id))[table(ensembl_gene_id) > 1])]
22 | alias_table <- make_alias_table(hgnc_complete_set)
23 | 
24 | genes_all <- fread('../data/gene_plots_data_all_web.csv', select = 'symbol')[symbol %in% hgnc_complete_set$symbol, unique(symbol)]
25 | cts <- c('Malignant', 'B_cell', 'Endothelial', 'Epithelial', 'Fibroblast', 'Macrophage', 'T_cell')
26 | rabbr <- paste(gsub(' | et al. ', '', r[1]), gsub(' |/', '-', r[2]), sep = '_')
27 | 
28 | 
29 | 
30 | 
31 | 
32 | pbulks <- lapply(paths$group, function(g) {
33 |     
34 |     cells <- suppressWarnings(fread(paths[g, cells], na.strings = '', colClasses = c(cell_name = 'character', sample = 'character')))
35 |     if(!all(c('cell_type', 'sample') %in% names(cells)) | !endsWith(paths[g, expmat], 'mtx')) return(NULL)
36 |     if(cells[, .N > length(unique(cell_name))]) cells[, cell_name := paste(cell_name, .I, sep = '_')]
37 |     genes <- fread(paths[g, genes], header = FALSE)$V1
38 |     expmat <- readMM(paths[g, expmat])
39 |     expmat <- expmat[genes != '', ] # In case any genes are empty strings
40 |     genes <- genes[genes != '']
41 |     expmat <- expmat[genes %in% names(table(genes))[table(genes) == 1], ] # Removes repeated gene names
42 |     genes <- genes[genes %in% names(table(genes))[table(genes) == 1]]
43 |     genes <- update_symbols_fast(genes, alias_table) # Update gene symbols
44 |     rownames(expmat) <- genes
45 |     colnames(expmat) <- cells$cell_name
46 |     
47 |     genes <- genes[genes %in% genes_all & genes %in% names(table(genes))[table(genes) == 1]]
48 |     
49 |     cells <- cells[cell_type %in% cts & col_nnz(expmat) >= 1000] # Filter cell types and remove low-complexity cells
50 |     setkey(cells, sample, cell_type)
51 |     cells <- cells[cells[, .N, by = .(sample, cell_type)][N >= 30, -'N']]
52 |     
53 |     if(nrow(cells) < 30) return(NULL)
54 |     expmat <- round(1e+05*to_frac(expmat[genes, cells$cell_name]), 4) # Normalise to TPM/10
55 |     
56 |     Reduce(merge, apply(unique(cells[, .(sample, cell_type)]), 1, function(sct) {
57 |         out <- as.data.table(rowMeans(expmat[, cells[as.list(sct), cell_name]]), keep.rownames = TRUE)
58 |         names(out) <- c('gene', paste(rabbr, g, sct[1], gsub('_', '', sct[2]), sep = '_'))
59 |         return(out)
60 |     }, simplify = FALSE))
61 |     
62 | })
63 | 
64 | pbulks <- Reduce(function(x, y) merge(x, y, by = 'gene', all = TRUE), pbulks[!sapply(pbulks, is.null)])
65 | 
66 | if(!is.null(pbulks)) fwrite(pbulks, paste0('../data/pseudobulks/', rabbr, '.csv'))
67 | 


--------------------------------------------------------------------------------
/ITH_hallmarks/Generating_MPs/Generate_Meta_Programs.R:
--------------------------------------------------------------------------------
  1 | 
  2 | # ----------------------------------------------------------------------------------------------------
  3 | # Load packages and functions  
  4 | # ----------------------------------------------------------------------------------------------------
  5 | 
  6 | library(reshape2)
  7 | library(NMF)
  8 | library(ggplot2)
  9 | library(scales)
 10 | source("custom_magma.R")
 11 | source("robust_nmf_programs.R")   
 12 | 
 13 | 
 14 | ## Input:
 15 | # Genes_nmf_w_basis is a list in which each entry contains NMF gene-scores of a single sample. In our study we ran NMF using ranks 4-9 on the top 7000 genes in each sample. Hence each entry in Genes_nmf_w_basis is a matrix with 7000 rows (genes) X 39 columns (NMF programs)  
 16 | # For the code below to run smoothly, please use the following nomenclature:
 17 |   # 1) End entry names in Genes_nmf_w_basis (i.e. each sample name) with '_rank4_9_nruns10.RDS' 
 18 |   # 2) End each matrix column name with an extension that represents the NMF rank and program index. for example '_rank4_9_nrun10.RDS.4.1' to represent the first NMF program obtained using rank=4, or '_rank4_9_nrun10.RDS.6.5' to represent the fifth NMF program obtained using rank=6    
 19 | # See Genes_nmf_w_basis_example.RDS for an example 
 20 | # We define MPs in 2 steps:
 21 |   # 1) The function robust_nmf_programs.R performs filtering, so that programs selected for defining MPs are:
 22 |   #    i) Robust                - recur in more that one rank within the sample 
 23 |   #    ii) Non-redundant        - once a NMF program is selected, other programs within the sample that are similar to it are removed
 24 |   #    iii) Not sample specific - has similarity to NMF programs in other samples 
 25 |   # ** Please see https://github.com/gabrielakinker/CCLE_heterogeneity for more details on how to define robust NMF programs 
 26 |   # 2) Selected NMFs are then clustered iteratively, as described in Figure S1. At the end of the process, each cluster generates a list of the 50 genes (i.e. the MP) that represent the NMF programs that contributed to the cluster. Notably, not all initially selected NMFs end up participating in a cluster  
 27 | 
 28 | 
 29 | # ----------------------------------------------------------------------------------------------------
 30 | # Select NMF programs
 31 | # ----------------------------------------------------------------------------------------------------
 32 | 
 33 | 
 34 | ## Parameters 
 35 | intra_min_parameter <- 35
 36 | intra_max_parameter <- 10
 37 | inter_min_parameter <- 10
 38 | 
 39 | 
 40 | # get top 50 genes for each NMF program 
 41 | nmf_programs          <- lapply(Genes_nmf_w_basis, function(x) apply(x, 2, function(y) names(sort(y, decreasing = T))[1:50]))
 42 | nmf_programs          <- lapply(nmf_programs,toupper) ## convert all genes to uppercase 
 43 | 
 44 | # for each sample, select robust NMF programs (i.e. observed using different ranks in the same sample), remove redundancy due to multiple ranks, and apply a filter based on the similarity to programs from other samples. 
 45 | nmf_filter_ccle       <- robust_nmf_programs(nmf_programs, intra_min = intra_min_parameter, intra_max = intra_max_parameter, inter_filter=T, inter_min = inter_min_parameter)  
 46 | nmf_programs          <- lapply(nmf_programs, function(x) x[, is.element(colnames(x), nmf_filter_ccle),drop=F])
 47 | nmf_programs          <- do.call(cbind, nmf_programs)
 48 | 
 49 | # calculate similarity between programs
 50 | nmf_intersect         <- apply(nmf_programs , 2, function(x) apply(nmf_programs , 2, function(y) length(intersect(x,y)))) 
 51 | 
 52 | # hierarchical clustering of the similarity matrix 
 53 | nmf_intersect_hc     <- hclust(as.dist(50-nmf_intersect), method="average") 
 54 | nmf_intersect_hc     <- reorder(as.dendrogram(nmf_intersect_hc), colMeans(nmf_intersect))
 55 | nmf_intersect        <- nmf_intersect[order.dendrogram(nmf_intersect_hc), order.dendrogram(nmf_intersect_hc)]
 56 | 
 57 | 
 58 | 
 59 | # ----------------------------------------------------------------------------------------------------
 60 | # Cluster selected NMF programs to generate MPs
 61 | # ----------------------------------------------------------------------------------------------------
 62 | 
 63 | 
 64 | ### Parameters for clustering
 65 | Min_intersect_initial <- 10    # the minimal intersection cutoff for defining the first NMF program in a cluster
 66 | Min_intersect_cluster <- 10    # the minimal intersection cutoff for adding a new NMF to the forming cluster 
 67 | Min_group_size        <- 5     # the minimal group size to consider for defining the first NMF program in a cluster 
 68 | 
 69 | Sorted_intersection       <-  sort(apply(nmf_intersect , 2, function(x) (length(which(x>=Min_intersect_initial))-1)  ) , decreasing = TRUE)
 70 | 
 71 | Cluster_list              <- list()   ### Every entry contains the NMFs of a chosen cluster
 72 | MP_list                   <- list()
 73 | k                         <- 1
 74 | Curr_cluster              <- c()
 75 | 
 76 | nmf_intersect_original    <- nmf_intersect
 77 | 
 78 | while (Sorted_intersection[1]>Min_group_size) {  
 79 |   
 80 |   Curr_cluster <- c(Curr_cluster , names(Sorted_intersection[1]))
 81 |   
 82 |   ### intersection between all remaining NMFs and Genes in MP 
 83 |   Genes_MP                    <- nmf_programs[,names(Sorted_intersection[1])] # Genes in the forming MP are first chosen to be those in the first NMF. Genes_MP always has only 50 genes and evolves during the formation of the cluster
 84 |   nmf_programs                <- nmf_programs[,-match(names(Sorted_intersection[1]) , colnames(nmf_programs))]  # remove selected NMF
 85 |   Intersection_with_Genes_MP  <- sort(apply(nmf_programs, 2, function(x) length(intersect(Genes_MP,x))) , decreasing = TRUE) # intersection between all other NMFs and Genes_MP  
 86 |   NMF_history                 <- Genes_MP  # has genes in all NMFs in the current cluster, for redefining Genes_MP after adding a new NMF 
 87 |   
 88 |   ### Create gene list is composed of intersecting genes (in descending order by frequency). When the number of genes with a given frequency span bewond the 50th genes, they are sorted according to their NMF score.    
 89 |   while ( Intersection_with_Genes_MP[1] >= Min_intersect_cluster) {  
 90 |     
 91 |     Curr_cluster  <- c(Curr_cluster , names(Intersection_with_Genes_MP)[1])
 92 |     
 93 |     Genes_MP_temp   <- sort(table(c(NMF_history , nmf_programs[,names(Intersection_with_Genes_MP)[1]])), decreasing = TRUE)   ## Genes_MP is newly defined each time according to all NMFs in the current cluster 
 94 |     Genes_at_border <- Genes_MP_temp[which(Genes_MP_temp == Genes_MP_temp[50])]   ### genes with overlap equal to the 50th gene
 95 |     
 96 |     if (length(Genes_at_border)>1){
 97 |       ### Sort last genes in Genes_at_border according to maximal NMF gene scores
 98 |       ### Run across all NMF programs in Curr_cluster and extract NMF scores for each gene
 99 |       Genes_curr_NMF_score <- c()
100 |       for (i in Curr_cluster) {
101 |         curr_study           <- paste( strsplit(i , "[.]")[[1]][1 : which(strsplit(i , "[.]")[[1]] == "RDS")]   , collapse = "."  )
102 |         Q                    <- Genes_nmf_w_basis[[curr_study]][ match(names(Genes_at_border),toupper(rownames(Genes_nmf_w_basis[[curr_study]])))[!is.na(match(names(Genes_at_border),toupper(rownames(Genes_nmf_w_basis[[curr_study]]))))]   ,i] 
103 |         names(Q)             <- names(Genes_at_border[!is.na(match(names(Genes_at_border),toupper(rownames(Genes_nmf_w_basis[[curr_study]]))))])  ### sometimes when adding genes the names do not appear 
104 |         Genes_curr_NMF_score <- c(Genes_curr_NMF_score,  Q )
105 |       }
106 |       Genes_curr_NMF_score_sort <- sort(Genes_curr_NMF_score , decreasing = TRUE)
107 |       Genes_curr_NMF_score_sort <- Genes_curr_NMF_score_sort[unique(names(Genes_curr_NMF_score_sort))]   
108 |       
109 |       Genes_MP_temp             <- c(names(Genes_MP_temp[which(Genes_MP_temp > Genes_MP_temp[50])]) , names(Genes_curr_NMF_score_sort))
110 |       
111 |     } else {
112 |       Genes_MP_temp <- names(Genes_MP_temp)[1:50] 
113 |     }
114 |     
115 |     NMF_history     <- c(NMF_history , nmf_programs[,names(Intersection_with_Genes_MP)[1]]) 
116 |     Genes_MP        <- Genes_MP_temp[1:50]
117 |     
118 |     nmf_programs    <- nmf_programs[,-match(names(Intersection_with_Genes_MP)[1] , colnames(nmf_programs))]  # remove selected NMF
119 |     
120 |     Intersection_with_Genes_MP <- sort(apply(nmf_programs, 2, function(x) length(intersect(Genes_MP,x))) , decreasing = TRUE) # intersection between all other NMFs and Genes_MP  
121 |     
122 |   }
123 |   
124 |   Cluster_list[[paste0("Cluster_",k)]] <- Curr_cluster
125 |   MP_list[[paste0("MP_",k)]]           <- Genes_MP
126 |   k <- k+1
127 |   
128 |   nmf_intersect             <- nmf_intersect[-match(Curr_cluster,rownames(nmf_intersect) ) , -match(Curr_cluster,colnames(nmf_intersect) ) ]  # Remove current chosen cluster
129 |   
130 |   Sorted_intersection       <-  sort(apply(nmf_intersect , 2, function(x) (length(which(x>=Min_intersect_initial))-1)  ) , decreasing = TRUE)   # Sort intersection of remaining NMFs not included in any of the previous clusters
131 |   
132 |   Curr_cluster <- c()
133 |   print(dim(nmf_intersect)[2])
134 | }
135 | 
136 | 
137 | 
138 | ####  Sort Jaccard similarity plot according to new clusters:
139 | 
140 | inds_sorted <- c()
141 | 
142 | for (j in 1:length(Cluster_list)){
143 |   
144 |   inds_sorted <- c(inds_sorted , match(Cluster_list[[j]] , colnames(nmf_intersect_original)))
145 |   
146 | }
147 | inds_new <- c(inds_sorted   ,   which(is.na( match(1:dim(nmf_intersect_original)[2],inds_sorted)))) ### clustered NMFs will appear first, and the latter are the NMFs that were not clustered
148 | 
149 | nmf_intersect_meltI_NEW <- reshape2::melt(nmf_intersect_original[inds_new,inds_new]) 
150 | 
151 | ggplot(data = nmf_intersect_meltI_NEW, aes(x=Var1, y=Var2, fill=100*value/(100-value), color=100*value/(100-value))) + 
152 |   geom_tile() + 
153 |   scale_color_gradient2(limits=c(2,25), low=custom_magma[1:111],  mid =custom_magma[112:222], high = custom_magma[223:333], midpoint = 13.5, oob=squish, name="Similarity\n(Jaccard index)") +                                
154 |   scale_fill_gradient2(limits=c(2,25), low=custom_magma[1:111],  mid =custom_magma[112:222], high = custom_magma[223:333], midpoint = 13.5, oob=squish, name="Similarity\n(Jaccard index)")  +
155 |   theme( axis.ticks = element_blank(), panel.border = element_rect(fill=F), panel.background = element_blank(),  axis.line = element_blank(), axis.text = element_text(size = 11), axis.title = element_text(size = 12), legend.title = element_text(size=11), legend.text = element_text(size = 10), legend.text.align = 0.5, legend.justification = "bottom") + 
156 |   theme(axis.title.x=element_blank(), axis.text.x=element_blank(), axis.ticks.x=element_blank()) + 
157 |   theme(axis.title.y=element_blank(), axis.text.y=element_blank(), axis.ticks.y=element_blank()) + 
158 |   guides(fill = guide_colourbar(barheight = 4, barwidth = 1))
159 | 
160 | 
161 | MP_list <-  do.call(cbind, MP_list)
162 | 
163 | 
164 | 
165 | 
166 | 
167 | 
168 | 
169 | 
170 | 
171 | 
172 | 
173 | 
174 | 


--------------------------------------------------------------------------------
/ITH_hallmarks/Generating_MPs/Genes_nmf_w_basis_example.RDS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tiroshlab/3ca/21a784103528cdf15008761f9669e2025159d6d2/ITH_hallmarks/Generating_MPs/Genes_nmf_w_basis_example.RDS


--------------------------------------------------------------------------------
/ITH_hallmarks/Generating_MPs/custom_magma.R:
--------------------------------------------------------------------------------
1 | # Custom color palette
2 | library(RColorBrewer)
3 | library(viridis)
4 | custom_magma <- c(colorRampPalette(c("white", rev(magma(323, begin = 0.15))[1]))(10), rev(magma(323, begin = 0.18)))


--------------------------------------------------------------------------------
/ITH_hallmarks/Generating_MPs/robust_nmf_programs.R:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------------------------
 2 | # Function for selecting robust nonnegative matrix factorization (NMF) programs
 3 | # ------------------------------------------------------------------------------------------- 
 4 | 
 5 | # - nmf_programs = a list; each element contains a matrix with NMF programs (top 50 genes) generated for a specific cell line using different NMF factorization ranks. 
 6 | # - intra_min = minimum overlap with a program from the same cell line (for selecting robust programs)
 7 | # - intra_max = maximum overlap with a program from the same cell line (for removing redundant programs)
 8 | # - inter_filter = logical; indicates whether programs should be filtered based on their similarity to programs of other cell lines
 9 | # - inter_min = minimum overlap with a program from another cell line 
10 | 
11 | # Returns a character vector with the NMF programs selected
12 | 
13 | robust_nmf_programs <- function(nmf_programs, intra_min = 35, intra_max = 10, inter_filter=T, inter_min = 10) {
14 |   
15 |   # Select NMF programs based on the minimum overlap with other NMF programs from the same cell line
16 |   intra_intersect <- lapply(nmf_programs, function(z) apply(z, 2, function(x) apply(z, 2, function(y) length(intersect(x,y))))) 
17 |   intra_intersect_max <- lapply(intra_intersect, function(x) apply(x, 2, function(y) sort(y, decreasing = T)[2]))             
18 |   nmf_sel <- lapply(names(nmf_programs), function(x) nmf_programs[[x]][,intra_intersect_max[[x]]>=intra_min]) 
19 |   names(nmf_sel) <- names(nmf_programs)
20 |   
21 |   # Select NMF programs based on i) the maximum overlap with other NMF programs from the same cell line and
22 |   # ii) the minimum overlap with programs from another cell line
23 |   nmf_sel_unlist <- do.call(cbind, nmf_sel)
24 |   inter_intersect <- apply(nmf_sel_unlist , 2, function(x) apply(nmf_sel_unlist , 2, function(y) length(intersect(x,y)))) ## calculating intersection between all programs
25 |   
26 |   final_filter <- NULL 
27 |   for(i in names(nmf_sel)) {
28 |     a <- inter_intersect[grep(i, colnames(inter_intersect), invert = T),grep(i, colnames(inter_intersect))]
29 |     b <- sort(apply(a, 2, max), decreasing = T) # for each cell line, ranks programs based on their maximum overlap with programs of other cell lines
30 |     if(inter_filter==T) b <- b[b>=inter_min] # selects programs with a maximum intersection of at least 10
31 |     if(length(b) > 1) {
32 |       c <- names(b[1]) 
33 |       for(y in 2:length(b)) {
34 |         if(max(inter_intersect[c,names(b[y])]) <= intra_max) c <- c(c,names(b[y])) # selects programs iteratively from top-down. Only selects programs that have a intersection smaller than 10 with a previously selected programs
35 |       }
36 |       final_filter <- c(final_filter, c)
37 |     } else {
38 |       final_filter <- c(final_filter, names(b))
39 |     }
40 |   }
41 |   return(final_filter)                                                      
42 | }
43 | 
44 | 


--------------------------------------------------------------------------------
/ITH_hallmarks/MPs_distribution/MP_distribution.R:
--------------------------------------------------------------------------------
  1 | # -------------------------------------------------------------------------------------------
  2 | # Program for depicting the MP distribution of a given cell types across and within samples 
  3 | # ------------------------------------------------------------------------------------------- 
  4 | 
  5 | # My_study  = a list in which each element represents a sample in the study; the elements are expression matrices of a given cell type (i.e. cell_type below) in CPM or TPM units. Row names in each matrix should be gene symbols, and column names the cell IDs. 
  6 | # cell_type = one of the following cell types : "Cancer" , "Endothelial" , "Epithelial" , "Fibroblasts" , "Macrophages" , "CD4_T_cells", "CD8_T_cells" , "B_cells". Here "Cancer" represents malignant cells
  7 | # MinGenes  = the min number of MP genes that should exist in the study (default is 25)
  8 | # MinScore  = the minimal score for assigning a cell to a MP (default is 1). A cell is assigned to the MP with the maximal score, given that it exceeded MinScore. If the maximal score is below MinScore, the cell is unassigned. 
  9 | # MinCells  = the minimal % of cells that should be assigned to a MP in order to account for the MP (default is 0.05)
 10 | # MP_list   = a list of the MPs for each cell_type
 11 | 
 12 | ### Output: 
 13 | # (a) Pie chart of the MP distribution across the study
 14 | # (b) Bar plot per tumor with the % of cells per MP
 15 | # (c) Expression heatmap per tumor 
 16 | 
 17 | 
 18 | 
 19 | # ------------------------------------------------------------------------------------------- 
 20 | 
 21 | 
 22 | library(ggplot2)
 23 | library(scalop)  # see https://rdrr.io/github/jlaffy/scalop/man/sigScores.html
 24 | library(gridExtra)
 25 | library(ggpubr)
 26 | 
 27 | 
 28 | 
 29 | ### Define the following:
 30 | # My_study  =
 31 | # cell_type = 
 32 | # MP_list   = readRDS("MP_list.RDS")   # can be found in the Github repository
 33 | # heatCols  = readRDS("heatCols.RDS")  # can be found in the Github repository
 34 | 
 35 | 
 36 | Score_cells <- function(L,
 37 |                         cell_type,
 38 |                         MinGenes = 25,
 39 |                         MinScore = 1,
 40 |                         MinCells = 0.05,
 41 |                         MP_list
 42 |                         ) 
 43 |   {
 44 |   
 45 |   MP_list <- MP_list[[cell_type]]
 46 |   
 47 |   L <- lapply(L, function(x) log2((x/10) + 1) ) # Log normalize before scoring 
 48 |   
 49 |   MP_scores_per_sample    <- lapply(L, function(x)  scalop::sigScores(x, sigs = MP_list, conserved.genes = MinGenes/50 )  )
 50 |   
 51 |   remove_cells <- function(x,MinScore){         # remove cells whose max score was below MinScore
 52 |     max_score <- apply(x, 1, function(y) max(y))
 53 |     cells_rm  <- which(max_score < MinScore)
 54 |     if (length(cells_rm) > 0 ){
 55 |       x <- x[-cells_rm , ]
 56 |     }
 57 |     return(x)
 58 |   }
 59 |   MP_scores_per_sample <- lapply(MP_scores_per_sample, function(x) remove_cells(x,MinScore))
 60 |   
 61 |   Assign_MP_per_cell   <- lapply(MP_scores_per_sample, function(x) apply(x, 1, function(y) colnames(x)[which(y==max(y))] ) ) 
 62 |   
 63 |   filter_cells <- function(x,MinCells){         # remove MP that were assassin to less than MinCells in each sample
 64 |     MP_frequency <- as.numeric(ave(x, x, FUN = length))
 65 |     MP_rm        <- which(MP_frequency/length(x) < MinCells)  # MPs to be removed
 66 |     if (length(MP_rm)>0){
 67 |       x <- x[-MP_rm]
 68 |     }
 69 |     return(x)
 70 |   }
 71 |   Assign_MP_per_cell_filtered <- lapply(Assign_MP_per_cell, function(x) filter_cells(x,MinCells)) 
 72 |   
 73 |   return(Assign_MP_per_cell_filtered)
 74 |   
 75 | }
 76 | 
 77 | 
 78 | Assign_MP_per_cell_filtered <- Score_cells(L = My_study , cell_type = "Cancer" , MP_list=MP_list)
 79 | 
 80 | 
 81 | 
 82 | #### Part A: pie chart of MPs across the whole study
 83 | MinCells <- 0.05 
 84 | df       <- data.frame(MPs = names(table(unlist(Assign_MP_per_cell_filtered)))  ,  frequency = as.vector(table(unlist(Assign_MP_per_cell_filtered))) )
 85 | MPs_rm   <- which(df$frequency/sum(df$frequency) < MinCells)  ## remove also MPs that were assassin to less than MinCells from the total  
 86 | if (length(MPs_rm > 0)){
 87 |   df$MPs[MPs_rm] <- paste0("Other MPs (<", MinCells*100, "%)")
 88 | }
 89 | 
 90 | 
 91 | blank_theme <- theme_minimal()+
 92 |   theme(
 93 |     axis.title.x = element_blank(),
 94 |     axis.title.y = element_blank(),
 95 |     panel.border = element_blank(),
 96 |     panel.grid=element_blank(),
 97 |     axis.ticks = element_blank(),
 98 |     plot.title=element_text(size=14, face="bold")
 99 |   )
100 | 
101 | 
102 | pie <- ggplot(df, aes(x="", y=frequency, fill=MPs)) + 
103 |        geom_bar(width = 1, stat = "identity") + 
104 |        coord_polar("y", start=0)+
105 |        blank_theme +
106 |        theme(axis.text.x=element_blank())
107 |   
108 | 
109 | pie
110 | 
111 | 
112 | #### Part B: Bar plot with MP distribution per sample
113 | 
114 | sample_num <- length(Assign_MP_per_cell_filtered)
115 | nCol       <- floor(sqrt(sample_num))
116 | MPs        <- unique(unlist(Assign_MP_per_cell_filtered))    
117 | 
118 | ### For arranging the full MP names as legend in the plot
119 | MP_list <- MP_list[[cell_type]]
120 | MPs     <- MPs[sort(match(MPs, names(MP_list)),index.return = T)$ix]
121 | 
122 | Assign_MP_per_cell_filtered_abbrev <- lapply(Assign_MP_per_cell_filtered, function(x) apply(as.data.frame(x) , 1 , function(x)  strsplit(x, "[ ]")[[1]][1]) )
123 | 
124 | df_list    <- lapply(Assign_MP_per_cell_filtered_abbrev, function(x) data.frame(MPs = names(table(x))  ,  frequency = as.vector(table(x))/sum(as.vector(table(x))) ))
125 | df_list2   <- lapply(Assign_MP_per_cell_filtered, function(x) data.frame(MPs = names(table(x))  ,  frequency = as.vector(table(x))/sum(as.vector(table(x))) ))
126 | 
127 | P <- lapply(seq_along(df_list), function(I) 
128 |      ggplot(data=df_list[[I]], aes(x=reorder(MPs,-frequency), y=frequency)) +
129 |      geom_bar(stat="identity")+
130 |      theme(axis.title.x=element_blank())+
131 |      ggtitle(names(df_list)[I])
132 |   )
133 | 
134 | 
135 | P
136 | 
137 | #### Part C: an expression heatmap for each sample
138 | 
139 | df_list_ordered  <- lapply(df_list, function(x) x[sort(x$frequency,decreasing = T , index.return = T)$ix,])  ## order MPs in each sample as in the bar plots, in decreasing order
140 | df_list_ordered2 <- lapply(df_list2, function(x) x[sort(x$frequency,decreasing = T , index.return = T)$ix,])  ## order MPs in each sample as in the bar plots, in decreasing order
141 | 
142 | 
143 | L1        <- lapply(seq_along(My_study), function(I) My_study[[I]][ , names(Assign_MP_per_cell_filtered[[I]])]) ## extract relevant cells that were assigned to an MP (at least 5%)
144 | names(L1) <- names(My_study)
145 | 
146 | L1 <- lapply(L1, function(x) log2((x/10) + 1) ) ## Log normalize 
147 | L1 <- lapply(L1, function(x) (x - rowMeans(x)) ) ## center
148 | 
149 | L2 <- lapply(seq_along(L1), function(I) L1[[I]][ , sort(match(Assign_MP_per_cell_filtered_abbrev[[I]],df_list_ordered[[I]]$MPs), index.return = T)$ix] )  # sort cells according to MPs in df_list_ordered
150 | names(L2) <- names(L1)
151 | 
152 | MP_genes_per_sample <- lapply(df_list_ordered2, function(x) unlist(MP_list[x$MPs]))  ### MP genes sorted by the MP order in df_list_ordered2
153 |   
154 | L_plot        <- lapply(seq_along(L2), function(I) L2[[I]][match(MP_genes_per_sample[[I]] , rownames(L2[[I]]))[!is.na(match(MP_genes_per_sample[[I]] , rownames(L2[[I]])))] , ])
155 | names(L_plot) <- names(L2)
156 | 
157 | color.scheme <- colorRampPalette(c(heatCols))(n=333)
158 | 
159 | Plot_heatmap <- function(M){ 
160 |   
161 |   M_new2        <- M
162 |   M_new2        <- apply(M_new2, 2, rev)
163 |   M_meltII      <-  reshape2::melt(t(M_new2)) 
164 |   M_meltII$Var2 <- factor(M_meltII$Var2)
165 |   
166 |   G <- ggplot(data = M_meltII, aes(x=Var1, y=Var2, fill=value, color=value)) + 
167 |     geom_tile() + 
168 |     scale_color_gradient2(limits=c(-4,4), low=color.scheme[1:111],  mid =color.scheme[112:222], high = color.scheme[223:333], midpoint = 0, oob=squish, name=NULL) +                                
169 |     scale_fill_gradient2(limits=c(-4,4), low=color.scheme[1:111],  mid =color.scheme[112:222], high = color.scheme[223:333], midpoint = 0, oob=squish, name=NULL)  +
170 |     theme(  panel.border = element_rect(fill=F), panel.background = element_blank(),  axis.line = element_blank(), axis.text = element_text(size = 8), axis.title = element_text(size = 8), legend.title = element_text(size=8), legend.text = element_text(size = 8), legend.text.align = 0.5, legend.justification = "bottom" ) + 
171 |     theme(axis.title.x=element_blank(), axis.text.x=element_blank(), axis.ticks.x=element_blank()) +
172 |     theme(axis.title.y=element_blank(), axis.text.y=element_blank(), axis.ticks.y=element_blank() )
173 |   
174 |   return(G)
175 | }
176 | 
177 | P1 <- lapply(L_plot, function(x) Plot_heatmap(x))
178 | 
179 | P_plot <- c()
180 | for (i in 1:length(P)){
181 |   P_plot <- c(P_plot , P[i] , P1[i])
182 | }
183 | 
184 | 
185 | P_plot <- do.call("ggarrange", c(P_plot, ncol=2, nrow = 3))
186 | 
187 | lapply(P_plot, function(x) 
188 |   
189 |   annotate_figure(x, left   = text_grob(paste(MPs , collapse = "\n"), 
190 |                                         color = "red", face = "bold", size = 10))
191 |   
192 | )
193 | 
194 | 
195 | 
196 | 


--------------------------------------------------------------------------------
/ITH_hallmarks/MPs_distribution/MP_list.RDS.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tiroshlab/3ca/21a784103528cdf15008761f9669e2025159d6d2/ITH_hallmarks/MPs_distribution/MP_list.RDS.gz


--------------------------------------------------------------------------------
/ITH_hallmarks/MPs_distribution/My_study.RDS.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tiroshlab/3ca/21a784103528cdf15008761f9669e2025159d6d2/ITH_hallmarks/MPs_distribution/My_study.RDS.gz


--------------------------------------------------------------------------------
/ITH_hallmarks/MPs_distribution/heatCols.RDS.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tiroshlab/3ca/21a784103528cdf15008761f9669e2025159d6d2/ITH_hallmarks/MPs_distribution/heatCols.RDS.gz


--------------------------------------------------------------------------------
/ITH_hallmarks/README.md:
--------------------------------------------------------------------------------
 1 | # Source code for Gavish et al.
 2 | 
 3 | The ITH_hallmarks folder contains code for reproducing the analyses in the Gavish et al. This code is arranged into 3 folders.
 4 | 
 5 | ## Generating_MPs
 6 | - Generate_Meta_Programs.R generates MPs from NMFs programs that were calculated for each sample using different ranks. The NMF programs were calculated per sample using the ‘NMF’ R package:  
 7 | NMFs_per_sample = nmf(x = Expression_matrix, rank = 4:9, method="snmf/r", nrun = 10)  
 8 | NMF programs are listed in Genes_nmf_w_basis, where each entry contains NMF gene-scores of a single sample.  
 9 | Using the function robust_nmf_programs.R (described below) and a custom written clustering method, MPs are derived from the NMF programs.
10 | - Genes_nmf_w_basis_example.RDS is an example for how the NMF output is arranged. Note that each element name in the list ends with '_rank4_9_nruns10.RDS', and each matrix column ends with an extension that represents the NMF rank and program index. 
11 | - robust_nmf_programs.R filters outs redundant and non-robust NMF programs for each sample. The remaining NMF programs are those that appeared in more than one NMF rank within the sample and have similarity to a NMF program in a different sample. Please see also https://github.com/gabrielakinker/CCLE_heterogeneity for more details on how to define robust NMF programs.
12 | 
13 | ## MPs_distribution
14 | - MP_distribution.R depicts the meta-program distribution of a certain cell type within individual samples and across the whole study. It requires a list of scRNAseq expression matrices of different samples in a study (of a certain cell type).
15 | - MP_list.RDS is a list of meta-programs for each cell type required for inferring the distributions in MP_distribution.R
16 | - heatCols.RDS is the colormap used for plotting heatmaps in MP_distribution.R
17 | - My_study.RDS can be used for an example
18 | 
19 | ## TCGA_analysis
20 | These scripts are for analysing expression of the MPs in TCGA data. A few notes on the data used here:
21 | - Some of the external datasets used are annotated with their source (e.g. URL to paper) via comments in the code.
22 | - The 'hgnc_complete_set.txt' file can be downloaded from the HGNC website: <https://www.genenames.org/download/archive/>.
23 | - Other datasets used are available on request.
24 | 


--------------------------------------------------------------------------------
/ITH_hallmarks/TCGA_analysis/functions.R:
--------------------------------------------------------------------------------
 1 | make_alias_table <- function(hgnc_complete_set) {
 2 |     out <- hgnc_complete_set[,
 3 |         rbind(
 4 |             data.table(symbol_alt = str_split(alias_symbol, '\\|')[[1]], type = 'alias'),
 5 |             data.table(symbol_alt = str_split(prev_symbol, '\\|')[[1]], type = 'prev')
 6 |         ),
 7 |         by = symbol
 8 |     ][symbol_alt == '', symbol_alt := NA]
 9 |     setkey(out, symbol_alt)
10 |     return(out)
11 | }
12 | 
13 | update_symbols_fast <- function(symbols, alias_table) {
14 |     if(length(symbols) != length(unique(symbols))) warning('Not all symbols are unique!')
15 |     setkey(alias_table, symbol_alt)
16 |     symbols[grep('C[0-9]+ORF[0-9]+', symbols)] <- gsub('ORF', 'orf', symbols[grep('C[0-9]+ORF[0-9]+', symbols)])
17 |     cond <- !(symbols %in% alias_table$symbol)
18 |     # The following addition to <cond> ensures two different symbols don't map to the same "prev" symbol...
19 |     cond <- cond & !(symbols %in% alias_table[type == 'prev'][symbols[cond]][symbol %in% names(table(symbol))[table(symbol) > 1], symbol_alt])
20 |     # ...And the following makes sure we have all unique prev symbols:
21 |     cond <- cond & !(symbols %in% alias_table[type == 'prev'][symbols[cond], names(table(symbol_alt))[table(symbol_alt) > 1]])
22 |     if(any(cond)) {
23 |         # alias_table[type == 'prev'][symbols[cond]]
24 |         symbols[cond] <- alias_table[type == 'prev'][symbols[cond]][,
25 |             .(newsymb = { # Match symbols that have at least one "prev" symbol that isn't already in symbols:
26 |                 cands <- symbol[!is.na(symbol) & !(symbol %in% symbols)]
27 |                 ifelse(length(cands) == 1, cands, symbol_alt) # Convert only those symbols that have exactly one such "prev" symbol
28 |             }),
29 |             by = symbol_alt
30 |         ]$newsymb
31 |     }
32 |     cond <- !(symbols %in% alias_table$symbol)
33 |     # As above, the following additions to <cond> ensure two different symbols don't map to the same "alias", and we have all unique aliases:
34 |     cond <- cond & !(symbols %in% alias_table[type == 'alias'][symbols[cond]][symbol %in% names(table(symbol))[table(symbol) > 1], symbol_alt])
35 |     cond <- cond & !(symbols %in% alias_table[type == 'alias'][symbols[cond], names(table(symbol_alt))[table(symbol_alt) > 1]])
36 |     if(any(cond)) {
37 |         symbols[cond] <- alias_table[type == 'alias'][symbols[cond]][,
38 |             .(newsymb = { # Match symbols that have at least one alias that isn't already in symbols:
39 |                 cands <- symbol[!is.na(symbol) & !(symbol %in% symbols)]
40 |                 ifelse(length(cands) == 1, cands, symbol_alt) # Convert only those symbols that have exactly one such alias
41 |             }),
42 |             by = symbol_alt
43 |         ]$newsymb
44 |     }
45 |     return(symbols)
46 | }
47 | 
48 | dirr <- function(x) dir(x, recursive = TRUE)
49 | 


--------------------------------------------------------------------------------
/ITH_hallmarks/TCGA_analysis/pdac_classical_main.R:
--------------------------------------------------------------------------------
  1 | library(data.table)
  2 | library(readxl)
  3 | library(magrittr)
  4 | library(stringr)
  5 | library(plyr)
  6 | library(ggplot2)
  7 | library(RColorBrewer)
  8 | library(scales)
  9 | library(cowplot)
 10 | library(randomcoloR)
 11 | library(matkot)
 12 | 
 13 | cancer_types <- c('ACC', 'BLCA', 'BRCA', 'CESC', 'CHOL', 'COAD', 'DLBC', 'ESCA_AC', 'ESCA_ESCC', 'GBM_IDH-WT', 'HNSC', 'KICH', 'KIRC', 'KIRP',
 14 |     'LAML', 'LGG_astro', 'LGG_IDH-WT', 'LGG_oligo', 'LIHC', 'LUAD', 'LUSC', 'MESO', 'OV', 'PAAD', 'PCPG', 'PRAD', 'READ', 'SARC', 'SKCM_primary',
 15 |     'SKCM_metastatic', 'STAD', 'TGCT', 'THCA', 'THYM', 'UCEC', 'UCS', 'UVM')
 16 | 
 17 | cancer_genes <- as.data.table(read_xlsx('/home/labs/tirosh/tyler/pan_cancer/cancer5000.xlsx', skip = 1, n_max = 260))[`Cancer5000-S (219)` == 1, gene]
 18 | 
 19 | hgnc_complete_set <- fread('../data/hgnc_complete_set.txt', key = 'symbol')[
 20 |     !(ensembl_gene_id %in% names(table(ensembl_gene_id))[table(ensembl_gene_id) > 1])
 21 | ][!grepl('unplaced', location), chrarm := str_extract(location, '[0-9]*[XY]*[pq]')][!is.na(chrarm)][
 22 |     order(as.numeric(mapvalues(gsub('[pq]', '', chrarm), c('X', 'Y'), c(23, 24))), str_extract(chrarm, '[pq]'))
 23 | ]
 24 | 
 25 | 
 26 | 
 27 | 
 28 | 
 29 | meta_hnsc <- fread('/home/labs/tirosh/tyler/TCGA_data/HNSC/Cells.csv', key = 'sample_id', na.strings = '')
 30 | scores_hnsc <- fread('/home/labs/tirosh/tyler/TCGA_data/HNSC/Scores.csv')[grep('MP17 Interferon|PDAC-classical', meta_program)]
 31 | mut_data_hnsc <- fread('/home/labs/tirosh/tyler/TCGA_data/HNSC/Mutation_data.csv')
 32 | mut_data_hnsc <- mut_data_hnsc[gene %in% cancer_genes]
 33 | setkey(mut_data_hnsc, gene, Variant_Classification)
 34 | mut_data_hnsc <- rbind(
 35 |     mut_data_hnsc[Variant_Classification %in% c('Nonsense_Mutation', 'Frame_Shift_Del', 'Frame_Shift_Ins', 'Splice_Site', 'Nonstop_Mutation',
 36 |         'Translation_Start_Site')],
 37 |     mut_data_hnsc[
 38 |         mut_data_hnsc[
 39 |             Variant_Classification %in% c('Missense_Mutation', 'In_Frame_Del', 'In_Frame_Ins'),
 40 |             .(n = length(unique(patient_id))),
 41 |             by = .(gene, Variant_Classification)
 42 |         ][n >= 2, -'n']
 43 |     ]
 44 | )
 45 | pscores_hnsc <- copy(scores_hnsc)[, patient_id := do.call(`[`, args = list(meta_hnsc, sample_id))$patient_id][, -'sample_id']
 46 | setcolorder(pscores_hnsc, 'patient_id')
 47 | setkey(pscores_hnsc, patient_id)
 48 | pscores_hnsc <- pscores_hnsc[patient_id %in% mut_data_hnsc$patient_id]
 49 | mut_data_hnsc <- mut_data_hnsc[patient_id %in% pscores_hnsc$patient_id]
 50 | pids_tab <- pscores_hnsc[meta_program == meta_program[1], table(patient_id)]
 51 | pids <- names(pids_tab)[pids_tab == 1]
 52 | pscores_hnsc <- pscores_hnsc[patient_id %in% pids]
 53 | mut_data_hnsc <- mut_data_hnsc[patient_id %in% pids]
 54 | 
 55 | pscores_hnsc <- dcast(pscores_hnsc[, .(patient_id, meta_program, score)], patient_id ~ meta_program, value.var = 'score')
 56 | genes <- c('CASP8', 'NSD1')
 57 | pscores_hnsc[, (genes) := lapply(genes, function(g) patient_id %in% mut_data_hnsc[gene == g, patient_id])]
 58 | 
 59 | # See how scores for this MP match with the trinity of CASP8-mut, HRAS-mut and TP53-WT:
 60 | pscores_hnsc[,
 61 |     categ3 := patient_id %in% mut_data_hnsc[gene == 'CASP8', patient_id] &
 62 |         patient_id %in% mut_data_hnsc[gene == 'HRAS', patient_id] &
 63 |         !(patient_id %in% mut_data_hnsc[gene == 'TP53', patient_id]),
 64 |     by = patient_id
 65 | ]
 66 | 
 67 | bdata_hnsc_ifn <- melt(
 68 |     pscores_hnsc[, .(patient_id, `MP17 Interferon/MHC-II (I)`, CASP8, categ3)],
 69 |     id.vars = c('patient_id', 'MP17 Interferon/MHC-II (I)'),
 70 |     variable.name = 'event',
 71 |     value.name = 'event_pres'
 72 | )[, event := mapvalues(event, c('CASP8', 'categ3'), c('CASP8-mut', 'CASP8-mut, HRAS-mut,\nTP53-wt'))]
 73 | box_hnsc_ifn <- ggplot(bdata_hnsc_ifn) +
 74 |     geom_boxplot(aes(x = event_pres, y = `MP17 Interferon/MHC-II (I)`), outlier.shape = NA) +
 75 |     geom_point(aes(x = event_pres, y = `MP17 Interferon/MHC-II (I)`), position = position_jitter(width = 0.3), shape = 21, fill = 'black',
 76 |         stroke = 0.3, alpha = 0.3) +
 77 |     geom_text(
 78 |         data = data.table(
 79 |             x = c(1.5, 1.5),
 80 |             y = c(3.1, 3.1),
 81 |             label = pscores_hnsc[, paste('p =', c(
 82 |                 signif(t.test(`MP17 Interferon/MHC-II (I)` ~ CASP8)$p.value, 2),
 83 |                 signif(t.test(`MP17 Interferon/MHC-II (I)` ~ categ3)$p.value, 2)
 84 |             ))],
 85 |             event = c('CASP8-mut', 'CASP8-mut, HRAS-mut,\nTP53-wt')
 86 |         ),
 87 |         aes(x = x, y = y, label = label)
 88 |     ) +
 89 |     geom_segment(
 90 |         data = data.table(x = c(1, 1, 2, 1, 1, 2), xend = c(2, 1, 2, 2, 1, 2), y = rep(2.8, 6), yend = c(2.8, 2.7, 2.7, 2.8, 2.7, 2.7),
 91 |             event = c('CASP8-mut', 'CASP8-mut, HRAS-mut,\nTP53-wt')),
 92 |         aes(x = x, xend = xend, y = y, yend = yend)
 93 |     ) +
 94 |     facet_wrap(vars(event), strip.position = 'bottom') +
 95 |     theme_test() +
 96 |     theme(strip.placement = 'outside', strip.background = element_rect(fill = NA, colour = NA), strip.text = element_text(size = 11, vjust = 1)) +
 97 |     labs(x = NULL, y = 'Interferon/MHC-II (I) score', title = 'Interferon/MHC-II (I) and mutations in HNSC')
 98 | 
 99 | subtypes_hnsc <- fread('../../TCGA_data/HNSC/subtypes.csv', na.strings = '', key = 'patient_id')
100 | pscores_hnsc[, subtype := do.call(`[`, list(subtypes_hnsc, patient_id))$subtype]
101 | pscores_hnsc[, subtype_classical := subtype == 'Classical']
102 | # pscores_hnsc[, t.test(subtype_classical ~ pdac_high)] # No enrichment of the Classical subtype in the PDAC-high population.
103 | 
104 | clin_hnsc <- fread('../../TCGA_data/HNSC/Clinical.csv', key = 'patient_id')
105 | pscores_hnsc[, site := do.call(`[`, list(clin_hnsc, patient_id))$anatomic_neoplasm_subdivision]
106 | pscores_hnsc[, larynx := (site == 'larynx')]
107 | # pscores_hnsc[, t.test(larynx ~ pdac_high)] # Larynx is enriched in the PDAC-high population.
108 | 
109 | pscores_hnsc[, larynx_nice := factor(ifelse(larynx, 'Larynx', 'Non-larynx'), levels = c('Non-larynx', 'Larynx'))]
110 | pscores_hnsc[, nsd1_nice := factor(ifelse(NSD1, 'NSD1-mut', 'NSD1-wt'))]
111 | bdata_hnsc_pdac_larynx <- rbind(
112 |     pscores_hnsc[, .(id = patient_id, score = `MP30 PDAC-classical`, nsd1 = NSD1, categ = ifelse(larynx, 'Larynx', 'Non-larynx'))],
113 |     pscores_hnsc[, .(id = patient_id, score = `MP30 PDAC-classical`, nsd1 = NSD1, categ = 'All tumors')]
114 | )
115 | bdata_hnsc_pdac_larynx[, nsd1 := factor(ifelse(nsd1, 'NSD1-mut', 'NSD1-wt'), levels = c('NSD1-wt', 'NSD1-mut'))]
116 | bdata_hnsc_pdac_larynx[, categ := factor(categ, levels = c('All tumors', 'Non-larynx', 'Larynx'))]
117 | 
118 | # Just the "All tumors" boxes:
119 | box_hnsc <- ggplot(bdata_hnsc_pdac_larynx[categ == 'All tumors']) +
120 |     geom_boxplot(aes(x = nsd1, y = score), outlier.shape = NA, width = 0.8) +
121 |     geom_point(aes(x = nsd1, y = score), fill = 'grey80', size = 2, shape = 21, stroke = 0.3, position = position_jitter(width = 0.3)) +
122 |     geom_text(
123 |         data = data.table(x = 1.5, y = 2.9, label = paste('p =', signif(c(pscores_hnsc[, t.test(`MP30 PDAC-classical` ~ NSD1)$p.value]), 2))),
124 |         aes(x = x, y = y, label = label)
125 |     ) +
126 |     geom_segment(
127 |         data = data.table(x = c(1, 1, 2), xend = c(2, 1, 2), y = rep(2.7, 3), yend = c(2.7, 2.65, 2.65)),
128 |         aes(x = x, xend = xend, y = y, yend = yend)
129 |     ) +
130 |     theme_test() +
131 |     theme(axis.text.x = element_text(size = 11), plot.title = element_text(hjust = 0.5)) +
132 |     labs(title = 'PDAC-classical and\nNSD1 mutation in HNSC', x = NULL, y = 'PDAC-classical score')
133 | 
134 | 
135 | 
136 | 
137 | 
138 | library(seriation)
139 | 
140 | scores_patient_mut_all <- fread('../data/scores_patient_mut.csv')
141 | 
142 | exp_data <- slapply(c('LUAD', 'PAAD'), function(ct) {
143 |     expmat <- fread(paste0('/home/labs/tirosh/tyler/TCGA_data/', ct, '/Exp_data_TPM.csv'))[, set_rownames(as.matrix(.SD), V1), .SDcols = -'V1']
144 |     meta <- fread(paste0('/home/labs/tirosh/tyler/TCGA_data/', ct, '/Cells.csv'), na.strings = '')[
145 |         sample_type != 'normal' & patient_id %in% scores_patient_mut_all[cancer_type == ct, patient_id]
146 |     ]
147 |     expmat <- expmat[, meta$sample_id]
148 |     colnames(expmat) <- meta$patient_id
149 |     return(list(expmat = expmat, meta = meta))
150 | })
151 | top_genes <- slapply(
152 |     names(exp_data),
153 |     function(ct) head(names(sort(apply(exp_data[[ct]]$expmat, 1, function(x) median(abs(x - median(x)))), decreasing = TRUE)), 2500)
154 | )
155 | cormat <- slapply(names(exp_data), function(ct) cor(t(apply(exp_data[[ct]]$expmat[top_genes[[ct]], ], 1, function(x) x - median(x)))))
156 | clust <- slapply(names(exp_data), function(ct) seriate(dist(cormat[[ct]]), method = 'HC_average')[[1]])
157 | pdac_clust <- labels(as.dendrogram(clust$LUAD)[[1]][[2]][[1]])
158 | 
159 | luad_clinical <- fread('/home/labs/tirosh/tyler/TCGA_data/LUAD/Clinical.csv', key = 'patient_id')
160 | hist_data <- copy(scores_patient_mut_all[cancer_type == 'LUAD' & grepl('PDAC Classical', meta_program)])[,
161 |     c('pdac_high', 'hist_type') := .((score > 1.5), do.call(function(x) luad_clinical[x, histological_type], list(x = patient_id)))
162 | ][, mucinous := grepl('^mucinous| mucinous', hist_type)] # This regex ensures "nonmucinous" is not counted
163 | setkey(hist_data, patient_id)
164 | 
165 | top_genes_int <- intersect(top_genes$LUAD, top_genes$PAAD)
166 | cor_luad_paad <- cor(exp_data$LUAD$expmat[top_genes_int, ], exp_data$PAAD$expmat[top_genes_int, ])
167 | bxplt_data <- data.table(patient_id = rownames(cor_luad_paad), mean_cor = rowMeans(cor_luad_paad))[, pdac_clust := (patient_id %in% pdac_clust)]
168 | setkey(scores_patient_mut_all, patient_id)
169 | bxplt_data[, pdac_high := do.call(function(x) scores_patient_mut_all[grepl('PDAC Classical', meta_program)][x, score > 1.5], list(x = patient_id))]
170 | bxplt_data[, mucinous := hist_data$mucinous] # Checked that this gives TRUE: all(bxplt_data$patient_id == hist_data$patient_id)
171 | 
172 | box_luad <- ggplot(bxplt_data, aes(x = mapvalues(pdac_high, c(TRUE, FALSE), c('PDAC score\n\u2265 1.5', 'PDAC score\n< 1.5')), y = mean_cor)) +
173 |     geom_boxplot(outlier.shape = NA) +
174 |     geom_point(
175 |         aes(fill = factor(mapvalues(mucinous, c(TRUE, FALSE), c('Mucinous', 'Non-mucinous')), levels = c('Non-mucinous', 'Mucinous'))),
176 |         size = 2, shape = 21, stroke = 0.3, position = position_jitter(width = 0.3)
177 |     ) +
178 |     scale_fill_manual(values = c('Non-mucinous' = 'grey80', 'Mucinous' = 'deeppink2')) +
179 |     theme_test() +
180 |     theme(axis.text.x = element_text(size = 11)) +
181 |     labs(x = NULL, y = 'Mean correlation with PDAC samples', title = NULL, fill = 'LUAD subtype')
182 | 
183 | 
184 | 
185 | 
186 | 
187 | box_hnsc_grob <- ggplotGrob(box_hnsc)
188 | box_luad_grob <- ggplotGrob(box_luad)
189 | widths_hnsc <- c(0.3, 0, 0.5, 0.4, 8, 0, 0, 0, 0.3)
190 | widths_luad <- c(0.3, 0, 0.5, 0.7, 8, 0, 0, 0.3, 3, 0, 0.3)
191 | heights_hnsc <- c(0.3, 0, 1, 0, 0, 0, 10, 0.8, 0, 0, 0, 0.3)
192 | heights_luad <- c(0.3, 0, 1, 0, 0, 0, 10, 0.8, 0, 0, 0, 0.3)
193 | box_hnsc_grob$widths <- unit(widths_hnsc, 'cm')
194 | box_luad_grob$widths <- unit(widths_luad, 'cm')
195 | box_hnsc_grob$heights <- unit(heights_hnsc, 'cm')
196 | box_luad_grob$heights <- unit(heights_luad, 'cm')
197 | 
198 | cairo_pdf('../data/pdac_classical_main.pdf', height = sum(heights_luad)/2.54, width = sum(c(widths_luad, widths_hnsc))/2.54)
199 | plot_grid(box_luad_grob, box_hnsc_grob, nrow = 1, ncol = 2, rel_widths = c(sum(widths_luad), sum(widths_hnsc)))
200 | dev.off()
201 | 


--------------------------------------------------------------------------------
/ITH_hallmarks/TCGA_analysis/preprocess_clinical_data.R:
--------------------------------------------------------------------------------
 1 | library(data.table)
 2 | library(magrittr)
 3 | library(matkot)
 4 | 
 5 | # source('functions.R')
 6 | 
 7 | cancer_types <- c('ACC', 'BLCA', 'BRCA', 'CESC', 'CHOL', 'COAD', 'DLBC', 'ESCA', 'GBM', 'HNSC', 'KICH', 'KIRC', 'KIRP', 'LAML', 'LGG', 'LIHC',
 8 |     'LUAD', 'LUSC', 'MESO', 'OV', 'PAAD', 'PCPG', 'PRAD', 'READ', 'SARC', 'SKCM', 'STAD', 'TGCT', 'THCA', 'THYM', 'UCEC', 'UCS', 'UVM')
 9 | 
10 | for(ct in cancer_types) {
11 |     
12 |     cat(ct, '\b...')
13 |     
14 |     if(ct %in% dir('~/TCGA_data') && 'All_CDEs.txt' %in% dir(paste0('~/TCGA_data/', ct))) {
15 |         clinical_data <- fread(paste0('~/TCGA_data/', ct, '/All_CDEs.txt'), showProgress = FALSE)
16 |     } else {
17 |         
18 |         download.file(
19 |             paste0(
20 |                 'http://gdac.broadinstitute.org/runs/stddata__2016_01_28/data/',
21 |                 ct,
22 |                 '/20160128/gdac.broadinstitute.org_',
23 |                 ct,
24 |                 '.Clinical_Pick_Tier1.Level_4.2016012800.0.0.tar.gz'
25 |             ),
26 |             destfile = 'tmp.tar.gz',
27 |             quiet = TRUE
28 |         )
29 |         
30 |         file_names <- untar('tmp.tar.gz', list = TRUE)
31 |         untar('tmp.tar.gz', files = file_names[endsWith(file_names, 'All_CDEs.txt')], exdir = 'tmp')
32 |         file.remove('tmp.tar.gz')
33 |         
34 |         # Read in the data:
35 |         clinical_data <- fread(paste0('tmp/', file_names[endsWith(file_names, 'All_CDEs.txt')]), showProgress = FALSE)
36 |         
37 |         # Remove the created directory:
38 |         unlink('tmp', recursive = TRUE)
39 |         
40 |     }
41 |     
42 |     clinical_data <- transpose(clinical_data, make.names = 'bcr_patient_barcode', keep.names = 'bcr_patient_barcode')
43 |     clinical_data <- clinical_data[, -'patient_id']
44 |     setnames(clinical_data, 'bcr_patient_barcode', 'patient_id')
45 |     clinical_data[, c('patient_id', 'cancer_type') := .(toupper(gsub('-', '\\.', patient_id)), ct)]
46 |     clinical_data <- clinical_data[order(patient_id)]
47 |     setcolorder(clinical_data, c('patient_id', 'cancer_type'))
48 |     
49 |     fwrite(clinical_data, paste0('~/TCGA_data/', ct, '/Clinical.csv'))
50 |     
51 |     cat('Done!\n')
52 |     
53 | }
54 | 


--------------------------------------------------------------------------------
/ITH_hallmarks/TCGA_analysis/preprocess_mutations.R:
--------------------------------------------------------------------------------
  1 | library(data.table)
  2 | library(magrittr)
  3 | library(limma)
  4 | library(stringr)
  5 | library(plyr)
  6 | library(RCurl)
  7 | library(matkot)
  8 | 
  9 | cancer_types <- c('ACC', 'BLCA', 'BRCA', 'CESC', 'CHOL', 'COAD', 'DLBC', 'ESCA', 'GBM', 'HNSC', 'KICH', 'KIRC', 'KIRP', 'LAML', 'LGG', 'LIHC',
 10 |     'LUAD', 'LUSC', 'MESO', 'OV', 'PAAD', 'PCPG', 'PRAD', 'READ', 'SARC', 'SKCM', 'STAD', 'TGCT', 'THCA', 'THYM', 'UCEC', 'UCS', 'UVM')
 11 | 
 12 | hgnc_complete_set <- fread('../data/hgnc_complete_set.txt', key = 'ensembl_gene_id')[
 13 |     !(ensembl_gene_id %in% names(table(ensembl_gene_id))[table(ensembl_gene_id) > 1]),
 14 |     .(symbol = symbol, alias = alias_symbol, entrez_id = entrez_id)
 15 | ][, entrez_id := as.character(entrez_id)]
 16 | 
 17 | 
 18 | 
 19 | 
 20 | 
 21 | for(ct in cancer_types) {
 22 |     
 23 |     cat(ct, '\n')
 24 |     
 25 |     # The CNV file URLs use one of three suffixes, so we need to choose whichever one exists:
 26 |     data_url <- sapply(
 27 |         c('-TP', '-TB', '-TM'),
 28 |         function(suff) paste0( 'http://gdac.broadinstitute.org/runs/analyses__2016_01_28/data/', ct, suff, '/20160128/gdac.broadinstitute.org_', ct,
 29 |             suff, '.CopyNumber_Gistic2.Level_4.2016012800.0.0.tar.gz'),
 30 |         USE.NAMES = FALSE
 31 |     )
 32 |     data_url <- data_url[sapply(data_url, url.exists)]
 33 |     
 34 |     download.file(data_url, destfile = 'tmp.tar.gz', quiet = TRUE)
 35 |     
 36 |     file_names <- untar('tmp.tar.gz', list = TRUE)
 37 |     untar('tmp.tar.gz', files = file_names[endsWith(file_names, 'all_thresholded.by_genes.txt')], exdir = 'tmp')
 38 |     file.remove('tmp.tar.gz')
 39 |     
 40 |     # Read in the data:
 41 |     cnv_data <- fread(paste0('tmp/', file_names[endsWith(file_names, 'all_thresholded.by_genes.txt')]))
 42 |     
 43 |     # Remove the created directory:
 44 |     unlink('tmp', recursive = TRUE)
 45 |     
 46 |     cnv_data[,
 47 |         c('Gene Symbol', 'Locus ID', 'Cytoband') := .(
 48 |             do.call(
 49 |                 function(gl) {
 50 |                     gl[grep('C[0-9]+ORF[0-9]+', gl)] <- gsub('ORF', 'orf', gl[grep('C[0-9]+ORF[0-9]+', gl)])
 51 |                     gl_mapped <- alias2SymbolTable(gl)
 52 |                     gl_log <- gl %in% hgnc_complete_set$symbol
 53 |                     gl_mapped_log <- gl_mapped %in% hgnc_complete_set$symbol
 54 |                     out <- gl
 55 |                     out[!gl_log & gl_mapped_log] <- gl_mapped[!gl_log & gl_mapped_log]
 56 |                     out[!gl_log & !gl_mapped_log] <- sapply(
 57 |                         gl_mapped[!gl_log & !gl_mapped_log],
 58 |                         function(g) hgnc_complete_set[
 59 |                             grepl(paste0('^', g, '\\||\\|', g, '\\||\\|', g, '$'), alias),
 60 |                             switch((.N == 1) + 1, NA, symbol)
 61 |                         ]
 62 |                     )
 63 |                     return(out)
 64 |                 },
 65 |                 args = list(gl = cnv_data$`Gene Symbol`)
 66 |             ),
 67 |             NULL,
 68 |             NULL
 69 |         )
 70 |     ]
 71 |     
 72 |     cnv_data <- cnv_data[!is.na(`Gene Symbol`)]
 73 |     cnv_data <- cnv_data[, set_rownames(as.matrix(.SD), `Gene Symbol`), .SDcols = -'Gene Symbol']
 74 |     colnames(cnv_data) <- gsub('-', '\\.', colnames(cnv_data))
 75 |     
 76 |     fwrite(as.data.frame(cnv_data), paste0('~/TCGA_data/', ct, '/CNV_GISTIC_data.csv'), row.names = TRUE)
 77 |     
 78 | }
 79 | 
 80 | 
 81 | 
 82 | 
 83 | 
 84 | # Mutations annotations files were downloaded from GDC Data Portal, filtering by MAF, open access, Masked Somatic Mutation, MuTect2.
 85 | 
 86 | # These files have full sample IDs, but since they don't match the sample IDs in the expression data, I'll extract just the patient IDs.
 87 | 
 88 | setkey(hgnc_complete_set, entrez_id)
 89 | 
 90 | for(ct in cancer_types) {
 91 |     cat(ct, '\n')
 92 |     cont <- dir(paste0('~/TCGA_data/mut_data/', ct))
 93 |     mut_data <- fread(paste0('~/TCGA_data/mut_data/', ct, '/', cont[grep(ct, cont)]))[,
 94 |         c('Entrez_Gene_Id', 'patient_id', 'sample_type') := .(
 95 |             as.character(Entrez_Gene_Id),
 96 |             apply(str_split_fixed(Tumor_Sample_Barcode, '-', 4)[, 1:3], 1, paste, collapse = '.'),
 97 |             mapvalues(
 98 |                 gsub('[A-Z]', '', str_split_fixed(Tumor_Sample_Barcode, '-', 5)[, 4]),
 99 |                 c('01', '02', '03', '05', '06', '07', '11'),
100 |                 c('primary', 'recurrent', 'primary', 'primary_additional', 'metastatic', 'metastatic_additional', 'normal'),
101 |                 warn_missing = FALSE
102 |             )
103 |         )
104 |     ][,
105 |         gene := hgnc_complete_set[Entrez_Gene_Id, symbol]
106 |     ][
107 |         !is.na(gene),
108 |         .(gene, patient_id, sample_type, Chromosome, Start_Position, End_Position, Strand, Variant_Classification, Variant_Type,
109 |             Reference_Allele, Tumor_Seq_Allele1, Tumor_Seq_Allele2, Mutation_Status)
110 |     ]
111 |     fwrite(mut_data, paste0('~/TCGA_data/', ct, '/Mutation_data.csv'))
112 | }
113 | 


--------------------------------------------------------------------------------
/ITH_hallmarks/TCGA_analysis/study_contribution_per_MP.RDS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tiroshlab/3ca/21a784103528cdf15008761f9669e2025159d6d2/ITH_hallmarks/TCGA_analysis/study_contribution_per_MP.RDS


--------------------------------------------------------------------------------
/ITH_hallmarks/TCGA_analysis/study_tcga_map.R:
--------------------------------------------------------------------------------
  1 | library(data.table)
  2 | library(magrittr)
  3 | 
  4 | study_tcga_map <- data.table()[,
  5 |     c('study', 'cancer_type') := rbindlist(.(
  6 |         .('ALL_Ebinger_2016', NA),
  7 |         .('AML_Galen_2019', 'LAML'),
  8 |         .('AML_Wu_2020', 'LAML'),
  9 |         .('AUTONOMIC_GANGLIA_Kinker_2020', NA),
 10 |         .('BILIARY_TRACT_Kinker_2020', 'LIHC'),
 11 |         .('BONE_Kinker_2020', NA),
 12 |         .('Breast_Chung_2017', 'BRCA'),
 13 |         .('Breast_Gao_2021', 'BRCA'),
 14 |         .('Breast_Jordan_2016', 'BRCA'),
 15 |         .('Breast_Karaayvas_2018', 'BRCA'),
 16 |         .('BREAST_Kinker_2020', 'BRCA'),
 17 |         .('Breast_Pal_2021', 'BRCA'),
 18 |         .('Breast_Qian_2020', 'BRCA'),
 19 |         .('CENTRAL_NERVOUS_SYSTEM_Kinker_2020', NA),
 20 |         .('CML_Giustacchini_2017', NA),
 21 |         .('Colon_Li_2017', 'COAD'),
 22 |         .('Colon_Li_2017', 'READ'),
 23 |         .('CRC_Lee_2020', 'COAD'),
 24 |         .('CRC_Lee_2020', 'READ'),
 25 |         .('ENDOMETRIUM_Kinker_2020', 'UCEC'),
 26 |         .('Ependymoma_Gojo_2020', NA),
 27 |         .('ESCC_Yao_2020', 'ESCA_ESCC'),
 28 |         .('Ewing_CellLines_Aynaud_2020', NA),
 29 |         .('Ewing_PDX_Aynaud_2020', NA),
 30 |         .('GBM_cell_lines', 'GBM_IDH-WT'),
 31 |         .('GBM_mouse_Unpublished', 'GBM_IDH-WT'),
 32 |         .('Glioblastoma_Couturier_2020', 'GBM_IDH-WT'),
 33 |         .('Glioblastoma_Darmanis_2017', 'GBM_IDH-WT'),
 34 |         .('Glioblastoma_Wang_2019', 'GBM_IDH-WT'),
 35 |         .('Glioma_Filbin_2018', NA), # H3-K27M mutant - a kind of HGG that's not GBM
 36 |         .('Glioma_Neftel_2019', 'GBM_IDH-WT'),
 37 |         .('Glioma_Tirosh_2016', 'LGG_oligo'),
 38 |         .('Glioma_Venteicher_2017', 'LGG_astro'),
 39 |         .('Glioma_Yuan_2018', 'GBM_IDH-WT'), # 7 GBMs and 1 grade III astrocytoma (TCGA LGG does include plenty of Grade 3 LGGs)
 40 |         .('Glioma_Yuan_2018', 'LGG_astro'),
 41 |         .('HCC_Sun_2021', 'LIHC'),
 42 |         .('HGSOC_Izar_2020', 'OV'),
 43 |         .('HNSCC_Puram_2017', 'HNSC'),
 44 |         .('IPMN_Franses_2017', NA), # CTCs
 45 |         .('KIDNEY_Kinker_2020', 'KICH'),
 46 |         .('KIDNEY_Kinker_2020', 'KIRC'),
 47 |         .('KIDNEY_Kinker_2020', 'KIRP'),
 48 |         .('LARGE_INTESTINE_Kinker_2020', 'COAD'),
 49 |         .('LARGE_INTESTINE_Kinker_2020', 'READ'),
 50 |         .('Leukemia_Caron_2020', NA), # ALL
 51 |         .('LIVER_Kinker_2020', 'LIHC'),
 52 |         .('Liver_Ma_2019', 'LIHC'),
 53 |         .('Liver_Ma_2019', 'CHOL'),
 54 |         .('LiverMets_Massalha_2020', NA), # these are either CRC metastases to the liver or intra-hepatic cholangiocarcinoma
 55 |         .('LSCC_Unpublished', 'HNSC'),
 56 |         .('Lung_Kim_2020', 'LUAD'),
 57 |         .('LUNG_Kinker_2020', 'LUAD'),
 58 |         .('LUNG_Kinker_2020', 'LUSC'),
 59 |         .('Lung_Laughney_2020', 'LUAD'),
 60 |         .('Lung_Qian_2020', 'LUAD'),
 61 |         .('Lung_Qian_2020', 'LUSC'), # This study contains about half LUAD and half LUSC
 62 |         .('Lung_Xing_2021', 'LUAD'),
 63 |         .('lungcancer_Antonella_2020', 'LUAD'),
 64 |         .('lungcancer_Maynard_2020', 'LUAD'),
 65 |         .('lungcancer_Maynard_2020', 'LUSC'), # 45 LUAD and 1 LUSC
 66 |         .('Medulloblastoma_Hovestadt_2019', NA),
 67 |         .('Melanoma_JerbyArnon_2018', 'SKCM_metastatic'),
 68 |         .('Melanoma_JerbyArnon_2018', 'SKCM_primary'),
 69 |         .('Melanoma_Tirosh_2016', 'SKCM_metastatic'),
 70 |         .('Melanoma_Tirosh_2016', 'SKCM_primary'),
 71 |         .('MM_Cohen_2021', NA),
 72 |         .('MM_Ledergor_2018', NA),
 73 |         .('MM_Liu_2021', NA),
 74 |         .('Myeloproliferative_Nam_2019', NA),
 75 |         .('NasopharyngealCarcinoma_Chen_2020', 'HNSC'),
 76 |         .('NET_Unpublished', NA),
 77 |         .('Neuroblastoma_CellLine_Jansky_2021', NA),
 78 |         .('Neuroblastoma_Dong_2020', NA),
 79 |         .('Neuroblastoma_Jansky_2021', NA),
 80 |         .('Neuroblastoma_Kildisiute_2021', NA),
 81 |         .('Neuroblastoma_Mercatelli_2021', NA),
 82 |         .('OESOPHAGUS_Kinker_2020', 'ESCA_AC'),
 83 |         .('OESOPHAGUS_Kinker_2020', 'ESCA_ESCC'),
 84 |         .('OPSCC_Unpublished', 'HNSC'),
 85 |         .('Osteosarcoma_Zhou_2020', NA),
 86 |         .('Ovarian_Nath_2021', 'OV'),
 87 |         .('Ovarian_Olalekan_2021', 'OV'),
 88 |         .('Ovarian_Qian_2020', 'OV'),
 89 |         .('OVARY_Kinker_2020', 'OV'),
 90 |         .('PANCREAS_Kinker_2020', 'PAAD'),
 91 |         .('PDAC_Franses_2020', NA), # CTCs
 92 |         .('PDAC_Ligorio_2019', 'PAAD'),
 93 |         .('PDAC_Lin_2020', 'PAAD'),
 94 |         .('PDAC_Moncada_2020', 'PAAD'),
 95 |         .('PDAC_Peng_2019', 'PAAD'),
 96 |         .('PDAC_Steele_2020', 'PAAD'),
 97 |         .('PDAC_Yu_2012', 'PAAD'),
 98 |         .('PDAC_Yue_2020', 'PAAD'),
 99 |         .('Pituitary_Cui_2021', NA),
100 |         .('PLEURA_Kinker_2020', NA),
101 |         .('Prostate_Chen_2021', 'PRAD'),
102 |         .('Prostate_Dong_2020', 'PRAD'),
103 |         .('PROSTATE_Kinker_2020', 'PRAD'),
104 |         .('Prostate_Miyamoto_2015', 'PRAD'),
105 |         .('RCC_Bi_2021', 'KIRC'), # 7 KIRC and 1 KIRP
106 |         .('RCC_Bi_2021', 'KIRP'),
107 |         .('RCC_Obradovic_2021', 'KIRC'),
108 |         .('RCC_Zhang_2021', 'KICH'), # 7 KIRC and 1 KICH
109 |         .('RCC_Zhang_2021', 'KIRC'),
110 |         .('SCHW_Unpublished', NA),
111 |         .('SCLC_HumanMet_Ireland_2020', NA),
112 |         .('SCLC_Mouse_Ireland_2020', NA),
113 |         .('SKIN_Kinker_2020', 'SKCM_metastatic'),
114 |         .('SKIN_Kinker_2020', 'SKCM_primary'),
115 |         .('SkinSCC_Ji_2020', 'SKCM_primary'),
116 |         .('SOFT_TISSUE_Kinker_2020', 'SARC'),
117 |         .('STOMACH_Kinker_2020', 'STAD'),
118 |         .('SyS_JerbyArnon_2021', 'SARC'),
119 |         .('Thyroid_Gao_2021', 'THCA'),
120 |         .('THYROID_Kinker_2020', 'THCA'),
121 |         .('UPPER_AERODIGESTIVE_TRACT_Kinker_2020', 'HNSC'),
122 |         .('URINARY_TRACT_Kinker_2020', 'BLCA'),
123 |         .('WilmsTumor_Young_2018', NA)
124 |     ))
125 | ]
126 | 
127 | fwrite(study_tcga_map, '../data/study_tcga_map.csv')
128 | 


--------------------------------------------------------------------------------
/ITH_hallmarks/TCGA_analysis/tcga_clinical.R:
--------------------------------------------------------------------------------
  1 | library(data.table)
  2 | library(magrittr)
  3 | library(ggplot2)
  4 | library(stringdist)
  5 | library(stringr)
  6 | library(survival)
  7 | library(readxl)
  8 | library(plyr)
  9 | library(matkot)
 10 | 
 11 | cancer_types <- c('ACC', 'BLCA', 'BRCA', 'CESC', 'CHOL', 'COAD', 'DLBC', 'ESCA_AC', 'ESCA_ESCC', 'GBM_IDH-WT', 'HNSC', 'KICH', 'KIRC', 'KIRP',
 12 |     'LAML', 'LGG_astro', 'LGG_IDH-WT', 'LGG_oligo', 'LIHC', 'LUAD', 'LUSC', 'MESO', 'OV', 'PAAD', 'PCPG', 'PRAD', 'READ', 'SARC', 'SKCM_primary',
 13 |     'SKCM_metastatic', 'STAD', 'TGCT', 'THCA', 'THYM', 'UCEC', 'UCS', 'UVM')
 14 | 
 15 | study_contrib <- readRDS('../data/study_contribution_per_MP.RDS')
 16 | names(study_contrib) <- gsub('  ', ' ', names(study_contrib))
 17 | names(study_contrib)[41] <- 'MP41 Unassigned'
 18 | 
 19 | study_tcga_map <- fread('../data/study_tcga_map.csv', na.strings = '', key = 'cancer_type')
 20 | cancer_types <- cancer_types[cancer_types %in% study_tcga_map$cancer_type]
 21 | 
 22 | # Data from Liu et al. 2018 (https://doi.org/10.1016/j.cell.2018.02.052):
 23 | liu <- as.data.table(
 24 |     read_xlsx(
 25 |         '~/TCGA_data/Liu_1-s2.0-S0092867418302290-mmc1.xlsx',
 26 |         sheet = 1,
 27 |         na = c('[Unknown]', '[Not Evaluated]', '[Not Applicable]', '[Not Available]', '[Discrepancy]', '#N/A')
 28 |     )
 29 | )[, .(patient_id = gsub('-', '\\.', bcr_patient_barcode), cancer_type = type, OS = OS.time, OSc = OS, PFI = PFI.time, PFIc = PFI,
 30 |     stage = ajcc_pathologic_tumor_stage, grade = histological_grade, ther1 = str_to_title(treatment_outcome_first_course))]
 31 | setkey(liu, patient_id)
 32 | 
 33 | firehose <- fread('~/TCGA_data/clinical.csv', na.strings = '', key = 'patient_id')[, .(patient_id = patient_id,
 34 |     cancer_type = cancer_type, ln_n = as.character(number_of_lymphnodes_positive_by_he), ther2 = str_to_title(primary_therapy_outcome_success),
 35 |     ther3 = str_to_title(followup_treatment_success), t_stage = pathologic_t, n_stage = pathologic_n, m_stage = pathologic_m)]
 36 | 
 37 | clin <- merge(liu, firehose, by = c('patient_id', 'cancer_type'), all = TRUE)
 38 | 
 39 | meta_all <- lapply(
 40 |     cancer_types,
 41 |     function(ct) fread(paste0('~/TCGA_data/', ct, '/Cells.csv'))[, .(patient_id = patient_id, cancer_type = ct)]
 42 | ) %>% rbindlist %>% unique
 43 | meta_all <- meta_all[patient_id %in% names(table(patient_id))[table(patient_id) == 1]]
 44 | setkey(meta_all, patient_id)
 45 | clin <- clin[patient_id %in% meta_all$patient_id]
 46 | clin[, cancer_type := do.call(`[`, list(meta_all, patient_id))$cancer_type] # Convert from TCGA label where relevant, e.g. 'ESCA' --> 'ESCA_AC'
 47 | 
 48 | clin_test <- lapply(cancer_types, function(ct) {
 49 |     
 50 |     cat(ct, '\n')
 51 |     
 52 |     if('Scores.csv' %in% dir(paste0('~/TCGA_data/', ct))) {
 53 |         scores <- fread(paste0('~/TCGA_data/', ct, '/Scores.csv'), key = 'sample_id')
 54 |     } else return(NULL)
 55 |     
 56 |     # Read in tumour metadata file and extract one sample ID per patient ID, according to order of priority of sample types:
 57 |     meta <- fread(paste0('~/TCGA_data/', ct, '/Cells.csv'), key = 'sample_type', na.strings = '')[
 58 |         sample_type != 'normal',
 59 |         .SD[c('primary', 'primary_additional', 'recurrent', 'metastatic', 'metastatic_additional')][!is.na(sample_id)][1],
 60 |         keyby = patient_id
 61 |     ][clin[cancer_type == ct, patient_id]][!is.na(sample_id)]
 62 |     setkey(meta, sample_id)
 63 |     
 64 |     # Subset scores table with the above IDs:
 65 |     scores <- scores[sample_id %in% meta$sample_id]
 66 |     scores <- scores[, c(do.call(`[`, list(meta, sample_id))[, .(patient_id, purity)], .(meta_program = meta_program, score = score))]
 67 |     
 68 |     # Subset clinical data table with the above IDs:
 69 |     clin_ct <- clin[patient_id %in% scores$patient_id]
 70 |     
 71 |     # Define variable categories:
 72 |     
 73 |     # M stage:
 74 |     clin_ct[, m_stage := ifelse(grepl('m0', m_stage), 'Low', ifelse(grepl('m1', m_stage), 'High', NA))]
 75 |     
 76 |     # Therapy resistance:
 77 |     for(v in c('ther1', 'ther2', 'ther3')) clin_ct[, (v) := ifelse(grepl('^C|^No ', get(v)), 'Low', ifelse(is.na(get(v)), NA, 'High'))]
 78 |     clin_ct[, ther1 := {if(is.na(ther1)) {if(is.na(ther2)) NA else ther2} else if(is.na(ther2)) ther1 else if(ther1 == ther2) ther1 else NA},
 79 |         by = patient_id]
 80 |     clin_ct[, ther := 'a']
 81 |     clin_ct[, ther := if(any(c(ther1, ther3) == 'High', na.rm = TRUE)) 'High' else if(all(is.na(c(ther1, ther3)))) NA else 'Low', by = patient_id]
 82 |     clin_ct[, c('ther1', 'ther2', 'ther3') := NULL]
 83 |     
 84 |     # Lymphatic spread:
 85 |     clin_ct[!is.na(ln_n), ln_n := ifelse(ln_n == '0', 'Low', 'High')]
 86 |     clin_ct[, n_stage := ifelse(grepl('n0', n_stage), 'Low', ifelse(grepl('n1|n2|n3', n_stage), 'High', NA))]
 87 |     clin_ct[, ln := 'a']
 88 |     clin_ct[, ln := if(any(c(ln_n, n_stage) == 'High', na.rm = TRUE)) 'High' else if(all(is.na(c(ln_n, n_stage)))) NA else 'Low', by = patient_id]
 89 |     clin_ct[, c('ln_n', 'n_stage') := NULL]
 90 |     
 91 |     # In cases where we have intermediate levels, define boundary as wherever the cumulative sum of sample numbers passes 10:
 92 |     
 93 |     ints <- list()
 94 |     
 95 |     # Grade:
 96 |     clin_ct[, grade := mapvalues(grade, c('G1', 'G4', 'GX', 'GB', 'High Grade', 'Low Grade'), c('Low', 'High', NA, 'Low', 'High', 'Low'),
 97 |         warn_missing = FALSE)]
 98 |     ints$grade <- c('G2', 'G3')[c('G2', 'G3') %in% clin_ct$grade]
 99 |     
100 |     # Stage:
101 |     clin_ct[, stage := mapvalues(gsub('[ABC]$', '', stage), c('Stage 0', 'Stage I', 'Stage IV', 'IS', 'I/II NOS', 'Stage X'),
102 |         c('Low', 'Low', 'High', 'Low', NA, NA), warn_missing = FALSE)]
103 |     ints$stage <- c('Stage II', 'Stage III')[c('Stage II', 'Stage III') %in% clin_ct$stage]
104 |     
105 |     # T stage:
106 |     clin_ct[, t_stage := mapvalues(str_extract(t_stage, 't[0-4]|tis'), c('tis', 't0', 't1', 't4'), c('Low', 'Low', 'Low', 'High'),
107 |         warn_missing = FALSE)]
108 |     ints$t_stage <- c('t2', 't3')[c('t2', 't3') %in% clin_ct$t_stage]
109 |     
110 |     for(v in names(ints)) {
111 |         if(length(ints[[v]]) > 0) {
112 |             sums <- clin_ct[!is.na(get(v)), .(N = .N), keyby = .(kv = get(v))][c('Low', ints[[v]], 'High')][!is.na(N), setNames(cumsum(N), kv)]
113 |             if(!any(sums[names(sums) != 'High'] >= 10)) {
114 |                 clin_ct[, (v) := NA]
115 |             } else {
116 |                 clin_ct[get(v) %in% names(sums)[1:min(which(sums >= 10))], (v) := 'Low']
117 |                 clin_ct[get(v) %in% names(sums)[(min(which(sums >= 10)) + 1):length(sums)], (v) := 'High']
118 |             }
119 |         }
120 |     }
121 |     
122 |     scores <- scores[, cbind(.SD, do.call(`[`, list(clin_ct, patient_id))[, -c('patient_id', 'cancer_type')])]
123 |     scores[, cancer_type := ct]
124 |     setcolorder(scores, c('patient_id', 'cancer_type'))
125 |     
126 |     surv <- lapply(unique(scores$meta_program), function(mp) {
127 |         lapply(c('OS', 'PFI'), function(v) {
128 |             sdata <- scores[meta_program == mp & !is.na(get(v))]
129 |             if(nrow(sdata) < 20) return(NULL)
130 |             coxmod <- coxph(Surv(get(v), get(paste0(v, 'c'))) ~ score, data = sdata)
131 |             data.table(cancer_type = ct, var_name = v, meta_program = mp, eff = coxmod$coefficients,# hr = exp(coxmod$coefficients),
132 |                 pval = summary(coxmod)$logtest['pvalue'])
133 |         }) %>% rbindlist
134 |     }) %>% rbindlist
135 |     
136 |     categ <- lapply(unique(scores$meta_program), function(mp) {
137 |         lapply(c('stage', 'grade', 't_stage', 'm_stage', 'ln', 'ther'), function(v) {
138 |             sdata <- scores[meta_program == mp & !is.na(get(v))]
139 |             if(nrow(sdata) < 20) return(NULL) else if(sdata[, sum(get(v) == 'Low') < 10 | sum(get(v) == 'High') < 10]) return(NULL)
140 |             data.table(cancer_type = ct, var_name = v, meta_program = mp,
141 |                 eff = sdata[, .SD[get(v) == 'High', mean(score)] - .SD[get(v) == 'Low', mean(score)]],
142 |                 pval = t.test(score ~ get(v), sdata)$p.value)
143 |         }) %>% rbindlist
144 |     }) %>% rbindlist
145 |     
146 |     return(list(res = rbind(surv, categ), scores = scores, clin = clin_ct))
147 |     
148 | })
149 | 
150 | clin_test_res <- rbindlist(lapply(clin_test, `[[`, 'res'))
151 | clin_test_data <- rbindlist(lapply(clin_test, `[[`, 'clin'))
152 | clin_test_scores <- rbindlist(lapply(clin_test, `[[`, 'scores'))
153 | 
154 | clin_test_res[, pval_adj := p.adjust(pval, method = 'BH')]
155 | setkey(clin_test_res, cancer_type, meta_program)
156 | 
157 | fwrite(clin_test_res, '../data/clin_test_res.csv')
158 | fwrite(clin_test_data, '../data/clin_test_data.csv')
159 | fwrite(clin_test_scores, '../data/clin_test_scores.csv')
160 | 


--------------------------------------------------------------------------------
/ITH_hallmarks/TCGA_analysis/tcga_clinical_deconv.R:
--------------------------------------------------------------------------------
  1 | library(data.table)
  2 | library(magrittr)
  3 | library(ggplot2)
  4 | library(stringdist)
  5 | library(stringr)
  6 | library(survival)
  7 | library(readxl)
  8 | library(plyr)
  9 | library(matkot)
 10 | 
 11 | cancer_types <- c('BLCA', 'BRCA', 'CESC', 'CHOL', 'COAD', 'ESCA_AC', 'ESCA_ESCC', 'HNSC', 'LUAD', 'LUSC', 'OV', 'PAAD', 'PRAD', 'READ', 'STAD',
 12 |     'THCA', 'UCEC')
 13 | 
 14 | study_contrib <- readRDS('../data/study_contribution_per_MP.RDS')
 15 | names(study_contrib) <- gsub('  ', ' ', names(study_contrib))
 16 | names(study_contrib)[41] <- 'MP41 Unassigned'
 17 | 
 18 | study_tcga_map <- fread('../data/study_tcga_map.csv', na.strings = '')
 19 | cancer_types <- cancer_types[cancer_types %in% study_tcga_map$cancer_type]
 20 | setkey(study_tcga_map, cancer_type)
 21 | 
 22 | # Data from Liu et al. 2018 (https://doi.org/10.1016/j.cell.2018.02.052):
 23 | liu <- as.data.table(
 24 |     read_xlsx(
 25 |         '~/TCGA_data/Liu_1-s2.0-S0092867418302290-mmc1.xlsx',
 26 |         sheet = 1,
 27 |         na = c('[Unknown]', '[Not Evaluated]', '[Not Applicable]', '[Not Available]', '[Discrepancy]', '#N/A')
 28 |     )
 29 | )[, .(patient_id = gsub('-', '\\.', bcr_patient_barcode), cancer_type = type, OS = OS.time, OSc = OS, PFI = PFI.time, PFIc = PFI,
 30 |     stage = ajcc_pathologic_tumor_stage, grade = histological_grade, ther1 = str_to_title(treatment_outcome_first_course))]
 31 | setkey(liu, patient_id)
 32 | 
 33 | firehose <- fread('~/TCGA_data/clinical.csv', na.strings = '', key = 'patient_id')[, .(patient_id = patient_id,
 34 |     cancer_type = cancer_type, ln_n = as.character(number_of_lymphnodes_positive_by_he), ther2 = str_to_title(primary_therapy_outcome_success),
 35 |     ther3 = str_to_title(followup_treatment_success), t_stage = pathologic_t, n_stage = pathologic_n, m_stage = pathologic_m)]
 36 | 
 37 | clin <- merge(liu, firehose, by = c('patient_id', 'cancer_type'), all = TRUE)
 38 | 
 39 | meta_all <- lapply(
 40 |     cancer_types,
 41 |     function(ct) fread(paste0('~/TCGA_data/', ct, '/Cells.csv'))[, .(patient_id = patient_id, cancer_type = ct)]
 42 | ) %>% rbindlist %>% unique
 43 | meta_all <- meta_all[patient_id %in% names(table(patient_id))[table(patient_id) == 1]]
 44 | setkey(meta_all, patient_id)
 45 | clin <- clin[patient_id %in% meta_all$patient_id]
 46 | clin[, cancer_type := do.call(`[`, list(meta_all, patient_id))$cancer_type] # Convert from TCGA label where relevant, e.g. 'ESCA' --> 'ESCA_AC'
 47 | 
 48 | clin_test <- lapply(cancer_types, function(ct) {
 49 |     
 50 |     cat(ct, '\n')
 51 |     
 52 |     if('Scores_deconv.csv' %in% dir(paste0('~/TCGA_data/', ct))) {
 53 |         scores <- fread(paste0('~/TCGA_data/', ct, '/Scores_deconv.csv'), key = 'sample_id')
 54 |     } else return(NULL)
 55 |     
 56 |     # Read in tumour metadata file and extract one sample ID per patient ID, according to order of priority of sample types:
 57 |     meta <- fread(paste0('~/TCGA_data/', ct, '/Cells.csv'), key = 'sample_type')[
 58 |         sample_type != 'normal',
 59 |         .SD[c('primary', 'primary_additional', 'recurrent', 'metastatic', 'metastatic_additional')][!is.na(sample_id)][1],
 60 |         keyby = patient_id
 61 |     ][clin[cancer_type == ct, patient_id]][!is.na(sample_id)]
 62 |     setkey(meta, sample_id)
 63 |     
 64 |     # Subset scores table with the above IDs:
 65 |     scores <- scores[sample_id %in% meta$sample_id]
 66 |     scores <- scores[, c(do.call(`[`, list(meta, sample_id))[, .(patient_id, purity)], .(meta_program = meta_program, score = score))]
 67 |     
 68 |     # Subset clinical data table with the above IDs:
 69 |     clin_ct <- clin[patient_id %in% scores$patient_id]
 70 |     
 71 |     # Define variable categories:
 72 |     
 73 |     # M stage:
 74 |     clin_ct[, m_stage := ifelse(grepl('m0', m_stage), 'Low', ifelse(grepl('m1', m_stage), 'High', NA))]
 75 |     
 76 |     # Therapy resistance:
 77 |     for(v in c('ther1', 'ther2', 'ther3')) clin_ct[, (v) := ifelse(grepl('^C|^No ', get(v)), 'Low', ifelse(is.na(get(v)), NA, 'High'))]
 78 |     clin_ct[, ther1 := {if(is.na(ther1)) {if(is.na(ther2)) NA else ther2} else if(is.na(ther2)) ther1 else if(ther1 == ther2) ther1 else NA},
 79 |         by = patient_id]
 80 |     clin_ct[, ther := 'a']
 81 |     clin_ct[, ther := if(any(c(ther1, ther3) == 'High', na.rm = TRUE)) 'High' else if(all(is.na(c(ther1, ther3)))) NA else 'Low', by = patient_id]
 82 |     clin_ct[, c('ther1', 'ther2', 'ther3') := NULL]
 83 |     
 84 |     # Lymphatic spread:
 85 |     clin_ct[!is.na(ln_n), ln_n := ifelse(ln_n == '0', 'Low', 'High')]
 86 |     clin_ct[, n_stage := ifelse(grepl('n0', n_stage), 'Low', ifelse(grepl('n1|n2|n3', n_stage), 'High', NA))]
 87 |     clin_ct[, ln := 'a']
 88 |     clin_ct[, ln := if(any(c(ln_n, n_stage) == 'High', na.rm = TRUE)) 'High' else if(all(is.na(c(ln_n, n_stage)))) NA else 'Low', by = patient_id]
 89 |     clin_ct[, c('ln_n', 'n_stage') := NULL]
 90 |     
 91 |     # In cases where we have intermediate levels, define boundary as wherever the cumulative sum of sample numbers passes 10:
 92 |     
 93 |     ints <- list()
 94 |     
 95 |     # Grade:
 96 |     clin_ct[, grade := mapvalues(grade, c('G1', 'G4', 'GX', 'GB', 'High Grade', 'Low Grade'), c('Low', 'High', NA, 'Low', 'High', 'Low'),
 97 |         warn_missing = FALSE)]
 98 |     ints$grade <- c('G2', 'G3')[c('G2', 'G3') %in% clin_ct$grade]
 99 |     
100 |     # Stage:
101 |     clin_ct[, stage := mapvalues(gsub('[ABC]$', '', stage), c('Stage 0', 'Stage I', 'Stage IV', 'IS', 'I/II NOS', 'Stage X'),
102 |         c('Low', 'Low', 'High', 'Low', NA, NA), warn_missing = FALSE)]
103 |     ints$stage <- c('Stage II', 'Stage III')[c('Stage II', 'Stage III') %in% clin_ct$stage]
104 |     
105 |     # T stage:
106 |     clin_ct[, t_stage := mapvalues(str_extract(t_stage, 't[0-4]|tis'), c('tis', 't0', 't1', 't4'), c('Low', 'Low', 'Low', 'High'),
107 |         warn_missing = FALSE)]
108 |     ints$t_stage <- c('t2', 't3')[c('t2', 't3') %in% clin_ct$t_stage]
109 |     
110 |     for(v in names(ints)) {
111 |         if(length(ints[[v]]) > 0) {
112 |             sums <- clin_ct[!is.na(get(v)), .(N = .N), keyby = .(kv = get(v))][c('Low', ints[[v]], 'High')][!is.na(N), setNames(cumsum(N), kv)]
113 |             if(!any(sums[names(sums) != 'High'] >= 10)) {
114 |                 clin_ct[, (v) := NA]
115 |             } else {
116 |                 clin_ct[get(v) %in% names(sums)[1:min(which(sums >= 10))], (v) := 'Low']
117 |                 clin_ct[get(v) %in% names(sums)[(min(which(sums >= 10)) + 1):length(sums)], (v) := 'High']
118 |             }
119 |         }
120 |     }
121 |     
122 |     scores <- scores[, cbind(.SD, do.call(`[`, list(clin_ct, patient_id))[, -c('patient_id', 'cancer_type')])]
123 |     scores[, cancer_type := ct]
124 |     setcolorder(scores, c('patient_id', 'cancer_type'))
125 |     
126 |     surv <- lapply(unique(scores$meta_program), function(mp) {
127 |         lapply(c('OS', 'PFI'), function(v) {
128 |             sdata <- scores[meta_program == mp & !is.na(get(v))]
129 |             if(nrow(sdata) < 20) return(NULL)
130 |             coxmod <- coxph(Surv(get(v), get(paste0(v, 'c'))) ~ score, data = sdata)
131 |             data.table(cancer_type = ct, var_name = v, meta_program = mp, eff = coxmod$coefficients,# hr = exp(coxmod$coefficients),
132 |                 pval = summary(coxmod)$logtest['pvalue'])
133 |         }) %>% rbindlist
134 |     }) %>% rbindlist
135 |     
136 |     categ <- lapply(unique(scores$meta_program), function(mp) {
137 |         lapply(c('stage', 'grade', 't_stage', 'm_stage', 'ln', 'ther'), function(v) {
138 |             sdata <- scores[meta_program == mp & !is.na(get(v))]
139 |             if(nrow(sdata) < 20) return(NULL) else if(sdata[, sum(get(v) == 'Low') < 10 | sum(get(v) == 'High') < 10]) return(NULL)
140 |             data.table(cancer_type = ct, var_name = v, meta_program = mp,
141 |                 eff = sdata[, .SD[get(v) == 'High', mean(score)] - .SD[get(v) == 'Low', mean(score)]],
142 |                 pval = t.test(score ~ get(v), sdata)$p.value)
143 |         }) %>% rbindlist
144 |     }) %>% rbindlist
145 |     
146 |     return(list(res = rbind(surv, categ), scores = scores, clin = clin_ct))
147 |     
148 | })
149 | 
150 | clin_test_res <- rbindlist(lapply(clin_test, `[[`, 'res'))
151 | clin_test_data <- rbindlist(lapply(clin_test, `[[`, 'clin'))
152 | clin_test_scores <- rbindlist(lapply(clin_test, `[[`, 'scores'))
153 | 
154 | clin_test_res[, pval_adj := p.adjust(pval, method = 'BH')]
155 | setkey(clin_test_res, cancer_type, meta_program)
156 | 
157 | fwrite(clin_test_res, '../data/clin_test_res_deconv.csv')
158 | fwrite(clin_test_data, '../data/clin_test_data_deconv.csv')
159 | fwrite(clin_test_scores, '../data/clin_test_scores_deconv.csv')
160 | 


--------------------------------------------------------------------------------
/ITH_hallmarks/TCGA_analysis/tcga_clinical_meta.R:
--------------------------------------------------------------------------------
  1 | library(data.table)
  2 | library(magrittr)
  3 | library(ggplot2)
  4 | library(stringr)
  5 | library(randomcoloR)
  6 | library(plyr)
  7 | library(scales)
  8 | library(latex2exp)
  9 | library(RColorBrewer)
 10 | library(cowplot)
 11 | library(ggrepel)
 12 | library(matkot)
 13 | 
 14 | 
 15 | 
 16 | 
 17 | 
 18 | # Combine results with and without deconvolution into the same heatmaps:
 19 | 
 20 | clin_test_res <- rbind(fread('../data/clin_test_res.csv'), fread('../data/clin_test_res_deconv.csv')[, cancer_type := paste(cancer_type, '(d)')])
 21 | clin_test_res[, pval_adj := p.adjust(pval, method = 'BH')]
 22 | 
 23 | setkey(clin_test_res, cancer_type, meta_program)
 24 | cancer_types <- sort(unique(clin_test_res$cancer_type))
 25 | mps <- unique(clin_test_res$meta_program)
 26 | 
 27 | pdata <- slapply(
 28 |     c('OS', 'PFI', 'stage', 'grade', 't_stage', 'm_stage', 'ln', 'ther'),
 29 |     function(v) {
 30 |         htmp_data <- clin_test_res[var_name == v][CJ(cancer_types, mps), -c('var_name', 'pval')]
 31 |         htmp_data[is.na(eff), eff := 0]
 32 |         htmp_data[is.na(pval_adj), pval_adj := 1]
 33 |         clust_data <- dcast(htmp_data[, -'pval_adj'], meta_program ~ cancer_type)[,
 34 |             set_rownames(as.matrix(.SD), meta_program),
 35 |             .SDcols = -'meta_program'
 36 |         ]
 37 |         clust_data <- clust_data[apply(clust_data, 1, function(x) !all(x == 0)), apply(clust_data, 2, function(x) !all(x == 0))]
 38 |         clust_x <- colSums(abs(clust_data))
 39 |         clust_x <- data.table(ctd = names(clust_x), ct = gsub(' \\(d\\)', '', names(clust_x)), vald = clust_x)[, val := mean(vald), by = ct]
 40 |         clust_x_temp <- clust_x[, unique(.SD)[, .(cancer_type = ct, r = order(order(-val))/.N)], .SDcols = c('ct', 'val')]
 41 |         setkey(clust_x_temp, cancer_type)
 42 |         clust_x[, r := clust_x_temp[ct, r]]
 43 |         clust_y <- list(labels = rownames(clust_data), order = order(-rowSums(abs(clust_data))))
 44 |         htmp_data <- htmp_data[cancer_type %in% colnames(clust_data) & meta_program %in% rownames(clust_data)]
 45 |         return(list(data = htmp_data, clust_x = clust_x[, .(ctd, ct, r)], clust_y = clust_y))
 46 |     }
 47 | )
 48 | 
 49 | mps_data <- rbindlist(lapply(names(pdata), function(v) with(pdata[[v]]$clust_y, data.table(v = v, mp = labels, r = order(order)/length(order)))))
 50 | setkey(mps_data, v, mp)
 51 | mps_data <- mps_data[CJ(c('OS', 'PFI', 'stage', 'grade', 't_stage', 'm_stage', 'ln', 'ther'), mps)]
 52 | mps_data[, rmean := mean(r[!is.na(r)]), by = mp]
 53 | mps_ord <- unique(mps_data[, .(mp, rmean)])[is.finite(rmean), mp[order(rmean)]]
 54 | 
 55 | cts_data <- rbindlist(lapply(pdata, `[[`, 'clust_x'))
 56 | cts_data_temp <- cts_data[, .(r = mean(r)), keyby = ct]
 57 | cts_ord <- unique(cts_data[, .(ctd = ctd, ct = ct, r = do.call(`[`, list(cts_data_temp, ct))$r)])[order(ctd), ctd[order(r)]]
 58 | 
 59 | # Overall survival:
 60 | os_data <- copy(pdata$OS)$data
 61 | os_data[, c('cancer_type', 'meta_program') := .(factor(cancer_type, levels = cts_ord), factor(meta_program, levels = mps_ord))]
 62 | os_data[, signif := ifelse(pval_adj < 0.05, ifelse(pval_adj < 0.01, ifelse(pval_adj < 0.001, '***', '**'), '*'), '')]
 63 | htmp_os <- ggplot(os_data, aes(x = cancer_type, y = meta_program, fill = exp(eff))) +
 64 |     geom_tile(colour = 'grey90') +
 65 |     geom_text(aes(label = signif), hjust = 0.5, vjust = 0.5, size = 3) +
 66 |     scale_x_discrete(expand = c(0, 0)) +
 67 |     scale_y_discrete(expand = c(0, 0)) +
 68 |     scale_fill_gradientn(colours = rev(brewer.pal(9, "PiYG")), limits = c(0.5, 2.5),
 69 |         values = rescale(c(0, 1, 2, 3, 4, 7, 10, 13, 16), to = c(0, 1)), oob = squish, na.value = '#F7F7F7', name = 'Hazard\nratio',
 70 |         guide = guide_colourbar(frame.colour = 'black', ticks.colour = 'black')) +
 71 |     theme(panel.border = element_rect(size = 0.5, fill = NA), axis.text.x = element_text(angle = 55, hjust = 1),
 72 |         legend.title = element_text(size = 13)) +
 73 |     labs(title = 'Overall survival', x = 'Cancer type', y = NULL)
 74 | 
 75 | # LN mets and therapy resistance:
 76 | ln_data <- copy(pdata$ln)$data
 77 | setkey(ln_data, cancer_type, meta_program)
 78 | ther_data <- copy(pdata$ther)$data
 79 | setkey(ther_data, cancer_type, meta_program)
 80 | mps_ln_ther <- union(ln_data$meta_program, ther_data$meta_program)
 81 | ln_data <- ln_data[CJ(unique(cancer_type), mps_ln_ther)]
 82 | ln_data[, c('cancer_type', 'meta_program') := .(factor(cancer_type, levels = cts_ord), factor(meta_program, levels = mps_ord))]
 83 | ln_data[, signif := ifelse(is.na(pval_adj), '', ifelse(pval_adj < 0.05, ifelse(pval_adj < 0.01, ifelse(pval_adj < 0.001, '***', '**'), '*'), ''))]
 84 | ther_data <- ther_data[CJ(unique(cancer_type), mps_ln_ther)]
 85 | ther_data[, c('cancer_type', 'meta_program') := .(factor(cancer_type, levels = cts_ord), factor(meta_program, levels = mps_ord))]
 86 | ther_data[, signif := ifelse(is.na(pval_adj), '', ifelse(pval_adj < 0.05, ifelse(pval_adj < 0.01, ifelse(pval_adj < 0.001, '***', '**'), '*'), ''))]
 87 | htmp_ln <- ggplot(ln_data, aes(x = cancer_type, y = meta_program, fill = eff)) +
 88 |     geom_tile(colour = 'grey90') +
 89 |     geom_text(aes(label = signif), hjust = 0.5, vjust = 0.5, size = 3) +
 90 |     scale_x_discrete(expand = c(0, 0)) +
 91 |     scale_y_discrete(expand = c(0, 0)) +
 92 |     scale_fill_gradientn(colours = rev(brewer.pal(9, "PiYG")), limits = c(-0.5, 0.5), breaks = c('-0.5' = -0.5, '0' = 0, '0.5' = 0.5), oob = squish,
 93 |         na.value = '#F7F7F7', name = expression(Delta*'(score)'), guide = guide_colourbar(frame.colour = 'black', ticks.colour = 'black')) +
 94 |     theme(panel.border = element_rect(size = 0.5, fill = NA), axis.text.x = element_text(angle = 55, hjust = 1),
 95 |         legend.title = element_text(size = 13)) +
 96 |     labs(title = 'Lymph node metastasis', x = 'Cancer type', y = NULL)
 97 | htmp_ther <- ggplot(ther_data, aes(x = cancer_type, y = meta_program, fill = eff)) +
 98 |     geom_tile(colour = 'grey90') +
 99 |     geom_text(aes(label = signif), hjust = 0.5, vjust = 0.5, size = 3) +
100 |     scale_x_discrete(expand = c(0, 0)) +
101 |     scale_y_discrete(expand = c(0, 0)) +
102 |     scale_fill_gradientn(colours = rev(brewer.pal(9, "PiYG")), limits = c(-0.5, 0.5), breaks = c('-0.5' = -0.5, '0' = 0, '0.5' = 0.5), oob = squish,
103 |         na.value = '#F7F7F7', name = expression(Delta*'(score)'), guide = guide_colourbar(frame.colour = 'black', ticks.colour = 'black')) +
104 |     theme(panel.border = element_rect(size = 0.5, fill = NA), axis.text.x = element_text(angle = 55, hjust = 1),
105 |         legend.title = element_text(size = 13)) +
106 |     labs(title = 'Therapy resistance', x = 'Cancer type', y = NULL)
107 | 
108 | # Make grobs and fix widths and heights:
109 | htmp_os_grob <- ggplotGrob(htmp_os)
110 | htmp_os_grob$widths <- unit(c(0.3, 0, 0, 4.5, 0.5*os_data[, length(unique(cancer_type))], 0, 0, 0.5, 2, 0, 0.3), 'cm')
111 | htmp_os_grob$heights <- unit(c(0.3, 0, 0.8, 0, 0, 0, 0.4*os_data[, length(unique(meta_program))], 2.5, 0.6, 0, 0, 0.3), 'cm')
112 | htmp_ln_grob <- ggplotGrob(htmp_ln + theme(legend.position = 'none'))
113 | htmp_ln_grob$widths <- unit(c(0.3, 0, 0, 4.5, 0.5*ln_data[, length(unique(cancer_type))], 0, 0, 0, 0.3), 'cm')
114 | htmp_ln_grob$heights <- unit(c(0.3, 0, 0.8, 0, 0, 0, 0.4*ln_data[, length(unique(meta_program))], 2.5, 0.6, 0, 0, 0.3), 'cm')
115 | htmp_ther_grob <- ggplotGrob(htmp_ther + theme(axis.text.y = element_blank()))
116 | htmp_ther_grob$widths <- unit(c(0.3, 0, 0, 0, 0.5*ther_data[, length(unique(cancer_type))], 0, 0, 0.5, 2, 0, 0.3), 'cm')
117 | htmp_ther_grob$heights <- unit(c(0.3, 0, 0.8, 0, 0, 0, 0.4*ther_data[, length(unique(meta_program))], 2.5, 0.6, 0, 0, 0.3), 'cm')
118 | 
119 | pdf('../data/clin_htmps_sub.pdf', width = sum(c(htmp_ln_grob$widths, htmp_ther_grob$widths))/2.54,
120 |     height = sum(c(htmp_os_grob$heights, htmp_ln_grob$heights))/2.54)
121 | plot_grid(
122 |     plot_grid(
123 |         htmp_os_grob,
124 |         ggplot() + theme_void(),
125 |         nrow = 1,
126 |         ncol = 2,
127 |         rel_widths = c(
128 |             as.numeric(sum(htmp_os_grob$widths)),
129 |             sum(c(htmp_ln_grob$widths, htmp_ther_grob$widths)) - as.numeric(sum(htmp_os_grob$widths))
130 |         )
131 |     ),
132 |     plot_grid(
133 |         htmp_ln_grob,
134 |         htmp_ther_grob,
135 |         nrow = 1,
136 |         ncol = 2,
137 |         rel_widths = c(sum(htmp_ln_grob$widths), sum(htmp_ther_grob$widths))
138 |     ),
139 |     nrow = 2,
140 |     ncol = 1,
141 |     rel_heights = c(sum(htmp_os_grob$heights), sum(htmp_ln_grob$heights))
142 | )
143 | dev.off()
144 | 
145 | 
146 | 
147 | 
148 | 
149 | # Summary showing pan-cancer consistency of most common MPs:
150 | 
151 | clin_test_res <- rbind(fread('../data/clin_test_res.csv'), fread('../data/clin_test_res_deconv.csv')[, cancer_type := paste(cancer_type, '(d)')])
152 | clin_test_res[, pval_adj := p.adjust(pval, method = 'BH')]
153 | 
154 | study_contrib <- readRDS('../data/study_contribution_per_MP.RDS')
155 | names(study_contrib) <- gsub('  ', ' ', names(study_contrib))
156 | names(study_contrib)[41] <- 'MP41 Unassigned'
157 | 
158 | study_tcga_map <- fread('../data/study_tcga_map.csv', na.strings = '', key = 'study')
159 | 
160 | mps_tab <- lapply(
161 |     names(study_contrib),
162 |     function(mp_name) study_tcga_map[study_contrib[[mp_name]], .(meta_program = mp_name, study = study, cancer_type = cancer_type)]
163 | ) %>% rbindlist
164 | 
165 | common_mps <- mps_tab[, .(n_study = .N, n_ct = length(unique(cancer_type[!is.na(cancer_type)]))), by = meta_program][n_ct >= 5 & n_study >= 10]
166 | common_mps <- common_mps$meta_program
167 | 
168 | pdata_scatter <- clin_test_res[meta_program %in% common_mps, .(s = sum(sign(eff)), s_sig = sum(sign(eff[pval_adj < 0.05]))), by = .(meta_program)]
169 | 
170 | set.seed(8331)
171 | pdf('../data/clin_summary_scatter.pdf', width = 14/2.54, height = 11.5/2.54)
172 | ggplot(pdata_scatter, aes(x = s_sig, y = s)) +
173 |     theme_test() +
174 |     geom_hline(yintercept = 0, linetype = 'dashed', size = 0.5, colour = 'lightgrey') +
175 |     geom_vline(xintercept = 0, linetype = 'dashed', size = 0.5, colour = 'lightgrey') +
176 |     geom_point(shape = 21, fill = 'tomato', stroke = 0.3, size = 3) +
177 |     geom_text_repel(data = pdata_scatter[grep('Cell Cycle|Stress \\(in vitro\\)|Proteasomal|Hypoxia', meta_program)], aes(label = meta_program),
178 |         colour = 'steelblue') +
179 |     geom_text_repel(data = pdata_scatter[grep('Stress$|PDAC', meta_program)], aes(label = meta_program),
180 |         colour = 'steelblue', nudge_x = 14, nudge_y = 8) +
181 |     geom_text_repel(data = pdata_scatter[grep('MHC-II \\(I\\)', meta_program)], aes(label = meta_program),
182 |         colour = 'steelblue', nudge_x = 1, nudge_y = -1) +
183 |     labs(x = expression(sum()*italic(sign)*'(significant effects)'), y = expression(sum()*italic(sign)*'(all effects)'))
184 | dev.off()
185 | 


--------------------------------------------------------------------------------
/ITH_hallmarks/TCGA_analysis/tcga_mp_scores.R:
--------------------------------------------------------------------------------
 1 | library(data.table)
 2 | library(magrittr)
 3 | library(stringr)
 4 | library(readxl)
 5 | library(matkot)
 6 | 
 7 | source('functions.R')
 8 | 
 9 | cancer_types <- c('ACC', 'BLCA', 'BRCA', 'CESC', 'CHOL', 'COAD', 'DLBC', 'ESCA_AC', 'ESCA_ESCC', 'GBM_IDH-WT', 'HNSC', 'KICH', 'KIRC', 'KIRP',
10 |     'LAML', 'LGG_astro', 'LGG_IDH-WT', 'LGG_oligo', 'LIHC', 'LUAD', 'LUSC', 'MESO', 'OV', 'PAAD', 'PCPG', 'PRAD', 'READ', 'SARC', 'SKCM_primary',
11 |     'SKCM_metastatic', 'STAD', 'TGCT', 'THCA', 'THYM', 'UCEC', 'UCS', 'UVM')
12 | 
13 | hgnc_complete_set <- fread('../data/hgnc_complete_set.txt', key = 'ensembl_gene_id')[
14 |     !(ensembl_gene_id %in% names(table(ensembl_gene_id))[table(ensembl_gene_id) > 1])
15 | ]
16 | alias_table <- make_alias_table(hgnc_complete_set)
17 | 
18 | study_contrib <- readRDS('../data/study_contribution_per_MP.RDS')
19 | mps <- as.list(read_xlsx('../data/mps_malignant.xlsx')) # Also already ordered by MP number
20 | mps <- slapply(mps, update_symbols_fast, alias_table)
21 | 
22 | # Make sure MP names are consistent:
23 | names(mps) <- gsub('  ', ' ', names(mps))
24 | names(study_contrib) <- gsub('  ', ' ', names(study_contrib))
25 | names(study_contrib)[41] <- 'MP41 Unassigned'
26 | 
27 | study_tcga_map <- fread('../data/study_tcga_map.csv', na.strings = '')
28 | cancer_types <- cancer_types[cancer_types %in% study_tcga_map$cancer_type]
29 | setkey(study_tcga_map, cancer_type)
30 | 
31 | for(ct in cancer_types) {
32 |     
33 |     cat(ct, '\n')
34 |     
35 |     mps_ct <- mps[sapply(study_contrib, function(x) any(x %in% study_tcga_map[ct, study]))]
36 |     
37 |     expmat <- fread(paste0('~/TCGA_data/', ct, '/Exp_data_TPM.csv'))[, set_rownames(as.matrix(.SD), V1), .SDcols = -'V1']
38 |     meta <- fread(paste0('~/TCGA_data/', ct, '/Cells.csv'))[sample_type != 'normal']
39 |     expmat <- expmat[, meta$sample_id]
40 |     
41 |     mps_ct <- slapply(mps_ct, function(x) x[x %in% rownames(expmat)])
42 |     
43 |     set.seed(140)
44 |     scores <- lapply(
45 |         names(mps_ct),
46 |         function(mp) meta[, .(sample_id = sample_id, meta_program = mp, score = sig_score(expmat, mps_ct[[mp]], nbin = 50, n = 50))]
47 |     ) %>% rbindlist
48 |     
49 |     fwrite(scores, paste0('~/TCGA_data/', ct, '/Scores.csv'))
50 |     
51 | }
52 | 


--------------------------------------------------------------------------------
/ITH_hallmarks/TCGA_analysis/tcga_mp_scores_deconv.R:
--------------------------------------------------------------------------------
 1 | library(data.table)
 2 | library(magrittr)
 3 | library(stringr)
 4 | library(readxl)
 5 | library(matkot)
 6 | 
 7 | source('functions.R')
 8 | 
 9 | cancer_types <- c('BLCA', 'BRCA', 'CESC', 'CHOL', 'COAD', 'ESCA_AC', 'ESCA_ESCC', 'HNSC', 'LUAD', 'LUSC', 'OV', 'PAAD', 'PRAD', 'READ', 'STAD',
10 |     'THCA', 'UCEC')
11 | 
12 | hgnc_complete_set <- fread('../data/hgnc_complete_set.txt', key = 'ensembl_gene_id')[
13 |     !(ensembl_gene_id %in% names(table(ensembl_gene_id))[table(ensembl_gene_id) > 1])
14 | ]
15 | alias_table <- make_alias_table(hgnc_complete_set)
16 | 
17 | study_contrib <- readRDS('../data/study_contribution_per_MP.RDS')
18 | mps <- as.list(read_xlsx('../data/mps_malignant.xlsx')) # Also already ordered by MP number
19 | mps <- slapply(mps, update_symbols_fast, alias_table)
20 | 
21 | # Make sure MP names are consistent:
22 | names(mps) <- gsub('  ', ' ', names(mps))
23 | names(study_contrib) <- gsub('  ', ' ', names(study_contrib))
24 | names(study_contrib)[41] <- 'MP41 Unassigned'
25 | 
26 | study_tcga_map <- fread('../data/study_tcga_map.csv', na.strings = '')
27 | cancer_types <- cancer_types[cancer_types %in% study_tcga_map$cancer_type]
28 | setkey(study_tcga_map, cancer_type)
29 | 
30 | for(ct in cancer_types) {
31 |     
32 |     cat(ct, '\n')
33 |     
34 |     mps_ct <- mps[sapply(study_contrib, function(x) any(x %in% study_tcga_map[ct, study]))]
35 |     
36 |     meta <- fread(paste0('~/TCGA_data/', ct, '/Cells.csv'))[sample_type != 'normal']
37 |     
38 |     if(startsWith(ct, 'ESCA')) {
39 |         expmat <- fread('~/pan_cancer/data/TCGA_deconvolved_scaled/ESCA_Epithelial.cells.txt')
40 |     } else expmat <- fread(paste0('~/pan_cancer/data/TCGA_deconvolved_scaled/', ct, '_Epithelial.cells.txt'))
41 |     expmat <- expmat[, set_rownames(as.matrix(.SD), Gene), .SDcols = -'Gene']
42 |     meta <- meta[sample_id %in% colnames(expmat)]
43 |     expmat <- expmat[, meta$sample_id]
44 |     
45 |     mps_ct <- slapply(mps_ct, function(x) x[x %in% rownames(expmat)])
46 |     
47 |     set.seed(140)
48 |     scores <- lapply(
49 |         names(mps_ct),
50 |         function(mp) meta[, .(sample_id = sample_id, meta_program = mp, score = colMeans(expmat[mps_ct[[mp]], ]))]
51 |     ) %>% rbindlist
52 |     
53 |     fwrite(scores, paste0('~/TCGA_data/', ct, '/Scores_deconv.csv'))
54 |     
55 | }
56 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Tirosh Lab
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 3ca
2 | This repository contains code relating to the Curated Cancer Cell Atlas (3CA), including code for reproducing analyses and figures on the [3CA website](https://www.weizmann.ac.il/sites/3CA), and code for reproducing the analysis in Gavish et al. "The hallmarks of transcriptional intra-tumor heterogeneity".
3 | 
4 | ### Source code for Gavish et al.
5 | Code for reproducing the analyses in the Gavish et al. study is contained in the ITH_hallmarks directory.
6 | 
7 | ### 3CA version 2
8 | The 3CA_v2 directory code for new analyses conducted on the expanded version of 3CA.
9 | 


--------------------------------------------------------------------------------
/aliases_pubmed.R:
--------------------------------------------------------------------------------
 1 | library(data.table)
 2 | library(stringr)
 3 | library(easyPubMed)
 4 | 
 5 | hgnc_complete_set <- fread('../data/hgnc_complete_set.txt', key = 'symbol')[
 6 |     !(ensembl_gene_id %in% names(table(ensembl_gene_id))[table(ensembl_gene_id) > 1]) & locus_group == 'protein-coding gene'
 7 | ]
 8 | 
 9 | alias_table <- hgnc_complete_set[, .(symbol_alt = unique(c(str_split(alias_symbol, '\\|')[[1]], str_split(prev_symbol, '\\|')[[1]]))), by = symbol]
10 | alias_table <- alias_table[symbol_alt != '']
11 | 
12 | # Remove elements of <symbol_alt> that also occur in <symbol>:
13 | alias_table <- alias_table[!(symbol_alt %in% symbol)]
14 | 
15 | # Get number of PubMed search results for each alternative symbol:
16 | alias_table[, pubmed_freq := as.numeric(get_pubmed_ids(paste0(symbol_alt, '[Text Word]'))$Count), by = symbol_alt]
17 | 
18 | # Adding combined searches, i.e. check for cases where the alias and the up-to-date symbol occur together:
19 | alias_table[,
20 |     pubmed_comb_freq := as.numeric(get_pubmed_ids(paste0('(', symbol_alt, '[Text Word]) AND (', symbol, '[Text Word])'))$Count),
21 |     by = .(symbol, symbol_alt)
22 | ]
23 | 
24 | fwrite(alias_table, '../data/aliases_pubmed.csv')
25 | 


--------------------------------------------------------------------------------
/aliases_table.R:
--------------------------------------------------------------------------------
 1 | library(data.table)
 2 | library(magrittr)
 3 | library(matkot)
 4 | 
 5 | aliases_table <- fread('../data/aliases_pubmed.csv')
 6 | aliases_table <- aliases_table[
 7 |     pubmed_freq > 1000 &
 8 |         pubmed_comb_freq > 100 &
 9 |         !(symbol_alt %in% symbol) &
10 |         str_count(symbol_alt, '[a-z]') <= 1 &
11 |         str_count(symbol_alt, '.') > 1 & # Could put 2 here as some 2-letter aliases map poorly, but some are important like RB -> RB1
12 |         !grepl('\\.', symbol_alt)
13 | ]
14 | aliases_table <- aliases_table[!(symbol_alt %in% aliases_table[, .(l = length(unique(symbol))), by = symbol_alt][l > 1, symbol_alt])]
15 | # Try capitalising and removing dashes. If this makes any of symbol_alt the same as symbol, remove these entries. Otherwise, if it makes any
16 | # elements of symbol_alt the same, take the one with the highest pubmed_freq.
17 | aliases_table[, symbol_alt_x := gsub('-', '', toupper(symbol_alt))]
18 | aliases_table <- aliases_table[symbol_alt_x != symbol]
19 | aliases_table <- aliases_table[,
20 |     .(symbol_alt = rbindlist(lapply(
21 |         names(table(symbol_alt_x)),
22 |         function(symb) if(table(symbol_alt_x)[symb] == 1) {
23 |             return(.SD[symbol_alt_x == symb, .(symbol_alt, pubmed_freq)])
24 |         } else return(.SD[symbol_alt_x == symb, .(symbol_alt = symbol_alt[which.max(pubmed_freq)], pubmed_freq = max(pubmed_freq))])
25 |     ))[order(-pubmed_freq), symbol_alt]),
26 |     by = symbol
27 | ]
28 | 
29 | fwrite(aliases_table, '../data/alias_table.csv')
30 | 


--------------------------------------------------------------------------------
/bins.R:
--------------------------------------------------------------------------------
 1 | library(data.table)
 2 | library(magrittr)
 3 | library(Matrix)
 4 | library(stringr)
 5 | library(plyr)
 6 | library(matkot)
 7 | 
 8 | source('functions.R')
 9 | 
10 | paths_table <- fread('../data/paths_table.csv', key = c('study', 'cancer_type'), encoding = 'UTF-8')
11 | 
12 | hgnc_complete_set <- fread('../data/hgnc_complete_set.txt', key = 'symbol')
13 | hgnc_complete_set <- hgnc_complete_set[!(ensembl_gene_id %in% names(table(ensembl_gene_id))[table(ensembl_gene_id) > 1])]
14 | alias_table <- make_alias_table(hgnc_complete_set)
15 | 
16 | set.seed(763)
17 | data_genes_all <- lapply(transpose(as.list(unique(paths_table[, .(study, cancer_type)]))), function(r) {
18 |     cat(r[1], '-', r[2], '\n')
19 |     rdir <- paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1])
20 |     if('gene_ave.csv' %in% dir(rdir)) gene_ave <- fread(paste0(rdir, '/gene_ave.csv'))[!is.na(ave)] else return(NULL)
21 |     if(nrow(gene_ave) == 0) return(NULL)
22 |     gene_ave <- gene_ave[!is.na(ave), if(unique(.SD[, .(group, sample, n_cell)])[, sum(n_cell) >= 10]) .SD[,
23 |         .(study = r[1], cancer_type = r[2], ave = sum(ave*n_cell)/sum(n_cell)),
24 |         by = symbol
25 |     ], by = .(group = cell_type)]
26 |     setcolorder(gene_ave, c('study', 'cancer_type'))
27 |     setkey(gene_ave, study, cancer_type, group, symbol)
28 |     # Shuffle the genes before ranking so that the zero genes don't simply get ordered alphabetically:
29 |     gene_ave[, ave_rank := .SD[sample(1:.N), order(order(ave))[order(symbol)]]/.N, .SDcols = c('symbol', 'ave'), by = group]
30 | }) %>% rbindlist
31 | 
32 | # Bins for non-malignant cell types:
33 | 
34 | # Get the non-malignant cell types occuring in sufficiently many datasets:
35 | cell_types_nm <- data_genes_all[
36 |     !(group %in% c('', 'Unassigned', 'Malignant')) &
37 |         !(study == 'Chen et al. 2020' & cancer_type == 'Head and Neck') &
38 |         !(study == 'Sun et al. 2021' & cancer_type == 'Liver/Biliary'),
39 |     unique(.SD),
40 |     .SDcols = c('study', 'cancer_type', 'group')
41 | ][, .N, by = group][N >= 3, group]
42 | 
43 | genes_nm <- fread('../data/gene_plots_data_all_web.csv', select = 'symbol')$symbol %>% unique
44 | 
45 | bins_nm <- data_genes_all[
46 |     symbol %in% genes_nm & group %in% cell_types_nm &
47 |         !(study == 'Chen et al. 2020' & cancer_type == 'Head and Neck') &
48 |         !(study == 'Sun et al. 2021' & cancer_type == 'Liver/Biliary'),
49 |     .(ave = mean(ave[!is.na(ave)]), ave_rank = mean(ave_rank[!is.na(ave_rank)])),
50 |     by = .(symbol, group)
51 | ][, cbind(.SD[order(ave_rank)], bin = cut(1:.N, 15, labels = FALSE)), by = group]
52 | 
53 | fwrite(bins_nm, '../data/bins_nm.csv')
54 | 


--------------------------------------------------------------------------------
/canonical_markers.R:
--------------------------------------------------------------------------------
 1 | library(data.table)
 2 | 
 3 | canonical_markers <- list(
 4 |     Adipocyte = c('ADIPOQ', 'FABP4', 'LEP'),
 5 |     Astrocyte = c('GFAP', 'GJA1'),
 6 |     B_cell = c('MS4A1', 'CD79A', 'CD79B'),
 7 |     Basophil = c('CCR3', 'IL3RA'),
 8 |     Dendritic = c('CCR7', 'CD86', 'CLEC10A'),
 9 |     Endothelial = c('VWF', 'CDH5', 'CLEC14A', 'CLDN5', 'ADGRL4'),
10 |     Epithelial = c('EPCAM', 'KRT19', 'KRT7', 'MUC1'),
11 |     Erythrocyte = c('HBA1', 'HBA2', 'HBB'),
12 |     Fibroblast = c('COL1A1', 'COL1A2', 'COL3A1', 'LUM'),
13 |     Immune = 'PTPRC',
14 |     Keratinocyte = c('FLG', 'IVL', 'LORICRIN'),
15 |     Lymphovascular = c('CCL21', 'TFF3'),
16 |     Macrophage = c('AIF1', 'CD14', 'CD163'),
17 |     Mast = c('CPA3', 'MS4A2', 'TPSB2'),
18 |     Melanocyte = c('DCT', 'MLANA', 'PMEL', 'TYR', 'TYRP1'),
19 |     Monocyte = c('CCR2', 'CSF1R'),
20 |     Myocyte = c('ACTA1', 'DES', 'MYL1', 'TTN', 'NEB'),
21 |     Neuron = c('ENO2', 'RBFOX3'),
22 |     Neutrophil = c('AZU1', 'CTSG', 'ELANE', 'MPO'),
23 |     NK_cell = c('NKG7', 'KLRD1', 'GZMB', 'KLRF1'),
24 |     Oligodendrocyte = c('TF', 'PLP1', 'CLDN11', 'MAG', 'MBP', 'MOG'),
25 |     OPC = c('OLIG1', 'OLIG2', 'NEU4'),
26 |     Pericyte = c('RGS5', 'ESAM', 'MEF2C'),
27 |     Plasma = c('JCHAIN', 'MZB1', 'TNFRSF17'),
28 |     T_cell = c('CD2', 'CD3D', 'CD3E')
29 | )
30 | 
31 | canonical_markers <- rbindlist(lapply(names(canonical_markers), function(ct) data.table(cell_type = ct, gene = canonical_markers[[ct]])))
32 | 
33 | fwrite(canonical_markers, '../data/canonical_markers.csv')
34 | 


--------------------------------------------------------------------------------
/cc_sigs_consensus.R:
--------------------------------------------------------------------------------
 1 | library(data.table)
 2 | library(ggplot2)
 3 | library(magrittr)
 4 | library(matkot)
 5 | 
 6 | source('functions.R')
 7 | 
 8 | paths_table <- fread('../data/paths_table.csv', key = c('study', 'cancer_type'))
 9 | 
10 | sigs <- lapply(transpose(as.list(unique(paths_table[cancer_type != 'Other/Models', .(study, cancer_type)]))), function(r) {
11 |     cat(r, '\n')
12 |     if(!('data_cc.RDS' %in% dir(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1])))) return(NULL)
13 |     plot_data <- readRDS(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1], '/data_cc.RDS'))
14 |     nullcond <- sapply(plot_data, function(x) ifelse(is.null(x), TRUE, all(sapply(x[names(x) != 'path'], is.null))))
15 |     if(all(nullcond)) return(NULL)
16 |     paths <- apply(paths_table[as.list(r), .(cells, genes, expmat)], 1, as.list, simplify = FALSE)
17 |     rout <- lapply(which(!nullcond), function(i) list(r = r, group = i, g1s = plot_data[[i]]$g1s, g2m = plot_data[[i]]$g2m))
18 | }) %>% unlist(recursive = FALSE)
19 | 
20 | g1s_tab <- table(unlist(lapply(sigs, `[[`, 'g1s')))
21 | g2m_tab <- table(unlist(lapply(sigs, `[[`, 'g2m')))
22 | 
23 | sigs_cons <- list(g1s = names(g1s_tab)[order(-g1s_tab)][1:50], g2m = names(g2m_tab)[order(-g2m_tab)][1:50])
24 | 
25 | saveRDS(sigs_cons, '../data/cc_sigs_consensus.rds')
26 | 


--------------------------------------------------------------------------------
/cna_mat_prep.R:
--------------------------------------------------------------------------------
 1 | # The command line arguments should supply study name and cancer type (in that order).
 2 | r = commandArgs(trailingOnly = TRUE)
 3 | 
 4 | 
 5 | 
 6 | 
 7 | 
 8 | library(data.table)
 9 | library(magrittr)
10 | library(matkot)
11 | 
12 | paths_table <- fread('../data/paths_table.csv', encoding = 'UTF-8', key = c('study', 'cancer_type'))
13 | 
14 | d <- paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1])
15 | if('data_cna.rds' %in% dir(d)) {
16 |     cna <- readRDS(paste0(d, '/data_cna.rds')) # Genes are already in order
17 |     cna_cond <- sapply(cna, function(x) is.null(x) | is.null(x$cna_data))
18 |     if(!all(cna_cond)) {
19 |         for(i in (1:length(cna))[!cna_cond]) { # Indices where data is not NULL
20 |             out <- cna[[i]]$cna_data[, .(gene, cell_name, cna = round(cna, 4))]
21 |             out <- dcast(out, cell_name ~ gene)[, c('cell_name', out[, unique(gene)]), with = FALSE]
22 |             if(sum(!cna_cond) > 1) {
23 |                 if(!('CNA matrix' %in% dir(d))) dir.create(paste0(d, '/CNA matrix'))
24 |                 suff <- paths_table[as.list(r)][i, if(group_name != '') group_name else paste0('group', i)]
25 |                 fwrite(out, paste0(d, '/CNA matrix/CNA_matrix_', suff, '.csv'))
26 |             } else fwrite(out, paste0(d, '/CNA matrix.csv'))
27 |         }
28 |     } else cat('No CNA data\n')
29 | } else cat('No CNA data\n')
30 | 


--------------------------------------------------------------------------------
/gene_ave.R:
--------------------------------------------------------------------------------
 1 | r = commandArgs(trailingOnly = TRUE)
 2 | 
 3 | 
 4 | 
 5 | 
 6 | 
 7 | library(data.table)
 8 | library(magrittr)
 9 | library(Matrix)
10 | library(stringr)
11 | library(plyr)
12 | library(matkot)
13 | 
14 | source('functions.R')
15 | 
16 | paths_table <- fread('../data/paths_table.csv', key = c('study', 'cancer_type'))
17 | 
18 | hgnc_complete_set <- fread('../data/hgnc_complete_set_2023-04-13.txt', key = 'symbol')
19 | hgnc_complete_set <- hgnc_complete_set[!(ensembl_gene_id %in% names(table(ensembl_gene_id))[table(ensembl_gene_id) > 1])]
20 | alias_table <- make_alias_table(hgnc_complete_set)
21 | 
22 | paths <- copy(paths_table[as.list(r)]); setkey(paths, group)
23 | 
24 | out <- rbindlist(lapply(paths$group, function(grp) {
25 |     
26 |     cells <- suppressWarnings(fread(paths[grp, cells], na.strings = '', colClasses = c(cell_name = 'character', sample = 'character')))
27 |     
28 |     if(!all(c('cell_type', 'sample') %in% names(cells)) | !endsWith(paths[grp, expmat], 'mtx')) {
29 |         return(data.table(group = grp, group_name = paths[grp, group_name], cell_type = character(), sample = character(), symbol = character(),
30 |             ave = numeric(), prop_pos = numeric(), n_cell = integer()))
31 |     }
32 |     
33 |     genes <- fread(paths[grp, genes], header = FALSE)$V1
34 |     expmat <- readMM(paths[grp, expmat])
35 |     expmat <- expmat[genes %in% names(table(genes))[table(genes) == 1], ]
36 |     genes <- genes[genes %in% names(table(genes))[table(genes) == 1]]
37 |     genes <- update_symbols_fast(genes, alias_table)
38 |     rownames(expmat) <- genes
39 |     colnames(expmat) <- cells$cell_name
40 |     
41 |     # Remove low-complexity cells and normalise to log TPM/10:
42 |     cells <- cells[col_nnz(expmat) >= 1000]
43 |     if(nrow(cells) < 10) return(data.table(group = grp, group_name = paths[grp, group_name], cell_type = character(), sample = character(),
44 |         symbol = character(), ave = numeric(), prop_pos = numeric(), n_cell = integer()))
45 |     expmat <- round(log_transform(1e+05*to_frac(expmat[, cells$cell_name])), 4)
46 |     
47 |     pout <- cells[
48 |         !is.na(cell_type) & !is.na(sample),
49 |         .(symbol = genes, ave = rowMeans(expmat[, cell_name, drop = FALSE]), prop_pos = row_nnz(expmat[, cell_name, drop = FALSE])/.N, n_cell = .N),
50 |         by = .(cell_type, sample)
51 |     ]
52 |     pout[, c('group', 'group_name') := .(grp, paths[grp, group_name])]
53 |     setcolorder(pout, c('group', 'group_name'))
54 |     
55 |     return(pout)
56 |     
57 | }), use.names = TRUE)
58 | 
59 | if(nrow(out) > 0) {
60 |     ct <- gsub('/', '-', r[2])
61 |     if(!(ct %in% dir('../data/study_plots'))) {dir.create(paste0('../data/study_plots/', ct))}
62 |     if(!(r[1] %in% dir(paste0('../data/study_plots/', ct)))) {dir.create(paste0('../data/study_plots/', ct, '/', r[1]))}
63 |     fwrite(out, paste0('../data/study_plots/', ct, '/', r[1], '/gene_ave.csv'))
64 | }
65 | 


--------------------------------------------------------------------------------
/gene_mp_cor.R:
--------------------------------------------------------------------------------
  1 | # The command line arguments should supply study name and cancer type (in that order).
  2 | r = commandArgs(trailingOnly = TRUE)
  3 | 
  4 | 
  5 | 
  6 | 
  7 | 
  8 | library(data.table)
  9 | library(magrittr)
 10 | library(Matrix)
 11 | library(stringr)
 12 | library(matkot)
 13 | 
 14 | source('functions.R')
 15 | 
 16 | paths_table <- fread('../data/paths_table.csv', key = c('study', 'cancer_type'))
 17 | 
 18 | hgnc_complete_set <- fread('../data/hgnc_complete_set_2023-04-13.txt', key = 'symbol')
 19 | hgnc_complete_set <- hgnc_complete_set[!(ensembl_gene_id %in% names(table(ensembl_gene_id))[table(ensembl_gene_id) > 1])]
 20 | alias_table <- make_alias_table(hgnc_complete_set)
 21 | 
 22 | paths <- copy(paths_table[as.list(r)]); setkey(paths, group)
 23 | 
 24 | rdir <- paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1])
 25 | 
 26 | if('mp_scores_all.csv' %in% dir(rdir)) {
 27 |     scores <- fread(paste0(rdir, '/mp_scores_all.csv'), colClasses = c(cell_name = 'character'))
 28 |     if(nrow(scores) == 0 | !('score' %in% names(scores))) scores <- NULL
 29 | } else scores <- NULL
 30 | 
 31 | if(!is.null(scores)) {
 32 |     
 33 |     out <- lapply(paths$group, function(grp) {
 34 |         
 35 |         cells <- suppressWarnings(fread(paths[grp, cells], na.strings = '', colClasses = c(cell_name = 'character', sample = 'character')))
 36 |         
 37 |         if(
 38 |             scores[group == grp, .N == 0 | all(is.na(score))] |
 39 |                 !all(c('cell_type', 'sample') %in% names(cells)) |
 40 |                 !endsWith(paths[grp, expmat], 'mtx')
 41 |         ) {
 42 |             return(data.table(group = numeric(), group_name = character(), cell_type = character(), meta_program = character(),
 43 |                 gene = character(), corr = numeric(), n_cell = integer(), n_sample = integer(), n_sample_thresh = integer()))
 44 |         }
 45 |         
 46 |         genes <- fread(paths[grp, genes], header = FALSE)$V1
 47 |         expmat <- readMM(paths[grp, expmat])
 48 |         expmat <- expmat[genes %in% names(table(genes))[table(genes) == 1], ]
 49 |         genes <- genes[genes %in% names(table(genes))[table(genes) == 1]]
 50 |         genes <- update_symbols_fast(genes, alias_table)
 51 |         rownames(expmat) <- genes
 52 |         colnames(expmat) <- cells$cell_name
 53 |         
 54 |         # Remove low-complexity cells and normalise to log TPM/10:
 55 |         cells <- cells[col_nnz(expmat) >= 1000]
 56 |         if(nrow(cells) < 10) return(data.table(group = numeric(), group_name = character(), cell_type = character(), meta_program = character(),
 57 |             gene = character(), corr = numeric(), n_cell = integer(), n_sample = integer(), n_sample_thresh = integer()))
 58 |         expmat <- round(log_transform(1e+05*to_frac(expmat[, cells$cell_name])), 4)
 59 |         
 60 |         setkey(cells, cell_name)
 61 |         scores[group == grp, c('cell_type', 'sample') := do.call(`[`, list(cells, cell_name))[, .(cell_type, sample)]]
 62 |         setkey(scores, sample, cell_name)
 63 |         # In the following, I'm using a threshold of 10, in a few places, for minimum number of cells.
 64 |         grp_out <- scores[group == grp][, # Chaining so we can use the key even if the same cell/sample names are present in multiple groups
 65 |             if(length(unique(cell_name)) >= 10) {
 66 |                 cat(unique(cell_type), '\n')
 67 |                 ids <- unique(.SD[, .(sample, cell_name)])
 68 |                 bounds <- ids[, .N, by = sample][, quantile(N, c(0.75, 0.25)) + c(1.5, -1.5)*floor(IQR(N))]
 69 |                 ids <- ids[, if(.N >= bounds[2]) {if(.N <= bounds[1]) .(cell_name) else .(cell_name = sample(cell_name, bounds[1]))}, keyby = sample]
 70 |                 ids_n <- ids[, .N, keyby = sample]
 71 |                 expmat_sub <- expmat[, ids$cell_name]
 72 |                 rm_all <- rowMeans(expmat_sub)
 73 |                 # The following binds together matrices across samples, which might lead to memory issues for large datasets. But we need it to
 74 |                 # compute the correlation across samples.
 75 |                 expmat_sub <- t(Reduce(cbind, lapply(ids_n$sample, function(smpl) {
 76 |                     if(ids_n[smpl, N] < 10) {
 77 |                         apply(expmat_sub[, ids[smpl, cell_name], drop = FALSE], 2, function(x) x - rm_all)
 78 |                     } else {
 79 |                         rm_smpl <- rowMeans(expmat_sub[, ids[smpl, cell_name]])
 80 |                         apply(expmat_sub[, ids[smpl, cell_name]], 2, function(x) x - rm_smpl)
 81 |                     }
 82 |                 })))
 83 |                 setkey(ids, NULL)
 84 |                 out <- .SD[ids, .(gene = rownames(expmat), corr = cor(score, expmat_sub)[1, ]), by = meta_program]
 85 |                 out[, c('n_cell', 'n_sample', 'n_sample_thresh') := ids_n[, .(nrow(ids), .N, sum(N >= 10))]]
 86 |                 out
 87 |             },
 88 |             by = cell_type
 89 |         ]
 90 |         
 91 |         grp_out[, c('group', 'group_name') := .(grp, paths[grp, group_name])]
 92 |         setcolorder(grp_out, c('group', 'group_name'))
 93 |         
 94 |         return(grp_out)
 95 |         
 96 |     }) %>% rbindlist(use.names = TRUE)
 97 |     
 98 |     if(nrow(out) > 0) fwrite(out, paste0(rdir, '/gene_mp_cor.csv'))
 99 |     
100 | }
101 | 


--------------------------------------------------------------------------------
/gene_mp_cor_all_web.R:
--------------------------------------------------------------------------------
 1 | library(data.table)
 2 | library(magrittr)
 3 | library(stringr)
 4 | library(Matrix)
 5 | library(matkot)
 6 | 
 7 | source('functions.R')
 8 | 
 9 | paths_table <- fread('../data/paths_table.csv', key = c('study', 'cancer_type'), encoding = 'UTF-8')
10 | 
11 | hgnc_complete_set <- fread('../data/hgnc_complete_set.txt', key = 'symbol')
12 | hgnc_complete_set <- hgnc_complete_set[!(ensembl_gene_id %in% names(table(ensembl_gene_id))[table(ensembl_gene_id) > 1])]
13 | 
14 | genes <- fread('../data/gene_plots_data_all_web.csv', select = 'symbol')[symbol %in% hgnc_complete_set$symbol, unique(symbol)]
15 | 
16 | to_include <- unique(paths_table[
17 |     cancer_type != 'Other/Models' &
18 |         !grepl('Unpublished', study) &
19 |         !(study == 'Chen et al. 2020' & cancer_type == 'Head and Neck') &
20 |         !(study == 'Sun et al. 2021' & cancer_type == 'Liver/Biliary'),
21 |     .(study, cancer_type)
22 | ])
23 | 
24 | gene_mp_cor <- lapply(transpose(as.list(to_include)), function(r) {
25 |     
26 |     cat(r, '\n')
27 |     if(!('gene_mp_cor.csv' %in% dir(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1])))) return(NULL)
28 |     
29 |     rout <- fread(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1], '/gene_mp_cor.csv'))
30 |     rout <- rout[gene %in% genes]
31 |         
32 |     rout[, c('study', 'cancer_type') := .(r[1], r[2])]
33 |     setcolorder(rout, c('cancer_type', 'study'))
34 |     
35 | }) %>% rbindlist
36 | 
37 | dtkey <- unique(gene_mp_cor[, .(gene, cell_type, cancer_type, meta_program)])[order(gene, cell_type, cancer_type, meta_program)]
38 | gene_mp_cor_ave <- gene_mp_cor[
39 |     !is.na(corr),
40 |     if(sum(n_cell) >= 50 & sum(n_sample) >= 3) .(mean_corr = weighted.mean(corr, n_sample_thresh)),
41 |     keyby = .(gene, cell_type, cancer_type, meta_program)
42 | ]
43 | gene_mp_cor_ave <- gene_mp_cor_ave[dtkey]
44 | 
45 | fwrite(gene_mp_cor_ave, '../data/gene_mp_cor_all_web.csv')
46 | 
47 | # Save tables for individual genes, to speed up the writing of the Rmd files:
48 | setkey(gene_mp_cor_ave, gene)
49 | for(g in sort(genes[genes %in% gene_mp_cor_ave$gene])) {
50 |     cat(g, '\n')
51 |     fwrite(gene_mp_cor_ave[g, -'gene'], paste0('../data/gene_plots/gene_mp_cor/', g, '.csv'))
52 | }
53 | 


--------------------------------------------------------------------------------
/gene_plots_cell_types.R:
--------------------------------------------------------------------------------
 1 | library(data.table)
 2 | library(magrittr)
 3 | library(ggplot2)
 4 | library(stringr)
 5 | library(Matrix)
 6 | library(matkot)
 7 | 
 8 | source('functions.R')
 9 | 
10 | paths_table <- fread('../data/paths_table.csv', key = c('study', 'cancer_type'), encoding = 'UTF-8')
11 | 
12 | 
13 | 
14 | 
15 | 
16 | # For each dataset, find cell types that constitute at least 1% of the data:
17 | cell_types_thresh <- lapply(transpose(as.list(unique(paths_table[, .(study, cancer_type)]))), function(r) {
18 |         
19 |     cat(r, '\n')
20 |     
21 |     cts <- lapply(paths_table[as.list(r), cells], function(p) {
22 |         cells <- suppressWarnings(fread(p, na.strings = ''))
23 |         if(!('cell_type' %in% names(cells))) return(NULL)
24 |         cells <- cells[!is.na(cell_type) & cell_type != 'Unassigned']
25 |         out <- cells[, .(N = .N/nrow(cells)), by = cell_type][N >= 0.01, cell_type]
26 |         # Check if 'Epithelial' cells are definitely normal epithelial cells:
27 |         if(
28 |             'Epithelial' %in% out && (
29 |                 ('malignant' %in% names(cells) && cells[cell_type == 'Epithelial' & malignant == 'no', .N/nrow(cells)] < 0.01) |
30 |                     !('Malignant' %in% cells$cell_type)
31 |             )
32 |         ) {out <- out[out != 'Epithelial']}
33 |         if(length(out) > 0) return(out)
34 |     })
35 |     
36 |     cts <- cts[!sapply(cts, is.null)]
37 |     
38 |     if(length(cts) > 0) return(Reduce(intersect, cts))
39 |     
40 | })
41 | 
42 | # Take cell types that pass the 1% threshold in at least 5 datasets:
43 | cell_types <- table(unlist(cell_types_thresh[!sapply(cell_types_thresh, is.null)]))
44 | cell_types <- names(cell_types)[cell_types >= 5 & names(cell_types) != '']
45 | 
46 | saveRDS(cell_types, '../data/gene_plots_cell_types.rds')
47 | 


--------------------------------------------------------------------------------
/gene_plots_data_all_web.R:
--------------------------------------------------------------------------------
  1 | library(data.table)
  2 | library(magrittr)
  3 | library(stringr)
  4 | library(Matrix)
  5 | library(matkot)
  6 | 
  7 | source('functions.R')
  8 | 
  9 | paths_table <- fread('../data/paths_table.csv', key = c('study', 'cancer_type'), encoding = 'UTF-8')
 10 | 
 11 | cell_types <- readRDS('../data/gene_plots_cell_types.rds')
 12 | 
 13 | hgnc_complete_set <- fread('../data/hgnc_complete_set.txt', key = 'symbol')
 14 | hgnc_complete_set <- hgnc_complete_set[!(ensembl_gene_id %in% names(table(ensembl_gene_id))[table(ensembl_gene_id) > 1])]
 15 | 
 16 | # The following averages over studies for a given cancer type:
 17 | 
 18 | to_include <- unique(paths_table[
 19 |     cancer_type != 'Other/Models' &
 20 |         !grepl('Unpublished', study) &
 21 |         !(study == 'Chen et al. 2020' & cancer_type == 'Head and Neck') &
 22 |         !(study == 'Sun et al. 2021' & cancer_type == 'Liver/Biliary'),
 23 |     .(study, cancer_type)
 24 | ])
 25 | 
 26 | gene_ave <- lapply(transpose(as.list(to_include)), function(r) {
 27 |     
 28 |     cat(r, '\n')
 29 |     if(!('gene_ave.csv' %in% dir(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1])))) return(NULL)
 30 |     
 31 |     rout <- fread(
 32 |         paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1], '/gene_ave.csv'),
 33 |         colClasses = c(cell_type = 'character', symbol = 'character'),
 34 |         key = c('cell_type', 'symbol')
 35 |     )[cell_type %in% cell_types]
 36 |     rout[, c('study', 'cancer_type') := as.list(r)]
 37 |     
 38 |     samples_path <- paste0('/home/labs/tirosh/shared/pan_cancer_datasets/', paths_table[as.list(r), directory[1]], '/samples.csv')
 39 |     samples <- fread(samples_path, colClasses = c(sample = 'character'), na.strings = '')
 40 |     samples <- samples[!is.na(sample) & !is.na(cancer_type) & !(cancer_type %in% c('Normal', 'Premalignant'))]
 41 |     if(nrow(samples) == 0) return(NULL)
 42 |     
 43 |     rout <- rout[sample %in% samples$sample] # This excludes the sample == 'all' category
 44 |     
 45 |     if(all(r == c('Jerby-Arnon et al. 2021', 'Sarcoma'))) { # This dataset is unusual because it has the same sample names in 10x and SS2 datasets
 46 |         rout[, tech := group_name]
 47 |     } else {
 48 |         setkey(samples, sample)
 49 |         rout[, tech := do.call(`[`, list(samples, sample))$technology]
 50 |     }
 51 |     
 52 |     setcolorder(rout, c('cancer_type', 'study', 'group', 'group_name', 'tech'))
 53 |     
 54 |     return(rout)
 55 |     
 56 | }) %>% rbindlist
 57 | 
 58 | gene_ave[
 59 |     cell_type %in% c('Macrophage', 'Myeloid', 'Monocyte'),
 60 |     c('cell_type', 'n_cell', 'ave', 'prop_pos') := .('Macrophage', sum(n_cell), sum(ave*n_cell)/sum(n_cell), sum(prop_pos*n_cell)/sum(n_cell)),
 61 |     by = .(cancer_type, study, tech, sample, symbol)
 62 | ]
 63 | gene_ave <- unique(gene_ave)
 64 | 
 65 | gene_ave <- gene_ave[!(cancer_type == 'Brain' & cell_type == 'Fibroblast')]
 66 | 
 67 | gene_ave <- gene_ave[symbol %in% hgnc_complete_set$symbol]
 68 | 
 69 | 
 70 | 
 71 | 
 72 | 
 73 | # Retain genes that have at least one value in all but at most 3 cancer types:
 74 | gene_ave_study <- gene_ave[symbol %in% gene_ave[, .(n = length(unique(cancer_type))), by = symbol][n >= max(n) - 3, symbol]]
 75 | 
 76 | gene_ave_study <- gene_ave_study[,
 77 |     if(sum(n_cell) >= 10) .(ave = sum(ave*n_cell)/sum(n_cell), prop_pos = sum(prop_pos*n_cell)/sum(n_cell), n_cell = sum(n_cell), n_sample = .N,
 78 |         n_sample_thresh = sum(n_cell >= 10)),
 79 |     by = .(symbol, cell_type, cancer_type, study, tech) # Mean across samples for each study, cancer type and tech (require >=10 cells in each case)
 80 | ][,
 81 |     .(ave = mean(ave), prop_pos = mean(prop_pos), n_cell = sum(n_cell), n_sample = sum(n_sample), n_sample_thresh = sum(n_sample_thresh)),
 82 |     by = .(symbol, cell_type, cancer_type, study) # Mean across datasets of the same study and cancer type but different tech
 83 | ]
 84 | 
 85 | gene_ave_all <- gene_ave_study[,
 86 |     .(study = 'all', ave = weighted.mean(ave, n_sample_thresh + 1), prop_pos = weighted.mean(prop_pos, n_sample_thresh + 1), n_cell = sum(n_cell),
 87 |         n_sample = sum(n_sample), n_sample_thresh = sum(n_sample_thresh)),
 88 |     by = .(symbol, cell_type, cancer_type) # Weighted mean across studies of the same disease, weighted by (number of samples with >= 10 cells) + 1
 89 | ]
 90 | 
 91 | gene_plots_data_all_web <- rbind(gene_ave_study, gene_ave_all, use.names = TRUE)
 92 | 
 93 | fwrite(gene_plots_data_all_web, '../data/gene_plots_data_all_web.csv')
 94 | 
 95 | 
 96 | 
 97 | 
 98 | 
 99 | unique_ct <- gene_plots_data_all_web[study == 'all', setNames(CJ(unique(cell_type), unique(cancer_type)), c('cell_type', 'cancer_type'))]
100 | unique_ct[, study := 'all']
101 | unique_study <- gene_plots_data_all_web[
102 |     study != 'all',
103 |     setNames(CJ(unique(cell_type), unique(paste(study, cancer_type, sep = ' - '))), c('cell_type', 'study'))
104 | ][, c('study', 'cancer_type') := as.data.table(str_split_fixed(study, ' - ', 2))]
105 | unique_dt <- rbind(unique_ct, unique_study, use.names = TRUE)
106 | 
107 | # Save tables for individual genes, to speed up the writing of the Rmd files:
108 | setkey(gene_plots_data_all_web, cell_type, cancer_type, study)
109 | for(g in gene_plots_data_all_web[, sort(unique(symbol))]) {
110 |     cat(g, '\n')
111 |     # No gene names have underscores in them: sum(grepl('_', gene_plots_data_all_web[, sort(unique(symbol))]))
112 |     # Deal with gene names with slashes in them by replacing with underscore:
113 |     fwrite(
114 |         gene_plots_data_all_web[symbol == g, -'symbol'][unique_dt],
115 |         paste0('../data/gene_plots/gene_plots_data_all_web/', gsub('/', '_', g), '.csv')
116 |     )
117 | }
118 | 


--------------------------------------------------------------------------------
/gene_plots_render_web.R:
--------------------------------------------------------------------------------
 1 | n = commandArgs(trailingOnly = TRUE)
 2 | 
 3 | genes_n <- gsub('.csv$', '', dir('../data/gene_plots/gene_plots_data_all_web'))
 4 | genes_n <- genes_n[cut(1:length(genes_n), 500, labels = FALSE) == n]
 5 | genes_n <- genes_n[genes_n != 'backup']
 6 | for(g in genes_n) {
 7 |     if(paste0(g, '.Rmd') %in% dir('../data/gene_plots/rmds_web') & paste0(g, '.rds') %in% dir('../data/gene_plots/rds_plots_web')) {
 8 |         rmarkdown::render(
 9 |             paste0('/home/labs/tirosh/tyler/pan_cancer/data/gene_plots/rmds_web/', g, '.Rmd'),
10 |             intermediates_dir = paste0('/home/labs/tirosh/tyler/pan_cancer/data/gene_plots/pdfs_web/tmp/', g),
11 |             output_dir = '/home/labs/tirosh/tyler/pan_cancer/data/gene_plots/pdfs_web'
12 |         )
13 |     }
14 | }
15 | 


--------------------------------------------------------------------------------
/gene_plots_rmd_web.R:
--------------------------------------------------------------------------------
  1 | n = commandArgs(trailingOnly = TRUE)
  2 | 
  3 | 
  4 | 
  5 | 
  6 | 
  7 | library(data.table)
  8 | library(magrittr)
  9 | library(ggplot2)
 10 | library(stringr)
 11 | library(plyr)
 12 | library(RColorBrewer)
 13 | library(scales)
 14 | library(cowplot)
 15 | library(matkot)
 16 | 
 17 | source('functions.R')
 18 | 
 19 | aliases <- fread('../data/alias_table.csv')
 20 | 
 21 | 
 22 | 
 23 | 
 24 | 
 25 | genes_n <- gsub('.csv$', '', gsub('_', '/', dir('../data/gene_plots/gene_plots_data_all_web')))
 26 | genes_n <- genes_n[cut(1:length(genes_n), 200, labels = FALSE) == n]
 27 | genes_n <- genes_n[genes_n != 'backup']
 28 | for(g in genes_n) {
 29 |     
 30 |     cat(g, '\n')
 31 |     
 32 |     data_g <- fread(paste0('../data/gene_plots/gene_plots_data_all_web/', g, '.csv'))
 33 |     data_g[, cell_type := gsub('_', ' ', cell_type)]
 34 |     
 35 |     data_ct <- data_g[study == 'all', -'study']
 36 |     data_study <- data_g[study != 'all']
 37 |     
 38 |     data_ct[, facet_y := 'c']
 39 |     data_ct <- rbind(
 40 |         data_ct,
 41 |         cbind(
 42 |             cancer_type = 'All cancers',
 43 |             data_ct[, .(ave = mean(ave[!is.na(ave)]), prop_pos = mean(prop_pos[!is.na(prop_pos)]), n_cell = sum(n_cell), n_sample = sum(n_sample),
 44 |                 n_sample_thresh = sum(n_sample_thresh)), by = cell_type]
 45 |         )[, facet_y := 'a'],
 46 |         use.names = TRUE
 47 |     )
 48 |     data_ct[,
 49 |         c('facet_y', 'cell_type', 'cancer_type') := .(
 50 |             factor(facet_y, levels = c('c', 'a')),
 51 |             factor(cell_type, levels = c('Malignant', 'Macrophage', 'T cell', 'NK cell', 'B cell', 'Plasma', 'Dendritic', 'Mast',
 52 |                 'Fibroblast', 'Pericyte', 'Endothelial', 'Epithelial', 'Oligodendrocyte')),
 53 |             factor(cancer_type, levels = sort(unique(cancer_type), decreasing = TRUE))
 54 |         )
 55 |     ]
 56 |     
 57 |     data_study[, cell_type := factor(cell_type, levels = c('Malignant', 'Macrophage', 'T cell', 'NK cell', 'B cell', 'Plasma',
 58 |         'Dendritic', 'Mast', 'Fibroblast', 'Pericyte', 'Endothelial', 'Epithelial', 'Oligodendrocyte'))]
 59 |     data_study[, study := factor(study, levels = rev(sort(unique(study))))] # So that studies go alphabetically from the top
 60 |     
 61 |     # Cut cancer types into groups for separate pages:
 62 |     ct_groups <- list(character(0))
 63 |     i <- 0
 64 |     j <- 1
 65 |     for(ct in data_study[, sort(unique(as.character(cancer_type)))]) {
 66 |         ns <- data_study[cancer_type == ct, length(unique(study))]
 67 |         if(30 - i >= ns) {
 68 |             ct_groups[[j]] <- c(ct_groups[[j]], ct)
 69 |             i <- i + ns
 70 |         } else {
 71 |             j <- j + 1
 72 |             ct_groups[[j]] <- ct
 73 |             i <- ns
 74 |         }
 75 |     }
 76 |     
 77 |     mp_cor_g <- fread(paste0('../data/gene_plots/gene_mp_cor/', g, '.csv'))
 78 |     mp_cor_g[, cell_type := gsub('_', ' ', cell_type)]
 79 |     mp_cor_cts <- mp_cor_g[, {uct <- unique(cell_type); if('Malignant' %in% uct) c('Malignant', sort(uct[uct != 'Malignant'])) else sort(uct)}]
 80 |     
 81 |     plots <- list(
 82 |         dotplot = ggplot(data_ct, aes(x = cell_type, y = cancer_type)) +
 83 |             geom_point(aes(fill = ave, size = 100*prop_pos), shape = 21, colour = 'black') +
 84 |             facet_grid(rows = vars(facet_y), scales = 'free', space = 'free') +
 85 |             scale_x_discrete(labels = c('Malignant' = expression(bold('Malignant')), parse = TRUE)) +
 86 |             scale_y_discrete(labels = c('All cancers' = expression(bold('All cancers')), parse = TRUE)) +
 87 |             scale_fill_gradientn(
 88 |                 colours = brewer.pal(9, 'YlOrRd'),
 89 |                 limits = c(0, 8),
 90 |                 breaks = c(0, 2, 4, 6, 8),
 91 |                 labels = c('0' = '0', '2' = '2', '4' = '4', '6' = '6', '8' = '\u2265 8'),
 92 |                 oob = squish
 93 |             ) +
 94 |             scale_radius(limits = c(0, 100), range = c(1, 8), breaks = c(0, 20, 40, 60, 80, 100)) +
 95 |             theme_bw() +
 96 |             theme(
 97 |                 axis.text.x = element_text(angle = 55, hjust = 1),
 98 |                 axis.title.x = element_blank(),
 99 |                 axis.title.y = element_text(margin = margin(r = 10)),
100 |                 panel.grid = element_line(linewidth = 0.4),
101 |                 strip.background = element_blank(),
102 |                 strip.text = element_blank()
103 |             ) +
104 |             guides(fill = guide_colourbar(frame.colour = 'black')) +
105 |             labs(y = 'Cancer type', fill = 'Mean log2\nexpression\nlevel', size = '% expressing\ncells'),
106 |         dotplot_stdy = lapply(ct_groups, function(grp) {
107 |             pdata <- data_study[cancer_type %in% grp]
108 |             ggplot(pdata, aes(x = cell_type, y = study)) +
109 |                 geom_point(aes(fill = ave, size = 100*prop_pos), shape = 21, colour = 'black') +
110 |                 facet_grid(rows = vars(cancer_type), scales = 'free', space = 'free') +
111 |                 scale_x_discrete(labels = c('Malignant' = expression(bold('Malignant')), parse = TRUE)) +
112 |                 scale_fill_gradientn(
113 |                     colours = brewer.pal(9, 'YlOrRd'),
114 |                     limits = c(0, 8),
115 |                     breaks = c(0, 2, 4, 6, 8),
116 |                     labels = c('0' = '0', '2' = '2', '4' = '4', '6' = '6', '8' = '\u2265 8'),
117 |                     oob = squish
118 |                 ) +
119 |                 scale_radius(limits = c(0, 100), range = c(1, 8), breaks = c(0, 20, 40, 60, 80, 100)) +
120 |                 theme_bw() +
121 |                 theme(
122 |                     axis.text.x = element_text(angle = 55, hjust = 1),
123 |                     axis.title.x = element_blank(),
124 |                     axis.title.y = element_text(margin = margin(r = 10)),
125 |                     panel.grid = element_line(linewidth = 0.4),
126 |                     strip.background = element_rect(colour = NA, fill = NA),
127 |                     strip.text = element_text(vjust = 0),
128 |                     legend.justification = ifelse(data_study[cancer_type %in% grp, length(unique(study)) <= 14], 'top', 'center')
129 |                 ) +
130 |                 guides(fill = guide_colourbar(frame.colour = 'black')) +
131 |                 labs(y = 'Study', fill = 'Mean log2\nexpression\nlevel', size = '% expressing\ncells')
132 |         }),
133 |         htmp = slapply(mp_cor_cts, function(ct) {
134 |             ct_data <- mp_cor_g[cell_type == ct]
135 |             setkey(ct_data, cancer_type, meta_program)
136 |             ct_data <- ct_data[CJ(unique(cancer_type), unique(meta_program))]
137 |             ct_data[, meta_program := factor(meta_program, levels = rev(sort(unique(meta_program))))]
138 |             ct_title <- mapvalues(ct, mp_cor_cts, c('Malignant cells', 'B cells', 'Endothelial cells', 'Epithelial cells', 'Fibroblasts',
139 |                 'Macrophages', 'T cells'), warn_missing = FALSE)
140 |             ggplot(ct_data) +
141 |                 geom_tile(aes(x = cancer_type, y = meta_program, fill = as.numeric(mean_corr), colour = 'a')) + # as.numeric() in case all NA
142 |                 scale_x_discrete(expand = c(0, 0)) +
143 |                 scale_y_discrete(expand = c(0, 0)) +
144 |                 scale_colour_manual(name = NULL, values = c(a = 'grey90'), labels = c(a = 'NA')) +
145 |                 scale_fill_gradientn(colours = rev(brewer.pal(9, 'RdBu')), limits = c(-0.6, 0.6), breaks = c(-0.6, -0.3, 0, 0.3, 0.6),
146 |                     oob = squish, na.value = 'grey80') +
147 |                 guides(fill = guide_colourbar(order = 1, frame.colour = 'black'),
148 |                     colour = guide_legend(override.aes = list(fill = 'grey80'), order = 2)) +
149 |                 theme(panel.border = element_rect(fill = NA, colour = 'black'), axis.text.x = element_text(angle = 35, hjust = 1),
150 |                     axis.title.x = element_text(margin = margin(t = 10)), axis.title.y = element_text(margin = margin(r = 10))) +
151 |                 labs(x = 'Cancer type', y = 'Meta-program', fill = 'Average\ncorrelation', title = ct_title)
152 |         })
153 |     )
154 |     
155 |     saveRDS(plots, paste0('../data/gene_plots/rds_plots_web/', g, '.rds'))
156 |     
157 |     rmdlines <- c('---\n', 'title: "**', g, '**"\n')
158 |     
159 |     if(g %in% aliases$symbol) rmdlines <- c(rmdlines, 'subtitle: "', paste(aliases[symbol == g, symbol_alt], collapse = ', '), '"\n')
160 |     
161 |     rmdlines <- c(
162 |         rmdlines,
163 |         'header-includes:\n',
164 |         ' \\renewcommand{\\familydefault}{\\sfdefault}\n',
165 |         ' \\pagenumbering{gobble}\n',
166 |         'geometry: margin=1cm\n',
167 |         'output: pdf_document\n',
168 |         'papersize: a4\n',
169 |         '---\n',
170 |         '\n',
171 |         '```{r setup, include = FALSE}\n',
172 |         'knitr::opts_chunk$set(echo = FALSE, warning = FALSE, error = FALSE, dev = "cairo_pdf")\n',
173 |         '```\n',
174 |         '\n',
175 |         '```{r}\n',
176 |         'plots <- readRDS("../rds_plots_web/', g, '.rds")\n', # This path has to be relative to where the Rmd file will be
177 |         '```\n',
178 |         '\n',
179 |         '## **A**\n',
180 |         '\n',
181 |         '```{r fig.align="center", fig.height=', 165/25.4, ', fig.width=', 180/25.4, '}\n',
182 |         'plots$dotplot\n',
183 |         '```\n',
184 |         '\n',
185 |         '\\vspace*{\\fill}\n',
186 |         '\n',
187 |         '**A. Expression of ', g, ' per cell type and cancer type.** Plot showing the average expression level of ', g, ' and the percentage of ',
188 |         'cells expressing ', g, ' in each cancer type and each of the most common cell types, namely those cell types constituting at least 1% of ',
189 |         'at least 5 individual datasets. Expression levels are defined as log2(TPM/10). Average expression levels and percentages were measured ',
190 |         'per cell type within each dataset, then averaged across studies within each cancer type.\n'
191 |     )
192 |     
193 |     for(i in 1:length(plots$dotplot_stdy)) {
194 |         plot_title <- ifelse(i == 1, '**B**', '**B (continued)**')
195 |         plot_height <- 30 + 7.5*data_study[cancer_type %in% ct_groups[[i]], length(unique(study))] + 2*(length(ct_groups[[i]]) - 1)
196 |         legend_space <- ifelse(i == length(plots$dotplot_stdy), 20, 0)
197 |         rmdlines <- c(
198 |             rmdlines,
199 |             '\n',
200 |             '\\newpage\n',
201 |             '\n',
202 |             '## ', plot_title, '\n',
203 |             '\n',
204 |             '```{r fig.align="center", fig.height=', (267 - legend_space)/25.4, ', fig.width=', 190/25.4, '}\n',
205 |             'cowplot::plot_grid(plots$dotplot_stdy[[', i, ']], nrow = 2, ncol = 1, rel_heights = c(', plot_height, ', ',
206 |             267 - legend_space -  plot_height, '))\n',
207 |             '```\n'
208 |         )
209 |     }
210 |     
211 |     rmdlines <- c(
212 |         rmdlines,
213 |         '\n',
214 |         '\\vspace*{\\fill}\n',
215 |         '\n',
216 |         '**B. Expression of ', g, ' per cell type and study.** Plot showing the average expression level of ', g, ' and the percentage of cells ',
217 |         'expressing ', g, ' in each study and each of the cell types shown in A. Expression levels are defined as log2(TPM/10).\n'
218 |     )
219 |     
220 |     for(i in 1:length(plots$htmp)) {
221 |         ct <- names(plots$htmp)[i]
222 |         plot_title <- ifelse(i == 1, '**C**', '**C (continued)**')
223 |         plot_height <- 30 + 5*mp_cor_g[cell_type == ct, length(unique(meta_program))]
224 |         legend_space <- ifelse(i == length(plots$htmp), 20, 0)
225 |         rmdlines <- c(
226 |             rmdlines,
227 |             '\n',
228 |             '\\newpage\n',
229 |             '\n',
230 |             '## ', plot_title, '\n',
231 |             '\n',
232 |             '```{r fig.align="center", fig.height=', (267 - legend_space)/25.4, ', fig.width=', 190/25.4, '}\n',
233 |             'cowplot::plot_grid(plots$htmp[[', i, ']], nrow = 2, ncol = 1, rel_heights = c(', plot_height, ', ',
234 |             267 - legend_space -  plot_height, '))\n',
235 |             '```\n'
236 |         )
237 |     }
238 |     
239 |     rmdlines <- c(
240 |         rmdlines,
241 |         '\n',
242 |         '\\vspace*{\\fill}\n',
243 |         '\n',
244 |         '**C. Correlation of ', g, ' with meta-programs.** Heatmaps showing, for each cell type, the average Pearson correlation of ', g,
245 |         ' with meta-program scores in each cancer type. NAs indicate insufficient data or zero variance.\n'
246 |     )
247 |     
248 |     out_con <- file(paste0('../data/gene_plots/rmds_web/', g, '.Rmd'))
249 |     writeLines(rmdlines, con = out_con, sep = '')
250 |     close(out_con)
251 |     
252 | }
253 | 


--------------------------------------------------------------------------------
/mp_scores_all.R:
--------------------------------------------------------------------------------
  1 | # The command line arguments should supply study name and cancer type (in that order).
  2 | r = commandArgs(trailingOnly = TRUE)
  3 | 
  4 | 
  5 | 
  6 | 
  7 | 
  8 | library(data.table)
  9 | library(magrittr)
 10 | library(readxl)
 11 | library(plyr)
 12 | library(stringr)
 13 | library(Matrix)
 14 | library(matkot)
 15 | 
 16 | source('functions.R')
 17 | 
 18 | paths_table <- fread('../data/paths_table.csv', key = c('study', 'cancer_type'))
 19 | paths <- copy(paths_table[as.list(r)]); setkey(paths, group)
 20 | 
 21 | hgnc_complete_set <- fread('../data/hgnc_complete_set_2023-04-13.txt', key = 'symbol')
 22 | hgnc_complete_set <- hgnc_complete_set[!(ensembl_gene_id %in% names(table(ensembl_gene_id))[table(ensembl_gene_id) > 1])]
 23 | alias_table <- make_alias_table(hgnc_complete_set)
 24 | 
 25 | meta_programs <- lapply(
 26 |     c('Malignant', 'B cells', 'Endothelial', 'Epithelial', 'Fibroblasts', 'Macrophages', 'CD4 T cells', 'CD8 T cells'),
 27 |     function(ct) {
 28 |         dt <- as.data.table(read_xlsx('../data/meta_programs_2023-07-13.xlsx', sheet = ct))
 29 |         dt[, (names(dt)) := lapply(dt, update_symbols_fast, alias_table)]
 30 |         return(dt)
 31 |     }
 32 | )
 33 | 
 34 | # Change some MP names, especially for CD4 and CD8 T cells, so we can merge them into a single T cell category:
 35 | names(meta_programs[[1]])[c(12:15, 30, 32, 34, 39)] <- gsub('-', ' ', names(meta_programs[[1]])[c(12:15, 30, 32, 34, 39)])
 36 | names(meta_programs[[1]])[c(19, 35, 37, 41)] <- c('EpiSen', 'Hemato-related I', 'Hemato-related II', 'MP41 (Unassigned)')
 37 | names(meta_programs[[2]])[3] <- 'Cell Cycle'
 38 | names(meta_programs[[3]])[c(1, 6)] <- c('Notch signaling', 'Cell Cycle')
 39 | names(meta_programs[[4]])[5] <- 'Cell Cycle'
 40 | names(meta_programs[[5]])[c(6, 10, 15)] <- c('Cell Cycle', 'Metal response', 'Lipid metabolism')
 41 | names(meta_programs[[6]])[c(3, 9, 12)] <- c('Cell Cycle', 'Proteasomal degradation', 'Unfolded protein response')
 42 | names(meta_programs[[7]]) <- paste('CD4 -', names(meta_programs[[7]]))
 43 | names(meta_programs[[7]])[c(1, 3)] <- c('CD4 - Treg', 'CD4 - Cell Cycle')
 44 | names(meta_programs[[8]]) <- paste('CD8 -', names(meta_programs[[8]]))
 45 | names(meta_programs[[8]])[c(2, 10)] <- c('CD8 - Cell Cycle', 'CD8 - Heat shock')
 46 | 
 47 | # Merge CD4 and CD8 T cells:
 48 | meta_programs[[7]] <- cbind(meta_programs[[7]], meta_programs[[8]])
 49 | meta_programs <- meta_programs[1:7]
 50 | 
 51 | cts <- c('Malignant', 'B_cell', 'Endothelial', 'Epithelial', 'Fibroblast', 'Macrophage', 'T_cell')
 52 | names(meta_programs) <- cts
 53 | 
 54 | 
 55 | 
 56 | 
 57 | 
 58 | rdir <- paste0('../study_plots/', gsub('/', '-', r[2]), '/', r[1])
 59 | 
 60 | gene_ave <- fread(paste0(rdir, '/gene_ave.csv'))[!is.na(ave)]
 61 | 
 62 | if(nrow(gene_ave) > 0) {
 63 |     bins_nm <- fread('../data/bins_nm.csv')
 64 |     gene_ave_list <- list(gene_ave[cell_type == 'Malignant' & n_cell >= 10, .(group, symbol, sample, cell_type, ave)])
 65 |     if(unique(gene_ave[cell_type == 'Malignant', .(group, sample, n_cell)])[, sum(n_cell) >= 10]) gene_ave_list <- c(gene_ave_list, list(
 66 |         gene_ave[
 67 |             cell_type == 'Malignant',
 68 |             .(group = 'all', sample = 'all', cell_type = 'Malignant', ave = sum(ave*n_cell)/sum(n_cell)),
 69 |             by = symbol
 70 |         ]
 71 |     ))
 72 |     gene_ave_list <- c(gene_ave_list, list(gene_ave[,
 73 |         if(unique(.SD[, .(cell_type, n_cell)])[, sum(n_cell) >= 10]) .SD[, .(cell_type = 'all', ave = sum(ave*n_cell)/sum(n_cell)), by = symbol],
 74 |         by = .(group, sample)
 75 |     ]))
 76 |     if(unique(gene_ave[, .(sample, cell_type, n_cell)])[, sum(n_cell) >= 10]) gene_ave_list <- c(gene_ave_list, list(
 77 |         gene_ave[, .(group = 'all', sample = 'all', cell_type = 'all', ave = sum(ave*n_cell)/sum(n_cell)), by = symbol]
 78 |     ))
 79 |     gene_ave <- rbindlist(gene_ave_list, use.names = TRUE)
 80 |     set.seed(7327) # Shuffle the genes before ranking so that the zero genes don't simply get ordered alphabetically:
 81 |     gene_ave[sample(1:.N), ave_rank := order(order(ave))/.N, by = .(group, sample, cell_type)]
 82 |     bins_m <- gene_ave[
 83 |         cell_type == 'Malignant',
 84 |         cbind(.SD[order(ave_rank)], bin = cut(1:.N, 15, labels = FALSE)),
 85 |         by = .(group, sample),
 86 |         .SDcols = -'cell_type'
 87 |     ]
 88 |     bins_all <- gene_ave[
 89 |         cell_type == 'all',
 90 |         cbind(.SD[order(ave_rank)], bin = cut(1:.N, 15, labels = FALSE)),
 91 |         by = .(group, sample),
 92 |         .SDcols = -'cell_type'
 93 |     ]
 94 | }
 95 | 
 96 | 
 97 | 
 98 | 
 99 | 
100 | if(nrow(gene_ave) > 0) out <- lapply(paths$group, function(g) {
101 |     
102 |     cat('Preparing data\n')
103 |     cells <- suppressWarnings(fread(paths[g, cells], na.strings = '', colClasses = c(cell_name = 'character', sample = 'character')))
104 |     if(!all(c('cell_type', 'sample') %in% names(cells)) | !endsWith(paths[g, expmat], 'mtx')) return(NULL)
105 |     if(cells[, .N > length(unique(cell_name))]) cells[, cell_name := paste(cell_name, .I, sep = '_')]
106 |     genes <- fread(paths[g, genes], header = FALSE)$V1
107 |     expmat <- readMM(paths[g, expmat])
108 |     expmat <- expmat[genes != '', ] # In case any genes are empty strings
109 |     genes <- genes[genes != '']
110 |     expmat <- expmat[genes %in% names(table(genes))[table(genes) == 1], ] # Removes repeated gene names
111 |     genes <- genes[genes %in% names(table(genes))[table(genes) == 1]]
112 |     genes <- update_symbols_fast(genes, alias_table) # Update gene symbols
113 |     rownames(expmat) <- genes
114 |     colnames(expmat) <- cells$cell_name
115 |     cells <- cells[cell_type %in% cts & col_nnz(expmat) >= 1000] # Filter cell types and remove low-complexity cells
116 |     if(nrow(cells) < 30) return(NULL)
117 |     expmat <- round(log_transform(1e+05*to_frac(expmat[, cells$cell_name])), 4) # Normalise to log TPM/10
118 |     
119 |     if(sum(unique(bins_nm$symbol) %in% genes) < 1000) bins_nm[, symbol := str_to_title(symbol)] # In case this is mouse data
120 |     if(sum(unique(bins_nm$symbol) %in% genes) < 1000) return(NULL)
121 |     bins_nm <- bins_nm[symbol %in% genes, .(group, symbol, bin = as.character(bin))]
122 |     bins_m <- bins_m[symbol %in% genes & group %in% c(g, 'all'), .(group = mapvalues(sample, 'all', 'Malignant'), symbol, bin = as.character(bin))]
123 |     bins_all <- bins_all[symbol %in% genes & group %in% c(g, 'all'), .(group = sample, symbol, bin = as.character(bin))]
124 |     
125 |     # Consolidate bins tables (and add <group> column to <cells>, for easy matching with <bins>):
126 |     cat('Consolidating bins tables\n')
127 |     if(all(unique(cells$sample) %in% bins_m$group)) {
128 |         bins <- rbind(bins_nm[group %in% cells$cell_type], bins_m[group != 'Malignant'])
129 |         cells[cell_type %in% c(unique(bins_nm$group), 'Malignant'), group := ifelse(cell_type == 'Malignant', sample, cell_type)]
130 |     } else { # Use "Malignant" group for samples with too few malignant cells:
131 |         bins <- rbind(bins_nm[group %in% cells$cell_type], bins_m)
132 |         cells[
133 |             cell_type %in% c(unique(bins_nm$group), 'Malignant'),
134 |             group := ifelse(cell_type != 'Malignant' | !(sample %in% bins_m$group), cell_type, sample)
135 |         ]
136 |     }
137 |     if(!all(cells[cell_type != 'Malignant', unique(cell_type)] %in% bins_nm$group)) {
138 |         bins <- rbind(bins, bins_all[group == 'all']) # Use "all" group for non-malignant cell types not listed in bins_nm
139 |         cells[cell_type != 'Malignant' & !(cell_type %in% bins_nm$group), group := 'all']
140 |     }
141 |     
142 |     setkey(bins, group, symbol)
143 |     bins_copy <- copy(bins); setkey(bins_copy, group, bin)
144 |     
145 |     cat('Computing bin averages\n')
146 |     set.seed(9432)
147 |     bin_aves <- cells[, {
148 |         ct <- unique(cell_type)
149 |         cat(ct, '\n')
150 |         ct_genes <- sort(unique(unlist(meta_programs[[ct]])))
151 |         if(sum(ct_genes %in% genes) < 50) ct_genes <- str_to_title(ct_genes) # In case this is mouse data
152 |         ct_genes <- ct_genes[ct_genes %in% genes]
153 |         if(length(ct_genes) < 50) NULL else {
154 |             .SD[, {
155 |                 grp <- unique(group)
156 |                 cat('\t', grp, '\n')
157 |                 sdout <- bins_copy[
158 |                     .(grp, bins[.(grp, ct_genes), as.character(bin)]),
159 |                     if(.N >= 50) {
160 |                         if(.N <= 100) {
161 |                             .(gene = ct_genes[.GRP], bin_genes = symbol)
162 |                         } else .(gene = ct_genes[.GRP], bin_genes = sample(symbol, 100, replace = FALSE))
163 |                     },
164 |                     by = .EACHI
165 |                 ]
166 |                 expmat_sub <- expmat[unique(sdout$bin_genes), cell_name, drop = FALSE]
167 |                 sdout[, .(cell_name = cell_name, ave = colMeans(expmat_sub[bin_genes, , drop = FALSE])), by = gene]
168 |             }, by = group]
169 |         }
170 |     }, by = cell_type]
171 |     
172 |     bin_ave_mat <- dcast(bin_aves[, .(gene, cell_name, ave)], gene ~ cell_name)[, set_rownames(as.matrix(.SD), gene), .SDcols = -'gene']
173 |     
174 |     cat('Computing scores\n')
175 |     scores <- cells[, {
176 |         ct <- unique(cell_type)
177 |         cat(ct, '\n')
178 |         c(
179 |             .(cell_name = cell_name),
180 |             slapply(names(meta_programs[[ct]]), function(mp_name) {
181 |                 cat('\t', mp_name, '\n')
182 |                 mp <- meta_programs[[ct]][[mp_name]]
183 |                 if(sum(mp %in% genes & mp %in% bin_aves$gene) < 10) mp <- str_to_title(mp)
184 |                 mp <- mp[mp %in% genes & mp %in% bin_aves$gene]
185 |                 if(length(mp) >= 30) {
186 |                     xout <- expmat[mp, cell_name, drop = FALSE] - bin_ave_mat[mp, cell_name, drop = FALSE]
187 |                     xout[xout > 3] <- 3; xout[xout < -3] <- 3 # Cap relative expression levels to reduce influence of extreme values.
188 |                     colMeans(xout, na.rm = TRUE) # Vector of scores - values for some cells/groups may be NA/NaN.
189 |                 }
190 |             })
191 |         ) %>% as.data.table %>% melt(id.vars = 'cell_name', variable.name = 'meta_program', value.name = 'score', variable.factor = FALSE)
192 |     }, by = cell_type]
193 |     
194 |     scores <- scores[, -'cell_type']
195 |     scores[, group := g]
196 |     setcolorder(scores, 'group')
197 |     return(scores)
198 |     
199 | }) %>% rbindlist
200 | 
201 | if(nrow(gene_ave) > 0) fwrite(out, paste0(rdir, '/mp_scores_all.csv'))
202 | 


--------------------------------------------------------------------------------
/mp_scores_prep.R:
--------------------------------------------------------------------------------
 1 | library(data.table)
 2 | library(magrittr)
 3 | library(matkot)
 4 | 
 5 | paths_table <- fread('../data/paths_table.csv', encoding = 'UTF-8', key = c('study', 'cancer_type'))
 6 | 
 7 | for(r in transpose(as.list(unique(paths_table[, .(study, cancer_type)])))) tryCatch({
 8 |     cat(r, '-')
 9 |     d <- paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1])
10 |     if('data_dist.rds' %in% dir(d)) {
11 |         data_dist <- readRDS(paste0(d, '/data_dist.rds'))
12 |         data_dist_cond <- sapply(data_dist, function(x) is.null(x) | is.null(x$scores))
13 |         if(!all(data_dist_cond)) {
14 |             for(i in (1:length(data_dist))[!data_dist_cond]) { # Indices where data is not NULL
15 |                 out <- data_dist[[i]]$scores[, .(cell_name, sample, cell_type, meta_program, score = round(score, 4))]
16 |                 if(sum(!data_dist_cond) > 1) {
17 |                     if(!('MP scores' %in% dir(d))) dir.create(paste0(d, '/MP scores'))
18 |                     suff <- paths_table[as.list(r)][i, if(group_name != '') group_name else paste0('group', i)]
19 |                     fwrite(out, paste0(d, '/MP scores/MP_scores_', suff, '.csv'))
20 |                 } else fwrite(out, paste0(d, '/MP scores.csv'))
21 |             }
22 |         } else cat('No MP score data')
23 |     } else cat('No MP score data')
24 |     cat('\n')
25 | }, error = function(e) print('ERROR\n'))
26 | 


--------------------------------------------------------------------------------
/study_plots_cc.R:
--------------------------------------------------------------------------------
  1 | library(data.table)
  2 | library(ggplot2)
  3 | library(magrittr)
  4 | library(Matrix)
  5 | library(stringr)
  6 | library(plyr)
  7 | library(cowplot)
  8 | library(RColorBrewer)
  9 | library(scales)
 10 | library(gtable)
 11 | library(grid)
 12 | library(gridExtra)
 13 | library(matkot)
 14 | 
 15 | source('functions.R')
 16 | 
 17 | paths_table <- fread('../data/paths_table.csv', encoding = 'UTF-8', key = c('study', 'cancer_type'))
 18 | 
 19 | ct_to_title <- function(ct) {
 20 |     if(grepl('ic$|al$|ar$|oid$|crine$|ant$', ct) | ct %in% c('Mast', 'Plasma', 'Immune', 'Stem', 'Schwann', 'Langerhans', 'Tuft', 'Stellate')) {
 21 |         return(paste0(ct, ' cells'))
 22 |     } else {
 23 |         return(paste0(gsub('_', ' ', ct), 's'))
 24 |     }
 25 | }
 26 | 
 27 | 
 28 | 
 29 | 
 30 | 
 31 | for(r in transpose(as.list(unique(paths_table[, .(study, cancer_type)])))) {
 32 |     
 33 |     cat(r, '\n')
 34 |     
 35 |     if(all(c('data_cc.RDS', 'Cell cycle.pdf') %in% dir(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1])) == c(TRUE, FALSE))) {
 36 |         
 37 |         plot_data <- readRDS(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1], '/data_cc.RDS'))
 38 |         nullcond <- sapply(plot_data, function(x) ifelse(is.null(x), TRUE, all(sapply(x[names(x) != 'path'], is.null))))
 39 |         
 40 |         if(!all(nullcond)) {
 41 |             
 42 |             suff <- paste(gsub(' et al. ', '', r[1]), gsub('/| ', '-', r[2]), sep = '_')
 43 |             
 44 |             out <- lapply(which(!nullcond), function(i) {
 45 |                 
 46 |                 # To use in plot titles:
 47 |                 if(sum(unique(paths_table[, .(study, cancer_type)])$study == r[1]) == 1) {
 48 |                     if(paths_table[as.list(r), .N] > 1) {
 49 |                         title_tail <- paste0(r[1], ' - ', paths_table[as.list(r)][i, if(group_name == '') paste('Group', group) else group_name])
 50 |                     } else {
 51 |                         title_tail <- r[1]
 52 |                     }
 53 |                 } else {
 54 |                     if(paths_table[as.list(r), .N] > 1) {
 55 |                         title_tail <- paste0(r[1], ', ', r[2], ' - ', paths_table[as.list(r)][i, if(group_name == '') paste('Group', group) else
 56 |                             group_name])
 57 |                     } else {
 58 |                         title_tail <- paste0(r[1], ', ', r[2])
 59 |                     }
 60 |                 }
 61 |                 
 62 |                 plot_out <- do.call(cc_plot, args = plot_data[[i]][c('ccdata', 'hdata', 'g1s', 'g2m')])
 63 |                 plot_title <- paste0('Cell cycle activity in ', title_tail)
 64 |                 return(list(plot_out = plot_out, plot_title = plot_title))
 65 |                 
 66 |             })
 67 |             
 68 |             if(!all(sapply(out, function(x) is.null(x$plot_out)))) {
 69 |                 
 70 |                 rmdlines <- c(
 71 |                     '---\n',
 72 |                     'title: ""\n',
 73 |                     'header-includes:\n',
 74 |                     ' \\renewcommand{\\familydefault}{\\sfdefault}\n', # Indent is important but must use spaces, NOT tabs!
 75 |                     ' \\pagenumbering{gobble}\n',
 76 |                     'geometry: margin=1cm\n',
 77 |                     'output: pdf_document\n',
 78 |                     'papersize: a4\n',
 79 |                     '---\n',
 80 |                     '\n',
 81 |                     '```{r setup, include = FALSE}\n',
 82 |                     'knitr::opts_chunk$set(echo = FALSE, warning = FALSE, error = FALSE, dev = "cairo_pdf")\n',
 83 |                     '```\n'
 84 |                 )
 85 |                 for(i in 1:length(out)) {
 86 |                     if(is.null(out[[i]]$plot_out)) next
 87 |                     saveRDS(out[[i]]$plot_out, paste0('comp_plots_', i, '_', suff, '.rds'))
 88 |                     if(i > 1) rmdlines <- c(rmdlines, '\n\\newpage\n')
 89 |                     rmdlines <- c(
 90 |                         rmdlines,
 91 |                         '\n',
 92 |                         '```{r}\n',
 93 |                         'plots <- readRDS("comp_plots_', i, '_', suff, '.rds")\n',
 94 |                         '```\n'
 95 |                     )
 96 |                     if(names(out[[i]]$plot_out$scatter)[1] == 'data') {
 97 |                         # In this case there's only one scatter plot ('data' is the 1st element of a ggplot object), and there will be a heatmap.
 98 |                         rmdlines <- c(
 99 |                             rmdlines,
100 |                             '\n',
101 |                             '### ', out[[i]]$plot_title, '\n',
102 |                             '\n',
103 |                             '### **A**\n',
104 |                             '\n',
105 |                             '```{r fig.align="center", fig.height=', 110/25.4, ', fig.width=', 170/25.4, ', out.width="70%", out.height="70%"}\n',
106 |                             'plots$scatter\n',
107 |                             '```\n',
108 |                             '\n',
109 |                             '### **B**\n',
110 |                             '\n',
111 |                             '```{r fig.align="center", fig.height=', 175/25.4, ', fig.width=', 210/25.4, ', out.width="80%", out.height="80%"}\n',
112 |                             'plots$heatmap\n',
113 |                             '```\n',
114 |                             '\n',
115 |                             '\\vspace*{\\fill}\n', # Aligns text with bottom of page
116 |                             '\n',
117 |                             '**A.** Scatter plot showing the scores for G1/S (x axis) and G2/M (y axis) cell cycle phases for individual cells ',
118 |                             '(points). The color indicates the phase assigned to each cell, and the percentage refers to all cells which are ',
119 |                             'assigned as either G1/S, G2/M or intermediate. **B.** Heatmap of relative expression levels in individual cells ',
120 |                             '(columns) of the genes (rows) best distinguishing G1/S and G2/M phases. Cells are ordered by phase (indicated by ',
121 |                             'the color bar, top) and, within each phase, by the maximum of the G1/S and G2/M scores.\n'
122 |                         )
123 |                     } else { # In this case every element is a separate scatter plot, corresponding to a cell type, possibly with no heatmap.
124 |                         cts <- names(out[[i]]$plot_out$scatter)
125 |                         if('Malignant' %in% cts) {cts <- c('Malignant', sort(cts[cts != 'Malignant']))} else {cts <- sort(cts)}
126 |                         for(j in 1:length(out[[i]]$plot_out$scatter)) {
127 |                             if(j > 1) rmdlines <- c(rmdlines, '\n\\newpage\n')
128 |                             ct <- cts[j]
129 |                             plot_title <- paste0(out[[i]]$plot_title, ' - ', ct_to_title(ct))
130 |                             if(ct %in% names(out[[i]]$plot_out$heatmap)) {
131 |                                 rmdlines <- c(
132 |                                     rmdlines,
133 |                                     '\n',
134 |                                     '### ', plot_title, '\n',
135 |                                     '\n',
136 |                                     '### **A**\n',
137 |                                     '\n',
138 |                                     '```{r fig.align="center", fig.height=',
139 |                                     110/25.4,
140 |                                     ', fig.width=',
141 |                                     170/25.4,
142 |                                     ', out.width="70%", out.height="70%"}\n',
143 |                                     'plots$scatter[["', ct, '"]]\n',
144 |                                     '```\n',
145 |                                     '\n',
146 |                                     '### **B**\n',
147 |                                     '\n',
148 |                                     '```{r fig.align="center", fig.height=',
149 |                                     175/25.4,
150 |                                     ', fig.width=',
151 |                                     210/25.4,
152 |                                     ', out.width="80%", out.height="80%"}\n',
153 |                                     'plots$heatmap[["', ct, '"]]\n',
154 |                                     '```\n',
155 |                                     '\n',
156 |                                     '\\vspace*{\\fill}\n', # Aligns text with bottom of page
157 |                                     '\n',
158 |                                     '**A.** Scatter plot showing the scores for G1/S (x axis) and G2/M (y axis) cell cycle phases for individual ',
159 |                                     ct_to_title(ct),
160 |                                     ' (points). The color indicates the phase assigned to each cell, and the percentage refers to all cells ',
161 |                                     'which are assigned as either G1/S, G2/M or intermediate. **B.** Heatmap of relative expression levels in ',
162 |                                     'individual ',
163 |                                     ct_to_title(ct),
164 |                                     ' (columns) of the genes (rows) best distinguishing G1/S and G2/M phases. Cells are ordered by phase ',
165 |                                     '(indicated by the color bar, top) and, within each phase, by the maximum of the G1/S and G2/M scores.\n'
166 |                                 )
167 |                             } else {
168 |                                 rmdlines <- c(
169 |                                     rmdlines,
170 |                                     '\n',
171 |                                     '### ', plot_title, '\n',
172 |                                     '\n',
173 |                                     '```{r fig.align = "center", fig.height = ', 120/25.4, ', fig.width = ', 180/25.4,
174 |                                     ', out.width = "70%", out.height = "70%"}\n',
175 |                                     'plots$scatter[["', ct, '"]]\n',
176 |                                     '```\n',
177 |                                     '\n',
178 |                                     'Scatter plot showing the scores for G1/S (x axis) and G2/M (y axis) cell cycle phases for individual ',
179 |                                     ct_to_title(ct),
180 |                                     ' (points). The color indicates the phase assigned to each cell, and the percentage refers to all cells which ',
181 |                                     'are assigned as either G1/S, G2/M or intermediate.\n'
182 |                                 )
183 |                             }
184 |                         }
185 |                     }
186 |                 }
187 |                 out_con <- file(paste0('temp_', suff, '.Rmd'))
188 |                 writeLines(rmdlines, con = out_con, sep = '')
189 |                 close(out_con)
190 |                 rmarkdown::render(
191 |                     paste0('temp_', suff, '.Rmd'),
192 |                     output_file = paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1], '/Cell cycle.pdf')
193 |                 )
194 |                 file.remove(paste0('temp_', suff, '.Rmd'))
195 |                 for(i in 1:length(out)) file.remove(paste0('comp_plots_', i, '_', suff, '.rds'))
196 |                 
197 |             }
198 |             
199 |         }
200 |         
201 |     }
202 |     
203 | }
204 | 


--------------------------------------------------------------------------------
/study_plots_cna.R:
--------------------------------------------------------------------------------
  1 | # The command line arguments should supply study name and cancer type (in that order).
  2 | r = commandArgs(trailingOnly = TRUE)
  3 | 
  4 | 
  5 | 
  6 | 
  7 | 
  8 | library(data.table)
  9 | library(magrittr)
 10 | library(ggplot2)
 11 | library(RColorBrewer)
 12 | library(randomcoloR)
 13 | library(scales)
 14 | library(cowplot)
 15 | library(ggtext)
 16 | library(matkot)
 17 | 
 18 | source('functions.R')
 19 | 
 20 | paths_table <- fread('../data/paths_table.csv', key = c('study', 'cancer_type'))
 21 | 
 22 | 
 23 | 
 24 | 
 25 | 
 26 | paths <- apply(paths_table[as.list(r), .(cells, genes, expmat)], 1, as.list, simplify = FALSE)
 27 | 
 28 | if('data_cna.rds' %in% dir(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1]))) {
 29 |     
 30 |     plot_data <- readRDS(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1], '/data_cna.rds'))
 31 |     nullcond <- sapply(plot_data, function(x) ifelse(is.null(x), TRUE, all(sapply(x[names(x) != 'path'], is.null))))
 32 |     
 33 |     out <- lapply(which(!nullcond), function(i) {
 34 |         
 35 |         # To use in plot titles:
 36 |         if(sum(unique(paths_table[, .(study, cancer_type)])$study == r[1]) == 1) {
 37 |             if(paths_table[as.list(r), .N] > 1) {
 38 |                 title_tail <- paste0(r[1], ' - ', paths_table[as.list(r)][i, if(group_name == '') paste('Group', group) else group_name])
 39 |             } else {
 40 |                 title_tail <- r[1]
 41 |             }
 42 |         } else {
 43 |             if(paths_table[as.list(r), .N] > 1) {
 44 |                 title_tail <- paste0(r[1], ', ', r[2], ' - ', paths_table[as.list(r)][i, if(group_name == '') paste('Group', group) else group_name])
 45 |             } else {
 46 |                 title_tail <- paste0(r[1], ', ', r[2])
 47 |             }
 48 |         }
 49 |         
 50 |         cells <- suppressWarnings(fread(paths[[i]]$cells, na.strings = ''))
 51 |         cells$cell_name <- as.character(cells$cell_name)
 52 |         setkey(cells, cell_name)
 53 |         
 54 |         pdata <- plot_data[[i]]$cna_data[, .(gene, cell_name, cna, sample, chr)]
 55 |         pdata[, cell_type := do.call(`[`, list(cells, cell_name))$cell_type]
 56 |         pdata <- pdata[!is.na(cell_type)]
 57 |         chr_n <- pdata[, .(n = length(unique(gene))), keyby = chr][n > 20]
 58 |         pdata <- pdata[chr %in% chr_n$chr]
 59 |         pdata[, malignant := ifelse(cell_type == 'Malignant', 'm', 'nm')]
 60 |         
 61 |         # Sample cells so that we have no more than 2000, of which between 60 and 85% are malignant:
 62 |         grp_n <- pdata[, .(n = length(unique(cell_name))), keyby = .(malignant, sample)]
 63 |         grp_tot <- grp_n[, min(max(sum(malignant == 'm')/.N, 0.6), 0.85)]
 64 |         grp_tot <- 2000*c(m = grp_tot, nm = 1 - grp_tot)
 65 |         grp_tot_true <- grp_n[, .(N = sum(n)), keyby = malignant]$N
 66 |         if(any(grp_tot > grp_tot_true)) {wm <- which.min(grp_tot_true - grp_tot); grp_tot <- grp_tot*grp_tot_true[wm]/grp_tot[wm]}
 67 |         grp_n[, n_to_sample := { # Returns number of cells to take from each sample
 68 |             tot <- grp_tot[unique(malignant)] # Total allowed for this group
 69 |             baseline <- pmin(n, min(20, floor(tot/length(unique(sample))))) # Number of cells to keep (samples with too few cells are untouched)
 70 |             N <- n - baseline
 71 |             to_lose <- sum(n) - tot # Number of cells from each sample that we can lose
 72 |             N <- N - floor(to_lose*N/sum(N)) # Take number of cells from each sample proportional to the total number in that sample
 73 |             to_lose <- sum(N + baseline) - tot # Any left over due to the floor() function
 74 |             sub_places <- order(order(-N)) %in% 1:to_lose # Take these off evenly from the samples having the most cells
 75 |             N[sub_places] <- N[sub_places] - 1
 76 |             N + baseline
 77 |         }, by = malignant]
 78 |         set.seed(1012)
 79 |         pdata <- pdata[, .SD[cell_name %in% sample( # The actual sampling
 80 |             unique(cell_name),
 81 |             do.call(`[`, list(grp_n, .(unique(malignant), unique(sample))))$n_to_sample,
 82 |             replace = FALSE
 83 |         )], by = .(malignant, sample)]
 84 |         pdata[, gene := factor(gene, levels = unique(gene))]
 85 |         pdata[, cell_name := factor(cell_name, levels = unique(cell_name[order(cell_type != 'Malignant', sample)]))]
 86 |         pdata[, cell_num := as.numeric(cell_name)]
 87 |         
 88 |         # Manually choose chromosome labels:
 89 |         chr_lab <- c('01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12', '14', '16', '17', '18', '19', '20', '22', 'X')
 90 |         chr_lab <- unique(pdata[, .(gene, chr)])[,
 91 |             {inds <- chr_n[, floor(cumsum(n) - n/2)[chr %in% chr_lab]]; setNames(gsub('^0', '', chr[inds]), as.character(gene)[inds])}
 92 |         ]
 93 |         # To choose them automatically using some threshold for minimum number of genes (100 in this case):
 94 |         # chr_lab <- unique(pdata[, .(gene, chr)])[,
 95 |         #     {inds <- chr_n[, floor(cumsum(n) - n/2)[n >= 100]]; setNames(gsub('^0', '', chr[inds]), as.character(gene)[inds])}
 96 |         # ]
 97 |         
 98 |         cells_lab <- pdata[, .(n = length(unique(cell_num))), keyby = malignant][, floor(c(n[1]/2, n[1] + n[2]/2))]
 99 |         
100 |         set.seed(1358)
101 |         bar_smpl <- ggplot(pdata) +
102 |             geom_raster(aes(x = 'a', y = cell_num, fill = sample)) +
103 |             scale_fill_manual(name = 'Sample', values = pdata[, setNames(distinctColorPalette(length(unique(sample))), unique(sample))]) +
104 |             scale_x_discrete(expand = c(0, 0)) +
105 |             scale_y_continuous(expand = c(0, 0)) +
106 |             theme(
107 |                 axis.text = element_blank(),
108 |                 axis.title = element_blank(),
109 |                 axis.ticks = element_blank(),
110 |                 axis.ticks.length = unit(0, 'pt'),
111 |                 legend.position = 'left',
112 |                 legend.title = element_text(size = 14),
113 |                 legend.text = element_text(size = 12),
114 |                 legend.justification = c(1, 0.5),
115 |                 panel.border = element_rect(fill = NA, colour = 'black', linewidth = 0.5),
116 |                 plot.margin = unit(c(5.5, 2, 5.5, 5.5), 'pt')
117 |             ) +
118 |             guides(fill = guide_legend(keywidth = unit(10, 'pt'), keyheight = unit(15, 'pt')))
119 |         
120 |         htmp <- ggplot(pdata) +
121 |             geom_raster(aes(x = gene, y = cell_num, fill = cna)) +
122 |             geom_vline(aes(xintercept = xi), data = data.table(xi = chr_n[1:(.N - 1), cumsum(n) + 0.5]), linewidth = 0.3) +
123 |             geom_hline(aes(yintercept = yi), data = data.table(yi = pdata[cell_type == 'Malignant', length(unique(cell_name)) + 0.5]),
124 |                 linewidth = 0.5) +
125 |             scale_fill_gradientn(colours = rev(brewer.pal(11, 'RdBu')), limits = c(-1, 1), breaks = c(-1, 0, 1), oob = squish) +
126 |             scale_x_discrete(expand = c(0, 0), breaks = names(chr_lab), labels = chr_lab) +
127 |             scale_y_continuous(expand = c(0, 0), sec.axis = dup_axis(breaks = cells_lab, labels = c('Malignant', 'TME'))) +
128 |             theme(
129 |                 axis.text.x = element_text(size = 12),
130 |                 axis.text.y.left = element_blank(),
131 |                 axis.text.y.right = element_text(angle = -90, hjust = 0.5, size = 14),
132 |                 axis.ticks = element_blank(),
133 |                 axis.ticks.length = unit(0, 'pt'),
134 |                 axis.title.x = element_text(size = 14),
135 |                 axis.title.y.left = element_blank(),
136 |                 axis.title.y.right = element_text(size = 14),
137 |                 legend.title = element_text(size = 12, margin = margin(b = 6)),
138 |                 legend.text = element_text(size = 10),
139 |                 legend.justification = c(0.5, 0),
140 |                 panel.border = element_rect(fill = NA, colour = 'black', linewidth = 0.5),
141 |                 plot.title = element_text(size = 14, face = 'bold'),
142 |                 plot.subtitle = element_text(size = 12),
143 |                 plot.margin = unit(c(5.5, 5.5, 5.5, 1), 'pt')
144 |             ) +
145 |             guides(fill = guide_colourbar(barwidth = unit(15, 'pt'), barheight = unit(70, 'pt'), ticks.colour = 'black', frame.colour = 'black')) +
146 |             labs(
147 |                 x = 'Chromosome',
148 |                 y = 'Cells',
149 |                 fill = 'Inferred CNA\n(log2 ratio)',
150 |                 title = paste('Inferred CNAs in', title_tail),
151 |                 subtitle = paste(
152 |                     'Reference cells:',
153 |                     paste(plot_data[[i]]$ref_cells[, .N, by = cell_type][order(-N), gsub('_', ' ', cell_type)], collapse = ', ')
154 |                 )
155 |             )
156 |         
157 |         return(list(heatmap = htmp, bar = bar_smpl))
158 |         
159 |     })
160 |     
161 |     if(sum(unique(paths_table[, .(study, cancer_type)])$study == r[1]) == 1) caption <- r[1] else caption <- paste0(r[1], ' (', r[2], ')')
162 |     if(length(out) == 1) {
163 |         caption <- paste0('Heatmap of inferred copy number alteration (CNA) values (quantified as log2 ratio) at each chromosomal position',
164 |             ' for a representative subset of cells in ', caption, ', with colour bar (left) showing the corresponding samples.')
165 |     } else {
166 |         caption <- paste0('Heatmaps of inferred copy number alteration (CNA) values (quantified as log2 ratio) at each chromosomal position',
167 |             ' for representative subsets of cells in ', caption, ', with colour bars (left) showing the corresponding samples.')
168 |     }
169 |     caption <- ggplot() +
170 |         theme(panel.background = element_rect(fill = NA), plot.caption.position = 'plot',
171 |             plot.caption = element_textbox_simple(hjust = 0, size = 12)) +
172 |         labs(caption = caption)
173 |     
174 |     leg <- lapply(out, function(x) get_legend(x$bar))
175 |     leg_width <- max(sapply(leg, function(x) as.numeric(x$widths[3])))
176 |     w <- c(leg_width + 0.4, 0.7, 26.7)
177 |     
178 |     # Open the PNG before using plot_grid(), because this function calls ggdraw(), which opens a graphic device, leading to X11 error on the server.
179 |     png(
180 |         paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1], '/CNAs.png'),
181 |         width = sum(w), height = length(out)*18 + (length(out) - 1)*4 + 4, units = 'cm', res = 300
182 |     )
183 |     bar_grob <- lapply(out, function(x) ggplotGrob(x$bar + theme(legend.position = 'none')))
184 |     htmp_grob <- lapply(out, function(x) ggplotGrob(x$heatmap))
185 |     for(j in 1:length(bar_grob)) bar_grob[[j]]$heights[c(1, 7, 12)] <- unit(c(1.7, 14.9, 1.4), 'cm')
186 |     for(j in 1:length(htmp_grob)) htmp_grob[[j]]$heights <- unit(c(0.2, 0, 0.8, 0.7, 0, 0, 14.9, 0.5, 0.7, 0, 0, 0.2), 'cm')
187 |     plist <- lapply(1:length(out), function(j) {
188 |         if(j == 1) {
189 |             plot_grid(leg[[j]], bar_grob[[j]], htmp_grob[[j]], nrow = 1, ncol = 3, rel_widths = w)
190 |         } else plot_grid(
191 |             ggplot() + theme_void(),
192 |             plot_grid(leg[[j]], bar_grob[[j]], htmp_grob[[j]], nrow = 1, ncol = 3, rel_widths = w),
193 |             nrow = 2, ncol = 1, rel_heights = c(4, 18)
194 |         )
195 |     })
196 |     if(length(plist) == 1) {
197 |         print(plot_grid(plist[[1]], caption, nrow = 2, ncol = 1, rel_heights = c(18, 4)))
198 |     } else {
199 |         print(plot_grid(plotlist = c(plist, list(caption)), nrow = length(plist) + 1, ncol = 1, rel_heights = c(18, rep(22, length(plist) - 1), 4)))
200 |     }
201 |     dev.off()
202 |     
203 | }
204 | 


--------------------------------------------------------------------------------
/study_plots_ct_comp.R:
--------------------------------------------------------------------------------
  1 | # bsub -q new-medium -R "select[model=Intel_Skylake] rusage[mem=32000]" -oo log/study_plots_ct_comp.o -eo log/study_plots_ct_comp.e Rscript study_plots_ct_comp.R
  2 | 
  3 | library(data.table)
  4 | library(ggplot2)
  5 | library(magrittr)
  6 | library(Matrix)
  7 | library(stringr)
  8 | library(plyr)
  9 | library(cowplot)
 10 | library(RColorBrewer)
 11 | library(scales)
 12 | library(matkot)
 13 | 
 14 | try(library(randomcoloR), silent = TRUE)
 15 | 
 16 | source('functions.R')
 17 | 
 18 | paths_table <- fread('../data/paths_table.csv', encoding = 'UTF-8', key = c('study', 'cancer_type'))
 19 | 
 20 | 
 21 | 
 22 | 
 23 | 
 24 | for(r in transpose(as.list(unique(paths_table[, .(study, cancer_type)])))) {
 25 |     
 26 |     cat(r, '\n')
 27 |     
 28 |     if(all(c('data_ct_comp.RDS', 'Cell types.pdf') %in% dir(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1])) == c(TRUE, FALSE))) {
 29 |         
 30 |         plot_data <- readRDS(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1], '/data_ct_comp.RDS'))
 31 |         
 32 |         nullcond <- sapply(plot_data, function(x) ifelse(is.null(x), TRUE, all(sapply(x[names(x) != 'path'], is.null))))
 33 |         if(all(nullcond)) next
 34 |         
 35 |         out <- lapply(which(!nullcond), function(i) {
 36 |             
 37 |             # To use in plot titles:
 38 |             if(sum(unique(paths_table[, .(study, cancer_type)])$study == r[1]) == 1) {
 39 |                 if(paths_table[as.list(r), .N] > 1) {
 40 |                     title_tail <- paste0(r[1], ' - ', paths_table[as.list(r)][i, if(group_name == '') paste('Group', group) else group_name])
 41 |                 } else {
 42 |                     title_tail <- r[1]
 43 |                 }
 44 |             } else {
 45 |                 if(paths_table[as.list(r), .N] > 1) {
 46 |                     title_tail <- paste0(r[1], ', ', r[2], ' - ', paths_table[as.list(r)][i, if(group_name == '') paste('Group', group) else
 47 |                         group_name])
 48 |                 } else {
 49 |                     title_tail <- paste0(r[1], ', ', r[2])
 50 |                 }
 51 |             }
 52 |             
 53 |             if(!is.null(plot_data[[i]]$data)) {
 54 |                 
 55 |                 pdata <- plot_data[[i]]$data
 56 |                 
 57 |                 marker_cond <- 'marker_plot_data' %in% names(pdata) &&
 58 |                     pdata$marker_plot_data[, length(unique(gene)) > 1 & length(unique(cell_type)) > 1]
 59 |                 if('marker_plot_data' %in% names(pdata) && pdata$marker_plot_data[, length(unique(gene)) < 10]) {
 60 |                     leg_arr <- 'horizontal'
 61 |                 } else {leg_arr <- 'vertical'}
 62 |                 
 63 |                 if('randomcoloR' %in% .packages(TRUE)) {
 64 |                     set.seed(9728)
 65 |                     plot_colours <- pdata$pie_data[, setNames(distinctColorPalette(.N), cell_type)]
 66 |                     plot_out <- do.call(
 67 |                         ct_comp_plot,
 68 |                         args = c(switch(marker_cond + 1, pdata['pie_data'], pdata), list(colours = plot_colours, legends_arrange = leg_arr))
 69 |                     )
 70 |                 } else {
 71 |                     plot_out <- do.call(
 72 |                         ct_comp_plot,
 73 |                         args = c(switch(marker_cond + 1, pdata['pie_data'], pdata), list(legends_arrange = leg_arr))
 74 |                     )
 75 |                 }
 76 |                 
 77 |                 if(marker_cond) {
 78 |                     plot_title <- paste0('Cell type composition and marker gene expression in ', title_tail)
 79 |                 } else {plot_title <- paste0('Cell type composition in ', title_tail)}
 80 |                 
 81 |                 return(list(plot_out = plot_out, plot_title = plot_title, leg_arr = leg_arr))
 82 |                 
 83 |             }
 84 |             
 85 |         })
 86 |         
 87 |         # A4 page is 210x297mm.  Here, I'm allowing 20mm for the title and 15mm each for the "A" and "B" labels.
 88 |         rmdlines <- c(
 89 |             '---\n',
 90 |             'title: ""\n',
 91 |             'header-includes:\n',
 92 |             ' \\renewcommand{\\familydefault}{\\sfdefault}\n', # Indent is important but must use spaces, NOT tabs!
 93 |             ' \\pagenumbering{gobble}\n',
 94 |             'geometry: margin=1cm\n',
 95 |             'output: pdf_document\n',
 96 |             'papersize: a4\n',
 97 |             '---\n',
 98 |             '\n',
 99 |             '```{r setup, include = FALSE}\n',
100 |             'knitr::opts_chunk$set(echo = FALSE, warning = FALSE, error = FALSE, dev = "cairo_pdf")\n',
101 |             '```\n'
102 |         )
103 |         for(i in 1:length(out)) {
104 |             if('marker_plot' %in% names(out[[i]]$plot_out)) {
105 |                 margin_x <- (ifelse(out[[i]]$leg_arr == 'vertical', 150, 125) - # Take 25mm off margin if legends are arranged horizontally
106 |                         10*length(unique(plot_data[!nullcond][[i]]$data$marker_plot_data$cell_type)))/2
107 |                 n_genes <- length(unique(plot_data[!nullcond][[i]]$data$marker_plot_data$gene))
108 |                 h <- (log2(24/n_genes) + 6)*n_genes + 30
109 |                 # The following top-aligns legends in case of too few genes, else the tops of the legends get cut off.  It also depends on the length
110 |                 # of the x labels, as long labels squash the plot.  4 seems OK, but 5 is safer in case there's a longer label than "Oligodendrocyte".
111 |                 if(n_genes <= 5) out[[i]]$plot_out$marker_plot <- out[[i]]$plot_out$marker_plot + theme(legend.justification = 'top')
112 |             }
113 |             saveRDS(out[[i]]$plot_out, paste0('comp_plots_', i, '.rds'))
114 |             if(i > 1) rmdlines <- c(rmdlines, '\n\\newpage\n')
115 |             if('marker_plot' %in% names(out[[i]]$plot_out)) {
116 |                 rmdlines <- c(
117 |                     rmdlines,
118 |                     '\n',
119 |                     '### ', out[[i]]$plot_title, '\n',
120 |                     '\n',
121 |                     '```{r}\n',
122 |                     'plots <- readRDS("comp_plots_', i, '.rds")\n',
123 |                     '```\n',
124 |                     '\n',
125 |                     '### **A**\n',
126 |                     '\n',
127 |                     '```{r fig.align = "center", fig.height = ', 117/25.4, ', fig.width = ', 210/25.4, ', out.width = "80%", out.height = "80%"}\n',
128 |                     'plots$pie\n',
129 |                     '```\n',
130 |                     '\n',
131 |                     '### **B**\n',
132 |                     '\n',
133 |                     '```{r fig.align = "center", fig.height = ', h/25.4, ', fig.width = ', 210/25.4, ', out.width = "80%", out.height = "80%"}\n',
134 |                     'plots$marker_plot + theme(plot.margin = unit(c(0, ', margin_x, ', 0, ', margin_x, '), "mm"))\n',
135 |                     '```\n',
136 |                     '\n',
137 |                     '\\vspace*{\\fill}\n', # Aligns text with bottom of page
138 |                     '\n',
139 |                     '**A.** Pie chart showing proportions of cell types after quality control. Cell types are ordered by abundance. **B.** Plot ',
140 |                     'showing the average log2(TPM/10) expression levels of marker genes in each cell type and, for each gene, the percentage of ',
141 |                     'cells of each type in which that gene was detected. Points are not shown for combinations where the percentage of expressing ',
142 |                     'cells is below 50%. Only non-malignant cell types with at least 10 cells are shown, and these are ordered as in A.\n'
143 |                 )
144 |             } else {
145 |                 rmdlines <- c(
146 |                     rmdlines,
147 |                     '\n',
148 |                     '### ', out[[i]]$plot_title, '\n',
149 |                     '\n',
150 |                     '```{r}\n',
151 |                     'plots <- readRDS("comp_plots_', i, '.rds")\n',
152 |                     '```\n',
153 |                     '\n',
154 |                     '```{r fig.align = "center", fig.height = ', 117/25.4, ', fig.width = ', 210/25.4, ', out.width = "80%", out.height = "80%"}\n',
155 |                     'plots$pie\n',
156 |                     '```\n',
157 |                     '\n',
158 |                     'Pie chart showing proportions of cell types after quality control. Cell types are ordered by abundance.\n'
159 |                 )
160 |             }
161 |         }
162 |         out_con <- file('temp.Rmd')
163 |         writeLines(rmdlines, con = out_con, sep = '')
164 |         close(out_con)
165 |         rmarkdown::render('temp.Rmd', output_file = paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1], '/Cell types.pdf'))
166 |         file.remove('temp.Rmd')
167 |         for(i in 1:length(out)) file.remove(paste0('comp_plots_', i, '.rds'))
168 |         
169 |     }
170 |     
171 | }
172 | 


--------------------------------------------------------------------------------
/study_plots_ct_umap.R:
--------------------------------------------------------------------------------
  1 | # bsub -q new-medium -R "select[model=Intel_Skylake] rusage[mem=32000]" -oo log/study_plots_ct_umap.o -eo log/study_plots_ct_umap.e Rscript study_plots_ct_umap.R
  2 | 
  3 | library(data.table)
  4 | library(ggplot2)
  5 | library(magrittr)
  6 | library(Matrix)
  7 | library(stringr)
  8 | library(plyr)
  9 | library(cowplot)
 10 | library(RColorBrewer)
 11 | library(scales)
 12 | library(matkot)
 13 | 
 14 | try(library(randomcoloR), silent = TRUE)
 15 | 
 16 | source('functions.R')
 17 | 
 18 | paths_table <- fread('../data/paths_table.csv', encoding = 'UTF-8', key = c('study', 'cancer_type'))
 19 | 
 20 | 
 21 | 
 22 | 
 23 | 
 24 | for(r in transpose(as.list(unique(paths_table[, .(study, cancer_type)])))) {
 25 |     
 26 |     cat(r, '\n')
 27 |     
 28 |     if(all(c('data_ct_umap.RDS', 'UMAP.pdf') %in% dir(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1])) == c(TRUE, FALSE))) {
 29 |         
 30 |         plot_data <- readRDS(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1], '/data_ct_umap.RDS'))
 31 |         
 32 |         nullcond <- sapply(plot_data, function(x) ifelse(is.null(x), TRUE, is.null(x$data)))
 33 |         if(all(nullcond)) next
 34 |         
 35 |         if('randomcoloR' %in% .packages(TRUE)) {
 36 |             set.seed(9728)
 37 |             plot_colours <- lapply(
 38 |                 which(!nullcond),
 39 |                 function(i) {
 40 |                     pdata <- plot_data[[i]]$data
 41 |                     slapply(c('cell_type', 'sample')[c('cell_type', 'sample') %in% names(pdata)], function(vn) unique(pdata[[vn]]))
 42 |                 }
 43 |             )
 44 |             plot_colours <- slapply(
 45 |                 c('cell_type', 'sample'),
 46 |                 function(vn) {
 47 |                     vn_unique <- unique(unlist(lapply(plot_colours, function(x) if(vn %in% names(x)) x[[vn]])))
 48 |                     if(!is.null(vn_unique)) setNames(distinctColorPalette(length(vn_unique)), vn_unique)
 49 |                 }
 50 |             )
 51 |         }
 52 |         
 53 |         out <- lapply(which(!nullcond), function(i) {
 54 |             
 55 |             pdata <- plot_data[[i]]$data
 56 |             if(!any(c('cell_type', 'sample') %in% names(pdata))) return(NULL)
 57 |             
 58 |             # To use in plot titles:
 59 |             if(sum(unique(paths_table[, .(study, cancer_type)])$study == r[1]) == 1) {
 60 |                 if(paths_table[as.list(r), .N] > 1) {
 61 |                     title_tail <- paste0(r[1], ' - ', paths_table[as.list(r)][i, if(group_name == '') paste('Group', group) else group_name])
 62 |                 } else {
 63 |                     title_tail <- r[1]
 64 |                 }
 65 |             } else {
 66 |                 if(paths_table[as.list(r), .N] > 1) {
 67 |                     title_tail <- paste0(r[1], ', ', r[2], ' - ', paths_table[as.list(r)][i, if(group_name == '') paste('Group', group) else
 68 |                         group_name])
 69 |                 } else {
 70 |                     title_tail <- paste0(r[1], ', ', r[2])
 71 |                 }
 72 |             }
 73 |             
 74 |             if('cell_type' %in% names(pdata)) {
 75 |                 if('Malignant' %in% pdata$cell_type) {
 76 |                     pdata[, cell_type := factor(cell_type, levels = c('Malignant', sort(unique(cell_type[cell_type != 'Malignant']))))]
 77 |                 } else {
 78 |                     pdata[, cell_type := factor(cell_type, levels = sort(unique(cell_type)))]
 79 |                 }
 80 |             }
 81 |             if('sample' %in% names(pdata)) {
 82 |                 if(!any(is.na(as.numeric(unique(pdata$sample))))) {
 83 |                     pdata[, sample := factor(as.character(sample), levels = sort(as.numeric(unique(sample))))]
 84 |                 } else pdata[, sample := factor(as.character(sample), levels = sort(as.character(unique(sample))))]
 85 |             }
 86 |             if('randomcoloR' %in% .packages(TRUE)) c(list(
 87 |                 title_tail = title_tail,
 88 |                 plots = slapply(
 89 |                     c('cell_type', 'sample')[c('cell_type', 'sample') %in% names(pdata)],
 90 |                     function(vn) ct_umap_plot(pdata, vn, colours = plot_colours[[vn]][levels(pdata[[vn]])],
 91 |                         legend_title = gsub('_', ' ', str_to_title(vn)))
 92 |                 )
 93 |             )) else c(list(
 94 |                 title_tail = title_tail,
 95 |                 plots = slapply(
 96 |                     c('cell_type', 'sample')[c('cell_type', 'sample') %in% names(pdata)],
 97 |                     function(vn) ct_umap_plot(pdata, vn, legend_title = gsub('_', ' ', str_to_title(vn)))
 98 |                 )
 99 |             ))
100 |             
101 |         })
102 |         
103 |         if(all(sapply(out, is.null))) next
104 |         
105 |         # A4 page is 210x297mm.  Here, I'm allowing 20mm for the title and 15mm each for the "A" and "B" labels.
106 |         rmdlines <- c(
107 |             '---\n',
108 |             'title: ""\n',
109 |             'header-includes:\n',
110 |             ' \\renewcommand{\\familydefault}{\\sfdefault}\n', # Indent is important but must use spaces, NOT tabs!
111 |             ' \\pagenumbering{gobble}\n',
112 |             'geometry: margin=1cm\n',
113 |             'output: pdf_document\n',
114 |             'papersize: a4\n',
115 |             '---\n',
116 |             '\n',
117 |             '```{r setup, include = FALSE}\n',
118 |             'knitr::opts_chunk$set(echo = FALSE, warning = FALSE, error = FALSE, dev = "cairo_pdf")\n',
119 |             '```\n'
120 |         )
121 |         for(i in 1:length(out)) {
122 |             saveRDS(out[[i]]$plots, paste0('umap_plots_', i, '.rds'))
123 |             if(i > 1) rmdlines <- c(rmdlines, '\n\\newpage\n')
124 |             if(length(out[[i]]$plots) == 2) {
125 |                 rmdlines <- c(
126 |                     rmdlines,
127 |                     '\n',
128 |                     '### UMAP in ', out[[i]]$title_tail, '\n',
129 |                     '\n',
130 |                     '```{r}\n',
131 |                     'plots <- readRDS("umap_plots_', i, '.rds")\n',
132 |                     '```\n',
133 |                     '\n',
134 |                     '### **A**\n',
135 |                     '\n',
136 |                     '```{r fig.align = "center", fig.height = ', 130/25.4, ', fig.width = ', 180/25.4, ', out.width = "80%", out.height = "80%"}\n',
137 |                     'plots$cell_type\n',
138 |                     '```\n',
139 |                     '\n',
140 |                     '### **B**\n',
141 |                     '\n',
142 |                     '```{r fig.align = "center", fig.height = ', 130/25.4, ', fig.width = ', 180/25.4, ', out.width = "80%", out.height = "80%"}\n',
143 |                     'plots$sample\n',
144 |                     '```\n',
145 |                     '\n',
146 |                     '\\vspace*{\\fill}\n', # Aligns text with bottom of page
147 |                     '\n',
148 |                     '**A.** UMAP plot of single cells in ', out[[i]]$title_tail, ', after quality control. Cells are colored by cell type. **B.** ',
149 |                     'UMAP plot as in A, with cells colored by sample.\n'
150 |                 )
151 |             } else {
152 |                 rmdlines <- c(
153 |                     rmdlines,
154 |                     '\n',
155 |                     '### UMAP in ', out[[i]]$title_tail, '\n',
156 |                     '\n',
157 |                     '```{r}\n',
158 |                     'plots <- readRDS("umap_plots_', i, '.rds")\n',
159 |                     '```\n',
160 |                     '\n',
161 |                     '```{r fig.align = "center", fig.height = ', 130/25.4, ', fig.width = ', 180/25.4, ', out.width = "80%", out.height = "80%"}\n',
162 |                     'plots[[1]]\n',
163 |                     '```\n',
164 |                     '\n',
165 |                     'UMAP plot of single cells in ', out[[i]]$title_tail, ', after quality control. Cells are colored by ',
166 |                     gsub('_', ' ', names(out[[i]]$plots)), '.\n'
167 |                 )
168 |             }
169 |         }
170 |         out_con <- file('temp.Rmd')
171 |         writeLines(rmdlines, con = out_con, sep = '')
172 |         close(out_con)
173 |         rmarkdown::render('temp.Rmd', output_file = paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1], '/UMAP.pdf'))
174 |         file.remove('temp.Rmd')
175 |         for(i in 1:length(out)) file.remove(paste0('umap_plots_', i, '.rds'))
176 |         
177 |     }
178 |     
179 | }
180 | 


--------------------------------------------------------------------------------
/study_plots_data_cc_thresh.R:
--------------------------------------------------------------------------------
  1 | library(data.table)
  2 | library(magrittr)
  3 | library(matkot)
  4 | 
  5 | paths_table <- fread('../data/paths_table.csv', key = c('study', 'cancer_type'))
  6 | 
  7 | thresh <- lapply(transpose(as.list(unique(paths_table[, .(study, cancer_type)]))), function(r) {
  8 |     if(!('data_cc.RDS' %in% dir(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1])))) return(NULL)
  9 |     plot_data <- readRDS(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1], '/data_cc.RDS'))
 10 |     nullcond <- sapply(plot_data, function(x) ifelse(is.null(x), TRUE, all(sapply(x[names(x) != 'path'], is.null))))
 11 |     if(all(nullcond)) return(NULL)
 12 |     lapply(which(!nullcond), function(i) {
 13 |         if('cell_type' %in% names(plot_data[[i]]$ccdata)) {
 14 |             plot_data[[i]]$ccdata[,
 15 |                 .(g1s = min(g1s_score[lr_g1s]), g2m = min(g2m_score[lr_g2m]), study = r[1], cancer_type = r[2]),
 16 |                 by = cell_type
 17 |             ]
 18 |         } else {
 19 |             plot_data[[i]]$ccdata[, .(cell_type = NA, g1s = min(g1s_score[lr_g1s]), g2m = min(g2m_score[lr_g2m]), study = r[1], cancer_type = r[2])]
 20 |         }
 21 |     }) %>% rbindlist
 22 | }) %>% rbindlist
 23 | 
 24 | # plot(thresh[is.finite(g1s), sort(g1s)]); abline(h = 0.7)
 25 | # plot(thresh[is.finite(g2m), sort(g2m)]); abline(h = 0.65)
 26 | 
 27 | for(r in transpose(as.list(unique(paths_table[, .(study, cancer_type)])))) {
 28 |     cat(r, '\n')
 29 |     if(!('data_cc.RDS' %in% dir(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1])))) next
 30 |     plot_data <- readRDS(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1], '/data_cc.RDS'))
 31 |     nullcond <- sapply(plot_data, function(x) ifelse(is.null(x), TRUE, all(sapply(x[names(x) != 'path'], is.null))))
 32 |     if(all(nullcond)) next
 33 |     for(i in which(!nullcond)) {
 34 |         
 35 |         ccdata <- copy(plot_data[[i]]$ccdata)
 36 |         ccdata[, phase := 'Cycling']
 37 |         ccdata[g1s_score < 0.7 & g2m_score < 0.65, phase := 'Not cycling']
 38 |         
 39 |         # Define phases and intermediates by fold change threshold:
 40 |         ccdata[phase == 'Cycling' & g1s_score > g2m_score & g1s_score - 0.5 < 2*(g2m_score - 0.5), phase := 'Intermediate']
 41 |         ccdata[phase == 'Cycling' & g2m_score > g1s_score & g2m_score - 0.5 < 2*(g1s_score - 0.5), phase := 'Intermediate']
 42 |         ccdata[phase == 'Cycling', phase := c('G1/S', 'G2/M')[which.max(.(g1s_score, g2m_score))], by = cell_name]
 43 |         
 44 |         # ggplot(ccdata[cell_type == 'Malignant'], aes(x = g1s_score, y = g2m_score)) + geom_point(aes(colour = phase)) + theme_test()
 45 |         
 46 |         setkey(ccdata, cell_name)
 47 |         plot_data[[i]]$ccdata <- ccdata
 48 |         
 49 |         hdata <- copy(plot_data[[i]]$hdata)
 50 |         if('data.table' %in% class(hdata)) {
 51 |             setkey(hdata, cell_name)
 52 |             hdata[, phase := do.call(`[`, list(ccdata, cell_name))$phase]
 53 |             annot_cols <- c('cell_name', 'g1s_score', 'g2m_score', 'phase')
 54 |             hdata <- slapply(
 55 |                 switch(
 56 |                     ('cell_type' %in% names(ccdata)) + 1,
 57 |                     list(ccdata$cell_name),
 58 |                     slapply(
 59 |                         ccdata[phase != 'Not cycling', .(N = .N), by = cell_type][N >= 50, cell_type],
 60 |                         function(ct) ccdata[cell_type == ct, cell_name]
 61 |                     )
 62 |                 ),
 63 |                 function(idvec) {
 64 |                     # Downsample cell IDs if one phase (probably non-cycling cells) takes up more than two thirds of the data:
 65 |                     cell_ids <- hdata[
 66 |                         idvec,
 67 |                         .(id = switch((.N > hdata[idvec][phase != p, 2*.N]) + 1, cell_name, sample(cell_name, hdata[idvec][phase != p, 2*.N]))),
 68 |                         by = .(p = phase)
 69 |                     ]$id
 70 |                     out_data <- copy(hdata)[cell_name %in% cell_ids, -'cycling']
 71 |                     # Take cumulative sum of numbers of cells for each phase, to be used in ordering the cells and plotting division
 72 |                     # lines in the heatmap:
 73 |                     setkey(out_data, phase)
 74 |                     const <- out_data[cell_name %in% cell_ids][c('G2/M', 'Intermediate', 'G1/S')][!is.na(cell_name), .(N = .N), by = phase]
 75 |                     const <- const[, setNames(c(0, cumsum(N)), c(phase, 'Not cycling'))]
 76 |                     # Get ordered cell ID list:
 77 |                     cell_ids <- out_data[c('G2/M', 'Intermediate', 'G1/S', 'Not cycling')][
 78 |                         !is.na(cell_name),
 79 |                         .(cell_name = cell_name, cell_order = order(-pmax(g1s_score, g2m_score)) + const[phase]),
 80 |                         by = phase
 81 |                     ][, cell_name[cell_order]]
 82 |                     # Melt heatmap data and order cells:
 83 |                     out_data <- melt(out_data, id.vars = annot_cols, variable.name = 'gene', value.name = 'exp_level')
 84 |                     out_data[, cell_name := factor(cell_name, levels = cell_ids)]
 85 |                     # Order the genes:
 86 |                     setkey(out_data, phase)
 87 |                     if(all(c('G1/S', 'G2/M') %in% out_data$phase)) {
 88 |                         out_data[, gene := factor(gene, levels = out_data[
 89 |                             c('G1/S', 'G2/M'),
 90 |                             .SD[gene %in% plot_data[[i]][[tolower(gsub('/', '', phase))]], .(ave_exp = mean(exp_level)), by = gene],
 91 |                             by = phase
 92 |                         ][, .(ordered_genes = gene[order(ave_exp)]), by = phase]$ordered_genes)]
 93 |                     } else { # Already checked there are enough cycling cells in idvec, so can assume that 'G1/S' or 'G2/M' is present
 94 |                         out_data[, gene := factor(gene, levels = out_data[phase %in% c('G1/S', 'G2/M'), .(ave_exp = mean(exp_level)), by = gene][,
 95 |                             Reduce(c, lapply(
 96 |                                 c('G1/S', 'G2/M'),
 97 |                                 function(p) .SD[gene %in% plot_data[[i]][[tolower(gsub('/', '', p))]]][order(ave_exp), gene]
 98 |                             ))
 99 |                         ])]
100 |                     }
101 |                     return(list(data = out_data, vline = const))
102 |                 }
103 |             )
104 |             plot_data[[i]]$hdata <- hdata
105 |         } else plot_data[[i]]$hdata <- NULL
106 |         
107 |     }
108 |     saveRDS(plot_data, paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1], '/data_cc.RDS'))
109 | }
110 | 


--------------------------------------------------------------------------------
/study_plots_data_cc_thresh_consensus.R:
--------------------------------------------------------------------------------
  1 | library(data.table)
  2 | library(magrittr)
  3 | library(matkot)
  4 | 
  5 | paths_table <- fread('../data/paths_table.csv', key = c('study', 'cancer_type'))
  6 | 
  7 | thresh <- lapply(transpose(as.list(unique(paths_table[, .(study, cancer_type)]))), function(r) {
  8 |     if(!('data_cc_consensus.rds' %in% dir(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1])))) return(NULL)
  9 |     plot_data <- readRDS(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1], '/data_cc_consensus.rds'))
 10 |     nullcond <- sapply(plot_data, function(x) ifelse(is.null(x), TRUE, all(sapply(x[names(x) != 'path'], is.null))))
 11 |     if(all(nullcond)) return(NULL)
 12 |     lapply(which(!nullcond), function(i) {
 13 |         if('cell_type' %in% names(plot_data[[i]]$ccdata)) {
 14 |             plot_data[[i]]$ccdata[,
 15 |                 .(g1s = min(g1s_score[lr_g1s]), g2m = min(g2m_score[lr_g2m]), study = r[1], cancer_type = r[2]),
 16 |                 by = cell_type
 17 |             ]
 18 |         } else {
 19 |             plot_data[[i]]$ccdata[, .(cell_type = NA, g1s = min(g1s_score[lr_g1s]), g2m = min(g2m_score[lr_g2m]), study = r[1], cancer_type = r[2])]
 20 |         }
 21 |     }) %>% rbindlist
 22 | }) %>% rbindlist
 23 | 
 24 | # plot(thresh[is.finite(g1s), sort(g1s)]); abline(h = 0.7)
 25 | # plot(thresh[is.finite(g2m), sort(g2m)]); abline(h = 0.7)
 26 | 
 27 | for(r in transpose(as.list(unique(paths_table[, .(study, cancer_type)])))) {
 28 |     cat(r, '\n')
 29 |     if(!('data_cc_consensus.rds' %in% dir(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1])))) next
 30 |     plot_data <- readRDS(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1], '/data_cc_consensus.rds'))
 31 |     nullcond <- sapply(plot_data, function(x) ifelse(is.null(x), TRUE, all(sapply(x[names(x) != 'path'], is.null))))
 32 |     if(all(nullcond)) next
 33 |     for(i in which(!nullcond)) {
 34 |         
 35 |         ccdata <- copy(plot_data[[i]]$ccdata)
 36 |         ccdata[, phase := 'Cycling']
 37 |         ccdata[g1s_score < 0.7 & g2m_score < 0.7, phase := 'Not cycling']
 38 |         
 39 |         # Define phases and intermediates by fold change threshold:
 40 |         ccdata[phase == 'Cycling' & g1s_score > g2m_score & g1s_score - 0.5 < 2*(g2m_score - 0.5), phase := 'Intermediate']
 41 |         ccdata[phase == 'Cycling' & g2m_score > g1s_score & g2m_score - 0.5 < 2*(g1s_score - 0.5), phase := 'Intermediate']
 42 |         ccdata[phase == 'Cycling', phase := c('G1/S', 'G2/M')[which.max(.(g1s_score, g2m_score))], by = cell_name]
 43 |         
 44 |         # ggplot(ccdata[cell_type == 'Malignant'], aes(x = g1s_score, y = g2m_score)) + geom_point(aes(colour = phase)) + theme_test()
 45 |         
 46 |         setkey(ccdata, cell_name)
 47 |         plot_data[[i]]$ccdata <- ccdata
 48 |         
 49 |         hdata <- copy(plot_data[[i]]$hdata)
 50 |         if('data.table' %in% class(hdata)) {
 51 |             setkey(hdata, cell_name)
 52 |             hdata[, phase := do.call(`[`, list(ccdata, cell_name))$phase]
 53 |             annot_cols <- c('cell_name', 'g1s_score', 'g2m_score', 'phase')
 54 |             hdata <- slapply(
 55 |                 switch(
 56 |                     ('cell_type' %in% names(ccdata)) + 1,
 57 |                     list(ccdata$cell_name),
 58 |                     slapply(
 59 |                         ccdata[phase != 'Not cycling', .(N = .N), by = cell_type][N >= 50, cell_type],
 60 |                         function(ct) ccdata[cell_type == ct, cell_name]
 61 |                     )
 62 |                 ),
 63 |                 function(idvec) {
 64 |                     # Downsample cell IDs if one phase (probably non-cycling cells) takes up more than two thirds of the data:
 65 |                     cell_ids <- hdata[
 66 |                         idvec,
 67 |                         .(id = switch((.N > hdata[idvec][phase != p, 2*.N]) + 1, cell_name, sample(cell_name, hdata[idvec][phase != p, 2*.N]))),
 68 |                         by = .(p = phase)
 69 |                     ]$id
 70 |                     out_data <- copy(hdata)[cell_name %in% cell_ids, -'cycling']
 71 |                     # Take cumulative sum of numbers of cells for each phase, to be used in ordering the cells and plotting division lines in the
 72 |                     # heatmap:
 73 |                     setkey(out_data, phase)
 74 |                     const <- out_data[cell_name %in% cell_ids][c('G2/M', 'Intermediate', 'G1/S')][!is.na(cell_name), .(N = .N), by = phase]
 75 |                     const <- const[, setNames(c(0, cumsum(N)), c(phase, 'Not cycling'))]
 76 |                     # Get ordered cell ID list:
 77 |                     cell_ids <- out_data[c('G2/M', 'Intermediate', 'G1/S', 'Not cycling')][
 78 |                         !is.na(cell_name),
 79 |                         .(cell_name = cell_name, cell_order = order(-pmax(g1s_score, g2m_score)) + const[phase]),
 80 |                         by = phase
 81 |                     ][, cell_name[cell_order]]
 82 |                     # Melt heatmap data and order cells:
 83 |                     out_data <- melt(out_data, id.vars = annot_cols, variable.name = 'gene', value.name = 'exp_level')
 84 |                     out_data[, cell_name := factor(cell_name, levels = cell_ids)]
 85 |                     # Order the genes:
 86 |                     setkey(out_data, phase)
 87 |                     if(all(c('G1/S', 'G2/M') %in% out_data$phase)) {
 88 |                         out_data[, gene := factor(gene, levels = out_data[
 89 |                             c('G1/S', 'G2/M'),
 90 |                             .SD[gene %in% plot_data[[i]][[tolower(gsub('/', '', phase))]], .(ave_exp = mean(exp_level)), by = gene],
 91 |                             by = phase
 92 |                         ][, .(ordered_genes = gene[order(ave_exp)]), by = phase]$ordered_genes)]
 93 |                     } else { # Already checked there are enough cycling cells in idvec, so can assume that 'G1/S' or 'G2/M' is present
 94 |                         out_data[, gene := factor(gene, levels = out_data[phase %in% c('G1/S', 'G2/M'), .(ave_exp = mean(exp_level)), by = gene][,
 95 |                             Reduce(c, lapply(
 96 |                                 c('G1/S', 'G2/M'),
 97 |                                 function(p) .SD[gene %in% plot_data[[i]][[tolower(gsub('/', '', p))]]][order(ave_exp), gene]
 98 |                             ))
 99 |                         ])]
100 |                     }
101 |                     return(list(data = out_data, vline = const))
102 |                 }
103 |             )
104 |             plot_data[[i]]$hdata <- hdata
105 |         } else plot_data[[i]]$hdata <- NULL
106 |         
107 |     }
108 |     saveRDS(plot_data, paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1], '/data_cc_consensus.rds'))
109 | }
110 | 


--------------------------------------------------------------------------------
/study_plots_data_cna.R:
--------------------------------------------------------------------------------
  1 | # The command line arguments should supply study name and cancer type (in that order).
  2 | r = commandArgs(trailingOnly = TRUE)
  3 | 
  4 | 
  5 | 
  6 | 
  7 | 
  8 | library(data.table)
  9 | library(magrittr)
 10 | library(Matrix)
 11 | library(caTools)
 12 | library(stringr)
 13 | library(plyr)
 14 | library(matkot)
 15 | 
 16 | source('functions.R')
 17 | 
 18 | paths_table <- fread('../data/paths_table.csv', key = c('study', 'cancer_type'))
 19 | 
 20 | hgnc_complete_set <- fread('../data/hgnc_complete_set_2023-04-13.txt', key = 'ensembl_gene_id')
 21 | hgnc_complete_set <- hgnc_complete_set[!(ensembl_gene_id %in% names(table(ensembl_gene_id))[table(ensembl_gene_id) > 1])]
 22 | alias_table <- make_alias_table(hgnc_complete_set)
 23 | 
 24 | gene_positions <- fread('../data/gene_positions.csv')
 25 | 
 26 | # This is how gene_positions was defined:
 27 | # library(biomaRt)
 28 | # ensembl_mart <- useMart('ensembl')
 29 | # ensembl_data <- useDataset('hsapiens_gene_ensembl', mart = ensembl_mart)
 30 | # gene_positions <- as.data.table(getBM(attributes = c('ensembl_gene_id', 'chromosome_name', 'start_position', 'end_position'), mart = ensembl_data))
 31 | # fwrite(gene_positions, '../data/gene_positions.csv')
 32 | 
 33 | gene_positions[, c('symbol', 'location') := do.call(`[`, list(hgnc_complete_set, ensembl_gene_id))[, .(symbol, location_sortable)]]
 34 | gene_positions <- gene_positions[!is.na(symbol) & chromosome_name %in% c(as.character(1:22), 'X', 'Y')]
 35 | gene_positions[, chromosome_name := mapvalues(chromosome_name, as.character(1:9), paste0(0, 1:9))]
 36 | setkey(gene_positions, symbol)
 37 | 
 38 | 
 39 | 
 40 | 
 41 | 
 42 | paths <- apply(paths_table[as.list(r), .(cells, genes, expmat)], 1, as.list, simplify = FALSE)
 43 | ct <- gsub('/', '-', r[2])
 44 | out <- lapply(paths, function(p) {
 45 |     
 46 |     if(!endsWith(p$expmat, 'mtx')) return(list(cna_data = NULL, ref_cells = NULL, ref_range = NULL, path = p))
 47 |     
 48 |     cat('Preparing data\n')
 49 |     cells <- suppressWarnings(fread(p$cells, na.strings = ''))
 50 |     if(!('cell_type' %in% names(cells)) || length(unique(cells$cell_type)) == 1 || !('Malignant' %in% cells$cell_type)) {
 51 |         return(list(cna_data = NULL, ref_cells = NULL, ref_range = NULL, path = p))
 52 |     }
 53 |     cells$cell_name <- as.character(cells$cell_name) # In case cell names are the same as row numbers
 54 |     if('sample' %in% names(cells)) cells$sample <- as.character(cells$sample)
 55 |     if(cells[, .N > length(unique(cell_name))]) cells[, cell_name := paste(cell_name, .I, sep = '_')]
 56 |     genes <- fread(p$genes, header = FALSE)$V1
 57 |     expmat <- readMM(p$expmat)
 58 |     expmat <- expmat[genes != '', ]
 59 |     genes <- genes[genes != '']
 60 |     expmat <- expmat[genes %in% names(table(genes))[table(genes) == 1], ]
 61 |     genes <- genes[genes %in% names(table(genes))[table(genes) == 1]]
 62 |     genes <- update_symbols_fast(genes, alias_table) # Update gene symbols
 63 |     rownames(expmat) <- genes
 64 |     genes <- genes[genes %in% gene_positions$symbol]; expmat <- expmat[genes, ]
 65 |     colnames(expmat) <- cells$cell_name
 66 |     cells <- cells[col_nnz(expmat) >= 1000] # Remove low-complexity cells
 67 |     if(nrow(cells) < 30) return(list(cna_data = NULL, ref_cells = NULL, ref_range = NULL, path = p))
 68 |     expmat <- round(log_transform(1e+05*to_frac(expmat[, cells$cell_name])), 4) # Normalise to log TPM/10
 69 |     
 70 |     # Get reference cells, prioritising Fibroblasts, then Endothelial, then Macrophage, then whatever is most abundant.
 71 |     cat('Defining reference\n')
 72 |     ref_cts_global <- cells[cell_type != 'Malignant', .(N = .N), keyby = cell_type][N >= 100]
 73 |     ref_cts_global <- rbind(
 74 |         ref_cts_global[c('Fibroblast', 'Endothelial', 'Macrophage')],
 75 |         ref_cts_global[!(cell_type %in% c('Fibroblast', 'Endothelial', 'Macrophage'))][order(-N)]
 76 |     )[!is.na(N), cell_type[complete.cases(.SD)][1:min(2, .N)]]
 77 |     if(length(ref_cts_global) < 2) return(list(cna_data = NULL, ref_cells = NULL, ref_range = NULL, path = p))
 78 |     ref_cells <- cells[cell_type %in% ref_cts_global, .(cell_name = cell_name, cell_type = cell_type, ref_type = 'global')]
 79 |     ref_cells_smpl <- cells[
 80 |         cell_type != 'Malignant',
 81 |         {
 82 |             ref_cts_smpl <- .SD[, .(N = .N), keyby = cell_type][N >= 100]
 83 |             ref_cts_smpl <- ref_cts_smpl[c('Fibroblast', 'Endothelial', 'Macrophage'), cell_type[complete.cases(.SD)]]
 84 |             if(length(ref_cts_smpl) >= 2) .SD[cell_type %in% ref_cts_smpl, .(cell_name = cell_name, cell_type = cell_type)]
 85 |         },
 86 |         by = .(ref_type = sample)
 87 |     ]
 88 |     if(nrow(ref_cells_smpl) > 0) ref_cells <- rbind(ref_cells, ref_cells_smpl, use.names = TRUE)
 89 |     rm(ref_cells_smpl)
 90 | 
 91 |     cat('Computing CNA values\n')
 92 |     rms <- rowMeans(expmat)
 93 |     genes_top <- head(names(rms)[order(-rms)], 5000)
 94 |     expmat <- expmat[genes_top, ]; rms <- rms[genes_top]
 95 |     expmat <- expmat - rms # Centre rows/genes
 96 |     expmat[expmat > 3] <- 3; expmat[expmat < -3] <- -3 # Restrict range to limit influence of extreme values
 97 |     expdt <- as.data.table(as.matrix(expmat), keep.rownames = 'gene')
 98 |     expdt <- melt(expdt, id.vars = 'gene', variable.name = 'cell_name', value.name = 'value', variable.factor = FALSE)
 99 |     expdt[, c('chr', 'start_pos') := gene_positions[gene, .(chromosome_name, start_position)]]
100 |     expdt <- expdt[order(chr, start_pos)]
101 |     expdt[, cna := log2(runmean(2^value, k = 100)), by = .(chr, cell_name)] # Computing the CNA values
102 |     expdt[, cna := cna - median(cna), keyby = cell_name]
103 |     ref_range <- ref_cells[, do.call(`[`, list(expdt, cell_name))[, .(mean_cna = mean(cna)), by = gene], by = .(ref_type, cell_type)]
104 |     ref_range <- ref_range[, .(mn = min(mean_cna), mx = max(mean_cna)), keyby = .(ref_type, gene)]
105 |     setkey(cells, cell_name)
106 |     expdt[, sample := do.call(`[`, list(cells, cell_name))[, as.character(sample)]]
107 | 
108 |     cat('Correcting CNA values w.r.t. reference\n')
109 |     expdt[, # Correcting the CNA values w.r.t. reference cells
110 |         cna := {
111 |             rt <- unique(sample)
112 |             if(!(rt %in% ref_range$ref_type)) rt <- 'global'
113 |             out <- .SD[, .(cna, do.call(`[`, list(ref_range, .(rt, gene)))[, .(mn, mx)])]
114 |             out[, ifelse(cna > mx + 0.1, cna - mx - 0.1, ifelse(cna < mn - 0.1, cna - mn + 0.1, 0))]
115 |         },
116 |         by = sample
117 |     ]
118 |     
119 |     cat('Done!\n\n')
120 |     return(list(cna_data = expdt, ref_cells = ref_cells, ref_range = ref_range, path = p))
121 |     
122 | })
123 | 
124 | if(!all(sapply(out, function(x) is.null(x$cna_data)))) {
125 | 
126 |     ct <- gsub('/', '-', r[2])
127 |     if(!(ct %in% dir('../data/study_plots'))) {dir.create(paste0('../data/study_plots/', ct))}
128 |     if(!(r[1] %in% dir(paste0('../data/study_plots/', ct)))) {dir.create(paste0('../data/study_plots/', ct, '/', r[1]))}
129 | 
130 |     saveRDS(out, paste0('../data/study_plots/', ct, '/', r[1], '/data_cna.rds'))
131 | 
132 | }
133 | 


--------------------------------------------------------------------------------
/study_plots_data_ct_comp.R:
--------------------------------------------------------------------------------
 1 | # The command line arguments should supply study name and cancer type (in that order).
 2 | r = commandArgs(trailingOnly = TRUE)
 3 | 
 4 | 
 5 | 
 6 | 
 7 | 
 8 | library(data.table)
 9 | library(ggplot2)
10 | library(magrittr)
11 | library(Matrix)
12 | library(stringr)
13 | library(plyr)
14 | library(matkot)
15 | 
16 | source('functions.R')
17 | 
18 | canonical_markers <- fread('../data/canonical_markers.csv')
19 | paths_table <- fread('../data/paths_table.csv', encoding = 'UTF-8', key = c('study', 'cancer_type'))
20 | 
21 | 
22 | 
23 | 
24 | 
25 | paths <- apply(paths_table[as.list(r), .(cells, genes, expmat)], 1, as.list, simplify = FALSE)
26 | 
27 | out <- lapply(
28 |     paths,
29 |     function(p) {
30 |         
31 |         if(endsWith(p$expmat, 'mtx')) {
32 |             
33 |             cells <- suppressWarnings(fread(p$cells, na.strings = ''))
34 |             
35 |             cells$cell_name <- as.character(cells$cell_name)
36 |             
37 |             if(cells[, .N > length(unique(cell_name))]) cells[, cell_name := paste(cell_name, .I, sep = '_')]
38 |             
39 |             genes <- fread(p$genes, header = FALSE)$V1
40 |             expmat <- readMM(p$expmat)
41 |             rownames(expmat) <- genes
42 |             colnames(expmat) <- cells$cell_name
43 |             
44 |             # Remove low-complexity cells and normalise to log TPM/10:
45 |             cells <- cells[col_nnz(expmat) >= 1000]
46 |             expmat <- round(log_transform(1e+05*to_frac(expmat[, cells$cell_name])), 4)
47 |             
48 |             ct_comp_data_out <- ct_comp_data(expmat, cells, markers = canonical_markers)
49 |             
50 |             return(list(data = ct_comp_data_out, path = p))
51 |             
52 |         } else return(list(data = NULL, path = p))
53 |         
54 |     }
55 | )
56 | 
57 | if(!all(sapply(out, function(x) is.null(x$data)))) {
58 |     
59 |     ct <- gsub('/', '-', r[2])
60 |     if(!(ct %in% dir('../data/study_plots'))) {dir.create(paste0('../data/study_plots/', ct))}
61 |     if(!(r[1] %in% dir(paste0('../data/study_plots/', ct)))) {dir.create(paste0('../data/study_plots/', ct, '/', r[1]))}
62 |     
63 |     saveRDS(out, paste0('../data/study_plots/', ct, '/', r[1], '/data_ct_comp.RDS'))
64 |     
65 | }
66 | 


--------------------------------------------------------------------------------
/study_plots_data_ct_umap.R:
--------------------------------------------------------------------------------
 1 | # The command line arguments should supply study name and cancer type (in that order).
 2 | r = commandArgs(trailingOnly = TRUE)
 3 | 
 4 | 
 5 | 
 6 | 
 7 | 
 8 | library(data.table)
 9 | library(ggplot2)
10 | library(magrittr)
11 | library(Matrix)
12 | library(stringr)
13 | library(plyr)
14 | library(irlba)
15 | library(uwot)
16 | library(matkot)
17 | 
18 | source('functions.R')
19 | 
20 | paths_table <- fread('../data/paths_table.csv', encoding = 'UTF-8', key = c('study', 'cancer_type'))
21 | 
22 | 
23 | 
24 | 
25 | 
26 | paths <- apply(paths_table[as.list(r), .(cells, genes, expmat)], 1, as.list, simplify = FALSE)
27 | 
28 | out <- lapply(paths, function(p) {
29 |     
30 |     if(endsWith(p$expmat, 'mtx')) {
31 |         
32 |         cells <- suppressWarnings(fread(p$cells, na.strings = ''))
33 |         
34 |         cells$cell_name <- as.character(cells$cell_name)
35 |         
36 |         if(cells[, .N > length(unique(cell_name))]) cells[, cell_name := paste(cell_name, .I, sep = '_')]
37 |         
38 |         genes <- fread(p$genes, header = FALSE)$V1
39 |         expmat <- readMM(p$expmat)
40 |         rownames(expmat) <- genes
41 |         colnames(expmat) <- cells$cell_name
42 |         
43 |         # Remove low-complexity cells and normalise to log TPM/10:
44 |         cells <- cells[complexity >= 1000]
45 |         expmat <- round(log_transform(1e+05*to_frac(expmat[, cells$cell_name])), 4)
46 |         
47 |         # Filtered gene list, after removing lowly expressed genes:
48 |         if('sample' %in% names(cells)) {
49 |             filtered_genes <- cells[
50 |                 sample %in% cells[, .(N = .N), by = sample][N >= 10, sample],
51 |                 .(gene = genes[rowMeans(log_transform(expmat[, cell_name], reverse = TRUE)) >= 1e+05*(2^4 - 1)/1e+06]),
52 |                 by = sample
53 |             ]$gene %>% unique
54 |         } else {
55 |             filtered_genes <- genes[rowMeans(log_transform(expmat, reverse = TRUE)) >= 1e+05*(2^4 - 1)/1e+06]
56 |         }
57 |         
58 |         ct_umap_data_out <- ct_umap_data(expmat[filtered_genes, ], cells)
59 |         
60 |         return(list(data = ct_umap_data_out, path = p))
61 |         
62 |     } else return(NULL)
63 |     
64 | })
65 | 
66 | if(!all(sapply(out, function(x) is.null(x$data)))) {
67 |     
68 |     ct <- gsub('/', '-', r[2])
69 |     if(!(ct %in% dir('../data/study_plots'))) {dir.create(paste0('../data/study_plots/', ct))}
70 |     if(!(r[1] %in% dir(paste0('../data/study_plots/', ct)))) {dir.create(paste0('../data/study_plots/', ct, '/', r[1]))}
71 |     
72 |     saveRDS(out, paste0('../data/study_plots/', ct, '/', r[1], '/data_ct_umap.RDS'))
73 |     
74 | }
75 | 


--------------------------------------------------------------------------------
/study_plots_data_dist.R:
--------------------------------------------------------------------------------
  1 | r = commandArgs(trailingOnly = TRUE)
  2 | 
  3 | 
  4 | 
  5 | 
  6 | 
  7 | library(data.table)
  8 | library(magrittr)
  9 | library(readxl)
 10 | library(plyr)
 11 | library(stringr)
 12 | library(Matrix)
 13 | library(matkot)
 14 | 
 15 | source('functions.R')
 16 | 
 17 | paths_table <- fread('../data/paths_table.csv', key = c('study', 'cancer_type'))
 18 | 
 19 | hgnc_complete_set <- fread('../data/hgnc_complete_set_2023-04-13.txt', key = 'symbol')
 20 | hgnc_complete_set <- hgnc_complete_set[!(ensembl_gene_id %in% names(table(ensembl_gene_id))[table(ensembl_gene_id) > 1])]
 21 | alias_table <- make_alias_table(hgnc_complete_set)
 22 | 
 23 | meta_programs <- lapply(
 24 |     c('Malignant', 'B cells', 'Endothelial', 'Epithelial', 'Fibroblasts', 'Macrophages', 'CD4 T cells', 'CD8 T cells'),
 25 |     function(ct) {
 26 |         dt <- as.data.table(read_xlsx('../data/meta_programs_2023-07-13.xlsx', sheet = ct))
 27 |         dt[, (names(dt)) := lapply(dt, update_symbols_fast, alias_table)]
 28 |         return(dt)
 29 |     }
 30 | )
 31 | 
 32 | # Change some MP names, especially for CD4 and CD8 T cells, so we can merge them into a single T cell category:
 33 | names(meta_programs[[1]])[c(12:15, 30, 32, 34, 39)] <- gsub('-', ' ', names(meta_programs[[1]])[c(12:15, 30, 32, 34, 39)])
 34 | names(meta_programs[[1]])[c(19, 35, 37, 41)] <- c('EpiSen', 'Hemato-related I', 'Hemato-related II', 'MP41 (Unassigned)')
 35 | names(meta_programs[[2]])[3] <- 'Cell Cycle'
 36 | names(meta_programs[[3]])[c(1, 6)] <- c('Notch signaling', 'Cell Cycle')
 37 | names(meta_programs[[4]])[5] <- 'Cell Cycle'
 38 | names(meta_programs[[5]])[c(6, 10, 15)] <- c('Cell Cycle', 'Metal response', 'Lipid metabolism')
 39 | names(meta_programs[[6]])[c(3, 9, 12)] <- c('Cell Cycle', 'Proteasomal degradation', 'Unfolded protein response')
 40 | names(meta_programs[[7]]) <- paste('CD4 -', names(meta_programs[[7]]))
 41 | names(meta_programs[[7]])[c(1, 3)] <- c('CD4 - Treg', 'CD4 - Cell Cycle')
 42 | names(meta_programs[[8]]) <- paste('CD8 -', names(meta_programs[[8]]))
 43 | names(meta_programs[[8]])[c(2, 10)] <- c('CD8 - Cell Cycle', 'CD8 - Heat shock')
 44 | 
 45 | # Merge CD4 and CD8 T cells:
 46 | meta_programs[[7]] <- cbind(meta_programs[[7]], meta_programs[[8]])
 47 | meta_programs <- meta_programs[1:7]
 48 | 
 49 | cts <- c('Malignant', 'B_cell', 'Endothelial', 'Epithelial', 'Fibroblast', 'Macrophage', 'T_cell')
 50 | names(meta_programs) <- cts
 51 | 
 52 | 
 53 | 
 54 | 
 55 | 
 56 | paths <- apply(paths_table[as.list(r), .(cells, genes, expmat)], 1, as.list, simplify = FALSE)
 57 | 
 58 | out <- lapply(paths, function(p) {
 59 |     
 60 |     if(!endsWith(p$expmat, 'mtx')) return(NULL)
 61 |     
 62 |     cat('Preparing data\n')
 63 |     cells <- suppressWarnings(fread(p$cells, na.strings = ''))
 64 |     if(!all(c('cell_type', 'sample') %in% names(cells))) return(NULL)
 65 |     cells$cell_name <- as.character(cells$cell_name)
 66 |     cells$sample <- as.character(cells$sample)
 67 |     if(cells[, .N > length(unique(cell_name))]) cells[, cell_name := paste(cell_name, .I, sep = '_')]
 68 |     genes <- fread(p$genes, header = FALSE)$V1
 69 |     expmat <- readMM(p$expmat)
 70 |     expmat <- expmat[genes != '', ]
 71 |     genes <- genes[genes != '']
 72 |     expmat <- expmat[genes %in% names(table(genes))[table(genes) == 1], ]
 73 |     genes <- genes[genes %in% names(table(genes))[table(genes) == 1]]
 74 |     genes <- update_symbols_fast(genes, alias_table) # Update gene symbols
 75 |     rownames(expmat) <- genes
 76 |     colnames(expmat) <- cells$cell_name
 77 |     cells <- cells[col_nnz(expmat) >= 1000] # Remove low-complexity cells
 78 |     if(nrow(cells) < 30) return(NULL)
 79 |     expmat <- round(log_transform(1e+05*to_frac(expmat[, cells$cell_name])), 4) # Normalise to log TPM/10
 80 |     
 81 |     cat('Assigning MP to each cell\n')
 82 |     set.seed(4387)
 83 |     scores <- cells[
 84 |         cell_type %in% cts,
 85 |         if(.N >= 100) c( # Cut-off of 100 cells of this cell type in this sample
 86 |             .(cell_name = cell_name),
 87 |             slapply(
 88 |                 meta_programs[[unique(cell_type)]],
 89 |                 function(mp) {
 90 |                     if(sum(mp %in% rownames(expmat)) < 10) mp <- str_to_title(mp)
 91 |                     if(sum(mp %in% rownames(expmat)) >= 30) sig_score(expmat[, cell_name], mp[mp %in% rownames(expmat)], nbin = 50, n = 50)
 92 |                 }
 93 |             )
 94 |         ) %>% as.data.table %>% melt(id.vars = 'cell_name', variable.name = 'meta_program', value.name = 'score', variable.factor = FALSE),
 95 |         by = .(cell_type, sample)
 96 |     ]
 97 |     if(nrow(scores) == 0) return(NULL)
 98 |     # Assign MP to each cell (take the one with highest score):
 99 |     assignments <- scores[, .(max_score = max(score), meta_program = meta_program[which.max(score)]), by = .(cell_type, sample, cell_name)]
100 |     assignments[, a_ct := ifelse(max_score < 1, as.character(NA), meta_program)]
101 |     # In each cell type and sample, remove MPs that were assigned to less than 5% of those cells that were given an assignment (i.e. that had max
102 |     # score >= 1):
103 |     assignments[!is.na(a_ct), a_smpl := {a <- table(a_ct)/.N; ifelse(a_ct %in% names(a)[a < 0.05], NA, a_ct)}, by = .(cell_type, sample)]
104 |     # Similar but more lenient criterion for a_ct, to get rid of spurious assignments:
105 |     assignments[!is.na(a_ct), a_ct := {a <- table(a_ct)/.N; ifelse(a_ct %in% names(a)[a < 0.01], NA, a_ct)}, by = .(cell_type, sample)]
106 |     setkey(assignments, cell_type, sample)
107 |     
108 |     cat('Constructing plot data\n')
109 |     pie_data <- slapply(cts[cts %in% assignments$cell_type], function(ct) {
110 |         pie_out <- assignments[ct, {N_ct <- .N; .SD[, .(prop = .N/N_ct), by = a_ct]}][order(-prop)]
111 |         pie_out[is.na(a_ct), a_ct := 'Unassigned']
112 |         pie_out[prop < 0.03, a_ct := 'Rare MPs (each <3%)']
113 |         pie_out[a_ct == 'Rare MPs (each <3%)', prop := sum(prop)]
114 |         pie_out <- unique(pie_out)
115 |         pie_out[, a_ct := factor(a_ct, levels = c(a_ct[!(a_ct %in% c('Rare MPs (each <3%)', 'Unassigned'))], 'Rare MPs (each <3%)', 'Unassigned'))]
116 |         pie_out <- pie_out[order(a_ct)]
117 |         pie_out[, cumprop := c(0, cumsum(prop)[-.N])]
118 |         return(pie_out)
119 |     })
120 |     bar_data <- slapply(cts[cts %in% assignments$cell_type], function(ct) {
121 |         slapply(assignments[ct, unique(sample)], function(smpl) {
122 |             bar_out <- assignments[.(ct, as.character(smpl))][, {N_smpl <- .N; .SD[, .(prop = .N/N_smpl), by = a_smpl]}][!is.na(a_smpl)][order(prop)]
123 |             if(nrow(bar_out) == 0) return(NULL)
124 |             bar_out[, a_smpl := factor(a_smpl, levels = a_smpl)]
125 |             return(bar_out)
126 |         })
127 |     })
128 |     bar_data <- slapply(bar_data, function(x) x[!sapply(x, is.null)])
129 |     htmp_data <- slapply(cts[cts %in% assignments$cell_type], function(ct) {
130 |         slapply(assignments[ct, unique(sample)], function(smpl) {
131 |             if(!(smpl %in% names(bar_data[[ct]]))) return(NULL)
132 |             mps <- bar_data[[ct]][[as.character(smpl)]][, levels(a_smpl)]
133 |             g <- slapply(meta_programs[[ct]][, ..mps], function(mp) {
134 |                 if(sum(mp %in% rownames(expmat)) < 10) mp <- str_to_title(mp)
135 |                 mp[mp %in% rownames(expmat)]
136 |             })
137 |             asub <- assignments[!is.na(a_smpl)][.(ct, as.character(smpl))]
138 |             if(nrow(asub) > 100) asub <- asub[, {n <- floor(100*.N/nrow(asub)); .SD[sample(1:.N, n, replace = FALSE)]}, by = a_smpl]
139 |             htmp_out_list <- slapply(mps, function(mp_name) {
140 |                 mat <- expmat[g[[mp_name]], asub$cell_name, drop = FALSE]
141 |                 mat_sub <- mat[, asub[a_smpl == mp_name, cell_name], drop = FALSE]
142 |                 dt <- as.data.table(as.matrix(mat - rowMeans(mat)), keep.rownames = 'gene') # centre per gene and convert to data.table
143 |                 dt <- melt(dt, id.vars = 'gene', variable.name = 'cell_name', value.name = 'value')[, a_smpl := mp_name]
144 |                 return(list(data = dt, genes = names(sort(rowMeans(mat_sub))), cells = names(sort(colMeans(mat_sub)))))
145 |             })
146 |             htmp_out <- rbindlist(lapply(htmp_out_list, `[[`, 'data'))
147 |             htmp_out[, c('gene_a', 'a_smpl') := .(paste(gene, gsub(' |-|/|\\(|\\)', '', a_smpl), sep = '_'), factor(a_smpl, levels = rev(mps)))]
148 |             cells_ord <- unlist(lapply(htmp_out_list[rev(mps)], `[[`, 'cells'))
149 |             genes_ord <- unlist(lapply(mps, function(mp_name) paste(htmp_out_list[[mp_name]]$genes, gsub(' |-|/|\\(|\\)', '', mp_name), sep = '_')))
150 |             htmp_out[, c('gene_a', 'cell_name') := .(factor(gene_a, levels = genes_ord), factor(cell_name, levels = cells_ord))]
151 |             htmp_out[, gene_num := as.numeric(gene_a)]
152 |             return(htmp_out)
153 |         })
154 |     })
155 |     htmp_data <- slapply(htmp_data, function(x) x[!sapply(x, is.null)])
156 |     
157 |     cat('Done!\n')
158 |     return(list(scores = scores, assignments = assignments, pie_data = pie_data, bar_data = bar_data, heatmap_data = htmp_data, path = p))
159 |     
160 | })
161 | 
162 | if(!all(sapply(out, function(x) is.null(x$assignments)))) {
163 |     
164 |     ct <- gsub('/', '-', r[2])
165 |     if(!(ct %in% dir('../data/study_plots'))) {dir.create(paste0('../data/study_plots/', ct))}
166 |     if(!(r[1] %in% dir(paste0('../data/study_plots/', ct)))) {dir.create(paste0('../data/study_plots/', ct, '/', r[1]))}
167 |     
168 |     saveRDS(out, paste0('../data/study_plots/', ct, '/', r[1], '/data_dist.rds'))
169 |     
170 | }
171 | 


--------------------------------------------------------------------------------