├── 3CA_v2
├── MP_analysis_new_batch.R
├── README.md
├── cc_analysis.R
├── cc_analysis_consensus_comparison.R
├── ct_specificity.R
├── gene_analysis.R
├── mp_new.R
└── pseudobulks.R
├── ITH_hallmarks
├── Generating_MPs
│ ├── Generate_Meta_Programs.R
│ ├── Genes_nmf_w_basis_example.RDS
│ ├── custom_magma.R
│ └── robust_nmf_programs.R
├── MPs_distribution
│ ├── MP_distribution.R
│ ├── MP_list.RDS.gz
│ ├── My_study.RDS.gz
│ └── heatCols.RDS.gz
├── README.md
└── TCGA_analysis
│ ├── functions.R
│ ├── pdac_classical_main.R
│ ├── preprocess_clinical_data.R
│ ├── preprocess_mutations.R
│ ├── preprocess_tcga_data.R
│ ├── study_contribution_per_MP.RDS
│ ├── study_tcga_map.R
│ ├── tcga_clinical.R
│ ├── tcga_clinical_deconv.R
│ ├── tcga_clinical_meta.R
│ ├── tcga_mp_scores.R
│ ├── tcga_mp_scores_deconv.R
│ ├── tcga_mutations_new.R
│ └── tcga_subtypes_data.R
├── LICENSE
├── README.md
├── aliases_pubmed.R
├── aliases_table.R
├── bins.R
├── canonical_markers.R
├── cc_sigs_consensus.R
├── cna_mat_prep.R
├── functions.R
├── gene_ave.R
├── gene_mp_cor.R
├── gene_mp_cor_all_web.R
├── gene_plots_cell_types.R
├── gene_plots_data_all_web.R
├── gene_plots_render_web.R
├── gene_plots_rmd_web.R
├── mp_scores_all.R
├── mp_scores_prep.R
├── study_plots_cc.R
├── study_plots_cna.R
├── study_plots_ct_comp.R
├── study_plots_ct_umap.R
├── study_plots_data_cc.R
├── study_plots_data_cc_consensus.R
├── study_plots_data_cc_thresh.R
├── study_plots_data_cc_thresh_consensus.R
├── study_plots_data_cna.R
├── study_plots_data_ct_comp.R
├── study_plots_data_ct_umap.R
├── study_plots_data_dist.R
└── study_plots_dist.R
/3CA_v2/README.md:
--------------------------------------------------------------------------------
1 | # Analysis with 3CA version 2
2 |
3 | The 3CA_v2 directory contains code for analyses conducted on the expanded version of 3CA, including: cell cycle analysis; measurement of gene specificity and sensitivity; quantification of cell type specificity; and an updated characterisation of recurrent expression programs (meta-programs/MPs). All analyses were run using R version 4.1.1.
4 |
--------------------------------------------------------------------------------
/3CA_v2/cc_analysis_consensus_comparison.R:
--------------------------------------------------------------------------------
1 | library(data.table)
2 | library(ggplot2)
3 | library(magrittr)
4 | library(Matrix)
5 | library(stringr)
6 | library(plyr)
7 | library(cowplot)
8 | library(RColorBrewer)
9 | library(scales)
10 | library(readxl)
11 | library(ggpubr)
12 | library(ggtext)
13 | library(matkot)
14 |
15 | try(library(randomcoloR), silent = TRUE)
16 |
17 | source('functions.R')
18 |
19 | paths_table <- fread('../data/paths_table.csv', key = c('study', 'cancer_type'))
20 |
21 |
22 |
23 |
24 |
25 | diseases <- lapply(transpose(as.list(unique(paths_table[cancer_type != 'Other/Models', .(study, cancer_type)]))), function(r) {
26 | cat(r, '\n')
27 | samples_path <- paste0('~/../shared/pan_cancer_datasets/', paths_table[as.list(r), directory[1]], '/samples.csv')
28 | if(file.exists(samples_path)) {
29 | samples <- fread(samples_path)[!is.na(sample)] # Setting na.strings = '' doesn't seem to work here...
30 | samples[, sample := as.character(sample)]
31 | samples[cancer_type == '', cancer_type := NA]
32 | setkey(samples, sample)
33 | } else {
34 | warning("Samples file doesn't exist for ", r[1], ", ", r[2])
35 | return(NULL)
36 | }
37 | if('cancer_type' %in% names(samples)) {
38 | return(samples[, .(study = r[1], cancer_type = r[2], sample = sample, disease = cancer_type)])
39 | } else {
40 | warning("Samples file for ", r[1], ", ", r[2], " doesn't contain cancer_type column")
41 | return(NULL)
42 | }
43 | }) %>% rbindlist
44 | to_exclude <- list(c('Chen et al. 2020', 'Head and Neck'), c('Sun et al. 2021', 'Liver/Biliary')) %>%
45 | transpose %>% as.data.table %>% setNames(c('study', 'cancer_type'))
46 | setkey(diseases, study, cancer_type)
47 | diseases_to_include <- diseases[!to_exclude][
48 | !is.na(disease) & !(disease %in% c('Normal', 'Premalignant')),
49 | .SD[, .(n_sample = .N, n_study = nrow(unique(.SD))), by = disease, .SDcols = c('study', 'cancer_type')][
50 | (n_study >= 2 & n_sample >= 10) | (n_sample >= 20),
51 | disease
52 | ]
53 | ]
54 | to_include <- diseases[!to_exclude][disease %in% diseases_to_include, .(study, cancer_type)] %>% unique
55 |
56 |
57 |
58 |
59 |
60 | set.seed(1852)
61 | cc_prop <- lapply(c('data_cc_consensus.rds', 'data_cc.RDS'), function(fn) rbindlist(lapply(transpose(as.list(to_include)), function(r) {
62 |
63 | cat(r, '\n')
64 |
65 | # Only difference is to change 'data_cc_consensus.rds' to 'data_cc.RDS' in the following two lines:
66 | if(!(fn %in% dir(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1])))) return(NULL)
67 | plot_data <- readRDS(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1], '/', fn))
68 |
69 | nullcond <- sapply(plot_data, function(x) ifelse(is.null(x), TRUE, all(sapply(x[names(x) != 'path'], is.null))))
70 | if(all(nullcond)) return(NULL)
71 |
72 | paths <- apply(paths_table[as.list(r), .(cells, genes, expmat)], 1, as.list, simplify = FALSE)
73 |
74 | samples_path <- paste0('~/../shared/pan_cancer_datasets/', paths_table[as.list(r), directory[1]], '/samples.csv')
75 | if(file.exists(samples_path)) {
76 | samples <- fread(samples_path, colClasses = c(sample = 'character'), na.strings = '')[!is.na(sample) & cancer_type %in% diseases_to_include]
77 | setkey(samples, sample)
78 | }
79 |
80 | # Split HNSCC samples into HPV pos and neg:
81 | if(all(r == c('Puram et al. 2017', 'Head and Neck'))) {
82 | samples <- samples[sample %in% c('5', '6', '16', '17', '18', '20', '22', '25', '26', '28')][, cancer_type := 'HNSCC (HPV-neg.)']
83 | } else if(all(r == c('Cillo et al. 2020', 'Head and Neck'))) {
84 | samples[, cancer_type := ifelse(p16_status == 'p16+', 'HNSCC (HPV-pos.)', 'HNSCC (HPV-neg.)')]
85 | } else if(all(r == c('Kürten et al. 2021', 'Head and Neck'))) {
86 | samples[, cancer_type := ifelse(additional_tumor_characterisics == 'HPV Positive', 'HNSCC (HPV-pos.)', 'HNSCC (HPV-neg.)')]
87 | } else if(all(r == c('Unpublished HNSCC HTAN', 'Head and Neck'))) {
88 | samples[, cancer_type := ifelse(hpv == 'HPV+', 'HNSCC (HPV-pos.)', 'HNSCC (HPV-neg.)')]
89 | }
90 |
91 | # Split CRC and Gastric into MSS and MSI:
92 | if(all(r == c('Lee et al. 2020', 'Colorectal'))) {
93 | samples[, cancer_type := ifelse(grepl('mss', genetic_hormonal_features), 'CRC (MSS)', 'CRC (MSI)')]
94 | } else if(all(r == c('Pelka et al. 2021', 'Colorectal'))) {
95 | samples[, cancer_type := ifelse(MSIStatus == 'MSS', 'CRC (MSS)', 'CRC (MSI)')]
96 | } else if(all(r == c('Zhang et al. 2018', 'Colorectal'))) {
97 | samples[, cancer_type := ifelse(msi_status == 'MSS', 'CRC (MSS)', 'CRC (MSI)')]
98 | } else if(all(r == c('Kumar et al. 2022', 'Gastric'))) {
99 | samples[, cancer_type := ifelse(is.na(subtype) | subtype == '', NA, ifelse(subtype == 'MSI', 'Gastric (MSI)', 'Gastric (MSS)'))]
100 | } else if(all(r == c('Zhang et al. 2019', 'Gastric'))) {
101 | samples[, cancer_type := NA] # There's only 1 true gastric cancer sample in this dataset, and it's not annotated with MSI status.
102 | }
103 |
104 | samples <- samples[!is.na(cancer_type)]; if(nrow(samples) == 0) return(NULL)
105 |
106 | rout <- lapply(which(!nullcond), function(i) {
107 |
108 | dt <- plot_data[[i]]$ccdata
109 | dt <- dt[sample %in% samples$sample][, c('disease', 'tech') := do.call(`[`, list(samples, sample))[, .(cancer_type, technology)]]
110 | if(!(nrow(dt) >= 100 & 'cell_type' %in% names(dt))) return(NULL)
111 | dt <- dt[, .(cell_name, cell_type, sample, disease, tech, phase)]
112 | if(any(c('Myeloid', 'Monocyte') %in% dt$cell_type)) dt[cell_type %in% c('Myeloid', 'Monocyte'), cell_type := 'Macrophage']
113 |
114 | # Add complexity column to dt:
115 | cells <- suppressWarnings(fread(paths[[i]]$cells, na.strings = '', colClasses = c(cell_name = 'character', sample = 'character')))
116 | setkey(cells, cell_name)
117 | if('complexity' %in% names(cells)) dt[, n_gene := do.call(`[`, list(cells, cell_name, 'complexity'))] else dt[, n_gene := as.numeric(NA)]
118 | return(dt)
119 |
120 | }) %>% rbindlist
121 |
122 | if(is.null(rout) || nrow(rout) == 0) return(NULL)
123 |
124 | # Cell cycle proportions across all cells in a dataset (per disease and tech):
125 | rout_all <- rout[, # Require at least 100 cells altogether, across samples
126 | if(.N >= 100) .(sample = 'all', n_sample = length(unique(sample)), n_cell = .N, n_g1s = sum(phase == 'G1/S'),
127 | n_g2m = sum(phase == 'G2/M'), n_int = sum(phase == 'Intermediate'), n_gene = mean(n_gene)),
128 | by = .(cell_type, disease, tech)
129 | ]
130 |
131 | # As above but imposing bounds on the number of cells per sample, so that samples with very many cells don't skew the average:
132 | rout_all_b <- rout[,
133 | if(.N >= 100) { # Require at least 100 cells altogether, across samples
134 | bounds <- .SD[, .(N = .N), by = sample][, quantile(N, c(0.75, 0.25)) + c(1.5, -1.5)*floor(IQR(N))]
135 | .SD[, if(.N >= bounds[2]) {if(.N <= bounds[1]) .SD else .SD[sample(1:.N, bounds[1])]}, by = sample][,
136 | if(.N >= 100) .(sample = 'all_b', n_sample = length(unique(sample)), n_cell = .N, n_g1s = sum(phase == 'G1/S'),
137 | n_g2m = sum(phase == 'G2/M'), n_int = sum(phase == 'Intermediate'), n_gene = mean(n_gene))
138 | ]
139 | },
140 | by = .(cell_type, disease, tech)
141 | ]
142 |
143 | # Cell cycle proportions per sample:
144 | rout_smpl <- rout[, # Require at least 100 cells per sample
145 | if(.N >= 100) .(n_sample = NA, n_cell = .N, n_g1s = sum(phase == 'G1/S'), n_g2m = sum(phase == 'G2/M'),
146 | n_int = sum(phase == 'Intermediate'), n_gene = mean(n_gene)),
147 | by = .(cell_type, disease, tech, sample)
148 | ]
149 |
150 | rout <- rbindlist(list(rout_all, rout_all_b, rout_smpl)[c(nrow(rout_all) > 0, nrow(rout_all_b) > 0, nrow(rout_smpl) > 0)])
151 |
152 | if(!is.null(rout) && nrow(rout) > 0) {
153 | rout <- unique(rout)
154 | rout[, c('cancer_type', 'study') := .(r[2], r[1])]
155 | setcolorder(rout, c('cancer_type', 'study'))
156 | return(rout)
157 | }
158 |
159 | }))) %>% setNames(c('cons', 'bes'))
160 |
161 | for(x in c('cons', 'bes')) {
162 | cc_prop[[x]] <- cc_prop[[x]][!(cancer_type == 'Brain' & cell_type == 'Fibroblast')][, disease := mapvalues(
163 | gsub(' Cancer', '', disease),
164 | c('Acute Myeloid Leukemia', 'Chronic Myeloid Leukemia', 'Clear Cell Renal Cell Carcinoma', 'Colorectal', 'Cutaneous Basal Cell Carcinoma',
165 | 'Cutaneous Squamous Cell Carcinoma', 'Diffuse Large B Cell Lymphoma', 'Glioblastoma', 'Hepatocellular Carcinoma', 'Lung Adenocarcinoma',
166 | 'Lung Squamous Cell Carcinoma', 'Multiple Myeloma', 'Neuroendocrine Tumor', 'Pancreatic Ductal Adenocarcinoma', 'Small Cell Lung'),
167 | c('AML', 'CML', 'ccRCC', 'CRC', 'Skin BCC', 'Skin SCC', 'DLBCL', 'GBM', 'HCC', 'Lung Adeno.', 'Lung Squamous', 'MM', 'NET', 'PDAC', 'SCLC'),
168 | warn_missing = FALSE
169 | )]
170 | }
171 |
172 | common_cts <- cc_prop$cons[sample == 'all_b', .(n = nrow(unique(.SD))), by = cell_type, .SDcols = c('study', 'disease')][n >= 10, cell_type]
173 |
174 |
175 |
176 |
177 |
178 | cor_data <- do.call(merge, lapply(names(cc_prop), function(x) rbind(
179 | cc_prop[[x]][
180 | sample == 'all_b' & cell_type %in% common_cts,
181 | setNames(.('Cell cycle proportion', weighted.mean((n_g1s + n_g2m + n_int)/n_cell, n_sample)), c('v', paste0('val_', x))),
182 | by = .(cell_type, study, disease)
183 | ],
184 | cc_prop[[x]][
185 | sample == 'all_b' & cell_type %in% common_cts & n_g1s + n_g2m + n_int >= 50,
186 | setNames(.('Phase bias', weighted.mean((n_g1s - n_g2m)/(n_g1s + n_g2m + n_int), n_sample)), c('v', paste0('val_', x))),
187 | by = .(cell_type, study, disease)
188 | ]
189 | )))
190 |
191 | ggplot(cor_data, aes(x = val_cons, y = val_bes)) +
192 | geom_point(size = 3, shape = 21, fill = 'lightblue') +
193 | facet_wrap(vars(v), nrow = 1, ncol = 2, scales = 'free') +
194 | geom_text(
195 | aes(x = x, y = y, label = l),
196 | cor_data[,
197 | .(x = min(val_cons) + 0.18*diff(range(val_cons)), y = min(val_bes) + 0.9*diff(range(val_bes)),
198 | l = paste('r =', signif(cor(val_bes, val_cons), 2))),
199 | by = v
200 | ],
201 | size = 8
202 | ) +
203 | geom_abline(slope = 1, intercept = 0, colour = 'tomato', linewidth = 1) +
204 | theme_bw() +
205 | theme(
206 | axis.text = element_text(size = 14),
207 | axis.title = element_text(size = 16),
208 | plot.title = element_text(size = 18),
209 | strip.text = element_text(size = 16),
210 | strip.background = element_rect(fill = 'grey95'),
211 | aspect.ratio = 1
212 | ) +
213 | labs(x = 'Consensus signature', y = 'Bespoke signature', title = 'Cell cycle quantification with bespoke and consensus gene signatures')
214 |
--------------------------------------------------------------------------------
/3CA_v2/pseudobulks.R:
--------------------------------------------------------------------------------
1 | # The command line arguments should supply study name and cancer type (in that order).
2 | r = commandArgs(trailingOnly = TRUE)
3 |
4 |
5 |
6 |
7 |
8 | library(data.table)
9 | library(magrittr)
10 | library(plyr)
11 | library(stringr)
12 | library(Matrix)
13 | library(matkot)
14 |
15 | source('functions.R')
16 |
17 | paths_table <- fread('../data/paths_table.csv', key = c('study', 'cancer_type'))
18 | paths <- copy(paths_table[as.list(r)]); setkey(paths, group)
19 |
20 | hgnc_complete_set <- fread('../data/hgnc_complete_set_2023-04-13.txt', key = 'symbol')
21 | hgnc_complete_set <- hgnc_complete_set[!(ensembl_gene_id %in% names(table(ensembl_gene_id))[table(ensembl_gene_id) > 1])]
22 | alias_table <- make_alias_table(hgnc_complete_set)
23 |
24 | genes_all <- fread('../data/gene_plots_data_all_web.csv', select = 'symbol')[symbol %in% hgnc_complete_set$symbol, unique(symbol)]
25 | cts <- c('Malignant', 'B_cell', 'Endothelial', 'Epithelial', 'Fibroblast', 'Macrophage', 'T_cell')
26 | rabbr <- paste(gsub(' | et al. ', '', r[1]), gsub(' |/', '-', r[2]), sep = '_')
27 |
28 |
29 |
30 |
31 |
32 | pbulks <- lapply(paths$group, function(g) {
33 |
34 | cells <- suppressWarnings(fread(paths[g, cells], na.strings = '', colClasses = c(cell_name = 'character', sample = 'character')))
35 | if(!all(c('cell_type', 'sample') %in% names(cells)) | !endsWith(paths[g, expmat], 'mtx')) return(NULL)
36 | if(cells[, .N > length(unique(cell_name))]) cells[, cell_name := paste(cell_name, .I, sep = '_')]
37 | genes <- fread(paths[g, genes], header = FALSE)$V1
38 | expmat <- readMM(paths[g, expmat])
39 | expmat <- expmat[genes != '', ] # In case any genes are empty strings
40 | genes <- genes[genes != '']
41 | expmat <- expmat[genes %in% names(table(genes))[table(genes) == 1], ] # Removes repeated gene names
42 | genes <- genes[genes %in% names(table(genes))[table(genes) == 1]]
43 | genes <- update_symbols_fast(genes, alias_table) # Update gene symbols
44 | rownames(expmat) <- genes
45 | colnames(expmat) <- cells$cell_name
46 |
47 | genes <- genes[genes %in% genes_all & genes %in% names(table(genes))[table(genes) == 1]]
48 |
49 | cells <- cells[cell_type %in% cts & col_nnz(expmat) >= 1000] # Filter cell types and remove low-complexity cells
50 | setkey(cells, sample, cell_type)
51 | cells <- cells[cells[, .N, by = .(sample, cell_type)][N >= 30, -'N']]
52 |
53 | if(nrow(cells) < 30) return(NULL)
54 | expmat <- round(1e+05*to_frac(expmat[genes, cells$cell_name]), 4) # Normalise to TPM/10
55 |
56 | Reduce(merge, apply(unique(cells[, .(sample, cell_type)]), 1, function(sct) {
57 | out <- as.data.table(rowMeans(expmat[, cells[as.list(sct), cell_name]]), keep.rownames = TRUE)
58 | names(out) <- c('gene', paste(rabbr, g, sct[1], gsub('_', '', sct[2]), sep = '_'))
59 | return(out)
60 | }, simplify = FALSE))
61 |
62 | })
63 |
64 | pbulks <- Reduce(function(x, y) merge(x, y, by = 'gene', all = TRUE), pbulks[!sapply(pbulks, is.null)])
65 |
66 | if(!is.null(pbulks)) fwrite(pbulks, paste0('../data/pseudobulks/', rabbr, '.csv'))
67 |
--------------------------------------------------------------------------------
/ITH_hallmarks/Generating_MPs/Generate_Meta_Programs.R:
--------------------------------------------------------------------------------
1 |
2 | # ----------------------------------------------------------------------------------------------------
3 | # Load packages and functions
4 | # ----------------------------------------------------------------------------------------------------
5 |
6 | library(reshape2)
7 | library(NMF)
8 | library(ggplot2)
9 | library(scales)
10 | source("custom_magma.R")
11 | source("robust_nmf_programs.R")
12 |
13 |
14 | ## Input:
15 | # Genes_nmf_w_basis is a list in which each entry contains NMF gene-scores of a single sample. In our study we ran NMF using ranks 4-9 on the top 7000 genes in each sample. Hence each entry in Genes_nmf_w_basis is a matrix with 7000 rows (genes) X 39 columns (NMF programs)
16 | # For the code below to run smoothly, please use the following nomenclature:
17 | # 1) End entry names in Genes_nmf_w_basis (i.e. each sample name) with '_rank4_9_nruns10.RDS'
18 | # 2) End each matrix column name with an extension that represents the NMF rank and program index. for example '_rank4_9_nrun10.RDS.4.1' to represent the first NMF program obtained using rank=4, or '_rank4_9_nrun10.RDS.6.5' to represent the fifth NMF program obtained using rank=6
19 | # See Genes_nmf_w_basis_example.RDS for an example
20 | # We define MPs in 2 steps:
21 | # 1) The function robust_nmf_programs.R performs filtering, so that programs selected for defining MPs are:
22 | # i) Robust - recur in more that one rank within the sample
23 | # ii) Non-redundant - once a NMF program is selected, other programs within the sample that are similar to it are removed
24 | # iii) Not sample specific - has similarity to NMF programs in other samples
25 | # ** Please see https://github.com/gabrielakinker/CCLE_heterogeneity for more details on how to define robust NMF programs
26 | # 2) Selected NMFs are then clustered iteratively, as described in Figure S1. At the end of the process, each cluster generates a list of the 50 genes (i.e. the MP) that represent the NMF programs that contributed to the cluster. Notably, not all initially selected NMFs end up participating in a cluster
27 |
28 |
29 | # ----------------------------------------------------------------------------------------------------
30 | # Select NMF programs
31 | # ----------------------------------------------------------------------------------------------------
32 |
33 |
34 | ## Parameters
35 | intra_min_parameter <- 35
36 | intra_max_parameter <- 10
37 | inter_min_parameter <- 10
38 |
39 |
40 | # get top 50 genes for each NMF program
41 | nmf_programs <- lapply(Genes_nmf_w_basis, function(x) apply(x, 2, function(y) names(sort(y, decreasing = T))[1:50]))
42 | nmf_programs <- lapply(nmf_programs,toupper) ## convert all genes to uppercase
43 |
44 | # for each sample, select robust NMF programs (i.e. observed using different ranks in the same sample), remove redundancy due to multiple ranks, and apply a filter based on the similarity to programs from other samples.
45 | nmf_filter_ccle <- robust_nmf_programs(nmf_programs, intra_min = intra_min_parameter, intra_max = intra_max_parameter, inter_filter=T, inter_min = inter_min_parameter)
46 | nmf_programs <- lapply(nmf_programs, function(x) x[, is.element(colnames(x), nmf_filter_ccle),drop=F])
47 | nmf_programs <- do.call(cbind, nmf_programs)
48 |
49 | # calculate similarity between programs
50 | nmf_intersect <- apply(nmf_programs , 2, function(x) apply(nmf_programs , 2, function(y) length(intersect(x,y))))
51 |
52 | # hierarchical clustering of the similarity matrix
53 | nmf_intersect_hc <- hclust(as.dist(50-nmf_intersect), method="average")
54 | nmf_intersect_hc <- reorder(as.dendrogram(nmf_intersect_hc), colMeans(nmf_intersect))
55 | nmf_intersect <- nmf_intersect[order.dendrogram(nmf_intersect_hc), order.dendrogram(nmf_intersect_hc)]
56 |
57 |
58 |
59 | # ----------------------------------------------------------------------------------------------------
60 | # Cluster selected NMF programs to generate MPs
61 | # ----------------------------------------------------------------------------------------------------
62 |
63 |
64 | ### Parameters for clustering
65 | Min_intersect_initial <- 10 # the minimal intersection cutoff for defining the first NMF program in a cluster
66 | Min_intersect_cluster <- 10 # the minimal intersection cutoff for adding a new NMF to the forming cluster
67 | Min_group_size <- 5 # the minimal group size to consider for defining the first NMF program in a cluster
68 |
69 | Sorted_intersection <- sort(apply(nmf_intersect , 2, function(x) (length(which(x>=Min_intersect_initial))-1) ) , decreasing = TRUE)
70 |
71 | Cluster_list <- list() ### Every entry contains the NMFs of a chosen cluster
72 | MP_list <- list()
73 | k <- 1
74 | Curr_cluster <- c()
75 |
76 | nmf_intersect_original <- nmf_intersect
77 |
78 | while (Sorted_intersection[1]>Min_group_size) {
79 |
80 | Curr_cluster <- c(Curr_cluster , names(Sorted_intersection[1]))
81 |
82 | ### intersection between all remaining NMFs and Genes in MP
83 | Genes_MP <- nmf_programs[,names(Sorted_intersection[1])] # Genes in the forming MP are first chosen to be those in the first NMF. Genes_MP always has only 50 genes and evolves during the formation of the cluster
84 | nmf_programs <- nmf_programs[,-match(names(Sorted_intersection[1]) , colnames(nmf_programs))] # remove selected NMF
85 | Intersection_with_Genes_MP <- sort(apply(nmf_programs, 2, function(x) length(intersect(Genes_MP,x))) , decreasing = TRUE) # intersection between all other NMFs and Genes_MP
86 | NMF_history <- Genes_MP # has genes in all NMFs in the current cluster, for redefining Genes_MP after adding a new NMF
87 |
88 | ### Create gene list is composed of intersecting genes (in descending order by frequency). When the number of genes with a given frequency span bewond the 50th genes, they are sorted according to their NMF score.
89 | while ( Intersection_with_Genes_MP[1] >= Min_intersect_cluster) {
90 |
91 | Curr_cluster <- c(Curr_cluster , names(Intersection_with_Genes_MP)[1])
92 |
93 | Genes_MP_temp <- sort(table(c(NMF_history , nmf_programs[,names(Intersection_with_Genes_MP)[1]])), decreasing = TRUE) ## Genes_MP is newly defined each time according to all NMFs in the current cluster
94 | Genes_at_border <- Genes_MP_temp[which(Genes_MP_temp == Genes_MP_temp[50])] ### genes with overlap equal to the 50th gene
95 |
96 | if (length(Genes_at_border)>1){
97 | ### Sort last genes in Genes_at_border according to maximal NMF gene scores
98 | ### Run across all NMF programs in Curr_cluster and extract NMF scores for each gene
99 | Genes_curr_NMF_score <- c()
100 | for (i in Curr_cluster) {
101 | curr_study <- paste( strsplit(i , "[.]")[[1]][1 : which(strsplit(i , "[.]")[[1]] == "RDS")] , collapse = "." )
102 | Q <- Genes_nmf_w_basis[[curr_study]][ match(names(Genes_at_border),toupper(rownames(Genes_nmf_w_basis[[curr_study]])))[!is.na(match(names(Genes_at_border),toupper(rownames(Genes_nmf_w_basis[[curr_study]]))))] ,i]
103 | names(Q) <- names(Genes_at_border[!is.na(match(names(Genes_at_border),toupper(rownames(Genes_nmf_w_basis[[curr_study]]))))]) ### sometimes when adding genes the names do not appear
104 | Genes_curr_NMF_score <- c(Genes_curr_NMF_score, Q )
105 | }
106 | Genes_curr_NMF_score_sort <- sort(Genes_curr_NMF_score , decreasing = TRUE)
107 | Genes_curr_NMF_score_sort <- Genes_curr_NMF_score_sort[unique(names(Genes_curr_NMF_score_sort))]
108 |
109 | Genes_MP_temp <- c(names(Genes_MP_temp[which(Genes_MP_temp > Genes_MP_temp[50])]) , names(Genes_curr_NMF_score_sort))
110 |
111 | } else {
112 | Genes_MP_temp <- names(Genes_MP_temp)[1:50]
113 | }
114 |
115 | NMF_history <- c(NMF_history , nmf_programs[,names(Intersection_with_Genes_MP)[1]])
116 | Genes_MP <- Genes_MP_temp[1:50]
117 |
118 | nmf_programs <- nmf_programs[,-match(names(Intersection_with_Genes_MP)[1] , colnames(nmf_programs))] # remove selected NMF
119 |
120 | Intersection_with_Genes_MP <- sort(apply(nmf_programs, 2, function(x) length(intersect(Genes_MP,x))) , decreasing = TRUE) # intersection between all other NMFs and Genes_MP
121 |
122 | }
123 |
124 | Cluster_list[[paste0("Cluster_",k)]] <- Curr_cluster
125 | MP_list[[paste0("MP_",k)]] <- Genes_MP
126 | k <- k+1
127 |
128 | nmf_intersect <- nmf_intersect[-match(Curr_cluster,rownames(nmf_intersect) ) , -match(Curr_cluster,colnames(nmf_intersect) ) ] # Remove current chosen cluster
129 |
130 | Sorted_intersection <- sort(apply(nmf_intersect , 2, function(x) (length(which(x>=Min_intersect_initial))-1) ) , decreasing = TRUE) # Sort intersection of remaining NMFs not included in any of the previous clusters
131 |
132 | Curr_cluster <- c()
133 | print(dim(nmf_intersect)[2])
134 | }
135 |
136 |
137 |
138 | #### Sort Jaccard similarity plot according to new clusters:
139 |
140 | inds_sorted <- c()
141 |
142 | for (j in 1:length(Cluster_list)){
143 |
144 | inds_sorted <- c(inds_sorted , match(Cluster_list[[j]] , colnames(nmf_intersect_original)))
145 |
146 | }
147 | inds_new <- c(inds_sorted , which(is.na( match(1:dim(nmf_intersect_original)[2],inds_sorted)))) ### clustered NMFs will appear first, and the latter are the NMFs that were not clustered
148 |
149 | nmf_intersect_meltI_NEW <- reshape2::melt(nmf_intersect_original[inds_new,inds_new])
150 |
151 | ggplot(data = nmf_intersect_meltI_NEW, aes(x=Var1, y=Var2, fill=100*value/(100-value), color=100*value/(100-value))) +
152 | geom_tile() +
153 | scale_color_gradient2(limits=c(2,25), low=custom_magma[1:111], mid =custom_magma[112:222], high = custom_magma[223:333], midpoint = 13.5, oob=squish, name="Similarity\n(Jaccard index)") +
154 | scale_fill_gradient2(limits=c(2,25), low=custom_magma[1:111], mid =custom_magma[112:222], high = custom_magma[223:333], midpoint = 13.5, oob=squish, name="Similarity\n(Jaccard index)") +
155 | theme( axis.ticks = element_blank(), panel.border = element_rect(fill=F), panel.background = element_blank(), axis.line = element_blank(), axis.text = element_text(size = 11), axis.title = element_text(size = 12), legend.title = element_text(size=11), legend.text = element_text(size = 10), legend.text.align = 0.5, legend.justification = "bottom") +
156 | theme(axis.title.x=element_blank(), axis.text.x=element_blank(), axis.ticks.x=element_blank()) +
157 | theme(axis.title.y=element_blank(), axis.text.y=element_blank(), axis.ticks.y=element_blank()) +
158 | guides(fill = guide_colourbar(barheight = 4, barwidth = 1))
159 |
160 |
161 | MP_list <- do.call(cbind, MP_list)
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
--------------------------------------------------------------------------------
/ITH_hallmarks/Generating_MPs/Genes_nmf_w_basis_example.RDS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tiroshlab/3ca/21a784103528cdf15008761f9669e2025159d6d2/ITH_hallmarks/Generating_MPs/Genes_nmf_w_basis_example.RDS
--------------------------------------------------------------------------------
/ITH_hallmarks/Generating_MPs/custom_magma.R:
--------------------------------------------------------------------------------
1 | # Custom color palette
2 | library(RColorBrewer)
3 | library(viridis)
4 | custom_magma <- c(colorRampPalette(c("white", rev(magma(323, begin = 0.15))[1]))(10), rev(magma(323, begin = 0.18)))
--------------------------------------------------------------------------------
/ITH_hallmarks/Generating_MPs/robust_nmf_programs.R:
--------------------------------------------------------------------------------
1 | # -------------------------------------------------------------------------------------------
2 | # Function for selecting robust nonnegative matrix factorization (NMF) programs
3 | # -------------------------------------------------------------------------------------------
4 |
5 | # - nmf_programs = a list; each element contains a matrix with NMF programs (top 50 genes) generated for a specific cell line using different NMF factorization ranks.
6 | # - intra_min = minimum overlap with a program from the same cell line (for selecting robust programs)
7 | # - intra_max = maximum overlap with a program from the same cell line (for removing redundant programs)
8 | # - inter_filter = logical; indicates whether programs should be filtered based on their similarity to programs of other cell lines
9 | # - inter_min = minimum overlap with a program from another cell line
10 |
11 | # Returns a character vector with the NMF programs selected
12 |
13 | robust_nmf_programs <- function(nmf_programs, intra_min = 35, intra_max = 10, inter_filter=T, inter_min = 10) {
14 |
15 | # Select NMF programs based on the minimum overlap with other NMF programs from the same cell line
16 | intra_intersect <- lapply(nmf_programs, function(z) apply(z, 2, function(x) apply(z, 2, function(y) length(intersect(x,y)))))
17 | intra_intersect_max <- lapply(intra_intersect, function(x) apply(x, 2, function(y) sort(y, decreasing = T)[2]))
18 | nmf_sel <- lapply(names(nmf_programs), function(x) nmf_programs[[x]][,intra_intersect_max[[x]]>=intra_min])
19 | names(nmf_sel) <- names(nmf_programs)
20 |
21 | # Select NMF programs based on i) the maximum overlap with other NMF programs from the same cell line and
22 | # ii) the minimum overlap with programs from another cell line
23 | nmf_sel_unlist <- do.call(cbind, nmf_sel)
24 | inter_intersect <- apply(nmf_sel_unlist , 2, function(x) apply(nmf_sel_unlist , 2, function(y) length(intersect(x,y)))) ## calculating intersection between all programs
25 |
26 | final_filter <- NULL
27 | for(i in names(nmf_sel)) {
28 | a <- inter_intersect[grep(i, colnames(inter_intersect), invert = T),grep(i, colnames(inter_intersect))]
29 | b <- sort(apply(a, 2, max), decreasing = T) # for each cell line, ranks programs based on their maximum overlap with programs of other cell lines
30 | if(inter_filter==T) b <- b[b>=inter_min] # selects programs with a maximum intersection of at least 10
31 | if(length(b) > 1) {
32 | c <- names(b[1])
33 | for(y in 2:length(b)) {
34 | if(max(inter_intersect[c,names(b[y])]) <= intra_max) c <- c(c,names(b[y])) # selects programs iteratively from top-down. Only selects programs that have a intersection smaller than 10 with a previously selected programs
35 | }
36 | final_filter <- c(final_filter, c)
37 | } else {
38 | final_filter <- c(final_filter, names(b))
39 | }
40 | }
41 | return(final_filter)
42 | }
43 |
44 |
--------------------------------------------------------------------------------
/ITH_hallmarks/MPs_distribution/MP_distribution.R:
--------------------------------------------------------------------------------
1 | # -------------------------------------------------------------------------------------------
2 | # Program for depicting the MP distribution of a given cell types across and within samples
3 | # -------------------------------------------------------------------------------------------
4 |
5 | # My_study = a list in which each element represents a sample in the study; the elements are expression matrices of a given cell type (i.e. cell_type below) in CPM or TPM units. Row names in each matrix should be gene symbols, and column names the cell IDs.
6 | # cell_type = one of the following cell types : "Cancer" , "Endothelial" , "Epithelial" , "Fibroblasts" , "Macrophages" , "CD4_T_cells", "CD8_T_cells" , "B_cells". Here "Cancer" represents malignant cells
7 | # MinGenes = the min number of MP genes that should exist in the study (default is 25)
8 | # MinScore = the minimal score for assigning a cell to a MP (default is 1). A cell is assigned to the MP with the maximal score, given that it exceeded MinScore. If the maximal score is below MinScore, the cell is unassigned.
9 | # MinCells = the minimal % of cells that should be assigned to a MP in order to account for the MP (default is 0.05)
10 | # MP_list = a list of the MPs for each cell_type
11 |
12 | ### Output:
13 | # (a) Pie chart of the MP distribution across the study
14 | # (b) Bar plot per tumor with the % of cells per MP
15 | # (c) Expression heatmap per tumor
16 |
17 |
18 |
19 | # -------------------------------------------------------------------------------------------
20 |
21 |
22 | library(ggplot2)
23 | library(scalop) # see https://rdrr.io/github/jlaffy/scalop/man/sigScores.html
24 | library(gridExtra)
25 | library(ggpubr)
26 |
27 |
28 |
29 | ### Define the following:
30 | # My_study =
31 | # cell_type =
32 | # MP_list = readRDS("MP_list.RDS") # can be found in the Github repository
33 | # heatCols = readRDS("heatCols.RDS") # can be found in the Github repository
34 |
35 |
36 | Score_cells <- function(L,
37 | cell_type,
38 | MinGenes = 25,
39 | MinScore = 1,
40 | MinCells = 0.05,
41 | MP_list
42 | )
43 | {
44 |
45 | MP_list <- MP_list[[cell_type]]
46 |
47 | L <- lapply(L, function(x) log2((x/10) + 1) ) # Log normalize before scoring
48 |
49 | MP_scores_per_sample <- lapply(L, function(x) scalop::sigScores(x, sigs = MP_list, conserved.genes = MinGenes/50 ) )
50 |
51 | remove_cells <- function(x,MinScore){ # remove cells whose max score was below MinScore
52 | max_score <- apply(x, 1, function(y) max(y))
53 | cells_rm <- which(max_score < MinScore)
54 | if (length(cells_rm) > 0 ){
55 | x <- x[-cells_rm , ]
56 | }
57 | return(x)
58 | }
59 | MP_scores_per_sample <- lapply(MP_scores_per_sample, function(x) remove_cells(x,MinScore))
60 |
61 | Assign_MP_per_cell <- lapply(MP_scores_per_sample, function(x) apply(x, 1, function(y) colnames(x)[which(y==max(y))] ) )
62 |
63 | filter_cells <- function(x,MinCells){ # remove MP that were assassin to less than MinCells in each sample
64 | MP_frequency <- as.numeric(ave(x, x, FUN = length))
65 | MP_rm <- which(MP_frequency/length(x) < MinCells) # MPs to be removed
66 | if (length(MP_rm)>0){
67 | x <- x[-MP_rm]
68 | }
69 | return(x)
70 | }
71 | Assign_MP_per_cell_filtered <- lapply(Assign_MP_per_cell, function(x) filter_cells(x,MinCells))
72 |
73 | return(Assign_MP_per_cell_filtered)
74 |
75 | }
76 |
77 |
78 | Assign_MP_per_cell_filtered <- Score_cells(L = My_study , cell_type = "Cancer" , MP_list=MP_list)
79 |
80 |
81 |
82 | #### Part A: pie chart of MPs across the whole study
83 | MinCells <- 0.05
84 | df <- data.frame(MPs = names(table(unlist(Assign_MP_per_cell_filtered))) , frequency = as.vector(table(unlist(Assign_MP_per_cell_filtered))) )
85 | MPs_rm <- which(df$frequency/sum(df$frequency) < MinCells) ## remove also MPs that were assassin to less than MinCells from the total
86 | if (length(MPs_rm > 0)){
87 | df$MPs[MPs_rm] <- paste0("Other MPs (<", MinCells*100, "%)")
88 | }
89 |
90 |
91 | blank_theme <- theme_minimal()+
92 | theme(
93 | axis.title.x = element_blank(),
94 | axis.title.y = element_blank(),
95 | panel.border = element_blank(),
96 | panel.grid=element_blank(),
97 | axis.ticks = element_blank(),
98 | plot.title=element_text(size=14, face="bold")
99 | )
100 |
101 |
102 | pie <- ggplot(df, aes(x="", y=frequency, fill=MPs)) +
103 | geom_bar(width = 1, stat = "identity") +
104 | coord_polar("y", start=0)+
105 | blank_theme +
106 | theme(axis.text.x=element_blank())
107 |
108 |
109 | pie
110 |
111 |
112 | #### Part B: Bar plot with MP distribution per sample
113 |
114 | sample_num <- length(Assign_MP_per_cell_filtered)
115 | nCol <- floor(sqrt(sample_num))
116 | MPs <- unique(unlist(Assign_MP_per_cell_filtered))
117 |
118 | ### For arranging the full MP names as legend in the plot
119 | MP_list <- MP_list[[cell_type]]
120 | MPs <- MPs[sort(match(MPs, names(MP_list)),index.return = T)$ix]
121 |
122 | Assign_MP_per_cell_filtered_abbrev <- lapply(Assign_MP_per_cell_filtered, function(x) apply(as.data.frame(x) , 1 , function(x) strsplit(x, "[ ]")[[1]][1]) )
123 |
124 | df_list <- lapply(Assign_MP_per_cell_filtered_abbrev, function(x) data.frame(MPs = names(table(x)) , frequency = as.vector(table(x))/sum(as.vector(table(x))) ))
125 | df_list2 <- lapply(Assign_MP_per_cell_filtered, function(x) data.frame(MPs = names(table(x)) , frequency = as.vector(table(x))/sum(as.vector(table(x))) ))
126 |
127 | P <- lapply(seq_along(df_list), function(I)
128 | ggplot(data=df_list[[I]], aes(x=reorder(MPs,-frequency), y=frequency)) +
129 | geom_bar(stat="identity")+
130 | theme(axis.title.x=element_blank())+
131 | ggtitle(names(df_list)[I])
132 | )
133 |
134 |
135 | P
136 |
137 | #### Part C: an expression heatmap for each sample
138 |
139 | df_list_ordered <- lapply(df_list, function(x) x[sort(x$frequency,decreasing = T , index.return = T)$ix,]) ## order MPs in each sample as in the bar plots, in decreasing order
140 | df_list_ordered2 <- lapply(df_list2, function(x) x[sort(x$frequency,decreasing = T , index.return = T)$ix,]) ## order MPs in each sample as in the bar plots, in decreasing order
141 |
142 |
143 | L1 <- lapply(seq_along(My_study), function(I) My_study[[I]][ , names(Assign_MP_per_cell_filtered[[I]])]) ## extract relevant cells that were assigned to an MP (at least 5%)
144 | names(L1) <- names(My_study)
145 |
146 | L1 <- lapply(L1, function(x) log2((x/10) + 1) ) ## Log normalize
147 | L1 <- lapply(L1, function(x) (x - rowMeans(x)) ) ## center
148 |
149 | L2 <- lapply(seq_along(L1), function(I) L1[[I]][ , sort(match(Assign_MP_per_cell_filtered_abbrev[[I]],df_list_ordered[[I]]$MPs), index.return = T)$ix] ) # sort cells according to MPs in df_list_ordered
150 | names(L2) <- names(L1)
151 |
152 | MP_genes_per_sample <- lapply(df_list_ordered2, function(x) unlist(MP_list[x$MPs])) ### MP genes sorted by the MP order in df_list_ordered2
153 |
154 | L_plot <- lapply(seq_along(L2), function(I) L2[[I]][match(MP_genes_per_sample[[I]] , rownames(L2[[I]]))[!is.na(match(MP_genes_per_sample[[I]] , rownames(L2[[I]])))] , ])
155 | names(L_plot) <- names(L2)
156 |
157 | color.scheme <- colorRampPalette(c(heatCols))(n=333)
158 |
159 | Plot_heatmap <- function(M){
160 |
161 | M_new2 <- M
162 | M_new2 <- apply(M_new2, 2, rev)
163 | M_meltII <- reshape2::melt(t(M_new2))
164 | M_meltII$Var2 <- factor(M_meltII$Var2)
165 |
166 | G <- ggplot(data = M_meltII, aes(x=Var1, y=Var2, fill=value, color=value)) +
167 | geom_tile() +
168 | scale_color_gradient2(limits=c(-4,4), low=color.scheme[1:111], mid =color.scheme[112:222], high = color.scheme[223:333], midpoint = 0, oob=squish, name=NULL) +
169 | scale_fill_gradient2(limits=c(-4,4), low=color.scheme[1:111], mid =color.scheme[112:222], high = color.scheme[223:333], midpoint = 0, oob=squish, name=NULL) +
170 | theme( panel.border = element_rect(fill=F), panel.background = element_blank(), axis.line = element_blank(), axis.text = element_text(size = 8), axis.title = element_text(size = 8), legend.title = element_text(size=8), legend.text = element_text(size = 8), legend.text.align = 0.5, legend.justification = "bottom" ) +
171 | theme(axis.title.x=element_blank(), axis.text.x=element_blank(), axis.ticks.x=element_blank()) +
172 | theme(axis.title.y=element_blank(), axis.text.y=element_blank(), axis.ticks.y=element_blank() )
173 |
174 | return(G)
175 | }
176 |
177 | P1 <- lapply(L_plot, function(x) Plot_heatmap(x))
178 |
179 | P_plot <- c()
180 | for (i in 1:length(P)){
181 | P_plot <- c(P_plot , P[i] , P1[i])
182 | }
183 |
184 |
185 | P_plot <- do.call("ggarrange", c(P_plot, ncol=2, nrow = 3))
186 |
187 | lapply(P_plot, function(x)
188 |
189 | annotate_figure(x, left = text_grob(paste(MPs , collapse = "\n"),
190 | color = "red", face = "bold", size = 10))
191 |
192 | )
193 |
194 |
195 |
196 |
--------------------------------------------------------------------------------
/ITH_hallmarks/MPs_distribution/MP_list.RDS.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tiroshlab/3ca/21a784103528cdf15008761f9669e2025159d6d2/ITH_hallmarks/MPs_distribution/MP_list.RDS.gz
--------------------------------------------------------------------------------
/ITH_hallmarks/MPs_distribution/My_study.RDS.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tiroshlab/3ca/21a784103528cdf15008761f9669e2025159d6d2/ITH_hallmarks/MPs_distribution/My_study.RDS.gz
--------------------------------------------------------------------------------
/ITH_hallmarks/MPs_distribution/heatCols.RDS.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tiroshlab/3ca/21a784103528cdf15008761f9669e2025159d6d2/ITH_hallmarks/MPs_distribution/heatCols.RDS.gz
--------------------------------------------------------------------------------
/ITH_hallmarks/README.md:
--------------------------------------------------------------------------------
1 | # Source code for Gavish et al.
2 |
3 | The ITH_hallmarks folder contains code for reproducing the analyses in the Gavish et al. This code is arranged into 3 folders.
4 |
5 | ## Generating_MPs
6 | - Generate_Meta_Programs.R generates MPs from NMFs programs that were calculated for each sample using different ranks. The NMF programs were calculated per sample using the ‘NMF’ R package:
7 | NMFs_per_sample = nmf(x = Expression_matrix, rank = 4:9, method="snmf/r", nrun = 10)
8 | NMF programs are listed in Genes_nmf_w_basis, where each entry contains NMF gene-scores of a single sample.
9 | Using the function robust_nmf_programs.R (described below) and a custom written clustering method, MPs are derived from the NMF programs.
10 | - Genes_nmf_w_basis_example.RDS is an example for how the NMF output is arranged. Note that each element name in the list ends with '_rank4_9_nruns10.RDS', and each matrix column ends with an extension that represents the NMF rank and program index.
11 | - robust_nmf_programs.R filters outs redundant and non-robust NMF programs for each sample. The remaining NMF programs are those that appeared in more than one NMF rank within the sample and have similarity to a NMF program in a different sample. Please see also https://github.com/gabrielakinker/CCLE_heterogeneity for more details on how to define robust NMF programs.
12 |
13 | ## MPs_distribution
14 | - MP_distribution.R depicts the meta-program distribution of a certain cell type within individual samples and across the whole study. It requires a list of scRNAseq expression matrices of different samples in a study (of a certain cell type).
15 | - MP_list.RDS is a list of meta-programs for each cell type required for inferring the distributions in MP_distribution.R
16 | - heatCols.RDS is the colormap used for plotting heatmaps in MP_distribution.R
17 | - My_study.RDS can be used for an example
18 |
19 | ## TCGA_analysis
20 | These scripts are for analysing expression of the MPs in TCGA data. A few notes on the data used here:
21 | - Some of the external datasets used are annotated with their source (e.g. URL to paper) via comments in the code.
22 | - The 'hgnc_complete_set.txt' file can be downloaded from the HGNC website: .
23 | - Other datasets used are available on request.
24 |
--------------------------------------------------------------------------------
/ITH_hallmarks/TCGA_analysis/functions.R:
--------------------------------------------------------------------------------
1 | make_alias_table <- function(hgnc_complete_set) {
2 | out <- hgnc_complete_set[,
3 | rbind(
4 | data.table(symbol_alt = str_split(alias_symbol, '\\|')[[1]], type = 'alias'),
5 | data.table(symbol_alt = str_split(prev_symbol, '\\|')[[1]], type = 'prev')
6 | ),
7 | by = symbol
8 | ][symbol_alt == '', symbol_alt := NA]
9 | setkey(out, symbol_alt)
10 | return(out)
11 | }
12 |
13 | update_symbols_fast <- function(symbols, alias_table) {
14 | if(length(symbols) != length(unique(symbols))) warning('Not all symbols are unique!')
15 | setkey(alias_table, symbol_alt)
16 | symbols[grep('C[0-9]+ORF[0-9]+', symbols)] <- gsub('ORF', 'orf', symbols[grep('C[0-9]+ORF[0-9]+', symbols)])
17 | cond <- !(symbols %in% alias_table$symbol)
18 | # The following addition to ensures two different symbols don't map to the same "prev" symbol...
19 | cond <- cond & !(symbols %in% alias_table[type == 'prev'][symbols[cond]][symbol %in% names(table(symbol))[table(symbol) > 1], symbol_alt])
20 | # ...And the following makes sure we have all unique prev symbols:
21 | cond <- cond & !(symbols %in% alias_table[type == 'prev'][symbols[cond], names(table(symbol_alt))[table(symbol_alt) > 1]])
22 | if(any(cond)) {
23 | # alias_table[type == 'prev'][symbols[cond]]
24 | symbols[cond] <- alias_table[type == 'prev'][symbols[cond]][,
25 | .(newsymb = { # Match symbols that have at least one "prev" symbol that isn't already in symbols:
26 | cands <- symbol[!is.na(symbol) & !(symbol %in% symbols)]
27 | ifelse(length(cands) == 1, cands, symbol_alt) # Convert only those symbols that have exactly one such "prev" symbol
28 | }),
29 | by = symbol_alt
30 | ]$newsymb
31 | }
32 | cond <- !(symbols %in% alias_table$symbol)
33 | # As above, the following additions to ensure two different symbols don't map to the same "alias", and we have all unique aliases:
34 | cond <- cond & !(symbols %in% alias_table[type == 'alias'][symbols[cond]][symbol %in% names(table(symbol))[table(symbol) > 1], symbol_alt])
35 | cond <- cond & !(symbols %in% alias_table[type == 'alias'][symbols[cond], names(table(symbol_alt))[table(symbol_alt) > 1]])
36 | if(any(cond)) {
37 | symbols[cond] <- alias_table[type == 'alias'][symbols[cond]][,
38 | .(newsymb = { # Match symbols that have at least one alias that isn't already in symbols:
39 | cands <- symbol[!is.na(symbol) & !(symbol %in% symbols)]
40 | ifelse(length(cands) == 1, cands, symbol_alt) # Convert only those symbols that have exactly one such alias
41 | }),
42 | by = symbol_alt
43 | ]$newsymb
44 | }
45 | return(symbols)
46 | }
47 |
48 | dirr <- function(x) dir(x, recursive = TRUE)
49 |
--------------------------------------------------------------------------------
/ITH_hallmarks/TCGA_analysis/pdac_classical_main.R:
--------------------------------------------------------------------------------
1 | library(data.table)
2 | library(readxl)
3 | library(magrittr)
4 | library(stringr)
5 | library(plyr)
6 | library(ggplot2)
7 | library(RColorBrewer)
8 | library(scales)
9 | library(cowplot)
10 | library(randomcoloR)
11 | library(matkot)
12 |
13 | cancer_types <- c('ACC', 'BLCA', 'BRCA', 'CESC', 'CHOL', 'COAD', 'DLBC', 'ESCA_AC', 'ESCA_ESCC', 'GBM_IDH-WT', 'HNSC', 'KICH', 'KIRC', 'KIRP',
14 | 'LAML', 'LGG_astro', 'LGG_IDH-WT', 'LGG_oligo', 'LIHC', 'LUAD', 'LUSC', 'MESO', 'OV', 'PAAD', 'PCPG', 'PRAD', 'READ', 'SARC', 'SKCM_primary',
15 | 'SKCM_metastatic', 'STAD', 'TGCT', 'THCA', 'THYM', 'UCEC', 'UCS', 'UVM')
16 |
17 | cancer_genes <- as.data.table(read_xlsx('/home/labs/tirosh/tyler/pan_cancer/cancer5000.xlsx', skip = 1, n_max = 260))[`Cancer5000-S (219)` == 1, gene]
18 |
19 | hgnc_complete_set <- fread('../data/hgnc_complete_set.txt', key = 'symbol')[
20 | !(ensembl_gene_id %in% names(table(ensembl_gene_id))[table(ensembl_gene_id) > 1])
21 | ][!grepl('unplaced', location), chrarm := str_extract(location, '[0-9]*[XY]*[pq]')][!is.na(chrarm)][
22 | order(as.numeric(mapvalues(gsub('[pq]', '', chrarm), c('X', 'Y'), c(23, 24))), str_extract(chrarm, '[pq]'))
23 | ]
24 |
25 |
26 |
27 |
28 |
29 | meta_hnsc <- fread('/home/labs/tirosh/tyler/TCGA_data/HNSC/Cells.csv', key = 'sample_id', na.strings = '')
30 | scores_hnsc <- fread('/home/labs/tirosh/tyler/TCGA_data/HNSC/Scores.csv')[grep('MP17 Interferon|PDAC-classical', meta_program)]
31 | mut_data_hnsc <- fread('/home/labs/tirosh/tyler/TCGA_data/HNSC/Mutation_data.csv')
32 | mut_data_hnsc <- mut_data_hnsc[gene %in% cancer_genes]
33 | setkey(mut_data_hnsc, gene, Variant_Classification)
34 | mut_data_hnsc <- rbind(
35 | mut_data_hnsc[Variant_Classification %in% c('Nonsense_Mutation', 'Frame_Shift_Del', 'Frame_Shift_Ins', 'Splice_Site', 'Nonstop_Mutation',
36 | 'Translation_Start_Site')],
37 | mut_data_hnsc[
38 | mut_data_hnsc[
39 | Variant_Classification %in% c('Missense_Mutation', 'In_Frame_Del', 'In_Frame_Ins'),
40 | .(n = length(unique(patient_id))),
41 | by = .(gene, Variant_Classification)
42 | ][n >= 2, -'n']
43 | ]
44 | )
45 | pscores_hnsc <- copy(scores_hnsc)[, patient_id := do.call(`[`, args = list(meta_hnsc, sample_id))$patient_id][, -'sample_id']
46 | setcolorder(pscores_hnsc, 'patient_id')
47 | setkey(pscores_hnsc, patient_id)
48 | pscores_hnsc <- pscores_hnsc[patient_id %in% mut_data_hnsc$patient_id]
49 | mut_data_hnsc <- mut_data_hnsc[patient_id %in% pscores_hnsc$patient_id]
50 | pids_tab <- pscores_hnsc[meta_program == meta_program[1], table(patient_id)]
51 | pids <- names(pids_tab)[pids_tab == 1]
52 | pscores_hnsc <- pscores_hnsc[patient_id %in% pids]
53 | mut_data_hnsc <- mut_data_hnsc[patient_id %in% pids]
54 |
55 | pscores_hnsc <- dcast(pscores_hnsc[, .(patient_id, meta_program, score)], patient_id ~ meta_program, value.var = 'score')
56 | genes <- c('CASP8', 'NSD1')
57 | pscores_hnsc[, (genes) := lapply(genes, function(g) patient_id %in% mut_data_hnsc[gene == g, patient_id])]
58 |
59 | # See how scores for this MP match with the trinity of CASP8-mut, HRAS-mut and TP53-WT:
60 | pscores_hnsc[,
61 | categ3 := patient_id %in% mut_data_hnsc[gene == 'CASP8', patient_id] &
62 | patient_id %in% mut_data_hnsc[gene == 'HRAS', patient_id] &
63 | !(patient_id %in% mut_data_hnsc[gene == 'TP53', patient_id]),
64 | by = patient_id
65 | ]
66 |
67 | bdata_hnsc_ifn <- melt(
68 | pscores_hnsc[, .(patient_id, `MP17 Interferon/MHC-II (I)`, CASP8, categ3)],
69 | id.vars = c('patient_id', 'MP17 Interferon/MHC-II (I)'),
70 | variable.name = 'event',
71 | value.name = 'event_pres'
72 | )[, event := mapvalues(event, c('CASP8', 'categ3'), c('CASP8-mut', 'CASP8-mut, HRAS-mut,\nTP53-wt'))]
73 | box_hnsc_ifn <- ggplot(bdata_hnsc_ifn) +
74 | geom_boxplot(aes(x = event_pres, y = `MP17 Interferon/MHC-II (I)`), outlier.shape = NA) +
75 | geom_point(aes(x = event_pres, y = `MP17 Interferon/MHC-II (I)`), position = position_jitter(width = 0.3), shape = 21, fill = 'black',
76 | stroke = 0.3, alpha = 0.3) +
77 | geom_text(
78 | data = data.table(
79 | x = c(1.5, 1.5),
80 | y = c(3.1, 3.1),
81 | label = pscores_hnsc[, paste('p =', c(
82 | signif(t.test(`MP17 Interferon/MHC-II (I)` ~ CASP8)$p.value, 2),
83 | signif(t.test(`MP17 Interferon/MHC-II (I)` ~ categ3)$p.value, 2)
84 | ))],
85 | event = c('CASP8-mut', 'CASP8-mut, HRAS-mut,\nTP53-wt')
86 | ),
87 | aes(x = x, y = y, label = label)
88 | ) +
89 | geom_segment(
90 | data = data.table(x = c(1, 1, 2, 1, 1, 2), xend = c(2, 1, 2, 2, 1, 2), y = rep(2.8, 6), yend = c(2.8, 2.7, 2.7, 2.8, 2.7, 2.7),
91 | event = c('CASP8-mut', 'CASP8-mut, HRAS-mut,\nTP53-wt')),
92 | aes(x = x, xend = xend, y = y, yend = yend)
93 | ) +
94 | facet_wrap(vars(event), strip.position = 'bottom') +
95 | theme_test() +
96 | theme(strip.placement = 'outside', strip.background = element_rect(fill = NA, colour = NA), strip.text = element_text(size = 11, vjust = 1)) +
97 | labs(x = NULL, y = 'Interferon/MHC-II (I) score', title = 'Interferon/MHC-II (I) and mutations in HNSC')
98 |
99 | subtypes_hnsc <- fread('../../TCGA_data/HNSC/subtypes.csv', na.strings = '', key = 'patient_id')
100 | pscores_hnsc[, subtype := do.call(`[`, list(subtypes_hnsc, patient_id))$subtype]
101 | pscores_hnsc[, subtype_classical := subtype == 'Classical']
102 | # pscores_hnsc[, t.test(subtype_classical ~ pdac_high)] # No enrichment of the Classical subtype in the PDAC-high population.
103 |
104 | clin_hnsc <- fread('../../TCGA_data/HNSC/Clinical.csv', key = 'patient_id')
105 | pscores_hnsc[, site := do.call(`[`, list(clin_hnsc, patient_id))$anatomic_neoplasm_subdivision]
106 | pscores_hnsc[, larynx := (site == 'larynx')]
107 | # pscores_hnsc[, t.test(larynx ~ pdac_high)] # Larynx is enriched in the PDAC-high population.
108 |
109 | pscores_hnsc[, larynx_nice := factor(ifelse(larynx, 'Larynx', 'Non-larynx'), levels = c('Non-larynx', 'Larynx'))]
110 | pscores_hnsc[, nsd1_nice := factor(ifelse(NSD1, 'NSD1-mut', 'NSD1-wt'))]
111 | bdata_hnsc_pdac_larynx <- rbind(
112 | pscores_hnsc[, .(id = patient_id, score = `MP30 PDAC-classical`, nsd1 = NSD1, categ = ifelse(larynx, 'Larynx', 'Non-larynx'))],
113 | pscores_hnsc[, .(id = patient_id, score = `MP30 PDAC-classical`, nsd1 = NSD1, categ = 'All tumors')]
114 | )
115 | bdata_hnsc_pdac_larynx[, nsd1 := factor(ifelse(nsd1, 'NSD1-mut', 'NSD1-wt'), levels = c('NSD1-wt', 'NSD1-mut'))]
116 | bdata_hnsc_pdac_larynx[, categ := factor(categ, levels = c('All tumors', 'Non-larynx', 'Larynx'))]
117 |
118 | # Just the "All tumors" boxes:
119 | box_hnsc <- ggplot(bdata_hnsc_pdac_larynx[categ == 'All tumors']) +
120 | geom_boxplot(aes(x = nsd1, y = score), outlier.shape = NA, width = 0.8) +
121 | geom_point(aes(x = nsd1, y = score), fill = 'grey80', size = 2, shape = 21, stroke = 0.3, position = position_jitter(width = 0.3)) +
122 | geom_text(
123 | data = data.table(x = 1.5, y = 2.9, label = paste('p =', signif(c(pscores_hnsc[, t.test(`MP30 PDAC-classical` ~ NSD1)$p.value]), 2))),
124 | aes(x = x, y = y, label = label)
125 | ) +
126 | geom_segment(
127 | data = data.table(x = c(1, 1, 2), xend = c(2, 1, 2), y = rep(2.7, 3), yend = c(2.7, 2.65, 2.65)),
128 | aes(x = x, xend = xend, y = y, yend = yend)
129 | ) +
130 | theme_test() +
131 | theme(axis.text.x = element_text(size = 11), plot.title = element_text(hjust = 0.5)) +
132 | labs(title = 'PDAC-classical and\nNSD1 mutation in HNSC', x = NULL, y = 'PDAC-classical score')
133 |
134 |
135 |
136 |
137 |
138 | library(seriation)
139 |
140 | scores_patient_mut_all <- fread('../data/scores_patient_mut.csv')
141 |
142 | exp_data <- slapply(c('LUAD', 'PAAD'), function(ct) {
143 | expmat <- fread(paste0('/home/labs/tirosh/tyler/TCGA_data/', ct, '/Exp_data_TPM.csv'))[, set_rownames(as.matrix(.SD), V1), .SDcols = -'V1']
144 | meta <- fread(paste0('/home/labs/tirosh/tyler/TCGA_data/', ct, '/Cells.csv'), na.strings = '')[
145 | sample_type != 'normal' & patient_id %in% scores_patient_mut_all[cancer_type == ct, patient_id]
146 | ]
147 | expmat <- expmat[, meta$sample_id]
148 | colnames(expmat) <- meta$patient_id
149 | return(list(expmat = expmat, meta = meta))
150 | })
151 | top_genes <- slapply(
152 | names(exp_data),
153 | function(ct) head(names(sort(apply(exp_data[[ct]]$expmat, 1, function(x) median(abs(x - median(x)))), decreasing = TRUE)), 2500)
154 | )
155 | cormat <- slapply(names(exp_data), function(ct) cor(t(apply(exp_data[[ct]]$expmat[top_genes[[ct]], ], 1, function(x) x - median(x)))))
156 | clust <- slapply(names(exp_data), function(ct) seriate(dist(cormat[[ct]]), method = 'HC_average')[[1]])
157 | pdac_clust <- labels(as.dendrogram(clust$LUAD)[[1]][[2]][[1]])
158 |
159 | luad_clinical <- fread('/home/labs/tirosh/tyler/TCGA_data/LUAD/Clinical.csv', key = 'patient_id')
160 | hist_data <- copy(scores_patient_mut_all[cancer_type == 'LUAD' & grepl('PDAC Classical', meta_program)])[,
161 | c('pdac_high', 'hist_type') := .((score > 1.5), do.call(function(x) luad_clinical[x, histological_type], list(x = patient_id)))
162 | ][, mucinous := grepl('^mucinous| mucinous', hist_type)] # This regex ensures "nonmucinous" is not counted
163 | setkey(hist_data, patient_id)
164 |
165 | top_genes_int <- intersect(top_genes$LUAD, top_genes$PAAD)
166 | cor_luad_paad <- cor(exp_data$LUAD$expmat[top_genes_int, ], exp_data$PAAD$expmat[top_genes_int, ])
167 | bxplt_data <- data.table(patient_id = rownames(cor_luad_paad), mean_cor = rowMeans(cor_luad_paad))[, pdac_clust := (patient_id %in% pdac_clust)]
168 | setkey(scores_patient_mut_all, patient_id)
169 | bxplt_data[, pdac_high := do.call(function(x) scores_patient_mut_all[grepl('PDAC Classical', meta_program)][x, score > 1.5], list(x = patient_id))]
170 | bxplt_data[, mucinous := hist_data$mucinous] # Checked that this gives TRUE: all(bxplt_data$patient_id == hist_data$patient_id)
171 |
172 | box_luad <- ggplot(bxplt_data, aes(x = mapvalues(pdac_high, c(TRUE, FALSE), c('PDAC score\n\u2265 1.5', 'PDAC score\n< 1.5')), y = mean_cor)) +
173 | geom_boxplot(outlier.shape = NA) +
174 | geom_point(
175 | aes(fill = factor(mapvalues(mucinous, c(TRUE, FALSE), c('Mucinous', 'Non-mucinous')), levels = c('Non-mucinous', 'Mucinous'))),
176 | size = 2, shape = 21, stroke = 0.3, position = position_jitter(width = 0.3)
177 | ) +
178 | scale_fill_manual(values = c('Non-mucinous' = 'grey80', 'Mucinous' = 'deeppink2')) +
179 | theme_test() +
180 | theme(axis.text.x = element_text(size = 11)) +
181 | labs(x = NULL, y = 'Mean correlation with PDAC samples', title = NULL, fill = 'LUAD subtype')
182 |
183 |
184 |
185 |
186 |
187 | box_hnsc_grob <- ggplotGrob(box_hnsc)
188 | box_luad_grob <- ggplotGrob(box_luad)
189 | widths_hnsc <- c(0.3, 0, 0.5, 0.4, 8, 0, 0, 0, 0.3)
190 | widths_luad <- c(0.3, 0, 0.5, 0.7, 8, 0, 0, 0.3, 3, 0, 0.3)
191 | heights_hnsc <- c(0.3, 0, 1, 0, 0, 0, 10, 0.8, 0, 0, 0, 0.3)
192 | heights_luad <- c(0.3, 0, 1, 0, 0, 0, 10, 0.8, 0, 0, 0, 0.3)
193 | box_hnsc_grob$widths <- unit(widths_hnsc, 'cm')
194 | box_luad_grob$widths <- unit(widths_luad, 'cm')
195 | box_hnsc_grob$heights <- unit(heights_hnsc, 'cm')
196 | box_luad_grob$heights <- unit(heights_luad, 'cm')
197 |
198 | cairo_pdf('../data/pdac_classical_main.pdf', height = sum(heights_luad)/2.54, width = sum(c(widths_luad, widths_hnsc))/2.54)
199 | plot_grid(box_luad_grob, box_hnsc_grob, nrow = 1, ncol = 2, rel_widths = c(sum(widths_luad), sum(widths_hnsc)))
200 | dev.off()
201 |
--------------------------------------------------------------------------------
/ITH_hallmarks/TCGA_analysis/preprocess_clinical_data.R:
--------------------------------------------------------------------------------
1 | library(data.table)
2 | library(magrittr)
3 | library(matkot)
4 |
5 | # source('functions.R')
6 |
7 | cancer_types <- c('ACC', 'BLCA', 'BRCA', 'CESC', 'CHOL', 'COAD', 'DLBC', 'ESCA', 'GBM', 'HNSC', 'KICH', 'KIRC', 'KIRP', 'LAML', 'LGG', 'LIHC',
8 | 'LUAD', 'LUSC', 'MESO', 'OV', 'PAAD', 'PCPG', 'PRAD', 'READ', 'SARC', 'SKCM', 'STAD', 'TGCT', 'THCA', 'THYM', 'UCEC', 'UCS', 'UVM')
9 |
10 | for(ct in cancer_types) {
11 |
12 | cat(ct, '\b...')
13 |
14 | if(ct %in% dir('~/TCGA_data') && 'All_CDEs.txt' %in% dir(paste0('~/TCGA_data/', ct))) {
15 | clinical_data <- fread(paste0('~/TCGA_data/', ct, '/All_CDEs.txt'), showProgress = FALSE)
16 | } else {
17 |
18 | download.file(
19 | paste0(
20 | 'http://gdac.broadinstitute.org/runs/stddata__2016_01_28/data/',
21 | ct,
22 | '/20160128/gdac.broadinstitute.org_',
23 | ct,
24 | '.Clinical_Pick_Tier1.Level_4.2016012800.0.0.tar.gz'
25 | ),
26 | destfile = 'tmp.tar.gz',
27 | quiet = TRUE
28 | )
29 |
30 | file_names <- untar('tmp.tar.gz', list = TRUE)
31 | untar('tmp.tar.gz', files = file_names[endsWith(file_names, 'All_CDEs.txt')], exdir = 'tmp')
32 | file.remove('tmp.tar.gz')
33 |
34 | # Read in the data:
35 | clinical_data <- fread(paste0('tmp/', file_names[endsWith(file_names, 'All_CDEs.txt')]), showProgress = FALSE)
36 |
37 | # Remove the created directory:
38 | unlink('tmp', recursive = TRUE)
39 |
40 | }
41 |
42 | clinical_data <- transpose(clinical_data, make.names = 'bcr_patient_barcode', keep.names = 'bcr_patient_barcode')
43 | clinical_data <- clinical_data[, -'patient_id']
44 | setnames(clinical_data, 'bcr_patient_barcode', 'patient_id')
45 | clinical_data[, c('patient_id', 'cancer_type') := .(toupper(gsub('-', '\\.', patient_id)), ct)]
46 | clinical_data <- clinical_data[order(patient_id)]
47 | setcolorder(clinical_data, c('patient_id', 'cancer_type'))
48 |
49 | fwrite(clinical_data, paste0('~/TCGA_data/', ct, '/Clinical.csv'))
50 |
51 | cat('Done!\n')
52 |
53 | }
54 |
--------------------------------------------------------------------------------
/ITH_hallmarks/TCGA_analysis/preprocess_mutations.R:
--------------------------------------------------------------------------------
1 | library(data.table)
2 | library(magrittr)
3 | library(limma)
4 | library(stringr)
5 | library(plyr)
6 | library(RCurl)
7 | library(matkot)
8 |
9 | cancer_types <- c('ACC', 'BLCA', 'BRCA', 'CESC', 'CHOL', 'COAD', 'DLBC', 'ESCA', 'GBM', 'HNSC', 'KICH', 'KIRC', 'KIRP', 'LAML', 'LGG', 'LIHC',
10 | 'LUAD', 'LUSC', 'MESO', 'OV', 'PAAD', 'PCPG', 'PRAD', 'READ', 'SARC', 'SKCM', 'STAD', 'TGCT', 'THCA', 'THYM', 'UCEC', 'UCS', 'UVM')
11 |
12 | hgnc_complete_set <- fread('../data/hgnc_complete_set.txt', key = 'ensembl_gene_id')[
13 | !(ensembl_gene_id %in% names(table(ensembl_gene_id))[table(ensembl_gene_id) > 1]),
14 | .(symbol = symbol, alias = alias_symbol, entrez_id = entrez_id)
15 | ][, entrez_id := as.character(entrez_id)]
16 |
17 |
18 |
19 |
20 |
21 | for(ct in cancer_types) {
22 |
23 | cat(ct, '\n')
24 |
25 | # The CNV file URLs use one of three suffixes, so we need to choose whichever one exists:
26 | data_url <- sapply(
27 | c('-TP', '-TB', '-TM'),
28 | function(suff) paste0( 'http://gdac.broadinstitute.org/runs/analyses__2016_01_28/data/', ct, suff, '/20160128/gdac.broadinstitute.org_', ct,
29 | suff, '.CopyNumber_Gistic2.Level_4.2016012800.0.0.tar.gz'),
30 | USE.NAMES = FALSE
31 | )
32 | data_url <- data_url[sapply(data_url, url.exists)]
33 |
34 | download.file(data_url, destfile = 'tmp.tar.gz', quiet = TRUE)
35 |
36 | file_names <- untar('tmp.tar.gz', list = TRUE)
37 | untar('tmp.tar.gz', files = file_names[endsWith(file_names, 'all_thresholded.by_genes.txt')], exdir = 'tmp')
38 | file.remove('tmp.tar.gz')
39 |
40 | # Read in the data:
41 | cnv_data <- fread(paste0('tmp/', file_names[endsWith(file_names, 'all_thresholded.by_genes.txt')]))
42 |
43 | # Remove the created directory:
44 | unlink('tmp', recursive = TRUE)
45 |
46 | cnv_data[,
47 | c('Gene Symbol', 'Locus ID', 'Cytoband') := .(
48 | do.call(
49 | function(gl) {
50 | gl[grep('C[0-9]+ORF[0-9]+', gl)] <- gsub('ORF', 'orf', gl[grep('C[0-9]+ORF[0-9]+', gl)])
51 | gl_mapped <- alias2SymbolTable(gl)
52 | gl_log <- gl %in% hgnc_complete_set$symbol
53 | gl_mapped_log <- gl_mapped %in% hgnc_complete_set$symbol
54 | out <- gl
55 | out[!gl_log & gl_mapped_log] <- gl_mapped[!gl_log & gl_mapped_log]
56 | out[!gl_log & !gl_mapped_log] <- sapply(
57 | gl_mapped[!gl_log & !gl_mapped_log],
58 | function(g) hgnc_complete_set[
59 | grepl(paste0('^', g, '\\||\\|', g, '\\||\\|', g, '$'), alias),
60 | switch((.N == 1) + 1, NA, symbol)
61 | ]
62 | )
63 | return(out)
64 | },
65 | args = list(gl = cnv_data$`Gene Symbol`)
66 | ),
67 | NULL,
68 | NULL
69 | )
70 | ]
71 |
72 | cnv_data <- cnv_data[!is.na(`Gene Symbol`)]
73 | cnv_data <- cnv_data[, set_rownames(as.matrix(.SD), `Gene Symbol`), .SDcols = -'Gene Symbol']
74 | colnames(cnv_data) <- gsub('-', '\\.', colnames(cnv_data))
75 |
76 | fwrite(as.data.frame(cnv_data), paste0('~/TCGA_data/', ct, '/CNV_GISTIC_data.csv'), row.names = TRUE)
77 |
78 | }
79 |
80 |
81 |
82 |
83 |
84 | # Mutations annotations files were downloaded from GDC Data Portal, filtering by MAF, open access, Masked Somatic Mutation, MuTect2.
85 |
86 | # These files have full sample IDs, but since they don't match the sample IDs in the expression data, I'll extract just the patient IDs.
87 |
88 | setkey(hgnc_complete_set, entrez_id)
89 |
90 | for(ct in cancer_types) {
91 | cat(ct, '\n')
92 | cont <- dir(paste0('~/TCGA_data/mut_data/', ct))
93 | mut_data <- fread(paste0('~/TCGA_data/mut_data/', ct, '/', cont[grep(ct, cont)]))[,
94 | c('Entrez_Gene_Id', 'patient_id', 'sample_type') := .(
95 | as.character(Entrez_Gene_Id),
96 | apply(str_split_fixed(Tumor_Sample_Barcode, '-', 4)[, 1:3], 1, paste, collapse = '.'),
97 | mapvalues(
98 | gsub('[A-Z]', '', str_split_fixed(Tumor_Sample_Barcode, '-', 5)[, 4]),
99 | c('01', '02', '03', '05', '06', '07', '11'),
100 | c('primary', 'recurrent', 'primary', 'primary_additional', 'metastatic', 'metastatic_additional', 'normal'),
101 | warn_missing = FALSE
102 | )
103 | )
104 | ][,
105 | gene := hgnc_complete_set[Entrez_Gene_Id, symbol]
106 | ][
107 | !is.na(gene),
108 | .(gene, patient_id, sample_type, Chromosome, Start_Position, End_Position, Strand, Variant_Classification, Variant_Type,
109 | Reference_Allele, Tumor_Seq_Allele1, Tumor_Seq_Allele2, Mutation_Status)
110 | ]
111 | fwrite(mut_data, paste0('~/TCGA_data/', ct, '/Mutation_data.csv'))
112 | }
113 |
--------------------------------------------------------------------------------
/ITH_hallmarks/TCGA_analysis/study_contribution_per_MP.RDS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tiroshlab/3ca/21a784103528cdf15008761f9669e2025159d6d2/ITH_hallmarks/TCGA_analysis/study_contribution_per_MP.RDS
--------------------------------------------------------------------------------
/ITH_hallmarks/TCGA_analysis/study_tcga_map.R:
--------------------------------------------------------------------------------
1 | library(data.table)
2 | library(magrittr)
3 |
4 | study_tcga_map <- data.table()[,
5 | c('study', 'cancer_type') := rbindlist(.(
6 | .('ALL_Ebinger_2016', NA),
7 | .('AML_Galen_2019', 'LAML'),
8 | .('AML_Wu_2020', 'LAML'),
9 | .('AUTONOMIC_GANGLIA_Kinker_2020', NA),
10 | .('BILIARY_TRACT_Kinker_2020', 'LIHC'),
11 | .('BONE_Kinker_2020', NA),
12 | .('Breast_Chung_2017', 'BRCA'),
13 | .('Breast_Gao_2021', 'BRCA'),
14 | .('Breast_Jordan_2016', 'BRCA'),
15 | .('Breast_Karaayvas_2018', 'BRCA'),
16 | .('BREAST_Kinker_2020', 'BRCA'),
17 | .('Breast_Pal_2021', 'BRCA'),
18 | .('Breast_Qian_2020', 'BRCA'),
19 | .('CENTRAL_NERVOUS_SYSTEM_Kinker_2020', NA),
20 | .('CML_Giustacchini_2017', NA),
21 | .('Colon_Li_2017', 'COAD'),
22 | .('Colon_Li_2017', 'READ'),
23 | .('CRC_Lee_2020', 'COAD'),
24 | .('CRC_Lee_2020', 'READ'),
25 | .('ENDOMETRIUM_Kinker_2020', 'UCEC'),
26 | .('Ependymoma_Gojo_2020', NA),
27 | .('ESCC_Yao_2020', 'ESCA_ESCC'),
28 | .('Ewing_CellLines_Aynaud_2020', NA),
29 | .('Ewing_PDX_Aynaud_2020', NA),
30 | .('GBM_cell_lines', 'GBM_IDH-WT'),
31 | .('GBM_mouse_Unpublished', 'GBM_IDH-WT'),
32 | .('Glioblastoma_Couturier_2020', 'GBM_IDH-WT'),
33 | .('Glioblastoma_Darmanis_2017', 'GBM_IDH-WT'),
34 | .('Glioblastoma_Wang_2019', 'GBM_IDH-WT'),
35 | .('Glioma_Filbin_2018', NA), # H3-K27M mutant - a kind of HGG that's not GBM
36 | .('Glioma_Neftel_2019', 'GBM_IDH-WT'),
37 | .('Glioma_Tirosh_2016', 'LGG_oligo'),
38 | .('Glioma_Venteicher_2017', 'LGG_astro'),
39 | .('Glioma_Yuan_2018', 'GBM_IDH-WT'), # 7 GBMs and 1 grade III astrocytoma (TCGA LGG does include plenty of Grade 3 LGGs)
40 | .('Glioma_Yuan_2018', 'LGG_astro'),
41 | .('HCC_Sun_2021', 'LIHC'),
42 | .('HGSOC_Izar_2020', 'OV'),
43 | .('HNSCC_Puram_2017', 'HNSC'),
44 | .('IPMN_Franses_2017', NA), # CTCs
45 | .('KIDNEY_Kinker_2020', 'KICH'),
46 | .('KIDNEY_Kinker_2020', 'KIRC'),
47 | .('KIDNEY_Kinker_2020', 'KIRP'),
48 | .('LARGE_INTESTINE_Kinker_2020', 'COAD'),
49 | .('LARGE_INTESTINE_Kinker_2020', 'READ'),
50 | .('Leukemia_Caron_2020', NA), # ALL
51 | .('LIVER_Kinker_2020', 'LIHC'),
52 | .('Liver_Ma_2019', 'LIHC'),
53 | .('Liver_Ma_2019', 'CHOL'),
54 | .('LiverMets_Massalha_2020', NA), # these are either CRC metastases to the liver or intra-hepatic cholangiocarcinoma
55 | .('LSCC_Unpublished', 'HNSC'),
56 | .('Lung_Kim_2020', 'LUAD'),
57 | .('LUNG_Kinker_2020', 'LUAD'),
58 | .('LUNG_Kinker_2020', 'LUSC'),
59 | .('Lung_Laughney_2020', 'LUAD'),
60 | .('Lung_Qian_2020', 'LUAD'),
61 | .('Lung_Qian_2020', 'LUSC'), # This study contains about half LUAD and half LUSC
62 | .('Lung_Xing_2021', 'LUAD'),
63 | .('lungcancer_Antonella_2020', 'LUAD'),
64 | .('lungcancer_Maynard_2020', 'LUAD'),
65 | .('lungcancer_Maynard_2020', 'LUSC'), # 45 LUAD and 1 LUSC
66 | .('Medulloblastoma_Hovestadt_2019', NA),
67 | .('Melanoma_JerbyArnon_2018', 'SKCM_metastatic'),
68 | .('Melanoma_JerbyArnon_2018', 'SKCM_primary'),
69 | .('Melanoma_Tirosh_2016', 'SKCM_metastatic'),
70 | .('Melanoma_Tirosh_2016', 'SKCM_primary'),
71 | .('MM_Cohen_2021', NA),
72 | .('MM_Ledergor_2018', NA),
73 | .('MM_Liu_2021', NA),
74 | .('Myeloproliferative_Nam_2019', NA),
75 | .('NasopharyngealCarcinoma_Chen_2020', 'HNSC'),
76 | .('NET_Unpublished', NA),
77 | .('Neuroblastoma_CellLine_Jansky_2021', NA),
78 | .('Neuroblastoma_Dong_2020', NA),
79 | .('Neuroblastoma_Jansky_2021', NA),
80 | .('Neuroblastoma_Kildisiute_2021', NA),
81 | .('Neuroblastoma_Mercatelli_2021', NA),
82 | .('OESOPHAGUS_Kinker_2020', 'ESCA_AC'),
83 | .('OESOPHAGUS_Kinker_2020', 'ESCA_ESCC'),
84 | .('OPSCC_Unpublished', 'HNSC'),
85 | .('Osteosarcoma_Zhou_2020', NA),
86 | .('Ovarian_Nath_2021', 'OV'),
87 | .('Ovarian_Olalekan_2021', 'OV'),
88 | .('Ovarian_Qian_2020', 'OV'),
89 | .('OVARY_Kinker_2020', 'OV'),
90 | .('PANCREAS_Kinker_2020', 'PAAD'),
91 | .('PDAC_Franses_2020', NA), # CTCs
92 | .('PDAC_Ligorio_2019', 'PAAD'),
93 | .('PDAC_Lin_2020', 'PAAD'),
94 | .('PDAC_Moncada_2020', 'PAAD'),
95 | .('PDAC_Peng_2019', 'PAAD'),
96 | .('PDAC_Steele_2020', 'PAAD'),
97 | .('PDAC_Yu_2012', 'PAAD'),
98 | .('PDAC_Yue_2020', 'PAAD'),
99 | .('Pituitary_Cui_2021', NA),
100 | .('PLEURA_Kinker_2020', NA),
101 | .('Prostate_Chen_2021', 'PRAD'),
102 | .('Prostate_Dong_2020', 'PRAD'),
103 | .('PROSTATE_Kinker_2020', 'PRAD'),
104 | .('Prostate_Miyamoto_2015', 'PRAD'),
105 | .('RCC_Bi_2021', 'KIRC'), # 7 KIRC and 1 KIRP
106 | .('RCC_Bi_2021', 'KIRP'),
107 | .('RCC_Obradovic_2021', 'KIRC'),
108 | .('RCC_Zhang_2021', 'KICH'), # 7 KIRC and 1 KICH
109 | .('RCC_Zhang_2021', 'KIRC'),
110 | .('SCHW_Unpublished', NA),
111 | .('SCLC_HumanMet_Ireland_2020', NA),
112 | .('SCLC_Mouse_Ireland_2020', NA),
113 | .('SKIN_Kinker_2020', 'SKCM_metastatic'),
114 | .('SKIN_Kinker_2020', 'SKCM_primary'),
115 | .('SkinSCC_Ji_2020', 'SKCM_primary'),
116 | .('SOFT_TISSUE_Kinker_2020', 'SARC'),
117 | .('STOMACH_Kinker_2020', 'STAD'),
118 | .('SyS_JerbyArnon_2021', 'SARC'),
119 | .('Thyroid_Gao_2021', 'THCA'),
120 | .('THYROID_Kinker_2020', 'THCA'),
121 | .('UPPER_AERODIGESTIVE_TRACT_Kinker_2020', 'HNSC'),
122 | .('URINARY_TRACT_Kinker_2020', 'BLCA'),
123 | .('WilmsTumor_Young_2018', NA)
124 | ))
125 | ]
126 |
127 | fwrite(study_tcga_map, '../data/study_tcga_map.csv')
128 |
--------------------------------------------------------------------------------
/ITH_hallmarks/TCGA_analysis/tcga_clinical.R:
--------------------------------------------------------------------------------
1 | library(data.table)
2 | library(magrittr)
3 | library(ggplot2)
4 | library(stringdist)
5 | library(stringr)
6 | library(survival)
7 | library(readxl)
8 | library(plyr)
9 | library(matkot)
10 |
11 | cancer_types <- c('ACC', 'BLCA', 'BRCA', 'CESC', 'CHOL', 'COAD', 'DLBC', 'ESCA_AC', 'ESCA_ESCC', 'GBM_IDH-WT', 'HNSC', 'KICH', 'KIRC', 'KIRP',
12 | 'LAML', 'LGG_astro', 'LGG_IDH-WT', 'LGG_oligo', 'LIHC', 'LUAD', 'LUSC', 'MESO', 'OV', 'PAAD', 'PCPG', 'PRAD', 'READ', 'SARC', 'SKCM_primary',
13 | 'SKCM_metastatic', 'STAD', 'TGCT', 'THCA', 'THYM', 'UCEC', 'UCS', 'UVM')
14 |
15 | study_contrib <- readRDS('../data/study_contribution_per_MP.RDS')
16 | names(study_contrib) <- gsub(' ', ' ', names(study_contrib))
17 | names(study_contrib)[41] <- 'MP41 Unassigned'
18 |
19 | study_tcga_map <- fread('../data/study_tcga_map.csv', na.strings = '', key = 'cancer_type')
20 | cancer_types <- cancer_types[cancer_types %in% study_tcga_map$cancer_type]
21 |
22 | # Data from Liu et al. 2018 (https://doi.org/10.1016/j.cell.2018.02.052):
23 | liu <- as.data.table(
24 | read_xlsx(
25 | '~/TCGA_data/Liu_1-s2.0-S0092867418302290-mmc1.xlsx',
26 | sheet = 1,
27 | na = c('[Unknown]', '[Not Evaluated]', '[Not Applicable]', '[Not Available]', '[Discrepancy]', '#N/A')
28 | )
29 | )[, .(patient_id = gsub('-', '\\.', bcr_patient_barcode), cancer_type = type, OS = OS.time, OSc = OS, PFI = PFI.time, PFIc = PFI,
30 | stage = ajcc_pathologic_tumor_stage, grade = histological_grade, ther1 = str_to_title(treatment_outcome_first_course))]
31 | setkey(liu, patient_id)
32 |
33 | firehose <- fread('~/TCGA_data/clinical.csv', na.strings = '', key = 'patient_id')[, .(patient_id = patient_id,
34 | cancer_type = cancer_type, ln_n = as.character(number_of_lymphnodes_positive_by_he), ther2 = str_to_title(primary_therapy_outcome_success),
35 | ther3 = str_to_title(followup_treatment_success), t_stage = pathologic_t, n_stage = pathologic_n, m_stage = pathologic_m)]
36 |
37 | clin <- merge(liu, firehose, by = c('patient_id', 'cancer_type'), all = TRUE)
38 |
39 | meta_all <- lapply(
40 | cancer_types,
41 | function(ct) fread(paste0('~/TCGA_data/', ct, '/Cells.csv'))[, .(patient_id = patient_id, cancer_type = ct)]
42 | ) %>% rbindlist %>% unique
43 | meta_all <- meta_all[patient_id %in% names(table(patient_id))[table(patient_id) == 1]]
44 | setkey(meta_all, patient_id)
45 | clin <- clin[patient_id %in% meta_all$patient_id]
46 | clin[, cancer_type := do.call(`[`, list(meta_all, patient_id))$cancer_type] # Convert from TCGA label where relevant, e.g. 'ESCA' --> 'ESCA_AC'
47 |
48 | clin_test <- lapply(cancer_types, function(ct) {
49 |
50 | cat(ct, '\n')
51 |
52 | if('Scores.csv' %in% dir(paste0('~/TCGA_data/', ct))) {
53 | scores <- fread(paste0('~/TCGA_data/', ct, '/Scores.csv'), key = 'sample_id')
54 | } else return(NULL)
55 |
56 | # Read in tumour metadata file and extract one sample ID per patient ID, according to order of priority of sample types:
57 | meta <- fread(paste0('~/TCGA_data/', ct, '/Cells.csv'), key = 'sample_type', na.strings = '')[
58 | sample_type != 'normal',
59 | .SD[c('primary', 'primary_additional', 'recurrent', 'metastatic', 'metastatic_additional')][!is.na(sample_id)][1],
60 | keyby = patient_id
61 | ][clin[cancer_type == ct, patient_id]][!is.na(sample_id)]
62 | setkey(meta, sample_id)
63 |
64 | # Subset scores table with the above IDs:
65 | scores <- scores[sample_id %in% meta$sample_id]
66 | scores <- scores[, c(do.call(`[`, list(meta, sample_id))[, .(patient_id, purity)], .(meta_program = meta_program, score = score))]
67 |
68 | # Subset clinical data table with the above IDs:
69 | clin_ct <- clin[patient_id %in% scores$patient_id]
70 |
71 | # Define variable categories:
72 |
73 | # M stage:
74 | clin_ct[, m_stage := ifelse(grepl('m0', m_stage), 'Low', ifelse(grepl('m1', m_stage), 'High', NA))]
75 |
76 | # Therapy resistance:
77 | for(v in c('ther1', 'ther2', 'ther3')) clin_ct[, (v) := ifelse(grepl('^C|^No ', get(v)), 'Low', ifelse(is.na(get(v)), NA, 'High'))]
78 | clin_ct[, ther1 := {if(is.na(ther1)) {if(is.na(ther2)) NA else ther2} else if(is.na(ther2)) ther1 else if(ther1 == ther2) ther1 else NA},
79 | by = patient_id]
80 | clin_ct[, ther := 'a']
81 | clin_ct[, ther := if(any(c(ther1, ther3) == 'High', na.rm = TRUE)) 'High' else if(all(is.na(c(ther1, ther3)))) NA else 'Low', by = patient_id]
82 | clin_ct[, c('ther1', 'ther2', 'ther3') := NULL]
83 |
84 | # Lymphatic spread:
85 | clin_ct[!is.na(ln_n), ln_n := ifelse(ln_n == '0', 'Low', 'High')]
86 | clin_ct[, n_stage := ifelse(grepl('n0', n_stage), 'Low', ifelse(grepl('n1|n2|n3', n_stage), 'High', NA))]
87 | clin_ct[, ln := 'a']
88 | clin_ct[, ln := if(any(c(ln_n, n_stage) == 'High', na.rm = TRUE)) 'High' else if(all(is.na(c(ln_n, n_stage)))) NA else 'Low', by = patient_id]
89 | clin_ct[, c('ln_n', 'n_stage') := NULL]
90 |
91 | # In cases where we have intermediate levels, define boundary as wherever the cumulative sum of sample numbers passes 10:
92 |
93 | ints <- list()
94 |
95 | # Grade:
96 | clin_ct[, grade := mapvalues(grade, c('G1', 'G4', 'GX', 'GB', 'High Grade', 'Low Grade'), c('Low', 'High', NA, 'Low', 'High', 'Low'),
97 | warn_missing = FALSE)]
98 | ints$grade <- c('G2', 'G3')[c('G2', 'G3') %in% clin_ct$grade]
99 |
100 | # Stage:
101 | clin_ct[, stage := mapvalues(gsub('[ABC]$', '', stage), c('Stage 0', 'Stage I', 'Stage IV', 'IS', 'I/II NOS', 'Stage X'),
102 | c('Low', 'Low', 'High', 'Low', NA, NA), warn_missing = FALSE)]
103 | ints$stage <- c('Stage II', 'Stage III')[c('Stage II', 'Stage III') %in% clin_ct$stage]
104 |
105 | # T stage:
106 | clin_ct[, t_stage := mapvalues(str_extract(t_stage, 't[0-4]|tis'), c('tis', 't0', 't1', 't4'), c('Low', 'Low', 'Low', 'High'),
107 | warn_missing = FALSE)]
108 | ints$t_stage <- c('t2', 't3')[c('t2', 't3') %in% clin_ct$t_stage]
109 |
110 | for(v in names(ints)) {
111 | if(length(ints[[v]]) > 0) {
112 | sums <- clin_ct[!is.na(get(v)), .(N = .N), keyby = .(kv = get(v))][c('Low', ints[[v]], 'High')][!is.na(N), setNames(cumsum(N), kv)]
113 | if(!any(sums[names(sums) != 'High'] >= 10)) {
114 | clin_ct[, (v) := NA]
115 | } else {
116 | clin_ct[get(v) %in% names(sums)[1:min(which(sums >= 10))], (v) := 'Low']
117 | clin_ct[get(v) %in% names(sums)[(min(which(sums >= 10)) + 1):length(sums)], (v) := 'High']
118 | }
119 | }
120 | }
121 |
122 | scores <- scores[, cbind(.SD, do.call(`[`, list(clin_ct, patient_id))[, -c('patient_id', 'cancer_type')])]
123 | scores[, cancer_type := ct]
124 | setcolorder(scores, c('patient_id', 'cancer_type'))
125 |
126 | surv <- lapply(unique(scores$meta_program), function(mp) {
127 | lapply(c('OS', 'PFI'), function(v) {
128 | sdata <- scores[meta_program == mp & !is.na(get(v))]
129 | if(nrow(sdata) < 20) return(NULL)
130 | coxmod <- coxph(Surv(get(v), get(paste0(v, 'c'))) ~ score, data = sdata)
131 | data.table(cancer_type = ct, var_name = v, meta_program = mp, eff = coxmod$coefficients,# hr = exp(coxmod$coefficients),
132 | pval = summary(coxmod)$logtest['pvalue'])
133 | }) %>% rbindlist
134 | }) %>% rbindlist
135 |
136 | categ <- lapply(unique(scores$meta_program), function(mp) {
137 | lapply(c('stage', 'grade', 't_stage', 'm_stage', 'ln', 'ther'), function(v) {
138 | sdata <- scores[meta_program == mp & !is.na(get(v))]
139 | if(nrow(sdata) < 20) return(NULL) else if(sdata[, sum(get(v) == 'Low') < 10 | sum(get(v) == 'High') < 10]) return(NULL)
140 | data.table(cancer_type = ct, var_name = v, meta_program = mp,
141 | eff = sdata[, .SD[get(v) == 'High', mean(score)] - .SD[get(v) == 'Low', mean(score)]],
142 | pval = t.test(score ~ get(v), sdata)$p.value)
143 | }) %>% rbindlist
144 | }) %>% rbindlist
145 |
146 | return(list(res = rbind(surv, categ), scores = scores, clin = clin_ct))
147 |
148 | })
149 |
150 | clin_test_res <- rbindlist(lapply(clin_test, `[[`, 'res'))
151 | clin_test_data <- rbindlist(lapply(clin_test, `[[`, 'clin'))
152 | clin_test_scores <- rbindlist(lapply(clin_test, `[[`, 'scores'))
153 |
154 | clin_test_res[, pval_adj := p.adjust(pval, method = 'BH')]
155 | setkey(clin_test_res, cancer_type, meta_program)
156 |
157 | fwrite(clin_test_res, '../data/clin_test_res.csv')
158 | fwrite(clin_test_data, '../data/clin_test_data.csv')
159 | fwrite(clin_test_scores, '../data/clin_test_scores.csv')
160 |
--------------------------------------------------------------------------------
/ITH_hallmarks/TCGA_analysis/tcga_clinical_deconv.R:
--------------------------------------------------------------------------------
1 | library(data.table)
2 | library(magrittr)
3 | library(ggplot2)
4 | library(stringdist)
5 | library(stringr)
6 | library(survival)
7 | library(readxl)
8 | library(plyr)
9 | library(matkot)
10 |
11 | cancer_types <- c('BLCA', 'BRCA', 'CESC', 'CHOL', 'COAD', 'ESCA_AC', 'ESCA_ESCC', 'HNSC', 'LUAD', 'LUSC', 'OV', 'PAAD', 'PRAD', 'READ', 'STAD',
12 | 'THCA', 'UCEC')
13 |
14 | study_contrib <- readRDS('../data/study_contribution_per_MP.RDS')
15 | names(study_contrib) <- gsub(' ', ' ', names(study_contrib))
16 | names(study_contrib)[41] <- 'MP41 Unassigned'
17 |
18 | study_tcga_map <- fread('../data/study_tcga_map.csv', na.strings = '')
19 | cancer_types <- cancer_types[cancer_types %in% study_tcga_map$cancer_type]
20 | setkey(study_tcga_map, cancer_type)
21 |
22 | # Data from Liu et al. 2018 (https://doi.org/10.1016/j.cell.2018.02.052):
23 | liu <- as.data.table(
24 | read_xlsx(
25 | '~/TCGA_data/Liu_1-s2.0-S0092867418302290-mmc1.xlsx',
26 | sheet = 1,
27 | na = c('[Unknown]', '[Not Evaluated]', '[Not Applicable]', '[Not Available]', '[Discrepancy]', '#N/A')
28 | )
29 | )[, .(patient_id = gsub('-', '\\.', bcr_patient_barcode), cancer_type = type, OS = OS.time, OSc = OS, PFI = PFI.time, PFIc = PFI,
30 | stage = ajcc_pathologic_tumor_stage, grade = histological_grade, ther1 = str_to_title(treatment_outcome_first_course))]
31 | setkey(liu, patient_id)
32 |
33 | firehose <- fread('~/TCGA_data/clinical.csv', na.strings = '', key = 'patient_id')[, .(patient_id = patient_id,
34 | cancer_type = cancer_type, ln_n = as.character(number_of_lymphnodes_positive_by_he), ther2 = str_to_title(primary_therapy_outcome_success),
35 | ther3 = str_to_title(followup_treatment_success), t_stage = pathologic_t, n_stage = pathologic_n, m_stage = pathologic_m)]
36 |
37 | clin <- merge(liu, firehose, by = c('patient_id', 'cancer_type'), all = TRUE)
38 |
39 | meta_all <- lapply(
40 | cancer_types,
41 | function(ct) fread(paste0('~/TCGA_data/', ct, '/Cells.csv'))[, .(patient_id = patient_id, cancer_type = ct)]
42 | ) %>% rbindlist %>% unique
43 | meta_all <- meta_all[patient_id %in% names(table(patient_id))[table(patient_id) == 1]]
44 | setkey(meta_all, patient_id)
45 | clin <- clin[patient_id %in% meta_all$patient_id]
46 | clin[, cancer_type := do.call(`[`, list(meta_all, patient_id))$cancer_type] # Convert from TCGA label where relevant, e.g. 'ESCA' --> 'ESCA_AC'
47 |
48 | clin_test <- lapply(cancer_types, function(ct) {
49 |
50 | cat(ct, '\n')
51 |
52 | if('Scores_deconv.csv' %in% dir(paste0('~/TCGA_data/', ct))) {
53 | scores <- fread(paste0('~/TCGA_data/', ct, '/Scores_deconv.csv'), key = 'sample_id')
54 | } else return(NULL)
55 |
56 | # Read in tumour metadata file and extract one sample ID per patient ID, according to order of priority of sample types:
57 | meta <- fread(paste0('~/TCGA_data/', ct, '/Cells.csv'), key = 'sample_type')[
58 | sample_type != 'normal',
59 | .SD[c('primary', 'primary_additional', 'recurrent', 'metastatic', 'metastatic_additional')][!is.na(sample_id)][1],
60 | keyby = patient_id
61 | ][clin[cancer_type == ct, patient_id]][!is.na(sample_id)]
62 | setkey(meta, sample_id)
63 |
64 | # Subset scores table with the above IDs:
65 | scores <- scores[sample_id %in% meta$sample_id]
66 | scores <- scores[, c(do.call(`[`, list(meta, sample_id))[, .(patient_id, purity)], .(meta_program = meta_program, score = score))]
67 |
68 | # Subset clinical data table with the above IDs:
69 | clin_ct <- clin[patient_id %in% scores$patient_id]
70 |
71 | # Define variable categories:
72 |
73 | # M stage:
74 | clin_ct[, m_stage := ifelse(grepl('m0', m_stage), 'Low', ifelse(grepl('m1', m_stage), 'High', NA))]
75 |
76 | # Therapy resistance:
77 | for(v in c('ther1', 'ther2', 'ther3')) clin_ct[, (v) := ifelse(grepl('^C|^No ', get(v)), 'Low', ifelse(is.na(get(v)), NA, 'High'))]
78 | clin_ct[, ther1 := {if(is.na(ther1)) {if(is.na(ther2)) NA else ther2} else if(is.na(ther2)) ther1 else if(ther1 == ther2) ther1 else NA},
79 | by = patient_id]
80 | clin_ct[, ther := 'a']
81 | clin_ct[, ther := if(any(c(ther1, ther3) == 'High', na.rm = TRUE)) 'High' else if(all(is.na(c(ther1, ther3)))) NA else 'Low', by = patient_id]
82 | clin_ct[, c('ther1', 'ther2', 'ther3') := NULL]
83 |
84 | # Lymphatic spread:
85 | clin_ct[!is.na(ln_n), ln_n := ifelse(ln_n == '0', 'Low', 'High')]
86 | clin_ct[, n_stage := ifelse(grepl('n0', n_stage), 'Low', ifelse(grepl('n1|n2|n3', n_stage), 'High', NA))]
87 | clin_ct[, ln := 'a']
88 | clin_ct[, ln := if(any(c(ln_n, n_stage) == 'High', na.rm = TRUE)) 'High' else if(all(is.na(c(ln_n, n_stage)))) NA else 'Low', by = patient_id]
89 | clin_ct[, c('ln_n', 'n_stage') := NULL]
90 |
91 | # In cases where we have intermediate levels, define boundary as wherever the cumulative sum of sample numbers passes 10:
92 |
93 | ints <- list()
94 |
95 | # Grade:
96 | clin_ct[, grade := mapvalues(grade, c('G1', 'G4', 'GX', 'GB', 'High Grade', 'Low Grade'), c('Low', 'High', NA, 'Low', 'High', 'Low'),
97 | warn_missing = FALSE)]
98 | ints$grade <- c('G2', 'G3')[c('G2', 'G3') %in% clin_ct$grade]
99 |
100 | # Stage:
101 | clin_ct[, stage := mapvalues(gsub('[ABC]$', '', stage), c('Stage 0', 'Stage I', 'Stage IV', 'IS', 'I/II NOS', 'Stage X'),
102 | c('Low', 'Low', 'High', 'Low', NA, NA), warn_missing = FALSE)]
103 | ints$stage <- c('Stage II', 'Stage III')[c('Stage II', 'Stage III') %in% clin_ct$stage]
104 |
105 | # T stage:
106 | clin_ct[, t_stage := mapvalues(str_extract(t_stage, 't[0-4]|tis'), c('tis', 't0', 't1', 't4'), c('Low', 'Low', 'Low', 'High'),
107 | warn_missing = FALSE)]
108 | ints$t_stage <- c('t2', 't3')[c('t2', 't3') %in% clin_ct$t_stage]
109 |
110 | for(v in names(ints)) {
111 | if(length(ints[[v]]) > 0) {
112 | sums <- clin_ct[!is.na(get(v)), .(N = .N), keyby = .(kv = get(v))][c('Low', ints[[v]], 'High')][!is.na(N), setNames(cumsum(N), kv)]
113 | if(!any(sums[names(sums) != 'High'] >= 10)) {
114 | clin_ct[, (v) := NA]
115 | } else {
116 | clin_ct[get(v) %in% names(sums)[1:min(which(sums >= 10))], (v) := 'Low']
117 | clin_ct[get(v) %in% names(sums)[(min(which(sums >= 10)) + 1):length(sums)], (v) := 'High']
118 | }
119 | }
120 | }
121 |
122 | scores <- scores[, cbind(.SD, do.call(`[`, list(clin_ct, patient_id))[, -c('patient_id', 'cancer_type')])]
123 | scores[, cancer_type := ct]
124 | setcolorder(scores, c('patient_id', 'cancer_type'))
125 |
126 | surv <- lapply(unique(scores$meta_program), function(mp) {
127 | lapply(c('OS', 'PFI'), function(v) {
128 | sdata <- scores[meta_program == mp & !is.na(get(v))]
129 | if(nrow(sdata) < 20) return(NULL)
130 | coxmod <- coxph(Surv(get(v), get(paste0(v, 'c'))) ~ score, data = sdata)
131 | data.table(cancer_type = ct, var_name = v, meta_program = mp, eff = coxmod$coefficients,# hr = exp(coxmod$coefficients),
132 | pval = summary(coxmod)$logtest['pvalue'])
133 | }) %>% rbindlist
134 | }) %>% rbindlist
135 |
136 | categ <- lapply(unique(scores$meta_program), function(mp) {
137 | lapply(c('stage', 'grade', 't_stage', 'm_stage', 'ln', 'ther'), function(v) {
138 | sdata <- scores[meta_program == mp & !is.na(get(v))]
139 | if(nrow(sdata) < 20) return(NULL) else if(sdata[, sum(get(v) == 'Low') < 10 | sum(get(v) == 'High') < 10]) return(NULL)
140 | data.table(cancer_type = ct, var_name = v, meta_program = mp,
141 | eff = sdata[, .SD[get(v) == 'High', mean(score)] - .SD[get(v) == 'Low', mean(score)]],
142 | pval = t.test(score ~ get(v), sdata)$p.value)
143 | }) %>% rbindlist
144 | }) %>% rbindlist
145 |
146 | return(list(res = rbind(surv, categ), scores = scores, clin = clin_ct))
147 |
148 | })
149 |
150 | clin_test_res <- rbindlist(lapply(clin_test, `[[`, 'res'))
151 | clin_test_data <- rbindlist(lapply(clin_test, `[[`, 'clin'))
152 | clin_test_scores <- rbindlist(lapply(clin_test, `[[`, 'scores'))
153 |
154 | clin_test_res[, pval_adj := p.adjust(pval, method = 'BH')]
155 | setkey(clin_test_res, cancer_type, meta_program)
156 |
157 | fwrite(clin_test_res, '../data/clin_test_res_deconv.csv')
158 | fwrite(clin_test_data, '../data/clin_test_data_deconv.csv')
159 | fwrite(clin_test_scores, '../data/clin_test_scores_deconv.csv')
160 |
--------------------------------------------------------------------------------
/ITH_hallmarks/TCGA_analysis/tcga_clinical_meta.R:
--------------------------------------------------------------------------------
1 | library(data.table)
2 | library(magrittr)
3 | library(ggplot2)
4 | library(stringr)
5 | library(randomcoloR)
6 | library(plyr)
7 | library(scales)
8 | library(latex2exp)
9 | library(RColorBrewer)
10 | library(cowplot)
11 | library(ggrepel)
12 | library(matkot)
13 |
14 |
15 |
16 |
17 |
18 | # Combine results with and without deconvolution into the same heatmaps:
19 |
20 | clin_test_res <- rbind(fread('../data/clin_test_res.csv'), fread('../data/clin_test_res_deconv.csv')[, cancer_type := paste(cancer_type, '(d)')])
21 | clin_test_res[, pval_adj := p.adjust(pval, method = 'BH')]
22 |
23 | setkey(clin_test_res, cancer_type, meta_program)
24 | cancer_types <- sort(unique(clin_test_res$cancer_type))
25 | mps <- unique(clin_test_res$meta_program)
26 |
27 | pdata <- slapply(
28 | c('OS', 'PFI', 'stage', 'grade', 't_stage', 'm_stage', 'ln', 'ther'),
29 | function(v) {
30 | htmp_data <- clin_test_res[var_name == v][CJ(cancer_types, mps), -c('var_name', 'pval')]
31 | htmp_data[is.na(eff), eff := 0]
32 | htmp_data[is.na(pval_adj), pval_adj := 1]
33 | clust_data <- dcast(htmp_data[, -'pval_adj'], meta_program ~ cancer_type)[,
34 | set_rownames(as.matrix(.SD), meta_program),
35 | .SDcols = -'meta_program'
36 | ]
37 | clust_data <- clust_data[apply(clust_data, 1, function(x) !all(x == 0)), apply(clust_data, 2, function(x) !all(x == 0))]
38 | clust_x <- colSums(abs(clust_data))
39 | clust_x <- data.table(ctd = names(clust_x), ct = gsub(' \\(d\\)', '', names(clust_x)), vald = clust_x)[, val := mean(vald), by = ct]
40 | clust_x_temp <- clust_x[, unique(.SD)[, .(cancer_type = ct, r = order(order(-val))/.N)], .SDcols = c('ct', 'val')]
41 | setkey(clust_x_temp, cancer_type)
42 | clust_x[, r := clust_x_temp[ct, r]]
43 | clust_y <- list(labels = rownames(clust_data), order = order(-rowSums(abs(clust_data))))
44 | htmp_data <- htmp_data[cancer_type %in% colnames(clust_data) & meta_program %in% rownames(clust_data)]
45 | return(list(data = htmp_data, clust_x = clust_x[, .(ctd, ct, r)], clust_y = clust_y))
46 | }
47 | )
48 |
49 | mps_data <- rbindlist(lapply(names(pdata), function(v) with(pdata[[v]]$clust_y, data.table(v = v, mp = labels, r = order(order)/length(order)))))
50 | setkey(mps_data, v, mp)
51 | mps_data <- mps_data[CJ(c('OS', 'PFI', 'stage', 'grade', 't_stage', 'm_stage', 'ln', 'ther'), mps)]
52 | mps_data[, rmean := mean(r[!is.na(r)]), by = mp]
53 | mps_ord <- unique(mps_data[, .(mp, rmean)])[is.finite(rmean), mp[order(rmean)]]
54 |
55 | cts_data <- rbindlist(lapply(pdata, `[[`, 'clust_x'))
56 | cts_data_temp <- cts_data[, .(r = mean(r)), keyby = ct]
57 | cts_ord <- unique(cts_data[, .(ctd = ctd, ct = ct, r = do.call(`[`, list(cts_data_temp, ct))$r)])[order(ctd), ctd[order(r)]]
58 |
59 | # Overall survival:
60 | os_data <- copy(pdata$OS)$data
61 | os_data[, c('cancer_type', 'meta_program') := .(factor(cancer_type, levels = cts_ord), factor(meta_program, levels = mps_ord))]
62 | os_data[, signif := ifelse(pval_adj < 0.05, ifelse(pval_adj < 0.01, ifelse(pval_adj < 0.001, '***', '**'), '*'), '')]
63 | htmp_os <- ggplot(os_data, aes(x = cancer_type, y = meta_program, fill = exp(eff))) +
64 | geom_tile(colour = 'grey90') +
65 | geom_text(aes(label = signif), hjust = 0.5, vjust = 0.5, size = 3) +
66 | scale_x_discrete(expand = c(0, 0)) +
67 | scale_y_discrete(expand = c(0, 0)) +
68 | scale_fill_gradientn(colours = rev(brewer.pal(9, "PiYG")), limits = c(0.5, 2.5),
69 | values = rescale(c(0, 1, 2, 3, 4, 7, 10, 13, 16), to = c(0, 1)), oob = squish, na.value = '#F7F7F7', name = 'Hazard\nratio',
70 | guide = guide_colourbar(frame.colour = 'black', ticks.colour = 'black')) +
71 | theme(panel.border = element_rect(size = 0.5, fill = NA), axis.text.x = element_text(angle = 55, hjust = 1),
72 | legend.title = element_text(size = 13)) +
73 | labs(title = 'Overall survival', x = 'Cancer type', y = NULL)
74 |
75 | # LN mets and therapy resistance:
76 | ln_data <- copy(pdata$ln)$data
77 | setkey(ln_data, cancer_type, meta_program)
78 | ther_data <- copy(pdata$ther)$data
79 | setkey(ther_data, cancer_type, meta_program)
80 | mps_ln_ther <- union(ln_data$meta_program, ther_data$meta_program)
81 | ln_data <- ln_data[CJ(unique(cancer_type), mps_ln_ther)]
82 | ln_data[, c('cancer_type', 'meta_program') := .(factor(cancer_type, levels = cts_ord), factor(meta_program, levels = mps_ord))]
83 | ln_data[, signif := ifelse(is.na(pval_adj), '', ifelse(pval_adj < 0.05, ifelse(pval_adj < 0.01, ifelse(pval_adj < 0.001, '***', '**'), '*'), ''))]
84 | ther_data <- ther_data[CJ(unique(cancer_type), mps_ln_ther)]
85 | ther_data[, c('cancer_type', 'meta_program') := .(factor(cancer_type, levels = cts_ord), factor(meta_program, levels = mps_ord))]
86 | ther_data[, signif := ifelse(is.na(pval_adj), '', ifelse(pval_adj < 0.05, ifelse(pval_adj < 0.01, ifelse(pval_adj < 0.001, '***', '**'), '*'), ''))]
87 | htmp_ln <- ggplot(ln_data, aes(x = cancer_type, y = meta_program, fill = eff)) +
88 | geom_tile(colour = 'grey90') +
89 | geom_text(aes(label = signif), hjust = 0.5, vjust = 0.5, size = 3) +
90 | scale_x_discrete(expand = c(0, 0)) +
91 | scale_y_discrete(expand = c(0, 0)) +
92 | scale_fill_gradientn(colours = rev(brewer.pal(9, "PiYG")), limits = c(-0.5, 0.5), breaks = c('-0.5' = -0.5, '0' = 0, '0.5' = 0.5), oob = squish,
93 | na.value = '#F7F7F7', name = expression(Delta*'(score)'), guide = guide_colourbar(frame.colour = 'black', ticks.colour = 'black')) +
94 | theme(panel.border = element_rect(size = 0.5, fill = NA), axis.text.x = element_text(angle = 55, hjust = 1),
95 | legend.title = element_text(size = 13)) +
96 | labs(title = 'Lymph node metastasis', x = 'Cancer type', y = NULL)
97 | htmp_ther <- ggplot(ther_data, aes(x = cancer_type, y = meta_program, fill = eff)) +
98 | geom_tile(colour = 'grey90') +
99 | geom_text(aes(label = signif), hjust = 0.5, vjust = 0.5, size = 3) +
100 | scale_x_discrete(expand = c(0, 0)) +
101 | scale_y_discrete(expand = c(0, 0)) +
102 | scale_fill_gradientn(colours = rev(brewer.pal(9, "PiYG")), limits = c(-0.5, 0.5), breaks = c('-0.5' = -0.5, '0' = 0, '0.5' = 0.5), oob = squish,
103 | na.value = '#F7F7F7', name = expression(Delta*'(score)'), guide = guide_colourbar(frame.colour = 'black', ticks.colour = 'black')) +
104 | theme(panel.border = element_rect(size = 0.5, fill = NA), axis.text.x = element_text(angle = 55, hjust = 1),
105 | legend.title = element_text(size = 13)) +
106 | labs(title = 'Therapy resistance', x = 'Cancer type', y = NULL)
107 |
108 | # Make grobs and fix widths and heights:
109 | htmp_os_grob <- ggplotGrob(htmp_os)
110 | htmp_os_grob$widths <- unit(c(0.3, 0, 0, 4.5, 0.5*os_data[, length(unique(cancer_type))], 0, 0, 0.5, 2, 0, 0.3), 'cm')
111 | htmp_os_grob$heights <- unit(c(0.3, 0, 0.8, 0, 0, 0, 0.4*os_data[, length(unique(meta_program))], 2.5, 0.6, 0, 0, 0.3), 'cm')
112 | htmp_ln_grob <- ggplotGrob(htmp_ln + theme(legend.position = 'none'))
113 | htmp_ln_grob$widths <- unit(c(0.3, 0, 0, 4.5, 0.5*ln_data[, length(unique(cancer_type))], 0, 0, 0, 0.3), 'cm')
114 | htmp_ln_grob$heights <- unit(c(0.3, 0, 0.8, 0, 0, 0, 0.4*ln_data[, length(unique(meta_program))], 2.5, 0.6, 0, 0, 0.3), 'cm')
115 | htmp_ther_grob <- ggplotGrob(htmp_ther + theme(axis.text.y = element_blank()))
116 | htmp_ther_grob$widths <- unit(c(0.3, 0, 0, 0, 0.5*ther_data[, length(unique(cancer_type))], 0, 0, 0.5, 2, 0, 0.3), 'cm')
117 | htmp_ther_grob$heights <- unit(c(0.3, 0, 0.8, 0, 0, 0, 0.4*ther_data[, length(unique(meta_program))], 2.5, 0.6, 0, 0, 0.3), 'cm')
118 |
119 | pdf('../data/clin_htmps_sub.pdf', width = sum(c(htmp_ln_grob$widths, htmp_ther_grob$widths))/2.54,
120 | height = sum(c(htmp_os_grob$heights, htmp_ln_grob$heights))/2.54)
121 | plot_grid(
122 | plot_grid(
123 | htmp_os_grob,
124 | ggplot() + theme_void(),
125 | nrow = 1,
126 | ncol = 2,
127 | rel_widths = c(
128 | as.numeric(sum(htmp_os_grob$widths)),
129 | sum(c(htmp_ln_grob$widths, htmp_ther_grob$widths)) - as.numeric(sum(htmp_os_grob$widths))
130 | )
131 | ),
132 | plot_grid(
133 | htmp_ln_grob,
134 | htmp_ther_grob,
135 | nrow = 1,
136 | ncol = 2,
137 | rel_widths = c(sum(htmp_ln_grob$widths), sum(htmp_ther_grob$widths))
138 | ),
139 | nrow = 2,
140 | ncol = 1,
141 | rel_heights = c(sum(htmp_os_grob$heights), sum(htmp_ln_grob$heights))
142 | )
143 | dev.off()
144 |
145 |
146 |
147 |
148 |
149 | # Summary showing pan-cancer consistency of most common MPs:
150 |
151 | clin_test_res <- rbind(fread('../data/clin_test_res.csv'), fread('../data/clin_test_res_deconv.csv')[, cancer_type := paste(cancer_type, '(d)')])
152 | clin_test_res[, pval_adj := p.adjust(pval, method = 'BH')]
153 |
154 | study_contrib <- readRDS('../data/study_contribution_per_MP.RDS')
155 | names(study_contrib) <- gsub(' ', ' ', names(study_contrib))
156 | names(study_contrib)[41] <- 'MP41 Unassigned'
157 |
158 | study_tcga_map <- fread('../data/study_tcga_map.csv', na.strings = '', key = 'study')
159 |
160 | mps_tab <- lapply(
161 | names(study_contrib),
162 | function(mp_name) study_tcga_map[study_contrib[[mp_name]], .(meta_program = mp_name, study = study, cancer_type = cancer_type)]
163 | ) %>% rbindlist
164 |
165 | common_mps <- mps_tab[, .(n_study = .N, n_ct = length(unique(cancer_type[!is.na(cancer_type)]))), by = meta_program][n_ct >= 5 & n_study >= 10]
166 | common_mps <- common_mps$meta_program
167 |
168 | pdata_scatter <- clin_test_res[meta_program %in% common_mps, .(s = sum(sign(eff)), s_sig = sum(sign(eff[pval_adj < 0.05]))), by = .(meta_program)]
169 |
170 | set.seed(8331)
171 | pdf('../data/clin_summary_scatter.pdf', width = 14/2.54, height = 11.5/2.54)
172 | ggplot(pdata_scatter, aes(x = s_sig, y = s)) +
173 | theme_test() +
174 | geom_hline(yintercept = 0, linetype = 'dashed', size = 0.5, colour = 'lightgrey') +
175 | geom_vline(xintercept = 0, linetype = 'dashed', size = 0.5, colour = 'lightgrey') +
176 | geom_point(shape = 21, fill = 'tomato', stroke = 0.3, size = 3) +
177 | geom_text_repel(data = pdata_scatter[grep('Cell Cycle|Stress \\(in vitro\\)|Proteasomal|Hypoxia', meta_program)], aes(label = meta_program),
178 | colour = 'steelblue') +
179 | geom_text_repel(data = pdata_scatter[grep('Stress$|PDAC', meta_program)], aes(label = meta_program),
180 | colour = 'steelblue', nudge_x = 14, nudge_y = 8) +
181 | geom_text_repel(data = pdata_scatter[grep('MHC-II \\(I\\)', meta_program)], aes(label = meta_program),
182 | colour = 'steelblue', nudge_x = 1, nudge_y = -1) +
183 | labs(x = expression(sum()*italic(sign)*'(significant effects)'), y = expression(sum()*italic(sign)*'(all effects)'))
184 | dev.off()
185 |
--------------------------------------------------------------------------------
/ITH_hallmarks/TCGA_analysis/tcga_mp_scores.R:
--------------------------------------------------------------------------------
1 | library(data.table)
2 | library(magrittr)
3 | library(stringr)
4 | library(readxl)
5 | library(matkot)
6 |
7 | source('functions.R')
8 |
9 | cancer_types <- c('ACC', 'BLCA', 'BRCA', 'CESC', 'CHOL', 'COAD', 'DLBC', 'ESCA_AC', 'ESCA_ESCC', 'GBM_IDH-WT', 'HNSC', 'KICH', 'KIRC', 'KIRP',
10 | 'LAML', 'LGG_astro', 'LGG_IDH-WT', 'LGG_oligo', 'LIHC', 'LUAD', 'LUSC', 'MESO', 'OV', 'PAAD', 'PCPG', 'PRAD', 'READ', 'SARC', 'SKCM_primary',
11 | 'SKCM_metastatic', 'STAD', 'TGCT', 'THCA', 'THYM', 'UCEC', 'UCS', 'UVM')
12 |
13 | hgnc_complete_set <- fread('../data/hgnc_complete_set.txt', key = 'ensembl_gene_id')[
14 | !(ensembl_gene_id %in% names(table(ensembl_gene_id))[table(ensembl_gene_id) > 1])
15 | ]
16 | alias_table <- make_alias_table(hgnc_complete_set)
17 |
18 | study_contrib <- readRDS('../data/study_contribution_per_MP.RDS')
19 | mps <- as.list(read_xlsx('../data/mps_malignant.xlsx')) # Also already ordered by MP number
20 | mps <- slapply(mps, update_symbols_fast, alias_table)
21 |
22 | # Make sure MP names are consistent:
23 | names(mps) <- gsub(' ', ' ', names(mps))
24 | names(study_contrib) <- gsub(' ', ' ', names(study_contrib))
25 | names(study_contrib)[41] <- 'MP41 Unassigned'
26 |
27 | study_tcga_map <- fread('../data/study_tcga_map.csv', na.strings = '')
28 | cancer_types <- cancer_types[cancer_types %in% study_tcga_map$cancer_type]
29 | setkey(study_tcga_map, cancer_type)
30 |
31 | for(ct in cancer_types) {
32 |
33 | cat(ct, '\n')
34 |
35 | mps_ct <- mps[sapply(study_contrib, function(x) any(x %in% study_tcga_map[ct, study]))]
36 |
37 | expmat <- fread(paste0('~/TCGA_data/', ct, '/Exp_data_TPM.csv'))[, set_rownames(as.matrix(.SD), V1), .SDcols = -'V1']
38 | meta <- fread(paste0('~/TCGA_data/', ct, '/Cells.csv'))[sample_type != 'normal']
39 | expmat <- expmat[, meta$sample_id]
40 |
41 | mps_ct <- slapply(mps_ct, function(x) x[x %in% rownames(expmat)])
42 |
43 | set.seed(140)
44 | scores <- lapply(
45 | names(mps_ct),
46 | function(mp) meta[, .(sample_id = sample_id, meta_program = mp, score = sig_score(expmat, mps_ct[[mp]], nbin = 50, n = 50))]
47 | ) %>% rbindlist
48 |
49 | fwrite(scores, paste0('~/TCGA_data/', ct, '/Scores.csv'))
50 |
51 | }
52 |
--------------------------------------------------------------------------------
/ITH_hallmarks/TCGA_analysis/tcga_mp_scores_deconv.R:
--------------------------------------------------------------------------------
1 | library(data.table)
2 | library(magrittr)
3 | library(stringr)
4 | library(readxl)
5 | library(matkot)
6 |
7 | source('functions.R')
8 |
9 | cancer_types <- c('BLCA', 'BRCA', 'CESC', 'CHOL', 'COAD', 'ESCA_AC', 'ESCA_ESCC', 'HNSC', 'LUAD', 'LUSC', 'OV', 'PAAD', 'PRAD', 'READ', 'STAD',
10 | 'THCA', 'UCEC')
11 |
12 | hgnc_complete_set <- fread('../data/hgnc_complete_set.txt', key = 'ensembl_gene_id')[
13 | !(ensembl_gene_id %in% names(table(ensembl_gene_id))[table(ensembl_gene_id) > 1])
14 | ]
15 | alias_table <- make_alias_table(hgnc_complete_set)
16 |
17 | study_contrib <- readRDS('../data/study_contribution_per_MP.RDS')
18 | mps <- as.list(read_xlsx('../data/mps_malignant.xlsx')) # Also already ordered by MP number
19 | mps <- slapply(mps, update_symbols_fast, alias_table)
20 |
21 | # Make sure MP names are consistent:
22 | names(mps) <- gsub(' ', ' ', names(mps))
23 | names(study_contrib) <- gsub(' ', ' ', names(study_contrib))
24 | names(study_contrib)[41] <- 'MP41 Unassigned'
25 |
26 | study_tcga_map <- fread('../data/study_tcga_map.csv', na.strings = '')
27 | cancer_types <- cancer_types[cancer_types %in% study_tcga_map$cancer_type]
28 | setkey(study_tcga_map, cancer_type)
29 |
30 | for(ct in cancer_types) {
31 |
32 | cat(ct, '\n')
33 |
34 | mps_ct <- mps[sapply(study_contrib, function(x) any(x %in% study_tcga_map[ct, study]))]
35 |
36 | meta <- fread(paste0('~/TCGA_data/', ct, '/Cells.csv'))[sample_type != 'normal']
37 |
38 | if(startsWith(ct, 'ESCA')) {
39 | expmat <- fread('~/pan_cancer/data/TCGA_deconvolved_scaled/ESCA_Epithelial.cells.txt')
40 | } else expmat <- fread(paste0('~/pan_cancer/data/TCGA_deconvolved_scaled/', ct, '_Epithelial.cells.txt'))
41 | expmat <- expmat[, set_rownames(as.matrix(.SD), Gene), .SDcols = -'Gene']
42 | meta <- meta[sample_id %in% colnames(expmat)]
43 | expmat <- expmat[, meta$sample_id]
44 |
45 | mps_ct <- slapply(mps_ct, function(x) x[x %in% rownames(expmat)])
46 |
47 | set.seed(140)
48 | scores <- lapply(
49 | names(mps_ct),
50 | function(mp) meta[, .(sample_id = sample_id, meta_program = mp, score = colMeans(expmat[mps_ct[[mp]], ]))]
51 | ) %>% rbindlist
52 |
53 | fwrite(scores, paste0('~/TCGA_data/', ct, '/Scores_deconv.csv'))
54 |
55 | }
56 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 Tirosh Lab
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 3ca
2 | This repository contains code relating to the Curated Cancer Cell Atlas (3CA), including code for reproducing analyses and figures on the [3CA website](https://www.weizmann.ac.il/sites/3CA), and code for reproducing the analysis in Gavish et al. "The hallmarks of transcriptional intra-tumor heterogeneity".
3 |
4 | ### Source code for Gavish et al.
5 | Code for reproducing the analyses in the Gavish et al. study is contained in the ITH_hallmarks directory.
6 |
7 | ### 3CA version 2
8 | The 3CA_v2 directory code for new analyses conducted on the expanded version of 3CA.
9 |
--------------------------------------------------------------------------------
/aliases_pubmed.R:
--------------------------------------------------------------------------------
1 | library(data.table)
2 | library(stringr)
3 | library(easyPubMed)
4 |
5 | hgnc_complete_set <- fread('../data/hgnc_complete_set.txt', key = 'symbol')[
6 | !(ensembl_gene_id %in% names(table(ensembl_gene_id))[table(ensembl_gene_id) > 1]) & locus_group == 'protein-coding gene'
7 | ]
8 |
9 | alias_table <- hgnc_complete_set[, .(symbol_alt = unique(c(str_split(alias_symbol, '\\|')[[1]], str_split(prev_symbol, '\\|')[[1]]))), by = symbol]
10 | alias_table <- alias_table[symbol_alt != '']
11 |
12 | # Remove elements of that also occur in :
13 | alias_table <- alias_table[!(symbol_alt %in% symbol)]
14 |
15 | # Get number of PubMed search results for each alternative symbol:
16 | alias_table[, pubmed_freq := as.numeric(get_pubmed_ids(paste0(symbol_alt, '[Text Word]'))$Count), by = symbol_alt]
17 |
18 | # Adding combined searches, i.e. check for cases where the alias and the up-to-date symbol occur together:
19 | alias_table[,
20 | pubmed_comb_freq := as.numeric(get_pubmed_ids(paste0('(', symbol_alt, '[Text Word]) AND (', symbol, '[Text Word])'))$Count),
21 | by = .(symbol, symbol_alt)
22 | ]
23 |
24 | fwrite(alias_table, '../data/aliases_pubmed.csv')
25 |
--------------------------------------------------------------------------------
/aliases_table.R:
--------------------------------------------------------------------------------
1 | library(data.table)
2 | library(magrittr)
3 | library(matkot)
4 |
5 | aliases_table <- fread('../data/aliases_pubmed.csv')
6 | aliases_table <- aliases_table[
7 | pubmed_freq > 1000 &
8 | pubmed_comb_freq > 100 &
9 | !(symbol_alt %in% symbol) &
10 | str_count(symbol_alt, '[a-z]') <= 1 &
11 | str_count(symbol_alt, '.') > 1 & # Could put 2 here as some 2-letter aliases map poorly, but some are important like RB -> RB1
12 | !grepl('\\.', symbol_alt)
13 | ]
14 | aliases_table <- aliases_table[!(symbol_alt %in% aliases_table[, .(l = length(unique(symbol))), by = symbol_alt][l > 1, symbol_alt])]
15 | # Try capitalising and removing dashes. If this makes any of symbol_alt the same as symbol, remove these entries. Otherwise, if it makes any
16 | # elements of symbol_alt the same, take the one with the highest pubmed_freq.
17 | aliases_table[, symbol_alt_x := gsub('-', '', toupper(symbol_alt))]
18 | aliases_table <- aliases_table[symbol_alt_x != symbol]
19 | aliases_table <- aliases_table[,
20 | .(symbol_alt = rbindlist(lapply(
21 | names(table(symbol_alt_x)),
22 | function(symb) if(table(symbol_alt_x)[symb] == 1) {
23 | return(.SD[symbol_alt_x == symb, .(symbol_alt, pubmed_freq)])
24 | } else return(.SD[symbol_alt_x == symb, .(symbol_alt = symbol_alt[which.max(pubmed_freq)], pubmed_freq = max(pubmed_freq))])
25 | ))[order(-pubmed_freq), symbol_alt]),
26 | by = symbol
27 | ]
28 |
29 | fwrite(aliases_table, '../data/alias_table.csv')
30 |
--------------------------------------------------------------------------------
/bins.R:
--------------------------------------------------------------------------------
1 | library(data.table)
2 | library(magrittr)
3 | library(Matrix)
4 | library(stringr)
5 | library(plyr)
6 | library(matkot)
7 |
8 | source('functions.R')
9 |
10 | paths_table <- fread('../data/paths_table.csv', key = c('study', 'cancer_type'), encoding = 'UTF-8')
11 |
12 | hgnc_complete_set <- fread('../data/hgnc_complete_set.txt', key = 'symbol')
13 | hgnc_complete_set <- hgnc_complete_set[!(ensembl_gene_id %in% names(table(ensembl_gene_id))[table(ensembl_gene_id) > 1])]
14 | alias_table <- make_alias_table(hgnc_complete_set)
15 |
16 | set.seed(763)
17 | data_genes_all <- lapply(transpose(as.list(unique(paths_table[, .(study, cancer_type)]))), function(r) {
18 | cat(r[1], '-', r[2], '\n')
19 | rdir <- paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1])
20 | if('gene_ave.csv' %in% dir(rdir)) gene_ave <- fread(paste0(rdir, '/gene_ave.csv'))[!is.na(ave)] else return(NULL)
21 | if(nrow(gene_ave) == 0) return(NULL)
22 | gene_ave <- gene_ave[!is.na(ave), if(unique(.SD[, .(group, sample, n_cell)])[, sum(n_cell) >= 10]) .SD[,
23 | .(study = r[1], cancer_type = r[2], ave = sum(ave*n_cell)/sum(n_cell)),
24 | by = symbol
25 | ], by = .(group = cell_type)]
26 | setcolorder(gene_ave, c('study', 'cancer_type'))
27 | setkey(gene_ave, study, cancer_type, group, symbol)
28 | # Shuffle the genes before ranking so that the zero genes don't simply get ordered alphabetically:
29 | gene_ave[, ave_rank := .SD[sample(1:.N), order(order(ave))[order(symbol)]]/.N, .SDcols = c('symbol', 'ave'), by = group]
30 | }) %>% rbindlist
31 |
32 | # Bins for non-malignant cell types:
33 |
34 | # Get the non-malignant cell types occuring in sufficiently many datasets:
35 | cell_types_nm <- data_genes_all[
36 | !(group %in% c('', 'Unassigned', 'Malignant')) &
37 | !(study == 'Chen et al. 2020' & cancer_type == 'Head and Neck') &
38 | !(study == 'Sun et al. 2021' & cancer_type == 'Liver/Biliary'),
39 | unique(.SD),
40 | .SDcols = c('study', 'cancer_type', 'group')
41 | ][, .N, by = group][N >= 3, group]
42 |
43 | genes_nm <- fread('../data/gene_plots_data_all_web.csv', select = 'symbol')$symbol %>% unique
44 |
45 | bins_nm <- data_genes_all[
46 | symbol %in% genes_nm & group %in% cell_types_nm &
47 | !(study == 'Chen et al. 2020' & cancer_type == 'Head and Neck') &
48 | !(study == 'Sun et al. 2021' & cancer_type == 'Liver/Biliary'),
49 | .(ave = mean(ave[!is.na(ave)]), ave_rank = mean(ave_rank[!is.na(ave_rank)])),
50 | by = .(symbol, group)
51 | ][, cbind(.SD[order(ave_rank)], bin = cut(1:.N, 15, labels = FALSE)), by = group]
52 |
53 | fwrite(bins_nm, '../data/bins_nm.csv')
54 |
--------------------------------------------------------------------------------
/canonical_markers.R:
--------------------------------------------------------------------------------
1 | library(data.table)
2 |
3 | canonical_markers <- list(
4 | Adipocyte = c('ADIPOQ', 'FABP4', 'LEP'),
5 | Astrocyte = c('GFAP', 'GJA1'),
6 | B_cell = c('MS4A1', 'CD79A', 'CD79B'),
7 | Basophil = c('CCR3', 'IL3RA'),
8 | Dendritic = c('CCR7', 'CD86', 'CLEC10A'),
9 | Endothelial = c('VWF', 'CDH5', 'CLEC14A', 'CLDN5', 'ADGRL4'),
10 | Epithelial = c('EPCAM', 'KRT19', 'KRT7', 'MUC1'),
11 | Erythrocyte = c('HBA1', 'HBA2', 'HBB'),
12 | Fibroblast = c('COL1A1', 'COL1A2', 'COL3A1', 'LUM'),
13 | Immune = 'PTPRC',
14 | Keratinocyte = c('FLG', 'IVL', 'LORICRIN'),
15 | Lymphovascular = c('CCL21', 'TFF3'),
16 | Macrophage = c('AIF1', 'CD14', 'CD163'),
17 | Mast = c('CPA3', 'MS4A2', 'TPSB2'),
18 | Melanocyte = c('DCT', 'MLANA', 'PMEL', 'TYR', 'TYRP1'),
19 | Monocyte = c('CCR2', 'CSF1R'),
20 | Myocyte = c('ACTA1', 'DES', 'MYL1', 'TTN', 'NEB'),
21 | Neuron = c('ENO2', 'RBFOX3'),
22 | Neutrophil = c('AZU1', 'CTSG', 'ELANE', 'MPO'),
23 | NK_cell = c('NKG7', 'KLRD1', 'GZMB', 'KLRF1'),
24 | Oligodendrocyte = c('TF', 'PLP1', 'CLDN11', 'MAG', 'MBP', 'MOG'),
25 | OPC = c('OLIG1', 'OLIG2', 'NEU4'),
26 | Pericyte = c('RGS5', 'ESAM', 'MEF2C'),
27 | Plasma = c('JCHAIN', 'MZB1', 'TNFRSF17'),
28 | T_cell = c('CD2', 'CD3D', 'CD3E')
29 | )
30 |
31 | canonical_markers <- rbindlist(lapply(names(canonical_markers), function(ct) data.table(cell_type = ct, gene = canonical_markers[[ct]])))
32 |
33 | fwrite(canonical_markers, '../data/canonical_markers.csv')
34 |
--------------------------------------------------------------------------------
/cc_sigs_consensus.R:
--------------------------------------------------------------------------------
1 | library(data.table)
2 | library(ggplot2)
3 | library(magrittr)
4 | library(matkot)
5 |
6 | source('functions.R')
7 |
8 | paths_table <- fread('../data/paths_table.csv', key = c('study', 'cancer_type'))
9 |
10 | sigs <- lapply(transpose(as.list(unique(paths_table[cancer_type != 'Other/Models', .(study, cancer_type)]))), function(r) {
11 | cat(r, '\n')
12 | if(!('data_cc.RDS' %in% dir(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1])))) return(NULL)
13 | plot_data <- readRDS(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1], '/data_cc.RDS'))
14 | nullcond <- sapply(plot_data, function(x) ifelse(is.null(x), TRUE, all(sapply(x[names(x) != 'path'], is.null))))
15 | if(all(nullcond)) return(NULL)
16 | paths <- apply(paths_table[as.list(r), .(cells, genes, expmat)], 1, as.list, simplify = FALSE)
17 | rout <- lapply(which(!nullcond), function(i) list(r = r, group = i, g1s = plot_data[[i]]$g1s, g2m = plot_data[[i]]$g2m))
18 | }) %>% unlist(recursive = FALSE)
19 |
20 | g1s_tab <- table(unlist(lapply(sigs, `[[`, 'g1s')))
21 | g2m_tab <- table(unlist(lapply(sigs, `[[`, 'g2m')))
22 |
23 | sigs_cons <- list(g1s = names(g1s_tab)[order(-g1s_tab)][1:50], g2m = names(g2m_tab)[order(-g2m_tab)][1:50])
24 |
25 | saveRDS(sigs_cons, '../data/cc_sigs_consensus.rds')
26 |
--------------------------------------------------------------------------------
/cna_mat_prep.R:
--------------------------------------------------------------------------------
1 | # The command line arguments should supply study name and cancer type (in that order).
2 | r = commandArgs(trailingOnly = TRUE)
3 |
4 |
5 |
6 |
7 |
8 | library(data.table)
9 | library(magrittr)
10 | library(matkot)
11 |
12 | paths_table <- fread('../data/paths_table.csv', encoding = 'UTF-8', key = c('study', 'cancer_type'))
13 |
14 | d <- paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1])
15 | if('data_cna.rds' %in% dir(d)) {
16 | cna <- readRDS(paste0(d, '/data_cna.rds')) # Genes are already in order
17 | cna_cond <- sapply(cna, function(x) is.null(x) | is.null(x$cna_data))
18 | if(!all(cna_cond)) {
19 | for(i in (1:length(cna))[!cna_cond]) { # Indices where data is not NULL
20 | out <- cna[[i]]$cna_data[, .(gene, cell_name, cna = round(cna, 4))]
21 | out <- dcast(out, cell_name ~ gene)[, c('cell_name', out[, unique(gene)]), with = FALSE]
22 | if(sum(!cna_cond) > 1) {
23 | if(!('CNA matrix' %in% dir(d))) dir.create(paste0(d, '/CNA matrix'))
24 | suff <- paths_table[as.list(r)][i, if(group_name != '') group_name else paste0('group', i)]
25 | fwrite(out, paste0(d, '/CNA matrix/CNA_matrix_', suff, '.csv'))
26 | } else fwrite(out, paste0(d, '/CNA matrix.csv'))
27 | }
28 | } else cat('No CNA data\n')
29 | } else cat('No CNA data\n')
30 |
--------------------------------------------------------------------------------
/gene_ave.R:
--------------------------------------------------------------------------------
1 | r = commandArgs(trailingOnly = TRUE)
2 |
3 |
4 |
5 |
6 |
7 | library(data.table)
8 | library(magrittr)
9 | library(Matrix)
10 | library(stringr)
11 | library(plyr)
12 | library(matkot)
13 |
14 | source('functions.R')
15 |
16 | paths_table <- fread('../data/paths_table.csv', key = c('study', 'cancer_type'))
17 |
18 | hgnc_complete_set <- fread('../data/hgnc_complete_set_2023-04-13.txt', key = 'symbol')
19 | hgnc_complete_set <- hgnc_complete_set[!(ensembl_gene_id %in% names(table(ensembl_gene_id))[table(ensembl_gene_id) > 1])]
20 | alias_table <- make_alias_table(hgnc_complete_set)
21 |
22 | paths <- copy(paths_table[as.list(r)]); setkey(paths, group)
23 |
24 | out <- rbindlist(lapply(paths$group, function(grp) {
25 |
26 | cells <- suppressWarnings(fread(paths[grp, cells], na.strings = '', colClasses = c(cell_name = 'character', sample = 'character')))
27 |
28 | if(!all(c('cell_type', 'sample') %in% names(cells)) | !endsWith(paths[grp, expmat], 'mtx')) {
29 | return(data.table(group = grp, group_name = paths[grp, group_name], cell_type = character(), sample = character(), symbol = character(),
30 | ave = numeric(), prop_pos = numeric(), n_cell = integer()))
31 | }
32 |
33 | genes <- fread(paths[grp, genes], header = FALSE)$V1
34 | expmat <- readMM(paths[grp, expmat])
35 | expmat <- expmat[genes %in% names(table(genes))[table(genes) == 1], ]
36 | genes <- genes[genes %in% names(table(genes))[table(genes) == 1]]
37 | genes <- update_symbols_fast(genes, alias_table)
38 | rownames(expmat) <- genes
39 | colnames(expmat) <- cells$cell_name
40 |
41 | # Remove low-complexity cells and normalise to log TPM/10:
42 | cells <- cells[col_nnz(expmat) >= 1000]
43 | if(nrow(cells) < 10) return(data.table(group = grp, group_name = paths[grp, group_name], cell_type = character(), sample = character(),
44 | symbol = character(), ave = numeric(), prop_pos = numeric(), n_cell = integer()))
45 | expmat <- round(log_transform(1e+05*to_frac(expmat[, cells$cell_name])), 4)
46 |
47 | pout <- cells[
48 | !is.na(cell_type) & !is.na(sample),
49 | .(symbol = genes, ave = rowMeans(expmat[, cell_name, drop = FALSE]), prop_pos = row_nnz(expmat[, cell_name, drop = FALSE])/.N, n_cell = .N),
50 | by = .(cell_type, sample)
51 | ]
52 | pout[, c('group', 'group_name') := .(grp, paths[grp, group_name])]
53 | setcolorder(pout, c('group', 'group_name'))
54 |
55 | return(pout)
56 |
57 | }), use.names = TRUE)
58 |
59 | if(nrow(out) > 0) {
60 | ct <- gsub('/', '-', r[2])
61 | if(!(ct %in% dir('../data/study_plots'))) {dir.create(paste0('../data/study_plots/', ct))}
62 | if(!(r[1] %in% dir(paste0('../data/study_plots/', ct)))) {dir.create(paste0('../data/study_plots/', ct, '/', r[1]))}
63 | fwrite(out, paste0('../data/study_plots/', ct, '/', r[1], '/gene_ave.csv'))
64 | }
65 |
--------------------------------------------------------------------------------
/gene_mp_cor.R:
--------------------------------------------------------------------------------
1 | # The command line arguments should supply study name and cancer type (in that order).
2 | r = commandArgs(trailingOnly = TRUE)
3 |
4 |
5 |
6 |
7 |
8 | library(data.table)
9 | library(magrittr)
10 | library(Matrix)
11 | library(stringr)
12 | library(matkot)
13 |
14 | source('functions.R')
15 |
16 | paths_table <- fread('../data/paths_table.csv', key = c('study', 'cancer_type'))
17 |
18 | hgnc_complete_set <- fread('../data/hgnc_complete_set_2023-04-13.txt', key = 'symbol')
19 | hgnc_complete_set <- hgnc_complete_set[!(ensembl_gene_id %in% names(table(ensembl_gene_id))[table(ensembl_gene_id) > 1])]
20 | alias_table <- make_alias_table(hgnc_complete_set)
21 |
22 | paths <- copy(paths_table[as.list(r)]); setkey(paths, group)
23 |
24 | rdir <- paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1])
25 |
26 | if('mp_scores_all.csv' %in% dir(rdir)) {
27 | scores <- fread(paste0(rdir, '/mp_scores_all.csv'), colClasses = c(cell_name = 'character'))
28 | if(nrow(scores) == 0 | !('score' %in% names(scores))) scores <- NULL
29 | } else scores <- NULL
30 |
31 | if(!is.null(scores)) {
32 |
33 | out <- lapply(paths$group, function(grp) {
34 |
35 | cells <- suppressWarnings(fread(paths[grp, cells], na.strings = '', colClasses = c(cell_name = 'character', sample = 'character')))
36 |
37 | if(
38 | scores[group == grp, .N == 0 | all(is.na(score))] |
39 | !all(c('cell_type', 'sample') %in% names(cells)) |
40 | !endsWith(paths[grp, expmat], 'mtx')
41 | ) {
42 | return(data.table(group = numeric(), group_name = character(), cell_type = character(), meta_program = character(),
43 | gene = character(), corr = numeric(), n_cell = integer(), n_sample = integer(), n_sample_thresh = integer()))
44 | }
45 |
46 | genes <- fread(paths[grp, genes], header = FALSE)$V1
47 | expmat <- readMM(paths[grp, expmat])
48 | expmat <- expmat[genes %in% names(table(genes))[table(genes) == 1], ]
49 | genes <- genes[genes %in% names(table(genes))[table(genes) == 1]]
50 | genes <- update_symbols_fast(genes, alias_table)
51 | rownames(expmat) <- genes
52 | colnames(expmat) <- cells$cell_name
53 |
54 | # Remove low-complexity cells and normalise to log TPM/10:
55 | cells <- cells[col_nnz(expmat) >= 1000]
56 | if(nrow(cells) < 10) return(data.table(group = numeric(), group_name = character(), cell_type = character(), meta_program = character(),
57 | gene = character(), corr = numeric(), n_cell = integer(), n_sample = integer(), n_sample_thresh = integer()))
58 | expmat <- round(log_transform(1e+05*to_frac(expmat[, cells$cell_name])), 4)
59 |
60 | setkey(cells, cell_name)
61 | scores[group == grp, c('cell_type', 'sample') := do.call(`[`, list(cells, cell_name))[, .(cell_type, sample)]]
62 | setkey(scores, sample, cell_name)
63 | # In the following, I'm using a threshold of 10, in a few places, for minimum number of cells.
64 | grp_out <- scores[group == grp][, # Chaining so we can use the key even if the same cell/sample names are present in multiple groups
65 | if(length(unique(cell_name)) >= 10) {
66 | cat(unique(cell_type), '\n')
67 | ids <- unique(.SD[, .(sample, cell_name)])
68 | bounds <- ids[, .N, by = sample][, quantile(N, c(0.75, 0.25)) + c(1.5, -1.5)*floor(IQR(N))]
69 | ids <- ids[, if(.N >= bounds[2]) {if(.N <= bounds[1]) .(cell_name) else .(cell_name = sample(cell_name, bounds[1]))}, keyby = sample]
70 | ids_n <- ids[, .N, keyby = sample]
71 | expmat_sub <- expmat[, ids$cell_name]
72 | rm_all <- rowMeans(expmat_sub)
73 | # The following binds together matrices across samples, which might lead to memory issues for large datasets. But we need it to
74 | # compute the correlation across samples.
75 | expmat_sub <- t(Reduce(cbind, lapply(ids_n$sample, function(smpl) {
76 | if(ids_n[smpl, N] < 10) {
77 | apply(expmat_sub[, ids[smpl, cell_name], drop = FALSE], 2, function(x) x - rm_all)
78 | } else {
79 | rm_smpl <- rowMeans(expmat_sub[, ids[smpl, cell_name]])
80 | apply(expmat_sub[, ids[smpl, cell_name]], 2, function(x) x - rm_smpl)
81 | }
82 | })))
83 | setkey(ids, NULL)
84 | out <- .SD[ids, .(gene = rownames(expmat), corr = cor(score, expmat_sub)[1, ]), by = meta_program]
85 | out[, c('n_cell', 'n_sample', 'n_sample_thresh') := ids_n[, .(nrow(ids), .N, sum(N >= 10))]]
86 | out
87 | },
88 | by = cell_type
89 | ]
90 |
91 | grp_out[, c('group', 'group_name') := .(grp, paths[grp, group_name])]
92 | setcolorder(grp_out, c('group', 'group_name'))
93 |
94 | return(grp_out)
95 |
96 | }) %>% rbindlist(use.names = TRUE)
97 |
98 | if(nrow(out) > 0) fwrite(out, paste0(rdir, '/gene_mp_cor.csv'))
99 |
100 | }
101 |
--------------------------------------------------------------------------------
/gene_mp_cor_all_web.R:
--------------------------------------------------------------------------------
1 | library(data.table)
2 | library(magrittr)
3 | library(stringr)
4 | library(Matrix)
5 | library(matkot)
6 |
7 | source('functions.R')
8 |
9 | paths_table <- fread('../data/paths_table.csv', key = c('study', 'cancer_type'), encoding = 'UTF-8')
10 |
11 | hgnc_complete_set <- fread('../data/hgnc_complete_set.txt', key = 'symbol')
12 | hgnc_complete_set <- hgnc_complete_set[!(ensembl_gene_id %in% names(table(ensembl_gene_id))[table(ensembl_gene_id) > 1])]
13 |
14 | genes <- fread('../data/gene_plots_data_all_web.csv', select = 'symbol')[symbol %in% hgnc_complete_set$symbol, unique(symbol)]
15 |
16 | to_include <- unique(paths_table[
17 | cancer_type != 'Other/Models' &
18 | !grepl('Unpublished', study) &
19 | !(study == 'Chen et al. 2020' & cancer_type == 'Head and Neck') &
20 | !(study == 'Sun et al. 2021' & cancer_type == 'Liver/Biliary'),
21 | .(study, cancer_type)
22 | ])
23 |
24 | gene_mp_cor <- lapply(transpose(as.list(to_include)), function(r) {
25 |
26 | cat(r, '\n')
27 | if(!('gene_mp_cor.csv' %in% dir(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1])))) return(NULL)
28 |
29 | rout <- fread(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1], '/gene_mp_cor.csv'))
30 | rout <- rout[gene %in% genes]
31 |
32 | rout[, c('study', 'cancer_type') := .(r[1], r[2])]
33 | setcolorder(rout, c('cancer_type', 'study'))
34 |
35 | }) %>% rbindlist
36 |
37 | dtkey <- unique(gene_mp_cor[, .(gene, cell_type, cancer_type, meta_program)])[order(gene, cell_type, cancer_type, meta_program)]
38 | gene_mp_cor_ave <- gene_mp_cor[
39 | !is.na(corr),
40 | if(sum(n_cell) >= 50 & sum(n_sample) >= 3) .(mean_corr = weighted.mean(corr, n_sample_thresh)),
41 | keyby = .(gene, cell_type, cancer_type, meta_program)
42 | ]
43 | gene_mp_cor_ave <- gene_mp_cor_ave[dtkey]
44 |
45 | fwrite(gene_mp_cor_ave, '../data/gene_mp_cor_all_web.csv')
46 |
47 | # Save tables for individual genes, to speed up the writing of the Rmd files:
48 | setkey(gene_mp_cor_ave, gene)
49 | for(g in sort(genes[genes %in% gene_mp_cor_ave$gene])) {
50 | cat(g, '\n')
51 | fwrite(gene_mp_cor_ave[g, -'gene'], paste0('../data/gene_plots/gene_mp_cor/', g, '.csv'))
52 | }
53 |
--------------------------------------------------------------------------------
/gene_plots_cell_types.R:
--------------------------------------------------------------------------------
1 | library(data.table)
2 | library(magrittr)
3 | library(ggplot2)
4 | library(stringr)
5 | library(Matrix)
6 | library(matkot)
7 |
8 | source('functions.R')
9 |
10 | paths_table <- fread('../data/paths_table.csv', key = c('study', 'cancer_type'), encoding = 'UTF-8')
11 |
12 |
13 |
14 |
15 |
16 | # For each dataset, find cell types that constitute at least 1% of the data:
17 | cell_types_thresh <- lapply(transpose(as.list(unique(paths_table[, .(study, cancer_type)]))), function(r) {
18 |
19 | cat(r, '\n')
20 |
21 | cts <- lapply(paths_table[as.list(r), cells], function(p) {
22 | cells <- suppressWarnings(fread(p, na.strings = ''))
23 | if(!('cell_type' %in% names(cells))) return(NULL)
24 | cells <- cells[!is.na(cell_type) & cell_type != 'Unassigned']
25 | out <- cells[, .(N = .N/nrow(cells)), by = cell_type][N >= 0.01, cell_type]
26 | # Check if 'Epithelial' cells are definitely normal epithelial cells:
27 | if(
28 | 'Epithelial' %in% out && (
29 | ('malignant' %in% names(cells) && cells[cell_type == 'Epithelial' & malignant == 'no', .N/nrow(cells)] < 0.01) |
30 | !('Malignant' %in% cells$cell_type)
31 | )
32 | ) {out <- out[out != 'Epithelial']}
33 | if(length(out) > 0) return(out)
34 | })
35 |
36 | cts <- cts[!sapply(cts, is.null)]
37 |
38 | if(length(cts) > 0) return(Reduce(intersect, cts))
39 |
40 | })
41 |
42 | # Take cell types that pass the 1% threshold in at least 5 datasets:
43 | cell_types <- table(unlist(cell_types_thresh[!sapply(cell_types_thresh, is.null)]))
44 | cell_types <- names(cell_types)[cell_types >= 5 & names(cell_types) != '']
45 |
46 | saveRDS(cell_types, '../data/gene_plots_cell_types.rds')
47 |
--------------------------------------------------------------------------------
/gene_plots_data_all_web.R:
--------------------------------------------------------------------------------
1 | library(data.table)
2 | library(magrittr)
3 | library(stringr)
4 | library(Matrix)
5 | library(matkot)
6 |
7 | source('functions.R')
8 |
9 | paths_table <- fread('../data/paths_table.csv', key = c('study', 'cancer_type'), encoding = 'UTF-8')
10 |
11 | cell_types <- readRDS('../data/gene_plots_cell_types.rds')
12 |
13 | hgnc_complete_set <- fread('../data/hgnc_complete_set.txt', key = 'symbol')
14 | hgnc_complete_set <- hgnc_complete_set[!(ensembl_gene_id %in% names(table(ensembl_gene_id))[table(ensembl_gene_id) > 1])]
15 |
16 | # The following averages over studies for a given cancer type:
17 |
18 | to_include <- unique(paths_table[
19 | cancer_type != 'Other/Models' &
20 | !grepl('Unpublished', study) &
21 | !(study == 'Chen et al. 2020' & cancer_type == 'Head and Neck') &
22 | !(study == 'Sun et al. 2021' & cancer_type == 'Liver/Biliary'),
23 | .(study, cancer_type)
24 | ])
25 |
26 | gene_ave <- lapply(transpose(as.list(to_include)), function(r) {
27 |
28 | cat(r, '\n')
29 | if(!('gene_ave.csv' %in% dir(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1])))) return(NULL)
30 |
31 | rout <- fread(
32 | paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1], '/gene_ave.csv'),
33 | colClasses = c(cell_type = 'character', symbol = 'character'),
34 | key = c('cell_type', 'symbol')
35 | )[cell_type %in% cell_types]
36 | rout[, c('study', 'cancer_type') := as.list(r)]
37 |
38 | samples_path <- paste0('/home/labs/tirosh/shared/pan_cancer_datasets/', paths_table[as.list(r), directory[1]], '/samples.csv')
39 | samples <- fread(samples_path, colClasses = c(sample = 'character'), na.strings = '')
40 | samples <- samples[!is.na(sample) & !is.na(cancer_type) & !(cancer_type %in% c('Normal', 'Premalignant'))]
41 | if(nrow(samples) == 0) return(NULL)
42 |
43 | rout <- rout[sample %in% samples$sample] # This excludes the sample == 'all' category
44 |
45 | if(all(r == c('Jerby-Arnon et al. 2021', 'Sarcoma'))) { # This dataset is unusual because it has the same sample names in 10x and SS2 datasets
46 | rout[, tech := group_name]
47 | } else {
48 | setkey(samples, sample)
49 | rout[, tech := do.call(`[`, list(samples, sample))$technology]
50 | }
51 |
52 | setcolorder(rout, c('cancer_type', 'study', 'group', 'group_name', 'tech'))
53 |
54 | return(rout)
55 |
56 | }) %>% rbindlist
57 |
58 | gene_ave[
59 | cell_type %in% c('Macrophage', 'Myeloid', 'Monocyte'),
60 | c('cell_type', 'n_cell', 'ave', 'prop_pos') := .('Macrophage', sum(n_cell), sum(ave*n_cell)/sum(n_cell), sum(prop_pos*n_cell)/sum(n_cell)),
61 | by = .(cancer_type, study, tech, sample, symbol)
62 | ]
63 | gene_ave <- unique(gene_ave)
64 |
65 | gene_ave <- gene_ave[!(cancer_type == 'Brain' & cell_type == 'Fibroblast')]
66 |
67 | gene_ave <- gene_ave[symbol %in% hgnc_complete_set$symbol]
68 |
69 |
70 |
71 |
72 |
73 | # Retain genes that have at least one value in all but at most 3 cancer types:
74 | gene_ave_study <- gene_ave[symbol %in% gene_ave[, .(n = length(unique(cancer_type))), by = symbol][n >= max(n) - 3, symbol]]
75 |
76 | gene_ave_study <- gene_ave_study[,
77 | if(sum(n_cell) >= 10) .(ave = sum(ave*n_cell)/sum(n_cell), prop_pos = sum(prop_pos*n_cell)/sum(n_cell), n_cell = sum(n_cell), n_sample = .N,
78 | n_sample_thresh = sum(n_cell >= 10)),
79 | by = .(symbol, cell_type, cancer_type, study, tech) # Mean across samples for each study, cancer type and tech (require >=10 cells in each case)
80 | ][,
81 | .(ave = mean(ave), prop_pos = mean(prop_pos), n_cell = sum(n_cell), n_sample = sum(n_sample), n_sample_thresh = sum(n_sample_thresh)),
82 | by = .(symbol, cell_type, cancer_type, study) # Mean across datasets of the same study and cancer type but different tech
83 | ]
84 |
85 | gene_ave_all <- gene_ave_study[,
86 | .(study = 'all', ave = weighted.mean(ave, n_sample_thresh + 1), prop_pos = weighted.mean(prop_pos, n_sample_thresh + 1), n_cell = sum(n_cell),
87 | n_sample = sum(n_sample), n_sample_thresh = sum(n_sample_thresh)),
88 | by = .(symbol, cell_type, cancer_type) # Weighted mean across studies of the same disease, weighted by (number of samples with >= 10 cells) + 1
89 | ]
90 |
91 | gene_plots_data_all_web <- rbind(gene_ave_study, gene_ave_all, use.names = TRUE)
92 |
93 | fwrite(gene_plots_data_all_web, '../data/gene_plots_data_all_web.csv')
94 |
95 |
96 |
97 |
98 |
99 | unique_ct <- gene_plots_data_all_web[study == 'all', setNames(CJ(unique(cell_type), unique(cancer_type)), c('cell_type', 'cancer_type'))]
100 | unique_ct[, study := 'all']
101 | unique_study <- gene_plots_data_all_web[
102 | study != 'all',
103 | setNames(CJ(unique(cell_type), unique(paste(study, cancer_type, sep = ' - '))), c('cell_type', 'study'))
104 | ][, c('study', 'cancer_type') := as.data.table(str_split_fixed(study, ' - ', 2))]
105 | unique_dt <- rbind(unique_ct, unique_study, use.names = TRUE)
106 |
107 | # Save tables for individual genes, to speed up the writing of the Rmd files:
108 | setkey(gene_plots_data_all_web, cell_type, cancer_type, study)
109 | for(g in gene_plots_data_all_web[, sort(unique(symbol))]) {
110 | cat(g, '\n')
111 | # No gene names have underscores in them: sum(grepl('_', gene_plots_data_all_web[, sort(unique(symbol))]))
112 | # Deal with gene names with slashes in them by replacing with underscore:
113 | fwrite(
114 | gene_plots_data_all_web[symbol == g, -'symbol'][unique_dt],
115 | paste0('../data/gene_plots/gene_plots_data_all_web/', gsub('/', '_', g), '.csv')
116 | )
117 | }
118 |
--------------------------------------------------------------------------------
/gene_plots_render_web.R:
--------------------------------------------------------------------------------
1 | n = commandArgs(trailingOnly = TRUE)
2 |
3 | genes_n <- gsub('.csv$', '', dir('../data/gene_plots/gene_plots_data_all_web'))
4 | genes_n <- genes_n[cut(1:length(genes_n), 500, labels = FALSE) == n]
5 | genes_n <- genes_n[genes_n != 'backup']
6 | for(g in genes_n) {
7 | if(paste0(g, '.Rmd') %in% dir('../data/gene_plots/rmds_web') & paste0(g, '.rds') %in% dir('../data/gene_plots/rds_plots_web')) {
8 | rmarkdown::render(
9 | paste0('/home/labs/tirosh/tyler/pan_cancer/data/gene_plots/rmds_web/', g, '.Rmd'),
10 | intermediates_dir = paste0('/home/labs/tirosh/tyler/pan_cancer/data/gene_plots/pdfs_web/tmp/', g),
11 | output_dir = '/home/labs/tirosh/tyler/pan_cancer/data/gene_plots/pdfs_web'
12 | )
13 | }
14 | }
15 |
--------------------------------------------------------------------------------
/gene_plots_rmd_web.R:
--------------------------------------------------------------------------------
1 | n = commandArgs(trailingOnly = TRUE)
2 |
3 |
4 |
5 |
6 |
7 | library(data.table)
8 | library(magrittr)
9 | library(ggplot2)
10 | library(stringr)
11 | library(plyr)
12 | library(RColorBrewer)
13 | library(scales)
14 | library(cowplot)
15 | library(matkot)
16 |
17 | source('functions.R')
18 |
19 | aliases <- fread('../data/alias_table.csv')
20 |
21 |
22 |
23 |
24 |
25 | genes_n <- gsub('.csv$', '', gsub('_', '/', dir('../data/gene_plots/gene_plots_data_all_web')))
26 | genes_n <- genes_n[cut(1:length(genes_n), 200, labels = FALSE) == n]
27 | genes_n <- genes_n[genes_n != 'backup']
28 | for(g in genes_n) {
29 |
30 | cat(g, '\n')
31 |
32 | data_g <- fread(paste0('../data/gene_plots/gene_plots_data_all_web/', g, '.csv'))
33 | data_g[, cell_type := gsub('_', ' ', cell_type)]
34 |
35 | data_ct <- data_g[study == 'all', -'study']
36 | data_study <- data_g[study != 'all']
37 |
38 | data_ct[, facet_y := 'c']
39 | data_ct <- rbind(
40 | data_ct,
41 | cbind(
42 | cancer_type = 'All cancers',
43 | data_ct[, .(ave = mean(ave[!is.na(ave)]), prop_pos = mean(prop_pos[!is.na(prop_pos)]), n_cell = sum(n_cell), n_sample = sum(n_sample),
44 | n_sample_thresh = sum(n_sample_thresh)), by = cell_type]
45 | )[, facet_y := 'a'],
46 | use.names = TRUE
47 | )
48 | data_ct[,
49 | c('facet_y', 'cell_type', 'cancer_type') := .(
50 | factor(facet_y, levels = c('c', 'a')),
51 | factor(cell_type, levels = c('Malignant', 'Macrophage', 'T cell', 'NK cell', 'B cell', 'Plasma', 'Dendritic', 'Mast',
52 | 'Fibroblast', 'Pericyte', 'Endothelial', 'Epithelial', 'Oligodendrocyte')),
53 | factor(cancer_type, levels = sort(unique(cancer_type), decreasing = TRUE))
54 | )
55 | ]
56 |
57 | data_study[, cell_type := factor(cell_type, levels = c('Malignant', 'Macrophage', 'T cell', 'NK cell', 'B cell', 'Plasma',
58 | 'Dendritic', 'Mast', 'Fibroblast', 'Pericyte', 'Endothelial', 'Epithelial', 'Oligodendrocyte'))]
59 | data_study[, study := factor(study, levels = rev(sort(unique(study))))] # So that studies go alphabetically from the top
60 |
61 | # Cut cancer types into groups for separate pages:
62 | ct_groups <- list(character(0))
63 | i <- 0
64 | j <- 1
65 | for(ct in data_study[, sort(unique(as.character(cancer_type)))]) {
66 | ns <- data_study[cancer_type == ct, length(unique(study))]
67 | if(30 - i >= ns) {
68 | ct_groups[[j]] <- c(ct_groups[[j]], ct)
69 | i <- i + ns
70 | } else {
71 | j <- j + 1
72 | ct_groups[[j]] <- ct
73 | i <- ns
74 | }
75 | }
76 |
77 | mp_cor_g <- fread(paste0('../data/gene_plots/gene_mp_cor/', g, '.csv'))
78 | mp_cor_g[, cell_type := gsub('_', ' ', cell_type)]
79 | mp_cor_cts <- mp_cor_g[, {uct <- unique(cell_type); if('Malignant' %in% uct) c('Malignant', sort(uct[uct != 'Malignant'])) else sort(uct)}]
80 |
81 | plots <- list(
82 | dotplot = ggplot(data_ct, aes(x = cell_type, y = cancer_type)) +
83 | geom_point(aes(fill = ave, size = 100*prop_pos), shape = 21, colour = 'black') +
84 | facet_grid(rows = vars(facet_y), scales = 'free', space = 'free') +
85 | scale_x_discrete(labels = c('Malignant' = expression(bold('Malignant')), parse = TRUE)) +
86 | scale_y_discrete(labels = c('All cancers' = expression(bold('All cancers')), parse = TRUE)) +
87 | scale_fill_gradientn(
88 | colours = brewer.pal(9, 'YlOrRd'),
89 | limits = c(0, 8),
90 | breaks = c(0, 2, 4, 6, 8),
91 | labels = c('0' = '0', '2' = '2', '4' = '4', '6' = '6', '8' = '\u2265 8'),
92 | oob = squish
93 | ) +
94 | scale_radius(limits = c(0, 100), range = c(1, 8), breaks = c(0, 20, 40, 60, 80, 100)) +
95 | theme_bw() +
96 | theme(
97 | axis.text.x = element_text(angle = 55, hjust = 1),
98 | axis.title.x = element_blank(),
99 | axis.title.y = element_text(margin = margin(r = 10)),
100 | panel.grid = element_line(linewidth = 0.4),
101 | strip.background = element_blank(),
102 | strip.text = element_blank()
103 | ) +
104 | guides(fill = guide_colourbar(frame.colour = 'black')) +
105 | labs(y = 'Cancer type', fill = 'Mean log2\nexpression\nlevel', size = '% expressing\ncells'),
106 | dotplot_stdy = lapply(ct_groups, function(grp) {
107 | pdata <- data_study[cancer_type %in% grp]
108 | ggplot(pdata, aes(x = cell_type, y = study)) +
109 | geom_point(aes(fill = ave, size = 100*prop_pos), shape = 21, colour = 'black') +
110 | facet_grid(rows = vars(cancer_type), scales = 'free', space = 'free') +
111 | scale_x_discrete(labels = c('Malignant' = expression(bold('Malignant')), parse = TRUE)) +
112 | scale_fill_gradientn(
113 | colours = brewer.pal(9, 'YlOrRd'),
114 | limits = c(0, 8),
115 | breaks = c(0, 2, 4, 6, 8),
116 | labels = c('0' = '0', '2' = '2', '4' = '4', '6' = '6', '8' = '\u2265 8'),
117 | oob = squish
118 | ) +
119 | scale_radius(limits = c(0, 100), range = c(1, 8), breaks = c(0, 20, 40, 60, 80, 100)) +
120 | theme_bw() +
121 | theme(
122 | axis.text.x = element_text(angle = 55, hjust = 1),
123 | axis.title.x = element_blank(),
124 | axis.title.y = element_text(margin = margin(r = 10)),
125 | panel.grid = element_line(linewidth = 0.4),
126 | strip.background = element_rect(colour = NA, fill = NA),
127 | strip.text = element_text(vjust = 0),
128 | legend.justification = ifelse(data_study[cancer_type %in% grp, length(unique(study)) <= 14], 'top', 'center')
129 | ) +
130 | guides(fill = guide_colourbar(frame.colour = 'black')) +
131 | labs(y = 'Study', fill = 'Mean log2\nexpression\nlevel', size = '% expressing\ncells')
132 | }),
133 | htmp = slapply(mp_cor_cts, function(ct) {
134 | ct_data <- mp_cor_g[cell_type == ct]
135 | setkey(ct_data, cancer_type, meta_program)
136 | ct_data <- ct_data[CJ(unique(cancer_type), unique(meta_program))]
137 | ct_data[, meta_program := factor(meta_program, levels = rev(sort(unique(meta_program))))]
138 | ct_title <- mapvalues(ct, mp_cor_cts, c('Malignant cells', 'B cells', 'Endothelial cells', 'Epithelial cells', 'Fibroblasts',
139 | 'Macrophages', 'T cells'), warn_missing = FALSE)
140 | ggplot(ct_data) +
141 | geom_tile(aes(x = cancer_type, y = meta_program, fill = as.numeric(mean_corr), colour = 'a')) + # as.numeric() in case all NA
142 | scale_x_discrete(expand = c(0, 0)) +
143 | scale_y_discrete(expand = c(0, 0)) +
144 | scale_colour_manual(name = NULL, values = c(a = 'grey90'), labels = c(a = 'NA')) +
145 | scale_fill_gradientn(colours = rev(brewer.pal(9, 'RdBu')), limits = c(-0.6, 0.6), breaks = c(-0.6, -0.3, 0, 0.3, 0.6),
146 | oob = squish, na.value = 'grey80') +
147 | guides(fill = guide_colourbar(order = 1, frame.colour = 'black'),
148 | colour = guide_legend(override.aes = list(fill = 'grey80'), order = 2)) +
149 | theme(panel.border = element_rect(fill = NA, colour = 'black'), axis.text.x = element_text(angle = 35, hjust = 1),
150 | axis.title.x = element_text(margin = margin(t = 10)), axis.title.y = element_text(margin = margin(r = 10))) +
151 | labs(x = 'Cancer type', y = 'Meta-program', fill = 'Average\ncorrelation', title = ct_title)
152 | })
153 | )
154 |
155 | saveRDS(plots, paste0('../data/gene_plots/rds_plots_web/', g, '.rds'))
156 |
157 | rmdlines <- c('---\n', 'title: "**', g, '**"\n')
158 |
159 | if(g %in% aliases$symbol) rmdlines <- c(rmdlines, 'subtitle: "', paste(aliases[symbol == g, symbol_alt], collapse = ', '), '"\n')
160 |
161 | rmdlines <- c(
162 | rmdlines,
163 | 'header-includes:\n',
164 | ' \\renewcommand{\\familydefault}{\\sfdefault}\n',
165 | ' \\pagenumbering{gobble}\n',
166 | 'geometry: margin=1cm\n',
167 | 'output: pdf_document\n',
168 | 'papersize: a4\n',
169 | '---\n',
170 | '\n',
171 | '```{r setup, include = FALSE}\n',
172 | 'knitr::opts_chunk$set(echo = FALSE, warning = FALSE, error = FALSE, dev = "cairo_pdf")\n',
173 | '```\n',
174 | '\n',
175 | '```{r}\n',
176 | 'plots <- readRDS("../rds_plots_web/', g, '.rds")\n', # This path has to be relative to where the Rmd file will be
177 | '```\n',
178 | '\n',
179 | '## **A**\n',
180 | '\n',
181 | '```{r fig.align="center", fig.height=', 165/25.4, ', fig.width=', 180/25.4, '}\n',
182 | 'plots$dotplot\n',
183 | '```\n',
184 | '\n',
185 | '\\vspace*{\\fill}\n',
186 | '\n',
187 | '**A. Expression of ', g, ' per cell type and cancer type.** Plot showing the average expression level of ', g, ' and the percentage of ',
188 | 'cells expressing ', g, ' in each cancer type and each of the most common cell types, namely those cell types constituting at least 1% of ',
189 | 'at least 5 individual datasets. Expression levels are defined as log2(TPM/10). Average expression levels and percentages were measured ',
190 | 'per cell type within each dataset, then averaged across studies within each cancer type.\n'
191 | )
192 |
193 | for(i in 1:length(plots$dotplot_stdy)) {
194 | plot_title <- ifelse(i == 1, '**B**', '**B (continued)**')
195 | plot_height <- 30 + 7.5*data_study[cancer_type %in% ct_groups[[i]], length(unique(study))] + 2*(length(ct_groups[[i]]) - 1)
196 | legend_space <- ifelse(i == length(plots$dotplot_stdy), 20, 0)
197 | rmdlines <- c(
198 | rmdlines,
199 | '\n',
200 | '\\newpage\n',
201 | '\n',
202 | '## ', plot_title, '\n',
203 | '\n',
204 | '```{r fig.align="center", fig.height=', (267 - legend_space)/25.4, ', fig.width=', 190/25.4, '}\n',
205 | 'cowplot::plot_grid(plots$dotplot_stdy[[', i, ']], nrow = 2, ncol = 1, rel_heights = c(', plot_height, ', ',
206 | 267 - legend_space - plot_height, '))\n',
207 | '```\n'
208 | )
209 | }
210 |
211 | rmdlines <- c(
212 | rmdlines,
213 | '\n',
214 | '\\vspace*{\\fill}\n',
215 | '\n',
216 | '**B. Expression of ', g, ' per cell type and study.** Plot showing the average expression level of ', g, ' and the percentage of cells ',
217 | 'expressing ', g, ' in each study and each of the cell types shown in A. Expression levels are defined as log2(TPM/10).\n'
218 | )
219 |
220 | for(i in 1:length(plots$htmp)) {
221 | ct <- names(plots$htmp)[i]
222 | plot_title <- ifelse(i == 1, '**C**', '**C (continued)**')
223 | plot_height <- 30 + 5*mp_cor_g[cell_type == ct, length(unique(meta_program))]
224 | legend_space <- ifelse(i == length(plots$htmp), 20, 0)
225 | rmdlines <- c(
226 | rmdlines,
227 | '\n',
228 | '\\newpage\n',
229 | '\n',
230 | '## ', plot_title, '\n',
231 | '\n',
232 | '```{r fig.align="center", fig.height=', (267 - legend_space)/25.4, ', fig.width=', 190/25.4, '}\n',
233 | 'cowplot::plot_grid(plots$htmp[[', i, ']], nrow = 2, ncol = 1, rel_heights = c(', plot_height, ', ',
234 | 267 - legend_space - plot_height, '))\n',
235 | '```\n'
236 | )
237 | }
238 |
239 | rmdlines <- c(
240 | rmdlines,
241 | '\n',
242 | '\\vspace*{\\fill}\n',
243 | '\n',
244 | '**C. Correlation of ', g, ' with meta-programs.** Heatmaps showing, for each cell type, the average Pearson correlation of ', g,
245 | ' with meta-program scores in each cancer type. NAs indicate insufficient data or zero variance.\n'
246 | )
247 |
248 | out_con <- file(paste0('../data/gene_plots/rmds_web/', g, '.Rmd'))
249 | writeLines(rmdlines, con = out_con, sep = '')
250 | close(out_con)
251 |
252 | }
253 |
--------------------------------------------------------------------------------
/mp_scores_all.R:
--------------------------------------------------------------------------------
1 | # The command line arguments should supply study name and cancer type (in that order).
2 | r = commandArgs(trailingOnly = TRUE)
3 |
4 |
5 |
6 |
7 |
8 | library(data.table)
9 | library(magrittr)
10 | library(readxl)
11 | library(plyr)
12 | library(stringr)
13 | library(Matrix)
14 | library(matkot)
15 |
16 | source('functions.R')
17 |
18 | paths_table <- fread('../data/paths_table.csv', key = c('study', 'cancer_type'))
19 | paths <- copy(paths_table[as.list(r)]); setkey(paths, group)
20 |
21 | hgnc_complete_set <- fread('../data/hgnc_complete_set_2023-04-13.txt', key = 'symbol')
22 | hgnc_complete_set <- hgnc_complete_set[!(ensembl_gene_id %in% names(table(ensembl_gene_id))[table(ensembl_gene_id) > 1])]
23 | alias_table <- make_alias_table(hgnc_complete_set)
24 |
25 | meta_programs <- lapply(
26 | c('Malignant', 'B cells', 'Endothelial', 'Epithelial', 'Fibroblasts', 'Macrophages', 'CD4 T cells', 'CD8 T cells'),
27 | function(ct) {
28 | dt <- as.data.table(read_xlsx('../data/meta_programs_2023-07-13.xlsx', sheet = ct))
29 | dt[, (names(dt)) := lapply(dt, update_symbols_fast, alias_table)]
30 | return(dt)
31 | }
32 | )
33 |
34 | # Change some MP names, especially for CD4 and CD8 T cells, so we can merge them into a single T cell category:
35 | names(meta_programs[[1]])[c(12:15, 30, 32, 34, 39)] <- gsub('-', ' ', names(meta_programs[[1]])[c(12:15, 30, 32, 34, 39)])
36 | names(meta_programs[[1]])[c(19, 35, 37, 41)] <- c('EpiSen', 'Hemato-related I', 'Hemato-related II', 'MP41 (Unassigned)')
37 | names(meta_programs[[2]])[3] <- 'Cell Cycle'
38 | names(meta_programs[[3]])[c(1, 6)] <- c('Notch signaling', 'Cell Cycle')
39 | names(meta_programs[[4]])[5] <- 'Cell Cycle'
40 | names(meta_programs[[5]])[c(6, 10, 15)] <- c('Cell Cycle', 'Metal response', 'Lipid metabolism')
41 | names(meta_programs[[6]])[c(3, 9, 12)] <- c('Cell Cycle', 'Proteasomal degradation', 'Unfolded protein response')
42 | names(meta_programs[[7]]) <- paste('CD4 -', names(meta_programs[[7]]))
43 | names(meta_programs[[7]])[c(1, 3)] <- c('CD4 - Treg', 'CD4 - Cell Cycle')
44 | names(meta_programs[[8]]) <- paste('CD8 -', names(meta_programs[[8]]))
45 | names(meta_programs[[8]])[c(2, 10)] <- c('CD8 - Cell Cycle', 'CD8 - Heat shock')
46 |
47 | # Merge CD4 and CD8 T cells:
48 | meta_programs[[7]] <- cbind(meta_programs[[7]], meta_programs[[8]])
49 | meta_programs <- meta_programs[1:7]
50 |
51 | cts <- c('Malignant', 'B_cell', 'Endothelial', 'Epithelial', 'Fibroblast', 'Macrophage', 'T_cell')
52 | names(meta_programs) <- cts
53 |
54 |
55 |
56 |
57 |
58 | rdir <- paste0('../study_plots/', gsub('/', '-', r[2]), '/', r[1])
59 |
60 | gene_ave <- fread(paste0(rdir, '/gene_ave.csv'))[!is.na(ave)]
61 |
62 | if(nrow(gene_ave) > 0) {
63 | bins_nm <- fread('../data/bins_nm.csv')
64 | gene_ave_list <- list(gene_ave[cell_type == 'Malignant' & n_cell >= 10, .(group, symbol, sample, cell_type, ave)])
65 | if(unique(gene_ave[cell_type == 'Malignant', .(group, sample, n_cell)])[, sum(n_cell) >= 10]) gene_ave_list <- c(gene_ave_list, list(
66 | gene_ave[
67 | cell_type == 'Malignant',
68 | .(group = 'all', sample = 'all', cell_type = 'Malignant', ave = sum(ave*n_cell)/sum(n_cell)),
69 | by = symbol
70 | ]
71 | ))
72 | gene_ave_list <- c(gene_ave_list, list(gene_ave[,
73 | if(unique(.SD[, .(cell_type, n_cell)])[, sum(n_cell) >= 10]) .SD[, .(cell_type = 'all', ave = sum(ave*n_cell)/sum(n_cell)), by = symbol],
74 | by = .(group, sample)
75 | ]))
76 | if(unique(gene_ave[, .(sample, cell_type, n_cell)])[, sum(n_cell) >= 10]) gene_ave_list <- c(gene_ave_list, list(
77 | gene_ave[, .(group = 'all', sample = 'all', cell_type = 'all', ave = sum(ave*n_cell)/sum(n_cell)), by = symbol]
78 | ))
79 | gene_ave <- rbindlist(gene_ave_list, use.names = TRUE)
80 | set.seed(7327) # Shuffle the genes before ranking so that the zero genes don't simply get ordered alphabetically:
81 | gene_ave[sample(1:.N), ave_rank := order(order(ave))/.N, by = .(group, sample, cell_type)]
82 | bins_m <- gene_ave[
83 | cell_type == 'Malignant',
84 | cbind(.SD[order(ave_rank)], bin = cut(1:.N, 15, labels = FALSE)),
85 | by = .(group, sample),
86 | .SDcols = -'cell_type'
87 | ]
88 | bins_all <- gene_ave[
89 | cell_type == 'all',
90 | cbind(.SD[order(ave_rank)], bin = cut(1:.N, 15, labels = FALSE)),
91 | by = .(group, sample),
92 | .SDcols = -'cell_type'
93 | ]
94 | }
95 |
96 |
97 |
98 |
99 |
100 | if(nrow(gene_ave) > 0) out <- lapply(paths$group, function(g) {
101 |
102 | cat('Preparing data\n')
103 | cells <- suppressWarnings(fread(paths[g, cells], na.strings = '', colClasses = c(cell_name = 'character', sample = 'character')))
104 | if(!all(c('cell_type', 'sample') %in% names(cells)) | !endsWith(paths[g, expmat], 'mtx')) return(NULL)
105 | if(cells[, .N > length(unique(cell_name))]) cells[, cell_name := paste(cell_name, .I, sep = '_')]
106 | genes <- fread(paths[g, genes], header = FALSE)$V1
107 | expmat <- readMM(paths[g, expmat])
108 | expmat <- expmat[genes != '', ] # In case any genes are empty strings
109 | genes <- genes[genes != '']
110 | expmat <- expmat[genes %in% names(table(genes))[table(genes) == 1], ] # Removes repeated gene names
111 | genes <- genes[genes %in% names(table(genes))[table(genes) == 1]]
112 | genes <- update_symbols_fast(genes, alias_table) # Update gene symbols
113 | rownames(expmat) <- genes
114 | colnames(expmat) <- cells$cell_name
115 | cells <- cells[cell_type %in% cts & col_nnz(expmat) >= 1000] # Filter cell types and remove low-complexity cells
116 | if(nrow(cells) < 30) return(NULL)
117 | expmat <- round(log_transform(1e+05*to_frac(expmat[, cells$cell_name])), 4) # Normalise to log TPM/10
118 |
119 | if(sum(unique(bins_nm$symbol) %in% genes) < 1000) bins_nm[, symbol := str_to_title(symbol)] # In case this is mouse data
120 | if(sum(unique(bins_nm$symbol) %in% genes) < 1000) return(NULL)
121 | bins_nm <- bins_nm[symbol %in% genes, .(group, symbol, bin = as.character(bin))]
122 | bins_m <- bins_m[symbol %in% genes & group %in% c(g, 'all'), .(group = mapvalues(sample, 'all', 'Malignant'), symbol, bin = as.character(bin))]
123 | bins_all <- bins_all[symbol %in% genes & group %in% c(g, 'all'), .(group = sample, symbol, bin = as.character(bin))]
124 |
125 | # Consolidate bins tables (and add column to , for easy matching with ):
126 | cat('Consolidating bins tables\n')
127 | if(all(unique(cells$sample) %in% bins_m$group)) {
128 | bins <- rbind(bins_nm[group %in% cells$cell_type], bins_m[group != 'Malignant'])
129 | cells[cell_type %in% c(unique(bins_nm$group), 'Malignant'), group := ifelse(cell_type == 'Malignant', sample, cell_type)]
130 | } else { # Use "Malignant" group for samples with too few malignant cells:
131 | bins <- rbind(bins_nm[group %in% cells$cell_type], bins_m)
132 | cells[
133 | cell_type %in% c(unique(bins_nm$group), 'Malignant'),
134 | group := ifelse(cell_type != 'Malignant' | !(sample %in% bins_m$group), cell_type, sample)
135 | ]
136 | }
137 | if(!all(cells[cell_type != 'Malignant', unique(cell_type)] %in% bins_nm$group)) {
138 | bins <- rbind(bins, bins_all[group == 'all']) # Use "all" group for non-malignant cell types not listed in bins_nm
139 | cells[cell_type != 'Malignant' & !(cell_type %in% bins_nm$group), group := 'all']
140 | }
141 |
142 | setkey(bins, group, symbol)
143 | bins_copy <- copy(bins); setkey(bins_copy, group, bin)
144 |
145 | cat('Computing bin averages\n')
146 | set.seed(9432)
147 | bin_aves <- cells[, {
148 | ct <- unique(cell_type)
149 | cat(ct, '\n')
150 | ct_genes <- sort(unique(unlist(meta_programs[[ct]])))
151 | if(sum(ct_genes %in% genes) < 50) ct_genes <- str_to_title(ct_genes) # In case this is mouse data
152 | ct_genes <- ct_genes[ct_genes %in% genes]
153 | if(length(ct_genes) < 50) NULL else {
154 | .SD[, {
155 | grp <- unique(group)
156 | cat('\t', grp, '\n')
157 | sdout <- bins_copy[
158 | .(grp, bins[.(grp, ct_genes), as.character(bin)]),
159 | if(.N >= 50) {
160 | if(.N <= 100) {
161 | .(gene = ct_genes[.GRP], bin_genes = symbol)
162 | } else .(gene = ct_genes[.GRP], bin_genes = sample(symbol, 100, replace = FALSE))
163 | },
164 | by = .EACHI
165 | ]
166 | expmat_sub <- expmat[unique(sdout$bin_genes), cell_name, drop = FALSE]
167 | sdout[, .(cell_name = cell_name, ave = colMeans(expmat_sub[bin_genes, , drop = FALSE])), by = gene]
168 | }, by = group]
169 | }
170 | }, by = cell_type]
171 |
172 | bin_ave_mat <- dcast(bin_aves[, .(gene, cell_name, ave)], gene ~ cell_name)[, set_rownames(as.matrix(.SD), gene), .SDcols = -'gene']
173 |
174 | cat('Computing scores\n')
175 | scores <- cells[, {
176 | ct <- unique(cell_type)
177 | cat(ct, '\n')
178 | c(
179 | .(cell_name = cell_name),
180 | slapply(names(meta_programs[[ct]]), function(mp_name) {
181 | cat('\t', mp_name, '\n')
182 | mp <- meta_programs[[ct]][[mp_name]]
183 | if(sum(mp %in% genes & mp %in% bin_aves$gene) < 10) mp <- str_to_title(mp)
184 | mp <- mp[mp %in% genes & mp %in% bin_aves$gene]
185 | if(length(mp) >= 30) {
186 | xout <- expmat[mp, cell_name, drop = FALSE] - bin_ave_mat[mp, cell_name, drop = FALSE]
187 | xout[xout > 3] <- 3; xout[xout < -3] <- 3 # Cap relative expression levels to reduce influence of extreme values.
188 | colMeans(xout, na.rm = TRUE) # Vector of scores - values for some cells/groups may be NA/NaN.
189 | }
190 | })
191 | ) %>% as.data.table %>% melt(id.vars = 'cell_name', variable.name = 'meta_program', value.name = 'score', variable.factor = FALSE)
192 | }, by = cell_type]
193 |
194 | scores <- scores[, -'cell_type']
195 | scores[, group := g]
196 | setcolorder(scores, 'group')
197 | return(scores)
198 |
199 | }) %>% rbindlist
200 |
201 | if(nrow(gene_ave) > 0) fwrite(out, paste0(rdir, '/mp_scores_all.csv'))
202 |
--------------------------------------------------------------------------------
/mp_scores_prep.R:
--------------------------------------------------------------------------------
1 | library(data.table)
2 | library(magrittr)
3 | library(matkot)
4 |
5 | paths_table <- fread('../data/paths_table.csv', encoding = 'UTF-8', key = c('study', 'cancer_type'))
6 |
7 | for(r in transpose(as.list(unique(paths_table[, .(study, cancer_type)])))) tryCatch({
8 | cat(r, '-')
9 | d <- paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1])
10 | if('data_dist.rds' %in% dir(d)) {
11 | data_dist <- readRDS(paste0(d, '/data_dist.rds'))
12 | data_dist_cond <- sapply(data_dist, function(x) is.null(x) | is.null(x$scores))
13 | if(!all(data_dist_cond)) {
14 | for(i in (1:length(data_dist))[!data_dist_cond]) { # Indices where data is not NULL
15 | out <- data_dist[[i]]$scores[, .(cell_name, sample, cell_type, meta_program, score = round(score, 4))]
16 | if(sum(!data_dist_cond) > 1) {
17 | if(!('MP scores' %in% dir(d))) dir.create(paste0(d, '/MP scores'))
18 | suff <- paths_table[as.list(r)][i, if(group_name != '') group_name else paste0('group', i)]
19 | fwrite(out, paste0(d, '/MP scores/MP_scores_', suff, '.csv'))
20 | } else fwrite(out, paste0(d, '/MP scores.csv'))
21 | }
22 | } else cat('No MP score data')
23 | } else cat('No MP score data')
24 | cat('\n')
25 | }, error = function(e) print('ERROR\n'))
26 |
--------------------------------------------------------------------------------
/study_plots_cc.R:
--------------------------------------------------------------------------------
1 | library(data.table)
2 | library(ggplot2)
3 | library(magrittr)
4 | library(Matrix)
5 | library(stringr)
6 | library(plyr)
7 | library(cowplot)
8 | library(RColorBrewer)
9 | library(scales)
10 | library(gtable)
11 | library(grid)
12 | library(gridExtra)
13 | library(matkot)
14 |
15 | source('functions.R')
16 |
17 | paths_table <- fread('../data/paths_table.csv', encoding = 'UTF-8', key = c('study', 'cancer_type'))
18 |
19 | ct_to_title <- function(ct) {
20 | if(grepl('ic$|al$|ar$|oid$|crine$|ant$', ct) | ct %in% c('Mast', 'Plasma', 'Immune', 'Stem', 'Schwann', 'Langerhans', 'Tuft', 'Stellate')) {
21 | return(paste0(ct, ' cells'))
22 | } else {
23 | return(paste0(gsub('_', ' ', ct), 's'))
24 | }
25 | }
26 |
27 |
28 |
29 |
30 |
31 | for(r in transpose(as.list(unique(paths_table[, .(study, cancer_type)])))) {
32 |
33 | cat(r, '\n')
34 |
35 | if(all(c('data_cc.RDS', 'Cell cycle.pdf') %in% dir(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1])) == c(TRUE, FALSE))) {
36 |
37 | plot_data <- readRDS(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1], '/data_cc.RDS'))
38 | nullcond <- sapply(plot_data, function(x) ifelse(is.null(x), TRUE, all(sapply(x[names(x) != 'path'], is.null))))
39 |
40 | if(!all(nullcond)) {
41 |
42 | suff <- paste(gsub(' et al. ', '', r[1]), gsub('/| ', '-', r[2]), sep = '_')
43 |
44 | out <- lapply(which(!nullcond), function(i) {
45 |
46 | # To use in plot titles:
47 | if(sum(unique(paths_table[, .(study, cancer_type)])$study == r[1]) == 1) {
48 | if(paths_table[as.list(r), .N] > 1) {
49 | title_tail <- paste0(r[1], ' - ', paths_table[as.list(r)][i, if(group_name == '') paste('Group', group) else group_name])
50 | } else {
51 | title_tail <- r[1]
52 | }
53 | } else {
54 | if(paths_table[as.list(r), .N] > 1) {
55 | title_tail <- paste0(r[1], ', ', r[2], ' - ', paths_table[as.list(r)][i, if(group_name == '') paste('Group', group) else
56 | group_name])
57 | } else {
58 | title_tail <- paste0(r[1], ', ', r[2])
59 | }
60 | }
61 |
62 | plot_out <- do.call(cc_plot, args = plot_data[[i]][c('ccdata', 'hdata', 'g1s', 'g2m')])
63 | plot_title <- paste0('Cell cycle activity in ', title_tail)
64 | return(list(plot_out = plot_out, plot_title = plot_title))
65 |
66 | })
67 |
68 | if(!all(sapply(out, function(x) is.null(x$plot_out)))) {
69 |
70 | rmdlines <- c(
71 | '---\n',
72 | 'title: ""\n',
73 | 'header-includes:\n',
74 | ' \\renewcommand{\\familydefault}{\\sfdefault}\n', # Indent is important but must use spaces, NOT tabs!
75 | ' \\pagenumbering{gobble}\n',
76 | 'geometry: margin=1cm\n',
77 | 'output: pdf_document\n',
78 | 'papersize: a4\n',
79 | '---\n',
80 | '\n',
81 | '```{r setup, include = FALSE}\n',
82 | 'knitr::opts_chunk$set(echo = FALSE, warning = FALSE, error = FALSE, dev = "cairo_pdf")\n',
83 | '```\n'
84 | )
85 | for(i in 1:length(out)) {
86 | if(is.null(out[[i]]$plot_out)) next
87 | saveRDS(out[[i]]$plot_out, paste0('comp_plots_', i, '_', suff, '.rds'))
88 | if(i > 1) rmdlines <- c(rmdlines, '\n\\newpage\n')
89 | rmdlines <- c(
90 | rmdlines,
91 | '\n',
92 | '```{r}\n',
93 | 'plots <- readRDS("comp_plots_', i, '_', suff, '.rds")\n',
94 | '```\n'
95 | )
96 | if(names(out[[i]]$plot_out$scatter)[1] == 'data') {
97 | # In this case there's only one scatter plot ('data' is the 1st element of a ggplot object), and there will be a heatmap.
98 | rmdlines <- c(
99 | rmdlines,
100 | '\n',
101 | '### ', out[[i]]$plot_title, '\n',
102 | '\n',
103 | '### **A**\n',
104 | '\n',
105 | '```{r fig.align="center", fig.height=', 110/25.4, ', fig.width=', 170/25.4, ', out.width="70%", out.height="70%"}\n',
106 | 'plots$scatter\n',
107 | '```\n',
108 | '\n',
109 | '### **B**\n',
110 | '\n',
111 | '```{r fig.align="center", fig.height=', 175/25.4, ', fig.width=', 210/25.4, ', out.width="80%", out.height="80%"}\n',
112 | 'plots$heatmap\n',
113 | '```\n',
114 | '\n',
115 | '\\vspace*{\\fill}\n', # Aligns text with bottom of page
116 | '\n',
117 | '**A.** Scatter plot showing the scores for G1/S (x axis) and G2/M (y axis) cell cycle phases for individual cells ',
118 | '(points). The color indicates the phase assigned to each cell, and the percentage refers to all cells which are ',
119 | 'assigned as either G1/S, G2/M or intermediate. **B.** Heatmap of relative expression levels in individual cells ',
120 | '(columns) of the genes (rows) best distinguishing G1/S and G2/M phases. Cells are ordered by phase (indicated by ',
121 | 'the color bar, top) and, within each phase, by the maximum of the G1/S and G2/M scores.\n'
122 | )
123 | } else { # In this case every element is a separate scatter plot, corresponding to a cell type, possibly with no heatmap.
124 | cts <- names(out[[i]]$plot_out$scatter)
125 | if('Malignant' %in% cts) {cts <- c('Malignant', sort(cts[cts != 'Malignant']))} else {cts <- sort(cts)}
126 | for(j in 1:length(out[[i]]$plot_out$scatter)) {
127 | if(j > 1) rmdlines <- c(rmdlines, '\n\\newpage\n')
128 | ct <- cts[j]
129 | plot_title <- paste0(out[[i]]$plot_title, ' - ', ct_to_title(ct))
130 | if(ct %in% names(out[[i]]$plot_out$heatmap)) {
131 | rmdlines <- c(
132 | rmdlines,
133 | '\n',
134 | '### ', plot_title, '\n',
135 | '\n',
136 | '### **A**\n',
137 | '\n',
138 | '```{r fig.align="center", fig.height=',
139 | 110/25.4,
140 | ', fig.width=',
141 | 170/25.4,
142 | ', out.width="70%", out.height="70%"}\n',
143 | 'plots$scatter[["', ct, '"]]\n',
144 | '```\n',
145 | '\n',
146 | '### **B**\n',
147 | '\n',
148 | '```{r fig.align="center", fig.height=',
149 | 175/25.4,
150 | ', fig.width=',
151 | 210/25.4,
152 | ', out.width="80%", out.height="80%"}\n',
153 | 'plots$heatmap[["', ct, '"]]\n',
154 | '```\n',
155 | '\n',
156 | '\\vspace*{\\fill}\n', # Aligns text with bottom of page
157 | '\n',
158 | '**A.** Scatter plot showing the scores for G1/S (x axis) and G2/M (y axis) cell cycle phases for individual ',
159 | ct_to_title(ct),
160 | ' (points). The color indicates the phase assigned to each cell, and the percentage refers to all cells ',
161 | 'which are assigned as either G1/S, G2/M or intermediate. **B.** Heatmap of relative expression levels in ',
162 | 'individual ',
163 | ct_to_title(ct),
164 | ' (columns) of the genes (rows) best distinguishing G1/S and G2/M phases. Cells are ordered by phase ',
165 | '(indicated by the color bar, top) and, within each phase, by the maximum of the G1/S and G2/M scores.\n'
166 | )
167 | } else {
168 | rmdlines <- c(
169 | rmdlines,
170 | '\n',
171 | '### ', plot_title, '\n',
172 | '\n',
173 | '```{r fig.align = "center", fig.height = ', 120/25.4, ', fig.width = ', 180/25.4,
174 | ', out.width = "70%", out.height = "70%"}\n',
175 | 'plots$scatter[["', ct, '"]]\n',
176 | '```\n',
177 | '\n',
178 | 'Scatter plot showing the scores for G1/S (x axis) and G2/M (y axis) cell cycle phases for individual ',
179 | ct_to_title(ct),
180 | ' (points). The color indicates the phase assigned to each cell, and the percentage refers to all cells which ',
181 | 'are assigned as either G1/S, G2/M or intermediate.\n'
182 | )
183 | }
184 | }
185 | }
186 | }
187 | out_con <- file(paste0('temp_', suff, '.Rmd'))
188 | writeLines(rmdlines, con = out_con, sep = '')
189 | close(out_con)
190 | rmarkdown::render(
191 | paste0('temp_', suff, '.Rmd'),
192 | output_file = paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1], '/Cell cycle.pdf')
193 | )
194 | file.remove(paste0('temp_', suff, '.Rmd'))
195 | for(i in 1:length(out)) file.remove(paste0('comp_plots_', i, '_', suff, '.rds'))
196 |
197 | }
198 |
199 | }
200 |
201 | }
202 |
203 | }
204 |
--------------------------------------------------------------------------------
/study_plots_cna.R:
--------------------------------------------------------------------------------
1 | # The command line arguments should supply study name and cancer type (in that order).
2 | r = commandArgs(trailingOnly = TRUE)
3 |
4 |
5 |
6 |
7 |
8 | library(data.table)
9 | library(magrittr)
10 | library(ggplot2)
11 | library(RColorBrewer)
12 | library(randomcoloR)
13 | library(scales)
14 | library(cowplot)
15 | library(ggtext)
16 | library(matkot)
17 |
18 | source('functions.R')
19 |
20 | paths_table <- fread('../data/paths_table.csv', key = c('study', 'cancer_type'))
21 |
22 |
23 |
24 |
25 |
26 | paths <- apply(paths_table[as.list(r), .(cells, genes, expmat)], 1, as.list, simplify = FALSE)
27 |
28 | if('data_cna.rds' %in% dir(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1]))) {
29 |
30 | plot_data <- readRDS(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1], '/data_cna.rds'))
31 | nullcond <- sapply(plot_data, function(x) ifelse(is.null(x), TRUE, all(sapply(x[names(x) != 'path'], is.null))))
32 |
33 | out <- lapply(which(!nullcond), function(i) {
34 |
35 | # To use in plot titles:
36 | if(sum(unique(paths_table[, .(study, cancer_type)])$study == r[1]) == 1) {
37 | if(paths_table[as.list(r), .N] > 1) {
38 | title_tail <- paste0(r[1], ' - ', paths_table[as.list(r)][i, if(group_name == '') paste('Group', group) else group_name])
39 | } else {
40 | title_tail <- r[1]
41 | }
42 | } else {
43 | if(paths_table[as.list(r), .N] > 1) {
44 | title_tail <- paste0(r[1], ', ', r[2], ' - ', paths_table[as.list(r)][i, if(group_name == '') paste('Group', group) else group_name])
45 | } else {
46 | title_tail <- paste0(r[1], ', ', r[2])
47 | }
48 | }
49 |
50 | cells <- suppressWarnings(fread(paths[[i]]$cells, na.strings = ''))
51 | cells$cell_name <- as.character(cells$cell_name)
52 | setkey(cells, cell_name)
53 |
54 | pdata <- plot_data[[i]]$cna_data[, .(gene, cell_name, cna, sample, chr)]
55 | pdata[, cell_type := do.call(`[`, list(cells, cell_name))$cell_type]
56 | pdata <- pdata[!is.na(cell_type)]
57 | chr_n <- pdata[, .(n = length(unique(gene))), keyby = chr][n > 20]
58 | pdata <- pdata[chr %in% chr_n$chr]
59 | pdata[, malignant := ifelse(cell_type == 'Malignant', 'm', 'nm')]
60 |
61 | # Sample cells so that we have no more than 2000, of which between 60 and 85% are malignant:
62 | grp_n <- pdata[, .(n = length(unique(cell_name))), keyby = .(malignant, sample)]
63 | grp_tot <- grp_n[, min(max(sum(malignant == 'm')/.N, 0.6), 0.85)]
64 | grp_tot <- 2000*c(m = grp_tot, nm = 1 - grp_tot)
65 | grp_tot_true <- grp_n[, .(N = sum(n)), keyby = malignant]$N
66 | if(any(grp_tot > grp_tot_true)) {wm <- which.min(grp_tot_true - grp_tot); grp_tot <- grp_tot*grp_tot_true[wm]/grp_tot[wm]}
67 | grp_n[, n_to_sample := { # Returns number of cells to take from each sample
68 | tot <- grp_tot[unique(malignant)] # Total allowed for this group
69 | baseline <- pmin(n, min(20, floor(tot/length(unique(sample))))) # Number of cells to keep (samples with too few cells are untouched)
70 | N <- n - baseline
71 | to_lose <- sum(n) - tot # Number of cells from each sample that we can lose
72 | N <- N - floor(to_lose*N/sum(N)) # Take number of cells from each sample proportional to the total number in that sample
73 | to_lose <- sum(N + baseline) - tot # Any left over due to the floor() function
74 | sub_places <- order(order(-N)) %in% 1:to_lose # Take these off evenly from the samples having the most cells
75 | N[sub_places] <- N[sub_places] - 1
76 | N + baseline
77 | }, by = malignant]
78 | set.seed(1012)
79 | pdata <- pdata[, .SD[cell_name %in% sample( # The actual sampling
80 | unique(cell_name),
81 | do.call(`[`, list(grp_n, .(unique(malignant), unique(sample))))$n_to_sample,
82 | replace = FALSE
83 | )], by = .(malignant, sample)]
84 | pdata[, gene := factor(gene, levels = unique(gene))]
85 | pdata[, cell_name := factor(cell_name, levels = unique(cell_name[order(cell_type != 'Malignant', sample)]))]
86 | pdata[, cell_num := as.numeric(cell_name)]
87 |
88 | # Manually choose chromosome labels:
89 | chr_lab <- c('01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12', '14', '16', '17', '18', '19', '20', '22', 'X')
90 | chr_lab <- unique(pdata[, .(gene, chr)])[,
91 | {inds <- chr_n[, floor(cumsum(n) - n/2)[chr %in% chr_lab]]; setNames(gsub('^0', '', chr[inds]), as.character(gene)[inds])}
92 | ]
93 | # To choose them automatically using some threshold for minimum number of genes (100 in this case):
94 | # chr_lab <- unique(pdata[, .(gene, chr)])[,
95 | # {inds <- chr_n[, floor(cumsum(n) - n/2)[n >= 100]]; setNames(gsub('^0', '', chr[inds]), as.character(gene)[inds])}
96 | # ]
97 |
98 | cells_lab <- pdata[, .(n = length(unique(cell_num))), keyby = malignant][, floor(c(n[1]/2, n[1] + n[2]/2))]
99 |
100 | set.seed(1358)
101 | bar_smpl <- ggplot(pdata) +
102 | geom_raster(aes(x = 'a', y = cell_num, fill = sample)) +
103 | scale_fill_manual(name = 'Sample', values = pdata[, setNames(distinctColorPalette(length(unique(sample))), unique(sample))]) +
104 | scale_x_discrete(expand = c(0, 0)) +
105 | scale_y_continuous(expand = c(0, 0)) +
106 | theme(
107 | axis.text = element_blank(),
108 | axis.title = element_blank(),
109 | axis.ticks = element_blank(),
110 | axis.ticks.length = unit(0, 'pt'),
111 | legend.position = 'left',
112 | legend.title = element_text(size = 14),
113 | legend.text = element_text(size = 12),
114 | legend.justification = c(1, 0.5),
115 | panel.border = element_rect(fill = NA, colour = 'black', linewidth = 0.5),
116 | plot.margin = unit(c(5.5, 2, 5.5, 5.5), 'pt')
117 | ) +
118 | guides(fill = guide_legend(keywidth = unit(10, 'pt'), keyheight = unit(15, 'pt')))
119 |
120 | htmp <- ggplot(pdata) +
121 | geom_raster(aes(x = gene, y = cell_num, fill = cna)) +
122 | geom_vline(aes(xintercept = xi), data = data.table(xi = chr_n[1:(.N - 1), cumsum(n) + 0.5]), linewidth = 0.3) +
123 | geom_hline(aes(yintercept = yi), data = data.table(yi = pdata[cell_type == 'Malignant', length(unique(cell_name)) + 0.5]),
124 | linewidth = 0.5) +
125 | scale_fill_gradientn(colours = rev(brewer.pal(11, 'RdBu')), limits = c(-1, 1), breaks = c(-1, 0, 1), oob = squish) +
126 | scale_x_discrete(expand = c(0, 0), breaks = names(chr_lab), labels = chr_lab) +
127 | scale_y_continuous(expand = c(0, 0), sec.axis = dup_axis(breaks = cells_lab, labels = c('Malignant', 'TME'))) +
128 | theme(
129 | axis.text.x = element_text(size = 12),
130 | axis.text.y.left = element_blank(),
131 | axis.text.y.right = element_text(angle = -90, hjust = 0.5, size = 14),
132 | axis.ticks = element_blank(),
133 | axis.ticks.length = unit(0, 'pt'),
134 | axis.title.x = element_text(size = 14),
135 | axis.title.y.left = element_blank(),
136 | axis.title.y.right = element_text(size = 14),
137 | legend.title = element_text(size = 12, margin = margin(b = 6)),
138 | legend.text = element_text(size = 10),
139 | legend.justification = c(0.5, 0),
140 | panel.border = element_rect(fill = NA, colour = 'black', linewidth = 0.5),
141 | plot.title = element_text(size = 14, face = 'bold'),
142 | plot.subtitle = element_text(size = 12),
143 | plot.margin = unit(c(5.5, 5.5, 5.5, 1), 'pt')
144 | ) +
145 | guides(fill = guide_colourbar(barwidth = unit(15, 'pt'), barheight = unit(70, 'pt'), ticks.colour = 'black', frame.colour = 'black')) +
146 | labs(
147 | x = 'Chromosome',
148 | y = 'Cells',
149 | fill = 'Inferred CNA\n(log2 ratio)',
150 | title = paste('Inferred CNAs in', title_tail),
151 | subtitle = paste(
152 | 'Reference cells:',
153 | paste(plot_data[[i]]$ref_cells[, .N, by = cell_type][order(-N), gsub('_', ' ', cell_type)], collapse = ', ')
154 | )
155 | )
156 |
157 | return(list(heatmap = htmp, bar = bar_smpl))
158 |
159 | })
160 |
161 | if(sum(unique(paths_table[, .(study, cancer_type)])$study == r[1]) == 1) caption <- r[1] else caption <- paste0(r[1], ' (', r[2], ')')
162 | if(length(out) == 1) {
163 | caption <- paste0('Heatmap of inferred copy number alteration (CNA) values (quantified as log2 ratio) at each chromosomal position',
164 | ' for a representative subset of cells in ', caption, ', with colour bar (left) showing the corresponding samples.')
165 | } else {
166 | caption <- paste0('Heatmaps of inferred copy number alteration (CNA) values (quantified as log2 ratio) at each chromosomal position',
167 | ' for representative subsets of cells in ', caption, ', with colour bars (left) showing the corresponding samples.')
168 | }
169 | caption <- ggplot() +
170 | theme(panel.background = element_rect(fill = NA), plot.caption.position = 'plot',
171 | plot.caption = element_textbox_simple(hjust = 0, size = 12)) +
172 | labs(caption = caption)
173 |
174 | leg <- lapply(out, function(x) get_legend(x$bar))
175 | leg_width <- max(sapply(leg, function(x) as.numeric(x$widths[3])))
176 | w <- c(leg_width + 0.4, 0.7, 26.7)
177 |
178 | # Open the PNG before using plot_grid(), because this function calls ggdraw(), which opens a graphic device, leading to X11 error on the server.
179 | png(
180 | paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1], '/CNAs.png'),
181 | width = sum(w), height = length(out)*18 + (length(out) - 1)*4 + 4, units = 'cm', res = 300
182 | )
183 | bar_grob <- lapply(out, function(x) ggplotGrob(x$bar + theme(legend.position = 'none')))
184 | htmp_grob <- lapply(out, function(x) ggplotGrob(x$heatmap))
185 | for(j in 1:length(bar_grob)) bar_grob[[j]]$heights[c(1, 7, 12)] <- unit(c(1.7, 14.9, 1.4), 'cm')
186 | for(j in 1:length(htmp_grob)) htmp_grob[[j]]$heights <- unit(c(0.2, 0, 0.8, 0.7, 0, 0, 14.9, 0.5, 0.7, 0, 0, 0.2), 'cm')
187 | plist <- lapply(1:length(out), function(j) {
188 | if(j == 1) {
189 | plot_grid(leg[[j]], bar_grob[[j]], htmp_grob[[j]], nrow = 1, ncol = 3, rel_widths = w)
190 | } else plot_grid(
191 | ggplot() + theme_void(),
192 | plot_grid(leg[[j]], bar_grob[[j]], htmp_grob[[j]], nrow = 1, ncol = 3, rel_widths = w),
193 | nrow = 2, ncol = 1, rel_heights = c(4, 18)
194 | )
195 | })
196 | if(length(plist) == 1) {
197 | print(plot_grid(plist[[1]], caption, nrow = 2, ncol = 1, rel_heights = c(18, 4)))
198 | } else {
199 | print(plot_grid(plotlist = c(plist, list(caption)), nrow = length(plist) + 1, ncol = 1, rel_heights = c(18, rep(22, length(plist) - 1), 4)))
200 | }
201 | dev.off()
202 |
203 | }
204 |
--------------------------------------------------------------------------------
/study_plots_ct_comp.R:
--------------------------------------------------------------------------------
1 | # bsub -q new-medium -R "select[model=Intel_Skylake] rusage[mem=32000]" -oo log/study_plots_ct_comp.o -eo log/study_plots_ct_comp.e Rscript study_plots_ct_comp.R
2 |
3 | library(data.table)
4 | library(ggplot2)
5 | library(magrittr)
6 | library(Matrix)
7 | library(stringr)
8 | library(plyr)
9 | library(cowplot)
10 | library(RColorBrewer)
11 | library(scales)
12 | library(matkot)
13 |
14 | try(library(randomcoloR), silent = TRUE)
15 |
16 | source('functions.R')
17 |
18 | paths_table <- fread('../data/paths_table.csv', encoding = 'UTF-8', key = c('study', 'cancer_type'))
19 |
20 |
21 |
22 |
23 |
24 | for(r in transpose(as.list(unique(paths_table[, .(study, cancer_type)])))) {
25 |
26 | cat(r, '\n')
27 |
28 | if(all(c('data_ct_comp.RDS', 'Cell types.pdf') %in% dir(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1])) == c(TRUE, FALSE))) {
29 |
30 | plot_data <- readRDS(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1], '/data_ct_comp.RDS'))
31 |
32 | nullcond <- sapply(plot_data, function(x) ifelse(is.null(x), TRUE, all(sapply(x[names(x) != 'path'], is.null))))
33 | if(all(nullcond)) next
34 |
35 | out <- lapply(which(!nullcond), function(i) {
36 |
37 | # To use in plot titles:
38 | if(sum(unique(paths_table[, .(study, cancer_type)])$study == r[1]) == 1) {
39 | if(paths_table[as.list(r), .N] > 1) {
40 | title_tail <- paste0(r[1], ' - ', paths_table[as.list(r)][i, if(group_name == '') paste('Group', group) else group_name])
41 | } else {
42 | title_tail <- r[1]
43 | }
44 | } else {
45 | if(paths_table[as.list(r), .N] > 1) {
46 | title_tail <- paste0(r[1], ', ', r[2], ' - ', paths_table[as.list(r)][i, if(group_name == '') paste('Group', group) else
47 | group_name])
48 | } else {
49 | title_tail <- paste0(r[1], ', ', r[2])
50 | }
51 | }
52 |
53 | if(!is.null(plot_data[[i]]$data)) {
54 |
55 | pdata <- plot_data[[i]]$data
56 |
57 | marker_cond <- 'marker_plot_data' %in% names(pdata) &&
58 | pdata$marker_plot_data[, length(unique(gene)) > 1 & length(unique(cell_type)) > 1]
59 | if('marker_plot_data' %in% names(pdata) && pdata$marker_plot_data[, length(unique(gene)) < 10]) {
60 | leg_arr <- 'horizontal'
61 | } else {leg_arr <- 'vertical'}
62 |
63 | if('randomcoloR' %in% .packages(TRUE)) {
64 | set.seed(9728)
65 | plot_colours <- pdata$pie_data[, setNames(distinctColorPalette(.N), cell_type)]
66 | plot_out <- do.call(
67 | ct_comp_plot,
68 | args = c(switch(marker_cond + 1, pdata['pie_data'], pdata), list(colours = plot_colours, legends_arrange = leg_arr))
69 | )
70 | } else {
71 | plot_out <- do.call(
72 | ct_comp_plot,
73 | args = c(switch(marker_cond + 1, pdata['pie_data'], pdata), list(legends_arrange = leg_arr))
74 | )
75 | }
76 |
77 | if(marker_cond) {
78 | plot_title <- paste0('Cell type composition and marker gene expression in ', title_tail)
79 | } else {plot_title <- paste0('Cell type composition in ', title_tail)}
80 |
81 | return(list(plot_out = plot_out, plot_title = plot_title, leg_arr = leg_arr))
82 |
83 | }
84 |
85 | })
86 |
87 | # A4 page is 210x297mm. Here, I'm allowing 20mm for the title and 15mm each for the "A" and "B" labels.
88 | rmdlines <- c(
89 | '---\n',
90 | 'title: ""\n',
91 | 'header-includes:\n',
92 | ' \\renewcommand{\\familydefault}{\\sfdefault}\n', # Indent is important but must use spaces, NOT tabs!
93 | ' \\pagenumbering{gobble}\n',
94 | 'geometry: margin=1cm\n',
95 | 'output: pdf_document\n',
96 | 'papersize: a4\n',
97 | '---\n',
98 | '\n',
99 | '```{r setup, include = FALSE}\n',
100 | 'knitr::opts_chunk$set(echo = FALSE, warning = FALSE, error = FALSE, dev = "cairo_pdf")\n',
101 | '```\n'
102 | )
103 | for(i in 1:length(out)) {
104 | if('marker_plot' %in% names(out[[i]]$plot_out)) {
105 | margin_x <- (ifelse(out[[i]]$leg_arr == 'vertical', 150, 125) - # Take 25mm off margin if legends are arranged horizontally
106 | 10*length(unique(plot_data[!nullcond][[i]]$data$marker_plot_data$cell_type)))/2
107 | n_genes <- length(unique(plot_data[!nullcond][[i]]$data$marker_plot_data$gene))
108 | h <- (log2(24/n_genes) + 6)*n_genes + 30
109 | # The following top-aligns legends in case of too few genes, else the tops of the legends get cut off. It also depends on the length
110 | # of the x labels, as long labels squash the plot. 4 seems OK, but 5 is safer in case there's a longer label than "Oligodendrocyte".
111 | if(n_genes <= 5) out[[i]]$plot_out$marker_plot <- out[[i]]$plot_out$marker_plot + theme(legend.justification = 'top')
112 | }
113 | saveRDS(out[[i]]$plot_out, paste0('comp_plots_', i, '.rds'))
114 | if(i > 1) rmdlines <- c(rmdlines, '\n\\newpage\n')
115 | if('marker_plot' %in% names(out[[i]]$plot_out)) {
116 | rmdlines <- c(
117 | rmdlines,
118 | '\n',
119 | '### ', out[[i]]$plot_title, '\n',
120 | '\n',
121 | '```{r}\n',
122 | 'plots <- readRDS("comp_plots_', i, '.rds")\n',
123 | '```\n',
124 | '\n',
125 | '### **A**\n',
126 | '\n',
127 | '```{r fig.align = "center", fig.height = ', 117/25.4, ', fig.width = ', 210/25.4, ', out.width = "80%", out.height = "80%"}\n',
128 | 'plots$pie\n',
129 | '```\n',
130 | '\n',
131 | '### **B**\n',
132 | '\n',
133 | '```{r fig.align = "center", fig.height = ', h/25.4, ', fig.width = ', 210/25.4, ', out.width = "80%", out.height = "80%"}\n',
134 | 'plots$marker_plot + theme(plot.margin = unit(c(0, ', margin_x, ', 0, ', margin_x, '), "mm"))\n',
135 | '```\n',
136 | '\n',
137 | '\\vspace*{\\fill}\n', # Aligns text with bottom of page
138 | '\n',
139 | '**A.** Pie chart showing proportions of cell types after quality control. Cell types are ordered by abundance. **B.** Plot ',
140 | 'showing the average log2(TPM/10) expression levels of marker genes in each cell type and, for each gene, the percentage of ',
141 | 'cells of each type in which that gene was detected. Points are not shown for combinations where the percentage of expressing ',
142 | 'cells is below 50%. Only non-malignant cell types with at least 10 cells are shown, and these are ordered as in A.\n'
143 | )
144 | } else {
145 | rmdlines <- c(
146 | rmdlines,
147 | '\n',
148 | '### ', out[[i]]$plot_title, '\n',
149 | '\n',
150 | '```{r}\n',
151 | 'plots <- readRDS("comp_plots_', i, '.rds")\n',
152 | '```\n',
153 | '\n',
154 | '```{r fig.align = "center", fig.height = ', 117/25.4, ', fig.width = ', 210/25.4, ', out.width = "80%", out.height = "80%"}\n',
155 | 'plots$pie\n',
156 | '```\n',
157 | '\n',
158 | 'Pie chart showing proportions of cell types after quality control. Cell types are ordered by abundance.\n'
159 | )
160 | }
161 | }
162 | out_con <- file('temp.Rmd')
163 | writeLines(rmdlines, con = out_con, sep = '')
164 | close(out_con)
165 | rmarkdown::render('temp.Rmd', output_file = paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1], '/Cell types.pdf'))
166 | file.remove('temp.Rmd')
167 | for(i in 1:length(out)) file.remove(paste0('comp_plots_', i, '.rds'))
168 |
169 | }
170 |
171 | }
172 |
--------------------------------------------------------------------------------
/study_plots_ct_umap.R:
--------------------------------------------------------------------------------
1 | # bsub -q new-medium -R "select[model=Intel_Skylake] rusage[mem=32000]" -oo log/study_plots_ct_umap.o -eo log/study_plots_ct_umap.e Rscript study_plots_ct_umap.R
2 |
3 | library(data.table)
4 | library(ggplot2)
5 | library(magrittr)
6 | library(Matrix)
7 | library(stringr)
8 | library(plyr)
9 | library(cowplot)
10 | library(RColorBrewer)
11 | library(scales)
12 | library(matkot)
13 |
14 | try(library(randomcoloR), silent = TRUE)
15 |
16 | source('functions.R')
17 |
18 | paths_table <- fread('../data/paths_table.csv', encoding = 'UTF-8', key = c('study', 'cancer_type'))
19 |
20 |
21 |
22 |
23 |
24 | for(r in transpose(as.list(unique(paths_table[, .(study, cancer_type)])))) {
25 |
26 | cat(r, '\n')
27 |
28 | if(all(c('data_ct_umap.RDS', 'UMAP.pdf') %in% dir(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1])) == c(TRUE, FALSE))) {
29 |
30 | plot_data <- readRDS(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1], '/data_ct_umap.RDS'))
31 |
32 | nullcond <- sapply(plot_data, function(x) ifelse(is.null(x), TRUE, is.null(x$data)))
33 | if(all(nullcond)) next
34 |
35 | if('randomcoloR' %in% .packages(TRUE)) {
36 | set.seed(9728)
37 | plot_colours <- lapply(
38 | which(!nullcond),
39 | function(i) {
40 | pdata <- plot_data[[i]]$data
41 | slapply(c('cell_type', 'sample')[c('cell_type', 'sample') %in% names(pdata)], function(vn) unique(pdata[[vn]]))
42 | }
43 | )
44 | plot_colours <- slapply(
45 | c('cell_type', 'sample'),
46 | function(vn) {
47 | vn_unique <- unique(unlist(lapply(plot_colours, function(x) if(vn %in% names(x)) x[[vn]])))
48 | if(!is.null(vn_unique)) setNames(distinctColorPalette(length(vn_unique)), vn_unique)
49 | }
50 | )
51 | }
52 |
53 | out <- lapply(which(!nullcond), function(i) {
54 |
55 | pdata <- plot_data[[i]]$data
56 | if(!any(c('cell_type', 'sample') %in% names(pdata))) return(NULL)
57 |
58 | # To use in plot titles:
59 | if(sum(unique(paths_table[, .(study, cancer_type)])$study == r[1]) == 1) {
60 | if(paths_table[as.list(r), .N] > 1) {
61 | title_tail <- paste0(r[1], ' - ', paths_table[as.list(r)][i, if(group_name == '') paste('Group', group) else group_name])
62 | } else {
63 | title_tail <- r[1]
64 | }
65 | } else {
66 | if(paths_table[as.list(r), .N] > 1) {
67 | title_tail <- paste0(r[1], ', ', r[2], ' - ', paths_table[as.list(r)][i, if(group_name == '') paste('Group', group) else
68 | group_name])
69 | } else {
70 | title_tail <- paste0(r[1], ', ', r[2])
71 | }
72 | }
73 |
74 | if('cell_type' %in% names(pdata)) {
75 | if('Malignant' %in% pdata$cell_type) {
76 | pdata[, cell_type := factor(cell_type, levels = c('Malignant', sort(unique(cell_type[cell_type != 'Malignant']))))]
77 | } else {
78 | pdata[, cell_type := factor(cell_type, levels = sort(unique(cell_type)))]
79 | }
80 | }
81 | if('sample' %in% names(pdata)) {
82 | if(!any(is.na(as.numeric(unique(pdata$sample))))) {
83 | pdata[, sample := factor(as.character(sample), levels = sort(as.numeric(unique(sample))))]
84 | } else pdata[, sample := factor(as.character(sample), levels = sort(as.character(unique(sample))))]
85 | }
86 | if('randomcoloR' %in% .packages(TRUE)) c(list(
87 | title_tail = title_tail,
88 | plots = slapply(
89 | c('cell_type', 'sample')[c('cell_type', 'sample') %in% names(pdata)],
90 | function(vn) ct_umap_plot(pdata, vn, colours = plot_colours[[vn]][levels(pdata[[vn]])],
91 | legend_title = gsub('_', ' ', str_to_title(vn)))
92 | )
93 | )) else c(list(
94 | title_tail = title_tail,
95 | plots = slapply(
96 | c('cell_type', 'sample')[c('cell_type', 'sample') %in% names(pdata)],
97 | function(vn) ct_umap_plot(pdata, vn, legend_title = gsub('_', ' ', str_to_title(vn)))
98 | )
99 | ))
100 |
101 | })
102 |
103 | if(all(sapply(out, is.null))) next
104 |
105 | # A4 page is 210x297mm. Here, I'm allowing 20mm for the title and 15mm each for the "A" and "B" labels.
106 | rmdlines <- c(
107 | '---\n',
108 | 'title: ""\n',
109 | 'header-includes:\n',
110 | ' \\renewcommand{\\familydefault}{\\sfdefault}\n', # Indent is important but must use spaces, NOT tabs!
111 | ' \\pagenumbering{gobble}\n',
112 | 'geometry: margin=1cm\n',
113 | 'output: pdf_document\n',
114 | 'papersize: a4\n',
115 | '---\n',
116 | '\n',
117 | '```{r setup, include = FALSE}\n',
118 | 'knitr::opts_chunk$set(echo = FALSE, warning = FALSE, error = FALSE, dev = "cairo_pdf")\n',
119 | '```\n'
120 | )
121 | for(i in 1:length(out)) {
122 | saveRDS(out[[i]]$plots, paste0('umap_plots_', i, '.rds'))
123 | if(i > 1) rmdlines <- c(rmdlines, '\n\\newpage\n')
124 | if(length(out[[i]]$plots) == 2) {
125 | rmdlines <- c(
126 | rmdlines,
127 | '\n',
128 | '### UMAP in ', out[[i]]$title_tail, '\n',
129 | '\n',
130 | '```{r}\n',
131 | 'plots <- readRDS("umap_plots_', i, '.rds")\n',
132 | '```\n',
133 | '\n',
134 | '### **A**\n',
135 | '\n',
136 | '```{r fig.align = "center", fig.height = ', 130/25.4, ', fig.width = ', 180/25.4, ', out.width = "80%", out.height = "80%"}\n',
137 | 'plots$cell_type\n',
138 | '```\n',
139 | '\n',
140 | '### **B**\n',
141 | '\n',
142 | '```{r fig.align = "center", fig.height = ', 130/25.4, ', fig.width = ', 180/25.4, ', out.width = "80%", out.height = "80%"}\n',
143 | 'plots$sample\n',
144 | '```\n',
145 | '\n',
146 | '\\vspace*{\\fill}\n', # Aligns text with bottom of page
147 | '\n',
148 | '**A.** UMAP plot of single cells in ', out[[i]]$title_tail, ', after quality control. Cells are colored by cell type. **B.** ',
149 | 'UMAP plot as in A, with cells colored by sample.\n'
150 | )
151 | } else {
152 | rmdlines <- c(
153 | rmdlines,
154 | '\n',
155 | '### UMAP in ', out[[i]]$title_tail, '\n',
156 | '\n',
157 | '```{r}\n',
158 | 'plots <- readRDS("umap_plots_', i, '.rds")\n',
159 | '```\n',
160 | '\n',
161 | '```{r fig.align = "center", fig.height = ', 130/25.4, ', fig.width = ', 180/25.4, ', out.width = "80%", out.height = "80%"}\n',
162 | 'plots[[1]]\n',
163 | '```\n',
164 | '\n',
165 | 'UMAP plot of single cells in ', out[[i]]$title_tail, ', after quality control. Cells are colored by ',
166 | gsub('_', ' ', names(out[[i]]$plots)), '.\n'
167 | )
168 | }
169 | }
170 | out_con <- file('temp.Rmd')
171 | writeLines(rmdlines, con = out_con, sep = '')
172 | close(out_con)
173 | rmarkdown::render('temp.Rmd', output_file = paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1], '/UMAP.pdf'))
174 | file.remove('temp.Rmd')
175 | for(i in 1:length(out)) file.remove(paste0('umap_plots_', i, '.rds'))
176 |
177 | }
178 |
179 | }
180 |
--------------------------------------------------------------------------------
/study_plots_data_cc_thresh.R:
--------------------------------------------------------------------------------
1 | library(data.table)
2 | library(magrittr)
3 | library(matkot)
4 |
5 | paths_table <- fread('../data/paths_table.csv', key = c('study', 'cancer_type'))
6 |
7 | thresh <- lapply(transpose(as.list(unique(paths_table[, .(study, cancer_type)]))), function(r) {
8 | if(!('data_cc.RDS' %in% dir(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1])))) return(NULL)
9 | plot_data <- readRDS(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1], '/data_cc.RDS'))
10 | nullcond <- sapply(plot_data, function(x) ifelse(is.null(x), TRUE, all(sapply(x[names(x) != 'path'], is.null))))
11 | if(all(nullcond)) return(NULL)
12 | lapply(which(!nullcond), function(i) {
13 | if('cell_type' %in% names(plot_data[[i]]$ccdata)) {
14 | plot_data[[i]]$ccdata[,
15 | .(g1s = min(g1s_score[lr_g1s]), g2m = min(g2m_score[lr_g2m]), study = r[1], cancer_type = r[2]),
16 | by = cell_type
17 | ]
18 | } else {
19 | plot_data[[i]]$ccdata[, .(cell_type = NA, g1s = min(g1s_score[lr_g1s]), g2m = min(g2m_score[lr_g2m]), study = r[1], cancer_type = r[2])]
20 | }
21 | }) %>% rbindlist
22 | }) %>% rbindlist
23 |
24 | # plot(thresh[is.finite(g1s), sort(g1s)]); abline(h = 0.7)
25 | # plot(thresh[is.finite(g2m), sort(g2m)]); abline(h = 0.65)
26 |
27 | for(r in transpose(as.list(unique(paths_table[, .(study, cancer_type)])))) {
28 | cat(r, '\n')
29 | if(!('data_cc.RDS' %in% dir(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1])))) next
30 | plot_data <- readRDS(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1], '/data_cc.RDS'))
31 | nullcond <- sapply(plot_data, function(x) ifelse(is.null(x), TRUE, all(sapply(x[names(x) != 'path'], is.null))))
32 | if(all(nullcond)) next
33 | for(i in which(!nullcond)) {
34 |
35 | ccdata <- copy(plot_data[[i]]$ccdata)
36 | ccdata[, phase := 'Cycling']
37 | ccdata[g1s_score < 0.7 & g2m_score < 0.65, phase := 'Not cycling']
38 |
39 | # Define phases and intermediates by fold change threshold:
40 | ccdata[phase == 'Cycling' & g1s_score > g2m_score & g1s_score - 0.5 < 2*(g2m_score - 0.5), phase := 'Intermediate']
41 | ccdata[phase == 'Cycling' & g2m_score > g1s_score & g2m_score - 0.5 < 2*(g1s_score - 0.5), phase := 'Intermediate']
42 | ccdata[phase == 'Cycling', phase := c('G1/S', 'G2/M')[which.max(.(g1s_score, g2m_score))], by = cell_name]
43 |
44 | # ggplot(ccdata[cell_type == 'Malignant'], aes(x = g1s_score, y = g2m_score)) + geom_point(aes(colour = phase)) + theme_test()
45 |
46 | setkey(ccdata, cell_name)
47 | plot_data[[i]]$ccdata <- ccdata
48 |
49 | hdata <- copy(plot_data[[i]]$hdata)
50 | if('data.table' %in% class(hdata)) {
51 | setkey(hdata, cell_name)
52 | hdata[, phase := do.call(`[`, list(ccdata, cell_name))$phase]
53 | annot_cols <- c('cell_name', 'g1s_score', 'g2m_score', 'phase')
54 | hdata <- slapply(
55 | switch(
56 | ('cell_type' %in% names(ccdata)) + 1,
57 | list(ccdata$cell_name),
58 | slapply(
59 | ccdata[phase != 'Not cycling', .(N = .N), by = cell_type][N >= 50, cell_type],
60 | function(ct) ccdata[cell_type == ct, cell_name]
61 | )
62 | ),
63 | function(idvec) {
64 | # Downsample cell IDs if one phase (probably non-cycling cells) takes up more than two thirds of the data:
65 | cell_ids <- hdata[
66 | idvec,
67 | .(id = switch((.N > hdata[idvec][phase != p, 2*.N]) + 1, cell_name, sample(cell_name, hdata[idvec][phase != p, 2*.N]))),
68 | by = .(p = phase)
69 | ]$id
70 | out_data <- copy(hdata)[cell_name %in% cell_ids, -'cycling']
71 | # Take cumulative sum of numbers of cells for each phase, to be used in ordering the cells and plotting division
72 | # lines in the heatmap:
73 | setkey(out_data, phase)
74 | const <- out_data[cell_name %in% cell_ids][c('G2/M', 'Intermediate', 'G1/S')][!is.na(cell_name), .(N = .N), by = phase]
75 | const <- const[, setNames(c(0, cumsum(N)), c(phase, 'Not cycling'))]
76 | # Get ordered cell ID list:
77 | cell_ids <- out_data[c('G2/M', 'Intermediate', 'G1/S', 'Not cycling')][
78 | !is.na(cell_name),
79 | .(cell_name = cell_name, cell_order = order(-pmax(g1s_score, g2m_score)) + const[phase]),
80 | by = phase
81 | ][, cell_name[cell_order]]
82 | # Melt heatmap data and order cells:
83 | out_data <- melt(out_data, id.vars = annot_cols, variable.name = 'gene', value.name = 'exp_level')
84 | out_data[, cell_name := factor(cell_name, levels = cell_ids)]
85 | # Order the genes:
86 | setkey(out_data, phase)
87 | if(all(c('G1/S', 'G2/M') %in% out_data$phase)) {
88 | out_data[, gene := factor(gene, levels = out_data[
89 | c('G1/S', 'G2/M'),
90 | .SD[gene %in% plot_data[[i]][[tolower(gsub('/', '', phase))]], .(ave_exp = mean(exp_level)), by = gene],
91 | by = phase
92 | ][, .(ordered_genes = gene[order(ave_exp)]), by = phase]$ordered_genes)]
93 | } else { # Already checked there are enough cycling cells in idvec, so can assume that 'G1/S' or 'G2/M' is present
94 | out_data[, gene := factor(gene, levels = out_data[phase %in% c('G1/S', 'G2/M'), .(ave_exp = mean(exp_level)), by = gene][,
95 | Reduce(c, lapply(
96 | c('G1/S', 'G2/M'),
97 | function(p) .SD[gene %in% plot_data[[i]][[tolower(gsub('/', '', p))]]][order(ave_exp), gene]
98 | ))
99 | ])]
100 | }
101 | return(list(data = out_data, vline = const))
102 | }
103 | )
104 | plot_data[[i]]$hdata <- hdata
105 | } else plot_data[[i]]$hdata <- NULL
106 |
107 | }
108 | saveRDS(plot_data, paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1], '/data_cc.RDS'))
109 | }
110 |
--------------------------------------------------------------------------------
/study_plots_data_cc_thresh_consensus.R:
--------------------------------------------------------------------------------
1 | library(data.table)
2 | library(magrittr)
3 | library(matkot)
4 |
5 | paths_table <- fread('../data/paths_table.csv', key = c('study', 'cancer_type'))
6 |
7 | thresh <- lapply(transpose(as.list(unique(paths_table[, .(study, cancer_type)]))), function(r) {
8 | if(!('data_cc_consensus.rds' %in% dir(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1])))) return(NULL)
9 | plot_data <- readRDS(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1], '/data_cc_consensus.rds'))
10 | nullcond <- sapply(plot_data, function(x) ifelse(is.null(x), TRUE, all(sapply(x[names(x) != 'path'], is.null))))
11 | if(all(nullcond)) return(NULL)
12 | lapply(which(!nullcond), function(i) {
13 | if('cell_type' %in% names(plot_data[[i]]$ccdata)) {
14 | plot_data[[i]]$ccdata[,
15 | .(g1s = min(g1s_score[lr_g1s]), g2m = min(g2m_score[lr_g2m]), study = r[1], cancer_type = r[2]),
16 | by = cell_type
17 | ]
18 | } else {
19 | plot_data[[i]]$ccdata[, .(cell_type = NA, g1s = min(g1s_score[lr_g1s]), g2m = min(g2m_score[lr_g2m]), study = r[1], cancer_type = r[2])]
20 | }
21 | }) %>% rbindlist
22 | }) %>% rbindlist
23 |
24 | # plot(thresh[is.finite(g1s), sort(g1s)]); abline(h = 0.7)
25 | # plot(thresh[is.finite(g2m), sort(g2m)]); abline(h = 0.7)
26 |
27 | for(r in transpose(as.list(unique(paths_table[, .(study, cancer_type)])))) {
28 | cat(r, '\n')
29 | if(!('data_cc_consensus.rds' %in% dir(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1])))) next
30 | plot_data <- readRDS(paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1], '/data_cc_consensus.rds'))
31 | nullcond <- sapply(plot_data, function(x) ifelse(is.null(x), TRUE, all(sapply(x[names(x) != 'path'], is.null))))
32 | if(all(nullcond)) next
33 | for(i in which(!nullcond)) {
34 |
35 | ccdata <- copy(plot_data[[i]]$ccdata)
36 | ccdata[, phase := 'Cycling']
37 | ccdata[g1s_score < 0.7 & g2m_score < 0.7, phase := 'Not cycling']
38 |
39 | # Define phases and intermediates by fold change threshold:
40 | ccdata[phase == 'Cycling' & g1s_score > g2m_score & g1s_score - 0.5 < 2*(g2m_score - 0.5), phase := 'Intermediate']
41 | ccdata[phase == 'Cycling' & g2m_score > g1s_score & g2m_score - 0.5 < 2*(g1s_score - 0.5), phase := 'Intermediate']
42 | ccdata[phase == 'Cycling', phase := c('G1/S', 'G2/M')[which.max(.(g1s_score, g2m_score))], by = cell_name]
43 |
44 | # ggplot(ccdata[cell_type == 'Malignant'], aes(x = g1s_score, y = g2m_score)) + geom_point(aes(colour = phase)) + theme_test()
45 |
46 | setkey(ccdata, cell_name)
47 | plot_data[[i]]$ccdata <- ccdata
48 |
49 | hdata <- copy(plot_data[[i]]$hdata)
50 | if('data.table' %in% class(hdata)) {
51 | setkey(hdata, cell_name)
52 | hdata[, phase := do.call(`[`, list(ccdata, cell_name))$phase]
53 | annot_cols <- c('cell_name', 'g1s_score', 'g2m_score', 'phase')
54 | hdata <- slapply(
55 | switch(
56 | ('cell_type' %in% names(ccdata)) + 1,
57 | list(ccdata$cell_name),
58 | slapply(
59 | ccdata[phase != 'Not cycling', .(N = .N), by = cell_type][N >= 50, cell_type],
60 | function(ct) ccdata[cell_type == ct, cell_name]
61 | )
62 | ),
63 | function(idvec) {
64 | # Downsample cell IDs if one phase (probably non-cycling cells) takes up more than two thirds of the data:
65 | cell_ids <- hdata[
66 | idvec,
67 | .(id = switch((.N > hdata[idvec][phase != p, 2*.N]) + 1, cell_name, sample(cell_name, hdata[idvec][phase != p, 2*.N]))),
68 | by = .(p = phase)
69 | ]$id
70 | out_data <- copy(hdata)[cell_name %in% cell_ids, -'cycling']
71 | # Take cumulative sum of numbers of cells for each phase, to be used in ordering the cells and plotting division lines in the
72 | # heatmap:
73 | setkey(out_data, phase)
74 | const <- out_data[cell_name %in% cell_ids][c('G2/M', 'Intermediate', 'G1/S')][!is.na(cell_name), .(N = .N), by = phase]
75 | const <- const[, setNames(c(0, cumsum(N)), c(phase, 'Not cycling'))]
76 | # Get ordered cell ID list:
77 | cell_ids <- out_data[c('G2/M', 'Intermediate', 'G1/S', 'Not cycling')][
78 | !is.na(cell_name),
79 | .(cell_name = cell_name, cell_order = order(-pmax(g1s_score, g2m_score)) + const[phase]),
80 | by = phase
81 | ][, cell_name[cell_order]]
82 | # Melt heatmap data and order cells:
83 | out_data <- melt(out_data, id.vars = annot_cols, variable.name = 'gene', value.name = 'exp_level')
84 | out_data[, cell_name := factor(cell_name, levels = cell_ids)]
85 | # Order the genes:
86 | setkey(out_data, phase)
87 | if(all(c('G1/S', 'G2/M') %in% out_data$phase)) {
88 | out_data[, gene := factor(gene, levels = out_data[
89 | c('G1/S', 'G2/M'),
90 | .SD[gene %in% plot_data[[i]][[tolower(gsub('/', '', phase))]], .(ave_exp = mean(exp_level)), by = gene],
91 | by = phase
92 | ][, .(ordered_genes = gene[order(ave_exp)]), by = phase]$ordered_genes)]
93 | } else { # Already checked there are enough cycling cells in idvec, so can assume that 'G1/S' or 'G2/M' is present
94 | out_data[, gene := factor(gene, levels = out_data[phase %in% c('G1/S', 'G2/M'), .(ave_exp = mean(exp_level)), by = gene][,
95 | Reduce(c, lapply(
96 | c('G1/S', 'G2/M'),
97 | function(p) .SD[gene %in% plot_data[[i]][[tolower(gsub('/', '', p))]]][order(ave_exp), gene]
98 | ))
99 | ])]
100 | }
101 | return(list(data = out_data, vline = const))
102 | }
103 | )
104 | plot_data[[i]]$hdata <- hdata
105 | } else plot_data[[i]]$hdata <- NULL
106 |
107 | }
108 | saveRDS(plot_data, paste0('../data/study_plots/', gsub('/', '-', r[2]), '/', r[1], '/data_cc_consensus.rds'))
109 | }
110 |
--------------------------------------------------------------------------------
/study_plots_data_cna.R:
--------------------------------------------------------------------------------
1 | # The command line arguments should supply study name and cancer type (in that order).
2 | r = commandArgs(trailingOnly = TRUE)
3 |
4 |
5 |
6 |
7 |
8 | library(data.table)
9 | library(magrittr)
10 | library(Matrix)
11 | library(caTools)
12 | library(stringr)
13 | library(plyr)
14 | library(matkot)
15 |
16 | source('functions.R')
17 |
18 | paths_table <- fread('../data/paths_table.csv', key = c('study', 'cancer_type'))
19 |
20 | hgnc_complete_set <- fread('../data/hgnc_complete_set_2023-04-13.txt', key = 'ensembl_gene_id')
21 | hgnc_complete_set <- hgnc_complete_set[!(ensembl_gene_id %in% names(table(ensembl_gene_id))[table(ensembl_gene_id) > 1])]
22 | alias_table <- make_alias_table(hgnc_complete_set)
23 |
24 | gene_positions <- fread('../data/gene_positions.csv')
25 |
26 | # This is how gene_positions was defined:
27 | # library(biomaRt)
28 | # ensembl_mart <- useMart('ensembl')
29 | # ensembl_data <- useDataset('hsapiens_gene_ensembl', mart = ensembl_mart)
30 | # gene_positions <- as.data.table(getBM(attributes = c('ensembl_gene_id', 'chromosome_name', 'start_position', 'end_position'), mart = ensembl_data))
31 | # fwrite(gene_positions, '../data/gene_positions.csv')
32 |
33 | gene_positions[, c('symbol', 'location') := do.call(`[`, list(hgnc_complete_set, ensembl_gene_id))[, .(symbol, location_sortable)]]
34 | gene_positions <- gene_positions[!is.na(symbol) & chromosome_name %in% c(as.character(1:22), 'X', 'Y')]
35 | gene_positions[, chromosome_name := mapvalues(chromosome_name, as.character(1:9), paste0(0, 1:9))]
36 | setkey(gene_positions, symbol)
37 |
38 |
39 |
40 |
41 |
42 | paths <- apply(paths_table[as.list(r), .(cells, genes, expmat)], 1, as.list, simplify = FALSE)
43 | ct <- gsub('/', '-', r[2])
44 | out <- lapply(paths, function(p) {
45 |
46 | if(!endsWith(p$expmat, 'mtx')) return(list(cna_data = NULL, ref_cells = NULL, ref_range = NULL, path = p))
47 |
48 | cat('Preparing data\n')
49 | cells <- suppressWarnings(fread(p$cells, na.strings = ''))
50 | if(!('cell_type' %in% names(cells)) || length(unique(cells$cell_type)) == 1 || !('Malignant' %in% cells$cell_type)) {
51 | return(list(cna_data = NULL, ref_cells = NULL, ref_range = NULL, path = p))
52 | }
53 | cells$cell_name <- as.character(cells$cell_name) # In case cell names are the same as row numbers
54 | if('sample' %in% names(cells)) cells$sample <- as.character(cells$sample)
55 | if(cells[, .N > length(unique(cell_name))]) cells[, cell_name := paste(cell_name, .I, sep = '_')]
56 | genes <- fread(p$genes, header = FALSE)$V1
57 | expmat <- readMM(p$expmat)
58 | expmat <- expmat[genes != '', ]
59 | genes <- genes[genes != '']
60 | expmat <- expmat[genes %in% names(table(genes))[table(genes) == 1], ]
61 | genes <- genes[genes %in% names(table(genes))[table(genes) == 1]]
62 | genes <- update_symbols_fast(genes, alias_table) # Update gene symbols
63 | rownames(expmat) <- genes
64 | genes <- genes[genes %in% gene_positions$symbol]; expmat <- expmat[genes, ]
65 | colnames(expmat) <- cells$cell_name
66 | cells <- cells[col_nnz(expmat) >= 1000] # Remove low-complexity cells
67 | if(nrow(cells) < 30) return(list(cna_data = NULL, ref_cells = NULL, ref_range = NULL, path = p))
68 | expmat <- round(log_transform(1e+05*to_frac(expmat[, cells$cell_name])), 4) # Normalise to log TPM/10
69 |
70 | # Get reference cells, prioritising Fibroblasts, then Endothelial, then Macrophage, then whatever is most abundant.
71 | cat('Defining reference\n')
72 | ref_cts_global <- cells[cell_type != 'Malignant', .(N = .N), keyby = cell_type][N >= 100]
73 | ref_cts_global <- rbind(
74 | ref_cts_global[c('Fibroblast', 'Endothelial', 'Macrophage')],
75 | ref_cts_global[!(cell_type %in% c('Fibroblast', 'Endothelial', 'Macrophage'))][order(-N)]
76 | )[!is.na(N), cell_type[complete.cases(.SD)][1:min(2, .N)]]
77 | if(length(ref_cts_global) < 2) return(list(cna_data = NULL, ref_cells = NULL, ref_range = NULL, path = p))
78 | ref_cells <- cells[cell_type %in% ref_cts_global, .(cell_name = cell_name, cell_type = cell_type, ref_type = 'global')]
79 | ref_cells_smpl <- cells[
80 | cell_type != 'Malignant',
81 | {
82 | ref_cts_smpl <- .SD[, .(N = .N), keyby = cell_type][N >= 100]
83 | ref_cts_smpl <- ref_cts_smpl[c('Fibroblast', 'Endothelial', 'Macrophage'), cell_type[complete.cases(.SD)]]
84 | if(length(ref_cts_smpl) >= 2) .SD[cell_type %in% ref_cts_smpl, .(cell_name = cell_name, cell_type = cell_type)]
85 | },
86 | by = .(ref_type = sample)
87 | ]
88 | if(nrow(ref_cells_smpl) > 0) ref_cells <- rbind(ref_cells, ref_cells_smpl, use.names = TRUE)
89 | rm(ref_cells_smpl)
90 |
91 | cat('Computing CNA values\n')
92 | rms <- rowMeans(expmat)
93 | genes_top <- head(names(rms)[order(-rms)], 5000)
94 | expmat <- expmat[genes_top, ]; rms <- rms[genes_top]
95 | expmat <- expmat - rms # Centre rows/genes
96 | expmat[expmat > 3] <- 3; expmat[expmat < -3] <- -3 # Restrict range to limit influence of extreme values
97 | expdt <- as.data.table(as.matrix(expmat), keep.rownames = 'gene')
98 | expdt <- melt(expdt, id.vars = 'gene', variable.name = 'cell_name', value.name = 'value', variable.factor = FALSE)
99 | expdt[, c('chr', 'start_pos') := gene_positions[gene, .(chromosome_name, start_position)]]
100 | expdt <- expdt[order(chr, start_pos)]
101 | expdt[, cna := log2(runmean(2^value, k = 100)), by = .(chr, cell_name)] # Computing the CNA values
102 | expdt[, cna := cna - median(cna), keyby = cell_name]
103 | ref_range <- ref_cells[, do.call(`[`, list(expdt, cell_name))[, .(mean_cna = mean(cna)), by = gene], by = .(ref_type, cell_type)]
104 | ref_range <- ref_range[, .(mn = min(mean_cna), mx = max(mean_cna)), keyby = .(ref_type, gene)]
105 | setkey(cells, cell_name)
106 | expdt[, sample := do.call(`[`, list(cells, cell_name))[, as.character(sample)]]
107 |
108 | cat('Correcting CNA values w.r.t. reference\n')
109 | expdt[, # Correcting the CNA values w.r.t. reference cells
110 | cna := {
111 | rt <- unique(sample)
112 | if(!(rt %in% ref_range$ref_type)) rt <- 'global'
113 | out <- .SD[, .(cna, do.call(`[`, list(ref_range, .(rt, gene)))[, .(mn, mx)])]
114 | out[, ifelse(cna > mx + 0.1, cna - mx - 0.1, ifelse(cna < mn - 0.1, cna - mn + 0.1, 0))]
115 | },
116 | by = sample
117 | ]
118 |
119 | cat('Done!\n\n')
120 | return(list(cna_data = expdt, ref_cells = ref_cells, ref_range = ref_range, path = p))
121 |
122 | })
123 |
124 | if(!all(sapply(out, function(x) is.null(x$cna_data)))) {
125 |
126 | ct <- gsub('/', '-', r[2])
127 | if(!(ct %in% dir('../data/study_plots'))) {dir.create(paste0('../data/study_plots/', ct))}
128 | if(!(r[1] %in% dir(paste0('../data/study_plots/', ct)))) {dir.create(paste0('../data/study_plots/', ct, '/', r[1]))}
129 |
130 | saveRDS(out, paste0('../data/study_plots/', ct, '/', r[1], '/data_cna.rds'))
131 |
132 | }
133 |
--------------------------------------------------------------------------------
/study_plots_data_ct_comp.R:
--------------------------------------------------------------------------------
1 | # The command line arguments should supply study name and cancer type (in that order).
2 | r = commandArgs(trailingOnly = TRUE)
3 |
4 |
5 |
6 |
7 |
8 | library(data.table)
9 | library(ggplot2)
10 | library(magrittr)
11 | library(Matrix)
12 | library(stringr)
13 | library(plyr)
14 | library(matkot)
15 |
16 | source('functions.R')
17 |
18 | canonical_markers <- fread('../data/canonical_markers.csv')
19 | paths_table <- fread('../data/paths_table.csv', encoding = 'UTF-8', key = c('study', 'cancer_type'))
20 |
21 |
22 |
23 |
24 |
25 | paths <- apply(paths_table[as.list(r), .(cells, genes, expmat)], 1, as.list, simplify = FALSE)
26 |
27 | out <- lapply(
28 | paths,
29 | function(p) {
30 |
31 | if(endsWith(p$expmat, 'mtx')) {
32 |
33 | cells <- suppressWarnings(fread(p$cells, na.strings = ''))
34 |
35 | cells$cell_name <- as.character(cells$cell_name)
36 |
37 | if(cells[, .N > length(unique(cell_name))]) cells[, cell_name := paste(cell_name, .I, sep = '_')]
38 |
39 | genes <- fread(p$genes, header = FALSE)$V1
40 | expmat <- readMM(p$expmat)
41 | rownames(expmat) <- genes
42 | colnames(expmat) <- cells$cell_name
43 |
44 | # Remove low-complexity cells and normalise to log TPM/10:
45 | cells <- cells[col_nnz(expmat) >= 1000]
46 | expmat <- round(log_transform(1e+05*to_frac(expmat[, cells$cell_name])), 4)
47 |
48 | ct_comp_data_out <- ct_comp_data(expmat, cells, markers = canonical_markers)
49 |
50 | return(list(data = ct_comp_data_out, path = p))
51 |
52 | } else return(list(data = NULL, path = p))
53 |
54 | }
55 | )
56 |
57 | if(!all(sapply(out, function(x) is.null(x$data)))) {
58 |
59 | ct <- gsub('/', '-', r[2])
60 | if(!(ct %in% dir('../data/study_plots'))) {dir.create(paste0('../data/study_plots/', ct))}
61 | if(!(r[1] %in% dir(paste0('../data/study_plots/', ct)))) {dir.create(paste0('../data/study_plots/', ct, '/', r[1]))}
62 |
63 | saveRDS(out, paste0('../data/study_plots/', ct, '/', r[1], '/data_ct_comp.RDS'))
64 |
65 | }
66 |
--------------------------------------------------------------------------------
/study_plots_data_ct_umap.R:
--------------------------------------------------------------------------------
1 | # The command line arguments should supply study name and cancer type (in that order).
2 | r = commandArgs(trailingOnly = TRUE)
3 |
4 |
5 |
6 |
7 |
8 | library(data.table)
9 | library(ggplot2)
10 | library(magrittr)
11 | library(Matrix)
12 | library(stringr)
13 | library(plyr)
14 | library(irlba)
15 | library(uwot)
16 | library(matkot)
17 |
18 | source('functions.R')
19 |
20 | paths_table <- fread('../data/paths_table.csv', encoding = 'UTF-8', key = c('study', 'cancer_type'))
21 |
22 |
23 |
24 |
25 |
26 | paths <- apply(paths_table[as.list(r), .(cells, genes, expmat)], 1, as.list, simplify = FALSE)
27 |
28 | out <- lapply(paths, function(p) {
29 |
30 | if(endsWith(p$expmat, 'mtx')) {
31 |
32 | cells <- suppressWarnings(fread(p$cells, na.strings = ''))
33 |
34 | cells$cell_name <- as.character(cells$cell_name)
35 |
36 | if(cells[, .N > length(unique(cell_name))]) cells[, cell_name := paste(cell_name, .I, sep = '_')]
37 |
38 | genes <- fread(p$genes, header = FALSE)$V1
39 | expmat <- readMM(p$expmat)
40 | rownames(expmat) <- genes
41 | colnames(expmat) <- cells$cell_name
42 |
43 | # Remove low-complexity cells and normalise to log TPM/10:
44 | cells <- cells[complexity >= 1000]
45 | expmat <- round(log_transform(1e+05*to_frac(expmat[, cells$cell_name])), 4)
46 |
47 | # Filtered gene list, after removing lowly expressed genes:
48 | if('sample' %in% names(cells)) {
49 | filtered_genes <- cells[
50 | sample %in% cells[, .(N = .N), by = sample][N >= 10, sample],
51 | .(gene = genes[rowMeans(log_transform(expmat[, cell_name], reverse = TRUE)) >= 1e+05*(2^4 - 1)/1e+06]),
52 | by = sample
53 | ]$gene %>% unique
54 | } else {
55 | filtered_genes <- genes[rowMeans(log_transform(expmat, reverse = TRUE)) >= 1e+05*(2^4 - 1)/1e+06]
56 | }
57 |
58 | ct_umap_data_out <- ct_umap_data(expmat[filtered_genes, ], cells)
59 |
60 | return(list(data = ct_umap_data_out, path = p))
61 |
62 | } else return(NULL)
63 |
64 | })
65 |
66 | if(!all(sapply(out, function(x) is.null(x$data)))) {
67 |
68 | ct <- gsub('/', '-', r[2])
69 | if(!(ct %in% dir('../data/study_plots'))) {dir.create(paste0('../data/study_plots/', ct))}
70 | if(!(r[1] %in% dir(paste0('../data/study_plots/', ct)))) {dir.create(paste0('../data/study_plots/', ct, '/', r[1]))}
71 |
72 | saveRDS(out, paste0('../data/study_plots/', ct, '/', r[1], '/data_ct_umap.RDS'))
73 |
74 | }
75 |
--------------------------------------------------------------------------------
/study_plots_data_dist.R:
--------------------------------------------------------------------------------
1 | r = commandArgs(trailingOnly = TRUE)
2 |
3 |
4 |
5 |
6 |
7 | library(data.table)
8 | library(magrittr)
9 | library(readxl)
10 | library(plyr)
11 | library(stringr)
12 | library(Matrix)
13 | library(matkot)
14 |
15 | source('functions.R')
16 |
17 | paths_table <- fread('../data/paths_table.csv', key = c('study', 'cancer_type'))
18 |
19 | hgnc_complete_set <- fread('../data/hgnc_complete_set_2023-04-13.txt', key = 'symbol')
20 | hgnc_complete_set <- hgnc_complete_set[!(ensembl_gene_id %in% names(table(ensembl_gene_id))[table(ensembl_gene_id) > 1])]
21 | alias_table <- make_alias_table(hgnc_complete_set)
22 |
23 | meta_programs <- lapply(
24 | c('Malignant', 'B cells', 'Endothelial', 'Epithelial', 'Fibroblasts', 'Macrophages', 'CD4 T cells', 'CD8 T cells'),
25 | function(ct) {
26 | dt <- as.data.table(read_xlsx('../data/meta_programs_2023-07-13.xlsx', sheet = ct))
27 | dt[, (names(dt)) := lapply(dt, update_symbols_fast, alias_table)]
28 | return(dt)
29 | }
30 | )
31 |
32 | # Change some MP names, especially for CD4 and CD8 T cells, so we can merge them into a single T cell category:
33 | names(meta_programs[[1]])[c(12:15, 30, 32, 34, 39)] <- gsub('-', ' ', names(meta_programs[[1]])[c(12:15, 30, 32, 34, 39)])
34 | names(meta_programs[[1]])[c(19, 35, 37, 41)] <- c('EpiSen', 'Hemato-related I', 'Hemato-related II', 'MP41 (Unassigned)')
35 | names(meta_programs[[2]])[3] <- 'Cell Cycle'
36 | names(meta_programs[[3]])[c(1, 6)] <- c('Notch signaling', 'Cell Cycle')
37 | names(meta_programs[[4]])[5] <- 'Cell Cycle'
38 | names(meta_programs[[5]])[c(6, 10, 15)] <- c('Cell Cycle', 'Metal response', 'Lipid metabolism')
39 | names(meta_programs[[6]])[c(3, 9, 12)] <- c('Cell Cycle', 'Proteasomal degradation', 'Unfolded protein response')
40 | names(meta_programs[[7]]) <- paste('CD4 -', names(meta_programs[[7]]))
41 | names(meta_programs[[7]])[c(1, 3)] <- c('CD4 - Treg', 'CD4 - Cell Cycle')
42 | names(meta_programs[[8]]) <- paste('CD8 -', names(meta_programs[[8]]))
43 | names(meta_programs[[8]])[c(2, 10)] <- c('CD8 - Cell Cycle', 'CD8 - Heat shock')
44 |
45 | # Merge CD4 and CD8 T cells:
46 | meta_programs[[7]] <- cbind(meta_programs[[7]], meta_programs[[8]])
47 | meta_programs <- meta_programs[1:7]
48 |
49 | cts <- c('Malignant', 'B_cell', 'Endothelial', 'Epithelial', 'Fibroblast', 'Macrophage', 'T_cell')
50 | names(meta_programs) <- cts
51 |
52 |
53 |
54 |
55 |
56 | paths <- apply(paths_table[as.list(r), .(cells, genes, expmat)], 1, as.list, simplify = FALSE)
57 |
58 | out <- lapply(paths, function(p) {
59 |
60 | if(!endsWith(p$expmat, 'mtx')) return(NULL)
61 |
62 | cat('Preparing data\n')
63 | cells <- suppressWarnings(fread(p$cells, na.strings = ''))
64 | if(!all(c('cell_type', 'sample') %in% names(cells))) return(NULL)
65 | cells$cell_name <- as.character(cells$cell_name)
66 | cells$sample <- as.character(cells$sample)
67 | if(cells[, .N > length(unique(cell_name))]) cells[, cell_name := paste(cell_name, .I, sep = '_')]
68 | genes <- fread(p$genes, header = FALSE)$V1
69 | expmat <- readMM(p$expmat)
70 | expmat <- expmat[genes != '', ]
71 | genes <- genes[genes != '']
72 | expmat <- expmat[genes %in% names(table(genes))[table(genes) == 1], ]
73 | genes <- genes[genes %in% names(table(genes))[table(genes) == 1]]
74 | genes <- update_symbols_fast(genes, alias_table) # Update gene symbols
75 | rownames(expmat) <- genes
76 | colnames(expmat) <- cells$cell_name
77 | cells <- cells[col_nnz(expmat) >= 1000] # Remove low-complexity cells
78 | if(nrow(cells) < 30) return(NULL)
79 | expmat <- round(log_transform(1e+05*to_frac(expmat[, cells$cell_name])), 4) # Normalise to log TPM/10
80 |
81 | cat('Assigning MP to each cell\n')
82 | set.seed(4387)
83 | scores <- cells[
84 | cell_type %in% cts,
85 | if(.N >= 100) c( # Cut-off of 100 cells of this cell type in this sample
86 | .(cell_name = cell_name),
87 | slapply(
88 | meta_programs[[unique(cell_type)]],
89 | function(mp) {
90 | if(sum(mp %in% rownames(expmat)) < 10) mp <- str_to_title(mp)
91 | if(sum(mp %in% rownames(expmat)) >= 30) sig_score(expmat[, cell_name], mp[mp %in% rownames(expmat)], nbin = 50, n = 50)
92 | }
93 | )
94 | ) %>% as.data.table %>% melt(id.vars = 'cell_name', variable.name = 'meta_program', value.name = 'score', variable.factor = FALSE),
95 | by = .(cell_type, sample)
96 | ]
97 | if(nrow(scores) == 0) return(NULL)
98 | # Assign MP to each cell (take the one with highest score):
99 | assignments <- scores[, .(max_score = max(score), meta_program = meta_program[which.max(score)]), by = .(cell_type, sample, cell_name)]
100 | assignments[, a_ct := ifelse(max_score < 1, as.character(NA), meta_program)]
101 | # In each cell type and sample, remove MPs that were assigned to less than 5% of those cells that were given an assignment (i.e. that had max
102 | # score >= 1):
103 | assignments[!is.na(a_ct), a_smpl := {a <- table(a_ct)/.N; ifelse(a_ct %in% names(a)[a < 0.05], NA, a_ct)}, by = .(cell_type, sample)]
104 | # Similar but more lenient criterion for a_ct, to get rid of spurious assignments:
105 | assignments[!is.na(a_ct), a_ct := {a <- table(a_ct)/.N; ifelse(a_ct %in% names(a)[a < 0.01], NA, a_ct)}, by = .(cell_type, sample)]
106 | setkey(assignments, cell_type, sample)
107 |
108 | cat('Constructing plot data\n')
109 | pie_data <- slapply(cts[cts %in% assignments$cell_type], function(ct) {
110 | pie_out <- assignments[ct, {N_ct <- .N; .SD[, .(prop = .N/N_ct), by = a_ct]}][order(-prop)]
111 | pie_out[is.na(a_ct), a_ct := 'Unassigned']
112 | pie_out[prop < 0.03, a_ct := 'Rare MPs (each <3%)']
113 | pie_out[a_ct == 'Rare MPs (each <3%)', prop := sum(prop)]
114 | pie_out <- unique(pie_out)
115 | pie_out[, a_ct := factor(a_ct, levels = c(a_ct[!(a_ct %in% c('Rare MPs (each <3%)', 'Unassigned'))], 'Rare MPs (each <3%)', 'Unassigned'))]
116 | pie_out <- pie_out[order(a_ct)]
117 | pie_out[, cumprop := c(0, cumsum(prop)[-.N])]
118 | return(pie_out)
119 | })
120 | bar_data <- slapply(cts[cts %in% assignments$cell_type], function(ct) {
121 | slapply(assignments[ct, unique(sample)], function(smpl) {
122 | bar_out <- assignments[.(ct, as.character(smpl))][, {N_smpl <- .N; .SD[, .(prop = .N/N_smpl), by = a_smpl]}][!is.na(a_smpl)][order(prop)]
123 | if(nrow(bar_out) == 0) return(NULL)
124 | bar_out[, a_smpl := factor(a_smpl, levels = a_smpl)]
125 | return(bar_out)
126 | })
127 | })
128 | bar_data <- slapply(bar_data, function(x) x[!sapply(x, is.null)])
129 | htmp_data <- slapply(cts[cts %in% assignments$cell_type], function(ct) {
130 | slapply(assignments[ct, unique(sample)], function(smpl) {
131 | if(!(smpl %in% names(bar_data[[ct]]))) return(NULL)
132 | mps <- bar_data[[ct]][[as.character(smpl)]][, levels(a_smpl)]
133 | g <- slapply(meta_programs[[ct]][, ..mps], function(mp) {
134 | if(sum(mp %in% rownames(expmat)) < 10) mp <- str_to_title(mp)
135 | mp[mp %in% rownames(expmat)]
136 | })
137 | asub <- assignments[!is.na(a_smpl)][.(ct, as.character(smpl))]
138 | if(nrow(asub) > 100) asub <- asub[, {n <- floor(100*.N/nrow(asub)); .SD[sample(1:.N, n, replace = FALSE)]}, by = a_smpl]
139 | htmp_out_list <- slapply(mps, function(mp_name) {
140 | mat <- expmat[g[[mp_name]], asub$cell_name, drop = FALSE]
141 | mat_sub <- mat[, asub[a_smpl == mp_name, cell_name], drop = FALSE]
142 | dt <- as.data.table(as.matrix(mat - rowMeans(mat)), keep.rownames = 'gene') # centre per gene and convert to data.table
143 | dt <- melt(dt, id.vars = 'gene', variable.name = 'cell_name', value.name = 'value')[, a_smpl := mp_name]
144 | return(list(data = dt, genes = names(sort(rowMeans(mat_sub))), cells = names(sort(colMeans(mat_sub)))))
145 | })
146 | htmp_out <- rbindlist(lapply(htmp_out_list, `[[`, 'data'))
147 | htmp_out[, c('gene_a', 'a_smpl') := .(paste(gene, gsub(' |-|/|\\(|\\)', '', a_smpl), sep = '_'), factor(a_smpl, levels = rev(mps)))]
148 | cells_ord <- unlist(lapply(htmp_out_list[rev(mps)], `[[`, 'cells'))
149 | genes_ord <- unlist(lapply(mps, function(mp_name) paste(htmp_out_list[[mp_name]]$genes, gsub(' |-|/|\\(|\\)', '', mp_name), sep = '_')))
150 | htmp_out[, c('gene_a', 'cell_name') := .(factor(gene_a, levels = genes_ord), factor(cell_name, levels = cells_ord))]
151 | htmp_out[, gene_num := as.numeric(gene_a)]
152 | return(htmp_out)
153 | })
154 | })
155 | htmp_data <- slapply(htmp_data, function(x) x[!sapply(x, is.null)])
156 |
157 | cat('Done!\n')
158 | return(list(scores = scores, assignments = assignments, pie_data = pie_data, bar_data = bar_data, heatmap_data = htmp_data, path = p))
159 |
160 | })
161 |
162 | if(!all(sapply(out, function(x) is.null(x$assignments)))) {
163 |
164 | ct <- gsub('/', '-', r[2])
165 | if(!(ct %in% dir('../data/study_plots'))) {dir.create(paste0('../data/study_plots/', ct))}
166 | if(!(r[1] %in% dir(paste0('../data/study_plots/', ct)))) {dir.create(paste0('../data/study_plots/', ct, '/', r[1]))}
167 |
168 | saveRDS(out, paste0('../data/study_plots/', ct, '/', r[1], '/data_dist.rds'))
169 |
170 | }
171 |
--------------------------------------------------------------------------------