├── .gitignore ├── .here ├── 0-expression_data_overlap_and_split.R ├── 1-normalize_titrated_data.R ├── 1A-detect_differentially_expressed_genes.R ├── 2-train_test_category.R ├── 2A-small_n_differential_expression.R ├── 3-combine_category_kappa.R ├── 4-ica_pca_feature_reconstruction.R ├── 5-predict_category_reconstructed_data.R ├── 6-save_recon_error_kappa_data.R ├── 7-extract_plier_pathways.R ├── 8-PLIER_pathways_analysis.Rmd ├── 8-PLIER_pathways_analysis.nb.html ├── LICENSE ├── README.md ├── brca_data_urls.txt ├── check_installs.R ├── check_sums.tsv ├── classifier_repeat_wrapper.R ├── combine_clinical_data.R ├── data └── .empty ├── diagrams ├── RNA-seq_titration_ML_overview.png └── RNA-seq_titration_diff_expression_overview.png ├── docker ├── R-3.6.3 │ └── Dockerfile ├── R-4.1.2 │ └── Dockerfile └── README.md ├── download_TCGA_data.sh ├── gdc_legacy_archive_brca_manifest.txt ├── load_packages.R ├── models └── .empty ├── normalized_data ├── .empty └── reconstructed_data │ └── .empty ├── plots ├── .empty ├── data │ └── .empty ├── main │ └── .empty ├── scripts │ ├── .empty │ ├── 0-plot_predictor_category_distributions.R │ ├── 1A-plot_DEGs.R │ ├── 2A-plot_small_n_differential_expression.R │ ├── 3-plot_category_kappa.R │ ├── 6-plot_recon_error.R │ ├── 6-plot_recon_kappa.R │ ├── 7-plot_plier_pathways.R │ ├── recon_kappa_difference.R │ └── visualize_expression.R ├── supplementary │ └── .empty └── visualize_expression │ └── .empty ├── prepare_GBM_data.R ├── results ├── .empty ├── array_rnaseq_ratio │ ├── ratio.2022-02-18_18_50_01_UTC.tsv │ └── ratio.tracking.tsv ├── differential_expression │ └── .empty └── reconstructed_data │ └── .empty ├── retrieve_MC3_mutations.py ├── run_all_analyses_and_plots.sh ├── run_differential_expression_experiments.sh ├── run_experiments.R ├── run_machine_learning_experiments.sh ├── search_geo_arrayexpress.py ├── tcga_tss_codes.csv └── util ├── CrossNorm.R ├── ICA_PCA_reconstruction_functions.R ├── color_blind_friendly_palette.R ├── differential_expression_functions.R ├── normalization_functions.R ├── option_functions.R └── train_test_functions.R /.gitignore: -------------------------------------------------------------------------------- 1 | .*bash_history 2 | .config 3 | .local 4 | .refinebio.yaml 5 | .Rhistory 6 | .rstudio 7 | .wget-hsts 8 | data 9 | normalized_data 10 | models 11 | results 12 | plots/main/*.pdf 13 | plots/supplementary/*.pdf 14 | plots/visualize_expression/*.pdf 15 | .Rproj.user 16 | RNAseq_titration_results.Rproj 17 | ._RNAseq_titration_results.Rproj 18 | .DS_Store 19 | -------------------------------------------------------------------------------- /.here: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/greenelab/RNAseq_titration_results/2d9b2f5ce2f04fc76279b2fc314e51abc9a4f3b4/.here -------------------------------------------------------------------------------- /0-expression_data_overlap_and_split.R: -------------------------------------------------------------------------------- 1 | # J. Taroni Jun 2016 2 | # The purpose of this script is to read in TGCA array and sequencing data, 3 | # to preprocess leaving only overlapping genes and samples with complete 4 | # category information, and to split the data into training and testing sets 5 | # It should be run from the command line through the run_experiments.R script 6 | 7 | option_list <- list( 8 | optparse::make_option("--cancer_type", 9 | default = NA_character_, 10 | help = "Cancer type"), 11 | optparse::make_option("--predictor", 12 | default = NA_character_, 13 | help = "Predictor used"), 14 | optparse::make_option("--seed1", 15 | default = NA_integer_, 16 | help = "Random seed"), 17 | optparse::make_option("--null_model", 18 | action = "store_true", 19 | default = FALSE, 20 | help = "Permute dependent variable (within subtype if predictor is a gene)") 21 | ) 22 | 23 | opt <- optparse::parse_args(optparse::OptionParser(option_list=option_list)) 24 | source(here::here("util/option_functions.R")) 25 | check_options(opt) 26 | 27 | # load libraries 28 | suppressMessages(source(here::here("load_packages.R"))) 29 | 30 | # set options 31 | cancer_type <- opt$cancer_type 32 | predictor <- opt$predictor 33 | null_model <- opt$null_model 34 | file_identifier <- ifelse(null_model, 35 | str_c(cancer_type, predictor, "null", sep = "_"), 36 | str_c(cancer_type, predictor, sep = "_")) 37 | 38 | # set seed 39 | initial.seed <- as.integer(opt$seed1) 40 | set.seed(initial.seed) 41 | # set seed for spliting into train/test here, before null_model scramble 42 | split.seed <- sample(1:10000, 1) 43 | 44 | # define directories 45 | data.dir <- here::here("data") 46 | plot.dir <- here::here("plots") 47 | plot.data.dir <- file.path(plot.dir, "data") 48 | res.dir <- here::here("results") 49 | 50 | # name input files 51 | seq.exprs.filename <- paste0(cancer_type, "RNASeq.pcl") 52 | array.exprs.filename <- paste0(cancer_type, "array.pcl") 53 | clin.filename <- paste0("combined_clinical_data.", cancer_type, ".tsv") 54 | 55 | # name output files 56 | category.distribtion.plot.data <- paste0(file_identifier, 57 | "_dist_split_stacked_bar_", 58 | initial.seed, ".tsv") 59 | train.test.labels <- paste0(file_identifier, 60 | "_matchedSamples_training_testing_split_labels_", 61 | initial.seed, ".tsv") 62 | 63 | #### read in expression and clinical data -------------------------------------- 64 | 65 | # read in expression data as data.frame 66 | seq.data <- fread(file.path(data.dir, seq.exprs.filename), 67 | data.table = FALSE) 68 | array.data <- fread(file.path(data.dir, array.exprs.filename), 69 | data.table = FALSE) 70 | clinical <- fread(file.path(data.dir, clin.filename), 71 | data.table = FALSE) 72 | 73 | # filter clinical data to keep tumor samples with complete data 74 | # if the predictor is subtype, we only select subtype (twice, but once) 75 | # if the predictor is a gene, we select subtype and the gene 76 | # this ensures downstream mutation predictions will have subtype available as covariate 77 | clinical <- clinical %>% 78 | mutate(category = !!sym(predictor)) %>% 79 | select(Sample, Type, "subtype", "category") %>% 80 | filter(Type == "tumor") %>% 81 | tidyr::drop_na() 82 | 83 | # change first column name to "gene" 84 | colnames(array.data)[1] <- colnames(seq.data)[1] <- "gene" 85 | 86 | # remove tumor-adjacent samples from the array data set 87 | array.tumor.smpls <- clinical$Sample 88 | array.tumor.smpls <- substr(array.tumor.smpls, 1, 15) 89 | 90 | array.category <- clinical$category 91 | 92 | # filter array data only to include tumor samples 93 | array.data <- array.data[, c(1, which(colnames(array.data) %in% 94 | array.tumor.smpls))] 95 | 96 | # what are the overlapping sample names -- "matched" samples? 97 | # includes "gene" column 98 | sample.overlap <- intersect(colnames(array.data), colnames(seq.data)) 99 | 100 | # what are the overlapping genes between the two platforms? 101 | gene.overlap <- intersect(array.data$gene, seq.data$gene) 102 | 103 | # filter the expression data for matched samples and overlapping genes 104 | array.matched <- array.data[which(array.data$gene %in% gene.overlap), 105 | sample.overlap] 106 | seq.matched <- seq.data[which(seq.data$gene %in% gene.overlap), 107 | sample.overlap] 108 | 109 | # reorder genes on both platforms 110 | array.matched <- array.matched[order(array.matched$gene), ] 111 | seq.matched <- seq.matched[order(seq.matched$gene), ] 112 | 113 | # reorder samples on both platforms 114 | array.matched <- array.matched[, c(1, (order(colnames(array.matched)[-1]) + 1))] 115 | seq.matched <- seq.matched[, c(1, (order(colnames(seq.matched)[-1]) + 1))] 116 | 117 | # check reording sample names worked as expected 118 | if (any(colnames(array.matched) != colnames(seq.matched))) { 119 | stop("Column name reordering did not work as expected in 0-expression_data_overlap_and_split.R") 120 | } 121 | 122 | # keep category labels for samples with expression data 123 | array.category <- as.factor(array.category[which(array.tumor.smpls %in% 124 | colnames(array.matched))]) 125 | 126 | array.tumor.smpls <- array.tumor.smpls[which(array.tumor.smpls %in% 127 | colnames(array.matched))] 128 | 129 | # remove "unmatched" / "raw" expression data 130 | rm(array.data, seq.data) 131 | 132 | # write matched only samples to pcl files 133 | array.output.nm <- sub(".pcl", "_matchedOnly_ordered.pcl", array.exprs.filename) 134 | array.output.nm <- file.path(data.dir, array.output.nm) 135 | write.table(array.matched, file = array.output.nm, 136 | row.names = FALSE, quote = FALSE, sep = "\t") 137 | 138 | seq.output.nm <- sub(".pcl", "_matchedOnly_ordered.pcl", seq.exprs.filename) 139 | seq.output.nm <- file.path(data.dir, seq.output.nm) 140 | write.table(seq.matched, file = seq.output.nm, 141 | row.names = FALSE, quote = FALSE, sep = "\t") 142 | 143 | #### split data into balanced training and testing sets ------------------------ 144 | 145 | # order array category to match the expression data order 146 | array.category <- array.category[order(array.tumor.smpls)] 147 | 148 | message(paste("\nRandom seed for splitting into testing and training:", 149 | split.seed), appendLF = TRUE) 150 | 151 | set.seed(split.seed) 152 | train.index <- unlist(createDataPartition(array.category, times = 1, p = (2/3))) 153 | 154 | #### write training/test labels to file ---------------------------------------- 155 | 156 | lbl <- rep("test", length(array.tumor.smpls)) 157 | lbl[train.index] <- "train" 158 | lbl.df <- tibble(sample = colnames(array.matched)[2:ncol(array.matched)], 159 | split = lbl, 160 | category = as.character(array.category)) 161 | 162 | # add back subtype 163 | lbl.df <- lbl.df %>% 164 | left_join(clinical %>% 165 | select(Sample, subtype), 166 | by = c("sample" = "Sample")) 167 | 168 | #### permute category labels for null model ------------------------------------ 169 | # this comes after createDataPartition() to ensure same samples go to train/test 170 | # grouping by split ensure labels remain balanced within train and test 171 | # if null_model is specified and predicting subtype, permute subtype labels 172 | # if null_model is specified and predicting mutation status, 173 | # permute mutation labels WITHIN subtype 174 | 175 | if (null_model) { 176 | if (predictor == "subtype") { # here, subtype = category 177 | lbl.df <- lbl.df %>% 178 | group_by(split) %>% 179 | mutate(category = case_when(split == "train" ~ sample(category), 180 | split == "test" ~ category)) %>% 181 | ungroup() 182 | } else { # if predictor not subtype, then must be mutation 183 | lbl.df <- lbl.df %>% # subtype = subtype, category = TP53 or PIK3CA 0/1 184 | group_by(split, subtype) %>% # sample within subtype 185 | mutate(category = case_when(split == "train" ~ sample(category), 186 | split == "test" ~ category)) %>% 187 | ungroup() 188 | } 189 | } 190 | 191 | write.table(lbl.df, 192 | file = file.path(res.dir, train.test.labels), 193 | quote = FALSE, sep = "\t", row.names = FALSE) 194 | 195 | #### save plot data frame ------------------------------------------------------ 196 | 197 | plot.df <- lbl.df %>% 198 | mutate(split = case_when(split == "train" ~ "Train (2/3)", 199 | split == "test" ~ "Test (1/3)")) %>% 200 | bind_rows(lbl.df %>% mutate(split = "Whole")) %>% 201 | mutate(initial_seed = initial.seed) 202 | 203 | write.table(plot.df, 204 | file = file.path(plot.data.dir, 205 | category.distribtion.plot.data), 206 | quote = FALSE, sep = "\t", row.names = FALSE) 207 | -------------------------------------------------------------------------------- /1-normalize_titrated_data.R: -------------------------------------------------------------------------------- 1 | # J. Taroni Jun 2016 2 | # The purpose of this script is to read in TGCA array and sequencing data, 3 | # already pre-processed to only include test tumor samples, 4 | # (output of 0-expression_data_overlap_and_split.R) and to normalize 5 | # the data. 6 | # It should be run from the command line through the run_experiments.R script 7 | 8 | option_list <- list( 9 | optparse::make_option("--cancer_type", 10 | default = NA_character_, 11 | help = "Cancer type"), 12 | optparse::make_option("--predictor", 13 | default = NA_character_, 14 | help = "Predictor used"), 15 | optparse::make_option("--seed1", 16 | default = NA_integer_, 17 | help = "Random seed"), 18 | optparse::make_option("--seed2", 19 | default = NA_integer_, 20 | help = "Random seed"), 21 | optparse::make_option("--null_model", 22 | action = "store_true", 23 | default = FALSE, 24 | help = "Refer to models with permuted dependent variable (within subtype if predictor is a gene)"), 25 | optparse::make_option("--ncores", 26 | default = NA_integer_, 27 | help = "Set the number of cores to use") 28 | ) 29 | 30 | opt <- optparse::parse_args(optparse::OptionParser(option_list=option_list)) 31 | source(here::here("util/option_functions.R")) 32 | check_options(opt) 33 | 34 | # load libraries 35 | suppressMessages(source(here::here("load_packages.R"))) 36 | source(here::here("util", "normalization_functions.R")) 37 | 38 | # set options 39 | cancer_type <- opt$cancer_type 40 | predictor <- opt$predictor 41 | null_model <- opt$null_model 42 | file_identifier <- ifelse(null_model, 43 | str_c(cancer_type, predictor, "null", sep = "_"), 44 | str_c(cancer_type, predictor, sep = "_")) 45 | ncores <- min(parallel::detectCores() - 1, 46 | opt$ncores, 47 | na.rm = TRUE) 48 | 49 | # set seed 50 | filename.seed <- as.integer(opt$seed1) 51 | initial.seed <- as.integer(opt$seed2) 52 | set.seed(initial.seed) 53 | 54 | # define directories 55 | data.dir <- here::here("data") 56 | norm.data.dir <- here::here("normalized_data") 57 | res.dir <- here::here("results") 58 | 59 | # name input files 60 | seq.file <- paste0(cancer_type, "RNASeq_matchedOnly_ordered.pcl") 61 | array.file <- paste0(cancer_type, "array_matchedOnly_ordered.pcl") 62 | train.test.file <- paste0(file_identifier, 63 | "_matchedSamples_training_testing_split_labels_", 64 | filename.seed, ".tsv") 65 | 66 | # name output files 67 | norm.test.object <- paste0(file_identifier, 68 | "_array_seq_test_data_normalized_list_", 69 | filename.seed, ".RDS") 70 | norm.train.object <- paste0(file_identifier, 71 | "_array_seq_train_titrate_normalized_list_", 72 | filename.seed, ".RDS") 73 | 74 | #### read in data -------------------------------------------------------------- 75 | 76 | seq.data <- fread(file.path(data.dir, seq.file), data.table = FALSE) 77 | array.data <- fread(file.path(data.dir, array.file), data.table = FALSE) 78 | sample.train.test <- fread(file.path(res.dir, train.test.file), data.table = FALSE) 79 | 80 | #### split samples, titrate ---------------------------------------------------- 81 | 82 | train.sample.names <- as.character(sample.train.test$sample[ 83 | which(sample.train.test$split == "train")]) 84 | test.sample.names <- as.character(sample.train.test$sample[ 85 | which(sample.train.test$split == "test")]) 86 | 87 | # get samples for 'titration' 88 | titration.seed <- sample(1:10000, 1) 89 | message(paste("Random seed for titration:", 90 | titration.seed), appendLF = TRUE) 91 | 92 | set.seed(titration.seed) 93 | titrate.sample.list <- lapply(seq(0, 1, by = 0.1), 94 | function(x) GetTitratedSampleNames(train.sample.names, 95 | x)) 96 | names(titrate.sample.list) <- as.character(seq(0, 100, by = 10)) 97 | 98 | # these samples will be the RNA-seq samples in any given 'titration' experiment 99 | # remove rows that are equal to all ones -- for any combination + test data 100 | # z-score processing will not work on such rows 101 | seq.dt.list <- lapply(titrate.sample.list, 102 | function(x) seq.data[, c(1, which(colnames(seq.data) %in% x))]) 103 | seq.dt.list[["test"]] <- 104 | seq.data[, c(1, which(colnames(seq.data) %in% test.sample.names))] 105 | all.same.list <- lapply(seq.dt.list[2:12], 106 | function(x){ 107 | vals <- x[, 2:ncol(x)] 108 | indx <- which(apply(vals, 1, check_all_same)) 109 | return(indx) 110 | } ) 111 | all.same.indx <- unique(unlist(all.same.list)) 112 | # if no rows have all same value (in previous lapply), all.same.indx is integer(0) 113 | # subsetting data frames by -integer(0) results in no rows 114 | # so check that integer vector has length > 0 before subsetting 115 | if (length(all.same.indx) > 0) { 116 | array.data <- array.data[-all.same.indx, ] 117 | seq.data <- seq.data[-all.same.indx, ] 118 | } 119 | 120 | #### get datatables to mix ----------------------------------------------------- 121 | 122 | # get a list that contains an 123 | # array data.table and seq data.table for each level of 'titration' 124 | array.train <- 125 | data.table(array.data[, 126 | c(1, which(colnames(array.data) %in% train.sample.names))]) 127 | 128 | seq.train <- 129 | data.table(seq.data[, 130 | c(1, which(colnames(seq.data) %in% train.sample.names))]) 131 | 132 | titrate.mix.dt.list <- lapply(titrate.sample.list, 133 | function(x) GetDataTablesForMixing(array.train, 134 | seq.train, x)) 135 | 136 | #### normalize train data ------------------------------------------------------ 137 | 138 | # initialize in the list to hold normalized data 139 | norm.titrate.list <- list() 140 | 141 | # single platform array normalization 142 | norm.titrate.list[["0"]] <- 143 | SinglePlatformNormalizationWrapper(titrate.mix.dt.list[[1]]$array, 144 | platform = "array", 145 | add.untransformed = TRUE, 146 | add.qn.z = TRUE) 147 | 148 | # parallel backend 149 | cl <- parallel::makeCluster(ncores) 150 | doParallel::registerDoParallel(cl) 151 | 152 | # 'mixed' both platform normalization 153 | norm.titrate.list[2:10] <- 154 | foreach(n = 2:10, .packages = "tidyverse") %dopar% { 155 | NormalizationWrapper(titrate.mix.dt.list[[n]]$array, 156 | titrate.mix.dt.list[[n]]$seq, 157 | add.untransformed = TRUE, 158 | add.qn.z = TRUE, 159 | add.cn = TRUE, 160 | add.seurat.training = TRUE) 161 | } 162 | 163 | # stop parallel backend 164 | parallel::stopCluster(cl) 165 | # sort out names 166 | names(norm.titrate.list)[2:10] <- names(titrate.mix.dt.list)[2:10] 167 | 168 | # single platform seq normalization 169 | norm.titrate.list[["100"]] <- 170 | SinglePlatformNormalizationWrapper(titrate.mix.dt.list[[11]]$seq, 171 | platform = "seq", 172 | add.untransformed = TRUE, 173 | add.qn.z = TRUE) 174 | 175 | #### normalize test data ------------------------------------------------------- 176 | array.test <- 177 | data.table(array.data[, 178 | c(1, which(colnames(array.data) %in% test.sample.names))]) 179 | seq.test <- 180 | data.table(seq.data[, c(1, which(colnames(seq.data) %in% test.sample.names))]) 181 | 182 | # array normalization 183 | array.test.norm.list <- 184 | SinglePlatformNormalizationWrapper(array.test, 185 | platform = "array", 186 | add.untransformed = TRUE, 187 | add.qn.z = TRUE, 188 | add.cn.test = TRUE, 189 | add.seurat.test = TRUE, 190 | training.list = norm.titrate.list) 191 | 192 | # seq normalization 193 | # initialize list to hold normalized seq data 194 | seq.test.norm.list <- list() 195 | 196 | # LOG normalization 197 | seq.test.norm.list[["log"]] <- LOGSeqOnly(seq.test) 198 | # NPN 199 | seq.test.norm.list[["npn"]] <- NPNSingleDT(seq.test) 200 | 201 | # start parallel backend 202 | cl <- parallel::makeCluster(ncores) 203 | doParallel::registerDoParallel(cl) 204 | 205 | # QN -- requires reference data 206 | # initialize list to hold QN data 207 | seq.qn.list <- list() 208 | 209 | # for 0% seq - use 0% LOG array data 210 | seq.qn.list[["0"]] <- QNSingleWithRef(ref.dt = norm.titrate.list$`0`$log, 211 | targ.dt = seq.test) 212 | 213 | # for 10-90% seq - use the "raw array" training data at each level of sequencing 214 | # data (this is LOG data, but only the array samples) 215 | seq.qn.list[2:10] <- 216 | foreach(i = 2:10) %dopar% { 217 | QNSingleWithRef(ref.dt = norm.titrate.list[[i]]$raw.array, 218 | targ.dt = seq.test) 219 | } 220 | names(seq.qn.list)[2:10] <- names(norm.titrate.list)[2:10] 221 | 222 | # stop parallel back end 223 | parallel::stopCluster(cl) 224 | 225 | # QN 100% seq by itself (preProcessCore::normalize.quantiles) 226 | seq.qn.list[["100"]] <- QNSingleDT(seq.test) 227 | 228 | # add QN seq data to list of normalized test data 229 | seq.test.norm.list[["qn"]] <- seq.qn.list 230 | rm(seq.qn.list) 231 | 232 | # start parallel backend 233 | cl <- parallel::makeCluster(ncores) 234 | doParallel::registerDoParallel(cl) 235 | 236 | # QN-Z -- requires reference data 237 | # initialize list to hold QN data 238 | seq.qnz.list <- list() 239 | 240 | # for 0% seq - use 0% LOG array data 241 | seq.qnz.list[["0"]] <- QNZSingleWithRef(ref.dt = norm.titrate.list$`0`$log, 242 | targ.dt = seq.test) 243 | 244 | # for 10-90% seq - use the "raw array" training data at each level of sequencing 245 | # data (this is LOG data, but only the array samples) 246 | seq.qnz.list[2:10] <- 247 | foreach(i = 2:10) %dopar% { 248 | QNZSingleWithRef(ref.dt = norm.titrate.list[[i]]$raw.array, 249 | targ.dt = seq.test) 250 | } 251 | names(seq.qnz.list)[2:10] <- names(norm.titrate.list)[2:10] 252 | 253 | # stop parallel back end 254 | parallel::stopCluster(cl) 255 | 256 | # QNZ 100% seq by itself (preProcessCore::normalize.quantiles) 257 | seq.qnz.list[["100"]] <- QNZSingleDT(seq.test) 258 | 259 | # add QNZ seq data to list of normalized test data 260 | seq.test.norm.list[["qn-z"]] <- seq.qnz.list 261 | rm(seq.qnz.list) 262 | 263 | # start parallel back end 264 | cl <- parallel::makeCluster(ncores) 265 | doParallel::registerDoParallel(cl) 266 | 267 | # TDM normalization -- requires references 268 | # initialize list to hold TDM data 269 | seq.tdm.list <- list() 270 | 271 | # for 0% seq - use 0% LOG array data 272 | seq.tdm.list[["0"]] <- TDMSingleWithRef(ref.dt = norm.titrate.list$`0`$log, 273 | targ.dt = seq.test) 274 | # for 10-90% seq - use the "raw array" training data at each level of sequencing 275 | # data (this is LOG data, but only the array samples) 276 | seq.tdm.list[2:10] <- 277 | foreach(i = 2:10) %dopar% { 278 | TDMSingleWithRef(ref.dt = norm.titrate.list[[i]]$raw.array, 279 | targ.dt = seq.test) 280 | } 281 | names(seq.tdm.list)[2:10] <- names(norm.titrate.list)[2:10] 282 | 283 | # stop parallel backend 284 | parallel::stopCluster(cl) 285 | 286 | # 100% is not applicable for TDM 287 | seq.tdm.list["100"] <- list(NULL) 288 | 289 | # add TDM seq data to list of normalized test data 290 | seq.test.norm.list[["tdm"]] <- seq.tdm.list 291 | rm(seq.tdm.list) 292 | 293 | # z-score seq test data 294 | seq.test.norm.list[["z"]] <- ZScoreSingleDT(seq.test) 295 | 296 | # untransformed seq test data 297 | seq.test.norm.list[["un"]] <- seq.test 298 | 299 | # CrossNorm RNA-seq test 300 | # Rescale each column, quantile normalize, then rescale each row 301 | seq.test.norm.list[["qn (cn)"]] <- rescale_datatable(seq.test, 302 | by_column = TRUE) %>% 303 | QNSingleDT(zero.to.one = TRUE) 304 | 305 | # Seurat RNA-seq test 306 | # for 10-90% seq - use the integrated training data at each %RNA-seq 307 | 308 | # parallel backend 309 | cl <- parallel::makeCluster(ncores) 310 | doParallel::registerDoParallel(cl) 311 | 312 | seq.seurat.list <- foreach(i = 2:10, .packages = "tidyverse") %dopar% { # 2:10 corresponds to 10%-90% 313 | 314 | if (!is.null(norm.titrate.list[[i]][["seurat_model"]])) { 315 | 316 | tryCatch(SeuratProjectPCATestData(seq.test, 317 | norm.titrate.list[[i]][["seurat_model"]], 318 | vbose = TRUE), 319 | error = function(e) NULL) 320 | 321 | } else { 322 | NULL 323 | } 324 | 325 | } 326 | 327 | names(seq.seurat.list) <- names(norm.titrate.list)[2:10] # 2:10 corresponds to 10%-90% 328 | 329 | # stop parallel backend 330 | parallel::stopCluster(cl) 331 | 332 | # add Seurat RNA-seq test data to list of normalized test data 333 | seq.test.norm.list[["seurat"]] <- seq.seurat.list 334 | rm(seq.seurat.list) 335 | 336 | # combine array and seq test data into a list 337 | test.norm.list <- list(array = array.test.norm.list, 338 | seq = seq.test.norm.list) 339 | 340 | # save test data 341 | saveRDS(test.norm.list, file = file.path(norm.data.dir, norm.test.object)) 342 | 343 | # save train data after removing Seurat models (just keep Seurat-normed data) 344 | for (n in names(norm.titrate.list)) { 345 | if ("seurat_model" %in% names(norm.titrate.list[[n]])) { 346 | norm.titrate.list[[n]][["seurat_model"]] <- NULL 347 | } 348 | } 349 | 350 | saveRDS(norm.titrate.list, file = file.path(norm.data.dir, norm.train.object)) 351 | -------------------------------------------------------------------------------- /1A-detect_differentially_expressed_genes.R: -------------------------------------------------------------------------------- 1 | # J. Taroni Jan 2017 2 | # The purpose of this analysis is to identify differentially expressed genes 3 | # between one subtype, specified by the user, and all 4 | # other subtypes using the limma package for varying amounts of RNA-seq data 5 | # (0-100%, 10% added at a time; termed 'RNA-seq titration') and normalization 6 | # methods. It takes RNA-seq and microarray data from matched samples as input, 7 | # and performs RNA-seq titration and differential expression analysis. 8 | # 9 | # USAGE: Rscript 1A-detect_differentially_expressed_genes.R --cancer_type --subtype_vs_others --subtype_vs_subtype --seed --ncores 10 | 11 | option_list <- list( 12 | optparse::make_option("--cancer_type", 13 | default = NA_character_, 14 | help = "Cancer type"), 15 | optparse::make_option("--subtype_vs_others", 16 | default = NA_character_, 17 | help = "Subtype used for comparison against all others"), 18 | optparse::make_option("--subtype_vs_subtype", 19 | default = NA_character_, 20 | help = "Subtypes used in head-to-head comparison (comma-separated without space e.g. Type1,Type2)"), 21 | optparse::make_option("--seed", 22 | default = 98, 23 | help = "Random seed [default: %default]"), 24 | optparse::make_option("--ncores", 25 | default = NA_integer_, 26 | help = "Set the number of cores to use") 27 | ) 28 | 29 | opt <- optparse::parse_args(optparse::OptionParser(option_list=option_list)) 30 | source(here::here("util/option_functions.R")) 31 | check_options(opt) 32 | 33 | # load libraries 34 | suppressMessages(source(here::here("load_packages.R"))) 35 | source(here::here("util", "normalization_functions.R")) 36 | source(here::here("util", "differential_expression_functions.R")) 37 | 38 | # set options 39 | cancer_type <- opt$cancer_type 40 | subtype_vs_others <- opt$subtype_vs_others 41 | subtype_vs_subtype <- opt$subtype_vs_subtype 42 | two_subtypes <- as.vector(stringr::str_split(subtype_vs_subtype, pattern = ",", simplify = TRUE)) 43 | file_identifier <- str_c(cancer_type, "subtype", sep = "_") # we are only working with subtype models here 44 | ncores <- min(parallel::detectCores() - 1, 45 | opt$ncores, 46 | na.rm = TRUE) 47 | 48 | # set seed 49 | initial.seed <- opt$seed 50 | set.seed(initial.seed) 51 | message(paste("\nInitial seed set to:", initial.seed)) 52 | 53 | # define directories 54 | data.dir <- here::here("data") 55 | res.dir <- here::here("results") 56 | norm.dir <- here::here("normalized_data") 57 | deg.dir <- file.path(res.dir, "differential_expression") 58 | plot.data.dir <- here::here("plots/data") 59 | 60 | # define input files 61 | seq.file <- file.path(data.dir, 62 | paste0(cancer_type, "RNASeq_matchedOnly_ordered.pcl")) 63 | array.file <- file.path(data.dir, 64 | paste0(cancer_type, "array_matchedOnly_ordered.pcl")) 65 | smpl.file <- file.path(res.dir, 66 | list.files(res.dir, # this finds the first example of a subtypes file from cancer_type 67 | pattern = paste0(file_identifier, # and does not rely on knowing a seed 68 | "_matchedSamples_training_testing_split_labels_"))[1]) 69 | 70 | # define output files 71 | subtype_vs_others_lead <- paste0(file_identifier, 72 | "_titration_differential_exp_eBayes_fits_", 73 | subtype_vs_others, "vOther") 74 | two_subtypes_lead <- paste0(file_identifier, 75 | "_titration_differential_exp_eBayes_fits_", 76 | stringr::str_c(two_subtypes, collapse = "v")) 77 | 78 | subtype_vs_others.rds <- file.path(deg.dir, 79 | paste0(subtype_vs_others_lead, ".RDS")) 80 | two_subtypes.rds <- file.path(deg.dir, 81 | paste0(two_subtypes_lead, ".RDS")) 82 | 83 | subtype_vs_others.propDE_file <- file.path(plot.data.dir, 84 | paste0(subtype_vs_others_lead, 85 | ".propDE.tsv")) 86 | two_subtypes.propDE_file <- file.path(plot.data.dir, 87 | paste0(two_subtypes_lead, 88 | ".propDE.tsv")) 89 | 90 | subtype_vs_others.silver_file <- file.path(plot.data.dir, 91 | paste0(subtype_vs_others_lead, 92 | ".silver.tsv")) 93 | two_subtypes.silver_file <- file.path(plot.data.dir, 94 | paste0(two_subtypes_lead, 95 | ".silver.tsv")) 96 | 97 | norm.rds <- file.path(norm.dir, 98 | paste0(file_identifier, 99 | "_titration_no_ZTO_transform_with_UN.RDS")) 100 | 101 | #### read in data -------------------------------------------------------------- 102 | 103 | seq.data <- data.table::fread(seq.file, data.table = F) 104 | array.data <- data.table::fread(array.file, data.table = F) 105 | sample.df <- read.delim(smpl.file) 106 | 107 | # check that subtypes are in sample.df 108 | for(subtype in c(subtype_vs_others, two_subtypes)) { 109 | if (!(subtype %in% sample.df$category)) { 110 | stop(paste("Subtype", subtype, "not found in sample file", 111 | smpl.file, "in 1A-detect_differentially_expressed_genes.R.")) 112 | } 113 | } 114 | 115 | sample.names <- sample.df$sample 116 | 117 | #### RNA-seq 'titration' ------------------------------------------------------- 118 | 119 | titration.seed <- sample(1:10000, 1) 120 | message(paste("Random seed for titration:", 121 | titration.seed), appendLF=TRUE) 122 | 123 | set.seed(titration.seed) 124 | # these samples will be the RNA-seq samples in any given 'titration' experiment 125 | titrate.sample.list <- 126 | lapply(seq(0, 1, by = 0.1), 127 | function(x) GetTitratedSampleNames(sample.names, x)) 128 | 129 | # remove rows that are equal to all ones in sequencing data -- these are 130 | # essentially missing values and cause issues with z-transformation 131 | seq.dt.list <- 132 | lapply(titrate.sample.list, 133 | function(x) seq.data[, c(1, which(colnames(seq.data) %in% x))]) 134 | all.same.list <- lapply(seq.dt.list[2:11], 135 | function(x){ 136 | vals <- x[, 2:ncol(x)] 137 | indx <- which(apply(vals, 1, check_all_same)) 138 | return(indx) 139 | } ) 140 | all.same.indx <- unique(unlist(all.same.list)) 141 | # if no rows are all same (in previous lapply), all.same.indx is integer(0) 142 | # subsetting data frames by -integer(0) results in no rows 143 | # so check that integer vector has length > 0 before subsetting 144 | if (length(all.same.indx) > 0) { 145 | array.data <- array.data[-all.same.indx, ] 146 | seq.data <- seq.data[-all.same.indx, ] 147 | } 148 | 149 | # get a list that contains an array data.table and seq data.table for each 150 | # each level of 'titration' 151 | titrate.mix.dt.list <- 152 | lapply(titrate.sample.list, 153 | function(x) GetDataTablesForMixing(data.table(array.data), 154 | data.table(seq.data), 155 | x)) 156 | names(titrate.mix.dt.list) <- as.character(seq(0, 100, by=10)) 157 | 158 | #### normalize data ------------------------------------------------------------ 159 | 160 | # initialize in the list to hold normalized data 161 | norm.titrate.list <- list() 162 | 163 | # single platform array normalization 164 | norm.titrate.list[["0"]] <- 165 | SinglePlatformNormalizationWrapper(titrate.mix.dt.list[[1]]$array, 166 | platform = "array", 167 | zto = FALSE, 168 | add.qn.z = TRUE) 169 | 170 | # parallel backend 171 | cl <- parallel::makeCluster(ncores) 172 | doParallel::registerDoParallel(cl) 173 | 174 | # 'mixed' both platform normalization 175 | norm.titrate.list[2:10] <- 176 | foreach(n = 2:10) %dopar% { 177 | NormalizationWrapper(titrate.mix.dt.list[[n]]$array, 178 | titrate.mix.dt.list[[n]]$seq, 179 | zto = FALSE, 180 | add.untransformed = TRUE, 181 | add.qn.z = TRUE) 182 | } 183 | names(norm.titrate.list)[2:10] <- names(titrate.mix.dt.list)[2:10] 184 | 185 | # stop parallel backend 186 | parallel::stopCluster(cl) 187 | 188 | # single platform seq normalization 189 | norm.titrate.list[["100"]] <- 190 | SinglePlatformNormalizationWrapper(titrate.mix.dt.list[[11]]$seq, 191 | platform = "seq", 192 | zto = FALSE, 193 | add.untransformed = TRUE, 194 | add.qn.z = TRUE) 195 | 196 | # save normalized data 197 | saveRDS(norm.titrate.list, file = norm.rds) 198 | 199 | #### Subtype v. Others -------------------------------------------------------- 200 | # design matrices 201 | design.mat.list <- GetDesignMatrixList(norm.titrate.list, sample.df, 202 | subtype = subtype_vs_others) 203 | # differential expression 204 | fit.results.list <- GetFiteBayesList(norm.list = norm.titrate.list, 205 | design.list = design.mat.list) 206 | # save fit results to RDS 207 | saveRDS(fit.results.list, file = subtype_vs_others.rds) 208 | 209 | # write top.table.list to results directory 210 | adjust.method <- "BH" 211 | subtype_vs_others.top.table.list <- 212 | lapply(fit.results.list, # for each level of % seq 213 | function(x) 214 | lapply(x, # for each normalization method 215 | function(y) GetAllGenesTopTable(y, adjust = adjust.method))) 216 | 217 | # write proportion DE to plot data directory 218 | subtype_vs_others.proportion_de <- GetDataProportionDE( 219 | subtype_vs_others.top.table.list, 220 | adjust.method = "BH", cutoff = 0.05) 221 | 222 | write.table(x = subtype_vs_others.proportion_de, 223 | file = subtype_vs_others.propDE_file, 224 | quote = FALSE, sep = "\t", row.names = FALSE) 225 | 226 | # write stats for comparison to silver standard to plot data directory 227 | subtype_vs_others.silver <- GetDataSilverStandardStats( 228 | subtype_vs_others.top.table.list, 229 | cutoff = 0.05) 230 | 231 | write.table(subtype_vs_others.silver, 232 | file = subtype_vs_others.silver_file, 233 | quote = FALSE, sep = "\t", row.names = FALSE) 234 | 235 | #### Subtype v. Subtype -------------------------------------------------------- 236 | # remove all samples that are not in these subtypes 237 | samples.to.keep <- 238 | sample.df$sample[which(sample.df$category %in% two_subtypes)] 239 | 240 | pruned.norm.list <- 241 | lapply(norm.titrate.list, 242 | function(x) lapply(x, 243 | function(y) y[, 244 | c(1, which(colnames(y) %in% 245 | samples.to.keep)), 246 | with = FALSE])) 247 | 248 | # get design matrices 249 | last_subtype.design.list <- GetDesignMatrixList(pruned.norm.list, 250 | sample.df, 251 | subtype = last(two_subtypes)) 252 | # differential expression 253 | last_subtype.fit.results.list <- GetFiteBayesList(norm.list = pruned.norm.list, 254 | design.list = last_subtype.design.list) 255 | 256 | # save fit results to file 257 | saveRDS(last_subtype.fit.results.list, 258 | file = two_subtypes.rds) 259 | 260 | # get top.table.list 261 | adjust.method <- "BH" 262 | two_subtypes.top.table.list <- 263 | lapply(last_subtype.fit.results.list, # for each level of % seq 264 | function(x) 265 | lapply(x, # for each normalization method 266 | function(y) GetAllGenesTopTable(y, adjust = adjust.method))) 267 | 268 | # write proportion DE to plot data directory 269 | two_subtypes.proportion_de <- GetDataProportionDE(two_subtypes.top.table.list, 270 | adjust.method = "BH", cutoff = 0.05) 271 | 272 | write.table(x = two_subtypes.proportion_de, 273 | file = two_subtypes.propDE_file, 274 | quote = FALSE, sep = "\t", row.names = FALSE) 275 | 276 | # write stats for comparison to silver standard to plot data directory 277 | two_subtypes.silver <- GetDataSilverStandardStats( 278 | two_subtypes.top.table.list, 279 | cutoff = 0.05) 280 | 281 | write.table(two_subtypes.silver, 282 | file = two_subtypes.silver_file, 283 | quote = FALSE, sep = "\t", row.names = FALSE) 284 | -------------------------------------------------------------------------------- /2-train_test_category.R: -------------------------------------------------------------------------------- 1 | # J. Taroni Jul 2016 2 | # The purpose of this script is to train LASSO, linear SVM, and 3 | # predictive models on normalized and mixed array and RNA-seq data 4 | # (output of 1-normalized_titrated_data.R) and then to perform predictions on 5 | # normalized test data. 6 | # It should be run from the command line through the run_experiments.R script 7 | 8 | option_list <- list( 9 | optparse::make_option("--cancer_type", 10 | default = NA_character_, 11 | help = "Cancer type"), 12 | optparse::make_option("--predictor", 13 | default = NA_character_, 14 | help = "Predictor used"), 15 | optparse::make_option("--seed1", 16 | default = NA_integer_, 17 | help = "Random seed"), 18 | optparse::make_option("--seed3", 19 | default = NA_integer_, 20 | help = "Random seed"), 21 | optparse::make_option("--null_model", 22 | action = "store_true", 23 | default = FALSE, 24 | help = "Refer to models with permuted dependent variable (within subtype if predictor is a gene)"), 25 | optparse::make_option("--ncores", 26 | default = NA_integer_, 27 | help = "Set the number of cores to use") 28 | ) 29 | 30 | opt <- optparse::parse_args(optparse::OptionParser(option_list=option_list)) 31 | source(here::here("util/option_functions.R")) 32 | check_options(opt) 33 | 34 | # load libraries 35 | suppressMessages(source(here::here("load_packages.R"))) 36 | source(here::here("util", "train_test_functions.R")) 37 | 38 | # set options 39 | cancer_type <- opt$cancer_type 40 | predictor <- opt$predictor 41 | null_model <- opt$null_model 42 | file_identifier <- ifelse(null_model, 43 | str_c(cancer_type, predictor, "null", sep = "_"), 44 | str_c(cancer_type, predictor, sep = "_")) 45 | ncores <- min(parallel::detectCores() - 1, 46 | opt$ncores, 47 | na.rm = TRUE) 48 | 49 | # set seed 50 | filename.seed <- opt$seed1 51 | initial.seed <- opt$seed3 52 | set.seed(initial.seed) 53 | 54 | # define directories 55 | norm.data.dir <- here::here("normalized_data") 56 | mdl.dir <- here::here("models") 57 | res.dir <- here::here("results") 58 | 59 | # define input files 60 | norm.test.object <- paste0(file_identifier, 61 | "_array_seq_test_data_normalized_list_", 62 | filename.seed, ".RDS") 63 | norm.train.object <- paste0(file_identifier, 64 | "_array_seq_train_titrate_normalized_list_", 65 | filename.seed, ".RDS") 66 | train.test.labels <- paste0(file_identifier, 67 | "_matchedSamples_training_testing_split_labels_", 68 | filename.seed, ".tsv") 69 | 70 | # define output files 71 | trained.models.object <- paste0(file_identifier, 72 | "_train_3_models_", 73 | filename.seed, ".RDS") 74 | train.kappa.file <- file.path(res.dir, 75 | paste0(file_identifier, 76 | "_train_3_models_training_set_total_kappa_", 77 | filename.seed, ".tsv")) 78 | array.kappa.file <- file.path(res.dir, 79 | paste0(file_identifier, 80 | "_train_3_models_array_kappa_", 81 | filename.seed, ".tsv")) 82 | seq.kappa.file <- file.path(res.dir, 83 | paste0(file_identifier, 84 | "_train_3_models_seq_kappa_", 85 | filename.seed, ".tsv")) 86 | 87 | #### load data ----------------------------------------------------------------- 88 | 89 | sample.train.test <- fread(file.path(res.dir, train.test.labels), data.table = FALSE) 90 | norm.titrate.list <- readRDS(file.path(norm.data.dir, norm.train.object)) 91 | norm.test.list <- readRDS(file.path(norm.data.dir, norm.test.object)) 92 | 93 | # set each category as a factor 94 | sample.train.test$category <- as.factor(sample.train.test$category) 95 | 96 | # category levels for each perc of seq data 97 | category.norm.list <- lapply(norm.titrate.list, 98 | function(x) GetOrderedCategoryLabels(x$z, 99 | sample.train.test)) 100 | 101 | # restructure normalized list so that it's organized by normalization method 102 | restr.train.list <- RestructureNormList(norm.titrate.list) 103 | rm(norm.titrate.list) 104 | 105 | #### training ------------------------------------------------------------------ 106 | 107 | folds.seed <- sample(1:10000, 1) 108 | message(paste("Random seed for createFolds:", folds.seed), appendLF = TRUE) 109 | set.seed(folds.seed) 110 | folds.list <- lapply(category.norm.list, function(x) createFolds(x, k = 5)) 111 | 112 | # parallel backend 113 | cl <- parallel::makeCluster(ncores) 114 | registerDoParallel(cl) 115 | 116 | resample.seed <- sample(1:10000, 1) 117 | message(paste("Random seed for resampling:", resample.seed), appendLF=TRUE) 118 | 119 | train.model.list <- 120 | foreach(n = 1:length(restr.train.list)) %do% { # foreach norm method 121 | foreach(m = 1:length(category.norm.list)) %dopar% { # foreach % seq level 122 | TrainThreeModels(dt = restr.train.list[[n]][[m]], 123 | category = category.norm.list[[m]], 124 | seed = resample.seed, 125 | folds.list = folds.list[[m]]) 126 | 127 | } 128 | } 129 | 130 | # stop parallel backend 131 | stopCluster(cl) 132 | 133 | # get names 134 | names(train.model.list) <- names(restr.train.list) 135 | train.model.list <- mapply( 136 | function(x, y){ 137 | names(x) <- names(y) 138 | return(x) 139 | }, x = train.model.list, 140 | y = restr.train.list, 141 | SIMPLIFY = TRUE) 142 | 143 | # restructure trained model list so from top to bottom: norm method -> model 144 | # type -> % seq level (0 - 100) 145 | train.model.list <- RestructureTrainedList(train.model.list) 146 | 147 | # save predictive models 148 | saveRDS(train.model.list, file = file.path(mdl.dir, trained.models.object)) 149 | 150 | #### training kappa --------------------------------------------------------- 151 | # get rid of 0, 100 tdm list, they're NULL 152 | restr.train.list$tdm$`0` <- NULL 153 | restr.train.list$tdm$`100` <- NULL 154 | 155 | # get training kappa stats and write to file 156 | train.kappa.df <- PredictWrapper(train.model.list = train.model.list, 157 | pred.list = restr.train.list, 158 | sample.df = sample.train.test, 159 | only.kap = TRUE) 160 | 161 | write.table(train.kappa.df, file = train.kappa.file, sep = "\t", 162 | row.names = FALSE, quote = FALSE) 163 | 164 | #### predictions - test data --------------------------------------------------- 165 | 166 | # get predictions on array test data as a data frame 167 | array.kappa.df <- PredictWrapper(train.model.list = train.model.list, 168 | pred.list = norm.test.list$array, 169 | sample.df = sample.train.test, 170 | only.kap = TRUE) 171 | 172 | write.table(array.kappa.df, file = array.kappa.file, sep = "\t", 173 | row.names = FALSE, quote = FALSE) 174 | 175 | # for the 0 perc seq level of the titration, the model tested on log transformed 176 | # array data (100% array data) should be tested on the TDM transformed seq data 177 | for(i in 1:length(train.model.list[["tdm"]])){ 178 | train.model.list[["tdm"]][[i]]$`0` <- train.model.list[["log"]][[i]]$`0` 179 | train.model.list[["tdm"]][[i]] <- train.model.list[["tdm"]][[i]][c(10, 1:9)] 180 | } 181 | 182 | # get rid of 100 tdm list, it's NULL 183 | norm.test.list$seq$tdm$`100` <- NULL 184 | 185 | # get predictions on RNA-seq test data as a data frame 186 | seq.kappa.df <- PredictWrapper(train.model.list = train.model.list, 187 | pred.list = norm.test.list$seq, 188 | sample.df = sample.train.test, 189 | only.kap = TRUE) 190 | 191 | write.table(seq.kappa.df, file = seq.kappa.file, sep = "\t", 192 | row.names = FALSE, quote = FALSE) 193 | -------------------------------------------------------------------------------- /2A-small_n_differential_expression.R: -------------------------------------------------------------------------------- 1 | # J. Taroni Feb 2016 2 | # The purpose of this analysis is to examine how normalization methods 3 | # (quantile normalization or z-transformation) perform wrt differential 4 | # expression when there are a small number of samples on each platform 5 | # 6 | # USAGE: Rscript 2A-small_n_differential_expression.R --cancer_type --subtype_vs_subtype --ncores 7 | 8 | option_list <- list( 9 | optparse::make_option("--cancer_type", 10 | default = NA_character_, 11 | help = "Cancer type"), 12 | optparse::make_option("--subtype_vs_subtype", 13 | default = NA_character_, 14 | help = "Subtypes used in head-to-head comparison (comma-separated without space e.g. Type1,Type2)"), 15 | optparse::make_option("--seed", 16 | default = 3255, 17 | help = "Random seed"), 18 | optparse::make_option("--ncores", 19 | default = NA_integer_, 20 | help = "Set the number of cores to use") 21 | ) 22 | 23 | opt <- optparse::parse_args(optparse::OptionParser(option_list=option_list)) 24 | source(here::here("util/option_functions.R")) 25 | check_options(opt) 26 | 27 | # load libraries 28 | suppressMessages(source(here::here("load_packages.R"))) 29 | source(here::here("util", "normalization_functions.R")) 30 | source(here::here("util", "differential_expression_functions.R")) 31 | source(here::here("util", "color_blind_friendly_palette.R")) 32 | 33 | # set options 34 | cancer_type <- opt$cancer_type 35 | subtype_vs_subtype <- opt$subtype_vs_subtype 36 | two_subtypes <- as.vector(stringr::str_split(subtype_vs_subtype, pattern = ",", simplify = TRUE)) 37 | file_identifier <- str_c(cancer_type, "subtype", sep = "_") # we are only working with subtype models here 38 | ncores <- min(parallel::detectCores() - 1, 39 | opt$ncores, 40 | na.rm = TRUE) 41 | 42 | # set seed 43 | initial.seed <- opt$seed 44 | set.seed(initial.seed) 45 | 46 | # set additional random seeds for reproducibility within foreach dopar loops 47 | random_seeds <- sample(1:10000, size = 9) 48 | 49 | message(paste("\nInitial seed set to:", initial.seed)) 50 | 51 | # define directories 52 | data.dir <- here::here("data") 53 | res.dir <- here::here("results") 54 | deg.dir <- file.path(res.dir, "differential_expression") 55 | plot.dir <- here::here("plots") 56 | plot.data.dir <- file.path(plot.dir, "data") 57 | 58 | # define input files 59 | seq.file <- file.path(data.dir, 60 | paste0(cancer_type, "RNASeq_matchedOnly_ordered.pcl")) 61 | array.file <- file.path(data.dir, 62 | paste0(cancer_type, "array_matchedOnly_ordered.pcl")) 63 | smpl.file <- file.path(res.dir, 64 | list.files(res.dir, # this finds the first example of a subtypes file from cancer_type 65 | pattern = paste0(file_identifier, # and does not rely on knowing a seed 66 | "_matchedSamples_training_testing_split_labels_"))[1]) 67 | 68 | #### read in data -------------------------------------------------------------- 69 | 70 | seq.data <- data.table::fread(seq.file, data.table = F) 71 | array.data <- data.table::fread(array.file, data.table = F) 72 | sample.df <- read.delim(smpl.file) 73 | 74 | # check that subtypes are in sample.df 75 | for(subtype in two_subtypes) { 76 | if (!(subtype %in% sample.df$category)) { 77 | stop(paste("Subtype", subtype, "not found in sample file", 78 | smpl.file, "in 2A-small_n_differential_expression.R.")) 79 | } 80 | } 81 | 82 | sample.names <- sample.df$sample 83 | 84 | #### main ---------------------------------------------------------------------- 85 | 86 | # leave only subtypes of interest to choose from & make data.table 87 | # remove all samples that are not subtypes of interest 88 | samples.to.keep <- 89 | sample.df$sample[which(sample.df$category %in% two_subtypes)] 90 | 91 | array.dt <- data.table(array.data[, 92 | c(1, which(colnames(array.data) %in% 93 | samples.to.keep))]) 94 | seq.dt <- data.table(seq.data[, 95 | c(1, which(colnames(seq.data) %in% 96 | samples.to.keep))]) 97 | sample.df <- sample.df[which(sample.df$sample %in% samples.to.keep), ] 98 | 99 | smaller_subtype_size <- min(table(as.character(sample.df$category))) 100 | 101 | # different sizes of n to test 102 | no.samples <- c(3, 4, 5, 6, 8, 10, 15, 25, 50) 103 | no.samples <- no.samples[which(no.samples <= smaller_subtype_size)] 104 | 105 | message(paste("Smaller subtype has", smaller_subtype_size, "samples,", 106 | "so using up to", max(no.samples), "samples in 2A-small_n_differential_expression.R")) 107 | 108 | # initialize list to hold Jaccard, Rand, Spearman data from the 10 trials 109 | stats.df.list <- list() 110 | 111 | # Do this at 0-100% RNA-seq titration levels 112 | # parallel backend 113 | cl <- parallel::makeCluster(ncores) 114 | doParallel::registerDoParallel(cl) 115 | 116 | # at each titration level (0-100% RNA-seq) 117 | stats.df.list[1:9] <- foreach(seq_prop = seq(0.1, .9, 0.1), .packages = c("tidyverse")) %dopar% { 118 | 119 | # random_seeds indexed by 1 through 9, corresponding to seq_prop 0.1 through 0.9 120 | set.seed(random_seeds[seq_prop*10]) 121 | 122 | # we're going to repeat the small n experiment 10 times 123 | stats.df.iter_list <- list() # this is returned to stats.df.list each iteration 124 | for (trial.iter in 1:10) { 125 | 126 | # for each n (3...50), get the sample names that will be included in the 127 | # experiment and on each platform 128 | sample.list <- 129 | lapply(no.samples, # for each n (3...50) 130 | function(x) GetSamplesforMixingSmallN(x, sample.df, 131 | subtype = data.table::last(two_subtypes), 132 | seq_proportion = seq_prop)) 133 | 134 | # initialize list to hold differential expression results (eBayes output) 135 | master.deg.list <- list() 136 | 137 | for (smpl.no.iter in seq_along(sample.list)) { # for each n (3...50) 138 | # normalize data 139 | n_array <- length(sample.list[[smpl.no.iter]]$array) 140 | n_seq <- length(sample.list[[smpl.no.iter]]$seq) 141 | 142 | if (n_array >= 3 & n_seq >= 3) { # require at least three array and seq samples 143 | norm.list <- SmallNNormWrapper(array.dt = array.dt, 144 | seq.dt = seq.dt, 145 | mix.list = sample.list[[smpl.no.iter]], 146 | zto = FALSE) 147 | # perform differential expression analysis 148 | master.deg.list[[as.character(no.samples[smpl.no.iter])]] <- 149 | SmallNDEGWrapper(norm.list = norm.list, sample.df = sample.df, 150 | subtype = data.table::last(two_subtypes)) 151 | } 152 | } 153 | 154 | top.table.list <- 155 | lapply(master.deg.list, # for each n (3...50) 156 | function(x) # for each normalization method 157 | lapply(x, function(y) GetAllGenesTopTable(y))) # extract DEGs 158 | 159 | # how do the (100-X)%/X% array/seq differentially expressed genes compared to 160 | # the platform-specific standards? 161 | if (length(top.table.list) > 0) { 162 | stats.df.iter_list[[trial.iter]] <- GetSmallNSilverStandardStats(top.table.list, 163 | cutoff = 0.1) 164 | } 165 | } 166 | stats.df.iter_list # return stats.df.iter_list to stats.df.list 167 | } 168 | 169 | # stop parallel backend 170 | parallel::stopCluster(cl) 171 | 172 | # renames list levels 173 | names(stats.df.list)[1:9] <- as.character(seq(10, 90, 10)) 174 | 175 | # combine jaccard similarity data.frames into one data.frame 176 | subtypes_combination <- stringr::str_c(two_subtypes, collapse = "v") 177 | 178 | stats.df <- reshape2::melt(stats.df.list, 179 | id.vars = c("platform", "normalization", "no.samples")) 180 | names(stats.df) <- c("platform", "normalization", "no.samples", "measure", "value", 181 | "iteration", "seq_prop") 182 | stats.df <- stats.df %>% 183 | mutate(seq_prop = factor(str_c(seq_prop, "% RNA-seq"), 184 | levels = str_c(seq(0, 100, 10), "% RNA-seq"))) 185 | 186 | write.table(stats.df, 187 | file = file.path(plot.data.dir, 188 | paste0(file_identifier, 189 | "_small_n_", 190 | subtypes_combination, 191 | "_results.tsv")), 192 | sep = "\t", quote = FALSE, row.names = FALSE) 193 | -------------------------------------------------------------------------------- /3-combine_category_kappa.R: -------------------------------------------------------------------------------- 1 | # J. Taroni Jul 2016 2 | # The purpose of this script is to combine and save Kappa statistics from category 3 | # predictions on hold-out data. It should be run from the command line 4 | # through the classifier_repeat_wrapper.R script or alternatively 5 | # USAGE: Rscript 3-combine_category_kappa.R 6 | 7 | option_list <- list( 8 | optparse::make_option("--cancer_type", 9 | default = NA_character_, 10 | help = "Cancer type"), 11 | optparse::make_option("--predictor", 12 | default = NA_character_, 13 | help = "Predictor used"), 14 | optparse::make_option("--null_model", 15 | action = "store_true", 16 | default = FALSE, 17 | help = "Use null model as baseline for plotting delta kappa") 18 | ) 19 | 20 | opt <- optparse::parse_args(optparse::OptionParser(option_list=option_list)) 21 | source(here::here("util/option_functions.R")) 22 | check_options(opt) 23 | 24 | # load libraries 25 | suppressMessages(source(here::here("load_packages.R"))) 26 | source(here::here("util", "color_blind_friendly_palette.R")) 27 | 28 | # set options 29 | cancer_type <- opt$cancer_type 30 | predictor <- opt$predictor 31 | null_model <- opt$null_model 32 | file_identifier <- str_c(cancer_type, predictor, sep = "_") 33 | 34 | # define directories 35 | plot.dir <- here::here("plots") 36 | plot.data.dir <- file.path(plot.dir, "data") 37 | res.dir <- here::here("results") 38 | 39 | # list array and seq files from results directory 40 | lf <- list.files(res.dir, full.names = TRUE) 41 | array.files <- lf[grepl(paste0(file_identifier, 42 | "_train_3_models_array_kappa_"), lf)] 43 | seq.files <- lf[grepl(paste0(file_identifier, 44 | "_train_3_models_seq_kappa_"), lf)] 45 | if (null_model) { 46 | null_array.files <- lf[grepl(paste0(file_identifier, 47 | "_null_train_3_models_array_kappa_"), lf)] 48 | null_seq.files <- lf[grepl(paste0(file_identifier, 49 | "_null_train_3_models_seq_kappa_"), lf)] 50 | 51 | # check that we have ordered pairs of regular and null files for array and seq 52 | array_seeds <- stringr::str_sub(array.files, -8, -5) 53 | null_array_seeds <- stringr::str_sub(null_array.files, -8, -5) 54 | seq_seeds <- stringr::str_sub(seq.files, -8, -5) 55 | null_seq_seeds <- stringr::str_sub(null_seq.files, -8, -5) 56 | if (!(all(array_seeds == null_array_seeds) & 57 | all(seq_seeds == null_seq_seeds))) { 58 | stop("Array or seq seeds do not match in delta kappa plotting script.") 59 | } 60 | 61 | } 62 | 63 | # define output files 64 | test.df.filename <- ifelse(null_model, 65 | file.path(plot.data.dir, 66 | paste0(file_identifier, 67 | "_train_3_models_delta_kappa.tsv")), 68 | file.path(plot.data.dir, 69 | paste0(file_identifier, 70 | "_train_3_models_kappa.tsv"))) 71 | 72 | summary.df.filename <- ifelse(null_model, 73 | file.path(plot.data.dir, 74 | paste0(file_identifier, 75 | "_train_3_models_delta_kappa_summary_table.tsv")), 76 | file.path(plot.data.dir, 77 | paste0(file_identifier, 78 | "_train_3_models_kappa_summary_table.tsv"))) 79 | 80 | #### read in data -------------------------------------------------------------- 81 | 82 | # read in the tables that contain the kappa statistics for predictions on test 83 | # data 84 | array.list <- list() # initialize list that will hold all array tables 85 | seq.list <- list() # initialize list that will hold all the RNA-seq tables 86 | for (file_index in 1:length(array.files)) { 87 | array.list[[file_index]] <- fread(array.files[file_index], data.table = F) 88 | seq.list[[file_index]] <- fread(seq.files[file_index], data.table = F) 89 | } 90 | 91 | if (null_model) { 92 | null_array.list <- list() # initialize list that will hold null array tables 93 | null_seq.list <- list() # initialize list that will hold null RNA-seq tables 94 | for (null_file_index in 1:length(null_array.files)) { 95 | null_array.list[[null_file_index]] <- fread(null_array.files[null_file_index], data.table = F) 96 | null_seq.list[[null_file_index]] <- fread(null_seq.files[null_file_index], data.table = F) 97 | } 98 | 99 | # calculate delta kappa values 100 | delta_kappa_array.list <- list() # list for delta kappa array values 101 | delta_kappa_seq.list <- list() # list for delta kappa seq values 102 | for (pair_index in 1:length(array.files)) { 103 | 104 | delta_kappa_array.list[[pair_index]] <- array.list[[pair_index]] %>% 105 | left_join(null_array.list[[pair_index]], 106 | by = c("perc.seq", "classifier", "norm.method"), 107 | suffix = c(".true", ".null")) %>% 108 | mutate(delta_kappa = kappa.true - kappa.null) %>% # regular kappa - null kappa 109 | select(delta_kappa, auc.true, sensitivity.true, specificity.true, perc.seq, classifier, norm.method) 110 | 111 | delta_kappa_seq.list[[pair_index]] <- seq.list[[pair_index]] %>% 112 | left_join(null_seq.list[[pair_index]], 113 | by = c("perc.seq", "classifier", "norm.method"), 114 | suffix = c(".true", ".null")) %>% 115 | mutate(delta_kappa = kappa.true - kappa.null) %>% # regular kappa - null kappa 116 | select(delta_kappa, auc.true, sensitivity.true, specificity.true, perc.seq, classifier, norm.method) 117 | 118 | } 119 | 120 | } 121 | 122 | # combine all tables from each platform into a data.frame 123 | # cannot use ifelse() because return value must be same dim as conditional test 124 | if (null_model) { 125 | array.df <- data.table::rbindlist(delta_kappa_array.list) 126 | seq.df <- data.table::rbindlist(delta_kappa_seq.list) 127 | } else { 128 | array.df <- data.table::rbindlist(array.list) %>% 129 | select(kappa, auc, sensitivity, specificity, perc.seq, classifier, norm.method) 130 | seq.df <- data.table::rbindlist(seq.list) %>% 131 | select(kappa, auc, sensitivity, specificity, perc.seq, classifier, norm.method) 132 | } 133 | 134 | #### save test set results ----------------------------------------------------- 135 | 136 | # bind all kappa stats together 137 | test.df <- cbind(rbind(array.df, seq.df), 138 | c(rep("Microarray", nrow(array.df)), 139 | rep("RNA-seq", nrow(seq.df)))) 140 | 141 | colnames(test.df) <- c("Kappa", "AUC", "Sensitivity", "Specificity", "Perc.Seq", "Classifier", 142 | "Normalization", "Platform") 143 | 144 | # order %seq to display 0-100 145 | test.df$Perc.Seq <- factor(test.df$Perc.Seq, levels = seq(0, 100, 10)) 146 | 147 | # recode model types 148 | cls.recode.str <- 149 | "'glmnet' = 'LASSO'; 'rf' = 'Random Forest'; 'svm' = 'Linear SVM'" 150 | test.df$Classifier <- car::recode(test.df$Classifier, 151 | recodes = cls.recode.str) 152 | 153 | # capitalize norm methods 154 | test.df$Normalization <- as.factor(toupper(test.df$Normalization)) 155 | test.df$Classifier <- as.factor(test.df$Classifier) 156 | 157 | readr::write_tsv(test.df, 158 | test.df.filename) # delta or not delta in file name 159 | 160 | # get summary data.frame + write to file 161 | summary.df <- test.df %>% 162 | dplyr::group_by(Classifier, Normalization, Platform, Perc.Seq) %>% 163 | dplyr::summarise(Median_Kappa = median(Kappa, na.rm = TRUE), 164 | Mean_Kappa = mean(Kappa, na.rm = TRUE), 165 | SD_Kappa = sd(Kappa, na.rm = TRUE), 166 | Median_AUC = median(AUC, na.rm = TRUE), 167 | Mean_AUC = mean(AUC, na.rm = TRUE), 168 | SD_AUC = sd(AUC, na.rm = TRUE), 169 | Median_Sensitivity = median(Sensitivity, na.rm = TRUE), 170 | Mean_Sensitivity = mean(Sensitivity, na.rm = TRUE), 171 | SD_Sensitivity = sd(Sensitivity, na.rm = TRUE), 172 | Median_Specificity = median(Specificity, na.rm = TRUE), 173 | Mean_Specificity = mean(Specificity, na.rm = TRUE), 174 | SD_Specificity = sd(Specificity, na.rm = TRUE), 175 | .groups = "drop") 176 | 177 | readr::write_tsv(summary.df, 178 | summary.df.filename) # delta or not delta in file name 179 | -------------------------------------------------------------------------------- /4-ica_pca_feature_reconstruction.R: -------------------------------------------------------------------------------- 1 | # J. Taroni Aug 2016 2 | # The purpose of this script is to perform unsupervised learning on TCGA train- 3 | # ing data (output of 1-normalize_titrated_data.R), PCA or ICA, 4 | # and to transform test data into the training data reduced dimensional space, 5 | # and back out ('reconstruction') and to then calculate the 'reconstruction 6 | # error' (MASE). 7 | # 8 | # It should be run from the command line. 9 | # USAGE: Rscript 4-ica_pca_feature_reconstruction.R --cancer_type --predictor --n_components --seed --null_model 10 | # n_components refers to the number of components (PC/IC) that should be used 11 | # for reconstruction. 12 | 13 | option_list <- list( 14 | optparse::make_option("--cancer_type", 15 | default = NA_character_, 16 | help = "Cancer type"), 17 | optparse::make_option("--predictor", 18 | default = NA_character_, 19 | help = "Predictor used"), 20 | optparse::make_option("--n_components", 21 | default = 50, 22 | help = "Number of compenents [default: %default]"), 23 | optparse::make_option("--seed", 24 | default = 346, 25 | help = "Random seed [default: %default]"), 26 | optparse::make_option("--null_model", 27 | action = "store_true", 28 | default = FALSE, 29 | help = "Refer to models with permuted dependent variable (within subtype if predictor is a gene)") 30 | ) 31 | 32 | opt <- optparse::parse_args(optparse::OptionParser(option_list=option_list)) 33 | source(here::here("util/option_functions.R")) 34 | check_options(opt) 35 | 36 | # load libraries 37 | suppressMessages(source(here::here("load_packages.R"))) 38 | source(here::here("util", "train_test_functions.R")) 39 | source(here::here("util", "ICA_PCA_reconstruction_functions.R")) 40 | 41 | # set options 42 | cancer_type <- opt$cancer_type 43 | predictor <- opt$predictor 44 | null_model <- opt$null_model 45 | file_identifier <- ifelse(null_model, 46 | str_c(cancer_type, predictor, "null", sep = "_"), 47 | str_c(cancer_type, predictor, sep = "_")) 48 | n.comp <- as.integer(opt$n_components) 49 | 50 | # set seed 51 | initial.seed <- as.integer(opt$seed) 52 | set.seed(initial.seed) 53 | message(paste("\nInitial seed set to:", initial.seed)) 54 | 55 | # define directories 56 | res.dir <- here::here("results") 57 | norm.dir <- here::here("normalized_data") 58 | mdl.dir <- here::here("models") 59 | rcn.dir <- file.path(norm.dir, "reconstructed_data") 60 | rcn.res.dir <- file.path(res.dir, "reconstructed_data") 61 | 62 | # define input files 63 | lf <- list.files(norm.dir, full.names = TRUE) 64 | train.files <- lf[grepl(paste0(file_identifier, 65 | "_array_seq_train_titrate_normalized_list_"), lf)] 66 | test.files <- lf[grepl(paste0(file_identifier, 67 | "_array_seq_test_data_normalized_list_"), lf)] 68 | 69 | # parse filename seeds 70 | filename.seeds <- substr(train.files, 71 | (nchar(train.files)-7), 72 | (nchar(train.files)-4)) 73 | 74 | # define output files 75 | df.file.lead <- paste0(file_identifier, 76 | "_reconstruction_error_", n.comp, "_components_") 77 | mdl.file.lead <- paste0(file_identifier, 78 | "_array_seq_train_", n.comp, "_components_object_") 79 | rcn.file.lead <- paste0(file_identifier, 80 | "_reconstructed_data_", n.comp, "_components_") 81 | 82 | #### main ---------------------------------------------------------------------- 83 | platforms <- c("array", "seq") 84 | #recon.methods <- c("ICA", "PCA") 85 | recon.methods <- c("PCA") # July 2021 update to no longer run ICA 86 | 87 | for (seed in filename.seeds) { 88 | rep.count <- grep(seed, filename.seeds) 89 | message(paste("\n\n#### RECONSTRUCTION ROUND", 90 | rep.count, "of", length(filename.seeds), "####\n\n")) 91 | 92 | #### read in data #### 93 | message("Reading in data...") 94 | train.rds <- train.files[grepl(seed, train.files)] 95 | test.rds <- test.files[grepl(seed, test.files)] 96 | train.data <- readRDS(train.rds) 97 | test.data <- readRDS(test.rds) 98 | train.data <- RestructureNormList(train.data) 99 | 100 | # remove Seurat data from this analysis because it is already in reduced space 101 | train.data$seurat <- NULL 102 | 103 | # get rid of TDM and QN (CN) null values 104 | train.data <- purrr::modify_depth(train.data, 105 | 1, # work on the first level lists 106 | purrr::discard, is.null) # discard if null 107 | 108 | # for each method to be used for reconstruction 109 | for (rcn in recon.methods) { 110 | message(paste(" ", rcn, "on training set")) 111 | 112 | # perform reconstruction method on the training data 113 | train.comp.list <- TrainSetCompAnalysis(train.list = train.data, 114 | num.comp = n.comp, 115 | comp.method = rcn) 116 | 117 | # write the component objects to file in the models directory 118 | comp.rds.name <- paste0(mdl.file.lead, rcn, "_", seed, ".RDS") 119 | saveRDS(train.comp.list, file = file.path(mdl.dir, comp.rds.name)) 120 | 121 | # reconstruction on the holdout data 122 | for(plt in platforms) { # for the two platforms -- microarray and RNA-seq 123 | message(paste("\t Performing", plt, "reconstruction")) 124 | if (plt == "seq" & is.null(train.comp.list$tdm$`0`)){ 125 | # At the 0% RNA-seq level, TDM RNA-seq test data is transformed using the 126 | # log-transformed 100% array data on the reference. So, use 127 | # log-transformed 100% array data as the training set for evaluating the 128 | # TDM method at 0% RNA-seq level. 129 | train.comp.list$tdm$`0` <- train.comp.list$log$`0` 130 | train.comp.list$tdm <- train.comp.list$tdm[c(10, 1:9)] 131 | } 132 | 133 | # perform the reconstruction experiment, which will return reconstructed 134 | # holdout out data in data.table format suitable for category prediction 135 | # and calculate the reconstruction error (MASE) to be returned as a 136 | # data.frame 137 | results <- ReconstructionWrapper(train.list = train.comp.list, 138 | test.list = test.data[[plt]], 139 | num.comps = n.comp) 140 | 141 | # save recon objects 142 | message("\t Saving reconstructed holdout data") 143 | recon.rds <- paste0(rcn.file.lead, rcn, "_", plt, "_", seed,".RDS") 144 | saveRDS(results$recon, file = file.path(rcn.dir, recon.rds)) 145 | 146 | # write error data.frame to file 147 | error.df <- results$mase.df 148 | error.df <- cbind(error.df, rep(plt, nrow(error.df))) 149 | colnames(error.df)[ncol(error.df)] <- "platform" 150 | error.df.name <- paste0(df.file.lead, rcn, "_", plt, "_", seed, ".tsv") 151 | message("\t Saving MASE data.frame") 152 | write.table(error.df, file = file.path(rcn.res.dir, error.df.name), 153 | quote = FALSE, row.names = FALSE, sep = "\t") 154 | 155 | rm(results, error.df) 156 | gc() 157 | 158 | } 159 | 160 | } 161 | 162 | rm(train.data, test.data) 163 | gc() 164 | 165 | } 166 | -------------------------------------------------------------------------------- /5-predict_category_reconstructed_data.R: -------------------------------------------------------------------------------- 1 | # J. Taroni Oct 2016 2 | # The purpose of this script is to perform category prediction 3 | # (from 2-train_test_brca_category.R) on test/holdout data that has been 4 | # reconstructed using the components from PCA on training data (the 5 | # output of 4-ica_pca_feature_reconstruction.R). It outputs a list of 6 | # confusionMatrix objects and a data.frame of Kappa statistics from these 7 | # predictions. 8 | # It should be run from the command line. 9 | # USAGE: Rscript 5-predict_category_reconstructed_data.R --cancer_type --predictor --null_model 10 | 11 | option_list <- list( 12 | optparse::make_option("--cancer_type", 13 | default = NA_character_, 14 | help = "Cancer type"), 15 | optparse::make_option("--predictor", 16 | default = NA_character_, 17 | help = "Predictor used"), 18 | optparse::make_option("--null_model", 19 | action = "store_true", 20 | default = FALSE, 21 | help = "Refer to models with permuted dependent variable (within subtype if predictor is a gene)") 22 | ) 23 | 24 | opt <- optparse::parse_args(optparse::OptionParser(option_list=option_list)) 25 | source(here::here("util/option_functions.R")) 26 | check_options(opt) 27 | 28 | # load libraries 29 | suppressMessages(source(here::here("load_packages.R"))) 30 | source(here::here("util", "train_test_functions.R")) 31 | 32 | # set options 33 | cancer_type <- opt$cancer_type 34 | predictor <- opt$predictor 35 | null_model <- opt$null_model 36 | file_identifier <- ifelse(null_model, 37 | str_c(cancer_type, predictor, "null", sep = "_"), 38 | str_c(cancer_type, predictor, sep = "_")) 39 | 40 | # define directories 41 | mdl.dir <- here::here("models") 42 | norm.dir <- here::here("normalized_data") 43 | res.dir <- here::here("results") 44 | rcn.dir <- file.path(norm.dir, "reconstructed_data") 45 | rcn.res.dir <- file.path(res.dir, "reconstructed_data") 46 | 47 | # define input files 48 | supervised.model.files <- list.files(mdl.dir, 49 | pattern = paste0(file_identifier, 50 | "_train_3_models"), 51 | full.names = TRUE) 52 | recon.files <- list.files(rcn.dir, 53 | pattern = paste0(file_identifier, 54 | "_reconstructed_data_"), 55 | full.names = TRUE) 56 | 57 | # get filename.seeds (identifiers for each replicate) 58 | # from the reconstructed data files 59 | filename.seeds <- unique(substr(recon.files, 60 | (nchar(recon.files)-7), 61 | (nchar(recon.files)-4))) 62 | 63 | # define output files 64 | cm.file.lead <- paste0(file_identifier, 65 | "_prediction_reconstructed_data_confusionMatrices_") 66 | kap.file.lead <- paste0(file_identifier, 67 | "_prediction_reconstructed_data_kappa_") 68 | 69 | #### main ---------------------------------------------------------------------- 70 | 71 | platforms <- c("array", "seq") 72 | recon.methods <- c("PCA") 73 | 74 | for (seed in filename.seeds) { 75 | 76 | # error-handling -- want to make sure there is a corresponding supervised 77 | # model file to current reconstructed data file (seed) 78 | check.model.file <- any(grepl(seed, supervised.model.files)) 79 | if (!check.model.file) { 80 | stop(paste("There is no corresponding supervised model file for 81 | filename.seed:", seed)) 82 | } 83 | 84 | rep.count <- grep(seed, filename.seeds) 85 | message(paste("\n\n#### CATEGORY PREDICTION", 86 | rep.count, "of", length(filename.seeds), "####\n\n")) 87 | 88 | # read in supervised models (LASSO, linear SVM, random forest) 89 | train.rds <- supervised.model.files[grep(seed, supervised.model.files)] 90 | train.list <- readRDS(train.rds) 91 | 92 | # remove Seurat data from this analysis because it is already in reduced space 93 | train.list$seurat <- NULL 94 | 95 | # need to read in corresponding sample.df 96 | sample.df.file <- 97 | file.path(res.dir, 98 | paste0(file_identifier, 99 | "_matchedSamples_training_testing_split_labels_", 100 | seed, ".tsv")) 101 | sample.df <- data.table::fread(sample.df.file, data.table = F) 102 | sample.df$category <- as.factor(sample.df$category) 103 | 104 | # initialize list to hold confusionMatrices & kappa statistics 105 | kappa.list <- list() 106 | cm.list <- list() 107 | 108 | for (plt in platforms) { 109 | plt.list <- list() 110 | plt.kap.list <- list() 111 | for (rcn in recon.methods) { 112 | 113 | # read in reconstructed data from current platform and reconstruction 114 | # method 115 | file.identifier <- paste(rcn, plt, seed, sep = "_") 116 | recon.rds <- recon.files[grep(file.identifier, recon.files)] 117 | recon.list <- readRDS(recon.rds) 118 | 119 | # return list of confusion matrix objects AND kappa statistics 120 | cm_kappa.list <- PredictWrapper(train.model.list = train.list, 121 | pred.list = recon.list, 122 | sample.df = sample.df, 123 | only.kap = FALSE, 124 | run.parallel = FALSE) 125 | 126 | # get confusionMatrix objects 127 | plt.list[[rcn]] <- cm_kappa.list$confusion_matrix_objects 128 | 129 | # get kappa statistics 130 | plt.kap.list[[rcn]] <- cm_kappa.list$kappa_statistics 131 | 132 | # remove reconstructed data and cm_kappa_list 133 | rm(recon.list, cm_kappa.list) 134 | gc() 135 | 136 | } 137 | 138 | cm.list[[plt]] <- plt.list 139 | kappa.list[[plt]] <- plt.kap.list 140 | 141 | } 142 | 143 | # save confusion matrices 144 | cm.file.name <- file.path(rcn.res.dir, paste0(cm.file.lead, seed, ".RDS")) 145 | saveRDS(cm.list, file = cm.file.name) 146 | 147 | # get kappa stats into data.frame from nested list and save as data.frame 148 | kappa.df <- reshape2::melt(kappa.list) 149 | colnames(kappa.df) <- c("Perc.seq", "Classifier", "Normalization", 150 | "Measure", "Kappa", "Reconstruction", "Platform") 151 | kap.file.name <- file.path(rcn.res.dir, paste0(kap.file.lead, seed, ".tsv")) 152 | write.table(kappa.df, file = kap.file.name, row.names = F, quote = F, 153 | sep = "\t") 154 | 155 | rm(train.list, kappa.list, cm.list) 156 | gc() 157 | 158 | } 159 | -------------------------------------------------------------------------------- /6-save_recon_error_kappa_data.R: -------------------------------------------------------------------------------- 1 | # J. Taroni Oct 2016 2 | # This script plots reconstruction errors (MASE and RMSE) from 3 | # 4-ica_pca_feature_reconstruction.R and the Kappa statistics associated with 4 | # predictions on reconstructed data from 5-predict_category_reconstructed_data.R 5 | # as violin plots, respectively. 6 | # USAGE: Rscript 6-plot_recon_error_kappa.R --cancer_type --predictor --null_model 7 | 8 | option_list <- list( 9 | optparse::make_option("--cancer_type", 10 | default = NA_character_, 11 | help = "Cancer type"), 12 | optparse::make_option("--predictor", 13 | default = NA_character_, 14 | help = "Predictor used"), 15 | optparse::make_option("--null_model", 16 | action = "store_true", 17 | default = FALSE, 18 | help = "Refer to models with permuted dependent variable (within subtype if predictor is a gene)") 19 | ) 20 | 21 | opt <- optparse::parse_args(optparse::OptionParser(option_list=option_list)) 22 | source(here::here("util/option_functions.R")) 23 | check_options(opt) 24 | 25 | # load libraries 26 | suppressMessages(library(tidyverse)) 27 | source(here::here("util", "color_blind_friendly_palette.R")) 28 | 29 | # set options 30 | cancer_type <- opt$cancer_type 31 | predictor <- opt$predictor 32 | null_model <- opt$null_model 33 | file_identifier <- ifelse(null_model, 34 | str_c(cancer_type, predictor, "null", sep = "_"), 35 | str_c(cancer_type, predictor, sep = "_")) 36 | 37 | # define directories 38 | plot.dir <- here::here("plots") 39 | plot.data.dir <- file.path(plot.dir, "data") 40 | rcn.res.dir <- here::here("results", "reconstructed_data") 41 | 42 | # define input files 43 | # pattern = "kappa" captures a downstream output file if this script is rerun 44 | # pattern = "kappa_[0-9]+.tsv" captures the intended filenames including seeds between 1:10000 45 | kappa.df.files <- list.files(rcn.res.dir, 46 | pattern = paste0(file_identifier, 47 | "_prediction_reconstructed_data_kappa_[0-9]+.tsv"), 48 | full.names = TRUE) 49 | error.files <- list.files(rcn.res.dir, 50 | pattern = paste0(file_identifier, 51 | "_reconstruction_error"), 52 | full.names = TRUE) 53 | 54 | # define output files 55 | kap.plot.file.lead <- file.path(plot.dir, paste0(file_identifier, "_kappa_reconstructed_data_")) 56 | err.plot.file.lead <- file.path(plot.dir, paste0(file_identifier, "_reconstruction_error_")) 57 | kap.plot.data.file <- file.path(plot.data.dir, paste0(file_identifier, "_kappa_reconstructed_data.tsv")) 58 | err.plot.data.file <- file.path(plot.data.dir, paste0(file_identifier, "_reconstruction_error.tsv")) 59 | 60 | #### plot kappa stats ---------------------------------------------------------- 61 | 62 | # read in kappa data.frames from each replicate and bind -- line plot with 63 | # boxplot "confidence intervals" 64 | kappa.df.list <- list() 65 | fl.iter <- 1 66 | for (fl in kappa.df.files) { 67 | kappa.df.list[[fl.iter]] <- data.table::fread(fl, data.table = FALSE) 68 | fl.iter <- fl.iter + 1 69 | } 70 | kappa.master.df <- as.data.frame(data.table::rbindlist(kappa.df.list)) 71 | rm(kappa.df.list) 72 | 73 | # order Perc.seq so line plot displays 0-100 74 | kappa.master.df$Perc.seq <- factor(kappa.master.df$Perc.seq, 75 | levels = seq(0, 100, 10)) 76 | 77 | # rename classifiers 78 | 79 | cls.recode.str <- 80 | "'glmnet' = 'LASSO'; 'rf' = 'Random Forest'; 'svm' = 'Linear SVM'" 81 | kappa.master.df$Classifier <- car::recode(kappa.master.df$Classifier, 82 | recodes = cls.recode.str) 83 | kappa.master.df$Classifier <- as.factor(kappa.master.df$Classifier) 84 | 85 | # get norm and reconstruction methods as factors 86 | kappa.master.df$Normalization <- stringr::str_to_upper(kappa.master.df$Normalization) 87 | kappa.master.df$Normalization <- as.factor(kappa.master.df$Normalization) 88 | kappa.master.df$Reconstruction <- as.factor(kappa.master.df$Reconstruction) 89 | 90 | # rename platforms 91 | plt.recode.str <- 92 | "'array' = 'Microarray'; 'seq' = 'RNA-seq'" 93 | kappa.master.df$Platform <- car::recode(kappa.master.df$Platform, 94 | recodes = plt.recode.str) 95 | kappa.master.df$Platform <- as.factor(kappa.master.df$Platform) 96 | 97 | 98 | write.table(kappa.master.df, 99 | file = kap.plot.data.file, 100 | quote = FALSE, sep = "\t", row.names = FALSE) 101 | 102 | # get summary data.frame + write to file 103 | kappa.summary.df <- 104 | kappa.master.df %>% 105 | dplyr::group_by(Classifier, Normalization, Platform, Perc.seq) %>% 106 | dplyr::summarise(Median = median(Kappa), 107 | Mean = mean(Kappa), 108 | SD = sd(Kappa), 109 | .groups = "drop") 110 | readr::write_tsv(kappa.summary.df, 111 | file.path(rcn.res.dir, 112 | paste0(file_identifier, 113 | "_kappa_reconstructed_data_summary_table.tsv"))) 114 | 115 | rm(kappa.master.df) 116 | 117 | #### plot error measures ------------------------------------------------------- 118 | 119 | # read in error measure data.frames from each replicate and bind -- violin plot 120 | error.df.list <- list() 121 | for(fl.iter in seq_along(error.files)){ 122 | error.df.list[[fl.iter]] <- data.table::fread(error.files[fl.iter], 123 | data.table = FALSE) 124 | } 125 | error.master.df <- as.data.frame(data.table::rbindlist(error.df.list)) 126 | rm(error.df.list) 127 | 128 | # order perc.seq so plot displays 0-100 129 | error.master.df$perc.seq <- factor(error.master.df$perc.seq, 130 | levels = seq(0, 100, by = 10)) 131 | 132 | # get norm and reconstruction methods as factors 133 | error.master.df$norm.method <- stringr::str_to_upper(error.master.df$norm.method) 134 | error.master.df$norm.method <- as.factor(error.master.df$norm.method) 135 | 136 | # rename platforms -- same as above for kappa data.frame 137 | error.master.df$platform <- car::recode(error.master.df$platform, 138 | recodes = plt.recode.str) 139 | error.master.df$platform <- as.factor(error.master.df$platform) 140 | 141 | # reconstruction method as factor 142 | error.master.df$comp.method <- as.factor(error.master.df$comp.method) 143 | 144 | # take the average of each genes error across replicates 145 | error.mean.df <- error.master.df %>% 146 | dplyr::group_by(gene, perc.seq, norm.method, comp.method, platform) %>% 147 | dplyr::summarise(mean_mase = mean(MASE), 148 | .groups = "drop") 149 | rm(error.master.df) 150 | colnames(error.mean.df) <- c("Gene", "Perc.seq", "Normalization", 151 | "Method", "Platform", "Mean_Value") 152 | 153 | write.table(error.mean.df, 154 | file = err.plot.data.file, 155 | quote = FALSE, sep = "\t", row.names = FALSE) 156 | -------------------------------------------------------------------------------- /8-PLIER_pathways_analysis.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "PLIER pathways analysis" 3 | output: html_notebook 4 | author: "Steven Foltz" 5 | date: "December 2022" 6 | --- 7 | 8 | ```{r} 9 | meaningful_difference <- 0.2 10 | n_repeats <- 10 11 | ``` 12 | 13 | ### What additional pathways are identified by PLIER after doubling the sample size? 14 | 15 | Here we look for oncogenic pathways (defined by [GSEA MSigDB C6](https://www.gsea-msigdb.org/gsea/msigdb/human/genesets.jsp?collection=C6)) that are found more frequently in the data with the full sample size than in data with half the sample size. 16 | By focusing on pathways that are reliably detected more often in the full sample size data, we can identify stable patterns that emerge when more data is available. 17 | We ran PLIER `r n_repeats` times using each combination of data normalization method and sample size (half or full). 18 | Out of those `r n_repeats` runs, we found the proportion of times each oncogenic pathway was significantly associated with a latent variable. 19 | We set an arbitrary "meaningful difference" threshold of `r meaningful_difference` to detect when that proportion was meaningfully greater using full sample size compared to half sample size and require array and RNA-seq data to both to satisfy the condition. 20 | We also arbitrarily limit results to those pathways found in over half of runs using the full size data. 21 | 22 | ### Load packages and pathways 23 | 24 | ```{r} 25 | library(tidyverse) 26 | data("oncogenicPathways", package = "PLIER") 27 | ``` 28 | 29 | 30 | ### Set input file names 31 | ```{r} 32 | plots_data_dir <- here::here("plots", "data") 33 | brca_filename <- file.path(plots_data_dir, "BRCA_subtype_PLIER_pathways.tsv") 34 | gbm_filename <- file.path(plots_data_dir, "GBM_subtype_PLIER_pathways.tsv") 35 | ``` 36 | 37 | ### Read in data for each cancer type 38 | ```{r} 39 | pathways_df <- NULL 40 | 41 | if (file.exists(brca_filename)) { 42 | brca_pathways_df <- read_tsv(brca_filename) %>% 43 | mutate(cancer_type = "BRCA") 44 | pathways_df <- bind_rows(pathways_df, 45 | brca_pathways_df) 46 | } else { 47 | message(str_c("BRCA file ", brca_filename, " does not exist.")) 48 | } 49 | 50 | if (file.exists(gbm_filename)) { 51 | gbm_pathways_df <- read_tsv(gbm_filename) %>% 52 | mutate(cancer_type = "GBM") 53 | pathways_df <- bind_rows(pathways_df, 54 | gbm_pathways_df) 55 | } else { 56 | message(str_c("GBM file ", gbm_filename, " does not exist.")) 57 | } 58 | 59 | if (!file.exists(brca_filename) & !file.exists(gbm_filename)) { 60 | stop(str_c("Neither BRCA file ", brca_filename, 61 | " nor GBM file ", gbm_filename, 62 | " exists.")) 63 | } 64 | 65 | ``` 66 | 67 | ### Filter data to identify oncogenic pathways detected more frequently in full data 68 | ```{r} 69 | 70 | pathways_df %>% 71 | filter(FDR < 0.05, # require significant association with an LV 72 | pathway %in% colnames(oncogenicPathways)) %>% # require oncogenic pathways 73 | # for each combination of normalization method, %RNA-seq, pathway, and cancer type 74 | group_by(nmeth, pseq, pathway, cancer_type) %>% 75 | # summarize by finding the count and proportion of repeats in which that 76 | # pathway was significantly associated with at least one latent variable 77 | summarize(n_seeds = length(unique(seed_index)), 78 | prop_seeds = n_seeds/n_repeats, 79 | .groups = "drop") %>% 80 | # clean up normalization method strings (remove parentheses, spaces, and hyphens) 81 | mutate(nmeth = str_remove_all(nmeth, "[\\(\\)]")) %>% 82 | mutate(nmeth = str_replace_all(nmeth, c(" " = "_", "-" = "_"))) %>% 83 | # create new variable combining normalization method and %RNA-seq 84 | mutate(nmeth_pseq = str_c(nmeth, pseq, sep = "_")) %>% 85 | select(cancer_type, pathway, prop_seeds, nmeth_pseq) %>% 86 | # create new columns for each combination of norm method and %RNA-seq 87 | # each row corresponds to a single pathway from a cancer type 88 | pivot_wider(id_cols = c("cancer_type", "pathway"), 89 | names_from = nmeth_pseq, 90 | values_from = prop_seeds) %>% 91 | mutate_if(is.numeric, replace_na, 0) %>% # replace NAs with 0s 92 | # reduce to "meaningful" results 93 | # 1. difference in array only data must be meaningful 94 | # 2. difference in seq only data must be meaningful 95 | # 3. pathway must be detected in over half of full data sets (array only, RNA-seq only, and NPN 50%/50%) 96 | filter(log_0 - array_only_50 >= meaningful_difference, 97 | log_100 - seq_only_50 >= meaningful_difference, 98 | log_0 > 0.5, 99 | log_100 > 0.5, 100 | npn_50 > 0.5) %>% 101 | select(cancer_type, pathway, array_only_50, seq_only_50, log_0, log_100, npn_50) %>% 102 | arrange(cancer_type) %>% 103 | knitr::kable() 104 | ``` 105 | 106 | | variable_name | meaning | 107 | | --- | --- | 108 | | `array_only_50` | LOG transformed array data (half sample size) | 109 | | `seq_only_50` | LOG transformed RNA-seq data (half sample size) | 110 | | `log_0` | LOG transformed array data (full sample size) | 111 | | `log_100` | LOG transformed RNA-seq data (full sample size) | 112 | | `npn_50` | NPN transformed data, 50% array and 50% RNA-seq (full sample size) | 113 | 114 | ### Citation: 115 | Mao W, Zaslavsky E, Hartmann BM, Sealfon SC, Chikina M. Pathway-level information extractor (PLIER) for gene expression data. Nat Methods. 2019;16: 607–610. -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (C) 2016 Trustees of the University of Pennsylvania. 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions 6 | are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright 12 | notice, this list of conditions and the following disclaimer in the 13 | documentation and/or other materials provided with the distribution. 14 | 15 | 3. Neither the name of the copyright holder nor the names of its 16 | contributors may be used to endorse or promote products derived 17 | from this software without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 22 | FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 23 | COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 24 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 25 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 26 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 29 | ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 | POSSIBILITY OF SUCH DAMAGE. 31 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Cross-platform normalization enables machine learning model training on microarray and RNA-seq data simultaneously 2 | 3 | **Published article:** Foltz, S. M., Greene, C. S. & Taroni, J. N. Cross-platform normalization enables machine learning model training on microarray and RNA-seq data simultaneously. *Commun Biol* 6, 222 (2023). https://doi.org/10.1038/s42003-023-04588-6 4 | 5 | 6 | 7 | **Table of Contents** *generated with [DocToc](https://github.com/thlorenz/doctoc)* 8 | 9 | - [Summary](#summary) 10 | - [Requirements](#requirements) 11 | - [Obtaining and running the Docker container](#obtaining-and-running-the-docker-container) 12 | - [Download data from The Cancer Genome Atlas (TCGA)](#download-data-from-the-cancer-genome-atlas-tcga) 13 | - [Recreate manuscript results](#recreate-manuscript-results) 14 | - [Methods](#methods) 15 | - [Machine Learning Pipeline](#machine-learning-pipeline) 16 | - [Running individual experiments](#running-individual-experiments) 17 | - [Machine learning](#machine-learning) 18 | - [Other scripts](#other-scripts) 19 | - [Manuscript versions](#manuscript-versions) 20 | - [Funding](#funding) 21 | 22 | 23 | 24 | ## Summary 25 | 26 | We performed a series of supervised and unsupervised machine learning 27 | evaluations, as well as differential expression and pathway analyses, to assess which 28 | normalization methods are best suited for combining data from microarray and 29 | RNA-seq platforms. 30 | 31 | We evaluated seven normalization approaches for all methods: 32 | 33 | 1. log-transformation (LOG) 34 | 2. [non-paranormal transformation](https://arxiv.org/abs/0903.0649) (NPN) 35 | 3. [quantile normalization](http://bmbolstad.com/misc/normalize/bolstad_norm_paper.pdf) (QN) 36 | 4. [quantile normalization via CrossNorm](https://www.nature.com/articles/srep18898) 37 | 5. quantile normalization followed by z-scoring (QN-Z) 38 | 6. [Training Distribution Matching](https://peerj.com/articles/1621/) (TDM) 39 | 7. z-scoring (Z) 40 | 41 | We also explored the use of [Seurat](https://satijalab.org/seurat/) to normalize array and RNA-seq data. 42 | Due to low sample numbers at the edges of our titration protocol, many experimental conditions could not be integrated. 43 | 44 | ## Requirements 45 | 46 | We recommend using the docker image `envest/rnaseq_titration_results:R-4.1.2` to handle package and dependency installation. 47 | See `docker/R-4.1.2/Dockerfile` for more information. 48 | 49 | Our analysis ([v2.3](https://github.com/greenelab/RNAseq_titration_results/releases/tag/v2.3)) was run using 7 cores on an AWS instance with 16 cores, 128 GB memory, and an allocated 1 TB of space. 50 | 51 | ### Obtaining and running the Docker container 52 | 53 | Pull the docker image using: 54 | 55 | ``` 56 | docker pull envest/rnaseq_titration_results:R-4.1.2 57 | ``` 58 | 59 | Then run the command to start up a container, replacing `[PASSWORD]` with your own password: 60 | 61 | ``` 62 | docker run --mount type=bind,target=/home/rstudio,source=$PWD -e PASSWORD=[PASSWORD] -p 8787:8787 envest/rnaseq_titration_results:R-4.1.2 63 | ``` 64 | 65 | Navigate to and login to the RStudio server with the username `rstudio` and the password you set above. 66 | 67 | 68 | ## Download data from The Cancer Genome Atlas (TCGA) 69 | 70 | TCGA data from 520 breast cancer (BRCA) patients used for these analyses 71 | is [available at zenodo](https://zenodo.org/record/58862). 72 | 73 | Data from 150 glioblastoma (GBM) patients is available from the [Genomic Data Commons PanCan Atlas](https://gdc.cancer.gov/about-data/publications/pancanatlas). 74 | 75 | To download data, run the data download script in the top directory: 76 | 77 | ``` 78 | bash download_TCGA_data.sh 79 | ``` 80 | 81 | ## Recreate manuscript results 82 | 83 | After data has been downloaded, running 84 | 85 | ``` 86 | bash run_all_analyses_and_plots.sh [cancer type] 87 | ``` 88 | where 89 | 90 | - `[cancer type]` is `both`, `BRCA` or `GBM` 91 | 92 | with [v2.3](https://github.com/greenelab/RNAseq_titration_results/releases/tag/v2.3) of this repository will reproduce the results presented in our manuscript. 93 | We recommend running all analyses within the project Docker container. 94 | 95 | ## Methods 96 | 97 | ### Machine Learning Pipeline 98 | 99 | Here's a schematic overview of our machine learning experiments: 100 | 101 | ![](diagrams/RNA-seq_titration_ML_overview.png) 102 | 103 | **Overview of supervised and unsupervised machine learning experiments.** 104 | 105 | 1. Matched samples run on both microarray and RNA-seq were split into a training (2/3) and holdout set (1/3). 106 | 2. RNA-seq samples were "titrated" into the training set, 10% at a time (0-100%), replacing their matched array samples, resulting in eleven training sets for each normalization method. 107 | 3. Machine learning applications: 108 | 109 | - _Supervised learning_: 110 | We trained three classifiers – LASSO, linear SVM, and Random Forest — on each training set and tested them on the microarray and RNA-seq holdout sets. 111 | The models were trained to predict tumor subtype (both cancer types have 5 subtypes) and the binary mutation status of _TP53_ and _PIK3CA_. 112 | 113 | - _Unsupervised learning_: 114 | We projected holdout sets onto and back out of the training set space using Principal Components Analysis to obtain reconstructed holdout sets. 115 | We then used the trained subtype classifiers to predict on the reconstructed holdout sets. 116 | [PLIER](https://github.com/wgmao/PLIER) (Pathway-Level Information ExtractoR) identified coordinated sets of genes in each cancer type. 117 | 118 | ## Running individual experiments 119 | 120 | #### Machine learning 121 | 122 | To run the machine learning pipeline, run in top directory: 123 | 124 | ``` 125 | bash run_machine_learning_experiments.sh [cancer type] [prediction task] [n cores] 126 | ``` 127 | 128 | where 129 | 130 | - `[cancer type]` is `BRCA` or `GBM` 131 | - `[prediction task]` is `subtype`, `TP53`, or `PIK3CA` 132 | - `[n cores]` is the number of cores you want to run in parallel 133 | 134 | #### Other scripts 135 | 136 | To search for the number of publicly available microarray and RNA-seq samples from [GEO](https://www.ncbi.nlm.nih.gov/geo/) and [ArrayExpress](https://www.ebi.ac.uk/arrayexpress/), run 137 | 138 | ``` 139 | python3 search_geo_arrayexpress.py 140 | ``` 141 | and check the output in `results/array_rnaseq_ratio`. 142 | 143 | To compare PLIER pathways that are more frequently identified using the full sample size data compared to half sample size data, run 144 | ``` 145 | Rscript -e "rmarkdown::render('8-PLIER_pathways_analysis.Rmd', clean = TRUE)" 146 | ``` 147 | and examine the results in `8-PLIER_pathways_analysis.nb.html`. 148 | 149 | ## Manuscript versions 150 | 151 | | Version | Relevant links | 152 | | :------ | :------------- | 153 | | [v2.3](https://github.com/greenelab/RNAseq_titration_results/releases/tag/v2.3) | [Published article](https://doi.org/10.1038/s42003-023-04588-6), [Figshare+ data](https://doi.org/10.25452/figshare.plus.19629864.v5), [Data for plots](https://doi.org/10.6084/m9.figshare.19686453.v5) | 154 | | [v2.2](https://github.com/greenelab/RNAseq_titration_results/releases/tag/v2.2) | [Figshare+ data](https://doi.org/10.25452/figshare.plus.19629864.v3), [Data for plots](https://doi.org/10.6084/m9.figshare.19686453.v3) | 155 | | [v2.1](https://github.com/greenelab/RNAseq_titration_results/releases/tag/v2.1) | [Figshare+ data](https://doi.org/10.25452/figshare.plus.19629864.v2), [Data for plots](https://doi.org/10.6084/m9.figshare.19686453.v2) | 156 | | [v2.0](https://github.com/greenelab/RNAseq_titration_results/releases/tag/v2.0) | [Figshare+ data](https://doi.org/10.25452/figshare.plus.19629864.v1), [Data for plots](https://doi.org/10.6084/m9.figshare.19686453.v1) | 157 | | [v1.1](https://github.com/greenelab/RNAseq_titration_results/releases/tag/v1.1) | [Figshare full results](https://doi.org/10.6084/m9.figshare.5035997.v2) | 158 | | [v1.0](https://github.com/greenelab/RNAseq_titration_results/releases/tag/v1.0) | [Pre-print](https://doi.org/10.1101/118349) | 159 | 160 | ## Funding 161 | 162 | This work was supported by the Gordon and Betty Moore Foundation [GBMF 4552], Alex's Lemonade Stand Foundation [GR-000002471], and the National Institutes of Health [T32-AR007442, U01-TR001263, R01-CA237170, K12GM081259]. 163 | 164 | # FAQ 165 | 166 | **Can I normalize array data to match RNA-seq data?** 167 | 168 | *We generally do not advise this study design. We expect array data to have less precision at higher expression levels due to saturation, while counts-based RNA-seq data does not have that problem. We recommend reshaping the data expected to have more dynamic range (RNA-seq) to fit the narrower and less precise (array) distribution. See also [TDM FAQs](https://github.com/greenelab/TDM#faq).* 169 | -------------------------------------------------------------------------------- /brca_data_urls.txt: -------------------------------------------------------------------------------- 1 | https://zenodo.org/record/58862/files/BRCAarray.pcl 2 | https://zenodo.org/record/58862/files/BRCAClin.tsv 3 | https://zenodo.org/record/58862/files/BRCARNASeq.pcl 4 | https://zenodo.org/record/58862/files/BRCARNASeqClin.tsv 5 | -------------------------------------------------------------------------------- /check_installs.R: -------------------------------------------------------------------------------- 1 | require("ape") 2 | require("binr") 3 | require("caret") 4 | require("cluster") 5 | require("corrplot") 6 | require("cowplot") 7 | require("data.table") 8 | require("devtools") 9 | require("doParallel") 10 | require("e1071") 11 | require("fastICA") 12 | require("flexclust") 13 | require("fpc") 14 | require("gdata") 15 | require("glmnet") 16 | require("gridExtra") 17 | require("Hmisc") 18 | require("huge") 19 | require("kernlab") 20 | require("limma") 21 | require("parallel") 22 | require("plyr") 23 | require("preprocessCore") 24 | require("quantro") 25 | require("ranger") 26 | require("reshape2") 27 | require("scales") 28 | require("sdcMicro") 29 | require("TDM") 30 | require("tidyverse") 31 | -------------------------------------------------------------------------------- /check_sums.tsv: -------------------------------------------------------------------------------- 1 | 2f4f2fcd97eff5385c0b1205b719b8dc data/BRCAClin.tsv 2 | 7f00ea6ef1f309773b02e6118046550f data/BRCARNASeq.pcl 3 | d4486dde14da14b4f8887a7415e2866f data/BRCARNASeqClin.tsv 4 | 1a89ea769381e300e5a88ec61713ad9e data/BRCAarray.pcl 5 | 02e72c33071307ff6570621480d3c90b data/EBPlusPlusAdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.tsv 6 | ce911443e071c4251fb3f196780e76de data/GBMClin.tsv 7 | 949938abadd336eb9a2f698b3102e1bb data/GBMRNASeq.pcl 8 | 3bbdad11c322ebf3b03ada07263c6444 data/GBMarray.pcl 9 | e5df57691b44c47b8c916116b5ac7acf data/PanCan-General_Open_GDC-Manifest_2.txt 10 | a4591b2dcee39591f59e5e25a6ce75fa data/TCGA-CDR-SupplementalTableS1.xlsx 11 | 7fafc537807d5b3ddf0bb89665279a9d data/broad.mit.edu_PANCAN_Genome_Wide_SNP_6_whitelisted.seg 12 | 7c5f8a12d6ca986e5ebba93281360517 data/combined_clinical_data.BRCA.tsv 13 | 94621b5396bd5d69eb36b6c5503dec97 data/combined_clinical_data.GBM.tsv 14 | 1d8834a51282396e07e3ce9a5417d024 data/gbm_clinical_table_S7.xlsx 15 | 639ad8f8386e98dacc22e439188aa8fa data/mc3.v0.2.8.PUBLIC.maf.gz 16 | 7583a5fb4d23d50b79813b26469f6385 data/mutations.BRCA.tsv 17 | 15cae05325c1b0562be8029efba5534a data/mutations.GBM.tsv 18 | 5484229fa691a721dd7fd08ade2233e7 data/mutations.maf 19 | e56585bd0c2e59658b1d54fc8b0c9df2 data/mutations.tsv 20 | b62634d9eccbb548499ce384605fe47a data/GSE83130/LICENSE.TXT 21 | 9ed2fa92d31d51f17fc048b98158a5e1 data/GSE83130/README.md 22 | 76a0454f911aeb17276725abb760ce89 data/GSE83130/GSE83130/GSE83130.tsv 23 | dca310d9643a18d35e694425c56b9d2b data/GSE83130/GSE83130/metadata_GSE83130.tsv 24 | -------------------------------------------------------------------------------- /classifier_repeat_wrapper.R: -------------------------------------------------------------------------------- 1 | # J. Taroni Jul 2016 2 | # This script is a wrapper for running the BRCA subtype pipeline repeatedly with 3 | # different random seeds. 4 | # It should be run from the command line. 5 | # USAGE: Rscript classifier_repeat_wrapper.R --cancer_type [BRCA|GBM] --predictor [subtype|TP53|PIK3CA] --n_repeats (default: 10) --null_model --ncores 6 | 7 | option_list <- list( 8 | optparse::make_option("--cancer_type", 9 | default = NA_character_, 10 | help = "Cancer type"), 11 | optparse::make_option("--predictor", 12 | default = NA_character_, 13 | help = "Predictor used"), 14 | optparse::make_option("--n_repeats", 15 | default = 10, 16 | help = "Number of times experiment is repeated [default: %default]"), 17 | optparse::make_option("--null_model", 18 | action = "store_true", 19 | default = FALSE, 20 | help = "Permute dependent variable (within subtype if predictor is a gene)"), 21 | optparse::make_option("--ncores", 22 | default = NA_integer_, 23 | help = "Set the number of cores to use") 24 | ) 25 | 26 | opt <- optparse::parse_args(optparse::OptionParser(option_list=option_list)) 27 | source(here::here("util/option_functions.R")) 28 | check_options(opt) 29 | 30 | # set options 31 | cancer_type <- opt$cancer_type 32 | predictor <- opt$predictor 33 | n.repeats <- opt$n_repeats 34 | null_model <- opt$null_model 35 | ncores <- min(parallel::detectCores() - 1, 36 | opt$ncores, 37 | na.rm = TRUE) 38 | 39 | message(paste("\nPredicting", predictor, 40 | "in", cancer_type, 41 | ifelse(null_model, 42 | "(null model) ...", 43 | "..."))) 44 | message(paste("\nNumber of repeats set to", n.repeats)) 45 | message(paste("\nUsing", ncores, "out of", parallel::detectCores(), "cores")) 46 | 47 | initial.seed <- 12 48 | set.seed(initial.seed) 49 | 50 | seeds <- sample(1:10000, n.repeats) 51 | 52 | rep.count <- 1 53 | for(seed in seeds){ 54 | message(paste("\n\n#### REPEAT NUMBER", rep.count, "####\n\n")) 55 | system(paste("Rscript run_experiments.R", 56 | "--cancer_type", cancer_type, 57 | "--predictor", predictor, 58 | "--seed", seed, 59 | ifelse(null_model, 60 | "--null_model", 61 | ""), 62 | "--ncores", ncores)) 63 | rep.count <- rep.count + 1 64 | } 65 | 66 | system(paste("Rscript 3-combine_category_kappa.R", 67 | "--cancer_type", cancer_type, 68 | "--predictor", predictor, 69 | ifelse(null_model, 70 | "--null_model", 71 | ""))) 72 | -------------------------------------------------------------------------------- /combine_clinical_data.R: -------------------------------------------------------------------------------- 1 | # Script combines clinical data from all cancer types to one data frame 2 | # Clinical data includes: subtype and TP53/PIK3CA mutation status 3 | # Steven Foltz August 2021 4 | 5 | option_list <- list( 6 | optparse::make_option("--cancer_type", 7 | default = NA_character_, 8 | help = "Cancer type"), 9 | optparse::make_option("--clinical_input", 10 | default = NA_character_, 11 | help = "Clinical information input file path (.tsv)"), 12 | optparse::make_option("--mutation_input", 13 | default = NA_character_, 14 | help = "Mutation input file path (.tsv)"), 15 | optparse::make_option("--combined_output", 16 | default = NA_character_, 17 | help = "Combined subtype and mutation output file path (.tsv)"), 18 | optparse::make_option("--overwrite", 19 | action = "store_true", 20 | default = FALSE, 21 | help = "Overwrite existing output files [default: %default]") 22 | ) 23 | 24 | opt <- optparse::parse_args(optparse::OptionParser(option_list=option_list)) 25 | source(here::here("util/option_functions.R")) 26 | check_options(opt) 27 | 28 | # load libraries 29 | suppressMessages(library(tidyverse)) 30 | 31 | # set options 32 | cancer_type <- opt$cancer_type 33 | clinical_input_filepath <- opt$clinical_input 34 | mutation_input_filepath <- opt$mutation_input 35 | combined_output_filepath <- opt$combined_output 36 | 37 | ################################################################################ 38 | # Read in clinical and mutation data 39 | ################################################################################ 40 | clinical_df <- read_tsv(clinical_input_filepath, # treat all columns equally 41 | col_types = cols(.default = col_character())) %>% 42 | mutate(Sample = substr(Sample, 1, 15)) # remove extra parts of TCGA ID 43 | 44 | mutation_df <- read_tsv(mutation_input_filepath, # treat all columns equally 45 | col_types = cols(.default = col_character())) %>% 46 | mutate(tcga_id = substr(tcga_id, 1, 15)) # remove extra parts of TCGA ID 47 | 48 | ################################################################################ 49 | # Combine clinical and mutation data 50 | ################################################################################ 51 | 52 | # combine data frames with left_join() to get the left side of venn diagram 53 | # start join with clinical_df because later scripts expect column name = Sample 54 | combined_df <- clinical_df %>% 55 | left_join(mutation_df, 56 | by = c("Sample" = "tcga_id")) %>% 57 | mutate(PIK3CA = case_when(PIK3CA == 0 ~ "No_PIK3CA_mutation", 58 | PIK3CA == 1 ~ "PIK3CA_mutation"), 59 | TP53 = case_when(TP53 == 0 ~ "No_TP53_mutation", 60 | TP53 == 1 ~ "TP53_mutation")) 61 | 62 | ################################################################################ 63 | # Save output file 64 | ################################################################################ 65 | 66 | write_tsv(combined_df, 67 | combined_output_filepath) 68 | -------------------------------------------------------------------------------- /data/.empty: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/greenelab/RNAseq_titration_results/2d9b2f5ce2f04fc76279b2fc314e51abc9a4f3b4/data/.empty -------------------------------------------------------------------------------- /diagrams/RNA-seq_titration_ML_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/greenelab/RNAseq_titration_results/2d9b2f5ce2f04fc76279b2fc314e51abc9a4f3b4/diagrams/RNA-seq_titration_ML_overview.png -------------------------------------------------------------------------------- /diagrams/RNA-seq_titration_diff_expression_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/greenelab/RNAseq_titration_results/2d9b2f5ce2f04fc76279b2fc314e51abc9a4f3b4/diagrams/RNA-seq_titration_diff_expression_overview.png -------------------------------------------------------------------------------- /docker/R-3.6.3/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM rocker/tidyverse:3.6.3 2 | 3 | # Update apt-get and install other libraries 4 | RUN apt-get --allow-releaseinfo-change-suite update && apt-get install -y --no-install-recommends \ 5 | curl \ 6 | libbz2-dev \ 7 | libgdal-dev \ 8 | libgeos-dev \ 9 | libglpk40 \ 10 | liblzma-dev \ 11 | libmagick++-dev \ 12 | libproj-dev \ 13 | libudunits2-dev \ 14 | libxt-dev \ 15 | python3-pip \ 16 | python3-dev 17 | 18 | # Install pyrefinebio v0.3.4 19 | RUN pip3 install pyrefinebio==0.3.4 20 | 21 | # R packages 22 | RUN install2.r --error --deps TRUE \ 23 | ape \ 24 | binr \ 25 | caret \ 26 | cluster \ 27 | corrplot \ 28 | cowplot \ 29 | data.table \ 30 | devtools \ 31 | doParallel \ 32 | dplyr \ 33 | e1071 \ 34 | fastICA \ 35 | flexclust \ 36 | fpc \ 37 | gdata \ 38 | ggplot2 \ 39 | ggupset \ 40 | glmnet \ 41 | gridExtra \ 42 | here \ 43 | Hmisc \ 44 | huge \ 45 | jsonlite \ 46 | kernlab \ 47 | locfit \ 48 | optparse \ 49 | plyr \ 50 | ranger \ 51 | RColorBrewer \ 52 | reshape2 \ 53 | scales \ 54 | sdcMicro \ 55 | stringr \ 56 | styler \ 57 | viridis 58 | 59 | # R Bioconductor packages 60 | RUN Rscript -e "options(warn = 2); BiocManager::install(c( \ 61 | 'EnsDb.Hsapiens.v86' , \ 62 | 'ensembldb', \ 63 | 'limma', \ 64 | 'quantro'), \ 65 | update = FALSE, \ 66 | version = '3.10')" 67 | 68 | # Threading issue with preprocessCore::normalize.quantiles 69 | # https://support.bioconductor.org/p/122925/#124701 70 | # https://github.com/bmbolstad/preprocessCore/issues/1#issuecomment-326756305 71 | RUN Rscript -e "options(warn = 2); BiocManager::install( \ 72 | 'preprocessCore', \ 73 | configure.args = '--disable-threading', \ 74 | update = FALSE)" 75 | 76 | # ref = 341eb77105e7efd2654b4f112578648584936e06 is latest greenelab/TDM commit (retrieved 2021-05-28) 77 | RUN Rscript -e "options(warn = 2); remotes::install_github( \ 78 | 'greenelab/TDM', ref = 'b041807835d4076c5549356c86c44f087d713b1a')" 79 | 80 | # ref = 08ed6b54e4efe5249107cb335cd8e169657cbc44 is wgmao/PLIER commit corresponding to v0.1.6 (retrieved 2021-11-09) 81 | RUN Rscript -e "options(warn = 2); remotes::install_github( \ 82 | 'wgmao/PLIER', ref = '08ed6b54e4efe5249107cb335cd8e169657cbc44')" 83 | -------------------------------------------------------------------------------- /docker/R-4.1.2/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM rocker/tidyverse:4.1.2 2 | 3 | # Update apt-get and install other libraries 4 | RUN apt-get update && apt-get install -y --no-install-recommends \ 5 | curl \ 6 | libfftw3-dev \ 7 | libbz2-dev \ 8 | libgdal-dev \ 9 | libgeos-dev \ 10 | libglpk40 \ 11 | liblzma-dev \ 12 | libmagick++-dev \ 13 | libproj-dev \ 14 | libudunits2-dev \ 15 | libxt-dev \ 16 | python3-pip \ 17 | python3-dev 18 | 19 | # Install pyrefinebio v0.3.4 20 | RUN pip3 install pyrefinebio==0.3.4 21 | 22 | # Install some Bioconductor packages dependencies 23 | RUN Rscript -e "options(warn = 2); BiocManager::install(c( \ 24 | 'DESeq2', \ 25 | 'EnsDb.Hsapiens.v86' , \ 26 | 'ensembldb', \ 27 | 'GenomicRanges', \ 28 | 'GenomeInfoDb', \ 29 | 'IRanges', \ 30 | 'limma', \ 31 | 'MAST', \ 32 | 'monocle', \ 33 | 'multtest', \ 34 | 'quantro', \ 35 | 'qvalue', \ 36 | 'rtracklayer', \ 37 | 'S4Vectors', \ 38 | 'SingleCellExperiment', \ 39 | 'SummarizedExperiment'), \ 40 | update = FALSE, \ 41 | version = 3.14)" 42 | 43 | # R packages 44 | RUN install2.r --error --deps TRUE \ 45 | ape \ 46 | binr \ 47 | caret \ 48 | cluster \ 49 | corrplot \ 50 | cowplot \ 51 | data.table \ 52 | devtools \ 53 | doParallel \ 54 | dplyr \ 55 | e1071 \ 56 | fastICA \ 57 | flexclust \ 58 | fpc \ 59 | gdata \ 60 | ggplot2 \ 61 | ggupset \ 62 | glmnet \ 63 | gridExtra \ 64 | here \ 65 | Hmisc \ 66 | huge \ 67 | jsonlite \ 68 | kernlab \ 69 | locfit \ 70 | optparse \ 71 | plyr \ 72 | ranger \ 73 | RColorBrewer \ 74 | reshape2 \ 75 | scales \ 76 | sdcMicro \ 77 | Seurat \ 78 | stringr \ 79 | styler \ 80 | viridis 81 | 82 | # Threading issue with preprocessCore::normalize.quantiles 83 | # https://support.bioconductor.org/p/122925/#124701 84 | # https://github.com/bmbolstad/preprocessCore/issues/1#issuecomment-326756305 85 | RUN Rscript -e "options(warn = 2); BiocManager::install( \ 86 | 'preprocessCore', \ 87 | configure.args = '--disable-threading', \ 88 | force = TRUE, \ 89 | update = FALSE, \ 90 | version = 3.14)" 91 | 92 | # ref = 341eb77105e7efd2654b4f112578648584936e06 is latest greenelab/TDM commit (retrieved 2021-05-28) 93 | RUN Rscript -e "options(warn = 2); remotes::install_github( \ 94 | 'greenelab/TDM', ref = 'b041807835d4076c5549356c86c44f087d713b1a')" 95 | 96 | # ref = 08ed6b54e4efe5249107cb335cd8e169657cbc44 is wgmao/PLIER commit corresponding to v0.1.6 (retrieved 2021-11-09) 97 | RUN Rscript -e "options(warn = 2); remotes::install_github( \ 98 | 'wgmao/PLIER', ref = '08ed6b54e4efe5249107cb335cd8e169657cbc44')" 99 | -------------------------------------------------------------------------------- /docker/README.md: -------------------------------------------------------------------------------- 1 | # Using Docker with this project 2 | 3 | Two Docker images are available from `envest/rnaseq_titration_results`. 4 | They are highly similar but are based on different versions of R. 5 | 6 | ### R-4.1.2 version 7 | 8 | We recommend using this version for running any analysis. 9 | This version is maintained and will be the one to get updated in the future. 10 | 11 | To pull this image, use the tag `R-4.1.2`: 12 | 13 | ``` 14 | docker pull envest/rnaseq_titration_results:R-4.1.2 15 | ``` 16 | 17 | ### R-3.6.3 version 18 | 19 | We also have an image based on R version 3.6.3. 20 | This image is more representative of the development environment used in earlier (pre-2022) iterations of this analysis and we retain it for posterity. 21 | 22 | :warning: We do _not_ recommend using this version for running analysis since recent code updates have changed some behaviors under older versions of R and corresponding package versions. 23 | 24 | To pull this image, use the tag `R-3.6.3`: 25 | 26 | ``` 27 | docker pull envest/rnaseq_titration_results:R-3.6.3 28 | ``` 29 | -------------------------------------------------------------------------------- /download_TCGA_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | # change to the directory of this script 5 | cd "$(dirname "${BASH_SOURCE[0]}")" 6 | 7 | # set data directory 8 | data="data" 9 | mkdir -p $data 10 | 11 | # downlaod BRCA array and seq data from URLs 12 | wget -nc -i brca_data_urls.txt '--directory-prefix='$data 13 | 14 | # Obtain TCGA data freeze manifest file 15 | # See here for more info: https://gdc.cancer.gov/about-data/publications/pancanatlas 16 | manifest_url="https://gdc.cancer.gov/files/public/file/PanCan-General_Open_GDC-Manifest_2.txt" 17 | manifest_basename=$(basename $manifest_url) 18 | if [ -f $data/$manifest_basename ]; then 19 | echo TCGA file $data/$manifest_basename already exists and was not overwritten. 20 | else 21 | echo Downloading $manifest_basename 22 | curl -o $data/$manifest_basename --silent $manifest_url 23 | fi 24 | 25 | # download specific files from TCGA manifest 26 | mutations="mc3.v0.2.8.PUBLIC.maf.gz" 27 | copy_number="broad.mit.edu_PANCAN_Genome_Wide_SNP_6_whitelisted.seg" 28 | clinical="TCGA-CDR-SupplementalTableS1.xlsx" 29 | rnaseq="EBPlusPlusAdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.tsv" 30 | 31 | filename_array=($mutations \ 32 | $copy_number \ 33 | $clinical \ 34 | $rnaseq) 35 | 36 | for filename in ${filename_array[@]}; do 37 | if [ -f $data/$filename ]; then 38 | echo TCGA file $data/$filename already exists and was not overwritten. 39 | else 40 | echo Downloading $filename 41 | id=$(grep -w $filename $data/$manifest_basename | cut -f1) 42 | curl -o $data/$filename https://api.gdc.cancer.gov/data/$id 43 | fi 44 | done 45 | 46 | # get TCGA array expression using refine.bio client 47 | # GSE83130 GBM 48 | for accession in GSE83130; do 49 | if [ -d $data/$accession ]; then 50 | echo refine.bio download for $accession already exists and was not overwritten. 51 | else 52 | echo Downloading $accession 53 | refinebio create-token -s 54 | refinebio download-dataset \ 55 | --email-address steven.foltz@ccdatalab.org \ 56 | --path $data/$accession\.zip \ 57 | --experiments $accession \ 58 | --aggregation EXPERIMENT \ 59 | --transformation NONE \ 60 | --skip-quantile-normalization True 61 | unzip -d $data/$accession $data/$accession\.zip && rm -f $data/$accession\.zip 62 | fi 63 | done 64 | 65 | # download TCGA GBM clinical data including subtypes 66 | # Publication: Brennan, C. W. et al. The somatic genomic landscape of glioblastoma. Cell 155, 462–477 (2013) 67 | # Link to paper: https://doi.org/10.1016/j.cell.2013.09.034 68 | gbm_clinical_link="https://www.cell.com/cms/10.1016/j.cell.2013.09.034/attachment/9cefc2e8-caac-4225-bcdd-70f105ccf568/mmc7.xlsx" 69 | if [ -f $data/gbm_clinical_table_S7.xlsx ]; then 70 | echo GBM clinical spreadsheet $data/gbm_clinical_table_S7.xlsx already exists and was not overwritten. 71 | else 72 | wget -O $data/gbm_clinical_table_S7.xlsx $gbm_clinical_link 73 | fi 74 | 75 | # modify BRCA clinical file column PAM50 to be subtype 76 | sed -i 's/PAM50/subtype/' $data/BRCAClin.tsv 77 | 78 | # process GBM data via script 79 | echo Processing GBM data ... 80 | Rscript prepare_GBM_data.R \ 81 | --seq_input $data/EBPlusPlusAdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.tsv \ 82 | --array_input $data/GSE83130/GSE83130/GSE83130.tsv \ 83 | --metadata_input $data/GSE83130/aggregated_metadata.json \ 84 | --array_output $data/GBMarray.pcl \ 85 | --seq_output $data/GBMRNASeq.pcl \ 86 | --clinical_input $data/gbm_clinical_table_S7.xlsx \ 87 | --clinical_output $data/GBMClin.tsv 88 | 89 | # retrieve BRCA and GBM mutations in PIK3CA and TP53 from TCGA MC3 90 | # output is stored in data/mutations.* TSV and MAF files 91 | echo Retrieving mutation data from MC3 for BRCA and GBM ... 92 | python3 retrieve_MC3_mutations.py 93 | 94 | # combine clinical and mutation data into one data frame 95 | echo Combining clinical and mutation data for BRCA ... 96 | Rscript combine_clinical_data.R \ 97 | --cancer_type BRCA \ 98 | --clinical_input $data/BRCAClin.tsv \ 99 | --mutation_input $data/mutations.BRCA.tsv \ 100 | --combined_output $data/combined_clinical_data.BRCA.tsv 101 | echo Combining clinical and mutation data for GBM ... 102 | Rscript combine_clinical_data.R \ 103 | --cancer_type GBM \ 104 | --clinical_input $data/GBMClin.tsv \ 105 | --mutation_input $data/mutations.GBM.tsv \ 106 | --combined_output $data/combined_clinical_data.GBM.tsv 107 | 108 | # check md5 sums of downloaded files 109 | echo Checking md5 sums of downloaded files ... 110 | md5sum --check --quiet check_sums.tsv 111 | echo All data files match expected md5 sums! 112 | 113 | # get BRCA array expression data from TCGA Legacy Archive 114 | # data/gdc_legacy_archive_brca_manifest.txt obtained from https://portal.gdc.cancer.gov/legacy-archive 115 | # with search parameters 116 | # Cases 117 | # Cancer Program = TCGA 118 | # Project = TCGA-BRCA 119 | # Files 120 | # Data Category = Raw microarray data 121 | # Data Type = Raw intensities 122 | # Experimental Strategy = Gene expression array 123 | ################################################################################ 124 | # UNCOMMENT TO DOWNLOAD TCGA LEGACY ARCHIVE BRCA EXPRESSION ARRAY DATA 125 | # Need to rebuild docker image with gdc-client uncommented 126 | #brca_array_dir=$data/BRCA_array 127 | #if [ -d $brca_array_dir ]; then 128 | # echo TCGA Legacy Archive data for BRCA already exists and was not overwritten. 129 | #else 130 | # mkdir -p $brca_array_dir 131 | # gdc-client download --manifest gdc_legacy_archive_brca_manifest.txt --dir $brca_array_dir 132 | #fi 133 | ################################################################################ 134 | -------------------------------------------------------------------------------- /load_packages.R: -------------------------------------------------------------------------------- 1 | library(ape) 2 | library(binr) 3 | library(caret) 4 | library(cluster) 5 | library(corrplot) 6 | library(cowplot) 7 | library(data.table) 8 | library(devtools) 9 | library(doParallel) 10 | library(e1071) 11 | library(fastICA) 12 | library(flexclust) 13 | library(fpc) 14 | library(gdata) 15 | library(glmnet) 16 | library(gridExtra) 17 | library(Hmisc) 18 | library(huge) 19 | library(kernlab) 20 | library(limma) 21 | library(parallel) 22 | library(PLIER) 23 | library(plyr) 24 | library(preprocessCore) 25 | library(quantro) 26 | library(ranger) 27 | library(reshape2) 28 | library(scales) 29 | library(sdcMicro) 30 | library(TDM) 31 | library(tidyverse) 32 | -------------------------------------------------------------------------------- /models/.empty: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/greenelab/RNAseq_titration_results/2d9b2f5ce2f04fc76279b2fc314e51abc9a4f3b4/models/.empty -------------------------------------------------------------------------------- /normalized_data/.empty: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/greenelab/RNAseq_titration_results/2d9b2f5ce2f04fc76279b2fc314e51abc9a4f3b4/normalized_data/.empty -------------------------------------------------------------------------------- /normalized_data/reconstructed_data/.empty: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/greenelab/RNAseq_titration_results/2d9b2f5ce2f04fc76279b2fc314e51abc9a4f3b4/normalized_data/reconstructed_data/.empty -------------------------------------------------------------------------------- /plots/.empty: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/greenelab/RNAseq_titration_results/2d9b2f5ce2f04fc76279b2fc314e51abc9a4f3b4/plots/.empty -------------------------------------------------------------------------------- /plots/data/.empty: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/greenelab/RNAseq_titration_results/2d9b2f5ce2f04fc76279b2fc314e51abc9a4f3b4/plots/data/.empty -------------------------------------------------------------------------------- /plots/main/.empty: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/greenelab/RNAseq_titration_results/2d9b2f5ce2f04fc76279b2fc314e51abc9a4f3b4/plots/main/.empty -------------------------------------------------------------------------------- /plots/scripts/.empty: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/greenelab/RNAseq_titration_results/2d9b2f5ce2f04fc76279b2fc314e51abc9a4f3b4/plots/scripts/.empty -------------------------------------------------------------------------------- /plots/scripts/0-plot_predictor_category_distributions.R: -------------------------------------------------------------------------------- 1 | # S. Foltz Feb 2022 2 | # This plots the predictor category distribution for each seed 3 | 4 | option_list <- list( 5 | optparse::make_option("--cancer_type", 6 | default = NA_character_, 7 | help = "Cancer type"), 8 | optparse::make_option("--predictor", 9 | default = NA_character_, 10 | help = "Predictor used"), 11 | optparse::make_option("--null_model", 12 | action = "store_true", 13 | default = FALSE, 14 | help = "Use null model input data"), 15 | optparse::make_option("--plot_all_seeds", 16 | action = "store_true", 17 | default = FALSE, 18 | help = "Plot all seeds instead of representative seed"), 19 | optparse::make_option("--output_directory", 20 | default = NA_character_, 21 | help = "Output directory for plot (absolute or relative path)") 22 | ) 23 | 24 | opt <- optparse::parse_args(optparse::OptionParser(option_list=option_list)) 25 | source(here::here("util/option_functions.R")) 26 | check_options(opt) 27 | 28 | # load libraries 29 | suppressMessages(library(tidyverse)) 30 | source(here::here("util/color_blind_friendly_palette.R")) 31 | 32 | # set options 33 | cancer_type <- opt$cancer_type 34 | predictor <- opt$predictor 35 | null_model <- opt$null_model 36 | plot_all_seeds <- opt$plot_all_seeds 37 | file_identifier <- ifelse(null_model, 38 | str_c(cancer_type, predictor, "null", sep = "_"), 39 | str_c(cancer_type, predictor, sep = "_")) 40 | 41 | # define directories 42 | plot.dir <- here::here("plots") 43 | plot.data.dir <- here::here("plots/data") 44 | output_directory <- opt$output_directory 45 | 46 | # list potential input files 47 | input_files <- list.files(path = plot.data.dir, 48 | pattern = paste0(file_identifier, 49 | ".dist_split_stacked_bar."), 50 | full.names = TRUE) 51 | 52 | # define input data 53 | if (plot_all_seeds) { # read in all seed data to one data frame 54 | plot_df <- input_files %>% 55 | map(read_tsv, 56 | col_types = "ccccc") %>% 57 | reduce(rbind) 58 | } else { # default 59 | plot_df <- read_tsv(input_files[1], 60 | col_types = "ccccc") 61 | initial_seed <- plot_df %>% 62 | pull(initial_seed) %>% 63 | unique() 64 | } 65 | 66 | # define output file 67 | if (plot_all_seeds) { 68 | category.distribution.plot <- file.path(output_directory, 69 | paste0(file_identifier, 70 | ".dist_split_stacked_bar.", 71 | "all_seeds", 72 | ".pdf")) 73 | } else { # default 74 | category.distribution.plot <- file.path(output_directory, 75 | paste0(file_identifier, 76 | ".dist_split_stacked_bar.", 77 | initial_seed, 78 | ".pdf")) 79 | } 80 | 81 | # plot 82 | 83 | plot_obj <- plot_df %>% 84 | ggplot(aes(x = fct_rev(split), 85 | fill = category)) + 86 | geom_bar() + 87 | scale_fill_manual(values = cbPalette) + 88 | labs(x = "Split", 89 | y = "Count", 90 | title = str_replace(file_identifier, 91 | pattern = "_", 92 | replacement = " "), 93 | fill = "Predictor") + 94 | theme_bw(base_size = 10) 95 | 96 | if (plot_all_seeds) { 97 | plot_obj <- plot_obj + 98 | coord_flip() + 99 | facet_wrap(~ initial_seed, 100 | ncol = 5) 101 | ggsave(filename = category.distribution.plot, 102 | plot = plot_obj, 103 | height = 3, 104 | width = 7.25) 105 | } else { # default 106 | ggsave(filename = category.distribution.plot, 107 | plot = plot_obj, 108 | height = 3, 109 | width = 3.5) 110 | } 111 | -------------------------------------------------------------------------------- /plots/scripts/1A-plot_DEGs.R: -------------------------------------------------------------------------------- 1 | # J. Taroni Feb 2016, S. Foltz Feb 2022 2 | # This script compares differential expression "silver standards" which are 3 | # differential expression analysis results from standard pipelines (i.e., 4 | # log transformed 100% array data and RSEM counts 100% RNA-seq [processed with 5 | # limma::voom]) with differential expression results from RNA-seq titrated data 6 | # (0-100%) normalized various ways. 7 | # 8 | # Plot the proportion of genes that are differentially expressed between conditions 9 | # Plot similarity of DEGs to silver standards (subtype vs. others only) 10 | # 11 | # USAGE: Rscript 1A-plot_DEGs.R --cancer_type --subtype_vs_others --subtype_vs_subtype 12 | 13 | option_list <- list( 14 | optparse::make_option("--cancer_type", 15 | default = NA_character_, 16 | help = "Cancer type"), 17 | optparse::make_option("--subtype_vs_others", 18 | help = "Subtype used for comparison against all others."), 19 | optparse::make_option("--subtype_vs_subtype", 20 | help = "Subtypes used in head-to-head comparison (comma-separated without space e.g. Type1,Type2)"), 21 | optparse::make_option("--proportion_output_directory", 22 | help = "Output directory of DEG proportion plot. Include this option to plot DEG proportion plot."), 23 | optparse::make_option("--overlap_output_directory", 24 | help = "Output directory of DEG overlap plot. Include this option to plot silver standard overlap plot."), 25 | optparse::make_option("--overlap_measure", 26 | help = "Which overlap measures to include in silver standard overlap plot (comma-separated without space e.g. Jaccard,Rand,Spearman; must be one or more of Jaccard, Rand, Spearman)") 27 | ) 28 | 29 | opt <- optparse::parse_args(optparse::OptionParser(option_list=option_list)) 30 | source(here::here("util/option_functions.R")) 31 | check_options(opt) 32 | 33 | # at least one of --subtype_vs_others or --subtype_vs_subtype should be given 34 | if (any(c("subtype_vs_others", "subtype_vs_subtype") %in% names(opt))) { 35 | 36 | subtype_vs_others <- NA # first assume option is not provided 37 | subtype_vs_subtype <- NA # then update as available below 38 | 39 | if ("subtype_vs_others" %in% names(opt)) { 40 | subtype_vs_others <- opt$subtype_vs_others 41 | } 42 | 43 | if ("subtype_vs_subtype" %in% names(opt)) { 44 | subtype_vs_subtype <- opt$subtype_vs_subtype 45 | } 46 | 47 | } else { 48 | stop(" Errors: must include --subtype_vs_others and/or --subtype_vs_subtype in plots/scripts/1A-plot_DEGs.R.\n") 49 | } 50 | 51 | # at least one of --proportion_output_directory or --overlap_output_directory should be given 52 | if (any(c("proportion_output_directory", "overlap_output_directory") %in% names(opt))) { 53 | 54 | proportion_output_directory <- NA # first assume option is not provided 55 | overlap_output_directory <- NA # then update as available below 56 | 57 | if ("proportion_output_directory" %in% names(opt)) { 58 | proportion_output_directory <- opt$proportion_output_directory 59 | plot_proportion <- TRUE 60 | 61 | } 62 | 63 | if ("overlap_output_directory" %in% names(opt)) { 64 | overlap_output_directory <- opt$overlap_output_directory 65 | plot_overlap <- TRUE 66 | 67 | # check that overlap measures requested are the ones present in data 68 | if ("overlap_measure" %in% names(opt)) { 69 | overlap_measures <- sort(stringr::str_split(opt$overlap_measure, 70 | pattern = ",", simplify = TRUE)) 71 | 72 | if (!all(overlap_measures %in% c("Jaccard", "Rand", "Spearman"))) { 73 | stop(" Errors: --overlap_measure must be one or more of Jaccard, Rand, Spearman in plots/scripts/1A-plot_DEGs.R.\n") 74 | } 75 | 76 | } else { 77 | stop(" Errors: must include --overlap_measure with --overlap_output_directory in plots/scripts/1A-plot_DEGs.R.\n") 78 | } 79 | } 80 | 81 | } else { 82 | stop(" Errors: must include --proportion_output_directory and/or --overlap_output_directory in plots/scripts/1A-plot_DEGs.R.\n") 83 | } 84 | 85 | # load libraries 86 | suppressMessages(library(tidyverse)) 87 | source(here::here("util/color_blind_friendly_palette.R")) 88 | source(here::here("util", "differential_expression_functions.R")) 89 | 90 | # set options 91 | cancer_type <- opt$cancer_type 92 | file_identifier <- str_c(cancer_type, "subtype", sep = "_") # we are only working with subtype models here 93 | 94 | # define directories 95 | plot.dir <- here::here("plots") 96 | plot.data.dir <- file.path(plot.dir, "data") 97 | 98 | #### functions ----------------------------------------------------------------- 99 | 100 | plot_DEG_proportions <- function(subtypes){ 101 | 102 | subtypes_path <- str_c(subtypes, collapse = "v") 103 | subtypes_nice <- str_c(subtypes, collapse = " vs. ") 104 | 105 | input_filename <- file.path( 106 | plot.data.dir, 107 | paste0(file_identifier, "_titration_differential_exp_eBayes_fits_", 108 | subtypes_path, ".propDE.tsv")) 109 | 110 | output_filename <- file.path( 111 | proportion_output_directory, 112 | paste0(file_identifier, "_differential_expr_proportion_lt5_", 113 | subtypes_path, ".pdf")) 114 | 115 | propDEG_df <- read_tsv(input_filename, 116 | col_types = "dcd") %>% 117 | mutate(perc.seq = factor(perc.seq, 118 | levels = seq(0, 100, 10))) 119 | 120 | # plot proportion of genes that are diff expressed 121 | plot_obj <- PlotProportionDE(propDEG_df, 122 | subtypes = subtypes_nice, 123 | cancer_type = cancer_type) 124 | 125 | ggsave(output_filename, 126 | plot = plot_obj, 127 | width = 7.25, 128 | height = 4) 129 | 130 | } 131 | 132 | plot_silver_overlap <- function(subtypes){ 133 | 134 | subtypes_path <- str_c(subtypes, collapse = "v") 135 | subtypes_nice <- str_c(subtypes, collapse = " vs. ") 136 | measures_path <- str_c(overlap_measures, collapse = "_") 137 | 138 | input_filename <- file.path( 139 | plot.data.dir, 140 | paste0(file_identifier, "_titration_differential_exp_eBayes_fits_", 141 | subtypes_path, ".silver.tsv")) 142 | 143 | output_filename <- file.path( 144 | overlap_output_directory, 145 | paste0(file_identifier, "_silver_standard_similarity_lt5_", 146 | measures_path, "_", subtypes_path, ".pdf")) 147 | 148 | silver_df <- read_tsv(input_filename, 149 | col_types = "cdccd") %>% 150 | mutate(Perc.Seq = factor(Perc.Seq, 151 | levels = seq(0, 100, 10))) %>% 152 | filter(measure %in% overlap_measures) 153 | 154 | using_single_measure <- length(overlap_measures) == 1 155 | 156 | plot_obj <- PlotSilverStandardStats( 157 | silver_df, 158 | title = paste(cancer_type, subtypes_nice, " FDR < 5%"), 159 | single_measure = using_single_measure) 160 | 161 | ggsave( 162 | output_filename, 163 | plot = plot_obj, 164 | width = 7.25, 165 | height = c(3,4,5)[length(overlap_measures)] 166 | ) 167 | } 168 | 169 | #### plot Subtype v. Other results --------------------------------------------- 170 | 171 | if (!is.na(subtype_vs_others)) { 172 | 173 | subtypes <- c(subtype_vs_others, "Other") 174 | 175 | if (plot_proportion) { 176 | plot_DEG_proportions(subtypes) 177 | } 178 | 179 | if (plot_overlap) { 180 | plot_silver_overlap(subtypes) 181 | } 182 | } 183 | 184 | #### plot Subtype v. Subtype results ------------------------------------------- 185 | 186 | if (!is.na(subtype_vs_subtype)) { 187 | 188 | subtypes <- as.vector( 189 | stringr::str_split(subtype_vs_subtype, pattern = ",", simplify = TRUE)) 190 | 191 | if (plot_proportion) { 192 | plot_DEG_proportions(subtypes) 193 | } 194 | 195 | if (plot_overlap) { 196 | plot_silver_overlap(subtypes) 197 | } 198 | 199 | } 200 | -------------------------------------------------------------------------------- /plots/scripts/2A-plot_small_n_differential_expression.R: -------------------------------------------------------------------------------- 1 | # J. Taroni Feb 2016, S. Foltz Feb 2022 2 | # With small n data, plot comparison of SOMETHING 3 | # 4 | # USAGE: Rscript 2A-plot_small_n_differential_expression.R --cancer_type --subtype_vs_others --subtype_vs_subtype --output_directory --overlap_measure 5 | 6 | option_list <- list( 7 | optparse::make_option("--cancer_type", 8 | default = NA_character_, 9 | help = "Cancer type"), 10 | optparse::make_option("--subtype_vs_others", 11 | help = "Subtype used for comparison against all others."), 12 | optparse::make_option("--subtype_vs_subtype", 13 | help = "Subtypes used in head-to-head comparison (comma-separated without space e.g. Type1,Type2)"), 14 | optparse::make_option("--output_directory", 15 | default = NA_character_, 16 | help = "Output directory of DEG overlap plot (absolute or relative path)."), 17 | optparse::make_option("--overlap_measure", 18 | default = NA_character_, 19 | help = "Which overlap measures to include in silver standard overlap plot (comma-separated without space e.g. Jaccard,Rand,Spearman; must be one or more of Jaccard, Rand, Spearman)") 20 | ) 21 | 22 | opt <- optparse::parse_args(optparse::OptionParser(option_list=option_list)) 23 | source(here::here("util/option_functions.R")) 24 | check_options(opt) 25 | 26 | # at least one of --subtype_vs_others or --subtype_vs_subtype should be given 27 | if (any(c("subtype_vs_others", "subtype_vs_subtype") %in% names(opt))) { 28 | 29 | subtype_vs_others <- NA # first assume option is not provided 30 | subtype_vs_subtype <- NA # then update as available below 31 | 32 | if ("subtype_vs_others" %in% names(opt)) { 33 | subtype_vs_others <- opt$subtype_vs_others 34 | } 35 | 36 | if ("subtype_vs_subtype" %in% names(opt)) { 37 | subtype_vs_subtype <- opt$subtype_vs_subtype 38 | } 39 | 40 | } else { 41 | message(" Errors: must include --subtype_vs_others and/or --subtype_vs_subtype in plots/scripts/2A-plot_small_n_differential_expression.R.\n") 42 | stop() 43 | } 44 | 45 | # check that overlap measures requested are the ones present in data 46 | if ("overlap_measure" %in% names(opt)) { 47 | overlap_measures <- sort(stringr::str_split(opt$overlap_measure, 48 | pattern = ",", simplify = TRUE)) 49 | 50 | if (!all(overlap_measures %in% c("Jaccard", "Rand", "Spearman"))) { 51 | message(" Errors: --overlap_measure must be one or more of Jaccard, Rand, Spearman in plots/scripts/2A-plot_small_n_differential_expression.R.\n") 52 | stop() 53 | } 54 | } 55 | 56 | # load libraries 57 | suppressMessages(library(tidyverse)) 58 | source(here::here("util/color_blind_friendly_palette.R")) 59 | 60 | # set options 61 | cancer_type <- opt$cancer_type 62 | file_identifier <- str_c(cancer_type, "subtype", sep = "_") # we are only working with subtype models here 63 | 64 | # define directories 65 | plot.dir <- here::here("plots") 66 | plot.data.dir <- file.path(plot.dir, "data") 67 | output_directory <- opt$output_directory 68 | 69 | #### functions ----------------------------------------------------------------- 70 | 71 | DataSummary <- function(x) { 72 | # This function is supplied to ggplot2::stat_summary in order to plot the 73 | # median value of a vector as a point and the "confidence interval on the 74 | # median" used in notched boxplots as a vertical line. See boxplot.stats for 75 | # more information. 76 | m <- median(x) 77 | conf <- boxplot.stats(x)$conf 78 | ymin <- min(conf) 79 | ymax <- max(conf) 80 | return(c(y = m, ymin = ymin, ymax = ymax)) 81 | } 82 | 83 | plot_small_n <- function(subtypes){ 84 | # This function creates a panel of line plots faceted by 85 | # % RNA-seq, the measure (Jaccard, Rand, Spearman), and normalization method 86 | 87 | subtypes_path <- str_c(subtypes, collapse = "v") 88 | subtypes_nice <- str_c(subtypes, collapse = " vs. ") 89 | measures_path <- str_c(overlap_measures, collapse = "_") 90 | 91 | input_filename <- file.path(plot.data.dir, 92 | paste0(file_identifier, 93 | "_small_n_", 94 | subtypes_path, 95 | "_results.tsv")) 96 | 97 | output_filename <- file.path( 98 | output_directory, 99 | paste0(file_identifier, "_small_n_", 100 | measures_path, "_", subtypes_path, ".pdf")) 101 | 102 | using_single_measure <- length(overlap_measures) == 1 103 | 104 | stats_df <- read_tsv(input_filename, 105 | col_types = "ccdcddc") %>% 106 | filter(seq_prop %in% str_c(c(30, 50, 70), "% RNA-seq"), 107 | !is.na(value), 108 | measure %in% overlap_measures) %>% 109 | mutate(no.samples = factor(no.samples)) 110 | 111 | plot_obj <- ggplot(stats_df, aes(x = no.samples, 112 | y = value, 113 | color = platform)) + 114 | stat_summary(fun = median, 115 | geom = "line", 116 | aes(group = platform), 117 | position = position_dodge(0.7), 118 | show.legend = FALSE) + 119 | stat_summary(fun = median, # this makes the point size consistent with other plots 120 | geom = "point", 121 | aes(group = platform), 122 | position = position_dodge(0.7), 123 | show.legend = FALSE) + 124 | stat_summary(fun.data = DataSummary, # this adds the error bars without median points 125 | geom = "linerange", 126 | aes(group = platform), 127 | position = position_dodge(0.7), 128 | show.legend = FALSE) + 129 | expand_limits(y = c(0,1)) + 130 | scale_y_continuous(breaks = seq(0, 1, 0.25)) + 131 | theme_bw() + 132 | labs(x = "Number of Samples from Each Subtype", 133 | y = ifelse(using_single_measure, 134 | unique(stats_df$measure), 135 | "Measure of Similarity"), 136 | title = str_c("Small n Experiment: ", paste(cancer_type, subtypes_nice, "FDR < 10%"))) + 137 | scale_colour_manual(values = cbPalette[c(2, 3)]) 138 | 139 | if (using_single_measure) { 140 | plot_obj <- plot_obj + 141 | facet_grid(normalization ~ seq_prop) 142 | } else { 143 | plot_obj <- plot_obj + 144 | facet_grid(measure + normalization ~ seq_prop) 145 | } 146 | 147 | ggsave(filename = output_filename, 148 | plot = plot_obj, 149 | width = 7.25, 150 | height = c(3,4,5)[length(overlap_measures)]) 151 | } 152 | 153 | #### plot Subtype v. Other results --------------------------------------------- 154 | 155 | if (!is.na(subtype_vs_others)) { 156 | 157 | subtypes <- c(subtype_vs_others, "Other") 158 | plot_small_n(subtypes) 159 | } 160 | 161 | #### plot Subtype v. Subtype results ------------------------------------------- 162 | 163 | if (!is.na(subtype_vs_subtype)) { 164 | 165 | subtypes <- as.vector( 166 | stringr::str_split(subtype_vs_subtype, pattern = ",", simplify = TRUE)) 167 | plot_small_n(subtypes) 168 | } 169 | -------------------------------------------------------------------------------- /plots/scripts/3-plot_category_kappa.R: -------------------------------------------------------------------------------- 1 | # S. Foltz Feb 2022 2 | # This plots kappa values from category prediction 3 | 4 | option_list <- list( 5 | optparse::make_option("--cancer_type", 6 | default = NA_character_, 7 | help = "Cancer type"), 8 | optparse::make_option("--predictor", 9 | default = NA_character_, 10 | help = "Predictor used"), 11 | optparse::make_option("--null_model", 12 | action = "store_true", 13 | default = FALSE, 14 | help = "Use delta kappa input data"), 15 | optparse::make_option("--output_directory", 16 | default = NA_character_, 17 | help = "Output directory for plot (absolute or relative path)"), 18 | optparse::make_option("--include_seurat", 19 | action = "store_true", 20 | default = FALSE, 21 | help = "Include Seurat results in plot (default: FALSE)") 22 | ) 23 | 24 | opt <- optparse::parse_args(optparse::OptionParser(option_list=option_list)) 25 | source(here::here("util/option_functions.R")) 26 | check_options(opt) 27 | 28 | # load libraries 29 | suppressMessages(library(tidyverse)) 30 | source(here::here("util/color_blind_friendly_palette.R")) 31 | 32 | # set options 33 | cancer_type <- opt$cancer_type 34 | predictor <- opt$predictor 35 | null_model <- opt$null_model 36 | file_identifier <- str_c(cancer_type, predictor, sep = "_") 37 | include_seurat <- opt$include_seurat 38 | 39 | # define directories 40 | plot.dir <- here::here("plots") 41 | plot.data.dir <- here::here("plots/data") 42 | output_directory <- opt$output_directory 43 | 44 | # define input file 45 | input_filename <- ifelse(null_model, 46 | file.path(plot.data.dir, 47 | paste0(file_identifier, 48 | "_train_3_models_delta_kappa.tsv")), 49 | file.path(plot.data.dir, 50 | paste0(file_identifier, 51 | "_train_3_models_kappa.tsv"))) 52 | 53 | # define output files 54 | output_filename <- file.path(output_directory, 55 | ifelse(null_model, 56 | paste0(file_identifier, 57 | "_train_3_models_delta_kappa.pdf"), 58 | paste0(file_identifier, 59 | "_train_3_models_kappa.pdf"))) 60 | 61 | # read in data 62 | median_df <- read_tsv(input_filename, 63 | col_types = "dddddccc") %>% 64 | mutate(Perc.Seq = factor(Perc.Seq, 65 | levels = seq(0, 100, 10))) %>% 66 | group_by(Perc.Seq, Platform, Classifier, Normalization) %>% 67 | summarize(n_obs = n(), 68 | med = median(Kappa), 69 | IQR = quantile(Kappa, 0.75) - quantile(Kappa, 0.25), 70 | median_ci_upper = med + 1.58*IQR/sqrt(n_obs), 71 | median_ci_lower = med - 1.58*IQR/sqrt(n_obs), 72 | .groups = "drop") 73 | 74 | kappa_df <- read_tsv(input_filename, 75 | col_types = "dddddccc") %>% 76 | mutate(Perc.Seq = factor(Perc.Seq, 77 | levels = seq(0, 100, 10))) 78 | 79 | # default behavior: exclude (!include) seurat results 80 | if (!include_seurat) { 81 | median_df <- median_df %>% 82 | filter(Normalization != "SEURAT") %>% 83 | droplevels() 84 | kappa_df <- kappa_df %>% 85 | filter(Normalization != "SEURAT") %>% 86 | droplevels() 87 | } 88 | 89 | # plot 90 | 91 | plot_obj <- ggplot(median_df, 92 | aes(x = Perc.Seq, 93 | y = med, # median 94 | color = Platform, 95 | fill = Platform)) + 96 | facet_grid(rows = vars(Classifier), 97 | cols = vars(Normalization)) + 98 | geom_errorbar(aes(x = Perc.Seq, 99 | ymin = median_ci_lower, 100 | ymax = median_ci_upper), 101 | size = 0.25, 102 | width = 0.5, 103 | position = position_dodge(0.7)) + 104 | geom_line(aes(group = Platform), 105 | size = 0.5, 106 | position = position_dodge(0.7)) + 107 | geom_point(shape = 16, 108 | size = 0.5, 109 | show.legend = FALSE, 110 | position = position_dodge(0.7)) + 111 | geom_point(data = kappa_df, 112 | aes(x = Perc.Seq, 113 | y = Kappa, 114 | color = Platform, 115 | fill = Platform), 116 | alpha = 0.5, 117 | size = 0.25, 118 | shape = 16, 119 | position = position_dodge(0.7), 120 | show.legend = FALSE) + 121 | expand_limits(y = 1) + 122 | scale_x_discrete(labels = c("0", "", "", "", "", 123 | "50", "", "", "", "", 124 | "100")) + 125 | labs(x = "% RNA-seq Samples in Training Data", 126 | color = "Test Data Platform", 127 | fill = "Test Data Platform", 128 | y = ifelse(null_model, 129 | "Delta Kappa", 130 | "Kappa"), 131 | title = str_c(cancer_type, predictor, sep = " ")) + 132 | theme_bw() + 133 | scale_fill_manual(values = cbPalette[2:3]) + 134 | scale_colour_manual(values = cbPalette[2:3]) + 135 | theme(legend.position = "bottom", 136 | panel.grid.major = element_line(size = 0.25), 137 | panel.grid.minor = element_line(size = 0.25), 138 | strip.text.y = element_text(size = 7)) 139 | 140 | ggsave(output_filename, 141 | plot = plot_obj, 142 | height = 4, 143 | width = 7.25) 144 | -------------------------------------------------------------------------------- /plots/scripts/6-plot_recon_error.R: -------------------------------------------------------------------------------- 1 | # S. Foltz Feb 2022 2 | # This plots reconstruction error from PCA reconstruction 3 | 4 | option_list <- list( 5 | optparse::make_option("--cancer_type", 6 | default = NA_character_, 7 | help = "Cancer type"), 8 | optparse::make_option("--predictor", 9 | default = NA_character_, 10 | help = "Predictor used"), 11 | optparse::make_option("--output_directory", 12 | default = NA_character_, 13 | help = "Output directory for plot (absolute or relative path)") 14 | ) 15 | 16 | opt <- optparse::parse_args(optparse::OptionParser(option_list=option_list)) 17 | source(here::here("util/option_functions.R")) 18 | check_options(opt) 19 | 20 | # load libraries 21 | suppressMessages(library(tidyverse)) 22 | source(here::here("util/color_blind_friendly_palette.R")) 23 | 24 | # set options 25 | cancer_type <- opt$cancer_type 26 | predictor <- opt$predictor 27 | file_identifier <- str_c(cancer_type, predictor, sep = "_") 28 | 29 | # define directories 30 | plot.dir <- here::here("plots") 31 | plot.data.dir <- here::here("plots/data") 32 | output_directory <- opt$output_directory 33 | 34 | # define input file 35 | input_filename <- file.path(plot.data.dir, 36 | paste0(file_identifier, 37 | "_reconstruction_error.tsv")) 38 | 39 | # define output files 40 | output_filename <- file.path(output_directory, 41 | paste0(file_identifier, 42 | "_reconstruction_error.pdf")) 43 | 44 | # read in data 45 | 46 | plot_df <- readr::read_tsv(input_filename, 47 | col_types = "cdcccd") %>% 48 | mutate(Perc.seq = factor(Perc.seq, 49 | levels = seq(0, 100, 10))) %>% 50 | filter(Mean_Value != Inf) 51 | 52 | # for each normalization method, plot error stats 53 | plot_obj <- ggplot(plot_df, 54 | aes(x = Perc.seq, 55 | y = Mean_Value, 56 | color = Platform, 57 | fill = Platform)) + 58 | facet_wrap(~ Normalization, 59 | ncol = 4, 60 | scales = "free_y") + 61 | geom_violin(position = position_dodge(0.7), 62 | alpha = 0.25, 63 | show.legend = FALSE) + 64 | stat_summary(fun = median, 65 | geom = "line", 66 | aes(group = Platform), 67 | position = position_dodge(0.7)) + 68 | stat_summary(fun = median, 69 | geom = "point", 70 | aes(group = Platform), 71 | position = position_dodge(0.7), 72 | size = 1, 73 | shape = 16) + 74 | expand_limits(y = 0) + 75 | scale_x_discrete(labels = c("0", "", "", "", "", 76 | "50", "", "", "", "", 77 | "100")) + 78 | labs(x = "% RNA-seq Samples in Training Data", 79 | color = "Test Data Platform", 80 | fill = "Test Data Platform", 81 | y = "MASE (per gene)", 82 | title = str_c("PCA reconstruction error of", 83 | cancer_type, predictor, sep = " ")) + 84 | theme_bw() + 85 | scale_colour_manual(values = cbPalette[2:3]) + 86 | theme(legend.position = "bottom") 87 | 88 | ggsave(output_filename, 89 | plot = plot_obj, 90 | height = 4, 91 | width = 7.25) 92 | -------------------------------------------------------------------------------- /plots/scripts/6-plot_recon_kappa.R: -------------------------------------------------------------------------------- 1 | # S. Foltz Feb 2022 2 | # This plots reconstruction kappa values from PCA reconstruction 3 | 4 | option_list <- list( 5 | optparse::make_option("--cancer_type", 6 | default = NA_character_, 7 | help = "Cancer type"), 8 | optparse::make_option("--predictor", 9 | default = NA_character_, 10 | help = "Predictor used"), 11 | optparse::make_option("--output_directory", 12 | default = NA_character_, 13 | help = "Output directory for plot (absolute or relative path)") 14 | ) 15 | 16 | opt <- optparse::parse_args(optparse::OptionParser(option_list=option_list)) 17 | source(here::here("util/option_functions.R")) 18 | check_options(opt) 19 | 20 | # load libraries 21 | suppressMessages(library(tidyverse)) 22 | source(here::here("util/color_blind_friendly_palette.R")) 23 | 24 | # set options 25 | cancer_type <- opt$cancer_type 26 | predictor <- opt$predictor 27 | file_identifier <- str_c(cancer_type, predictor, sep = "_") 28 | 29 | # define directories 30 | plot.dir <- here::here("plots") 31 | plot.data.dir <- here::here("plots/data") 32 | output_directory <- opt$output_directory 33 | 34 | # define input file 35 | input_filename <- file.path(plot.data.dir, 36 | paste0(file_identifier, 37 | "_kappa_reconstructed_data.tsv")) 38 | 39 | # define output files 40 | output_filename <- file.path(output_directory, 41 | paste0(file_identifier, 42 | "_kappa_reconstructed.pdf")) 43 | 44 | # read in data 45 | 46 | median_df <- readr::read_tsv(input_filename, 47 | col_types = "dcccdcc") %>% 48 | mutate(Perc.seq = factor(Perc.seq, 49 | levels = seq(0, 100, 10))) %>% 50 | group_by(Perc.seq, Platform, Classifier, Normalization) %>% 51 | summarize(n_obs = n(), 52 | med = median(Kappa), 53 | IQR = quantile(Kappa, 0.75) - quantile(Kappa, 0.25), 54 | median_ci_upper = med + 1.58*IQR/sqrt(n_obs), 55 | median_ci_lower = med - 1.58*IQR/sqrt(n_obs), 56 | .groups = "drop") 57 | 58 | kappa_df <- read_tsv(input_filename, 59 | col_types = "dcccdcc") %>% 60 | mutate(Perc.seq = factor(Perc.seq, 61 | levels = seq(0, 100, 10))) 62 | 63 | # for each normalization method, plot kappa stats 64 | plot_obj <- ggplot(median_df, 65 | aes(x = Perc.seq, 66 | y = med, # median 67 | color = Platform, 68 | fill = Platform)) + 69 | facet_grid(rows = vars(Classifier), 70 | cols = vars(Normalization)) + 71 | geom_errorbar(aes(x = Perc.seq, 72 | ymin = median_ci_lower, 73 | ymax = median_ci_upper), 74 | size = 0.25, 75 | width = 0.5, 76 | position = position_dodge(0.7)) + 77 | geom_line(aes(group = Platform), 78 | size = 0.5, 79 | position = position_dodge(0.7)) + 80 | geom_point(shape = 16, 81 | size = 0.5, 82 | show.legend = FALSE, 83 | position = position_dodge(0.7)) + 84 | geom_point(data = kappa_df, 85 | aes(x = Perc.seq, 86 | y = Kappa, 87 | color = Platform, 88 | fill = Platform), 89 | alpha = 0.5, 90 | size = 0.25, 91 | shape = 16, 92 | position = position_dodge(0.7), 93 | show.legend = FALSE) + 94 | expand_limits(y = 1) + 95 | scale_x_discrete(labels = c("0", "", "", "", "", 96 | "50", "", "", "", "", 97 | "100")) + 98 | labs(x = "% RNA-seq Samples in Training Data", 99 | color = "Test Data Platform", 100 | fill = "Test Data Platform", 101 | y = "Kappa", 102 | title = str_c("PCA reconstruction of", 103 | cancer_type, predictor, sep = " ")) + 104 | theme_bw() + 105 | scale_colour_manual(values = cbPalette[2:3]) + 106 | theme(legend.position = "bottom", 107 | panel.grid.major = element_line(size = 0.25), 108 | panel.grid.minor = element_line(size = 0.25), 109 | strip.text.y = element_text(size = 7)) 110 | 111 | ggsave(output_filename, 112 | plot = plot_obj, 113 | height = 4, 114 | width = 7.25) 115 | -------------------------------------------------------------------------------- /plots/scripts/7-plot_plier_pathways.R: -------------------------------------------------------------------------------- 1 | # S. Foltz Feb 2022 2 | # This plots the rate of return for significant PLIER pathways 3 | # for data coming from different normalization methods and titration levels. 4 | 5 | option_list <- list( 6 | optparse::make_option("--cancer_type", 7 | default = NA_character_, 8 | help = "Cancer type"), 9 | optparse::make_option("--predictor", 10 | default = NA_character_, 11 | help = "Predictor used"), 12 | optparse::make_option("--output_directory", 13 | default = NA_character_, 14 | help = "Save plot to this directory") 15 | ) 16 | 17 | opt <- optparse::parse_args(optparse::OptionParser(option_list=option_list)) 18 | source(here::here("util/option_functions.R")) 19 | check_options(opt) 20 | 21 | # load libraries 22 | suppressMessages(library(tidyverse)) 23 | source(here::here("util/color_blind_friendly_palette.R")) 24 | 25 | # set options 26 | cancer_type <- opt$cancer_type 27 | predictor <- opt$predictor 28 | output_directory <- opt$output_directory 29 | file_identifier <- str_c(cancer_type, predictor, sep = "_") 30 | 31 | # define directories 32 | plot.dir <- here::here("plots") 33 | plot.data.dir <- here::here("plots/data") 34 | 35 | # define input file 36 | 37 | plot_data_filename <- file.path( 38 | plot.data.dir, 39 | str_c(file_identifier, "_PLIER_jaccard.tsv") 40 | ) 41 | 42 | # define output files 43 | 44 | output_filename <- file.path(output_directory, 45 | str_c(file_identifier, 46 | "_PLIER_jaccard.pdf")) 47 | 48 | # sample size levels 49 | sample_size_levels <- c("Single Platform\n(half sample size)", 50 | "Combined Array and RNA-seq (full sample size)", 51 | "Single Platform\n(full sample size)") 52 | 53 | # Read in data 54 | jaccard_df <- read_tsv(plot_data_filename, 55 | col_types = "dddddcdcddl") %>% 56 | mutate(sample_size = case_when(nmeth == "array_only" ~ sample_size_levels[1], 57 | nmeth == "seq_only" ~ sample_size_levels[1], 58 | pseq == 0 ~ sample_size_levels[3], 59 | pseq == 100 ~ sample_size_levels[3], 60 | TRUE ~ sample_size_levels[2]), 61 | sample_size = factor(sample_size, 62 | levels = sample_size_levels, 63 | ordered = TRUE), 64 | nmeth = case_when(nmeth == "array_only" ~ "LOG\nArray", 65 | nmeth == "seq_only" ~ "LOG\nRNA-seq", 66 | pseq == 0 ~ "LOG\nArray", 67 | pseq == 100 ~ "LOG\nRNA-seq", 68 | TRUE ~ str_to_upper(nmeth))) 69 | # Plot results 70 | set.seed(1) # using jitter 71 | 72 | plot_obj <- jaccard_df %>% 73 | ggplot(aes(x = nmeth, 74 | y = jaccard)) + 75 | geom_violin(draw_quantiles = .5, 76 | scale = "width") + 77 | geom_jitter(shape = 16, 78 | alpha = 0.5, 79 | height = 0, 80 | width = 0.1) + 81 | expand_limits(y = 0) + 82 | facet_grid(. ~ sample_size, 83 | scales = "free_x", 84 | space='free') + 85 | ggtitle(cancer_type) + 86 | xlab("Normalization Method") + 87 | ylab("Proportion of Pathways Significant") + 88 | theme_bw() 89 | 90 | ggsave(output_filename, 91 | plot = plot_obj, 92 | height = 4, width = 7.25) 93 | -------------------------------------------------------------------------------- /plots/scripts/recon_kappa_difference.R: -------------------------------------------------------------------------------- 1 | # S. Foltz Mar 2022 2 | # This compares kappa values from category prediction with/out reconstruction 3 | 4 | option_list <- list( 5 | optparse::make_option("--cancer_type", 6 | default = NA_character_, 7 | help = "Cancer type"), 8 | optparse::make_option("--output_directory", 9 | default = NA_character_, 10 | help = "Output directory for plot (absolute or relative path)") 11 | ) 12 | 13 | opt <- optparse::parse_args(optparse::OptionParser(option_list=option_list)) 14 | source(here::here("util/option_functions.R")) 15 | check_options(opt) 16 | 17 | # load libraries 18 | suppressMessages(library(tidyverse)) 19 | source(here::here("util/color_blind_friendly_palette.R")) 20 | 21 | # set options 22 | cancer_type <- opt$cancer_type 23 | predictor <- "subtype" 24 | file_identifier <- str_c(cancer_type, predictor, sep = "_") 25 | 26 | # define directories 27 | plot.dir <- here::here("plots") 28 | plot.data.dir <- here::here("plots/data") 29 | output_directory <- opt$output_directory 30 | 31 | # define input file 32 | without_recon_input_filename <- file.path(plot.data.dir, 33 | paste0(file_identifier, 34 | "_train_3_models_kappa.tsv")) 35 | with_recon_input_filename <- file.path(plot.data.dir, 36 | paste0(file_identifier, 37 | "_kappa_reconstructed_data.tsv")) 38 | 39 | # define output files 40 | output_filename <- file.path(output_directory, 41 | paste0(file_identifier, 42 | "_kappa_reconstruction_difference.pdf")) 43 | 44 | # read in data 45 | without_df <- read_tsv(without_recon_input_filename, 46 | col_types = "ddccc") %>% 47 | mutate(Perc.Seq = factor(Perc.Seq, 48 | levels = seq(0, 100, 10))) 49 | 50 | with_df <- read_tsv(with_recon_input_filename, 51 | col_types = "dcccdcc") %>% 52 | mutate(Perc.Seq = factor(Perc.seq, 53 | levels = seq(0, 100, 10))) 54 | 55 | # get data summary (median kappa at each setting) 56 | 57 | without_summary_df <- without_df %>% 58 | group_by(Perc.Seq, Classifier, Normalization, Platform) %>% 59 | summarize(median_without = median(Kappa), 60 | .groups = "drop") 61 | 62 | with_summary_df <- with_df %>% 63 | filter(Reconstruction == "PCA", 64 | Measure == "kappa") %>% 65 | group_by(Perc.Seq, Classifier, Normalization, Platform) %>% 66 | summarize(median_with = median(Kappa), 67 | .groups = "drop") 68 | 69 | # combined data frames and calculate difference in median kappas 70 | 71 | joint_df <- without_summary_df %>% 72 | left_join(with_summary_df, 73 | by = c("Perc.Seq", "Classifier", "Normalization", "Platform")) %>% 74 | mutate(kappa_difference = median_without - median_with) %>% 75 | filter(!is.na(kappa_difference)) 76 | 77 | # plot 78 | 79 | plot_obj <- ggplot(joint_df, 80 | aes(x = Perc.Seq, 81 | y = kappa_difference, 82 | color = Platform, 83 | fill = Platform)) + 84 | facet_grid(rows = vars(Classifier), 85 | cols = vars(Normalization)) + 86 | stat_summary(fun = median, 87 | geom = "line", 88 | aes(group = Platform), 89 | position = position_dodge(0.7)) + 90 | stat_summary(fun = median, 91 | geom = "point", 92 | aes(group = Platform), 93 | position = position_dodge(0.7), 94 | size = 1, 95 | shape = 16) + 96 | scale_x_discrete(labels = c("0", "", "", "", "", 97 | "50", "", "", "", "", 98 | "100")) + 99 | labs(x = "% RNA-seq Samples in Training Data", 100 | color = "Test Data Platform", 101 | fill = "Test Data Platform", 102 | y = "Difference in Kappa\n(No Reconstruction - Reconstruction)", 103 | title = str_c(cancer_type, predictor, "(reconstruction difference)", sep = " ")) + 104 | theme_bw() + 105 | scale_colour_manual(values = cbPalette[2:3]) + 106 | theme(legend.position = "bottom") 107 | 108 | ggsave(output_filename, 109 | plot = plot_obj, 110 | height = 4, 111 | width = 7.25) 112 | -------------------------------------------------------------------------------- /plots/scripts/visualize_expression.R: -------------------------------------------------------------------------------- 1 | # S. Foltz Oct 2021 2 | # The purpose of this script is to visualize normalized gene expression 3 | # and compare values from matched microarray and RNA-seq samples 4 | # USAGE: Rscript visualize_expression.R --cancer_type --predictor --null_model --seed 5 | 6 | option_list <- list( 7 | optparse::make_option("--cancer_type", 8 | default = NA_character_, 9 | help = "Cancer type"), 10 | optparse::make_option("--predictor", 11 | default = NA_character_, 12 | help = "Predictor used"), 13 | optparse::make_option("--null_model", 14 | action = "store_true", 15 | default = FALSE, 16 | help = "Refer to models with permuted dependent variable (within subtype if predictor is a gene)"), 17 | optparse::make_option("--seed", 18 | default = 1234, 19 | help = "Set a seed to ensure reproducible results when subsampling genes") 20 | ) 21 | 22 | opt <- optparse::parse_args(optparse::OptionParser(option_list=option_list)) 23 | source(here::here("util/option_functions.R")) 24 | check_options(opt) 25 | 26 | # load libraries 27 | suppressMessages(library(tidyverse)) 28 | source(here::here("util", "color_blind_friendly_palette.R")) 29 | 30 | # set options 31 | cancer_type <- opt$cancer_type 32 | predictor <- opt$predictor 33 | null_model <- opt$null_model 34 | file_identifier <- ifelse(null_model, 35 | str_c(cancer_type, predictor, "null", sep = "_"), 36 | str_c(cancer_type, predictor, sep = "_")) 37 | 38 | # set seed 39 | set.seed(opt$seed) 40 | 41 | # define directories 42 | plot.dir <- here::here("plots") 43 | norm.dir <- here::here("normalized_data") 44 | viz.dir <- file.path(plot.dir, "visualize_expression") 45 | 46 | # define input files 47 | normalized_test_data_filename <- list.files(norm.dir, 48 | pattern = str_c(file_identifier, 49 | "_array_seq_test_data_normalized_list_"), 50 | full.names = TRUE)[1] 51 | 52 | normalized_test_data <- read_rds(normalized_test_data_filename) 53 | 54 | ### functions ------------------------------------------------------------------ 55 | 56 | plot_matched_expression <- function(array_values, seq_values, 57 | method_title, plot_type, 58 | output_directory, filename_lead) { 59 | 60 | # This function creates a plot for expression values from matched array and RNA-seq samples 61 | # The function can produce a plot with points (alpha = 0.1) or a hex grid to show density 62 | # Inputs: 63 | # array_values = a vector of array values 64 | # seq_values = vector of seq values, matched to array values 65 | # method_title = something informative that will define the plot title and output filename 66 | # plot_type = either 'point' or 'hex' depending on the desired plot type 67 | # output_directory = output directory of PDF 68 | # filename_lead = start of the output filename 69 | # Outputs: 70 | # a PDF of the plot is saved to output_directory 71 | 72 | this_plot <- ggplot(mapping = aes(x = array_values, 73 | y = seq_values)) 74 | 75 | if (plot_type == "point") { 76 | this_plot <- this_plot + 77 | geom_point(alpha = 0.1, 78 | shape = 16) 79 | } else if (plot_type == "hex") { 80 | this_plot <- this_plot + 81 | geom_hex() 82 | } else { 83 | stop("Plot type must be 'point' or 'hex'.") 84 | } 85 | 86 | this_plot <- this_plot + 87 | geom_abline(lty = 2, # dashed red x-y line 88 | color = "red") + 89 | geom_smooth(method = "gam", # fit a curve to the data 90 | formula = y ~ s(x, bs = "cs")) + # loess no good for large n 91 | labs(x = "Microarray expression values", 92 | y = "RNA-seq expression values", 93 | title = method_title) + 94 | theme_minimal() 95 | 96 | if (method_title != "UN") { 97 | this_plot <- this_plot + 98 | coord_fixed() 99 | } 100 | 101 | ggsave(plot = this_plot, 102 | filename = file.path(output_directory, 103 | str_c(filename_lead, 104 | method_title, 105 | plot_type, 106 | "pdf", 107 | sep = ".")), 108 | height = 7.25, 109 | width = 7.25) 110 | } 111 | 112 | #### plot matched comparison of matched microarray and RNA-seq ----------------- 113 | 114 | gene_rows_included <- sort(sample(1:nrow(normalized_test_data$array$log), 115 | size = 1000, # select 1000 random genes 116 | replace = FALSE)) 117 | 118 | norm_methods <- names(normalized_test_data$seq) # get all normalization methods 119 | 120 | for (nm in norm_methods) { 121 | 122 | if (nm %in% c("seurat")) next 123 | 124 | if (nm == "tdm") { 125 | # array has no TDM (it is already log) 126 | array_values <- as.vector(as.matrix(normalized_test_data$array[["log"]][gene_rows_included, -1])) 127 | for (pct_rna_seq in as.character(seq(0, 90, 10))) { # NULL at 100% RNA-seq 128 | # only seq varies across %RNA-seq 129 | seq_values <- as.vector(as.matrix(normalized_test_data$seq[[nm]][[pct_rna_seq]][gene_rows_included, -1])) 130 | method_title <- str_c(str_to_upper(nm), pct_rna_seq, sep = "_") 131 | 132 | plot_matched_expression(array_values, seq_values, 133 | method_title, plot_type = "hex", 134 | viz.dir, file_identifier) 135 | 136 | } 137 | } else if (nm %in% c("qn", "qn-z")) { 138 | array_values <- as.vector(as.matrix(normalized_test_data$array[[nm]][gene_rows_included, -1])) 139 | for (pct_rna_seq in as.character(seq(0, 100, 10))) { 140 | # only seq varies across %RNA-seq 141 | seq_values <- as.vector(as.matrix(normalized_test_data$seq[[nm]][[pct_rna_seq]][gene_rows_included, -1])) 142 | method_title <- str_c(str_to_upper(nm), pct_rna_seq, sep = "_") 143 | 144 | plot_matched_expression(array_values, seq_values, 145 | method_title, plot_type = "hex", 146 | viz.dir, file_identifier) 147 | 148 | } 149 | } else { # test data for normalization methods that do not vary with RNA-seq % in training data 150 | array_values <- as.vector(as.matrix(normalized_test_data$array[[nm]][gene_rows_included, -1])) 151 | seq_values <- as.vector(as.matrix(normalized_test_data$seq[[nm]][gene_rows_included, -1])) 152 | method_title <- str_to_upper(nm) 153 | 154 | plot_matched_expression(array_values, seq_values, 155 | method_title, plot_type = "hex", 156 | viz.dir, file_identifier) 157 | 158 | } 159 | } 160 | -------------------------------------------------------------------------------- /plots/supplementary/.empty: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/greenelab/RNAseq_titration_results/2d9b2f5ce2f04fc76279b2fc314e51abc9a4f3b4/plots/supplementary/.empty -------------------------------------------------------------------------------- /plots/visualize_expression/.empty: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/greenelab/RNAseq_titration_results/2d9b2f5ce2f04fc76279b2fc314e51abc9a4f3b4/plots/visualize_expression/.empty -------------------------------------------------------------------------------- /prepare_GBM_data.R: -------------------------------------------------------------------------------- 1 | # Script prepares GBM expression data for use in pipelines 2 | # For GBM array data: convert sample names and remove duplicate individuals 3 | # For GBM seq data: filter all seq data for GBM samples only, convert genes IDs 4 | 5 | # Steven Foltz July 2021 6 | 7 | option_list <- list( 8 | optparse::make_option("--seq_input", 9 | default = NA_character_, 10 | help = "TCGA sequencing expression input file path"), 11 | optparse::make_option("--array_input", 12 | default = NA_character_, 13 | help = "refine.bio microarray expression input file path"), 14 | optparse::make_option("--metadata_input", 15 | default = NA_character_, 16 | help = "refine.bio aggregated metadata JSON file path"), 17 | optparse::make_option("--array_output", 18 | default = NA_character_, 19 | help = "Processed microarray data output file path"), 20 | optparse::make_option("--seq_output", 21 | default = NA_character_, 22 | help = "Processed sequencing data output file path"), 23 | optparse::make_option("--clinical_input", 24 | default = NA_character_, 25 | help = "Clinical information input file path (Excel file)"), 26 | optparse::make_option("--clinical_output", 27 | default = NA_character_, 28 | help = "Clinical information output file path (.tsv)"), 29 | optparse::make_option("--overwrite", 30 | action = "store_true", 31 | default = FALSE, 32 | help = "Overwrite existing output files [default: %default]") 33 | ) 34 | 35 | opt <- optparse::parse_args(optparse::OptionParser(option_list=option_list)) 36 | source(here::here("util/option_functions.R")) 37 | check_options(opt) 38 | 39 | # load libraries 40 | suppressMessages(library(tidyverse)) 41 | 42 | # set options 43 | tcga_seq_expression_input_filepath <- opt$seq_input 44 | gbm_array_expression_input_filepath <- opt$array_input 45 | metadata_json_input_filepath <- opt$metadata_input 46 | gbm_array_output_filepath <- opt$array_output 47 | gbm_seq_output_filepath <- opt$seq_output 48 | clinical_xlxs_input_filepath <- opt$clinical_input 49 | clinical_tsv_output_filepath <- opt$clinical_output 50 | 51 | ################################################################################ 52 | # Array data 53 | ################################################################################ 54 | 55 | # read in refine.bio GBM array expression data 56 | gbm_array_expression <- read_tsv(gbm_array_expression_input_filepath, 57 | col_types = cols( 58 | .default = col_double(), 59 | Gene = col_character() 60 | )) 61 | 62 | # load up aggregated metadata json file 63 | metadata_json <- jsonlite::fromJSON(metadata_json_input_filepath, 64 | simplifyVector = FALSE) 65 | 66 | # accession IDs present in expression data 67 | available_array_accession_ids <- colnames(gbm_array_expression)[-1] 68 | 69 | # All about TCGA barcodes 70 | # TCGA barcodes are defined in this format: TCGA-XX-YYYY-ZZ* 71 | # XX is the two character tissue source site (TSS) (defines a combination of hospital system and cancer type -- so multiple TSSs map to single cancer type) 72 | # YYYY is the four digit participant ID specific to a TSS (so TCGA-XX-YYYY defines an individual patient -- patients from different TSSs may have same participant ID) 73 | # ZZ is the sample type (starts with 0 for tumor samples; specifically 01 for primary solid tumors) 74 | # After ZZ (*) is more specific information not relevant in this context 75 | # Links to more info: 76 | # https://docs.gdc.cancer.gov/Encyclopedia/pages/TCGA_Barcode/ 77 | # https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/tissue-source-site-codes 78 | # https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/sample-type-codes 79 | 80 | # starting with flattened metadata, parse out raw TCGA IDs and filter for tumors 81 | all_array_tumor_samples <- tibble(accession = names(metadata_json$samples)) %>% 82 | filter(accession %in% available_array_accession_ids) %>% # check these are == 83 | rowwise() %>% 84 | mutate(tcga_id_raw = metadata_json$samples[[accession]]$refinebio_annotations[[1]]$characteristics_ch1[[9]] %>% 85 | str_remove("sample: ")) %>% 86 | # tcga_id_raw is the entire TCGA barcode (includes information after ZZ which we need to drop) 87 | mutate(tcga_id = str_sub(tcga_id_raw, 1, 15), # so tcga_id includes TCGA-XX-YYYY-ZZ 88 | sample = str_sub(tcga_id_raw, 14, 15)) %>% # and sample refers to ZZ 89 | filter(sample == "01") %>% # filter for primary solid tumors only 90 | ungroup() 91 | 92 | # keep one (first) accession per TCGA ID 93 | array_accession_tcga_id_keep <- all_array_tumor_samples %>% 94 | group_by(tcga_id) %>% 95 | summarize(accession = sort(accession)[1]) %>% 96 | ungroup() 97 | accession_colnames_keep <- colnames(gbm_array_expression)[-1][colnames(gbm_array_expression)[-1] %in% array_accession_tcga_id_keep$accession] 98 | 99 | # select columns to keep and rename with TCGA IDs 100 | gbm_array_expression_renamed <- gbm_array_expression %>% 101 | select(c("Gene", 102 | array_accession_tcga_id_keep$accession)) 103 | colnames(gbm_array_expression_renamed) <- c("sample", 104 | array_accession_tcga_id_keep$tcga_id) 105 | 106 | ################################################################################ 107 | # Sequencing data 108 | ################################################################################ 109 | 110 | # read in column names of entire TCGA seq expression file 111 | tcga_seq_expression_column_names <- read_tsv(tcga_seq_expression_input_filepath, 112 | col_types = cols( 113 | .default = col_double(), 114 | gene_id = col_character()), 115 | n_max = 0) %>% 116 | names() 117 | 118 | # identify sequencing TCGA IDs of samples present in array data 119 | # (a more inclusive approach would be selecting GBM samples based on TSS codes) 120 | gbm_seq_tumor_samples <- tibble(tcga_id_raw = tcga_seq_expression_column_names[-1]) %>% 121 | mutate(tcga_patient = str_sub(tcga_id_raw, 1, 12), # TCGA-XX-YYYY to match with clinical 122 | tcga_id = str_sub(tcga_id_raw, 1, 15), # as with array, tcga_id refers to TCGA-XX-YYYY-ZZ 123 | sample = str_sub(tcga_id_raw, 14, 15)) %>% # and sample is ZZ 124 | filter(sample == "01") %>% # require sample to be primary solid tumor 125 | filter(tcga_id %in% array_accession_tcga_id_keep$tcga_id) %>% # keep array GBMs 126 | group_by(tcga_patient) %>% 127 | summarize(tcga_id_raw = sort(tcga_id_raw)[1], # keep one raw ID per person 128 | tcga_id = str_sub(tcga_id_raw, 1, 15)) %>% 129 | ungroup() 130 | 131 | # now read in GBM subset of entire TCGA seq expression file 132 | # this is faster and uses less memory than reading in entire file and then subsetting 133 | # read these GBM columns only 134 | tcga_seq_gbm_tf <- tcga_seq_expression_column_names[-1] %in% gbm_seq_tumor_samples$tcga_id_raw 135 | # use these column types 136 | # first column is 'c' for gene_id, then '-' for non-GBM samples, then 'd' for GBM 137 | # setting column type to '-' skips over that column when reading file 138 | tcga_seq_gbm_col_types <- str_c(c("c", c("-", "d")[tcga_seq_gbm_tf + 1]), collapse = "") 139 | # read in my defined subset of columns with column types 140 | gbm_seq_expression <- read_tsv(tcga_seq_expression_input_filepath, 141 | col_types = tcga_seq_gbm_col_types) 142 | colnames(gbm_seq_expression) <- c("gene_id", 143 | gbm_seq_tumor_samples$tcga_id) 144 | 145 | # Detour to make gene ids consistent between array and seq files 146 | # will convert seq format (SYMBOL|ENTREZ) to array format (ENSG) 147 | 148 | # separate gene symbols from entrez ids (delimiter = "|") 149 | symbol_entrez_ids <- gbm_seq_expression %>% 150 | select(gene_id) %>% 151 | separate(gene_id, 152 | into = c("SYMBOL", "ENTREZID"), 153 | sep = "\\|", 154 | remove = FALSE) 155 | 156 | # map entrez ids to ensembl ids (GENEID) 157 | entrez_ensembl_ids <- ensembldb::select(EnsDb.Hsapiens.v86::EnsDb.Hsapiens.v86, 158 | keys= symbol_entrez_ids$ENTREZID, 159 | keytype = "ENTREZID", 160 | columns = "GENEID") %>% 161 | as_tibble() %>% 162 | mutate(ENTREZID = as.character(ENTREZID)) 163 | 164 | # collate gene name schemes 165 | # filter for those that mapped and exist in array 166 | # filter for ENSGs with a one-to-one mapping with entrez 167 | gene_id_mapping_in_array <- symbol_entrez_ids %>% 168 | left_join(entrez_ensembl_ids, 169 | by = "ENTREZID") %>% 170 | filter(!is.na(GENEID)) %>% 171 | add_count(GENEID) %>% 172 | filter(n == 1) %>% 173 | filter(GENEID %in% gbm_array_expression$Gene) %>% 174 | select(gene_id, GENEID) 175 | 176 | # starting with acceptable genes, left join with seq expression and select cols 177 | gbm_seq_expression_renamed <- gene_id_mapping_in_array %>% 178 | left_join(gbm_seq_expression, 179 | by = "gene_id") %>% 180 | select(-gene_id) %>% 181 | rename("sample" = "GENEID") 182 | 183 | ################################################################################ 184 | # subtype information 185 | ################################################################################ 186 | 187 | # read in Table S7 from flagship GBM landscape paper (Brennan et al., Cell 2013) 188 | # select and rename interesting columns 189 | gbm_subtypes <- readxl::read_xlsx(path = clinical_xlxs_input_filepath, 190 | sheet = "Clinical Data", 191 | skip = 1) %>% 192 | right_join(gbm_seq_tumor_samples, 193 | by = c("Case ID" = "tcga_patient")) %>% 194 | select("tcga_id", 195 | "MGMT Status", 196 | "G-CIMP\r\n methylation", 197 | "IDH1\r\n status", 198 | "Expression\r\nSubclass") %>% 199 | rename("Sample" = "tcga_id", 200 | "MGMT_methylation_status" = "MGMT Status", 201 | "G-CIMP_methylation" = "G-CIMP\r\n methylation", 202 | "IDH1_mutation_status" = "IDH1\r\n status", 203 | "subtype" = "Expression\r\nSubclass") %>% 204 | mutate(subtype = na_if(subtype, "NA")) %>% 205 | mutate(subtype = stringr::str_remove(subtype, "-")) %>% # G-CIMP to GCIMP 206 | mutate(Type = "tumor") 207 | 208 | missing_clinical <- gbm_subtypes %>% 209 | filter(is.na(subtype)) %>% 210 | pull(Sample) 211 | 212 | ################################################################################ 213 | # Write to file, excluding samples without clinical info 214 | ################################################################################ 215 | 216 | write_tsv(gbm_array_expression_renamed %>% 217 | select(-all_of(missing_clinical)), 218 | gbm_array_output_filepath) 219 | 220 | write_tsv(gbm_seq_expression_renamed %>% 221 | select(-all_of(missing_clinical)), 222 | gbm_seq_output_filepath) 223 | 224 | write_tsv(gbm_subtypes %>% 225 | filter(!is.na(subtype)), 226 | clinical_tsv_output_filepath) 227 | -------------------------------------------------------------------------------- /results/.empty: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/greenelab/RNAseq_titration_results/2d9b2f5ce2f04fc76279b2fc314e51abc9a4f3b4/results/.empty -------------------------------------------------------------------------------- /results/array_rnaseq_ratio/ratio.2022-02-18_18_50_01_UTC.tsv: -------------------------------------------------------------------------------- 1 | Platform GEO AE Total 2 | Array 1163755 207117 1370872 3 | RNA-seq 1078052 134243 1212295 4 | -------------------------------------------------------------------------------- /results/array_rnaseq_ratio/ratio.tracking.tsv: -------------------------------------------------------------------------------- 1 | File:ratio.2022-02-18_18_50_01_UTC.tsv Date:2022-02-18_18_50_01_UTC Array_to_RNA-seq_ratio:1.130807270507591 2 | -------------------------------------------------------------------------------- /results/differential_expression/.empty: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/greenelab/RNAseq_titration_results/2d9b2f5ce2f04fc76279b2fc314e51abc9a4f3b4/results/differential_expression/.empty -------------------------------------------------------------------------------- /results/reconstructed_data/.empty: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/greenelab/RNAseq_titration_results/2d9b2f5ce2f04fc76279b2fc314e51abc9a4f3b4/results/reconstructed_data/.empty -------------------------------------------------------------------------------- /retrieve_MC3_mutations.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import os 3 | import sys 4 | 5 | # define output filepaths 6 | output_tsv_filepath = "data/mutations.tsv" 7 | output_maf_filepath = "data/mutations.maf" 8 | 9 | # define directories 10 | data_dir = "data" 11 | 12 | # TCGA MC3 MAF from https://gdc.cancer.gov/about-data/publications/pancanatlas 13 | mc3_filename = os.path.join(data_dir, "mc3.v0.2.8.PUBLIC.maf.gz") 14 | 15 | # cancer types and genes of interest 16 | cancer_type_abbrevs = {"Breast invasive carcinoma": "BRCA", 17 | "Glioblastoma multiforme": "GBM"} 18 | cancer_types_of_interest = cancer_type_abbrevs.keys() 19 | genes_of_interest = ["PIK3CA", "TP53"] 20 | 21 | ############################################################ 22 | # Tissue source sites define the cancer type of the sample # 23 | ############################################################ 24 | 25 | # TSS codes from https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/tissue-source-site-codes 26 | tcga_tss_codes = open("tcga_tss_codes.csv", "r") 27 | tcga_tss_codes.readline() 28 | tcga_tss_codes_dict = {} 29 | 30 | # set up TSS dictionary with {tss: cancer_type} 31 | for line in tcga_tss_codes: 32 | k,v = line.strip().split(",") 33 | tcga_tss_codes_dict[k] = v 34 | 35 | tcga_tss_codes.close() 36 | 37 | ############################### 38 | # Retrieve mutations from MC3 # 39 | ############################### 40 | 41 | # simple mutation dictionary {cancer_type: {tcga_id: ["PIK3CA", "TP53"]}} 42 | # this will be used at end to create simple 0/1 mutation status data frame 43 | mutation_dict = {x: {} for x in cancer_types_of_interest} 44 | 45 | # open up MC3 and define header lines 46 | mc3 = gzip.open(mc3_filename, "rb") 47 | maf_header = mc3.readline().decode('UTF-8').strip().split() 48 | maf_ixs = {name: ix for ix, name in enumerate(maf_header)} 49 | tsv_header = "\t".join(["tcga_id", "cancer_type", "Tumor_Sample_Barcode", "Matched_Norm_Sample_Barcode", "Hugo_Symbol", "Chromosome", "Start_Position", "Variant_Classification"]) 50 | 51 | output_tsv = open(output_tsv_filepath, "w") 52 | output_maf = open(output_maf_filepath, "w") 53 | 54 | output_tsv.write(tsv_header + "\n") 55 | output_maf.write('\t'.join(maf_header) + "\n") 56 | 57 | # progress through each line of MC3 58 | # check if sample is primary solid tumor ("01") and 59 | # from out genes and cancer types of interest 60 | for line in mc3: 61 | record = line.decode('UTF-8').strip().split("\t") 62 | hugo_symbol = record[maf_ixs['Hugo_Symbol']] # gene name 63 | tcga_id_raw = record[maf_ixs['Tumor_Sample_Barcode']] # tumor barcode 64 | tcga_id_raw_normal = record[maf_ixs['Matched_Norm_Sample_Barcode']] # normal barcode 65 | is_tumor = tcga_id_raw.split("-")[3].startswith("01") 66 | tss_code = tcga_id_raw.split("-")[1] 67 | cancer_type = tcga_tss_codes_dict[tss_code] 68 | 69 | if is_tumor and cancer_type in cancer_types_of_interest: 70 | tcga_id = tcga_id_raw[0:15] 71 | 72 | # add TCGA ID to mutation dict 73 | if tcga_id not in mutation_dict[cancer_type]: 74 | mutation_dict[cancer_type][tcga_id] = set() 75 | 76 | # if gene of interest, add to mutation dict list for that ID and outputs 77 | if hugo_symbol in genes_of_interest: 78 | chromosome = record[maf_ixs['Chromosome']] # chromosome of mutation 79 | start_position = record[maf_ixs['Start_Position']] # position of mutation 80 | variant_class = record[maf_ixs['Variant_Classification']] # e.g. Missense_Mutation, In_Frame_Del 81 | 82 | # add a gene to mutation set 83 | mutation_dict[cancer_type][tcga_id].add(hugo_symbol) 84 | 85 | output_tsv.write("\t".join([tcga_id, cancer_type_abbrevs[cancer_type], tcga_id_raw, tcga_id_raw_normal, hugo_symbol, chromosome, start_position, variant_class]) + "\n") 86 | output_maf.write("\t".join(record) + "\n") 87 | 88 | mc3.close() 89 | output_tsv.close() 90 | output_maf.close() 91 | 92 | # write cancer-type-specific simple output data frames (0/1 for gene mutation status) 93 | for cancer_type in cancer_types_of_interest: 94 | 95 | simple_output_filename = os.path.join(data_dir, 96 | "mutations." + cancer_type_abbrevs[cancer_type] + ".tsv") 97 | simple_output = open(simple_output_filename, "w") 98 | simple_output_header = "\t".join(["tcga_id", "PIK3CA", "TP53"]) + "\n" 99 | 100 | simple_output.write(simple_output_header) 101 | 102 | # each TCGA ID has a set of mutated genes 103 | # for each gene of interest, return a binary mutation status if that gene appears in the list 104 | # for each TCGA ID, report the mutation status of each gene as a row in output data frame 105 | for tcga_id, mutation_list in mutation_dict[cancer_type].items(): 106 | mutation_status_list = [str(int(x in mutation_list)) for x in genes_of_interest] 107 | simple_output.write(tcga_id + "\t" + "\t".join(mutation_status_list) + "\n") 108 | 109 | simple_output.close() 110 | -------------------------------------------------------------------------------- /run_all_analyses_and_plots.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -euo pipefail 3 | 4 | # This script runs analysis code and plotting scripts for the publication, 5 | # including both cancer types (both) or just one cancer type (BRCA or GBM). 6 | # The script calls: 7 | # 1. run_machine_learning_experiments.sh 8 | # 2. run_differential_expression_experiments.sh (subtype only) 9 | # 3. plots/scripts/visualize_expression.R (subtype only) 10 | # 4. plotting scripts, as appropriate 11 | 12 | # cancer type (must be both, BRCA, or GBM) 13 | cancer_type=$1 14 | 15 | if [ $cancer_type != "both" ] && [ $cancer_type != "BRCA" ] && [ $cancer_type != "GBM" ]; then 16 | echo Cancer type must be both, BRCA or GBM in run_all_analyses_and_plots.sh [cancer_type] 17 | exit 18 | fi 19 | 20 | ################################################################################ 21 | # BRCA 22 | ################################################################################ 23 | 24 | if [ $cancer_type == "both" ] || [ $cancer_type == "BRCA" ]; then 25 | 26 | # BRCA subtype --------------------------------------------------------------- 27 | 28 | # run machine learning and DEG analysis scripts 29 | bash run_machine_learning_experiments.sh BRCA subtype 7 30 | bash run_differential_expression_experiments.sh BRCA Basal Her2,LumA Her2,LumA 7 31 | 32 | # plot array vs. RNA-seq expression levels after normalization 33 | Rscript plots/scripts/visualize_expression.R --cancer_type BRCA --predictor subtype 34 | 35 | # plot difference in subtype prediction kappa between non-reconstructed and reconstructed data 36 | Rscript plots/scripts/recon_kappa_difference.R --cancer_type BRCA --output_directory plots/supplementary 37 | 38 | # stacked bar plot showing distribution of subtypes in train/test sets (one representative example) 39 | Rscript plots/scripts/0-plot_predictor_category_distributions.R \ 40 | --cancer_type BRCA \ 41 | --predictor subtype \ 42 | --output_directory plots/supplementary 43 | 44 | # stacked bar plots showing distribution of subtypes in train/test sets (all seeds) 45 | Rscript plots/scripts/0-plot_predictor_category_distributions.R \ 46 | --cancer_type BRCA \ 47 | --predictor subtype \ 48 | --plot_all_seeds \ 49 | --output_directory plots/supplementary 50 | 51 | # violin + line plots showing kappa values from predictions on test data 52 | Rscript plots/scripts/3-plot_category_kappa.R \ 53 | --cancer_type BRCA \ 54 | --predictor subtype \ 55 | --output_directory plots/main 56 | 57 | # violin + line plots showing kappa values from predictions on reconstructed test data 58 | Rscript plots/scripts/6-plot_recon_kappa.R \ 59 | --cancer_type BRCA \ 60 | --predictor subtype \ 61 | --output_directory plots/supplementary 62 | 63 | # violin + line plots showing gene-level MASE values from reconstructed test data 64 | Rscript plots/scripts/6-plot_recon_error.R \ 65 | --cancer_type BRCA \ 66 | --predictor subtype \ 67 | --output_directory plots/supplementary 68 | 69 | # violin plots showing proportion of pathways significant in PLIER analyses 70 | Rscript plots/scripts/7-plot_plier_pathways.R \ 71 | --cancer_type BRCA \ 72 | --predictor subtype \ 73 | --output_directory plots/main 74 | 75 | # bar plot showing proportion of genes differentially expressed (Basal vs. Others) 76 | # line plot showing overlap with silver standard DEGs (Basal vs. Others) 77 | Rscript plots/scripts/1A-plot_DEGs.R \ 78 | --cancer_type BRCA \ 79 | --subtype_vs_others Basal \ 80 | --proportion_output_directory plots/supplementary \ 81 | --overlap_output_directory plots/supplementary \ 82 | --overlap_measure Jaccard,Spearman 83 | 84 | # bar plot showing proportion of genes differentially expressed (Her2 vs. LumA) 85 | # line plot showing overlap with silver standard DEGs (Her2 vs. LumA) 86 | Rscript plots/scripts/1A-plot_DEGs.R \ 87 | --cancer_type BRCA \ 88 | --subtype_vs_subtype Her2,LumA \ 89 | --proportion_output_directory plots/supplementary \ 90 | --overlap_output_directory plots/main \ 91 | --overlap_measure Jaccard,Spearman 92 | 93 | # line plot showing overlap with silver standard DEGs (Her2 vs. LumA) across small n values 94 | Rscript plots/scripts/2A-plot_small_n_differential_expression.R \ 95 | --cancer_type BRCA \ 96 | --subtype_vs_subtype Her2,LumA \ 97 | --output_directory plots/main \ 98 | --overlap_measure Jaccard,Spearman 99 | 100 | # ---------------------------------------------------------------------------- 101 | 102 | # BRCA TP53 ------------------------------------------------------------------ 103 | 104 | # run machine learning analysis scripts 105 | bash run_machine_learning_experiments.sh BRCA TP53 7 106 | 107 | # stacked bar plot showing distribution of subtypes in train/test sets (one representative example) 108 | Rscript plots/scripts/0-plot_predictor_category_distributions.R \ 109 | --cancer_type BRCA \ 110 | --predictor TP53 \ 111 | --output_directory plots/supplementary 112 | 113 | # stacked bar plots showing distribution of subtypes in train/test sets (all seeds) 114 | Rscript plots/scripts/0-plot_predictor_category_distributions.R \ 115 | --cancer_type BRCA \ 116 | --predictor TP53 \ 117 | --plot_all_seeds \ 118 | --output_directory plots/supplementary 119 | 120 | # violin + line plots showing kappa values from predictions on test data 121 | Rscript plots/scripts/3-plot_category_kappa.R \ 122 | --cancer_type BRCA \ 123 | --predictor TP53 \ 124 | --null_model \ 125 | --output_directory plots/supplementary 126 | 127 | # ---------------------------------------------------------------------------- 128 | 129 | # BRCA PIK3CA ---------------------------------------------------------------- 130 | 131 | # run machine learning analysis scripts 132 | bash run_machine_learning_experiments.sh BRCA PIK3CA 7 133 | 134 | # stacked bar plot showing distribution of subtypes in train/test sets (one representative example) 135 | Rscript plots/scripts/0-plot_predictor_category_distributions.R \ 136 | --cancer_type BRCA \ 137 | --predictor PIK3CA \ 138 | --output_directory plots/supplementary 139 | 140 | # stacked bar plots showing distribution of subtypes in train/test sets (all seeds) 141 | Rscript plots/scripts/0-plot_predictor_category_distributions.R \ 142 | --cancer_type BRCA \ 143 | --predictor PIK3CA \ 144 | --plot_all_seeds \ 145 | --output_directory plots/supplementary 146 | 147 | # violin + line plots showing kappa values from predictions on test data 148 | Rscript plots/scripts/3-plot_category_kappa.R \ 149 | --cancer_type BRCA \ 150 | --predictor PIK3CA \ 151 | --null_model \ 152 | --output_directory plots/supplementary 153 | 154 | # ---------------------------------------------------------------------------- 155 | 156 | fi 157 | 158 | ################################################################################ 159 | # GBM 160 | ################################################################################ 161 | 162 | if [ $cancer_type == "both" ] || [ $cancer_type == "GBM" ]; then 163 | 164 | # GBM subtype ---------------------------------------------------------------- 165 | 166 | # run machine learning and DEG analysis scripts 167 | bash run_machine_learning_experiments.sh GBM subtype 7 168 | bash run_differential_expression_experiments.sh GBM Proneural Classical,Mesenchymal Classical,Mesenchymal 7 169 | 170 | # plot array vs. RNA-seq expression levels after normalization 171 | Rscript plots/scripts/visualize_expression.R --cancer_type GBM --predictor subtype 172 | 173 | # plot difference in subtype prediction kappa between non-reconstructed and reconstructed data 174 | Rscript plots/scripts/recon_kappa_difference.R --cancer_type GBM --output_directory plots/supplementary 175 | 176 | # stacked bar plot showing distribution of subtypes in train/test sets (one representative example) 177 | Rscript plots/scripts/0-plot_predictor_category_distributions.R \ 178 | --cancer_type GBM \ 179 | --predictor subtype \ 180 | --output_directory plots/supplementary 181 | 182 | # stacked bar plots showing distribution of subtypes in train/test sets (all seeds) 183 | Rscript plots/scripts/0-plot_predictor_category_distributions.R \ 184 | --cancer_type GBM \ 185 | --predictor subtype \ 186 | --plot_all_seeds \ 187 | --output_directory plots/supplementary 188 | 189 | # violin + line plots showing kappa values from predictions on test data 190 | Rscript plots/scripts/3-plot_category_kappa.R \ 191 | --cancer_type GBM \ 192 | --predictor subtype \ 193 | --output_directory plots/supplementary 194 | 195 | # violin + line plots showing kappa values from predictions on reconstructed test data 196 | Rscript plots/scripts/6-plot_recon_kappa.R \ 197 | --cancer_type GBM \ 198 | --predictor subtype \ 199 | --output_directory plots/supplementary 200 | 201 | # violin + line plots showing gene-level MASE values from reconstructed test data 202 | Rscript plots/scripts/6-plot_recon_error.R \ 203 | --cancer_type GBM \ 204 | --predictor subtype \ 205 | --output_directory plots/supplementary 206 | 207 | # violin plots showing proportion of pathways significant in PLIER analyses 208 | Rscript plots/scripts/7-plot_plier_pathways.R \ 209 | --cancer_type GBM \ 210 | --predictor subtype \ 211 | --output_directory plots/main 212 | 213 | # bar plot showing proportion of genes differentially expressed (Proneural vs. Others, Classical vs. Mesenchymal) 214 | # line plot showing overlap with silver standard DEGs (Proneural vs. Others, Classical vs. Mesenchymal) 215 | Rscript plots/scripts/1A-plot_DEGs.R \ 216 | --cancer_type GBM \ 217 | --subtype_vs_others Proneural \ 218 | --subtype_vs_subtype Classical,Mesenchymal \ 219 | --proportion_output_directory plots/supplementary \ 220 | --overlap_output_directory plots/supplementary \ 221 | --overlap_measure Jaccard,Spearman 222 | 223 | # line plot showing overlap with silver standard DEGs (Classical vs. Mesenchymal) across small n values 224 | Rscript plots/scripts/2A-plot_small_n_differential_expression.R \ 225 | --cancer_type GBM \ 226 | --subtype_vs_subtype Classical,Mesenchymal \ 227 | --output_directory plots/supplementary \ 228 | --overlap_measure Jaccard,Spearman 229 | 230 | # ---------------------------------------------------------------------------- 231 | 232 | # GBM TP53 ------------------------------------------------------------------- 233 | 234 | # run machine learning analysis scripts 235 | bash run_machine_learning_experiments.sh GBM TP53 7 236 | 237 | # stacked bar plot showing distribution of subtypes in train/test sets (one representative example) 238 | Rscript plots/scripts/0-plot_predictor_category_distributions.R \ 239 | --cancer_type GBM \ 240 | --predictor TP53 \ 241 | --output_directory plots/supplementary 242 | 243 | # stacked bar plots showing distribution of subtypes in train/test sets (all seeds) 244 | Rscript plots/scripts/0-plot_predictor_category_distributions.R \ 245 | --cancer_type GBM \ 246 | --predictor TP53 \ 247 | --plot_all_seeds \ 248 | --output_directory plots/supplementary 249 | 250 | # violin + line plots showing kappa values from predictions on test data 251 | Rscript plots/scripts/3-plot_category_kappa.R \ 252 | --cancer_type GBM \ 253 | --predictor TP53 \ 254 | --null_model \ 255 | --output_directory plots/main 256 | 257 | # ---------------------------------------------------------------------------- 258 | 259 | # GBM PIK3CA ----------------------------------------------------------------- 260 | 261 | # run machine learning analysis scripts 262 | bash run_machine_learning_experiments.sh GBM PIK3CA 7 263 | 264 | # stacked bar plot showing distribution of subtypes in train/test sets (one representative example) 265 | Rscript plots/scripts/0-plot_predictor_category_distributions.R \ 266 | --cancer_type GBM \ 267 | --predictor PIK3CA \ 268 | --output_directory plots/supplementary 269 | 270 | # stacked bar plots showing distribution of subtypes in train/test sets (all seeds) 271 | Rscript plots/scripts/0-plot_predictor_category_distributions.R \ 272 | --cancer_type GBM \ 273 | --predictor PIK3CA \ 274 | --plot_all_seeds \ 275 | --output_directory plots/supplementary 276 | 277 | # violin + line plots showing kappa values from predictions on test data 278 | Rscript plots/scripts/3-plot_category_kappa.R \ 279 | --cancer_type GBM \ 280 | --predictor PIK3CA \ 281 | --null_model \ 282 | --output_directory plots/supplementary 283 | 284 | # ---------------------------------------------------------------------------- 285 | 286 | fi 287 | 288 | ################################################################################ 289 | # PLIER pathway analysis of BRCA and/or GBM 290 | ################################################################################ 291 | 292 | Rscript -e "rmarkdown::render('8-PLIER_pathways_analysis.Rmd', clean = TRUE)" 293 | -------------------------------------------------------------------------------- /run_differential_expression_experiments.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -euo pipefail 3 | 4 | # Usage: bash run_differential_expression_experiments.sh CANCER_TYPE SUBTYPE_VS_OTHERS SUBTYPE_VS_SUBTYPE SUBTYPE_VS_SUBTYPE_SMALL 5 | # where CANCER_TYPE is one of BRCA or GBM 6 | # SUBTYPE_VS_OTHER is the subtype you want to compare to all others (e.g. Basal) 7 | # SUBTYPE_VS_SUBTYPE is the two subtypes you want to compare head to head (e.g. LumA,Her2) (comma-separated) 8 | # SUBTYPE_VS_SUBTYPE_SMALL is the two subtypes you want to compare head to head when limiting the sample size (e.g. LumA,Her2) (comma-separated) 9 | 10 | cancer_type=$1 11 | subtype_vs_others=$2 12 | subtype_vs_subtype=$3 13 | subtype_vs_subtype_small=$4 14 | ncores=$5 15 | 16 | if [ $cancer_type != "BRCA" ] && [ $cancer_type != "GBM" ]; then 17 | echo Cancer type must be BRCA or GBM in run_differential_expression_experiments.sh [cancer_type] 18 | exit 19 | fi 20 | 21 | # Run differential expression scripts 22 | Rscript 1A-detect_differentially_expressed_genes.R --cancer_type $cancer_type --subtype_vs_others $subtype_vs_others --subtype_vs_subtype $subtype_vs_subtype --ncores $ncores 23 | Rscript 2A-small_n_differential_expression.R --cancer_type $cancer_type --subtype_vs_subtype $subtype_vs_subtype_small --ncores $ncores 24 | -------------------------------------------------------------------------------- /run_experiments.R: -------------------------------------------------------------------------------- 1 | # J. Taroni Jul 2016 2 | # The purpose of this script is to run the BRCA subtype classifier pipeline 3 | # for RNA-seq 'titration.' 4 | # It should be run from the command line. 5 | # USAGE: Rscript run_experiments.R --cancer_type [BRCA|GBM] --predictor [subtype|TP53|PIK3CA] --seed integer --null_model --ncores 6 | # It also may be run through the classifier_repeat_wrapper.R 7 | 8 | option_list <- list( 9 | optparse::make_option("--cancer_type", 10 | default = NA_character_, 11 | help = "Cancer type"), 12 | optparse::make_option("--predictor", 13 | default = NA_character_, 14 | help = "Predictor used"), 15 | optparse::make_option("--seed", 16 | default = NA_integer_, 17 | help = "Random seed"), 18 | optparse::make_option("--null_model", 19 | action = "store_true", 20 | default = FALSE, 21 | help = "Permute dependent variable (within subtype if predictor is a gene)"), 22 | optparse::make_option("--ncores", 23 | default = NA_integer_, 24 | help = "Set the number of cores to use") 25 | ) 26 | 27 | opt <- optparse::parse_args(optparse::OptionParser(option_list=option_list)) 28 | source(here::here("util/option_functions.R")) 29 | check_options(opt) 30 | 31 | # set options 32 | cancer_type <- opt$cancer_type 33 | predictor <- opt$predictor 34 | null_model <- opt$null_model 35 | ncores <- min(parallel::detectCores() - 1, 36 | opt$ncores, 37 | na.rm = TRUE) 38 | 39 | # set seed 40 | initial.seed <- opt$seed 41 | set.seed(initial.seed) 42 | 43 | # these seeds should be between 1000 and 9999 (be 4 digits) to match later file name parsing 44 | seeds <- sample(1000:9999, 3) 45 | 46 | message(paste("Initial seed:", initial.seed)) 47 | message(paste("Secondary seeds:", stringr::str_c(seeds, collapse = ", "))) 48 | 49 | message("Getting overlap and splitting into training and testing sets...") 50 | system(paste("Rscript 0-expression_data_overlap_and_split.R", 51 | "--cancer_type", cancer_type, 52 | "--predictor", predictor, 53 | "--seed1", seeds[1], 54 | ifelse(null_model, 55 | "--null_model", 56 | ""))) 57 | 58 | message("\nNormalizing data...") 59 | system(paste("Rscript 1-normalize_titrated_data.R", 60 | "--cancer_type", cancer_type, 61 | "--predictor", predictor, 62 | "--seed1", seeds[1], 63 | "--seed2", seeds[2], 64 | ifelse(null_model, 65 | "--null_model", 66 | ""), 67 | "--ncores", ncores)) 68 | 69 | message("\nTraining and testing models...") 70 | system(paste("Rscript 2-train_test_category.R", 71 | "--cancer_type", cancer_type, 72 | "--predictor", predictor, 73 | "--seed1", seeds[1], 74 | "--seed3", seeds[3], 75 | ifelse(null_model, 76 | "--null_model", 77 | ""), 78 | "--ncores", ncores)) 79 | -------------------------------------------------------------------------------- /run_machine_learning_experiments.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -euo pipefail 3 | 4 | # cancer type (either BRCA or GBM) 5 | cancer_type=$1 6 | predictor=$2 7 | ncores=$3 8 | 9 | if [ $cancer_type != "BRCA" ] && [ $cancer_type != "GBM" ]; then 10 | echo Cancer type must be BRCA or GBM in run_machine_learning_experiments.sh [cancer_type] [predictor] 11 | exit 12 | fi 13 | 14 | if [ $predictor != "subtype" ] && [ $predictor != "TP53" ] && [ $predictor != "PIK3CA" ]; then 15 | echo Predictor must be subtype, TP53, or PIK3CA in run_machine_learning_experiments.sh [cancer_type] [predictor] 16 | exit 17 | fi 18 | 19 | # Run ten repeats of the supervised analysis 20 | # if the predictor is a gene, also generate null models 21 | if [ $predictor == "TP53" ] || [ $predictor == "PIK3CA" ]; then 22 | Rscript classifier_repeat_wrapper.R --cancer_type $cancer_type --predictor $predictor --n_repeats 10 --ncores $ncores 23 | Rscript classifier_repeat_wrapper.R --cancer_type $cancer_type --predictor $predictor --n_repeats 10 --null_model --ncores $ncores 24 | else 25 | Rscript classifier_repeat_wrapper.R --cancer_type $cancer_type --predictor $predictor --n_repeats 10 --ncores $ncores 26 | fi 27 | 28 | # Run the unsupervised analyses using subtype models 29 | if [ $predictor == "subtype" ]; then 30 | Rscript 4-ica_pca_feature_reconstruction.R --cancer_type $cancer_type --predictor $predictor --n_components 50 31 | Rscript 5-predict_category_reconstructed_data.R --cancer_type $cancer_type --predictor $predictor 32 | Rscript 6-save_recon_error_kappa_data.R --cancer_type $cancer_type --predictor $predictor 33 | Rscript 7-extract_plier_pathways.R --cancer_type $cancer_type --ncores $ncores 34 | Rscript 7-extract_plier_pathways.R --cancer_type $cancer_type --ncores $ncores --permute 35 | fi 36 | -------------------------------------------------------------------------------- /search_geo_arrayexpress.py: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # This script queries two databases (GEO, ArrayExpress) to find human samples 3 | # analyzed on array or RNA-seq platforms. It parses information from each data 4 | # set and adds up the total number of samples from each platform. One output 5 | # file is a time-stamped table showing the number of samples from each database 6 | # and platform. Metadata from the search is appended to a tracking file, which 7 | # includes the original output table filename, the time/date of the saerch, and 8 | # the ratio of array to RNA-seq data found. Output goes to the folder 9 | # results/array_rnaseq_ratio. 10 | # 11 | # Usage: python3 util/search_geo_arrayexpress.py 12 | # 13 | # S. Foltz February 2022 14 | ############################################################################### 15 | 16 | from datetime import datetime 17 | import os 18 | import requests 19 | import sys 20 | import xmltodict 21 | 22 | # max number of results fetch can return 23 | fetch_retmax = 10000 # as of Feb 2022 24 | 25 | # find the directory of this script (top level project directory) 26 | dir_path = os.path.dirname(os.path.realpath(__file__)) 27 | 28 | # define output directory 29 | output_directory = os.path.join(dir_path, "results", "array_rnaseq_ratio") 30 | 31 | # check that output directory exists 32 | try: 33 | assert (os.path.isdir(output_directory)), \ 34 | "Output directory " + output_directory + \ 35 | " does not exist in search_geo_arrayexpress.py." 36 | except Exception as e: 37 | print(e, file = sys.stderr) 38 | exit() 39 | 40 | # define output filenames 41 | current_time = datetime.utcnow().strftime("%Y-%m-%d_%H_%M_%S_UTC") 42 | # output filename refers to search results at this particular time 43 | output_filename = os.path.join(output_directory, 44 | "ratio." + current_time + ".tsv") 45 | # tracking filename collects metadata about each search (filename, date, ratio) 46 | output_tracking_filename = os.path.join(output_directory, "ratio.tracking.tsv") 47 | 48 | ############################################################################### 49 | # GEO - Gene Expression Omnibus 50 | ############################################################################### 51 | 52 | # set up search terms and dictionary to track n_samples 53 | search_base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?" 54 | geo_array_search_term = "homo+sapiens[Organism]+AND+expression+profiling+by+array[DataSet+Type]" 55 | geo_rnaseq_search_term = "homo+sapiens[Organism]+AND+expression+profiling+by+high+throughput+sequencing[DataSet+Type]" 56 | 57 | geo_array_initial_url = search_base + \ 58 | "&".join(["db=gds", "term=" + geo_array_search_term]) 59 | geo_rnaseq_initial_url = search_base + \ 60 | "&".join(["db=gds", "term=" + geo_rnaseq_search_term]) 61 | 62 | geo_dict = {"array": [geo_array_initial_url, 0], 63 | "rnaseq": [geo_rnaseq_initial_url, 0]} 64 | 65 | # Use this base url to fetch the records of the search results 66 | fetch_base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?" 67 | # for any data sets with an entry in this list, we will skip that data set 68 | skip_these = ["(Submitter supplied) This SuperSeries is composed of the SubSeries listed below."] 69 | 70 | # for each platform in the dictionary, search twice and then fetch samples 71 | for platform in geo_dict: 72 | 73 | # first search to retrieve the total number of results 74 | initial_url = geo_dict[platform][0] 75 | initial_xml = requests.get(initial_url) 76 | initial_dict = xmltodict.parse(initial_xml.content) 77 | n_results = initial_dict['eSearchResult']['Count'] 78 | 79 | # search again using n_results, save query_key and WebEnv for fetch 80 | second_url = initial_url + "&RetMax=" + n_results + "&usehistory=y" 81 | second_xml = requests.get(second_url) 82 | second_dict = xmltodict.parse(second_xml.content) 83 | query_key = second_dict['eSearchResult']['QueryKey'] 84 | webenv = second_dict['eSearchResult']['WebEnv'] 85 | 86 | # fetch returns up to fetch_retmax results, so we need to define the start 87 | # position so we can increase the start position on subsequent fetches 88 | retstart = 0 89 | 90 | while retstart < int(n_results): 91 | fetch_parameters = "&".join( 92 | ["db=gds", 93 | "query_key=" + query_key, 94 | "WebEnv=" + webenv, 95 | "retmax=10000", 96 | "retstart=" + str(retstart)]) 97 | fetch_url = fetch_base + fetch_parameters 98 | fetch_text = requests.get(fetch_url).text 99 | for result in fetch_text.split("\n\n"): # split by \n 100 | record = result.split("\n") # split by \n 101 | for entry in record: 102 | if entry in skip_these: # data set should be skipped 103 | continue 104 | else: # otherwise, look for the line starting with "Platform" 105 | if entry.startswith("Platform"): # parse 2nd last element 106 | n_samples = int(entry.split(" ")[-2]) # (n_samples) 107 | geo_dict[platform][1] += n_samples # increment count 108 | retstart += fetch_retmax # increment the start position 109 | 110 | ############################################################################### 111 | # ArrayExpress 112 | ############################################################################### 113 | 114 | # set up search terms and dictionary to track n_samples 115 | # do not include GEO results in AE (directsub=on) 116 | ae_base_url = "https://www.ebi.ac.uk/arrayexpress/ArrayExpress-Experiments.txt?keywords=" 117 | ae_array_url = ae_base_url + "&organism=Homo+sapiens&exptype%5B%5D=%22rna+assay%22&exptype%5B%5D=%22array+assay%22&array=&directsub=on" 118 | ae_rnaseq_url = ae_base_url + "&organism=Homo+sapiens&exptype%5B%5D=%22rna+assay%22&exptype%5B%5D=%22sequencing+assay%22&array=&directsub=on" 119 | 120 | ae_dict = {"array": [ae_array_url, 0], 121 | "rnaseq": [ae_rnaseq_url, 0]} 122 | 123 | # for each platform in the dictionary, get results from url 124 | for platform in ae_dict: 125 | url = ae_dict[platform][0] 126 | results = requests.get(url).text.split("\n")[1:-1] # skip first, last 127 | for entry in results: 128 | n_assays = int(entry.split('\t')[4]) # fifth column is n_assays 129 | ae_dict[platform][1] += n_assays # increment the count 130 | 131 | ############################################################################### 132 | # Write results to output files 133 | ############################################################################### 134 | 135 | total_array = geo_dict["array"][1] + ae_dict["array"][1] # total number array 136 | total_rnaseq = geo_dict["rnaseq"][1] + ae_dict["rnaseq"][1] # total n RNA-seq 137 | 138 | # check array and RNA-seq searches returned non-zero results 139 | try: 140 | assert (total_array != 0 and total_rnaseq != 0), \ 141 | "Array or RNA-seq returned zero results in search_geo_arrayexpress.py." 142 | except Exception as e: 143 | print(e, file = sys.stderr) 144 | exit() 145 | 146 | 147 | ratio = total_array/total_rnaseq # array:RNA-seq 148 | 149 | output_table = open(output_filename, "w") 150 | output_table.write('\t'.join(["Platform", "GEO", "AE", "Total"]) + "\n") 151 | output_table.write('\t'.join([str(x) for x in ["Array", 152 | geo_dict["array"][1], 153 | ae_dict["array"][1], 154 | total_array]]) + "\n") 155 | output_table.write('\t'.join([str(x) for x in ["RNA-seq", 156 | geo_dict["rnaseq"][1], 157 | ae_dict["rnaseq"][1], 158 | total_rnaseq]]) + "\n") 159 | output_table.close() 160 | 161 | output_tracking = open(output_tracking_filename, "a") # create new or append 162 | output_tracking.write('\t'.join(["File:" + os.path.basename(output_filename), 163 | "Date:" + current_time, 164 | "Array_to_RNA-seq_ratio:" + str(ratio)]) + "\n") 165 | output_tracking.close() 166 | -------------------------------------------------------------------------------- /util/CrossNorm.R: -------------------------------------------------------------------------------- 1 | # The following code implements the CrossNorm algorithm with quantile normalization as described in 2 | # Cheng, L., Lo, L.-Y., Tang, N. L. S., Wang, D. & Leung, K.-S. CrossNorm: a novel normalization strategy for microarray data in cancers. Sci. Rep. 6, 18898 (2016) 3 | # https://www.nature.com/articles/srep18898 4 | 5 | # We thank the authors of CrossNorm for making the code publicly available under a Creative Commons CC BY license. 6 | 7 | # The code is copied from the Supplementary Information here: 8 | # https://static-content.springer.com/esm/art%3A10.1038%2Fsrep18898/MediaObjects/41598_2016_BFsrep18898_MOESM1_ESM.pdf 9 | 10 | # We made slight modifications that do no alter the functionality of the code, including 11 | # - We commented out library calls 12 | # - We specified the library in preprocessCore::normalize.quantiles() function calls 13 | 14 | #==================================================================================== 15 | # Description: 16 | # Cross Normalization (CrossNorm) for gene expression data. 17 | # 18 | # Arguments: 19 | # exp - a (non-empty) numeric matrix of data values. Row represents gene while 20 | # colum represents sample. 21 | # label - a (non-empty) binary vector of data values in which ’0’ represents 22 | # control sample and ’1’ reptesents disease sample. The length of label 23 | # should be equal to the column number of exp. 24 | # Value: 25 | # exp.crossnorm - A normalized numeric matrix. Row represents gene while column 26 | # represents sample. The gene order is the same as exp. 27 | # 28 | # Reference: 29 | # CrossNorm: a novel normalization strategy for microarray data in cancers 30 | # Lixin Cheng, Leung-Yau Lo, Kwong-Sak Leung, Nelson LS Tang and Dong Wang 31 | # 32 | # Example: 33 | # source("CrossNorm.R") 34 | # exp.pcn = PairedCrossNorm(exp, label) 35 | # exp.gcn = GeneralCrossNorm(exp, label) 36 | #==================================================================================== 37 | 38 | #library(affy) 39 | #library(preprocessCore) 40 | 41 | # -------------------Paired CrossNorm -------------------- 42 | 43 | PairedCrossNorm <- function(exp, label){ 44 | exp = as.matrix(exp); 45 | geneLen = dim(exp)[1]; 46 | exp.normal = exp[,label==0]; 47 | exp.disease = exp[,label==1]; 48 | exp.cross = rbind(exp.normal,exp.disease); 49 | exp.quantile.cross = preprocessCore::normalize.quantiles(exp.cross); 50 | exp.crossnorm.normal = exp.quantile.cross[1:geneLen,]; 51 | exp.crossnorm.disease = exp.quantile.cross[(geneLen+1):(2*geneLen),]; 52 | exp.crossnorm= cbind(exp.crossnorm.normal,exp.crossnorm.disease); 53 | return(exp.crossnorm) 54 | } 55 | 56 | # ---------------- General CrossNorm -------------------- 57 | 58 | GeneralCrossNorm <- function(exp,label){ 59 | exp = as.matrix(exp); 60 | exp.cross = Matrix2CrossMatrix(exp,label) 61 | exp.quantile.cross = preprocessCore::normalize.quantiles(exp.cross) 62 | exp.crossnorm = CrossMatrix2Matrix(exp.quantile.cross,label) 63 | return(exp.crossnorm) 64 | } 65 | 66 | # CrossMatrix 67 | Matrix2CrossMatrix <- function(M, label){ 68 | M = as.matrix(M) 69 | rowLen = dim(M)[1] 70 | sampleSize1 = sum(label==1) # disease sample size 71 | sampleSize0 = sum(label==0) # normal sample size 72 | indexMatrix = matrix(1:(sampleSize1*sampleSize0),,sampleSize0) 73 | M1 = M[,label==1] 74 | M0 = M[,label==0] 75 | M3 = matrix(0,rowLen*2,sampleSize1*sampleSize0) 76 | for (t in 1:sampleSize1){ 77 | M3[,indexMatrix[t,]] = rbind(matrix(rep(M1[,t],sampleSize0),,sampleSize0),M0) 78 | } 79 | return(M3) 80 | } 81 | 82 | CrossMatrix2Matrix <- function(CrossM,label){ 83 | rowLen = dim(CrossM)[1]/2 84 | sampleSize1 = sum(label==1) # disease sample size 85 | sampleSize0 = sum(label==0) # normal sample size 86 | indexMatrix = matrix(1:(sampleSize1*sampleSize0),,sampleSize0) 87 | M1 = matrix(0,rowLen,sampleSize1) 88 | M0 = matrix(0,rowLen,sampleSize0) 89 | for(t in 1:sampleSize1){ 90 | M1[,t] = apply(CrossM[1:rowLen,indexMatrix[t,]],1,mean) 91 | } 92 | for(t in 1:sampleSize0){ 93 | M0[,t] = apply(CrossM[(rowLen+1):(rowLen*2),indexMatrix[,t]],1,mean) 94 | } 95 | M = cbind(M0,M1) 96 | return(M) 97 | } 98 | -------------------------------------------------------------------------------- /util/color_blind_friendly_palette.R: -------------------------------------------------------------------------------- 1 | # color-blind friendly palette 2 | cbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", 3 | "#0072B2", "#D55E00", "#CC79A7") 4 | -------------------------------------------------------------------------------- /util/option_functions.R: -------------------------------------------------------------------------------- 1 | check_options <- function(opt) { 2 | # this function checks standardized command line options given to scripts 3 | # options ending with "_input" are checked to see if the input file exists 4 | # options ending with "_output" are checked to see if the output directory exists 5 | # and if the output file already exists, will it be overwritten or not 6 | # all messages and errors are reported 7 | # if there are any errors, the script stops 8 | 9 | my_errors <- list() 10 | my_messages <- list() 11 | 12 | for(option in names(opt)){ 13 | 14 | if (is.na(opt[[option]])) { # all required options should default to NA_character_ or NA_integer_ 15 | my_errors[[option]] <- stringr::str_c("\nOption given for --", option, 16 | " is missing and must be specified.") 17 | } else if (option == "cancer_type") { 18 | if (!(opt[[option]] %in% c("BRCA", "GBM"))) { # cancer type must be BRCA or GBM 19 | my_errors[[option]] <- stringr::str_c("\nCancer type given for --", option, 20 | " (", opt[[option]], ") ", 21 | " must be BRCA or GBM.") 22 | } 23 | } else if (option == "predictor") { 24 | if (!(opt[[option]] %in% c("subtype", "TP53", "PIK3CA"))) { # predictor must be subtype or TP53 or PIK3CA 25 | my_errors[[option]] <- stringr::str_c("\nPredictor given for --", option, 26 | " (", opt[[option]], ") ", 27 | " must be subtype, TP53, or PIK3CA.") 28 | } 29 | } else if (option == "subtype_vs_subtype") { 30 | two_subtypes <- as.vector(stringr::str_split(opt[[option]], pattern = ",", simplify = TRUE)) 31 | if (length(two_subtypes) != 2) { 32 | my_errors[[option]] <- stringr::str_c("\nSubtypes given for --", option, 33 | " (", opt[[option]], ") ", 34 | " must have (only) two comma-separated subtypes.") 35 | } 36 | 37 | } else if (stringr::str_ends(option, "_input")) { # option related to inputs 38 | if (!file.exists(opt[[option]])) { 39 | my_errors[[option]] <- stringr::str_c("\nInput file given for --", option, 40 | " (", opt[[option]], ") ", 41 | "does not exist.") 42 | } 43 | } else if (stringr::str_ends(option, "_output")) { # option related to outputs 44 | if (file.exists(opt[[option]])) { # if output file already exists 45 | if (opt$overwrite) { # overwrite is TRUE if given 46 | my_messages[[option]] <- stringr::str_c("\nOutput file given for --", option, 47 | " (", opt[[option]], ") ", 48 | "already exists and will be overwritten (--overwrite is set).") 49 | } else { # overwrite defaults to FALSE unless given 50 | my_errors[[option]] <- stringr::str_c("\nOutput file given for --", option, 51 | " (", opt[[option]], ") ", 52 | "already exists and will not be overwritten (use --overwrite).") 53 | } 54 | } else if (!dir.exists(dirname(opt[[option]]))) { # if output directory does not exist 55 | my_errors[[option]] <- stringr::str_c("\nOutput directory given for --", option, 56 | " (", dirname(opt[[option]]), ") ", 57 | "does not exist.") 58 | } 59 | } else if (stringr::str_ends(option, "_directory")) { # option related to output directory 60 | if (!dir.exists(opt[[option]])) { 61 | my_errors[[option]] <- stringr::str_c("\nOutput directory given for --", option, 62 | " (", opt[[option]], ") ", 63 | "does not exist.") 64 | } 65 | } else if (option == "ncores") { 66 | if (!is.integer(opt[[option]]) | opt[[option]] < 1) { 67 | my_errors[[option]] <- stringr::str_c("\nNumber of cores given for --", option, 68 | " must be a positive integer.") 69 | } 70 | } 71 | } 72 | 73 | if (length(my_messages) > 0) { 74 | message(" Messages:", my_messages, "\n") 75 | } 76 | if (length(my_errors) > 0) { 77 | message(" Errors:", my_errors, "\n") 78 | stop() 79 | } 80 | } 81 | --------------------------------------------------------------------------------