├── .gitignore
├── .here
├── 0-expression_data_overlap_and_split.R
├── 1-normalize_titrated_data.R
├── 1A-detect_differentially_expressed_genes.R
├── 2-train_test_category.R
├── 2A-small_n_differential_expression.R
├── 3-combine_category_kappa.R
├── 4-ica_pca_feature_reconstruction.R
├── 5-predict_category_reconstructed_data.R
├── 6-save_recon_error_kappa_data.R
├── 7-extract_plier_pathways.R
├── 8-PLIER_pathways_analysis.Rmd
├── 8-PLIER_pathways_analysis.nb.html
├── LICENSE
├── README.md
├── brca_data_urls.txt
├── check_installs.R
├── check_sums.tsv
├── classifier_repeat_wrapper.R
├── combine_clinical_data.R
├── data
    └── .empty
├── diagrams
    ├── RNA-seq_titration_ML_overview.png
    └── RNA-seq_titration_diff_expression_overview.png
├── docker
    ├── R-3.6.3
    │   └── Dockerfile
    ├── R-4.1.2
    │   └── Dockerfile
    └── README.md
├── download_TCGA_data.sh
├── gdc_legacy_archive_brca_manifest.txt
├── load_packages.R
├── models
    └── .empty
├── normalized_data
    ├── .empty
    └── reconstructed_data
    │   └── .empty
├── plots
    ├── .empty
    ├── data
    │   └── .empty
    ├── main
    │   └── .empty
    ├── scripts
    │   ├── .empty
    │   ├── 0-plot_predictor_category_distributions.R
    │   ├── 1A-plot_DEGs.R
    │   ├── 2A-plot_small_n_differential_expression.R
    │   ├── 3-plot_category_kappa.R
    │   ├── 6-plot_recon_error.R
    │   ├── 6-plot_recon_kappa.R
    │   ├── 7-plot_plier_pathways.R
    │   ├── recon_kappa_difference.R
    │   └── visualize_expression.R
    ├── supplementary
    │   └── .empty
    └── visualize_expression
    │   └── .empty
├── prepare_GBM_data.R
├── results
    ├── .empty
    ├── array_rnaseq_ratio
    │   ├── ratio.2022-02-18_18_50_01_UTC.tsv
    │   └── ratio.tracking.tsv
    ├── differential_expression
    │   └── .empty
    └── reconstructed_data
    │   └── .empty
├── retrieve_MC3_mutations.py
├── run_all_analyses_and_plots.sh
├── run_differential_expression_experiments.sh
├── run_experiments.R
├── run_machine_learning_experiments.sh
├── search_geo_arrayexpress.py
├── tcga_tss_codes.csv
└── util
    ├── CrossNorm.R
    ├── ICA_PCA_reconstruction_functions.R
    ├── color_blind_friendly_palette.R
    ├── differential_expression_functions.R
    ├── normalization_functions.R
    ├── option_functions.R
    └── train_test_functions.R


/.gitignore:
--------------------------------------------------------------------------------
 1 | .*bash_history
 2 | .config
 3 | .local
 4 | .refinebio.yaml
 5 | .Rhistory
 6 | .rstudio
 7 | .wget-hsts
 8 | data
 9 | normalized_data
10 | models
11 | results
12 | plots/main/*.pdf
13 | plots/supplementary/*.pdf
14 | plots/visualize_expression/*.pdf
15 | .Rproj.user
16 | RNAseq_titration_results.Rproj
17 | ._RNAseq_titration_results.Rproj
18 | .DS_Store
19 | 


--------------------------------------------------------------------------------
/.here:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/greenelab/RNAseq_titration_results/2d9b2f5ce2f04fc76279b2fc314e51abc9a4f3b4/.here


--------------------------------------------------------------------------------
/0-expression_data_overlap_and_split.R:
--------------------------------------------------------------------------------
  1 | # J. Taroni Jun 2016
  2 | # The purpose of this script is to read in TGCA array and sequencing data,
  3 | # to preprocess leaving only overlapping genes and samples with complete
  4 | # category information, and to split the data into training and testing sets
  5 | # It should be run from the command line through the run_experiments.R script
  6 | 
  7 | option_list <- list(
  8 |   optparse::make_option("--cancer_type",
  9 |                         default = NA_character_,
 10 |                         help = "Cancer type"),
 11 |   optparse::make_option("--predictor",
 12 |                         default = NA_character_,
 13 |                         help = "Predictor used"),
 14 |   optparse::make_option("--seed1",
 15 |                         default = NA_integer_,
 16 |                         help = "Random seed"),
 17 |   optparse::make_option("--null_model",
 18 |                         action = "store_true",
 19 |                         default = FALSE,
 20 |                         help = "Permute dependent variable (within subtype if predictor is a gene)")
 21 | )
 22 | 
 23 | opt <- optparse::parse_args(optparse::OptionParser(option_list=option_list))
 24 | source(here::here("util/option_functions.R"))
 25 | check_options(opt)
 26 | 
 27 | # load libraries
 28 | suppressMessages(source(here::here("load_packages.R")))
 29 | 
 30 | # set options
 31 | cancer_type <- opt$cancer_type
 32 | predictor <- opt$predictor
 33 | null_model <- opt$null_model
 34 | file_identifier <- ifelse(null_model,
 35 |                           str_c(cancer_type, predictor, "null", sep = "_"),
 36 |                           str_c(cancer_type, predictor, sep = "_"))
 37 | 
 38 | # set seed
 39 | initial.seed <- as.integer(opt$seed1)
 40 | set.seed(initial.seed)
 41 | # set seed for spliting into train/test here, before null_model scramble
 42 | split.seed <- sample(1:10000, 1)
 43 | 
 44 | # define directories
 45 | data.dir <- here::here("data")
 46 | plot.dir <- here::here("plots")
 47 | plot.data.dir <- file.path(plot.dir, "data")
 48 | res.dir <- here::here("results")
 49 | 
 50 | # name input files
 51 | seq.exprs.filename <- paste0(cancer_type, "RNASeq.pcl")
 52 | array.exprs.filename <- paste0(cancer_type, "array.pcl")
 53 | clin.filename <- paste0("combined_clinical_data.", cancer_type, ".tsv")
 54 | 
 55 | # name output files
 56 | category.distribtion.plot.data <- paste0(file_identifier,
 57 |                                          "_dist_split_stacked_bar_",
 58 |                                          initial.seed, ".tsv")
 59 | train.test.labels <- paste0(file_identifier,
 60 |                             "_matchedSamples_training_testing_split_labels_",
 61 |                             initial.seed, ".tsv")
 62 | 
 63 | #### read in expression and clinical data --------------------------------------
 64 | 
 65 | # read in expression data as data.frame
 66 | seq.data <- fread(file.path(data.dir, seq.exprs.filename),
 67 |                   data.table = FALSE)
 68 | array.data <- fread(file.path(data.dir, array.exprs.filename),
 69 |                     data.table = FALSE)
 70 | clinical <- fread(file.path(data.dir, clin.filename),
 71 |                   data.table = FALSE)
 72 | 
 73 | # filter clinical data to keep tumor samples with complete data
 74 | # if the predictor is subtype, we only select subtype (twice, but once)
 75 | # if the predictor is a gene, we select subtype and the gene
 76 | # this ensures downstream mutation predictions will have subtype available as covariate
 77 | clinical <- clinical %>%
 78 |   mutate(category = !!sym(predictor)) %>%
 79 |   select(Sample, Type, "subtype", "category") %>%
 80 |   filter(Type == "tumor") %>%
 81 |   tidyr::drop_na()
 82 | 
 83 | # change first column name to "gene"
 84 | colnames(array.data)[1] <- colnames(seq.data)[1] <- "gene"
 85 | 
 86 | # remove tumor-adjacent samples from the array data set
 87 | array.tumor.smpls <- clinical$Sample
 88 | array.tumor.smpls <- substr(array.tumor.smpls, 1, 15)
 89 | 
 90 | array.category <- clinical$category
 91 | 
 92 | # filter array data only to include tumor samples
 93 | array.data <- array.data[, c(1, which(colnames(array.data) %in%
 94 |                                         array.tumor.smpls))]
 95 | 
 96 | # what are the overlapping sample names -- "matched" samples?
 97 | # includes "gene" column
 98 | sample.overlap <- intersect(colnames(array.data), colnames(seq.data))
 99 | 
100 | # what are the overlapping genes between the two platforms?
101 | gene.overlap <- intersect(array.data$gene, seq.data$gene)
102 | 
103 | # filter the expression data for matched samples and overlapping genes
104 | array.matched <- array.data[which(array.data$gene %in% gene.overlap),
105 |                             sample.overlap]
106 | seq.matched <- seq.data[which(seq.data$gene %in% gene.overlap),
107 |                         sample.overlap]
108 | 
109 | # reorder genes on both platforms
110 | array.matched <- array.matched[order(array.matched$gene), ]
111 | seq.matched <- seq.matched[order(seq.matched$gene), ]
112 | 
113 | # reorder samples on both platforms
114 | array.matched <- array.matched[, c(1, (order(colnames(array.matched)[-1]) + 1))]
115 | seq.matched <- seq.matched[, c(1, (order(colnames(seq.matched)[-1]) + 1))]
116 | 
117 | # check reording sample names worked as expected
118 | if (any(colnames(array.matched) != colnames(seq.matched))) {
119 |   stop("Column name reordering did not work as expected in 0-expression_data_overlap_and_split.R")
120 | }
121 | 
122 | # keep category labels for samples with expression data
123 | array.category <- as.factor(array.category[which(array.tumor.smpls %in%
124 |                                                    colnames(array.matched))])
125 | 
126 | array.tumor.smpls <- array.tumor.smpls[which(array.tumor.smpls %in%
127 |                                                colnames(array.matched))]
128 | 
129 | # remove "unmatched" / "raw" expression data
130 | rm(array.data, seq.data)
131 | 
132 | # write matched only samples to pcl files
133 | array.output.nm <- sub(".pcl", "_matchedOnly_ordered.pcl", array.exprs.filename)
134 | array.output.nm <- file.path(data.dir, array.output.nm)
135 | write.table(array.matched, file = array.output.nm,
136 |             row.names = FALSE, quote = FALSE, sep = "\t")
137 | 
138 | seq.output.nm <- sub(".pcl", "_matchedOnly_ordered.pcl", seq.exprs.filename)
139 | seq.output.nm <- file.path(data.dir, seq.output.nm)
140 | write.table(seq.matched, file = seq.output.nm,
141 |             row.names = FALSE, quote = FALSE, sep = "\t")
142 | 
143 | #### split data into balanced training and testing sets ------------------------
144 | 
145 | # order array category to match the expression data order
146 | array.category <- array.category[order(array.tumor.smpls)]
147 | 
148 | message(paste("\nRandom seed for splitting into testing and training:",
149 |               split.seed), appendLF = TRUE)
150 | 
151 | set.seed(split.seed)
152 | train.index <- unlist(createDataPartition(array.category, times = 1, p = (2/3)))
153 | 
154 | #### write training/test labels to file ----------------------------------------
155 | 
156 | lbl <- rep("test", length(array.tumor.smpls))
157 | lbl[train.index] <- "train"
158 | lbl.df <- tibble(sample = colnames(array.matched)[2:ncol(array.matched)],
159 |                  split = lbl,
160 |                  category = as.character(array.category))
161 | 
162 | # add back subtype
163 | lbl.df <- lbl.df %>% 
164 |   left_join(clinical %>%
165 |               select(Sample, subtype),
166 |             by = c("sample" = "Sample"))
167 | 
168 | #### permute category labels for null model ------------------------------------
169 | # this comes after createDataPartition() to ensure same samples go to train/test
170 | # grouping by split ensure labels remain balanced within train and test
171 | # if null_model is specified and predicting subtype, permute subtype labels
172 | # if null_model is specified and predicting mutation status,
173 | #   permute mutation labels WITHIN subtype
174 | 
175 | if (null_model) {
176 |   if (predictor == "subtype") { # here, subtype = category
177 |     lbl.df <- lbl.df %>%
178 |       group_by(split) %>%
179 |       mutate(category = case_when(split == "train" ~ sample(category),
180 |                                   split == "test" ~ category)) %>%
181 |       ungroup()
182 |   } else { # if predictor not subtype, then must be mutation
183 |     lbl.df <- lbl.df %>% # subtype = subtype, category = TP53 or PIK3CA 0/1
184 |       group_by(split, subtype) %>% # sample within subtype
185 |       mutate(category = case_when(split == "train" ~ sample(category),
186 |                                   split == "test" ~ category)) %>%
187 |       ungroup()
188 |   }
189 | }
190 | 
191 | write.table(lbl.df,
192 |             file = file.path(res.dir, train.test.labels),
193 |             quote = FALSE, sep = "\t", row.names = FALSE)
194 | 
195 | #### save plot data frame ------------------------------------------------------
196 | 
197 | plot.df <- lbl.df %>%
198 |   mutate(split = case_when(split == "train" ~ "Train (2/3)",
199 |                            split == "test" ~ "Test (1/3)")) %>%
200 |   bind_rows(lbl.df %>% mutate(split = "Whole")) %>%
201 |   mutate(initial_seed = initial.seed)
202 | 
203 | write.table(plot.df,
204 |             file = file.path(plot.data.dir,
205 |                              category.distribtion.plot.data),
206 |             quote = FALSE, sep = "\t", row.names = FALSE)
207 | 


--------------------------------------------------------------------------------
/1-normalize_titrated_data.R:
--------------------------------------------------------------------------------
  1 | # J. Taroni Jun 2016
  2 | # The purpose of this script is to read in TGCA array and sequencing data,
  3 | # already pre-processed to only include test tumor samples,
  4 | # (output of 0-expression_data_overlap_and_split.R) and to normalize
  5 | # the data.
  6 | # It should be run from the command line through the run_experiments.R script
  7 | 
  8 | option_list <- list(
  9 |   optparse::make_option("--cancer_type",
 10 |                         default = NA_character_,
 11 |                         help = "Cancer type"),
 12 |   optparse::make_option("--predictor",
 13 |                         default = NA_character_,
 14 |                         help = "Predictor used"),
 15 |   optparse::make_option("--seed1",
 16 |                         default = NA_integer_,
 17 |                         help = "Random seed"),
 18 |   optparse::make_option("--seed2",
 19 |                         default = NA_integer_,
 20 |                         help = "Random seed"),
 21 |   optparse::make_option("--null_model",
 22 |                         action = "store_true",
 23 |                         default = FALSE,
 24 |                         help = "Refer to models with permuted dependent variable (within subtype if predictor is a gene)"),
 25 |   optparse::make_option("--ncores",
 26 |                         default = NA_integer_,
 27 |                         help = "Set the number of cores to use")
 28 | )
 29 | 
 30 | opt <- optparse::parse_args(optparse::OptionParser(option_list=option_list))
 31 | source(here::here("util/option_functions.R"))
 32 | check_options(opt)
 33 | 
 34 | # load libraries
 35 | suppressMessages(source(here::here("load_packages.R")))
 36 | source(here::here("util", "normalization_functions.R"))
 37 | 
 38 | # set options
 39 | cancer_type <- opt$cancer_type
 40 | predictor <- opt$predictor
 41 | null_model <- opt$null_model
 42 | file_identifier <- ifelse(null_model,
 43 |                           str_c(cancer_type, predictor, "null", sep = "_"),
 44 |                           str_c(cancer_type, predictor, sep = "_"))
 45 | ncores <- min(parallel::detectCores() - 1,
 46 |               opt$ncores,
 47 |               na.rm = TRUE)
 48 | 
 49 | # set seed
 50 | filename.seed <- as.integer(opt$seed1)
 51 | initial.seed <- as.integer(opt$seed2)
 52 | set.seed(initial.seed)
 53 | 
 54 | # define directories
 55 | data.dir <- here::here("data")
 56 | norm.data.dir <- here::here("normalized_data")
 57 | res.dir <- here::here("results")
 58 | 
 59 | # name input files
 60 | seq.file <- paste0(cancer_type, "RNASeq_matchedOnly_ordered.pcl")
 61 | array.file <- paste0(cancer_type, "array_matchedOnly_ordered.pcl")
 62 | train.test.file <- paste0(file_identifier,
 63 |                           "_matchedSamples_training_testing_split_labels_",
 64 |                           filename.seed, ".tsv")
 65 | 
 66 | # name output files
 67 | norm.test.object <- paste0(file_identifier,
 68 |                            "_array_seq_test_data_normalized_list_",
 69 |                            filename.seed, ".RDS")
 70 | norm.train.object <- paste0(file_identifier,
 71 |                             "_array_seq_train_titrate_normalized_list_",
 72 |                             filename.seed, ".RDS")
 73 | 
 74 | #### read in data --------------------------------------------------------------
 75 | 
 76 | seq.data <- fread(file.path(data.dir, seq.file), data.table = FALSE)
 77 | array.data <- fread(file.path(data.dir, array.file), data.table = FALSE)
 78 | sample.train.test <- fread(file.path(res.dir, train.test.file), data.table = FALSE)
 79 | 
 80 | #### split samples, titrate ----------------------------------------------------
 81 | 
 82 | train.sample.names <- as.character(sample.train.test$sample[
 83 |   which(sample.train.test$split == "train")])
 84 | test.sample.names <- as.character(sample.train.test$sample[
 85 |   which(sample.train.test$split == "test")])
 86 | 
 87 | # get samples for 'titration'
 88 | titration.seed <- sample(1:10000, 1)
 89 | message(paste("Random seed for titration:",
 90 |               titration.seed), appendLF = TRUE)
 91 | 
 92 | set.seed(titration.seed)
 93 | titrate.sample.list <- lapply(seq(0, 1, by = 0.1),
 94 |                               function(x) GetTitratedSampleNames(train.sample.names,
 95 |                                                                  x))
 96 | names(titrate.sample.list) <- as.character(seq(0, 100, by = 10))
 97 | 
 98 | # these samples will be the RNA-seq samples in any given 'titration' experiment
 99 | # remove rows that are equal to all ones -- for any combination + test data
100 | # z-score processing will not work on such rows
101 | seq.dt.list <- lapply(titrate.sample.list,
102 |                       function(x) seq.data[, c(1, which(colnames(seq.data) %in% x))])
103 | seq.dt.list[["test"]] <-
104 |   seq.data[, c(1, which(colnames(seq.data) %in% test.sample.names))]
105 | all.same.list <- lapply(seq.dt.list[2:12],
106 |                         function(x){
107 |                           vals <- x[, 2:ncol(x)]
108 |                           indx <- which(apply(vals, 1, check_all_same))
109 |                           return(indx)
110 |                         } )
111 | all.same.indx <- unique(unlist(all.same.list))
112 | # if no rows have all same value (in previous lapply), all.same.indx is integer(0)
113 | # subsetting data frames by -integer(0) results in no rows
114 | # so check that integer vector has length > 0 before subsetting
115 | if (length(all.same.indx) > 0) {
116 |   array.data <- array.data[-all.same.indx, ]
117 |   seq.data <- seq.data[-all.same.indx, ]
118 | }
119 | 
120 | #### get datatables to mix -----------------------------------------------------
121 | 
122 | # get a list that contains an
123 | # array data.table and seq data.table for each level of 'titration'
124 | array.train <-
125 |   data.table(array.data[,
126 |                         c(1, which(colnames(array.data) %in% train.sample.names))])
127 | 
128 | seq.train <-
129 |   data.table(seq.data[,
130 |                       c(1, which(colnames(seq.data) %in% train.sample.names))])
131 | 
132 | titrate.mix.dt.list <- lapply(titrate.sample.list,
133 |                               function(x) GetDataTablesForMixing(array.train,
134 |                                                                  seq.train, x))
135 | 
136 | #### normalize train data ------------------------------------------------------
137 | 
138 | # initialize in the list to hold normalized data
139 | norm.titrate.list <- list()
140 | 
141 | # single platform array normalization
142 | norm.titrate.list[["0"]] <-
143 |   SinglePlatformNormalizationWrapper(titrate.mix.dt.list[[1]]$array,
144 |                                      platform = "array",
145 |                                      add.untransformed = TRUE,
146 |                                      add.qn.z = TRUE)
147 | 
148 | # parallel backend
149 | cl <- parallel::makeCluster(ncores)
150 | doParallel::registerDoParallel(cl)
151 | 
152 | # 'mixed' both platform normalization
153 | norm.titrate.list[2:10] <-
154 |   foreach(n = 2:10, .packages = "tidyverse") %dopar% {
155 |     NormalizationWrapper(titrate.mix.dt.list[[n]]$array,
156 |                          titrate.mix.dt.list[[n]]$seq,
157 |                          add.untransformed = TRUE,
158 |                          add.qn.z = TRUE,
159 |                          add.cn = TRUE,
160 |                          add.seurat.training = TRUE)
161 |   }
162 | 
163 | # stop parallel backend
164 | parallel::stopCluster(cl)
165 | # sort out names
166 | names(norm.titrate.list)[2:10] <- names(titrate.mix.dt.list)[2:10]
167 | 
168 | # single platform seq normalization
169 | norm.titrate.list[["100"]] <-
170 |   SinglePlatformNormalizationWrapper(titrate.mix.dt.list[[11]]$seq,
171 |                                      platform = "seq",
172 |                                      add.untransformed = TRUE,
173 |                                      add.qn.z = TRUE)
174 | 
175 | #### normalize test data -------------------------------------------------------
176 | array.test <-
177 |   data.table(array.data[,
178 |                         c(1, which(colnames(array.data) %in% test.sample.names))])
179 | seq.test <-
180 |   data.table(seq.data[, c(1, which(colnames(seq.data) %in% test.sample.names))])
181 | 
182 | # array normalization
183 | array.test.norm.list <-
184 |   SinglePlatformNormalizationWrapper(array.test,
185 |                                      platform = "array",
186 |                                      add.untransformed = TRUE,
187 |                                      add.qn.z = TRUE,
188 |                                      add.cn.test = TRUE,
189 |                                      add.seurat.test = TRUE,
190 |                                      training.list = norm.titrate.list)
191 | 
192 | # seq normalization
193 | # initialize list to hold normalized seq data
194 | seq.test.norm.list <- list()
195 | 
196 | # LOG normalization
197 | seq.test.norm.list[["log"]] <- LOGSeqOnly(seq.test)
198 | # NPN
199 | seq.test.norm.list[["npn"]] <- NPNSingleDT(seq.test)
200 | 
201 | # start parallel backend
202 | cl <- parallel::makeCluster(ncores)
203 | doParallel::registerDoParallel(cl)
204 | 
205 | # QN -- requires reference data
206 | # initialize list to hold QN data
207 | seq.qn.list <- list()
208 | 
209 | # for 0% seq - use 0% LOG array data
210 | seq.qn.list[["0"]] <- QNSingleWithRef(ref.dt = norm.titrate.list$`0`$log,
211 |                                       targ.dt = seq.test)
212 | 
213 | # for 10-90% seq - use the "raw array" training data at each level of sequencing
214 | # data (this is LOG data, but only the array samples)
215 | seq.qn.list[2:10] <-
216 |   foreach(i = 2:10) %dopar% {
217 |     QNSingleWithRef(ref.dt = norm.titrate.list[[i]]$raw.array,
218 |                     targ.dt = seq.test)
219 |   }
220 | names(seq.qn.list)[2:10] <- names(norm.titrate.list)[2:10]
221 | 
222 | # stop parallel back end
223 | parallel::stopCluster(cl)
224 | 
225 | # QN 100% seq by itself (preProcessCore::normalize.quantiles)
226 | seq.qn.list[["100"]] <- QNSingleDT(seq.test)
227 | 
228 | # add QN seq data to list of normalized test data
229 | seq.test.norm.list[["qn"]] <- seq.qn.list
230 | rm(seq.qn.list)
231 | 
232 | # start parallel backend
233 | cl <- parallel::makeCluster(ncores)
234 | doParallel::registerDoParallel(cl)
235 | 
236 | # QN-Z -- requires reference data
237 | # initialize list to hold QN data
238 | seq.qnz.list <- list()
239 | 
240 | # for 0% seq - use 0% LOG array data
241 | seq.qnz.list[["0"]] <- QNZSingleWithRef(ref.dt = norm.titrate.list$`0`$log,
242 |                                         targ.dt = seq.test)
243 | 
244 | # for 10-90% seq - use the "raw array" training data at each level of sequencing
245 | # data (this is LOG data, but only the array samples)
246 | seq.qnz.list[2:10] <-
247 |   foreach(i = 2:10) %dopar% {
248 |     QNZSingleWithRef(ref.dt = norm.titrate.list[[i]]$raw.array,
249 |                     targ.dt = seq.test)
250 |   }
251 | names(seq.qnz.list)[2:10] <- names(norm.titrate.list)[2:10]
252 | 
253 | # stop parallel back end
254 | parallel::stopCluster(cl)
255 | 
256 | # QNZ 100% seq by itself (preProcessCore::normalize.quantiles)
257 | seq.qnz.list[["100"]] <- QNZSingleDT(seq.test)
258 | 
259 | # add QNZ seq data to list of normalized test data
260 | seq.test.norm.list[["qn-z"]] <- seq.qnz.list
261 | rm(seq.qnz.list)
262 | 
263 | # start parallel back end
264 | cl <- parallel::makeCluster(ncores)
265 | doParallel::registerDoParallel(cl)
266 | 
267 | # TDM normalization -- requires references
268 | # initialize list to hold TDM data
269 | seq.tdm.list <- list()
270 | 
271 | # for 0% seq - use 0% LOG array data
272 | seq.tdm.list[["0"]] <- TDMSingleWithRef(ref.dt = norm.titrate.list$`0`$log,
273 |                                         targ.dt = seq.test)
274 | # for 10-90% seq - use the "raw array" training data at each level of sequencing
275 | # data (this is LOG data, but only the array samples)
276 | seq.tdm.list[2:10] <-
277 |   foreach(i = 2:10) %dopar% {
278 |     TDMSingleWithRef(ref.dt = norm.titrate.list[[i]]$raw.array,
279 |                      targ.dt = seq.test)
280 |   }
281 | names(seq.tdm.list)[2:10] <- names(norm.titrate.list)[2:10]
282 | 
283 | # stop parallel backend
284 | parallel::stopCluster(cl)
285 | 
286 | # 100% is not applicable for TDM
287 | seq.tdm.list["100"] <- list(NULL)
288 | 
289 | # add TDM seq data to list of normalized test data
290 | seq.test.norm.list[["tdm"]] <- seq.tdm.list
291 | rm(seq.tdm.list)
292 | 
293 | # z-score seq test data
294 | seq.test.norm.list[["z"]] <- ZScoreSingleDT(seq.test)
295 | 
296 | # untransformed seq test data
297 | seq.test.norm.list[["un"]] <- seq.test
298 | 
299 | # CrossNorm RNA-seq test
300 | # Rescale each column, quantile normalize, then rescale each row
301 | seq.test.norm.list[["qn (cn)"]] <- rescale_datatable(seq.test,
302 |                                                      by_column = TRUE) %>%
303 |   QNSingleDT(zero.to.one = TRUE)
304 | 
305 | # Seurat RNA-seq test
306 | # for 10-90% seq - use the integrated training data at each %RNA-seq
307 | 
308 | # parallel backend
309 | cl <- parallel::makeCluster(ncores)
310 | doParallel::registerDoParallel(cl)
311 | 
312 | seq.seurat.list <- foreach(i = 2:10, .packages = "tidyverse") %dopar% { # 2:10 corresponds to 10%-90%
313 |   
314 |   if (!is.null(norm.titrate.list[[i]][["seurat_model"]])) {
315 |     
316 |     tryCatch(SeuratProjectPCATestData(seq.test,
317 |                                       norm.titrate.list[[i]][["seurat_model"]],
318 |                                       vbose = TRUE),
319 |              error = function(e) NULL)
320 |     
321 |   } else {
322 |     NULL
323 |   }
324 |   
325 | }
326 | 
327 | names(seq.seurat.list) <- names(norm.titrate.list)[2:10] # 2:10 corresponds to 10%-90%
328 | 
329 | # stop parallel backend
330 | parallel::stopCluster(cl)
331 | 
332 | # add Seurat RNA-seq test data to list of normalized test data
333 | seq.test.norm.list[["seurat"]] <- seq.seurat.list
334 | rm(seq.seurat.list)
335 | 
336 | # combine array and seq test data into a list
337 | test.norm.list <- list(array = array.test.norm.list,
338 |                        seq = seq.test.norm.list)
339 | 
340 | # save test data
341 | saveRDS(test.norm.list, file = file.path(norm.data.dir, norm.test.object))
342 | 
343 | # save train data after removing Seurat models (just keep Seurat-normed data)
344 | for (n in names(norm.titrate.list)) {
345 |   if ("seurat_model" %in% names(norm.titrate.list[[n]])) {
346 |     norm.titrate.list[[n]][["seurat_model"]] <- NULL
347 |   }
348 | }
349 | 
350 | saveRDS(norm.titrate.list, file = file.path(norm.data.dir, norm.train.object))
351 | 


--------------------------------------------------------------------------------
/1A-detect_differentially_expressed_genes.R:
--------------------------------------------------------------------------------
  1 | # J. Taroni Jan 2017
  2 | # The purpose of this analysis is to identify differentially expressed genes
  3 | # between one subtype, specified by the user, and all
  4 | # other subtypes using the limma package for varying amounts of RNA-seq data
  5 | # (0-100%, 10% added at a time; termed 'RNA-seq titration') and normalization
  6 | # methods. It takes RNA-seq and microarray data from matched samples as input,
  7 | # and performs RNA-seq titration and differential expression analysis.
  8 | #
  9 | # USAGE: Rscript 1A-detect_differentially_expressed_genes.R --cancer_type --subtype_vs_others --subtype_vs_subtype --seed --ncores
 10 | 
 11 | option_list <- list(
 12 |   optparse::make_option("--cancer_type",
 13 |                         default = NA_character_,
 14 |                         help = "Cancer type"),
 15 |   optparse::make_option("--subtype_vs_others",
 16 |                         default = NA_character_,
 17 |                         help = "Subtype used for comparison against all others"),
 18 |   optparse::make_option("--subtype_vs_subtype",
 19 |                         default = NA_character_,
 20 |                         help = "Subtypes used in head-to-head comparison (comma-separated without space e.g. Type1,Type2)"),
 21 |   optparse::make_option("--seed",
 22 |                         default = 98,
 23 |                         help = "Random seed [default: %default]"),
 24 |   optparse::make_option("--ncores",
 25 |                         default = NA_integer_,
 26 |                         help = "Set the number of cores to use")
 27 | )
 28 | 
 29 | opt <- optparse::parse_args(optparse::OptionParser(option_list=option_list))
 30 | source(here::here("util/option_functions.R"))
 31 | check_options(opt)
 32 | 
 33 | # load libraries
 34 | suppressMessages(source(here::here("load_packages.R")))
 35 | source(here::here("util", "normalization_functions.R"))
 36 | source(here::here("util", "differential_expression_functions.R"))
 37 | 
 38 | # set options
 39 | cancer_type <- opt$cancer_type
 40 | subtype_vs_others <- opt$subtype_vs_others
 41 | subtype_vs_subtype <- opt$subtype_vs_subtype
 42 | two_subtypes <- as.vector(stringr::str_split(subtype_vs_subtype, pattern = ",", simplify = TRUE))
 43 | file_identifier <- str_c(cancer_type, "subtype", sep = "_") # we are only working with subtype models here
 44 | ncores <- min(parallel::detectCores() - 1,
 45 |               opt$ncores,
 46 |               na.rm = TRUE)
 47 | 
 48 | # set seed
 49 | initial.seed <- opt$seed
 50 | set.seed(initial.seed)
 51 | message(paste("\nInitial seed set to:", initial.seed))
 52 | 
 53 | # define directories
 54 | data.dir <- here::here("data")
 55 | res.dir <- here::here("results")
 56 | norm.dir <- here::here("normalized_data")
 57 | deg.dir <- file.path(res.dir, "differential_expression")
 58 | plot.data.dir <- here::here("plots/data")
 59 | 
 60 | # define input files
 61 | seq.file <- file.path(data.dir,
 62 |                       paste0(cancer_type, "RNASeq_matchedOnly_ordered.pcl"))
 63 | array.file <- file.path(data.dir,
 64 |                         paste0(cancer_type, "array_matchedOnly_ordered.pcl"))
 65 | smpl.file <- file.path(res.dir,
 66 |                        list.files(res.dir, # this finds the first example of a subtypes file from cancer_type
 67 |                                   pattern = paste0(file_identifier, # and does not rely on knowing a seed
 68 |                                                    "_matchedSamples_training_testing_split_labels_"))[1])
 69 | 
 70 | # define output files
 71 | subtype_vs_others_lead <- paste0(file_identifier,
 72 |                                  "_titration_differential_exp_eBayes_fits_",
 73 |                                  subtype_vs_others, "vOther")
 74 | two_subtypes_lead <- paste0(file_identifier,
 75 |                             "_titration_differential_exp_eBayes_fits_",
 76 |                             stringr::str_c(two_subtypes, collapse = "v"))
 77 | 
 78 | subtype_vs_others.rds <- file.path(deg.dir,
 79 |                                    paste0(subtype_vs_others_lead, ".RDS"))
 80 | two_subtypes.rds <- file.path(deg.dir,
 81 |                               paste0(two_subtypes_lead, ".RDS"))
 82 | 
 83 | subtype_vs_others.propDE_file <- file.path(plot.data.dir,
 84 |                                            paste0(subtype_vs_others_lead,
 85 |                                                   ".propDE.tsv"))
 86 | two_subtypes.propDE_file <- file.path(plot.data.dir,
 87 |                                       paste0(two_subtypes_lead,
 88 |                                              ".propDE.tsv"))
 89 | 
 90 | subtype_vs_others.silver_file <- file.path(plot.data.dir,
 91 |                                            paste0(subtype_vs_others_lead,
 92 |                                                   ".silver.tsv"))
 93 | two_subtypes.silver_file <- file.path(plot.data.dir,
 94 |                                       paste0(two_subtypes_lead,
 95 |                                              ".silver.tsv"))
 96 | 
 97 | norm.rds <- file.path(norm.dir,
 98 |                       paste0(file_identifier,
 99 |                              "_titration_no_ZTO_transform_with_UN.RDS"))
100 | 
101 | #### read in data --------------------------------------------------------------
102 | 
103 | seq.data <- data.table::fread(seq.file, data.table = F)
104 | array.data <- data.table::fread(array.file, data.table = F)
105 | sample.df <- read.delim(smpl.file)
106 | 
107 | # check that subtypes are in sample.df
108 | for(subtype in c(subtype_vs_others, two_subtypes)) {
109 |   if (!(subtype %in% sample.df$category)) {
110 |     stop(paste("Subtype", subtype, "not found in sample file",
111 |                smpl.file, "in 1A-detect_differentially_expressed_genes.R."))
112 |   }
113 | }
114 | 
115 | sample.names <- sample.df$sample
116 | 
117 | #### RNA-seq 'titration' -------------------------------------------------------
118 | 
119 | titration.seed <- sample(1:10000, 1)
120 | message(paste("Random seed for titration:",
121 |               titration.seed), appendLF=TRUE)
122 | 
123 | set.seed(titration.seed)
124 | # these samples will be the RNA-seq samples in any given 'titration' experiment
125 | titrate.sample.list <-
126 |   lapply(seq(0, 1, by = 0.1),
127 |          function(x) GetTitratedSampleNames(sample.names, x))
128 | 
129 | # remove rows that are equal to all ones in sequencing data -- these are
130 | # essentially missing values and cause issues with z-transformation
131 | seq.dt.list <-
132 |   lapply(titrate.sample.list,
133 |          function(x) seq.data[, c(1, which(colnames(seq.data) %in% x))])
134 | all.same.list <- lapply(seq.dt.list[2:11],
135 |                         function(x){
136 |                           vals <- x[, 2:ncol(x)]
137 |                           indx <- which(apply(vals, 1, check_all_same))
138 |                           return(indx)
139 |                         } )
140 | all.same.indx <- unique(unlist(all.same.list))
141 | # if no rows are all same (in previous lapply), all.same.indx is integer(0)
142 | # subsetting data frames by -integer(0) results in no rows
143 | # so check that integer vector has length > 0 before subsetting
144 | if (length(all.same.indx) > 0) {
145 |   array.data <- array.data[-all.same.indx, ]
146 |   seq.data <- seq.data[-all.same.indx, ]
147 | }
148 | 
149 | # get a list that contains an array data.table and seq data.table for each
150 | # each level of 'titration'
151 | titrate.mix.dt.list <-
152 |   lapply(titrate.sample.list,
153 |          function(x) GetDataTablesForMixing(data.table(array.data),
154 |                                             data.table(seq.data),
155 |                                             x))
156 | names(titrate.mix.dt.list) <- as.character(seq(0, 100, by=10))
157 | 
158 | #### normalize data ------------------------------------------------------------
159 | 
160 | # initialize in the list to hold normalized data
161 | norm.titrate.list <- list()
162 | 
163 | # single platform array normalization
164 | norm.titrate.list[["0"]] <-
165 |   SinglePlatformNormalizationWrapper(titrate.mix.dt.list[[1]]$array,
166 |                                      platform = "array",
167 |                                      zto = FALSE,
168 |                                      add.qn.z = TRUE)
169 | 
170 | # parallel backend
171 | cl <- parallel::makeCluster(ncores)
172 | doParallel::registerDoParallel(cl)
173 | 
174 | # 'mixed' both platform normalization
175 | norm.titrate.list[2:10] <-
176 |   foreach(n = 2:10) %dopar% {
177 |     NormalizationWrapper(titrate.mix.dt.list[[n]]$array,
178 |                          titrate.mix.dt.list[[n]]$seq,
179 |                          zto = FALSE,
180 |                          add.untransformed = TRUE,
181 |                          add.qn.z = TRUE)
182 |   }
183 | names(norm.titrate.list)[2:10] <- names(titrate.mix.dt.list)[2:10]
184 | 
185 | # stop parallel backend
186 | parallel::stopCluster(cl)
187 | 
188 | # single platform seq normalization
189 | norm.titrate.list[["100"]] <-
190 |   SinglePlatformNormalizationWrapper(titrate.mix.dt.list[[11]]$seq,
191 |                                      platform = "seq",
192 |                                      zto = FALSE,
193 |                                      add.untransformed = TRUE,
194 |                                      add.qn.z = TRUE)
195 | 
196 | # save normalized data
197 | saveRDS(norm.titrate.list, file = norm.rds)
198 | 
199 | #### Subtype v. Others  --------------------------------------------------------
200 | # design matrices
201 | design.mat.list <- GetDesignMatrixList(norm.titrate.list, sample.df,
202 |                                        subtype = subtype_vs_others)
203 | # differential expression
204 | fit.results.list <- GetFiteBayesList(norm.list = norm.titrate.list,
205 |                                      design.list = design.mat.list)
206 | # save fit results to RDS
207 | saveRDS(fit.results.list, file = subtype_vs_others.rds)
208 | 
209 | # write top.table.list to results directory
210 | adjust.method <- "BH"
211 | subtype_vs_others.top.table.list <- 
212 |   lapply(fit.results.list,  # for each level of % seq
213 |          function(x)
214 |            lapply(x, # for each normalization method
215 |                   function(y) GetAllGenesTopTable(y, adjust = adjust.method)))
216 | 
217 | # write proportion DE to plot data directory
218 | subtype_vs_others.proportion_de <- GetDataProportionDE(
219 |   subtype_vs_others.top.table.list,
220 |   adjust.method = "BH", cutoff = 0.05)
221 | 
222 | write.table(x = subtype_vs_others.proportion_de, 
223 |             file = subtype_vs_others.propDE_file,
224 |             quote = FALSE, sep = "\t", row.names = FALSE)
225 | 
226 | # write stats for comparison to silver standard to plot data directory
227 | subtype_vs_others.silver <- GetDataSilverStandardStats(
228 |   subtype_vs_others.top.table.list,
229 |   cutoff = 0.05)
230 | 
231 | write.table(subtype_vs_others.silver,
232 |             file = subtype_vs_others.silver_file,
233 |             quote = FALSE, sep = "\t", row.names = FALSE)
234 | 
235 | #### Subtype v. Subtype --------------------------------------------------------
236 | # remove all samples that are not in these subtypes
237 | samples.to.keep <-
238 |   sample.df$sample[which(sample.df$category %in% two_subtypes)]
239 | 
240 | pruned.norm.list <-
241 |   lapply(norm.titrate.list,
242 |          function(x) lapply(x,
243 |                             function(y) y[,
244 |                                           c(1, which(colnames(y) %in%
245 |                                                        samples.to.keep)),
246 |                                           with = FALSE]))
247 | 
248 | # get design matrices
249 | last_subtype.design.list <- GetDesignMatrixList(pruned.norm.list,
250 |                                                 sample.df,
251 |                                                 subtype = last(two_subtypes))
252 | # differential expression
253 | last_subtype.fit.results.list <- GetFiteBayesList(norm.list = pruned.norm.list,
254 |                                                   design.list = last_subtype.design.list)
255 | 
256 | # save fit results to file
257 | saveRDS(last_subtype.fit.results.list,
258 |         file = two_subtypes.rds)
259 | 
260 | # get top.table.list
261 | adjust.method <- "BH"
262 | two_subtypes.top.table.list <- 
263 |   lapply(last_subtype.fit.results.list,  # for each level of % seq
264 |          function(x)
265 |            lapply(x, # for each normalization method
266 |                   function(y) GetAllGenesTopTable(y, adjust = adjust.method)))
267 | 
268 | # write proportion DE to plot data directory
269 | two_subtypes.proportion_de <- GetDataProportionDE(two_subtypes.top.table.list,
270 |                                      adjust.method = "BH", cutoff = 0.05)
271 | 
272 | write.table(x = two_subtypes.proportion_de, 
273 |             file = two_subtypes.propDE_file,
274 |             quote = FALSE, sep = "\t", row.names = FALSE)
275 | 
276 | # write stats for comparison to silver standard to plot data directory
277 | two_subtypes.silver <- GetDataSilverStandardStats(
278 |   two_subtypes.top.table.list,
279 |   cutoff = 0.05)
280 | 
281 | write.table(two_subtypes.silver,
282 |             file = two_subtypes.silver_file,
283 |             quote = FALSE, sep = "\t", row.names = FALSE)
284 | 


--------------------------------------------------------------------------------
/2-train_test_category.R:
--------------------------------------------------------------------------------
  1 | # J. Taroni Jul 2016
  2 | # The purpose of this script is to train LASSO, linear SVM, and
  3 | # predictive models on normalized and mixed array and RNA-seq data
  4 | # (output of 1-normalized_titrated_data.R) and then to perform predictions on
  5 | # normalized test data.
  6 | # It should be run from the command line through the run_experiments.R script
  7 | 
  8 | option_list <- list(
  9 |   optparse::make_option("--cancer_type",
 10 |                         default = NA_character_,
 11 |                         help = "Cancer type"),
 12 |   optparse::make_option("--predictor",
 13 |                         default = NA_character_,
 14 |                         help = "Predictor used"),
 15 |   optparse::make_option("--seed1",
 16 |                         default = NA_integer_,
 17 |                         help = "Random seed"),
 18 |   optparse::make_option("--seed3",
 19 |                         default = NA_integer_,
 20 |                         help = "Random seed"),
 21 |   optparse::make_option("--null_model",
 22 |                         action = "store_true",
 23 |                         default = FALSE,
 24 |                         help = "Refer to models with permuted dependent variable (within subtype if predictor is a gene)"),
 25 |   optparse::make_option("--ncores",
 26 |                         default = NA_integer_,
 27 |                         help = "Set the number of cores to use")
 28 | )
 29 | 
 30 | opt <- optparse::parse_args(optparse::OptionParser(option_list=option_list))
 31 | source(here::here("util/option_functions.R"))
 32 | check_options(opt)
 33 | 
 34 | # load libraries
 35 | suppressMessages(source(here::here("load_packages.R")))
 36 | source(here::here("util", "train_test_functions.R"))
 37 | 
 38 | # set options
 39 | cancer_type <- opt$cancer_type
 40 | predictor <- opt$predictor
 41 | null_model <- opt$null_model
 42 | file_identifier <- ifelse(null_model,
 43 |                           str_c(cancer_type, predictor, "null", sep = "_"),
 44 |                           str_c(cancer_type, predictor, sep = "_"))
 45 | ncores <- min(parallel::detectCores() - 1,
 46 |               opt$ncores,
 47 |               na.rm = TRUE)
 48 | 
 49 | # set seed
 50 | filename.seed <- opt$seed1
 51 | initial.seed <- opt$seed3
 52 | set.seed(initial.seed)
 53 | 
 54 | # define directories
 55 | norm.data.dir <- here::here("normalized_data")
 56 | mdl.dir <- here::here("models")
 57 | res.dir <- here::here("results")
 58 | 
 59 | # define input files
 60 | norm.test.object <- paste0(file_identifier,
 61 |                            "_array_seq_test_data_normalized_list_",
 62 |                            filename.seed, ".RDS")
 63 | norm.train.object <- paste0(file_identifier,
 64 |                             "_array_seq_train_titrate_normalized_list_",
 65 |                             filename.seed, ".RDS")
 66 | train.test.labels <- paste0(file_identifier,
 67 |                             "_matchedSamples_training_testing_split_labels_",
 68 |                             filename.seed, ".tsv")
 69 | 
 70 | # define output files
 71 | trained.models.object <- paste0(file_identifier,
 72 |                                 "_train_3_models_",
 73 |                                 filename.seed, ".RDS")
 74 | train.kappa.file <- file.path(res.dir,
 75 |                               paste0(file_identifier,
 76 |                                      "_train_3_models_training_set_total_kappa_",
 77 |                                      filename.seed, ".tsv"))
 78 | array.kappa.file <- file.path(res.dir,
 79 |                               paste0(file_identifier,
 80 |                                      "_train_3_models_array_kappa_",
 81 |                                      filename.seed, ".tsv"))
 82 | seq.kappa.file <- file.path(res.dir,
 83 |                             paste0(file_identifier,
 84 |                                    "_train_3_models_seq_kappa_",
 85 |                                    filename.seed, ".tsv"))
 86 | 
 87 | #### load data -----------------------------------------------------------------
 88 | 
 89 | sample.train.test <- fread(file.path(res.dir, train.test.labels), data.table = FALSE)
 90 | norm.titrate.list <- readRDS(file.path(norm.data.dir, norm.train.object))
 91 | norm.test.list <- readRDS(file.path(norm.data.dir, norm.test.object))
 92 | 
 93 | # set each category as a factor
 94 | sample.train.test$category <- as.factor(sample.train.test$category)
 95 | 
 96 | # category levels for each perc of seq data
 97 | category.norm.list <- lapply(norm.titrate.list,
 98 |                              function(x) GetOrderedCategoryLabels(x$z,
 99 |                                                                   sample.train.test))
100 | 
101 | # restructure normalized list so that it's organized by normalization method
102 | restr.train.list <- RestructureNormList(norm.titrate.list)
103 | rm(norm.titrate.list)
104 | 
105 | #### training ------------------------------------------------------------------
106 | 
107 | folds.seed <- sample(1:10000, 1)
108 | message(paste("Random seed for createFolds:", folds.seed), appendLF = TRUE)
109 | set.seed(folds.seed)
110 | folds.list <- lapply(category.norm.list, function(x) createFolds(x, k = 5))
111 | 
112 | # parallel backend
113 | cl <- parallel::makeCluster(ncores)
114 | registerDoParallel(cl)
115 | 
116 | resample.seed <- sample(1:10000, 1)
117 | message(paste("Random seed for resampling:", resample.seed), appendLF=TRUE)
118 | 
119 | train.model.list <-
120 |   foreach(n = 1:length(restr.train.list)) %do% {  # foreach norm method
121 |     foreach(m = 1:length(category.norm.list)) %dopar% {  # foreach % seq level
122 |       TrainThreeModels(dt = restr.train.list[[n]][[m]],
123 |                        category = category.norm.list[[m]],
124 |                        seed = resample.seed,
125 |                        folds.list = folds.list[[m]])
126 | 
127 |     }
128 |   }
129 | 
130 | # stop parallel backend
131 | stopCluster(cl)
132 | 
133 | # get names
134 | names(train.model.list) <- names(restr.train.list)
135 | train.model.list <- mapply(
136 |   function(x, y){
137 |     names(x) <- names(y)
138 |     return(x)
139 |   }, x = train.model.list,
140 |   y = restr.train.list,
141 |   SIMPLIFY = TRUE)
142 | 
143 | # restructure trained model list so from top to bottom: norm method -> model
144 | # type -> % seq level (0 - 100)
145 | train.model.list <- RestructureTrainedList(train.model.list)
146 | 
147 | # save predictive models
148 | saveRDS(train.model.list, file = file.path(mdl.dir, trained.models.object))
149 | 
150 | #### training kappa ---------------------------------------------------------
151 | # get rid of 0, 100 tdm list, they're NULL
152 | restr.train.list$tdm$`0` <- NULL
153 | restr.train.list$tdm$`100` <- NULL
154 | 
155 | # get training kappa stats and write to file
156 | train.kappa.df <- PredictWrapper(train.model.list = train.model.list,
157 |                                  pred.list = restr.train.list,
158 |                                  sample.df = sample.train.test,
159 |                                  only.kap = TRUE)
160 | 
161 | write.table(train.kappa.df, file = train.kappa.file, sep = "\t",
162 |             row.names = FALSE, quote = FALSE)
163 | 
164 | #### predictions - test data ---------------------------------------------------
165 | 
166 | # get predictions on array test data as a data frame
167 | array.kappa.df <- PredictWrapper(train.model.list = train.model.list,
168 |                                  pred.list = norm.test.list$array,
169 |                                  sample.df = sample.train.test,
170 |                                  only.kap = TRUE)
171 | 
172 | write.table(array.kappa.df, file = array.kappa.file, sep = "\t",
173 |             row.names = FALSE, quote = FALSE)
174 | 
175 | # for the 0 perc seq level of the titration, the model tested on log transformed
176 | # array data (100% array data) should be tested on the TDM transformed seq data
177 | for(i in 1:length(train.model.list[["tdm"]])){
178 |   train.model.list[["tdm"]][[i]]$`0` <- train.model.list[["log"]][[i]]$`0`
179 |   train.model.list[["tdm"]][[i]] <- train.model.list[["tdm"]][[i]][c(10, 1:9)]
180 | }
181 | 
182 | # get rid of 100 tdm list, it's NULL
183 | norm.test.list$seq$tdm$`100` <- NULL
184 | 
185 | # get predictions on RNA-seq test data as a data frame
186 | seq.kappa.df <- PredictWrapper(train.model.list = train.model.list,
187 |                                pred.list = norm.test.list$seq,
188 |                                sample.df = sample.train.test,
189 |                                only.kap = TRUE)
190 | 
191 | write.table(seq.kappa.df, file = seq.kappa.file, sep = "\t",
192 |             row.names = FALSE, quote = FALSE)
193 | 


--------------------------------------------------------------------------------
/2A-small_n_differential_expression.R:
--------------------------------------------------------------------------------
  1 | # J. Taroni Feb 2016
  2 | # The purpose of this analysis is to examine how normalization methods
  3 | # (quantile normalization or z-transformation) perform wrt differential
  4 | # expression when there are a small number of samples on each platform
  5 | #
  6 | # USAGE: Rscript 2A-small_n_differential_expression.R --cancer_type --subtype_vs_subtype --ncores
  7 | 
  8 | option_list <- list(
  9 |   optparse::make_option("--cancer_type",
 10 |                         default = NA_character_,
 11 |                         help = "Cancer type"),
 12 |   optparse::make_option("--subtype_vs_subtype",
 13 |                         default = NA_character_,
 14 |                         help = "Subtypes used in head-to-head comparison (comma-separated without space e.g. Type1,Type2)"),
 15 |   optparse::make_option("--seed",
 16 |                         default = 3255,
 17 |                         help = "Random seed"),
 18 |   optparse::make_option("--ncores",
 19 |                         default = NA_integer_,
 20 |                         help = "Set the number of cores to use")
 21 | )
 22 | 
 23 | opt <- optparse::parse_args(optparse::OptionParser(option_list=option_list))
 24 | source(here::here("util/option_functions.R"))
 25 | check_options(opt)
 26 | 
 27 | # load libraries
 28 | suppressMessages(source(here::here("load_packages.R")))
 29 | source(here::here("util", "normalization_functions.R"))
 30 | source(here::here("util", "differential_expression_functions.R"))
 31 | source(here::here("util", "color_blind_friendly_palette.R"))
 32 | 
 33 | # set options
 34 | cancer_type <- opt$cancer_type
 35 | subtype_vs_subtype <- opt$subtype_vs_subtype
 36 | two_subtypes <- as.vector(stringr::str_split(subtype_vs_subtype, pattern = ",", simplify = TRUE))
 37 | file_identifier <- str_c(cancer_type, "subtype", sep = "_") # we are only working with subtype models here
 38 | ncores <- min(parallel::detectCores() - 1,
 39 |               opt$ncores,
 40 |               na.rm = TRUE)
 41 | 
 42 | # set seed
 43 | initial.seed <- opt$seed
 44 | set.seed(initial.seed)
 45 | 
 46 | # set additional random seeds for reproducibility within foreach dopar loops
 47 | random_seeds <- sample(1:10000, size = 9)
 48 | 
 49 | message(paste("\nInitial seed set to:", initial.seed))
 50 | 
 51 | # define directories
 52 | data.dir <- here::here("data")
 53 | res.dir <- here::here("results")
 54 | deg.dir <- file.path(res.dir, "differential_expression")
 55 | plot.dir <- here::here("plots")
 56 | plot.data.dir <- file.path(plot.dir, "data")
 57 | 
 58 | # define input files
 59 | seq.file <- file.path(data.dir,
 60 |                       paste0(cancer_type, "RNASeq_matchedOnly_ordered.pcl"))
 61 | array.file <- file.path(data.dir,
 62 |                         paste0(cancer_type, "array_matchedOnly_ordered.pcl"))
 63 | smpl.file <- file.path(res.dir,
 64 |                        list.files(res.dir, # this finds the first example of a subtypes file from cancer_type
 65 |                                   pattern = paste0(file_identifier, # and does not rely on knowing a seed
 66 |                                                    "_matchedSamples_training_testing_split_labels_"))[1])
 67 | 
 68 | #### read in data --------------------------------------------------------------
 69 | 
 70 | seq.data <- data.table::fread(seq.file, data.table = F)
 71 | array.data <- data.table::fread(array.file, data.table = F)
 72 | sample.df <- read.delim(smpl.file)
 73 | 
 74 | # check that subtypes are in sample.df
 75 | for(subtype in two_subtypes) {
 76 |   if (!(subtype %in% sample.df$category)) {
 77 |     stop(paste("Subtype", subtype, "not found in sample file",
 78 |                smpl.file, "in 2A-small_n_differential_expression.R."))
 79 |   }
 80 | }
 81 | 
 82 | sample.names <- sample.df$sample
 83 | 
 84 | #### main ----------------------------------------------------------------------
 85 | 
 86 | # leave only subtypes of interest to choose from & make data.table
 87 | # remove all samples that are not subtypes of interest
 88 | samples.to.keep <-
 89 |   sample.df$sample[which(sample.df$category %in% two_subtypes)]
 90 | 
 91 | array.dt <- data.table(array.data[,
 92 |                                   c(1, which(colnames(array.data) %in%
 93 |                                                samples.to.keep))])
 94 | seq.dt <- data.table(seq.data[,
 95 |                               c(1, which(colnames(seq.data) %in%
 96 |                                            samples.to.keep))])
 97 | sample.df <- sample.df[which(sample.df$sample %in% samples.to.keep), ]
 98 | 
 99 | smaller_subtype_size <- min(table(as.character(sample.df$category)))
100 | 
101 | # different sizes of n to test
102 | no.samples <- c(3, 4, 5, 6, 8, 10, 15, 25, 50)
103 | no.samples <- no.samples[which(no.samples <= smaller_subtype_size)]
104 | 
105 | message(paste("Smaller subtype has", smaller_subtype_size, "samples,",
106 |               "so using up to", max(no.samples), "samples in 2A-small_n_differential_expression.R"))
107 | 
108 | # initialize list to hold Jaccard, Rand, Spearman data from the 10 trials
109 | stats.df.list <- list()
110 | 
111 | # Do this at 0-100% RNA-seq titration levels
112 | # parallel backend
113 | cl <- parallel::makeCluster(ncores)
114 | doParallel::registerDoParallel(cl)
115 | 
116 | # at each titration level (0-100% RNA-seq)
117 | stats.df.list[1:9] <- foreach(seq_prop = seq(0.1, .9, 0.1), .packages = c("tidyverse")) %dopar% {
118 |   
119 |   # random_seeds indexed by 1 through 9, corresponding to seq_prop 0.1 through 0.9
120 |   set.seed(random_seeds[seq_prop*10])
121 |   
122 |   # we're going to repeat the small n experiment 10 times
123 |   stats.df.iter_list <- list() # this is returned to stats.df.list each iteration
124 |   for (trial.iter in 1:10) {
125 |     
126 |     # for each n (3...50), get the sample names that will be included in the
127 |     # experiment and on each platform
128 |     sample.list <-
129 |       lapply(no.samples,  # for each n (3...50)
130 |              function(x) GetSamplesforMixingSmallN(x, sample.df,
131 |                                                    subtype = data.table::last(two_subtypes),
132 |                                                    seq_proportion = seq_prop))
133 |     
134 |     # initialize list to hold differential expression results (eBayes output)
135 |     master.deg.list <- list()
136 |     
137 |     for (smpl.no.iter in seq_along(sample.list)) {  # for each n (3...50)
138 |       # normalize data
139 |       n_array <- length(sample.list[[smpl.no.iter]]$array)
140 |       n_seq <- length(sample.list[[smpl.no.iter]]$seq)
141 |       
142 |       if (n_array >= 3 & n_seq >= 3) { # require at least three array and seq samples
143 |         norm.list <- SmallNNormWrapper(array.dt = array.dt,
144 |                                        seq.dt = seq.dt,
145 |                                        mix.list = sample.list[[smpl.no.iter]],
146 |                                        zto = FALSE)
147 |         # perform differential expression analysis
148 |         master.deg.list[[as.character(no.samples[smpl.no.iter])]] <-
149 |           SmallNDEGWrapper(norm.list = norm.list, sample.df = sample.df,
150 |                            subtype = data.table::last(two_subtypes)) 
151 |       }
152 |     }
153 |     
154 |     top.table.list <-
155 |       lapply(master.deg.list,  # for each n (3...50)
156 |              function(x)  # for each normalization method
157 |                lapply(x, function(y) GetAllGenesTopTable(y)))  # extract DEGs
158 |     
159 |     # how do the (100-X)%/X% array/seq differentially expressed genes compared to
160 |     # the platform-specific standards?
161 |     if (length(top.table.list) > 0) {
162 |       stats.df.iter_list[[trial.iter]] <- GetSmallNSilverStandardStats(top.table.list,
163 |                                                                        cutoff = 0.1)  
164 |     }
165 |   }
166 |   stats.df.iter_list # return stats.df.iter_list to stats.df.list
167 | }
168 | 
169 | # stop parallel backend
170 | parallel::stopCluster(cl)
171 | 
172 | # renames list levels
173 | names(stats.df.list)[1:9] <- as.character(seq(10, 90, 10))
174 | 
175 | # combine jaccard similarity data.frames into one data.frame
176 | subtypes_combination <- stringr::str_c(two_subtypes, collapse = "v")
177 | 
178 | stats.df <- reshape2::melt(stats.df.list,
179 |                            id.vars = c("platform", "normalization", "no.samples"))
180 | names(stats.df) <- c("platform", "normalization", "no.samples", "measure", "value",
181 |                      "iteration", "seq_prop")
182 | stats.df <- stats.df %>% 
183 |   mutate(seq_prop = factor(str_c(seq_prop, "% RNA-seq"),
184 |                            levels = str_c(seq(0, 100, 10), "% RNA-seq")))
185 |   
186 | write.table(stats.df,
187 |             file = file.path(plot.data.dir,
188 |                              paste0(file_identifier,
189 |                                     "_small_n_",
190 |                                     subtypes_combination,
191 |                                     "_results.tsv")),
192 |             sep = "\t", quote = FALSE, row.names = FALSE)
193 | 


--------------------------------------------------------------------------------
/3-combine_category_kappa.R:
--------------------------------------------------------------------------------
  1 | # J. Taroni Jul 2016
  2 | # The purpose of this script is to combine and save Kappa statistics from category
  3 | # predictions on hold-out data. It should be run from the command line
  4 | # through the classifier_repeat_wrapper.R script or alternatively
  5 | # USAGE: Rscript 3-combine_category_kappa.R
  6 | 
  7 | option_list <- list(
  8 |   optparse::make_option("--cancer_type",
  9 |                         default = NA_character_,
 10 |                         help = "Cancer type"),
 11 |   optparse::make_option("--predictor",
 12 |                         default = NA_character_,
 13 |                         help = "Predictor used"),
 14 |   optparse::make_option("--null_model",
 15 |                         action = "store_true",
 16 |                         default = FALSE,
 17 |                         help = "Use null model as baseline for plotting delta kappa")
 18 | )
 19 | 
 20 | opt <- optparse::parse_args(optparse::OptionParser(option_list=option_list))
 21 | source(here::here("util/option_functions.R"))
 22 | check_options(opt)
 23 | 
 24 | # load libraries
 25 | suppressMessages(source(here::here("load_packages.R")))
 26 | source(here::here("util", "color_blind_friendly_palette.R"))
 27 | 
 28 | # set options
 29 | cancer_type <- opt$cancer_type
 30 | predictor <- opt$predictor
 31 | null_model <- opt$null_model
 32 | file_identifier <- str_c(cancer_type, predictor, sep = "_")
 33 | 
 34 | # define directories
 35 | plot.dir <- here::here("plots")
 36 | plot.data.dir <- file.path(plot.dir, "data")
 37 | res.dir <- here::here("results")
 38 | 
 39 | # list array and seq files from results directory
 40 | lf <- list.files(res.dir, full.names = TRUE)
 41 | array.files <- lf[grepl(paste0(file_identifier,
 42 |                                "_train_3_models_array_kappa_"), lf)]
 43 | seq.files <- lf[grepl(paste0(file_identifier,
 44 |                              "_train_3_models_seq_kappa_"), lf)]
 45 | if (null_model) {
 46 |   null_array.files <- lf[grepl(paste0(file_identifier,
 47 |                                       "_null_train_3_models_array_kappa_"), lf)]
 48 |   null_seq.files <- lf[grepl(paste0(file_identifier,
 49 |                                     "_null_train_3_models_seq_kappa_"), lf)]
 50 |   
 51 |   # check that we have ordered pairs of regular and null files for array and seq
 52 |   array_seeds <- stringr::str_sub(array.files, -8, -5)
 53 |   null_array_seeds <- stringr::str_sub(null_array.files, -8, -5)
 54 |   seq_seeds <- stringr::str_sub(seq.files, -8, -5)
 55 |   null_seq_seeds <- stringr::str_sub(null_seq.files, -8, -5)
 56 |   if (!(all(array_seeds == null_array_seeds) &
 57 |         all(seq_seeds == null_seq_seeds))) {
 58 |     stop("Array or seq seeds do not match in delta kappa plotting script.")
 59 |   }
 60 |   
 61 | }
 62 | 
 63 | # define output files
 64 | test.df.filename <- ifelse(null_model,
 65 |                               file.path(plot.data.dir,
 66 |                                         paste0(file_identifier,
 67 |                                                "_train_3_models_delta_kappa.tsv")),
 68 |                               file.path(plot.data.dir,
 69 |                                         paste0(file_identifier,
 70 |                                                "_train_3_models_kappa.tsv")))
 71 | 
 72 | summary.df.filename <- ifelse(null_model,
 73 |                               file.path(plot.data.dir,
 74 |                                         paste0(file_identifier,
 75 |                                                "_train_3_models_delta_kappa_summary_table.tsv")),
 76 |                               file.path(plot.data.dir,
 77 |                                  paste0(file_identifier,
 78 |                                         "_train_3_models_kappa_summary_table.tsv")))
 79 | 
 80 | #### read in data --------------------------------------------------------------
 81 | 
 82 | # read in the tables that contain the kappa statistics for predictions on test
 83 | # data
 84 | array.list <- list()  # initialize list that will hold all array tables
 85 | seq.list <- list()  # initialize list that will hold all the RNA-seq tables
 86 | for (file_index in 1:length(array.files)) {
 87 |   array.list[[file_index]] <- fread(array.files[file_index], data.table = F)
 88 |   seq.list[[file_index]] <- fread(seq.files[file_index], data.table = F)
 89 | }
 90 | 
 91 | if (null_model) {
 92 |   null_array.list <- list() # initialize list that will hold null array tables
 93 |   null_seq.list <- list() # initialize list that will hold null RNA-seq tables
 94 |   for (null_file_index in 1:length(null_array.files)) {
 95 |     null_array.list[[null_file_index]] <- fread(null_array.files[null_file_index], data.table = F)
 96 |     null_seq.list[[null_file_index]] <- fread(null_seq.files[null_file_index], data.table = F)
 97 |   }
 98 |   
 99 |   # calculate delta kappa values
100 |   delta_kappa_array.list <- list() # list for delta kappa array values
101 |   delta_kappa_seq.list <- list() # list for delta kappa seq values
102 |   for (pair_index in 1:length(array.files)) {
103 |     
104 |     delta_kappa_array.list[[pair_index]] <- array.list[[pair_index]] %>%
105 |       left_join(null_array.list[[pair_index]],
106 |                 by = c("perc.seq", "classifier", "norm.method"),
107 |                 suffix = c(".true", ".null")) %>%
108 |       mutate(delta_kappa = kappa.true - kappa.null) %>% # regular kappa - null kappa
109 |       select(delta_kappa, auc.true, sensitivity.true, specificity.true, perc.seq, classifier, norm.method)
110 |    
111 |      delta_kappa_seq.list[[pair_index]] <- seq.list[[pair_index]] %>%
112 |       left_join(null_seq.list[[pair_index]],
113 |                 by = c("perc.seq", "classifier", "norm.method"),
114 |                 suffix = c(".true", ".null")) %>%
115 |       mutate(delta_kappa = kappa.true - kappa.null) %>% # regular kappa - null kappa
116 |       select(delta_kappa, auc.true, sensitivity.true, specificity.true, perc.seq, classifier, norm.method)
117 |      
118 |   }
119 |   
120 | }
121 | 
122 | # combine all tables from each platform into a data.frame
123 | # cannot use ifelse() because return value must be same dim as conditional test
124 | if (null_model) {
125 |   array.df <- data.table::rbindlist(delta_kappa_array.list)
126 |   seq.df <- data.table::rbindlist(delta_kappa_seq.list)
127 | } else {
128 |   array.df <- data.table::rbindlist(array.list) %>%
129 |     select(kappa, auc, sensitivity, specificity, perc.seq, classifier, norm.method)
130 |   seq.df <- data.table::rbindlist(seq.list) %>%
131 |     select(kappa, auc, sensitivity, specificity, perc.seq, classifier, norm.method)
132 | }
133 | 
134 | #### save test set results -----------------------------------------------------
135 | 
136 | # bind all kappa stats together
137 | test.df <- cbind(rbind(array.df, seq.df),
138 |                  c(rep("Microarray", nrow(array.df)),
139 |                    rep("RNA-seq", nrow(seq.df))))
140 | 
141 | colnames(test.df) <- c("Kappa", "AUC", "Sensitivity", "Specificity", "Perc.Seq", "Classifier",
142 |                        "Normalization", "Platform")
143 | 
144 | # order %seq to display 0-100
145 | test.df$Perc.Seq <- factor(test.df$Perc.Seq, levels = seq(0, 100, 10))
146 | 
147 | # recode model types
148 | cls.recode.str <-
149 |   "'glmnet' = 'LASSO'; 'rf' = 'Random Forest'; 'svm' = 'Linear SVM'"
150 | test.df$Classifier <- car::recode(test.df$Classifier,
151 |                                   recodes = cls.recode.str)
152 | 
153 | # capitalize norm methods
154 | test.df$Normalization <- as.factor(toupper(test.df$Normalization))
155 | test.df$Classifier <- as.factor(test.df$Classifier)
156 | 
157 | readr::write_tsv(test.df,
158 |                  test.df.filename) # delta or not delta in file name
159 | 
160 | # get summary data.frame + write to file
161 | summary.df <- test.df %>%
162 |   dplyr::group_by(Classifier, Normalization, Platform, Perc.Seq) %>%
163 |   dplyr::summarise(Median_Kappa = median(Kappa, na.rm = TRUE),
164 |                    Mean_Kappa = mean(Kappa, na.rm = TRUE),
165 |                    SD_Kappa = sd(Kappa, na.rm = TRUE),
166 |                    Median_AUC = median(AUC, na.rm = TRUE),
167 |                    Mean_AUC = mean(AUC, na.rm = TRUE),
168 |                    SD_AUC = sd(AUC, na.rm = TRUE),
169 |                    Median_Sensitivity = median(Sensitivity, na.rm = TRUE),
170 |                    Mean_Sensitivity = mean(Sensitivity, na.rm = TRUE),
171 |                    SD_Sensitivity = sd(Sensitivity, na.rm = TRUE),
172 |                    Median_Specificity = median(Specificity, na.rm = TRUE),
173 |                    Mean_Specificity = mean(Specificity, na.rm = TRUE),
174 |                    SD_Specificity = sd(Specificity, na.rm = TRUE),
175 |                    .groups = "drop")
176 | 
177 | readr::write_tsv(summary.df,
178 |                  summary.df.filename) # delta or not delta in file name
179 | 


--------------------------------------------------------------------------------
/4-ica_pca_feature_reconstruction.R:
--------------------------------------------------------------------------------
  1 | # J. Taroni Aug 2016
  2 | # The purpose of this script is to perform unsupervised learning on TCGA train-
  3 | # ing data (output of 1-normalize_titrated_data.R), PCA or ICA,
  4 | # and to transform test data into the training data reduced dimensional space,
  5 | # and back out ('reconstruction') and to then calculate the 'reconstruction
  6 | # error' (MASE).
  7 | #
  8 | # It should be run from the command line.
  9 | # USAGE: Rscript 4-ica_pca_feature_reconstruction.R --cancer_type --predictor --n_components --seed --null_model
 10 | # n_components refers to the number of components (PC/IC) that should be used
 11 | # for reconstruction.
 12 | 
 13 | option_list <- list(
 14 |   optparse::make_option("--cancer_type",
 15 |                         default = NA_character_,
 16 |                         help = "Cancer type"),
 17 |   optparse::make_option("--predictor",
 18 |                         default = NA_character_,
 19 |                         help = "Predictor used"),
 20 |   optparse::make_option("--n_components",
 21 |                         default = 50,
 22 |                         help = "Number of compenents [default: %default]"),
 23 |   optparse::make_option("--seed",
 24 |                         default = 346,
 25 |                         help = "Random seed [default: %default]"),
 26 |   optparse::make_option("--null_model",
 27 |                         action = "store_true",
 28 |                         default = FALSE,
 29 |                         help = "Refer to models with permuted dependent variable (within subtype if predictor is a gene)")
 30 | )
 31 | 
 32 | opt <- optparse::parse_args(optparse::OptionParser(option_list=option_list))
 33 | source(here::here("util/option_functions.R"))
 34 | check_options(opt)
 35 | 
 36 | # load libraries
 37 | suppressMessages(source(here::here("load_packages.R")))
 38 | source(here::here("util", "train_test_functions.R"))
 39 | source(here::here("util", "ICA_PCA_reconstruction_functions.R"))
 40 | 
 41 | # set options
 42 | cancer_type <- opt$cancer_type
 43 | predictor <- opt$predictor
 44 | null_model <- opt$null_model
 45 | file_identifier <- ifelse(null_model,
 46 |                           str_c(cancer_type, predictor, "null", sep = "_"),
 47 |                           str_c(cancer_type, predictor, sep = "_"))
 48 | n.comp <- as.integer(opt$n_components)
 49 | 
 50 | # set seed
 51 | initial.seed <- as.integer(opt$seed)
 52 | set.seed(initial.seed)
 53 | message(paste("\nInitial seed set to:", initial.seed))
 54 | 
 55 | # define directories
 56 | res.dir <- here::here("results")
 57 | norm.dir <- here::here("normalized_data")
 58 | mdl.dir <- here::here("models")
 59 | rcn.dir <- file.path(norm.dir, "reconstructed_data")
 60 | rcn.res.dir <- file.path(res.dir, "reconstructed_data")
 61 | 
 62 | # define input files
 63 | lf <- list.files(norm.dir, full.names = TRUE)
 64 | train.files <- lf[grepl(paste0(file_identifier,
 65 |                                "_array_seq_train_titrate_normalized_list_"), lf)]
 66 | test.files <- lf[grepl(paste0(file_identifier,
 67 |                               "_array_seq_test_data_normalized_list_"), lf)]
 68 | 
 69 | # parse filename seeds
 70 | filename.seeds <- substr(train.files,
 71 |                          (nchar(train.files)-7),
 72 |                          (nchar(train.files)-4))
 73 | 
 74 | # define output files
 75 | df.file.lead <- paste0(file_identifier,
 76 |                        "_reconstruction_error_", n.comp, "_components_")
 77 | mdl.file.lead <- paste0(file_identifier,
 78 |                         "_array_seq_train_", n.comp, "_components_object_")
 79 | rcn.file.lead <- paste0(file_identifier,
 80 |                         "_reconstructed_data_", n.comp, "_components_")
 81 | 
 82 | #### main ----------------------------------------------------------------------
 83 | platforms <- c("array", "seq")
 84 | #recon.methods <- c("ICA", "PCA")
 85 | recon.methods <- c("PCA") # July 2021 update to no longer run ICA
 86 | 
 87 | for (seed in filename.seeds) {
 88 |   rep.count <- grep(seed, filename.seeds)
 89 |   message(paste("\n\n#### RECONSTRUCTION ROUND",
 90 |                 rep.count, "of", length(filename.seeds), "####\n\n"))
 91 | 
 92 |   #### read in data ####
 93 |   message("Reading in data...")
 94 |   train.rds <- train.files[grepl(seed, train.files)]
 95 |   test.rds <- test.files[grepl(seed, test.files)]
 96 |   train.data <- readRDS(train.rds)
 97 |   test.data <- readRDS(test.rds)
 98 |   train.data <- RestructureNormList(train.data)
 99 | 
100 |   # remove Seurat data from this analysis because it is already in reduced space
101 |   train.data$seurat <- NULL
102 |   
103 |   # get rid of TDM and QN (CN) null values
104 |   train.data <- purrr::modify_depth(train.data,
105 |                                     1, # work on the first level lists
106 |                                     purrr::discard, is.null) # discard if null
107 |   
108 |   # for each method to be used for reconstruction
109 |   for (rcn in recon.methods) {
110 |     message(paste("  ", rcn, "on training set"))
111 | 
112 |     # perform reconstruction method on the training data
113 |     train.comp.list <- TrainSetCompAnalysis(train.list = train.data,
114 |                                             num.comp = n.comp,
115 |                                             comp.method = rcn)
116 | 
117 |     # write the component objects to file in the models directory
118 |     comp.rds.name <- paste0(mdl.file.lead, rcn, "_", seed, ".RDS")
119 |     saveRDS(train.comp.list, file = file.path(mdl.dir, comp.rds.name))
120 | 
121 |     # reconstruction on the holdout data
122 |     for(plt in platforms) { # for the two platforms -- microarray and RNA-seq
123 |       message(paste("\t Performing", plt, "reconstruction"))
124 |       if (plt == "seq" & is.null(train.comp.list$tdm$`0`)){
125 |         # At the 0% RNA-seq level, TDM RNA-seq test data is transformed using the
126 |         # log-transformed 100% array data on the reference. So, use
127 |         # log-transformed 100% array data as the training set for evaluating the
128 |         # TDM method at 0% RNA-seq level.
129 |         train.comp.list$tdm$`0` <- train.comp.list$log$`0`
130 |         train.comp.list$tdm <- train.comp.list$tdm[c(10, 1:9)]
131 |       }
132 | 
133 |       # perform the reconstruction experiment, which will return reconstructed
134 |       # holdout out data in data.table format suitable for category prediction
135 |       # and calculate the reconstruction error (MASE) to be returned as a
136 |       # data.frame
137 |       results <- ReconstructionWrapper(train.list = train.comp.list,
138 |                                        test.list = test.data[[plt]],
139 |                                        num.comps = n.comp)
140 | 
141 |       # save recon objects
142 |       message("\t   Saving reconstructed holdout data")
143 |       recon.rds <- paste0(rcn.file.lead, rcn, "_", plt, "_", seed,".RDS")
144 |       saveRDS(results$recon, file = file.path(rcn.dir, recon.rds))
145 | 
146 |       # write error data.frame to file
147 |       error.df <- results$mase.df
148 |       error.df <- cbind(error.df, rep(plt, nrow(error.df)))
149 |       colnames(error.df)[ncol(error.df)] <- "platform"
150 |       error.df.name <- paste0(df.file.lead, rcn, "_", plt, "_", seed, ".tsv")
151 |       message("\t   Saving MASE data.frame")
152 |       write.table(error.df, file = file.path(rcn.res.dir, error.df.name),
153 |                   quote = FALSE, row.names = FALSE, sep = "\t")
154 | 
155 |       rm(results, error.df)
156 |       gc()
157 | 
158 |     }
159 | 
160 |   }
161 | 
162 |   rm(train.data, test.data)
163 |   gc()
164 | 
165 | }
166 | 


--------------------------------------------------------------------------------
/5-predict_category_reconstructed_data.R:
--------------------------------------------------------------------------------
  1 | # J. Taroni Oct 2016
  2 | # The purpose of this script is to perform category prediction
  3 | # (from 2-train_test_brca_category.R) on test/holdout data that has been
  4 | # reconstructed using the components from PCA on training data (the
  5 | # output of 4-ica_pca_feature_reconstruction.R). It outputs a list of
  6 | # confusionMatrix objects and a data.frame of Kappa statistics from these
  7 | # predictions.
  8 | # It should be run from the command line.
  9 | # USAGE: Rscript 5-predict_category_reconstructed_data.R --cancer_type --predictor --null_model
 10 | 
 11 | option_list <- list(
 12 |   optparse::make_option("--cancer_type",
 13 |                         default = NA_character_,
 14 |                         help = "Cancer type"),
 15 |   optparse::make_option("--predictor",
 16 |                         default = NA_character_,
 17 |                         help = "Predictor used"),
 18 |   optparse::make_option("--null_model",
 19 |                         action = "store_true",
 20 |                         default = FALSE,
 21 |                         help = "Refer to models with permuted dependent variable (within subtype if predictor is a gene)")
 22 | )
 23 | 
 24 | opt <- optparse::parse_args(optparse::OptionParser(option_list=option_list))
 25 | source(here::here("util/option_functions.R"))
 26 | check_options(opt)
 27 | 
 28 | # load libraries
 29 | suppressMessages(source(here::here("load_packages.R")))
 30 | source(here::here("util", "train_test_functions.R"))
 31 | 
 32 | # set options
 33 | cancer_type <- opt$cancer_type
 34 | predictor <- opt$predictor
 35 | null_model <- opt$null_model
 36 | file_identifier <- ifelse(null_model,
 37 |                           str_c(cancer_type, predictor, "null", sep = "_"),
 38 |                           str_c(cancer_type, predictor, sep = "_"))
 39 | 
 40 | # define directories
 41 | mdl.dir <- here::here("models")
 42 | norm.dir <- here::here("normalized_data")
 43 | res.dir <- here::here("results")
 44 | rcn.dir <- file.path(norm.dir, "reconstructed_data")
 45 | rcn.res.dir <- file.path(res.dir, "reconstructed_data")
 46 | 
 47 | # define input files
 48 | supervised.model.files <- list.files(mdl.dir,
 49 |                                      pattern = paste0(file_identifier,
 50 |                                                       "_train_3_models"),
 51 |                                      full.names = TRUE)
 52 | recon.files <- list.files(rcn.dir,
 53 |                           pattern = paste0(file_identifier,
 54 |                                            "_reconstructed_data_"),
 55 |                           full.names = TRUE)
 56 | 
 57 | # get filename.seeds (identifiers for each replicate)
 58 | # from the reconstructed data files
 59 | filename.seeds <- unique(substr(recon.files,
 60 |                                 (nchar(recon.files)-7),
 61 |                                 (nchar(recon.files)-4)))
 62 | 
 63 | # define output files
 64 | cm.file.lead <- paste0(file_identifier,
 65 |                        "_prediction_reconstructed_data_confusionMatrices_")
 66 | kap.file.lead <- paste0(file_identifier,
 67 |                         "_prediction_reconstructed_data_kappa_")
 68 | 
 69 | #### main ----------------------------------------------------------------------
 70 | 
 71 | platforms <- c("array", "seq")
 72 | recon.methods <- c("PCA")
 73 | 
 74 | for (seed in filename.seeds) {
 75 | 
 76 |   # error-handling -- want to make sure there is a corresponding supervised
 77 |   # model file to current reconstructed data file (seed)
 78 |   check.model.file <- any(grepl(seed, supervised.model.files))
 79 |   if (!check.model.file) {
 80 |     stop(paste("There is no corresponding supervised model file for
 81 |           filename.seed:", seed))
 82 |   }
 83 | 
 84 |   rep.count <- grep(seed, filename.seeds)
 85 |   message(paste("\n\n#### CATEGORY PREDICTION",
 86 |                 rep.count, "of", length(filename.seeds), "####\n\n"))
 87 | 
 88 |   # read in supervised models (LASSO, linear SVM, random forest)
 89 |   train.rds <- supervised.model.files[grep(seed, supervised.model.files)]
 90 |   train.list <- readRDS(train.rds)
 91 |   
 92 |   # remove Seurat data from this analysis because it is already in reduced space
 93 |   train.list$seurat <- NULL
 94 |   
 95 |   # need to read in corresponding sample.df
 96 |   sample.df.file <-
 97 |     file.path(res.dir,
 98 |               paste0(file_identifier,
 99 |                      "_matchedSamples_training_testing_split_labels_",
100 |                      seed, ".tsv"))
101 |   sample.df <- data.table::fread(sample.df.file, data.table = F)
102 |   sample.df$category <- as.factor(sample.df$category)
103 | 
104 |   # initialize list to hold confusionMatrices & kappa statistics
105 |   kappa.list <- list()
106 |   cm.list <- list()
107 | 
108 |   for (plt in platforms) {
109 |     plt.list <- list()
110 |     plt.kap.list <- list()
111 |     for (rcn in recon.methods) {
112 | 
113 |       # read in reconstructed data from current platform and reconstruction
114 |       # method
115 |       file.identifier <- paste(rcn, plt, seed, sep = "_")
116 |       recon.rds <- recon.files[grep(file.identifier, recon.files)]
117 |       recon.list <- readRDS(recon.rds)
118 | 
119 |       # return list of confusion matrix objects AND kappa statistics
120 |       cm_kappa.list <- PredictWrapper(train.model.list = train.list,
121 |                                       pred.list = recon.list,
122 |                                       sample.df = sample.df,
123 |                                       only.kap = FALSE,
124 |                                       run.parallel = FALSE)
125 | 
126 |       # get confusionMatrix objects
127 |       plt.list[[rcn]] <- cm_kappa.list$confusion_matrix_objects
128 | 
129 |       # get kappa statistics
130 |       plt.kap.list[[rcn]] <- cm_kappa.list$kappa_statistics
131 | 
132 |       # remove reconstructed data and cm_kappa_list
133 |       rm(recon.list, cm_kappa.list)
134 |       gc()
135 | 
136 |     }
137 | 
138 |     cm.list[[plt]] <- plt.list
139 |     kappa.list[[plt]] <- plt.kap.list
140 | 
141 |   }
142 | 
143 |   # save confusion matrices
144 |   cm.file.name <- file.path(rcn.res.dir, paste0(cm.file.lead, seed, ".RDS"))
145 |   saveRDS(cm.list, file = cm.file.name)
146 | 
147 |   # get kappa stats into data.frame from nested list and save as data.frame
148 |   kappa.df <- reshape2::melt(kappa.list)
149 |   colnames(kappa.df) <- c("Perc.seq", "Classifier", "Normalization",
150 |                           "Measure", "Kappa", "Reconstruction", "Platform")
151 |   kap.file.name <- file.path(rcn.res.dir, paste0(kap.file.lead, seed, ".tsv"))
152 |   write.table(kappa.df, file = kap.file.name, row.names = F, quote = F,
153 |               sep = "\t")
154 | 
155 |   rm(train.list, kappa.list, cm.list)
156 |   gc()
157 | 
158 | }
159 | 


--------------------------------------------------------------------------------
/6-save_recon_error_kappa_data.R:
--------------------------------------------------------------------------------
  1 | # J. Taroni Oct 2016
  2 | # This script plots reconstruction errors (MASE and RMSE) from
  3 | # 4-ica_pca_feature_reconstruction.R and the Kappa statistics associated with
  4 | # predictions on reconstructed data from 5-predict_category_reconstructed_data.R
  5 | # as violin plots, respectively.
  6 | # USAGE: Rscript 6-plot_recon_error_kappa.R --cancer_type --predictor --null_model
  7 | 
  8 | option_list <- list(
  9 |   optparse::make_option("--cancer_type",
 10 |                         default = NA_character_,
 11 |                         help = "Cancer type"),
 12 |   optparse::make_option("--predictor",
 13 |                         default = NA_character_,
 14 |                         help = "Predictor used"),
 15 |   optparse::make_option("--null_model",
 16 |                         action = "store_true",
 17 |                         default = FALSE,
 18 |                         help = "Refer to models with permuted dependent variable (within subtype if predictor is a gene)")
 19 | )
 20 | 
 21 | opt <- optparse::parse_args(optparse::OptionParser(option_list=option_list))
 22 | source(here::here("util/option_functions.R"))
 23 | check_options(opt)
 24 | 
 25 | # load libraries
 26 | suppressMessages(library(tidyverse))
 27 | source(here::here("util", "color_blind_friendly_palette.R"))
 28 | 
 29 | # set options
 30 | cancer_type <- opt$cancer_type
 31 | predictor <- opt$predictor
 32 | null_model <- opt$null_model
 33 | file_identifier <- ifelse(null_model,
 34 |                           str_c(cancer_type, predictor, "null", sep = "_"),
 35 |                           str_c(cancer_type, predictor, sep = "_"))
 36 | 
 37 | # define directories
 38 | plot.dir <- here::here("plots")
 39 | plot.data.dir <- file.path(plot.dir, "data")
 40 | rcn.res.dir <- here::here("results", "reconstructed_data")
 41 | 
 42 | # define input files
 43 | # pattern = "kappa" captures a downstream output file if this script is rerun
 44 | # pattern = "kappa_[0-9]+.tsv" captures the intended filenames including seeds between 1:10000
 45 | kappa.df.files <- list.files(rcn.res.dir,
 46 |                              pattern = paste0(file_identifier,
 47 |                                               "_prediction_reconstructed_data_kappa_[0-9]+.tsv"),
 48 |                                               full.names = TRUE)
 49 | error.files <- list.files(rcn.res.dir,
 50 |                           pattern = paste0(file_identifier,
 51 |                                            "_reconstruction_error"),
 52 |                           full.names = TRUE)
 53 | 
 54 | # define output files
 55 | kap.plot.file.lead <- file.path(plot.dir, paste0(file_identifier, "_kappa_reconstructed_data_"))
 56 | err.plot.file.lead <- file.path(plot.dir, paste0(file_identifier, "_reconstruction_error_"))
 57 | kap.plot.data.file <- file.path(plot.data.dir, paste0(file_identifier, "_kappa_reconstructed_data.tsv"))
 58 | err.plot.data.file <- file.path(plot.data.dir, paste0(file_identifier, "_reconstruction_error.tsv"))
 59 | 
 60 | #### plot kappa stats ----------------------------------------------------------
 61 | 
 62 | # read in kappa data.frames from each replicate and bind -- line plot with
 63 | # boxplot "confidence intervals"
 64 | kappa.df.list <- list()
 65 | fl.iter <- 1
 66 | for (fl in kappa.df.files) {
 67 |   kappa.df.list[[fl.iter]] <- data.table::fread(fl, data.table = FALSE)
 68 |   fl.iter <- fl.iter + 1
 69 | }
 70 | kappa.master.df <- as.data.frame(data.table::rbindlist(kappa.df.list))
 71 | rm(kappa.df.list)
 72 | 
 73 | # order Perc.seq so line plot displays 0-100
 74 | kappa.master.df$Perc.seq <- factor(kappa.master.df$Perc.seq,
 75 |                                    levels = seq(0, 100, 10))
 76 | 
 77 | # rename classifiers
 78 | 
 79 | cls.recode.str <-
 80 |   "'glmnet' = 'LASSO'; 'rf' = 'Random Forest'; 'svm' = 'Linear SVM'"
 81 | kappa.master.df$Classifier <- car::recode(kappa.master.df$Classifier,
 82 |                                           recodes = cls.recode.str)
 83 | kappa.master.df$Classifier <- as.factor(kappa.master.df$Classifier)
 84 | 
 85 | # get norm and reconstruction methods as factors
 86 | kappa.master.df$Normalization <- stringr::str_to_upper(kappa.master.df$Normalization)
 87 | kappa.master.df$Normalization <- as.factor(kappa.master.df$Normalization)
 88 | kappa.master.df$Reconstruction <- as.factor(kappa.master.df$Reconstruction)
 89 | 
 90 | # rename platforms
 91 | plt.recode.str <-
 92 |   "'array' = 'Microarray'; 'seq' = 'RNA-seq'"
 93 | kappa.master.df$Platform <- car::recode(kappa.master.df$Platform,
 94 |                                         recodes = plt.recode.str)
 95 | kappa.master.df$Platform <- as.factor(kappa.master.df$Platform)
 96 | 
 97 | 
 98 | write.table(kappa.master.df,
 99 |             file = kap.plot.data.file,
100 |             quote = FALSE, sep = "\t", row.names = FALSE)
101 | 
102 | # get summary data.frame + write to file
103 | kappa.summary.df <-
104 |   kappa.master.df %>%
105 |   dplyr::group_by(Classifier, Normalization, Platform, Perc.seq) %>%
106 |   dplyr::summarise(Median = median(Kappa),
107 |                    Mean = mean(Kappa),
108 |                    SD = sd(Kappa),
109 |                    .groups = "drop")
110 | readr::write_tsv(kappa.summary.df,
111 |                  file.path(rcn.res.dir,
112 |                            paste0(file_identifier,
113 |                                   "_kappa_reconstructed_data_summary_table.tsv")))
114 | 
115 | rm(kappa.master.df)
116 | 
117 | #### plot error measures -------------------------------------------------------
118 | 
119 | # read in error measure data.frames from each replicate and bind -- violin plot
120 | error.df.list <- list()
121 | for(fl.iter in seq_along(error.files)){
122 |   error.df.list[[fl.iter]] <- data.table::fread(error.files[fl.iter],
123 |                                                 data.table = FALSE)
124 | }
125 | error.master.df <- as.data.frame(data.table::rbindlist(error.df.list))
126 | rm(error.df.list)
127 | 
128 | # order perc.seq so plot displays 0-100
129 | error.master.df$perc.seq <- factor(error.master.df$perc.seq,
130 |                                    levels = seq(0, 100, by = 10))
131 | 
132 | # get norm and reconstruction methods as factors
133 | error.master.df$norm.method <- stringr::str_to_upper(error.master.df$norm.method)
134 | error.master.df$norm.method <- as.factor(error.master.df$norm.method)
135 | 
136 | # rename platforms -- same as above for kappa data.frame
137 | error.master.df$platform <- car::recode(error.master.df$platform,
138 |                                         recodes = plt.recode.str)
139 | error.master.df$platform <- as.factor(error.master.df$platform)
140 | 
141 | # reconstruction method as factor
142 | error.master.df$comp.method <- as.factor(error.master.df$comp.method)
143 | 
144 | # take the average of each genes error across replicates
145 | error.mean.df <- error.master.df %>%
146 |   dplyr::group_by(gene, perc.seq, norm.method, comp.method, platform) %>%
147 |   dplyr::summarise(mean_mase = mean(MASE),
148 |                    .groups = "drop")
149 | rm(error.master.df)
150 | colnames(error.mean.df) <- c("Gene", "Perc.seq", "Normalization",
151 |                              "Method", "Platform", "Mean_Value")
152 | 
153 | write.table(error.mean.df,
154 |             file = err.plot.data.file,
155 |             quote = FALSE, sep = "\t", row.names = FALSE)
156 | 


--------------------------------------------------------------------------------
/8-PLIER_pathways_analysis.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "PLIER pathways analysis"
  3 | output: html_notebook
  4 | author: "Steven Foltz"
  5 | date: "December 2022"
  6 | ---
  7 | 
  8 | ```{r}
  9 | meaningful_difference <- 0.2
 10 | n_repeats <- 10
 11 | ```
 12 | 
 13 | ### What additional pathways are identified by PLIER after doubling the sample size?
 14 | 
 15 | Here we look for oncogenic pathways (defined by [GSEA MSigDB C6](https://www.gsea-msigdb.org/gsea/msigdb/human/genesets.jsp?collection=C6)) that are found more frequently in the data with the full sample size than in data with half the sample size.
 16 | By focusing on pathways that are reliably detected more often in the full sample size data, we can identify stable patterns that emerge when more data is available.
 17 | We ran PLIER `r n_repeats` times using each combination of data normalization method and sample size (half or full).
 18 | Out of those `r n_repeats` runs, we found the proportion of times each oncogenic pathway was significantly associated with a latent variable.
 19 | We set an arbitrary "meaningful difference" threshold of `r meaningful_difference` to detect when that proportion was meaningfully greater using full sample size compared to half sample size and require array and RNA-seq data to both to satisfy the condition.
 20 | We also arbitrarily limit results to those pathways found in over half of runs using the full size data.
 21 | 
 22 | ### Load packages and pathways
 23 | 
 24 | ```{r}
 25 | library(tidyverse)
 26 | data("oncogenicPathways", package = "PLIER")
 27 | ```
 28 | 
 29 | 
 30 | ### Set input file names
 31 | ```{r}
 32 | plots_data_dir <- here::here("plots", "data")
 33 | brca_filename <- file.path(plots_data_dir, "BRCA_subtype_PLIER_pathways.tsv")
 34 | gbm_filename <- file.path(plots_data_dir, "GBM_subtype_PLIER_pathways.tsv")
 35 | ```
 36 | 
 37 | ### Read in data for each cancer type
 38 | ```{r}
 39 | pathways_df <- NULL
 40 | 
 41 | if (file.exists(brca_filename)) {
 42 |   brca_pathways_df <- read_tsv(brca_filename) %>%
 43 |     mutate(cancer_type = "BRCA")
 44 |   pathways_df <- bind_rows(pathways_df,
 45 |                            brca_pathways_df)
 46 | } else {
 47 |   message(str_c("BRCA file ", brca_filename, " does not exist."))
 48 | }
 49 | 
 50 | if (file.exists(gbm_filename)) {
 51 |   gbm_pathways_df <- read_tsv(gbm_filename) %>%
 52 |     mutate(cancer_type = "GBM")
 53 |   pathways_df <- bind_rows(pathways_df,
 54 |                            gbm_pathways_df)
 55 | } else {
 56 |   message(str_c("GBM file ", gbm_filename, " does not exist."))
 57 | }
 58 | 
 59 | if (!file.exists(brca_filename) & !file.exists(gbm_filename)) {
 60 |   stop(str_c("Neither BRCA file ", brca_filename,
 61 |              " nor GBM file ", gbm_filename,
 62 |              " exists."))
 63 | }
 64 | 
 65 | ```
 66 | 
 67 | ### Filter data to identify oncogenic pathways detected more frequently in full data
 68 | ```{r}
 69 | 
 70 | pathways_df %>%
 71 |   filter(FDR < 0.05, # require significant association with an LV
 72 |          pathway %in% colnames(oncogenicPathways)) %>% # require oncogenic pathways
 73 |   # for each combination of normalization method, %RNA-seq, pathway, and cancer type
 74 |   group_by(nmeth, pseq, pathway, cancer_type) %>%
 75 |   # summarize by finding the count and proportion of repeats in which that
 76 |   # pathway was significantly associated with at least one latent variable
 77 |   summarize(n_seeds = length(unique(seed_index)),
 78 |             prop_seeds = n_seeds/n_repeats,
 79 |             .groups = "drop") %>%
 80 |   # clean up normalization method strings (remove parentheses, spaces, and hyphens)
 81 |   mutate(nmeth = str_remove_all(nmeth, "[\\(\\)]")) %>%
 82 |   mutate(nmeth = str_replace_all(nmeth, c(" " = "_", "-" = "_"))) %>%
 83 |   # create new variable combining normalization method and %RNA-seq
 84 |   mutate(nmeth_pseq = str_c(nmeth, pseq, sep = "_")) %>%
 85 |   select(cancer_type, pathway, prop_seeds, nmeth_pseq) %>%
 86 |   # create new columns for each combination of norm method and %RNA-seq
 87 |   # each row corresponds to a single pathway from a cancer type
 88 |   pivot_wider(id_cols = c("cancer_type", "pathway"),
 89 |               names_from = nmeth_pseq,
 90 |               values_from = prop_seeds) %>%
 91 |   mutate_if(is.numeric, replace_na, 0) %>% # replace NAs with 0s
 92 |   # reduce to "meaningful" results
 93 |   # 1. difference in array only data must be meaningful
 94 |   # 2. difference in seq only data must be meaningful
 95 |   # 3. pathway must be detected in over half of full data sets (array only, RNA-seq only, and NPN 50%/50%)
 96 |   filter(log_0 - array_only_50 >= meaningful_difference,
 97 |          log_100 - seq_only_50 >= meaningful_difference,
 98 |          log_0 > 0.5,
 99 |          log_100 > 0.5,
100 |          npn_50 > 0.5) %>%
101 |   select(cancer_type, pathway, array_only_50, seq_only_50, log_0, log_100, npn_50) %>%
102 |   arrange(cancer_type) %>%
103 |   knitr::kable()
104 | ```
105 | 
106 | | variable_name | meaning |
107 | | --- | --- |
108 | | `array_only_50` | LOG transformed array data (half sample size) |
109 | | `seq_only_50` | LOG transformed RNA-seq data (half sample size) |
110 | | `log_0` | LOG transformed array data (full sample size) |
111 | | `log_100` | LOG transformed RNA-seq data (full sample size) |
112 | | `npn_50` | NPN transformed data, 50% array and 50% RNA-seq (full sample size) |
113 | 
114 | ### Citation:
115 | Mao W, Zaslavsky E, Hartmann BM, Sealfon SC, Chikina M. Pathway-level information extractor (PLIER) for gene expression data. Nat Methods. 2019;16: 607–610.


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (C) 2016 Trustees of the University of Pennsylvania. 
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without 
 5 | modification, are permitted provided that the following conditions 
 6 | are met:
 7 | 
 8 | 1. Redistributions of source code must retain the above copyright 
 9 | notice, this list of conditions and the following disclaimer.
10 | 
11 | 2. Redistributions in binary form must reproduce the above copyright 
12 | notice, this list of conditions and the following disclaimer in the 
13 | documentation and/or other materials provided with the distribution.
14 | 
15 | 3. Neither the name of the copyright holder nor the names of its 
16 | contributors may be used to endorse or promote products derived 
17 | from this software without specific prior written permission.
18 | 
19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
20 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
21 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 
22 | FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
23 | COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 
24 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
25 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 
26 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 
28 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 
29 | ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
30 | POSSIBILITY OF SUCH DAMAGE.
31 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Cross-platform normalization enables machine learning model training on microarray and RNA-seq data simultaneously
  2 | 
  3 | **Published article:** Foltz, S. M., Greene, C. S. & Taroni, J. N. Cross-platform normalization enables machine learning model training on microarray and RNA-seq data simultaneously. *Commun Biol* 6, 222 (2023). https://doi.org/10.1038/s42003-023-04588-6
  4 | 
  5 | <!-- START doctoc generated TOC please keep comment here to allow auto update -->
  6 | <!-- DON'T EDIT THIS SECTION, INSTEAD RE-RUN doctoc TO UPDATE -->
  7 | **Table of Contents**  *generated with [DocToc](https://github.com/thlorenz/doctoc)*
  8 | 
  9 | - [Summary](#summary)
 10 | - [Requirements](#requirements)
 11 |   - [Obtaining and running the Docker container](#obtaining-and-running-the-docker-container)
 12 | - [Download data from The Cancer Genome Atlas (TCGA)](#download-data-from-the-cancer-genome-atlas-tcga)
 13 | - [Recreate manuscript results](#recreate-manuscript-results)
 14 | - [Methods](#methods)
 15 |   - [Machine Learning Pipeline](#machine-learning-pipeline)
 16 | - [Running individual experiments](#running-individual-experiments)
 17 |     - [Machine learning](#machine-learning)
 18 |     - [Other scripts](#other-scripts)
 19 | - [Manuscript versions](#manuscript-versions)
 20 | - [Funding](#funding)
 21 | 
 22 | <!-- END doctoc generated TOC please keep comment here to allow auto update -->
 23 | 
 24 | ## Summary
 25 | 
 26 | We performed a series of supervised and unsupervised machine learning 
 27 | evaluations, as well as differential expression and pathway analyses, to assess which 
 28 | normalization methods are best suited for combining data from microarray and 
 29 | RNA-seq platforms. 
 30 | 
 31 | We evaluated seven normalization approaches for all methods: 
 32 | 
 33 | 1. log-transformation (LOG) 
 34 | 2. [non-paranormal transformation](https://arxiv.org/abs/0903.0649) (NPN)
 35 | 3. [quantile normalization](http://bmbolstad.com/misc/normalize/bolstad_norm_paper.pdf) (QN)
 36 | 4. [quantile normalization via CrossNorm](https://www.nature.com/articles/srep18898)
 37 | 5. quantile normalization followed by z-scoring (QN-Z)
 38 | 6. [Training Distribution Matching](https://peerj.com/articles/1621/) (TDM)
 39 | 7. z-scoring (Z)
 40 | 
 41 | We also explored the use of [Seurat](https://satijalab.org/seurat/) to normalize array and RNA-seq data.
 42 | Due to low sample numbers at the edges of our titration protocol, many experimental conditions could not be integrated.
 43 | 
 44 | ## Requirements
 45 | 
 46 | We recommend using the docker image `envest/rnaseq_titration_results:R-4.1.2` to handle package and dependency installation.
 47 | See `docker/R-4.1.2/Dockerfile` for more information.
 48 | 
 49 | Our analysis ([v2.3](https://github.com/greenelab/RNAseq_titration_results/releases/tag/v2.3)) was run using 7 cores on an AWS instance with 16 cores, 128 GB memory, and an allocated 1 TB of space.
 50 | 
 51 | ### Obtaining and running the Docker container
 52 | 
 53 | Pull the docker image using:
 54 | 
 55 | ```
 56 | docker pull envest/rnaseq_titration_results:R-4.1.2
 57 | ```
 58 | 
 59 | Then run the command to start up a container, replacing `[PASSWORD]` with your own password:
 60 | 
 61 | ```
 62 | docker run --mount type=bind,target=/home/rstudio,source=$PWD -e PASSWORD=[PASSWORD] -p 8787:8787 envest/rnaseq_titration_results:R-4.1.2
 63 | ```
 64 | 
 65 | Navigate to <http://localhost:8787/> and login to the RStudio server with the username `rstudio` and the password you set above.
 66 | 
 67 | 
 68 | ## Download data from The Cancer Genome Atlas (TCGA)
 69 | 
 70 | TCGA data from 520 breast cancer (BRCA) patients used for these analyses
 71 | is [available at zenodo](https://zenodo.org/record/58862).
 72 | 
 73 | Data from 150 glioblastoma (GBM) patients is available from the [Genomic Data Commons PanCan Atlas](https://gdc.cancer.gov/about-data/publications/pancanatlas).
 74 | 
 75 | To download data, run the data download script in the top directory:
 76 | 
 77 | ```
 78 | bash download_TCGA_data.sh
 79 | ```
 80 | 
 81 | ## Recreate manuscript results
 82 | 
 83 | After data has been downloaded, running
 84 | 
 85 | ```
 86 | bash run_all_analyses_and_plots.sh [cancer type]
 87 | ```
 88 | where 
 89 | 
 90 | - `[cancer type]` is `both`, `BRCA` or `GBM`
 91 | 
 92 | with [v2.3](https://github.com/greenelab/RNAseq_titration_results/releases/tag/v2.3) of this repository will reproduce the results presented in our manuscript.
 93 | We recommend running all analyses within the project Docker container.
 94 | 
 95 | ## Methods
 96 | 
 97 | ### Machine Learning Pipeline
 98 | 
 99 | Here's a schematic overview of our machine learning experiments:
100 | 
101 | ![](diagrams/RNA-seq_titration_ML_overview.png)
102 | 
103 | **Overview of supervised and unsupervised machine learning experiments.** 
104 | 
105 | 1. Matched samples run on both microarray and RNA-seq were split into a training (2/3) and holdout set (1/3).
106 | 2. RNA-seq samples were "titrated" into the training set, 10% at a time (0-100%), replacing their matched array samples, resulting in eleven training sets for each normalization method. 
107 | 3. Machine learning applications:
108 | 
109 |   - _Supervised learning_: 
110 | We trained three classifiers – LASSO, linear SVM, and Random Forest — on each training set and tested them on the microarray and RNA-seq holdout sets.
111 | The models were trained to predict tumor subtype (both cancer types have 5 subtypes) and the binary mutation status of _TP53_ and _PIK3CA_.
112 | 
113 |   - _Unsupervised learning_: 
114 | We projected holdout sets onto and back out of the training set space using Principal Components Analysis to obtain reconstructed holdout sets.
115 | We then used the trained subtype classifiers to predict on the reconstructed holdout sets.
116 | [PLIER](https://github.com/wgmao/PLIER) (Pathway-Level Information ExtractoR) identified coordinated sets of genes in each cancer type.
117 | 
118 | ## Running individual experiments
119 | 
120 | #### Machine learning
121 | 
122 | To run the machine learning pipeline, run in top directory:
123 | 
124 | ```
125 | bash run_machine_learning_experiments.sh [cancer type] [prediction task] [n cores]
126 | ```
127 | 
128 | where 
129 | 
130 | - `[cancer type]` is `BRCA` or `GBM`
131 | - `[prediction task]` is `subtype`, `TP53`, or `PIK3CA`
132 | - `[n cores]` is the number of cores you want to run in parallel
133 | 
134 | #### Other scripts
135 | 
136 | To search for the number of publicly available microarray and RNA-seq samples from [GEO](https://www.ncbi.nlm.nih.gov/geo/) and [ArrayExpress](https://www.ebi.ac.uk/arrayexpress/), run
137 | 
138 | ```
139 | python3 search_geo_arrayexpress.py
140 | ```
141 | and check the output in `results/array_rnaseq_ratio`.
142 | 
143 | To compare PLIER pathways that are more frequently identified using the full sample size data compared to half sample size data, run
144 | ```
145 | Rscript -e "rmarkdown::render('8-PLIER_pathways_analysis.Rmd', clean = TRUE)"
146 | ```
147 | and examine the results in `8-PLIER_pathways_analysis.nb.html`.
148 | 
149 | ## Manuscript versions
150 | 
151 | | Version | Relevant links |
152 | | :------ | :------------- |
153 | | [v2.3](https://github.com/greenelab/RNAseq_titration_results/releases/tag/v2.3) | [Published article](https://doi.org/10.1038/s42003-023-04588-6), [Figshare+ data](https://doi.org/10.25452/figshare.plus.19629864.v5), [Data for plots](https://doi.org/10.6084/m9.figshare.19686453.v5) |
154 | | [v2.2](https://github.com/greenelab/RNAseq_titration_results/releases/tag/v2.2) | [Figshare+ data](https://doi.org/10.25452/figshare.plus.19629864.v3), [Data for plots](https://doi.org/10.6084/m9.figshare.19686453.v3) |
155 | | [v2.1](https://github.com/greenelab/RNAseq_titration_results/releases/tag/v2.1) | [Figshare+ data](https://doi.org/10.25452/figshare.plus.19629864.v2), [Data for plots](https://doi.org/10.6084/m9.figshare.19686453.v2) |
156 | | [v2.0](https://github.com/greenelab/RNAseq_titration_results/releases/tag/v2.0) | [Figshare+ data](https://doi.org/10.25452/figshare.plus.19629864.v1), [Data for plots](https://doi.org/10.6084/m9.figshare.19686453.v1) |
157 | | [v1.1](https://github.com/greenelab/RNAseq_titration_results/releases/tag/v1.1) |  [Figshare full results](https://doi.org/10.6084/m9.figshare.5035997.v2) |
158 | | [v1.0](https://github.com/greenelab/RNAseq_titration_results/releases/tag/v1.0) | [Pre-print](https://doi.org/10.1101/118349) |
159 | 
160 | ## Funding
161 | 
162 | This work was supported by the Gordon and Betty Moore Foundation [GBMF 4552], Alex's Lemonade Stand Foundation [GR-000002471], and the National Institutes of Health [T32-AR007442, U01-TR001263, R01-CA237170, K12GM081259].
163 | 
164 | # FAQ
165 | 
166 | **Can I normalize array data to match RNA-seq data?**
167 | 
168 | *We generally do not advise this study design. We expect array data to have less precision at higher expression levels due to saturation, while counts-based RNA-seq data does not have that problem. We recommend reshaping the data expected to have more dynamic range (RNA-seq) to fit the narrower and less precise (array) distribution. See also [TDM FAQs](https://github.com/greenelab/TDM#faq).*
169 | 


--------------------------------------------------------------------------------
/brca_data_urls.txt:
--------------------------------------------------------------------------------
1 | https://zenodo.org/record/58862/files/BRCAarray.pcl
2 | https://zenodo.org/record/58862/files/BRCAClin.tsv
3 | https://zenodo.org/record/58862/files/BRCARNASeq.pcl
4 | https://zenodo.org/record/58862/files/BRCARNASeqClin.tsv
5 | 


--------------------------------------------------------------------------------
/check_installs.R:
--------------------------------------------------------------------------------
 1 | require("ape")
 2 | require("binr")
 3 | require("caret")
 4 | require("cluster")
 5 | require("corrplot")
 6 | require("cowplot")
 7 | require("data.table")
 8 | require("devtools")
 9 | require("doParallel")
10 | require("e1071")
11 | require("fastICA")
12 | require("flexclust")
13 | require("fpc")
14 | require("gdata")
15 | require("glmnet")
16 | require("gridExtra")
17 | require("Hmisc")
18 | require("huge")
19 | require("kernlab")
20 | require("limma")
21 | require("parallel")
22 | require("plyr")
23 | require("preprocessCore")
24 | require("quantro")
25 | require("ranger")
26 | require("reshape2")
27 | require("scales")
28 | require("sdcMicro")
29 | require("TDM")
30 | require("tidyverse")
31 | 


--------------------------------------------------------------------------------
/check_sums.tsv:
--------------------------------------------------------------------------------
 1 | 2f4f2fcd97eff5385c0b1205b719b8dc  data/BRCAClin.tsv
 2 | 7f00ea6ef1f309773b02e6118046550f  data/BRCARNASeq.pcl
 3 | d4486dde14da14b4f8887a7415e2866f  data/BRCARNASeqClin.tsv
 4 | 1a89ea769381e300e5a88ec61713ad9e  data/BRCAarray.pcl
 5 | 02e72c33071307ff6570621480d3c90b  data/EBPlusPlusAdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.tsv
 6 | ce911443e071c4251fb3f196780e76de  data/GBMClin.tsv
 7 | 949938abadd336eb9a2f698b3102e1bb  data/GBMRNASeq.pcl
 8 | 3bbdad11c322ebf3b03ada07263c6444  data/GBMarray.pcl
 9 | e5df57691b44c47b8c916116b5ac7acf  data/PanCan-General_Open_GDC-Manifest_2.txt
10 | a4591b2dcee39591f59e5e25a6ce75fa  data/TCGA-CDR-SupplementalTableS1.xlsx
11 | 7fafc537807d5b3ddf0bb89665279a9d  data/broad.mit.edu_PANCAN_Genome_Wide_SNP_6_whitelisted.seg
12 | 7c5f8a12d6ca986e5ebba93281360517  data/combined_clinical_data.BRCA.tsv
13 | 94621b5396bd5d69eb36b6c5503dec97  data/combined_clinical_data.GBM.tsv
14 | 1d8834a51282396e07e3ce9a5417d024  data/gbm_clinical_table_S7.xlsx
15 | 639ad8f8386e98dacc22e439188aa8fa  data/mc3.v0.2.8.PUBLIC.maf.gz
16 | 7583a5fb4d23d50b79813b26469f6385  data/mutations.BRCA.tsv
17 | 15cae05325c1b0562be8029efba5534a  data/mutations.GBM.tsv
18 | 5484229fa691a721dd7fd08ade2233e7  data/mutations.maf
19 | e56585bd0c2e59658b1d54fc8b0c9df2  data/mutations.tsv
20 | b62634d9eccbb548499ce384605fe47a  data/GSE83130/LICENSE.TXT
21 | 9ed2fa92d31d51f17fc048b98158a5e1  data/GSE83130/README.md
22 | 76a0454f911aeb17276725abb760ce89  data/GSE83130/GSE83130/GSE83130.tsv
23 | dca310d9643a18d35e694425c56b9d2b  data/GSE83130/GSE83130/metadata_GSE83130.tsv
24 | 


--------------------------------------------------------------------------------
/classifier_repeat_wrapper.R:
--------------------------------------------------------------------------------
 1 | # J. Taroni Jul 2016
 2 | # This script is a wrapper for running the BRCA subtype pipeline repeatedly with
 3 | # different random seeds.
 4 | # It should be run from the command line.
 5 | # USAGE: Rscript classifier_repeat_wrapper.R --cancer_type [BRCA|GBM] --predictor [subtype|TP53|PIK3CA] --n_repeats (default: 10) --null_model --ncores
 6 | 
 7 | option_list <- list(
 8 |   optparse::make_option("--cancer_type",
 9 |                         default = NA_character_,
10 |                         help = "Cancer type"),
11 |   optparse::make_option("--predictor",
12 |                         default = NA_character_,
13 |                         help = "Predictor used"),
14 |   optparse::make_option("--n_repeats",
15 |                         default = 10,
16 |                         help = "Number of times experiment is repeated [default: %default]"),
17 |   optparse::make_option("--null_model",
18 |                         action = "store_true",
19 |                         default = FALSE,
20 |                         help = "Permute dependent variable (within subtype if predictor is a gene)"),
21 |   optparse::make_option("--ncores",
22 |                         default = NA_integer_,
23 |                         help = "Set the number of cores to use")
24 | )
25 | 
26 | opt <- optparse::parse_args(optparse::OptionParser(option_list=option_list))
27 | source(here::here("util/option_functions.R"))
28 | check_options(opt)
29 | 
30 | # set options
31 | cancer_type <- opt$cancer_type
32 | predictor <- opt$predictor
33 | n.repeats <- opt$n_repeats
34 | null_model <- opt$null_model
35 | ncores <- min(parallel::detectCores() - 1,
36 |               opt$ncores,
37 |               na.rm = TRUE)
38 | 
39 | message(paste("\nPredicting", predictor,
40 |               "in", cancer_type,
41 |               ifelse(null_model,
42 |                      "(null model) ...",
43 |                      "...")))
44 | message(paste("\nNumber of repeats set to", n.repeats))
45 | message(paste("\nUsing", ncores, "out of", parallel::detectCores(), "cores"))
46 | 
47 | initial.seed <- 12
48 | set.seed(initial.seed)
49 | 
50 | seeds <- sample(1:10000, n.repeats)
51 | 
52 | rep.count <- 1
53 | for(seed in seeds){
54 |   message(paste("\n\n#### REPEAT NUMBER", rep.count, "####\n\n"))
55 |   system(paste("Rscript run_experiments.R",
56 |                "--cancer_type", cancer_type,
57 |                "--predictor", predictor,
58 |                "--seed", seed,
59 |                ifelse(null_model,
60 |                       "--null_model",
61 |                       ""),
62 |                "--ncores", ncores))
63 |   rep.count <- rep.count + 1
64 | }
65 | 
66 | system(paste("Rscript 3-combine_category_kappa.R",
67 |              "--cancer_type", cancer_type,
68 |              "--predictor", predictor,
69 |              ifelse(null_model,
70 |                     "--null_model",
71 |                     "")))
72 | 


--------------------------------------------------------------------------------
/combine_clinical_data.R:
--------------------------------------------------------------------------------
 1 | # Script combines clinical data from all cancer types to one data frame
 2 | # Clinical data includes: subtype and TP53/PIK3CA mutation status
 3 | # Steven Foltz August 2021
 4 | 
 5 | option_list <- list(
 6 |   optparse::make_option("--cancer_type",
 7 |                         default = NA_character_,
 8 |                         help = "Cancer type"),
 9 |   optparse::make_option("--clinical_input",
10 |                         default = NA_character_,
11 |                         help = "Clinical information input file path (.tsv)"),
12 |   optparse::make_option("--mutation_input",
13 |                         default = NA_character_,
14 |                         help = "Mutation input file path (.tsv)"),
15 |   optparse::make_option("--combined_output",
16 |                         default = NA_character_,
17 |                         help = "Combined subtype and mutation output file path (.tsv)"),
18 |   optparse::make_option("--overwrite",
19 |                         action = "store_true",
20 |                         default = FALSE,
21 |                         help = "Overwrite existing output files [default: %default]")
22 | )
23 | 
24 | opt <- optparse::parse_args(optparse::OptionParser(option_list=option_list))
25 | source(here::here("util/option_functions.R"))
26 | check_options(opt)
27 | 
28 | # load libraries
29 | suppressMessages(library(tidyverse))
30 | 
31 | # set options
32 | cancer_type <- opt$cancer_type
33 | clinical_input_filepath <- opt$clinical_input
34 | mutation_input_filepath <- opt$mutation_input
35 | combined_output_filepath <- opt$combined_output
36 | 
37 | ################################################################################
38 | # Read in clinical and mutation data
39 | ################################################################################
40 | clinical_df <- read_tsv(clinical_input_filepath, # treat all columns equally
41 |                         col_types = cols(.default = col_character())) %>%
42 |   mutate(Sample = substr(Sample, 1, 15)) # remove extra parts of TCGA ID
43 | 
44 | mutation_df <- read_tsv(mutation_input_filepath, # treat all columns equally
45 |                         col_types = cols(.default = col_character())) %>%
46 |   mutate(tcga_id = substr(tcga_id, 1, 15)) # remove extra parts of TCGA ID
47 | 
48 | ################################################################################
49 | # Combine clinical and mutation data
50 | ################################################################################
51 | 
52 | # combine data frames with left_join() to get the left side of venn diagram
53 | # start join with clinical_df because later scripts expect column name = Sample
54 | combined_df <- clinical_df %>%
55 |   left_join(mutation_df,
56 |             by = c("Sample" = "tcga_id")) %>%
57 |   mutate(PIK3CA = case_when(PIK3CA == 0 ~ "No_PIK3CA_mutation",
58 |                             PIK3CA == 1 ~ "PIK3CA_mutation"),
59 |          TP53 = case_when(TP53 == 0 ~ "No_TP53_mutation",
60 |                           TP53 == 1 ~ "TP53_mutation"))
61 | 
62 | ################################################################################
63 | # Save output file
64 | ################################################################################
65 | 
66 | write_tsv(combined_df,
67 |           combined_output_filepath)
68 | 


--------------------------------------------------------------------------------
/data/.empty:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/greenelab/RNAseq_titration_results/2d9b2f5ce2f04fc76279b2fc314e51abc9a4f3b4/data/.empty


--------------------------------------------------------------------------------
/diagrams/RNA-seq_titration_ML_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/greenelab/RNAseq_titration_results/2d9b2f5ce2f04fc76279b2fc314e51abc9a4f3b4/diagrams/RNA-seq_titration_ML_overview.png


--------------------------------------------------------------------------------
/diagrams/RNA-seq_titration_diff_expression_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/greenelab/RNAseq_titration_results/2d9b2f5ce2f04fc76279b2fc314e51abc9a4f3b4/diagrams/RNA-seq_titration_diff_expression_overview.png


--------------------------------------------------------------------------------
/docker/R-3.6.3/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM rocker/tidyverse:3.6.3
 2 | 
 3 | # Update apt-get and install other libraries
 4 | RUN apt-get --allow-releaseinfo-change-suite update && apt-get install -y --no-install-recommends \
 5 |     curl \
 6 |     libbz2-dev \
 7 |     libgdal-dev \
 8 |     libgeos-dev \
 9 |     libglpk40 \
10 |     liblzma-dev \
11 |     libmagick++-dev \
12 |     libproj-dev \
13 |     libudunits2-dev \
14 |     libxt-dev \
15 |     python3-pip \
16 |     python3-dev
17 | 
18 | # Install pyrefinebio v0.3.4
19 | RUN pip3 install pyrefinebio==0.3.4
20 | 
21 | # R packages
22 | RUN install2.r --error --deps TRUE \
23 |     ape \
24 |     binr \
25 |     caret \
26 |     cluster \
27 |     corrplot \
28 |     cowplot \
29 |     data.table \
30 |     devtools \
31 |     doParallel \
32 |     dplyr \
33 |     e1071 \
34 |     fastICA \
35 |     flexclust \
36 |     fpc \
37 |     gdata \
38 |     ggplot2 \
39 |     ggupset \
40 |     glmnet \
41 |     gridExtra \
42 |     here \
43 |     Hmisc \
44 |     huge \
45 |     jsonlite \
46 |     kernlab \
47 |     locfit \
48 |     optparse \
49 |     plyr \
50 |     ranger \
51 |     RColorBrewer \
52 |     reshape2 \
53 |     scales \
54 |     sdcMicro \
55 |     stringr \
56 |     styler \
57 |     viridis
58 | 
59 | # R Bioconductor packages
60 | RUN Rscript -e "options(warn = 2); BiocManager::install(c( \
61 |     'EnsDb.Hsapiens.v86' , \
62 |     'ensembldb', \
63 |     'limma', \
64 |     'quantro'), \
65 |     update = FALSE, \
66 |     version = '3.10')"
67 | 
68 | # Threading issue with preprocessCore::normalize.quantiles
69 | # https://support.bioconductor.org/p/122925/#124701
70 | # https://github.com/bmbolstad/preprocessCore/issues/1#issuecomment-326756305
71 | RUN Rscript -e "options(warn = 2); BiocManager::install( \
72 |     'preprocessCore', \
73 |     configure.args = '--disable-threading', \
74 |     update = FALSE)"
75 | 
76 | # ref = 341eb77105e7efd2654b4f112578648584936e06 is latest greenelab/TDM commit (retrieved 2021-05-28)
77 | RUN Rscript -e "options(warn = 2); remotes::install_github( \
78 |     'greenelab/TDM', ref = 'b041807835d4076c5549356c86c44f087d713b1a')"
79 | 
80 | # ref = 08ed6b54e4efe5249107cb335cd8e169657cbc44 is wgmao/PLIER commit corresponding to v0.1.6 (retrieved 2021-11-09)
81 | RUN Rscript -e "options(warn = 2); remotes::install_github( \
82 |     'wgmao/PLIER', ref = '08ed6b54e4efe5249107cb335cd8e169657cbc44')"
83 | 


--------------------------------------------------------------------------------
/docker/R-4.1.2/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM rocker/tidyverse:4.1.2
 2 | 
 3 | # Update apt-get and install other libraries
 4 | RUN apt-get update && apt-get install -y --no-install-recommends \
 5 |     curl \
 6 |     libfftw3-dev \
 7 |     libbz2-dev \
 8 |     libgdal-dev \
 9 |     libgeos-dev \
10 |     libglpk40 \
11 |     liblzma-dev \
12 |     libmagick++-dev \
13 |     libproj-dev \
14 |     libudunits2-dev \
15 |     libxt-dev \
16 |     python3-pip \
17 |     python3-dev
18 | 
19 | # Install pyrefinebio v0.3.4
20 | RUN pip3 install pyrefinebio==0.3.4
21 | 
22 | # Install some Bioconductor packages dependencies
23 | RUN Rscript -e "options(warn = 2); BiocManager::install(c( \
24 |     'DESeq2', \
25 |     'EnsDb.Hsapiens.v86' , \
26 |     'ensembldb', \
27 |     'GenomicRanges', \
28 |     'GenomeInfoDb', \
29 |     'IRanges', \
30 |     'limma', \
31 |     'MAST', \
32 |     'monocle', \
33 |     'multtest', \
34 |     'quantro', \
35 |     'qvalue', \
36 |     'rtracklayer', \
37 |     'S4Vectors', \
38 |     'SingleCellExperiment', \
39 |     'SummarizedExperiment'), \
40 |     update = FALSE, \
41 |     version = 3.14)"
42 | 
43 | # R packages
44 | RUN install2.r --error --deps TRUE \
45 |     ape \
46 |     binr \
47 |     caret \
48 |     cluster \
49 |     corrplot \
50 |     cowplot \
51 |     data.table \
52 |     devtools \
53 |     doParallel \
54 |     dplyr \
55 |     e1071 \
56 |     fastICA \
57 |     flexclust \
58 |     fpc \
59 |     gdata \
60 |     ggplot2 \
61 |     ggupset \
62 |     glmnet \
63 |     gridExtra \
64 |     here \
65 |     Hmisc \
66 |     huge \
67 |     jsonlite \
68 |     kernlab \
69 |     locfit \
70 |     optparse \
71 |     plyr \
72 |     ranger \
73 |     RColorBrewer \
74 |     reshape2 \
75 |     scales \
76 |     sdcMicro \
77 |     Seurat \
78 |     stringr \
79 |     styler \
80 |     viridis
81 | 
82 | # Threading issue with preprocessCore::normalize.quantiles
83 | # https://support.bioconductor.org/p/122925/#124701
84 | # https://github.com/bmbolstad/preprocessCore/issues/1#issuecomment-326756305
85 | RUN Rscript -e "options(warn = 2); BiocManager::install( \
86 |     'preprocessCore', \
87 |     configure.args = '--disable-threading', \
88 |     force = TRUE, \
89 |     update = FALSE, \
90 |     version = 3.14)"
91 | 
92 | # ref = 341eb77105e7efd2654b4f112578648584936e06 is latest greenelab/TDM commit (retrieved 2021-05-28)
93 | RUN Rscript -e "options(warn = 2); remotes::install_github( \
94 |     'greenelab/TDM', ref = 'b041807835d4076c5549356c86c44f087d713b1a')"
95 | 
96 | # ref = 08ed6b54e4efe5249107cb335cd8e169657cbc44 is wgmao/PLIER commit corresponding to v0.1.6 (retrieved 2021-11-09)
97 | RUN Rscript -e "options(warn = 2); remotes::install_github( \
98 |     'wgmao/PLIER', ref = '08ed6b54e4efe5249107cb335cd8e169657cbc44')"
99 | 


--------------------------------------------------------------------------------
/docker/README.md:
--------------------------------------------------------------------------------
 1 | # Using Docker with this project
 2 | 
 3 | Two Docker images are available from `envest/rnaseq_titration_results`.
 4 | They are highly similar but are based on different versions of R.
 5 | 
 6 | ### R-4.1.2 version
 7 | 
 8 | We recommend using this version for running any analysis.
 9 | This version is maintained and will be the one to get updated in the future.
10 | 
11 | To pull this image, use the tag `R-4.1.2`:
12 | 
13 | ```
14 | docker pull envest/rnaseq_titration_results:R-4.1.2
15 | ```
16 | 
17 | ### R-3.6.3 version
18 | 
19 | We also have an image based on R version 3.6.3.
20 | This image is more representative of the development environment used in earlier (pre-2022) iterations of this analysis and we retain it for posterity.
21 | 
22 | :warning: We do _not_ recommend using this version for running analysis since recent code updates have changed some behaviors under older versions of R and corresponding package versions. 
23 | 
24 | To pull this image, use the tag `R-3.6.3`:
25 | 
26 | ```
27 | docker pull envest/rnaseq_titration_results:R-3.6.3
28 | ```
29 | 


--------------------------------------------------------------------------------
/download_TCGA_data.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | set -euo pipefail
  3 | 
  4 | # change to the directory of this script
  5 | cd "$(dirname "${BASH_SOURCE[0]}")"
  6 | 
  7 | # set data directory
  8 | data="data"
  9 | mkdir -p $data
 10 | 
 11 | # downlaod BRCA array and seq data from URLs
 12 | wget -nc -i brca_data_urls.txt '--directory-prefix='$data
 13 | 
 14 | # Obtain TCGA data freeze manifest file
 15 | # See here for more info: https://gdc.cancer.gov/about-data/publications/pancanatlas
 16 | manifest_url="https://gdc.cancer.gov/files/public/file/PanCan-General_Open_GDC-Manifest_2.txt"
 17 | manifest_basename=$(basename $manifest_url)
 18 | if [ -f $data/$manifest_basename ]; then
 19 |   echo TCGA file $data/$manifest_basename already exists and was not overwritten.
 20 | else
 21 |   echo Downloading $manifest_basename
 22 |   curl -o $data/$manifest_basename --silent $manifest_url
 23 | fi
 24 | 
 25 | # download specific files from TCGA manifest
 26 | mutations="mc3.v0.2.8.PUBLIC.maf.gz"
 27 | copy_number="broad.mit.edu_PANCAN_Genome_Wide_SNP_6_whitelisted.seg"
 28 | clinical="TCGA-CDR-SupplementalTableS1.xlsx"
 29 | rnaseq="EBPlusPlusAdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.tsv"
 30 | 
 31 | filename_array=($mutations \
 32 |                 $copy_number \
 33 |                 $clinical \
 34 |                 $rnaseq)
 35 | 
 36 | for filename in ${filename_array[@]}; do
 37 |   if [ -f $data/$filename ]; then
 38 |     echo TCGA file $data/$filename already exists and was not overwritten.
 39 |   else
 40 |     echo Downloading $filename
 41 |     id=$(grep -w $filename $data/$manifest_basename | cut -f1)
 42 |     curl -o $data/$filename https://api.gdc.cancer.gov/data/$id
 43 |   fi
 44 | done
 45 | 
 46 | # get TCGA array expression using refine.bio client
 47 | # GSE83130 GBM
 48 | for accession in GSE83130; do
 49 |   if [ -d $data/$accession ]; then
 50 |     echo refine.bio download for $accession already exists and was not overwritten.
 51 |   else
 52 |     echo Downloading $accession
 53 |     refinebio create-token -s
 54 |     refinebio download-dataset \
 55 |       --email-address steven.foltz@ccdatalab.org \
 56 |       --path $data/$accession\.zip \
 57 |       --experiments $accession \
 58 |       --aggregation EXPERIMENT \
 59 |       --transformation NONE \
 60 |       --skip-quantile-normalization True
 61 |     unzip -d $data/$accession $data/$accession\.zip && rm -f $data/$accession\.zip
 62 |   fi
 63 | done
 64 | 
 65 | # download TCGA GBM clinical data including subtypes
 66 | # Publication: Brennan, C. W. et al. The somatic genomic landscape of glioblastoma. Cell 155, 462–477 (2013)
 67 | # Link to paper: https://doi.org/10.1016/j.cell.2013.09.034
 68 | gbm_clinical_link="https://www.cell.com/cms/10.1016/j.cell.2013.09.034/attachment/9cefc2e8-caac-4225-bcdd-70f105ccf568/mmc7.xlsx"
 69 | if [ -f $data/gbm_clinical_table_S7.xlsx ]; then
 70 |   echo GBM clinical spreadsheet $data/gbm_clinical_table_S7.xlsx already exists and was not overwritten.
 71 | else
 72 |   wget -O $data/gbm_clinical_table_S7.xlsx $gbm_clinical_link
 73 | fi
 74 | 
 75 | # modify BRCA clinical file column PAM50 to be subtype
 76 | sed -i 's/PAM50/subtype/' $data/BRCAClin.tsv
 77 | 
 78 | # process GBM data via script
 79 | echo Processing GBM data ...
 80 | Rscript prepare_GBM_data.R \
 81 |   --seq_input $data/EBPlusPlusAdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.tsv \
 82 |   --array_input $data/GSE83130/GSE83130/GSE83130.tsv \
 83 |   --metadata_input $data/GSE83130/aggregated_metadata.json \
 84 |   --array_output $data/GBMarray.pcl \
 85 |   --seq_output $data/GBMRNASeq.pcl \
 86 |   --clinical_input $data/gbm_clinical_table_S7.xlsx \
 87 |   --clinical_output $data/GBMClin.tsv
 88 | 
 89 | # retrieve BRCA and GBM mutations in PIK3CA and TP53 from TCGA MC3
 90 | # output is stored in data/mutations.* TSV and MAF files
 91 | echo Retrieving mutation data from MC3 for BRCA and GBM ...
 92 | python3 retrieve_MC3_mutations.py
 93 | 
 94 | # combine clinical and mutation data into one data frame
 95 | echo Combining clinical and mutation data for BRCA ...
 96 | Rscript combine_clinical_data.R \
 97 |   --cancer_type BRCA \
 98 |   --clinical_input $data/BRCAClin.tsv \
 99 |   --mutation_input $data/mutations.BRCA.tsv \
100 |   --combined_output $data/combined_clinical_data.BRCA.tsv
101 | echo Combining clinical and mutation data for GBM ...
102 | Rscript combine_clinical_data.R \
103 |   --cancer_type GBM \
104 |   --clinical_input $data/GBMClin.tsv \
105 |   --mutation_input $data/mutations.GBM.tsv \
106 |   --combined_output $data/combined_clinical_data.GBM.tsv
107 | 
108 | # check md5 sums of downloaded files
109 | echo Checking md5 sums of downloaded files ...
110 | md5sum --check --quiet check_sums.tsv
111 | echo All data files match expected md5 sums!
112 | 
113 | # get BRCA array expression data from TCGA Legacy Archive
114 | # data/gdc_legacy_archive_brca_manifest.txt obtained from https://portal.gdc.cancer.gov/legacy-archive
115 | # with search parameters
116 | #   Cases
117 | #     Cancer Program = TCGA
118 | #     Project = TCGA-BRCA
119 | #   Files
120 | #     Data Category = Raw microarray data
121 | #     Data Type = Raw intensities
122 | #     Experimental Strategy = Gene expression array
123 | ################################################################################
124 | # UNCOMMENT TO DOWNLOAD TCGA LEGACY ARCHIVE BRCA EXPRESSION ARRAY DATA
125 | # Need to rebuild docker image with gdc-client uncommented
126 | #brca_array_dir=$data/BRCA_array
127 | #if [ -d $brca_array_dir ]; then
128 | #  echo TCGA Legacy Archive data for BRCA already exists and was not overwritten.
129 | #else
130 | #  mkdir -p $brca_array_dir
131 | #  gdc-client download --manifest gdc_legacy_archive_brca_manifest.txt --dir $brca_array_dir
132 | #fi
133 | ################################################################################
134 | 


--------------------------------------------------------------------------------
/load_packages.R:
--------------------------------------------------------------------------------
 1 | library(ape)
 2 | library(binr)
 3 | library(caret)
 4 | library(cluster)
 5 | library(corrplot)
 6 | library(cowplot)
 7 | library(data.table)
 8 | library(devtools)
 9 | library(doParallel)
10 | library(e1071)
11 | library(fastICA)
12 | library(flexclust)
13 | library(fpc)
14 | library(gdata)
15 | library(glmnet)
16 | library(gridExtra)
17 | library(Hmisc)
18 | library(huge)
19 | library(kernlab)
20 | library(limma)
21 | library(parallel)
22 | library(PLIER)
23 | library(plyr)
24 | library(preprocessCore)
25 | library(quantro)
26 | library(ranger)
27 | library(reshape2)
28 | library(scales)
29 | library(sdcMicro)
30 | library(TDM)
31 | library(tidyverse)
32 | 


--------------------------------------------------------------------------------
/models/.empty:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/greenelab/RNAseq_titration_results/2d9b2f5ce2f04fc76279b2fc314e51abc9a4f3b4/models/.empty


--------------------------------------------------------------------------------
/normalized_data/.empty:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/greenelab/RNAseq_titration_results/2d9b2f5ce2f04fc76279b2fc314e51abc9a4f3b4/normalized_data/.empty


--------------------------------------------------------------------------------
/normalized_data/reconstructed_data/.empty:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/greenelab/RNAseq_titration_results/2d9b2f5ce2f04fc76279b2fc314e51abc9a4f3b4/normalized_data/reconstructed_data/.empty


--------------------------------------------------------------------------------
/plots/.empty:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/greenelab/RNAseq_titration_results/2d9b2f5ce2f04fc76279b2fc314e51abc9a4f3b4/plots/.empty


--------------------------------------------------------------------------------
/plots/data/.empty:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/greenelab/RNAseq_titration_results/2d9b2f5ce2f04fc76279b2fc314e51abc9a4f3b4/plots/data/.empty


--------------------------------------------------------------------------------
/plots/main/.empty:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/greenelab/RNAseq_titration_results/2d9b2f5ce2f04fc76279b2fc314e51abc9a4f3b4/plots/main/.empty


--------------------------------------------------------------------------------
/plots/scripts/.empty:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/greenelab/RNAseq_titration_results/2d9b2f5ce2f04fc76279b2fc314e51abc9a4f3b4/plots/scripts/.empty


--------------------------------------------------------------------------------
/plots/scripts/0-plot_predictor_category_distributions.R:
--------------------------------------------------------------------------------
  1 | # S. Foltz Feb 2022
  2 | # This plots the predictor category distribution for each seed
  3 | 
  4 | option_list <- list(
  5 |   optparse::make_option("--cancer_type",
  6 |                         default = NA_character_,
  7 |                         help = "Cancer type"),
  8 |   optparse::make_option("--predictor",
  9 |                         default = NA_character_,
 10 |                         help = "Predictor used"),
 11 |   optparse::make_option("--null_model",
 12 |                         action = "store_true",
 13 |                         default = FALSE,
 14 |                         help = "Use null model input data"),
 15 |   optparse::make_option("--plot_all_seeds",
 16 |                         action = "store_true",
 17 |                         default = FALSE,
 18 |                         help = "Plot all seeds instead of representative seed"),
 19 |   optparse::make_option("--output_directory",
 20 |                         default = NA_character_,
 21 |                         help = "Output directory for plot (absolute or relative path)")
 22 | )
 23 | 
 24 | opt <- optparse::parse_args(optparse::OptionParser(option_list=option_list))
 25 | source(here::here("util/option_functions.R"))
 26 | check_options(opt)
 27 | 
 28 | # load libraries
 29 | suppressMessages(library(tidyverse))
 30 | source(here::here("util/color_blind_friendly_palette.R"))
 31 | 
 32 | # set options
 33 | cancer_type <- opt$cancer_type
 34 | predictor <- opt$predictor
 35 | null_model <- opt$null_model
 36 | plot_all_seeds <- opt$plot_all_seeds
 37 | file_identifier <- ifelse(null_model,
 38 |                           str_c(cancer_type, predictor, "null", sep = "_"),
 39 |                           str_c(cancer_type, predictor, sep = "_"))
 40 | 
 41 | # define directories
 42 | plot.dir <- here::here("plots")
 43 | plot.data.dir <- here::here("plots/data")
 44 | output_directory <- opt$output_directory
 45 | 
 46 | # list potential input files
 47 | input_files <- list.files(path = plot.data.dir,
 48 |                           pattern = paste0(file_identifier,
 49 |                                            ".dist_split_stacked_bar."),
 50 |                           full.names = TRUE)
 51 | 
 52 | # define input data
 53 | if (plot_all_seeds) { # read in all seed data to one data frame
 54 |   plot_df <- input_files %>%
 55 |     map(read_tsv,
 56 |         col_types = "ccccc") %>%
 57 |     reduce(rbind)
 58 | } else { # default
 59 |   plot_df <- read_tsv(input_files[1],
 60 |                       col_types = "ccccc")
 61 |   initial_seed <- plot_df %>%
 62 |     pull(initial_seed) %>%
 63 |     unique()
 64 | }
 65 | 
 66 | # define output file
 67 | if (plot_all_seeds) {
 68 |   category.distribution.plot <- file.path(output_directory,
 69 |                                           paste0(file_identifier,
 70 |                                                  ".dist_split_stacked_bar.",
 71 |                                                  "all_seeds",
 72 |                                                  ".pdf"))
 73 | } else { # default
 74 |   category.distribution.plot <- file.path(output_directory,
 75 |                                           paste0(file_identifier,
 76 |                                                  ".dist_split_stacked_bar.",
 77 |                                                  initial_seed,
 78 |                                                  ".pdf"))
 79 | }
 80 | 
 81 | # plot
 82 | 
 83 | plot_obj <- plot_df %>%
 84 |   ggplot(aes(x = fct_rev(split),
 85 |              fill = category)) +
 86 |   geom_bar() +
 87 |   scale_fill_manual(values = cbPalette) +
 88 |   labs(x = "Split",
 89 |        y = "Count",
 90 |        title = str_replace(file_identifier,
 91 |                            pattern = "_",
 92 |                            replacement = " "),
 93 |        fill = "Predictor") +
 94 |   theme_bw(base_size = 10)
 95 | 
 96 | if (plot_all_seeds) {
 97 |   plot_obj <- plot_obj +
 98 |     coord_flip() +
 99 |     facet_wrap(~ initial_seed,
100 |                ncol = 5)
101 |   ggsave(filename = category.distribution.plot,
102 |          plot = plot_obj,
103 |          height = 3,
104 |          width = 7.25)
105 | } else { # default
106 |   ggsave(filename = category.distribution.plot,
107 |          plot = plot_obj,
108 |          height = 3,
109 |          width = 3.5)
110 | }
111 | 


--------------------------------------------------------------------------------
/plots/scripts/1A-plot_DEGs.R:
--------------------------------------------------------------------------------
  1 | # J. Taroni Feb 2016, S. Foltz Feb 2022
  2 | # This script compares differential expression "silver standards" which are
  3 | # differential expression analysis results from standard pipelines (i.e.,
  4 | # log transformed 100% array data and RSEM counts 100% RNA-seq [processed with
  5 | # limma::voom]) with differential expression results from RNA-seq titrated data
  6 | # (0-100%) normalized various ways.
  7 | #
  8 | # Plot the proportion of genes that are differentially expressed between conditions
  9 | # Plot similarity of DEGs to silver standards (subtype vs. others only)
 10 | #
 11 | # USAGE: Rscript 1A-plot_DEGs.R --cancer_type --subtype_vs_others --subtype_vs_subtype
 12 | 
 13 | option_list <- list(
 14 |   optparse::make_option("--cancer_type",
 15 |                         default = NA_character_,
 16 |                         help = "Cancer type"),
 17 |   optparse::make_option("--subtype_vs_others",
 18 |                         help = "Subtype used for comparison against all others."),
 19 |   optparse::make_option("--subtype_vs_subtype",
 20 |                         help = "Subtypes used in head-to-head comparison (comma-separated without space e.g. Type1,Type2)"),
 21 |   optparse::make_option("--proportion_output_directory",
 22 |                         help = "Output directory of DEG proportion plot. Include this option to plot DEG proportion plot."),
 23 |   optparse::make_option("--overlap_output_directory",
 24 |                         help = "Output directory of DEG overlap plot. Include this option to plot silver standard overlap plot."),
 25 |   optparse::make_option("--overlap_measure",
 26 |                         help = "Which overlap measures to include in silver standard overlap plot (comma-separated without space e.g. Jaccard,Rand,Spearman; must be one or more of Jaccard, Rand, Spearman)")
 27 | )
 28 | 
 29 | opt <- optparse::parse_args(optparse::OptionParser(option_list=option_list))
 30 | source(here::here("util/option_functions.R"))
 31 | check_options(opt)
 32 | 
 33 | # at least one of --subtype_vs_others or --subtype_vs_subtype should be given
 34 | if (any(c("subtype_vs_others", "subtype_vs_subtype") %in% names(opt))) {
 35 |   
 36 |   subtype_vs_others <- NA # first assume option is not provided
 37 |   subtype_vs_subtype <- NA # then update as available below
 38 |   
 39 |   if ("subtype_vs_others" %in% names(opt)) {
 40 |     subtype_vs_others <- opt$subtype_vs_others
 41 |   }
 42 |   
 43 |   if ("subtype_vs_subtype" %in% names(opt)) {
 44 |     subtype_vs_subtype <- opt$subtype_vs_subtype
 45 |   }
 46 |   
 47 | } else {
 48 |   stop("  Errors: must include --subtype_vs_others and/or --subtype_vs_subtype in plots/scripts/1A-plot_DEGs.R.\n")
 49 | }
 50 | 
 51 | # at least one of --proportion_output_directory or --overlap_output_directory should be given
 52 | if (any(c("proportion_output_directory", "overlap_output_directory") %in% names(opt))) {
 53 |   
 54 |   proportion_output_directory <- NA # first assume option is not provided
 55 |   overlap_output_directory <- NA # then update as available below
 56 |   
 57 |   if ("proportion_output_directory" %in% names(opt)) {
 58 |     proportion_output_directory <- opt$proportion_output_directory
 59 |     plot_proportion <- TRUE
 60 | 
 61 |   }
 62 |   
 63 |   if ("overlap_output_directory" %in% names(opt)) {
 64 |     overlap_output_directory <- opt$overlap_output_directory
 65 |     plot_overlap <- TRUE
 66 |     
 67 |     # check that overlap measures requested are the ones present in data
 68 |     if ("overlap_measure" %in% names(opt)) {
 69 |       overlap_measures <- sort(stringr::str_split(opt$overlap_measure,
 70 |                                             pattern = ",", simplify = TRUE))
 71 |       
 72 |       if (!all(overlap_measures %in% c("Jaccard", "Rand", "Spearman"))) {
 73 |         stop("  Errors: --overlap_measure must be one or more of Jaccard, Rand, Spearman in plots/scripts/1A-plot_DEGs.R.\n")
 74 |       }
 75 |       
 76 |     } else {
 77 |       stop("  Errors: must include --overlap_measure with --overlap_output_directory in plots/scripts/1A-plot_DEGs.R.\n")
 78 |     }
 79 |   }
 80 |   
 81 | } else {
 82 |   stop("  Errors: must include --proportion_output_directory and/or --overlap_output_directory in plots/scripts/1A-plot_DEGs.R.\n")
 83 | }
 84 | 
 85 | # load libraries
 86 | suppressMessages(library(tidyverse))
 87 | source(here::here("util/color_blind_friendly_palette.R"))
 88 | source(here::here("util", "differential_expression_functions.R"))
 89 | 
 90 | # set options
 91 | cancer_type <- opt$cancer_type
 92 | file_identifier <- str_c(cancer_type, "subtype", sep = "_") # we are only working with subtype models here
 93 | 
 94 | # define directories
 95 | plot.dir <- here::here("plots")
 96 | plot.data.dir <- file.path(plot.dir, "data")
 97 | 
 98 | #### functions -----------------------------------------------------------------
 99 | 
100 | plot_DEG_proportions <- function(subtypes){
101 |   
102 |   subtypes_path <- str_c(subtypes, collapse = "v")
103 |   subtypes_nice <- str_c(subtypes, collapse = " vs. ")
104 |   
105 |   input_filename <- file.path(
106 |     plot.data.dir,
107 |     paste0(file_identifier, "_titration_differential_exp_eBayes_fits_",
108 |            subtypes_path, ".propDE.tsv"))
109 |   
110 |   output_filename <- file.path(
111 |     proportion_output_directory,
112 |     paste0(file_identifier, "_differential_expr_proportion_lt5_",
113 |            subtypes_path, ".pdf"))
114 |   
115 |   propDEG_df <- read_tsv(input_filename,
116 |                          col_types = "dcd") %>%
117 |     mutate(perc.seq = factor(perc.seq,
118 |                              levels = seq(0, 100, 10)))
119 |   
120 |   # plot proportion of genes that are diff expressed
121 |   plot_obj <- PlotProportionDE(propDEG_df,
122 |                                subtypes = subtypes_nice,
123 |                                cancer_type = cancer_type)
124 |   
125 |   ggsave(output_filename,
126 |          plot = plot_obj,
127 |          width = 7.25,
128 |          height = 4)
129 |   
130 | }
131 | 
132 | plot_silver_overlap <- function(subtypes){
133 |   
134 |   subtypes_path <- str_c(subtypes, collapse = "v")
135 |   subtypes_nice <- str_c(subtypes, collapse = " vs. ")
136 |   measures_path <- str_c(overlap_measures, collapse = "_")
137 |   
138 |   input_filename <- file.path(
139 |     plot.data.dir,
140 |     paste0(file_identifier, "_titration_differential_exp_eBayes_fits_",
141 |            subtypes_path, ".silver.tsv"))
142 |   
143 |   output_filename <- file.path(
144 |     overlap_output_directory,
145 |     paste0(file_identifier, "_silver_standard_similarity_lt5_",
146 |            measures_path, "_", subtypes_path, ".pdf"))
147 |   
148 |   silver_df <- read_tsv(input_filename,
149 |                         col_types = "cdccd") %>%
150 |     mutate(Perc.Seq = factor(Perc.Seq,
151 |                              levels = seq(0, 100, 10))) %>%
152 |     filter(measure %in% overlap_measures)
153 |   
154 |   using_single_measure <- length(overlap_measures) == 1
155 |   
156 |   plot_obj <- PlotSilverStandardStats(
157 |     silver_df,
158 |     title = paste(cancer_type, subtypes_nice, " FDR < 5%"),
159 |     single_measure = using_single_measure)
160 | 
161 |   ggsave(
162 |     output_filename,
163 |     plot = plot_obj,
164 |     width = 7.25,
165 |     height = c(3,4,5)[length(overlap_measures)]
166 |   )
167 | }
168 | 
169 | #### plot Subtype v. Other results ---------------------------------------------
170 | 
171 | if (!is.na(subtype_vs_others)) {
172 |   
173 |   subtypes <- c(subtype_vs_others, "Other")
174 | 
175 |   if (plot_proportion) {
176 |     plot_DEG_proportions(subtypes)  
177 |   }
178 |   
179 |   if (plot_overlap) {
180 |     plot_silver_overlap(subtypes)  
181 |   }
182 | }
183 | 
184 | #### plot Subtype v. Subtype results -------------------------------------------
185 | 
186 | if (!is.na(subtype_vs_subtype)) {
187 |   
188 |   subtypes <- as.vector(
189 |     stringr::str_split(subtype_vs_subtype, pattern = ",", simplify = TRUE))
190 | 
191 |   if (plot_proportion) {
192 |     plot_DEG_proportions(subtypes)  
193 |   }
194 |   
195 |   if (plot_overlap) {
196 |     plot_silver_overlap(subtypes)  
197 |   }
198 | 
199 | }
200 | 


--------------------------------------------------------------------------------
/plots/scripts/2A-plot_small_n_differential_expression.R:
--------------------------------------------------------------------------------
  1 | # J. Taroni Feb 2016, S. Foltz Feb 2022
  2 | # With small n data, plot comparison of SOMETHING
  3 | #
  4 | # USAGE: Rscript 2A-plot_small_n_differential_expression.R --cancer_type --subtype_vs_others --subtype_vs_subtype --output_directory --overlap_measure
  5 | 
  6 | option_list <- list(
  7 |   optparse::make_option("--cancer_type",
  8 |                         default = NA_character_,
  9 |                         help = "Cancer type"),
 10 |   optparse::make_option("--subtype_vs_others",
 11 |                         help = "Subtype used for comparison against all others."),
 12 |   optparse::make_option("--subtype_vs_subtype",
 13 |                         help = "Subtypes used in head-to-head comparison (comma-separated without space e.g. Type1,Type2)"),
 14 |   optparse::make_option("--output_directory",
 15 |                         default = NA_character_,
 16 |                         help = "Output directory of DEG overlap plot (absolute or relative path)."),
 17 |   optparse::make_option("--overlap_measure",
 18 |                         default = NA_character_,
 19 |                         help = "Which overlap measures to include in silver standard overlap plot (comma-separated without space e.g. Jaccard,Rand,Spearman; must be one or more of Jaccard, Rand, Spearman)")
 20 | )
 21 | 
 22 | opt <- optparse::parse_args(optparse::OptionParser(option_list=option_list))
 23 | source(here::here("util/option_functions.R"))
 24 | check_options(opt)
 25 | 
 26 | # at least one of --subtype_vs_others or --subtype_vs_subtype should be given
 27 | if (any(c("subtype_vs_others", "subtype_vs_subtype") %in% names(opt))) {
 28 |   
 29 |   subtype_vs_others <- NA # first assume option is not provided
 30 |   subtype_vs_subtype <- NA # then update as available below
 31 |   
 32 |   if ("subtype_vs_others" %in% names(opt)) {
 33 |     subtype_vs_others <- opt$subtype_vs_others
 34 |   }
 35 |   
 36 |   if ("subtype_vs_subtype" %in% names(opt)) {
 37 |     subtype_vs_subtype <- opt$subtype_vs_subtype
 38 |   }
 39 |   
 40 | } else {
 41 |   message("  Errors: must include --subtype_vs_others and/or --subtype_vs_subtype in plots/scripts/2A-plot_small_n_differential_expression.R.\n")
 42 |   stop()
 43 | }
 44 | 
 45 | # check that overlap measures requested are the ones present in data
 46 | if ("overlap_measure" %in% names(opt)) {
 47 |   overlap_measures <- sort(stringr::str_split(opt$overlap_measure,
 48 |                                               pattern = ",", simplify = TRUE))
 49 |   
 50 |   if (!all(overlap_measures %in% c("Jaccard", "Rand", "Spearman"))) {
 51 |     message("  Errors: --overlap_measure must be one or more of Jaccard, Rand, Spearman in plots/scripts/2A-plot_small_n_differential_expression.R.\n")
 52 |     stop()  
 53 |   }
 54 | }
 55 | 
 56 | # load libraries
 57 | suppressMessages(library(tidyverse))
 58 | source(here::here("util/color_blind_friendly_palette.R"))
 59 | 
 60 | # set options
 61 | cancer_type <- opt$cancer_type
 62 | file_identifier <- str_c(cancer_type, "subtype", sep = "_") # we are only working with subtype models here
 63 | 
 64 | # define directories
 65 | plot.dir <- here::here("plots")
 66 | plot.data.dir <- file.path(plot.dir, "data")
 67 | output_directory <- opt$output_directory
 68 | 
 69 | #### functions -----------------------------------------------------------------
 70 | 
 71 | DataSummary <- function(x) {
 72 |   # This function is supplied to ggplot2::stat_summary in order to plot the
 73 |   # median value of a vector as a point and the "confidence interval on the
 74 |   # median" used in notched boxplots as a vertical line. See boxplot.stats for
 75 |   # more information.
 76 |   m <- median(x)
 77 |   conf <- boxplot.stats(x)$conf
 78 |   ymin <- min(conf)
 79 |   ymax <- max(conf)
 80 |   return(c(y = m, ymin = ymin, ymax = ymax))
 81 | }
 82 | 
 83 | plot_small_n <- function(subtypes){
 84 |   # This function creates a panel of line plots faceted by 
 85 |   # % RNA-seq, the measure (Jaccard, Rand, Spearman), and normalization method
 86 |   
 87 |   subtypes_path <- str_c(subtypes, collapse = "v")
 88 |   subtypes_nice <- str_c(subtypes, collapse = " vs. ")
 89 |   measures_path <- str_c(overlap_measures, collapse = "_")
 90 |   
 91 |   input_filename <- file.path(plot.data.dir,
 92 |                               paste0(file_identifier,
 93 |                                      "_small_n_",
 94 |                                      subtypes_path,
 95 |                                      "_results.tsv"))
 96 |   
 97 |   output_filename <- file.path(
 98 |     output_directory,
 99 |     paste0(file_identifier, "_small_n_",
100 |            measures_path, "_", subtypes_path, ".pdf"))
101 |   
102 |   using_single_measure <- length(overlap_measures) == 1
103 |   
104 |   stats_df <- read_tsv(input_filename,
105 |                        col_types = "ccdcddc") %>%
106 |     filter(seq_prop %in% str_c(c(30, 50, 70), "% RNA-seq"),
107 |            !is.na(value),
108 |            measure %in% overlap_measures) %>%
109 |     mutate(no.samples = factor(no.samples))
110 |   
111 |   plot_obj <- ggplot(stats_df, aes(x = no.samples,
112 |                                    y = value,
113 |                                    color = platform)) +
114 |     stat_summary(fun = median,
115 |                  geom = "line",
116 |                  aes(group = platform),
117 |                  position = position_dodge(0.7),
118 |                  show.legend = FALSE) +
119 |     stat_summary(fun = median, # this makes the point size consistent with other plots
120 |                  geom = "point",
121 |                  aes(group = platform),
122 |                  position = position_dodge(0.7),
123 |                  show.legend = FALSE) +
124 |     stat_summary(fun.data = DataSummary, # this adds the error bars without median points
125 |                  geom = "linerange",
126 |                  aes(group = platform),
127 |                  position = position_dodge(0.7),
128 |                  show.legend = FALSE) +
129 |     expand_limits(y = c(0,1)) +
130 |     scale_y_continuous(breaks = seq(0, 1, 0.25)) +
131 |     theme_bw() +
132 |     labs(x = "Number of Samples from Each Subtype",
133 |          y = ifelse(using_single_measure,
134 |                     unique(stats_df$measure),
135 |                     "Measure of Similarity"),
136 |          title = str_c("Small n Experiment: ", paste(cancer_type, subtypes_nice, "FDR < 10%"))) +
137 |     scale_colour_manual(values = cbPalette[c(2, 3)])
138 |   
139 |   if (using_single_measure) {
140 |     plot_obj <- plot_obj +
141 |       facet_grid(normalization ~ seq_prop)
142 |   } else {
143 |     plot_obj <- plot_obj +
144 |       facet_grid(measure + normalization ~ seq_prop)
145 |   }
146 |   
147 |   ggsave(filename = output_filename,
148 |          plot = plot_obj,
149 |          width = 7.25,
150 |          height = c(3,4,5)[length(overlap_measures)])
151 | }
152 | 
153 | #### plot Subtype v. Other results ---------------------------------------------
154 | 
155 | if (!is.na(subtype_vs_others)) {
156 |   
157 |   subtypes <- c(subtype_vs_others, "Other")
158 |   plot_small_n(subtypes)
159 | }
160 | 
161 | #### plot Subtype v. Subtype results -------------------------------------------
162 | 
163 | if (!is.na(subtype_vs_subtype)) {
164 |   
165 |   subtypes <- as.vector(
166 |     stringr::str_split(subtype_vs_subtype, pattern = ",", simplify = TRUE))
167 |   plot_small_n(subtypes)
168 | }
169 | 


--------------------------------------------------------------------------------
/plots/scripts/3-plot_category_kappa.R:
--------------------------------------------------------------------------------
  1 | # S. Foltz Feb 2022
  2 | # This plots kappa values from category prediction
  3 | 
  4 | option_list <- list(
  5 |   optparse::make_option("--cancer_type",
  6 |                         default = NA_character_,
  7 |                         help = "Cancer type"),
  8 |   optparse::make_option("--predictor",
  9 |                         default = NA_character_,
 10 |                         help = "Predictor used"),
 11 |   optparse::make_option("--null_model",
 12 |                         action = "store_true",
 13 |                         default = FALSE,
 14 |                         help = "Use delta kappa input data"),
 15 |   optparse::make_option("--output_directory",
 16 |                         default = NA_character_,
 17 |                         help = "Output directory for plot (absolute or relative path)"),
 18 |   optparse::make_option("--include_seurat",
 19 |                         action = "store_true",
 20 |                         default = FALSE,
 21 |                         help = "Include Seurat results in plot (default: FALSE)")
 22 | )
 23 | 
 24 | opt <- optparse::parse_args(optparse::OptionParser(option_list=option_list))
 25 | source(here::here("util/option_functions.R"))
 26 | check_options(opt)
 27 | 
 28 | # load libraries
 29 | suppressMessages(library(tidyverse))
 30 | source(here::here("util/color_blind_friendly_palette.R"))
 31 | 
 32 | # set options
 33 | cancer_type <- opt$cancer_type
 34 | predictor <- opt$predictor
 35 | null_model <- opt$null_model
 36 | file_identifier <- str_c(cancer_type, predictor, sep = "_")
 37 | include_seurat <- opt$include_seurat
 38 | 
 39 | # define directories
 40 | plot.dir <- here::here("plots")
 41 | plot.data.dir <- here::here("plots/data")
 42 | output_directory <- opt$output_directory
 43 | 
 44 | # define input file
 45 | input_filename <- ifelse(null_model,
 46 |                          file.path(plot.data.dir,
 47 |                                    paste0(file_identifier,
 48 |                                           "_train_3_models_delta_kappa.tsv")),
 49 |                          file.path(plot.data.dir,
 50 |                                    paste0(file_identifier,
 51 |                                           "_train_3_models_kappa.tsv")))
 52 | 
 53 | # define output files
 54 | output_filename <- file.path(output_directory,
 55 |                              ifelse(null_model,
 56 |                                     paste0(file_identifier,
 57 |                                            "_train_3_models_delta_kappa.pdf"),
 58 |                                     paste0(file_identifier,
 59 |                                            "_train_3_models_kappa.pdf")))
 60 | 
 61 | # read in data
 62 | median_df <- read_tsv(input_filename,
 63 |                       col_types = "dddddccc") %>%
 64 |   mutate(Perc.Seq = factor(Perc.Seq,
 65 |                            levels = seq(0, 100, 10))) %>%
 66 |   group_by(Perc.Seq, Platform, Classifier, Normalization) %>%
 67 |   summarize(n_obs = n(),
 68 |             med = median(Kappa),
 69 |             IQR = quantile(Kappa, 0.75) - quantile(Kappa, 0.25),
 70 |             median_ci_upper = med + 1.58*IQR/sqrt(n_obs),
 71 |             median_ci_lower = med - 1.58*IQR/sqrt(n_obs),
 72 |             .groups = "drop")
 73 | 
 74 | kappa_df <- read_tsv(input_filename,
 75 |                      col_types = "dddddccc") %>%
 76 |   mutate(Perc.Seq = factor(Perc.Seq,
 77 |                            levels = seq(0, 100, 10)))
 78 | 
 79 | # default behavior: exclude (!include) seurat results
 80 | if (!include_seurat) {
 81 |   median_df <- median_df %>%
 82 |     filter(Normalization != "SEURAT") %>%
 83 |     droplevels()
 84 |   kappa_df <- kappa_df %>%
 85 |     filter(Normalization != "SEURAT") %>%
 86 |     droplevels()
 87 | }
 88 | 
 89 | # plot
 90 | 
 91 | plot_obj <- ggplot(median_df,
 92 |                    aes(x = Perc.Seq,
 93 |                        y = med, # median
 94 |                        color = Platform,
 95 |                        fill = Platform)) +
 96 |   facet_grid(rows = vars(Classifier),
 97 |              cols = vars(Normalization)) +
 98 |   geom_errorbar(aes(x = Perc.Seq,
 99 |                     ymin = median_ci_lower,
100 |                     ymax = median_ci_upper),
101 |                 size = 0.25,
102 |                 width = 0.5,
103 |                 position = position_dodge(0.7)) +
104 |   geom_line(aes(group = Platform),
105 |             size = 0.5,
106 |             position = position_dodge(0.7)) + 
107 |   geom_point(shape = 16,
108 |              size = 0.5,
109 |              show.legend = FALSE,
110 |              position = position_dodge(0.7)) +
111 |   geom_point(data = kappa_df,
112 |              aes(x = Perc.Seq,
113 |                  y = Kappa,
114 |                  color = Platform,
115 |                  fill = Platform),
116 |              alpha = 0.5,
117 |              size = 0.25,
118 |              shape = 16,
119 |              position = position_dodge(0.7),
120 |              show.legend = FALSE) +
121 |   expand_limits(y = 1) +
122 |   scale_x_discrete(labels = c("0", "", "", "", "",
123 |                               "50", "", "", "", "",
124 |                               "100")) + 
125 |   labs(x = "% RNA-seq Samples in Training Data",
126 |        color = "Test Data Platform",
127 |        fill = "Test Data Platform",
128 |        y = ifelse(null_model,
129 |                   "Delta Kappa",
130 |                   "Kappa"),
131 |        title = str_c(cancer_type, predictor, sep = " ")) +
132 |   theme_bw() +
133 |   scale_fill_manual(values = cbPalette[2:3]) +
134 |   scale_colour_manual(values = cbPalette[2:3]) +
135 |   theme(legend.position = "bottom",
136 |         panel.grid.major = element_line(size = 0.25),
137 |         panel.grid.minor = element_line(size = 0.25),
138 |         strip.text.y = element_text(size = 7))
139 | 
140 | ggsave(output_filename,
141 |        plot = plot_obj,
142 |        height = 4,
143 |        width = 7.25)
144 | 


--------------------------------------------------------------------------------
/plots/scripts/6-plot_recon_error.R:
--------------------------------------------------------------------------------
 1 | # S. Foltz Feb 2022
 2 | # This plots reconstruction error from PCA reconstruction
 3 | 
 4 | option_list <- list(
 5 |   optparse::make_option("--cancer_type",
 6 |                         default = NA_character_,
 7 |                         help = "Cancer type"),
 8 |   optparse::make_option("--predictor",
 9 |                         default = NA_character_,
10 |                         help = "Predictor used"),
11 |   optparse::make_option("--output_directory",
12 |                         default = NA_character_,
13 |                         help = "Output directory for plot (absolute or relative path)")
14 | )
15 | 
16 | opt <- optparse::parse_args(optparse::OptionParser(option_list=option_list))
17 | source(here::here("util/option_functions.R"))
18 | check_options(opt)
19 | 
20 | # load libraries
21 | suppressMessages(library(tidyverse))
22 | source(here::here("util/color_blind_friendly_palette.R"))
23 | 
24 | # set options
25 | cancer_type <- opt$cancer_type
26 | predictor <- opt$predictor
27 | file_identifier <- str_c(cancer_type, predictor, sep = "_")
28 | 
29 | # define directories
30 | plot.dir <- here::here("plots")
31 | plot.data.dir <- here::here("plots/data")
32 | output_directory <- opt$output_directory
33 | 
34 | # define input file
35 | input_filename <- file.path(plot.data.dir,
36 |                             paste0(file_identifier,
37 |                                    "_reconstruction_error.tsv"))
38 | 
39 | # define output files
40 | output_filename <- file.path(output_directory,
41 |                              paste0(file_identifier,
42 |                                     "_reconstruction_error.pdf"))
43 | 
44 | # read in data
45 | 
46 | plot_df <- readr::read_tsv(input_filename,
47 |                            col_types = "cdcccd") %>%
48 |   mutate(Perc.seq = factor(Perc.seq,
49 |                            levels = seq(0, 100, 10))) %>%
50 |   filter(Mean_Value != Inf)
51 | 
52 | # for each normalization method, plot error stats
53 | plot_obj <- ggplot(plot_df,
54 |                    aes(x = Perc.seq,
55 |                        y = Mean_Value,
56 |                        color = Platform,
57 |                        fill = Platform)) +
58 |   facet_wrap(~ Normalization,
59 |              ncol = 4,
60 |              scales = "free_y") +
61 |   geom_violin(position = position_dodge(0.7),
62 |               alpha = 0.25,
63 |               show.legend = FALSE) +
64 |   stat_summary(fun = median,
65 |                geom = "line",
66 |                aes(group = Platform),
67 |                position = position_dodge(0.7)) +
68 |   stat_summary(fun = median,
69 |                geom = "point",
70 |                aes(group = Platform),
71 |                position = position_dodge(0.7),
72 |                size = 1,
73 |                shape = 16) +
74 |   expand_limits(y = 0) +
75 |   scale_x_discrete(labels = c("0", "", "", "", "",
76 |                               "50", "", "", "", "",
77 |                               "100")) + 
78 |   labs(x = "% RNA-seq Samples in Training Data",
79 |        color = "Test Data Platform",
80 |        fill = "Test Data Platform",
81 |        y = "MASE (per gene)",
82 |        title = str_c("PCA reconstruction error of",
83 |                      cancer_type, predictor, sep = " ")) +
84 |   theme_bw() +
85 |   scale_colour_manual(values = cbPalette[2:3]) +
86 |   theme(legend.position = "bottom")
87 | 
88 | ggsave(output_filename,
89 |        plot = plot_obj,
90 |        height = 4,
91 |        width = 7.25)
92 | 


--------------------------------------------------------------------------------
/plots/scripts/6-plot_recon_kappa.R:
--------------------------------------------------------------------------------
  1 | # S. Foltz Feb 2022
  2 | # This plots reconstruction kappa values from PCA reconstruction
  3 | 
  4 | option_list <- list(
  5 |   optparse::make_option("--cancer_type",
  6 |                         default = NA_character_,
  7 |                         help = "Cancer type"),
  8 |   optparse::make_option("--predictor",
  9 |                         default = NA_character_,
 10 |                         help = "Predictor used"),
 11 |   optparse::make_option("--output_directory",
 12 |                         default = NA_character_,
 13 |                         help = "Output directory for plot (absolute or relative path)")
 14 | )
 15 | 
 16 | opt <- optparse::parse_args(optparse::OptionParser(option_list=option_list))
 17 | source(here::here("util/option_functions.R"))
 18 | check_options(opt)
 19 | 
 20 | # load libraries
 21 | suppressMessages(library(tidyverse))
 22 | source(here::here("util/color_blind_friendly_palette.R"))
 23 | 
 24 | # set options
 25 | cancer_type <- opt$cancer_type
 26 | predictor <- opt$predictor
 27 | file_identifier <- str_c(cancer_type, predictor, sep = "_")
 28 | 
 29 | # define directories
 30 | plot.dir <- here::here("plots")
 31 | plot.data.dir <- here::here("plots/data")
 32 | output_directory <- opt$output_directory
 33 | 
 34 | # define input file
 35 | input_filename <- file.path(plot.data.dir,
 36 |                             paste0(file_identifier,
 37 |                                    "_kappa_reconstructed_data.tsv"))
 38 | 
 39 | # define output files
 40 | output_filename <- file.path(output_directory,
 41 |                              paste0(file_identifier,
 42 |                                     "_kappa_reconstructed.pdf"))
 43 | 
 44 | # read in data
 45 | 
 46 | median_df <- readr::read_tsv(input_filename,
 47 |                              col_types = "dcccdcc") %>%
 48 |   mutate(Perc.seq = factor(Perc.seq,
 49 |                            levels = seq(0, 100, 10))) %>%
 50 |   group_by(Perc.seq, Platform, Classifier, Normalization) %>%
 51 |   summarize(n_obs = n(),
 52 |             med = median(Kappa),
 53 |             IQR = quantile(Kappa, 0.75) - quantile(Kappa, 0.25),
 54 |             median_ci_upper = med + 1.58*IQR/sqrt(n_obs),
 55 |             median_ci_lower = med - 1.58*IQR/sqrt(n_obs),
 56 |             .groups = "drop")
 57 | 
 58 | kappa_df <- read_tsv(input_filename,
 59 |                      col_types = "dcccdcc") %>%
 60 |   mutate(Perc.seq = factor(Perc.seq,
 61 |                            levels = seq(0, 100, 10)))
 62 | 
 63 | # for each normalization method, plot kappa stats
 64 | plot_obj <- ggplot(median_df,
 65 |                    aes(x = Perc.seq,
 66 |                        y = med, # median
 67 |                        color = Platform,
 68 |                        fill = Platform)) +
 69 |   facet_grid(rows = vars(Classifier),
 70 |              cols = vars(Normalization)) +
 71 |   geom_errorbar(aes(x = Perc.seq,
 72 |                     ymin = median_ci_lower,
 73 |                     ymax = median_ci_upper),
 74 |                 size = 0.25,
 75 |                 width = 0.5,
 76 |                 position = position_dodge(0.7)) +
 77 |   geom_line(aes(group = Platform),
 78 |             size = 0.5,
 79 |             position = position_dodge(0.7)) + 
 80 |   geom_point(shape = 16,
 81 |              size = 0.5,
 82 |              show.legend = FALSE,
 83 |              position = position_dodge(0.7)) +
 84 |   geom_point(data = kappa_df,
 85 |              aes(x = Perc.seq,
 86 |                  y = Kappa,
 87 |                  color = Platform,
 88 |                  fill = Platform),
 89 |              alpha = 0.5,
 90 |              size = 0.25,
 91 |              shape = 16,
 92 |              position = position_dodge(0.7),
 93 |              show.legend = FALSE) +
 94 |   expand_limits(y = 1) +
 95 |   scale_x_discrete(labels = c("0", "", "", "", "",
 96 |                               "50", "", "", "", "",
 97 |                               "100")) + 
 98 |   labs(x = "% RNA-seq Samples in Training Data",
 99 |        color = "Test Data Platform",
100 |        fill = "Test Data Platform",
101 |        y = "Kappa",
102 |        title = str_c("PCA reconstruction of",
103 |                      cancer_type, predictor, sep = " ")) +
104 |   theme_bw() +
105 |   scale_colour_manual(values = cbPalette[2:3]) +
106 |   theme(legend.position = "bottom",
107 |         panel.grid.major = element_line(size = 0.25),
108 |         panel.grid.minor = element_line(size = 0.25),
109 |         strip.text.y = element_text(size = 7))
110 | 
111 | ggsave(output_filename,
112 |        plot = plot_obj,
113 |        height = 4,
114 |        width = 7.25)
115 | 


--------------------------------------------------------------------------------
/plots/scripts/7-plot_plier_pathways.R:
--------------------------------------------------------------------------------
 1 | # S. Foltz Feb 2022
 2 | # This plots the rate of return for significant PLIER pathways
 3 | # for data coming from different normalization methods and titration levels.
 4 | 
 5 | option_list <- list(
 6 |   optparse::make_option("--cancer_type",
 7 |                         default = NA_character_,
 8 |                         help = "Cancer type"),
 9 |   optparse::make_option("--predictor",
10 |                         default = NA_character_,
11 |                         help = "Predictor used"),
12 |   optparse::make_option("--output_directory",
13 |                         default = NA_character_,
14 |                         help = "Save plot to this directory")
15 | )
16 | 
17 | opt <- optparse::parse_args(optparse::OptionParser(option_list=option_list))
18 | source(here::here("util/option_functions.R"))
19 | check_options(opt)
20 | 
21 | # load libraries
22 | suppressMessages(library(tidyverse))
23 | source(here::here("util/color_blind_friendly_palette.R"))
24 | 
25 | # set options
26 | cancer_type <- opt$cancer_type
27 | predictor <- opt$predictor
28 | output_directory <- opt$output_directory
29 | file_identifier <- str_c(cancer_type, predictor, sep = "_")
30 | 
31 | # define directories
32 | plot.dir <- here::here("plots")
33 | plot.data.dir <- here::here("plots/data")
34 | 
35 | # define input file
36 | 
37 | plot_data_filename <- file.path(
38 |   plot.data.dir,
39 |   str_c(file_identifier, "_PLIER_jaccard.tsv")
40 | )
41 | 
42 | # define output files
43 | 
44 | output_filename <- file.path(output_directory,
45 |                              str_c(file_identifier,
46 |                                    "_PLIER_jaccard.pdf"))
47 | 
48 | # sample size levels
49 | sample_size_levels <- c("Single Platform\n(half sample size)",
50 |                         "Combined Array and RNA-seq (full sample size)",
51 |                         "Single Platform\n(full sample size)")
52 | 
53 | # Read in data
54 | jaccard_df <- read_tsv(plot_data_filename,
55 |                        col_types = "dddddcdcddl") %>%
56 |   mutate(sample_size = case_when(nmeth == "array_only" ~ sample_size_levels[1],
57 |                                  nmeth == "seq_only" ~ sample_size_levels[1],
58 |                                  pseq == 0 ~ sample_size_levels[3],
59 |                                  pseq == 100 ~ sample_size_levels[3],
60 |                                  TRUE ~ sample_size_levels[2]),
61 |          sample_size = factor(sample_size,
62 |                               levels = sample_size_levels,
63 |                               ordered = TRUE),
64 |          nmeth = case_when(nmeth == "array_only" ~ "LOG\nArray",
65 |                            nmeth == "seq_only" ~ "LOG\nRNA-seq",
66 |                            pseq == 0 ~ "LOG\nArray",
67 |                            pseq == 100 ~ "LOG\nRNA-seq",
68 |                            TRUE ~ str_to_upper(nmeth)))
69 | # Plot results
70 | set.seed(1) # using jitter
71 | 
72 | plot_obj <- jaccard_df %>%
73 |   ggplot(aes(x = nmeth,
74 |              y = jaccard)) +
75 |   geom_violin(draw_quantiles = .5,
76 |               scale = "width") +
77 |   geom_jitter(shape = 16,
78 |               alpha = 0.5,
79 |               height = 0,
80 |               width = 0.1) +
81 |   expand_limits(y = 0) +
82 |   facet_grid(. ~ sample_size,
83 |              scales = "free_x",
84 |              space='free') +
85 |   ggtitle(cancer_type) +
86 |   xlab("Normalization Method") +
87 |   ylab("Proportion of Pathways Significant") +
88 |   theme_bw()
89 | 
90 | ggsave(output_filename,
91 |        plot = plot_obj,
92 |        height = 4, width = 7.25)
93 | 


--------------------------------------------------------------------------------
/plots/scripts/recon_kappa_difference.R:
--------------------------------------------------------------------------------
  1 | # S. Foltz Mar 2022
  2 | # This compares kappa values from category prediction with/out reconstruction
  3 | 
  4 | option_list <- list(
  5 |   optparse::make_option("--cancer_type",
  6 |                         default = NA_character_,
  7 |                         help = "Cancer type"),
  8 |   optparse::make_option("--output_directory",
  9 |                         default = NA_character_,
 10 |                         help = "Output directory for plot (absolute or relative path)")
 11 | )
 12 | 
 13 | opt <- optparse::parse_args(optparse::OptionParser(option_list=option_list))
 14 | source(here::here("util/option_functions.R"))
 15 | check_options(opt)
 16 | 
 17 | # load libraries
 18 | suppressMessages(library(tidyverse))
 19 | source(here::here("util/color_blind_friendly_palette.R"))
 20 | 
 21 | # set options
 22 | cancer_type <- opt$cancer_type
 23 | predictor <- "subtype"
 24 | file_identifier <- str_c(cancer_type, predictor, sep = "_")
 25 | 
 26 | # define directories
 27 | plot.dir <- here::here("plots")
 28 | plot.data.dir <- here::here("plots/data")
 29 | output_directory <- opt$output_directory
 30 | 
 31 | # define input file
 32 | without_recon_input_filename <- file.path(plot.data.dir,
 33 |                                           paste0(file_identifier,
 34 |                                                  "_train_3_models_kappa.tsv"))
 35 | with_recon_input_filename <- file.path(plot.data.dir,
 36 |                                        paste0(file_identifier,
 37 |                                               "_kappa_reconstructed_data.tsv"))
 38 | 
 39 | # define output files
 40 | output_filename <- file.path(output_directory,
 41 |                              paste0(file_identifier,
 42 |                                            "_kappa_reconstruction_difference.pdf"))
 43 | 
 44 | # read in data
 45 | without_df <- read_tsv(without_recon_input_filename,
 46 |                        col_types = "ddccc") %>%
 47 |   mutate(Perc.Seq = factor(Perc.Seq,
 48 |                            levels = seq(0, 100, 10)))
 49 | 
 50 | with_df <- read_tsv(with_recon_input_filename,
 51 |                     col_types = "dcccdcc") %>%
 52 |   mutate(Perc.Seq = factor(Perc.seq,
 53 |                            levels = seq(0, 100, 10)))
 54 | 
 55 | # get data summary (median kappa at each setting)
 56 | 
 57 | without_summary_df <- without_df %>%
 58 |   group_by(Perc.Seq, Classifier, Normalization, Platform) %>%
 59 |   summarize(median_without = median(Kappa),
 60 |             .groups = "drop")
 61 | 
 62 | with_summary_df <- with_df %>%
 63 |   filter(Reconstruction == "PCA",
 64 |          Measure == "kappa") %>%
 65 |   group_by(Perc.Seq, Classifier, Normalization, Platform) %>%
 66 |   summarize(median_with = median(Kappa),
 67 |             .groups = "drop")
 68 | 
 69 | # combined data frames and calculate difference in median kappas
 70 | 
 71 | joint_df <- without_summary_df %>%
 72 |   left_join(with_summary_df,
 73 |             by = c("Perc.Seq", "Classifier", "Normalization", "Platform")) %>%
 74 |   mutate(kappa_difference = median_without - median_with) %>%
 75 |   filter(!is.na(kappa_difference))
 76 | 
 77 | # plot
 78 | 
 79 | plot_obj <- ggplot(joint_df,
 80 |                    aes(x = Perc.Seq,
 81 |                        y = kappa_difference,
 82 |                        color = Platform,
 83 |                        fill = Platform)) +
 84 |   facet_grid(rows = vars(Classifier),
 85 |              cols = vars(Normalization)) +
 86 |   stat_summary(fun = median,
 87 |                geom = "line",
 88 |                aes(group = Platform),
 89 |                position = position_dodge(0.7)) +
 90 |   stat_summary(fun = median,
 91 |                geom = "point",
 92 |                aes(group = Platform),
 93 |                position = position_dodge(0.7),
 94 |                size = 1,
 95 |                shape = 16) +
 96 |   scale_x_discrete(labels = c("0", "", "", "", "",
 97 |                               "50", "", "", "", "",
 98 |                               "100")) + 
 99 |   labs(x = "% RNA-seq Samples in Training Data",
100 |        color = "Test Data Platform",
101 |        fill = "Test Data Platform",
102 |        y = "Difference in Kappa\n(No Reconstruction - Reconstruction)",
103 |        title = str_c(cancer_type, predictor, "(reconstruction difference)", sep = " ")) +
104 |   theme_bw() +
105 |   scale_colour_manual(values = cbPalette[2:3]) +
106 |   theme(legend.position = "bottom")
107 | 
108 | ggsave(output_filename,
109 |        plot = plot_obj,
110 |        height = 4,
111 |        width = 7.25)
112 | 


--------------------------------------------------------------------------------
/plots/scripts/visualize_expression.R:
--------------------------------------------------------------------------------
  1 | # S. Foltz Oct 2021
  2 | # The purpose of this script is to visualize normalized gene expression
  3 | # and compare values from matched microarray and RNA-seq samples
  4 | # USAGE: Rscript visualize_expression.R --cancer_type --predictor --null_model --seed
  5 | 
  6 | option_list <- list(
  7 |   optparse::make_option("--cancer_type",
  8 |                         default = NA_character_,
  9 |                         help = "Cancer type"),
 10 |   optparse::make_option("--predictor",
 11 |                         default = NA_character_,
 12 |                         help = "Predictor used"),
 13 |   optparse::make_option("--null_model",
 14 |                         action = "store_true",
 15 |                         default = FALSE,
 16 |                         help = "Refer to models with permuted dependent variable (within subtype if predictor is a gene)"),
 17 |   optparse::make_option("--seed",
 18 |                         default = 1234,
 19 |                         help = "Set a seed to ensure reproducible results when subsampling genes")
 20 | )
 21 | 
 22 | opt <- optparse::parse_args(optparse::OptionParser(option_list=option_list))
 23 | source(here::here("util/option_functions.R"))
 24 | check_options(opt)
 25 | 
 26 | # load libraries
 27 | suppressMessages(library(tidyverse))
 28 | source(here::here("util", "color_blind_friendly_palette.R"))
 29 | 
 30 | # set options
 31 | cancer_type <- opt$cancer_type
 32 | predictor <- opt$predictor
 33 | null_model <- opt$null_model
 34 | file_identifier <- ifelse(null_model,
 35 |                           str_c(cancer_type, predictor, "null", sep = "_"),
 36 |                           str_c(cancer_type, predictor, sep = "_"))
 37 | 
 38 | # set seed
 39 | set.seed(opt$seed)
 40 | 
 41 | # define directories
 42 | plot.dir <- here::here("plots")
 43 | norm.dir <- here::here("normalized_data")
 44 | viz.dir <- file.path(plot.dir, "visualize_expression")
 45 | 
 46 | # define input files
 47 | normalized_test_data_filename <- list.files(norm.dir,
 48 |                                             pattern = str_c(file_identifier,
 49 |                                                             "_array_seq_test_data_normalized_list_"),
 50 |                                             full.names = TRUE)[1]
 51 | 
 52 | normalized_test_data <- read_rds(normalized_test_data_filename)
 53 | 
 54 | ### functions ------------------------------------------------------------------
 55 | 
 56 | plot_matched_expression <- function(array_values, seq_values,
 57 |                                     method_title, plot_type,
 58 |                                     output_directory, filename_lead) {
 59 |   
 60 |   # This function creates a plot for expression values from matched array and RNA-seq samples
 61 |   # The function can produce a plot with points (alpha = 0.1) or a hex grid to show density
 62 |   # Inputs:
 63 |   #   array_values = a vector of array values
 64 |   #   seq_values = vector of seq values, matched to array values
 65 |   #   method_title = something informative that will define the plot title and output filename
 66 |   #   plot_type = either 'point' or 'hex' depending on the desired plot type
 67 |   #   output_directory = output directory of PDF
 68 |   #   filename_lead = start of the output filename
 69 |   # Outputs:
 70 |   #   a PDF of the plot is saved to output_directory
 71 |   
 72 |   this_plot <- ggplot(mapping = aes(x = array_values,
 73 |                                       y = seq_values))
 74 |     
 75 |   if (plot_type == "point") {
 76 |     this_plot <- this_plot +
 77 |       geom_point(alpha = 0.1,
 78 |                  shape = 16)
 79 |   } else if (plot_type == "hex") {
 80 |     this_plot <- this_plot +
 81 |       geom_hex()
 82 |   } else {
 83 |     stop("Plot type must be 'point' or 'hex'.")
 84 |   }
 85 |   
 86 |   this_plot <- this_plot + 
 87 |     geom_abline(lty = 2, # dashed red x-y line 
 88 |                 color = "red") +
 89 |     geom_smooth(method = "gam", # fit a curve to the data
 90 |                 formula = y ~ s(x, bs = "cs")) + # loess no good for large n
 91 |     labs(x = "Microarray expression values",
 92 |          y = "RNA-seq expression values",
 93 |          title = method_title) +
 94 |     theme_minimal()
 95 |   
 96 |   if (method_title != "UN") {
 97 |     this_plot <- this_plot +
 98 |       coord_fixed()
 99 |   }
100 |   
101 |   ggsave(plot = this_plot,
102 |          filename = file.path(output_directory,
103 |                               str_c(filename_lead,
104 |                                     method_title,
105 |                                     plot_type,
106 |                                     "pdf",
107 |                                     sep = ".")),
108 |          height = 7.25,
109 |          width = 7.25)
110 | }
111 | 
112 | #### plot matched comparison of matched microarray and RNA-seq -----------------
113 | 
114 | gene_rows_included <- sort(sample(1:nrow(normalized_test_data$array$log),
115 |                                   size = 1000, # select 1000 random genes
116 |                                   replace = FALSE))
117 | 
118 | norm_methods <- names(normalized_test_data$seq) # get all normalization methods
119 | 
120 | for (nm in norm_methods) {
121 |   
122 |   if (nm %in% c("seurat")) next
123 |   
124 |   if (nm == "tdm") {
125 |     # array has no TDM (it is already log)
126 |     array_values <- as.vector(as.matrix(normalized_test_data$array[["log"]][gene_rows_included, -1]))
127 |     for (pct_rna_seq in as.character(seq(0, 90, 10))) { # NULL at 100% RNA-seq
128 |       # only seq varies across %RNA-seq
129 |       seq_values <- as.vector(as.matrix(normalized_test_data$seq[[nm]][[pct_rna_seq]][gene_rows_included, -1]))
130 |       method_title <- str_c(str_to_upper(nm), pct_rna_seq, sep = "_")
131 |       
132 |       plot_matched_expression(array_values, seq_values,
133 |                               method_title, plot_type = "hex",
134 |                               viz.dir, file_identifier)
135 |       
136 |     } 
137 |   } else if (nm %in% c("qn", "qn-z")) {
138 |     array_values <- as.vector(as.matrix(normalized_test_data$array[[nm]][gene_rows_included, -1]))
139 |     for (pct_rna_seq in as.character(seq(0, 100, 10))) {
140 |       # only seq varies across %RNA-seq
141 |       seq_values <- as.vector(as.matrix(normalized_test_data$seq[[nm]][[pct_rna_seq]][gene_rows_included, -1]))
142 |       method_title <- str_c(str_to_upper(nm), pct_rna_seq, sep = "_")
143 |       
144 |       plot_matched_expression(array_values, seq_values,
145 |                               method_title, plot_type = "hex",
146 |                               viz.dir, file_identifier)
147 |       
148 |     }
149 |   } else { # test data for normalization methods that do not vary with RNA-seq % in training data
150 |     array_values <- as.vector(as.matrix(normalized_test_data$array[[nm]][gene_rows_included, -1]))
151 |     seq_values <- as.vector(as.matrix(normalized_test_data$seq[[nm]][gene_rows_included, -1]))
152 |     method_title <- str_to_upper(nm)
153 |     
154 |     plot_matched_expression(array_values, seq_values,
155 |                             method_title, plot_type = "hex",
156 |                             viz.dir, file_identifier)
157 |     
158 |   }
159 | }
160 | 


--------------------------------------------------------------------------------
/plots/supplementary/.empty:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/greenelab/RNAseq_titration_results/2d9b2f5ce2f04fc76279b2fc314e51abc9a4f3b4/plots/supplementary/.empty


--------------------------------------------------------------------------------
/plots/visualize_expression/.empty:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/greenelab/RNAseq_titration_results/2d9b2f5ce2f04fc76279b2fc314e51abc9a4f3b4/plots/visualize_expression/.empty


--------------------------------------------------------------------------------
/prepare_GBM_data.R:
--------------------------------------------------------------------------------
  1 | # Script prepares GBM expression data for use in pipelines
  2 | # For GBM array data: convert sample names and remove duplicate individuals
  3 | # For GBM seq data: filter all seq data for GBM samples only, convert genes IDs
  4 | 
  5 | # Steven Foltz July 2021
  6 | 
  7 | option_list <- list(
  8 |   optparse::make_option("--seq_input",
  9 |                         default = NA_character_,
 10 |                         help = "TCGA sequencing expression input file path"),
 11 |   optparse::make_option("--array_input",
 12 |                         default = NA_character_,
 13 |                         help = "refine.bio microarray expression input file path"),
 14 |   optparse::make_option("--metadata_input",
 15 |                         default = NA_character_,
 16 |                         help = "refine.bio aggregated metadata JSON file path"),
 17 |   optparse::make_option("--array_output",
 18 |                         default = NA_character_,
 19 |                         help = "Processed microarray data output file path"),
 20 |   optparse::make_option("--seq_output",
 21 |                         default = NA_character_,
 22 |                         help = "Processed sequencing data output file path"),
 23 |   optparse::make_option("--clinical_input",
 24 |                         default = NA_character_,
 25 |                         help = "Clinical information input file path (Excel file)"),
 26 |   optparse::make_option("--clinical_output",
 27 |                         default = NA_character_,
 28 |                         help = "Clinical information output file path (.tsv)"),
 29 |   optparse::make_option("--overwrite",
 30 |                         action = "store_true",
 31 |                         default = FALSE,
 32 |                         help = "Overwrite existing output files [default: %default]")
 33 | )
 34 | 
 35 | opt <- optparse::parse_args(optparse::OptionParser(option_list=option_list))
 36 | source(here::here("util/option_functions.R"))
 37 | check_options(opt)
 38 | 
 39 | # load libraries
 40 | suppressMessages(library(tidyverse))
 41 | 
 42 | # set options
 43 | tcga_seq_expression_input_filepath <- opt$seq_input
 44 | gbm_array_expression_input_filepath <- opt$array_input
 45 | metadata_json_input_filepath <- opt$metadata_input
 46 | gbm_array_output_filepath <- opt$array_output
 47 | gbm_seq_output_filepath <- opt$seq_output
 48 | clinical_xlxs_input_filepath <- opt$clinical_input
 49 | clinical_tsv_output_filepath <- opt$clinical_output
 50 | 
 51 | ################################################################################
 52 | # Array data
 53 | ################################################################################
 54 | 
 55 | # read in refine.bio GBM array expression data
 56 | gbm_array_expression <- read_tsv(gbm_array_expression_input_filepath,
 57 |                                  col_types = cols(
 58 |                                    .default = col_double(),
 59 |                                    Gene = col_character()
 60 |                                  ))
 61 | 
 62 | # load up aggregated metadata json file
 63 | metadata_json <- jsonlite::fromJSON(metadata_json_input_filepath,
 64 |                                     simplifyVector = FALSE)
 65 | 
 66 | # accession IDs present in expression data
 67 | available_array_accession_ids <- colnames(gbm_array_expression)[-1]
 68 | 
 69 | # All about TCGA barcodes
 70 | # TCGA barcodes are defined in this format: TCGA-XX-YYYY-ZZ*
 71 | # XX is the two character tissue source site (TSS) (defines a combination of hospital system and cancer type -- so multiple TSSs map to single cancer type)
 72 | # YYYY is the four digit participant ID specific to a TSS (so TCGA-XX-YYYY defines an individual patient -- patients from different TSSs may have same participant ID)
 73 | # ZZ is the sample type (starts with 0 for tumor samples; specifically 01 for primary solid tumors)
 74 | # After ZZ (*) is more specific information not relevant in this context
 75 | # Links to more info:
 76 | # https://docs.gdc.cancer.gov/Encyclopedia/pages/TCGA_Barcode/
 77 | # https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/tissue-source-site-codes
 78 | # https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/sample-type-codes
 79 | 
 80 | # starting with flattened metadata, parse out raw TCGA IDs and filter for tumors
 81 | all_array_tumor_samples <- tibble(accession = names(metadata_json$samples)) %>%
 82 |   filter(accession %in% available_array_accession_ids) %>% # check these are ==
 83 |   rowwise() %>%
 84 |   mutate(tcga_id_raw = metadata_json$samples[[accession]]$refinebio_annotations[[1]]$characteristics_ch1[[9]] %>%
 85 |            str_remove("sample: ")) %>%
 86 |   # tcga_id_raw is the entire TCGA barcode (includes information after ZZ which we need to drop)
 87 |   mutate(tcga_id = str_sub(tcga_id_raw, 1, 15), # so tcga_id includes TCGA-XX-YYYY-ZZ
 88 |          sample = str_sub(tcga_id_raw, 14, 15)) %>% # and sample refers to ZZ
 89 |   filter(sample == "01") %>% # filter for primary solid tumors only
 90 |   ungroup()
 91 | 
 92 | # keep one (first) accession per TCGA ID
 93 | array_accession_tcga_id_keep <- all_array_tumor_samples %>%
 94 |   group_by(tcga_id) %>%
 95 |   summarize(accession = sort(accession)[1]) %>%
 96 |   ungroup()
 97 | accession_colnames_keep <- colnames(gbm_array_expression)[-1][colnames(gbm_array_expression)[-1] %in% array_accession_tcga_id_keep$accession]
 98 | 
 99 | # select columns to keep and rename with TCGA IDs
100 | gbm_array_expression_renamed <- gbm_array_expression %>%
101 |   select(c("Gene",
102 |            array_accession_tcga_id_keep$accession))
103 | colnames(gbm_array_expression_renamed) <- c("sample",
104 |                                             array_accession_tcga_id_keep$tcga_id)
105 | 
106 | ################################################################################
107 | # Sequencing data
108 | ################################################################################
109 | 
110 | # read in column names of entire TCGA seq expression file
111 | tcga_seq_expression_column_names <- read_tsv(tcga_seq_expression_input_filepath,
112 |                                              col_types = cols(
113 |                                                .default = col_double(),
114 |                                                gene_id = col_character()),
115 |                                              n_max = 0) %>%
116 |   names()
117 | 
118 | # identify sequencing TCGA IDs of samples present in array data
119 | # (a more inclusive approach would be selecting GBM samples based on TSS codes)
120 | gbm_seq_tumor_samples <- tibble(tcga_id_raw = tcga_seq_expression_column_names[-1]) %>%
121 |   mutate(tcga_patient = str_sub(tcga_id_raw, 1, 12), # TCGA-XX-YYYY to match with clinical
122 |          tcga_id = str_sub(tcga_id_raw, 1, 15), # as with array, tcga_id refers to TCGA-XX-YYYY-ZZ
123 |          sample = str_sub(tcga_id_raw, 14, 15)) %>% # and sample is ZZ
124 |   filter(sample == "01") %>% # require sample to be primary solid tumor
125 |   filter(tcga_id %in% array_accession_tcga_id_keep$tcga_id) %>% # keep array GBMs
126 |   group_by(tcga_patient) %>%
127 |   summarize(tcga_id_raw = sort(tcga_id_raw)[1], # keep one raw ID per person
128 |             tcga_id = str_sub(tcga_id_raw, 1, 15)) %>%
129 |   ungroup()
130 | 
131 | # now read in GBM subset of entire TCGA seq expression file
132 | # this is faster and uses less memory than reading in entire file and then subsetting
133 | # read these GBM columns only
134 | tcga_seq_gbm_tf <- tcga_seq_expression_column_names[-1] %in% gbm_seq_tumor_samples$tcga_id_raw
135 | # use these column types
136 | # first column is 'c' for gene_id, then '-' for non-GBM samples, then 'd' for GBM
137 | # setting column type to '-' skips over that column when reading file
138 | tcga_seq_gbm_col_types <- str_c(c("c", c("-", "d")[tcga_seq_gbm_tf + 1]), collapse = "")
139 | # read in my defined subset of columns with column types
140 | gbm_seq_expression <- read_tsv(tcga_seq_expression_input_filepath,
141 |                                col_types = tcga_seq_gbm_col_types)
142 | colnames(gbm_seq_expression) <- c("gene_id",
143 |                                   gbm_seq_tumor_samples$tcga_id)
144 | 
145 | # Detour to make gene ids consistent between array and seq files
146 | # will convert seq format (SYMBOL|ENTREZ) to array format (ENSG)
147 | 
148 | # separate gene symbols from entrez ids (delimiter = "|")
149 | symbol_entrez_ids <- gbm_seq_expression %>%
150 |   select(gene_id) %>%
151 |   separate(gene_id,
152 |            into = c("SYMBOL", "ENTREZID"),
153 |            sep = "\\|",
154 |            remove = FALSE)
155 | 
156 | # map entrez ids to ensembl ids (GENEID)
157 | entrez_ensembl_ids <- ensembldb::select(EnsDb.Hsapiens.v86::EnsDb.Hsapiens.v86,
158 |                                         keys= symbol_entrez_ids$ENTREZID,
159 |                                         keytype = "ENTREZID",
160 |                                         columns = "GENEID") %>%
161 |   as_tibble() %>%
162 |   mutate(ENTREZID = as.character(ENTREZID))
163 | 
164 | # collate gene name schemes
165 | # filter for those that mapped and exist in array
166 | # filter for ENSGs with a one-to-one mapping with entrez
167 | gene_id_mapping_in_array <- symbol_entrez_ids %>%
168 |   left_join(entrez_ensembl_ids,
169 |             by = "ENTREZID") %>%
170 |   filter(!is.na(GENEID)) %>%
171 |   add_count(GENEID) %>%
172 |   filter(n == 1) %>%
173 |   filter(GENEID %in% gbm_array_expression$Gene) %>%
174 |   select(gene_id, GENEID)
175 | 
176 | # starting with acceptable genes, left join with seq expression and select cols
177 | gbm_seq_expression_renamed <- gene_id_mapping_in_array %>%
178 |   left_join(gbm_seq_expression,
179 |             by = "gene_id") %>%
180 |   select(-gene_id) %>%
181 |   rename("sample" = "GENEID")
182 | 
183 | ################################################################################
184 | # subtype information
185 | ################################################################################
186 | 
187 | # read in Table S7 from flagship GBM landscape paper (Brennan et al., Cell 2013)
188 | # select and rename interesting columns
189 | gbm_subtypes <- readxl::read_xlsx(path = clinical_xlxs_input_filepath,
190 |                                   sheet = "Clinical Data",
191 |                                   skip = 1) %>%
192 |   right_join(gbm_seq_tumor_samples,
193 |              by = c("Case ID" = "tcga_patient")) %>%
194 |   select("tcga_id",
195 |          "MGMT Status",
196 |          "G-CIMP\r\n methylation",
197 |          "IDH1\r\n status",
198 |          "Expression\r\nSubclass") %>%
199 |   rename("Sample" = "tcga_id",
200 |          "MGMT_methylation_status" = "MGMT Status",
201 |          "G-CIMP_methylation" = "G-CIMP\r\n methylation",
202 |          "IDH1_mutation_status" = "IDH1\r\n status",
203 |          "subtype" = "Expression\r\nSubclass") %>%
204 |   mutate(subtype = na_if(subtype, "NA")) %>%
205 |   mutate(subtype = stringr::str_remove(subtype, "-")) %>% # G-CIMP to GCIMP
206 |   mutate(Type = "tumor")
207 | 
208 | missing_clinical <- gbm_subtypes %>%
209 |   filter(is.na(subtype)) %>%
210 |   pull(Sample)
211 | 
212 | ################################################################################
213 | # Write to file, excluding samples without clinical info
214 | ################################################################################
215 | 
216 | write_tsv(gbm_array_expression_renamed %>%
217 |             select(-all_of(missing_clinical)),
218 |           gbm_array_output_filepath)
219 | 
220 | write_tsv(gbm_seq_expression_renamed %>%
221 |             select(-all_of(missing_clinical)),
222 |           gbm_seq_output_filepath)
223 | 
224 | write_tsv(gbm_subtypes %>%
225 |             filter(!is.na(subtype)),
226 |           clinical_tsv_output_filepath)
227 | 


--------------------------------------------------------------------------------
/results/.empty:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/greenelab/RNAseq_titration_results/2d9b2f5ce2f04fc76279b2fc314e51abc9a4f3b4/results/.empty


--------------------------------------------------------------------------------
/results/array_rnaseq_ratio/ratio.2022-02-18_18_50_01_UTC.tsv:
--------------------------------------------------------------------------------
1 | Platform	GEO	AE	Total
2 | Array	1163755	207117	1370872
3 | RNA-seq	1078052	134243	1212295
4 | 


--------------------------------------------------------------------------------
/results/array_rnaseq_ratio/ratio.tracking.tsv:
--------------------------------------------------------------------------------
1 | File:ratio.2022-02-18_18_50_01_UTC.tsv	Date:2022-02-18_18_50_01_UTC	Array_to_RNA-seq_ratio:1.130807270507591
2 | 


--------------------------------------------------------------------------------
/results/differential_expression/.empty:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/greenelab/RNAseq_titration_results/2d9b2f5ce2f04fc76279b2fc314e51abc9a4f3b4/results/differential_expression/.empty


--------------------------------------------------------------------------------
/results/reconstructed_data/.empty:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/greenelab/RNAseq_titration_results/2d9b2f5ce2f04fc76279b2fc314e51abc9a4f3b4/results/reconstructed_data/.empty


--------------------------------------------------------------------------------
/retrieve_MC3_mutations.py:
--------------------------------------------------------------------------------
  1 | import gzip
  2 | import os
  3 | import sys
  4 | 
  5 | # define output filepaths
  6 | output_tsv_filepath = "data/mutations.tsv"
  7 | output_maf_filepath = "data/mutations.maf"
  8 | 
  9 | # define directories
 10 | data_dir = "data"
 11 | 
 12 | # TCGA MC3 MAF from https://gdc.cancer.gov/about-data/publications/pancanatlas
 13 | mc3_filename = os.path.join(data_dir, "mc3.v0.2.8.PUBLIC.maf.gz")
 14 | 
 15 | # cancer types and genes of interest
 16 | cancer_type_abbrevs = {"Breast invasive carcinoma": "BRCA",
 17 |                        "Glioblastoma multiforme": "GBM"}
 18 | cancer_types_of_interest = cancer_type_abbrevs.keys()
 19 | genes_of_interest = ["PIK3CA", "TP53"]
 20 | 
 21 | ############################################################
 22 | # Tissue source sites define the cancer type of the sample #
 23 | ############################################################
 24 | 
 25 | # TSS codes from https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/tissue-source-site-codes
 26 | tcga_tss_codes = open("tcga_tss_codes.csv", "r")
 27 | tcga_tss_codes.readline()
 28 | tcga_tss_codes_dict = {}
 29 | 
 30 | # set up TSS dictionary with {tss: cancer_type}
 31 | for line in tcga_tss_codes:
 32 |   k,v = line.strip().split(",")
 33 |   tcga_tss_codes_dict[k] = v
 34 | 
 35 | tcga_tss_codes.close()
 36 | 
 37 | ###############################
 38 | # Retrieve mutations from MC3 #
 39 | ###############################
 40 | 
 41 | # simple mutation dictionary {cancer_type: {tcga_id: ["PIK3CA", "TP53"]}}
 42 | # this will be used at end to create simple 0/1 mutation status data frame
 43 | mutation_dict = {x: {} for x in cancer_types_of_interest}
 44 | 
 45 | # open up MC3 and define header lines
 46 | mc3 = gzip.open(mc3_filename, "rb")
 47 | maf_header = mc3.readline().decode('UTF-8').strip().split()
 48 | maf_ixs = {name: ix for ix, name in enumerate(maf_header)}
 49 | tsv_header = "\t".join(["tcga_id", "cancer_type", "Tumor_Sample_Barcode", "Matched_Norm_Sample_Barcode", "Hugo_Symbol", "Chromosome", "Start_Position", "Variant_Classification"])
 50 | 
 51 | output_tsv = open(output_tsv_filepath, "w")
 52 | output_maf = open(output_maf_filepath, "w")
 53 | 
 54 | output_tsv.write(tsv_header + "\n")
 55 | output_maf.write('\t'.join(maf_header) + "\n")
 56 | 
 57 | # progress through each line of MC3
 58 | # check if sample is primary solid tumor ("01") and
 59 | # from out genes and cancer types of interest
 60 | for line in mc3:
 61 |   record = line.decode('UTF-8').strip().split("\t")
 62 |   hugo_symbol = record[maf_ixs['Hugo_Symbol']] # gene name
 63 |   tcga_id_raw = record[maf_ixs['Tumor_Sample_Barcode']] # tumor barcode
 64 |   tcga_id_raw_normal = record[maf_ixs['Matched_Norm_Sample_Barcode']] # normal barcode
 65 |   is_tumor = tcga_id_raw.split("-")[3].startswith("01")
 66 |   tss_code = tcga_id_raw.split("-")[1]
 67 |   cancer_type = tcga_tss_codes_dict[tss_code]
 68 | 
 69 |   if is_tumor and cancer_type in cancer_types_of_interest:
 70 |     tcga_id = tcga_id_raw[0:15]
 71 |     
 72 |     # add TCGA ID to mutation dict
 73 |     if tcga_id not in mutation_dict[cancer_type]:
 74 |       mutation_dict[cancer_type][tcga_id] = set()
 75 | 
 76 |     # if gene of interest, add to mutation dict list for that ID and outputs
 77 |     if hugo_symbol in genes_of_interest:
 78 |       chromosome = record[maf_ixs['Chromosome']] # chromosome of mutation
 79 |       start_position = record[maf_ixs['Start_Position']] # position of mutation
 80 |       variant_class = record[maf_ixs['Variant_Classification']] # e.g. Missense_Mutation, In_Frame_Del
 81 |     
 82 |       # add a gene to mutation set
 83 |       mutation_dict[cancer_type][tcga_id].add(hugo_symbol)
 84 | 
 85 |       output_tsv.write("\t".join([tcga_id, cancer_type_abbrevs[cancer_type], tcga_id_raw, tcga_id_raw_normal, hugo_symbol, chromosome, start_position, variant_class]) + "\n")
 86 |       output_maf.write("\t".join(record) + "\n")
 87 | 
 88 | mc3.close()
 89 | output_tsv.close()
 90 | output_maf.close()
 91 | 
 92 | # write cancer-type-specific simple output data frames (0/1 for gene mutation status)
 93 | for cancer_type in cancer_types_of_interest:
 94 | 
 95 |   simple_output_filename = os.path.join(data_dir,
 96 |                                         "mutations." + cancer_type_abbrevs[cancer_type] + ".tsv")
 97 |   simple_output = open(simple_output_filename, "w")
 98 |   simple_output_header = "\t".join(["tcga_id", "PIK3CA", "TP53"]) + "\n"
 99 | 
100 |   simple_output.write(simple_output_header)
101 | 
102 |   # each TCGA ID has a set of mutated genes
103 |   # for each gene of interest, return a binary mutation status if that gene appears in the list
104 |   # for each TCGA ID, report the mutation status of each gene as a row in output data frame
105 |   for tcga_id, mutation_list in mutation_dict[cancer_type].items():
106 |     mutation_status_list = [str(int(x in mutation_list)) for x in genes_of_interest]
107 |     simple_output.write(tcga_id + "\t" + "\t".join(mutation_status_list) + "\n")
108 | 
109 |   simple_output.close()
110 | 


--------------------------------------------------------------------------------
/run_all_analyses_and_plots.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/sh
  2 | set -euo pipefail
  3 | 
  4 | # This script runs analysis code and plotting scripts for the publication,
  5 | # including both cancer types (both) or just one cancer type (BRCA or GBM).
  6 | # The script calls:
  7 | #  1. run_machine_learning_experiments.sh
  8 | #  2. run_differential_expression_experiments.sh (subtype only)
  9 | #  3. plots/scripts/visualize_expression.R (subtype only)
 10 | #  4. plotting scripts, as appropriate
 11 | 
 12 | # cancer type (must be both, BRCA, or GBM)
 13 | cancer_type=$1
 14 | 
 15 | if [ $cancer_type != "both" ] && [ $cancer_type != "BRCA" ] && [ $cancer_type != "GBM" ]; then
 16 |   echo Cancer type must be both, BRCA or GBM in run_all_analyses_and_plots.sh [cancer_type]
 17 |   exit
 18 | fi
 19 | 
 20 | ################################################################################
 21 | # BRCA
 22 | ################################################################################
 23 | 
 24 | if [ $cancer_type == "both" ] || [ $cancer_type == "BRCA" ]; then
 25 | 
 26 |   # BRCA subtype ---------------------------------------------------------------
 27 | 
 28 |   # run machine learning and DEG analysis scripts
 29 |   bash run_machine_learning_experiments.sh BRCA subtype 7
 30 |   bash run_differential_expression_experiments.sh BRCA Basal Her2,LumA Her2,LumA 7
 31 | 
 32 |   # plot array vs. RNA-seq expression levels after normalization
 33 |   Rscript plots/scripts/visualize_expression.R --cancer_type BRCA --predictor subtype
 34 | 
 35 |   # plot difference in subtype prediction kappa between non-reconstructed and reconstructed data
 36 |   Rscript plots/scripts/recon_kappa_difference.R --cancer_type BRCA --output_directory plots/supplementary
 37 | 
 38 |   # stacked bar plot showing distribution of subtypes in train/test sets (one representative example)
 39 |   Rscript plots/scripts/0-plot_predictor_category_distributions.R \
 40 |     --cancer_type BRCA \
 41 |     --predictor subtype \
 42 |     --output_directory plots/supplementary
 43 | 
 44 |   # stacked bar plots showing distribution of subtypes in train/test sets (all seeds)
 45 |   Rscript plots/scripts/0-plot_predictor_category_distributions.R \
 46 |     --cancer_type BRCA \
 47 |     --predictor subtype \
 48 |     --plot_all_seeds \
 49 |     --output_directory plots/supplementary
 50 | 
 51 |   # violin + line plots showing kappa values from predictions on test data
 52 |   Rscript plots/scripts/3-plot_category_kappa.R \
 53 |     --cancer_type BRCA \
 54 |     --predictor subtype \
 55 |     --output_directory plots/main
 56 | 
 57 |   # violin + line plots showing kappa values from predictions on reconstructed test data
 58 |   Rscript plots/scripts/6-plot_recon_kappa.R \
 59 |     --cancer_type BRCA \
 60 |     --predictor subtype \
 61 |     --output_directory plots/supplementary
 62 | 
 63 |   # violin + line plots showing gene-level MASE values from reconstructed test data
 64 |   Rscript plots/scripts/6-plot_recon_error.R \
 65 |     --cancer_type BRCA \
 66 |     --predictor subtype \
 67 |     --output_directory plots/supplementary
 68 | 
 69 |   # violin plots showing proportion of pathways significant in PLIER analyses
 70 |   Rscript plots/scripts/7-plot_plier_pathways.R \
 71 |     --cancer_type BRCA \
 72 |     --predictor subtype \
 73 |     --output_directory plots/main
 74 | 
 75 |   # bar plot showing proportion of genes differentially expressed (Basal vs. Others)
 76 |   # line plot showing overlap with silver standard DEGs (Basal vs. Others)
 77 |   Rscript plots/scripts/1A-plot_DEGs.R \
 78 |     --cancer_type BRCA \
 79 |     --subtype_vs_others Basal \
 80 |     --proportion_output_directory plots/supplementary \
 81 |     --overlap_output_directory plots/supplementary \
 82 |     --overlap_measure Jaccard,Spearman
 83 |   
 84 |   # bar plot showing proportion of genes differentially expressed (Her2 vs. LumA)
 85 |   # line plot showing overlap with silver standard DEGs (Her2 vs. LumA)
 86 |   Rscript plots/scripts/1A-plot_DEGs.R \
 87 |     --cancer_type BRCA \
 88 |     --subtype_vs_subtype Her2,LumA \
 89 |     --proportion_output_directory plots/supplementary \
 90 |     --overlap_output_directory plots/main \
 91 |     --overlap_measure Jaccard,Spearman
 92 | 
 93 |   # line plot showing overlap with silver standard DEGs (Her2 vs. LumA) across small n values
 94 |   Rscript plots/scripts/2A-plot_small_n_differential_expression.R \
 95 |     --cancer_type BRCA \
 96 |     --subtype_vs_subtype Her2,LumA \
 97 |     --output_directory plots/main \
 98 |     --overlap_measure Jaccard,Spearman
 99 | 
100 |   # ----------------------------------------------------------------------------
101 | 
102 |   # BRCA TP53 ------------------------------------------------------------------
103 | 
104 |   # run machine learning analysis scripts
105 |   bash run_machine_learning_experiments.sh BRCA TP53 7
106 | 
107 |   # stacked bar plot showing distribution of subtypes in train/test sets (one representative example)
108 |   Rscript plots/scripts/0-plot_predictor_category_distributions.R \
109 |     --cancer_type BRCA \
110 |     --predictor TP53 \
111 |     --output_directory plots/supplementary
112 | 
113 |   # stacked bar plots showing distribution of subtypes in train/test sets (all seeds)
114 |   Rscript plots/scripts/0-plot_predictor_category_distributions.R \
115 |     --cancer_type BRCA \
116 |     --predictor TP53 \
117 |     --plot_all_seeds \
118 |     --output_directory plots/supplementary
119 | 
120 |   # violin + line plots showing kappa values from predictions on test data
121 |   Rscript plots/scripts/3-plot_category_kappa.R \
122 |     --cancer_type BRCA \
123 |     --predictor TP53 \
124 |     --null_model \
125 |     --output_directory plots/supplementary
126 | 
127 |   # ----------------------------------------------------------------------------
128 | 
129 |   # BRCA PIK3CA ----------------------------------------------------------------
130 | 
131 |   # run machine learning analysis scripts
132 |   bash run_machine_learning_experiments.sh BRCA PIK3CA 7
133 | 
134 |   # stacked bar plot showing distribution of subtypes in train/test sets (one representative example)
135 |   Rscript plots/scripts/0-plot_predictor_category_distributions.R \
136 |     --cancer_type BRCA \
137 |     --predictor PIK3CA \
138 |     --output_directory plots/supplementary
139 | 
140 |   # stacked bar plots showing distribution of subtypes in train/test sets (all seeds)
141 |   Rscript plots/scripts/0-plot_predictor_category_distributions.R \
142 |     --cancer_type BRCA \
143 |     --predictor PIK3CA \
144 |     --plot_all_seeds \
145 |     --output_directory plots/supplementary
146 | 
147 |   # violin + line plots showing kappa values from predictions on test data
148 |   Rscript plots/scripts/3-plot_category_kappa.R \
149 |     --cancer_type BRCA \
150 |     --predictor PIK3CA \
151 |     --null_model \
152 |     --output_directory plots/supplementary
153 | 
154 |   # ----------------------------------------------------------------------------
155 | 
156 | fi
157 | 
158 | ################################################################################
159 | # GBM
160 | ################################################################################
161 | 
162 | if [ $cancer_type == "both" ] || [ $cancer_type == "GBM" ]; then
163 | 
164 |   # GBM subtype ----------------------------------------------------------------
165 | 
166 |   # run machine learning and DEG analysis scripts
167 |   bash run_machine_learning_experiments.sh GBM subtype 7
168 |   bash run_differential_expression_experiments.sh GBM Proneural Classical,Mesenchymal Classical,Mesenchymal 7
169 | 
170 |   # plot array vs. RNA-seq expression levels after normalization
171 |   Rscript plots/scripts/visualize_expression.R --cancer_type GBM --predictor subtype
172 | 
173 |   # plot difference in subtype prediction kappa between non-reconstructed and reconstructed data
174 |   Rscript plots/scripts/recon_kappa_difference.R --cancer_type GBM --output_directory plots/supplementary
175 | 
176 |   # stacked bar plot showing distribution of subtypes in train/test sets (one representative example)
177 |   Rscript plots/scripts/0-plot_predictor_category_distributions.R \
178 |     --cancer_type GBM \
179 |     --predictor subtype \
180 |     --output_directory plots/supplementary
181 | 
182 |   # stacked bar plots showing distribution of subtypes in train/test sets (all seeds)
183 |   Rscript plots/scripts/0-plot_predictor_category_distributions.R \
184 |     --cancer_type GBM \
185 |     --predictor subtype \
186 |     --plot_all_seeds \
187 |     --output_directory plots/supplementary
188 | 
189 |   # violin + line plots showing kappa values from predictions on test data
190 |   Rscript plots/scripts/3-plot_category_kappa.R \
191 |     --cancer_type GBM \
192 |     --predictor subtype \
193 |     --output_directory plots/supplementary
194 | 
195 |   # violin + line plots showing kappa values from predictions on reconstructed test data
196 |   Rscript plots/scripts/6-plot_recon_kappa.R \
197 |     --cancer_type GBM \
198 |     --predictor subtype \
199 |     --output_directory plots/supplementary
200 | 
201 |   # violin + line plots showing gene-level MASE values from reconstructed test data
202 |   Rscript plots/scripts/6-plot_recon_error.R \
203 |     --cancer_type GBM \
204 |     --predictor subtype \
205 |     --output_directory plots/supplementary
206 | 
207 |   # violin plots showing proportion of pathways significant in PLIER analyses
208 |   Rscript plots/scripts/7-plot_plier_pathways.R \
209 |     --cancer_type GBM \
210 |     --predictor subtype \
211 |     --output_directory plots/main
212 | 
213 |   # bar plot showing proportion of genes differentially expressed (Proneural vs. Others, Classical vs. Mesenchymal)
214 |   # line plot showing overlap with silver standard DEGs (Proneural vs. Others, Classical vs. Mesenchymal)
215 |   Rscript plots/scripts/1A-plot_DEGs.R \
216 |     --cancer_type GBM \
217 |     --subtype_vs_others Proneural \
218 |     --subtype_vs_subtype Classical,Mesenchymal \
219 |     --proportion_output_directory plots/supplementary \
220 |     --overlap_output_directory plots/supplementary \
221 |     --overlap_measure Jaccard,Spearman
222 | 
223 |   # line plot showing overlap with silver standard DEGs (Classical vs. Mesenchymal) across small n values
224 |   Rscript plots/scripts/2A-plot_small_n_differential_expression.R \
225 |     --cancer_type GBM \
226 |     --subtype_vs_subtype Classical,Mesenchymal \
227 |     --output_directory plots/supplementary \
228 |     --overlap_measure Jaccard,Spearman
229 | 
230 |   # ----------------------------------------------------------------------------
231 | 
232 |   # GBM TP53 -------------------------------------------------------------------
233 | 
234 |   # run machine learning analysis scripts
235 |   bash run_machine_learning_experiments.sh GBM TP53 7
236 | 
237 |   # stacked bar plot showing distribution of subtypes in train/test sets (one representative example)
238 |   Rscript plots/scripts/0-plot_predictor_category_distributions.R \
239 |     --cancer_type GBM \
240 |     --predictor TP53 \
241 |     --output_directory plots/supplementary
242 | 
243 |   # stacked bar plots showing distribution of subtypes in train/test sets (all seeds)
244 |   Rscript plots/scripts/0-plot_predictor_category_distributions.R \
245 |     --cancer_type GBM \
246 |     --predictor TP53 \
247 |     --plot_all_seeds \
248 |     --output_directory plots/supplementary
249 | 
250 |   # violin + line plots showing kappa values from predictions on test data
251 |   Rscript plots/scripts/3-plot_category_kappa.R \
252 |     --cancer_type GBM \
253 |     --predictor TP53 \
254 |     --null_model \
255 |     --output_directory plots/main
256 | 
257 |   # ----------------------------------------------------------------------------
258 | 
259 |   # GBM PIK3CA -----------------------------------------------------------------
260 | 
261 |   # run machine learning analysis scripts
262 |   bash run_machine_learning_experiments.sh GBM PIK3CA 7
263 | 
264 |   # stacked bar plot showing distribution of subtypes in train/test sets (one representative example)
265 |   Rscript plots/scripts/0-plot_predictor_category_distributions.R \
266 |     --cancer_type GBM \
267 |     --predictor PIK3CA \
268 |     --output_directory plots/supplementary
269 | 
270 |   # stacked bar plots showing distribution of subtypes in train/test sets (all seeds)
271 |   Rscript plots/scripts/0-plot_predictor_category_distributions.R \
272 |     --cancer_type GBM \
273 |     --predictor PIK3CA \
274 |     --plot_all_seeds \
275 |     --output_directory plots/supplementary
276 | 
277 |   # violin + line plots showing kappa values from predictions on test data
278 |   Rscript plots/scripts/3-plot_category_kappa.R \
279 |     --cancer_type GBM \
280 |     --predictor PIK3CA \
281 |     --null_model \
282 |     --output_directory plots/supplementary
283 | 
284 |   # ----------------------------------------------------------------------------
285 | 
286 | fi
287 | 
288 | ################################################################################
289 | # PLIER pathway analysis of BRCA and/or GBM
290 | ################################################################################
291 | 
292 | Rscript -e "rmarkdown::render('8-PLIER_pathways_analysis.Rmd', clean = TRUE)"
293 | 


--------------------------------------------------------------------------------
/run_differential_expression_experiments.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | set -euo pipefail
 3 | 
 4 | # Usage: bash run_differential_expression_experiments.sh CANCER_TYPE SUBTYPE_VS_OTHERS SUBTYPE_VS_SUBTYPE SUBTYPE_VS_SUBTYPE_SMALL
 5 | # where CANCER_TYPE is one of BRCA or GBM
 6 | # SUBTYPE_VS_OTHER is the subtype you want to compare to all others (e.g. Basal)
 7 | # SUBTYPE_VS_SUBTYPE is the two subtypes you want to compare head to head (e.g. LumA,Her2) (comma-separated)
 8 | # SUBTYPE_VS_SUBTYPE_SMALL is the two subtypes you want to compare head to head when limiting the sample size (e.g. LumA,Her2) (comma-separated)
 9 | 
10 | cancer_type=$1
11 | subtype_vs_others=$2
12 | subtype_vs_subtype=$3
13 | subtype_vs_subtype_small=$4
14 | ncores=$5
15 | 
16 | if [ $cancer_type != "BRCA" ] && [ $cancer_type != "GBM" ]; then
17 |   echo Cancer type must be BRCA or GBM in run_differential_expression_experiments.sh [cancer_type]
18 |   exit
19 | fi
20 | 
21 | # Run differential expression scripts
22 | Rscript 1A-detect_differentially_expressed_genes.R --cancer_type $cancer_type --subtype_vs_others $subtype_vs_others --subtype_vs_subtype $subtype_vs_subtype --ncores $ncores
23 | Rscript 2A-small_n_differential_expression.R --cancer_type $cancer_type --subtype_vs_subtype $subtype_vs_subtype_small --ncores $ncores
24 | 


--------------------------------------------------------------------------------
/run_experiments.R:
--------------------------------------------------------------------------------
 1 | # J. Taroni Jul 2016
 2 | # The purpose of this script is to run the BRCA subtype classifier pipeline
 3 | # for RNA-seq 'titration.'
 4 | # It should be run from the command line.
 5 | # USAGE: Rscript run_experiments.R --cancer_type [BRCA|GBM] --predictor [subtype|TP53|PIK3CA] --seed integer --null_model --ncores
 6 | # It also may be run through the classifier_repeat_wrapper.R
 7 | 
 8 | option_list <- list(
 9 |   optparse::make_option("--cancer_type",
10 |                         default = NA_character_,
11 |                         help = "Cancer type"),
12 |   optparse::make_option("--predictor",
13 |                         default = NA_character_,
14 |                         help = "Predictor used"),
15 |   optparse::make_option("--seed",
16 |                         default = NA_integer_,
17 |                         help = "Random seed"),
18 |   optparse::make_option("--null_model",
19 |                         action = "store_true",
20 |                         default = FALSE,
21 |                         help = "Permute dependent variable (within subtype if predictor is a gene)"),
22 |   optparse::make_option("--ncores",
23 |                         default = NA_integer_,
24 |                         help = "Set the number of cores to use")
25 | )
26 | 
27 | opt <- optparse::parse_args(optparse::OptionParser(option_list=option_list))
28 | source(here::here("util/option_functions.R"))
29 | check_options(opt)
30 | 
31 | # set options
32 | cancer_type <- opt$cancer_type
33 | predictor <- opt$predictor
34 | null_model <- opt$null_model
35 | ncores <- min(parallel::detectCores() - 1,
36 |               opt$ncores,
37 |               na.rm = TRUE)
38 | 
39 | # set seed
40 | initial.seed <- opt$seed
41 | set.seed(initial.seed)
42 | 
43 | # these seeds should be between 1000 and 9999 (be 4 digits) to match later file name parsing
44 | seeds <- sample(1000:9999, 3)
45 | 
46 | message(paste("Initial seed:", initial.seed))
47 | message(paste("Secondary seeds:", stringr::str_c(seeds, collapse = ", ")))
48 | 
49 | message("Getting overlap and splitting into training and testing sets...")
50 | system(paste("Rscript 0-expression_data_overlap_and_split.R",
51 |              "--cancer_type", cancer_type,
52 |              "--predictor", predictor,
53 |              "--seed1", seeds[1],
54 |              ifelse(null_model,
55 |                     "--null_model",
56 |                     "")))
57 | 
58 | message("\nNormalizing data...")
59 | system(paste("Rscript 1-normalize_titrated_data.R",
60 |              "--cancer_type", cancer_type,
61 |              "--predictor", predictor,
62 |              "--seed1", seeds[1],
63 |              "--seed2", seeds[2],
64 |              ifelse(null_model,
65 |                     "--null_model",
66 |                     ""),
67 |              "--ncores", ncores))
68 | 
69 | message("\nTraining and testing models...")
70 | system(paste("Rscript 2-train_test_category.R",
71 |              "--cancer_type", cancer_type,
72 |              "--predictor", predictor,
73 |              "--seed1", seeds[1],
74 |              "--seed3", seeds[3],
75 |              ifelse(null_model,
76 |                     "--null_model",
77 |                     ""),
78 |              "--ncores", ncores))
79 | 


--------------------------------------------------------------------------------
/run_machine_learning_experiments.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | set -euo pipefail
 3 | 
 4 | # cancer type (either BRCA or GBM)
 5 | cancer_type=$1
 6 | predictor=$2
 7 | ncores=$3
 8 | 
 9 | if [ $cancer_type != "BRCA" ] && [ $cancer_type != "GBM" ]; then
10 |   echo Cancer type must be BRCA or GBM in run_machine_learning_experiments.sh [cancer_type] [predictor]
11 |   exit
12 | fi
13 | 
14 | if [ $predictor != "subtype" ] && [ $predictor != "TP53" ] && [ $predictor != "PIK3CA" ]; then
15 |   echo Predictor must be subtype, TP53, or PIK3CA in run_machine_learning_experiments.sh [cancer_type] [predictor]
16 |   exit
17 | fi
18 | 
19 | # Run ten repeats of the supervised analysis
20 | # if the predictor is a gene, also generate null models
21 | if [ $predictor == "TP53" ] || [ $predictor == "PIK3CA" ]; then
22 |   Rscript classifier_repeat_wrapper.R --cancer_type $cancer_type --predictor $predictor --n_repeats 10 --ncores $ncores
23 |   Rscript classifier_repeat_wrapper.R --cancer_type $cancer_type --predictor $predictor --n_repeats 10 --null_model --ncores $ncores
24 | else
25 |   Rscript classifier_repeat_wrapper.R --cancer_type $cancer_type --predictor $predictor --n_repeats 10 --ncores $ncores
26 | fi
27 | 
28 | # Run the unsupervised analyses using subtype models
29 | if [ $predictor == "subtype" ]; then
30 |   Rscript 4-ica_pca_feature_reconstruction.R --cancer_type $cancer_type --predictor $predictor --n_components 50
31 |   Rscript 5-predict_category_reconstructed_data.R --cancer_type $cancer_type --predictor $predictor
32 |   Rscript 6-save_recon_error_kappa_data.R --cancer_type $cancer_type --predictor $predictor
33 |   Rscript 7-extract_plier_pathways.R --cancer_type $cancer_type --ncores $ncores
34 |   Rscript 7-extract_plier_pathways.R --cancer_type $cancer_type --ncores $ncores --permute
35 | fi
36 | 


--------------------------------------------------------------------------------
/search_geo_arrayexpress.py:
--------------------------------------------------------------------------------
  1 | ###############################################################################
  2 | # This script queries two databases (GEO, ArrayExpress) to find human samples
  3 | # analyzed on array or RNA-seq platforms. It parses information from each data
  4 | # set and adds up the total number of samples from each platform. One output
  5 | # file is a time-stamped table showing the number of samples from each database
  6 | # and platform. Metadata from the search is appended to a tracking file, which
  7 | # includes the original output table filename, the time/date of the saerch, and
  8 | # the ratio of array to RNA-seq data found. Output goes to the folder
  9 | # results/array_rnaseq_ratio.
 10 | #
 11 | # Usage: python3 util/search_geo_arrayexpress.py
 12 | #
 13 | # S. Foltz February 2022
 14 | ###############################################################################
 15 | 
 16 | from datetime import datetime
 17 | import os
 18 | import requests
 19 | import sys
 20 | import xmltodict
 21 | 
 22 | # max number of results fetch can return
 23 | fetch_retmax = 10000 # as of Feb 2022
 24 | 
 25 | # find the directory of this script (top level project directory)
 26 | dir_path = os.path.dirname(os.path.realpath(__file__))
 27 | 
 28 | # define output directory
 29 | output_directory = os.path.join(dir_path, "results", "array_rnaseq_ratio")
 30 | 
 31 | # check that output directory exists
 32 | try:
 33 |     assert (os.path.isdir(output_directory)), \
 34 |         "Output directory " + output_directory + \
 35 |         " does not exist in search_geo_arrayexpress.py."
 36 | except Exception as e:
 37 |     print(e, file = sys.stderr)
 38 |     exit()
 39 | 
 40 | # define output filenames
 41 | current_time = datetime.utcnow().strftime("%Y-%m-%d_%H_%M_%S_UTC")
 42 | # output filename refers to search results at this particular time
 43 | output_filename = os.path.join(output_directory,
 44 |                                "ratio." + current_time + ".tsv")
 45 | # tracking filename collects metadata about each search (filename, date, ratio)
 46 | output_tracking_filename = os.path.join(output_directory, "ratio.tracking.tsv")
 47 | 
 48 | ###############################################################################
 49 | # GEO - Gene Expression Omnibus
 50 | ###############################################################################
 51 | 
 52 | # set up search terms and dictionary to track n_samples
 53 | search_base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?"
 54 | geo_array_search_term = "homo+sapiens[Organism]+AND+expression+profiling+by+array[DataSet+Type]"
 55 | geo_rnaseq_search_term = "homo+sapiens[Organism]+AND+expression+profiling+by+high+throughput+sequencing[DataSet+Type]"
 56 | 
 57 | geo_array_initial_url = search_base + \
 58 | "&".join(["db=gds", "term=" + geo_array_search_term])
 59 | geo_rnaseq_initial_url = search_base + \
 60 | "&".join(["db=gds", "term=" + geo_rnaseq_search_term])
 61 | 
 62 | geo_dict = {"array": [geo_array_initial_url, 0],
 63 |             "rnaseq": [geo_rnaseq_initial_url, 0]}
 64 | 
 65 | # Use this base url to fetch the records of the search results
 66 | fetch_base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?"
 67 | # for any data sets with an entry in this list, we will skip that data set
 68 | skip_these = ["(Submitter supplied) This SuperSeries is composed of the SubSeries listed below."]
 69 | 
 70 | # for each platform in the dictionary, search twice and then fetch samples
 71 | for platform in geo_dict:
 72 | 
 73 |     # first search to retrieve the total number of results
 74 |     initial_url = geo_dict[platform][0]
 75 |     initial_xml = requests.get(initial_url)
 76 |     initial_dict = xmltodict.parse(initial_xml.content)
 77 |     n_results = initial_dict['eSearchResult']['Count']
 78 | 
 79 |     # search again using n_results, save query_key and WebEnv for fetch
 80 |     second_url = initial_url + "&RetMax=" + n_results + "&usehistory=y"
 81 |     second_xml = requests.get(second_url)
 82 |     second_dict = xmltodict.parse(second_xml.content)
 83 |     query_key = second_dict['eSearchResult']['QueryKey']
 84 |     webenv = second_dict['eSearchResult']['WebEnv']
 85 | 
 86 |     # fetch returns up to fetch_retmax results, so we need to define the start
 87 |     # position so we can increase the start position on subsequent fetches
 88 |     retstart = 0
 89 | 
 90 |     while retstart < int(n_results):
 91 |         fetch_parameters = "&".join(
 92 |           ["db=gds",
 93 |            "query_key=" + query_key,
 94 |            "WebEnv=" + webenv,
 95 |            "retmax=10000",
 96 |            "retstart=" + str(retstart)])
 97 |         fetch_url = fetch_base + fetch_parameters
 98 |         fetch_text = requests.get(fetch_url).text
 99 |         for result in fetch_text.split("\n\n"):  # split by \n
100 |             record = result.split("\n")  # split by \n
101 |             for entry in record:
102 |                 if entry in skip_these:  # data set should be skipped
103 |                     continue
104 |                 else:  # otherwise, look for the line starting with "Platform"
105 |                     if entry.startswith("Platform"):  # parse 2nd last element
106 |                         n_samples = int(entry.split(" ")[-2])  # (n_samples)
107 |                         geo_dict[platform][1] += n_samples  # increment count
108 |         retstart += fetch_retmax  # increment the start position
109 | 
110 | ###############################################################################
111 | # ArrayExpress
112 | ###############################################################################
113 | 
114 | # set up search terms and dictionary to track n_samples
115 | # do not include GEO results in AE (directsub=on)
116 | ae_base_url = "https://www.ebi.ac.uk/arrayexpress/ArrayExpress-Experiments.txt?keywords="
117 | ae_array_url = ae_base_url + "&organism=Homo+sapiens&exptype%5B%5D=%22rna+assay%22&exptype%5B%5D=%22array+assay%22&array=&directsub=on"
118 | ae_rnaseq_url = ae_base_url + "&organism=Homo+sapiens&exptype%5B%5D=%22rna+assay%22&exptype%5B%5D=%22sequencing+assay%22&array=&directsub=on"
119 | 
120 | ae_dict = {"array": [ae_array_url, 0],
121 |            "rnaseq": [ae_rnaseq_url, 0]}
122 | 
123 | # for each platform in the dictionary, get results from url
124 | for platform in ae_dict:
125 |     url = ae_dict[platform][0]
126 |     results = requests.get(url).text.split("\n")[1:-1]  # skip first, last
127 |     for entry in results:
128 |         n_assays = int(entry.split('\t')[4])  # fifth column is n_assays
129 |         ae_dict[platform][1] += n_assays  # increment the count
130 | 
131 | ###############################################################################
132 | # Write results to output files
133 | ###############################################################################
134 | 
135 | total_array = geo_dict["array"][1] + ae_dict["array"][1]  # total number array
136 | total_rnaseq = geo_dict["rnaseq"][1] + ae_dict["rnaseq"][1]  # total n RNA-seq
137 | 
138 | # check array and RNA-seq searches returned non-zero results
139 | try:
140 |     assert (total_array != 0 and total_rnaseq != 0), \
141 |         "Array or RNA-seq returned zero results in search_geo_arrayexpress.py."
142 | except Exception as e:
143 |     print(e, file = sys.stderr)
144 |     exit()
145 | 
146 | 
147 | ratio = total_array/total_rnaseq  # array:RNA-seq
148 | 
149 | output_table = open(output_filename, "w")
150 | output_table.write('\t'.join(["Platform", "GEO", "AE", "Total"]) + "\n")
151 | output_table.write('\t'.join([str(x) for x in ["Array",
152 |                                   geo_dict["array"][1],
153 |                                   ae_dict["array"][1],
154 |                                   total_array]]) + "\n")
155 | output_table.write('\t'.join([str(x) for x in ["RNA-seq",
156 |                                   geo_dict["rnaseq"][1],
157 |                                   ae_dict["rnaseq"][1],
158 |                                   total_rnaseq]]) + "\n")
159 | output_table.close()
160 | 
161 | output_tracking = open(output_tracking_filename, "a")  # create new or append
162 | output_tracking.write('\t'.join(["File:" + os.path.basename(output_filename),
163 |                                  "Date:" + current_time,
164 |                                  "Array_to_RNA-seq_ratio:" + str(ratio)]) + "\n")
165 | output_tracking.close()
166 | 


--------------------------------------------------------------------------------
/util/CrossNorm.R:
--------------------------------------------------------------------------------
 1 | # The following code implements the CrossNorm algorithm with quantile normalization as described in
 2 | # Cheng, L., Lo, L.-Y., Tang, N. L. S., Wang, D. & Leung, K.-S. CrossNorm: a novel normalization strategy for microarray data in cancers. Sci. Rep. 6, 18898 (2016)
 3 | # https://www.nature.com/articles/srep18898
 4 | 
 5 | # We thank the authors of CrossNorm for making the code publicly available under a Creative Commons CC BY license.
 6 | 
 7 | # The code is copied from the Supplementary Information here:
 8 | # https://static-content.springer.com/esm/art%3A10.1038%2Fsrep18898/MediaObjects/41598_2016_BFsrep18898_MOESM1_ESM.pdf
 9 | 
10 | # We made slight modifications that do no alter the functionality of the code, including
11 | #   - We commented out library calls
12 | #   - We specified the library in preprocessCore::normalize.quantiles() function calls
13 | 
14 | #====================================================================================
15 | # Description:
16 | # Cross Normalization (CrossNorm) for gene expression data.
17 | #
18 | # Arguments:
19 | # exp - a (non-empty) numeric matrix of data values. Row represents gene while
20 | # colum represents sample.
21 | # label - a (non-empty) binary vector of data values in which ’0’ represents
22 | # control sample and ’1’ reptesents disease sample. The length of label
23 | # should be equal to the column number of exp.
24 | # Value:
25 | # exp.crossnorm - A normalized numeric matrix. Row represents gene while column
26 | # represents sample. The gene order is the same as exp.
27 | #
28 | # Reference:
29 | # CrossNorm: a novel normalization strategy for microarray data in cancers
30 | # Lixin Cheng, Leung-Yau Lo, Kwong-Sak Leung, Nelson LS Tang and Dong Wang
31 | #
32 | # Example:
33 | # source("CrossNorm.R")
34 | # exp.pcn = PairedCrossNorm(exp, label)
35 | # exp.gcn = GeneralCrossNorm(exp, label)
36 | #====================================================================================
37 | 
38 | #library(affy)
39 | #library(preprocessCore)
40 | 
41 | # -------------------Paired CrossNorm --------------------
42 | 
43 | PairedCrossNorm <- function(exp, label){
44 |   exp = as.matrix(exp);
45 |   geneLen = dim(exp)[1];
46 |   exp.normal = exp[,label==0];
47 |   exp.disease = exp[,label==1];
48 |   exp.cross = rbind(exp.normal,exp.disease);
49 |   exp.quantile.cross = preprocessCore::normalize.quantiles(exp.cross);
50 |   exp.crossnorm.normal = exp.quantile.cross[1:geneLen,];
51 |   exp.crossnorm.disease = exp.quantile.cross[(geneLen+1):(2*geneLen),];
52 |   exp.crossnorm= cbind(exp.crossnorm.normal,exp.crossnorm.disease);
53 |   return(exp.crossnorm)
54 | }
55 | 
56 | # ---------------- General CrossNorm --------------------
57 | 
58 | GeneralCrossNorm <- function(exp,label){
59 |   exp = as.matrix(exp);
60 |   exp.cross = Matrix2CrossMatrix(exp,label)
61 |   exp.quantile.cross = preprocessCore::normalize.quantiles(exp.cross)
62 |   exp.crossnorm = CrossMatrix2Matrix(exp.quantile.cross,label)
63 |   return(exp.crossnorm)
64 | }
65 | 
66 | # CrossMatrix
67 | Matrix2CrossMatrix <- function(M, label){
68 |   M = as.matrix(M)
69 |   rowLen = dim(M)[1]
70 |   sampleSize1 = sum(label==1) # disease sample size
71 |   sampleSize0 = sum(label==0) # normal sample size
72 |   indexMatrix = matrix(1:(sampleSize1*sampleSize0),,sampleSize0)
73 |   M1 = M[,label==1]
74 |   M0 = M[,label==0]
75 |   M3 = matrix(0,rowLen*2,sampleSize1*sampleSize0)
76 |   for (t in 1:sampleSize1){
77 |     M3[,indexMatrix[t,]] = rbind(matrix(rep(M1[,t],sampleSize0),,sampleSize0),M0)
78 |   }
79 |   return(M3)
80 | }
81 | 
82 | CrossMatrix2Matrix <- function(CrossM,label){
83 |   rowLen = dim(CrossM)[1]/2
84 |   sampleSize1 = sum(label==1) # disease sample size
85 |   sampleSize0 = sum(label==0) # normal sample size
86 |   indexMatrix = matrix(1:(sampleSize1*sampleSize0),,sampleSize0)
87 |   M1 = matrix(0,rowLen,sampleSize1)
88 |   M0 = matrix(0,rowLen,sampleSize0)
89 |   for(t in 1:sampleSize1){
90 |     M1[,t] = apply(CrossM[1:rowLen,indexMatrix[t,]],1,mean)
91 |   }
92 |   for(t in 1:sampleSize0){
93 |     M0[,t] = apply(CrossM[(rowLen+1):(rowLen*2),indexMatrix[,t]],1,mean)
94 |   }
95 |   M = cbind(M0,M1)
96 |   return(M)
97 | }
98 | 


--------------------------------------------------------------------------------
/util/color_blind_friendly_palette.R:
--------------------------------------------------------------------------------
1 | # color-blind friendly palette
2 | cbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", 
3 |                "#0072B2", "#D55E00", "#CC79A7")
4 | 


--------------------------------------------------------------------------------
/util/option_functions.R:
--------------------------------------------------------------------------------
 1 | check_options <- function(opt) {
 2 |   # this function checks standardized command line options given to scripts
 3 |   # options ending with "_input" are checked to see if the input file exists
 4 |   # options ending with "_output" are checked to see if the output directory exists
 5 |   #   and if the output file already exists, will it be overwritten or not
 6 |   # all messages and errors are reported
 7 |   # if there are any errors, the script stops
 8 | 
 9 |   my_errors <- list()
10 |   my_messages <- list()
11 | 
12 |   for(option in names(opt)){
13 | 
14 |     if (is.na(opt[[option]])) { # all required options should default to NA_character_ or NA_integer_
15 |       my_errors[[option]] <- stringr::str_c("\nOption given for --", option,
16 |                                             " is missing and must be specified.")
17 |     } else if (option == "cancer_type") {
18 |       if (!(opt[[option]] %in% c("BRCA", "GBM"))) { # cancer type must be BRCA or GBM
19 |         my_errors[[option]] <- stringr::str_c("\nCancer type given for --", option,
20 |                                              " (", opt[[option]], ") ",
21 |                                              " must be BRCA or GBM.")
22 |       }
23 |     } else if (option == "predictor") {
24 |       if (!(opt[[option]] %in% c("subtype", "TP53", "PIK3CA"))) { # predictor must be subtype or TP53 or PIK3CA
25 |         my_errors[[option]] <- stringr::str_c("\nPredictor given for --", option,
26 |                                               " (", opt[[option]], ") ",
27 |                                               " must be subtype, TP53, or PIK3CA.")
28 |       }
29 |     }  else if (option == "subtype_vs_subtype") {
30 |       two_subtypes <- as.vector(stringr::str_split(opt[[option]], pattern = ",", simplify = TRUE))
31 |       if (length(two_subtypes) != 2) {
32 |         my_errors[[option]] <- stringr::str_c("\nSubtypes given for --", option,
33 |                                              " (", opt[[option]], ") ",
34 |                                              " must have (only) two comma-separated subtypes.")
35 |       }
36 |       
37 |     } else if (stringr::str_ends(option, "_input")) { # option related to inputs
38 |       if (!file.exists(opt[[option]])) {
39 |         my_errors[[option]] <- stringr::str_c("\nInput file given for --", option,
40 |                                               " (", opt[[option]], ") ",
41 |                                               "does not exist.")
42 |       }
43 |     } else if (stringr::str_ends(option, "_output")) { # option related to outputs
44 |       if (file.exists(opt[[option]])) { # if output file already exists
45 |         if (opt$overwrite) { # overwrite is TRUE if given
46 |           my_messages[[option]] <- stringr::str_c("\nOutput file given for --", option,
47 |                                                   " (", opt[[option]], ") ",
48 |                                                   "already exists and will be overwritten (--overwrite is set).")
49 |         } else { # overwrite defaults to FALSE unless given
50 |           my_errors[[option]] <- stringr::str_c("\nOutput file given for --", option,
51 |                                                 " (", opt[[option]], ") ",
52 |                                                 "already exists and will not be overwritten (use --overwrite).")
53 |         }
54 |       } else if (!dir.exists(dirname(opt[[option]]))) { # if output directory does not exist
55 |         my_errors[[option]] <- stringr::str_c("\nOutput directory given for --", option,
56 |                                               " (", dirname(opt[[option]]), ") ",
57 |                                               "does not exist.")
58 |       }
59 |     } else if (stringr::str_ends(option, "_directory")) { # option related to output directory
60 |       if (!dir.exists(opt[[option]])) {
61 |         my_errors[[option]] <- stringr::str_c("\nOutput directory given for --", option,
62 |                                               " (", opt[[option]], ") ",
63 |                                               "does not exist.")
64 |       }
65 |     } else if (option == "ncores") {
66 |       if (!is.integer(opt[[option]]) | opt[[option]] < 1) {
67 |         my_errors[[option]] <- stringr::str_c("\nNumber of cores given for --", option,
68 |                                               " must be a positive integer.")
69 |       }
70 |     }
71 |   }
72 | 
73 |   if (length(my_messages) > 0) {
74 |     message("  Messages:", my_messages, "\n")
75 |   }
76 |   if (length(my_errors) > 0) {
77 |     message("  Errors:", my_errors, "\n")
78 |     stop()
79 |   }
80 | }
81 | 


--------------------------------------------------------------------------------