├── .gitignore ├── R ├── analysis │ ├── bulk_concordance │ │ ├── inner-run-concordance-terciles.R │ │ ├── inner-run-concordance.R │ │ ├── outer-run-concordance-terciles.R │ │ ├── outer-run-concordance.R │ │ ├── summarise_concordance-terciles.R │ │ ├── summarise_concordance.R │ │ └── write_grid.R │ ├── confounds │ │ ├── inner-calculate-confounds.R │ │ ├── outer-calculate-confounds.R │ │ └── summarise-confounds.R │ ├── control_only │ │ ├── inner-control-only-spatial.R │ │ ├── inner-control-only.R │ │ ├── inner-expr-summary-control-only.R │ │ ├── outer-control-only-spatial.R │ │ ├── outer-control-only.R │ │ ├── outer-expr-summary-control-only.R │ │ ├── summarise-control-only-DE-vs-variance.R │ │ ├── summarise-control-only-n-DE-genes.R │ │ └── summarise-control-only-spatial-n-DE-genes.R │ ├── delta_variance │ │ ├── inner-write-delta-variance.R │ │ ├── outer-write-delta-variance.R │ │ └── summarise-delta-variance.R │ ├── downsample_cells │ │ ├── inner-downsample-cells-outcomes.R │ │ ├── inner-downsample-cells.R │ │ ├── outer-downsample-cells-outcomes.R │ │ ├── outer-downsample-cells.R │ │ └── summarise-downsample-cells-outcomes.R │ ├── expr_summary │ │ ├── inner-expr-summary.R │ │ ├── outer-expr-summary.R │ │ └── summarise-expr-summary.R │ ├── extract_FPs │ │ ├── inner-extract-FPs.R │ │ ├── outer-extract-FPs.R │ │ └── summarise-extract-FPs.R │ ├── mean_variance │ │ └── analyze-mean-delta-variance-all-datasets.R │ ├── run_DE │ │ ├── inner-run-DE.R │ │ └── outer-run-DE.R │ ├── run_GSEA │ │ ├── inner-GSEA-concordance.R │ │ ├── inner-run-GSEA.R │ │ ├── outer-GSEA-concordance.R │ │ ├── outer-run-GSEA.R │ │ └── summarise-GSEA-concordance.R │ ├── run_bulk_DE │ │ ├── inner-run-DE.R │ │ └── outer-run-DE.R │ ├── run_spike_in_DE │ │ ├── inner-run-DE.R │ │ ├── outer-run-DE.R │ │ └── summarise-spike-ins.R │ ├── simulations │ │ ├── inner-expr-summary-simulations.R │ │ ├── inner-null-run-DE.R │ │ ├── inner_write_simulation_objects_null.R │ │ ├── outer-expr-summary-simulations.R │ │ ├── outer-null-run-DE.R │ │ ├── outer_write_simulation_objects_null.R │ │ ├── summarise-null-DE-genes-per-bin.R │ │ └── summarise-null-n-DE-genes.R │ └── time_RAM │ │ ├── summarise-time-RAM-downsample_cells.R │ │ └── summarise-time-RAM.R └── functions │ ├── calculate_overlap.R │ ├── datasets.R │ ├── get_bulk_comparisons.R │ ├── get_comparisons.R │ ├── recode_colnames.R │ ├── run_DE.R │ └── spatial_datasets.R └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | # OS X 2 | .DS_Store 3 | 4 | # Eclipse 5 | .settings 6 | .project 7 | .classpath 8 | /build/ 9 | /bin/ 10 | /target/ 11 | 12 | # Rstudio 13 | .R* 14 | 15 | # Python 16 | *.pyc 17 | .idea 18 | -------------------------------------------------------------------------------- /R/analysis/bulk_concordance/inner-run-concordance-terciles.R: -------------------------------------------------------------------------------- 1 | setwd("~/git/DE-analysis") 2 | options(stringsAsFactors = F) 3 | library(argparse) 4 | 5 | # parse arguments 6 | parser = ArgumentParser(prog = 'inner-run-concordance-terciles.R') 7 | parser$add_argument('--input_sc', type = 'character', required = T) 8 | parser$add_argument('--input_bulk', type = 'character', required = T) 9 | parser$add_argument('--summary_file', type = 'character', required = T) 10 | parser$add_argument('--output_dir', type = 'character', required = T) 11 | parser$add_argument('--n_bins', type = 'integer', required = T) 12 | args = parser$parse_args() 13 | print(args) 14 | 15 | library(tidyverse) 16 | library(magrittr) 17 | library(Seurat) 18 | library(Matrix) 19 | library(RRHO) 20 | library(AUC) 21 | source("R/functions/calculate_overlap.R") 22 | source("R/analysis/bulk_concordance/write_grid.R") 23 | 24 | # set up output filepath 25 | if (!dir.exists(args$output_dir)) 26 | dir.create(args$output_dir, recursive = T) 27 | 28 | # load in files 29 | sc = readRDS(args$input_sc) 30 | bulk = readRDS(args$input_bulk) 31 | 32 | # read expression summary 33 | expr_summary = read.csv(args$summary_file) 34 | 35 | # define output file 36 | sc_name = gsub(".rds", "", basename(args$input_sc)) 37 | bulk_name = gsub(".rds", "", basename(args$input_bulk)) 38 | output_filename = paste0(sc_name, "|", bulk_name, '-n_bins=', args$n_bins, 39 | ".rds") 40 | output_file = file.path(args$output_dir, output_filename) 41 | 42 | # get all combinations of single-cell/bulk 43 | sc_idxs = names(sc) 44 | bulk_idxs = names(bulk) 45 | if (is.null(sc_idxs)) { 46 | sc_idxs = "1" 47 | names(sc) = '1' 48 | } 49 | if (is.null(bulk_idxs)) { 50 | bulk_idxs = "1" 51 | names(bulk) = '1' 52 | } 53 | comparisons = expand.grid(sc_comparison = sc_idxs, bulk_comparison = bulk_idxs, 54 | stringsAsFactors = F) 55 | 56 | # get rid of irrelevant comparisons from Cano-Gamez 2020 57 | if (grepl("CanoGamez2020", sc_name)) { 58 | keep = map2_lgl(comparisons$sc_comparison, 59 | comparisons$bulk_comparison, 60 | ~ grepl(.x, .y)) 61 | comparisons %<>% extract(keep, ) 62 | } 63 | 64 | # analyze each comparison separately 65 | results = data.frame() 66 | for (comparison_idx in seq_len(nrow(comparisons))) { 67 | message("analyzing comparison ", comparison_idx, " of ", nrow(comparisons), 68 | " ...") 69 | 70 | # prepare data 71 | sc_comparison = comparisons$sc_comparison[comparison_idx] 72 | bulk_comparison = comparisons$bulk_comparison[comparison_idx] 73 | sc_sub = sc[[sc_comparison]] 74 | bulk_sub = bulk[[bulk_comparison]] 75 | comparison_label = paste0(sc_comparison, "|", bulk_comparison) 76 | 77 | # for Angelidis, filter to relevant cell types to prevent bugs 78 | if (grepl("Angelidis", sc_name)) { 79 | sc_sub %<>% filter(cell_type %in% c("Type_2_pneumocytes", 80 | "Alveolar_macrophage")) 81 | } 82 | # same for Reyfman 83 | if (grepl("Reyfman", sc_name)) { 84 | sc_sub %<>% filter(cell_type %in% c("AT2", "Alveolar macrophages")) 85 | } 86 | 87 | # run concordance over each quintile separately 88 | out = data.frame() 89 | cell_types = unique(sc_sub$cell_type) 90 | for (cell_type in cell_types) { 91 | # bin genes by expression level 92 | tested_genes = sc_sub %>% 93 | filter(cell_type == !!cell_type) %>% 94 | filter(gene %in% bulk_sub$gene) %>% 95 | pull(gene) 96 | bins = expr_summary %>% 97 | filter(gene %in% tested_genes) %>% 98 | filter(comparison == sc_comparison, cell_type == !!cell_type) %>% 99 | arrange(mean) %>% 100 | mutate(bin = cut(row_number() / n(), 101 | breaks = seq(0, args$n_bins) / args$n_bins), 102 | bin = as.integer(bin)) %>% 103 | split(.$bin) 104 | 105 | # run over each bin 106 | for (bin in seq_len(args$n_bins)) { 107 | bin_genes = bins[[bin]]$gene 108 | sc_tmp = sc_sub %>% 109 | filter(cell_type == !!cell_type, gene %in% bin_genes) 110 | 111 | tmp = template %>% 112 | mutate(value = seq(nrow(template)) %>% 113 | map( ~ { 114 | print(template[., ]) 115 | method = template$method[.] 116 | k = template$k[.] 117 | cor_method = template$cor_method[.] 118 | value = calculate_overlap( 119 | bulk_de = bulk_sub, 120 | sc_de = sc_tmp, 121 | method = method, 122 | k = k, 123 | cor_method = cor_method 124 | ) 125 | }) %>% 126 | unlist() 127 | ) %>% 128 | mutate(cell_type = cell_type, 129 | bin = bin, 130 | sc_label = sc_comparison, 131 | bulk_label = bulk_comparison) 132 | out %<>% bind_rows(tmp) 133 | } 134 | } 135 | 136 | # append to the main results container 137 | results %<>% bind_rows(out) 138 | } 139 | 140 | # save results 141 | saveRDS(results, output_file) 142 | -------------------------------------------------------------------------------- /R/analysis/bulk_concordance/inner-run-concordance.R: -------------------------------------------------------------------------------- 1 | setwd("~/git/DE-analysis") 2 | options(stringsAsFactors = F) 3 | library(argparse) 4 | 5 | # parse arguments 6 | parser = ArgumentParser(prog = 'inner-run-concordance.R') 7 | parser$add_argument('--input_sc', type = 'character', required = T) 8 | parser$add_argument('--input_bulk', type = 'character', required = T) 9 | parser$add_argument('--output_dir', type = 'character', required = T) 10 | args = parser$parse_args() 11 | print(args) 12 | 13 | library(tidyverse) 14 | library(magrittr) 15 | library(Seurat) 16 | library(Matrix) 17 | library(RRHO) 18 | library(AUC) 19 | source("R/functions/calculate_overlap.R") 20 | source("R/analysis/bulk_concordance/write_grid.R") 21 | 22 | # set up output filepath 23 | if (!dir.exists(args$output_dir)) 24 | dir.create(args$output_dir, recursive = T) 25 | 26 | # load in files 27 | sc = readRDS(args$input_sc) 28 | bulk = readRDS(args$input_bulk) 29 | 30 | # define output file 31 | sc_name = gsub(".rds", "", basename(args$input_sc)) 32 | bulk_name = gsub(".rds", "", basename(args$input_bulk)) 33 | output_filename = paste0(sc_name, "|", bulk_name, ".rds") 34 | output_file = file.path(args$output_dir, output_filename) 35 | 36 | # get all combinations of single-cell/bulk 37 | sc_idxs = names(sc) 38 | bulk_idxs = names(bulk) 39 | if (is.null(sc_idxs)) { 40 | sc_idxs = "1" 41 | names(sc) = '1' 42 | } 43 | if (is.null(bulk_idxs)) { 44 | bulk_idxs = "1" 45 | names(bulk) = '1' 46 | } 47 | comparisons = expand.grid(sc_comparison = sc_idxs, bulk_comparison = bulk_idxs, 48 | stringsAsFactors = F) 49 | 50 | # get rid of irrelevant comparisons from Cano-Gamez 2020 51 | if (grepl("CanoGamez2020", sc_name)) { 52 | keep = map2_lgl(comparisons$sc_comparison, 53 | comparisons$bulk_comparison, 54 | ~ grepl(.x, .y)) 55 | comparisons %<>% extract(keep, ) 56 | } 57 | 58 | results = data.frame() 59 | for (comparison_idx in seq_len(nrow(comparisons))) { 60 | message("analyzing comparison ", comparison_idx, " of ", nrow(comparisons), 61 | " ...") 62 | 63 | # prepare data 64 | sc_comparison = comparisons$sc_comparison[comparison_idx] 65 | bulk_comparison = comparisons$bulk_comparison[comparison_idx] 66 | sc_sub = sc[[sc_comparison]] 67 | bulk_sub = bulk[[bulk_comparison]] 68 | comparison_label = paste0(sc_comparison, "|", bulk_comparison) 69 | 70 | # for Angelidis, filter to relevant cell types to prevent bugs 71 | if (grepl("Angelidis", sc_name)) { 72 | sc_sub %<>% filter(cell_type %in% c("Type_2_pneumocytes", 73 | "Alveolar_macrophage")) 74 | } 75 | # same for Reyfman 76 | if (grepl("Reyfman", sc_name)) { 77 | sc_sub %<>% filter(cell_type %in% c("AT2", "Alveolar macrophages")) 78 | } 79 | 80 | # calculate concordance metrics for this comparison 81 | out = sc_sub %>% 82 | split(.$cell_type) %>% 83 | map( ~ { 84 | print(.$cell_type[1]) 85 | sc_tmp = . 86 | tmp = template %>% 87 | mutate(value = seq(nrow(template)) %>% 88 | map( ~ { 89 | print(template[., ]) 90 | method = template$method[.] 91 | k = template$k[.] 92 | cor_method = template$cor_method[.] 93 | value = calculate_overlap( 94 | bulk_de = bulk_sub, 95 | sc_de = sc_tmp, 96 | method = method, 97 | k = k, 98 | cor_method = cor_method 99 | ) 100 | }) %>% 101 | unlist() 102 | ) 103 | }) %>% 104 | bind_rows(.id = 'cell_type') %>% 105 | mutate( 106 | sc_label = sc_comparison, 107 | bulk_label = bulk_comparison 108 | ) 109 | # bind to main results container 110 | results %<>% bind_rows(out) 111 | } 112 | 113 | # save results 114 | saveRDS(results, output_file) 115 | -------------------------------------------------------------------------------- /R/analysis/bulk_concordance/outer-run-concordance-terciles.R: -------------------------------------------------------------------------------- 1 | # Run single-cell or pseudobulk DE analyses on all cell types in a dataset, 2 | # within each tercile of gene expression. 3 | setwd("~/git/DE-analysis") 4 | options(stringsAsFactors = F) 5 | library(argparse) 6 | 7 | # parse arguments 8 | parser = ArgumentParser(prog = 'outer-run-concordance-terciles.R') 9 | parser$add_argument('--allocation', type = 'character') 10 | args = parser$parse_args() 11 | 12 | library(tidyverse) 13 | library(magrittr) 14 | source("R/functions/datasets.R") 15 | source("R/functions/submit_job.R") 16 | source("R/functions/detect_system.R") 17 | 18 | # list bulk input files 19 | bulk_files = list.files(file.path(base_dir, "analysis/run_bulk_DE")) 20 | bulk_inputs = data.frame(bulk_file = bulk_files) %>% 21 | mutate(label = gsub("_.*|-.*", "", bulk_file)) %>% 22 | # manual fix for the Hagai datasets 23 | mutate(label = ifelse(label == 'Hagai2018', gsub("-.*", "", bulk_file), 24 | label)) %>% 25 | # restore the entire filepath 26 | mutate(bulk_file = file.path(base_dir, 'analysis/run_bulk_DE', bulk_file)) 27 | 28 | # get single-cell comparison files 29 | sc_files = list.files(file.path(base_dir, "analysis/run_DE")) 30 | sc_inputs = data.frame(sc_file = sc_files) %>% 31 | mutate(label = gsub("_.*|-.*", "", sc_file)) %>% 32 | # manual fix for the Hagai datasets 33 | mutate(label = ifelse(label == 'Hagai2018', gsub("-.*", "", sc_file), 34 | label)) 35 | 36 | # rep analysis grid over input files 37 | grid = bulk_inputs %>% 38 | left_join(sc_inputs) %>% 39 | drop_na() 40 | 41 | # add bins 42 | grid = tidyr::crossing(grid, n_bins = 3) 43 | 44 | # add expr_summary file as a parameter 45 | grid %<>% 46 | mutate(summary_file = file.path(base_dir, "analysis/expr_summary", 47 | paste0(label, '.txt.gz'))) 48 | 49 | # write the raw array 50 | grid_file = "sh/analysis/run_DE/grids/run_concordance_terciles.raw.txt" 51 | grid_dir = dirname(grid_file) 52 | if (!dir.exists(grid_dir)) 53 | dir.create(grid_dir, recursive = T) 54 | write.table(grid, grid_file, quote = F, row.names = F, sep = "\t") 55 | 56 | # define output directory where results are stored 57 | output_dir = file.path(base_dir, "analysis/run_concordance_terciles") 58 | 59 | # check which parameters are already complete 60 | overwrite = F 61 | grid0 = grid 62 | if (!overwrite) { 63 | grid0 = grid %>% 64 | mutate(output_filename = paste0(basename(sc_file) %>% 65 | gsub("\\.rds$", "", .), 66 | "|", 67 | basename(bulk_file) %>% 68 | gsub("\\.rds$", "", .), 69 | '-n_bins=', n_bins, 70 | '.rds'), 71 | output_file = file.path(output_dir, output_filename), 72 | exists = file.exists(output_file)) %>% 73 | filter(!exists) %>% 74 | dplyr::select(-output_file, -output_filename, -exists) 75 | } 76 | 77 | # limit to the 'gold standard' datasets 78 | grid0 %<>% filter(grepl("Hagai|CanoGamez|Reyfman|Angelidis", label)) 79 | 80 | # write the grid that still needs to be run 81 | write.table(grid0, "sh/analysis/run_DE/grids/run_concordance_terciles.txt", 82 | quote = F, row.names = F, sep = "\t") 83 | 84 | # finally, run the job on whatever system we're on 85 | sh_dir = "~/git/DE-analysis/sh/analysis/run_DE" 86 | script = file.path(sh_dir, "run_concordance_terciles.sh") 87 | submit_job(grid0, script, args$allocation, system) 88 | -------------------------------------------------------------------------------- /R/analysis/bulk_concordance/outer-run-concordance.R: -------------------------------------------------------------------------------- 1 | setwd("~/git/DE-analysis") 2 | options(stringsAsFactors = F) 3 | library(argparse) 4 | 5 | # parse arguments 6 | parser = ArgumentParser(prog = 'outer-run-DE.R') 7 | parser$add_argument('--allocation', type = 'character') 8 | args = parser$parse_args() 9 | 10 | library(tidyverse) 11 | library(magrittr) 12 | source("R/functions/datasets.R") 13 | source("R/functions/submit_job.R") 14 | source("R/functions/detect_system.R") 15 | 16 | # list bulk input files 17 | bulk_files = list.files(file.path(base_dir, "analysis/run_bulk_DE")) 18 | bulk_inputs = data.frame(bulk_file = bulk_files) %>% 19 | mutate(label = gsub("_.*|-.*", "", bulk_file)) %>% 20 | # manual fix for the Hagai datasets 21 | mutate(label = ifelse(label == 'Hagai2018', gsub("-.*", "", bulk_file), 22 | label)) %>% 23 | # manual fix for the CanoGamez proteomics 24 | mutate(label = fct_recode(label, 25 | "CanoGamez2020" = "CanoGamez2020:proteomics")) %>% 26 | # restore the entire filepath 27 | mutate(bulk_file = file.path(base_dir, 'analysis/run_bulk_DE', bulk_file)) 28 | 29 | # get single-cell comparison files 30 | sc_files = list.files(file.path(base_dir, "analysis/run_DE")) 31 | sc_inputs = data.frame(sc_file = sc_files) %>% 32 | mutate(label = gsub("_.*|-.*", "", sc_file)) %>% 33 | # manual fix for the Hagai datasets 34 | mutate(label = ifelse(label == 'Hagai2018', gsub("-.*", "", sc_file), label)) 35 | 36 | # rep analysis grid over input files 37 | grid = bulk_inputs %>% 38 | left_join(sc_inputs) %>% 39 | drop_na() 40 | 41 | # write the raw array 42 | grid_file = "sh/analysis/run_DE/grids/run_concordance.raw.txt" 43 | grid_dir = dirname(grid_file) 44 | if (!dir.exists(grid_dir)) 45 | dir.create(grid_dir, recursive = T) 46 | write.table(grid, grid_file, quote = F, row.names = F, sep = "\t") 47 | 48 | # define output directory where results are stored 49 | output_dir = file.path(base_dir, "analysis/run_concordance") 50 | 51 | # check which parameters are already complete 52 | overwrite = F 53 | grid0 = grid 54 | if (!overwrite) { 55 | grid0 = grid %>% 56 | mutate(output_filename = paste0(basename(sc_file) %>% 57 | gsub("\\.rds$", "", .), 58 | "|", 59 | basename(bulk_file) %>% 60 | gsub("\\.rds$", "", .), 61 | '.rds'), 62 | output_file = file.path(output_dir, output_filename), 63 | exists = file.exists(output_file)) %>% 64 | filter(!exists) %>% 65 | dplyr::select(-output_file, -output_filename, -exists) 66 | } 67 | 68 | # limit to the 'gold standard' datasets 69 | grid0 %<>% filter(grepl("Hagai|CanoGamez|Reyfman|Angelidis", label)) 70 | 71 | # write the grid that still needs to be run 72 | write.table(grid0, "sh/analysis/run_DE/grids/run_concordance.txt", 73 | quote = F, row.names = F, sep = "\t") 74 | 75 | # finally, run the job on whatever system we're on 76 | sh_dir = "~/git/DE-analysis/sh/analysis/run_DE" 77 | script = file.path(sh_dir, "run_concordance.sh") 78 | submit_job(grid0, script, args$allocation, system) 79 | -------------------------------------------------------------------------------- /R/analysis/bulk_concordance/summarise_concordance-terciles.R: -------------------------------------------------------------------------------- 1 | setwd("~/git/DE-analysis") 2 | options(stringsAsFactors = F) 3 | library(tidyverse) 4 | library(magrittr) 5 | args = list(); source("R/functions/detect_system.R") 6 | 7 | # set up input 8 | input_dir = file.path(base_dir, "analysis/run_concordance_terciles") 9 | input_files = list.files(input_dir, full.names = T, pattern ='*\\.rds$') 10 | 11 | # read all input files 12 | dats = map(input_files, readRDS) %>% 13 | setNames(basename(input_files)) 14 | 15 | # combine into a single file 16 | dat = dats %>% 17 | bind_rows(.id = 'comparison') %>% 18 | separate(comparison, c("sc", "bulk"), "\\|") %>% 19 | separate(sc, c("sc_dataset", "sc_test", "shuffle_replicates"), "-") %>% 20 | separate(bulk, c("bulk_dataset", "bulk_test", "n_bins"), "-") %>% 21 | mutate_at(vars(sc_test, bulk_test, shuffle_replicates, n_bins), function(x) 22 | gsub(".*=|.rds", "", x)) 23 | 24 | # save results 25 | output_file = "data/analysis/bulk_concordance/concordance_terciles.rds" 26 | output_dir = dirname(output_file) 27 | if (!dir.exists(output_dir)) 28 | dir.create(output_dir, recursive = T) 29 | saveRDS(dat, output_file) 30 | -------------------------------------------------------------------------------- /R/analysis/bulk_concordance/summarise_concordance.R: -------------------------------------------------------------------------------- 1 | setwd("~/git/DE-analysis") 2 | options(stringsAsFactors = F) 3 | library(tidyverse) 4 | library(magrittr) 5 | args = list(); source("R/functions/detect_system.R") 6 | 7 | input_dir = file.path(base_dir, "analysis/run_concordance") 8 | input_files = list.files(input_dir, full.names = T, pattern ='*\\.rds$') 9 | 10 | # read all input files 11 | dats = map(input_files, readRDS) %>% 12 | setNames(basename(input_files)) 13 | 14 | # combine into a single file 15 | dat = dats %>% 16 | bind_rows(.id = 'comparison') %>% 17 | separate(comparison, c("sc", "bulk"), "\\|") %>% 18 | separate(sc, c("sc_dataset", "sc_test", "shuffle_replicates"), "-") %>% 19 | separate(bulk, c("bulk_dataset", "bulk_test"), "-") %>% 20 | mutate_at(vars(sc_test, shuffle_replicates, bulk_test), 21 | ~ gsub("^.*=|\\.rds", "", .)) 22 | 23 | # save results 24 | output_file = "data/analysis/bulk_concordance/concordance_summary.rds" 25 | output_dir = dirname(output_file) 26 | if (!dir.exists(output_dir)) 27 | dir.create(output_dir, recursive = T) 28 | saveRDS(dat, output_file) 29 | -------------------------------------------------------------------------------- /R/analysis/bulk_concordance/write_grid.R: -------------------------------------------------------------------------------- 1 | # write concordance array for scRNA-seq/bulk comparisons 2 | # FCC array 3 | fcc_opts = list( 4 | method = 'fcc', 5 | cor_method = 'spearman' 6 | ) 7 | fcc_array = do.call(expand.grid, c(fcc_opts, stringsAsFactors = F)) 8 | # AUCC array 9 | aucc_opts = list( 10 | method = 'aucc', 11 | k = c(100, 200, 500, 1000) 12 | ) 13 | aucc_array = do.call(expand.grid, c(aucc_opts, stringsAsFactors = F)) 14 | # create results template using all parameters of interest 15 | template = bind_rows( 16 | fcc_array, 17 | aucc_array 18 | ) 19 | -------------------------------------------------------------------------------- /R/analysis/confounds/inner-calculate-confounds.R: -------------------------------------------------------------------------------- 1 | # Calculate possible confounding factors to DE identified in the Augur paper: 2 | # - read depth (mean and total counts per cell type), 3 | # - 'gene depth' (number of genes detected per cell type), and 4 | # - 'cell depth' (number of cells sequenced per type) 5 | setwd("~/git/DE-analysis") 6 | options(stringsAsFactors = F) 7 | library(argparse) 8 | 9 | # parse arguments 10 | parser = ArgumentParser(prog = 'inner-calculate-confounds.R') 11 | parser$add_argument('--input_file', type = 'character', required = T) 12 | parser$add_argument('--output_dir', type = 'character', required = T) 13 | args = parser$parse_args() 14 | print(args) 15 | 16 | library(tidyverse) 17 | library(magrittr) 18 | library(Seurat) 19 | library(Matrix) 20 | source("R/functions/get_comparisons.R") 21 | 22 | # set up output filepath 23 | if (!dir.exists(args$output_dir)) 24 | dir.create(args$output_dir, recursive = T) 25 | dataset = args$input_file %>% 26 | basename() %>% 27 | gsub("\\.rds$", "", .) 28 | output_filename = paste0(dataset, ".txt") 29 | output_file = file.path(args$output_dir, output_filename) 30 | 31 | # read input file and extract matrix/metadata 32 | sc = readRDS(args$input_file) 33 | expr = GetAssayData(sc, slot = 'counts') 34 | meta = sc@meta.data 35 | dataset = gsub("\\.rds$", "", basename(args$input_file)) 36 | 37 | # get all combinations of conditions 38 | results = list() 39 | comparisons = get_comparisons(dataset, expr, meta) 40 | for (comparison_idx in seq_along(comparisons)) { 41 | comparison = comparisons[[comparison_idx]] 42 | comparison_name = names(comparisons)[comparison_idx] 43 | if (is.null(comparison_name)) 44 | comparison_name = 1 45 | 46 | message("[", comparison_idx, "/", length(comparisons), "] ", 47 | "analyzing comparison ", comparison_name, " ...") 48 | message("##############################") 49 | 50 | # get subset expression and metadata 51 | expr0 = comparison$expr 52 | meta0 = comparison$meta %>% 53 | set_rownames(colnames(expr0)) 54 | 55 | # reconstruct the Seurat object 56 | sc0 = CreateSeuratObject(expr0, min.cells = 1, min.features = 0, 57 | meta.data = meta0) 58 | 59 | # analyze each cell type in turn 60 | cell_types = unique(meta0$cell_type) 61 | for (cell_type_idx in seq_along(cell_types)) { 62 | cell_type = cell_types[cell_type_idx] 63 | message(" [", cell_type_idx, "/", length(cell_types), 64 | "] analyzing cell type: ", cell_type, " ...") 65 | 66 | # number of cells 67 | keep = which(meta0$cell_type == cell_type) 68 | n_cells = length(keep) 69 | 70 | # read depth per cell 71 | expr1 = expr0[, keep, drop = F] 72 | reads = colSums(expr1) 73 | read_depth_mean = mean(reads) 74 | read_depth_sum = sum(reads) 75 | 76 | # genes detected per cell 77 | n_genes = colSums(expr1 > 0) 78 | n_genes_mean = mean(n_genes) 79 | 80 | # append to results 81 | results %<>% rbind( 82 | data.frame(dataset = dataset, 83 | comparison = comparison_name, 84 | cell_type = cell_type, 85 | outcome = c("# of cells", 86 | "read depth (mean)", 87 | "read depth (sum)", 88 | "# of genes (mean)"), 89 | value = c(n_cells, 90 | read_depth_mean, 91 | read_depth_sum, 92 | n_genes_mean))) 93 | } 94 | } 95 | 96 | # write 97 | write.csv(results, output_file, row.names = F) 98 | system(paste("gzip --force", output_file)) 99 | -------------------------------------------------------------------------------- /R/analysis/confounds/outer-calculate-confounds.R: -------------------------------------------------------------------------------- 1 | # Calculate possible confounding factors to DE identified in the Augur paper: 2 | # - read depth (mean and total counts per cell type), 3 | # - 'gene depth' (number of genes detected per cell type), and 4 | # - 'cell depth' (number of cells sequenced per type) 5 | setwd("~/git/DE-analysis") 6 | options(stringsAsFactors = F) 7 | library(argparse) 8 | 9 | # parse arguments 10 | parser = ArgumentParser(prog = 'outer-calculate-confounds.R') 11 | parser$add_argument('--allocation', type = 'character') 12 | args = parser$parse_args() 13 | 14 | library(tidyverse) 15 | library(magrittr) 16 | source("R/functions/datasets.R") 17 | source("R/functions/submit_job.R") 18 | source("R/functions/detect_system.R") 19 | 20 | # list input files 21 | input_dir = file.path(base_dir, "rnaseq", "seurat") 22 | input_files = file.path(input_dir, paste0(datasets, '.rds')) 23 | # grid is simply the list of input files 24 | grid = data.frame(input_file = input_files) 25 | 26 | # define output directory where results are stored 27 | output_dir = file.path(base_dir, "analysis/confounds") 28 | 29 | # check which parameters are already complete 30 | overwrite = F 31 | grid0 = grid 32 | if (!overwrite) { 33 | grid0 = grid %>% 34 | mutate(output_filename = paste0(basename(input_file) %>% 35 | gsub("\\.rds$", "", .), '.txt.gz'), 36 | output_file = file.path(output_dir, output_filename), 37 | exists = file.exists(output_file)) %>% 38 | filter(!exists) %>% 39 | dplyr::select(-output_file, -output_filename, -exists) 40 | } 41 | 42 | # write the grid that still needs to be run 43 | grid_file = "sh/analysis/confounds/grids/calculate_confounds.txt" 44 | grid_dir = dirname(grid_file) 45 | if (!dir.exists(grid_dir)) 46 | dir.create(grid_dir, recursive = T) 47 | write.table(grid0, grid_file, quote = F, row.names = F, sep = "\t") 48 | 49 | # finally, run the job on whatever system we're on 50 | sh_dir = "~/git/DE-analysis/sh/analysis/confounds" 51 | script = file.path(sh_dir, "calculate_confounds.sh") 52 | submit_job(grid0, script, args$allocation, system) 53 | -------------------------------------------------------------------------------- /R/analysis/confounds/summarise-confounds.R: -------------------------------------------------------------------------------- 1 | setwd("~/git/DE-analysis") 2 | options(stringsAsFactors = F) 3 | library(tidyverse) 4 | library(magrittr) 5 | args = list(); source("R/functions/detect_system.R") 6 | 7 | # list files 8 | input_dir = file.path(base_dir, "analysis/confounds") 9 | input_files = list.files(input_dir, full.names = T, pattern = '*\\.txt\\.gz$') 10 | 11 | # read these all 12 | dats = map(input_files, read.csv) 13 | dat = do.call(rbind, dats) 14 | 15 | # save results 16 | output_file = "data/analysis/confounds/confounds.rds" 17 | output_dir = dirname(output_file) 18 | if (!dir.exists(output_dir)) 19 | dir.create(output_dir, recursive = T) 20 | saveRDS(dat, output_file) 21 | -------------------------------------------------------------------------------- /R/analysis/control_only/inner-control-only-spatial.R: -------------------------------------------------------------------------------- 1 | setwd("~/git/DE-analysis") 2 | options(stringsAsFactors = F) 3 | library(argparse) 4 | 5 | # parse arguments 6 | parser = ArgumentParser(prog = 'inner-control-only.R') 7 | parser$add_argument('--input_file', type = 'character', required = T) 8 | parser$add_argument('--de_test', type = 'character', required = T) 9 | parser$add_argument('--sample_idx', type = 'integer', required = T) 10 | parser$add_argument('--shuffle_replicates', type = 'character', required = T) 11 | parser$add_argument('--label', type = 'character', required = T) 12 | parser$add_argument('--output_dir', type = 'character', required = T) 13 | args = parser$parse_args() 14 | print(args) 15 | 16 | library(tidyverse) 17 | library(magrittr) 18 | library(Seurat) 19 | library(Matrix) 20 | library(peakRAM) 21 | library(future) 22 | source("R/functions/get_comparisons.R") 23 | source("R/functions/run_DE.R") 24 | 25 | # set up output filepath 26 | if (!dir.exists(args$output_dir)) 27 | dir.create(args$output_dir, recursive = T) 28 | dataset = args$input_file %>% 29 | basename() %>% 30 | gsub("\\.rds$", "", .) 31 | output_filename = paste0(dataset, 32 | '-de_test=', args$de_test, 33 | '-sample_idx=', args$sample_idx, 34 | '-shuffle_replicates=', args$shuffle_replicates, 35 | '-label=', args$label, 36 | ".rds") 37 | output_file = file.path(args$output_dir, output_filename) 38 | 39 | # read input file and extract matrix/metadata 40 | sc = readRDS(args$input_file) 41 | expr = GetAssayData(sc, slot = 'counts') 42 | meta = sc@meta.data 43 | 44 | # get all combinations of conditions 45 | comparisons = get_comparisons(dataset, expr, meta) 46 | if (is.null(names(comparisons))) { 47 | names(comparisons) = '1' 48 | } 49 | 50 | # iterate through comparisons 51 | results = list() 52 | for (comparison_name in names(comparisons)) { 53 | comparison = comparisons[[comparison_name]] 54 | 55 | # get subset expression and metadata 56 | expr0 = comparison$expr 57 | meta0 = comparison$meta 58 | 59 | # filter to label of interest 60 | meta0 %<>% 61 | mutate(idx = row_number()) %>% 62 | filter(grepl(args$label, label)) 63 | expr0 = expr[, meta0$idx] 64 | 65 | # skip if there aren't enough replicates 66 | if (n_distinct(meta0$replicate) < 6) { 67 | message("skipping comparison: ", comparison , " (not enough replicates) ...") 68 | next 69 | } 70 | 71 | # set barcode as column 72 | meta0 %<>% 73 | as.data.frame() %>% 74 | mutate(new_barcode = colnames(expr0)) 75 | 76 | # re-assign the groups 77 | reps = unique(meta0$replicate) 78 | n_reps = length(reps) 79 | ctrl = sample(reps, n_reps / 2) 80 | meta0 %<>% 81 | mutate(label = ifelse(replicate %in% ctrl, 'ctrl', 'stim')) %>% 82 | set_rownames(.$new_barcode) 83 | 84 | # check for replicate shuffling 85 | if (args$shuffle_replicates == "YES") { 86 | meta0 %<>% group_by(cell_type, label) %>% 87 | mutate(replicate = sample(replicate)) %>% 88 | set_rownames(.$new_barcode) 89 | } 90 | 91 | # reconstruct the Seurat object 92 | sc0 = CreateSeuratObject(expr0, min.cells = 1, min.features = 0, 93 | meta.data = meta0) 94 | 95 | # run DE analysis 96 | DE = run_DE(sc0, de_test = args$de_test) 97 | 98 | # add to results 99 | results[[length(results) + 1]] = mutate(DE, comparison = comparison_name) 100 | } 101 | 102 | # stop if empty 103 | if (length(results) == 0) 104 | stop("couldn't get any results") 105 | 106 | # save results 107 | saveRDS(results, output_file) 108 | -------------------------------------------------------------------------------- /R/analysis/control_only/inner-control-only.R: -------------------------------------------------------------------------------- 1 | # Run single-cell or pseudobulk DE analyses on random splits of control 2 | # samples only. 3 | setwd("~/git/DE-analysis") 4 | options(stringsAsFactors = F) 5 | library(argparse) 6 | 7 | # parse arguments 8 | parser = ArgumentParser(prog = 'inner-control-only.R') 9 | parser$add_argument('--input_file', type = 'character', required = T) 10 | parser$add_argument('--de_test', type = 'character', required = T) 11 | parser$add_argument('--sample_idx', type = 'integer', required = T) 12 | parser$add_argument('--shuffle_replicates', type = 'character', required = T) 13 | parser$add_argument('--label', type = 'character', required = T) 14 | parser$add_argument('--comparison', type = 'character', required = T) 15 | parser$add_argument('--output_dir', type = 'character', required = T) 16 | args = parser$parse_args() 17 | print(args) 18 | 19 | library(tidyverse) 20 | library(magrittr) 21 | library(Seurat) 22 | library(Matrix) 23 | library(peakRAM) 24 | library(future) 25 | source("R/functions/get_comparisons.R") 26 | source("R/functions/run_DE.R") 27 | 28 | # set up output filepath 29 | if (!dir.exists(args$output_dir)) 30 | dir.create(args$output_dir, recursive = T) 31 | dataset = args$input_file %>% 32 | basename() %>% 33 | gsub("\\.rds$", "", .) 34 | output_filename = paste0(dataset, 35 | '-de_test=', args$de_test, 36 | '-sample_idx=', args$sample_idx, 37 | '-shuffle_replicates=', args$shuffle_replicates, 38 | '-label=', args$label, 39 | '-comparison=', args$comparison, 40 | ".rds") 41 | output_file = file.path(args$output_dir, output_filename) 42 | 43 | # read input file and extract matrix/metadata 44 | sc = readRDS(args$input_file) 45 | expr = GetAssayData(sc, slot = 'counts') 46 | meta = sc@meta.data 47 | 48 | # get all combinations of conditions 49 | comparisons = get_comparisons(dataset, expr, meta) 50 | if (is.null(names(comparisons))) { 51 | names(comparisons) = '1' 52 | } 53 | 54 | # grab comparison of interest 55 | comparison_name = args$comparison 56 | comparison = comparisons[[comparison_name]] 57 | 58 | # get subset expression and metadata 59 | expr0 = comparison$expr 60 | meta0 = comparison$meta 61 | 62 | # grab the correct label 63 | meta0 %<>% 64 | as.data.frame() %>% 65 | set_rownames(colnames(expr0)) %>% 66 | rownames_to_column(var = 'new_barcode') %>% 67 | filter(label == args$label) %>% 68 | set_rownames(.$new_barcode) 69 | expr0 %<>% extract(, rownames(meta0)) 70 | 71 | # re-assign the groups 72 | reps = unique(meta0$replicate) 73 | n_reps = length(reps) 74 | ctrl = sample(reps, n_reps/2) 75 | meta0 %<>% 76 | mutate(label = ifelse(replicate %in% ctrl, 'ctrl', 'stim')) %>% 77 | set_rownames(.$new_barcode) 78 | 79 | # check for replicate shuffling 80 | if (args$shuffle_replicates == "YES") { 81 | meta0 %<>% group_by(cell_type, label) %>% 82 | mutate(replicate = sample(replicate)) %>% 83 | set_rownames(.$new_barcode) 84 | } 85 | 86 | # reconstruct the Seurat object 87 | sc0 = CreateSeuratObject(expr0, min.cells = 1, min.features = 0, 88 | meta.data = meta0) 89 | 90 | # run DE analysis 91 | DE = run_DE(sc0, de_test = args$de_test) 92 | 93 | # stop if empty 94 | if (nrow(DE) == 0) 95 | stop("couldn't get any results") 96 | 97 | # save results 98 | saveRDS(DE, output_file) 99 | -------------------------------------------------------------------------------- /R/analysis/control_only/inner-expr-summary-control-only.R: -------------------------------------------------------------------------------- 1 | setwd("~/git/DE-analysis") 2 | options(stringsAsFactors = F) 3 | library(argparse) 4 | 5 | # parse arguments 6 | parser = ArgumentParser(prog = 'inner-expr-summary.R') 7 | parser$add_argument('--input_file', type = 'character', required = T) 8 | parser$add_argument('--label', type = 'character', required = T) 9 | parser$add_argument('--output_dir', type = 'character', required = T) 10 | args = parser$parse_args() 11 | print(args) 12 | 13 | library(tidyverse) 14 | library(magrittr) 15 | library(Seurat) 16 | library(Matrix) 17 | source("R/functions/get_comparisons.R") 18 | 19 | # set up output filepath 20 | if (!dir.exists(args$output_dir)) 21 | dir.create(args$output_dir, recursive = T) 22 | dataset = args$input_file %>% 23 | basename() %>% 24 | gsub("\\.rds$", "", .) 25 | output_filename = paste0(dataset, "-label=", args$label, ".csv") 26 | output_file = file.path(args$output_dir, output_filename) 27 | 28 | # read input file and extract matrix/metadata 29 | sc = readRDS(args$input_file) 30 | expr = GetAssayData(sc, slot = 'counts') 31 | meta = sc@meta.data 32 | dataset = gsub("\\.rds$", "", basename(args$input_file)) 33 | 34 | # filter to the condition of interest 35 | keep = which(meta$label == args$label) 36 | expr0 = expr[, keep, drop = F] 37 | meta0 = meta[keep, , drop = F] 38 | 39 | # analyze each cell type in turn 40 | results = data.frame() 41 | cell_types = unique(meta0$cell_type) 42 | for (cell_type_idx in seq_along(cell_types)) { 43 | cell_type = cell_types[cell_type_idx] 44 | message(" [", cell_type_idx, "/", length(cell_types), 45 | "] analyzing cell type: ", cell_type, " ...") 46 | 47 | # get cell-type-specific expression matrix 48 | keep = which(meta0$cell_type == cell_type) 49 | expr1 = expr0[, keep, drop = F] 50 | meta1 = meta0[keep, , drop = F] 51 | rownames(meta1) = colnames(expr1) 52 | 53 | # calculate statistics 54 | genes = rownames(expr1) 55 | means = Matrix::rowMeans(expr1) 56 | sds = sparseMatrixStats::rowSds(expr1) 57 | covs = sds / means 58 | pct_zeros = Matrix::rowSums(expr1 == 0) / ncol(expr1) 59 | 60 | # calculate logFC as defined in Seurat 61 | logFC = tryCatch({ 62 | sc0 = CreateSeuratObject(expr1, meta = meta1) %>% 63 | NormalizeData() 64 | Idents(sc0) = sc0$label 65 | mat = GetAssayData(sc0, slot = 'data') 66 | levels = levels(meta1$label) 67 | if (is.null(levels)) { 68 | levels = unique(meta1$label) 69 | } 70 | cells1 = WhichCells(sc0, idents = levels[1]) 71 | cells2 = WhichCells(sc0, idents = levels[2]) 72 | data1 = log(rowMeans(mat[, cells1, drop = F] + 1)) 73 | data2 = log(rowMeans(mat[, cells2, drop = F] + 1)) 74 | out = data2 - data1 # backwards from Seurat (i.e., the proper way) 75 | }, error = function(e) { return(NA_real_) }) 76 | 77 | # calculate pseudobulk variance 78 | pseudobulk_variance = tryCatch({ 79 | meta2 = meta1 %>% 80 | mutate(label = as.character(label), 81 | replicate = as.character(replicate)) 82 | mm = model.matrix(~ 0 + replicate, data = meta2) 83 | mat_mm = expr1 %*% mm 84 | # drop empty columns 85 | keep_samples = colSums(mat_mm) > 0 86 | mat_mm %<>% extract(, keep_samples) %>% as.matrix() 87 | # normalize 88 | mat_mm %<>% edgeR::cpm() 89 | # grab the variance for each gene 90 | vars = sparseMatrixStats::rowSds(mat_mm) 91 | vars %<>% setNames(rownames(mat_mm)) 92 | vars 93 | }, error = function(e) { return(NA_real_) }) 94 | 95 | # calculate shuffled pseudobulk variance 96 | shuffled_variance = tryCatch({ 97 | meta2 = meta1 %>% 98 | mutate(label = as.character(label), 99 | replicate = as.character(replicate)) %>% 100 | group_by(cell_type, label) %>% 101 | mutate(replicate = sample(replicate)) 102 | mm = model.matrix(~ 0 + replicate, data = meta2) 103 | mat_mm = expr1 %*% mm 104 | # drop empty columns 105 | keep_samples = colSums(mat_mm) > 0 106 | mat_mm %<>% extract(, keep_samples) %>% as.matrix() 107 | # normalize 108 | mat_mm %<>% edgeR::cpm() 109 | # grab the variance for each gene 110 | vars = sparseMatrixStats::rowSds(mat_mm) 111 | vars %<>% setNames(rownames(mat_mm)) 112 | vars 113 | }, error = function(e) { return(NA_real_) }) 114 | 115 | # calculate the ratio of real to shuffled variance 116 | ratio = pseudobulk_variance / shuffled_variance 117 | 118 | # convert to data frame 119 | df = data.frame(gene = genes, mean = means, sd = sds, cov = covs, 120 | pct_zero = pct_zeros, logFC = logFC, 121 | pseudobulk_variance = pseudobulk_variance, 122 | shuffled_variance = shuffled_variance, 123 | pseudobulk_ratio = ratio) %>% 124 | mutate(dataset = dataset, 125 | label = args$label, 126 | cell_type = cell_type) 127 | 128 | # append to results 129 | results %<>% bind_rows(df) 130 | } 131 | 132 | # rearrange columns 133 | results %<>% dplyr::select(dataset, label, cell_type, everything()) 134 | 135 | # write 136 | write.csv(results, output_file, row.names = F) 137 | system(paste("gzip --force", output_file)) 138 | -------------------------------------------------------------------------------- /R/analysis/control_only/outer-control-only-spatial.R: -------------------------------------------------------------------------------- 1 | # Run single-cell or pseudobulk DE analyses on random splits of control 2 | # samples only in a spatial dataset. 3 | setwd("~/git/DE-analysis") 4 | options(stringsAsFactors = F) 5 | library(argparse) 6 | 7 | # parse arguments 8 | parser = ArgumentParser(prog = 'outer-control-only.R') 9 | parser$add_argument('--allocation', type = 'character') 10 | args = parser$parse_args() 11 | 12 | library(tidyverse) 13 | library(magrittr) 14 | source("R/functions/submit_job.R") 15 | source("R/functions/detect_system.R") 16 | 17 | # list input files 18 | input_dir = file.path(base_dir, "spatial", "seurat") 19 | input_files = list.files(input_dir, full.names = TRUE, pattern = '*rds') 20 | inputs = data.frame(input_file = input_files) 21 | 22 | # establish grid of analyses 23 | opts = list( 24 | de_test = c( 25 | ## single-cell methods, implemented in Seurat 26 | "wilcox", "bimod", "t", "negbinom", "poisson", "LR", "MAST", 27 | # pseudobulk methods 28 | "pseudobulk_DESeq2,test?LRT", "pseudobulk_DESeq2,test?Wald", 29 | "pseudobulk_limma,mode?voom", "pseudobulk_limma,mode?trend", 30 | "pseudobulk_edgeR,test?QLF", 31 | "pseudobulk_edgeR,test?LRT", 32 | ## mixed model methods, implemented in Seurat 33 | "mixed_lm" 34 | ), 35 | sample_idx = 1, 36 | shuffle_replicates = c("NO", "YES") 37 | ) 38 | grid = do.call(expand.grid, c(opts, stringsAsFactors = F)) %>% 39 | mutate(type = ifelse(grepl("pseudo|mixed", de_test), 'rep', 'single')) %>% 40 | # only do shuffle replicates in pseudobulk or mixed model 41 | filter(type != 'single' | shuffle_replicates != 'YES') %>% 42 | dplyr::select(-type) 43 | 44 | # rep analysis grid over input files 45 | grid %<>% 46 | dplyr::slice(rep(1:n(), each = nrow(inputs))) %>% 47 | mutate(input_file = rep(inputs$input_file, nrow(grid))) %>% 48 | left_join(inputs, by = 'input_file') %>% 49 | # reorder columns 50 | dplyr::select(input_file, de_test, sample_idx, shuffle_replicates) %>% 51 | # filter to mouse only 52 | filter(grepl("_mouse", input_file)) %>% 53 | # manually code control labels 54 | mutate(label = ifelse( 55 | gsub("\\.rds", "", basename(input_file)) == 'Maniatis2019_mouse', 56 | 'WT', NA)) 57 | 58 | # write the raw array 59 | grid_file = "sh/analysis/control_only/grids/control_only_spatial.raw.txt" 60 | grid_dir = dirname(grid_file) 61 | if (!dir.exists(grid_dir)) 62 | dir.create(grid_dir, recursive = T) 63 | write.table(grid, grid_file, quote = F, row.names = F, sep = "\t") 64 | 65 | # define output directory where results are stored 66 | output_dir = file.path(base_dir, "analysis/control_only/spatial") 67 | 68 | # check which parameters are already complete 69 | overwrite = F 70 | grid0 = grid 71 | if (!overwrite) { 72 | grid0 = grid %>% 73 | mutate(output_file = file.path(output_dir, paste0(basename(input_file) %>% 74 | gsub("\\.rds$", "", .), 75 | '-de_test=', de_test, 76 | '-sample_idx=', sample_idx, 77 | '-shuffle_replicates=', shuffle_replicates, 78 | '-label=', label, 79 | '.rds')), 80 | exists = file.exists(output_file)) %>% 81 | filter(!exists) %>% 82 | dplyr::select(-output_file, -exists) 83 | } 84 | 85 | # write the grid that still needs to be run 86 | write.table(grid0, "sh/analysis/control_only/grids/control_only_spatial.txt", 87 | quote = F, row.names = F, sep = "\t") 88 | 89 | # finally, run the job on whatever system we're on 90 | sh_dir = "~/git/DE-analysis/sh/analysis/control_only" 91 | script = file.path(sh_dir, "control_only_spatial.sh") 92 | submit_job(grid0, script, args$allocation, system) 93 | -------------------------------------------------------------------------------- /R/analysis/control_only/outer-control-only.R: -------------------------------------------------------------------------------- 1 | # Run single-cell or pseudobulk DE analyses on random splits of control 2 | # samples only. 3 | setwd("~/git/DE-analysis") 4 | options(stringsAsFactors = F) 5 | library(argparse) 6 | 7 | # parse arguments 8 | parser = ArgumentParser(prog = 'outer-control-only.R') 9 | parser$add_argument('--allocation', type = 'character') 10 | args = parser$parse_args() 11 | 12 | library(tidyverse) 13 | library(magrittr) 14 | source("R/functions/datasets.R") 15 | source("R/functions/submit_job.R") 16 | source("R/functions/detect_system.R") 17 | 18 | # list input files 19 | input_dir = file.path(base_dir, "rnaseq", "seurat") 20 | input_files = file.path(input_dir, paste0(datasets, '.rds')) 21 | inputs = data.frame(input_file = input_files) 22 | 23 | # establish grid of analyses 24 | opts = list( 25 | de_test = c( 26 | ## single-cell methods, implemented in Seurat 27 | "wilcox", "bimod", "t", "negbinom", "poisson", "LR", "MAST", 28 | # pseudobulk methods 29 | "pseudobulk_DESeq2,test?LRT", "pseudobulk_DESeq2,test?Wald", 30 | "pseudobulk_limma,mode?voom", "pseudobulk_limma,mode?trend", 31 | "pseudobulk_edgeR,test?QLF", 32 | "pseudobulk_edgeR,test?LRT", 33 | ## mixed model methods, implemented in Seurat 34 | "mixed_lm" 35 | ), 36 | sample_idx = 1, 37 | shuffle_replicates = c("NO", "YES") 38 | ) 39 | grid = do.call(expand.grid, c(opts, stringsAsFactors = F)) %>% 40 | mutate(type = ifelse(grepl("pseudo|mixed", de_test), 'rep', 'single')) %>% 41 | # only do shuffle replicates in pseudobulk or mixed model 42 | filter(type != 'single' | shuffle_replicates != 'YES') %>% 43 | dplyr::select(-type) 44 | 45 | # rep analysis grid over input files 46 | grid %<>% 47 | dplyr::slice(rep(1:n(), each = nrow(inputs))) %>% 48 | mutate(input_file = rep(inputs$input_file, nrow(grid))) %>% 49 | left_join(inputs, by = 'input_file') %>% 50 | # reorder columns 51 | dplyr::select(input_file, de_test, sample_idx, shuffle_replicates) 52 | 53 | # load in number of replicates for each dataset 54 | reps = readRDS("data/analysis/confounds/replicates.rds") 55 | 56 | # grab the conditions where we have enough replicates 57 | keep = reps %>% 58 | group_by(dataset, label, comparison) %>% 59 | summarise(n_reps = n()) %>% 60 | ungroup %>% 61 | filter(n_reps >= 6) 62 | 63 | # add this into the grid 64 | grid %<>% 65 | mutate(dataset = gsub(".rds$", "", basename(input_file))) %>% 66 | left_join(keep) %>% 67 | # drop the ones we aren't keeping 68 | drop_na() %>% 69 | dplyr::select(-dataset, -n_reps) 70 | 71 | # write the raw array 72 | grid_file = "sh/analysis/control_only/grids/control_only.raw.txt" 73 | grid_dir = dirname(grid_file) 74 | if (!dir.exists(grid_dir)) 75 | dir.create(grid_dir, recursive = T) 76 | write.table(grid, grid_file, quote = F, row.names = F, sep = "\t") 77 | 78 | # define output directory where results are stored 79 | output_dir = file.path(base_dir, "analysis/control_only") 80 | 81 | # check which parameters are already complete 82 | overwrite = F 83 | grid0 = grid 84 | if (!overwrite) { 85 | grid0 = grid %>% 86 | mutate(output_file = file.path(output_dir, paste0(basename(input_file) %>% 87 | gsub("\\.rds$", "", .), 88 | '-de_test=', de_test, 89 | '-sample_idx=', sample_idx, 90 | '-shuffle_replicates=', shuffle_replicates, 91 | '-label=', label, 92 | '-comparison=', comparison, 93 | '.rds')), 94 | exists = file.exists(output_file)) %>% 95 | filter(!exists) %>% 96 | dplyr::select(-output_file, -exists) 97 | } 98 | 99 | # write the grid that still needs to be run 100 | write.table(grid0, "sh/analysis/control_only/grids/control_only.txt", 101 | quote = F, row.names = F, sep = "\t") 102 | 103 | # finally, run the job on whatever system we're on 104 | sh_dir = "~/git/DE-analysis/sh/analysis/control_only" 105 | script = file.path(sh_dir, "control_only.sh") 106 | submit_job(grid0, script, args$allocation, system) 107 | -------------------------------------------------------------------------------- /R/analysis/control_only/outer-expr-summary-control-only.R: -------------------------------------------------------------------------------- 1 | setwd("~/git/DE-analysis") 2 | options(stringsAsFactors = F) 3 | library(argparse) 4 | 5 | # parse arguments 6 | parser = ArgumentParser(prog = 'outer-expr-summary-control-only.R') 7 | parser$add_argument('--allocation', type = 'character') 8 | args = parser$parse_args() 9 | 10 | library(tidyverse) 11 | library(magrittr) 12 | source("R/functions/datasets.R") 13 | source("R/functions/submit_job.R") 14 | source("R/functions/detect_system.R") 15 | 16 | # list input files 17 | input_dir = file.path(base_dir, "rnaseq", "seurat") 18 | input_files = file.path(input_dir, paste0(datasets, '.rds')) 19 | inputs = data.frame(input_file = input_files) 20 | grid = data.frame(input_file = input_files, dataset = datasets) 21 | 22 | # load in number of replicates for each dataset 23 | reps = readRDS("data/analysis/confounds/replicates.rds") 24 | 25 | # grab the conditions where we have enough replicates 26 | keep = reps %>% 27 | group_by(dataset, label, comparison) %>% 28 | summarise(n_reps = n()) %>% 29 | ungroup %>% 30 | filter(n_reps >= 6) %>% 31 | # ignore duplicate comparisons 32 | distinct(dataset, label, n_reps) 33 | 34 | # add this into the grid 35 | grid %<>% 36 | left_join(keep, by = 'dataset') %>% 37 | # drop the ones we aren't keeping 38 | drop_na() %>% 39 | dplyr::select(-n_reps) 40 | 41 | # define output directory where results are stored 42 | output_dir = file.path(base_dir, "analysis/expr_summary/control_only") 43 | 44 | # check which parameters are already complete 45 | overwrite = F 46 | grid0 = grid 47 | if (!overwrite) { 48 | grid0 = grid %>% 49 | mutate(output_filename = paste0(dataset, '-label=', label, '.csv.gz'), 50 | output_file = file.path(output_dir, output_filename), 51 | exists = file.exists(output_file)) %>% 52 | filter(!exists) %>% 53 | dplyr::select(-output_file, -output_filename, -exists, 54 | -dataset) 55 | } 56 | 57 | # write the grid that still needs to be run 58 | grid_file = "sh/analysis/control_only/grids/expr_summary.txt" 59 | grid_dir = dirname(grid_file) 60 | if (!dir.exists(grid_dir)) 61 | dir.create(grid_dir, recursive = T) 62 | write.table(grid0, grid_file, quote = F, row.names = F, sep = "\t") 63 | 64 | # finally, run the job on whatever system we're on 65 | sh_dir = "~/git/DE-analysis/sh/analysis/control_only" 66 | script = file.path(sh_dir, "expr_summary_control_only.sh") 67 | submit_job(grid0, script, args$allocation, system) 68 | -------------------------------------------------------------------------------- /R/analysis/control_only/summarise-control-only-DE-vs-variance.R: -------------------------------------------------------------------------------- 1 | setwd("~/git/DE-analysis") 2 | options(stringsAsFactors = F) 3 | library(tidyverse) 4 | library(magrittr) 5 | library(data.table) 6 | args = list(); source("R/functions/detect_system.R") 7 | 8 | # list input files 9 | input_dir = file.path(base_dir, "analysis", "expr_summary", "control_only") 10 | input_files = list.files(input_dir, pattern = '*\\.csv\\.gz', full.names = TRUE) 11 | 12 | # we don't need to summarize all data, so let's filter here 13 | meta = data.frame(filename = basename(input_files)) %>% 14 | mutate(idx = row_number()) %>% 15 | separate(filename, into = c('dataset', 'label'), sep = '-') %>% 16 | mutate_all(~ gsub("^.*=|\\.csv\\.gz", "", .)) %>% 17 | type_convert() 18 | # filter to control groups in simple experiments only 19 | keep = c('Goldfarbmuren2020' = 'never', ## lung from never smokers 20 | 'Grubman2019' = 'Control', ## ALZ control brains 21 | 'Hrvatin2018' = '0h', ## light-deprived mice 22 | 'Huang2020' = 'control', ## colonic mucosa in healthy children 23 | 'Kang2018' = 'ctrl', ## unstimulated PBMCs 24 | 'Mathys2019' = 'Control', ## ALZ control brains 25 | 'Nagy2020' = 'Control', ## MDD control brains 26 | 'Reyfman2020' = 'Control', ## healthy lungs 27 | 'Rossi2019' = 'control', ## mice on a control diet 28 | 'Sathyamurthy2018' = 'control', ## healthy mouse spinal cord 29 | 'Smillie2019' = 'Healthy', ## healthy colon 30 | 'Tran2019' = 'Ctrl', ## uninjured RGCs 31 | 'Wilk2020' = 'Healthy', ## control PBMCs 32 | 'Wu2017' = 'control' ## control mice 33 | ) %>% 34 | data.frame(dataset = names(.), label = .) 35 | 36 | # filter metadata/files accordingly 37 | meta0 = inner_join(meta, keep, by = c('dataset', 'label')) 38 | input_files %<>% extract(meta0$idx) 39 | 40 | # read all data 41 | dats = map(input_files, fread) 42 | # combine into a single data frame 43 | dat = bind_rows(dats) 44 | 45 | # last, we also need to load the DE results 46 | DE = readRDS(file.path(base_dir, "analysis", "summary_data", 47 | "control_only.rds")) 48 | n_DE = readRDS("data/analysis/control_only/n_DE_genes.rds") 49 | 50 | ## outcome 1: write mean delta-variance for each cell type in each dataset 51 | delta_vars = dat %>% 52 | drop_na(pseudobulk_variance, shuffled_variance) %>% 53 | mutate(delta_variance = shuffled_variance - pseudobulk_variance) %>% 54 | group_by(dataset, label, cell_type) %>% 55 | summarise(mean_delta = mean(delta_variance)) %>% 56 | ungroup() 57 | saveRDS(delta_vars, "data/analysis/control_only/delta_variance.rds") 58 | 59 | ## outcome 2: number of DE genes in each bin 60 | bins = 10 61 | xy0 = xy %>% 62 | mutate(abs_delta_variance = abs(delta_variance)) 63 | bin_results = xy0 %>% 64 | # bin expression levels 65 | group_by(dataset, label, cell_type, de_test, shuffle_replicates) %>% 66 | arrange(abs_delta_variance) %>% 67 | mutate(bin = cut(row_number() / n(), 68 | breaks = seq(0, bins) / bins), 69 | bin = as.integer(bin)) %>% 70 | ungroup() %>% 71 | # count DE genes in each bin 72 | group_by(dataset, label, cell_type, de_test, shuffle_replicates, bin) %>% 73 | summarise(genes = sum(p_val_adj < 0.05)) %>% 74 | ungroup() 75 | saveRDS(bin_results, "data/analysis/control_only/genes_per_bin.rds") 76 | -------------------------------------------------------------------------------- /R/analysis/control_only/summarise-control-only-n-DE-genes.R: -------------------------------------------------------------------------------- 1 | # Count the total number of DE genes in each control-only experiment. 2 | setwd("~/git/DE_analysis") 3 | options(stringsAsFactors = F) 4 | library(tidyverse) 5 | library(magrittr) 6 | source("R/functions/recode_colnames.R") 7 | args = list(); source("R/functions/detect_system.R") 8 | 9 | # list input files 10 | input_dir = file.path(base_dir, "analysis", "control_only") 11 | input_files = list.files(input_dir, pattern = 'rds', full.names = TRUE) 12 | 13 | # we don't need to summarize all data, so let's filter here 14 | meta = data.frame(filename = basename(input_files)) %>% 15 | mutate(idx = row_number()) %>% 16 | separate(filename, into = c('dataset', 'de_test', 'sample_idx', 17 | 'shuffle_replicates', 'label', 'comparison'), 18 | sep = '-') %>% 19 | mutate_all(~ gsub("^.*=|\\.rds", "", .)) %>% 20 | type_convert() %>% 21 | # remove superfluous columns 22 | dplyr::select(-sample_idx) 23 | # filter to control groups in simple experiments only 24 | keep = c('Goldfarbmuren2020' = 'never', ## lung from never smokers 25 | 'Grubman2019' = 'Control', ## ALZ control brains 26 | 'Hrvatin2018' = '0h', ## light-deprived mice 27 | 'Huang2020' = 'control', ## colonic mucosa in healthy children 28 | 'Kang2018' = 'ctrl', ## unstimulated PBMCs 29 | 'Mathys2019' = 'Control', ## ALZ control brains 30 | 'Nagy2020' = 'Control', ## MDD control brains 31 | 'Reyfman2020' = 'Control', ## healthy lungs 32 | 'Rossi2019' = 'control', ## mice on a control diet 33 | 'Sathyamurthy2018' = 'control', ## healthy mouse spinal cord 34 | 'Smillie2019' = 'Healthy', ## healthy colon 35 | 'Tran2019' = 'Ctrl', ## uninjured RGCs 36 | 'Wilk2020' = 'Healthy', ## control PBMCs 37 | 'Wu2017' = 'control' ## control mice 38 | ) %>% 39 | data.frame(dataset = names(.), label = .) 40 | 41 | # filter metadata/files accordingly 42 | meta0 = inner_join(meta, keep, by = c('dataset', 'label')) 43 | input_files %<>% extract(meta0$idx) 44 | 45 | # read all data 46 | dats = map(input_files, ~ readRDS(.x) %>% 47 | # fix column names 48 | recode_colnames() %>% 49 | # fix p-values 50 | group_by(cell_type) %>% 51 | mutate(p_val_adj = p.adjust(p_val, method = 'BH')) %>% 52 | ungroup() 53 | ) 54 | 55 | # combine into a single data frame 56 | dat = meta0 %>% 57 | split(.$idx) %>% 58 | map2(dats, ~ cbind(.x, .y)) %>% 59 | bind_rows() 60 | # reorder columns 61 | dat0 = dat %>% 62 | dplyr::select(dataset, comparison, label, cell_type, 63 | de_test, shuffle_replicates, 64 | gene, p_val, test_statistic, p_val_adj) %>% 65 | # remove missing genes 66 | drop_na(p_val) 67 | # remove duplicated data from multiple comparisons 68 | # these are irrelevant since DE takes place within controls 69 | dat0 %<>% 70 | group_by(dataset) %>% 71 | filter(comparison == first(comparison)) %>% 72 | ungroup() 73 | 74 | # save the full set of results 75 | saveRDS(dat0, file.path(base_dir, "analysis", "summary_data", 76 | "control_only.rds")) 77 | 78 | # count the number of genes 79 | n_genes = dat0 %>% 80 | group_by(dataset, comparison, label, cell_type, de_test, 81 | shuffle_replicates) %>% 82 | summarise(n = sum(p_val_adj < 0.05)) %>% 83 | ungroup() 84 | 85 | # write # of DE genes 86 | output_file = "data/analysis/control_only/n_DE_genes.rds" 87 | output_dir = dirname(output_file) 88 | if (!dir.exists(output_dir)) 89 | dir.create(output_dir) 90 | saveRDS(n_genes, output_file) 91 | -------------------------------------------------------------------------------- /R/analysis/control_only/summarise-control-only-spatial-n-DE-genes.R: -------------------------------------------------------------------------------- 1 | setwd("~/git/DE-analysis") 2 | options(stringsAsFactors = F) 3 | library(tidyverse) 4 | library(magrittr) 5 | source("R/functions/recode_colnames.R") 6 | args = list(); source("R/functions/detect_system.R") 7 | 8 | # list input files 9 | input_dir = file.path(base_dir, "analysis", "control_only", "spatial") 10 | input_files = list.files(input_dir, pattern = 'rds', full.names = TRUE) 11 | 12 | meta = data.frame(filename = basename(input_files)) %>% 13 | mutate(idx = row_number()) %>% 14 | separate(filename, into = c('dataset', 'de_test', 'sample_idx', 15 | 'shuffle_replicates', 'label'), sep = '-') %>% 16 | mutate_all(~ gsub("^.*=|\\.rds", "", .)) %>% 17 | type_convert() %>% 18 | # remove superfluous columns 19 | dplyr::select(-sample_idx) 20 | 21 | # read all data 22 | dats = map(input_files, ~ readRDS(.x) %>% 23 | bind_rows() %>% 24 | # fix column names 25 | recode_colnames() %>% 26 | # fix p-values 27 | group_by(comparison, cell_type) %>% 28 | mutate(p_val_adj = p.adjust(p_val, method = 'BH')) %>% 29 | ungroup()) 30 | 31 | # combine into a single data frame 32 | dat = meta %>% 33 | split(.$idx) %>% 34 | map2(dats, ~ cbind(.x, .y)) %>% 35 | bind_rows() 36 | # reorder columns 37 | dat0 = dat %>% 38 | dplyr::select(dataset, comparison, label, cell_type, 39 | de_test, shuffle_replicates, 40 | gene, p_val, test_statistic, p_val_adj) %>% 41 | # remove missing genes 42 | drop_na(p_val) 43 | 44 | # save the full set of results 45 | saveRDS(dat0, file.path(base_dir, "analysis", "summary_data", 46 | "control_only_spatial.rds")) 47 | 48 | # count the number of genes 49 | n_genes = dat0 %>% 50 | group_by(dataset, comparison, label, cell_type, de_test, 51 | shuffle_replicates) %>% 52 | summarise(n = sum(p_val_adj < 0.05)) %>% 53 | ungroup() 54 | 55 | # write # of DE genes 56 | output_file = "data/analysis/control_only/spatial/n_DE_genes.rds" 57 | output_dir = dirname(output_file) 58 | if (!dir.exists(output_dir)) 59 | dir.create(output_dir) 60 | saveRDS(n_genes, output_file) 61 | -------------------------------------------------------------------------------- /R/analysis/delta_variance/inner-write-delta-variance.R: -------------------------------------------------------------------------------- 1 | # Calculate the difference in variance between pseudobulks with biological and 2 | # shuffled replicates across all 46 datasets. 3 | setwd("~/git/DE-analysis") 4 | options(stringsAsFactors = F) 5 | library(argparse) 6 | 7 | # parse arguments 8 | parser = ArgumentParser(prog = 'inner-write-delta-variance.R') 9 | parser$add_argument('--input_file', type = 'character', required = T) 10 | parser$add_argument('--output_dir', type = 'character', required = T) 11 | args = parser$parse_args() 12 | print(args) 13 | 14 | library(tidyverse) 15 | library(magrittr) 16 | library(Seurat) 17 | library(Matrix) 18 | library(sparseMatrixStats) 19 | source("R/functions/datasets.R") 20 | source("R/functions/get_comparisons.R") 21 | 22 | # set up output filepath 23 | if (!dir.exists(args$output_dir)) 24 | dir.create(args$output_dir, recursive = T) 25 | dataset = args$input_file %>% 26 | basename() %>% 27 | gsub("\\.rds$", "", .) 28 | output_filename = paste0(dataset, ".rds") 29 | output_file = file.path(args$output_dir, output_filename) 30 | 31 | # read input file and extract matrix/metadata 32 | sc = readRDS(args$input_file) 33 | expr = GetAssayData(sc, slot = 'counts') 34 | meta = sc@meta.data 35 | dataset = gsub("\\.rds$", "", basename(args$input_file)) 36 | 37 | # iterate through comparisons 38 | vars = list() 39 | comparisons = get_comparisons(dataset, expr, meta) 40 | for (comparison_idx in seq_along(comparisons)) { 41 | comparison = comparisons[[comparison_idx]] 42 | comparison_name = names(comparisons)[comparison_idx] 43 | if (is.null(comparison_name)) 44 | comparison_name = 1 45 | 46 | # get subset expression and metadata 47 | expr0 = comparison$expr 48 | meta0 = comparison$meta %>% 49 | set_rownames(colnames(expr0)) 50 | 51 | # analyze each cell type in turn 52 | cell_types = unique(meta0$cell_type) 53 | for (cell_type_idx in seq_along(cell_types)) { 54 | cell_type = cell_types[cell_type_idx] 55 | message(" [", cell_type_idx, "/", length(cell_types), 56 | "] analyzing cell type: ", cell_type, " ...") 57 | 58 | # get cell-type-specific expression matrix 59 | keep = which(meta0$cell_type == cell_type) 60 | expr1 = expr0[, keep, drop = F] 61 | meta1 = meta0[keep, , drop = F] 62 | rownames(meta1) = colnames(expr1) 63 | 64 | # catch cell types without replicates or conditions 65 | if (n_distinct(meta1$label) < 2 | 66 | n_distinct(meta1$replicate) < 3) { 67 | next 68 | } 69 | 70 | # shuffle replicates 71 | meta2 = meta1 %>% 72 | group_by(cell_type, label) %>% 73 | mutate(replicate = sample(replicate)) %>% 74 | ungroup() %>% 75 | set_rownames(colnames(expr1)) 76 | 77 | # summarise to pseudobulk matrices 78 | metadatas = list('biological replicates' = meta1, 79 | 'shuffled replicates' = meta2) 80 | grid = tidyr::crossing(replicate_type = names(metadatas)) 81 | 82 | tmp = data.frame() 83 | for (grid_idx in seq_len(nrow(grid))) { 84 | replicate_type = grid$replicate_type[grid_idx] 85 | model_matrix = metadatas[[replicate_type]] %>% 86 | ungroup() %>% 87 | mutate(label = as.character(label), 88 | replicate = as.character(replicate)) %>% 89 | model.matrix(~ 0 + replicate:label, data = .) 90 | mat_mm = expr1 %*% model_matrix 91 | 92 | # drop empty columns 93 | keep_samples = colSums(mat_mm) > 0 94 | mat_mm %<>% extract(, keep_samples) %>% as.matrix() 95 | 96 | # filter genes with 0 variance 97 | sds = rowSds(mat_mm) 98 | mat_mm %<>% extract(sds > 0, ) 99 | 100 | # calculate CPM 101 | mat_mm %<>% edgeR::cpm() 102 | 103 | # calculate variances for each gene 104 | gene_vars = rowSds(mat_mm) 105 | 106 | # also calculate mean expression for each gene 107 | gene_means = rowMeans(mat_mm) 108 | 109 | # create output data frame 110 | df = data.frame(gene = rownames(mat_mm), 111 | mean = gene_means, 112 | variance = gene_vars, 113 | replicate_type = replicate_type 114 | ) %>% 115 | # tag dataset, comparison, cell type 116 | mutate(dataset = dataset, 117 | comparison = comparison_name, 118 | cell_type = cell_type) 119 | tmp %<>% bind_rows(df) 120 | } 121 | 122 | # summarise within this cell type, at least 123 | summary = tmp %>% 124 | group_by(dataset, comparison, cell_type, gene) %>% 125 | filter(n() > 1) %>% 126 | mutate(cov = variance / mean, 127 | ratio = variance[replicate_type == 'shuffled replicates'] / 128 | variance[replicate_type == 'biological replicates'], 129 | delta = variance[replicate_type == 'shuffled replicates'] - 130 | variance[replicate_type == 'biological replicates'], 131 | delta_cov = cov[replicate_type == 'shuffled replicates'] - 132 | cov[replicate_type == 'biological replicates'], 133 | mean_var1 = mean(variance[replicate_type == 134 | 'shuffled replicates']), 135 | mean_var2 = mean(variance[replicate_type == 136 | 'biological replicates'])) %>% 137 | ungroup() %>% 138 | group_by(dataset, comparison, cell_type) %>% 139 | summarise(mean_ratio = mean(ratio, na.rm = TRUE), 140 | mean_delta = mean(delta, na.rm = TRUE), 141 | mean_delta_cov = mean(delta_cov, na.rm = TRUE), 142 | mean_var1 = mean(mean_var1, na.rm = TRUE), 143 | mean_var2 = mean(mean_var2, na.rm = TRUE)) %>% 144 | ungroup() %>% 145 | ## force everything to character 146 | map_dfc(as.character) 147 | 148 | # append 149 | vars %<>% bind_rows(summary) 150 | } 151 | } 152 | 153 | # save results 154 | saveRDS(vars, output_file) 155 | -------------------------------------------------------------------------------- /R/analysis/delta_variance/outer-write-delta-variance.R: -------------------------------------------------------------------------------- 1 | setwd("~/git/DE-analysis") 2 | options(stringsAsFactors = F) 3 | library(argparse) 4 | 5 | # parse arguments 6 | parser = ArgumentParser(prog = 'outer-write-delta-variance.R') 7 | parser$add_argument('--allocation', type = 'character') 8 | args = parser$parse_args() 9 | 10 | library(tidyverse) 11 | library(magrittr) 12 | source("R/functions/datasets.R") 13 | source("R/functions/submit_job.R") 14 | source("R/functions/detect_system.R") 15 | 16 | # list input files 17 | input_dir = file.path(base_dir, "rnaseq", "seurat") 18 | input_files = file.path(input_dir, paste0(datasets, '.rds')) 19 | # add Hagai plate data into this 20 | input_files %<>% c(file.path(input_dir, "Hagai2018_plate.rds")) 21 | # grid is simply the list of input files 22 | grid = data.frame(input_file = input_files) 23 | 24 | # define output directory where results are stored 25 | output_dir = file.path(base_dir, "analysis/delta_variance") 26 | 27 | # check which parameters are already complete 28 | overwrite = F 29 | grid0 = grid 30 | if (!overwrite) { 31 | grid0 = grid %>% 32 | mutate(output_filename = basename(input_file), 33 | output_file = file.path(output_dir, output_filename), 34 | exists = file.exists(output_file)) %>% 35 | filter(!exists) %>% 36 | dplyr::select(-output_file, -output_filename, -exists) 37 | } 38 | 39 | # write the grid that still needs to be run 40 | grid_file = "sh/analysis/delta_variance/grids/delta_variance.txt" 41 | grid_dir = dirname(grid_file) 42 | if (!dir.exists(grid_dir)) 43 | dir.create(grid_dir, recursive = T) 44 | write.table(grid0, grid_file, quote = F, row.names = F, sep = "\t") 45 | 46 | # finally, run the job on whatever system we're on 47 | sh_dir = "~/git/DE-analysis/sh/analysis/delta_variance" 48 | script = file.path(sh_dir, "delta_variance.sh") 49 | submit_job(grid0, script, args$allocation, system) 50 | -------------------------------------------------------------------------------- /R/analysis/delta_variance/summarise-delta-variance.R: -------------------------------------------------------------------------------- 1 | setwd("~/git/DE-analysis") 2 | options(stringsAsFactors = F) 3 | library(tidyverse) 4 | library(magrittr) 5 | source("R/functions/datasets.R") 6 | args = list(); source("R/functions/detect_system.R") 7 | 8 | # list input files 9 | input_dir = file.path(base_dir, "analysis", "delta_variance") 10 | input_files = file.path(input_dir, paste0(datasets, '.rds')) 11 | 12 | # read all files 13 | dats = map(input_files, readRDS) 14 | dat = bind_rows(dats) %>% type_convert() 15 | 16 | # save results 17 | output_file = "data/analysis/delta_variance/delta_variance.rds" 18 | output_dir = dirname(output_file) 19 | if (!dir.exists(output_dir)) 20 | dir.create(output_dir, recursive = TRUE) 21 | saveRDS(dat, output_file) 22 | -------------------------------------------------------------------------------- /R/analysis/downsample_cells/inner-downsample-cells-outcomes.R: -------------------------------------------------------------------------------- 1 | # Get concordance between scRNA-seq/pseudobulk DE and bulk DE 2 | # in downsampled data 3 | setwd("~/git/DE-analysis") 4 | options(stringsAsFactors = F) 5 | library(argparse) 6 | 7 | # parse arguments 8 | parser = ArgumentParser(prog = 'inner-downsample-cells-outcomes.R') 9 | parser$add_argument('--input_sc', type = 'character', required = T) 10 | parser$add_argument('--input_bulk', type = 'character', required = T) 11 | parser$add_argument('--output_dir', type = 'character', required = T) 12 | args = parser$parse_args() 13 | print(args) 14 | 15 | library(tidyverse) 16 | library(magrittr) 17 | library(Seurat) 18 | library(Matrix) 19 | library(RRHO) 20 | library(AUC) 21 | source("R/functions/calculate_overlap.R") 22 | source("R/analysis/bulk_concordance/write_grid.R") 23 | 24 | # set up output filepath 25 | if (!dir.exists(args$output_dir)) 26 | dir.create(args$output_dir, recursive = T) 27 | 28 | # load in files 29 | sc = readRDS(args$input_sc) 30 | bulk = readRDS(args$input_bulk) 31 | 32 | # define output file 33 | sc_name = gsub(".rds", "", basename(args$input_sc)) 34 | bulk_name = gsub(".rds", "", basename(args$input_bulk)) 35 | output_filename = paste0(sc_name, "|", bulk_name, ".rds") 36 | output_file = file.path(args$output_dir, output_filename) 37 | 38 | # get all combinations of single-cell/bulk 39 | sc_idxs = names(sc) 40 | bulk_idxs = names(bulk) 41 | if (is.null(sc_idxs)) { 42 | sc_idxs = "1" 43 | names(sc) = '1' 44 | } 45 | if (is.null(bulk_idxs)) { 46 | bulk_idxs = "1" 47 | names(bulk) = '1' 48 | } 49 | comparisons = expand.grid(sc_idxs, bulk_idxs, stringsAsFactors = F) 50 | 51 | # get rid of irrelevant comparisons from Cano-Gamez 2020 52 | if (grepl("CanoGamez2020", sc_name)) { 53 | keep = map2_lgl(comparisons$Var1, comparisons$Var2, ~ grepl(.x, .y)) 54 | comparisons %<>% extract(keep, ) 55 | } 56 | 57 | results = c() 58 | for (comparison_idx in 1:nrow(comparisons)) { 59 | message("analyzing comparison ", comparison_idx, " of ", nrow(comparisons), 60 | " ...") 61 | 62 | # prepare data 63 | sc_sub = sc[[comparisons[comparison_idx,]$Var1]] 64 | bulk_sub = bulk[[comparisons[comparison_idx,]$Var2]] %>% 65 | ## fix for Reyfman 66 | ungroup() 67 | sc_label = comparisons[comparison_idx,]$Var1 68 | bulk_label = comparisons[comparison_idx,]$Var2 69 | comparison_label = paste0(sc_label, "|", bulk_label) 70 | 71 | # for Angelidis, filter to relevant cell types to prevent bugs 72 | if (grepl("Angelidis", sc_name)) { 73 | sc_sub %<>% filter(cell_type %in% c("Type_2_pneumocytes", 74 | "Alveolar_macrophage")) 75 | } 76 | # same for Reyfman 77 | if (grepl("Reyfman", sc_name)) { 78 | sc_sub %<>% filter(cell_type %in% c("AT2", "Alveolar macrophages")) 79 | } 80 | 81 | # calculate concordance metrics for this comparison 82 | out = sc_sub %>% 83 | split(.$cell_type) %>% 84 | map( ~ { 85 | print(.$cell_type[1]) 86 | sc_tmp = . 87 | tmp = template %>% 88 | mutate(value = seq(nrow(template)) %>% 89 | map( ~ { 90 | print(template[., ]) 91 | method = template[.,]$method 92 | k = template[.,]$k 93 | cor_method = template[.,]$cor_method 94 | value = calculate_overlap( 95 | bulk_de = bulk_sub, 96 | sc_de = sc_tmp, 97 | method = method, 98 | k = k, 99 | cor_method = cor_method) 100 | }) %>% 101 | unlist() 102 | ) 103 | }) %>% 104 | bind_rows(.id = 'cell_type') %>% 105 | mutate( 106 | sc_label = sc_label, 107 | bulk_label = bulk_label 108 | ) 109 | # bind to main results container 110 | results %<>% bind_rows(out) 111 | } 112 | 113 | # save results 114 | saveRDS(results, output_file) 115 | -------------------------------------------------------------------------------- /R/analysis/downsample_cells/inner-downsample-cells.R: -------------------------------------------------------------------------------- 1 | # Run single-cell or pseudobulk DE analyses on downsampled datasets. 2 | setwd("~/git/DE-analysis") 3 | options(stringsAsFactors = F) 4 | library(argparse) 5 | 6 | # parse arguments 7 | parser = ArgumentParser(prog = 'inner-downsample-cells.R') 8 | parser$add_argument('--input_file', type = 'character', required = T) 9 | parser$add_argument('--output_dir', type = 'character', required = T) 10 | parser$add_argument('--de_test', type = 'character', required = T) 11 | parser$add_argument('--n_cells', type = 'double', required = T) 12 | parser$add_argument('--sample_idx', type = 'integer', required = T) 13 | args = parser$parse_args() 14 | print(args) 15 | 16 | library(tidyverse) 17 | library(magrittr) 18 | library(Seurat) 19 | library(Matrix) 20 | source("R/functions/run_DE.R") 21 | source("R/functions/get_comparisons.R") 22 | 23 | # set up output filepath 24 | if (!dir.exists(args$output_dir)) 25 | dir.create(args$output_dir, recursive = T) 26 | dataset = args$input_file %>% 27 | basename() %>% 28 | gsub("\\.rds$", "", .) 29 | output_filename = paste0(dataset, 30 | "-de_test=", args$de_test, 31 | "-n_cells=", args$n_cells, 32 | "-sample_idx=", args$sample_idx, 33 | ".rds") 34 | output_file = file.path(args$output_dir, output_filename) 35 | 36 | # read input file and extract matrix/metadata 37 | sc = readRDS(args$input_file) 38 | expr = GetAssayData(sc, slot = 'counts') 39 | meta = sc@meta.data 40 | 41 | # get all combinations of conditions 42 | results = list() 43 | comparisons = get_comparisons(dataset, expr, meta) 44 | 45 | for (comparison_idx in seq_along(comparisons)) { 46 | comparison = comparisons[[comparison_idx]] 47 | comparison_name = names(comparisons)[comparison_idx] 48 | if (is.null(comparison_name)) 49 | comparison_name = 1 50 | 51 | if (grepl("Hagai2018", dataset)) { 52 | # only do certain comparisons 53 | if (!comparison_name %in% c("lps4", "pic4")) 54 | next 55 | } 56 | 57 | message("[", comparison_idx, "/", length(comparisons), "] ", 58 | "analyzing comparison ", comparison_name, " ...") 59 | message("##############################") 60 | 61 | # get subset expression and metadata 62 | set.seed(args$sample_idx) 63 | meta0 = comparison$meta %>% 64 | # make sure rownames are correct 65 | set_rownames(colnames(comparison$expr)) %>% 66 | rownames_to_column(var = 'cell_barcode') %>% 67 | group_by(replicate) %>% 68 | mutate(cells = ceiling(args$n_cells * (n() / nrow(.)))) %>% 69 | sample_n(cells[1]) %>% 70 | ## maintaining the proportions, make sure n_cells is precise 71 | ungroup() %>% 72 | sample_n(args$n_cells) %>% 73 | set_rownames(.$cell_barcode) 74 | expr0 = comparison$expr %>% extract(, rownames(meta0)) 75 | 76 | # make some checks 77 | if (grepl("Reyfman2020", dataset)) { 78 | cell_types = c("AT2", "Alveolar macrophages") 79 | meta0 %<>% filter(cell_type %in% cell_types) %>% set_rownames(.$cell_barcode) 80 | expr0 %<>% extract(, rownames(meta0)) 81 | } else if (grepl("Angelidis2019", dataset)) { 82 | cell_types = c("Alveolar_macrophage", "Type_2_pneumocytes") 83 | meta0 %<>% filter(cell_type %in% cell_types) %>% set_rownames(.$cell_barcode) 84 | expr0 %<>% extract(, rownames(meta0)) 85 | } 86 | 87 | # reconstruct the Seurat object 88 | sc_downsampled = CreateSeuratObject(expr0, 89 | min.cells = 1, min.features = 0, 90 | meta.data = meta0) 91 | 92 | # run DE analysis 93 | DE = run_DE(sc_downsampled, de_test = args$de_test) 94 | 95 | # append to list 96 | results[[comparison_name]] = DE 97 | } 98 | 99 | # stop if empty 100 | if (length(results) == 0 | all(map_int(results, nrow) == 0)) 101 | stop("couldn't get any results") 102 | 103 | # save results 104 | saveRDS(results, output_file) 105 | -------------------------------------------------------------------------------- /R/analysis/downsample_cells/outer-downsample-cells-outcomes.R: -------------------------------------------------------------------------------- 1 | # Calculate outcomes for single-cell or pseudobulk DE analyses on 2 | # downsampled datasets. 3 | setwd("~/git/DE-analysis") 4 | options(stringsAsFactors = F) 5 | library(argparse) 6 | 7 | # parse arguments 8 | parser = ArgumentParser(prog = 'outer-downsample-cells-outcomes.R') 9 | parser$add_argument('--allocation', type = 'character') 10 | args = parser$parse_args() 11 | 12 | library(tidyverse) 13 | library(magrittr) 14 | source("R/functions/datasets.R") 15 | source("R/functions/submit_job.R") 16 | source("R/functions/detect_system.R") 17 | 18 | # list bulk input files 19 | bulk_files = list.files(file.path(base_dir, "analysis/run_bulk_DE")) 20 | bulk_inputs = data.frame(bulk_file = bulk_files) %>% 21 | mutate(label = gsub("-.*", "", bulk_file)) %>% 22 | # manual fix for the Hagai datasets 23 | mutate(label = ifelse(label == 'Hagai2018', gsub("-.*", "", bulk_file), 24 | label)) %>% 25 | # manually match a few sc datasets to their bulk data 26 | mutate(label = fct_recode(label, 27 | "Reyfman2020" = "Reyfman2020_alvmac", 28 | "Reyfman2020" = "Reyfman2020_AT2", 29 | "Angelidis2019" = "Angelidis2019_facsepi", 30 | "Angelidis2019" = "Angelidis2019_facsmac", 31 | "CanoGamez2020" = "CanoGamez2020:proteomics")) %>% 32 | # restore the entire filepath 33 | mutate(bulk_file = file.path(base_dir, 'analysis/run_bulk_DE', bulk_file)) 34 | 35 | # list datasets 36 | inputs = data.frame(dataset = datasets) 37 | 38 | # establish grid of analyses 39 | opts = list( 40 | de_test = c( 41 | ## single-cell methods, implemented in Seurat 42 | "wilcox", "bimod", "t", "negbinom", "poisson", "LR", "MAST", 43 | ## pseudobulk methods 44 | "pseudobulk_DESeq2,test?LRT", "pseudobulk_DESeq2,test?Wald", 45 | "pseudobulk_limma,mode?voom", "pseudobulk_limma,mode?trend", 46 | "pseudobulk_edgeR,test?QLF", "pseudobulk_edgeR,test?LRT", 47 | ## mixed model methods, implemented in Seurat 48 | "mixed_lm", "mixed_nbinom", "mixed_poisson", 49 | ## slight adjustments to mixed model methods 50 | "mixed_lm,test?LRT", "mixed_nbinom,test?LRT", "mixed_poisson,test?LRT", 51 | "mixed_nbinom,offset?YES", "mixed_poisson,offset?YES", 52 | "mixed_nbinom,test?LRT,offset?YES", "mixed_poisson,test?LRT,offset?YES", 53 | ## pseudobulk methods with aggregation disabled 54 | "pseudobulk_DESeq2,test?LRT,replicate?cells", 55 | "pseudobulk_DESeq2,test?Wald,replicate?cells", 56 | "pseudobulk_limma,mode?voom,replicate?cells", 57 | "pseudobulk_limma,mode?trend,replicate?cells", 58 | "pseudobulk_edgeR,test?QLF,replicate?cells", 59 | "pseudobulk_edgeR,test?LRT,replicate?cells"), 60 | n_cells = c(25, 50, 100, 200, 500, 1000), 61 | sample_idx = 0 62 | ) 63 | grid = do.call(expand.grid, c(opts, stringsAsFactors = F)) 64 | 65 | # rep analysis grid over input files 66 | grid %<>% 67 | dplyr::slice(rep(1:n(), each = nrow(inputs))) %>% 68 | mutate(dataset = rep(inputs$dataset, nrow(grid))) %>% 69 | left_join(inputs, by = 'dataset') %>% 70 | # reorder columns 71 | dplyr::select(dataset, de_test, n_cells, sample_idx) %>% 72 | mutate(label = dataset) %>% 73 | # add in bulk file 74 | left_join(bulk_inputs) %>% 75 | # filter Angelidis when n_cells == 25 (won't run) 76 | filter(!grepl("Angelidis", dataset) | n_cells != 25) 77 | 78 | # now, reorganize the grid to map query/target file pairs 79 | query_dir = file.path(base_dir, "analysis/downsample_cells/DE") 80 | grid %<>% 81 | mutate(input_sc = file.path(query_dir, 82 | paste0(dataset, 83 | '-de_test=', de_test, 84 | '-n_cells=', n_cells, 85 | '-sample_idx=', sample_idx, 86 | '.rds'))) %>% 87 | dplyr::rename(input_bulk = bulk_file) %>% 88 | dplyr::select(input_sc, input_bulk) 89 | 90 | # write the raw array 91 | grid_file = "sh/analysis/downsample_cells/grids/downsample_cells.raw.txt" 92 | grid_dir = dirname(grid_file) 93 | if (!dir.exists(grid_dir)) 94 | dir.create(grid_dir, recursive = T) 95 | write.table(grid, grid_file, quote = F, row.names = F, sep = "\t") 96 | 97 | # define output directory where results are stored 98 | output_dir = file.path(base_dir, "analysis/downsample_cells/concordance") 99 | 100 | # check which parameters are already complete 101 | overwrite = F 102 | grid0 = grid 103 | if (!overwrite) { 104 | grid0 = grid %>% 105 | mutate(idx = row_number(), 106 | output_filename = paste0(basename(input_sc) %>% 107 | gsub("\\.rds$", "", .), 108 | "|", 109 | basename(input_bulk) %>% 110 | gsub("\\.rds$", "", .), 111 | '.rds'), 112 | output_file = file.path(output_dir, output_filename), 113 | exists = file.exists(output_file)) %>% 114 | filter(!exists) %>% 115 | dplyr::select(-idx, -output_file, -exists) 116 | } 117 | 118 | # just do hagai and Canogamez for now 119 | grid0 %<>% filter(grepl("Hagai|Cano|Reyfman|Angelidis", input_sc)) 120 | 121 | # write the grid that still needs to be run 122 | write.table(grid0, 123 | "sh/analysis/downsample_cells/grids/downsample_cells_outcomes.txt", 124 | quote = F, row.names = F, sep = "\t") 125 | 126 | # finally, run the job on whatever system we're on 127 | sh_dir = "~/git/DE-analysis/sh/analysis/downsample_cells" 128 | script = file.path(sh_dir, "downsample_cells_outcomes.sh") 129 | submit_job(grid0, script, args$allocation, system) 130 | -------------------------------------------------------------------------------- /R/analysis/downsample_cells/outer-downsample-cells.R: -------------------------------------------------------------------------------- 1 | # Run single-cell or pseudobulk DE analyses on downsampled datasets. 2 | setwd("~/git/DE-analysis") 3 | options(stringsAsFactors = F) 4 | library(argparse) 5 | 6 | # parse arguments 7 | parser = ArgumentParser(prog = 'outer-downsample-cells.R') 8 | parser$add_argument('--allocation', type = 'character') 9 | args = parser$parse_args() 10 | 11 | library(tidyverse) 12 | library(magrittr) 13 | source("R/functions/datasets.R") 14 | source("R/functions/submit_job.R") 15 | source("R/functions/detect_system.R") 16 | 17 | # list input files 18 | input_dir = file.path(base_dir, "rnaseq", "seurat") 19 | input_files = file.path(input_dir, paste0(datasets, '.rds')) 20 | inputs = data.frame(input_file = input_files) 21 | 22 | # establish grid of analyses 23 | opts = list( 24 | de_test = c( 25 | ## single-cell methods, implemented in Seurat 26 | "wilcox", "bimod", "t", "negbinom", "poisson", "LR", "MAST", 27 | ## pseudobulk methods 28 | "pseudobulk_DESeq2,test?LRT", "pseudobulk_DESeq2,test?Wald", 29 | "pseudobulk_limma,mode?voom", "pseudobulk_limma,mode?trend", 30 | "pseudobulk_edgeR,test?QLF", "pseudobulk_edgeR,test?LRT", 31 | ## mixed model methods, implemented in Seurat 32 | "mixed_lm", "mixed_nbinom", "mixed_poisson", 33 | ## slight adjustments to mixed model methods 34 | "mixed_lm,test?LRT", "mixed_nbinom,test?LRT", "mixed_poisson,test?LRT", 35 | "mixed_nbinom,offset?YES", "mixed_poisson,offset?YES", 36 | "mixed_nbinom,test?LRT,offset?YES", "mixed_poisson,test?LRT,offset?YES", 37 | ## pseudobulk methods with aggregation disabled 38 | "pseudobulk_DESeq2,test?LRT,replicate?cells", 39 | "pseudobulk_DESeq2,test?Wald,replicate?cells", 40 | "pseudobulk_limma,mode?voom,replicate?cells", 41 | "pseudobulk_limma,mode?trend,replicate?cells", 42 | "pseudobulk_edgeR,test?QLF,replicate?cells", 43 | "pseudobulk_edgeR,test?LRT,replicate?cells" 44 | ), 45 | n_cells = c(25, 50, 100, 200, 500, 1000), 46 | sample_idx = 0 47 | ) 48 | grid = do.call(expand.grid, c(opts, stringsAsFactors = F)) 49 | 50 | # rep analysis grid over input files 51 | grid %<>% 52 | dplyr::slice(rep(1:n(), each = nrow(inputs))) %>% 53 | mutate(input_file = rep(inputs$input_file, nrow(grid))) %>% 54 | left_join(inputs, by = 'input_file') %>% 55 | # reorder columns 56 | dplyr::select(input_file, de_test, n_cells, sample_idx) %>% 57 | # filter Angelidis when n_cells == 25 (won't run) 58 | filter(!grepl("Angelidis", input_file) | n_cells != 25) 59 | 60 | # write the raw array 61 | grid_file = "sh/analysis/downsample_cells/grids/downsample_cells.raw.txt" 62 | grid_dir = dirname(grid_file) 63 | if (!dir.exists(grid_dir)) 64 | dir.create(grid_dir, recursive = T) 65 | write.table(grid, grid_file, quote = F, row.names = F, sep = "\t") 66 | 67 | # define output directory where results are stored 68 | output_dir = file.path(base_dir, "analysis/downsample_cells/DE") 69 | 70 | # check which parameters are already complete 71 | overwrite = F 72 | grid0 = grid 73 | if (!overwrite) { 74 | grid0 = grid %>% 75 | mutate(idx = row_number(), 76 | output_file = file.path(output_dir, 77 | paste0(basename(input_file) %>% 78 | gsub("\\.rds$", "", .), 79 | '-de_test=', de_test, 80 | '-n_cells=', n_cells, 81 | '-sample_idx=', sample_idx, 82 | '.rds')), 83 | exists = file.exists(output_file)) %>% 84 | filter(!exists) %>% 85 | dplyr::select(-idx, -output_file, -exists) 86 | } 87 | 88 | # just do bulk datasets for now 89 | grid0 %<>% filter(grepl("Hagai|Cano|Reyfman|Angelidis", input_file)) 90 | 91 | # write the grid that still needs to be run 92 | write.table(grid0, "sh/analysis/downsample_cells/grids/downsample_cells.txt", 93 | quote = F, row.names = F, sep = "\t") 94 | 95 | # finally, run the job on whatever system we're on 96 | sh_dir = "~/git/DE-analysis/sh/analysis/downsample_cells" 97 | script = file.path(sh_dir, "downsample_cells.sh") 98 | submit_job(grid0, script, args$allocation, system) 99 | -------------------------------------------------------------------------------- /R/analysis/downsample_cells/summarise-downsample-cells-outcomes.R: -------------------------------------------------------------------------------- 1 | setwd("~/git/DE-analysis") 2 | options(stringsAsFactors = F) 3 | library(tidyverse) 4 | library(magrittr) 5 | args = list(); source("R/functions/detect_system.R") 6 | 7 | # set up input directory 8 | input_dir = file.path(base_dir, "analysis/downsample_cells/concordance") 9 | input_files = list.files(input_dir, full.names = T, pattern = '*\\.rds$') 10 | 11 | # read all input files 12 | dats = map(input_files, readRDS) %>% 13 | setNames(basename(input_files)) 14 | 15 | # combine into a single file 16 | dat = dats %>% 17 | bind_rows(.id = 'comparison') %>% 18 | separate(comparison, c("sc", "bulk"), "\\|") %>% 19 | separate(sc, c("dataset", "de_test", "n_cells", "sample_idx"), sep = "-") %>% 20 | mutate(de_test = gsub("!", "\\|", de_test)) %>% 21 | separate(bulk, c("bulk_dataset", "bulk_test"), "-") %>% 22 | mutate_at(vars(dataset, de_test, n_cells, sample_idx, bulk_test), 23 | ~ gsub(".*=|.rds", "", .)) %>% 24 | # remove old bulk test framework 25 | filter(!bulk_test %in% c("bulk_limma", "bulk_DESeq2", "bulk_edgeR") | 26 | # ... but keep published Reyfman analysis 27 | (bulk_test == 'bulk_DESeq2' & grepl("Reyfman", dataset))) 28 | 29 | # save results 30 | output_file = "data/analysis/downsample_cells/concordance_summary.rds" 31 | output_dir = dirname(output_file) 32 | if (!dir.exists(output_dir)) 33 | dir.create(output_dir, recursive = T) 34 | saveRDS(dat, output_file) 35 | -------------------------------------------------------------------------------- /R/analysis/expr_summary/inner-expr-summary.R: -------------------------------------------------------------------------------- 1 | setwd("~/git/DE-analysis") 2 | options(stringsAsFactors = F) 3 | library(argparse) 4 | 5 | # parse arguments 6 | parser = ArgumentParser(prog = 'inner-expr-summary.R') 7 | parser$add_argument('--input_file', type = 'character', required = T) 8 | parser$add_argument('--output_dir', type = 'character', required = T) 9 | args = parser$parse_args() 10 | print(args) 11 | 12 | library(tidyverse) 13 | library(magrittr) 14 | library(Seurat) 15 | library(Matrix) 16 | source("R/functions/get_comparisons.R") 17 | 18 | # set up output filepath 19 | if (!dir.exists(args$output_dir)) 20 | dir.create(args$output_dir, recursive = T) 21 | dataset = args$input_file %>% 22 | basename() %>% 23 | gsub("\\.rds$", "", .) 24 | output_filename = paste0(dataset, ".txt") 25 | output_file = file.path(args$output_dir, output_filename) 26 | 27 | # read input file and extract matrix/metadata 28 | sc = readRDS(args$input_file) 29 | expr = GetAssayData(sc, slot = 'counts') 30 | meta = sc@meta.data 31 | dataset = gsub("\\.rds$", "", basename(args$input_file)) 32 | 33 | # get all combinations of conditions 34 | results = list() 35 | comparisons = get_comparisons(dataset, expr, meta) 36 | for (comparison_idx in seq_along(comparisons)) { 37 | comparison = comparisons[[comparison_idx]] 38 | comparison_name = names(comparisons)[comparison_idx] 39 | if (is.null(comparison_name)) 40 | comparison_name = 1 41 | 42 | message("[", comparison_idx, "/", length(comparisons), "] ", 43 | "analyzing comparison ", comparison_name, " ...") 44 | message("##############################") 45 | 46 | # get subset expression and metadata 47 | expr0 = comparison$expr 48 | meta0 = comparison$meta 49 | 50 | # analyze each cell type in turn 51 | cell_types = unique(meta0$cell_type) 52 | for (cell_type_idx in seq_along(cell_types)) { 53 | cell_type = cell_types[cell_type_idx] 54 | message(" [", cell_type_idx, "/", length(cell_types), 55 | "] analyzing cell type: ", cell_type, " ...") 56 | 57 | # get cell-type-specific expression matrix 58 | keep = which(meta0$cell_type == cell_type) 59 | expr1 = expr0[, keep, drop = F] 60 | meta1 = meta0[keep, , drop = F] 61 | rownames(meta1) = colnames(expr1) 62 | 63 | # calculate statistics 64 | genes = rownames(expr1) 65 | means = Matrix::rowMeans(expr1) 66 | sds = sparseMatrixStats::rowSds(expr1) 67 | covs = sds / means 68 | pct_zeros = Matrix::rowSums(expr1 == 0) / ncol(expr1) 69 | 70 | # calculate logFC as defined in Seurat 71 | logFC = tryCatch({ 72 | sc0 = CreateSeuratObject(expr1, meta = meta1) %>% 73 | NormalizeData() 74 | Idents(sc0) = sc0$label 75 | mat = GetAssayData(sc0, slot = 'data') 76 | levels = levels(meta1$label) 77 | if (is.null(levels)) { 78 | levels = unique(meta1$label) 79 | } 80 | cells1 = WhichCells(sc0, idents = levels[1]) 81 | cells2 = WhichCells(sc0, idents = levels[2]) 82 | data1 = log(rowMeans(mat[, cells1, drop = F] + 1)) 83 | data2 = log(rowMeans(mat[, cells2, drop = F] + 1)) 84 | out = data2 - data1 # backwards from Seurat (i.e., the proper way) 85 | }, error = function(e) { return(NA_real_) }) 86 | 87 | # calculate pseudobulk variance 88 | pseudobulk_variance = tryCatch({ 89 | meta2 = meta1 %>% 90 | mutate(label = as.character(label), 91 | replicate = as.character(replicate)) 92 | mm = model.matrix(~ 0 + replicate:label, data = meta2) 93 | mat_mm = expr1 %*% mm 94 | # drop empty columns 95 | keep_samples = colSums(mat_mm) > 0 96 | mat_mm %<>% extract(, keep_samples) %>% as.matrix() 97 | # normalize 98 | mat_mm %<>% edgeR::cpm() 99 | # grab the variance for each gene 100 | vars = sparseMatrixStats::rowSds(mat_mm) 101 | vars %<>% setNames(rownames(mat_mm)) 102 | vars 103 | }, error = function(e) { return(NA_real_) }) 104 | 105 | # calculate shuffled pseudobulk variance 106 | shuffled_variance = tryCatch({ 107 | meta2 = meta1 %>% 108 | mutate(label = as.character(label), 109 | replicate = as.character(replicate)) %>% 110 | group_by(cell_type, label) %>% 111 | mutate(replicate = sample(replicate)) 112 | mm = model.matrix(~ 0 + replicate:label, data = meta2) 113 | mat_mm = expr1 %*% mm 114 | # drop empty columns 115 | keep_samples = colSums(mat_mm) > 0 116 | mat_mm %<>% extract(, keep_samples) %>% as.matrix() 117 | # normalize 118 | mat_mm %<>% edgeR::cpm() 119 | # grab the variance for each gene 120 | vars = sparseMatrixStats::rowSds(mat_mm) 121 | vars %<>% setNames(rownames(mat_mm)) 122 | vars 123 | }, error = function(e) { return(NA_real_) }) 124 | 125 | # calculate the ratio of real to shuffled variance 126 | ratio = pseudobulk_variance / shuffled_variance 127 | 128 | # convert to data frame 129 | df = data.frame(gene = genes, mean = means, sd = sds, cov = covs, 130 | pct_zero = pct_zeros, logFC = logFC, 131 | pseudobulk_variance = pseudobulk_variance, 132 | shuffled_variance = shuffled_variance, 133 | pseudobulk_ratio = ratio) %>% 134 | mutate(dataset = dataset, 135 | comparison = comparison_name, 136 | cell_type = cell_type) 137 | 138 | # append to results 139 | results %<>% bind_rows(df) 140 | } 141 | } 142 | 143 | # rearrange columns 144 | results %<>% dplyr::select(dataset, comparison, cell_type, everything()) 145 | 146 | # write 147 | write.csv(results, output_file, row.names = F) 148 | system(paste("gzip --force", output_file)) 149 | -------------------------------------------------------------------------------- /R/analysis/expr_summary/outer-expr-summary.R: -------------------------------------------------------------------------------- 1 | setwd("~/git/DE-analysis") 2 | options(stringsAsFactors = F) 3 | library(argparse) 4 | 5 | # parse arguments 6 | parser = ArgumentParser(prog = 'outer-expr-summary.R') 7 | parser$add_argument('--allocation', type = 'character') 8 | args = parser$parse_args() 9 | 10 | library(tidyverse) 11 | library(magrittr) 12 | source("R/functions/datasets.R") 13 | source("R/functions/submit_job.R") 14 | source("R/functions/detect_system.R") 15 | 16 | # list input files 17 | input_dir = file.path(base_dir, "rnaseq", "seurat") 18 | input_files = file.path(input_dir, paste0(datasets, '.rds')) 19 | # add Hagai plate data into this 20 | input_files %<>% c(file.path(input_dir, "Hagai2018_plate.rds")) 21 | # grid is simply the list of input files 22 | grid = data.frame(input_file = input_files) 23 | 24 | # define output directory where results are stored 25 | output_dir = file.path(base_dir, "analysis/expr_summary") 26 | 27 | # check which parameters are already complete 28 | overwrite = F 29 | grid0 = grid 30 | if (!overwrite) { 31 | grid0 = grid %>% 32 | mutate(output_filename = paste0(basename(input_file) %>% 33 | gsub("\\.rds$", "", .), '.txt.gz'), 34 | output_file = file.path(output_dir, output_filename), 35 | exists = file.exists(output_file)) %>% 36 | filter(!exists) %>% 37 | dplyr::select(-output_file, -output_filename, -exists) 38 | } 39 | 40 | # write the grid that still needs to be run 41 | grid_file = "sh/analysis/expr_summary/grids/expr_summary.txt" 42 | grid_dir = dirname(grid_file) 43 | if (!dir.exists(grid_dir)) 44 | dir.create(grid_dir, recursive = T) 45 | write.table(grid0, grid_file, quote = F, row.names = F, sep = "\t") 46 | 47 | # finally, run the job on whatever system we're on 48 | sh_dir = "~/git/DE-analysis/sh/analysis/expr_summary" 49 | script = file.path(sh_dir, "expr_summary.sh") 50 | submit_job(grid0, script, args$allocation, system) 51 | -------------------------------------------------------------------------------- /R/analysis/expr_summary/summarise-expr-summary.R: -------------------------------------------------------------------------------- 1 | setwd("~/git/DE-analysis") 2 | options(stringsAsFactors = F) 3 | library(tidyverse) 4 | library(magrittr) 5 | library(data.table) 6 | args = list(); source("R/functions/detect_system.R") 7 | 8 | # list input files 9 | summary_dir = file.path(base_dir, "analysis", "expr_summary") 10 | summary_files = list.files(summary_dir, full.names = TRUE, pattern = "*gz") 11 | 12 | # read them all 13 | dats = map(summary_files, fread) 14 | 15 | # combine 16 | dat = do.call(rbind, dats) 17 | 18 | # pick one comparison per dataset 19 | confounds = readRDS("data/analysis/confounds/confounds.rds") %>% 20 | # fix a couple datasets 21 | mutate(comparison = ifelse(grepl("Schafflick|Der", dataset), 22 | gsub("^.*_", "", dataset), 23 | ifelse(grepl("Hagai", dataset), 24 | paste0(gsub("^.*_", "", dataset), "|", 25 | comparison), 26 | comparison)), 27 | dataset = ifelse(grepl("Schafflick|Der|Hagai", dataset), 28 | gsub("_.*$", "", dataset), dataset)) 29 | # pick the comparison with the most cells 30 | most_cells = confounds %>% 31 | filter(outcome == '# of cells') %>% 32 | group_by(dataset, comparison) %>% 33 | summarise(total_cells = sum(value), 34 | n_cell_types = n_distinct(cell_type)) %>% 35 | ungroup() %>% 36 | group_by(dataset) %>% 37 | arrange(desc(total_cells), desc(n_cell_types)) %>% 38 | dplyr::slice(1) %>% 39 | ungroup() %>% 40 | dplyr::select(dataset, comparison) 41 | n_distinct(most_cells$dataset) 42 | 43 | # filter to these comparisons 44 | dat0 = dat %>% 45 | mutate(comparison = ifelse(grepl("Schafflick|Der", dataset), 46 | gsub("^.*_", "", dataset), 47 | ifelse(grepl("Hagai", dataset), 48 | paste0(gsub("^.*_", "", dataset), "|", 49 | comparison), 50 | comparison)), 51 | dataset = ifelse(grepl("Schafflick|Der|Hagai", dataset), 52 | gsub("_.*$", "", dataset), dataset)) %>% 53 | inner_join(most_cells, by = c('dataset', 'comparison')) 54 | 55 | # write 56 | saveRDS(dat0, "data/analysis/expr_summary/expr_summary.rds") 57 | -------------------------------------------------------------------------------- /R/analysis/extract_FPs/inner-extract-FPs.R: -------------------------------------------------------------------------------- 1 | # Extract summary statistics for the top-ranking false-positives and 2 | # false-negatives from each DE method. 3 | setwd("~/git/DE-analysis") 4 | options(stringsAsFactors = FALSE) 5 | library(argparse) 6 | 7 | # parse arguments 8 | parser = ArgumentParser(prog = 'inner-extract-FPs.R') 9 | parser$add_argument('--label', type = 'character', required = TRUE) 10 | parser$add_argument('--sc_file', type = 'character', required = TRUE) 11 | parser$add_argument('--bulk_file', type = 'character', required = TRUE) 12 | parser$add_argument('--summary_file', type = 'character', required = TRUE) 13 | parser$add_argument('--output_file', type = 'character', required = TRUE) 14 | args = parser$parse_args() 15 | 16 | library(tidyverse) 17 | library(magrittr) 18 | 19 | # read single-cell and bulk DE 20 | sc_de = readRDS(args$sc_file) 21 | bulk_de = readRDS(args$bulk_file) 22 | 23 | # read expr summary 24 | expr_summary = read.csv(args$summary_file) 25 | 26 | # set up output containers 27 | FP = FN = data.frame() 28 | 29 | # iterate through single-cell comparisons 30 | label = args$label 31 | for (comparison_idx in seq_along(sc_de)) { 32 | sc_sub = sc_de[[comparison_idx]] 33 | sc_comparison = names(sc_de)[comparison_idx] 34 | if (is.null(sc_comparison)) 35 | sc_comparison = 1 36 | 37 | # filter comparisons 38 | if (grepl("Hagai2018", label) & !sc_comparison %in% c("lps4", "pic4")) { 39 | message(".. skipping comparison ", sc_comparison, "...") 40 | next 41 | } 42 | 43 | # iterate through cell types in the single-cell data 44 | cell_types = unique(sc_sub$cell_type) 45 | ## for Reyfman/Angelidis, only do select cell types 46 | if (grepl("Reyfman2020", label)) { 47 | cell_types = ifelse(grepl("AT2", label), "AT2", "Alveolar macrophages") 48 | } else if (grepl("Angelidis2019", label)) { 49 | cell_types = ifelse(grepl("alvmac", label), "Alveolar_macrophage", 50 | "Type_2_pneumocytes") 51 | } 52 | for (cell_type in cell_types) { 53 | message(".. analyzing cell type ", cell_type, " in comparison ", 54 | sc_comparison, "...") 55 | sc = filter(sc_sub, cell_type == !!cell_type) 56 | 57 | # get the relevant bulk comparisons 58 | if (grepl("Hagai2018", label)) { 59 | bulk_comparison = toupper(sc_comparison) 60 | bulk = bulk_de[[bulk_comparison]] 61 | } else if (label == "CanoGamez2020") { 62 | bulk_comparison = paste0('Resting|', sc_comparison, '|', cell_type, '|5d') 63 | bulk = bulk_de[[bulk_comparison]] 64 | } else if (grepl("Reyfman2020|Angelidis2019", label)) { 65 | bulk_comparison = '1' 66 | bulk = bulk_de[[1]] %>% ungroup() 67 | } else { 68 | stop("not sure what to do with label: ", label) 69 | } 70 | 71 | # fix column names 72 | fix_colnames = function(df) { 73 | colnames(df) %<>% 74 | fct_recode('p_val' = 'p.value', ## DESeq2 75 | 'p_val' = 'pvalue', ## DESeq2 76 | 'p_val' = 'p.value', ## t/wilcox 77 | 'p_val' = 'P.Value', ## limma 78 | 'p_val' = 'PValue' , ## edgeR 79 | 'p_val_adj' = 'padj', ## DESeq2/t/wilcox 80 | 'p_val_adj' = 'adj.P.Val', ## limma 81 | 'p_val_adj' = 'FDR', ## edgeER 82 | 'avg_logFC' = 'log2FoldChange', ## DESEeq2 83 | 'avg_logFC' = 'logFC', ## limma/edgeR 84 | 'test_statistic' = 'stat', ## DESeq2 85 | 'test_statistic' = 'F', ## edgeR 86 | 'test_statistic' = 't', ## limma 87 | 'test_statistic' = 'LR', ## edgeR LRT 88 | 'test_statistic' = 'statistic' ## t 89 | ) %>% 90 | as.character() 91 | return(df) 92 | } 93 | sc %<>% fix_colnames() 94 | bulk %<>% fix_colnames() 95 | 96 | # call FPs 97 | ns_sc = sc %>% 98 | # replace Bonferroni with BH correction 99 | mutate(padj = p.adjust(p_val, 'BH')) %>% 100 | filter(padj > 0.1) %>% 101 | pull(gene) 102 | ns_bulk = filter(bulk, p_val_adj > 0.1) %>% pull(gene) 103 | 104 | # single-cell FPs 105 | fps = sc %>% 106 | arrange(p_val) %>% 107 | filter(gene %in% ns_bulk) %>% 108 | head(200) %>% 109 | mutate(rank = row_number(), 110 | sc_comparison = sc_comparison, 111 | bulk_comparison = bulk_comparison) %>% 112 | dplyr::select(sc_comparison, bulk_comparison, cell_type, 113 | rank, gene, everything()) 114 | if ("runtime" %in% colnames(fps)) { 115 | fps %<>% dplyr::select(-runtime, -mem_usage) 116 | } 117 | 118 | # single-cell FNs 119 | fn_genes = bulk %>% 120 | arrange(p_val) %>% 121 | filter(gene %in% ns_sc) %>% 122 | filter(!duplicated(gene)) %>% 123 | head(200) %>% 124 | pull(gene) 125 | fns = sc %>% 126 | filter(gene %in% fn_genes) %>% 127 | # order by bulk p-values 128 | right_join(data.frame(gene = fn_genes), by = 'gene') %>% 129 | mutate(rank = row_number(), 130 | sc_comparison = sc_comparison, 131 | bulk_comparison = bulk_comparison) %>% 132 | dplyr::select(sc_comparison, bulk_comparison, cell_type, 133 | rank, gene, everything()) 134 | if ("runtime" %in% colnames(fns)) { 135 | fns %<>% dplyr::select(-runtime, -mem_usage) 136 | } 137 | 138 | # merge in expression summary to both 139 | summary0 = filter(expr_summary, 140 | cell_type == !!cell_type, 141 | comparison == sc_comparison) %>% 142 | dplyr::rename(sc_comparison = comparison) %>% 143 | dplyr::select(-dataset) 144 | fps %<>% left_join(summary0, by = c('cell_type', 'sc_comparison', 'gene')) 145 | fns %<>% left_join(summary0, by = c('cell_type', 'sc_comparison', 'gene')) 146 | 147 | # append to results 148 | FP %<>% bind_rows(fps) 149 | FN %<>% bind_rows(fns) 150 | } 151 | } 152 | 153 | # construct output 154 | output = list(FPs = FP, FNs = FN) 155 | 156 | # create output directory, if it doesn't exist 157 | output_dir = dirname(args$output_file) 158 | if (!dir.exists(output_dir)) 159 | dir.create(output_dir, recursive = TRUE) 160 | # save results 161 | saveRDS(output, args$output_file) 162 | -------------------------------------------------------------------------------- /R/analysis/extract_FPs/outer-extract-FPs.R: -------------------------------------------------------------------------------- 1 | # Extract summary statistics for the top-ranking false-positives and 2 | # false-negatives from each DE method. 3 | setwd("~/git/DE-analysis") 4 | options(stringsAsFactors = FALSE) 5 | library(argparse) 6 | 7 | # parse arguments 8 | parser = ArgumentParser(prog = 'outer-extract-FPs.R') 9 | parser$add_argument('--allocation', type = 'character') 10 | args = parser$parse_args() 11 | 12 | library(tidyverse) 13 | library(magrittr) 14 | source("R/functions/datasets.R") 15 | source("R/functions/submit_job.R") 16 | source("R/functions/detect_system.R") 17 | 18 | # set up grid 19 | opts = list( 20 | sc_dataset = c(paste0('Hagai2018_', c('mouse', 'pig', 'rat', 'rabbit')), 21 | 'Angelidis2019', 22 | 'CanoGamez2020', 23 | 'Reyfman2020'), 24 | bulk_dataset = c(paste0('Hagai2018_', c('mouse', 'pig', 'rat', 'rabbit')), 25 | 'Angelidis2019_facsepi', 26 | 'Angelidis2019_facsmac', 27 | 'CanoGamez2020', 28 | 'Reyfman2020_alvmac', 29 | 'Reyfman2020_AT2'), 30 | sc_test = c( 31 | ## single-cell methods, implemented in Seurat 32 | "wilcox", "bimod", "t", "negbinom", "poisson", "LR", "MAST", 33 | ## pseudobulk methods 34 | "pseudobulk_DESeq2,test?LRT", "pseudobulk_DESeq2,test?Wald", 35 | "pseudobulk_limma,mode?voom", "pseudobulk_limma,mode?trend", 36 | "pseudobulk_edgeR,test?QLF", "pseudobulk_edgeR,test?LRT", 37 | ## mixed model, implemented in Seurat 38 | "mixed_lm", 39 | ## pseudobulk methods with aggregation disabled 40 | "pseudobulk_DESeq2,test?LRT,replicate?cells", 41 | "pseudobulk_DESeq2,test?Wald,replicate?cells", 42 | "pseudobulk_limma,mode?voom,replicate?cells", 43 | "pseudobulk_limma,mode?trend,replicate?cells", 44 | "pseudobulk_edgeR,test?QLF,replicate?cells", 45 | "pseudobulk_edgeR,test?LRT,replicate?cells"), 46 | bulk_test = c("bulk_DESeq2,test?LRT", 47 | "bulk_DESeq2,test?Wald", 48 | "bulk_limma,mode?voom", 49 | "bulk_limma,mode?trend", 50 | "bulk_edgeR,test?LRT", 51 | "bulk_edgeR,test?QLF"), 52 | shuffle_replicates = c("NO", "YES") 53 | ) 54 | grid = do.call(tidyr::crossing, opts) %>% 55 | # matching datasets 56 | extract(map2_lgl(.$sc_dataset, .$bulk_dataset, ~ grepl(.x, .y)), ) 57 | 58 | # write the raw array 59 | grid_file = "sh/analysis/extract_FPs/grids/extract_FPs.raw.txt" 60 | grid_dir = dirname(grid_file) 61 | if (!dir.exists(grid_dir)) 62 | dir.create(grid_dir, recursive = T) 63 | write.table(grid, grid_file, quote = F, row.names = F, sep = "\t") 64 | 65 | # define output directory where results are stored 66 | output_dir = file.path(base_dir, "analysis", "extract_FPs") 67 | 68 | # now, check for which parameters are already complete 69 | overwrite = F 70 | grid0 = grid 71 | if (overwrite == F) { 72 | grid0 = grid %>% 73 | # for Reyfman2020, recode bulk test 74 | mutate(bulk_test = ifelse(grepl("Reyfman2020", sc_dataset), 75 | 'bulk_DESeq2', bulk_test)) %>% 76 | distinct() %>% 77 | # set up single-cell DE, bulk DE, and expr summary filepaths 78 | mutate(sc_dir = file.path(base_dir, 'analysis', 'run_DE'), 79 | sc_filename = paste0(sc_dataset, '-de_test=', sc_test, 80 | '-shuffle_replicates=', shuffle_replicates, 81 | '.rds'), 82 | sc_file = file.path(sc_dir, sc_filename), 83 | bulk_dir = file.path(base_dir, 'analysis', 'run_bulk_DE'), 84 | bulk_filename = paste0(bulk_dataset, '-de_test=', bulk_test, '.rds'), 85 | bulk_file = file.path(bulk_dir, bulk_filename), 86 | summary_dir = file.path(base_dir, 'analysis', 87 | 'expr_summary'), 88 | summary_file = file.path(summary_dir, 89 | paste0(sc_dataset, '.txt.gz'))) %>% 90 | # set up output filepath 91 | mutate(output_filename = paste0(bulk_dataset, 92 | '-sc_test=', sc_test, 93 | '-shuffle_replicates=', shuffle_replicates, 94 | '-bulk_test=', bulk_test, 95 | '.rds'), 96 | output_file = file.path(output_dir, output_filename), 97 | exists = file.exists(output_file), 98 | idx = row_number()) %>% 99 | # drop files that exist 100 | filter(!exists) %>% 101 | # keep only parameters and I/O 102 | dplyr::select(bulk_dataset, sc_test, bulk_test, 103 | sc_file, bulk_file, summary_file, output_file) 104 | } 105 | 106 | # write the grid that still needs to be run 107 | write.table(grid0, "sh/analysis/extract_FPs/grids/extract_FPs.txt", 108 | quote = F, row.names = F, sep = "\t") 109 | 110 | # finally, run the job on whatever system we're on 111 | script = "~/git/DE-analysis/sh/analysis/extract_FPs/extract_FPs.sh" 112 | submit_job(grid0, script, args$allocation, system) 113 | -------------------------------------------------------------------------------- /R/analysis/extract_FPs/summarise-extract-FPs.R: -------------------------------------------------------------------------------- 1 | setwd("~/git/DE-analysis") 2 | options(stringsAsFactors = F) 3 | library(tidyverse) 4 | library(magrittr) 5 | args = list(); source("R/functions/detect_system.R") 6 | 7 | input_dir = file.path(base_dir, "analysis/extract_FPs") 8 | input_files = list.files(input_dir, full.names = T, pattern ='*\\.rds$') 9 | 10 | # read all input files 11 | dats = map(input_files, readRDS) %>% 12 | setNames(basename(input_files)) 13 | 14 | # extract FPs/FNs separately 15 | FPs = map(dats, 'FPs') 16 | FNs = map(dats, 'FNs') 17 | 18 | # combine into a single object 19 | FP = FPs %>% 20 | map(~ map_dfc(., as.character)) %>% 21 | bind_rows(.id = 'filename') %>% 22 | type_convert() %>% 23 | # extract missing info from filename 24 | separate(filename, into = c('dataset', 'sc_test', 'shuffle_replicate', 25 | 'bulk_test'), sep = '-') %>% 26 | mutate_at(vars(sc_test, shuffle_replicate, bulk_test), function(x) 27 | gsub(".*=|.rds", "", x)) 28 | FN = FNs %>% 29 | map(~ map_dfc(., as.character)) %>% 30 | bind_rows(.id = 'filename') %>% 31 | type_convert() %>% 32 | # extract missing info from filename 33 | separate(filename, into = c('dataset', 'sc_test', 'shuffle_replicate', 34 | 'bulk_test'), sep = '-') %>% 35 | mutate_at(vars(sc_test, shuffle_replicate, bulk_test), function(x) 36 | gsub(".*=|.rds", "", x)) 37 | 38 | # create output 39 | dat = list(FPs = FP, FNs = FN) 40 | 41 | # save results 42 | output_file = "data/analysis/extract_FPs/extract_FPs.rds" 43 | output_dir = dirname(output_file) 44 | if (!dir.exists(output_dir)) 45 | dir.create(output_dir, recursive = T) 46 | saveRDS(dat, output_file) 47 | -------------------------------------------------------------------------------- /R/analysis/mean_variance/analyze-mean-delta-variance-all-datasets.R: -------------------------------------------------------------------------------- 1 | # Analyze the relationships between mean expression, expression variance, and 2 | # delta-variance in all 46 datasets. 3 | setwd("~/git/DE-analysis") 4 | options(stringsAsFactors = F) 5 | library(tidyverse) 6 | library(magrittr) 7 | library(broom) 8 | library(ppcor) 9 | 10 | # read expr_summary data 11 | dat = readRDS("data/analysis/expr_summary/expr_summary.rds") 12 | 13 | # for each dataset, calculate correlations between: 14 | ## mean and delta-variance 15 | cors1 = dat %>% 16 | # first, correlate within cell types 17 | mutate(delta = shuffled_variance - pseudobulk_variance) %>% 18 | drop_na(mean, delta) %>% 19 | group_by(dataset, comparison, cell_type) %>% 20 | do(tidy(cor.test(.$mean, .$delta, method = 'p', use = 'p'))) %>% 21 | ungroup() %>% 22 | filter(is.finite(estimate)) %>% 23 | # next, average over cell types 24 | group_by(dataset, comparison) %>% 25 | summarise(mean_cor = mean(estimate, na.rm = TRUE)) %>% 26 | ungroup() 27 | 28 | ## variance and delta-variance 29 | cors2 = dat %>% 30 | # first, correlate within cell types 31 | mutate(delta = shuffled_variance - pseudobulk_variance) %>% 32 | drop_na(pseudobulk_variance, delta) %>% 33 | group_by(dataset, comparison, cell_type) %>% 34 | do(tidy(cor.test(.$pseudobulk_variance, .$delta, method = 'p', use = 'p'))) %>% 35 | ungroup() %>% 36 | filter(is.finite(estimate)) %>% 37 | # next, average over cell types 38 | group_by(dataset, comparison) %>% 39 | summarise(mean_cor = mean(estimate, na.rm = TRUE)) %>% 40 | ungroup() 41 | 42 | # now do partial correlations between: 43 | ## delta-variance and variance, controlling for mean 44 | pcors1 = dat %>% 45 | # first, correlate within cell types 46 | mutate(delta = shuffled_variance - pseudobulk_variance) %>% 47 | drop_na(pseudobulk_variance, mean, delta) %>% 48 | group_by(dataset, comparison, cell_type) %>% 49 | mutate(partial_cor = pcor.test(pseudobulk_variance, delta, mean)$estimate) %>% 50 | ungroup() %>% 51 | filter(is.finite(partial_cor)) %>% 52 | # next, average over cell types 53 | group_by(dataset, comparison) %>% 54 | summarise(mean_cor = mean(partial_cor, na.rm = TRUE)) %>% 55 | ungroup() 56 | 57 | ## delta-variance and mean, controlling for variance 58 | pcors2 = dat %>% 59 | # first, correlate within cell types 60 | mutate(delta = shuffled_variance - pseudobulk_variance) %>% 61 | drop_na(pseudobulk_variance, mean, delta) %>% 62 | group_by(dataset, comparison, cell_type) %>% 63 | mutate(partial_cor = pcor.test(mean, delta, pseudobulk_variance)$estimate) %>% 64 | ungroup() %>% 65 | filter(is.finite(partial_cor)) %>% 66 | # next, average over cell types 67 | group_by(dataset, comparison) %>% 68 | summarise(mean_cor = mean(partial_cor, na.rm = TRUE)) %>% 69 | ungroup() 70 | 71 | # save all four correlations 72 | cors = bind_rows(mutate(cors1, xval = 'mean vs. delta-variance'), 73 | mutate(cors2, xval = 'variance vs. delta-variance'), 74 | mutate(pcors1, xval = 'variance vs. delta-variance (partial)'), 75 | mutate(pcors2, xval = 'mean vs. delta-variance (partial)')) %>% 76 | mutate(xval = fct_relevel(xval, 77 | 'variance vs. delta-variance (partial)', 78 | 'mean vs. delta-variance (partial)', 79 | 'variance vs. delta-variance', 80 | 'mean vs. delta-variance')) 81 | saveRDS(cors, "data/analysis/mean_variance/correlations.rds") 82 | -------------------------------------------------------------------------------- /R/analysis/run_DE/inner-run-DE.R: -------------------------------------------------------------------------------- 1 | # Run single-cell or pseudobulk DE analyses on all cell types in a dataset. 2 | setwd("~/git/DE-analysis") 3 | options(stringsAsFactors = F) 4 | library(argparse) 5 | 6 | # parse arguments 7 | parser = ArgumentParser(prog = 'inner-run-DE.R') 8 | parser$add_argument('--input_file', type = 'character', required = T) 9 | parser$add_argument('--shuffle_replicates', type = 'character', required = T) 10 | parser$add_argument('--output_dir', type = 'character', required = T) 11 | parser$add_argument('--de_test', type = 'character', required = T) 12 | args = parser$parse_args() 13 | print(args) 14 | 15 | library(tidyverse) 16 | library(magrittr) 17 | library(Seurat) 18 | library(Matrix) 19 | library(peakRAM) 20 | library(future) 21 | source("R/functions/get_comparisons.R") 22 | source("R/functions/run_DE.R") 23 | 24 | # set up output filepath 25 | if (!dir.exists(args$output_dir)) 26 | dir.create(args$output_dir, recursive = T) 27 | dataset = args$input_file %>% 28 | basename() %>% 29 | gsub("\\.rds$", "", .) 30 | output_filename = paste0(dataset, 31 | "-de_test=", args$de_test, 32 | "-shuffle_replicates=", args$shuffle_replicates, 33 | ".rds") 34 | output_file = file.path(args$output_dir, output_filename) 35 | 36 | # read input file and extract matrix/metadata 37 | sc = readRDS(args$input_file) 38 | expr = GetAssayData(sc, slot = 'counts') 39 | meta = sc@meta.data 40 | 41 | # get all combinations of conditions 42 | results = list() 43 | comparisons = get_comparisons(dataset, expr, meta) 44 | for (comparison_idx in seq_along(comparisons)) { 45 | comparison = comparisons[[comparison_idx]] 46 | comparison_name = names(comparisons)[comparison_idx] 47 | if (is.null(comparison_name)) 48 | comparison_name = 1 49 | 50 | message("[", comparison_idx, "/", length(comparisons), "] ", 51 | "analyzing comparison ", comparison_name, " ...") 52 | message("##############################") 53 | 54 | # get subset expression and metadata 55 | expr0 = comparison$expr 56 | meta0 = comparison$meta 57 | 58 | # check for replicate shuffling 59 | if (args$shuffle_replicates == "YES") { 60 | meta0 %<>% 61 | group_by(cell_type, label) %>% 62 | mutate(replicate = sample(replicate)) 63 | } 64 | 65 | # fix rownames 66 | meta0 %<>% set_rownames(colnames(expr0)) 67 | 68 | # reconstruct the Seurat object 69 | sc0 = CreateSeuratObject(expr0, min.cells = 1, min.features = 0, 70 | meta.data = meta0) 71 | 72 | # run DE analysis 73 | DE = run_DE(sc0, de_test = args$de_test) 74 | 75 | # append to list 76 | results[[comparison_name]] = DE 77 | } 78 | 79 | # stop if empty 80 | if (length(results) == 0 | all(map_int(results, nrow) == 0)) 81 | stop("couldn't get any results") 82 | 83 | # save results 84 | saveRDS(results, output_file) 85 | -------------------------------------------------------------------------------- /R/analysis/run_DE/outer-run-DE.R: -------------------------------------------------------------------------------- 1 | # Run single-cell or pseudobulk DE analyses on all cell types in a dataset. 2 | setwd("~/git/DE-analysis") 3 | options(stringsAsFactors = F) 4 | library(argparse) 5 | 6 | # parse arguments 7 | parser = ArgumentParser(prog = 'outer-run-DE.R') 8 | parser$add_argument('--allocation', type = 'character') 9 | args = parser$parse_args() 10 | 11 | library(tidyverse) 12 | library(magrittr) 13 | source("R/functions/datasets.R") 14 | source("R/functions/submit_job.R") 15 | source("R/functions/detect_system.R") 16 | 17 | # list input files 18 | input_dir = file.path(base_dir, "rnaseq", "seurat") 19 | input_files = file.path(input_dir, paste0(datasets, '.rds')) 20 | 21 | # establish grid of analyses 22 | opts = list( 23 | de_test = c( 24 | ## single-cell methods, implemented in Seurat 25 | "wilcox", "bimod", "t", "negbinom", "poisson", "LR", "MAST", 26 | ## pseudobulk methods 27 | "pseudobulk_DESeq2,test?LRT", "pseudobulk_DESeq2,test?Wald", 28 | "pseudobulk_limma,mode?voom", "pseudobulk_limma,mode?trend", 29 | "pseudobulk_edgeR,test?QLF", "pseudobulk_edgeR,test?LRT", 30 | ## mixed model, implemented in Seurat 31 | "mixed_lm", 32 | ## pseudobulk methods run without aggregation 33 | "pseudobulk_DESeq2,test?LRT,replicate?cells", 34 | "pseudobulk_DESeq2,test?Wald,replicate?cells", 35 | "pseudobulk_limma,mode?voom,replicate?cells", 36 | "pseudobulk_limma,mode?trend,replicate?cells", 37 | "pseudobulk_edgeR,test?QLF,replicate?cells", 38 | "pseudobulk_edgeR,test?LRT,replicate?cells", 39 | ), 40 | shuffle_replicates = c("NO", "YES") 41 | ) 42 | grid = do.call(expand.grid, c(opts, stringsAsFactors = F)) 43 | 44 | # rep analysis grid over input files 45 | grid %<>% 46 | dplyr::slice(rep(1:n(), each = nrow(inputs))) %>% 47 | mutate(input_file = rep(inputs$input_file, nrow(grid))) %>% 48 | left_join(inputs, by = 'input_file') %>% 49 | # reorder columns 50 | dplyr::select(input_file, everything()) 51 | 52 | # write the raw array 53 | grid_file = "sh/analysis/run_DE/grids/run_DE.raw.txt" 54 | grid_dir = dirname(grid_file) 55 | if (!dir.exists(grid_dir)) 56 | dir.create(grid_dir, recursive = T) 57 | write.table(grid, grid_file, quote = F, row.names = F, sep = "\t") 58 | 59 | # define output directory where results are stored 60 | output_dir = file.path(base_dir, "analysis/run_DE") 61 | 62 | # check which parameters are already complete 63 | overwrite = F 64 | grid0 = grid 65 | if (!overwrite) { 66 | grid0 = grid %>% 67 | mutate(output_filename = paste0(basename(input_file) %>% 68 | gsub("\\.rds$", "", .), 69 | '-de_test=', de_test, 70 | '-shuffle_replicates=', shuffle_replicates, 71 | '.rds'), 72 | output_file = file.path(output_dir, output_filename), 73 | exists = file.exists(output_file)) %>% 74 | filter(!exists) %>% 75 | dplyr::select(-output_file, -output_filename, -exists) 76 | } 77 | 78 | # subset grid, if needed 79 | if (nrow(grid0) >= 10000) { 80 | grid0 %<>% dplyr::slice(1:9900) ## allow for some other running jobs or sh 81 | } 82 | 83 | # write the grid that still needs to be run 84 | write.table(grid0, "sh/analysis/run_DE/grids/run_DE.txt", 85 | quote = F, row.names = F, sep = "\t") 86 | 87 | # finally, run the job on whatever system we're on 88 | sh_dir = "~/git/DE-analysis/sh/analysis/run_DE" 89 | script = file.path(sh_dir, "run_DE.sh") 90 | submit_job(grid0, script, args$allocation, system) 91 | -------------------------------------------------------------------------------- /R/analysis/run_GSEA/inner-GSEA-concordance.R: -------------------------------------------------------------------------------- 1 | # Calculate the concordance GSEA results from matching single-cell and bulk DE. 2 | setwd("~/git/DE-analysis") 3 | options(stringsAsFactors = F) 4 | library(argparse) 5 | 6 | # parse arguments 7 | parser = ArgumentParser(prog = 'inner-GSEA-concordance.R') 8 | parser$add_argument('--label', type = 'character', required = TRUE) 9 | parser$add_argument('--input_sc', type = 'character', required = TRUE) 10 | parser$add_argument('--input_bulk', type = 'character', required = TRUE) 11 | parser$add_argument('--output_file', type = 'character', required = TRUE) 12 | args = parser$parse_args() 13 | print(args) 14 | 15 | library(tidyverse) 16 | library(magrittr) 17 | library(AUC) 18 | source("R/functions/calculate_overlap.R") 19 | source("R/analysis/bulk_concordance/write_grid.R") 20 | 21 | # set up output filepath 22 | output_dir = dirname(args$output_file) 23 | if (!dir.exists(output_dir)) 24 | dir.create(output_dir, recursive = T) 25 | 26 | # load in files 27 | sc = readRDS(args$input_sc) 28 | bulk = readRDS(args$input_bulk) 29 | 30 | # iterate through single-cell comparisons 31 | res = data.frame() 32 | label = args$label 33 | for (sc_comparison in unique(sc$comparison)) { 34 | sc_sub = filter(sc, comparison == sc_comparison) 35 | 36 | # iterate through cell types in the single-cell data 37 | cell_types = unique(sc_sub$cell_type) 38 | for (cell_type in cell_types) { 39 | message(".. analyzing cell type ", cell_type, " in comparison ", 40 | sc_comparison, "...") 41 | input1 = filter(sc_sub, cell_type == !!cell_type) 42 | 43 | # now, get the matching bulk data 44 | if (grepl("Hagai2018", label)) { 45 | bulk_comparison = toupper(sc_comparison) 46 | input2 = filter(bulk, comparison == bulk_comparison) 47 | } else if (label == "CanoGamez2020") { 48 | bulk_comparison = paste0('Resting|', sc_comparison, '|', cell_type, '|5d') 49 | input2 = filter(bulk, comparison == bulk_comparison) 50 | } else if (grepl("Reyfman2020|Angelidis2019", label)) { 51 | bulk_comparison = 1 52 | input2 = bulk 53 | } else { 54 | stop("not sure what to do with label: ", label) 55 | } 56 | 57 | # fix columns 58 | input1 %<>% dplyr::rename(p_val = pval, p_val_adj = padj, gene = pathway, 59 | test_statistic = nMoreExtreme) %>% 60 | mutate(avg_logFC = 1) ## need to set the sign 61 | input2 %<>% dplyr::rename(p_val = pval, p_val_adj = padj, gene = pathway, 62 | test_statistic = nMoreExtreme) %>% 63 | mutate(avg_logFC = 1) 64 | 65 | # run the GSEA results through our generic concordance function 66 | concordance = template %>% 67 | mutate(value = pmap_dbl(., function(...) { 68 | template = tibble(...) 69 | print(template) 70 | value = calculate_overlap( 71 | bulk_de = input2, 72 | sc_de = input1, 73 | method = template$method, 74 | k = template$k, 75 | cor_method = template$cor_method 76 | ) 77 | return(value) 78 | })) %>% 79 | # flag comparisons and cell type 80 | mutate(sc_comparison = sc_comparison, 81 | bulk_comparison = bulk_comparison, 82 | cell_type = cell_type) 83 | 84 | # append to results 85 | res %<>% rbind(concordance) 86 | } 87 | } 88 | 89 | # save results 90 | saveRDS(res, args$output_file) 91 | -------------------------------------------------------------------------------- /R/analysis/run_GSEA/inner-run-GSEA.R: -------------------------------------------------------------------------------- 1 | # Run gene set enrichment analysis (GSEA) on a set of DE results. 2 | setwd("~/git/DE-analysis") 3 | options(stringsAsFactors = FALSE) 4 | library(argparse) 5 | 6 | # parse arguments 7 | parser = ArgumentParser(prog = 'inner-run-GSEA.R') 8 | parser$add_argument('--input_file', type = 'character', required = TRUE) 9 | parser$add_argument('--output_file', type = 'character', required = TRUE) 10 | parser$add_argument('--n_permutations', type = 'integer', default = 1e6) 11 | parser$add_argument('--min_size', type = 'integer', default = 10) 12 | parser$add_argument('--max_size', type = 'integer', default = 1000) 13 | args = parser$parse_args() 14 | print(args) 15 | 16 | library(tidyverse) 17 | library(magrittr) 18 | library(fgsea) 19 | library(flavin) 20 | 21 | # create output directory, if it does not exist 22 | output_dir = dirname(args$output_file) 23 | if (!dir.exists(output_dir)) 24 | dir.create(output_dir, recursive = T) 25 | 26 | # read input file 27 | input = readRDS(args$input_file) 28 | 29 | # read GO 30 | species = ifelse(grepl("Angelidis|_mouse|_rat", args$input_file), 31 | 'mouse', 'human') 32 | goa_file = paste0("data/GO/", 33 | fct_recode(species, 'mgi' = 'mouse', 'goa_human' = 'human'), 34 | ".gaf.gz") 35 | goa = read_gaf(goa_file) 36 | ann = as_annotation_list(goa, 'DB_Object_Symbol', 'GO_ID') 37 | 38 | # create results container 39 | res = data.frame() 40 | 41 | # iterate through comparisons 42 | for (comparison_idx in seq_along(input)) { 43 | comparison = input[[comparison_idx]] 44 | comparison_name = names(input)[comparison_idx] 45 | if (is.null(comparison_name)) 46 | comparison_name = 1 47 | 48 | if ("cell_type" %in% colnames(comparison)) { 49 | # iterate through cell types 50 | cell_types = unique(comparison$cell_type) 51 | ## keep only a subset of cell types to improve runtime 52 | keep = c("Naive", 53 | "Memory", 54 | "bone marrow derived mononuclear phagocytes", 55 | "Alveolar_macrophage", 56 | "Type_2_pneumocytes", 57 | "AT2", 58 | "Alveolar macrophages") 59 | cell_types %<>% intersect(keep) 60 | for (cell_type in cell_types) { 61 | message(".. analyzing cell type ", cell_type, " in comparison ", 62 | comparison_name, "...") 63 | DE = filter(comparison, cell_type == !!cell_type) 64 | 65 | # fix column names 66 | colnames(DE) %<>% 67 | fct_recode('p_val' = 'p.value', ## DESeq2 68 | 'p_val' = 'pvalue', ## DESeq2 69 | 'p_val' = 'p.value', ## t/wilcox 70 | 'p_val' = 'P.Value', ## limma 71 | 'p_val' = 'PValue' , ## edgeR 72 | 'p_val_adj' = 'padj', ## DESeq2/t/wilcox 73 | 'p_val_adj' = 'adj.P.Val', ## limma 74 | 'p_val_adj' = 'FDR', ## edgeER 75 | 'avg_logFC' = 'log2FoldChange', ## DESEeq2 76 | 'avg_logFC' = 'logFC', ## limma/edgeR 77 | 'test_statistic' = 'stat', ## DESeq2 78 | 'test_statistic' = 'F', ## edgeR 79 | 'test_statistic' = 't', ## limma 80 | 'test_statistic' = 'LR', ## edgeR LRT 81 | 'test_statistic' = 'statistic' ## t 82 | ) %>% 83 | as.character() 84 | 85 | # extract ranks 86 | ranks = DE %>% 87 | drop_na(test_statistic) %$% 88 | setNames(abs(test_statistic), gene) %>% 89 | sort(decreasing = TRUE) 90 | ## replace infinite values 91 | ranks[is.infinite(ranks)] = max(ranks[!is.infinite(ranks)]) 92 | 93 | # run GSEA 94 | gsea = fgsea(pathways = ann, 95 | stats = ranks, 96 | nproc = 1, 97 | nperm = args$n_permutations, 98 | minSize = args$min_size, 99 | maxSize = args$max_size) %>% 100 | dplyr::select(-leadingEdge) %>% 101 | # flag cell type and comparison 102 | mutate(cell_type = cell_type, 103 | comparison = comparison_name) 104 | 105 | # append to results 106 | res %<>% bind_rows(gsea) 107 | } 108 | } else { 109 | message(".. analyzing comparison ", comparison_name, "...") 110 | DE = comparison 111 | 112 | # fix column names 113 | colnames(DE) %<>% 114 | fct_recode('p_val' = 'p.value', ## DESeq2 115 | 'p_val' = 'pvalue', ## DESeq2 116 | 'p_val' = 'p.value', ## t/wilcox 117 | 'p_val' = 'P.Value', ## limma 118 | 'p_val' = 'PValue' , ## edgeR 119 | 'p_val_adj' = 'padj', ## DESeq2/t/wilcox 120 | 'p_val_adj' = 'adj.P.Val', ## limma 121 | 'p_val_adj' = 'FDR', ## edgeER 122 | 'avg_logFC' = 'log2FoldChange', ## DESEeq2 123 | 'avg_logFC' = 'logFC', ## limma/edgeR 124 | 'test_statistic' = 'stat', ## DESeq2 125 | 'test_statistic' = 'F', ## edgeR 126 | 'test_statistic' = 't', ## limma 127 | 'test_statistic' = 'LR', ## edgeR LRT 128 | 'test_statistic' = 'statistic' ## t 129 | ) %>% 130 | as.character() 131 | 132 | # extract ranks 133 | ranks = DE %>% 134 | drop_na(test_statistic) %$% 135 | setNames(abs(test_statistic), gene) %>% 136 | sort(decreasing = TRUE) 137 | 138 | # run GSEA 139 | gsea = fgsea(pathways = ann, 140 | stats = ranks, 141 | nproc = 1, 142 | nperm = args$n_permutations, 143 | minSize = args$min_size, 144 | maxSize = args$max_size) %>% 145 | dplyr::select(-leadingEdge) %>% 146 | # flag cell type and comparison 147 | mutate(comparison = comparison_name) 148 | 149 | # append to results 150 | res %<>% bind_rows(gsea) 151 | } 152 | } 153 | 154 | # stop if empty 155 | if (nrow(res) == 0) 156 | stop("couldn't get any results") 157 | 158 | # save results 159 | saveRDS(res, args$output_file) 160 | -------------------------------------------------------------------------------- /R/analysis/run_GSEA/outer-GSEA-concordance.R: -------------------------------------------------------------------------------- 1 | # Calculate the concordance GSEA results from matching single-cell and bulk DE. 2 | setwd("~/git/DE-analysis") 3 | options(stringsAsFactors = FALSE) 4 | library(argparse) 5 | 6 | # parse arguments 7 | parser = ArgumentParser(prog = 'outer-GSEA-concordance.R') 8 | parser$add_argument('--allocation', type = 'character') 9 | args = parser$parse_args() 10 | 11 | library(tidyverse) 12 | library(magrittr) 13 | source("R/functions/datasets.R") 14 | source("R/functions/submit_job.R") 15 | source("R/functions/detect_system.R") 16 | 17 | # set up grid 18 | opts = list( 19 | sc_dataset = c(paste0('Hagai2018_', c('mouse', 'pig', 'rat', 'rabbit')), 20 | 'Angelidis2019', 21 | 'CanoGamez2020', 22 | 'Reyfman2020'), 23 | bulk_dataset = c(paste0('Hagai2018_', c('mouse', 'pig', 'rat', 'rabbit')), 24 | 'Angelidis2019_facsepi', 25 | 'Angelidis2019_facsmac', 26 | 'CanoGamez2020', 27 | 'Reyfman2020_alvmac', 28 | 'Reyfman2020_AT2'), 29 | sc_test = c("wilcox", "bimod", "t", "negbinom", "poisson", "LR", "MAST", 30 | "pseudobulk_DESeq2,test?LRT", "pseudobulk_DESeq2,test?Wald", 31 | "pseudobulk_limma,mode?voom", "pseudobulk_limma,mode?trend", 32 | "pseudobulk_edgeR,test?QLF", "pseudobulk_edgeR,test?LRT", 33 | "mixed_lm"), 34 | bulk_test = c("bulk_DESeq2,test?LRT", 35 | "bulk_DESeq2,test?Wald", 36 | "bulk_limma,mode?voom", 37 | "bulk_limma,mode?trend", 38 | "bulk_edgeR,test?LRT", 39 | "bulk_edgeR,test?QLF") 40 | ) 41 | grid = do.call(tidyr::crossing, opts) %>% 42 | # matching datasets 43 | extract(map2_lgl(.$sc_dataset, .$bulk_dataset, ~ grepl(.x, .y)), ) 44 | 45 | # write the raw array 46 | grid_file = "sh/analysis/run_GSEA/grids/GSEA_concordance.raw.txt" 47 | grid_dir = dirname(grid_file) 48 | if (!dir.exists(grid_dir)) 49 | dir.create(grid_dir, recursive = T) 50 | write.table(grid, grid_file, quote = F, row.names = F, sep = "\t") 51 | 52 | # define output directory where results are stored 53 | output_dir = file.path(base_dir, "analysis", "run_GSEA", "concordance") 54 | 55 | # now, check for which parameters are already complete 56 | overwrite = F 57 | grid0 = grid 58 | if (overwrite == F) { 59 | grid0 = grid %>% 60 | # for Reyfman2020, recode bulk test 61 | mutate(bulk_test = ifelse(grepl("Reyfman2020_", bulk_dataset), 62 | 'bulk_DESeq2', bulk_test)) %>% 63 | distinct() %>% 64 | # set up single-cell DE, bulk DE, and expr summary filepaths 65 | mutate(sc_dir = file.path(base_dir, 'analysis', 'run_GSEA', 'single_cell'), 66 | sc_filename = paste0(sc_dataset, '-de_test=', sc_test, '.rds'), 67 | sc_file = file.path(sc_dir, sc_filename), 68 | bulk_dir = file.path(base_dir, 'analysis', 'run_GSEA', 'bulk'), 69 | bulk_filename = paste0(bulk_dataset, '-de_test=', bulk_test, '.rds'), 70 | bulk_file = file.path(bulk_dir, bulk_filename)) %>% 71 | # set up output filepath 72 | mutate(output_filename = paste0(bulk_dataset, 73 | '-sc_test=', sc_test, 74 | '-bulk_test=', bulk_test, 75 | '.rds'), 76 | output_file = file.path(output_dir, output_filename), 77 | exists = file.exists(output_file), 78 | idx = row_number()) %>% 79 | # drop files that exist 80 | filter(!exists) %>% 81 | # keep only parameters and I/O 82 | dplyr::select(bulk_dataset, sc_file, bulk_file, output_file) 83 | } 84 | 85 | # write the grid that still needs to be run 86 | write.table(grid0, "sh/analysis/run_GSEA/grids/GSEA_concordance.txt", 87 | quote = F, row.names = F, sep = "\t") 88 | 89 | # finally, run the job on whatever system we're on 90 | script = "~/git/DE-analysis/sh/analysis/run_GSEA/GSEA_concordance.sh" 91 | submit_job(grid0, script, args$allocation, system) 92 | -------------------------------------------------------------------------------- /R/analysis/run_GSEA/outer-run-GSEA.R: -------------------------------------------------------------------------------- 1 | # Run gene set enrichment analysis (GSEA) on single-cell DE results. 2 | setwd("~/git/DE-analysis") 3 | options(stringsAsFactors = F) 4 | library(argparse) 5 | 6 | # parse arguments 7 | parser = ArgumentParser(prog = 'outer-run-GSEA.R') 8 | parser$add_argument('--allocation', type = 'character') 9 | args = parser$parse_args() 10 | 11 | library(tidyverse) 12 | library(magrittr) 13 | source("R/functions/submit_job.R") 14 | source("R/functions/detect_system.R") 15 | 16 | # manually set up the input single-cell datasets 17 | sc_datasets = c(paste0('Hagai2018_', c('rat', 'rabbit', 'mouse', 'pig')), 18 | 'CanoGamez2020', 19 | 'Angelidis2019', 20 | 'Reyfman2020') 21 | 22 | # establish analysis grid 23 | opts = list( 24 | dataset = sc_datasets, 25 | de_test = c( 26 | "wilcox", "bimod", "t", "negbinom", "poisson", "LR", "MAST", 27 | "pseudobulk_DESeq2,test?LRT", "pseudobulk_DESeq2,test?Wald", 28 | "pseudobulk_limma,mode?voom", "pseudobulk_limma,mode?trend", 29 | "pseudobulk_edgeR,test?QLF", "pseudobulk_edgeR,test?LRT", 30 | "mixed_lm") 31 | ) 32 | sc_grid = do.call(tidyr::crossing, opts) %>% 33 | mutate(input_dir = file.path(base_dir, "analysis", "run_DE"), 34 | output_dir = file.path(base_dir, "analysis", "run_GSEA", 35 | "single_cell")) 36 | 37 | # now, do the same for bulk grid 38 | bulk_datasets = c(paste0('Hagai2018_', c('mouse', 'pig', 'rat', 'rabbit')), 39 | 'Angelidis2019_facsepi', 40 | 'Angelidis2019_facsmac', 41 | 'CanoGamez2020', 42 | 'Reyfman2020_alvmac', 43 | 'Reyfman2020_AT2') 44 | opts = list( 45 | dataset = bulk_datasets, 46 | de_test = c("bulk_DESeq2,test?LRT", 47 | "bulk_DESeq2,test?Wald", 48 | "bulk_limma,mode?voom", 49 | "bulk_limma,mode?trend", 50 | "bulk_edgeR,test?LRT", 51 | "bulk_edgeR,test?QLF") 52 | ) 53 | bulk_grid = do.call(tidyr::crossing, opts) %>% 54 | mutate(input_dir = file.path(base_dir, "analysis", "run_bulk_DE"), 55 | output_dir = file.path(base_dir, "analysis", "run_GSEA", "bulk")) 56 | 57 | # combine grids 58 | grid = bind_rows(sc_grid, bulk_grid) 59 | 60 | # write the raw array 61 | grid_file = "sh/analysis/run_GSEA/grids/run_GSEA.raw.txt" 62 | grid_dir = dirname(grid_file) 63 | if (!dir.exists(grid_dir)) 64 | dir.create(grid_dir, recursive = T) 65 | write.table(grid, grid_file, quote = F, row.names = F, sep = "\t") 66 | 67 | # check which parameters are already complete 68 | overwrite = F 69 | grid0 = grid 70 | if (!overwrite) { 71 | grid0 = grid %>% 72 | # for Reyfman2020, recode bulk test 73 | mutate(de_test = ifelse(grepl("Reyfman2020_", dataset), 74 | 'bulk_DESeq2', de_test)) %>% 75 | distinct() %>% 76 | mutate(input_file = file.path(input_dir, paste0(dataset, 77 | '-de_test=', de_test, 78 | '.rds')), 79 | output_file = file.path(output_dir, paste0(dataset, 80 | '-de_test=', de_test, 81 | '.rds')), 82 | exists = file.exists(output_file)) %>% 83 | filter(!exists) %>% 84 | dplyr::select(dataset, de_test, input_file, output_file) 85 | } 86 | 87 | # subset grid, if needed 88 | if (nrow(grid0) >= 10000) { 89 | grid0 %<>% dplyr::slice(1:9900) ## allow for some other running jobs or sh 90 | } 91 | 92 | # write the grid that still needs to be run 93 | write.table(grid0, "sh/analysis/run_GSEA/grids/run_GSEA.txt", 94 | quote = F, row.names = F, sep = "\t") 95 | 96 | # finally, run the job on whatever system we're on 97 | sh_dir = "~/git/DE-analysis/sh/analysis/run_GSEA" 98 | script = file.path(sh_dir, "run_GSEA.sh") 99 | submit_job(grid0, script, args$allocation, system) 100 | -------------------------------------------------------------------------------- /R/analysis/run_GSEA/summarise-GSEA-concordance.R: -------------------------------------------------------------------------------- 1 | setwd("~/git/DE-analysis") 2 | options(stringsAsFactors = F) 3 | library(tidyverse) 4 | library(magrittr) 5 | args = list(); source("R/functions/detect_system.R") 6 | 7 | input_dir = file.path(base_dir, "analysis", "run_GSEA", "concordance") 8 | input_files = list.files(input_dir, full.names = T, pattern ='*\\.rds$') 9 | 10 | # read all input files 11 | dats = map(input_files, readRDS) %>% 12 | setNames(basename(input_files)) 13 | 14 | # combine into a single file 15 | dat = dats %>% 16 | map(~ map_dfc(., as.character)) %>% 17 | bind_rows(.id = 'comparison') %>% 18 | type_convert() %>% 19 | separate(comparison, into = c('bulk_dataset', 'sc_test', 'bulk_test'), 20 | sep = '-') %>% 21 | mutate_at(vars(sc_test, bulk_test), function(x) gsub(".*=|.rds", "", x)) 22 | 23 | # save results 24 | output_file = "data/analysis/run_GSEA/GSEA_concordance.rds" 25 | output_dir = dirname(output_file) 26 | if (!dir.exists(output_dir)) 27 | dir.create(output_dir, recursive = T) 28 | saveRDS(dat, output_file) 29 | -------------------------------------------------------------------------------- /R/analysis/run_bulk_DE/inner-run-DE.R: -------------------------------------------------------------------------------- 1 | # Run bulk DE analyses on all cell types in a dataset. 2 | setwd("~/git/DE-analysis") 3 | options(stringsAsFactors = F) 4 | library(argparse) 5 | 6 | # parse arguments 7 | parser = ArgumentParser(prog = 'inner-run-DE.R') 8 | parser$add_argument('--input_file', type = 'character', required = T) 9 | parser$add_argument('--output_dir', type = 'character', required = T) 10 | parser$add_argument('--de_test', type = 'character', required = T) 11 | args = parser$parse_args() 12 | print(args) 13 | 14 | library(tidyverse) 15 | library(magrittr) 16 | library(Seurat) 17 | library(Matrix) 18 | source("R/functions/get_bulk_comparisons.R") 19 | source("R/functions/run_DE.R") 20 | 21 | # set up output filepath 22 | if (!dir.exists(args$output_dir)) 23 | dir.create(args$output_dir, recursive = T) 24 | dataset = args$input_file %>% 25 | basename() %>% 26 | gsub("\\.rds$", "", .) 27 | dataset_label = args$input_file %>% 28 | basename() %>% 29 | gsub("_.*|.rds", "", .) 30 | output_filename = paste0(dataset, "-de_test=", args$de_test, ".rds") 31 | output_file = file.path(args$output_dir, output_filename) 32 | 33 | # read input file and extract matrix/metadata 34 | sc = readRDS(args$input_file) 35 | expr = sc$assay 36 | meta = sc$meta 37 | 38 | # get all combinations of conditions 39 | results = list() 40 | comparisons = get_bulk_comparisons(dataset_label, expr, meta) 41 | for (comparison_idx in seq_along(comparisons)) { 42 | comparison = comparisons[[comparison_idx]] 43 | comparison_name = names(comparisons)[comparison_idx] 44 | if (is.null(comparison_name)) 45 | comparison_name = 1 46 | 47 | message("[", comparison_idx, "/", length(comparisons), "] ", 48 | "analyzing comparison ", comparison_name, " ...") 49 | message("##############################") 50 | 51 | # get subset expression and metadata 52 | expr0 = comparison$expr 53 | meta0 = comparison$meta %>% 54 | set_rownames(colnames(expr0)) 55 | 56 | # run DE analysis 57 | if (grepl("proteomics|microarray", args$input_file)) { 58 | DE = bulk_DE(expr0, targets = meta0, de_test = args$de_test, used_voom = F) 59 | } else { 60 | DE = bulk_DE(expr0, targets = meta0, de_test = args$de_test) 61 | } 62 | # append to list 63 | results[[comparison_name]] = DE 64 | } 65 | 66 | # stop if empty 67 | if (length(results) == 0 | all(map_int(results, nrow) == 0)) 68 | stop("couldn't get any results") 69 | 70 | # save results 71 | saveRDS(results, output_file) 72 | -------------------------------------------------------------------------------- /R/analysis/run_bulk_DE/outer-run-DE.R: -------------------------------------------------------------------------------- 1 | # Run bulk DE analysis on a dataset. 2 | setwd("~/git/DE-analysis") 3 | options(stringsAsFactors = F) 4 | library(argparse) 5 | 6 | # parse arguments 7 | parser = ArgumentParser(prog = 'outer-run-DE.R') 8 | parser$add_argument('--allocation', type = 'character') 9 | args = parser$parse_args() 10 | 11 | library(tidyverse) 12 | library(magrittr) 13 | source("R/functions/datasets.R") 14 | source("R/functions/submit_job.R") 15 | source("R/functions/detect_system.R") 16 | 17 | # list input files 18 | input_files = file.path(base_dir, paste0(bulk_datasets, '.rds')) 19 | inputs = data.frame(input_file = input_files) %>% 20 | mutate(type = dirname(bulk_datasets)) 21 | 22 | # define tests to use 23 | de_tests = c("bulk_DESeq2,test?LRT", 24 | "bulk_DESeq2,test?Wald", 25 | "bulk_limma,mode?voom", 26 | "bulk_limma,mode?trend", 27 | "bulk_edgeR,test?LRT", 28 | "bulk_edgeR,test?QLF") 29 | 30 | # rep analysis grid over input files 31 | grid = inputs %>% 32 | dplyr::slice(rep(1:n(), each = length(de_tests))) %>% 33 | mutate(de_test = rep(de_tests, nrow(inputs))) %>% 34 | # only do limma for proteomics and microarray 35 | filter(type != 'proteomics' | !grepl("DESeq2|edgeR", de_test)) 36 | 37 | # write the raw array 38 | grid_file = "sh/analysis/run_DE/grids/run_bulk_DE.raw.txt" 39 | grid_dir = dirname(grid_file) 40 | if (!dir.exists(grid_dir)) 41 | dir.create(grid_dir, recursive = T) 42 | write.table(grid, grid_file, quote = F, row.names = F, sep = "\t") 43 | 44 | # define output directory where results are stored 45 | output_dir = file.path(base_dir, "analysis/run_bulk_DE") 46 | 47 | # check which parameters are already complete 48 | overwrite = F 49 | grid0 = grid 50 | if (!overwrite) { 51 | grid0 = grid %>% 52 | mutate(output_filename = paste0(basename(input_file) %>% 53 | gsub("\\.rds$", "", .), 54 | '-de_test=', de_test, 55 | '.rds'), 56 | output_file = file.path(output_dir, output_filename), 57 | exists = file.exists(output_file)) %>% 58 | filter(!exists) %>% 59 | dplyr::select(-output_file, -output_filename, -exists) 60 | } 61 | 62 | # write the grid that still needs to be run 63 | write.table(grid0, "sh/analysis/run_DE/grids/run_bulk_DE.txt", 64 | quote = F, row.names = F, sep = "\t") 65 | 66 | # finally, run the job on whatever system we're on 67 | sh_dir = "~/git/DE-analysis/sh/analysis/run_DE" 68 | script = file.path(sh_dir, "run_bulk_DE.sh") 69 | submit_job(grid0, script, args$allocation, system) 70 | -------------------------------------------------------------------------------- /R/analysis/run_spike_in_DE/inner-run-DE.R: -------------------------------------------------------------------------------- 1 | # Run single-cell or pseudobulk DE analyses on the Hagai et al. dataset with 2 | # ERCC spike-ins. 3 | setwd("~/git/DE-analysis") 4 | options(stringsAsFactors = F) 5 | library(argparse) 6 | 7 | # parse arguments 8 | parser = ArgumentParser(prog = 'inner-run-DE.R') 9 | parser$add_argument('--input_file', type = 'character', required = T) 10 | parser$add_argument('--shuffle_replicates', type = 'character', required = T) 11 | parser$add_argument('--output_dir', type = 'character', required = T) 12 | parser$add_argument('--de_test', type = 'character', required = T) 13 | args = parser$parse_args() 14 | print(args) 15 | 16 | library(tidyverse) 17 | library(magrittr) 18 | library(Seurat) 19 | library(Matrix) 20 | library(peakRAM) 21 | library(future) 22 | source("R/functions/get_comparisons.R") 23 | source("R/functions/run_DE.R") 24 | 25 | # set up output filepath 26 | if (!dir.exists(args$output_dir)) 27 | dir.create(args$output_dir, recursive = T) 28 | dataset = args$input_file %>% 29 | basename() %>% 30 | gsub("\\.rds$", "", .) 31 | output_filename = paste0(dataset, 32 | "-de_test=", args$de_test, 33 | "-shuffle_replicates=", args$shuffle_replicates, 34 | ".rds") 35 | output_file = file.path(args$output_dir, output_filename) 36 | 37 | # read input file and extract matrix/metadata 38 | sc = readRDS(args$input_file) 39 | expr = GetAssayData(sc, slot = 'counts') 40 | meta = sc@meta.data 41 | 42 | # get all combinations of conditions 43 | results = list() 44 | comparisons = get_comparisons(dataset, expr, meta) 45 | for (comparison_idx in seq_along(comparisons)) { 46 | comparison = comparisons[[comparison_idx]] 47 | comparison_name = names(comparisons)[comparison_idx] 48 | if (is.null(comparison_name)) 49 | comparison_name = 1 50 | 51 | message("[", comparison_idx, "/", length(comparisons), "] ", 52 | "analyzing comparison ", comparison_name, " ...") 53 | message("##############################") 54 | 55 | # get subset expression and metadata 56 | expr0 = comparison$expr 57 | meta0 = comparison$meta 58 | 59 | # check for replicate shuffling 60 | if (args$shuffle_replicates == "YES") { 61 | meta0 %<>% 62 | group_by(cell_type, label) %>% 63 | mutate(replicate = sample(replicate)) 64 | } 65 | 66 | # fix rownames 67 | meta0 %<>% set_rownames(colnames(expr0)) 68 | 69 | # reconstruct the Seurat object 70 | sc0 = CreateSeuratObject(expr0, min.cells = 1, min.features = 0, 71 | meta.data = meta0) 72 | 73 | # run DE analysis 74 | DE = run_DE(sc0, de_test = args$de_test) 75 | 76 | # append to list 77 | results[[comparison_name]] = DE 78 | } 79 | 80 | # stop if empty 81 | if (length(results) == 0 | all(map_int(results, nrow) == 0)) 82 | stop("couldn't get any results") 83 | 84 | # save results 85 | saveRDS(results, output_file) 86 | -------------------------------------------------------------------------------- /R/analysis/run_spike_in_DE/outer-run-DE.R: -------------------------------------------------------------------------------- 1 | # Run single-cell or pseudobulk DE analyses on the Hagai et al. dataset with 2 | # ERCC spike-ins. 3 | setwd("~/git/DE-analysis") 4 | options(stringsAsFactors = F) 5 | library(argparse) 6 | 7 | # parse arguments 8 | parser = ArgumentParser(prog = 'outer-run-DE.R') 9 | parser$add_argument('--allocation', type = 'character') 10 | args = parser$parse_args() 11 | 12 | library(tidyverse) 13 | library(magrittr) 14 | source("R/functions/submit_job.R") 15 | source("R/functions/detect_system.R") 16 | 17 | # list input files 18 | input_dir = file.path(base_dir, "rnaseq", "seurat") 19 | input_files = file.path(input_dir, "Hagai2018_plate.rds") 20 | inputs = data.frame(input_file = input_files) 21 | 22 | # establish grid of analyses 23 | opts = list( 24 | de_test = c( 25 | ## single-cell methods, implemented in Seurat 26 | "wilcox", "bimod", "t", "negbinom", "poisson", "LR", "MAST", 27 | ## pseudobulk methods 28 | "pseudobulk_DESeq2,test?LRT", "pseudobulk_DESeq2,test?Wald", 29 | "pseudobulk_limma,mode?voom", "pseudobulk_limma,mode?trend", 30 | "pseudobulk_edgeR,test?QLF", "pseudobulk_edgeR,test?LRT", 31 | ## mixed model, implemented in Seurat 32 | "mixed_lm", 33 | ## pseudobulk methods run without aggregation 34 | "pseudobulk_DESeq2,test?LRT,replicate?cells", 35 | "pseudobulk_DESeq2,test?Wald,replicate?cells", 36 | "pseudobulk_limma,mode?voom,replicate?cells", 37 | "pseudobulk_limma,mode?trend,replicate?cells", 38 | "pseudobulk_edgeR,test?QLF,replicate?cells", 39 | "pseudobulk_edgeR,test?LRT,replicate?cells" 40 | ), 41 | shuffle_replicates = c("NO", "YES") 42 | ) 43 | grid = do.call(expand.grid, c(opts, stringsAsFactors = F)) 44 | 45 | # rep analysis grid over input files 46 | grid %<>% 47 | dplyr::slice(rep(1:n(), each = nrow(inputs))) %>% 48 | mutate(input_file = rep(inputs$input_file, nrow(grid))) %>% 49 | left_join(inputs, by = 'input_file') %>% 50 | # reorder columns 51 | dplyr::select(input_file, everything()) 52 | 53 | # write the raw array 54 | grid_file = "sh/analysis/run_spike_in_DE/grids/run_DE.raw.txt" 55 | grid_dir = dirname(grid_file) 56 | if (!dir.exists(grid_dir)) 57 | dir.create(grid_dir, recursive = T) 58 | write.table(grid, grid_file, quote = F, row.names = F, sep = "\t") 59 | 60 | # define output directory where results are stored 61 | output_dir = file.path(base_dir, "analysis/run_spike_in_DE") 62 | 63 | # check which parameters are already complete 64 | overwrite = F 65 | grid0 = grid 66 | if (!overwrite) { 67 | grid0 = grid %>% 68 | mutate(output_filename = paste0(basename(input_file) %>% 69 | gsub("\\.rds$", "", .), 70 | '-de_test=', de_test, 71 | '-shuffle_replicates=', shuffle_replicates, 72 | '.rds'), 73 | output_file = file.path(output_dir, output_filename), 74 | exists = file.exists(output_file)) %>% 75 | filter(!exists) %>% 76 | dplyr::select(-output_file, -output_filename, -exists) 77 | } 78 | 79 | # subset grid, if needed 80 | if (nrow(grid0) >= 10000) { 81 | grid0 %<>% dplyr::slice(1:9900) ## allow for some other running jobs or sh 82 | } 83 | 84 | # write the grid that still needs to be run 85 | write.table(grid0, "sh/analysis/run_spike_in_DE/grids/run_DE.txt", 86 | quote = F, row.names = F, sep = "\t") 87 | 88 | # finally, run the job on whatever system we're on 89 | sh_dir = "~/git/DE-analysis/sh/analysis/run_spike_in_DE" 90 | script = file.path(sh_dir, "run_DE.sh") 91 | submit_job(grid0, script, args$allocation, system) 92 | -------------------------------------------------------------------------------- /R/analysis/run_spike_in_DE/summarise-spike-ins.R: -------------------------------------------------------------------------------- 1 | setwd("~/git/DE-analysis") 2 | options(stringsAsFactors = F) 3 | library(tidyverse) 4 | library(magrittr) 5 | args = list(); source("R/functions/detect_system.R") 6 | 7 | # set up input directory 8 | input_dir = file.path(base_dir, "analysis/run_spike_in_DE") 9 | input_files = list.files(input_dir, full.names = T, pattern ='*\\.rds$') 10 | 11 | # read all input files 12 | dats = map(input_files, readRDS) %>% 13 | setNames(basename(input_files)) 14 | 15 | ## function to fix column names 16 | clean_cols = function(df) { 17 | # fix column names 18 | colnames(df) %<>% 19 | fct_recode('p_val' = 'p.value', ## DESeq2 20 | 'p_val' = 'pvalue', ## DESeq2 21 | 'p_val' = 'p.value', ## t/wilcox 22 | 'p_val' = 'P.Value', ## limma 23 | 'p_val' = 'PValue' , ## edgeR 24 | 'p_val_adj' = 'padj', ## DESeq2/t/wilcox 25 | 'p_val_adj' = 'adj.P.Val', ## limma 26 | 'p_val_adj' = 'FDR', ## edgeER 27 | 'avg_logFC' = 'log2FoldChange', ## DESEeq2 28 | 'avg_logFC' = 'logFC', ## limma/edgeR 29 | 'test_statistic' = 'stat', ## DESeq2 30 | 'test_statistic' = 'F', ## edgeR 31 | 'test_statistic' = 't', ## limma 32 | 'test_statistic' = 'LR', ## edgeR LRT 33 | 'test_statistic' = 'statistic' ## t 34 | ) %>% 35 | as.character() 36 | return(df) 37 | } 38 | 39 | # summarise, keeping only ERCCs 40 | sum = dats %>% 41 | map(extract2, 1) %>% 42 | map(clean_cols) %>% 43 | bind_rows(.id = 'dataset') %>% 44 | filter(grepl("^ERCC-", gene)) %>% 45 | dplyr::select(dataset, cell_type, gene, p_val, p_val_adj, test, 46 | test_statistic, avg_logFC) 47 | 48 | # combine this with gene level summary statistics 49 | expr_summary = read.csv(file.path(base_dir, "analysis/expr_summary", 50 | "Hagai2018_plate.txt.gz")) %>% 51 | filter(gene %in% sum$gene) 52 | sum %<>% left_join(expr_summary, by = 'gene') %>% 53 | dplyr::select(-dataset.y, -cell_type.x) %>% 54 | dplyr::rename(cell_type = cell_type.y) %>% 55 | separate(dataset.x, into = c('dataset', 'de_test', 'shuffle_replicates'), 56 | sep = '-') %>% 57 | mutate_at(vars(de_test, shuffle_replicates), ~ gsub("^.*=|\\.rds$", "", .)) 58 | 59 | # save results 60 | output_file = "data/analysis/run_spike_in_DE/spike_in_summary.rds" 61 | output_dir = dirname(output_file) 62 | if (!dir.exists(output_dir)) 63 | dir.create(output_dir, recursive = T) 64 | saveRDS(sum, output_file) 65 | -------------------------------------------------------------------------------- /R/analysis/simulations/inner-expr-summary-simulations.R: -------------------------------------------------------------------------------- 1 | setwd("~/git/DE-analysis") 2 | options(stringsAsFactors = F) 3 | library(argparse) 4 | 5 | # parse arguments 6 | parser = ArgumentParser(prog = 'inner-expr-summary.R') 7 | parser$add_argument('--input_file', type = 'character', required = T) 8 | parser$add_argument('--output_dir', type = 'character', required = T) 9 | args = parser$parse_args() 10 | print(args) 11 | 12 | library(tidyverse) 13 | library(magrittr) 14 | library(Seurat) 15 | library(Matrix) 16 | source("R/functions/get_comparisons.R") 17 | 18 | # set up output filepath 19 | if (!dir.exists(args$output_dir)) 20 | dir.create(args$output_dir, recursive = T) 21 | output_filename = basename(args$input_file) 22 | output_file = file.path(args$output_dir, output_filename) 23 | 24 | # read input file and extract matrix/metadata 25 | sc = readRDS(args$input_file) 26 | expr = GetAssayData(sc, slot = 'counts') 27 | meta = sc@meta.data 28 | 29 | # calculate statistics 30 | genes = rownames(expr) 31 | means = Matrix::rowMeans(expr) 32 | sds = sparseMatrixStats::rowSds(expr) 33 | covs = sds / means 34 | pct_zeros = Matrix::rowSums(expr == 0) / ncol(expr) 35 | 36 | # calculate logFC as defined in Seurat 37 | logFC = tryCatch({ 38 | sc0 = CreateSeuratObject(expr, meta = meta) %>% 39 | NormalizeData() 40 | Idents(sc0) = sc0$label 41 | mat = GetAssayData(sc0, slot = 'data') 42 | levels = levels(meta$label) 43 | if (is.null(levels)) { 44 | levels = unique(meta$label) 45 | } 46 | cells1 = WhichCells(sc0, idents = levels[1]) 47 | cells2 = WhichCells(sc0, idents = levels[2]) 48 | data1 = log(rowMeans(mat[, cells1, drop = F] + 1)) 49 | data2 = log(rowMeans(mat[, cells2, drop = F] + 1)) 50 | out = data2 - data1 # backwards from Seurat (i.e., the proper way) 51 | }, error = function(e) { return(NA_real_) }) 52 | 53 | # calculate pseudobulk variance 54 | pseudobulk_variance = tryCatch({ 55 | meta2 = meta %>% 56 | mutate(label = as.character(label), 57 | replicate = as.character(replicate)) 58 | mm = model.matrix(~ 0 + replicate, data = meta2) 59 | mat_mm = expr %*% mm 60 | # drop empty columns 61 | keep_samples = colSums(mat_mm) > 0 62 | mat_mm %<>% extract(, keep_samples) %>% as.matrix() 63 | # normalize 64 | mat_mm %<>% edgeR::cpm() 65 | # grab the variance for each gene 66 | vars = sparseMatrixStats::rowSds(mat_mm) 67 | vars %<>% setNames(rownames(mat_mm)) 68 | vars 69 | }, error = function(e) { return(NA_real_) }) 70 | 71 | # calculate shuffled pseudobulk variance 72 | shuffled_variance = tryCatch({ 73 | meta2 = meta %>% 74 | mutate(label = as.character(label), 75 | replicate = as.character(replicate)) %>% 76 | group_by(cell_type, label) %>% 77 | mutate(replicate = sample(replicate)) 78 | mm = model.matrix(~ 0 + replicate, data = meta2) 79 | mat_mm = expr %*% mm 80 | # drop empty columns 81 | keep_samples = colSums(mat_mm) > 0 82 | mat_mm %<>% extract(, keep_samples) %>% as.matrix() 83 | # normalize 84 | mat_mm %<>% edgeR::cpm() 85 | # grab the variance for each gene 86 | vars = sparseMatrixStats::rowSds(mat_mm) 87 | vars %<>% setNames(rownames(mat_mm)) 88 | vars 89 | }, error = function(e) { return(NA_real_) }) 90 | 91 | # calculate the ratio of real to shuffled variance 92 | ratio = pseudobulk_variance / shuffled_variance 93 | 94 | # convert to data frame 95 | df = data.frame(gene = genes, mean = means, sd = sds, cov = covs, 96 | pct_zero = pct_zeros, logFC = logFC, 97 | pseudobulk_variance = pseudobulk_variance, 98 | shuffled_variance = shuffled_variance, 99 | pseudobulk_ratio = ratio) %>% 100 | # drop genes with zero expression 101 | filter(mean > 0) 102 | 103 | # write 104 | saveRDS(df, output_file) 105 | -------------------------------------------------------------------------------- /R/analysis/simulations/inner-null-run-DE.R: -------------------------------------------------------------------------------- 1 | # Run single-cell or pseudobulk DE analyses on all cell types in a dataset. 2 | setwd("~/git/DE-analysis") 3 | options(stringsAsFactors = F) 4 | library(argparse) 5 | 6 | # parse arguments 7 | parser = ArgumentParser(prog = 'inner-run-DE.R') 8 | parser$add_argument('--input_file', type = 'character', required = T) 9 | parser$add_argument('--shuffle_replicates', type = 'character', required = T) 10 | parser$add_argument('--output_dir', type = 'character', required = T) 11 | parser$add_argument('--de_test', type = 'character', required = T) 12 | args = parser$parse_args() 13 | print(args) 14 | 15 | library(tidyverse) 16 | library(magrittr) 17 | library(Seurat) 18 | library(Matrix) 19 | library(peakRAM) 20 | library(future) 21 | source("R/functions/get_comparisons.R") 22 | source("R/functions/run_DE.R") 23 | 24 | # set up output filepath 25 | if (!dir.exists(args$output_dir)) 26 | dir.create(args$output_dir, recursive = T) 27 | dataset = args$input_file %>% 28 | basename() %>% 29 | gsub("\\.rds$", "", .) 30 | output_filename = paste0(dataset, 31 | "-de_test=", args$de_test, 32 | "-shuffle_replicates=", args$shuffle_replicates, 33 | ".rds") 34 | output_file = file.path(args$output_dir, output_filename) 35 | 36 | # read input file and extract matrix/metadata 37 | sc = readRDS(args$input_file) 38 | 39 | # check for replicate shuffling 40 | if (args$shuffle_replicates == "YES") { 41 | sc@meta.data %<>% 42 | group_by(label) %>% 43 | mutate(replicate = sample(replicate)) %>% 44 | ungroup() %>% 45 | set_rownames(colnames(sc)) 46 | } 47 | 48 | # run DE analysis 49 | DE = run_DE(sc, de_test = args$de_test) 50 | 51 | # stop if empty 52 | if (nrow(DE) == 0) 53 | stop("couldn't get any results") 54 | 55 | # save results 56 | saveRDS(DE, output_file) 57 | -------------------------------------------------------------------------------- /R/analysis/simulations/inner_write_simulation_objects_null.R: -------------------------------------------------------------------------------- 1 | # Generate the complete set of simulated scRNA-seq datasets for the experiment 2 | # of null differential expression 3 | setwd("~/git/DE-analysis") 4 | options(stringsAsFactors = F) 5 | library(argparse) 6 | 7 | # parse arguments 8 | parser = ArgumentParser(prog = 'inner_write_simulation_objects_null.R') 9 | parser$add_argument('--n_cells', type = 'integer', required = T) 10 | parser$add_argument('--de_prob', type = 'double', required = T) 11 | parser$add_argument('--de_loc', type = 'double', required = T) 12 | parser$add_argument('--n_reps', type = 'integer', required = T) 13 | parser$add_argument('--sample_idx', type = 'integer', required = T) 14 | parser$add_argument('--output_dir', type = 'character', required = T) 15 | args = parser$parse_args() 16 | print(args) 17 | 18 | library(Seurat) 19 | library(splatterBatch) 20 | library(scater) 21 | library(tidyverse) 22 | library(magrittr) 23 | library(Matrix) 24 | 25 | source("R/functions/detect_system.R") 26 | 27 | # check the output directory 28 | if (!dir.exists(args$output_dir)) { 29 | dir.create(args$output_dir, recursive = T) 30 | } 31 | 32 | # define output file 33 | output_filename = paste0("GSE96583", 34 | "-n_cells=", args$n_cells, 35 | "-de_prob=", args$de_prob, 36 | "-de_loc=", args$de_loc, 37 | "-n_reps=", args$n_reps, 38 | "-sample_idx=", args$sample_idx, 39 | ".rds") 40 | output_file = file.path(args$output_dir, output_filename) 41 | 42 | # get parameters defined by Kang et al. IFN dataset 43 | params = readRDS(file.path(base_dir, "analysis/simulations/parameters", 44 | "parameters_GSE96583.rds")) 45 | 46 | # calculate group probabilities 47 | group_probs = 1 / args$n_reps 48 | # assign groups 49 | unst = sample(seq(args$n_reps), args$n_reps/2) 50 | 51 | # generate simulated cells 52 | sim = splatterBatch::splatSimulateGroups( 53 | params = params, 54 | seed = args$sample_idx, 55 | batchCells = args$n_cells, 56 | de.prob = args$de_prob, 57 | de.facLoc = args$de_loc, 58 | group.prob = rep(group_probs, args$n_reps), verbose = F 59 | ) %>% logNormCounts() %>% as.Seurat() 60 | 61 | # adjust metadata for default input to Seurat 62 | sim@meta.data %<>% 63 | dplyr::mutate(cell_type = paste0('cell_', 1)) %>% 64 | dplyr::rename(label = Group) %>% 65 | mutate(replicate = gsub("Group", "Replicate ", label)) %>% 66 | mutate(label = as.numeric(gsub("Group", "", label))) %>% 67 | mutate(label = ifelse(label %in% unst, 'unst', 'stim')) %>% 68 | set_rownames(colnames(GetAssayData(sim))) 69 | 70 | # save 71 | saveRDS(sim, output_file) 72 | -------------------------------------------------------------------------------- /R/analysis/simulations/outer-expr-summary-simulations.R: -------------------------------------------------------------------------------- 1 | setwd("~/git/DE-analysis") 2 | options(stringsAsFactors = F) 3 | library(argparse) 4 | 5 | # parse arguments 6 | parser = ArgumentParser(prog = 'outer-expr-summary-simulations.R') 7 | parser$add_argument('--allocation', type = 'character') 8 | args = parser$parse_args() 9 | 10 | library(tidyverse) 11 | library(magrittr) 12 | source("R/functions/datasets.R") 13 | source("R/functions/submit_job.R") 14 | source("R/functions/detect_system.R") 15 | 16 | # set up grid 17 | # limit this experiment to n_reps=3 18 | input_dir = file.path(base_dir, "analysis", "simulations", "null", "objects") 19 | grid = tidyr::crossing( 20 | n_cells = c(100, 200, 500, 1000, 2000), 21 | de_prob = 0.5, 22 | de_loc = seq(0, 1, 0.1), 23 | n_reps = c(3, 4, 5, 10, 20) * 2, 24 | sample_idx = seq_len(10) 25 | ) %>% 26 | # vary only one of n_cells/n_reps 27 | filter(n_cells == 500 | n_reps == 6) %>% 28 | mutate(input_filename = paste0("GSE96583", 29 | "-n_cells=", n_cells, 30 | "-de_prob=", de_prob, 31 | "-de_loc=", de_loc, 32 | "-n_reps=", n_reps, 33 | "-sample_idx=", sample_idx, 34 | '.rds'), 35 | input_file = file.path(input_dir, input_filename)) 36 | 37 | # define output directory where results are stored 38 | output_dir = file.path(base_dir, "analysis/simulations/null/expr_summary") 39 | 40 | # check which parameters are already complete 41 | overwrite = F 42 | grid0 = grid 43 | if (!overwrite) { 44 | grid0 = grid %>% 45 | mutate(output_filename = basename(input_file), 46 | output_file = file.path(output_dir, output_filename), 47 | exists = file.exists(output_file)) %>% 48 | filter(!exists) %>% 49 | dplyr::select(input_file) 50 | } 51 | 52 | # write the grid that still needs to be run 53 | grid_file = "sh/analysis/simulations/grids/expr_summary.txt" 54 | grid_dir = dirname(grid_file) 55 | if (!dir.exists(grid_dir)) 56 | dir.create(grid_dir, recursive = T) 57 | write.table(grid0, grid_file, quote = F, row.names = F, sep = "\t") 58 | 59 | # finally, run the job on whatever system we're on 60 | sh_dir = "~/git/DE-analysis/sh/analysis/simulations" 61 | script = file.path(sh_dir, "expr_summary_simulations.sh") 62 | submit_job(grid0, script, args$allocation, system) 63 | -------------------------------------------------------------------------------- /R/analysis/simulations/outer-null-run-DE.R: -------------------------------------------------------------------------------- 1 | setwd("~/git/DE-analysis") 2 | options(stringsAsFactors = F) 3 | library(argparse) 4 | 5 | # parse arguments 6 | parser = ArgumentParser(prog = 'outer-nullrun-DE.R') 7 | parser$add_argument('--allocation', type = 'character') 8 | args = parser$parse_args() 9 | 10 | library(tidyverse) 11 | library(magrittr) 12 | source("R/functions/datasets.R") 13 | source("R/functions/submit_job.R") 14 | source("R/functions/detect_system.R") 15 | 16 | # list input files 17 | input_dir = file.path(base_dir, "analysis", "simulations", "null", "objects") 18 | 19 | grid = tidyr::crossing( 20 | de_test = c( 21 | ## single-cell methods, implemented in Seurat 22 | "wilcox", "bimod", "t", "negbinom", "poisson", "LR", "MAST", 23 | # pseudobulk methods 24 | "pseudobulk_DESeq2,test?LRT", "pseudobulk_DESeq2,test?Wald", 25 | "pseudobulk_limma,mode?voom", "pseudobulk_limma,mode?trend", 26 | "pseudobulk_edgeR,test?QLF", 27 | "pseudobulk_edgeR,test?LRT", 28 | # mixed model, implemented in Seurat 29 | "mixed_lm" 30 | ), 31 | n_cells = c(100, 200, 500, 1000, 2000), 32 | de_prob = 0.5, 33 | de_loc = seq(0, 1, 0.1), 34 | n_reps = 2 * c(3, 4, 5, 10, 20), 35 | sample_idx = seq_len(10), 36 | shuffle_replicates = c("NO", "YES") 37 | ) %>% 38 | filter(grepl("pseudo|mixed", de_test) | shuffle_replicates == 'NO') %>% 39 | filter(n_cells == 500 & n_reps %in% c(2 * c(3, 4, 5, 10, 20)) | 40 | n_reps == 6 & n_cells %in% c(100, 200, 500, 1000, 2000)) 41 | 42 | # write the raw array 43 | grid_file = "sh/analysis/simulations/grids/null_run_DE.raw.txt" 44 | grid_dir = dirname(grid_file) 45 | if (!dir.exists(grid_dir)) 46 | dir.create(grid_dir, recursive = T) 47 | write.table(grid, grid_file, quote = F, row.names = F, sep = "\t") 48 | 49 | # define output directory where results are stored 50 | output_dir = file.path(base_dir, "analysis/simulations/null/DE") 51 | 52 | # check which parameters are already complete 53 | overwrite = F 54 | grid0 = grid 55 | if (!overwrite) { 56 | grid0 = grid %>% 57 | mutate(output_filename = paste0("GSE96583", 58 | "-n_cells=", n_cells, 59 | "-de_prob=", de_prob, 60 | "-de_loc=", de_loc, 61 | "-n_reps=", n_reps, 62 | "-sample_idx=", sample_idx, 63 | '-de_test=', de_test, 64 | '-shuffle_replicates=', shuffle_replicates, 65 | '.rds'), 66 | input_filename = paste0("GSE96583", 67 | "-n_cells=", n_cells, 68 | "-de_prob=", de_prob, 69 | "-de_loc=", de_loc, 70 | "-n_reps=", n_reps, 71 | "-sample_idx=", sample_idx, 72 | '.rds'), 73 | output_file = file.path(output_dir, output_filename), 74 | input_file = file.path(input_dir, input_filename), 75 | exists = file.exists(output_file)) %>% 76 | filter(!exists) %>% 77 | dplyr::select(-output_file, -output_filename, -exists) %>% 78 | # clean up grid 79 | dplyr::select(input_file, de_test, shuffle_replicates) 80 | } 81 | 82 | # run 5000 at a time 83 | grid0 %<>% dplyr::slice(1:5000) 84 | 85 | # write the grid that still needs to be run 86 | write.table(grid0, "sh/analysis/simulations/grids/null_run_DE.txt", 87 | quote = F, row.names = F, sep = "\t") 88 | 89 | # finally, run the job on whatever system we're on 90 | sh_dir = "~/git/DE-analysis/sh/analysis/simulations" 91 | script = file.path(sh_dir, "null_run_DE.sh") 92 | submit_job(grid0, script, args$allocation, system) 93 | -------------------------------------------------------------------------------- /R/analysis/simulations/outer_write_simulation_objects_null.R: -------------------------------------------------------------------------------- 1 | setwd("~/git/DE-analysis") 2 | options(stringsAsFactors = F) 3 | library(argparse) 4 | 5 | # parse arguments 6 | parser = ArgumentParser(prog = 'outer_write_simulation-objects_null.R') 7 | parser$add_argument('--allocation', type = 'character') 8 | args = parser$parse_args() 9 | 10 | library(tidyverse) 11 | library(magrittr) 12 | source("R/functions/datasets.R") 13 | source("R/functions/submit_job.R") 14 | source("R/functions/detect_system.R") 15 | 16 | # define the output directory 17 | output_dir = file.path(base_dir, "analysis/simulations/null/objects") 18 | 19 | grid = tidyr::crossing( 20 | n_cells = c(100, 200, 500, 1000, 2000), 21 | de_prob = 0.5, 22 | de_loc = seq(0, 1, 0.1), 23 | n_reps = c(6, 8, 10, 20, 40), 24 | sample_idx = seq_len(10) 25 | ) 26 | 27 | # write the raw array 28 | grid_file = "sh/analysis/simulations/grids/write_null_objects.raw.txt" 29 | grid_dir = dirname(grid_file) 30 | if (!dir.exists(grid_dir)) 31 | dir.create(grid_dir, recursive = T) 32 | write.table(grid, grid_file, quote = F, row.names = F, sep = "\t") 33 | 34 | # define output directory where results are stored 35 | output_dir = file.path(base_dir, "analysis/simulations/null/objects") 36 | 37 | # check which parameters are already complete 38 | overwrite = F 39 | grid0 = grid 40 | if (!overwrite) { 41 | grid0 = grid %>% 42 | mutate(output_filename = paste0("GSE96583", 43 | "-n_cells=", n_cells, 44 | "-de_prob=", de_prob, 45 | "-de_loc=", de_loc, 46 | "-n_reps=", n_reps, 47 | "-sample_idx=", sample_idx, 48 | '.rds'), 49 | output_file = file.path(output_dir, output_filename), 50 | exists = file.exists(output_file)) %>% 51 | filter(!exists) %>% 52 | dplyr::select(-output_file, -output_filename, -exists) 53 | } 54 | 55 | # write the grid that still needs to be run 56 | write.table(grid0, "sh/analysis/simulations/grids/write_null_objects.txt", 57 | quote = F, row.names = F, sep = "\t") 58 | 59 | # finally, run the job on whatever system we're on 60 | sh_dir = "~/git/DE-analysis/sh/analysis/simulations" 61 | script = file.path(sh_dir, "write_null_objects.sh") 62 | submit_job(grid0, script, args$allocation, system) 63 | -------------------------------------------------------------------------------- /R/analysis/simulations/summarise-null-DE-genes-per-bin.R: -------------------------------------------------------------------------------- 1 | # Tally the number of DE genes within bins of genes grouped by delta-variance. 2 | setwd("~/git/DE-analysis") 3 | options(stringsAsFactors = F) 4 | library(tidyverse) 5 | library(magrittr) 6 | source("R/functions/recode_colnames.R") 7 | args = list(); source("R/functions/detect_system.R") 8 | 9 | # set up grid 10 | # limit this experiment to n_reps=3, n_cells=500 11 | input_dir = file.path(base_dir, "analysis", "simulations", "null", "DE") 12 | input_files = tidyr::crossing( 13 | de_test = c( 14 | ## single-cell methods, implemented in Seurat 15 | "wilcox", "bimod", "t", "negbinom", "poisson", "LR", "MAST", 16 | # pseudobulk methods 17 | "pseudobulk_DESeq2,test?LRT", "pseudobulk_DESeq2,test?Wald", 18 | "pseudobulk_limma,mode?voom", "pseudobulk_limma,mode?trend", 19 | "pseudobulk_edgeR,test?QLF", 20 | "pseudobulk_edgeR,test?LRT", 21 | # mixed model, implemented in Seurat 22 | "mixed_lm" 23 | ), 24 | n_cells = 500, 25 | de_prob = 0.5, 26 | de_loc = seq(0, 1, 0.1), 27 | n_reps = 6, 28 | sample_idx = seq_len(10), 29 | shuffle_replicates = c("NO", "YES") 30 | ) %>% 31 | filter(grepl("pseudo|mixed", de_test) | shuffle_replicates == 'NO') %>% 32 | mutate(input_filename = paste0("GSE96583", 33 | "-n_cells=", n_cells, 34 | "-de_prob=", de_prob, 35 | "-de_loc=", de_loc, 36 | "-n_reps=", n_reps, 37 | "-sample_idx=", sample_idx, 38 | '-de_test=', de_test, 39 | '-shuffle_replicates=', shuffle_replicates, 40 | '.rds'), 41 | input_file = file.path(input_dir, input_filename)) %>% 42 | pull(input_file) 43 | 44 | # also set up the expr_summary files 45 | summary_dir = file.path(base_dir, "analysis", "simulations", "null", 46 | "expr_summary") 47 | summary_files = tidyr::crossing( 48 | n_cells = 500, 49 | de_prob = 0.5, 50 | de_loc = seq(0, 1, 0.1), 51 | n_reps = 6, 52 | sample_idx = seq_len(10) 53 | ) %>% mutate(summary_filename = paste0("GSE96583", 54 | "-n_cells=", n_cells, 55 | "-de_prob=", de_prob, 56 | "-de_loc=", de_loc, 57 | "-n_reps=", n_reps, 58 | "-sample_idx=", sample_idx, 59 | '.rds'), 60 | summary_file = file.path(summary_dir, summary_filename)) %>% 61 | pull(summary_file) 62 | 63 | # read all data 64 | dats = map(input_files, ~ readRDS(.x) %>% 65 | # fix column names 66 | recode_colnames() %>% 67 | # fix p-values 68 | group_by(cell_type) %>% 69 | mutate(p_val_adj = p.adjust(p_val, method = 'BH')) %>% 70 | ungroup() 71 | ) %>% setNames(basename(input_files)) 72 | 73 | # read all summary files 74 | summary_dats = map(summary_files, readRDS) %>% 75 | setNames(basename(summary_files)) 76 | 77 | # combine all the data and join the two sources of data together 78 | DE = bind_rows(dats, .id = 'filename') %>% 79 | separate(filename, into = c('dataset', 'n_cells', 'de_prob', 'de_loc', 80 | 'n_reps', 'sample_idx', 'de_test', 81 | 'shuffle_replicates'), sep = '-') %>% 82 | mutate_all(~ gsub("^.*=|\\.rds$", "", .)) %>% 83 | type_convert() %>% 84 | # remove some columns 85 | dplyr::select(-test, -runtime, -mem_usage, -baseMean, -lfcSE, -logCPM, 86 | -AveExpr, -B, -used_voom) 87 | summary = bind_rows(summary_dats, .id = 'filename') %>% 88 | separate(filename, into = c('dataset', 'n_cells', 'de_prob', 'de_loc', 89 | 'n_reps', 'sample_idx'), sep = '-') %>% 90 | mutate_all(~ gsub("^.*=|\\.rds$", "", .)) %>% 91 | type_convert() 92 | dat = left_join(DE, summary, by = c('dataset', 'n_cells', 'de_prob', 'de_loc', 93 | 'n_reps', 'sample_idx', 'gene')) 94 | 95 | # save the complete dataset 96 | saveRDS(dat, file.path(base_dir, "analysis/simulations/null/expr_summary.rds")) 97 | 98 | # now, calculate number of DE genes per bin 99 | bins = 10 100 | dat0 = dat %>% 101 | mutate(delta_variance = shuffled_variance - pseudobulk_variance, 102 | abs_delta_variance = abs(delta_variance)) 103 | bin_results = dat0 %>% 104 | # bin expression levels 105 | group_by(dataset, n_cells, de_prob, de_loc, n_reps, sample_idx, de_test, 106 | shuffle_replicates) %>% 107 | arrange(abs_delta_variance) %>% 108 | mutate(bin = cut(row_number() / n(), 109 | breaks = seq(0, bins) / bins), 110 | bin = as.integer(bin)) %>% 111 | ungroup() %>% 112 | # count DE genes in each bin 113 | group_by(dataset, n_cells, de_prob, de_loc, n_reps, sample_idx, de_test, 114 | shuffle_replicates, bin) %>% 115 | summarise(genes = sum(p_val_adj < 0.05)) %>% 116 | ungroup() 117 | 118 | # save results 119 | output_file = "data/analysis/simulations/null-genes-per-bin.rds" 120 | output_dir = dirname(output_file) 121 | if (!dir.exists(output_dir)) 122 | dir.create(output_dir, recursive = T) 123 | saveRDS(bin_results, output_file) 124 | -------------------------------------------------------------------------------- /R/analysis/simulations/summarise-null-n-DE-genes.R: -------------------------------------------------------------------------------- 1 | setwd("~/git/DE-analysis") 2 | options(stringsAsFactors = F) 3 | library(tidyverse) 4 | library(magrittr) 5 | 6 | # set up input directory 7 | source("R/functions/detect_system.R") 8 | 9 | # first, summarise the effect of n_reps, at n_cells == 500 10 | input_dir = file.path(base_dir, "analysis", "simulations", "null", "DE") 11 | input_files = tidyr::crossing( 12 | de_test = c( 13 | ## single-cell methods, implemented in Seurat 14 | "wilcox", "bimod", "t", "negbinom", "poisson", "LR", "MAST", 15 | # pseudobulk methods 16 | "pseudobulk_DESeq2,test?LRT", "pseudobulk_DESeq2,test?Wald", 17 | "pseudobulk_limma,mode?voom", "pseudobulk_limma,mode?trend", 18 | "pseudobulk_edgeR,test?QLF", 19 | "pseudobulk_edgeR,test?LRT", 20 | # mixed model, implemented in Seurat 21 | "mixed_lm" 22 | ), 23 | n_cells = c(100, 200, 500, 1000, 2000), 24 | de_prob = 0.5, 25 | de_loc = seq(0, 1, 0.1), 26 | n_reps = 2 * c(3, 4, 5, 10, 20), 27 | sample_idx = seq_len(10), 28 | shuffle_replicates = c("NO", "YES") 29 | ) %>% 30 | filter(grepl("pseudo|mixed", de_test) | shuffle_replicates == 'NO') %>% 31 | filter(n_cells == 500 & n_reps %in% c(2 * c(3, 4, 5, 10, 20)) | 32 | n_reps == 6 & n_cells %in% c(100, 200, 500, 1000, 2000)) %>% 33 | mutate(input_filename = paste0("GSE96583", 34 | "-n_cells=", n_cells, 35 | "-de_prob=", de_prob, 36 | "-de_loc=", de_loc, 37 | "-n_reps=", n_reps, 38 | "-sample_idx=", sample_idx, 39 | '-de_test=', de_test, 40 | '-shuffle_replicates=', shuffle_replicates, 41 | '.rds'), 42 | input_file = file.path(input_dir, input_filename)) %>% 43 | pull(input_file) 44 | 45 | # read all input files 46 | dats = map(input_files, readRDS) %>% 47 | setNames(basename(input_files)) 48 | 49 | # calculate # of DE genes 50 | n_DE = dats %>% 51 | map(~ { 52 | DE = bind_rows(., .id = 'comparison') 53 | # fix column names 54 | colnames(DE) %<>% 55 | fct_recode('p_val' = 'p.value', ## DESeq2 56 | 'p_val' = 'pvalue', ## DESeq2 57 | 'p_val' = 'p.value', ## t/wilcox 58 | 'p_val' = 'P.Value', ## limma 59 | 'p_val' = 'PValue' , ## edgeR 60 | 'p_val_adj' = 'padj', ## DESeq2/t/wilcox 61 | 'p_val_adj' = 'adj.P.Val', ## limma 62 | 'p_val_adj' = 'FDR', ## edgeER 63 | 'avg_logFC' = 'log2FoldChange', ## DESEeq2 64 | 'avg_logFC' = 'logFC', ## limma/edgeR 65 | 'test_statistic' = 'stat', ## DESeq2 66 | 'test_statistic' = 'F', ## edgeR 67 | 'test_statistic' = 't', ## limma 68 | 'test_statistic' = 'LR', ## edgeR LRT 69 | 'test_statistic' = 'statistic' ## t 70 | ) %>% 71 | as.character() 72 | # re-calculate adjusted p-values using BH correction (Seurat does Bonferroni) 73 | DE %<>% 74 | group_by(comparison, cell_type) %>% 75 | mutate(p_val_adj = p.adjust(p_val, method = 'BH')) %>% 76 | ungroup() 77 | # combine results 78 | DE %>% 79 | group_by(comparison, test, cell_type) %>% 80 | summarise(n_1 = sum(p_val_adj < 0.01, na.rm = T), 81 | n_5 = sum(p_val_adj < 0.05, na.rm = T), 82 | n_10 = sum(p_val_adj < 0.1, na.rm = T)) %>% 83 | ungroup() %>% 84 | gather('fdr', 'n_genes', n_1:n_10) %>% 85 | mutate(fdr = paste0(gsub("^.*_", "", fdr), "%")) 86 | }) %>% 87 | bind_rows(.id = 'filename') %>% 88 | separate(filename, into = c( 89 | 'dataset', 90 | 'n_cells', 91 | 'de_prob', 92 | 'de_loc', 93 | 'n_reps', 94 | 'sample_idx', 95 | 'de_test', 96 | 'shuffle_replicates'), 97 | sep = '-') %>% 98 | mutate_at(vars(n_cells, de_prob, de_loc, n_reps, sample_idx, 99 | de_test, shuffle_replicates), ~ gsub("^.*=|\\.rds$", "", .)) 100 | 101 | # save results 102 | output_file = "data/analysis/simulations/null-n-DE-genes.rds" 103 | output_dir = dirname(output_file) 104 | if (!dir.exists(output_dir)) 105 | dir.create(output_dir, recursive = T) 106 | saveRDS(n_DE, output_file) 107 | -------------------------------------------------------------------------------- /R/analysis/time_RAM/summarise-time-RAM-downsample_cells.R: -------------------------------------------------------------------------------- 1 | # Summarize the time and RAM usage of the default DE analyses in datasets 2 | # downsampled to a fixed number of cells (used to test mixed models). 3 | setwd("~/git/DE-analysis") 4 | options(stringsAsFactors = F) 5 | library(tidyverse) 6 | library(magrittr) 7 | args = list(); source('R/functions/detect_system.R') 8 | 9 | # set up input directory 10 | input_dir = file.path(base_dir, "analysis/downsample_cells/DE") 11 | input_files = list.files(input_dir, full.names = T, pattern ='*\\.rds$') %>% 12 | extract(grepl("Angelidis|Hagai|CanoGamez|Reyfman", .)) 13 | 14 | # extract walltime and RAM from all input files 15 | dats = map(input_files, ~ { 16 | print(.) 17 | dat = readRDS(.) %>% 18 | # remove empty data frames 19 | extract(map_int(., nrow) > 0) 20 | map(dat, ~ distinct(., runtime, mem_usage)) %>% 21 | bind_rows(.id = 'comparison') 22 | }) %>% 23 | setNames(basename(input_files)) 24 | 25 | # combine all results 26 | res = dats %>% 27 | bind_rows(.id = 'filename') %>% 28 | separate(filename, into = c('dataset', 'de_test', 'n_cells', 'sample_idx'), 29 | sep = '-') %>% 30 | mutate_at(vars(de_test, n_cells, sample_idx), 31 | ~ gsub("^.*=|\\.rds$", "", .)) %>% 32 | type_convert() 33 | 34 | # print summary 35 | res %>% 36 | group_by(de_test) %>% 37 | summarise(mean_ram = mean(mem_usage / 1e3), 38 | mean_time = mean(runtime / 60)) %>% 39 | arrange(desc(mean_time)) 40 | 41 | # save results 42 | output_file = "data/analysis/downsample_cells/time_RAM.rds" 43 | output_dir = dirname(output_file) 44 | if (!dir.exists(output_dir)) 45 | dir.create(output_dir, recursive = T) 46 | saveRDS(res, output_file) 47 | -------------------------------------------------------------------------------- /R/analysis/time_RAM/summarise-time-RAM.R: -------------------------------------------------------------------------------- 1 | setwd("~/git/DE-analysis") 2 | options(stringsAsFactors = F) 3 | library(tidyverse) 4 | library(magrittr) 5 | args = list(); source("R/functions/detect_system.R") 6 | 7 | # set up input 8 | input_dir = file.path(base_dir, "analysis/run_DE") 9 | input_files = list.files(input_dir, full.names = T, pattern ='*\\.rds$') 10 | 11 | # extract walltime and RAM from all input files 12 | dats = map(input_files, ~ { 13 | dat = readRDS(.) 14 | map(dat, ~ distinct(., runtime, mem_usage)) %>% 15 | bind_rows(.id = 'comparison') 16 | }) %>% 17 | setNames(basename(input_files)) 18 | 19 | # combine all results 20 | res = dats %>% 21 | bind_rows(.id = 'filename') %>% 22 | separate(filename, into = c('dataset', 'de_test', 'shuffle_replicates'), 23 | sep = '-') %>% 24 | mutate_at(vars(de_test, shuffle_replicates), ~ gsub("^.*=|\\.rds$", "", .)) %>% 25 | type_convert() 26 | 27 | # print summary 28 | res %>% 29 | group_by(de_test) %>% 30 | summarise(mean_ram = mean(mem_usage / 1e3), 31 | mean_time = mean(runtime / 60)) %>% 32 | arrange(desc(mean_time)) 33 | 34 | # save results 35 | output_file = "data/analysis/run_DE/time_RAM.rds" 36 | output_dir = dirname(output_file) 37 | if (!dir.exists(output_dir)) 38 | dir.create(output_dir, recursive = T) 39 | saveRDS(res, output_file) 40 | -------------------------------------------------------------------------------- /R/functions/calculate_overlap.R: -------------------------------------------------------------------------------- 1 | ## function to score the overlap between bulk and single-cell DE 2 | calculate_overlap = function(bulk_de, sc_de, 3 | method = c('fcc', 'aucc'), 4 | k = NULL, 5 | cor_method = c('pearson', 'spearman')) { 6 | method = match.arg(method) 7 | cor__method = match.arg(cor_method) 8 | if (method == 'fcc' & is.na(cor_method)) { 9 | stop("if using method='fcc', you must set cor_method (pearson/spearman)") 10 | } 11 | if (method == 'aucc' & is.na(k)) { 12 | stop("If using method='aucc', you must set k") 13 | } 14 | 15 | # double check column names 16 | colnames(bulk_de) %<>% 17 | fct_recode('p_val' = 'p.value', ## DESeq2 18 | 'p_val' = 'pvalue', ## DESeq2 19 | 'p_val' = 'p.value', ## t/wilcox 20 | 'p_val' = 'P.Value', ## limma 21 | 'p_val' = 'PValue' , ## edgeR 22 | 'p_val_adj' = 'padj', ## DESeq2/t/wilcox 23 | 'p_val_adj' = 'adj.P.Val', ## limma 24 | 'p_val_adj' = 'FDR', ## edgeER 25 | 'avg_logFC' = 'log2FoldChange', ## DESEeq2 26 | 'avg_logFC' = 'logFC', ## limma/edgeR 27 | 'test_statistic' = 'stat', ## DESeq2 28 | 'test_statistic' = 'F', ## edgeR 29 | 'test_statistic' = 't', ## limma 30 | 'test_statistic' = 'LR', ## edgeR LRT 31 | 'test_statistic' = 'statistic' ## t 32 | ) %>% 33 | as.character() 34 | colnames(sc_de) %<>% 35 | fct_recode('p_val' = 'p.value', ## DESeq2 36 | 'p_val' = 'pvalue', ## DESeq2 37 | 'p_val' = 'p.value', ## t/wilcox 38 | 'p_val' = 'P.Value', ## limma 39 | 'p_val' = 'PValue' , ## edgeR 40 | 'p_val_adj' = 'padj', ## DESeq2/t/wilcox 41 | 'p_val_adj' = 'adj.P.Val', ## limma 42 | 'p_val_adj' = 'FDR', ## edgeER 43 | 'avg_logFC' = 'log2FoldChange', ## DESEeq2 44 | 'avg_logFC' = 'logFC', ## limma/edgeR 45 | 'test_statistic' = 'stat', ## DESeq2 46 | 'test_statistic' = 'F', ## edgeR 47 | 'test_statistic' = 'LR', ## edgeR LRT 48 | 'test_statistic' = 't', ## limma 49 | 'test_statistic' = 'statistic' ## t 50 | ) %>% 51 | as.character() 52 | 53 | # remove NAs 54 | sc_de %<>% filter(!is.na(p_val), !is.na(p_val_adj), !is.na(test_statistic)) 55 | bulk_de %<>% filter(!is.na(p_val), !is.na(p_val_adj), !is.na(test_statistic)) 56 | 57 | # replace p=0 with minimum p-value 58 | sc_de_min = min(sc_de$p_val_adj[sc_de$p_val_adj > 0]) 59 | bulk_de_min = min(bulk_de$p_val_adj[bulk_de$p_val_adj > 0]) 60 | sc_de %<>% 61 | mutate(p_val_adj = ifelse(p_val_adj <= sc_de_min, sc_de_min, p_val_adj)) 62 | bulk_de %<>% 63 | mutate(p_val_adj = ifelse(p_val_adj <= bulk_de_min, bulk_de_min, p_val_adj)) 64 | ## repeat for raw p-values 65 | sc_de_min = min(sc_de$p_val[sc_de$p_val > 0]) 66 | bulk_de_min = min(bulk_de$p_val[bulk_de$p_val > 0]) 67 | sc_de %<>% 68 | mutate(p_val = ifelse(p_val <= sc_de_min, sc_de_min, p_val)) 69 | bulk_de %<>% 70 | mutate(p_val = ifelse(p_val <= bulk_de_min, bulk_de_min, p_val)) 71 | 72 | # filter to genes detected in both single-cell and bulk data 73 | genes = intersect(bulk_de$gene, sc_de$gene) 74 | sc_de %<>% filter(gene %in% genes) %>% arrange(gene) 75 | bulk_de %<>% filter(gene %in% genes) %>% arrange(gene) 76 | 77 | if (method == 'fcc') { 78 | # fold-change correlation 79 | genes = intersect(bulk_de$gene, sc_de$gene) 80 | bulk_de %<>% filter(gene %in% genes) 81 | sc_de %<>% filter(gene %in% genes) 82 | cor = cor( 83 | bulk_de %>% 84 | mutate(stat = sign(avg_logFC) * abs(test_statistic)) %>% 85 | arrange(gene) %>% 86 | pull(stat), 87 | sc_de %>% 88 | mutate(stat = sign(avg_logFC) * abs(test_statistic)) %>% 89 | arrange(gene) %>% 90 | pull(stat), 91 | method = cor_method 92 | ) 93 | } else if (method == 'aucc') { 94 | # area under the concordance curve 95 | k = as.integer(k) 96 | ## rank in descending order first by p_val 97 | ## break ties by the abs() of the test_statistic 98 | vec1 = bulk_de %>% 99 | arrange(p_val, desc(abs(test_statistic))) %>% 100 | pull(gene) %>% 101 | head(k) 102 | vec2 = sc_de %>% 103 | arrange(p_val, desc(abs(test_statistic))) %>% 104 | pull(gene) %>% 105 | head(k) 106 | 107 | concordance_curve = map_dbl(seq_len(k), ~ { 108 | v1 = vec1[seq_len(.)] 109 | v2 = vec2[seq_len(.)] 110 | length(intersect(v1, v2)) 111 | }) 112 | denom = k * (k + 1) / 2 113 | aucc = sum(concordance_curve) / denom 114 | return(aucc) 115 | } 116 | } 117 | -------------------------------------------------------------------------------- /R/functions/datasets.R: -------------------------------------------------------------------------------- 1 | datasets = c( 2 | "Angelidis2019", 3 | "Arneson2018", 4 | "Avey2018", 5 | "Aztekin2019", 6 | "Bhattacherjee2019", 7 | "Brenner2020", 8 | "CanoGamez2020", 9 | "Cheng2019", 10 | "Co2020", 11 | "Crowell2019", 12 | "Davie2018", 13 | "Denisenko2020", 14 | "Der2019_kidney", 15 | "Der2019_skin", 16 | "Goldfarbmuren2020", 17 | "Grubman2019", 18 | "Gunner2019", 19 | "Haber2017_droplet", 20 | "Hagai2018_mouse", 21 | "Hagai2018_rat", 22 | "Hagai2018_pig", 23 | "Hagai2018_rabbit", 24 | "Hashimoto2019", 25 | "Hrvatin2018", 26 | "Hu2017", 27 | "Huang2020", 28 | "Jaitin2018_HFD", 29 | "Jakel2019", 30 | "Kotliarov2020", 31 | "Kang2018", 32 | "Kim2019", 33 | "Madissoon2020", 34 | "Mathys2019", 35 | "Nagy2020", 36 | "OrdovasMontanes2018", 37 | "Rault2020", 38 | "Reyes2020", 39 | "Reyfman2020", 40 | "Rossi2019", 41 | "Sathyamurthy2018", 42 | "Schirmer2019", 43 | "Schafflick2020_CSF", 44 | "Schafflick2020_PBMCs", 45 | "Skinnider2020", 46 | "Tran2019", 47 | "Wagner2018", 48 | "Wang2020", 49 | "Wilk2020", 50 | "Wirka2019", 51 | "Wu2017", 52 | "Ximerakis2019" 53 | ) 54 | 55 | bulk_datasets = c( 56 | "bulk_rnaseq/Angelidis2019_facsepi", 57 | "bulk_rnaseq/Angelidis2019_facsmac", 58 | "bulk_rnaseq/CanoGamez2020", 59 | "bulk_rnaseq/Hagai2018_mouse", 60 | "bulk_rnaseq/Hagai2018_rat", 61 | "bulk_rnaseq/Hagai2018_pig", 62 | "bulk_rnaseq/Hagai2018_rabbit", 63 | "proteomics/CanoGamez2020:proteomics" 64 | ) 65 | -------------------------------------------------------------------------------- /R/functions/get_bulk_comparisons.R: -------------------------------------------------------------------------------- 1 | get_bulk_comparisons = function(dataset, expr, meta) { 2 | # set up container 3 | comparisons = list() 4 | # handle each dataset appropriately 5 | if (dataset == 'Angelidis2019') { 6 | meta %<>% mutate(label = factor(label, levels = c("3m", "24m"))) 7 | results[[1]] = list(expr = expr, meta = meta) 8 | } else if (dataset == 'Hagai2018') { 9 | ## Hagai2018: two different binary comparisons 10 | for (comparison in c('LPS4', 'PIC4')) { 11 | message(' processing comparison: ', comparison, ' ...') 12 | meta0 = meta %>% 13 | rownames_to_column(var = 'sample') %>% 14 | filter(label %in% c('UNST', comparison)) %>% 15 | mutate(label = factor(label, levels = c("UNST", comparison))) 16 | expr0 = expr[, meta0$sample] 17 | results[[comparison]] = list(expr = expr0, meta = meta0) 18 | } 19 | } else if (dataset == 'CanoGamez2020') { 20 | ## CanoGamez2020: Two different cell types, 7 different cytokine conditions 21 | cytokines = c("IFNB", "Th17", "Resting", "Th2", "Th0", "iTreg", "Th1") 22 | cell_types = c('Naive', 'Memory') 23 | stimulation_times = c("16h", "5d") 24 | grid = tidyr::crossing(cytokine1 = cytokines, 25 | cytokine2 = cytokines, 26 | cell_type = cell_types, 27 | stimulation_time = stimulation_times) %>% 28 | filter(cytokine1 != cytokine2) %>% 29 | filter(cytokine1 == 'Resting') 30 | for (grid_idx in seq_len(nrow(grid))) { 31 | cytokine1 = grid$cytokine1[grid_idx] 32 | cytokine2 = grid$cytokine2[grid_idx] 33 | cell_type = grid$cell_type[grid_idx] 34 | stimulation_time = grid$stimulation_time[grid_idx] 35 | key = paste0(cytokine1, '|', cytokine2, "|", cell_type, "|", 36 | stimulation_time) 37 | message(' processing comparison: ', key, ' ...') 38 | 39 | meta0 = meta %>% 40 | mutate(idx = row_number()) %>% 41 | filter(stimulation_time == !!stimulation_time) %>% 42 | filter(grepl(!!cell_type, cell_type)) %>% 43 | filter(cytokine_condition %in% c(cytokine1, cytokine2)) %>% 44 | mutate(cytokine_condition = factor(cytokine_condition, 45 | levels = c(cytokine1, cytokine2)), 46 | label = cytokine_condition) 47 | expr0 = expr %>% extract(, meta0$idx) 48 | results[[key]] = list(expr = expr0, meta = meta0) 49 | } 50 | } else if (dataset == 'CanoGamez2020:proteomics') { 51 | ## CanoGamez2020: Two different cell types, 7 different cytokine conditions 52 | cytokines = c("IFNB", "Th17", "Resting", "Th2", "Th0", "iTreg", "Th1") 53 | cell_types = c('Naive', 'Memory') 54 | grid = tidyr::crossing(cytokine1 = cytokines, 55 | cytokine2 = cytokines, 56 | cell_type = cell_types) %>% 57 | filter(cytokine1 != cytokine2) %>% 58 | filter(cytokine1 == 'Resting') 59 | for (grid_idx in seq_len(nrow(grid))) { 60 | cell_type = grid$cell_type[grid_idx] 61 | cytokine1 = grid$cytokine1[grid_idx] 62 | cytokine2 = grid$cytokine2[grid_idx] 63 | key = paste0(cytokine1, '|', cytokine2, "|", cell_type) 64 | message(' processing comparison: ', key, ' ...') 65 | 66 | meta0 = meta %>% 67 | mutate(idx = row_number()) %>% 68 | filter(cell_type == !!cell_type) %>% 69 | filter(cytokine_condition %in% c(cytokine1, cytokine2)) %>% 70 | mutate(cytokine_condition = factor(cytokine_condition, 71 | levels = c(cytokine1, cytokine2)), 72 | label = cytokine_condition) 73 | expr0 = expr %>% extract(, meta0$idx) 74 | results[[key]] = list(expr = expr0, meta = meta0) 75 | } 76 | } else { 77 | stop("invalid dataset: ", dataset, " ...") 78 | } 79 | 80 | # drop all unused factor levels 81 | for (comparison_idx in seq_along(results)) { 82 | results[[comparison_idx]]$meta %<>% droplevels() 83 | } 84 | 85 | return(results) 86 | } 87 | -------------------------------------------------------------------------------- /R/functions/get_comparisons.R: -------------------------------------------------------------------------------- 1 | ## Get subset expression matrices containing all comparisons for a given 2 | ## dataset. 3 | get_comparisons = function(dataset, expr, meta) { 4 | # set up container 5 | results = list() 6 | # handle each dataset appropriately 7 | if (dataset %in% c('Arneson2018', 8 | 'Avey2018', 9 | 'Brenner2020', 10 | 'Cheng2019', 11 | 'Co2020', 12 | 'Crowell2019', 13 | 'Der2019_kidney', 14 | 'Der2019_skin', 15 | 'Grubman2019', 16 | 'Hashimoto2019', 17 | 'Hu2017', 18 | 'Jakel2019', 19 | 'Mathys2019', 20 | 'Nagy2020', 21 | 'OrdovasMontanes2018', 22 | 'Rault2020', 23 | 'Rossi2019', 24 | 'Sathyamurthy2018', 25 | 'Schafflick2020_CSF', 26 | 'Schafflick2020_PBMCs', 27 | 'Skinnider2020', 28 | 'Wang2020' 29 | )) { 30 | results[[1]] = list(expr = expr, meta = meta) 31 | } else if (dataset %in% c('Goldfarbmuren2020', 32 | 'Schirmer2019', 33 | 'Ximerakis2019')) { 34 | ## two cell type levels 35 | results[['cell_type']] = list(expr = expr, meta = meta) 36 | meta0 = meta %>% 37 | dplyr::select(-cell_type) %>% 38 | dplyr::rename(cell_type = global_cell_type) 39 | results[['global_cell_type']] = list(expr = expr, meta = meta0) 40 | } else if (dataset == 'Bhattacherjee2019') { 41 | ## Bhattacherjee2019: two possible levels of cell types, and 42 | ## three different timepoints 43 | timepoints = c('Maintenance', '48h', '15d') 44 | cell_types = c('cell_type', 'global_cell_type') 45 | grid = tidyr::crossing(timepoint = timepoints, cell_type = cell_types) 46 | for (grid_idx in seq_len(nrow(grid))) { 47 | timepoint = grid$timepoint[grid_idx] 48 | cell_type = grid$cell_type[grid_idx] 49 | key = paste0(timepoint, '|', cell_type) 50 | message(' processing comparison: ', key, ' ...') 51 | 52 | meta0 = meta %>% 53 | mutate(idx = row_number()) %>% 54 | filter(grepl(timepoint, label)) 55 | expr0 = expr %>% extract(, meta0$idx) 56 | if (cell_type == "global_cell_type") { 57 | meta0 %<>% 58 | dplyr::select(-cell_type) %>% 59 | dplyr::rename(cell_type = global_cell_type) 60 | } 61 | 62 | results[[key]] = list(expr = expr0, meta = meta0) 63 | } 64 | } else if (dataset == 'Huang2020') { 65 | ## Huang2020: two possible levels of cell types, and 66 | ## three different comparisons 67 | conditions = c('CD', 'colitis', 'UC') 68 | cell_types = c('cell_type', 'global_cell_type') 69 | grid = tidyr::crossing(condition = conditions, cell_type = cell_types) 70 | for (grid_idx in seq_len(nrow(grid))) { 71 | condition = grid$condition[grid_idx] 72 | cell_type = grid$cell_type[grid_idx] 73 | key = paste0(condition, '|', cell_type) 74 | message(' processing comparison: ', key, ' ...') 75 | 76 | meta0 = meta %>% 77 | mutate(idx = row_number()) %>% 78 | filter(label %in% c(condition, 'control')) 79 | expr0 = expr %>% extract(, meta0$idx) 80 | if (cell_type == "global_cell_type") { 81 | meta0 %<>% 82 | dplyr::select(-cell_type) %>% 83 | dplyr::rename(cell_type = global_cell_type) 84 | } 85 | 86 | results[[key]] = list(expr = expr0, meta = meta0) 87 | } 88 | } else if (dataset == 'Reyes2020') { 89 | ## Reyes2020: two possible levels of cell types, and 90 | ## three different comparisons 91 | cohorts = c('ICU-SEP vs. ICU-NoSEP', 92 | 'Sepsis vs. control', 93 | 'Sepsis vs. Leuk-UTI') 94 | cell_types = c('cell_type', 'global_cell_type') 95 | grid = tidyr::crossing(cohort = cohorts, cell_type = cell_types) 96 | for (grid_idx in seq_len(nrow(grid))) { 97 | cohort = grid$cohort[grid_idx] 98 | cell_type = grid$cell_type[grid_idx] 99 | key = paste0(cohort, '|', cell_type) 100 | message(' processing comparison: ', key, ' ...') 101 | 102 | if (cohort == 'ICU-SEP vs. ICU-NoSEP') { 103 | meta0 = meta %>% 104 | mutate(idx = row_number()) %>% 105 | filter(label %in% c("ICU-SEP", "ICU-NoSEP")) 106 | } else if (cohort == 'Sepsis vs. control') { 107 | meta0 = meta %>% 108 | mutate(idx = row_number()) %>% 109 | filter(label %in% c("Int-URO", "URO", "Bac-SEP", "ICU-SEP", 110 | "Control")) %>% 111 | mutate(label = ifelse(label == 'Control', label, 'Sepsis')) 112 | } else if (cohort == 'Sepsis vs. Leuk-UTI') { 113 | meta0 = meta %>% 114 | mutate(idx = row_number()) %>% 115 | filter(label %in% c("Int-URO", "URO", "Bac-SEP", "ICU-SEP", 116 | "Leuk-UTI")) %>% 117 | mutate(label = ifelse(label == 'Leuk-UTI', label, 'Sepsis')) 118 | } 119 | expr0 = expr %>% extract(, meta0$idx) 120 | if (cell_type == "global_cell_type") { 121 | meta0 %<>% 122 | dplyr::select(-cell_type) %>% 123 | dplyr::rename(cell_type = global_cell_type) 124 | } 125 | results[[key]] = list(expr = expr0, meta = meta0) 126 | } 127 | } else if (dataset == 'Wu2017') { 128 | ## Wu2017: two different binary comparisons 129 | for (comparison in c('stress', 'seizure')) { 130 | message(' processing comparison: ', comparison, ' ...') 131 | meta0 = meta %>% 132 | mutate(idx = row_number()) %>% 133 | filter(label %in% c('control', comparison)) %>% 134 | droplevels() 135 | expr0 = expr[, meta0$idx] 136 | results[[comparison]] = list(expr = expr0, meta = meta0) 137 | } 138 | } else if (dataset == 'Wagner2018') { 139 | ## Wagner2018: tyrosinase vs. chordin 140 | meta0 = meta %>% 141 | mutate(idx = row_number()) %>% 142 | filter(label != 'WT') 143 | expr0 = expr[, meta0$idx] 144 | results[[1]] = list(expr = expr0, meta = meta0) 145 | } else if (dataset == 'Gunner2019') { 146 | ## Gunner2019: control vs. lesion, in entire dataset or by genotype 147 | comparisons = c('entire_dataset', 'homozygous', 'heterozygous') 148 | for (comparison in comparisons) { 149 | message(' processing comparison: ', comparison, ' ...') 150 | if (comparison == 'entire_dataset') { 151 | meta0 = meta 152 | expr0 = expr 153 | } else { 154 | meta0 = meta %>% 155 | mutate(idx = row_number()) %>% 156 | filter(genotype == Hmisc::capitalize(comparison)) 157 | expr0 = expr[, meta0$idx] 158 | } 159 | results[[comparison]] = list(expr = expr0, meta = meta0) 160 | } 161 | } else if (dataset == 'Haber2017_droplet') { 162 | ## Haber2017: three binary comparisons vs. control 163 | comparisons = c('Hpoly.Day3', 'Hpoly.Day10', 'Salmonella') 164 | for (comparison in comparisons) { 165 | message(' processing comparison: ', comparison, ' ...') 166 | meta0 = meta %>% 167 | mutate(idx = row_number()) %>% 168 | filter(label %in% c('Control', comparison)) 169 | expr0 = expr[, meta0$idx] 170 | results[[comparison]] = list(expr = expr0, meta = meta0) 171 | } 172 | } else if (dataset == 'Aztekin2019') { 173 | ## Aztekin2019: amputation response 174 | grps = c("ST40_1", "ST40_0") 175 | meta0 = meta %>% 176 | mutate(idx = row_number()) %>% 177 | filter(label %in% grps) 178 | expr0 = expr %>% 179 | extract(, meta0$idx) 180 | results[[1]] = list(expr = expr0, meta = meta0) 181 | } else if (dataset == 'Kim2019') { 182 | ## Kim2019: aggression 183 | meta0 = meta %>% 184 | mutate(idx = row_number()) %>% 185 | filter(label %in% c('Control', 'Aggression')) 186 | expr0 = expr[, meta0$idx] 187 | results[[1]] = list(expr = expr0, meta = meta0) 188 | } else if (dataset == 'Wirka2019') { 189 | ## Wirka2019: 8w/0w in WT 190 | # subset metadata 191 | meta0 = meta %>% 192 | mutate(idx = row_number()) 193 | ## filter by genotype 194 | meta0 %<>% filter(phenotype == 'wt') 195 | ## filter by timepoint 196 | meta0 %<>% filter(label != '16wk') 197 | # subset expression 198 | expr0 = expr[, meta0$idx] 199 | results[[1]] = list(expr = expr0, meta = meta0) 200 | } else if (dataset == 'Jaitin2018_HFD') { 201 | ## Jaitin2019 (dataset 1): HFD vs. NC, 6w 202 | meta0 = meta %>% 203 | mutate(idx = row_number()) %>% 204 | filter(timepoint == 6) 205 | expr0 = expr[, meta0$idx] 206 | results[[1]] = list(expr = expr0, meta = meta0) 207 | } else if (dataset == 'CanoGamez2020') { 208 | ## CanoGamez2020: compare all cytokines to unstimulated 209 | comparisons = unique(meta$label) %>% 210 | setdiff('UNS') 211 | for (comparison in comparisons) { 212 | message(' processing comparison: ', comparison, ' ...') 213 | meta0 = meta %>% 214 | mutate(idx = row_number()) %>% 215 | filter(label %in% c('UNS', comparison)) %>% 216 | mutate(label = factor(label, levels = c('UNS', comparison))) 217 | expr0 = expr[, meta0$idx] 218 | results[[comparison]] = list(expr = expr0, meta = meta0) 219 | } 220 | } else if (dataset == 'Davie2018') { 221 | ## Davie2018: all combinations of age/sex/genotype 222 | ages = unique(meta$label) 223 | 224 | ### age 225 | comparisons = tidyr::crossing(age1 = ages, age2 = ages) %>% 226 | filter(age1 < age2) 227 | for (grid_idx in seq_len(nrow(comparisons))) { 228 | age1 = comparisons$age1[grid_idx] 229 | age2 = comparisons$age2[grid_idx] 230 | comparison = paste0(age1, '|', age2) 231 | message(' processing comparison: ', comparison, ' ...') 232 | meta0 = meta %>% 233 | mutate(idx = row_number()) %>% 234 | filter(label %in% c(age1, age2)) 235 | expr0 = expr[, meta0$idx] 236 | results[[comparison]] = list(expr = expr0, meta = meta0) 237 | } 238 | 239 | ### sex 240 | message(' processing comparison: sex ...') 241 | meta0 = meta %>% 242 | dplyr::select(-label) %>% 243 | dplyr::rename(label = gender) 244 | results[['sex']] = list(expr = expr, meta = meta0) 245 | 246 | ### genotype 247 | message(' processing comparison: genotype ...') 248 | meta0 = meta %>% 249 | dplyr::select(-label) %>% 250 | dplyr::rename(label = genotype) 251 | results[['genotype']] = list(expr = expr, meta = meta0) 252 | } else if (dataset == 'Wagner2018') { 253 | ## Wagner2018: tyrosinase vs. chordin 254 | meta0 = meta %>% 255 | mutate(idx = row_number()) %>% 256 | filter(label != 'WT') 257 | expr0 = expr[, meta0$idx] 258 | results[[1]] = list(expr = expr0, meta = meta0) 259 | } else if (dataset == 'Hrvatin2018') { 260 | ## Hrvatin2018: focus on 0h vs. 4h 261 | meta0 = meta %>% 262 | mutate(idx = row_number()) %>% 263 | filter(label != '1h') 264 | expr0 = expr[, meta0$idx] 265 | results[[1]] = list(expr = expr0, meta = meta0) 266 | } else if (dataset == 'Madissoon2020') { 267 | ## Madissoon2020: compare all timepoints to 0 h 268 | comparisons = unique(meta$label) %>% 269 | setdiff('0h') 270 | for (comparison in comparisons) { 271 | message(' processing comparison: ', comparison, ' ...') 272 | meta0 = meta %>% 273 | mutate(idx = row_number()) %>% 274 | filter(label %in% c('0h', comparison)) %>% 275 | mutate(label = factor(label, levels = c('0h', comparison))) 276 | expr0 = expr[, meta0$idx] 277 | results[[comparison]] = list(expr = expr0, meta = meta0) 278 | } 279 | } else if (dataset == 'Tran2019') { 280 | ## Tran2019: compare all timepoints to control 281 | comparisons = unique(meta$label) %>% setdiff('Ctrl') 282 | for (comparison in comparisons) { 283 | message(' processing comparison: ', comparison, ' ...') 284 | meta0 = meta %>% 285 | mutate(idx = row_number()) %>% 286 | filter(label %in% c('Ctrl', comparison)) 287 | expr0 = expr[, meta0$idx] 288 | results[[comparison]] = list(expr = expr0, meta = meta0) 289 | } 290 | } else if (dataset == 'Cuomo2020') { 291 | ## Cuomo2020: compare all timepoints to day 0 292 | comparisons = unique(meta$label) %>% setdiff('day0') 293 | for (comparison in comparisons) { 294 | message(' processing comparison: ', comparison, ' ...') 295 | meta0 = meta %>% 296 | mutate(idx = row_number()) %>% 297 | filter(label %in% c('day0', comparison)) %>% 298 | mutate(label = factor(label, levels = c('day0', comparison))) 299 | expr0 = expr[, meta0$idx] 300 | results[[comparison]] = list(expr = expr0, meta = meta0) 301 | } 302 | } else if (dataset %in% c( 303 | "Hagai2018_mouse", 304 | "Hagai2018_rat", 305 | "Hagai2018_pig", 306 | "Hagai2018_rabbit" 307 | )) { 308 | ## Hagai2018: compare all timepoints to unstimulated 309 | comparisons = unique(meta$label) %>% setdiff('unst') 310 | for (comparison in comparisons) { 311 | message(' processing comparison: ', comparison, ' ...') 312 | meta0 = meta %>% 313 | mutate(idx = row_number()) %>% 314 | filter(label %in% c('unst', comparison)) %>% 315 | mutate(label = factor(label, levels = c('unst', comparison))) 316 | expr0 = expr[, meta0$idx] 317 | results[[comparison]] = list(expr = expr0, meta = meta0) 318 | } 319 | } else if (dataset == 'Wilk2020') { 320 | ## Wilk2020: Compare each vent status to control, all COVID to control 321 | # and vent to no vent covid 322 | ## Comparison 1: all COVID to healthy control 323 | ## Comparison 2: ventilated COVID to healthy control 324 | ## Comparison 3: non-ventilated COVID to healthy control 325 | ## Comparison 4: non-ventilated COVID to ventilated covid 326 | comparisons = c("Healthy_COVID", "Healthy_vCOVID", "Healthy_nvCOVID", 327 | "nvCOVID_vCOVID") 328 | for (comparison in comparisons) { 329 | message(' processing comparison: ', comparison, ' ...') 330 | if (comparison == "Healthy_COVID") { 331 | meta0 = meta %>% 332 | mutate(idx = row_number()) %>% 333 | mutate(label = factor(label, levels = c('Healthy', 'COVID'))) 334 | expr0 = expr[, meta0$idx] 335 | results[[comparison]] = list(expr = expr0, meta = meta0) 336 | } 337 | if (comparison == "Healthy_vCOVID") { 338 | meta0 = meta %>% 339 | mutate(idx = row_number()) %>% 340 | filter(Ventilated %in% c("Healthy", "Vent")) %>% 341 | mutate(label = factor(label, levels = c('Healthy', 'COVID'))) 342 | expr0 = expr[, meta0$idx] 343 | results[[comparison]] = list(expr = expr0, meta = meta0) 344 | } 345 | if (comparison == "Healthy_nvCOVID") { 346 | meta0 = meta %>% 347 | mutate(idx = row_number()) %>% 348 | filter(Ventilated %in% c("Healthy", "NonVent")) %>% 349 | mutate(label = factor(label, levels = c('Healthy', 'COVID'))) 350 | expr0 = expr[, meta0$idx] 351 | results[[comparison]] = list(expr = expr0, meta = meta0) 352 | } 353 | if (comparison == "nvCOVID_vCOVID") { 354 | meta0 = meta %>% 355 | mutate(idx = row_number()) %>% 356 | filter(label == 'COVID') %>% 357 | mutate(label = Ventilated) %>% 358 | mutate(label = factor(label, levels = c('NonVent', 'Vent'))) 359 | expr0 = expr[, meta0$idx] 360 | results[[comparison]] = list(expr = expr0, meta = meta0) 361 | } 362 | } 363 | } else if (dataset %in% c('Kotliarov2020')) { 364 | ## explicitly specify factor levels 365 | meta %<>% mutate(label = factor(label, levels = c('low', 'high'))) 366 | results[[1]] = list(expr = expr, meta = meta) 367 | } else if (dataset == 'Kang2018') { 368 | ## explicitly specify factor levels 369 | meta %<>% mutate(label = factor(label, levels = c('ctrl', 'stim'))) 370 | results[[1]] = list(expr = expr, meta = meta) 371 | } else if (dataset == 'Angelidis2019') { 372 | ## explicitly specify factor levels 373 | meta %<>% mutate(label = factor(label, levels = c('3m', '24m'))) 374 | results[[1]] = list(expr = expr, meta = meta) 375 | } else if (dataset == 'Reyfman2020') { 376 | ## explicitly specify factor levels 377 | meta %<>% mutate(label = factor(label, levels = c('Control', 378 | 'Pulmonary fibrosis'))) 379 | results[[1]] = list(expr = expr, meta = meta) 380 | } else if (dataset == 'Denisenko2020') { 381 | #' warm vs. cold (fresh): for comparison to bulk 382 | #' methanol vs. fresh, cryopreserved vs. fresh in warm/cold 383 | comparisons = c("warm_vs_cold", 384 | "methanol_warm", 385 | "methanol_cold", 386 | "cryopreserved_warm", 387 | "cryopreserved_cold") 388 | for (comparison in comparisons) { 389 | meta0 = meta %>% 390 | mutate(idx = row_number()) %>% 391 | # drop single-nucleus, v2/v3 comparison 392 | filter(!grepl("^SN|^SC", label)) 393 | if (grepl("methanol", comparison)) { 394 | temperature = gsub("^.*_", "", comparison) 395 | meta0 %<>% 396 | filter(is.na(label2) | label2 == 'MeOH') %>% 397 | replace_na(list(label2 = 'fresh')) %>% 398 | filter(label == temperature) %>% 399 | mutate(label = factor(label2, levels = c('fresh', 'MeOH'))) 400 | } else if (grepl("cryo", comparison)) { 401 | temperature = gsub("^.*_", "", comparison) 402 | meta0 %<>% 403 | filter(is.na(label2) | label2 == 'Cryo') %>% 404 | replace_na(list(label2 = 'fresh')) %>% 405 | filter(label == temperature) %>% 406 | mutate(label = factor(label2, levels = c('fresh', 'Cryo'))) 407 | } else { 408 | # warm v. cold 409 | meta0 %<>% 410 | filter(is.na(label2)) %>% 411 | # set factor levels 412 | mutate(label = factor(label, levels = c('cold', 'warm'))) 413 | } 414 | expr0 = expr[, meta0$idx] 415 | results[[comparison]] = list(expr = expr0, meta = meta0) 416 | } 417 | ## 418 | } else if (dataset == "Hagai2018_plate") { 419 | ## explicitly specify factor levels 420 | meta %<>% mutate(label = factor(label, levels = c(2, 6))) 421 | results[[1]] = list(expr = expr, meta = meta) 422 | } else if (dataset == "Maniatis2019_mouse") { 423 | meta0 = meta %>% 424 | mutate(idx = row_number()) %>% 425 | # rename replicate, label, and region for compatibility 426 | dplyr::rename(replicate = isolate, label = breed, cell_type = region) 427 | results[[1]] = list(expr = expr, meta = meta0) 428 | } else { 429 | stop("invalid dataset: ", dataset, " ...") 430 | } 431 | 432 | # drop all unused factor levels 433 | for (comparison_idx in seq_along(results)) { 434 | results[[comparison_idx]]$meta %<>% droplevels() 435 | } 436 | 437 | return(results) 438 | } 439 | -------------------------------------------------------------------------------- /R/functions/recode_colnames.R: -------------------------------------------------------------------------------- 1 | recode_colnames = function(DE) { 2 | colnames(DE) %<>% 3 | fct_recode('p_val' = 'p.value', ## DESeq2 4 | 'p_val' = 'pvalue', ## DESeq2 5 | 'p_val' = 'p.value', ## t/wilcox 6 | 'p_val' = 'P.Value', ## limma 7 | 'p_val' = 'PValue' , ## edgeR 8 | 'p_val_adj' = 'padj', ## DESeq2/t/wilcox 9 | 'p_val_adj' = 'adj.P.Val', ## limma 10 | 'p_val_adj' = 'FDR', ## edgeER 11 | 'avg_logFC' = 'log2FoldChange', ## DESEeq2 12 | 'avg_logFC' = 'logFC', ## limma/edgeR 13 | 'test_statistic' = 'stat', ## DESeq2 14 | 'test_statistic' = 'F', ## edgeR 15 | 'test_statistic' = 't', ## limma 16 | 'test_statistic' = 'LR', ## edgeR LRT 17 | 'test_statistic' = 'statistic' ## t 18 | ) %>% 19 | as.character() 20 | return(DE) 21 | } 22 | -------------------------------------------------------------------------------- /R/functions/spatial_datasets.R: -------------------------------------------------------------------------------- 1 | datasets = c( 2 | "Maniatis2019_mouse" 3 | ) 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DE-analysis 2 | 3 | This repository contains R source code used to conduct the analysis in our manuscript, "Confronting false discoveries in single-cell differential expression." 4 | 5 | A brief overview of the main computational analyses that were conducted, and the location of the corresponding source code, is given below. 6 | 7 | - First, differentially expressed genes were identified in single-cell and matching bulk datasets, respectively, using code in the directories `R/analysis/run_de` and `R/analysis/run_bulk_de`. Code in `R/analysis/run_spike_in_de` was used to analyze a lone single-cell dataset in which the ERCC mixture of synthetic mRNAs was spiked in alongside each individual cell. 8 | - A list of all single-cell and bulk datasets analyzed in this study is provided in `R/functions/datasets.R`. Datasets containing multiple comparisons of two experimental groups were split into each possible comparison using the functions in `R/functions/get_comparisons.R` and `R/functions/get_bulk_comparisons.R`. 9 | - Code used to run differential expression analyses is provided in `R/functions/run_DE.R` 10 | - The concordance between the single-cell and bulk DE results was then quantified using code in the `R/analysis/bulk_concordance` directory. 11 | - Code used to calculate the AUCC and fold-change correlation is provided in `R/functions/calculate_overlap.R` 12 | - Gene set enrichment analysis of differential expression results was performed for both the single-cell and bulk datasets using code in the `R/analysis/run_GSEA` directory. 13 | - False-positive and false-negative DE calls were obtained for the single-cell data, using the bulk data as a reference, using code in the `R/analysis/extract_FPs` directory. 14 | - A number of summary statistics were obtained for each dataset (e.g., number of replicates, number of cell types) or each gene within each dataset (e.g., mean expression, delta-variance), using code in the directories `R/analysis/confounds`, `R/analysis/delta_variance`, and `R/analysis/expr_summary`. 15 | - The relationships between mean expression, the variance of gene expression, and the delta-variance in 'pseudo-replicates' were interrogated using code in `R/analysis/mean_variance`. 16 | - The effect of between-replicate variance was interrogated with simulation studies using the code in `R/analysis/simulations`. The code in this directory was used to generate synthetic gene expression data, perform DE analysis, and analyse the properties of DE genes. 17 | - DE analysis was performed between random groups of control samples using code in the `R/analysis/control_only` directory. This code was also used to analyze a spatial transcriptomics dataset. 18 | - The performance of generalized linear mixed models was assessed in downsampled datasets using code in the `R/analysis/downsample_cells` directory. 19 | - Finally, the computational resources (wall time, peak RAM usage) used by each method were extracted using code in the `R/analysis/time_RAM` directory. 20 | --------------------------------------------------------------------------------