├── .gitignore
├── R
    ├── analysis
    │   ├── bulk_concordance
    │   │   ├── inner-run-concordance-terciles.R
    │   │   ├── inner-run-concordance.R
    │   │   ├── outer-run-concordance-terciles.R
    │   │   ├── outer-run-concordance.R
    │   │   ├── summarise_concordance-terciles.R
    │   │   ├── summarise_concordance.R
    │   │   └── write_grid.R
    │   ├── confounds
    │   │   ├── inner-calculate-confounds.R
    │   │   ├── outer-calculate-confounds.R
    │   │   └── summarise-confounds.R
    │   ├── control_only
    │   │   ├── inner-control-only-spatial.R
    │   │   ├── inner-control-only.R
    │   │   ├── inner-expr-summary-control-only.R
    │   │   ├── outer-control-only-spatial.R
    │   │   ├── outer-control-only.R
    │   │   ├── outer-expr-summary-control-only.R
    │   │   ├── summarise-control-only-DE-vs-variance.R
    │   │   ├── summarise-control-only-n-DE-genes.R
    │   │   └── summarise-control-only-spatial-n-DE-genes.R
    │   ├── delta_variance
    │   │   ├── inner-write-delta-variance.R
    │   │   ├── outer-write-delta-variance.R
    │   │   └── summarise-delta-variance.R
    │   ├── downsample_cells
    │   │   ├── inner-downsample-cells-outcomes.R
    │   │   ├── inner-downsample-cells.R
    │   │   ├── outer-downsample-cells-outcomes.R
    │   │   ├── outer-downsample-cells.R
    │   │   └── summarise-downsample-cells-outcomes.R
    │   ├── expr_summary
    │   │   ├── inner-expr-summary.R
    │   │   ├── outer-expr-summary.R
    │   │   └── summarise-expr-summary.R
    │   ├── extract_FPs
    │   │   ├── inner-extract-FPs.R
    │   │   ├── outer-extract-FPs.R
    │   │   └── summarise-extract-FPs.R
    │   ├── mean_variance
    │   │   └── analyze-mean-delta-variance-all-datasets.R
    │   ├── run_DE
    │   │   ├── inner-run-DE.R
    │   │   └── outer-run-DE.R
    │   ├── run_GSEA
    │   │   ├── inner-GSEA-concordance.R
    │   │   ├── inner-run-GSEA.R
    │   │   ├── outer-GSEA-concordance.R
    │   │   ├── outer-run-GSEA.R
    │   │   └── summarise-GSEA-concordance.R
    │   ├── run_bulk_DE
    │   │   ├── inner-run-DE.R
    │   │   └── outer-run-DE.R
    │   ├── run_spike_in_DE
    │   │   ├── inner-run-DE.R
    │   │   ├── outer-run-DE.R
    │   │   └── summarise-spike-ins.R
    │   ├── simulations
    │   │   ├── inner-expr-summary-simulations.R
    │   │   ├── inner-null-run-DE.R
    │   │   ├── inner_write_simulation_objects_null.R
    │   │   ├── outer-expr-summary-simulations.R
    │   │   ├── outer-null-run-DE.R
    │   │   ├── outer_write_simulation_objects_null.R
    │   │   ├── summarise-null-DE-genes-per-bin.R
    │   │   └── summarise-null-n-DE-genes.R
    │   └── time_RAM
    │   │   ├── summarise-time-RAM-downsample_cells.R
    │   │   └── summarise-time-RAM.R
    └── functions
    │   ├── calculate_overlap.R
    │   ├── datasets.R
    │   ├── get_bulk_comparisons.R
    │   ├── get_comparisons.R
    │   ├── recode_colnames.R
    │   ├── run_DE.R
    │   └── spatial_datasets.R
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
 1 | # OS X
 2 | .DS_Store
 3 | 
 4 | # Eclipse
 5 | .settings
 6 | .project
 7 | .classpath
 8 | /build/
 9 | /bin/
10 | /target/
11 | 
12 | # Rstudio
13 | .R* 
14 | 
15 | # Python
16 | *.pyc
17 | .idea
18 | 


--------------------------------------------------------------------------------
/R/analysis/bulk_concordance/inner-run-concordance-terciles.R:
--------------------------------------------------------------------------------
  1 | setwd("~/git/DE-analysis")
  2 | options(stringsAsFactors = F)
  3 | library(argparse)
  4 | 
  5 | # parse arguments
  6 | parser = ArgumentParser(prog = 'inner-run-concordance-terciles.R')
  7 | parser$add_argument('--input_sc', type = 'character', required = T)
  8 | parser$add_argument('--input_bulk', type = 'character', required = T)
  9 | parser$add_argument('--summary_file', type = 'character', required = T)
 10 | parser$add_argument('--output_dir', type = 'character', required = T)
 11 | parser$add_argument('--n_bins', type = 'integer', required = T)
 12 | args = parser$parse_args()
 13 | print(args)
 14 | 
 15 | library(tidyverse)
 16 | library(magrittr)
 17 | library(Seurat)
 18 | library(Matrix)
 19 | library(RRHO)
 20 | library(AUC)
 21 | source("R/functions/calculate_overlap.R")
 22 | source("R/analysis/bulk_concordance/write_grid.R")
 23 | 
 24 | # set up output filepath
 25 | if (!dir.exists(args$output_dir))
 26 |   dir.create(args$output_dir, recursive = T)
 27 | 
 28 | # load in files
 29 | sc = readRDS(args$input_sc)
 30 | bulk = readRDS(args$input_bulk)
 31 | 
 32 | # read expression summary
 33 | expr_summary = read.csv(args$summary_file)
 34 | 
 35 | # define output file
 36 | sc_name = gsub(".rds", "", basename(args$input_sc))
 37 | bulk_name = gsub(".rds", "", basename(args$input_bulk))
 38 | output_filename = paste0(sc_name, "|", bulk_name, '-n_bins=', args$n_bins,
 39 |                          ".rds")
 40 | output_file = file.path(args$output_dir, output_filename)
 41 | 
 42 | # get all combinations of single-cell/bulk
 43 | sc_idxs = names(sc)
 44 | bulk_idxs = names(bulk)
 45 | if (is.null(sc_idxs)) {
 46 |   sc_idxs = "1"
 47 |   names(sc) = '1'
 48 | }
 49 | if (is.null(bulk_idxs)) {
 50 |   bulk_idxs = "1"
 51 |   names(bulk) = '1'
 52 | }
 53 | comparisons = expand.grid(sc_comparison = sc_idxs, bulk_comparison = bulk_idxs,
 54 |                           stringsAsFactors = F)
 55 | 
 56 | # get rid of irrelevant comparisons from Cano-Gamez 2020
 57 | if (grepl("CanoGamez2020", sc_name)) {
 58 |   keep = map2_lgl(comparisons$sc_comparison,
 59 |                   comparisons$bulk_comparison,
 60 |                   ~ grepl(.x, .y))
 61 |   comparisons %<>% extract(keep, )
 62 | }
 63 | 
 64 | # analyze each comparison separately
 65 | results = data.frame()
 66 | for (comparison_idx in seq_len(nrow(comparisons))) {
 67 |   message("analyzing comparison ", comparison_idx, " of ", nrow(comparisons),
 68 |           " ...")
 69 |   
 70 |   # prepare data
 71 |   sc_comparison = comparisons$sc_comparison[comparison_idx]
 72 |   bulk_comparison = comparisons$bulk_comparison[comparison_idx]
 73 |   sc_sub = sc[[sc_comparison]]
 74 |   bulk_sub = bulk[[bulk_comparison]]
 75 |   comparison_label = paste0(sc_comparison, "|", bulk_comparison)
 76 |   
 77 |   # for Angelidis, filter to relevant cell types to prevent bugs
 78 |   if (grepl("Angelidis", sc_name)) {
 79 |     sc_sub %<>% filter(cell_type %in% c("Type_2_pneumocytes",
 80 |                                         "Alveolar_macrophage"))
 81 |   }
 82 |   # same for Reyfman
 83 |   if (grepl("Reyfman", sc_name)) {
 84 |     sc_sub %<>% filter(cell_type %in% c("AT2", "Alveolar macrophages"))
 85 |   }
 86 |   
 87 |   # run concordance over each quintile separately
 88 |   out = data.frame()
 89 |   cell_types = unique(sc_sub$cell_type)
 90 |   for (cell_type in cell_types) {
 91 |     # bin genes by expression level
 92 |     tested_genes = sc_sub %>%
 93 |       filter(cell_type == !!cell_type) %>%
 94 |       filter(gene %in% bulk_sub$gene) %>%
 95 |       pull(gene)
 96 |     bins = expr_summary %>%
 97 |       filter(gene %in% tested_genes) %>%
 98 |       filter(comparison == sc_comparison, cell_type == !!cell_type) %>%
 99 |       arrange(mean) %>%
100 |       mutate(bin = cut(row_number() / n(),
101 |                        breaks = seq(0, args$n_bins) / args$n_bins),
102 |              bin = as.integer(bin)) %>%
103 |       split(.$bin)
104 |     
105 |     # run over each bin
106 |     for (bin in seq_len(args$n_bins)) {
107 |       bin_genes = bins[[bin]]$gene
108 |       sc_tmp = sc_sub %>%
109 |         filter(cell_type == !!cell_type, gene %in% bin_genes)
110 |       
111 |       tmp = template %>%
112 |         mutate(value = seq(nrow(template)) %>%
113 |                  map( ~ {
114 |                    print(template[., ])
115 |                    method = template$method[.]
116 |                    k = template$k[.]
117 |                    cor_method = template$cor_method[.]
118 |                    value = calculate_overlap(
119 |                      bulk_de = bulk_sub,
120 |                      sc_de = sc_tmp,
121 |                      method = method,
122 |                      k = k,
123 |                      cor_method = cor_method
124 |                    )
125 |                  }) %>%
126 |                  unlist()
127 |         ) %>%
128 |         mutate(cell_type = cell_type,
129 |                bin = bin,
130 |                sc_label = sc_comparison,
131 |                bulk_label = bulk_comparison)
132 |       out %<>% bind_rows(tmp)
133 |     }
134 |   }
135 |   
136 |   # append to the main results container
137 |   results %<>% bind_rows(out)
138 | }
139 | 
140 | # save results
141 | saveRDS(results, output_file)
142 | 


--------------------------------------------------------------------------------
/R/analysis/bulk_concordance/inner-run-concordance.R:
--------------------------------------------------------------------------------
  1 | setwd("~/git/DE-analysis")
  2 | options(stringsAsFactors = F)
  3 | library(argparse)
  4 | 
  5 | # parse arguments
  6 | parser = ArgumentParser(prog = 'inner-run-concordance.R')
  7 | parser$add_argument('--input_sc', type = 'character', required = T)
  8 | parser$add_argument('--input_bulk', type = 'character', required = T)
  9 | parser$add_argument('--output_dir', type = 'character', required = T)
 10 | args = parser$parse_args()
 11 | print(args)
 12 | 
 13 | library(tidyverse)
 14 | library(magrittr)
 15 | library(Seurat)
 16 | library(Matrix)
 17 | library(RRHO)
 18 | library(AUC)
 19 | source("R/functions/calculate_overlap.R")
 20 | source("R/analysis/bulk_concordance/write_grid.R")
 21 | 
 22 | # set up output filepath
 23 | if (!dir.exists(args$output_dir))
 24 |   dir.create(args$output_dir, recursive = T)
 25 | 
 26 | # load in files
 27 | sc = readRDS(args$input_sc)
 28 | bulk = readRDS(args$input_bulk)
 29 | 
 30 | # define output file
 31 | sc_name = gsub(".rds", "", basename(args$input_sc))
 32 | bulk_name = gsub(".rds", "", basename(args$input_bulk))
 33 | output_filename = paste0(sc_name, "|", bulk_name, ".rds")
 34 | output_file = file.path(args$output_dir, output_filename)
 35 | 
 36 | # get all combinations of single-cell/bulk
 37 | sc_idxs = names(sc)
 38 | bulk_idxs = names(bulk)
 39 | if (is.null(sc_idxs)) {
 40 |   sc_idxs = "1"
 41 |   names(sc) = '1'
 42 | }
 43 | if (is.null(bulk_idxs)) {
 44 |   bulk_idxs = "1"
 45 |   names(bulk) = '1'
 46 | }
 47 | comparisons = expand.grid(sc_comparison = sc_idxs, bulk_comparison = bulk_idxs, 
 48 |                           stringsAsFactors = F)
 49 | 
 50 | # get rid of irrelevant comparisons from Cano-Gamez 2020
 51 | if (grepl("CanoGamez2020", sc_name)) {
 52 |   keep = map2_lgl(comparisons$sc_comparison,
 53 |                   comparisons$bulk_comparison, 
 54 |                   ~ grepl(.x, .y))
 55 |   comparisons %<>% extract(keep, )
 56 | }
 57 | 
 58 | results = data.frame()
 59 | for (comparison_idx in seq_len(nrow(comparisons))) {
 60 |   message("analyzing comparison ", comparison_idx, " of ", nrow(comparisons),
 61 |           " ...")
 62 |   
 63 |   # prepare data
 64 |   sc_comparison = comparisons$sc_comparison[comparison_idx]
 65 |   bulk_comparison = comparisons$bulk_comparison[comparison_idx]
 66 |   sc_sub = sc[[sc_comparison]]
 67 |   bulk_sub = bulk[[bulk_comparison]]
 68 |   comparison_label = paste0(sc_comparison, "|", bulk_comparison)
 69 |   
 70 |   # for Angelidis, filter to relevant cell types to prevent bugs
 71 |   if (grepl("Angelidis", sc_name)) {
 72 |     sc_sub %<>% filter(cell_type %in% c("Type_2_pneumocytes",
 73 |                                         "Alveolar_macrophage"))
 74 |   }
 75 |   # same for Reyfman
 76 |   if (grepl("Reyfman", sc_name)) {
 77 |     sc_sub %<>% filter(cell_type %in% c("AT2", "Alveolar macrophages"))
 78 |   }
 79 |   
 80 |   # calculate concordance metrics for this comparison
 81 |   out = sc_sub %>%
 82 |     split(.$cell_type) %>%
 83 |     map( ~ {
 84 |       print(.$cell_type[1])
 85 |       sc_tmp = .
 86 |       tmp = template %>%
 87 |         mutate(value = seq(nrow(template)) %>%
 88 |                  map( ~ {
 89 |                    print(template[., ])
 90 |                    method = template$method[.]
 91 |                    k = template$k[.]
 92 |                    cor_method = template$cor_method[.]
 93 |                    value = calculate_overlap(
 94 |                      bulk_de = bulk_sub,
 95 |                      sc_de = sc_tmp,
 96 |                      method = method,
 97 |                      k = k,
 98 |                      cor_method = cor_method
 99 |                    )
100 |                  }) %>%
101 |                  unlist()
102 |         )
103 |     }) %>%
104 |     bind_rows(.id = 'cell_type') %>%
105 |     mutate(
106 |       sc_label = sc_comparison,
107 |       bulk_label = bulk_comparison
108 |     )
109 |   # bind to main results container
110 |   results %<>% bind_rows(out)
111 | }
112 | 
113 | # save results
114 | saveRDS(results, output_file)
115 | 


--------------------------------------------------------------------------------
/R/analysis/bulk_concordance/outer-run-concordance-terciles.R:
--------------------------------------------------------------------------------
 1 | # Run single-cell or pseudobulk DE analyses on all cell types in a dataset,
 2 | # within each tercile of gene expression.
 3 | setwd("~/git/DE-analysis")
 4 | options(stringsAsFactors = F)
 5 | library(argparse)
 6 | 
 7 | # parse arguments
 8 | parser = ArgumentParser(prog = 'outer-run-concordance-terciles.R')
 9 | parser$add_argument('--allocation', type = 'character')
10 | args = parser$parse_args()
11 | 
12 | library(tidyverse)
13 | library(magrittr)
14 | source("R/functions/datasets.R")
15 | source("R/functions/submit_job.R")
16 | source("R/functions/detect_system.R")
17 | 
18 | # list bulk input files
19 | bulk_files = list.files(file.path(base_dir, "analysis/run_bulk_DE"))
20 | bulk_inputs = data.frame(bulk_file = bulk_files) %>%
21 |   mutate(label = gsub("_.*|-.*", "", bulk_file)) %>%
22 |   # manual fix for the Hagai datasets
23 |   mutate(label = ifelse(label == 'Hagai2018', gsub("-.*", "", bulk_file),
24 |                         label)) %>%
25 |   # restore the entire filepath
26 |   mutate(bulk_file = file.path(base_dir, 'analysis/run_bulk_DE', bulk_file))
27 | 
28 | # get single-cell comparison files
29 | sc_files = list.files(file.path(base_dir, "analysis/run_DE"))
30 | sc_inputs = data.frame(sc_file = sc_files) %>%
31 |   mutate(label = gsub("_.*|-.*", "", sc_file)) %>%
32 |   # manual fix for the Hagai datasets
33 |   mutate(label = ifelse(label == 'Hagai2018', gsub("-.*", "", sc_file),
34 |                         label))
35 | 
36 | # rep analysis grid over input files
37 | grid = bulk_inputs %>%
38 |   left_join(sc_inputs) %>%
39 |   drop_na()
40 | 
41 | # add bins
42 | grid = tidyr::crossing(grid, n_bins = 3)
43 | 
44 | # add expr_summary file as a parameter
45 | grid %<>%
46 |   mutate(summary_file = file.path(base_dir, "analysis/expr_summary",
47 |                                   paste0(label, '.txt.gz')))
48 | 
49 | # write the raw array
50 | grid_file = "sh/analysis/run_DE/grids/run_concordance_terciles.raw.txt"
51 | grid_dir = dirname(grid_file)
52 | if (!dir.exists(grid_dir))
53 |   dir.create(grid_dir, recursive = T)
54 | write.table(grid, grid_file, quote = F, row.names = F, sep = "\t")
55 | 
56 | # define output directory where results are stored
57 | output_dir = file.path(base_dir, "analysis/run_concordance_terciles")
58 | 
59 | # check which parameters are already complete
60 | overwrite = F
61 | grid0 = grid
62 | if (!overwrite) {
63 |   grid0 = grid %>%
64 |     mutate(output_filename = paste0(basename(sc_file) %>%
65 |                                       gsub("\\.rds$", "", .),
66 |                                     "|",
67 |                                     basename(bulk_file) %>%
68 |                                       gsub("\\.rds$", "", .),
69 |                                     '-n_bins=', n_bins,
70 |                                     '.rds'),
71 |            output_file = file.path(output_dir, output_filename),
72 |            exists = file.exists(output_file)) %>%
73 |     filter(!exists) %>%
74 |     dplyr::select(-output_file, -output_filename, -exists)
75 | }
76 | 
77 | # limit to the 'gold standard' datasets
78 | grid0 %<>% filter(grepl("Hagai|CanoGamez|Reyfman|Angelidis", label))
79 | 
80 | # write the grid that still needs to be run
81 | write.table(grid0, "sh/analysis/run_DE/grids/run_concordance_terciles.txt",
82 |             quote = F, row.names = F, sep = "\t")
83 | 
84 | # finally, run the job on whatever system we're on
85 | sh_dir = "~/git/DE-analysis/sh/analysis/run_DE"
86 | script = file.path(sh_dir, "run_concordance_terciles.sh")
87 | submit_job(grid0, script, args$allocation, system)
88 | 


--------------------------------------------------------------------------------
/R/analysis/bulk_concordance/outer-run-concordance.R:
--------------------------------------------------------------------------------
 1 | setwd("~/git/DE-analysis")
 2 | options(stringsAsFactors = F)
 3 | library(argparse)
 4 | 
 5 | # parse arguments
 6 | parser = ArgumentParser(prog = 'outer-run-DE.R')
 7 | parser$add_argument('--allocation', type = 'character')
 8 | args = parser$parse_args()
 9 | 
10 | library(tidyverse)
11 | library(magrittr)
12 | source("R/functions/datasets.R")
13 | source("R/functions/submit_job.R")
14 | source("R/functions/detect_system.R")
15 | 
16 | # list bulk input files
17 | bulk_files = list.files(file.path(base_dir, "analysis/run_bulk_DE"))
18 | bulk_inputs = data.frame(bulk_file = bulk_files) %>%
19 |   mutate(label = gsub("_.*|-.*", "", bulk_file)) %>%
20 |   # manual fix for the Hagai datasets
21 |   mutate(label = ifelse(label == 'Hagai2018', gsub("-.*", "", bulk_file),
22 |                         label)) %>%
23 |   # manual fix for the CanoGamez proteomics
24 |   mutate(label = fct_recode(label,
25 |                             "CanoGamez2020" = "CanoGamez2020:proteomics")) %>%
26 |   # restore the entire filepath
27 |   mutate(bulk_file = file.path(base_dir, 'analysis/run_bulk_DE', bulk_file))
28 | 
29 | # get single-cell comparison files
30 | sc_files = list.files(file.path(base_dir, "analysis/run_DE"))
31 | sc_inputs = data.frame(sc_file = sc_files) %>%
32 |   mutate(label = gsub("_.*|-.*", "", sc_file)) %>%
33 |   # manual fix for the Hagai datasets
34 |   mutate(label = ifelse(label == 'Hagai2018', gsub("-.*", "", sc_file), label))
35 | 
36 | # rep analysis grid over input files
37 | grid = bulk_inputs %>%
38 |   left_join(sc_inputs) %>%
39 |   drop_na()
40 | 
41 | # write the raw array
42 | grid_file = "sh/analysis/run_DE/grids/run_concordance.raw.txt"
43 | grid_dir = dirname(grid_file)
44 | if (!dir.exists(grid_dir))
45 |   dir.create(grid_dir, recursive = T)
46 | write.table(grid, grid_file, quote = F, row.names = F, sep = "\t")
47 | 
48 | # define output directory where results are stored
49 | output_dir = file.path(base_dir, "analysis/run_concordance")
50 | 
51 | # check which parameters are already complete
52 | overwrite = F
53 | grid0 = grid
54 | if (!overwrite) {
55 |   grid0 = grid %>%
56 |     mutate(output_filename = paste0(basename(sc_file) %>%
57 |                                       gsub("\\.rds$", "", .),
58 |                                     "|",
59 |                                     basename(bulk_file) %>%
60 |                                       gsub("\\.rds$", "", .),
61 |                                     '.rds'),
62 |            output_file = file.path(output_dir, output_filename),
63 |            exists = file.exists(output_file)) %>%
64 |     filter(!exists) %>%
65 |     dplyr::select(-output_file, -output_filename, -exists)
66 | }
67 | 
68 | # limit to the 'gold standard' datasets
69 | grid0 %<>% filter(grepl("Hagai|CanoGamez|Reyfman|Angelidis", label))
70 | 
71 | # write the grid that still needs to be run
72 | write.table(grid0, "sh/analysis/run_DE/grids/run_concordance.txt",
73 |             quote = F, row.names = F, sep = "\t")
74 | 
75 | # finally, run the job on whatever system we're on
76 | sh_dir = "~/git/DE-analysis/sh/analysis/run_DE"
77 | script = file.path(sh_dir, "run_concordance.sh")
78 | submit_job(grid0, script, args$allocation, system)
79 | 


--------------------------------------------------------------------------------
/R/analysis/bulk_concordance/summarise_concordance-terciles.R:
--------------------------------------------------------------------------------
 1 | setwd("~/git/DE-analysis")
 2 | options(stringsAsFactors = F)
 3 | library(tidyverse)
 4 | library(magrittr)
 5 | args = list(); source("R/functions/detect_system.R")
 6 | 
 7 | # set up input
 8 | input_dir = file.path(base_dir, "analysis/run_concordance_terciles")
 9 | input_files = list.files(input_dir, full.names = T, pattern ='*\\.rds$')
10 | 
11 | # read all input files
12 | dats = map(input_files, readRDS) %>%
13 |   setNames(basename(input_files))
14 | 
15 | # combine into a single file
16 | dat = dats %>%
17 |   bind_rows(.id = 'comparison') %>%
18 |   separate(comparison, c("sc", "bulk"), "\\|") %>%
19 |   separate(sc, c("sc_dataset", "sc_test", "shuffle_replicates"), "-") %>%
20 |   separate(bulk, c("bulk_dataset", "bulk_test", "n_bins"), "-") %>%
21 |   mutate_at(vars(sc_test, bulk_test, shuffle_replicates, n_bins), function(x) 
22 |     gsub(".*=|.rds", "", x))
23 | 
24 | # save results
25 | output_file = "data/analysis/bulk_concordance/concordance_terciles.rds"
26 | output_dir = dirname(output_file)
27 | if (!dir.exists(output_dir))
28 |   dir.create(output_dir, recursive = T)
29 | saveRDS(dat, output_file)
30 | 


--------------------------------------------------------------------------------
/R/analysis/bulk_concordance/summarise_concordance.R:
--------------------------------------------------------------------------------
 1 | setwd("~/git/DE-analysis")
 2 | options(stringsAsFactors = F)
 3 | library(tidyverse)
 4 | library(magrittr)
 5 | args = list(); source("R/functions/detect_system.R")
 6 | 
 7 | input_dir = file.path(base_dir, "analysis/run_concordance")
 8 | input_files = list.files(input_dir, full.names = T, pattern ='*\\.rds$')
 9 | 
10 | # read all input files
11 | dats = map(input_files, readRDS) %>%
12 |   setNames(basename(input_files))
13 | 
14 | # combine into a single file
15 | dat = dats %>%
16 |   bind_rows(.id = 'comparison') %>%
17 |   separate(comparison, c("sc", "bulk"), "\\|") %>%
18 |   separate(sc, c("sc_dataset", "sc_test", "shuffle_replicates"), "-") %>%
19 |   separate(bulk, c("bulk_dataset", "bulk_test"), "-") %>%
20 |   mutate_at(vars(sc_test, shuffle_replicates, bulk_test),
21 |             ~ gsub("^.*=|\\.rds", "", .))
22 | 
23 | # save results
24 | output_file = "data/analysis/bulk_concordance/concordance_summary.rds"
25 | output_dir = dirname(output_file)
26 | if (!dir.exists(output_dir))
27 |   dir.create(output_dir, recursive = T)
28 | saveRDS(dat, output_file)
29 | 


--------------------------------------------------------------------------------
/R/analysis/bulk_concordance/write_grid.R:
--------------------------------------------------------------------------------
 1 | # write concordance array for scRNA-seq/bulk comparisons
 2 | # FCC array
 3 | fcc_opts = list(
 4 |   method = 'fcc',
 5 |   cor_method = 'spearman'
 6 | )
 7 | fcc_array = do.call(expand.grid, c(fcc_opts, stringsAsFactors = F))
 8 | # AUCC array
 9 | aucc_opts = list(
10 |   method = 'aucc',
11 |   k = c(100, 200, 500, 1000)
12 | )
13 | aucc_array = do.call(expand.grid, c(aucc_opts, stringsAsFactors = F))
14 | # create results template using all parameters of interest
15 | template = bind_rows(
16 |   fcc_array,
17 |   aucc_array
18 | )
19 | 


--------------------------------------------------------------------------------
/R/analysis/confounds/inner-calculate-confounds.R:
--------------------------------------------------------------------------------
 1 | # Calculate possible confounding factors to DE identified in the Augur paper:
 2 | # - read depth (mean and total counts per cell type),
 3 | # - 'gene depth' (number of genes detected per cell type), and
 4 | # - 'cell depth' (number of cells sequenced per type)
 5 | setwd("~/git/DE-analysis")
 6 | options(stringsAsFactors = F)
 7 | library(argparse)
 8 | 
 9 | # parse arguments
10 | parser = ArgumentParser(prog = 'inner-calculate-confounds.R')
11 | parser$add_argument('--input_file', type = 'character', required = T)
12 | parser$add_argument('--output_dir', type = 'character', required = T)
13 | args = parser$parse_args()
14 | print(args)
15 | 
16 | library(tidyverse)
17 | library(magrittr)
18 | library(Seurat)
19 | library(Matrix)
20 | source("R/functions/get_comparisons.R")
21 | 
22 | # set up output filepath
23 | if (!dir.exists(args$output_dir))
24 |   dir.create(args$output_dir, recursive = T)
25 | dataset = args$input_file %>%
26 |   basename() %>% 
27 |   gsub("\\.rds$", "", .)
28 | output_filename = paste0(dataset, ".txt")
29 | output_file = file.path(args$output_dir, output_filename)
30 | 
31 | # read input file and extract matrix/metadata
32 | sc = readRDS(args$input_file)
33 | expr = GetAssayData(sc, slot = 'counts')
34 | meta = sc@meta.data
35 | dataset = gsub("\\.rds$", "", basename(args$input_file))
36 | 
37 | # get all combinations of conditions
38 | results = list()
39 | comparisons = get_comparisons(dataset, expr, meta)
40 | for (comparison_idx in seq_along(comparisons)) {
41 |   comparison = comparisons[[comparison_idx]]
42 |   comparison_name = names(comparisons)[comparison_idx]
43 |   if (is.null(comparison_name))
44 |     comparison_name = 1
45 |   
46 |   message("[", comparison_idx, "/", length(comparisons), "] ",
47 |           "analyzing comparison ", comparison_name, " ...")
48 |   message("##############################")
49 |   
50 |   # get subset expression and metadata
51 |   expr0 = comparison$expr
52 |   meta0 = comparison$meta %>%
53 |     set_rownames(colnames(expr0))
54 |   
55 |   # reconstruct the Seurat object
56 |   sc0 = CreateSeuratObject(expr0, min.cells = 1, min.features = 0,
57 |                            meta.data = meta0)
58 |   
59 |   # analyze each cell type in turn
60 |   cell_types = unique(meta0$cell_type) 
61 |   for (cell_type_idx in seq_along(cell_types)) {
62 |     cell_type = cell_types[cell_type_idx]
63 |     message("  [", cell_type_idx, "/", length(cell_types), 
64 |             "] analyzing cell type: ", cell_type, " ...")
65 |     
66 |     # number of cells
67 |     keep = which(meta0$cell_type == cell_type)
68 |     n_cells = length(keep)
69 |     
70 |     # read depth per cell
71 |     expr1 = expr0[, keep, drop = F]
72 |     reads = colSums(expr1)
73 |     read_depth_mean = mean(reads)
74 |     read_depth_sum = sum(reads)
75 |     
76 |     # genes detected per cell
77 |     n_genes = colSums(expr1 > 0)
78 |     n_genes_mean = mean(n_genes)
79 | 
80 |     # append to results
81 |     results %<>% rbind(
82 |       data.frame(dataset = dataset,
83 |                  comparison = comparison_name,
84 |                  cell_type = cell_type,
85 |                  outcome = c("# of cells",
86 |                              "read depth (mean)",
87 |                              "read depth (sum)",
88 |                              "# of genes (mean)"),
89 |                  value = c(n_cells,
90 |                            read_depth_mean, 
91 |                            read_depth_sum,
92 |                            n_genes_mean)))
93 |   }
94 | }
95 | 
96 | # write
97 | write.csv(results, output_file, row.names = F)
98 | system(paste("gzip --force", output_file))
99 | 


--------------------------------------------------------------------------------
/R/analysis/confounds/outer-calculate-confounds.R:
--------------------------------------------------------------------------------
 1 | # Calculate possible confounding factors to DE identified in the Augur paper:
 2 | # - read depth (mean and total counts per cell type),
 3 | # - 'gene depth' (number of genes detected per cell type), and
 4 | # - 'cell depth' (number of cells sequenced per type)
 5 | setwd("~/git/DE-analysis")
 6 | options(stringsAsFactors = F)
 7 | library(argparse)
 8 | 
 9 | # parse arguments
10 | parser = ArgumentParser(prog = 'outer-calculate-confounds.R')
11 | parser$add_argument('--allocation', type = 'character')
12 | args = parser$parse_args()
13 | 
14 | library(tidyverse)
15 | library(magrittr)
16 | source("R/functions/datasets.R")
17 | source("R/functions/submit_job.R")
18 | source("R/functions/detect_system.R")
19 | 
20 | # list input files
21 | input_dir = file.path(base_dir, "rnaseq", "seurat")
22 | input_files = file.path(input_dir, paste0(datasets, '.rds'))
23 | # grid is simply the list of input files
24 | grid = data.frame(input_file = input_files)
25 | 
26 | # define output directory where results are stored
27 | output_dir = file.path(base_dir, "analysis/confounds")
28 | 
29 | # check which parameters are already complete
30 | overwrite = F
31 | grid0 = grid
32 | if (!overwrite) {
33 |   grid0 = grid %>%
34 |     mutate(output_filename = paste0(basename(input_file) %>% 
35 |                                       gsub("\\.rds$", "", .), '.txt.gz'),
36 |            output_file = file.path(output_dir, output_filename),
37 |            exists = file.exists(output_file)) %>%
38 |     filter(!exists) %>%
39 |     dplyr::select(-output_file, -output_filename, -exists)
40 | }
41 | 
42 | # write the grid that still needs to be run
43 | grid_file = "sh/analysis/confounds/grids/calculate_confounds.txt"
44 | grid_dir = dirname(grid_file)
45 | if (!dir.exists(grid_dir)) 
46 |   dir.create(grid_dir, recursive = T)
47 | write.table(grid0, grid_file, quote = F, row.names = F, sep = "\t")
48 | 
49 | # finally, run the job on whatever system we're on
50 | sh_dir = "~/git/DE-analysis/sh/analysis/confounds"
51 | script = file.path(sh_dir, "calculate_confounds.sh")
52 | submit_job(grid0, script, args$allocation, system)
53 | 


--------------------------------------------------------------------------------
/R/analysis/confounds/summarise-confounds.R:
--------------------------------------------------------------------------------
 1 | setwd("~/git/DE-analysis")
 2 | options(stringsAsFactors = F)
 3 | library(tidyverse)
 4 | library(magrittr)
 5 | args = list(); source("R/functions/detect_system.R")
 6 | 
 7 | # list files
 8 | input_dir = file.path(base_dir, "analysis/confounds")
 9 | input_files = list.files(input_dir, full.names = T, pattern = '*\\.txt\\.gz$')
10 | 
11 | # read these all
12 | dats = map(input_files, read.csv) 
13 | dat = do.call(rbind, dats)
14 | 
15 | # save results
16 | output_file = "data/analysis/confounds/confounds.rds"
17 | output_dir = dirname(output_file)
18 | if (!dir.exists(output_dir))
19 |   dir.create(output_dir, recursive = T)
20 | saveRDS(dat, output_file)
21 | 


--------------------------------------------------------------------------------
/R/analysis/control_only/inner-control-only-spatial.R:
--------------------------------------------------------------------------------
  1 | setwd("~/git/DE-analysis")
  2 | options(stringsAsFactors = F)
  3 | library(argparse)
  4 | 
  5 | # parse arguments
  6 | parser = ArgumentParser(prog = 'inner-control-only.R')
  7 | parser$add_argument('--input_file', type = 'character', required = T)
  8 | parser$add_argument('--de_test', type = 'character', required = T)
  9 | parser$add_argument('--sample_idx', type = 'integer', required = T)
 10 | parser$add_argument('--shuffle_replicates', type = 'character', required = T)
 11 | parser$add_argument('--label', type = 'character', required = T)
 12 | parser$add_argument('--output_dir', type = 'character', required = T)
 13 | args = parser$parse_args()
 14 | print(args)
 15 | 
 16 | library(tidyverse)
 17 | library(magrittr)
 18 | library(Seurat)
 19 | library(Matrix)
 20 | library(peakRAM)
 21 | library(future)
 22 | source("R/functions/get_comparisons.R")
 23 | source("R/functions/run_DE.R")
 24 | 
 25 | # set up output filepath
 26 | if (!dir.exists(args$output_dir))
 27 |   dir.create(args$output_dir, recursive = T)
 28 | dataset = args$input_file %>%
 29 |   basename() %>%
 30 |   gsub("\\.rds$", "", .)
 31 | output_filename = paste0(dataset,
 32 |     '-de_test=', args$de_test,
 33 |     '-sample_idx=', args$sample_idx,
 34 |     '-shuffle_replicates=', args$shuffle_replicates,
 35 |     '-label=', args$label,
 36 |     ".rds")
 37 | output_file = file.path(args$output_dir, output_filename)
 38 | 
 39 | # read input file and extract matrix/metadata
 40 | sc = readRDS(args$input_file)
 41 | expr = GetAssayData(sc, slot = 'counts')
 42 | meta = sc@meta.data
 43 | 
 44 | # get all combinations of conditions
 45 | comparisons = get_comparisons(dataset, expr, meta)
 46 | if (is.null(names(comparisons))) {
 47 |   names(comparisons) = '1'
 48 | }
 49 | 
 50 | # iterate through comparisons
 51 | results = list()
 52 | for (comparison_name in names(comparisons)) {
 53 |   comparison = comparisons[[comparison_name]]
 54 |   
 55 |   # get subset expression and metadata
 56 |   expr0 = comparison$expr
 57 |   meta0 = comparison$meta
 58 |   
 59 |   # filter to label of interest
 60 |   meta0 %<>%
 61 |     mutate(idx = row_number()) %>%
 62 |     filter(grepl(args$label, label))
 63 |   expr0 = expr[, meta0$idx]
 64 |   
 65 |   # skip if there aren't enough replicates
 66 |   if (n_distinct(meta0$replicate) < 6) {
 67 |     message("skipping comparison: ", comparison , " (not enough replicates) ...")  
 68 |     next
 69 |   }
 70 |   
 71 |   # set barcode as column
 72 |   meta0 %<>%
 73 |     as.data.frame() %>%
 74 |     mutate(new_barcode = colnames(expr0))
 75 | 
 76 |   # re-assign the groups
 77 |   reps = unique(meta0$replicate)
 78 |   n_reps = length(reps)
 79 |   ctrl = sample(reps, n_reps / 2)
 80 |   meta0 %<>%
 81 |     mutate(label = ifelse(replicate %in% ctrl, 'ctrl', 'stim')) %>%
 82 |     set_rownames(.$new_barcode)
 83 |   
 84 |   # check for replicate shuffling
 85 |   if (args$shuffle_replicates == "YES") {
 86 |     meta0 %<>% group_by(cell_type, label) %>%
 87 |       mutate(replicate = sample(replicate)) %>%
 88 |       set_rownames(.$new_barcode)
 89 |   }
 90 |   
 91 |   # reconstruct the Seurat object
 92 |   sc0 = CreateSeuratObject(expr0, min.cells = 1, min.features = 0,
 93 |                            meta.data = meta0)
 94 |   
 95 |   # run DE analysis
 96 |   DE = run_DE(sc0, de_test = args$de_test)
 97 |   
 98 |   # add to results
 99 |   results[[length(results) + 1]] = mutate(DE, comparison = comparison_name)
100 | }
101 | 
102 | # stop if empty
103 | if (length(results) == 0)
104 |   stop("couldn't get any results")
105 | 
106 | # save results
107 | saveRDS(results, output_file)
108 | 


--------------------------------------------------------------------------------
/R/analysis/control_only/inner-control-only.R:
--------------------------------------------------------------------------------
 1 | # Run single-cell or pseudobulk DE analyses on random splits of control
 2 | # samples only.
 3 | setwd("~/git/DE-analysis")
 4 | options(stringsAsFactors = F)
 5 | library(argparse)
 6 | 
 7 | # parse arguments
 8 | parser = ArgumentParser(prog = 'inner-control-only.R')
 9 | parser$add_argument('--input_file', type = 'character', required = T)
10 | parser$add_argument('--de_test', type = 'character', required = T)
11 | parser$add_argument('--sample_idx', type = 'integer', required = T)
12 | parser$add_argument('--shuffle_replicates', type = 'character', required = T)
13 | parser$add_argument('--label', type = 'character', required = T)
14 | parser$add_argument('--comparison', type = 'character', required = T)
15 | parser$add_argument('--output_dir', type = 'character', required = T)
16 | args = parser$parse_args()
17 | print(args)
18 | 
19 | library(tidyverse)
20 | library(magrittr)
21 | library(Seurat)
22 | library(Matrix)
23 | library(peakRAM)
24 | library(future)
25 | source("R/functions/get_comparisons.R")
26 | source("R/functions/run_DE.R")
27 | 
28 | # set up output filepath
29 | if (!dir.exists(args$output_dir))
30 |   dir.create(args$output_dir, recursive = T)
31 | dataset = args$input_file %>%
32 |   basename() %>%
33 |   gsub("\\.rds$", "", .)
34 | output_filename = paste0(dataset,
35 |     '-de_test=', args$de_test,
36 |     '-sample_idx=', args$sample_idx,
37 |     '-shuffle_replicates=', args$shuffle_replicates,
38 |     '-label=', args$label,
39 |     '-comparison=', args$comparison,
40 |   ".rds")
41 | output_file = file.path(args$output_dir, output_filename)
42 | 
43 | # read input file and extract matrix/metadata
44 | sc = readRDS(args$input_file)
45 | expr = GetAssayData(sc, slot = 'counts')
46 | meta = sc@meta.data
47 | 
48 | # get all combinations of conditions
49 | comparisons = get_comparisons(dataset, expr, meta)
50 | if (is.null(names(comparisons))) {
51 |   names(comparisons) = '1'
52 | }
53 | 
54 | # grab comparison of interest
55 | comparison_name = args$comparison
56 | comparison = comparisons[[comparison_name]]
57 | 
58 | # get subset expression and metadata
59 | expr0 = comparison$expr
60 | meta0 = comparison$meta
61 | 
62 | # grab the correct label
63 | meta0 %<>%
64 |   as.data.frame() %>%
65 |   set_rownames(colnames(expr0)) %>%
66 |   rownames_to_column(var = 'new_barcode') %>%
67 |   filter(label == args$label) %>%
68 |   set_rownames(.$new_barcode)
69 | expr0 %<>% extract(, rownames(meta0))
70 | 
71 | # re-assign the groups
72 | reps = unique(meta0$replicate)
73 | n_reps = length(reps)
74 | ctrl = sample(reps, n_reps/2)
75 | meta0 %<>%
76 |   mutate(label = ifelse(replicate %in% ctrl, 'ctrl', 'stim')) %>%
77 |   set_rownames(.$new_barcode)
78 | 
79 | # check for replicate shuffling
80 | if (args$shuffle_replicates == "YES") {
81 |   meta0 %<>% group_by(cell_type, label) %>%
82 |     mutate(replicate = sample(replicate)) %>%
83 |     set_rownames(.$new_barcode)
84 | }
85 | 
86 | # reconstruct the Seurat object
87 | sc0 = CreateSeuratObject(expr0, min.cells = 1, min.features = 0,
88 |                          meta.data = meta0)
89 | 
90 | # run DE analysis
91 | DE = run_DE(sc0, de_test = args$de_test)
92 | 
93 | # stop if empty
94 | if (nrow(DE) == 0)
95 |   stop("couldn't get any results")
96 | 
97 | # save results
98 | saveRDS(DE, output_file)
99 | 


--------------------------------------------------------------------------------
/R/analysis/control_only/inner-expr-summary-control-only.R:
--------------------------------------------------------------------------------
  1 | setwd("~/git/DE-analysis")
  2 | options(stringsAsFactors = F)
  3 | library(argparse)
  4 | 
  5 | # parse arguments
  6 | parser = ArgumentParser(prog = 'inner-expr-summary.R')
  7 | parser$add_argument('--input_file', type = 'character', required = T)
  8 | parser$add_argument('--label', type = 'character', required = T)
  9 | parser$add_argument('--output_dir', type = 'character', required = T)
 10 | args = parser$parse_args()
 11 | print(args)
 12 | 
 13 | library(tidyverse)
 14 | library(magrittr)
 15 | library(Seurat)
 16 | library(Matrix)
 17 | source("R/functions/get_comparisons.R")
 18 | 
 19 | # set up output filepath
 20 | if (!dir.exists(args$output_dir))
 21 |   dir.create(args$output_dir, recursive = T)
 22 | dataset = args$input_file %>%
 23 |   basename() %>%
 24 |   gsub("\\.rds$", "", .)
 25 | output_filename = paste0(dataset, "-label=", args$label, ".csv")
 26 | output_file = file.path(args$output_dir, output_filename)
 27 | 
 28 | # read input file and extract matrix/metadata
 29 | sc = readRDS(args$input_file)
 30 | expr = GetAssayData(sc, slot = 'counts')
 31 | meta = sc@meta.data
 32 | dataset = gsub("\\.rds$", "", basename(args$input_file))
 33 | 
 34 | # filter to the condition of interest
 35 | keep = which(meta$label == args$label) 
 36 | expr0 = expr[, keep, drop = F]
 37 | meta0 = meta[keep, , drop = F]
 38 | 
 39 | # analyze each cell type in turn
 40 | results = data.frame()
 41 | cell_types = unique(meta0$cell_type)
 42 | for (cell_type_idx in seq_along(cell_types)) {
 43 |   cell_type = cell_types[cell_type_idx]
 44 |   message("  [", cell_type_idx, "/", length(cell_types),
 45 |           "] analyzing cell type: ", cell_type, " ...")
 46 |   
 47 |   # get cell-type-specific expression matrix
 48 |   keep = which(meta0$cell_type == cell_type)
 49 |   expr1 = expr0[, keep, drop = F]
 50 |   meta1 = meta0[keep, , drop = F]
 51 |   rownames(meta1) = colnames(expr1)
 52 |   
 53 |   # calculate statistics
 54 |   genes = rownames(expr1)
 55 |   means = Matrix::rowMeans(expr1)
 56 |   sds = sparseMatrixStats::rowSds(expr1)
 57 |   covs = sds / means
 58 |   pct_zeros = Matrix::rowSums(expr1 == 0) / ncol(expr1)
 59 |   
 60 |   # calculate logFC as defined in Seurat
 61 |   logFC = tryCatch({
 62 |     sc0 = CreateSeuratObject(expr1, meta = meta1) %>%
 63 |       NormalizeData()
 64 |     Idents(sc0) = sc0$label
 65 |     mat = GetAssayData(sc0, slot = 'data')
 66 |     levels = levels(meta1$label)
 67 |     if (is.null(levels)) {
 68 |       levels = unique(meta1$label)
 69 |     }
 70 |     cells1 = WhichCells(sc0, idents = levels[1])
 71 |     cells2 = WhichCells(sc0, idents = levels[2])
 72 |     data1 = log(rowMeans(mat[, cells1, drop = F] + 1))
 73 |     data2 = log(rowMeans(mat[, cells2, drop = F] + 1))
 74 |     out = data2 - data1 # backwards from Seurat (i.e., the proper way)
 75 |   }, error = function(e) { return(NA_real_) })
 76 |   
 77 |   # calculate pseudobulk variance
 78 |   pseudobulk_variance = tryCatch({
 79 |     meta2 = meta1 %>%
 80 |       mutate(label = as.character(label),
 81 |              replicate = as.character(replicate))
 82 |     mm = model.matrix(~ 0 + replicate, data = meta2)
 83 |     mat_mm = expr1 %*% mm
 84 |     # drop empty columns
 85 |     keep_samples = colSums(mat_mm) > 0
 86 |     mat_mm %<>% extract(, keep_samples) %>% as.matrix()
 87 |     # normalize
 88 |     mat_mm %<>% edgeR::cpm()
 89 |     # grab the variance for each gene
 90 |     vars = sparseMatrixStats::rowSds(mat_mm)
 91 |     vars %<>% setNames(rownames(mat_mm))
 92 |     vars
 93 |   }, error = function(e) { return(NA_real_) })
 94 |   
 95 |   # calculate shuffled pseudobulk variance
 96 |   shuffled_variance = tryCatch({
 97 |     meta2 = meta1 %>%
 98 |       mutate(label = as.character(label),
 99 |              replicate = as.character(replicate)) %>%
100 |       group_by(cell_type, label) %>%
101 |       mutate(replicate = sample(replicate))
102 |     mm = model.matrix(~ 0 + replicate, data = meta2)
103 |     mat_mm = expr1 %*% mm
104 |     # drop empty columns
105 |     keep_samples = colSums(mat_mm) > 0
106 |     mat_mm %<>% extract(, keep_samples) %>% as.matrix()
107 |     # normalize
108 |     mat_mm %<>% edgeR::cpm()
109 |     # grab the variance for each gene
110 |     vars = sparseMatrixStats::rowSds(mat_mm)
111 |     vars %<>% setNames(rownames(mat_mm))
112 |     vars
113 |   }, error = function(e) { return(NA_real_) })
114 |   
115 |   # calculate the ratio of real to shuffled variance
116 |   ratio = pseudobulk_variance / shuffled_variance
117 |   
118 |   # convert to data frame
119 |   df = data.frame(gene = genes, mean = means, sd = sds, cov = covs,
120 |                   pct_zero = pct_zeros, logFC = logFC,
121 |                   pseudobulk_variance = pseudobulk_variance,
122 |                   shuffled_variance = shuffled_variance,
123 |                   pseudobulk_ratio = ratio) %>%
124 |     mutate(dataset = dataset,
125 |            label = args$label,
126 |            cell_type = cell_type)
127 |   
128 |   # append to results
129 |   results %<>% bind_rows(df)
130 | }
131 | 
132 | # rearrange columns
133 | results %<>% dplyr::select(dataset, label, cell_type, everything())
134 | 
135 | # write
136 | write.csv(results, output_file, row.names = F)
137 | system(paste("gzip --force", output_file))
138 | 


--------------------------------------------------------------------------------
/R/analysis/control_only/outer-control-only-spatial.R:
--------------------------------------------------------------------------------
 1 | # Run single-cell or pseudobulk DE analyses on random splits of control
 2 | # samples only in a spatial dataset.
 3 | setwd("~/git/DE-analysis")
 4 | options(stringsAsFactors = F)
 5 | library(argparse)
 6 | 
 7 | # parse arguments
 8 | parser = ArgumentParser(prog = 'outer-control-only.R')
 9 | parser$add_argument('--allocation', type = 'character')
10 | args = parser$parse_args()
11 | 
12 | library(tidyverse)
13 | library(magrittr)
14 | source("R/functions/submit_job.R")
15 | source("R/functions/detect_system.R")
16 | 
17 | # list input files
18 | input_dir = file.path(base_dir, "spatial", "seurat")
19 | input_files = list.files(input_dir, full.names = TRUE, pattern = '*rds')
20 | inputs = data.frame(input_file = input_files)
21 | 
22 | # establish grid of analyses
23 | opts = list(
24 |   de_test = c(
25 |     ## single-cell methods, implemented in Seurat
26 |     "wilcox", "bimod", "t", "negbinom", "poisson", "LR", "MAST",
27 |     # pseudobulk methods
28 |     "pseudobulk_DESeq2,test?LRT", "pseudobulk_DESeq2,test?Wald",
29 |     "pseudobulk_limma,mode?voom", "pseudobulk_limma,mode?trend",
30 |     "pseudobulk_edgeR,test?QLF",
31 |     "pseudobulk_edgeR,test?LRT",
32 |     ## mixed model methods, implemented in Seurat
33 |     "mixed_lm"
34 |   ),
35 |   sample_idx = 1,
36 |   shuffle_replicates = c("NO", "YES")
37 | )
38 | grid = do.call(expand.grid, c(opts, stringsAsFactors = F)) %>%
39 |   mutate(type = ifelse(grepl("pseudo|mixed", de_test), 'rep', 'single')) %>%
40 |   # only do shuffle replicates in pseudobulk or mixed model
41 |   filter(type != 'single' | shuffle_replicates != 'YES') %>%
42 |   dplyr::select(-type)
43 | 
44 | # rep analysis grid over input files
45 | grid %<>%
46 |   dplyr::slice(rep(1:n(), each = nrow(inputs))) %>%
47 |   mutate(input_file = rep(inputs$input_file, nrow(grid))) %>%
48 |   left_join(inputs, by = 'input_file') %>%
49 |   # reorder columns
50 |   dplyr::select(input_file, de_test, sample_idx, shuffle_replicates) %>%
51 |   # filter to mouse only
52 |   filter(grepl("_mouse", input_file)) %>%
53 |   # manually code control labels
54 |   mutate(label = ifelse(
55 |     gsub("\\.rds", "", basename(input_file)) == 'Maniatis2019_mouse', 
56 |     'WT', NA))
57 | 
58 | # write the raw array
59 | grid_file = "sh/analysis/control_only/grids/control_only_spatial.raw.txt"
60 | grid_dir = dirname(grid_file)
61 | if (!dir.exists(grid_dir))
62 |   dir.create(grid_dir, recursive = T)
63 | write.table(grid, grid_file, quote = F, row.names = F, sep = "\t")
64 | 
65 | # define output directory where results are stored
66 | output_dir = file.path(base_dir, "analysis/control_only/spatial")
67 | 
68 | # check which parameters are already complete
69 | overwrite = F
70 | grid0 = grid
71 | if (!overwrite) {
72 |   grid0 = grid %>%
73 |     mutate(output_file = file.path(output_dir, paste0(basename(input_file) %>%
74 |                                           gsub("\\.rds$", "", .),
75 |                                         '-de_test=', de_test,
76 |                                         '-sample_idx=', sample_idx,
77 |                                         '-shuffle_replicates=', shuffle_replicates,
78 |                                         '-label=', label,
79 |                                         '.rds')),
80 |            exists = file.exists(output_file)) %>%
81 |     filter(!exists) %>%
82 |     dplyr::select(-output_file, -exists)
83 | }
84 | 
85 | # write the grid that still needs to be run
86 | write.table(grid0, "sh/analysis/control_only/grids/control_only_spatial.txt",
87 |             quote = F, row.names = F, sep = "\t")
88 | 
89 | # finally, run the job on whatever system we're on
90 | sh_dir = "~/git/DE-analysis/sh/analysis/control_only"
91 | script = file.path(sh_dir, "control_only_spatial.sh")
92 | submit_job(grid0, script, args$allocation, system)
93 | 


--------------------------------------------------------------------------------
/R/analysis/control_only/outer-control-only.R:
--------------------------------------------------------------------------------
  1 | # Run single-cell or pseudobulk DE analyses on random splits of control
  2 | # samples only.
  3 | setwd("~/git/DE-analysis")
  4 | options(stringsAsFactors = F)
  5 | library(argparse)
  6 | 
  7 | # parse arguments
  8 | parser = ArgumentParser(prog = 'outer-control-only.R')
  9 | parser$add_argument('--allocation', type = 'character')
 10 | args = parser$parse_args()
 11 | 
 12 | library(tidyverse)
 13 | library(magrittr)
 14 | source("R/functions/datasets.R")
 15 | source("R/functions/submit_job.R")
 16 | source("R/functions/detect_system.R")
 17 | 
 18 | # list input files
 19 | input_dir = file.path(base_dir, "rnaseq", "seurat")
 20 | input_files = file.path(input_dir, paste0(datasets, '.rds'))
 21 | inputs = data.frame(input_file = input_files)
 22 | 
 23 | # establish grid of analyses
 24 | opts = list(
 25 |   de_test = c(
 26 |     ## single-cell methods, implemented in Seurat
 27 |     "wilcox", "bimod", "t", "negbinom", "poisson", "LR", "MAST",
 28 |     # pseudobulk methods
 29 |     "pseudobulk_DESeq2,test?LRT", "pseudobulk_DESeq2,test?Wald",
 30 |     "pseudobulk_limma,mode?voom", "pseudobulk_limma,mode?trend",
 31 |     "pseudobulk_edgeR,test?QLF",
 32 |     "pseudobulk_edgeR,test?LRT",
 33 |     ## mixed model methods, implemented in Seurat
 34 |     "mixed_lm"
 35 |   ),
 36 |   sample_idx = 1,
 37 |   shuffle_replicates = c("NO", "YES")
 38 | )
 39 | grid = do.call(expand.grid, c(opts, stringsAsFactors = F)) %>%
 40 |   mutate(type = ifelse(grepl("pseudo|mixed", de_test), 'rep', 'single')) %>%
 41 |   # only do shuffle replicates in pseudobulk or mixed model
 42 |   filter(type != 'single' | shuffle_replicates != 'YES') %>%
 43 |   dplyr::select(-type)
 44 | 
 45 | # rep analysis grid over input files
 46 | grid %<>%
 47 |   dplyr::slice(rep(1:n(), each = nrow(inputs))) %>%
 48 |   mutate(input_file = rep(inputs$input_file, nrow(grid))) %>%
 49 |   left_join(inputs, by = 'input_file') %>%
 50 |   # reorder columns
 51 |   dplyr::select(input_file, de_test, sample_idx, shuffle_replicates)
 52 | 
 53 | # load in number of replicates for each dataset
 54 | reps = readRDS("data/analysis/confounds/replicates.rds")
 55 | 
 56 | # grab the conditions where we have enough replicates
 57 | keep = reps %>%
 58 |   group_by(dataset, label, comparison) %>%
 59 |   summarise(n_reps = n()) %>%
 60 |   ungroup %>%
 61 |   filter(n_reps >= 6)
 62 | 
 63 | # add this into the grid
 64 | grid %<>%
 65 |     mutate(dataset = gsub(".rds$", "", basename(input_file))) %>%
 66 |     left_join(keep) %>%
 67 |     # drop the ones we aren't keeping
 68 |     drop_na() %>%
 69 |     dplyr::select(-dataset, -n_reps)
 70 | 
 71 | # write the raw array
 72 | grid_file = "sh/analysis/control_only/grids/control_only.raw.txt"
 73 | grid_dir = dirname(grid_file)
 74 | if (!dir.exists(grid_dir))
 75 |   dir.create(grid_dir, recursive = T)
 76 | write.table(grid, grid_file, quote = F, row.names = F, sep = "\t")
 77 | 
 78 | # define output directory where results are stored
 79 | output_dir = file.path(base_dir, "analysis/control_only")
 80 | 
 81 | # check which parameters are already complete
 82 | overwrite = F
 83 | grid0 = grid
 84 | if (!overwrite) {
 85 |   grid0 = grid %>%
 86 |     mutate(output_file = file.path(output_dir, paste0(basename(input_file) %>%
 87 |                                           gsub("\\.rds$", "", .),
 88 |                                         '-de_test=', de_test,
 89 |                                         '-sample_idx=', sample_idx,
 90 |                                         '-shuffle_replicates=', shuffle_replicates,
 91 |                                         '-label=', label,
 92 |                                         '-comparison=', comparison,
 93 |                                         '.rds')),
 94 |            exists = file.exists(output_file)) %>%
 95 |     filter(!exists) %>%
 96 |     dplyr::select(-output_file, -exists)
 97 | }
 98 | 
 99 | # write the grid that still needs to be run
100 | write.table(grid0, "sh/analysis/control_only/grids/control_only.txt",
101 |             quote = F, row.names = F, sep = "\t")
102 | 
103 | # finally, run the job on whatever system we're on
104 | sh_dir = "~/git/DE-analysis/sh/analysis/control_only"
105 | script = file.path(sh_dir, "control_only.sh")
106 | submit_job(grid0, script, args$allocation, system)
107 | 


--------------------------------------------------------------------------------
/R/analysis/control_only/outer-expr-summary-control-only.R:
--------------------------------------------------------------------------------
 1 | setwd("~/git/DE-analysis")
 2 | options(stringsAsFactors = F)
 3 | library(argparse)
 4 | 
 5 | # parse arguments
 6 | parser = ArgumentParser(prog = 'outer-expr-summary-control-only.R')
 7 | parser$add_argument('--allocation', type = 'character')
 8 | args = parser$parse_args()
 9 | 
10 | library(tidyverse)
11 | library(magrittr)
12 | source("R/functions/datasets.R")
13 | source("R/functions/submit_job.R")
14 | source("R/functions/detect_system.R")
15 | 
16 | # list input files
17 | input_dir = file.path(base_dir, "rnaseq", "seurat")
18 | input_files = file.path(input_dir, paste0(datasets, '.rds'))
19 | inputs = data.frame(input_file = input_files)
20 | grid = data.frame(input_file = input_files, dataset = datasets)
21 | 
22 | # load in number of replicates for each dataset
23 | reps = readRDS("data/analysis/confounds/replicates.rds")
24 | 
25 | # grab the conditions where we have enough replicates
26 | keep = reps %>%
27 |   group_by(dataset, label, comparison) %>%
28 |   summarise(n_reps = n()) %>%
29 |   ungroup %>%
30 |   filter(n_reps >= 6) %>%
31 |   # ignore duplicate comparisons
32 |   distinct(dataset, label, n_reps)
33 | 
34 | # add this into the grid
35 | grid %<>%
36 |   left_join(keep, by = 'dataset') %>%
37 |   # drop the ones we aren't keeping
38 |   drop_na() %>%
39 |   dplyr::select(-n_reps)
40 | 
41 | # define output directory where results are stored
42 | output_dir = file.path(base_dir, "analysis/expr_summary/control_only")
43 | 
44 | # check which parameters are already complete
45 | overwrite = F
46 | grid0 = grid
47 | if (!overwrite) {
48 |   grid0 = grid %>%
49 |     mutate(output_filename = paste0(dataset, '-label=', label, '.csv.gz'),
50 |            output_file = file.path(output_dir, output_filename),
51 |            exists = file.exists(output_file)) %>%
52 |     filter(!exists) %>%
53 |     dplyr::select(-output_file, -output_filename, -exists,
54 |                   -dataset)
55 | }
56 | 
57 | # write the grid that still needs to be run
58 | grid_file = "sh/analysis/control_only/grids/expr_summary.txt"
59 | grid_dir = dirname(grid_file)
60 | if (!dir.exists(grid_dir))
61 |   dir.create(grid_dir, recursive = T)
62 | write.table(grid0, grid_file, quote = F, row.names = F, sep = "\t")
63 | 
64 | # finally, run the job on whatever system we're on
65 | sh_dir = "~/git/DE-analysis/sh/analysis/control_only"
66 | script = file.path(sh_dir, "expr_summary_control_only.sh")
67 | submit_job(grid0, script, args$allocation, system)
68 | 


--------------------------------------------------------------------------------
/R/analysis/control_only/summarise-control-only-DE-vs-variance.R:
--------------------------------------------------------------------------------
 1 | setwd("~/git/DE-analysis")
 2 | options(stringsAsFactors = F)
 3 | library(tidyverse)
 4 | library(magrittr)
 5 | library(data.table)
 6 | args = list(); source("R/functions/detect_system.R")
 7 | 
 8 | # list input files
 9 | input_dir = file.path(base_dir, "analysis", "expr_summary", "control_only")
10 | input_files = list.files(input_dir, pattern = '*\\.csv\\.gz', full.names = TRUE)
11 | 
12 | # we don't need to summarize all data, so let's filter here
13 | meta = data.frame(filename = basename(input_files)) %>%
14 |   mutate(idx = row_number()) %>%
15 |   separate(filename, into = c('dataset', 'label'), sep = '-') %>%
16 |   mutate_all(~ gsub("^.*=|\\.csv\\.gz", "", .)) %>%
17 |   type_convert()
18 | # filter to control groups in simple experiments only
19 | keep = c('Goldfarbmuren2020' = 'never', ## lung from never smokers
20 |          'Grubman2019' = 'Control', ## ALZ control brains
21 |          'Hrvatin2018' = '0h', ## light-deprived mice
22 |          'Huang2020' = 'control', ## colonic mucosa in healthy children
23 |          'Kang2018' = 'ctrl', ## unstimulated PBMCs 
24 |          'Mathys2019' = 'Control', ## ALZ control brains
25 |          'Nagy2020' = 'Control', ## MDD control brains
26 |          'Reyfman2020' = 'Control', ## healthy lungs
27 |          'Rossi2019' = 'control', ## mice on a control diet
28 |          'Sathyamurthy2018' = 'control', ## healthy mouse spinal cord
29 |          'Smillie2019' = 'Healthy', ## healthy colon
30 |          'Tran2019' = 'Ctrl', ## uninjured RGCs
31 |          'Wilk2020' = 'Healthy', ## control PBMCs
32 |          'Wu2017' = 'control' ## control mice
33 | ) %>%
34 |   data.frame(dataset = names(.), label = .)
35 | 
36 | # filter metadata/files accordingly
37 | meta0 = inner_join(meta, keep, by = c('dataset', 'label'))
38 | input_files %<>% extract(meta0$idx)
39 | 
40 | # read all data
41 | dats = map(input_files, fread)
42 | # combine into a single data frame
43 | dat = bind_rows(dats)
44 | 
45 | # last, we also need to load the DE results
46 | DE = readRDS(file.path(base_dir, "analysis", "summary_data", 
47 |                        "control_only.rds"))
48 | n_DE = readRDS("data/analysis/control_only/n_DE_genes.rds")
49 | 
50 | ## outcome 1: write mean delta-variance for each cell type in each dataset
51 | delta_vars = dat %>%
52 |   drop_na(pseudobulk_variance, shuffled_variance) %>%
53 |   mutate(delta_variance = shuffled_variance - pseudobulk_variance) %>%
54 |   group_by(dataset, label, cell_type) %>%
55 |   summarise(mean_delta = mean(delta_variance)) %>%
56 |   ungroup()
57 | saveRDS(delta_vars, "data/analysis/control_only/delta_variance.rds")
58 | 
59 | ## outcome 2: number of DE genes in each bin
60 | bins = 10
61 | xy0 = xy %>% 
62 |   mutate(abs_delta_variance = abs(delta_variance))
63 | bin_results = xy0 %>%
64 |   # bin expression levels
65 |   group_by(dataset, label, cell_type, de_test, shuffle_replicates) %>%
66 |   arrange(abs_delta_variance) %>%
67 |   mutate(bin = cut(row_number() / n(),
68 |                    breaks = seq(0, bins) / bins),
69 |          bin = as.integer(bin)) %>%
70 |   ungroup() %>%
71 |   # count DE genes in each bin
72 |   group_by(dataset, label, cell_type, de_test, shuffle_replicates, bin) %>%
73 |   summarise(genes = sum(p_val_adj < 0.05)) %>%
74 |   ungroup()
75 | saveRDS(bin_results, "data/analysis/control_only/genes_per_bin.rds")
76 | 


--------------------------------------------------------------------------------
/R/analysis/control_only/summarise-control-only-n-DE-genes.R:
--------------------------------------------------------------------------------
 1 | # Count the total number of DE genes in each control-only experiment.
 2 | setwd("~/git/DE_analysis")
 3 | options(stringsAsFactors = F)
 4 | library(tidyverse)
 5 | library(magrittr)
 6 | source("R/functions/recode_colnames.R")
 7 | args = list(); source("R/functions/detect_system.R")
 8 | 
 9 | # list input files
10 | input_dir = file.path(base_dir, "analysis", "control_only")
11 | input_files = list.files(input_dir, pattern = 'rds', full.names = TRUE)
12 | 
13 | # we don't need to summarize all data, so let's filter here
14 | meta = data.frame(filename = basename(input_files)) %>%
15 |   mutate(idx = row_number()) %>%
16 |   separate(filename, into = c('dataset', 'de_test', 'sample_idx', 
17 |                               'shuffle_replicates', 'label', 'comparison'), 
18 |            sep = '-') %>%
19 |   mutate_all(~ gsub("^.*=|\\.rds", "", .)) %>%
20 |   type_convert() %>%
21 |   # remove superfluous columns
22 |   dplyr::select(-sample_idx)
23 | # filter to control groups in simple experiments only
24 | keep = c('Goldfarbmuren2020' = 'never', ## lung from never smokers
25 |          'Grubman2019' = 'Control', ## ALZ control brains
26 |          'Hrvatin2018' = '0h', ## light-deprived mice
27 |          'Huang2020' = 'control', ## colonic mucosa in healthy children
28 |          'Kang2018' = 'ctrl', ## unstimulated PBMCs 
29 |          'Mathys2019' = 'Control', ## ALZ control brains
30 |          'Nagy2020' = 'Control', ## MDD control brains
31 |          'Reyfman2020' = 'Control', ## healthy lungs
32 |          'Rossi2019' = 'control', ## mice on a control diet
33 |          'Sathyamurthy2018' = 'control', ## healthy mouse spinal cord
34 |          'Smillie2019' = 'Healthy', ## healthy colon
35 |          'Tran2019' = 'Ctrl', ## uninjured RGCs
36 |          'Wilk2020' = 'Healthy', ## control PBMCs
37 |          'Wu2017' = 'control' ## control mice
38 | ) %>%
39 |   data.frame(dataset = names(.), label = .)
40 | 
41 | # filter metadata/files accordingly
42 | meta0 = inner_join(meta, keep, by = c('dataset', 'label'))
43 | input_files %<>% extract(meta0$idx)
44 | 
45 | # read all data
46 | dats = map(input_files, ~ readRDS(.x) %>%
47 |              # fix column names
48 |              recode_colnames() %>%
49 |              # fix p-values
50 |              group_by(cell_type) %>%
51 |              mutate(p_val_adj = p.adjust(p_val, method = 'BH')) %>%
52 |              ungroup()
53 | )
54 | 
55 | # combine into a single data frame
56 | dat = meta0 %>%
57 |   split(.$idx) %>%
58 |   map2(dats, ~ cbind(.x, .y)) %>%
59 |   bind_rows()
60 | # reorder columns
61 | dat0 = dat %>%
62 |   dplyr::select(dataset, comparison, label, cell_type,
63 |                 de_test, shuffle_replicates,
64 |                 gene, p_val, test_statistic, p_val_adj) %>%
65 |   # remove missing genes
66 |   drop_na(p_val)
67 | # remove duplicated data from multiple comparisons
68 | # these are irrelevant since DE takes place within controls
69 | dat0 %<>%
70 |   group_by(dataset) %>%
71 |   filter(comparison == first(comparison)) %>%
72 |   ungroup()
73 | 
74 | # save the full set of results
75 | saveRDS(dat0, file.path(base_dir, "analysis", "summary_data",
76 |                         "control_only.rds"))
77 | 
78 | # count the number of genes 
79 | n_genes = dat0 %>%
80 |   group_by(dataset, comparison, label, cell_type, de_test, 
81 |            shuffle_replicates) %>%
82 |   summarise(n = sum(p_val_adj < 0.05)) %>%
83 |   ungroup()
84 | 
85 | # write # of DE genes
86 | output_file = "data/analysis/control_only/n_DE_genes.rds"
87 | output_dir = dirname(output_file)
88 | if (!dir.exists(output_dir))
89 |   dir.create(output_dir)
90 | saveRDS(n_genes, output_file)
91 | 


--------------------------------------------------------------------------------
/R/analysis/control_only/summarise-control-only-spatial-n-DE-genes.R:
--------------------------------------------------------------------------------
 1 | setwd("~/git/DE-analysis")
 2 | options(stringsAsFactors = F)
 3 | library(tidyverse)
 4 | library(magrittr)
 5 | source("R/functions/recode_colnames.R")
 6 | args = list(); source("R/functions/detect_system.R")
 7 | 
 8 | # list input files
 9 | input_dir = file.path(base_dir, "analysis", "control_only", "spatial")
10 | input_files = list.files(input_dir, pattern = 'rds', full.names = TRUE)
11 | 
12 | meta = data.frame(filename = basename(input_files)) %>%
13 |   mutate(idx = row_number()) %>%
14 |   separate(filename, into = c('dataset', 'de_test', 'sample_idx',
15 |                               'shuffle_replicates', 'label'), sep = '-') %>%
16 |   mutate_all(~ gsub("^.*=|\\.rds", "", .)) %>%
17 |   type_convert() %>%
18 |   # remove superfluous columns
19 |   dplyr::select(-sample_idx)
20 | 
21 | # read all data
22 | dats = map(input_files, ~ readRDS(.x) %>%
23 |              bind_rows() %>%
24 |              # fix column names
25 |              recode_colnames() %>%
26 |              # fix p-values
27 |              group_by(comparison, cell_type) %>%
28 |              mutate(p_val_adj = p.adjust(p_val, method = 'BH')) %>%
29 |              ungroup())
30 | 
31 | # combine into a single data frame
32 | dat = meta %>%
33 |   split(.$idx) %>%
34 |   map2(dats, ~ cbind(.x, .y)) %>%
35 |   bind_rows()
36 | # reorder columns
37 | dat0 = dat %>%
38 |   dplyr::select(dataset, comparison, label, cell_type,
39 |                 de_test, shuffle_replicates,
40 |                 gene, p_val, test_statistic, p_val_adj) %>%
41 |   # remove missing genes
42 |   drop_na(p_val)
43 | 
44 | # save the full set of results
45 | saveRDS(dat0, file.path(base_dir, "analysis", "summary_data",
46 |                         "control_only_spatial.rds"))
47 | 
48 | # count the number of genes 
49 | n_genes = dat0 %>%
50 |   group_by(dataset, comparison, label, cell_type, de_test, 
51 |            shuffle_replicates) %>%
52 |   summarise(n = sum(p_val_adj < 0.05)) %>%
53 |   ungroup()
54 | 
55 | # write # of DE genes
56 | output_file = "data/analysis/control_only/spatial/n_DE_genes.rds"
57 | output_dir = dirname(output_file)
58 | if (!dir.exists(output_dir))
59 |   dir.create(output_dir)
60 | saveRDS(n_genes, output_file)
61 | 


--------------------------------------------------------------------------------
/R/analysis/delta_variance/inner-write-delta-variance.R:
--------------------------------------------------------------------------------
  1 | # Calculate the difference in variance between pseudobulks with biological and
  2 | # shuffled replicates across all 46 datasets.
  3 | setwd("~/git/DE-analysis")
  4 | options(stringsAsFactors = F)
  5 | library(argparse)
  6 | 
  7 | # parse arguments
  8 | parser = ArgumentParser(prog = 'inner-write-delta-variance.R')
  9 | parser$add_argument('--input_file', type = 'character', required = T)
 10 | parser$add_argument('--output_dir', type = 'character', required = T)
 11 | args = parser$parse_args()
 12 | print(args)
 13 | 
 14 | library(tidyverse)
 15 | library(magrittr)
 16 | library(Seurat)
 17 | library(Matrix)
 18 | library(sparseMatrixStats)
 19 | source("R/functions/datasets.R")
 20 | source("R/functions/get_comparisons.R")
 21 | 
 22 | # set up output filepath
 23 | if (!dir.exists(args$output_dir))
 24 |   dir.create(args$output_dir, recursive = T)
 25 | dataset = args$input_file %>%
 26 |   basename() %>%
 27 |   gsub("\\.rds$", "", .)
 28 | output_filename = paste0(dataset, ".rds")
 29 | output_file = file.path(args$output_dir, output_filename)
 30 | 
 31 | # read input file and extract matrix/metadata
 32 | sc = readRDS(args$input_file)
 33 | expr = GetAssayData(sc, slot = 'counts')
 34 | meta = sc@meta.data
 35 | dataset = gsub("\\.rds$", "", basename(args$input_file))
 36 | 
 37 | # iterate through comparisons
 38 | vars = list()
 39 | comparisons = get_comparisons(dataset, expr, meta)
 40 | for (comparison_idx in seq_along(comparisons)) {
 41 |   comparison = comparisons[[comparison_idx]]
 42 |   comparison_name = names(comparisons)[comparison_idx]
 43 |   if (is.null(comparison_name))
 44 |     comparison_name = 1
 45 |   
 46 |   # get subset expression and metadata
 47 |   expr0 = comparison$expr
 48 |   meta0 = comparison$meta %>%
 49 |     set_rownames(colnames(expr0))
 50 |   
 51 |   # analyze each cell type in turn
 52 |   cell_types = unique(meta0$cell_type)
 53 |   for (cell_type_idx in seq_along(cell_types)) {
 54 |     cell_type = cell_types[cell_type_idx]
 55 |     message("  [", cell_type_idx, "/", length(cell_types),
 56 |             "] analyzing cell type: ", cell_type, " ...")
 57 |     
 58 |     # get cell-type-specific expression matrix
 59 |     keep = which(meta0$cell_type == cell_type)
 60 |     expr1 = expr0[, keep, drop = F]
 61 |     meta1 = meta0[keep, , drop = F]
 62 |     rownames(meta1) = colnames(expr1)
 63 |     
 64 |     # catch cell types without replicates or conditions
 65 |     if (n_distinct(meta1$label) < 2 | 
 66 |         n_distinct(meta1$replicate) < 3) {
 67 |       next
 68 |     }
 69 |     
 70 |     # shuffle replicates
 71 |     meta2 = meta1 %>%
 72 |       group_by(cell_type, label) %>%
 73 |       mutate(replicate = sample(replicate)) %>%
 74 |       ungroup() %>%
 75 |       set_rownames(colnames(expr1))
 76 |     
 77 |     # summarise to pseudobulk matrices
 78 |     metadatas = list('biological replicates' = meta1,
 79 |                      'shuffled replicates' = meta2)
 80 |     grid = tidyr::crossing(replicate_type = names(metadatas))
 81 |     
 82 |     tmp = data.frame()
 83 |     for (grid_idx in seq_len(nrow(grid))) {
 84 |       replicate_type = grid$replicate_type[grid_idx]
 85 |       model_matrix = metadatas[[replicate_type]] %>%
 86 |         ungroup() %>%
 87 |         mutate(label = as.character(label),
 88 |                replicate = as.character(replicate)) %>%
 89 |         model.matrix(~ 0 + replicate:label, data = .)
 90 |       mat_mm = expr1 %*% model_matrix
 91 |       
 92 |       # drop empty columns
 93 |       keep_samples = colSums(mat_mm) > 0
 94 |       mat_mm %<>% extract(, keep_samples) %>% as.matrix()
 95 |       
 96 |       # filter genes with 0 variance
 97 |       sds = rowSds(mat_mm)
 98 |       mat_mm %<>% extract(sds > 0, )
 99 |       
100 |       # calculate CPM
101 |       mat_mm %<>% edgeR::cpm()
102 |       
103 |       # calculate variances for each gene
104 |       gene_vars = rowSds(mat_mm)
105 |       
106 |       # also calculate mean expression for each gene
107 |       gene_means = rowMeans(mat_mm)
108 |       
109 |       # create output data frame
110 |       df = data.frame(gene = rownames(mat_mm),
111 |                       mean = gene_means,
112 |                       variance = gene_vars,
113 |                       replicate_type = replicate_type
114 |       ) %>%
115 |         # tag dataset, comparison, cell type
116 |         mutate(dataset = dataset,
117 |                comparison = comparison_name,
118 |                cell_type = cell_type)
119 |       tmp %<>% bind_rows(df)
120 |     }
121 |     
122 |     # summarise within this cell type, at least
123 |     summary = tmp %>%
124 |       group_by(dataset, comparison, cell_type, gene) %>%
125 |       filter(n() > 1) %>%
126 |       mutate(cov = variance / mean,
127 |              ratio = variance[replicate_type == 'shuffled replicates'] /
128 |                variance[replicate_type == 'biological replicates'],
129 |              delta = variance[replicate_type == 'shuffled replicates'] -
130 |                variance[replicate_type == 'biological replicates'],
131 |              delta_cov = cov[replicate_type == 'shuffled replicates'] -
132 |                cov[replicate_type == 'biological replicates'],
133 |              mean_var1 = mean(variance[replicate_type ==
134 |                                          'shuffled replicates']),
135 |              mean_var2 = mean(variance[replicate_type ==
136 |                                          'biological replicates'])) %>%
137 |       ungroup() %>%
138 |       group_by(dataset, comparison, cell_type) %>%
139 |       summarise(mean_ratio = mean(ratio, na.rm = TRUE),
140 |                 mean_delta = mean(delta, na.rm = TRUE),
141 |                 mean_delta_cov = mean(delta_cov, na.rm = TRUE),
142 |                 mean_var1 = mean(mean_var1, na.rm = TRUE),
143 |                 mean_var2 = mean(mean_var2, na.rm = TRUE)) %>%
144 |       ungroup() %>%
145 |       ## force everything to character
146 |       map_dfc(as.character)
147 |     
148 |     # append
149 |     vars %<>% bind_rows(summary)
150 |   }
151 | }
152 | 
153 | # save results
154 | saveRDS(vars, output_file)
155 | 


--------------------------------------------------------------------------------
/R/analysis/delta_variance/outer-write-delta-variance.R:
--------------------------------------------------------------------------------
 1 | setwd("~/git/DE-analysis")
 2 | options(stringsAsFactors = F)
 3 | library(argparse)
 4 | 
 5 | # parse arguments
 6 | parser = ArgumentParser(prog = 'outer-write-delta-variance.R')
 7 | parser$add_argument('--allocation', type = 'character')
 8 | args = parser$parse_args()
 9 | 
10 | library(tidyverse)
11 | library(magrittr)
12 | source("R/functions/datasets.R")
13 | source("R/functions/submit_job.R")
14 | source("R/functions/detect_system.R")
15 | 
16 | # list input files
17 | input_dir = file.path(base_dir, "rnaseq", "seurat")
18 | input_files = file.path(input_dir, paste0(datasets, '.rds'))
19 | # add Hagai plate data into this
20 | input_files %<>% c(file.path(input_dir, "Hagai2018_plate.rds"))
21 | # grid is simply the list of input files
22 | grid = data.frame(input_file = input_files)
23 | 
24 | # define output directory where results are stored
25 | output_dir = file.path(base_dir, "analysis/delta_variance")
26 | 
27 | # check which parameters are already complete
28 | overwrite = F
29 | grid0 = grid
30 | if (!overwrite) {
31 |   grid0 = grid %>%
32 |     mutate(output_filename = basename(input_file),
33 |            output_file = file.path(output_dir, output_filename),
34 |            exists = file.exists(output_file)) %>%
35 |     filter(!exists) %>%
36 |     dplyr::select(-output_file, -output_filename, -exists)
37 | }
38 | 
39 | # write the grid that still needs to be run
40 | grid_file = "sh/analysis/delta_variance/grids/delta_variance.txt"
41 | grid_dir = dirname(grid_file)
42 | if (!dir.exists(grid_dir))
43 |   dir.create(grid_dir, recursive = T)
44 | write.table(grid0, grid_file, quote = F, row.names = F, sep = "\t")
45 | 
46 | # finally, run the job on whatever system we're on
47 | sh_dir = "~/git/DE-analysis/sh/analysis/delta_variance"
48 | script = file.path(sh_dir, "delta_variance.sh")
49 | submit_job(grid0, script, args$allocation, system)
50 | 


--------------------------------------------------------------------------------
/R/analysis/delta_variance/summarise-delta-variance.R:
--------------------------------------------------------------------------------
 1 | setwd("~/git/DE-analysis")
 2 | options(stringsAsFactors = F)
 3 | library(tidyverse)
 4 | library(magrittr)
 5 | source("R/functions/datasets.R")
 6 | args = list(); source("R/functions/detect_system.R")
 7 | 
 8 | # list input files
 9 | input_dir = file.path(base_dir, "analysis", "delta_variance")
10 | input_files = file.path(input_dir, paste0(datasets, '.rds'))
11 | 
12 | # read all files
13 | dats = map(input_files, readRDS)
14 | dat = bind_rows(dats) %>% type_convert()
15 | 
16 | # save results
17 | output_file = "data/analysis/delta_variance/delta_variance.rds"
18 | output_dir = dirname(output_file)
19 | if (!dir.exists(output_dir))
20 |   dir.create(output_dir, recursive = TRUE)
21 | saveRDS(dat, output_file)
22 | 


--------------------------------------------------------------------------------
/R/analysis/downsample_cells/inner-downsample-cells-outcomes.R:
--------------------------------------------------------------------------------
  1 | # Get concordance between scRNA-seq/pseudobulk DE and bulk DE
  2 | # in downsampled data
  3 | setwd("~/git/DE-analysis")
  4 | options(stringsAsFactors = F)
  5 | library(argparse)
  6 | 
  7 | # parse arguments
  8 | parser = ArgumentParser(prog = 'inner-downsample-cells-outcomes.R')
  9 | parser$add_argument('--input_sc', type = 'character', required = T)
 10 | parser$add_argument('--input_bulk', type = 'character', required = T)
 11 | parser$add_argument('--output_dir', type = 'character', required = T)
 12 | args = parser$parse_args()
 13 | print(args)
 14 | 
 15 | library(tidyverse)
 16 | library(magrittr)
 17 | library(Seurat)
 18 | library(Matrix)
 19 | library(RRHO)
 20 | library(AUC)
 21 | source("R/functions/calculate_overlap.R")
 22 | source("R/analysis/bulk_concordance/write_grid.R")
 23 | 
 24 | # set up output filepath
 25 | if (!dir.exists(args$output_dir))
 26 |   dir.create(args$output_dir, recursive = T)
 27 | 
 28 | # load in files
 29 | sc = readRDS(args$input_sc)
 30 | bulk = readRDS(args$input_bulk)
 31 | 
 32 | # define output file
 33 | sc_name = gsub(".rds", "", basename(args$input_sc))
 34 | bulk_name = gsub(".rds", "", basename(args$input_bulk))
 35 | output_filename = paste0(sc_name, "|", bulk_name, ".rds")
 36 | output_file = file.path(args$output_dir, output_filename)
 37 | 
 38 | # get all combinations of single-cell/bulk
 39 | sc_idxs = names(sc)
 40 | bulk_idxs = names(bulk)
 41 | if (is.null(sc_idxs)) {
 42 |     sc_idxs = "1"
 43 |     names(sc) = '1'
 44 | }
 45 | if (is.null(bulk_idxs)) {
 46 |     bulk_idxs = "1"
 47 |     names(bulk) = '1'
 48 | }
 49 | comparisons = expand.grid(sc_idxs, bulk_idxs, stringsAsFactors = F)
 50 | 
 51 | # get rid of irrelevant comparisons from Cano-Gamez 2020
 52 | if (grepl("CanoGamez2020", sc_name)) {
 53 |   keep = map2_lgl(comparisons$Var1, comparisons$Var2, ~ grepl(.x, .y))
 54 |   comparisons %<>% extract(keep, )
 55 | }
 56 | 
 57 | results = c()
 58 | for (comparison_idx in 1:nrow(comparisons)) {
 59 |   message("analyzing comparison ", comparison_idx, " of ", nrow(comparisons),
 60 |           " ...")
 61 | 
 62 |   # prepare data
 63 |   sc_sub = sc[[comparisons[comparison_idx,]$Var1]]
 64 |   bulk_sub = bulk[[comparisons[comparison_idx,]$Var2]] %>% 
 65 |     ## fix for Reyfman
 66 |     ungroup()
 67 |   sc_label = comparisons[comparison_idx,]$Var1
 68 |   bulk_label = comparisons[comparison_idx,]$Var2
 69 |   comparison_label = paste0(sc_label, "|", bulk_label)
 70 |   
 71 |   # for Angelidis, filter to relevant cell types to prevent bugs
 72 |   if (grepl("Angelidis", sc_name)) {
 73 |     sc_sub %<>% filter(cell_type %in% c("Type_2_pneumocytes",
 74 |                                         "Alveolar_macrophage"))
 75 |   }
 76 |   # same for Reyfman
 77 |   if (grepl("Reyfman", sc_name)) {
 78 |     sc_sub %<>% filter(cell_type %in% c("AT2", "Alveolar macrophages"))
 79 |   }
 80 |   
 81 |   # calculate concordance metrics for this comparison
 82 |   out = sc_sub %>%
 83 |     split(.$cell_type) %>%
 84 |     map( ~ {
 85 |       print(.$cell_type[1])
 86 |       sc_tmp = .
 87 |       tmp = template %>%
 88 |         mutate(value = seq(nrow(template)) %>%
 89 |                  map( ~ {
 90 |                    print(template[., ])
 91 |                    method = template[.,]$method
 92 |                    k = template[.,]$k
 93 |                    cor_method = template[.,]$cor_method
 94 |                    value = calculate_overlap(
 95 |                      bulk_de = bulk_sub,
 96 |                      sc_de = sc_tmp,
 97 |                      method = method,
 98 |                      k = k,
 99 |                      cor_method = cor_method)
100 |                  }) %>%
101 |                  unlist()
102 |         )
103 |     }) %>%
104 |     bind_rows(.id = 'cell_type') %>%
105 |     mutate(
106 |       sc_label = sc_label,
107 |       bulk_label = bulk_label
108 |     )
109 |   # bind to main results container
110 |   results %<>% bind_rows(out)
111 | }
112 | 
113 | # save results
114 | saveRDS(results, output_file)
115 | 


--------------------------------------------------------------------------------
/R/analysis/downsample_cells/inner-downsample-cells.R:
--------------------------------------------------------------------------------
  1 | # Run single-cell or pseudobulk DE analyses on downsampled datasets.
  2 | setwd("~/git/DE-analysis")
  3 | options(stringsAsFactors = F)
  4 | library(argparse)
  5 | 
  6 | # parse arguments
  7 | parser = ArgumentParser(prog = 'inner-downsample-cells.R')
  8 | parser$add_argument('--input_file', type = 'character', required = T)
  9 | parser$add_argument('--output_dir', type = 'character', required = T)
 10 | parser$add_argument('--de_test', type = 'character', required = T)
 11 | parser$add_argument('--n_cells', type = 'double', required = T)
 12 | parser$add_argument('--sample_idx', type = 'integer', required = T)
 13 | args = parser$parse_args()
 14 | print(args)
 15 | 
 16 | library(tidyverse)
 17 | library(magrittr)
 18 | library(Seurat)
 19 | library(Matrix)
 20 | source("R/functions/run_DE.R")
 21 | source("R/functions/get_comparisons.R")
 22 | 
 23 | # set up output filepath
 24 | if (!dir.exists(args$output_dir))
 25 |   dir.create(args$output_dir, recursive = T)
 26 | dataset = args$input_file %>%
 27 |   basename() %>%
 28 |   gsub("\\.rds$", "", .)
 29 | output_filename = paste0(dataset,
 30 |                          "-de_test=", args$de_test,
 31 |                          "-n_cells=", args$n_cells,
 32 |                          "-sample_idx=", args$sample_idx,
 33 |                          ".rds")
 34 | output_file = file.path(args$output_dir, output_filename)
 35 | 
 36 | # read input file and extract matrix/metadata
 37 | sc = readRDS(args$input_file)
 38 | expr = GetAssayData(sc, slot = 'counts')
 39 | meta = sc@meta.data
 40 | 
 41 | # get all combinations of conditions
 42 | results = list()
 43 | comparisons = get_comparisons(dataset, expr, meta)
 44 | 
 45 | for (comparison_idx in seq_along(comparisons)) {
 46 |   comparison = comparisons[[comparison_idx]]
 47 |   comparison_name = names(comparisons)[comparison_idx]
 48 |   if (is.null(comparison_name))
 49 |     comparison_name = 1
 50 | 
 51 |   if (grepl("Hagai2018", dataset)) {
 52 |     # only do certain comparisons
 53 |     if (!comparison_name %in% c("lps4", "pic4"))
 54 |       next
 55 |   }
 56 | 
 57 |   message("[", comparison_idx, "/", length(comparisons), "] ",
 58 |           "analyzing comparison ", comparison_name, " ...")
 59 |   message("##############################")
 60 | 
 61 |   # get subset expression and metadata
 62 |   set.seed(args$sample_idx)
 63 |   meta0 = comparison$meta %>%
 64 |     # make sure rownames are correct
 65 |     set_rownames(colnames(comparison$expr)) %>%
 66 |     rownames_to_column(var = 'cell_barcode') %>%
 67 |     group_by(replicate) %>%
 68 |     mutate(cells = ceiling(args$n_cells * (n() / nrow(.)))) %>%
 69 |     sample_n(cells[1]) %>%
 70 |     ## maintaining the proportions, make sure n_cells is precise
 71 |     ungroup() %>%
 72 |     sample_n(args$n_cells) %>%
 73 |     set_rownames(.$cell_barcode)
 74 |   expr0 = comparison$expr %>% extract(, rownames(meta0))
 75 | 
 76 |   # make some checks
 77 |   if (grepl("Reyfman2020", dataset)) {
 78 |     cell_types = c("AT2", "Alveolar macrophages")
 79 |     meta0 %<>% filter(cell_type %in% cell_types) %>% set_rownames(.$cell_barcode)
 80 |     expr0 %<>% extract(, rownames(meta0))
 81 |   } else if (grepl("Angelidis2019", dataset)) {
 82 |         cell_types = c("Alveolar_macrophage", "Type_2_pneumocytes")
 83 |         meta0 %<>% filter(cell_type %in% cell_types) %>% set_rownames(.$cell_barcode)
 84 |         expr0 %<>% extract(, rownames(meta0))
 85 |   }
 86 | 
 87 |   # reconstruct the Seurat object
 88 |   sc_downsampled = CreateSeuratObject(expr0,
 89 |                                       min.cells = 1, min.features = 0,
 90 |                                       meta.data = meta0)
 91 | 
 92 |   # run DE analysis
 93 |   DE = run_DE(sc_downsampled, de_test = args$de_test)
 94 | 
 95 |   # append to list
 96 |   results[[comparison_name]] = DE
 97 | }
 98 | 
 99 | # stop if empty
100 | if (length(results) == 0 | all(map_int(results, nrow) == 0))
101 |   stop("couldn't get any results")
102 | 
103 | # save results
104 | saveRDS(results, output_file)
105 | 


--------------------------------------------------------------------------------
/R/analysis/downsample_cells/outer-downsample-cells-outcomes.R:
--------------------------------------------------------------------------------
  1 | # Calculate outcomes for single-cell or pseudobulk DE analyses on
  2 | # downsampled datasets.
  3 | setwd("~/git/DE-analysis")
  4 | options(stringsAsFactors = F)
  5 | library(argparse)
  6 | 
  7 | # parse arguments
  8 | parser = ArgumentParser(prog = 'outer-downsample-cells-outcomes.R')
  9 | parser$add_argument('--allocation', type = 'character')
 10 | args = parser$parse_args()
 11 | 
 12 | library(tidyverse)
 13 | library(magrittr)
 14 | source("R/functions/datasets.R")
 15 | source("R/functions/submit_job.R")
 16 | source("R/functions/detect_system.R")
 17 | 
 18 | # list bulk input files
 19 | bulk_files = list.files(file.path(base_dir, "analysis/run_bulk_DE"))
 20 | bulk_inputs = data.frame(bulk_file = bulk_files) %>%
 21 |   mutate(label = gsub("-.*", "", bulk_file)) %>%
 22 |   # manual fix for the Hagai datasets
 23 |   mutate(label = ifelse(label == 'Hagai2018', gsub("-.*", "", bulk_file),
 24 |                         label)) %>%
 25 |   # manually match a few sc datasets to their bulk data
 26 |   mutate(label = fct_recode(label,
 27 |                             "Reyfman2020" = "Reyfman2020_alvmac",
 28 |                             "Reyfman2020" = "Reyfman2020_AT2",
 29 |                             "Angelidis2019" = "Angelidis2019_facsepi",
 30 |                             "Angelidis2019" = "Angelidis2019_facsmac",
 31 |                             "CanoGamez2020" = "CanoGamez2020:proteomics")) %>%
 32 |   # restore the entire filepath
 33 |   mutate(bulk_file = file.path(base_dir, 'analysis/run_bulk_DE', bulk_file))
 34 | 
 35 | # list datasets
 36 | inputs = data.frame(dataset = datasets)
 37 | 
 38 | # establish grid of analyses
 39 | opts = list(
 40 |   de_test = c(
 41 |     ## single-cell methods, implemented in Seurat
 42 |     "wilcox", "bimod", "t", "negbinom", "poisson", "LR", "MAST",
 43 |     ## pseudobulk methods
 44 |     "pseudobulk_DESeq2,test?LRT", "pseudobulk_DESeq2,test?Wald",
 45 |     "pseudobulk_limma,mode?voom", "pseudobulk_limma,mode?trend",
 46 |     "pseudobulk_edgeR,test?QLF", "pseudobulk_edgeR,test?LRT",
 47 |     ## mixed model methods, implemented in Seurat
 48 |     "mixed_lm", "mixed_nbinom", "mixed_poisson",
 49 |     ## slight adjustments to mixed model methods
 50 |     "mixed_lm,test?LRT", "mixed_nbinom,test?LRT", "mixed_poisson,test?LRT",
 51 |     "mixed_nbinom,offset?YES", "mixed_poisson,offset?YES",
 52 |     "mixed_nbinom,test?LRT,offset?YES", "mixed_poisson,test?LRT,offset?YES",
 53 |     ## pseudobulk methods with aggregation disabled
 54 |     "pseudobulk_DESeq2,test?LRT,replicate?cells",
 55 |     "pseudobulk_DESeq2,test?Wald,replicate?cells",
 56 |     "pseudobulk_limma,mode?voom,replicate?cells",
 57 |     "pseudobulk_limma,mode?trend,replicate?cells",
 58 |     "pseudobulk_edgeR,test?QLF,replicate?cells",
 59 |     "pseudobulk_edgeR,test?LRT,replicate?cells"),
 60 |   n_cells = c(25, 50, 100, 200, 500, 1000),
 61 |   sample_idx = 0
 62 | )
 63 | grid = do.call(expand.grid, c(opts, stringsAsFactors = F))
 64 | 
 65 | # rep analysis grid over input files
 66 | grid %<>%
 67 |   dplyr::slice(rep(1:n(), each = nrow(inputs))) %>%
 68 |   mutate(dataset = rep(inputs$dataset, nrow(grid))) %>%
 69 |   left_join(inputs, by = 'dataset') %>%
 70 |   # reorder columns
 71 |   dplyr::select(dataset, de_test, n_cells, sample_idx) %>%
 72 |   mutate(label = dataset) %>%
 73 |   # add in bulk file
 74 |   left_join(bulk_inputs) %>%
 75 |   # filter Angelidis when n_cells == 25 (won't run)
 76 |   filter(!grepl("Angelidis", dataset) | n_cells != 25)
 77 | 
 78 | # now, reorganize the grid to map query/target file pairs
 79 | query_dir = file.path(base_dir, "analysis/downsample_cells/DE")
 80 | grid %<>%
 81 |   mutate(input_sc = file.path(query_dir,
 82 |                                 paste0(dataset,
 83 |                                        '-de_test=', de_test,
 84 |                                        '-n_cells=', n_cells,
 85 |                                        '-sample_idx=', sample_idx,
 86 |                                        '.rds'))) %>%
 87 |   dplyr::rename(input_bulk = bulk_file) %>%
 88 |   dplyr::select(input_sc, input_bulk)
 89 | 
 90 | # write the raw array
 91 | grid_file = "sh/analysis/downsample_cells/grids/downsample_cells.raw.txt"
 92 | grid_dir = dirname(grid_file)
 93 | if (!dir.exists(grid_dir))
 94 |   dir.create(grid_dir, recursive = T)
 95 | write.table(grid, grid_file, quote = F, row.names = F, sep = "\t")
 96 | 
 97 | # define output directory where results are stored
 98 | output_dir = file.path(base_dir, "analysis/downsample_cells/concordance")
 99 | 
100 | # check which parameters are already complete
101 | overwrite = F
102 | grid0 = grid
103 | if (!overwrite) {
104 |   grid0 = grid %>%
105 |     mutate(idx = row_number(),
106 |            output_filename = paste0(basename(input_sc) %>%
107 |                                              gsub("\\.rds$", "", .),
108 |                                            "|",
109 |                                            basename(input_bulk) %>%
110 |                                              gsub("\\.rds$", "", .),
111 |                                            '.rds'),
112 |                   output_file = file.path(output_dir, output_filename),
113 |                   exists = file.exists(output_file)) %>%
114 |     filter(!exists) %>%
115 |     dplyr::select(-idx, -output_file, -exists)
116 | }
117 | 
118 | # just do hagai and Canogamez for now
119 | grid0 %<>% filter(grepl("Hagai|Cano|Reyfman|Angelidis", input_sc))
120 | 
121 | # write the grid that still needs to be run
122 | write.table(grid0,
123 |             "sh/analysis/downsample_cells/grids/downsample_cells_outcomes.txt",
124 |             quote = F, row.names = F, sep = "\t")
125 | 
126 | # finally, run the job on whatever system we're on
127 | sh_dir = "~/git/DE-analysis/sh/analysis/downsample_cells"
128 | script = file.path(sh_dir, "downsample_cells_outcomes.sh")
129 | submit_job(grid0, script, args$allocation, system)
130 | 


--------------------------------------------------------------------------------
/R/analysis/downsample_cells/outer-downsample-cells.R:
--------------------------------------------------------------------------------
 1 | # Run single-cell or pseudobulk DE analyses on downsampled datasets.
 2 | setwd("~/git/DE-analysis")
 3 | options(stringsAsFactors = F)
 4 | library(argparse)
 5 | 
 6 | # parse arguments
 7 | parser = ArgumentParser(prog = 'outer-downsample-cells.R')
 8 | parser$add_argument('--allocation', type = 'character')
 9 | args = parser$parse_args()
10 | 
11 | library(tidyverse)
12 | library(magrittr)
13 | source("R/functions/datasets.R")
14 | source("R/functions/submit_job.R")
15 | source("R/functions/detect_system.R")
16 | 
17 | # list input files
18 | input_dir = file.path(base_dir, "rnaseq", "seurat")
19 | input_files = file.path(input_dir, paste0(datasets, '.rds'))
20 | inputs = data.frame(input_file = input_files)
21 | 
22 | # establish grid of analyses
23 | opts = list(
24 |   de_test = c(
25 |     ## single-cell methods, implemented in Seurat
26 |     "wilcox", "bimod", "t", "negbinom", "poisson", "LR", "MAST",
27 |     ## pseudobulk methods
28 |     "pseudobulk_DESeq2,test?LRT", "pseudobulk_DESeq2,test?Wald",
29 |     "pseudobulk_limma,mode?voom", "pseudobulk_limma,mode?trend",
30 |     "pseudobulk_edgeR,test?QLF", "pseudobulk_edgeR,test?LRT",
31 |     ## mixed model methods, implemented in Seurat
32 |     "mixed_lm", "mixed_nbinom", "mixed_poisson",
33 |      ## slight adjustments to mixed model methods
34 |     "mixed_lm,test?LRT", "mixed_nbinom,test?LRT", "mixed_poisson,test?LRT",
35 |     "mixed_nbinom,offset?YES", "mixed_poisson,offset?YES",
36 |     "mixed_nbinom,test?LRT,offset?YES", "mixed_poisson,test?LRT,offset?YES",
37 |     ## pseudobulk methods with aggregation disabled
38 |     "pseudobulk_DESeq2,test?LRT,replicate?cells",
39 |     "pseudobulk_DESeq2,test?Wald,replicate?cells",
40 |     "pseudobulk_limma,mode?voom,replicate?cells",
41 |     "pseudobulk_limma,mode?trend,replicate?cells",
42 |     "pseudobulk_edgeR,test?QLF,replicate?cells",
43 |     "pseudobulk_edgeR,test?LRT,replicate?cells"
44 |   ),
45 |   n_cells = c(25, 50, 100, 200, 500, 1000),
46 |   sample_idx = 0
47 | )
48 | grid = do.call(expand.grid, c(opts, stringsAsFactors = F))
49 | 
50 | # rep analysis grid over input files
51 | grid %<>%
52 |   dplyr::slice(rep(1:n(), each = nrow(inputs))) %>%
53 |   mutate(input_file = rep(inputs$input_file, nrow(grid))) %>%
54 |   left_join(inputs, by = 'input_file') %>%
55 |   # reorder columns
56 |   dplyr::select(input_file, de_test, n_cells, sample_idx) %>%
57 |   # filter Angelidis when n_cells == 25 (won't run)
58 |   filter(!grepl("Angelidis", input_file) | n_cells != 25)
59 | 
60 | # write the raw array
61 | grid_file = "sh/analysis/downsample_cells/grids/downsample_cells.raw.txt"
62 | grid_dir = dirname(grid_file)
63 | if (!dir.exists(grid_dir))
64 |   dir.create(grid_dir, recursive = T)
65 | write.table(grid, grid_file, quote = F, row.names = F, sep = "\t")
66 | 
67 | # define output directory where results are stored
68 | output_dir = file.path(base_dir, "analysis/downsample_cells/DE")
69 | 
70 | # check which parameters are already complete
71 | overwrite = F
72 | grid0 = grid
73 | if (!overwrite) {
74 |   grid0 = grid %>%
75 |     mutate(idx = row_number(),
76 |            output_file = file.path(output_dir,
77 |                                    paste0(basename(input_file) %>%
78 |                                             gsub("\\.rds$", "", .),
79 |                                           '-de_test=', de_test,
80 |                                           '-n_cells=', n_cells,
81 |                                           '-sample_idx=', sample_idx,
82 |                                           '.rds')),
83 |            exists = file.exists(output_file)) %>%
84 |     filter(!exists) %>%
85 |     dplyr::select(-idx, -output_file, -exists)
86 | }
87 | 
88 | # just do bulk datasets for now
89 | grid0 %<>% filter(grepl("Hagai|Cano|Reyfman|Angelidis", input_file))
90 | 
91 | # write the grid that still needs to be run
92 | write.table(grid0, "sh/analysis/downsample_cells/grids/downsample_cells.txt",
93 |             quote = F, row.names = F, sep = "\t")
94 | 
95 | # finally, run the job on whatever system we're on
96 | sh_dir = "~/git/DE-analysis/sh/analysis/downsample_cells"
97 | script = file.path(sh_dir, "downsample_cells.sh")
98 | submit_job(grid0, script, args$allocation, system)
99 | 


--------------------------------------------------------------------------------
/R/analysis/downsample_cells/summarise-downsample-cells-outcomes.R:
--------------------------------------------------------------------------------
 1 | setwd("~/git/DE-analysis")
 2 | options(stringsAsFactors = F)
 3 | library(tidyverse)
 4 | library(magrittr)
 5 | args = list(); source("R/functions/detect_system.R")
 6 | 
 7 | # set up input directory
 8 | input_dir = file.path(base_dir, "analysis/downsample_cells/concordance")
 9 | input_files = list.files(input_dir, full.names = T, pattern = '*\\.rds$')
10 | 
11 | # read all input files
12 | dats = map(input_files, readRDS) %>%
13 |   setNames(basename(input_files))
14 | 
15 | # combine into a single file
16 | dat = dats %>%
17 |   bind_rows(.id = 'comparison') %>%
18 |   separate(comparison, c("sc", "bulk"), "\\|") %>%
19 |   separate(sc, c("dataset", "de_test", "n_cells", "sample_idx"), sep = "-") %>%
20 |   mutate(de_test = gsub("!", "\\|", de_test)) %>%
21 |   separate(bulk, c("bulk_dataset", "bulk_test"), "-") %>%
22 |   mutate_at(vars(dataset, de_test, n_cells, sample_idx, bulk_test),
23 |             ~ gsub(".*=|.rds", "", .)) %>%
24 |   # remove old bulk test framework
25 |   filter(!bulk_test %in% c("bulk_limma", "bulk_DESeq2", "bulk_edgeR") |
26 |            # ... but keep published Reyfman analysis
27 |            (bulk_test == 'bulk_DESeq2' & grepl("Reyfman", dataset)))
28 | 
29 | # save results
30 | output_file = "data/analysis/downsample_cells/concordance_summary.rds"
31 | output_dir = dirname(output_file)
32 | if (!dir.exists(output_dir))
33 |   dir.create(output_dir, recursive = T)
34 | saveRDS(dat, output_file)
35 | 


--------------------------------------------------------------------------------
/R/analysis/expr_summary/inner-expr-summary.R:
--------------------------------------------------------------------------------
  1 | setwd("~/git/DE-analysis")
  2 | options(stringsAsFactors = F)
  3 | library(argparse)
  4 | 
  5 | # parse arguments
  6 | parser = ArgumentParser(prog = 'inner-expr-summary.R')
  7 | parser$add_argument('--input_file', type = 'character', required = T)
  8 | parser$add_argument('--output_dir', type = 'character', required = T)
  9 | args = parser$parse_args()
 10 | print(args)
 11 | 
 12 | library(tidyverse)
 13 | library(magrittr)
 14 | library(Seurat)
 15 | library(Matrix)
 16 | source("R/functions/get_comparisons.R")
 17 | 
 18 | # set up output filepath
 19 | if (!dir.exists(args$output_dir))
 20 |   dir.create(args$output_dir, recursive = T)
 21 | dataset = args$input_file %>%
 22 |   basename() %>%
 23 |   gsub("\\.rds$", "", .)
 24 | output_filename = paste0(dataset, ".txt")
 25 | output_file = file.path(args$output_dir, output_filename)
 26 | 
 27 | # read input file and extract matrix/metadata
 28 | sc = readRDS(args$input_file)
 29 | expr = GetAssayData(sc, slot = 'counts')
 30 | meta = sc@meta.data
 31 | dataset = gsub("\\.rds$", "", basename(args$input_file))
 32 | 
 33 | # get all combinations of conditions
 34 | results = list()
 35 | comparisons = get_comparisons(dataset, expr, meta)
 36 | for (comparison_idx in seq_along(comparisons)) {
 37 |   comparison = comparisons[[comparison_idx]]
 38 |   comparison_name = names(comparisons)[comparison_idx]
 39 |   if (is.null(comparison_name))
 40 |     comparison_name = 1
 41 | 
 42 |   message("[", comparison_idx, "/", length(comparisons), "] ",
 43 |           "analyzing comparison ", comparison_name, " ...")
 44 |   message("##############################")
 45 | 
 46 |   # get subset expression and metadata
 47 |   expr0 = comparison$expr
 48 |   meta0 = comparison$meta
 49 | 
 50 |   # analyze each cell type in turn
 51 |   cell_types = unique(meta0$cell_type)
 52 |   for (cell_type_idx in seq_along(cell_types)) {
 53 |     cell_type = cell_types[cell_type_idx]
 54 |     message("  [", cell_type_idx, "/", length(cell_types),
 55 |             "] analyzing cell type: ", cell_type, " ...")
 56 | 
 57 |     # get cell-type-specific expression matrix
 58 |     keep = which(meta0$cell_type == cell_type)
 59 |     expr1 = expr0[, keep, drop = F]
 60 |     meta1 = meta0[keep, , drop = F]
 61 |     rownames(meta1) = colnames(expr1)
 62 | 
 63 |     # calculate statistics
 64 |     genes = rownames(expr1)
 65 |     means = Matrix::rowMeans(expr1)
 66 |     sds = sparseMatrixStats::rowSds(expr1)
 67 |     covs = sds / means
 68 |     pct_zeros = Matrix::rowSums(expr1 == 0) / ncol(expr1)
 69 | 
 70 |     # calculate logFC as defined in Seurat
 71 |     logFC = tryCatch({
 72 |       sc0 = CreateSeuratObject(expr1, meta = meta1) %>%
 73 |         NormalizeData()
 74 |       Idents(sc0) = sc0$label
 75 |       mat = GetAssayData(sc0, slot = 'data')
 76 |       levels = levels(meta1$label)
 77 |       if (is.null(levels)) {
 78 |         levels = unique(meta1$label)
 79 |       }
 80 |       cells1 = WhichCells(sc0, idents = levels[1])
 81 |       cells2 = WhichCells(sc0, idents = levels[2])
 82 |       data1 = log(rowMeans(mat[, cells1, drop = F] + 1))
 83 |       data2 = log(rowMeans(mat[, cells2, drop = F] + 1))
 84 |       out = data2 - data1 # backwards from Seurat (i.e., the proper way)
 85 |     }, error = function(e) { return(NA_real_) })
 86 | 
 87 |     # calculate pseudobulk variance
 88 |     pseudobulk_variance = tryCatch({
 89 |       meta2 = meta1 %>%
 90 |         mutate(label = as.character(label),
 91 |                replicate = as.character(replicate))
 92 |       mm = model.matrix(~ 0 + replicate:label, data = meta2)
 93 |       mat_mm = expr1 %*% mm
 94 |       # drop empty columns
 95 |       keep_samples = colSums(mat_mm) > 0
 96 |       mat_mm %<>% extract(, keep_samples) %>% as.matrix()
 97 |       # normalize
 98 |       mat_mm %<>% edgeR::cpm()
 99 |       # grab the variance for each gene
100 |       vars = sparseMatrixStats::rowSds(mat_mm)
101 |       vars %<>% setNames(rownames(mat_mm))
102 |       vars
103 |     }, error = function(e) { return(NA_real_) })
104 | 
105 |     # calculate shuffled pseudobulk variance
106 |     shuffled_variance = tryCatch({
107 |       meta2 = meta1 %>%
108 |         mutate(label = as.character(label),
109 |                replicate = as.character(replicate)) %>%
110 |         group_by(cell_type, label) %>%
111 |         mutate(replicate = sample(replicate))
112 |       mm = model.matrix(~ 0 + replicate:label, data = meta2)
113 |       mat_mm = expr1 %*% mm
114 |       # drop empty columns
115 |       keep_samples = colSums(mat_mm) > 0
116 |       mat_mm %<>% extract(, keep_samples) %>% as.matrix()
117 |       # normalize
118 |       mat_mm %<>% edgeR::cpm()
119 |       # grab the variance for each gene
120 |       vars = sparseMatrixStats::rowSds(mat_mm)
121 |       vars %<>% setNames(rownames(mat_mm))
122 |       vars
123 |     }, error = function(e) { return(NA_real_) })
124 | 
125 |     # calculate the ratio of real to shuffled variance
126 |     ratio = pseudobulk_variance / shuffled_variance
127 | 
128 |     # convert to data frame
129 |     df = data.frame(gene = genes, mean = means, sd = sds, cov = covs,
130 |                     pct_zero = pct_zeros, logFC = logFC,
131 |                     pseudobulk_variance = pseudobulk_variance,
132 |                     shuffled_variance = shuffled_variance,
133 |                     pseudobulk_ratio = ratio) %>%
134 |       mutate(dataset = dataset,
135 |              comparison = comparison_name,
136 |              cell_type = cell_type)
137 | 
138 |     # append to results
139 |     results %<>% bind_rows(df)
140 |   }
141 | }
142 | 
143 | # rearrange columns
144 | results %<>% dplyr::select(dataset, comparison, cell_type, everything())
145 | 
146 | # write
147 | write.csv(results, output_file, row.names = F)
148 | system(paste("gzip --force", output_file))
149 | 


--------------------------------------------------------------------------------
/R/analysis/expr_summary/outer-expr-summary.R:
--------------------------------------------------------------------------------
 1 | setwd("~/git/DE-analysis")
 2 | options(stringsAsFactors = F)
 3 | library(argparse)
 4 | 
 5 | # parse arguments
 6 | parser = ArgumentParser(prog = 'outer-expr-summary.R')
 7 | parser$add_argument('--allocation', type = 'character')
 8 | args = parser$parse_args()
 9 | 
10 | library(tidyverse)
11 | library(magrittr)
12 | source("R/functions/datasets.R")
13 | source("R/functions/submit_job.R")
14 | source("R/functions/detect_system.R")
15 | 
16 | # list input files
17 | input_dir = file.path(base_dir, "rnaseq", "seurat")
18 | input_files = file.path(input_dir, paste0(datasets, '.rds'))
19 | # add Hagai plate data into this
20 | input_files %<>% c(file.path(input_dir, "Hagai2018_plate.rds"))
21 | # grid is simply the list of input files
22 | grid = data.frame(input_file = input_files)
23 | 
24 | # define output directory where results are stored
25 | output_dir = file.path(base_dir, "analysis/expr_summary")
26 | 
27 | # check which parameters are already complete
28 | overwrite = F
29 | grid0 = grid
30 | if (!overwrite) {
31 |   grid0 = grid %>%
32 |     mutate(output_filename = paste0(basename(input_file) %>%
33 |                                       gsub("\\.rds$", "", .), '.txt.gz'),
34 |            output_file = file.path(output_dir, output_filename),
35 |            exists = file.exists(output_file)) %>%
36 |     filter(!exists) %>%
37 |     dplyr::select(-output_file, -output_filename, -exists)
38 | }
39 | 
40 | # write the grid that still needs to be run
41 | grid_file = "sh/analysis/expr_summary/grids/expr_summary.txt"
42 | grid_dir = dirname(grid_file)
43 | if (!dir.exists(grid_dir))
44 |   dir.create(grid_dir, recursive = T)
45 | write.table(grid0, grid_file, quote = F, row.names = F, sep = "\t")
46 | 
47 | # finally, run the job on whatever system we're on
48 | sh_dir = "~/git/DE-analysis/sh/analysis/expr_summary"
49 | script = file.path(sh_dir, "expr_summary.sh")
50 | submit_job(grid0, script, args$allocation, system)
51 | 


--------------------------------------------------------------------------------
/R/analysis/expr_summary/summarise-expr-summary.R:
--------------------------------------------------------------------------------
 1 | setwd("~/git/DE-analysis")
 2 | options(stringsAsFactors = F)
 3 | library(tidyverse)
 4 | library(magrittr)
 5 | library(data.table)
 6 | args = list(); source("R/functions/detect_system.R")
 7 | 
 8 | # list input files
 9 | summary_dir = file.path(base_dir, "analysis", "expr_summary")
10 | summary_files = list.files(summary_dir, full.names = TRUE, pattern = "*gz")
11 | 
12 | # read them all
13 | dats = map(summary_files, fread)
14 | 
15 | # combine
16 | dat = do.call(rbind, dats)
17 | 
18 | # pick one comparison per dataset
19 | confounds = readRDS("data/analysis/confounds/confounds.rds") %>%
20 |   # fix a couple datasets
21 |   mutate(comparison = ifelse(grepl("Schafflick|Der", dataset),
22 |                              gsub("^.*_", "", dataset),
23 |                              ifelse(grepl("Hagai", dataset), 
24 |                                     paste0(gsub("^.*_", "", dataset), "|", 
25 |                                            comparison),
26 |                                     comparison)),
27 |          dataset = ifelse(grepl("Schafflick|Der|Hagai", dataset),
28 |                           gsub("_.*$", "", dataset), dataset))
29 | # pick the comparison with the most cells
30 | most_cells = confounds %>%
31 |   filter(outcome == '# of cells') %>%
32 |   group_by(dataset, comparison) %>%
33 |   summarise(total_cells = sum(value),
34 |             n_cell_types = n_distinct(cell_type)) %>%
35 |   ungroup() %>% 
36 |   group_by(dataset) %>%
37 |   arrange(desc(total_cells), desc(n_cell_types)) %>%
38 |   dplyr::slice(1) %>%
39 |   ungroup() %>%
40 |   dplyr::select(dataset, comparison)
41 | n_distinct(most_cells$dataset)
42 | 
43 | # filter to these comparisons
44 | dat0 = dat %>%
45 |   mutate(comparison = ifelse(grepl("Schafflick|Der", dataset),
46 |                              gsub("^.*_", "", dataset),
47 |                              ifelse(grepl("Hagai", dataset), 
48 |                                     paste0(gsub("^.*_", "", dataset), "|", 
49 |                                            comparison),
50 |                                     comparison)),
51 |          dataset = ifelse(grepl("Schafflick|Der|Hagai", dataset),
52 |                           gsub("_.*$", "", dataset), dataset)) %>%
53 |   inner_join(most_cells, by = c('dataset', 'comparison'))
54 | 
55 | # write
56 | saveRDS(dat0, "data/analysis/expr_summary/expr_summary.rds")
57 | 


--------------------------------------------------------------------------------
/R/analysis/extract_FPs/inner-extract-FPs.R:
--------------------------------------------------------------------------------
  1 | # Extract summary statistics for the top-ranking false-positives and
  2 | # false-negatives from each DE method.
  3 | setwd("~/git/DE-analysis")
  4 | options(stringsAsFactors = FALSE)
  5 | library(argparse)
  6 | 
  7 | # parse arguments
  8 | parser = ArgumentParser(prog = 'inner-extract-FPs.R')
  9 | parser$add_argument('--label', type = 'character', required = TRUE)
 10 | parser$add_argument('--sc_file', type = 'character', required = TRUE)
 11 | parser$add_argument('--bulk_file', type = 'character', required = TRUE)
 12 | parser$add_argument('--summary_file', type = 'character', required = TRUE)
 13 | parser$add_argument('--output_file', type = 'character', required = TRUE)
 14 | args = parser$parse_args()
 15 | 
 16 | library(tidyverse)
 17 | library(magrittr)
 18 | 
 19 | # read single-cell and bulk DE
 20 | sc_de = readRDS(args$sc_file)
 21 | bulk_de = readRDS(args$bulk_file)
 22 | 
 23 | # read expr summary
 24 | expr_summary = read.csv(args$summary_file)
 25 | 
 26 | # set up output containers
 27 | FP = FN = data.frame()
 28 | 
 29 | # iterate through single-cell comparisons
 30 | label = args$label
 31 | for (comparison_idx in seq_along(sc_de)) {
 32 |   sc_sub = sc_de[[comparison_idx]]
 33 |   sc_comparison = names(sc_de)[comparison_idx]
 34 |   if (is.null(sc_comparison))
 35 |     sc_comparison = 1
 36 |   
 37 |   # filter comparisons
 38 |   if (grepl("Hagai2018", label) & !sc_comparison %in% c("lps4", "pic4")) {
 39 |     message(".. skipping comparison ", sc_comparison, "...")
 40 |     next
 41 |   }
 42 | 
 43 |   # iterate through cell types in the single-cell data
 44 |   cell_types = unique(sc_sub$cell_type)
 45 |   ## for Reyfman/Angelidis, only do select cell types
 46 |   if (grepl("Reyfman2020", label)) {
 47 |     cell_types = ifelse(grepl("AT2", label), "AT2", "Alveolar macrophages")
 48 |   } else if (grepl("Angelidis2019", label)) {
 49 |     cell_types = ifelse(grepl("alvmac", label), "Alveolar_macrophage",
 50 |                         "Type_2_pneumocytes")
 51 |   }
 52 |   for (cell_type in cell_types) {
 53 |     message(".. analyzing cell type ", cell_type, " in comparison ",
 54 |             sc_comparison, "...")
 55 |     sc = filter(sc_sub, cell_type == !!cell_type)
 56 |     
 57 |     # get the relevant bulk comparisons
 58 |     if (grepl("Hagai2018", label)) {
 59 |       bulk_comparison = toupper(sc_comparison)
 60 |       bulk = bulk_de[[bulk_comparison]]
 61 |     } else if (label == "CanoGamez2020") {
 62 |       bulk_comparison = paste0('Resting|', sc_comparison, '|', cell_type, '|5d')
 63 |       bulk = bulk_de[[bulk_comparison]]
 64 |     } else if (grepl("Reyfman2020|Angelidis2019", label)) {
 65 |       bulk_comparison = '1'
 66 |       bulk = bulk_de[[1]] %>% ungroup()
 67 |     } else {
 68 |       stop("not sure what to do with label: ", label)
 69 |     }
 70 |     
 71 |     # fix column names
 72 |     fix_colnames = function(df) {
 73 |       colnames(df) %<>%
 74 |         fct_recode('p_val' = 'p.value',  ## DESeq2
 75 |                    'p_val' = 'pvalue',  ## DESeq2
 76 |                    'p_val' = 'p.value',  ## t/wilcox
 77 |                    'p_val' = 'P.Value',  ## limma
 78 |                    'p_val' = 'PValue'  , ## edgeR
 79 |                    'p_val_adj' = 'padj', ## DESeq2/t/wilcox
 80 |                    'p_val_adj' = 'adj.P.Val',      ## limma
 81 |                    'p_val_adj' = 'FDR',            ## edgeER
 82 |                    'avg_logFC' = 'log2FoldChange', ## DESEeq2
 83 |                    'avg_logFC' = 'logFC', ## limma/edgeR
 84 |                    'test_statistic' = 'stat', ## DESeq2
 85 |                    'test_statistic' = 'F', ## edgeR
 86 |                    'test_statistic' = 't', ## limma
 87 |                    'test_statistic' = 'LR', ## edgeR LRT
 88 |                    'test_statistic' = 'statistic' ## t
 89 |         ) %>%
 90 |         as.character()
 91 |       return(df)
 92 |     }
 93 |     sc %<>% fix_colnames()
 94 |     bulk %<>% fix_colnames()
 95 |     
 96 |     # call FPs
 97 |     ns_sc = sc %>%
 98 |       # replace Bonferroni with BH correction
 99 |       mutate(padj = p.adjust(p_val, 'BH')) %>%
100 |       filter(padj > 0.1) %>%
101 |       pull(gene)
102 |     ns_bulk = filter(bulk, p_val_adj > 0.1) %>% pull(gene)
103 |     
104 |     # single-cell FPs
105 |     fps = sc %>%
106 |       arrange(p_val) %>%
107 |       filter(gene %in% ns_bulk) %>%
108 |       head(200) %>%
109 |       mutate(rank = row_number(),
110 |              sc_comparison = sc_comparison,
111 |              bulk_comparison = bulk_comparison) %>%
112 |       dplyr::select(sc_comparison, bulk_comparison, cell_type,
113 |                     rank, gene, everything())
114 |     if ("runtime" %in% colnames(fps)) {
115 |       fps %<>% dplyr::select(-runtime, -mem_usage)
116 |     }
117 |     
118 |     # single-cell FNs
119 |     fn_genes = bulk %>%
120 |       arrange(p_val) %>%
121 |       filter(gene %in% ns_sc) %>%
122 |       filter(!duplicated(gene)) %>%
123 |       head(200) %>%
124 |       pull(gene)
125 |     fns = sc %>%
126 |       filter(gene %in% fn_genes) %>%
127 |       # order by bulk p-values
128 |       right_join(data.frame(gene = fn_genes), by = 'gene') %>%
129 |       mutate(rank = row_number(),
130 |              sc_comparison = sc_comparison,
131 |              bulk_comparison = bulk_comparison) %>%
132 |       dplyr::select(sc_comparison, bulk_comparison, cell_type,
133 |                     rank, gene, everything())
134 |     if ("runtime" %in% colnames(fns)) {
135 |       fns %<>% dplyr::select(-runtime, -mem_usage)
136 |     }
137 |     
138 |     # merge in expression summary to both
139 |     summary0 = filter(expr_summary,
140 |                       cell_type == !!cell_type,
141 |                       comparison == sc_comparison) %>%
142 |       dplyr::rename(sc_comparison = comparison) %>%
143 |       dplyr::select(-dataset)
144 |     fps %<>% left_join(summary0, by = c('cell_type', 'sc_comparison', 'gene'))
145 |     fns %<>% left_join(summary0, by = c('cell_type', 'sc_comparison', 'gene'))
146 |     
147 |     # append to results
148 |     FP %<>% bind_rows(fps)
149 |     FN %<>% bind_rows(fns)
150 |   }
151 | }
152 | 
153 | # construct output
154 | output = list(FPs = FP, FNs = FN)
155 | 
156 | # create output directory, if it doesn't exist
157 | output_dir = dirname(args$output_file)
158 | if (!dir.exists(output_dir))
159 |   dir.create(output_dir, recursive = TRUE)
160 | # save results
161 | saveRDS(output, args$output_file)
162 | 


--------------------------------------------------------------------------------
/R/analysis/extract_FPs/outer-extract-FPs.R:
--------------------------------------------------------------------------------
  1 | # Extract summary statistics for the top-ranking false-positives and
  2 | # false-negatives from each DE method.
  3 | setwd("~/git/DE-analysis")
  4 | options(stringsAsFactors = FALSE)
  5 | library(argparse)
  6 | 
  7 | # parse arguments
  8 | parser = ArgumentParser(prog = 'outer-extract-FPs.R')
  9 | parser$add_argument('--allocation', type = 'character')
 10 | args = parser$parse_args()
 11 | 
 12 | library(tidyverse)
 13 | library(magrittr)
 14 | source("R/functions/datasets.R")
 15 | source("R/functions/submit_job.R")
 16 | source("R/functions/detect_system.R")
 17 | 
 18 | # set up grid
 19 | opts = list(
 20 |   sc_dataset = c(paste0('Hagai2018_', c('mouse', 'pig', 'rat', 'rabbit')),
 21 |                  'Angelidis2019',
 22 |                  'CanoGamez2020',
 23 |                  'Reyfman2020'),
 24 |   bulk_dataset = c(paste0('Hagai2018_', c('mouse', 'pig', 'rat', 'rabbit')),
 25 |                    'Angelidis2019_facsepi',
 26 |                    'Angelidis2019_facsmac',
 27 |                    'CanoGamez2020',
 28 |                    'Reyfman2020_alvmac',
 29 |                    'Reyfman2020_AT2'),
 30 |   sc_test = c(
 31 |     ## single-cell methods, implemented in Seurat
 32 |     "wilcox", "bimod", "t", "negbinom", "poisson", "LR", "MAST",
 33 |     ## pseudobulk methods
 34 |     "pseudobulk_DESeq2,test?LRT", "pseudobulk_DESeq2,test?Wald",
 35 |     "pseudobulk_limma,mode?voom", "pseudobulk_limma,mode?trend",
 36 |     "pseudobulk_edgeR,test?QLF", "pseudobulk_edgeR,test?LRT",
 37 |     ## mixed model, implemented in Seurat
 38 |     "mixed_lm",
 39 |     ## pseudobulk methods with aggregation disabled
 40 |     "pseudobulk_DESeq2,test?LRT,replicate?cells",
 41 |     "pseudobulk_DESeq2,test?Wald,replicate?cells",
 42 |     "pseudobulk_limma,mode?voom,replicate?cells",
 43 |     "pseudobulk_limma,mode?trend,replicate?cells",
 44 |     "pseudobulk_edgeR,test?QLF,replicate?cells",
 45 |     "pseudobulk_edgeR,test?LRT,replicate?cells"),
 46 |   bulk_test = c("bulk_DESeq2,test?LRT",
 47 |                 "bulk_DESeq2,test?Wald",
 48 |                 "bulk_limma,mode?voom",
 49 |                 "bulk_limma,mode?trend",
 50 |                 "bulk_edgeR,test?LRT",
 51 |                 "bulk_edgeR,test?QLF"),
 52 |   shuffle_replicates = c("NO", "YES")
 53 | )
 54 | grid = do.call(tidyr::crossing, opts) %>%
 55 |   # matching datasets
 56 |   extract(map2_lgl(.$sc_dataset, .$bulk_dataset, ~ grepl(.x, .y)), )
 57 | 
 58 | # write the raw array
 59 | grid_file = "sh/analysis/extract_FPs/grids/extract_FPs.raw.txt"
 60 | grid_dir = dirname(grid_file)
 61 | if (!dir.exists(grid_dir))
 62 |   dir.create(grid_dir, recursive = T)
 63 | write.table(grid, grid_file, quote = F, row.names = F, sep = "\t")
 64 | 
 65 | # define output directory where results are stored
 66 | output_dir = file.path(base_dir, "analysis", "extract_FPs")
 67 | 
 68 | # now, check for which parameters are already complete
 69 | overwrite = F
 70 | grid0 = grid
 71 | if (overwrite == F) {
 72 |   grid0 = grid %>%
 73 |     # for Reyfman2020, recode bulk test
 74 |     mutate(bulk_test = ifelse(grepl("Reyfman2020", sc_dataset),
 75 |                               'bulk_DESeq2', bulk_test)) %>%
 76 |     distinct() %>%
 77 |     # set up single-cell DE, bulk DE, and expr summary filepaths
 78 |     mutate(sc_dir = file.path(base_dir, 'analysis', 'run_DE'),
 79 |            sc_filename = paste0(sc_dataset, '-de_test=', sc_test,
 80 |                                 '-shuffle_replicates=', shuffle_replicates,
 81 |                                 '.rds'),
 82 |            sc_file = file.path(sc_dir, sc_filename),
 83 |            bulk_dir = file.path(base_dir, 'analysis', 'run_bulk_DE'),
 84 |            bulk_filename = paste0(bulk_dataset, '-de_test=', bulk_test, '.rds'),
 85 |            bulk_file = file.path(bulk_dir, bulk_filename),
 86 |            summary_dir = file.path(base_dir, 'analysis',
 87 |                                    'expr_summary'),
 88 |            summary_file = file.path(summary_dir,
 89 |                                     paste0(sc_dataset, '.txt.gz'))) %>%
 90 |     # set up output filepath
 91 |     mutate(output_filename = paste0(bulk_dataset,
 92 |                                     '-sc_test=', sc_test,
 93 |                                     '-shuffle_replicates=', shuffle_replicates,
 94 |                                     '-bulk_test=', bulk_test,
 95 |                                     '.rds'),
 96 |            output_file = file.path(output_dir, output_filename),
 97 |            exists = file.exists(output_file),
 98 |            idx = row_number()) %>%
 99 |     # drop files that exist
100 |     filter(!exists) %>%
101 |     # keep only parameters and I/O
102 |     dplyr::select(bulk_dataset, sc_test, bulk_test,
103 |                   sc_file, bulk_file, summary_file, output_file)
104 | }
105 | 
106 | # write the grid that still needs to be run
107 | write.table(grid0, "sh/analysis/extract_FPs/grids/extract_FPs.txt",
108 |             quote = F, row.names = F, sep = "\t")
109 | 
110 | # finally, run the job on whatever system we're on
111 | script = "~/git/DE-analysis/sh/analysis/extract_FPs/extract_FPs.sh"
112 | submit_job(grid0, script, args$allocation, system)
113 | 


--------------------------------------------------------------------------------
/R/analysis/extract_FPs/summarise-extract-FPs.R:
--------------------------------------------------------------------------------
 1 | setwd("~/git/DE-analysis")
 2 | options(stringsAsFactors = F)
 3 | library(tidyverse)
 4 | library(magrittr)
 5 | args = list(); source("R/functions/detect_system.R")
 6 | 
 7 | input_dir = file.path(base_dir, "analysis/extract_FPs")
 8 | input_files = list.files(input_dir, full.names = T, pattern ='*\\.rds$')
 9 | 
10 | # read all input files
11 | dats = map(input_files, readRDS) %>%
12 |   setNames(basename(input_files))
13 | 
14 | # extract FPs/FNs separately
15 | FPs = map(dats, 'FPs')
16 | FNs = map(dats, 'FNs')
17 | 
18 | # combine into a single object
19 | FP = FPs %>%
20 |   map(~ map_dfc(., as.character)) %>%
21 |   bind_rows(.id = 'filename') %>%
22 |   type_convert() %>%
23 |   # extract missing info from filename
24 |   separate(filename, into = c('dataset', 'sc_test', 'shuffle_replicate', 
25 |                               'bulk_test'), sep = '-') %>%
26 |   mutate_at(vars(sc_test, shuffle_replicate, bulk_test), function(x) 
27 |     gsub(".*=|.rds", "", x))
28 | FN = FNs %>%
29 |   map(~ map_dfc(., as.character)) %>%
30 |   bind_rows(.id = 'filename') %>%
31 |   type_convert() %>%
32 |   # extract missing info from filename
33 |   separate(filename, into = c('dataset', 'sc_test', 'shuffle_replicate',
34 |                               'bulk_test'), sep = '-') %>%
35 |   mutate_at(vars(sc_test, shuffle_replicate, bulk_test), function(x) 
36 |     gsub(".*=|.rds", "", x))
37 | 
38 | # create output
39 | dat = list(FPs = FP, FNs = FN)
40 | 
41 | # save results
42 | output_file = "data/analysis/extract_FPs/extract_FPs.rds"
43 | output_dir = dirname(output_file)
44 | if (!dir.exists(output_dir))
45 |   dir.create(output_dir, recursive = T)
46 | saveRDS(dat, output_file)
47 | 


--------------------------------------------------------------------------------
/R/analysis/mean_variance/analyze-mean-delta-variance-all-datasets.R:
--------------------------------------------------------------------------------
 1 | # Analyze the relationships between mean expression, expression variance, and
 2 | # delta-variance in all 46 datasets.
 3 | setwd("~/git/DE-analysis")
 4 | options(stringsAsFactors = F)
 5 | library(tidyverse)
 6 | library(magrittr)
 7 | library(broom)
 8 | library(ppcor)
 9 | 
10 | # read expr_summary data
11 | dat = readRDS("data/analysis/expr_summary/expr_summary.rds") 
12 | 
13 | # for each dataset, calculate correlations between:
14 | ## mean and delta-variance
15 | cors1 = dat %>%
16 |   # first, correlate within cell types
17 |   mutate(delta = shuffled_variance - pseudobulk_variance) %>%
18 |   drop_na(mean, delta) %>%
19 |   group_by(dataset, comparison, cell_type) %>%
20 |   do(tidy(cor.test(.$mean, .$delta, method = 'p', use = 'p'))) %>%
21 |   ungroup() %>%
22 |   filter(is.finite(estimate)) %>%
23 |   # next, average over cell types
24 |   group_by(dataset, comparison) %>%
25 |   summarise(mean_cor = mean(estimate, na.rm = TRUE)) %>%
26 |   ungroup()
27 |   
28 | ## variance and delta-variance
29 | cors2 = dat %>%
30 |   # first, correlate within cell types
31 |   mutate(delta = shuffled_variance - pseudobulk_variance) %>%
32 |   drop_na(pseudobulk_variance, delta) %>%
33 |   group_by(dataset, comparison, cell_type) %>%
34 |   do(tidy(cor.test(.$pseudobulk_variance, .$delta, method = 'p', use = 'p'))) %>%
35 |   ungroup() %>%
36 |   filter(is.finite(estimate)) %>%
37 |   # next, average over cell types
38 |   group_by(dataset, comparison) %>%
39 |   summarise(mean_cor = mean(estimate, na.rm = TRUE)) %>%
40 |   ungroup()
41 | 
42 | # now do partial correlations between:
43 | ## delta-variance and variance, controlling for mean
44 | pcors1 = dat %>%
45 |   # first, correlate within cell types
46 |   mutate(delta = shuffled_variance - pseudobulk_variance) %>%
47 |   drop_na(pseudobulk_variance, mean, delta) %>%
48 |   group_by(dataset, comparison, cell_type) %>%
49 |   mutate(partial_cor = pcor.test(pseudobulk_variance, delta, mean)$estimate) %>%
50 |   ungroup() %>%
51 |   filter(is.finite(partial_cor)) %>%
52 |   # next, average over cell types
53 |   group_by(dataset, comparison) %>%
54 |   summarise(mean_cor = mean(partial_cor, na.rm = TRUE)) %>%
55 |   ungroup()
56 | 
57 | ## delta-variance and mean, controlling for variance
58 | pcors2 = dat %>%
59 |   # first, correlate within cell types
60 |   mutate(delta = shuffled_variance - pseudobulk_variance) %>%
61 |   drop_na(pseudobulk_variance, mean, delta) %>%
62 |   group_by(dataset, comparison, cell_type) %>%
63 |   mutate(partial_cor = pcor.test(mean, delta, pseudobulk_variance)$estimate) %>%
64 |   ungroup() %>%
65 |   filter(is.finite(partial_cor)) %>%
66 |   # next, average over cell types
67 |   group_by(dataset, comparison) %>%
68 |   summarise(mean_cor = mean(partial_cor, na.rm = TRUE)) %>%
69 |   ungroup()
70 | 
71 | # save all four correlations
72 | cors = bind_rows(mutate(cors1, xval = 'mean vs. delta-variance'),
73 |                  mutate(cors2, xval = 'variance vs. delta-variance'),
74 |                  mutate(pcors1, xval = 'variance vs. delta-variance (partial)'),
75 |                  mutate(pcors2, xval = 'mean vs. delta-variance (partial)')) %>%
76 |   mutate(xval = fct_relevel(xval,
77 |                             'variance vs. delta-variance (partial)',
78 |                             'mean vs. delta-variance (partial)',
79 |                             'variance vs. delta-variance',
80 |                             'mean vs. delta-variance'))
81 | saveRDS(cors, "data/analysis/mean_variance/correlations.rds")
82 | 


--------------------------------------------------------------------------------
/R/analysis/run_DE/inner-run-DE.R:
--------------------------------------------------------------------------------
 1 | # Run single-cell or pseudobulk DE analyses on all cell types in a dataset.
 2 | setwd("~/git/DE-analysis")
 3 | options(stringsAsFactors = F)
 4 | library(argparse)
 5 | 
 6 | # parse arguments
 7 | parser = ArgumentParser(prog = 'inner-run-DE.R')
 8 | parser$add_argument('--input_file', type = 'character', required = T)
 9 | parser$add_argument('--shuffle_replicates', type = 'character', required = T)
10 | parser$add_argument('--output_dir', type = 'character', required = T)
11 | parser$add_argument('--de_test', type = 'character', required = T)
12 | args = parser$parse_args()
13 | print(args)
14 | 
15 | library(tidyverse)
16 | library(magrittr)
17 | library(Seurat)
18 | library(Matrix)
19 | library(peakRAM)
20 | library(future)
21 | source("R/functions/get_comparisons.R")
22 | source("R/functions/run_DE.R")
23 | 
24 | # set up output filepath
25 | if (!dir.exists(args$output_dir))
26 |   dir.create(args$output_dir, recursive = T)
27 | dataset = args$input_file %>%
28 |   basename() %>%
29 |   gsub("\\.rds$", "", .)
30 | output_filename = paste0(dataset,
31 |                          "-de_test=", args$de_test,
32 |                          "-shuffle_replicates=", args$shuffle_replicates,
33 |                          ".rds")
34 | output_file = file.path(args$output_dir, output_filename)
35 | 
36 | # read input file and extract matrix/metadata
37 | sc = readRDS(args$input_file)
38 | expr = GetAssayData(sc, slot = 'counts')
39 | meta = sc@meta.data
40 | 
41 | # get all combinations of conditions
42 | results = list()
43 | comparisons = get_comparisons(dataset, expr, meta)
44 | for (comparison_idx in seq_along(comparisons)) {
45 |   comparison = comparisons[[comparison_idx]]
46 |   comparison_name = names(comparisons)[comparison_idx]
47 |   if (is.null(comparison_name))
48 |     comparison_name = 1
49 |   
50 |   message("[", comparison_idx, "/", length(comparisons), "] ",
51 |           "analyzing comparison ", comparison_name, " ...")
52 |   message("##############################")
53 |   
54 |   # get subset expression and metadata
55 |   expr0 = comparison$expr
56 |   meta0 = comparison$meta
57 |   
58 |   # check for replicate shuffling
59 |   if (args$shuffle_replicates == "YES") {
60 |     meta0 %<>% 
61 |       group_by(cell_type, label) %>%
62 |       mutate(replicate = sample(replicate))
63 |   }
64 |   
65 |   # fix rownames
66 |   meta0 %<>% set_rownames(colnames(expr0))
67 |   
68 |   # reconstruct the Seurat object
69 |   sc0 = CreateSeuratObject(expr0, min.cells = 1, min.features = 0,
70 |                            meta.data = meta0)
71 |   
72 |   # run DE analysis
73 |   DE = run_DE(sc0, de_test = args$de_test)
74 |   
75 |   # append to list
76 |   results[[comparison_name]] = DE
77 | }
78 | 
79 | # stop if empty
80 | if (length(results) == 0 | all(map_int(results, nrow) == 0))
81 |   stop("couldn't get any results")
82 | 
83 | # save results
84 | saveRDS(results, output_file)
85 | 


--------------------------------------------------------------------------------
/R/analysis/run_DE/outer-run-DE.R:
--------------------------------------------------------------------------------
 1 | # Run single-cell or pseudobulk DE analyses on all cell types in a dataset.
 2 | setwd("~/git/DE-analysis")
 3 | options(stringsAsFactors = F)
 4 | library(argparse)
 5 | 
 6 | # parse arguments
 7 | parser = ArgumentParser(prog = 'outer-run-DE.R')
 8 | parser$add_argument('--allocation', type = 'character')
 9 | args = parser$parse_args()
10 | 
11 | library(tidyverse)
12 | library(magrittr)
13 | source("R/functions/datasets.R")
14 | source("R/functions/submit_job.R")
15 | source("R/functions/detect_system.R")
16 | 
17 | # list input files
18 | input_dir = file.path(base_dir, "rnaseq", "seurat")
19 | input_files = file.path(input_dir, paste0(datasets, '.rds'))
20 | 
21 | # establish grid of analyses
22 | opts = list(
23 |   de_test = c(
24 |     ## single-cell methods, implemented in Seurat
25 |     "wilcox", "bimod", "t", "negbinom", "poisson", "LR", "MAST",
26 |     ## pseudobulk methods
27 |     "pseudobulk_DESeq2,test?LRT", "pseudobulk_DESeq2,test?Wald",
28 |     "pseudobulk_limma,mode?voom", "pseudobulk_limma,mode?trend",
29 |     "pseudobulk_edgeR,test?QLF", "pseudobulk_edgeR,test?LRT",
30 |     ## mixed model, implemented in Seurat
31 |     "mixed_lm",
32 |     ## pseudobulk methods run without aggregation
33 |     "pseudobulk_DESeq2,test?LRT,replicate?cells",
34 |     "pseudobulk_DESeq2,test?Wald,replicate?cells",
35 |     "pseudobulk_limma,mode?voom,replicate?cells",
36 |     "pseudobulk_limma,mode?trend,replicate?cells",
37 |     "pseudobulk_edgeR,test?QLF,replicate?cells",
38 |     "pseudobulk_edgeR,test?LRT,replicate?cells",
39 |   ),
40 |   shuffle_replicates = c("NO", "YES")
41 | )
42 | grid = do.call(expand.grid, c(opts, stringsAsFactors = F))
43 | 
44 | # rep analysis grid over input files
45 | grid %<>%
46 |   dplyr::slice(rep(1:n(), each = nrow(inputs))) %>%
47 |   mutate(input_file = rep(inputs$input_file, nrow(grid))) %>%
48 |   left_join(inputs, by = 'input_file') %>%
49 |   # reorder columns
50 |   dplyr::select(input_file, everything())
51 | 
52 | # write the raw array
53 | grid_file = "sh/analysis/run_DE/grids/run_DE.raw.txt"
54 | grid_dir = dirname(grid_file)
55 | if (!dir.exists(grid_dir))
56 |   dir.create(grid_dir, recursive = T)
57 | write.table(grid, grid_file, quote = F, row.names = F, sep = "\t")
58 | 
59 | # define output directory where results are stored
60 | output_dir = file.path(base_dir, "analysis/run_DE")
61 | 
62 | # check which parameters are already complete
63 | overwrite = F
64 | grid0 = grid
65 | if (!overwrite) {
66 |   grid0 = grid %>%
67 |     mutate(output_filename = paste0(basename(input_file) %>%
68 |                                       gsub("\\.rds$", "", .),
69 |                                     '-de_test=', de_test,
70 |                                     '-shuffle_replicates=', shuffle_replicates,
71 |                                     '.rds'),
72 |            output_file = file.path(output_dir, output_filename),
73 |            exists = file.exists(output_file)) %>%
74 |     filter(!exists) %>%
75 |     dplyr::select(-output_file, -output_filename, -exists)
76 | }
77 | 
78 | # subset grid, if needed
79 | if (nrow(grid0) >= 10000) {
80 |   grid0 %<>% dplyr::slice(1:9900) ## allow for some other running jobs or sh
81 | }
82 | 
83 | # write the grid that still needs to be run
84 | write.table(grid0, "sh/analysis/run_DE/grids/run_DE.txt",
85 |             quote = F, row.names = F, sep = "\t")
86 | 
87 | # finally, run the job on whatever system we're on
88 | sh_dir = "~/git/DE-analysis/sh/analysis/run_DE"
89 | script = file.path(sh_dir, "run_DE.sh")
90 | submit_job(grid0, script, args$allocation, system)
91 | 


--------------------------------------------------------------------------------
/R/analysis/run_GSEA/inner-GSEA-concordance.R:
--------------------------------------------------------------------------------
 1 | # Calculate the concordance GSEA results from matching single-cell and bulk DE.
 2 | setwd("~/git/DE-analysis")
 3 | options(stringsAsFactors = F)
 4 | library(argparse)
 5 | 
 6 | # parse arguments
 7 | parser = ArgumentParser(prog = 'inner-GSEA-concordance.R')
 8 | parser$add_argument('--label', type = 'character', required = TRUE)
 9 | parser$add_argument('--input_sc', type = 'character', required = TRUE)
10 | parser$add_argument('--input_bulk', type = 'character', required = TRUE)
11 | parser$add_argument('--output_file', type = 'character', required = TRUE)
12 | args = parser$parse_args()
13 | print(args)
14 | 
15 | library(tidyverse)
16 | library(magrittr)
17 | library(AUC)
18 | source("R/functions/calculate_overlap.R")
19 | source("R/analysis/bulk_concordance/write_grid.R")
20 | 
21 | # set up output filepath
22 | output_dir = dirname(args$output_file)
23 | if (!dir.exists(output_dir))
24 |   dir.create(output_dir, recursive = T)
25 | 
26 | # load in files
27 | sc = readRDS(args$input_sc)
28 | bulk = readRDS(args$input_bulk)
29 | 
30 | # iterate through single-cell comparisons
31 | res = data.frame()
32 | label = args$label
33 | for (sc_comparison in unique(sc$comparison)) {
34 |   sc_sub = filter(sc, comparison == sc_comparison)
35 |   
36 |   # iterate through cell types in the single-cell data
37 |   cell_types = unique(sc_sub$cell_type)
38 |   for (cell_type in cell_types) {
39 |     message(".. analyzing cell type ", cell_type, " in comparison ",
40 |             sc_comparison, "...")
41 |     input1 = filter(sc_sub, cell_type == !!cell_type)
42 |     
43 |     # now, get the matching bulk data
44 |     if (grepl("Hagai2018", label)) {
45 |       bulk_comparison = toupper(sc_comparison)
46 |       input2 = filter(bulk, comparison == bulk_comparison)
47 |     } else if (label == "CanoGamez2020") {
48 |       bulk_comparison = paste0('Resting|', sc_comparison, '|', cell_type, '|5d')
49 |       input2 = filter(bulk, comparison == bulk_comparison)
50 |     } else if (grepl("Reyfman2020|Angelidis2019", label)) {
51 |       bulk_comparison = 1
52 |       input2 = bulk
53 |     } else {
54 |       stop("not sure what to do with label: ", label)
55 |     }
56 |     
57 |     # fix columns
58 |     input1 %<>% dplyr::rename(p_val = pval, p_val_adj = padj, gene = pathway,
59 |                               test_statistic = nMoreExtreme) %>%
60 |       mutate(avg_logFC = 1) ## need to set the sign
61 |     input2 %<>% dplyr::rename(p_val = pval, p_val_adj = padj, gene = pathway,
62 |                               test_statistic = nMoreExtreme) %>%
63 |       mutate(avg_logFC = 1)
64 |     
65 |     # run the GSEA results through our generic concordance function
66 |     concordance = template %>%
67 |       mutate(value = pmap_dbl(., function(...) {
68 |         template = tibble(...)
69 |         print(template)
70 |         value = calculate_overlap(
71 |           bulk_de = input2,
72 |           sc_de = input1,
73 |           method = template$method,
74 |           k = template$k,
75 |           cor_method = template$cor_method
76 |         )
77 |         return(value)
78 |       })) %>%
79 |       # flag comparisons and cell type
80 |       mutate(sc_comparison = sc_comparison,
81 |              bulk_comparison = bulk_comparison,
82 |              cell_type = cell_type)
83 |     
84 |     # append to results
85 |     res %<>% rbind(concordance)
86 |   }
87 | }
88 | 
89 | # save results
90 | saveRDS(res, args$output_file)
91 | 


--------------------------------------------------------------------------------
/R/analysis/run_GSEA/inner-run-GSEA.R:
--------------------------------------------------------------------------------
  1 | # Run gene set enrichment analysis (GSEA) on a set of DE results.
  2 | setwd("~/git/DE-analysis")
  3 | options(stringsAsFactors = FALSE)
  4 | library(argparse)
  5 | 
  6 | # parse arguments
  7 | parser = ArgumentParser(prog = 'inner-run-GSEA.R')
  8 | parser$add_argument('--input_file', type = 'character', required = TRUE)
  9 | parser$add_argument('--output_file', type = 'character', required = TRUE)
 10 | parser$add_argument('--n_permutations', type = 'integer', default = 1e6)
 11 | parser$add_argument('--min_size', type = 'integer', default = 10)
 12 | parser$add_argument('--max_size', type = 'integer', default = 1000)
 13 | args = parser$parse_args()
 14 | print(args)
 15 | 
 16 | library(tidyverse)
 17 | library(magrittr)
 18 | library(fgsea)
 19 | library(flavin)
 20 | 
 21 | # create output directory, if it does not exist
 22 | output_dir = dirname(args$output_file)
 23 | if (!dir.exists(output_dir))
 24 |   dir.create(output_dir, recursive = T)
 25 | 
 26 | # read input file
 27 | input = readRDS(args$input_file)
 28 | 
 29 | # read GO
 30 | species = ifelse(grepl("Angelidis|_mouse|_rat", args$input_file),
 31 |                  'mouse', 'human')
 32 | goa_file = paste0("data/GO/", 
 33 |                   fct_recode(species, 'mgi' = 'mouse', 'goa_human' = 'human'),
 34 |                   ".gaf.gz")
 35 | goa = read_gaf(goa_file)
 36 | ann = as_annotation_list(goa, 'DB_Object_Symbol', 'GO_ID')
 37 | 
 38 | # create results container
 39 | res = data.frame()
 40 | 
 41 | # iterate through comparisons
 42 | for (comparison_idx in seq_along(input)) {
 43 |   comparison = input[[comparison_idx]]
 44 |   comparison_name = names(input)[comparison_idx]
 45 |   if (is.null(comparison_name))
 46 |     comparison_name = 1
 47 |   
 48 |   if ("cell_type" %in% colnames(comparison)) {
 49 |     # iterate through cell types
 50 |     cell_types = unique(comparison$cell_type)
 51 |     ## keep only a subset of cell types to improve runtime
 52 |     keep = c("Naive",
 53 |              "Memory",
 54 |              "bone marrow derived mononuclear phagocytes",
 55 |              "Alveolar_macrophage",
 56 |              "Type_2_pneumocytes",
 57 |              "AT2",
 58 |              "Alveolar macrophages")
 59 |     cell_types %<>% intersect(keep)
 60 |     for (cell_type in cell_types) {
 61 |       message(".. analyzing cell type ", cell_type, " in comparison ",
 62 |               comparison_name, "...")
 63 |       DE = filter(comparison, cell_type == !!cell_type)
 64 |       
 65 |       # fix column names
 66 |       colnames(DE) %<>%
 67 |         fct_recode('p_val' = 'p.value',  ## DESeq2
 68 |                    'p_val' = 'pvalue',  ## DESeq2
 69 |                    'p_val' = 'p.value',  ## t/wilcox
 70 |                    'p_val' = 'P.Value',  ## limma
 71 |                    'p_val' = 'PValue'  , ## edgeR
 72 |                    'p_val_adj' = 'padj', ## DESeq2/t/wilcox
 73 |                    'p_val_adj' = 'adj.P.Val',      ## limma
 74 |                    'p_val_adj' = 'FDR',            ## edgeER
 75 |                    'avg_logFC' = 'log2FoldChange', ## DESEeq2
 76 |                    'avg_logFC' = 'logFC', ## limma/edgeR
 77 |                    'test_statistic' = 'stat', ## DESeq2
 78 |                    'test_statistic' = 'F', ## edgeR
 79 |                    'test_statistic' = 't', ## limma
 80 |                    'test_statistic' = 'LR', ## edgeR LRT
 81 |                    'test_statistic' = 'statistic' ## t
 82 |         ) %>%
 83 |         as.character()
 84 |       
 85 |       # extract ranks
 86 |       ranks = DE %>%
 87 |         drop_na(test_statistic) %$%
 88 |         setNames(abs(test_statistic), gene) %>% 
 89 |         sort(decreasing = TRUE)
 90 |       ## replace infinite values
 91 |       ranks[is.infinite(ranks)] = max(ranks[!is.infinite(ranks)])
 92 |       
 93 |       # run GSEA
 94 |       gsea = fgsea(pathways = ann,
 95 |                    stats = ranks,
 96 |                    nproc = 1,
 97 |                    nperm = args$n_permutations,
 98 |                    minSize = args$min_size,
 99 |                    maxSize = args$max_size) %>%
100 |         dplyr::select(-leadingEdge) %>%
101 |         # flag cell type and comparison
102 |         mutate(cell_type = cell_type,
103 |                comparison = comparison_name)
104 |       
105 |       # append to results
106 |       res %<>% bind_rows(gsea)
107 |     }
108 |   } else {
109 |     message(".. analyzing comparison ", comparison_name, "...")
110 |     DE = comparison
111 |     
112 |     # fix column names
113 |     colnames(DE) %<>%
114 |       fct_recode('p_val' = 'p.value',  ## DESeq2
115 |                  'p_val' = 'pvalue',  ## DESeq2
116 |                  'p_val' = 'p.value',  ## t/wilcox
117 |                  'p_val' = 'P.Value',  ## limma
118 |                  'p_val' = 'PValue'  , ## edgeR
119 |                  'p_val_adj' = 'padj', ## DESeq2/t/wilcox
120 |                  'p_val_adj' = 'adj.P.Val',      ## limma
121 |                  'p_val_adj' = 'FDR',            ## edgeER
122 |                  'avg_logFC' = 'log2FoldChange', ## DESEeq2
123 |                  'avg_logFC' = 'logFC', ## limma/edgeR
124 |                  'test_statistic' = 'stat', ## DESeq2
125 |                  'test_statistic' = 'F', ## edgeR
126 |                  'test_statistic' = 't', ## limma
127 |                  'test_statistic' = 'LR', ## edgeR LRT
128 |                  'test_statistic' = 'statistic' ## t
129 |       ) %>%
130 |       as.character()
131 |     
132 |     # extract ranks
133 |     ranks = DE %>%
134 |       drop_na(test_statistic) %$%
135 |       setNames(abs(test_statistic), gene) %>% 
136 |       sort(decreasing = TRUE)
137 |     
138 |     # run GSEA
139 |     gsea = fgsea(pathways = ann,
140 |                  stats = ranks,
141 |                  nproc = 1,
142 |                  nperm = args$n_permutations,
143 |                  minSize = args$min_size,
144 |                  maxSize = args$max_size) %>%
145 |       dplyr::select(-leadingEdge) %>%
146 |       # flag cell type and comparison
147 |       mutate(comparison = comparison_name)
148 |     
149 |     # append to results
150 |     res %<>% bind_rows(gsea)
151 |   }
152 | }
153 | 
154 | # stop if empty
155 | if (nrow(res) == 0)
156 |   stop("couldn't get any results")
157 | 
158 | # save results
159 | saveRDS(res, args$output_file)
160 | 


--------------------------------------------------------------------------------
/R/analysis/run_GSEA/outer-GSEA-concordance.R:
--------------------------------------------------------------------------------
 1 | # Calculate the concordance GSEA results from matching single-cell and bulk DE.
 2 | setwd("~/git/DE-analysis")
 3 | options(stringsAsFactors = FALSE)
 4 | library(argparse)
 5 | 
 6 | # parse arguments
 7 | parser = ArgumentParser(prog = 'outer-GSEA-concordance.R')
 8 | parser$add_argument('--allocation', type = 'character')
 9 | args = parser$parse_args()
10 | 
11 | library(tidyverse)
12 | library(magrittr)
13 | source("R/functions/datasets.R")
14 | source("R/functions/submit_job.R")
15 | source("R/functions/detect_system.R")
16 | 
17 | # set up grid
18 | opts = list(
19 |   sc_dataset = c(paste0('Hagai2018_', c('mouse', 'pig', 'rat', 'rabbit')),
20 |                  'Angelidis2019',
21 |                  'CanoGamez2020', 
22 |                  'Reyfman2020'),
23 |   bulk_dataset = c(paste0('Hagai2018_', c('mouse', 'pig', 'rat', 'rabbit')),
24 |                    'Angelidis2019_facsepi',
25 |                    'Angelidis2019_facsmac',
26 |                    'CanoGamez2020',
27 |                    'Reyfman2020_alvmac',
28 |                    'Reyfman2020_AT2'),
29 |   sc_test = c("wilcox", "bimod", "t", "negbinom", "poisson", "LR", "MAST",
30 |               "pseudobulk_DESeq2,test?LRT", "pseudobulk_DESeq2,test?Wald",
31 |               "pseudobulk_limma,mode?voom", "pseudobulk_limma,mode?trend",
32 |               "pseudobulk_edgeR,test?QLF", "pseudobulk_edgeR,test?LRT",
33 |               "mixed_lm"),
34 |   bulk_test = c("bulk_DESeq2,test?LRT",
35 |                 "bulk_DESeq2,test?Wald",
36 |                 "bulk_limma,mode?voom",
37 |                 "bulk_limma,mode?trend",
38 |                 "bulk_edgeR,test?LRT",
39 |                 "bulk_edgeR,test?QLF")
40 | )
41 | grid = do.call(tidyr::crossing, opts) %>%
42 |   # matching datasets
43 |   extract(map2_lgl(.$sc_dataset, .$bulk_dataset, ~ grepl(.x, .y)), )
44 | 
45 | # write the raw array
46 | grid_file = "sh/analysis/run_GSEA/grids/GSEA_concordance.raw.txt"
47 | grid_dir = dirname(grid_file)
48 | if (!dir.exists(grid_dir))
49 |   dir.create(grid_dir, recursive = T)
50 | write.table(grid, grid_file, quote = F, row.names = F, sep = "\t")
51 | 
52 | # define output directory where results are stored
53 | output_dir = file.path(base_dir, "analysis", "run_GSEA", "concordance")
54 | 
55 | # now, check for which parameters are already complete
56 | overwrite = F
57 | grid0 = grid
58 | if (overwrite == F) {
59 |   grid0 = grid %>%
60 |     # for Reyfman2020, recode bulk test
61 |     mutate(bulk_test = ifelse(grepl("Reyfman2020_", bulk_dataset),
62 |                               'bulk_DESeq2', bulk_test)) %>%
63 |     distinct() %>%
64 |     # set up single-cell DE, bulk DE, and expr summary filepaths
65 |     mutate(sc_dir = file.path(base_dir, 'analysis', 'run_GSEA', 'single_cell'),
66 |            sc_filename = paste0(sc_dataset, '-de_test=', sc_test, '.rds'),
67 |            sc_file = file.path(sc_dir, sc_filename),
68 |            bulk_dir = file.path(base_dir, 'analysis', 'run_GSEA', 'bulk'),
69 |            bulk_filename = paste0(bulk_dataset, '-de_test=', bulk_test, '.rds'),
70 |            bulk_file = file.path(bulk_dir, bulk_filename)) %>%
71 |     # set up output filepath
72 |     mutate(output_filename = paste0(bulk_dataset, 
73 |                                     '-sc_test=', sc_test,
74 |                                     '-bulk_test=', bulk_test, 
75 |                                     '.rds'),
76 |            output_file = file.path(output_dir, output_filename),
77 |            exists = file.exists(output_file),
78 |            idx = row_number()) %>%
79 |     # drop files that exist
80 |     filter(!exists) %>%
81 |     # keep only parameters and I/O
82 |     dplyr::select(bulk_dataset, sc_file, bulk_file, output_file)
83 | }
84 | 
85 | # write the grid that still needs to be run
86 | write.table(grid0, "sh/analysis/run_GSEA/grids/GSEA_concordance.txt",
87 |             quote = F, row.names = F, sep = "\t")
88 | 
89 | # finally, run the job on whatever system we're on
90 | script = "~/git/DE-analysis/sh/analysis/run_GSEA/GSEA_concordance.sh"
91 | submit_job(grid0, script, args$allocation, system)
92 | 


--------------------------------------------------------------------------------
/R/analysis/run_GSEA/outer-run-GSEA.R:
--------------------------------------------------------------------------------
  1 | # Run gene set enrichment analysis (GSEA) on single-cell DE results.
  2 | setwd("~/git/DE-analysis")
  3 | options(stringsAsFactors = F)
  4 | library(argparse)
  5 | 
  6 | # parse arguments
  7 | parser = ArgumentParser(prog = 'outer-run-GSEA.R')
  8 | parser$add_argument('--allocation', type = 'character')
  9 | args = parser$parse_args()
 10 | 
 11 | library(tidyverse)
 12 | library(magrittr)
 13 | source("R/functions/submit_job.R")
 14 | source("R/functions/detect_system.R")
 15 | 
 16 | # manually set up the input single-cell datasets
 17 | sc_datasets = c(paste0('Hagai2018_', c('rat', 'rabbit', 'mouse', 'pig')),
 18 |                 'CanoGamez2020',
 19 |                 'Angelidis2019',
 20 |                 'Reyfman2020')
 21 | 
 22 | # establish analysis grid
 23 | opts = list(
 24 |   dataset = sc_datasets,
 25 |   de_test = c(
 26 |     "wilcox", "bimod", "t", "negbinom", "poisson", "LR", "MAST",
 27 |     "pseudobulk_DESeq2,test?LRT", "pseudobulk_DESeq2,test?Wald",
 28 |     "pseudobulk_limma,mode?voom", "pseudobulk_limma,mode?trend",
 29 |     "pseudobulk_edgeR,test?QLF", "pseudobulk_edgeR,test?LRT",
 30 |     "mixed_lm")
 31 | )
 32 | sc_grid = do.call(tidyr::crossing, opts) %>%
 33 |   mutate(input_dir = file.path(base_dir, "analysis", "run_DE"),
 34 |          output_dir = file.path(base_dir, "analysis", "run_GSEA",
 35 |                                 "single_cell"))
 36 | 
 37 | # now, do the same for bulk grid
 38 | bulk_datasets = c(paste0('Hagai2018_', c('mouse', 'pig', 'rat', 'rabbit')),
 39 |                   'Angelidis2019_facsepi',
 40 |                   'Angelidis2019_facsmac',
 41 |                   'CanoGamez2020',
 42 |                   'Reyfman2020_alvmac',
 43 |                   'Reyfman2020_AT2')
 44 | opts = list(
 45 |   dataset = bulk_datasets,
 46 |   de_test = c("bulk_DESeq2,test?LRT",
 47 |               "bulk_DESeq2,test?Wald",
 48 |               "bulk_limma,mode?voom",
 49 |               "bulk_limma,mode?trend",
 50 |               "bulk_edgeR,test?LRT",
 51 |               "bulk_edgeR,test?QLF")
 52 | )
 53 | bulk_grid = do.call(tidyr::crossing, opts) %>%
 54 |   mutate(input_dir = file.path(base_dir, "analysis", "run_bulk_DE"),
 55 |          output_dir = file.path(base_dir, "analysis", "run_GSEA", "bulk"))
 56 | 
 57 | # combine grids
 58 | grid = bind_rows(sc_grid, bulk_grid)
 59 | 
 60 | # write the raw array
 61 | grid_file = "sh/analysis/run_GSEA/grids/run_GSEA.raw.txt"
 62 | grid_dir = dirname(grid_file)
 63 | if (!dir.exists(grid_dir))
 64 |   dir.create(grid_dir, recursive = T)
 65 | write.table(grid, grid_file, quote = F, row.names = F, sep = "\t")
 66 | 
 67 | # check which parameters are already complete
 68 | overwrite = F
 69 | grid0 = grid
 70 | if (!overwrite) {
 71 |   grid0 = grid %>%
 72 |     # for Reyfman2020, recode bulk test
 73 |     mutate(de_test = ifelse(grepl("Reyfman2020_", dataset),
 74 |                             'bulk_DESeq2', de_test)) %>%
 75 |     distinct() %>%
 76 |     mutate(input_file = file.path(input_dir, paste0(dataset, 
 77 |                                                     '-de_test=', de_test, 
 78 |                                                     '.rds')),
 79 |            output_file = file.path(output_dir,  paste0(dataset,
 80 |                                                        '-de_test=', de_test,
 81 |                                                        '.rds')),
 82 |            exists = file.exists(output_file)) %>%
 83 |     filter(!exists) %>%
 84 |     dplyr::select(dataset, de_test, input_file, output_file)
 85 | }
 86 | 
 87 | # subset grid, if needed
 88 | if (nrow(grid0) >= 10000) {
 89 |   grid0 %<>% dplyr::slice(1:9900) ## allow for some other running jobs or sh
 90 | }
 91 | 
 92 | # write the grid that still needs to be run
 93 | write.table(grid0, "sh/analysis/run_GSEA/grids/run_GSEA.txt",
 94 |             quote = F, row.names = F, sep = "\t")
 95 | 
 96 | # finally, run the job on whatever system we're on
 97 | sh_dir = "~/git/DE-analysis/sh/analysis/run_GSEA"
 98 | script = file.path(sh_dir, "run_GSEA.sh")
 99 | submit_job(grid0, script, args$allocation, system)
100 | 


--------------------------------------------------------------------------------
/R/analysis/run_GSEA/summarise-GSEA-concordance.R:
--------------------------------------------------------------------------------
 1 | setwd("~/git/DE-analysis")
 2 | options(stringsAsFactors = F)
 3 | library(tidyverse)
 4 | library(magrittr)
 5 | args = list(); source("R/functions/detect_system.R")
 6 | 
 7 | input_dir = file.path(base_dir, "analysis", "run_GSEA", "concordance")
 8 | input_files = list.files(input_dir, full.names = T, pattern ='*\\.rds$')
 9 | 
10 | # read all input files
11 | dats = map(input_files, readRDS) %>%
12 |   setNames(basename(input_files))
13 | 
14 | # combine into a single file
15 | dat = dats %>%
16 |   map(~ map_dfc(., as.character)) %>%
17 |   bind_rows(.id = 'comparison') %>%
18 |   type_convert() %>%
19 |   separate(comparison, into = c('bulk_dataset', 'sc_test', 'bulk_test'),
20 |            sep = '-') %>%
21 |   mutate_at(vars(sc_test, bulk_test), function(x) gsub(".*=|.rds", "", x))
22 | 
23 | # save results
24 | output_file = "data/analysis/run_GSEA/GSEA_concordance.rds"
25 | output_dir = dirname(output_file)
26 | if (!dir.exists(output_dir))
27 |   dir.create(output_dir, recursive = T)
28 | saveRDS(dat, output_file)
29 | 


--------------------------------------------------------------------------------
/R/analysis/run_bulk_DE/inner-run-DE.R:
--------------------------------------------------------------------------------
 1 | # Run bulk DE analyses on all cell types in a dataset.
 2 | setwd("~/git/DE-analysis")
 3 | options(stringsAsFactors = F)
 4 | library(argparse)
 5 | 
 6 | # parse arguments
 7 | parser = ArgumentParser(prog = 'inner-run-DE.R')
 8 | parser$add_argument('--input_file', type = 'character', required = T)
 9 | parser$add_argument('--output_dir', type = 'character', required = T)
10 | parser$add_argument('--de_test', type = 'character', required = T)
11 | args = parser$parse_args()
12 | print(args)
13 | 
14 | library(tidyverse)
15 | library(magrittr)
16 | library(Seurat)
17 | library(Matrix)
18 | source("R/functions/get_bulk_comparisons.R")
19 | source("R/functions/run_DE.R")
20 | 
21 | # set up output filepath
22 | if (!dir.exists(args$output_dir))
23 |   dir.create(args$output_dir, recursive = T)
24 | dataset = args$input_file %>%
25 |   basename() %>%
26 |   gsub("\\.rds$", "", .)
27 | dataset_label = args$input_file %>%
28 |   basename() %>%
29 |   gsub("_.*|.rds", "", .)
30 | output_filename = paste0(dataset, "-de_test=", args$de_test, ".rds")
31 | output_file = file.path(args$output_dir, output_filename)
32 | 
33 | # read input file and extract matrix/metadata
34 | sc = readRDS(args$input_file)
35 | expr = sc$assay
36 | meta = sc$meta
37 | 
38 | # get all combinations of conditions
39 | results = list()
40 | comparisons = get_bulk_comparisons(dataset_label, expr, meta)
41 | for (comparison_idx in seq_along(comparisons)) {
42 |   comparison = comparisons[[comparison_idx]]
43 |   comparison_name = names(comparisons)[comparison_idx]
44 |   if (is.null(comparison_name))
45 |     comparison_name = 1
46 | 
47 |   message("[", comparison_idx, "/", length(comparisons), "] ",
48 |           "analyzing comparison ", comparison_name, " ...")
49 |   message("##############################")
50 | 
51 |   # get subset expression and metadata
52 |   expr0 = comparison$expr
53 |   meta0 = comparison$meta %>%
54 |     set_rownames(colnames(expr0))
55 | 
56 |   # run DE analysis
57 |   if (grepl("proteomics|microarray", args$input_file)) {
58 |     DE = bulk_DE(expr0, targets = meta0, de_test = args$de_test, used_voom = F)
59 |   } else {
60 |     DE = bulk_DE(expr0, targets = meta0, de_test = args$de_test)
61 |   }
62 |   # append to list
63 |   results[[comparison_name]] = DE
64 | }
65 | 
66 | # stop if empty
67 | if (length(results) == 0 | all(map_int(results, nrow) == 0))
68 |   stop("couldn't get any results")
69 | 
70 | # save results
71 | saveRDS(results, output_file)
72 | 


--------------------------------------------------------------------------------
/R/analysis/run_bulk_DE/outer-run-DE.R:
--------------------------------------------------------------------------------
 1 | # Run bulk DE analysis on a dataset.
 2 | setwd("~/git/DE-analysis")
 3 | options(stringsAsFactors = F)
 4 | library(argparse)
 5 | 
 6 | # parse arguments
 7 | parser = ArgumentParser(prog = 'outer-run-DE.R')
 8 | parser$add_argument('--allocation', type = 'character')
 9 | args = parser$parse_args()
10 | 
11 | library(tidyverse)
12 | library(magrittr)
13 | source("R/functions/datasets.R")
14 | source("R/functions/submit_job.R")
15 | source("R/functions/detect_system.R")
16 | 
17 | # list input files
18 | input_files = file.path(base_dir, paste0(bulk_datasets, '.rds'))
19 | inputs = data.frame(input_file = input_files) %>%
20 |   mutate(type = dirname(bulk_datasets))
21 | 
22 | # define tests to use
23 | de_tests = c("bulk_DESeq2,test?LRT",
24 |              "bulk_DESeq2,test?Wald",
25 |              "bulk_limma,mode?voom",
26 |              "bulk_limma,mode?trend",
27 |              "bulk_edgeR,test?LRT",
28 |              "bulk_edgeR,test?QLF")
29 | 
30 | # rep analysis grid over input files
31 | grid = inputs %>%
32 |   dplyr::slice(rep(1:n(), each = length(de_tests))) %>%
33 |   mutate(de_test = rep(de_tests, nrow(inputs))) %>%
34 |   # only do limma for proteomics and microarray
35 |   filter(type != 'proteomics' | !grepl("DESeq2|edgeR", de_test))
36 | 
37 | # write the raw array
38 | grid_file = "sh/analysis/run_DE/grids/run_bulk_DE.raw.txt"
39 | grid_dir = dirname(grid_file)
40 | if (!dir.exists(grid_dir))
41 |   dir.create(grid_dir, recursive = T)
42 | write.table(grid, grid_file, quote = F, row.names = F, sep = "\t")
43 | 
44 | # define output directory where results are stored
45 | output_dir = file.path(base_dir, "analysis/run_bulk_DE")
46 | 
47 | # check which parameters are already complete
48 | overwrite = F
49 | grid0 = grid
50 | if (!overwrite) {
51 |   grid0 = grid %>%
52 |     mutate(output_filename = paste0(basename(input_file) %>%
53 |                                       gsub("\\.rds$", "", .),
54 |                                     '-de_test=', de_test,
55 |                                     '.rds'),
56 |            output_file = file.path(output_dir, output_filename),
57 |            exists = file.exists(output_file)) %>%
58 |     filter(!exists) %>%
59 |     dplyr::select(-output_file, -output_filename, -exists)
60 | }
61 | 
62 | # write the grid that still needs to be run
63 | write.table(grid0, "sh/analysis/run_DE/grids/run_bulk_DE.txt",
64 |             quote = F, row.names = F, sep = "\t")
65 | 
66 | # finally, run the job on whatever system we're on
67 | sh_dir = "~/git/DE-analysis/sh/analysis/run_DE"
68 | script = file.path(sh_dir, "run_bulk_DE.sh")
69 | submit_job(grid0, script, args$allocation, system)
70 | 


--------------------------------------------------------------------------------
/R/analysis/run_spike_in_DE/inner-run-DE.R:
--------------------------------------------------------------------------------
 1 | # Run single-cell or pseudobulk DE analyses on the Hagai et al. dataset with 
 2 | # ERCC spike-ins.
 3 | setwd("~/git/DE-analysis")
 4 | options(stringsAsFactors = F)
 5 | library(argparse)
 6 | 
 7 | # parse arguments
 8 | parser = ArgumentParser(prog = 'inner-run-DE.R')
 9 | parser$add_argument('--input_file', type = 'character', required = T)
10 | parser$add_argument('--shuffle_replicates', type = 'character', required = T)
11 | parser$add_argument('--output_dir', type = 'character', required = T)
12 | parser$add_argument('--de_test', type = 'character', required = T)
13 | args = parser$parse_args()
14 | print(args)
15 | 
16 | library(tidyverse)
17 | library(magrittr)
18 | library(Seurat)
19 | library(Matrix)
20 | library(peakRAM)
21 | library(future)
22 | source("R/functions/get_comparisons.R")
23 | source("R/functions/run_DE.R")
24 | 
25 | # set up output filepath
26 | if (!dir.exists(args$output_dir))
27 |   dir.create(args$output_dir, recursive = T)
28 | dataset = args$input_file %>%
29 |   basename() %>%
30 |   gsub("\\.rds$", "", .)
31 | output_filename = paste0(dataset,
32 |   "-de_test=", args$de_test,
33 |   "-shuffle_replicates=", args$shuffle_replicates, 
34 |   ".rds")
35 | output_file = file.path(args$output_dir, output_filename)
36 | 
37 | # read input file and extract matrix/metadata
38 | sc = readRDS(args$input_file)
39 | expr = GetAssayData(sc, slot = 'counts')
40 | meta = sc@meta.data
41 | 
42 | # get all combinations of conditions
43 | results = list()
44 | comparisons = get_comparisons(dataset, expr, meta)
45 | for (comparison_idx in seq_along(comparisons)) {
46 |   comparison = comparisons[[comparison_idx]]
47 |   comparison_name = names(comparisons)[comparison_idx]
48 |   if (is.null(comparison_name))
49 |     comparison_name = 1
50 | 
51 |   message("[", comparison_idx, "/", length(comparisons), "] ",
52 |           "analyzing comparison ", comparison_name, " ...")
53 |   message("##############################")
54 | 
55 |   # get subset expression and metadata
56 |   expr0 = comparison$expr
57 |   meta0 = comparison$meta
58 | 
59 |   # check for replicate shuffling
60 |   if (args$shuffle_replicates == "YES") {
61 |     meta0 %<>% 
62 |       group_by(cell_type, label) %>%
63 |       mutate(replicate = sample(replicate))
64 |   }
65 | 
66 |   # fix rownames
67 |   meta0 %<>% set_rownames(colnames(expr0))
68 | 
69 |   # reconstruct the Seurat object
70 |   sc0 = CreateSeuratObject(expr0, min.cells = 1, min.features = 0,
71 |                            meta.data = meta0)
72 | 
73 |   # run DE analysis
74 |   DE = run_DE(sc0, de_test = args$de_test)
75 | 
76 |   # append to list
77 |   results[[comparison_name]] = DE
78 | }
79 | 
80 | # stop if empty
81 | if (length(results) == 0 | all(map_int(results, nrow) == 0))
82 |   stop("couldn't get any results")
83 | 
84 | # save results
85 | saveRDS(results, output_file)
86 | 


--------------------------------------------------------------------------------
/R/analysis/run_spike_in_DE/outer-run-DE.R:
--------------------------------------------------------------------------------
 1 | # Run single-cell or pseudobulk DE analyses on the Hagai et al. dataset with 
 2 | # ERCC spike-ins.
 3 | setwd("~/git/DE-analysis")
 4 | options(stringsAsFactors = F)
 5 | library(argparse)
 6 | 
 7 | # parse arguments
 8 | parser = ArgumentParser(prog = 'outer-run-DE.R')
 9 | parser$add_argument('--allocation', type = 'character')
10 | args = parser$parse_args()
11 | 
12 | library(tidyverse)
13 | library(magrittr)
14 | source("R/functions/submit_job.R")
15 | source("R/functions/detect_system.R")
16 | 
17 | # list input files
18 | input_dir = file.path(base_dir, "rnaseq", "seurat")
19 | input_files = file.path(input_dir, "Hagai2018_plate.rds")
20 | inputs = data.frame(input_file = input_files)
21 | 
22 | # establish grid of analyses
23 | opts = list(
24 |   de_test = c(
25 |     ## single-cell methods, implemented in Seurat
26 |     "wilcox", "bimod", "t", "negbinom", "poisson", "LR", "MAST",
27 |     ## pseudobulk methods
28 |     "pseudobulk_DESeq2,test?LRT", "pseudobulk_DESeq2,test?Wald",
29 |     "pseudobulk_limma,mode?voom", "pseudobulk_limma,mode?trend",
30 |     "pseudobulk_edgeR,test?QLF", "pseudobulk_edgeR,test?LRT",
31 |     ## mixed model, implemented in Seurat
32 |     "mixed_lm",
33 |     ## pseudobulk methods run without aggregation
34 |     "pseudobulk_DESeq2,test?LRT,replicate?cells",
35 |     "pseudobulk_DESeq2,test?Wald,replicate?cells",
36 |     "pseudobulk_limma,mode?voom,replicate?cells",
37 |     "pseudobulk_limma,mode?trend,replicate?cells",
38 |     "pseudobulk_edgeR,test?QLF,replicate?cells",
39 |     "pseudobulk_edgeR,test?LRT,replicate?cells"
40 |   ),
41 |   shuffle_replicates = c("NO", "YES")
42 | )
43 | grid = do.call(expand.grid, c(opts, stringsAsFactors = F))
44 | 
45 | # rep analysis grid over input files
46 | grid %<>%
47 |   dplyr::slice(rep(1:n(), each = nrow(inputs))) %>%
48 |   mutate(input_file = rep(inputs$input_file, nrow(grid))) %>%
49 |   left_join(inputs, by = 'input_file') %>%
50 |   # reorder columns
51 |   dplyr::select(input_file, everything())
52 | 
53 | # write the raw array
54 | grid_file = "sh/analysis/run_spike_in_DE/grids/run_DE.raw.txt"
55 | grid_dir = dirname(grid_file)
56 | if (!dir.exists(grid_dir))
57 |   dir.create(grid_dir, recursive = T)
58 | write.table(grid, grid_file, quote = F, row.names = F, sep = "\t")
59 | 
60 | # define output directory where results are stored
61 | output_dir = file.path(base_dir, "analysis/run_spike_in_DE")
62 | 
63 | # check which parameters are already complete
64 | overwrite = F
65 | grid0 = grid
66 | if (!overwrite) {
67 |   grid0 = grid %>%
68 |     mutate(output_filename = paste0(basename(input_file) %>%
69 |                                       gsub("\\.rds$", "", .),
70 |                                     '-de_test=', de_test,
71 |                                     '-shuffle_replicates=', shuffle_replicates,
72 |                                     '.rds'),
73 |            output_file = file.path(output_dir, output_filename),
74 |            exists = file.exists(output_file)) %>%
75 |     filter(!exists) %>%
76 |     dplyr::select(-output_file, -output_filename, -exists)
77 | }
78 | 
79 | # subset grid, if needed
80 | if (nrow(grid0) >= 10000) {
81 |   grid0 %<>% dplyr::slice(1:9900) ## allow for some other running jobs or sh
82 | }
83 | 
84 | # write the grid that still needs to be run
85 | write.table(grid0, "sh/analysis/run_spike_in_DE/grids/run_DE.txt",
86 |             quote = F, row.names = F, sep = "\t")
87 | 
88 | # finally, run the job on whatever system we're on
89 | sh_dir = "~/git/DE-analysis/sh/analysis/run_spike_in_DE"
90 | script = file.path(sh_dir, "run_DE.sh")
91 | submit_job(grid0, script, args$allocation, system)
92 | 


--------------------------------------------------------------------------------
/R/analysis/run_spike_in_DE/summarise-spike-ins.R:
--------------------------------------------------------------------------------
 1 | setwd("~/git/DE-analysis")
 2 | options(stringsAsFactors = F)
 3 | library(tidyverse)
 4 | library(magrittr)
 5 | args = list(); source("R/functions/detect_system.R")
 6 | 
 7 | # set up input directory
 8 | input_dir = file.path(base_dir, "analysis/run_spike_in_DE")
 9 | input_files = list.files(input_dir, full.names = T, pattern ='*\\.rds$')
10 | 
11 | # read all input files
12 | dats = map(input_files, readRDS) %>%
13 |   setNames(basename(input_files))
14 | 
15 | ## function to fix column names
16 | clean_cols = function(df) {
17 |   # fix column names
18 |   colnames(df) %<>%
19 |     fct_recode('p_val' = 'p.value',  ## DESeq2
20 |                'p_val' = 'pvalue',  ## DESeq2
21 |                'p_val' = 'p.value',  ## t/wilcox
22 |                'p_val' = 'P.Value',  ## limma
23 |                'p_val' = 'PValue'  , ## edgeR
24 |                'p_val_adj' = 'padj', ## DESeq2/t/wilcox
25 |                'p_val_adj' = 'adj.P.Val',      ## limma
26 |                'p_val_adj' = 'FDR',            ## edgeER
27 |                'avg_logFC' = 'log2FoldChange', ## DESEeq2
28 |                'avg_logFC' = 'logFC', ## limma/edgeR
29 |                'test_statistic' = 'stat', ## DESeq2
30 |                'test_statistic' = 'F', ## edgeR
31 |                'test_statistic' = 't', ## limma
32 |                'test_statistic' = 'LR', ## edgeR LRT
33 |                'test_statistic' = 'statistic' ## t
34 |     ) %>%
35 |     as.character()
36 |   return(df)  
37 | }
38 | 
39 | # summarise, keeping only ERCCs
40 | sum = dats %>%
41 |   map(extract2, 1) %>%
42 |   map(clean_cols) %>%
43 |   bind_rows(.id = 'dataset') %>%
44 |   filter(grepl("^ERCC-", gene)) %>%
45 |   dplyr::select(dataset, cell_type, gene, p_val, p_val_adj, test,
46 |                 test_statistic, avg_logFC)
47 | 
48 | # combine this with gene level summary statistics
49 | expr_summary = read.csv(file.path(base_dir, "analysis/expr_summary",
50 |                                   "Hagai2018_plate.txt.gz")) %>%
51 |   filter(gene %in% sum$gene)
52 | sum %<>% left_join(expr_summary, by = 'gene') %>%
53 |   dplyr::select(-dataset.y, -cell_type.x) %>%
54 |   dplyr::rename(cell_type = cell_type.y) %>%
55 |   separate(dataset.x, into = c('dataset', 'de_test', 'shuffle_replicates'), 
56 |            sep = '-') %>%
57 |   mutate_at(vars(de_test, shuffle_replicates), ~ gsub("^.*=|\\.rds$", "", .))
58 | 
59 | # save results
60 | output_file = "data/analysis/run_spike_in_DE/spike_in_summary.rds"
61 | output_dir = dirname(output_file)
62 | if (!dir.exists(output_dir))
63 |   dir.create(output_dir, recursive = T)
64 | saveRDS(sum, output_file)
65 | 


--------------------------------------------------------------------------------
/R/analysis/simulations/inner-expr-summary-simulations.R:
--------------------------------------------------------------------------------
  1 | setwd("~/git/DE-analysis")
  2 | options(stringsAsFactors = F)
  3 | library(argparse)
  4 | 
  5 | # parse arguments
  6 | parser = ArgumentParser(prog = 'inner-expr-summary.R')
  7 | parser$add_argument('--input_file', type = 'character', required = T)
  8 | parser$add_argument('--output_dir', type = 'character', required = T)
  9 | args = parser$parse_args()
 10 | print(args)
 11 | 
 12 | library(tidyverse)
 13 | library(magrittr)
 14 | library(Seurat)
 15 | library(Matrix)
 16 | source("R/functions/get_comparisons.R")
 17 | 
 18 | # set up output filepath
 19 | if (!dir.exists(args$output_dir))
 20 |   dir.create(args$output_dir, recursive = T)
 21 | output_filename = basename(args$input_file)
 22 | output_file = file.path(args$output_dir, output_filename)
 23 | 
 24 | # read input file and extract matrix/metadata
 25 | sc = readRDS(args$input_file)
 26 | expr = GetAssayData(sc, slot = 'counts')
 27 | meta = sc@meta.data
 28 | 
 29 | # calculate statistics
 30 | genes = rownames(expr)
 31 | means = Matrix::rowMeans(expr)
 32 | sds = sparseMatrixStats::rowSds(expr)
 33 | covs = sds / means
 34 | pct_zeros = Matrix::rowSums(expr == 0) / ncol(expr)
 35 | 
 36 | # calculate logFC as defined in Seurat
 37 | logFC = tryCatch({
 38 |   sc0 = CreateSeuratObject(expr, meta = meta) %>%
 39 |     NormalizeData()
 40 |   Idents(sc0) = sc0$label
 41 |   mat = GetAssayData(sc0, slot = 'data')
 42 |   levels = levels(meta$label)
 43 |   if (is.null(levels)) {
 44 |     levels = unique(meta$label)
 45 |   }
 46 |   cells1 = WhichCells(sc0, idents = levels[1])
 47 |   cells2 = WhichCells(sc0, idents = levels[2])
 48 |   data1 = log(rowMeans(mat[, cells1, drop = F] + 1))
 49 |   data2 = log(rowMeans(mat[, cells2, drop = F] + 1))
 50 |   out = data2 - data1 # backwards from Seurat (i.e., the proper way)
 51 | }, error = function(e) { return(NA_real_) })
 52 | 
 53 | # calculate pseudobulk variance
 54 | pseudobulk_variance = tryCatch({
 55 |   meta2 = meta %>%
 56 |     mutate(label = as.character(label),
 57 |            replicate = as.character(replicate))
 58 |   mm = model.matrix(~ 0 + replicate, data = meta2)
 59 |   mat_mm = expr %*% mm
 60 |   # drop empty columns
 61 |   keep_samples = colSums(mat_mm) > 0
 62 |   mat_mm %<>% extract(, keep_samples) %>% as.matrix()
 63 |   # normalize
 64 |   mat_mm %<>% edgeR::cpm()
 65 |   # grab the variance for each gene
 66 |   vars = sparseMatrixStats::rowSds(mat_mm)
 67 |   vars %<>% setNames(rownames(mat_mm))
 68 |   vars
 69 | }, error = function(e) { return(NA_real_) })
 70 | 
 71 | # calculate shuffled pseudobulk variance
 72 | shuffled_variance = tryCatch({
 73 |   meta2 = meta %>%
 74 |     mutate(label = as.character(label),
 75 |            replicate = as.character(replicate)) %>%
 76 |     group_by(cell_type, label) %>%
 77 |     mutate(replicate = sample(replicate))
 78 |   mm = model.matrix(~ 0 + replicate, data = meta2)
 79 |   mat_mm = expr %*% mm
 80 |   # drop empty columns
 81 |   keep_samples = colSums(mat_mm) > 0
 82 |   mat_mm %<>% extract(, keep_samples) %>% as.matrix()
 83 |   # normalize
 84 |   mat_mm %<>% edgeR::cpm()
 85 |   # grab the variance for each gene
 86 |   vars = sparseMatrixStats::rowSds(mat_mm)
 87 |   vars %<>% setNames(rownames(mat_mm))
 88 |   vars
 89 | }, error = function(e) { return(NA_real_) })
 90 | 
 91 | # calculate the ratio of real to shuffled variance
 92 | ratio = pseudobulk_variance / shuffled_variance
 93 | 
 94 | # convert to data frame
 95 | df = data.frame(gene = genes, mean = means, sd = sds, cov = covs,
 96 |                 pct_zero = pct_zeros, logFC = logFC,
 97 |                 pseudobulk_variance = pseudobulk_variance,
 98 |                 shuffled_variance = shuffled_variance,
 99 |                 pseudobulk_ratio = ratio) %>%
100 |   # drop genes with zero expression
101 |   filter(mean > 0)
102 | 
103 | # write
104 | saveRDS(df, output_file)
105 | 


--------------------------------------------------------------------------------
/R/analysis/simulations/inner-null-run-DE.R:
--------------------------------------------------------------------------------
 1 | # Run single-cell or pseudobulk DE analyses on all cell types in a dataset.
 2 | setwd("~/git/DE-analysis")
 3 | options(stringsAsFactors = F)
 4 | library(argparse)
 5 | 
 6 | # parse arguments
 7 | parser = ArgumentParser(prog = 'inner-run-DE.R')
 8 | parser$add_argument('--input_file', type = 'character', required = T)
 9 | parser$add_argument('--shuffle_replicates', type = 'character', required = T)
10 | parser$add_argument('--output_dir', type = 'character', required = T)
11 | parser$add_argument('--de_test', type = 'character', required = T)
12 | args = parser$parse_args()
13 | print(args)
14 | 
15 | library(tidyverse)
16 | library(magrittr)
17 | library(Seurat)
18 | library(Matrix)
19 | library(peakRAM)
20 | library(future)
21 | source("R/functions/get_comparisons.R")
22 | source("R/functions/run_DE.R")
23 | 
24 | # set up output filepath
25 | if (!dir.exists(args$output_dir))
26 |   dir.create(args$output_dir, recursive = T)
27 | dataset = args$input_file %>%
28 |   basename() %>%
29 |   gsub("\\.rds$", "", .)
30 | output_filename = paste0(dataset,
31 |   "-de_test=", args$de_test,
32 |   "-shuffle_replicates=", args$shuffle_replicates,
33 |   ".rds")
34 | output_file = file.path(args$output_dir, output_filename)
35 | 
36 | # read input file and extract matrix/metadata
37 | sc = readRDS(args$input_file)
38 | 
39 | # check for replicate shuffling
40 | if (args$shuffle_replicates == "YES") {
41 |   sc@meta.data %<>%
42 |     group_by(label) %>%
43 |     mutate(replicate = sample(replicate)) %>%
44 |     ungroup() %>%
45 |     set_rownames(colnames(sc))
46 | }
47 | 
48 | # run DE analysis
49 | DE = run_DE(sc, de_test = args$de_test)
50 | 
51 | # stop if empty
52 | if (nrow(DE) == 0)
53 |   stop("couldn't get any results")
54 | 
55 | # save results
56 | saveRDS(DE, output_file)
57 | 


--------------------------------------------------------------------------------
/R/analysis/simulations/inner_write_simulation_objects_null.R:
--------------------------------------------------------------------------------
 1 | # Generate the complete set of simulated scRNA-seq datasets for the experiment
 2 | # of null differential expression
 3 | setwd("~/git/DE-analysis")
 4 | options(stringsAsFactors = F)
 5 | library(argparse)
 6 | 
 7 | # parse arguments
 8 | parser = ArgumentParser(prog = 'inner_write_simulation_objects_null.R')
 9 | parser$add_argument('--n_cells', type = 'integer', required = T)
10 | parser$add_argument('--de_prob', type = 'double', required = T)
11 | parser$add_argument('--de_loc', type = 'double', required = T)
12 | parser$add_argument('--n_reps', type = 'integer', required = T)
13 | parser$add_argument('--sample_idx', type = 'integer', required = T)
14 | parser$add_argument('--output_dir', type = 'character', required = T)
15 | args = parser$parse_args()
16 | print(args)
17 | 
18 | library(Seurat)
19 | library(splatterBatch)
20 | library(scater)
21 | library(tidyverse)
22 | library(magrittr)
23 | library(Matrix)
24 | 
25 | source("R/functions/detect_system.R")
26 | 
27 | # check the output directory
28 | if (!dir.exists(args$output_dir)) {
29 |   dir.create(args$output_dir, recursive = T)
30 | }
31 | 
32 | # define output file
33 | output_filename = paste0("GSE96583",
34 |                          "-n_cells=", args$n_cells,
35 |                          "-de_prob=", args$de_prob,
36 |                          "-de_loc=", args$de_loc,
37 |                          "-n_reps=", args$n_reps,
38 |                          "-sample_idx=", args$sample_idx,
39 |                          ".rds")
40 | output_file = file.path(args$output_dir, output_filename)
41 | 
42 | # get parameters defined by Kang et al. IFN dataset
43 | params = readRDS(file.path(base_dir, "analysis/simulations/parameters",
44 |                            "parameters_GSE96583.rds"))
45 | 
46 | # calculate group probabilities
47 | group_probs = 1 / args$n_reps
48 | # assign groups
49 | unst = sample(seq(args$n_reps), args$n_reps/2)
50 | 
51 | # generate simulated cells
52 | sim = splatterBatch::splatSimulateGroups(
53 |   params = params,
54 |   seed = args$sample_idx,
55 |   batchCells = args$n_cells,
56 |   de.prob = args$de_prob,
57 |   de.facLoc = args$de_loc,
58 |   group.prob = rep(group_probs, args$n_reps), verbose = F
59 | ) %>% logNormCounts() %>% as.Seurat()
60 | 
61 | # adjust metadata for default input to Seurat
62 | sim@meta.data %<>%
63 |   dplyr::mutate(cell_type = paste0('cell_', 1)) %>%
64 |   dplyr::rename(label = Group) %>%
65 |   mutate(replicate = gsub("Group", "Replicate ", label)) %>%
66 |   mutate(label = as.numeric(gsub("Group", "", label))) %>%
67 |   mutate(label = ifelse(label %in% unst, 'unst', 'stim')) %>%
68 |   set_rownames(colnames(GetAssayData(sim)))
69 | 
70 | # save
71 | saveRDS(sim, output_file)
72 | 


--------------------------------------------------------------------------------
/R/analysis/simulations/outer-expr-summary-simulations.R:
--------------------------------------------------------------------------------
 1 | setwd("~/git/DE-analysis")
 2 | options(stringsAsFactors = F)
 3 | library(argparse)
 4 | 
 5 | # parse arguments
 6 | parser = ArgumentParser(prog = 'outer-expr-summary-simulations.R')
 7 | parser$add_argument('--allocation', type = 'character')
 8 | args = parser$parse_args()
 9 | 
10 | library(tidyverse)
11 | library(magrittr)
12 | source("R/functions/datasets.R")
13 | source("R/functions/submit_job.R")
14 | source("R/functions/detect_system.R")
15 | 
16 | # set up grid
17 | # limit this experiment to n_reps=3
18 | input_dir = file.path(base_dir, "analysis", "simulations", "null", "objects")
19 | grid = tidyr::crossing(
20 |   n_cells = c(100, 200, 500, 1000, 2000),
21 |   de_prob = 0.5,
22 |   de_loc = seq(0, 1, 0.1),
23 |   n_reps = c(3, 4, 5, 10, 20) * 2,
24 |   sample_idx = seq_len(10)
25 | ) %>%
26 |   # vary only one of n_cells/n_reps
27 |   filter(n_cells == 500 | n_reps == 6) %>%
28 |   mutate(input_filename = paste0("GSE96583",
29 |                                  "-n_cells=", n_cells,
30 |                                  "-de_prob=", de_prob,
31 |                                  "-de_loc=", de_loc,
32 |                                  "-n_reps=", n_reps,
33 |                                  "-sample_idx=", sample_idx,
34 |                                  '.rds'),
35 |          input_file = file.path(input_dir, input_filename))
36 | 
37 | # define output directory where results are stored
38 | output_dir = file.path(base_dir, "analysis/simulations/null/expr_summary")
39 | 
40 | # check which parameters are already complete
41 | overwrite = F
42 | grid0 = grid
43 | if (!overwrite) {
44 |   grid0 = grid %>%
45 |     mutate(output_filename = basename(input_file),
46 |            output_file = file.path(output_dir, output_filename),
47 |            exists = file.exists(output_file)) %>%
48 |     filter(!exists) %>%
49 |     dplyr::select(input_file)
50 | }
51 | 
52 | # write the grid that still needs to be run
53 | grid_file = "sh/analysis/simulations/grids/expr_summary.txt"
54 | grid_dir = dirname(grid_file)
55 | if (!dir.exists(grid_dir))
56 |   dir.create(grid_dir, recursive = T)
57 | write.table(grid0, grid_file, quote = F, row.names = F, sep = "\t")
58 | 
59 | # finally, run the job on whatever system we're on
60 | sh_dir = "~/git/DE-analysis/sh/analysis/simulations"
61 | script = file.path(sh_dir, "expr_summary_simulations.sh")
62 | submit_job(grid0, script, args$allocation, system)
63 | 


--------------------------------------------------------------------------------
/R/analysis/simulations/outer-null-run-DE.R:
--------------------------------------------------------------------------------
 1 | setwd("~/git/DE-analysis")
 2 | options(stringsAsFactors = F)
 3 | library(argparse)
 4 | 
 5 | # parse arguments
 6 | parser = ArgumentParser(prog = 'outer-nullrun-DE.R')
 7 | parser$add_argument('--allocation', type = 'character')
 8 | args = parser$parse_args()
 9 | 
10 | library(tidyverse)
11 | library(magrittr)
12 | source("R/functions/datasets.R")
13 | source("R/functions/submit_job.R")
14 | source("R/functions/detect_system.R")
15 | 
16 | # list input files
17 | input_dir = file.path(base_dir, "analysis", "simulations", "null", "objects")
18 | 
19 | grid = tidyr::crossing(
20 |   de_test = c(
21 |     ## single-cell methods, implemented in Seurat
22 |     "wilcox", "bimod", "t", "negbinom", "poisson", "LR", "MAST",
23 |     # pseudobulk methods
24 |     "pseudobulk_DESeq2,test?LRT", "pseudobulk_DESeq2,test?Wald",
25 |     "pseudobulk_limma,mode?voom", "pseudobulk_limma,mode?trend",
26 |     "pseudobulk_edgeR,test?QLF",
27 |     "pseudobulk_edgeR,test?LRT",
28 |     # mixed model, implemented in Seurat
29 |     "mixed_lm"
30 |   ),
31 |   n_cells = c(100, 200, 500, 1000, 2000),
32 |   de_prob = 0.5,
33 |   de_loc = seq(0, 1, 0.1),
34 |   n_reps = 2 * c(3, 4, 5, 10, 20),
35 |   sample_idx = seq_len(10),
36 |   shuffle_replicates = c("NO", "YES")
37 | ) %>%
38 |   filter(grepl("pseudo|mixed", de_test) | shuffle_replicates == 'NO') %>%
39 |   filter(n_cells == 500 & n_reps %in% c(2 * c(3, 4, 5, 10, 20)) |
40 |     n_reps == 6 & n_cells %in% c(100, 200, 500, 1000, 2000))
41 | 
42 | # write the raw array
43 | grid_file = "sh/analysis/simulations/grids/null_run_DE.raw.txt"
44 | grid_dir = dirname(grid_file)
45 | if (!dir.exists(grid_dir))
46 |   dir.create(grid_dir, recursive = T)
47 | write.table(grid, grid_file, quote = F, row.names = F, sep = "\t")
48 | 
49 | # define output directory where results are stored
50 | output_dir = file.path(base_dir, "analysis/simulations/null/DE")
51 | 
52 | # check which parameters are already complete
53 | overwrite = F
54 | grid0 = grid
55 | if (!overwrite) {
56 |   grid0 = grid %>%
57 |     mutate(output_filename = paste0("GSE96583",
58 |                                    "-n_cells=", n_cells,
59 |                                    "-de_prob=", de_prob,
60 |                                    "-de_loc=", de_loc,
61 |                                    "-n_reps=", n_reps,
62 |                                    "-sample_idx=", sample_idx,
63 |                                     '-de_test=', de_test,
64 |                                     '-shuffle_replicates=', shuffle_replicates,
65 |                                     '.rds'),
66 |           input_filename = paste0("GSE96583",
67 |                                          "-n_cells=", n_cells,
68 |                                          "-de_prob=", de_prob,
69 |                                          "-de_loc=", de_loc,
70 |                                          "-n_reps=", n_reps,
71 |                                          "-sample_idx=", sample_idx,
72 |                                           '.rds'),
73 |            output_file = file.path(output_dir, output_filename),
74 |            input_file = file.path(input_dir, input_filename),
75 |            exists = file.exists(output_file)) %>%
76 |     filter(!exists) %>%
77 |     dplyr::select(-output_file, -output_filename, -exists) %>%
78 |     # clean up grid
79 |     dplyr::select(input_file, de_test, shuffle_replicates)
80 | }
81 | 
82 | # run 5000 at a time
83 | grid0 %<>% dplyr::slice(1:5000)
84 | 
85 | # write the grid that still needs to be run
86 | write.table(grid0, "sh/analysis/simulations/grids/null_run_DE.txt",
87 |             quote = F, row.names = F, sep = "\t")
88 | 
89 | # finally, run the job on whatever system we're on
90 | sh_dir = "~/git/DE-analysis/sh/analysis/simulations"
91 | script = file.path(sh_dir, "null_run_DE.sh")
92 | submit_job(grid0, script, args$allocation, system)
93 | 


--------------------------------------------------------------------------------
/R/analysis/simulations/outer_write_simulation_objects_null.R:
--------------------------------------------------------------------------------
 1 | setwd("~/git/DE-analysis")
 2 | options(stringsAsFactors = F)
 3 | library(argparse)
 4 | 
 5 | # parse arguments
 6 | parser = ArgumentParser(prog = 'outer_write_simulation-objects_null.R')
 7 | parser$add_argument('--allocation', type = 'character')
 8 | args = parser$parse_args()
 9 | 
10 | library(tidyverse)
11 | library(magrittr)
12 | source("R/functions/datasets.R")
13 | source("R/functions/submit_job.R")
14 | source("R/functions/detect_system.R")
15 | 
16 | # define the output directory
17 | output_dir = file.path(base_dir, "analysis/simulations/null/objects")
18 | 
19 | grid = tidyr::crossing(
20 |   n_cells = c(100, 200, 500, 1000, 2000),
21 |   de_prob = 0.5,
22 |   de_loc = seq(0, 1, 0.1),
23 |   n_reps = c(6, 8, 10, 20, 40),
24 |   sample_idx = seq_len(10)
25 | )
26 | 
27 | # write the raw array
28 | grid_file = "sh/analysis/simulations/grids/write_null_objects.raw.txt"
29 | grid_dir = dirname(grid_file)
30 | if (!dir.exists(grid_dir))
31 |   dir.create(grid_dir, recursive = T)
32 | write.table(grid, grid_file, quote = F, row.names = F, sep = "\t")
33 | 
34 | # define output directory where results are stored
35 | output_dir = file.path(base_dir, "analysis/simulations/null/objects")
36 | 
37 | # check which parameters are already complete
38 | overwrite = F
39 | grid0 = grid
40 | if (!overwrite) {
41 |   grid0 = grid %>%
42 |     mutate(output_filename = paste0("GSE96583",
43 |                                    "-n_cells=", n_cells,
44 |                                    "-de_prob=", de_prob,
45 |                                    "-de_loc=", de_loc,
46 |                                    "-n_reps=", n_reps,
47 |                                    "-sample_idx=", sample_idx,
48 |                                     '.rds'),
49 |            output_file = file.path(output_dir, output_filename),
50 |            exists = file.exists(output_file)) %>%
51 |     filter(!exists) %>%
52 |     dplyr::select(-output_file, -output_filename, -exists)
53 | }
54 | 
55 | # write the grid that still needs to be run
56 | write.table(grid0, "sh/analysis/simulations/grids/write_null_objects.txt",
57 |             quote = F, row.names = F, sep = "\t")
58 | 
59 | # finally, run the job on whatever system we're on
60 | sh_dir = "~/git/DE-analysis/sh/analysis/simulations"
61 | script = file.path(sh_dir, "write_null_objects.sh")
62 | submit_job(grid0, script, args$allocation, system)
63 | 


--------------------------------------------------------------------------------
/R/analysis/simulations/summarise-null-DE-genes-per-bin.R:
--------------------------------------------------------------------------------
  1 | # Tally the number of DE genes within bins of genes grouped by delta-variance.
  2 | setwd("~/git/DE-analysis")
  3 | options(stringsAsFactors = F)
  4 | library(tidyverse)
  5 | library(magrittr)
  6 | source("R/functions/recode_colnames.R")
  7 | args = list(); source("R/functions/detect_system.R")
  8 | 
  9 | # set up grid
 10 | # limit this experiment to n_reps=3, n_cells=500
 11 | input_dir = file.path(base_dir, "analysis", "simulations", "null", "DE")
 12 | input_files = tidyr::crossing(
 13 |   de_test = c(
 14 |     ## single-cell methods, implemented in Seurat
 15 |     "wilcox", "bimod", "t", "negbinom", "poisson", "LR", "MAST",
 16 |     # pseudobulk methods
 17 |     "pseudobulk_DESeq2,test?LRT", "pseudobulk_DESeq2,test?Wald",
 18 |     "pseudobulk_limma,mode?voom", "pseudobulk_limma,mode?trend",
 19 |     "pseudobulk_edgeR,test?QLF",
 20 |     "pseudobulk_edgeR,test?LRT",
 21 |     # mixed model, implemented in Seurat
 22 |     "mixed_lm"
 23 |   ),
 24 |   n_cells = 500,
 25 |   de_prob = 0.5,
 26 |   de_loc = seq(0, 1, 0.1),
 27 |   n_reps = 6,
 28 |   sample_idx = seq_len(10),
 29 |   shuffle_replicates = c("NO", "YES")
 30 | ) %>%
 31 |   filter(grepl("pseudo|mixed", de_test) | shuffle_replicates == 'NO') %>%
 32 |   mutate(input_filename = paste0("GSE96583",
 33 |                                  "-n_cells=", n_cells,
 34 |                                  "-de_prob=", de_prob,
 35 |                                  "-de_loc=", de_loc,
 36 |                                  "-n_reps=", n_reps,
 37 |                                  "-sample_idx=", sample_idx,
 38 |                                  '-de_test=', de_test,
 39 |                                  '-shuffle_replicates=', shuffle_replicates,
 40 |                                  '.rds'),
 41 |          input_file = file.path(input_dir, input_filename)) %>%
 42 |   pull(input_file)
 43 | 
 44 | # also set up the expr_summary files
 45 | summary_dir = file.path(base_dir, "analysis", "simulations", "null",
 46 |                         "expr_summary")
 47 | summary_files = tidyr::crossing(
 48 |   n_cells = 500,
 49 |   de_prob = 0.5,
 50 |   de_loc = seq(0, 1, 0.1),
 51 |   n_reps = 6,
 52 |   sample_idx = seq_len(10)
 53 | ) %>% mutate(summary_filename = paste0("GSE96583",
 54 |                                        "-n_cells=", n_cells,
 55 |                                        "-de_prob=", de_prob,
 56 |                                        "-de_loc=", de_loc,
 57 |                                        "-n_reps=", n_reps,
 58 |                                        "-sample_idx=", sample_idx,
 59 |                                        '.rds'),
 60 |              summary_file = file.path(summary_dir, summary_filename)) %>%
 61 |   pull(summary_file)
 62 | 
 63 | # read all data
 64 | dats = map(input_files, ~ readRDS(.x) %>%
 65 |              # fix column names
 66 |              recode_colnames() %>%
 67 |              # fix p-values
 68 |              group_by(cell_type) %>%
 69 |              mutate(p_val_adj = p.adjust(p_val, method = 'BH')) %>%
 70 |              ungroup()
 71 | ) %>% setNames(basename(input_files))
 72 | 
 73 | # read all summary files
 74 | summary_dats = map(summary_files, readRDS) %>%
 75 |   setNames(basename(summary_files))
 76 | 
 77 | # combine all the data and join the two sources of data together
 78 | DE = bind_rows(dats, .id = 'filename') %>%
 79 |   separate(filename, into = c('dataset', 'n_cells', 'de_prob', 'de_loc',
 80 |                               'n_reps', 'sample_idx', 'de_test', 
 81 |                               'shuffle_replicates'), sep = '-') %>%
 82 |   mutate_all(~ gsub("^.*=|\\.rds$", "", .)) %>%
 83 |   type_convert() %>%
 84 |   # remove some columns
 85 |   dplyr::select(-test, -runtime, -mem_usage, -baseMean, -lfcSE, -logCPM,
 86 |                 -AveExpr, -B, -used_voom)
 87 | summary = bind_rows(summary_dats, .id = 'filename') %>%
 88 |   separate(filename, into = c('dataset', 'n_cells', 'de_prob', 'de_loc',
 89 |                               'n_reps', 'sample_idx'), sep = '-') %>%
 90 |   mutate_all(~ gsub("^.*=|\\.rds$", "", .)) %>%
 91 |   type_convert()
 92 | dat = left_join(DE, summary, by = c('dataset', 'n_cells', 'de_prob', 'de_loc',
 93 |                                     'n_reps', 'sample_idx', 'gene'))
 94 | 
 95 | # save the complete dataset
 96 | saveRDS(dat, file.path(base_dir, "analysis/simulations/null/expr_summary.rds"))
 97 | 
 98 | # now, calculate number of DE genes per bin
 99 | bins = 10
100 | dat0 = dat %>% 
101 |   mutate(delta_variance = shuffled_variance - pseudobulk_variance,
102 |          abs_delta_variance = abs(delta_variance))
103 | bin_results = dat0 %>%
104 |   # bin expression levels
105 |   group_by(dataset, n_cells, de_prob, de_loc, n_reps, sample_idx, de_test,
106 |            shuffle_replicates) %>%
107 |   arrange(abs_delta_variance) %>%
108 |   mutate(bin = cut(row_number() / n(),
109 |                    breaks = seq(0, bins) / bins),
110 |          bin = as.integer(bin)) %>%
111 |   ungroup() %>%
112 |   # count DE genes in each bin
113 |   group_by(dataset, n_cells, de_prob, de_loc, n_reps, sample_idx, de_test,
114 |            shuffle_replicates, bin) %>%
115 |   summarise(genes = sum(p_val_adj < 0.05)) %>%
116 |   ungroup() 
117 | 
118 | # save results
119 | output_file = "data/analysis/simulations/null-genes-per-bin.rds"
120 | output_dir = dirname(output_file)
121 | if (!dir.exists(output_dir))
122 |   dir.create(output_dir, recursive = T)
123 | saveRDS(bin_results, output_file)
124 | 


--------------------------------------------------------------------------------
/R/analysis/simulations/summarise-null-n-DE-genes.R:
--------------------------------------------------------------------------------
  1 | setwd("~/git/DE-analysis")
  2 | options(stringsAsFactors = F)
  3 | library(tidyverse)
  4 | library(magrittr)
  5 | 
  6 | # set up input directory
  7 | source("R/functions/detect_system.R")
  8 | 
  9 | # first, summarise the effect of n_reps, at n_cells == 500
 10 | input_dir = file.path(base_dir, "analysis", "simulations", "null", "DE")
 11 | input_files = tidyr::crossing(
 12 |   de_test = c(
 13 |     ## single-cell methods, implemented in Seurat
 14 |     "wilcox", "bimod", "t", "negbinom", "poisson", "LR", "MAST",
 15 |     # pseudobulk methods
 16 |     "pseudobulk_DESeq2,test?LRT", "pseudobulk_DESeq2,test?Wald",
 17 |     "pseudobulk_limma,mode?voom", "pseudobulk_limma,mode?trend",
 18 |     "pseudobulk_edgeR,test?QLF",
 19 |     "pseudobulk_edgeR,test?LRT",
 20 |     # mixed model, implemented in Seurat
 21 |     "mixed_lm"
 22 |   ),
 23 |   n_cells = c(100, 200, 500, 1000, 2000),
 24 |   de_prob = 0.5,
 25 |   de_loc = seq(0, 1, 0.1),
 26 |   n_reps = 2 * c(3, 4, 5, 10, 20),
 27 |   sample_idx = seq_len(10),
 28 |   shuffle_replicates = c("NO", "YES")
 29 | ) %>%
 30 |   filter(grepl("pseudo|mixed", de_test) | shuffle_replicates == 'NO') %>%
 31 |   filter(n_cells == 500 & n_reps %in% c(2 * c(3, 4, 5, 10, 20)) |
 32 |            n_reps == 6 & n_cells %in% c(100, 200, 500, 1000, 2000)) %>%
 33 |   mutate(input_filename = paste0("GSE96583",
 34 |                                  "-n_cells=", n_cells,
 35 |                                  "-de_prob=", de_prob,
 36 |                                  "-de_loc=", de_loc,
 37 |                                  "-n_reps=", n_reps,
 38 |                                  "-sample_idx=", sample_idx,
 39 |                                  '-de_test=', de_test,
 40 |                                  '-shuffle_replicates=', shuffle_replicates,
 41 |                                  '.rds'),
 42 |          input_file = file.path(input_dir, input_filename)) %>%
 43 |   pull(input_file)
 44 | 
 45 | # read all input files
 46 | dats = map(input_files, readRDS) %>%
 47 |   setNames(basename(input_files))
 48 | 
 49 | # calculate # of DE genes
 50 | n_DE = dats %>%
 51 |   map(~ {
 52 |     DE = bind_rows(., .id = 'comparison')
 53 |     # fix column names
 54 |     colnames(DE) %<>%
 55 |       fct_recode('p_val' = 'p.value',  ## DESeq2
 56 |                  'p_val' = 'pvalue',  ## DESeq2
 57 |                  'p_val' = 'p.value',  ## t/wilcox
 58 |                  'p_val' = 'P.Value',  ## limma
 59 |                  'p_val' = 'PValue'  , ## edgeR
 60 |                  'p_val_adj' = 'padj', ## DESeq2/t/wilcox
 61 |                  'p_val_adj' = 'adj.P.Val',      ## limma
 62 |                  'p_val_adj' = 'FDR',            ## edgeER
 63 |                  'avg_logFC' = 'log2FoldChange', ## DESEeq2
 64 |                  'avg_logFC' = 'logFC', ## limma/edgeR
 65 |                  'test_statistic' = 'stat', ## DESeq2
 66 |                  'test_statistic' = 'F', ## edgeR
 67 |                  'test_statistic' = 't', ## limma
 68 |                  'test_statistic' = 'LR', ## edgeR LRT
 69 |                  'test_statistic' = 'statistic' ## t
 70 |       ) %>%
 71 |       as.character()
 72 |     # re-calculate adjusted p-values using BH correction (Seurat does Bonferroni)
 73 |     DE %<>%
 74 |       group_by(comparison, cell_type) %>%
 75 |       mutate(p_val_adj = p.adjust(p_val, method = 'BH')) %>%
 76 |       ungroup()
 77 |     # combine results
 78 |     DE %>%
 79 |       group_by(comparison, test, cell_type) %>%
 80 |       summarise(n_1 = sum(p_val_adj < 0.01, na.rm = T),
 81 |                 n_5 = sum(p_val_adj < 0.05, na.rm = T),
 82 |                 n_10 = sum(p_val_adj < 0.1, na.rm = T)) %>%
 83 |       ungroup() %>%
 84 |       gather('fdr', 'n_genes', n_1:n_10) %>%
 85 |       mutate(fdr = paste0(gsub("^.*_", "", fdr), "%"))
 86 |   }) %>%
 87 |   bind_rows(.id = 'filename') %>%
 88 |   separate(filename, into = c(
 89 |     'dataset',
 90 |     'n_cells',
 91 |     'de_prob',
 92 |     'de_loc',
 93 |     'n_reps',
 94 |     'sample_idx',
 95 |     'de_test',
 96 |     'shuffle_replicates'),
 97 |     sep = '-') %>%
 98 |   mutate_at(vars(n_cells, de_prob, de_loc, n_reps, sample_idx,
 99 |                  de_test, shuffle_replicates), ~ gsub("^.*=|\\.rds$", "", .))
100 | 
101 | # save results
102 | output_file = "data/analysis/simulations/null-n-DE-genes.rds"
103 | output_dir = dirname(output_file)
104 | if (!dir.exists(output_dir))
105 |   dir.create(output_dir, recursive = T)
106 | saveRDS(n_DE, output_file)
107 | 


--------------------------------------------------------------------------------
/R/analysis/time_RAM/summarise-time-RAM-downsample_cells.R:
--------------------------------------------------------------------------------
 1 | # Summarize the time and RAM usage of the default DE analyses in datasets
 2 | # downsampled to a fixed number of cells (used to test mixed models).
 3 | setwd("~/git/DE-analysis")
 4 | options(stringsAsFactors = F)
 5 | library(tidyverse)
 6 | library(magrittr)
 7 | args = list(); source('R/functions/detect_system.R')
 8 | 
 9 | # set up input directory
10 | input_dir = file.path(base_dir, "analysis/downsample_cells/DE")
11 | input_files = list.files(input_dir, full.names = T, pattern ='*\\.rds$') %>%
12 |   extract(grepl("Angelidis|Hagai|CanoGamez|Reyfman", .))
13 | 
14 | # extract walltime and RAM from all input files
15 | dats = map(input_files, ~ {
16 |   print(.)
17 |   dat = readRDS(.) %>%
18 |     # remove empty data frames
19 |     extract(map_int(., nrow) > 0)
20 |   map(dat, ~ distinct(., runtime, mem_usage)) %>%
21 |     bind_rows(.id = 'comparison')
22 | }) %>%
23 |   setNames(basename(input_files))
24 | 
25 | # combine all results
26 | res = dats %>%
27 |   bind_rows(.id = 'filename') %>%
28 |   separate(filename, into = c('dataset', 'de_test', 'n_cells', 'sample_idx'),
29 |            sep = '-') %>%
30 |   mutate_at(vars(de_test, n_cells, sample_idx), 
31 |             ~ gsub("^.*=|\\.rds$", "", .)) %>%
32 |   type_convert()
33 | 
34 | # print summary
35 | res %>%
36 |   group_by(de_test) %>%
37 |   summarise(mean_ram = mean(mem_usage / 1e3),
38 |             mean_time = mean(runtime / 60)) %>%
39 |   arrange(desc(mean_time))
40 | 
41 | # save results
42 | output_file = "data/analysis/downsample_cells/time_RAM.rds"
43 | output_dir = dirname(output_file)
44 | if (!dir.exists(output_dir))
45 |   dir.create(output_dir, recursive = T)
46 | saveRDS(res, output_file)
47 | 


--------------------------------------------------------------------------------
/R/analysis/time_RAM/summarise-time-RAM.R:
--------------------------------------------------------------------------------
 1 | setwd("~/git/DE-analysis")
 2 | options(stringsAsFactors = F)
 3 | library(tidyverse)
 4 | library(magrittr)
 5 | args = list(); source("R/functions/detect_system.R")
 6 | 
 7 | # set up input
 8 | input_dir = file.path(base_dir, "analysis/run_DE")
 9 | input_files = list.files(input_dir, full.names = T, pattern ='*\\.rds$')
10 | 
11 | # extract walltime and RAM from all input files
12 | dats = map(input_files, ~ {
13 |   dat = readRDS(.)
14 |   map(dat, ~ distinct(., runtime, mem_usage)) %>%
15 |     bind_rows(.id = 'comparison')
16 | }) %>%
17 |   setNames(basename(input_files))
18 | 
19 | # combine all results
20 | res = dats %>%
21 |   bind_rows(.id = 'filename') %>%
22 |   separate(filename, into = c('dataset', 'de_test', 'shuffle_replicates'),
23 |            sep = '-') %>%
24 |   mutate_at(vars(de_test, shuffle_replicates), ~ gsub("^.*=|\\.rds$", "", .)) %>%
25 |   type_convert()
26 | 
27 | # print summary
28 | res %>%
29 |   group_by(de_test) %>%
30 |   summarise(mean_ram = mean(mem_usage / 1e3),
31 |             mean_time = mean(runtime / 60)) %>%
32 |   arrange(desc(mean_time))
33 | 
34 | # save results
35 | output_file = "data/analysis/run_DE/time_RAM.rds"
36 | output_dir = dirname(output_file)
37 | if (!dir.exists(output_dir))
38 |   dir.create(output_dir, recursive = T)
39 | saveRDS(res, output_file)
40 | 


--------------------------------------------------------------------------------
/R/functions/calculate_overlap.R:
--------------------------------------------------------------------------------
  1 | ## function to score the overlap between bulk and single-cell DE
  2 | calculate_overlap = function(bulk_de, sc_de,
  3 |                              method = c('fcc', 'aucc'),
  4 |                              k = NULL,
  5 |                              cor_method = c('pearson', 'spearman')) {
  6 |   method = match.arg(method)
  7 |   cor__method = match.arg(cor_method)
  8 |   if (method == 'fcc' & is.na(cor_method)) {
  9 |     stop("if using method='fcc', you must set cor_method (pearson/spearman)")
 10 |   }
 11 |   if (method == 'aucc' & is.na(k)) {
 12 |     stop("If using method='aucc', you must set k")
 13 |   }
 14 |   
 15 |   # double check column names
 16 |   colnames(bulk_de) %<>%
 17 |     fct_recode('p_val' = 'p.value',  ## DESeq2
 18 |                'p_val' = 'pvalue',  ## DESeq2
 19 |                'p_val' = 'p.value',  ## t/wilcox
 20 |                'p_val' = 'P.Value',  ## limma
 21 |                'p_val' = 'PValue'  , ## edgeR
 22 |                'p_val_adj' = 'padj', ## DESeq2/t/wilcox
 23 |                'p_val_adj' = 'adj.P.Val',      ## limma
 24 |                'p_val_adj' = 'FDR',            ## edgeER
 25 |                'avg_logFC' = 'log2FoldChange', ## DESEeq2
 26 |                'avg_logFC' = 'logFC', ## limma/edgeR
 27 |                'test_statistic' = 'stat', ## DESeq2
 28 |                'test_statistic' = 'F', ## edgeR
 29 |                'test_statistic' = 't', ## limma
 30 |                'test_statistic' = 'LR', ## edgeR LRT
 31 |                'test_statistic' = 'statistic' ## t
 32 |     ) %>%
 33 |     as.character()
 34 |   colnames(sc_de) %<>%
 35 |     fct_recode('p_val' = 'p.value',  ## DESeq2
 36 |                'p_val' = 'pvalue',  ## DESeq2
 37 |                'p_val' = 'p.value',  ## t/wilcox
 38 |                'p_val' = 'P.Value',  ## limma
 39 |                'p_val' = 'PValue'  , ## edgeR
 40 |                'p_val_adj' = 'padj', ## DESeq2/t/wilcox
 41 |                'p_val_adj' = 'adj.P.Val',      ## limma
 42 |                'p_val_adj' = 'FDR',            ## edgeER
 43 |                'avg_logFC' = 'log2FoldChange', ## DESEeq2
 44 |                'avg_logFC' = 'logFC', ## limma/edgeR
 45 |                'test_statistic' = 'stat', ## DESeq2
 46 |                'test_statistic' = 'F', ## edgeR
 47 |                'test_statistic' = 'LR', ## edgeR LRT
 48 |                'test_statistic' = 't', ## limma
 49 |                'test_statistic' = 'statistic' ## t
 50 |     ) %>%
 51 |     as.character()
 52 |   
 53 |   # remove NAs
 54 |   sc_de %<>% filter(!is.na(p_val), !is.na(p_val_adj), !is.na(test_statistic))
 55 |   bulk_de %<>% filter(!is.na(p_val), !is.na(p_val_adj), !is.na(test_statistic))
 56 |   
 57 |   # replace p=0 with minimum p-value
 58 |   sc_de_min = min(sc_de$p_val_adj[sc_de$p_val_adj > 0])
 59 |   bulk_de_min = min(bulk_de$p_val_adj[bulk_de$p_val_adj > 0])
 60 |   sc_de %<>% 
 61 |     mutate(p_val_adj = ifelse(p_val_adj <= sc_de_min, sc_de_min, p_val_adj))
 62 |   bulk_de %<>% 
 63 |     mutate(p_val_adj = ifelse(p_val_adj <= bulk_de_min, bulk_de_min, p_val_adj))
 64 |   ## repeat for raw p-values
 65 |   sc_de_min = min(sc_de$p_val[sc_de$p_val > 0])
 66 |   bulk_de_min = min(bulk_de$p_val[bulk_de$p_val > 0])
 67 |   sc_de %<>% 
 68 |     mutate(p_val = ifelse(p_val <= sc_de_min, sc_de_min, p_val))
 69 |   bulk_de %<>%
 70 |     mutate(p_val = ifelse(p_val <= bulk_de_min, bulk_de_min, p_val))
 71 |   
 72 |   # filter to genes detected in both single-cell and bulk data
 73 |   genes = intersect(bulk_de$gene, sc_de$gene)
 74 |   sc_de %<>% filter(gene %in% genes) %>% arrange(gene)
 75 |   bulk_de %<>% filter(gene %in% genes) %>% arrange(gene)
 76 |   
 77 |   if (method == 'fcc') {
 78 |     # fold-change correlation
 79 |     genes = intersect(bulk_de$gene, sc_de$gene)
 80 |     bulk_de %<>% filter(gene %in% genes)
 81 |     sc_de %<>% filter(gene %in% genes)
 82 |     cor = cor(
 83 |       bulk_de %>%
 84 |         mutate(stat = sign(avg_logFC) * abs(test_statistic)) %>% 
 85 |         arrange(gene) %>%
 86 |         pull(stat),
 87 |       sc_de %>%
 88 |         mutate(stat = sign(avg_logFC) * abs(test_statistic)) %>% 
 89 |         arrange(gene) %>%
 90 |         pull(stat),
 91 |       method = cor_method
 92 |     )
 93 |   } else if (method == 'aucc') {
 94 |     # area under the concordance curve
 95 |     k = as.integer(k)
 96 |     ## rank in descending order first by p_val
 97 |     ## break ties by the abs() of the test_statistic
 98 |     vec1 = bulk_de %>%
 99 |       arrange(p_val, desc(abs(test_statistic))) %>%
100 |       pull(gene) %>%
101 |       head(k)
102 |     vec2 = sc_de %>%
103 |       arrange(p_val, desc(abs(test_statistic))) %>%
104 |       pull(gene) %>%
105 |       head(k)
106 |     
107 |     concordance_curve = map_dbl(seq_len(k), ~ {
108 |       v1 = vec1[seq_len(.)]
109 |       v2 = vec2[seq_len(.)]
110 |       length(intersect(v1, v2))
111 |     })
112 |     denom = k * (k + 1) / 2
113 |     aucc = sum(concordance_curve) / denom
114 |     return(aucc)
115 |   }
116 | }
117 | 


--------------------------------------------------------------------------------
/R/functions/datasets.R:
--------------------------------------------------------------------------------
 1 | datasets = c(
 2 |   "Angelidis2019",
 3 |   "Arneson2018",
 4 |   "Avey2018",
 5 |   "Aztekin2019",
 6 |   "Bhattacherjee2019",
 7 |   "Brenner2020",
 8 |   "CanoGamez2020",
 9 |   "Cheng2019",
10 |   "Co2020",
11 |   "Crowell2019",
12 |   "Davie2018",
13 |   "Denisenko2020",
14 |   "Der2019_kidney",
15 |   "Der2019_skin",
16 |   "Goldfarbmuren2020",
17 |   "Grubman2019",
18 |   "Gunner2019",
19 |   "Haber2017_droplet",
20 |   "Hagai2018_mouse",
21 |   "Hagai2018_rat",
22 |   "Hagai2018_pig",
23 |   "Hagai2018_rabbit",
24 |   "Hashimoto2019",
25 |   "Hrvatin2018",
26 |   "Hu2017",
27 |   "Huang2020",
28 |   "Jaitin2018_HFD",
29 |   "Jakel2019",
30 |   "Kotliarov2020",
31 |   "Kang2018",
32 |   "Kim2019",
33 |   "Madissoon2020",
34 |   "Mathys2019",
35 |   "Nagy2020",
36 |   "OrdovasMontanes2018",
37 |   "Rault2020",
38 |   "Reyes2020",
39 |   "Reyfman2020",
40 |   "Rossi2019",
41 |   "Sathyamurthy2018",
42 |   "Schirmer2019",
43 |   "Schafflick2020_CSF",
44 |   "Schafflick2020_PBMCs",
45 |   "Skinnider2020",
46 |   "Tran2019",
47 |   "Wagner2018",
48 |   "Wang2020",
49 |   "Wilk2020",
50 |   "Wirka2019",
51 |   "Wu2017",
52 |   "Ximerakis2019"
53 | )
54 | 
55 | bulk_datasets = c(
56 |   "bulk_rnaseq/Angelidis2019_facsepi",
57 |   "bulk_rnaseq/Angelidis2019_facsmac",
58 |   "bulk_rnaseq/CanoGamez2020",
59 |   "bulk_rnaseq/Hagai2018_mouse",
60 |   "bulk_rnaseq/Hagai2018_rat",
61 |   "bulk_rnaseq/Hagai2018_pig",
62 |   "bulk_rnaseq/Hagai2018_rabbit",
63 |   "proteomics/CanoGamez2020:proteomics"
64 | )
65 | 


--------------------------------------------------------------------------------
/R/functions/get_bulk_comparisons.R:
--------------------------------------------------------------------------------
 1 | get_bulk_comparisons = function(dataset, expr, meta) {
 2 |   # set up container
 3 |   comparisons = list()
 4 |   # handle each dataset appropriately
 5 |   if (dataset == 'Angelidis2019') {
 6 |     meta %<>% mutate(label = factor(label, levels = c("3m", "24m")))
 7 |     results[[1]] = list(expr = expr, meta = meta)
 8 |   } else if (dataset == 'Hagai2018') {
 9 |     ## Hagai2018: two different binary comparisons
10 |     for (comparison in c('LPS4', 'PIC4')) {
11 |       message('  processing comparison: ', comparison, ' ...')
12 |       meta0 = meta %>%
13 |         rownames_to_column(var = 'sample') %>%
14 |         filter(label %in% c('UNST', comparison)) %>%
15 |         mutate(label = factor(label, levels = c("UNST", comparison)))
16 |       expr0 = expr[, meta0$sample]
17 |       results[[comparison]] = list(expr = expr0, meta = meta0)
18 |     }
19 |   } else if (dataset == 'CanoGamez2020') {
20 |     ## CanoGamez2020: Two different cell types, 7 different cytokine conditions
21 |     cytokines = c("IFNB", "Th17", "Resting", "Th2", "Th0", "iTreg", "Th1")
22 |     cell_types = c('Naive', 'Memory')
23 |     stimulation_times = c("16h", "5d")
24 |     grid = tidyr::crossing(cytokine1 = cytokines,
25 |                            cytokine2 = cytokines,
26 |                            cell_type = cell_types,
27 |                            stimulation_time = stimulation_times) %>%
28 |       filter(cytokine1 != cytokine2) %>%
29 |       filter(cytokine1 == 'Resting')
30 |     for (grid_idx in seq_len(nrow(grid))) {
31 |       cytokine1 = grid$cytokine1[grid_idx]
32 |       cytokine2 = grid$cytokine2[grid_idx]
33 |       cell_type = grid$cell_type[grid_idx]
34 |       stimulation_time = grid$stimulation_time[grid_idx]
35 |       key = paste0(cytokine1, '|', cytokine2, "|", cell_type, "|",
36 |                    stimulation_time)
37 |       message('  processing comparison: ', key, ' ...')
38 |       
39 |       meta0 = meta %>%
40 |         mutate(idx = row_number()) %>%
41 |         filter(stimulation_time == !!stimulation_time) %>%
42 |         filter(grepl(!!cell_type, cell_type)) %>%
43 |         filter(cytokine_condition %in% c(cytokine1, cytokine2)) %>%
44 |         mutate(cytokine_condition = factor(cytokine_condition,
45 |                                            levels = c(cytokine1, cytokine2)),
46 |                label = cytokine_condition)
47 |       expr0 = expr %>% extract(, meta0$idx)
48 |       results[[key]] = list(expr = expr0, meta = meta0)
49 |     }
50 |   } else if (dataset == 'CanoGamez2020:proteomics') {
51 |     ## CanoGamez2020: Two different cell types, 7 different cytokine conditions
52 |     cytokines = c("IFNB", "Th17", "Resting", "Th2", "Th0", "iTreg", "Th1")
53 |     cell_types = c('Naive', 'Memory')
54 |     grid = tidyr::crossing(cytokine1 = cytokines,
55 |                            cytokine2 = cytokines,
56 |                            cell_type = cell_types) %>%
57 |       filter(cytokine1 != cytokine2) %>%
58 |       filter(cytokine1 == 'Resting')
59 |     for (grid_idx in seq_len(nrow(grid))) {
60 |       cell_type = grid$cell_type[grid_idx]
61 |       cytokine1 = grid$cytokine1[grid_idx]
62 |       cytokine2 = grid$cytokine2[grid_idx]
63 |       key = paste0(cytokine1, '|', cytokine2, "|", cell_type)
64 |       message('  processing comparison: ', key, ' ...')
65 |       
66 |       meta0 = meta %>%
67 |         mutate(idx = row_number()) %>%
68 |         filter(cell_type == !!cell_type) %>%
69 |         filter(cytokine_condition %in% c(cytokine1, cytokine2)) %>%
70 |         mutate(cytokine_condition = factor(cytokine_condition,
71 |                                            levels = c(cytokine1, cytokine2)),
72 |                label = cytokine_condition)
73 |       expr0 = expr %>% extract(, meta0$idx)
74 |       results[[key]] = list(expr = expr0, meta = meta0)
75 |     }
76 |   } else {
77 |     stop("invalid dataset: ", dataset, " ...")
78 |   }
79 |   
80 |   # drop all unused factor levels
81 |   for (comparison_idx in seq_along(results)) {
82 |     results[[comparison_idx]]$meta %<>% droplevels()
83 |   }
84 |   
85 |   return(results)
86 | }
87 | 


--------------------------------------------------------------------------------
/R/functions/get_comparisons.R:
--------------------------------------------------------------------------------
  1 | ## Get subset expression matrices containing all comparisons for a given
  2 | ## dataset.
  3 | get_comparisons = function(dataset, expr, meta) {
  4 |   # set up container
  5 |   results = list()
  6 |   # handle each dataset appropriately
  7 |   if (dataset %in% c('Arneson2018',
  8 |                      'Avey2018',
  9 |                      'Brenner2020',
 10 |                      'Cheng2019',
 11 |                      'Co2020',
 12 |                      'Crowell2019',
 13 |                      'Der2019_kidney',
 14 |                      'Der2019_skin',
 15 |                      'Grubman2019',
 16 |                      'Hashimoto2019',
 17 |                      'Hu2017',
 18 |                      'Jakel2019',
 19 |                      'Mathys2019',
 20 |                      'Nagy2020',
 21 |                      'OrdovasMontanes2018',
 22 |                      'Rault2020',
 23 |                      'Rossi2019',
 24 |                      'Sathyamurthy2018',
 25 |                      'Schafflick2020_CSF',
 26 |                      'Schafflick2020_PBMCs',
 27 |                      'Skinnider2020',
 28 |                      'Wang2020'
 29 |   )) {
 30 |     results[[1]] = list(expr = expr, meta = meta)
 31 |   } else if (dataset %in% c('Goldfarbmuren2020',
 32 |                             'Schirmer2019',
 33 |                             'Ximerakis2019')) {
 34 |     ## two cell type levels
 35 |     results[['cell_type']] = list(expr = expr, meta = meta)
 36 |     meta0 = meta %>%
 37 |       dplyr::select(-cell_type) %>%
 38 |       dplyr::rename(cell_type = global_cell_type)
 39 |     results[['global_cell_type']] = list(expr = expr, meta = meta0)
 40 |   } else if (dataset == 'Bhattacherjee2019') {
 41 |     ## Bhattacherjee2019: two possible levels of cell types, and
 42 |     ## three different timepoints
 43 |     timepoints = c('Maintenance', '48h', '15d')
 44 |     cell_types = c('cell_type', 'global_cell_type')
 45 |     grid = tidyr::crossing(timepoint = timepoints, cell_type = cell_types)
 46 |     for (grid_idx in seq_len(nrow(grid))) {
 47 |       timepoint = grid$timepoint[grid_idx]
 48 |       cell_type = grid$cell_type[grid_idx]
 49 |       key = paste0(timepoint, '|', cell_type)
 50 |       message('  processing comparison: ', key, ' ...')
 51 | 
 52 |       meta0 = meta %>%
 53 |         mutate(idx = row_number()) %>%
 54 |         filter(grepl(timepoint, label))
 55 |       expr0 = expr %>% extract(, meta0$idx)
 56 |       if (cell_type == "global_cell_type") {
 57 |         meta0 %<>%
 58 |           dplyr::select(-cell_type) %>%
 59 |           dplyr::rename(cell_type = global_cell_type)
 60 |       }
 61 | 
 62 |       results[[key]] = list(expr = expr0, meta = meta0)
 63 |     }
 64 |   } else if (dataset == 'Huang2020') {
 65 |     ## Huang2020: two possible levels of cell types, and
 66 |     ## three different comparisons
 67 |     conditions = c('CD', 'colitis', 'UC')
 68 |     cell_types = c('cell_type', 'global_cell_type')
 69 |     grid = tidyr::crossing(condition = conditions, cell_type = cell_types)
 70 |     for (grid_idx in seq_len(nrow(grid))) {
 71 |       condition = grid$condition[grid_idx]
 72 |       cell_type = grid$cell_type[grid_idx]
 73 |       key = paste0(condition, '|', cell_type)
 74 |       message('  processing comparison: ', key, ' ...')
 75 | 
 76 |       meta0 = meta %>%
 77 |         mutate(idx = row_number()) %>%
 78 |         filter(label %in% c(condition, 'control'))
 79 |       expr0 = expr %>% extract(, meta0$idx)
 80 |       if (cell_type == "global_cell_type") {
 81 |         meta0 %<>%
 82 |           dplyr::select(-cell_type) %>%
 83 |           dplyr::rename(cell_type = global_cell_type)
 84 |       }
 85 | 
 86 |       results[[key]] = list(expr = expr0, meta = meta0)
 87 |     }
 88 |   } else if (dataset == 'Reyes2020') {
 89 |     ## Reyes2020: two possible levels of cell types, and
 90 |     ## three different comparisons
 91 |     cohorts = c('ICU-SEP vs. ICU-NoSEP',
 92 |                 'Sepsis vs. control',
 93 |                 'Sepsis vs. Leuk-UTI')
 94 |     cell_types = c('cell_type', 'global_cell_type')
 95 |     grid = tidyr::crossing(cohort = cohorts, cell_type = cell_types)
 96 |     for (grid_idx in seq_len(nrow(grid))) {
 97 |       cohort = grid$cohort[grid_idx]
 98 |       cell_type = grid$cell_type[grid_idx]
 99 |       key = paste0(cohort, '|', cell_type)
100 |       message('  processing comparison: ', key, ' ...')
101 | 
102 |       if (cohort == 'ICU-SEP vs. ICU-NoSEP') {
103 |         meta0 = meta %>%
104 |           mutate(idx = row_number()) %>%
105 |           filter(label %in% c("ICU-SEP", "ICU-NoSEP"))
106 |       } else if (cohort == 'Sepsis vs. control') {
107 |         meta0 = meta %>%
108 |           mutate(idx = row_number()) %>%
109 |           filter(label %in% c("Int-URO", "URO", "Bac-SEP", "ICU-SEP",
110 |                               "Control")) %>%
111 |           mutate(label = ifelse(label == 'Control', label, 'Sepsis'))
112 |       } else if (cohort == 'Sepsis vs. Leuk-UTI') {
113 |         meta0 = meta %>%
114 |           mutate(idx = row_number()) %>%
115 |           filter(label %in% c("Int-URO", "URO", "Bac-SEP", "ICU-SEP",
116 |                               "Leuk-UTI")) %>%
117 |           mutate(label = ifelse(label == 'Leuk-UTI', label, 'Sepsis'))
118 |       }
119 |       expr0 = expr %>% extract(, meta0$idx)
120 |       if (cell_type == "global_cell_type") {
121 |         meta0 %<>%
122 |           dplyr::select(-cell_type) %>%
123 |           dplyr::rename(cell_type = global_cell_type)
124 |       }
125 |       results[[key]] = list(expr = expr0, meta = meta0)
126 |     }
127 |   } else if (dataset == 'Wu2017') {
128 |     ## Wu2017: two different binary comparisons
129 |     for (comparison in c('stress', 'seizure')) {
130 |       message('  processing comparison: ', comparison, ' ...')
131 |       meta0 = meta %>%
132 |         mutate(idx = row_number()) %>%
133 |         filter(label %in% c('control', comparison)) %>%
134 |         droplevels()
135 |       expr0 = expr[, meta0$idx]
136 |       results[[comparison]] = list(expr = expr0, meta = meta0)
137 |     }
138 |   } else if (dataset == 'Wagner2018') {
139 |     ## Wagner2018: tyrosinase vs. chordin
140 |     meta0 = meta %>%
141 |       mutate(idx = row_number()) %>%
142 |       filter(label != 'WT')
143 |     expr0 = expr[, meta0$idx]
144 |     results[[1]] = list(expr = expr0, meta = meta0)
145 |   } else if (dataset == 'Gunner2019') {
146 |     ## Gunner2019: control vs. lesion, in entire dataset or by genotype
147 |     comparisons = c('entire_dataset', 'homozygous', 'heterozygous')
148 |     for (comparison in comparisons) {
149 |       message('  processing comparison: ', comparison, ' ...')
150 |       if (comparison == 'entire_dataset') {
151 |         meta0 = meta
152 |         expr0 = expr
153 |       } else {
154 |         meta0 = meta %>%
155 |           mutate(idx = row_number()) %>%
156 |           filter(genotype == Hmisc::capitalize(comparison))
157 |         expr0 = expr[, meta0$idx]
158 |       }
159 |       results[[comparison]] = list(expr = expr0, meta = meta0)
160 |     }
161 |   } else if (dataset == 'Haber2017_droplet') {
162 |     ## Haber2017: three binary comparisons vs. control
163 |     comparisons = c('Hpoly.Day3', 'Hpoly.Day10', 'Salmonella')
164 |     for (comparison in comparisons) {
165 |       message('  processing comparison: ', comparison, ' ...')
166 |       meta0 = meta %>%
167 |         mutate(idx = row_number()) %>%
168 |         filter(label %in% c('Control', comparison))
169 |       expr0 = expr[, meta0$idx]
170 |       results[[comparison]] = list(expr = expr0, meta = meta0)
171 |     }
172 |   } else if (dataset == 'Aztekin2019') {
173 |     ## Aztekin2019: amputation response
174 |     grps = c("ST40_1", "ST40_0")
175 |     meta0 = meta %>%
176 |       mutate(idx = row_number()) %>%
177 |       filter(label %in% grps)
178 |     expr0 = expr %>%
179 |       extract(, meta0$idx)
180 |     results[[1]] = list(expr = expr0, meta = meta0)
181 |   } else if (dataset == 'Kim2019') {
182 |     ## Kim2019: aggression
183 |     meta0 = meta %>%
184 |       mutate(idx = row_number()) %>%
185 |       filter(label %in% c('Control', 'Aggression'))
186 |     expr0 = expr[, meta0$idx]
187 |     results[[1]] = list(expr = expr0, meta = meta0)
188 |   } else if (dataset == 'Wirka2019') {
189 |     ## Wirka2019: 8w/0w in WT
190 |     # subset metadata
191 |     meta0 = meta %>%
192 |       mutate(idx = row_number())
193 |     ## filter by genotype
194 |     meta0 %<>% filter(phenotype == 'wt')
195 |     ## filter by timepoint
196 |     meta0 %<>% filter(label != '16wk')
197 |     # subset expression
198 |     expr0 = expr[, meta0$idx]
199 |     results[[1]] = list(expr = expr0, meta = meta0)
200 |   } else if (dataset == 'Jaitin2018_HFD') {
201 |     ## Jaitin2019 (dataset 1): HFD vs. NC, 6w
202 |     meta0 = meta %>%
203 |       mutate(idx = row_number()) %>%
204 |       filter(timepoint == 6)
205 |     expr0 = expr[, meta0$idx]
206 |     results[[1]] = list(expr = expr0, meta = meta0)
207 |   } else if (dataset == 'CanoGamez2020') {
208 |     ## CanoGamez2020: compare all cytokines to unstimulated
209 |     comparisons = unique(meta$label) %>%
210 |       setdiff('UNS')
211 |     for (comparison in comparisons) {
212 |       message('  processing comparison: ', comparison, ' ...')
213 |       meta0 = meta %>%
214 |         mutate(idx = row_number()) %>%
215 |         filter(label %in% c('UNS', comparison)) %>%
216 |         mutate(label = factor(label, levels = c('UNS', comparison)))
217 |       expr0 = expr[, meta0$idx]
218 |       results[[comparison]] = list(expr = expr0, meta = meta0)
219 |     }
220 |   } else if (dataset == 'Davie2018') {
221 |     ## Davie2018: all combinations of age/sex/genotype
222 |     ages = unique(meta$label)
223 | 
224 |     ### age
225 |     comparisons = tidyr::crossing(age1 = ages, age2 = ages) %>%
226 |       filter(age1 < age2)
227 |     for (grid_idx in seq_len(nrow(comparisons))) {
228 |       age1 = comparisons$age1[grid_idx]
229 |       age2 = comparisons$age2[grid_idx]
230 |       comparison = paste0(age1, '|', age2)
231 |       message('  processing comparison: ', comparison, ' ...')
232 |       meta0 = meta %>%
233 |         mutate(idx = row_number()) %>%
234 |         filter(label %in% c(age1, age2))
235 |       expr0 = expr[, meta0$idx]
236 |       results[[comparison]] = list(expr = expr0, meta = meta0)
237 |     }
238 | 
239 |     ### sex
240 |     message('  processing comparison: sex ...')
241 |     meta0 = meta %>%
242 |       dplyr::select(-label) %>%
243 |       dplyr::rename(label = gender)
244 |     results[['sex']] = list(expr = expr, meta = meta0)
245 | 
246 |     ### genotype
247 |     message('  processing comparison: genotype ...')
248 |     meta0 = meta %>%
249 |       dplyr::select(-label) %>%
250 |       dplyr::rename(label = genotype)
251 |     results[['genotype']] = list(expr = expr, meta = meta0)
252 |   } else if (dataset == 'Wagner2018') {
253 |     ## Wagner2018: tyrosinase vs. chordin
254 |     meta0 = meta %>%
255 |       mutate(idx = row_number()) %>%
256 |       filter(label != 'WT')
257 |     expr0 = expr[, meta0$idx]
258 |     results[[1]] = list(expr = expr0, meta = meta0)
259 |   } else if (dataset == 'Hrvatin2018') {
260 |     ## Hrvatin2018: focus on 0h vs. 4h
261 |     meta0 = meta %>%
262 |       mutate(idx = row_number()) %>%
263 |       filter(label != '1h')
264 |     expr0 = expr[, meta0$idx]
265 |     results[[1]] = list(expr = expr0, meta = meta0)
266 |   } else if (dataset == 'Madissoon2020') {
267 |     ## Madissoon2020: compare all timepoints to 0 h
268 |     comparisons = unique(meta$label) %>%
269 |       setdiff('0h')
270 |     for (comparison in comparisons) {
271 |       message('  processing comparison: ', comparison, ' ...')
272 |       meta0 = meta %>%
273 |         mutate(idx = row_number()) %>%
274 |         filter(label %in% c('0h', comparison)) %>%
275 |         mutate(label = factor(label, levels = c('0h', comparison)))
276 |       expr0 = expr[, meta0$idx]
277 |       results[[comparison]] = list(expr = expr0, meta = meta0)
278 |     }
279 |   } else if (dataset == 'Tran2019') {
280 |     ## Tran2019: compare all timepoints to control
281 |     comparisons = unique(meta$label) %>% setdiff('Ctrl')
282 |     for (comparison in comparisons) {
283 |       message('  processing comparison: ', comparison, ' ...')
284 |       meta0 = meta %>%
285 |         mutate(idx = row_number()) %>%
286 |         filter(label %in% c('Ctrl', comparison))
287 |       expr0 = expr[, meta0$idx]
288 |       results[[comparison]] = list(expr = expr0, meta = meta0)
289 |     }
290 |   } else if (dataset == 'Cuomo2020') {
291 |     ## Cuomo2020: compare all timepoints to day 0
292 |     comparisons = unique(meta$label) %>% setdiff('day0')
293 |     for (comparison in comparisons) {
294 |       message('  processing comparison: ', comparison, ' ...')
295 |       meta0 = meta %>%
296 |         mutate(idx = row_number()) %>%
297 |         filter(label %in% c('day0', comparison)) %>%
298 |         mutate(label = factor(label, levels = c('day0', comparison)))
299 |       expr0 = expr[, meta0$idx]
300 |       results[[comparison]] = list(expr = expr0, meta = meta0)
301 |     }
302 |   } else if (dataset %in% c(
303 |     "Hagai2018_mouse",
304 |     "Hagai2018_rat",
305 |     "Hagai2018_pig",
306 |     "Hagai2018_rabbit"
307 |   )) {
308 |     ## Hagai2018: compare all timepoints to unstimulated
309 |     comparisons = unique(meta$label) %>% setdiff('unst')
310 |     for (comparison in comparisons) {
311 |       message('  processing comparison: ', comparison, ' ...')
312 |       meta0 = meta %>%
313 |         mutate(idx = row_number()) %>%
314 |         filter(label %in% c('unst', comparison)) %>%
315 |         mutate(label = factor(label, levels = c('unst', comparison)))
316 |       expr0 = expr[, meta0$idx]
317 |       results[[comparison]] = list(expr = expr0, meta = meta0)
318 |     }
319 |   } else if (dataset == 'Wilk2020') {
320 |     ## Wilk2020: Compare each vent status to control, all COVID to control
321 |     # and vent to no vent covid
322 |     ## Comparison 1: all COVID to healthy control
323 |     ## Comparison 2: ventilated COVID to healthy control
324 |     ## Comparison 3: non-ventilated COVID to healthy control
325 |     ## Comparison 4: non-ventilated COVID to ventilated covid
326 |     comparisons = c("Healthy_COVID", "Healthy_vCOVID", "Healthy_nvCOVID",
327 |                     "nvCOVID_vCOVID")
328 |     for (comparison in comparisons) {
329 |       message('  processing comparison: ', comparison, ' ...')
330 |       if (comparison == "Healthy_COVID") {
331 |         meta0 = meta %>%
332 |           mutate(idx = row_number()) %>%
333 |           mutate(label = factor(label, levels = c('Healthy', 'COVID')))
334 |         expr0 = expr[, meta0$idx]
335 |         results[[comparison]] = list(expr = expr0, meta = meta0)
336 |       }
337 |       if (comparison == "Healthy_vCOVID") {
338 |         meta0 = meta %>%
339 |           mutate(idx = row_number()) %>%
340 |           filter(Ventilated %in% c("Healthy", "Vent")) %>%
341 |           mutate(label = factor(label, levels = c('Healthy', 'COVID')))
342 |         expr0 = expr[, meta0$idx]
343 |         results[[comparison]] = list(expr = expr0, meta = meta0)
344 |       }
345 |       if (comparison == "Healthy_nvCOVID") {
346 |         meta0 = meta %>%
347 |           mutate(idx = row_number()) %>%
348 |           filter(Ventilated %in% c("Healthy", "NonVent")) %>%
349 |           mutate(label = factor(label, levels = c('Healthy', 'COVID')))
350 |         expr0 = expr[, meta0$idx]
351 |         results[[comparison]] = list(expr = expr0, meta = meta0)
352 |       }
353 |       if (comparison == "nvCOVID_vCOVID") {
354 |         meta0 = meta %>%
355 |           mutate(idx = row_number()) %>%
356 |           filter(label == 'COVID') %>%
357 |           mutate(label = Ventilated) %>%
358 |           mutate(label = factor(label, levels = c('NonVent', 'Vent')))
359 |         expr0 = expr[, meta0$idx]
360 |         results[[comparison]] = list(expr = expr0, meta = meta0)
361 |       }
362 |     }
363 |   } else if (dataset %in% c('Kotliarov2020')) {
364 |     ## explicitly specify factor levels
365 |     meta %<>% mutate(label = factor(label, levels = c('low', 'high')))
366 |     results[[1]] = list(expr = expr, meta = meta)
367 |   } else if (dataset == 'Kang2018') {
368 |     ## explicitly specify factor levels
369 |     meta %<>% mutate(label = factor(label, levels = c('ctrl', 'stim')))
370 |     results[[1]] = list(expr = expr, meta = meta)
371 |   } else if (dataset == 'Angelidis2019') {
372 |     ## explicitly specify factor levels
373 |     meta %<>% mutate(label = factor(label, levels = c('3m', '24m')))
374 |     results[[1]] = list(expr = expr, meta = meta)
375 |   } else if (dataset == 'Reyfman2020') {
376 |     ## explicitly specify factor levels
377 |     meta %<>% mutate(label = factor(label, levels = c('Control',
378 |                                                       'Pulmonary fibrosis')))
379 |     results[[1]] = list(expr = expr, meta = meta)
380 |   } else if (dataset == 'Denisenko2020') {
381 |     #' warm vs. cold (fresh): for comparison to bulk
382 |     #' methanol vs. fresh, cryopreserved vs. fresh in warm/cold
383 |     comparisons = c("warm_vs_cold",
384 |                     "methanol_warm",
385 |                     "methanol_cold",
386 |                     "cryopreserved_warm",
387 |                     "cryopreserved_cold")
388 |     for (comparison in comparisons) {
389 |       meta0 = meta %>%
390 |         mutate(idx = row_number()) %>%
391 |         # drop single-nucleus, v2/v3 comparison
392 |         filter(!grepl("^SN|^SC", label))
393 |       if (grepl("methanol", comparison)) {
394 |         temperature = gsub("^.*_", "", comparison)
395 |         meta0 %<>%
396 |           filter(is.na(label2) | label2 == 'MeOH') %>%
397 |           replace_na(list(label2 = 'fresh')) %>%
398 |           filter(label == temperature) %>%
399 |           mutate(label = factor(label2, levels = c('fresh', 'MeOH')))
400 |       } else if (grepl("cryo", comparison)) {
401 |         temperature = gsub("^.*_", "", comparison)
402 |         meta0 %<>%
403 |           filter(is.na(label2) | label2 == 'Cryo') %>%
404 |           replace_na(list(label2 = 'fresh')) %>%
405 |           filter(label == temperature) %>%
406 |           mutate(label = factor(label2, levels = c('fresh', 'Cryo')))
407 |       } else {
408 |         # warm v. cold
409 |         meta0 %<>%
410 |           filter(is.na(label2)) %>%
411 |           # set factor levels
412 |           mutate(label = factor(label, levels = c('cold', 'warm')))
413 |       }
414 |       expr0 = expr[, meta0$idx]
415 |       results[[comparison]] = list(expr = expr0, meta = meta0)
416 |     }
417 |     ##
418 |   } else if (dataset == "Hagai2018_plate") {
419 |     ## explicitly specify factor levels
420 |     meta %<>% mutate(label = factor(label, levels = c(2, 6)))
421 |     results[[1]] = list(expr = expr, meta = meta)
422 |   } else if (dataset == "Maniatis2019_mouse") { 
423 |     meta0 = meta %>%
424 |       mutate(idx = row_number()) %>%
425 |       # rename replicate, label, and region for compatibility
426 |       dplyr::rename(replicate = isolate, label = breed, cell_type = region)
427 |     results[[1]] = list(expr = expr, meta = meta0)
428 |   } else {
429 |     stop("invalid dataset: ", dataset, " ...")
430 |   }
431 | 
432 |   # drop all unused factor levels
433 |   for (comparison_idx in seq_along(results)) {
434 |     results[[comparison_idx]]$meta %<>% droplevels()
435 |   }
436 | 
437 |   return(results)
438 | }
439 | 


--------------------------------------------------------------------------------
/R/functions/recode_colnames.R:
--------------------------------------------------------------------------------
 1 | recode_colnames = function(DE) {
 2 |   colnames(DE) %<>%
 3 |     fct_recode('p_val' = 'p.value',  ## DESeq2
 4 |                'p_val' = 'pvalue',  ## DESeq2
 5 |                'p_val' = 'p.value',  ## t/wilcox
 6 |                'p_val' = 'P.Value',  ## limma
 7 |                'p_val' = 'PValue'  , ## edgeR
 8 |                'p_val_adj' = 'padj', ## DESeq2/t/wilcox
 9 |                'p_val_adj' = 'adj.P.Val',      ## limma
10 |                'p_val_adj' = 'FDR',            ## edgeER
11 |                'avg_logFC' = 'log2FoldChange', ## DESEeq2
12 |                'avg_logFC' = 'logFC', ## limma/edgeR
13 |                'test_statistic' = 'stat', ## DESeq2
14 |                'test_statistic' = 'F', ## edgeR
15 |                'test_statistic' = 't', ## limma
16 |                'test_statistic' = 'LR', ## edgeR LRT
17 |                'test_statistic' = 'statistic' ## t
18 |     ) %>%
19 |     as.character()  
20 |   return(DE)
21 | }
22 | 


--------------------------------------------------------------------------------
/R/functions/spatial_datasets.R:
--------------------------------------------------------------------------------
1 | datasets = c(
2 |   "Maniatis2019_mouse"
3 | )
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # DE-analysis
 2 | 
 3 | This repository contains R source code used to conduct the analysis in our manuscript, "Confronting false discoveries in single-cell differential expression."
 4 | 
 5 | A brief overview of the main computational analyses that were conducted, and the location of the corresponding source code, is given below.
 6 | 
 7 | - First, differentially expressed genes were identified in single-cell and matching bulk datasets, respectively, using code in the directories `R/analysis/run_de` and `R/analysis/run_bulk_de`. Code in `R/analysis/run_spike_in_de` was used to analyze a lone single-cell dataset in which the ERCC mixture of synthetic mRNAs was spiked in alongside each individual cell.
 8 |     - A list of all single-cell and bulk datasets analyzed in this study is provided in `R/functions/datasets.R`. Datasets containing multiple comparisons of two experimental groups were split into each possible comparison using the functions in `R/functions/get_comparisons.R` and `R/functions/get_bulk_comparisons.R`. 
 9 |     - Code used to run differential expression analyses is provided in `R/functions/run_DE.R`
10 | - The concordance between the single-cell and bulk DE results was then quantified using code in the `R/analysis/bulk_concordance` directory.
11 |     - Code used to calculate the AUCC and fold-change correlation is provided in `R/functions/calculate_overlap.R`
12 | - Gene set enrichment analysis of differential expression results was performed for both the single-cell and bulk datasets using code in the `R/analysis/run_GSEA` directory.
13 | - False-positive and false-negative DE calls were obtained for the single-cell data, using the bulk data as a reference, using code in the `R/analysis/extract_FPs` directory.
14 | - A number of summary statistics were obtained for each dataset (e.g., number of replicates, number of cell types) or each gene within each dataset (e.g., mean expression, delta-variance), using code in the directories `R/analysis/confounds`, `R/analysis/delta_variance`, and `R/analysis/expr_summary`. 
15 | - The relationships between mean expression, the variance of gene expression, and the delta-variance in 'pseudo-replicates' were interrogated using code in `R/analysis/mean_variance`. 
16 | - The effect of between-replicate variance was interrogated with simulation studies using the code in `R/analysis/simulations`. The code in this directory was used to generate synthetic gene expression data, perform DE analysis, and analyse the properties of DE genes. 
17 | - DE analysis was performed between random groups of control samples using code in the `R/analysis/control_only` directory. This code was also used to analyze a spatial transcriptomics dataset.
18 | - The performance of generalized linear mixed models was assessed in downsampled datasets using code in the `R/analysis/downsample_cells` directory.
19 | - Finally, the computational resources (wall time, peak RAM usage) used by each method were extracted using code in the `R/analysis/time_RAM` directory.
20 | 


--------------------------------------------------------------------------------