├── .gitignore
├── README.md
├── bootstrap_analysis_examples.Rmd
├── bootstrap_analysis_examples.html
├── config
    └── config.yml
├── resources
    ├── crispr_data
    │   └── EPCrisprBenchmark_ensemble_data_GRCh38.tsv.gz
    ├── example
    │   ├── ABC_K562_Fulco2019Genes_GRCh38.tsv.gz
    │   ├── ABC_cell_type_mapping.txt
    │   ├── EPCrisprBenchmark_Fulco2019_K562_GRCh38.tsv.gz
    │   └── pred_config.txt
    ├── genome_annotations
    │   ├── CollapsedGeneBounds.hg38.TSS500bp.bed
    │   └── CollapsedGeneBounds.hg38.bed
    └── genomic_features
    │   └── K562_expressed_genes.tsv
└── workflow
    ├── Snakefile
    ├── envs
        └── r_crispr_comparison.yml
    ├── rules
        └── crispr_comparison.smk
    └── scripts
        ├── annotateMergedData.R
        ├── comparePredictionsToExperiment.Rmd
        ├── createGenomeBrowserTracks.R
        ├── createPredConfig.R
        ├── crisprComparisonBootstrapFunctions.R
        ├── crisprComparisonLoadInputData.R
        ├── crisprComparisonMergeFunctions.R
        ├── crisprComparisonPlotFunctions.R
        ├── crisprComparisonSimplePredictors.R
        └── mergePredictionsWithExperiment.R


/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | *.Rproj
3 | .Rhistory
4 | .RData
5 | .Ruserdata
6 | .snakemake
7 | results/
8 | data/
9 | slurm_logs


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![snakemake](https://img.shields.io/badge/snakemake-%E2%89%A55.10.0-brightgreen.svg)](https://snakemake.readthedocs.io/en/stable/index.html)
  2 | 
  3 | # Benchmark enhancer-gene prediction models against CRISPR data
  4 | 
  5 | This workflow is designed to evaluate the performance of an enhancer-gene linking model against
  6 | experimental data from CRISPR enhancer screes. It supports the evaluation of multiple predictors
  7 | against a single experimental data file. Comparing performance of a single predictor against
  8 | multiple experiments currently not supported.
  9 | 
 10 | Fundamentally, the idea is to overlap each experimentally tested element-gene-celltype tuple with a
 11 | predicted element-gene-cell type tuple. Diagnostic plots such as PR curves are then produced on this
 12 | overlapped dataset. Care must be taken when an experimentally tested element overlaps multiple
 13 | predicted elements, or when an experimentally tested element does not overlap any predicted
 14 | elements. See the configuration section below for how to handle these cases. 
 15 | 
 16 | Other notes:
 17 | 
 18 |  * There must be both experimental positives and negatives in the experimental data file in order to
 19 |  produce PR curves.
 20 |  * The code currently overlaps based on gene symbols (not on gene TSS coordinates)
 21 | 
 22 | ## Requirements
 23 |  * Inputs (see below for formats):
 24 |  	* One experimental data file 
 25 |  	* At least one predictions file
 26 |  	* (optional) Configuration file describing metadata of predictors, including how to aggregate
 27 |   multiple predicted enhancers overlapping one experimental enhancer
 28 |  	* (optional) Cell type mappings between cell types in predictions and experiment
 29 |  	
 30 | ### Dependencies
 31 | Running the workflow requires that
 32 | [snakemake (>=5.10.0)](https://snakemake.readthedocs.io/en/stable/index.html) and conda
 33 | (e.g. [miniconda](https://docs.conda.io/en/latest/miniconda.html)) are installed.
 34 | 
 35 | Other dependencies are automatically installed via conda when executing the workflow via snakemake
 36 | with the `--use-conda` flag (see below) for the first time. Time to install dependencies depends on
 37 | your system, but shouldn't take more than 30 minutes. Conda environment files can be found in
 38 | `workflow/envs`. If the snakemake workflow is used without conda, following R dependencies are
 39 | required:
 40 | 
 41 | ```sh
 42 | # base R:
 43 | R (>=4.1.1)
 44 | 
 45 | # R packages:
 46 | R.utils (>=2.11.0)
 47 | data.table (>=1.14.2)
 48 | tidyverse (>=1.3.1)
 49 | cowplot (>=1.1.1)
 50 | ggpubr (>=0.4.0)
 51 | ggcorrplot (>=0.1.3)
 52 | upsetr (>=1.4.0)
 53 | plotly (>=4.10.0)
 54 | rocr (>=1.0_11)
 55 | catools (>=1.18.2)
 56 | boot (>=1.3_28.1)
 57 | dt (>=0.22)
 58 | rmarkdown (>=2.13)
 59 | bookdown (>=0.25)
 60 | optparse (>=1.7.1)
 61 | GenomicRanges (>=1.46.1)
 62 | rtracklayer (>=1.54.0)
 63 | BiocParallel (>=1.28.3)
 64 | ```
 65 | 
 66 | ## Running an example comparison
 67 | A small example comparison can be performed using the following command. This also uses conda to
 68 | install all dependencies other than snakemake and conda.
 69 | ```sh
 70 | # perform example comparison specified in config.yml (-n = dryrun, remove for execution)
 71 | snakemake --use-conda results/example/example_crispr_comparison.html -j1 -n
 72 | ```
 73 | 
 74 | All generated output including the main .html output file with the benchmarking results will are
 75 | saved to `results/example/`.
 76 | 
 77 | ## Benchmarking other E-G predictive models
 78 | To benchmark the performance of E-G predictive models against CRISPR datasets, files containing
 79 | predictions and CRISPR data are required. The repository contains a K562 CRISPR benchmarking dataset
 80 | generated from combining results from different published experiments (see:
 81 | https://github.com/argschwind/ENCODE_CRISPR_data), which can be used to benchmark K562 E-G
 82 | predictions.
 83 | 
 84 | ### Input file Formats
 85 | 
 86 | * Experimental data
 87 |   * <https://docs.google.com/spreadsheets/d/1xold84upBFigZPFQGUMeNC5t2id-Bub_Y1o5Xid_MHw/edit?usp=sharing>.
 88 |   * Use `resources/crispr_data/EPCrisprBenchmark_ensemble_data_GRCh38.tsv.gz` as an example.
 89 | 
 90 | * Predictions
 91 |   * <https://docs.google.com/spreadsheets/d/1xold84upBFigZPFQGUMeNC5t2id-Bub_Y1o5Xid_MHw/edit?usp=sharing>
 92 |   * See `resources/example/K562_ABC_K562HiC_chrx.txt.gz` as an example.
 93 | 
 94 | Cross-validated ENCODE-rE2G predictions from
 95 | [Gschwind et al., 2023](https://www.biorxiv.org/content/10.1101/2023.11.09.563812v1) for
 96 | benchmarking against the provided CRISPR dataset can be found here:
 97 | https://www.synapse.org/#!Synapse:syn53018671
 98 | 
 99 | ### Configuring the snakemake workflow
100 | The `config/config.yml` file is used to specify comparisons that should be performed. See the
101 | comparison `"example"` in this file as an example. In addition to the predictions and experiment
102 | input files, each comparison can take a prediction config file (pred_config) in .txt format as input.
103 | This file specifies how the different predictors should be handled and the behavior of the comparison
104 | code for this predictor depends on its content:
105 | 
106 |  * pred_id: Short name for each predictor. Same as the names of 'pred' in the `config.yml` file.
107 |  * pred_col: Column name in prediction file containing the predictor values. Following the
108 |  aforementioned file format this is typlically 'Score'.
109 |  * boolean: (TRUE/FALSE) is this predictor binary, or os this a quantitative prediction score?
110 |  * alpha: Predictor score cutoff for plots. Can be NA if not applicable or unknown.
111 |  * aggregate_function: In the case that an experimentally tested element overlaps multiple predicted
112 |  elements, how should the predicted elements be aggregated to the level of the tested element. 
113 |  * fill_value: In the case that an experimentally tested element does not overlap any predicted
114 |  elements, what value should be filled in.
115 |  * inverse_predictor: Set to TRUE if lower values of the predictor signify more confidence in the
116 |  prediction. This is appropriate for predictors such as linear distance or pvalue.
117 |  * pred_name_long: A pretty name (2-3 words) for the predictor to make plots and tables look nicer.
118 |  * color: R color name or hex code to use in plots for the predictor. If NA while colors for other
119 |  predictors are set, this color will be gray. If all colors are NA, the pipeline will pick some
120 |  colors.
121 |  * include: (optional column, TRUE/FALSE) Should this predictor be included in the benchmark?
122 |  Different include columns can be specified using 'include_col' in the `config.yml` file. E.g. 
123 |  specifying `include_col: "MyPredictors"` will in a comparison in the `config.yml` will include all
124 |  predictors specified in the 'MyPredictors' column on the pred_config file.
125 |  
126 | See `resources/example/pred_config.txt` for an example. If this file is left out (`NULL` in 
127 | `config.txt`), a file with default values will be generated, however they might not be appropriate
128 | for the provided predictors.
129 | 
130 | ### Running the workflow
131 | Following command can be used to perform all specified comparisons. Run time per comparison depends
132 | on the number and file size of the included predictions, but should typically take less than
133 | 30 minutes.
134 | ```sh
135 | # perform all comparisons specified in config.yml (-n = dryrun, remove for execution)
136 | snakemake --use-conda -j1 -n
137 | ```
138 | 
139 | All generated output including the main .html document are saved to `results/` in subdirectories 
140 | named after the comparison name as specified in the `config.yml` file.
141 | 


--------------------------------------------------------------------------------
/bootstrap_analysis_examples.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Bootstrapped performance comparisons"
  3 | author: "Andreas R. Gschwind"
  4 | date: "`r format(Sys.time(), '%B %d, %Y')`"
  5 | output: 
  6 |   html_document:
  7 |     toc: true
  8 |     toc_float: true
  9 |     code_folding: show
 10 | ---
 11 | 
 12 | ```{r setupDocument, include=FALSE}
 13 | # set output html chunk options
 14 | knitr::opts_chunk$set(eval = FALSE)
 15 | ```
 16 | 
 17 | ## Compare performance between predictors and datasets
 18 | The CRISPR benchmarking pipeline uses a bootstrapping approach to compute confidence intervals for
 19 | performance metrics in the standard analyses. These methods can also be used to perform customized
 20 | analyses to compare the performance of predictive models:
 21 | 
 22 | * Compute confidence intervals for AUPRC or precision at thresholds
 23 | * Compute statistical significance of pairwise differences in performance:
 24 |   * Between different models on the same benchmarking dataset
 25 |   * Between the same models on two difference benchmarking datasets
 26 |   
 27 | The following examples show how these analyses can be performed on hypothetical data and models.
 28 | 
 29 | ***
 30 | 
 31 | ## Preparing data for bootstrapping
 32 | Running the benchmarking pipeline for a given comparison will produce a file called
 33 | `expt_pred_merged_annot.txt.gz`, which contains the scores of the benchmarked models merged with the
 34 | CRISPR ground truth dataset used in this comparison. Together with the `pred_config` file, this is
 35 | the main input to any benchmarking analyses, including custom performance comparisons. The following
 36 | code shows how to load the data and prepare it for bootstrapping analyses.
 37 | ```{r prepareData}
 38 | # required functions
 39 | source("workflow/scripts/crisprComparisonLoadInputData.R")
 40 | source("workflow/scripts/crisprComparisonPlotFunctions.R")
 41 | source("workflow/scripts/crisprComparisonBootstrapFunctions.R")
 42 | 
 43 | # load merged data
 44 | merged_file <- "results/<comparison>/expt_pred_merged_annot.txt.gz"
 45 | merged <- fread(merged_file, colClasses = c("ValidConnection" = "character"))
 46 | 
 47 | # load pred_config file
 48 | pred_config_file <- "path/to/pred_config.tsv"
 49 | pred_config <- importPredConfig(pred_config_file)
 50 | 
 51 | # process merged data for benchmarking analyses, including filtering for ValidConnection == TRUE
 52 | merged_training <- processMergedData(merged, pred_config = pred_config,
 53 |                                      filter_valid_connections = TRUE)
 54 | 
 55 | # reformat merged data for bootstrapping
 56 | merged_bs <- convertMergedForBootstrap(merged, pred_config = pred_config)
 57 | ```
 58 | 
 59 | ***
 60 | 
 61 | ## Compute confidence intervals
 62 | Bootstrapping can be used to compute empirical confidence intervals for AUPRC or precision at a
 63 | given threshold for predictors in the merged data. The `predictors` argument can be used to compute
 64 | confidence intervals only on a subset of models.
 65 | ```{r computeAuprcCi}
 66 | # bootstrap AUPRC and compute confidence intervals for all predictors in the merged data
 67 | ci_auprc <- bootstrapPerformanceIntervals(merged_bs, metric = "auprc", R = 10000, conf = 0.95,
 68 |                                           ci_type = "perc", ncpus = 2)
 69 | 
 70 | # calculate AUPRC confidence intervals for a subset of predictors (use pred_uid to subset)
 71 | preds <- c("ABCdnase.ABC.Score", "ENCODE_rE2G.Score", "baseline.distToTSS")
 72 | ci_auprc <- bootstrapPerformanceIntervals(merged_bs, metric = "auprc", predictors = preds,
 73 |                                           R = 10000, conf = 0.95, ci_type = "perc", ncpus = 2)
 74 | 
 75 | ```
 76 | The resulting tables contain bootstrapped performance metrics for the different predictors. Most
 77 | importantly, the `full` column contains the non-bootstrapped performance, while `lower` and `upper`
 78 | contain the lower and upper boundary of the confidence interval. `min` and `max` contain the minimum
 79 | and maximum values obtained while bootstrapping.
 80 | 
 81 | When computing confidence intervals for precision, the user needs to provide predictor score
 82 | thresholds. If thresholds are provided in the pred_config file, the `getThresholdValues()` helper
 83 | function can be used to extract them. If providing thresholds from other sources, note that
 84 | thresholds for inverse predictors have to be inverted, i.e. multiplied by -1.
 85 | ```{r computePrecCi}
 86 | # extract specified predictor thresholds in pred_config to compute confidence intervals
 87 | thresholds <- getThresholdValues(pred_config, predictors = preds, threshold_col = "alpha")
 88 | 
 89 | # bootstrap precision at 70% recall for a subset of predictors and compute CIs
 90 | ci_precision <- bootstrapPerformanceIntervals(merged_bs, metric = "precision",
 91 |                                               predictors = preds, thresholds = thresholds,
 92 |                                               R = 10000, conf = 0.95, ci_type = "perc", ncpus = 2)
 93 | ```
 94 | 
 95 | ***
 96 | 
 97 | ## Compute significance of performance differences
 98 | The bootstrapping approach can also be used to compute statistical significance of performance
 99 | differences between different predictors, or the same predictors but between two different
100 | CRISPR benchmarking datasets.
101 | 
102 | ### Differences between predictors
103 | Like with confidence intervals, we can calculate performance differences between predictors for
104 | AUPRC or precision at a given threshold. This approach bootstraps the delta in AUPRC or precision
105 | between two predictors, and uses the output to calculate a p-value for delta being different from
106 | 0.
107 | 
108 | By default, this function computes signficance for all possible predictor pairs in the merged data.
109 | ```{r computeAuprcDelta}
110 | # compute significant differences in AUPRC comparisons between all predictors in input data
111 | delta_auprc <- bootstrapDeltaPerformance(merged_bs, metric = "auprc", R = 10000, conf = 0.95,
112 |                                          ci_type = "perc", ncpus = 2)
113 | ```
114 | The output table has the same format as for confidence intervals, however each row is now one
115 | pairwise comparison of two predictors and the bootstrapped statistic is the delta in the chosen
116 | performance metric. This table also contains an additional p-value column.
117 | 
118 | Computing all pairwise comparisons can be resource intensive, so it's possible to specify specific
119 | comparison to compute. In the example below we compute the difference in precision at the given
120 | thresholds for two models against distance to TSS.
121 | ```{r computePrecDelta}
122 | # compute significant differences in precision at threshold for specific pairwise comparisons
123 | comps <- list(c("ABCdnase.ABC.Score", "baseline.distToTSS"),
124 |               c("ENCODE_rE2G.Score", "baseline.distToTSS"))
125 | delta_precision <- bootstrapDeltaPerformance(merged_bs, metric = "precision", comparisons = comps,
126 |                                              thresholds = thresholds, R = 10000, conf = 0.95,
127 |                                              ci_type = "perc", ncpus = 2)
128 | ```
129 | 
130 | To visualize the results of these comparisons, we can plot the bootstrapped confidence intervals to
131 | easily check if they overlap 0.
132 | ```{r plotPrecDelta, fig.height=3, fig.width=7}
133 | plotBootstrappedIntervals(delta_precision)
134 | ```
135 | 
136 | ### Differences between datasets
137 | The bootstrapping approach can be adapted to compute differences in performance of a predictor
138 | between two CRISPR benchmarking datasets. Here for each bootstrap iteration a sample is drawn from
139 | each of the two datasets and the delta in performance is calculated. Multiple bootstrap samples are
140 | used to calculate confidence intervals and to test for significant difference from 0 for delta.
141 | 
142 | First we need to load and reformat the merged data for the second benchmarking dataset.
143 | ```{r prepareSecondDataset}
144 | # load merged data for the second benchmarking dataset
145 | merged2_file <- "results/<comparison>/expt_pred_merged_annot.txt.gz"
146 | merged2 <- fread(merged2_file, colClasses = c("ValidConnection" = "character"))
147 | 
148 | # process merged data for benchmarking analyses, including filtering for ValidConnection == TRUE
149 | merged2 <- processMergedData(merged2, pred_config = pred_config, filter_valid_connections = TRUE)
150 | 
151 | # reformat merged data for bootstrapping
152 | merged2_bs <- convertMergedForBootstrap(merged2, pred_config = pred_config)
153 | ```
154 | 
155 | We can now compute difference in AUPRC for all predictors between the two datasets.
156 | ```{r computeAuprcDiff2Datasets}
157 | # compute bootstrapped differences in AUPRC for all models between the two datasets
158 | delta_auprc <- bootstrapDeltaPerformanceDatasets(data1 = merged_bs, data2 = merged2_bs, 
159 |                                                  metric = "auprc", R = 10000, conf = 0.95,
160 |                                                  ci_type = "perc", ncpus = 2)
161 | ```
162 | 
163 | Like before, we can also compute differences in precision at chosen thresholds, subset the
164 | comparison to selected predictors and plot the results
165 | ```{r computePrecDiff2Datasets, fig.height=3.5, fig.width=6}
166 | # compute bootstrapped performance differences for all models between training and held-out data
167 | delta_prec <- bootstrapDeltaPerformanceDatasets(data1 = merged_bs, data2 = merged2_bs, 
168 |                                                 metric = "precision", predictors = preds,
169 |                                                 thresholds = thresholds, R = 10000, conf = 0.95,
170 |                                                 ci_type = "perc", ncpus = 2)
171 | 
172 | # plot bootstrapped delta confidence intervals
173 | plotBootstrappedIntervals(delta_prec)
174 | ```
175 | 


--------------------------------------------------------------------------------
/config/config.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | # comparisons to be performed by the pipeline, copy-paste example for a template
 3 | comparisons:
 4 |   example:
 5 |     pred:
 6 |       ABC: "resources/example/ABC_K562_Fulco2019Genes_GRCh38.tsv.gz"
 7 |     expt: "resources/example/EPCrisprBenchmark_Fulco2019_K562_GRCh38.tsv.gz"
 8 |     gene_universe: "resources/genome_annotations/CollapsedGeneBounds.hg38.bed"
 9 |     tss_universe: "resources/genome_annotations/CollapsedGeneBounds.hg38.TSS500bp.bed"
10 |     pred_config: "resources/example/pred_config.txt"
11 |     include_col: Null
12 |     cell_type_mapping: 
13 |       ABC: "resources/example/ABC_cell_type_mapping.txt"
14 |     dist_bins_kb: [0, 20, 100, 2500]
15 |     expressed_genes: "resources/genomic_features/K562_expressed_genes.tsv"
16 |     gene_features: Null
17 |     enh_features: Null
18 |     enh_assays: Null
19 | 


--------------------------------------------------------------------------------
/resources/crispr_data/EPCrisprBenchmark_ensemble_data_GRCh38.tsv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EngreitzLab/CRISPR_comparison/af33e496e0deacb6112cfd47d4c9ddac1c715fef/resources/crispr_data/EPCrisprBenchmark_ensemble_data_GRCh38.tsv.gz


--------------------------------------------------------------------------------
/resources/example/ABC_K562_Fulco2019Genes_GRCh38.tsv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EngreitzLab/CRISPR_comparison/af33e496e0deacb6112cfd47d4c9ddac1c715fef/resources/example/ABC_K562_Fulco2019Genes_GRCh38.tsv.gz


--------------------------------------------------------------------------------
/resources/example/ABC_cell_type_mapping.txt:
--------------------------------------------------------------------------------
1 | experiment  predictions
2 | K562  K562_ID_2644
3 | 


--------------------------------------------------------------------------------
/resources/example/EPCrisprBenchmark_Fulco2019_K562_GRCh38.tsv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EngreitzLab/CRISPR_comparison/af33e496e0deacb6112cfd47d4c9ddac1c715fef/resources/example/EPCrisprBenchmark_Fulco2019_K562_GRCh38.tsv.gz


--------------------------------------------------------------------------------
/resources/example/pred_config.txt:
--------------------------------------------------------------------------------
1 | pred_id pred_col  boolean alpha aggregate_function  fill_value  inverse_predictor pred_name_long  color
2 | ABC powerlaw.Score  FALSE  0.02 sum 0 FALSE "ABC (powerlaw)"  "steelblue"
3 | 


--------------------------------------------------------------------------------
/workflow/Snakefile:
--------------------------------------------------------------------------------
 1 | 
 2 | # config file specifying benchmarks to perform
 3 | configfile: "config/config.yml"
 4 | 
 5 | # rules for CRISPR comparisons
 6 | include: "rules/crispr_comparison.smk"
 7 | 
 8 | # perform all comparisons listed in config.yml
 9 | rule all:
10 |   input:
11 |     expand("results/{comparison}/{comparison}_crispr_comparison.html",
12 |       comparison = config["comparisons"]),
13 |     expand("results/{comparison}/genome_browser_tracks",
14 |       comparison = config["comparisons"])
15 | 


--------------------------------------------------------------------------------
/workflow/envs/r_crispr_comparison.yml:
--------------------------------------------------------------------------------
 1 | name: r_crispr_comparison
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 |   - defaults
 6 |   - r
 7 | dependencies:
 8 |   - python=3.9.1
 9 |   - pandoc=2.18
10 |   - r-base=4.1.1
11 |   - r-r.utils=2.12.2
12 |   - r-data.table=1.14.8
13 |   - r-tidyverse=1.3.2
14 |   - r-cowplot=1.1.1
15 |   - r-ggpubr=0.6.0
16 |   - r-ggcorrplot=0.1.4
17 |   - r-upsetr=1.4.0
18 |   - r-rocr=1.0_11
19 |   - r-catools=1.18.2
20 |   - r-boot=1.3_28.1
21 |   - r-dt=0.28
22 |   - r-rmarkdown=2.21
23 |   - r-bookdown=0.34
24 |   - r-optparse=1.7.3
25 |   - bioconductor-genomicranges=1.46.1
26 |   - bioconductor-rtracklayer=1.54.0
27 |   - bioconductor-biocparallel=1.28.3
28 |   


--------------------------------------------------------------------------------
/workflow/rules/crispr_comparison.smk:
--------------------------------------------------------------------------------
  1 | # rules to perform comparisons of CRE predictions to CRISPR data
  2 | 
  3 | # get all prediction files and concatenate them into one array
  4 | def get_predictions(wildcards):
  5 |   preds = config["comparisons"][wildcards.comparison]["pred"]
  6 |   preds_array = []
  7 |   for value in preds.values():
  8 |     if isinstance(value, list):
  9 |       preds_array.extend(value)
 10 |     else:
 11 |       preds_array.append(value)
 12 |   return preds_array
 13 | 
 14 | # get pred_config file if specified in config, else create name for default pred_config file
 15 | def get_pred_config(wildcards):
 16 |   pred_config = config["comparisons"][wildcards.comparison]["pred_config"]
 17 |   if pred_config is None:
 18 |     comparison = wildcards.comparison
 19 |     pred_config = "results/" + comparison + "/pred_config.txt"
 20 |   return pred_config
 21 | 
 22 | # get optional input parameter if they are specified in config
 23 | def get_optional_parameter(wildcards, param, default=[]):
 24 |   try:
 25 |     param = config["comparisons"][wildcards.comparison][param]
 26 |   except KeyError:
 27 |     param = None
 28 |   if param is None:
 29 |     param = default
 30 |   else:
 31 |     if type(param) is dict:
 32 |       param = param.values()
 33 |   return param
 34 | 
 35 | ## RULES -------------------------------------------------------------------------------------------
 36 | 
 37 | # create minimal pred_config file with default values
 38 | rule createPredConfig:
 39 |   output:
 40 |     "results/{comparison}/pred_config.txt"
 41 |   params:
 42 |     pred_names = lambda wildcards: config["comparisons"][wildcards.comparison]["pred"].keys()
 43 |   conda: "../envs/r_crispr_comparison.yml"
 44 |   script:
 45 |     "../../workflow/scripts/createPredConfig.R"
 46 | 
 47 | # merge predictions with experimental data
 48 | rule mergePredictionsWithExperiment:
 49 |   input:
 50 |     predictions = get_predictions,
 51 |     experiment  = lambda wildcards: config["comparisons"][wildcards.comparison]["expt"],
 52 |     tss_universe = lambda wildcards: config["comparisons"][wildcards.comparison]["tss_universe"],
 53 |     gene_universe = lambda wildcards: config["comparisons"][wildcards.comparison]["gene_universe"],
 54 |     pred_config = get_pred_config,
 55 |     cell_type_mapping = lambda wildcards: get_optional_parameter(wildcards, "cell_type_mapping"),
 56 |     expressed_genes = lambda wildcards: get_optional_parameter(wildcards, "expressed_genes")
 57 |   output:
 58 |     merged = temp("results/{comparison}/expt_pred_merged.txt.gz")
 59 |   params:
 60 |     pos_col = "Regulated",
 61 |     include_col = lambda wildcards: get_optional_parameter(wildcards, "include_col", None),
 62 |     pred_format = lambda wildcards: get_optional_parameter(wildcards, "pred_file_format", "ENCODE"),
 63 |     filter_tss = lambda wildcards: get_optional_parameter(wildcards, "filter_pred_tss", True),
 64 |     filter_include_col = False
 65 |   log: "results/{comparison}/logs/mergePredictionsWithExperiment.log"
 66 |   conda: "../envs/r_crispr_comparison.yml"
 67 |   resources:
 68 |     mem_mb = 32000
 69 |   script:
 70 |    "../../workflow/scripts/mergePredictionsWithExperiment.R"
 71 |    
 72 | # annotate enhancers in merged data with overlapping genomic features and assays
 73 | rule annotateEnhFeatures:
 74 |   input:
 75 |     merged = "results/{comparison}/expt_pred_merged.txt.gz",
 76 |     gene_features = lambda wildcards: get_optional_parameter(wildcards, "gene_features"),
 77 |     enh_features = lambda wildcards: get_optional_parameter(wildcards, "enh_features"),
 78 |     enh_assays = lambda wildcards: get_optional_parameter(wildcards, "enh_assays")
 79 |   output:
 80 |     "results/{comparison}/expt_pred_merged_annot.txt.gz"
 81 |   conda: "../envs/r_crispr_comparison.yml"
 82 |   resources:
 83 |     mem_mb = 32000
 84 |   script:
 85 |     "../../workflow/scripts/annotateMergedData.R"
 86 |    
 87 | # perform comparisons of predictions to experimental data
 88 | rule comparePredictionsToExperiment:
 89 |   input:
 90 |     merged = "results/{comparison}/expt_pred_merged_annot.txt.gz",
 91 |     pred_config = get_pred_config
 92 |   output: "results/{comparison}/{comparison}_crispr_comparison.html"
 93 |   params:
 94 |      pred_names = lambda wildcards: config["comparisons"][wildcards.comparison]["pred"].keys(),
 95 |      include_missing_predictions = True,
 96 |      pos_col = "Regulated",
 97 |      min_sensitivity = 0.7,
 98 |      dist_bins_kb = lambda wildcards: config["comparisons"][wildcards.comparison]["dist_bins_kb"],
 99 |      include_col = lambda wildcards: get_optional_parameter(wildcards, "include_col", None)
100 |   conda: "../envs/r_crispr_comparison.yml"
101 |   resources:
102 |     mem_mb = 32000,
103 |     runtime = "6h"
104 |   script:
105 |     "../../workflow/scripts/comparePredictionsToExperiment.Rmd"
106 |     
107 | # create genome browser tracks
108 | rule createGenomeBrowserTracks:
109 |   input: 
110 |     merged = "results/{comparison}/expt_pred_merged_annot.txt.gz"
111 |   output: directory("results/{comparison}/genome_browser_tracks")
112 |   conda: "../envs/r_crispr_comparison.yml"
113 |   resources:
114 |     mem_mb = 8000  
115 |   script:
116 |     "../../workflow/scripts/createGenomeBrowserTracks.R"    
117 | 


--------------------------------------------------------------------------------
/workflow/scripts/annotateMergedData.R:
--------------------------------------------------------------------------------
  1 | ## Overlap merged data with genomic features in .bed format and chromatin assays in .bam files.
  2 | ## Adds one additional column per overlapped feature to merged data. Added columns start with an
  3 | ## "overlaps_" prefix
  4 | 
  5 | # save.image("annot.rda")
  6 | # stop()
  7 | 
  8 | # required packages
  9 | suppressPackageStartupMessages({
 10 |   library(data.table)
 11 |   library(tidyverse)
 12 |   library(GenomicRanges)
 13 |   library(GenomicAlignments)
 14 |   library(rtracklayer)
 15 | })
 16 | 
 17 | ## Define functions --------------------------------------------------------------------------------
 18 | 
 19 | # annotate EG pairs in merged data with gene features  (list of data.frames)
 20 | annotate_gene_features <- function(merged, gene_features) {
 21 |   
 22 |   # check that gene features have distinct feature names
 23 |   all_colnames <- unlist(lapply(gene_features, FUN = colnames))
 24 |   all_feature_names <- all_colnames[all_colnames != "gene"]
 25 |   if (any(table(all_feature_names) > 1)) {
 26 |     stop("Gene feature files do not have unique feature names.", call. = FALSE)
 27 |   }
 28 |   
 29 |   # merge all gene features in to one table
 30 |   all_gene_features <- purrr::reduce(gene_features, full_join, by = "gene")
 31 |   
 32 |   # set nee column names
 33 |   colnames(all_gene_features)[-1] <- paste0("gene_feature_", colnames(all_gene_features)[-1])
 34 |   
 35 |   # add gene features to merged data
 36 |   output <- left_join(merged, all_gene_features, by = c("measuredGeneSymbol" = "gene"))
 37 |   
 38 |   return(output)
 39 |   
 40 | }
 41 | 
 42 | # annotate merged data (data.frame) with overlapping enhancer features (list of data.frames)
 43 | annotate_enh_features <- function(merged, enh_features) {
 44 |   
 45 |   # create GRanges objects from features
 46 |   enh_features <- lapply(enh_features, FUN = makeGRangesFromDataFrame, seqnames.field = "V1",
 47 |                      start.field = "V2", end.field = "V3", strand.field = "V6",
 48 |                      starts.in.df.are.0based = TRUE)
 49 |   
 50 |   # convert to GRangesList
 51 |   enh_features <- GRangesList(enh_features)
 52 |   
 53 |   # create identifier for CREs in merged data
 54 |   merged <- unite(merged, col = "cre_id", chrom, chromStart, chromEnd, sep = "_", remove = FALSE)
 55 |   
 56 |   # extract CRE coordinates from merged data and create GRanges object
 57 |   cre_coords <- merged %>% 
 58 |     select(chr = chrom, start = chromStart, end = chromEnd, cre_id) %>% 
 59 |     distinct() %>% 
 60 |     makeGRangesFromDataFrame(keep.extra.columns = TRUE, starts.in.df.are.0based = TRUE)
 61 |   
 62 |   # get CREs that overlap with any features and convert to data frame
 63 |   overlaps <- lapply(enh_features, FUN = findOverlaps, query = cre_coords, ignore.strand = TRUE)
 64 |   
 65 |   # create data frame with all CREs with "empty" overlaps column
 66 |   all_cres <- data.table(cre_id = cre_coords$cre_id, overlap = FALSE)
 67 |   
 68 |   # fill in overlaps for every feature
 69 |   cre_overlaps <- lapply(overlaps, FUN = function(ovl, all_cres) {
 70 |     all_cres[unique(queryHits(ovl)), "overlap"] <- TRUE
 71 |     return(all_cres)
 72 |   }, all_cres = all_cres)
 73 |   
 74 |   # convert to one data frame
 75 |   cre_overlaps <- rbindlist(cre_overlaps, idcol = "feature")
 76 |   
 77 |   # convert to wide format and add prefix to feature columns
 78 |   cre_overlaps <- dcast(cre_overlaps, cre_id ~ feature, value.var = "overlap")
 79 |   colnames(cre_overlaps)[-1] <- paste0("enh_feature_", colnames(cre_overlaps)[-1])
 80 |   
 81 |   # add overlaps to original merged data to create output
 82 |   output <- merge(merged, cre_overlaps, by = "cre_id")
 83 |   output <- select(output, -cre_id)
 84 |   
 85 |   return(output)
 86 |   
 87 | }
 88 | 
 89 | # annotate enhancers by overlapping them with enhancer assay bam files
 90 | annotate_enh_assays <- function(merged, enh_assays, normalize = TRUE) {
 91 |   
 92 |   # create identifier for CREs in merged data
 93 |   merged <- unite(merged, col = "cre_id", chrom, chromStart, chromEnd, sep = "_", remove = FALSE)
 94 |   
 95 |   # extract CRE coordinates from merged data and create GRanges object
 96 |   cre_coords <- merged %>% 
 97 |     select(chr = chrom, start = chromStart, end = chromEnd, cre_id) %>% 
 98 |     distinct() %>% 
 99 |     makeGRangesFromDataFrame(keep.extra.columns = TRUE, starts.in.df.are.0based = TRUE)
100 |   
101 |   # count reads in each CRE
102 |   cre_assay_reads <- enh_assays %>%
103 |     lapply(FUN = countOverlaps, query = cre_coords, ignore.strand = TRUE) %>%
104 |     bind_cols()
105 |   
106 |   # combine with enhancer coordinates and convert to long format
107 |   cre_assay_reads <- cre_coords %>%
108 |     data.frame(stringsAsFactors = FALSE) %>%
109 |     select(-strand) %>%
110 |     dplyr::rename(chrom = seqnames, chromStart = start, chromEnd = end) %>%
111 |     bind_cols(cre_assay_reads) %>%
112 |     pivot_longer(cols = -c(1:5), names_to = "assay", values_to = "reads")
113 |   
114 |   # normalize for sequencing depth and enhancer size
115 |   if (normalize == TRUE) {
116 |     
117 |     # total reads per assay
118 |     total_reads <- vapply(enh_assays, FUN = length, FUN.VALUE = integer(1))
119 |     total_reads <- data.frame(assay = names(total_reads), total_reads = total_reads,
120 |                               row.names = NULL, stringsAsFactors = FALSE)
121 |     
122 |     # add total reads to read counts and normalize by sequencing depth
123 |     cre_assay_reads <- cre_assay_reads %>%
124 |       left_join(total_reads, by = "assay") %>% 
125 |       mutate(reads = reads / (total_reads / 1e6)) %>% 
126 |       select(-total_reads)
127 |     
128 |   }
129 |   
130 |   # convert to wide format add to merged data to create output
131 |   output <- cre_assay_reads %>%
132 |     mutate(assay = paste0("enh_assay_", assay)) %>% 
133 |     select(-c(chrom, chromStart, chromEnd, width)) %>%
134 |     pivot_wider(names_from = "assay", values_from = "reads") %>%
135 |     left_join(x = merged, y = ., by = "cre_id")
136 |   
137 |   # remove cre_id column 
138 |   output <- select(output, -cre_id)
139 |   
140 |   return(output)
141 |   
142 | }
143 | 
144 | ## Annotate merged data ----------------------------------------------------------------------------
145 | 
146 | # get annotation features and assay files
147 | config <- snakemake@config$comparisons[[snakemake@wildcards$comparison]]
148 | gene_features_files <- config$gene_features
149 | enh_features_files <- config$enh_features
150 | enh_assays_files <- config$enh_assays
151 | 
152 | # if no annotation features are provided, simply copy the input file
153 | if (is.null(c(gene_features_files, enh_features_files, enh_assays_files))) {
154 |   
155 |   message("No annotation features or assays provided.")
156 |   invisible(file.copy(from = snakemake@input$merged, to = snakemake@output[[1]], overwrite = TRUE))
157 |   
158 | } else {
159 |   
160 |   # load merged data
161 |   message("Loading merged data...")
162 |   merged <- fread(snakemake@input$merged)
163 |   
164 |   # annotate E-G pairs based on gene features
165 |   if (!is.null(gene_features_files)) {
166 |     
167 |     # load annotation features
168 |     message("Loading gene features...")
169 |     gene_features <- lapply(gene_features_files, FUN = fread)
170 |     
171 |     # annotate genes in merged data with gene features
172 |     message("Annotating merged data with gene features...")
173 |     merged <- annotate_gene_features(merged, gene_features = gene_features)
174 |     
175 |   }
176 | 
177 |   # annotate enhancers with overlapping genomic features if provided
178 |   if (!is.null(enh_features_files)) {
179 |     
180 |     # load annotation features
181 |     message("Loading enhancer features...")
182 |     enh_features <- lapply(enh_features_files, FUN = fread)
183 |     
184 |     # overlap merged data with features
185 |     message("Annotating merged data with enhancer features...")
186 |     merged <- annotate_enh_features(merged, enh_features = enh_features)
187 |     
188 |   }
189 |   
190 |   # annotate enhancers with assay reads if provided
191 |   if (!is.null(enh_assays_files)) {
192 |     
193 |     # load assay bam files
194 |     message("Loading enhancer assay bam files...")
195 |     assays <- lapply(enh_assays_files, FUN = readGAlignments)
196 |     
197 |     # annotate merged data with read counts from assay bam files
198 |     message("Annotating merged data with enhancer assay reads...")
199 |     merged <- annotate_enh_assays(merged, enh_assays = assays, normalize = TRUE)
200 |     
201 |   }
202 | 
203 |   # write to new output file
204 |   message("Writing to output file...")
205 |   fwrite(merged, file = snakemake@output[[1]], sep = "\t", na = "NA")
206 | 
207 |   message("Done!")
208 | 
209 | }
210 | 


--------------------------------------------------------------------------------
/workflow/scripts/comparePredictionsToExperiment.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Comparing E-G predictions with CRISPR data"
  3 | date: "`r format(Sys.time(), '%B %d, %Y')`"
  4 | params:
  5 |   rmd: "comparePredictionsToExperiment.Rmd"
  6 | output: 
  7 |   html_document:
  8 |     toc: true
  9 |     toc_float: true
 10 |     code_folding: hide
 11 | ---
 12 | 
 13 | ```{r setupDocument, include=FALSE}
 14 | knitr::opts_chunk$set(warning = FALSE, message = FALSE)
 15 | ```
 16 | 
 17 | ```{r attachPackages}
 18 | # attach required packages and functions
 19 | library(tidyverse)
 20 | library(cowplot)
 21 | library(DT)
 22 | library(ROCR)
 23 | library(caTools)
 24 | library(UpSetR)
 25 | source(file.path(snakemake@scriptdir, "crisprComparisonLoadInputData.R"))
 26 | source(file.path(snakemake@scriptdir, "crisprComparisonPlotFunctions.R"))
 27 | source(file.path(snakemake@scriptdir, "crisprComparisonBootstrapFunctions.R"))
 28 | ```
 29 | 
 30 | ```{r prepareInputData}
 31 | # load merged data
 32 | merged <- fread(snakemake@input$merged, colClasses = c("ValidConnection" = "character"))
 33 | 
 34 | # import pred_config file
 35 | include_col <- ifelse(is.null(snakemake@params$include_col), "include", snakemake@params$include_col)
 36 | pred_config <- importPredConfig(snakemake@input$pred_config,
 37 |                                 expr = "baseline.nearestExprTSS" %in% merged$pred_uid,
 38 |                                 include_col = include_col)
 39 | 
 40 | # process merged data for benchmarking analyses, including filtering for ValidConnection == TRUE
 41 | merged <- processMergedData(merged, pred_config = pred_config, filter_valid_connections = TRUE,
 42 |                             include_missing_predictions = snakemake@params$include_missing_predictions)
 43 | 
 44 | # only retain pred_config entries also in merged data (important when not using an include column)
 45 | pred_config <- subset(pred_config, pred_uid %in% unique(merged$pred_uid))
 46 | check_unique_identifier(pred_config, col = "pred_name_long")
 47 | 
 48 | # extract colors for predictors in all plots
 49 | if (all(is.na(pred_config$color))) {
 50 |   pred_colors <- NULL
 51 | } else {
 52 |   pred_colors <- deframe(select(pred_config, pred_name_long, color))
 53 | }
 54 | 
 55 | # column identifying experimental positives
 56 | pos_col <- snakemake@params$pos_col
 57 | 
 58 | # directory to save plots in addition to adding them to report
 59 | plotdir <- file.path(dirname(snakemake@output[[1]]), "plots")
 60 | ```
 61 | 
 62 | This is the output for the comparison **`r snakemake@wildcards$comparison`**. Following analyses
 63 | evaluate how well the experimental data agrees with the predictions of CRE - gene pairs. Following
 64 | input files were used:
 65 | 
 66 | ```{r}
 67 | # extract predictions and experiment input files from config in snakemake object
 68 | comp_config <- snakemake@config$comparisons[[snakemake@wildcards$comparison]]
 69 | expt_file <- basename(comp_config$expt)
 70 | pred_file <- basename(unlist(comp_config$pred))
 71 | ```
 72 | 
 73 | Experimental data: **`r expt_file`**  
 74 | Predictions: **`r pred_file`**
 75 | 
 76 | Following parameters in the config file **`r snakemake@input$pred_config`** were used to overlap
 77 | predictions with experimental data and to assess performance of predictors. If no config file was
 78 | provided, this was generated using default values. It's strongly recommended to use a prediction
 79 | config file to control how predictors should be treated.
 80 | ```{r printPredConfig}
 81 | options(htmlwidgets.TOJSON_ARGS = list(na = 'string'))
 82 | pred_config_print <- pred_config[, -c("include", "pred_uid")]
 83 | datatable(pred_config_print, options = list(pageLength = 20), autoHideNavigation = TRUE)
 84 | ```
 85 | 
 86 | ***
 87 | 
 88 | # Overlap between predictors and CRISPR data {.tabset .tabset-pills}
 89 | The number of CRISPR enhancer-gene pairs that overlapped enhancer-gene pairs for each predictor are
 90 | counted. CRISPR enhancer-gene pairs that did not overlap any predicted pairs, are considered not
 91 | predicted. Large fractions of CRISPR E-G pairs not overlapping predictions lead to poor performance.
 92 | ```{r overlaps, fig.height=5, fig.width=7}
 93 | # count and plot number of CRISPR E-G pairs overlapping prediction E-G pairs
 94 | overlap_plots <- applyCellTypes(merged, .fun = plotOverlaps)
 95 | 
 96 | # save plots to files
 97 | savePlotList(overlap_plots, basename = "expt_pred_overlaps.pdf", path = plotdir, height = 3.5,
 98 |              width = 7)
 99 | ```
100 | 
101 | ```{r results='asis', fig.cap=cap, fig.height=3.5, fig.width=7}
102 | # print plots for every cell type in tabs
103 | printTabbedPlots(overlap_plots, section_level = "#")
104 | 
105 | # figure caption
106 | cap <- paste("Number of CRISPR enhancer-gene pairs overlapping enhancer-gene pairs in predictions.")
107 | ```
108 | 
109 | ***
110 | 
111 | # Precision-Recall performance
112 | Precision-recall (PR) curves are used for comparing the performance of different predictors on the
113 | experimental data. The area under the PR curve (AUPRC) provides a single metric of a predictors
114 | performance.
115 | 
116 | ```{r computePRC}
117 | # compute precision-recall tables for all cell types
118 | pr <- applyCellTypes(merged, .fun = calcPRCurves, pred_config = pred_config, pos_col = pos_col)
119 | 
120 | # combine pr tables into one table and save to file for other downstream analyses
121 | pr_table <- rbindlist(pr, idcol = "ExperimentCellType")
122 | write_tsv(pr_table, file = file.path(dirname(snakemake@output[[1]]), "pr_table.txt.gz"))
123 | 
124 | # create performance summary tables
125 | perf_summary <- applyCellTypes(merged, .fun = makePRSummaryTableBS, pred_config = pred_config,
126 |                                pos_col = pos_col)
127 | 
128 | # convert performance summaries to one table and add full predictor names to performance summary
129 | perf_summary <- perf_summary %>% 
130 |   bind_rows(.id = "cell_type") %>% 
131 |   left_join(select(pred_config, pred_uid, pred_name_long), by = "pred_uid") %>% 
132 |   relocate(pred_name_long, .after = pred_uid)
133 | 
134 | # save performance summary to text file
135 | write_tsv(perf_summary,
136 |           file = file.path(dirname(snakemake@output[[1]]), "performance_summary.txt"))
137 | ```
138 | 
139 | ```{r plotPRC}
140 | # calculate number and percentage of experimental true positives in the experimental dataset
141 | n_pos <- applyCellTypes(merged, .fun = calcNPos, pos_col = pos_col)
142 | pct_pos <- applyCellTypes(merged, .fun = calcPctPos, pos_col = pos_col)
143 | 
144 | # get number of tested enhancer gene pairs in merged data and create title for PR plot
145 | n_pairs <- applyCellTypes(merged, .fun = function(df) n_distinct(df$name) )
146 | pr_title <- paste0(snakemake@wildcards$comparison, " (", unlist(n_pairs[names(pr)]), " pairs)")
147 | 
148 | # make PRC plots
149 | n_pos <- n_pos[names(pr)]
150 | pct_pos <- pct_pos[names(pr)]
151 | pr_plots <- mapply(FUN = makePRCurvePlot, pr_df = pr, n_pos = n_pos, pct_pos = pct_pos, plot_name = pr_title,
152 |                    MoreArgs = list(pred_config = pred_config,
153 |                                    min_sensitivity = snakemake@params$min_sensitivity,
154 |                                    line_width = 0.8, point_size = 3,
155 |                                    text_size = 13, colors = pred_colors),
156 |                    SIMPLIFY = FALSE)
157 | 
158 | # save plots to files
159 | savePlotList(pr_plots, basename = "prc_full_expt_data.pdf", path = plotdir, height = 4.5,
160 |              width = 8)
161 | ```
162 | 
163 | ## Precision-recall curves {.tabset .tabset-pills}
164 | 
165 | ```{r results='asis', fig.cap=cap, fig.height=4.5, fig.width=8}
166 | # print plots for every cell type in tabs
167 | printTabbedPlots(pr_plots, section_level = "##")
168 | 
169 | # figure caption
170 | cap <- paste("Precision-recall curves for all predictors in all matching experimental cell types.",
171 |              "Dots represent alpha cutoff values as specified in pred_config file. If no alpha was",
172 |              "set, the minium alpha in predictions was taken by default, respectively the maximum",
173 |              "for inverse predictors. Distance to TSS was added as baseline predictor and computed",
174 |              "from the provided 'gene universe'.")
175 | ```
176 | 
177 | ## Performance summary
178 | 
179 | ```{r perfSummary, fig.cap=cap}
180 | # pretty-print PR summary table for report
181 | perf_summary_print <- perf_summary  %>% 
182 |   select(-pred_uid) %>%
183 |   rename(predictor = pred_name_long) %>% 
184 |   mutate(across(where(is.double), round, digits = 3)) %>% 
185 |   datatable(
186 |     perf_summary,
187 |     extensions = c("FixedColumns", "Buttons"),
188 |     options = list(
189 |       pageLength = 20,
190 |       dom = 'Bfrtip',
191 |       scrollX = TRUE,
192 |       fixedColumns = list(leftColumns = 3),
193 |       buttons = c('copy', 'csv', 'excel', 'pdf', 'print')
194 |     ),
195 |     autoHideNavigation = TRUE
196 |     )
197 | 
198 | # add vertical lines depending if performance at provided thresholds has been computed and print
199 | if (ncol(perf_summary) == 14) {
200 |   perf_summary_print %>% 
201 |     formatStyle(c(2, 5, 6, 9, 12), `border-right` = "solid 1px")
202 | } else {
203 |   perf_summary_print %>%
204 |     formatStyle(c(2, 5, 6, 9, 12, 13, 14, 17), `border-right` = "solid 1px")
205 | }
206 | 
207 | # figure caption
208 | cap <- paste("Precision-recall performance summary for predictors. Table shows Area-under-the-PRC",
209 |              "(AUPRC) and precision at specified thresholds (if specified) and minimum sensitity",
210 |              "(recall) of 0.7.")
211 | ```
212 | 
213 | ***
214 | 
215 | # Receiver Operating Characteristic performance
216 | Receiver Operating Characteristic (ROC) curves are an alternative method to compare performance by
217 | computing true positive rates and false positive rates for each predictor.
218 | 
219 | ```{r plotROC}
220 | # make ROC curve plots for each cell type
221 | roc_plots <- applyCellTypes(merged, .fun = plotROC, pos_col = "Regulated",
222 |                             pred_config = pred_config, line_width = 0.8, point_size = 3,
223 |                             text_size = 13, colors = pred_colors)
224 | 
225 | # save plots to files
226 | savePlotList(roc_plots, basename = "roc_full_expt_data.pdf", path = plotdir, height = 4.5,
227 |              width = 8)
228 | ```
229 | 
230 | ```{r results='asis', fig.cap=cap, fig.height=4.5, fig.width=8}
231 | # print plots for every cell type in tabs
232 | printTabbedPlots(roc_plots, section_level = "##")
233 | 
234 | # figure caption
235 | cap <- paste("ROC curves for all predictors in all matching experimental cell types.",
236 |              "Distance to TSS and nearest genes/TSS were added as baseline predictors and computed",
237 |              "from the provided 'gene universe'.")
238 | ```
239 | 
240 | ***
241 | 
242 | # Effect size vs predictors {.tabset .tabset-pills}
243 | Each predictor listed in the prediction data is plotted against the effect size of enhancer
244 | perturbations reported in the experimental data (e.g. percent change in expression). These plots
245 | show how well a predictor is associated with effects observed in CRISPRi enhancer screens in an
246 | intuitive way.
247 | 
248 | ```{r effectSizeScatter, include=FALSE}
249 | # make predictor vs effect size scatter plots
250 | es_scatters <- applyCellTypes(merged, .fun = plotPredictorsVsEffectSize, pos_col = pos_col,
251 |                               pred_names_col = "pred_name_long", point_size = 2, text_size = 13,
252 |                               alpha_value = 1, label.x.npc = 0.7, ncol = 2)
253 | 
254 | # calculate plot dimensions based on rows and column in plot
255 | es_scatter_dims <- get_row_col(es_scatters[[1]])
256 | plot_height <- ceiling(es_scatter_dims[1] * 2.5)
257 | plot_width  <- ceiling(es_scatter_dims[2] * 3.5)
258 | 
259 | # save scatter plots to files
260 | savePlotList(es_scatters, basename = "EffectSize_scatter_plots.pdf", path = plotdir,
261 |              height = plot_height, width = plot_width)
262 | ```
263 | 
264 | ```{r, results='asis', fig.cap=cap, fig.height=plot_height, fig.width=plot_width}
265 | # print plots for every cell type in tabs
266 | printTabbedPlots(es_scatters, section_level = "#")
267 | 
268 | # figure caption
269 | cap <- paste("Predictors versus CRISPRi effect size. Effect size is defined as percent change in",
270 |              "target gene expression upon CRISPRi perturbation of an enhancer. Effect size values",
271 |              "are taken from 'EffectSize' column in experimental data, while predictor scores",
272 |              "correspond to scores from prediction files. Numbers show Spearman's rank correlation",
273 |              "coefficient (rho) between effect size and predictor scores.")
274 | ```
275 | 
276 | ***
277 | 
278 | # Predictor scores versus experimental outcome {.tabset .tabset-pills}
279 | The scores of each predictor is compared between experimental positives and negatives to get another
280 | assessment of how well it distinguishes true enhancer - gene pairs from negatives.
281 | 
282 | ```{r}
283 | # create plots showing predictor values as a function of experimental outcome
284 | pred_plots <- applyCellTypes(merged, .fun = plotPredictorsVsExperiment, pos_col = pos_col,
285 |                              pred_names_col = "pred_name_long", text_size = 13)
286 | 
287 | # calculate plot dimensions based on number of predictors
288 | pred_plots_dims <- get_row_col(pred_plots[[1]])
289 | plot_height <- ceiling(pred_plots_dims[1] * 2.5)
290 | plot_width  <- ceiling(pred_plots_dims[2] * 3.5)
291 | 
292 | # save scatter plots to files
293 | savePlotList(pred_plots, basename = "predictor_vs_experiment.pdf", path = plotdir,
294 |              height = plot_height, width = plot_width)
295 | ```
296 | 
297 | ```{r, results='asis', fig.cap=cap, fig.height=plot_height, fig.width=plot_width}
298 | # print plots for every cell type in tabs
299 | printTabbedPlots(pred_plots, section_level = "#")
300 | 
301 | # figure caption
302 | cap <- paste("Predictor scores vs. experimental outcome for all predictors. Each point represents",
303 |   "one E-G pair in the experimental data. Cases where the predictor value is 0 or infinite might",
304 |   "correspond to E-G pairs that were not found in predictions and predicor values were filled in",
305 |   "according to the prediction config file")
306 | ```
307 | 
308 | ***
309 | 
310 | # Performance as function of distance to TSS
311 | Enhancer-gene pairs are binned based on their distance to TSS and predictor performance is assessed
312 | for each bin.
313 | 
314 | ```{r distanceBins, fig.width=4.5, fig.height=3}
315 | # set distance bins to 4 equal sized bins if no manual boundaries are provided
316 | if (is.null(snakemake@params$dist_bins_kb)) {
317 |   dist_breaks <- 4
318 | } else {
319 |   dist_breaks <- as.numeric(snakemake@params$dist_bins_kb)
320 | }
321 | 
322 | # bin pairs by distance
323 | dist_bins <- merged %>% 
324 |   filter(pred_uid == "baseline.distToTSS") %>% 
325 |   mutate(pred_value = pred_value / 1000) %>% 
326 |   mutate(`distToTSS (bins)` = cut(pred_value, breaks = dist_breaks, right = FALSE)) %>% 
327 |   select(name, `distToTSS (bins)`)
328 | 
329 | # add distance bins to merged
330 | merged_bins <- merged %>% 
331 |   left_join(dist_bins, by = "name") %>% 
332 |   filter(!is.na(`distToTSS (bins)`))
333 | 
334 | # get unique experimentally tested pairs (TODO: implement count_pairs_subset() for this)
335 | crispr_pairs <- distinct(select(merged_bins, name, `distToTSS (bins)`, all_of(pos_col)))
336 | 
337 | # plot number of pairs per distance bin
338 | ggplot(crispr_pairs, aes(x = `distToTSS (bins)`, fill = get(pos_col))) +
339 |   geom_bar() +
340 |   labs(x = "Distance to TSS (kb)", y = "Number of E-G pairs",
341 |        title = "CRISPR E-G pairs vs. distance", fill = pos_col) +
342 |   scale_fill_manual(values = c("FALSE" = "darkgray", "TRUE" = "steelblue")) +
343 |   theme_bw()
344 | 
345 | # save plot to file
346 | ggsave(filename = file.path(plotdir, "eg_pairs_vs_distance.pdf"), width = 4.5, height = 3)
347 | ```
348 | 
349 | ```{r filterBins}
350 | # count CRISPR positives and negatives per bin
351 | crispr_pos_neg_pairs <- crispr_pairs %>%
352 |   group_by(`distToTSS (bins)`) %>%
353 |   summarize(pos = sum(get(pos_col) == TRUE),
354 |             neg = sum(get(pos_col) == FALSE))
355 | 
356 | # get any bins containing no positives or no negatives
357 | invalid_bins <- crispr_pos_neg_pairs %>% 
358 |   filter(pos == 0 | neg == 0) %>% 
359 |   pull(`distToTSS (bins)`)
360 | 
361 | # remove these from merged data, since performance can't be assessed for these
362 | merged_bins_filt <- merged_bins %>% 
363 |   filter(!`distToTSS (bins)` %in% invalid_bins) %>% 
364 |   mutate(`distToTSS (bins)` = droplevels(`distToTSS (bins)`))
365 | 
366 | # report if any bins are removed
367 | if (length(invalid_bins) > 0) {
368 |   message("Following distance bins are removed because they do not contain any CRISPR positives ",
369 |           "or negatives: ", paste(invalid_bins, collapse = ", "))
370 | }
371 | ```
372 | 
373 | ## AUPRC as function of distance {.tabset .tabset-pills}
374 | ```{r AUPRCVsDist, warning=FALSE}
375 | # compute performance as a function of distance to TSS
376 | dist_perf <- applyCellTypes(merged_bins_filt, .fun = computePerfSubsets, pred_config = pred_config,
377 |                             subset_col = "distToTSS (bins)", metric = "auprc", pos_col = pos_col,
378 |                             bs_iter = 1000)
379 | 
380 | # create table of performance across distance
381 | dist_perf_tbl <- bind_rows(dist_perf, .id = "cell_type")
382 | write_tsv(dist_perf_tbl,
383 |           file = file.path(dirname(snakemake@output[[1]]), "performance_summary_distance_binned.txt"))
384 | 
385 | # plot performance as a function of distance to TSS
386 | dist_auprc <- lapply(dist_perf, FUN = plotPerfSubsets, pred_config = pred_config,
387 |                      subset_name = "Distance to TSS (kb)", title = "AUPRC vs. distance to TSS")
388 | 
389 | # save distance prc plots to files
390 | savePlotList(dist_auprc, basename = "distance_auprc.pdf", path = plotdir, height = 5, width = 8)
391 | ```
392 | 
393 | ```{r, results='asis', fig.cap=cap, fig.height=5, fig.width=8}
394 | # print plots for every cell type in tabs
395 | printTabbedPlots(dist_auprc, section_level = "##")
396 | 
397 | # figure caption
398 | cap <- paste("Area under the Precision-Recall Curve (AUPRC) for different distance to TSS bins (kb).")
399 | ```
400 | 
401 | ## Precision-recall curves {.tabset .tabset-pills}
402 | ```{r PRCVsDist}
403 | # count the number of distance bins to set number of rows in plot grid
404 | n_bins <- n_distinct(merged_bins_filt$`distToTSS (bins)`)
405 | nrow <- ceiling(n_bins / 2)
406 | 
407 | # create PR curves per distance bin for all cell types
408 | dist_prc_plots <- applyCellTypes(merged_bins_filt, .fun = makePRCurveSubsets,
409 |                                  subset_cols = "distToTSS (bins)", pred_config = pred_config,
410 |                                  pos_col = pos_col,
411 |                                  min_sensitivity = snakemake@params$min_sensitivity,
412 |                                  line_width = 0.8, point_size = 3, text_size = 13, nrow = nrow,
413 |                                  colors = pred_colors)
414 | 
415 | # calculate plot dimensions based on number of features
416 | plot_height <- nrow * 4
417 | plot_width <- ceiling(n_bins / nrow) * 7
418 | 
419 | # save distance prc plots to files
420 | savePlotList(dist_prc_plots, basename = "distance_binned_prc.pdf", path = plotdir,
421 |              height = plot_height, width = plot_width)
422 | ```
423 | 
424 | ```{r, results='asis', fig.cap=cap, fig.height=plot_height, fig.width=plot_width}
425 | # print plots for every cell type in tabs
426 | printTabbedPlots(dist_prc_plots, section_level = "##")
427 | 
428 | # figure caption
429 | cap <- paste("Precision-Recall curves for different distance to TSS bins (kb).")
430 | ```
431 | 
432 | ## Predictor scores versus experimental outcome {.tabset .tabset-pills}
433 | ```{r PredVsExpVsDist}
434 | # create plots showing predictor values as a function of experimental outcome
435 | dist_pred_plots <- applyCellTypes(merged_bins_filt, .fun = plotPredVsExperimentSubsets,
436 |                                   subset_cols = "distToTSS (bins)", pos_col = pos_col,
437 |                                   pred_names_col = "pred_name_long", text_size = 13)
438 | 
439 | # calculate plot dimensions based on number of features and predictors
440 | plot_width <- ceiling(n_bins * 2.5)
441 | plot_height <- ceiling(n_distinct(merged_bins_filt$pred_name_long) * 1.5)
442 | 
443 | # save distance predictor vs. experiment plots to files
444 | savePlotList(dist_pred_plots, basename = "distance_binned_pred_vs_expt.pdf", path = plotdir,
445 |              height = plot_height, width = plot_width)
446 | ```
447 | 
448 | ```{r, results='asis', fig.cap=cap, fig.height=plot_height, fig.width=plot_width}
449 | # print plots for every cell type in tabs
450 | printTabbedPlots(dist_pred_plots, section_level = "##")
451 | 
452 | # figure caption
453 | cap <- paste("Predictor scores versus experimental outcome for different distance to TSS bins",
454 |              "(kb).")
455 | ```
456 | 
457 | ## Effect size vs predictors {.tabset .tabset-pills}
458 | ```{r EsVsPredVsDist}
459 | # create plots showing effect size as function of predictor scores
460 | dist_es_plots <- applyCellTypes(merged_bins_filt, .fun = predVsEffectSizeSubsets,
461 |                                 subset_cols = "distToTSS (bins)", pos_col = pos_col,
462 |                                 pred_names_col = "pred_name_long", point_size = 2, text_size = 16,
463 |                                 label.x.npc = 0.65)
464 | 
465 | # calculate plot dimensions based on number of features and predictors
466 | plot_height <- ceiling(n_bins * 2.5)
467 | plot_width <- ceiling(n_distinct(merged_bins_filt$pred_name_long) * 2)
468 | 
469 | # save distance effect size vs. predictor plots to files
470 | savePlotList(dist_es_plots, basename = "distance_binned_EffectSize_scatter.pdf", path = plotdir,
471 |              height = plot_height, width = plot_width)
472 | ```
473 | 
474 | ```{r, results='asis', fig.cap=cap, fig.height=plot_height, fig.width=plot_width}
475 | # print plots for every cell type in tabs
476 | printTabbedPlots(dist_es_plots, section_level = "##")
477 | 
478 | # figure caption
479 | cap <- paste("CRISPR effect size vs predictor scores for different distance to TSS bins (kb).")
480 | ```
481 | 
482 | ***
483 | 
484 | # Subset by gene and enhancer features
485 | If any gene or enhancer features are provided versions faceted by these features of the PR curves,
486 | predictor vs experiment and effect size plots are created.
487 | 
488 | ```{r featureCols}
489 | # columns in merged data containing gene and enhancer features
490 | gene_feat_cols <- grep(colnames(merged), pattern = "^gene_feature.+$", value = TRUE)
491 | enh_feat_cols <- grep(colnames(merged), pattern = "^enh_feature_.+$", value = TRUE)
492 | 
493 | # plot the number of positive and negative pairs per gene and enhancer feature
494 | if (length(c(gene_feat_cols, enh_feat_cols)) > 0) {
495 |   
496 |     # count the number of positive and negative pairs per gene and enhancer feature
497 |     n_pairs_features <- countPairsFeatures(merged, pos_col = pos_col)
498 |     
499 |     # plot the number of pairs per feature type
500 |     n_pairs_plots <- plotPairsFeatures(n_pairs_features)
501 |     
502 | }
503 | 
504 | # calculate dimensions of plot based on number of rows and columns in faceted plots
505 | if (length(gene_feat_cols) > 0) {
506 |   gene_plot_dims <- get_row_col(n_pairs_plots$Gene)
507 |   gene_plot_height <- ceiling(gene_plot_dims[1] * 2.5)
508 |   gene_plot_width  <- ceiling(gene_plot_dims[2] * 2.5)
509 | } else {
510 |   gene_plot_height <- 5
511 |   gene_plot_width  <- 7
512 | }
513 | 
514 | if (length(enh_feat_cols) > 0) {
515 |   enh_plot_dims <- get_row_col(n_pairs_plots$Enhancer)
516 |   enh_plot_height <- ceiling(enh_plot_dims[1] * 2.5)
517 |   enh_plot_width  <- ceiling(enh_plot_dims[2] * 2.5)
518 | } else {
519 |   enh_plot_height <- 5
520 |   enh_plot_width  <- 7
521 | }
522 | ```
523 | 
524 | ```{r, fig.height=gene_plot_height, fig.width=gene_plot_width}
525 | if (length(gene_feat_cols) > 0) {
526 |   n_pairs_plots$Gene
527 | }
528 | ```
529 | 
530 | ```{r, fig.height=enh_plot_height, fig.width=enh_plot_width}
531 | if (length(enh_feat_cols) > 0) {
532 |   n_pairs_plots$Enhancer
533 | }
534 | ```
535 | 
536 | ## Precision-recall curves
537 | 
538 | ### Gene features {.tabset .tabset-pills}
539 | ```{r geneFeaturePRC}
540 | if (length(gene_feat_cols) > 0) {
541 |  
542 |   # create subset plots for all cell types
543 |   gene_feat_plots <- applyCellTypes(merged, .fun = makePRCurveSubsets, subset_cols = gene_feat_cols,
544 |                                     pred_config = pred_config, pos_col = pos_col,
545 |                                     min_sensitivity = snakemake@params$min_sensitivity,
546 |                                     line_width = 1.2, point_size = 3.5, text_size = 13,
547 |                                     colors = pred_colors)
548 |   
549 |   # calculate plot dimensions based on number of features
550 |   n_features <- length(gene_feat_cols)
551 |   plot_height <- ceiling(n_features * 3.5)
552 | 
553 | }
554 | ```
555 | 
556 | ```{r, results='asis', fig.cap=cap, fig.height=plot_height, fig.width=10}
557 | if (length(gene_feat_cols) > 0) {
558 |   
559 |   # print plots for every cell type in tabs
560 |   printTabbedPlots(gene_feat_plots, section_level = "###")
561 |   
562 |   # figure caption
563 |   cap <- paste("Precision-Recall curves for subsets of the data based on provided gene features.")
564 |   
565 | }
566 | ```
567 | 
568 | ### Enhancer features {.tabset .tabset-pills}
569 | ```{r enhFeaturePRC}
570 | if (length(enh_feat_cols) > 0) {
571 |   
572 |   # create subset plots for all cell types
573 |   enh_feat_plots <- applyCellTypes(merged, .fun = makePRCurveSubsets, subset_cols = enh_feat_cols,
574 |                                    pred_config = pred_config, pos_col = pos_col,
575 |                                    min_sensitivity = snakemake@params$min_sensitivity,
576 |                                    line_width = 1.2, point_size = 3.5, text_size = 13,
577 |                                    colors = pred_colors)
578 |   
579 |   # calculate plot dimensions based on number of features
580 |   n_features <- length(enh_feat_cols)
581 |   plot_height <- ceiling(n_features * 3.5)
582 | 
583 | }
584 | ```
585 | 
586 | ```{r, results='asis', fig.cap=cap, fig.height=plot_height, fig.width=10}
587 | if (length(enh_feat_cols) > 0) {
588 | 
589 |   # print plots for every cell type in tabs
590 |   printTabbedPlots(enh_feat_plots, section_level = "###")
591 |   
592 |   cap <- paste("Precision-Recall curves for subsets of the data based on provided enhancer",
593 |   "features.")
594 | 
595 | }
596 | ```
597 | 
598 | ## Predictor scores versus experimental outcome
599 | 
600 | ### Gene features {.tabset .tabset-pills}
601 | ```{r geneFeaturePredVsExp}
602 | if (length(gene_feat_cols) > 0) {
603 | 
604 |   # create plots showing predictor values as a function of experimental outcome
605 |   pred_plots <- applyCellTypes(merged, .fun = plotPredVsExperimentSubsets, 
606 |                                subset_cols = gene_feat_cols, pos_col = pos_col,
607 |                                pred_names_col = "pred_name_long", text_size = 13)
608 |   
609 |   # calculate plot dimensions based on number of predictors
610 |   plot_height <- ceiling(n_distinct(merged$pred_name_long) * length(gene_feat_cols) * 1.5)
611 | 
612 | }
613 | ```
614 | 
615 | ```{r, results='asis', fig.cap=cap, fig.height=plot_height, fig.width=7}
616 | if (length(gene_feat_cols) > 0) {
617 |   
618 |   # print plots for every cell type in tabs
619 |   printTabbedPlots(pred_plots, section_level = "###")
620 |   
621 |   cap <- paste("Predictor scores vs. experimental outcome for all predictors for subsets of the",
622 |                "data based on provided gene features.")
623 |   
624 | }
625 | ```
626 | 
627 | ### Enhancer features {.tabset .tabset-pills}
628 | ```{r enhFeaturePredVsExp}
629 | if (length(enh_feat_cols) > 0) {
630 |   
631 |   # create plots showing predictor values as a function of experimental outcome
632 |   pred_plots <- applyCellTypes(merged, .fun = plotPredVsExperimentSubsets,
633 |                                subset_cols = enh_feat_cols, pos_col = pos_col,
634 |                                pred_names_col = "pred_name_long", text_size = 13)
635 |   
636 |   # calculate plot dimensions based on number of predictors
637 |   plot_height <- ceiling(n_distinct(merged$pred_name_long) * length(enh_feat_cols) * 1.5)
638 |   plot_height <- min(c(plot_height, 170))
639 |   
640 | }
641 | ```
642 | 
643 | ```{r, results='asis', fig.cap=cap, fig.height=plot_height, fig.width=7}
644 | if (length(enh_feat_cols) > 0) {
645 |   
646 |   # print plots for every cell type in tabs
647 |   printTabbedPlots(pred_plots, section_level = "###")
648 |   
649 |   # figure caption
650 |   cap <- paste("Predictor scores vs. experimental outcome for all predictors for subsets of the data",
651 |                "based on provided gene features")
652 | 
653 | }
654 | ```
655 | 
656 | ## Effect size vs predictors
657 | 
658 | ### Gene features {.tabset .tabset-pills}
659 | ```{r geneFeatureEsScatter}
660 | if (length(gene_feat_cols) > 0) {
661 |   
662 |   # create plots showing effect size as function of predictor scores
663 |   es_plots <- applyCellTypes(merged, .fun = predVsEffectSizeSubsets, subset_cols = gene_feat_cols,
664 |                              pos_col = pos_col, pred_names_col = "pred_name_long", point_size = 2,
665 |                              text_size = 16, label.x.npc = 0.65)
666 |   
667 |   # calculate plot dimensions based on number of predictors
668 |   plot_height <- ceiling(length(gene_feat_cols) * 5.75)
669 |   plot_width <- ceiling(n_distinct(merged$pred_name_long) * 2)
670 | 
671 | }
672 | ```
673 | 
674 | ```{r, results='asis', fig.cap=cap, fig.height=plot_height, fig.width=plot_width}
675 | if (length(gene_feat_cols) > 0) {
676 | 
677 |   # print plots for every cell type in tabs
678 |   printTabbedPlots(es_plots, section_level = "###")
679 |   
680 |   # figure caption
681 |   cap <- paste("Effect size vs. predictor scores for all predictors for subsets of the data based on",
682 |                "provided gene features")
683 | 
684 | }
685 | ```
686 | 
687 | ### Enhancer features {.tabset .tabset-pills}
688 | ```{r enhFeatureEsScatter}
689 | if (length(enh_feat_cols) > 0) {
690 |   
691 |   # create plots showing effect size as function of predictor scores
692 |   es_plots <- applyCellTypes(merged, .fun = predVsEffectSizeSubsets, subset_cols = enh_feat_cols,
693 |                              pos_col = pos_col, pred_names_col = "pred_name_long", point_size = 2,
694 |                              text_size = 16, label.x.npc = 0.65)
695 |   
696 |   # calculate plot dimensions based on number of predictors
697 |   plot_height <- ceiling(length(enh_feat_cols) * 5.75)
698 |   plot_width <- ceiling(n_distinct(merged$pred_name_long) * 2)
699 | 
700 | }
701 | ```
702 | 
703 | ```{r, results='asis', fig.cap=cap, fig.height=plot_height, fig.width=plot_width}
704 | if (length(enh_feat_cols) > 0) {
705 | 
706 |   # print plots for every cell type in tabs
707 |   printTabbedPlots(es_plots, section_level = "###")
708 |   
709 |   # figure caption
710 |   cap <- paste("Effect size vs. predictor scores for all predictors for subsets of the data based on",
711 |                "provided enhancer features")
712 |   
713 | }
714 | ```
715 | 
716 | ***
717 | 
718 | # Correlation between predictors
719 | How well predictor scores correlate with each other for E-G pairs in the experimental data is
720 | investigated.
721 | ```{r predCorMatrix}
722 | # create correlation matrix plots for all cell types
723 | corr_matrix_plots <- applyCellTypes(merged, .fun = plotPredCorMatrix,
724 |                                     pred_names_col = "pred_name_long", method = "spearman")
725 | 
726 | # plot dimensions based on number of predictors
727 | n_pred <- n_distinct(pull(filter(merged, boolean == FALSE), pred_uid))
728 | dim <- 4 + (n_pred - 1) / 3
729 | 
730 | # save correlation matrix plots to pdfs
731 | savePlotList(corr_matrix_plots, basename = "predictor_correlation.pdf", path = plotdir,
732 |              height = dim, width = dim)
733 | ```
734 | 
735 | ```{r, results='asis', fig.cap=cap, fig.height=dim, fig.width=dim}
736 | # print plots for every cell type in tabs
737 | printTabbedPlots(corr_matrix_plots, section_level = "#")
738 | 
739 | # figure caption
740 | cap <- paste("Correlation of scores between predictors for experimenta E-G pairs.")
741 | ```
742 | 
743 | ***
744 | 
745 | # Properties of the experimental dataset
746 | Different features of the experimental data are investigated.
747 | 
748 | ## Distance to TSS distribution {.tabset .tabset-pills}
749 | ```{r distToTSSDistr}
750 | # create distance to TSS distributions for all cell types
751 | dist_distr <- applyCellTypes(merged, .fun = plotDistanceDistribution, dist = "baseline.distToTSS",
752 |                              convert_dist_kb = TRUE, pos_col = pos_col, text_size = 13)
753 | 
754 | # save plots to pdfs
755 | savePlotList(dist_distr, basename = "distToTSS_distribution.pdf", path = plotdir, height = 5,
756 |              width = 7)
757 | ```
758 | 
759 | ```{r, results='asis', fig.cap=cap, fig.height=5, fig.width=7}
760 | # print plots for every cell type in tabs
761 | printTabbedPlots(dist_distr, section_level = "##")
762 |   
763 | # figure caption
764 | cap <- paste("Distance to TSS distributions for all E-G pairs in experimental data. E-G pairs are",
765 |   "partitioned according to whether they were identified as enhancer-gene interactions (positives)",
766 |   "or negatives.")
767 | ```
768 | 
769 | ## Overlapping features {.tabset .tabset-pills}
770 | A plot showing the number of experimentally tested candidate enhancers overlapping provided genomic
771 | features. If no features were provided, this plot is not generated.
772 | 
773 | ```{r enhFeatures}
774 | # columns in merged data containing information on overlapping enhancer features
775 | feature_cols <- grep(colnames(merged), pattern = "^enh_feature_.+$", value = TRUE)
776 | 
777 | # create upset plots if merged data contains any enhancer feature columns
778 | if (length(feature_cols > 0)) {
779 |   
780 |   # make upset plots from features overlapping experimentally tested enhancers for each cell type
781 |   overlap_plots <- applyCellTypes(merged, .fun = plotOverlappingFeatures,
782 |                                   feature_cols = feature_cols)
783 |   
784 |   # save plots to pdfs
785 |   for (i in names(overlap_plots)) {
786 |     pdf(file.path(plotdir,  paste0(i, ".overlappingFeatures.pdf")), height = 5, width = 7)
787 |     print(overlap_plots[[i]])
788 |     dev.off()
789 |   }
790 | }
791 | ```
792 | 
793 | ```{r, results='asis', fig.height=5, fig.width=7, fig.cap=cap}
794 | if (length(feature_cols > 0)) {
795 |   
796 |   # print plots for every cell type in tabs
797 |   printTabbedPlots(overlap_plots, section_level = "##", plot_function = print)
798 | 
799 |   # figure caption
800 |   cap <- paste("Genomic features overlapping experimentally tested enhancers.")
801 |   
802 | }
803 | ```
804 | 
805 | ***
806 | 
807 | # Sources
808 | * <a download="comparePredictionsToExperiment.Rmd" href="`r base64enc::dataURI(file = params$rmd, mime = 'text/rmd',
809 |     encoding = 'base64')`">R Markdown source file (to produce this document)</a>
810 | 


--------------------------------------------------------------------------------
/workflow/scripts/createGenomeBrowserTracks.R:
--------------------------------------------------------------------------------
 1 | ## Create genome browser tracks for EPbenchmarking CRISPR dataset
 2 | 
 3 | # save.image("tracks.rda")
 4 | # stop()
 5 | 
 6 | # required packages
 7 | suppressPackageStartupMessages({
 8 |   library(data.table)
 9 |   library(dplyr)
10 | })
11 | 
12 | ## Define functions --------------------------------------------------------------------------------
13 | 
14 | # function to create crispr tracks for one cell type
15 | create_crispr_tracks <- function(merged, outdir, cell_type) {
16 |   
17 |   # extract all crispr E-G pairs for the given cell type
18 |   crispr <- merged %>% 
19 |     filter(ExperimentCellType == cell_type) %>% 
20 |     select(chrom, chromStart, chromEnd, chrTSS, startTSS, endTSS, name, EffectSize, Regulated) %>% 
21 |     distinct()
22 |   
23 |   # create bed track with all tested crispr elements
24 |   elements_bed <- crispr %>% 
25 |     mutate(strand = ".", name = paste0(chrom, ":", chromStart, "-", chromEnd), score = 0) %>% 
26 |     select(chrom, chromStart, chromEnd, name, score, strand) %>% 
27 |     distinct()
28 |   
29 |   # create bedpe track for all E-G pairs
30 |   bedpe <- crispr %>% 
31 |     mutate(strand1 = ".", strand2 = ".") %>% 
32 |     select(chrom1 = chrom, start1 = chromStart, end1 = chromEnd, chrom2 = chrTSS, start2 = startTSS,
33 |            end2 = endTSS, name, score = EffectSize, strand1, strand2, Regulated)
34 |   
35 |   # split into tracks for positives and negatives
36 |   pos_bedpe <- select(filter(bedpe, Regulated == TRUE),  -Regulated)
37 |   neg_bedpe <- select(filter(bedpe, Regulated == FALSE), -Regulated)
38 |   
39 |   # output file paths
40 |   elements_bed_outfile <- file.path(outdir, cell_type, "crispr_elements.bed")
41 |   pos_bedpe_outfile <- file.path(outdir, cell_type, "crispr_positives.bedpe")
42 |   neg_bedpe_outfile <- file.path(outdir, cell_type, "crispr_negatives.bedpe")
43 |   
44 |   # write tracks to output files
45 |   dir.create(file.path(outdir, cell_type), recursive = TRUE, showWarnings = FALSE)
46 |   fwrite(elements_bed, file = elements_bed_outfile, sep = "\t", col.names = FALSE, quote = FALSE)
47 |   fwrite(pos_bedpe, file = pos_bedpe_outfile, sep = "\t", col.names = FALSE, quote = FALSE)
48 |   fwrite(neg_bedpe, file = neg_bedpe_outfile, sep = "\t", col.names = FALSE, quote = FALSE)
49 |   
50 | }
51 | 
52 | ## Create genome browser tracks --------------------------------------------------------------------
53 | 
54 | # load merged data
55 | merged <- fread(snakemake@input$merged)
56 | 
57 | # create CRISPR genome browser tracks for all cell types
58 | for (i in unique(merged$ExperimentCellType)) {
59 |   create_crispr_tracks(merged, outdir = snakemake@output[[1]], cell_type = i)
60 | }
61 | 


--------------------------------------------------------------------------------
/workflow/scripts/createPredConfig.R:
--------------------------------------------------------------------------------
 1 | 
 2 | ## Create a pred_config.txt file with default parameters for comparisons where none is provided
 3 | 
 4 | message("pred_config file not provided, generating one with default parameters.")
 5 | 
 6 | # create pred_config table with default values for each predictor set. this assumes that the column
 7 | # with the predictor value is called "Score" and it is treated like a quantitative score, where
 8 | # higher values correspond to higher confidence.
 9 | pred_config <- data.frame(
10 |   pred_id = snakemake@params$pred_names,
11 |   pred_col = "Score",
12 |   boolean = FALSE,
13 |   alpha = NA,
14 |   aggregate_function = "sum",
15 |   fill_value = 0,
16 |   inverse_predictor = FALSE,
17 |   pred_name_long = paste0(snakemake@params$pred_names, ".Score"),
18 |   color = NA,
19 |   plot_crispr = TRUE,
20 |   stringsAsFactors = FALSE
21 |   )
22 | 
23 | # save to output file
24 | write.table(pred_config, file = snakemake@output[[1]], row.names = FALSE, quote = FALSE, sep = "\t")
25 | 


--------------------------------------------------------------------------------
/workflow/scripts/crisprComparisonBootstrapFunctions.R:
--------------------------------------------------------------------------------
  1 | ## Functions to compute bootstrapped performance metrics (AUPRC and precision @ threshold) and
  2 | ## significance for pairwise differences in performance between predictors
  3 | 
  4 | library(boot)
  5 | #library(boot.pval)
  6 | library(ROCR)
  7 | library(caTools)
  8 | library(data.table)
  9 | library(dplyr)
 10 | library(BiocParallel)
 11 | 
 12 | #' Reformat CRISPR benchmarking data for bootstrapping
 13 | #'
 14 | #' Convert merged data from CRISPR benchmarking pipeline into format required for bootstrapping
 15 | #' 
 16 | #' @param merged Merged data from CRISPR benchmarking pipeline
 17 | #' @param pred_config pred_config table used to run CRISPR benchmarking pipeline
 18 | #' @param rm_boolean (logical) Should boolean predictors be filtered out? (default: TRUE)
 19 | convertMergedForBootstrap <- function(merged, pred_config, pos_col = "Regulated",
 20 |                                       rm_boolean = TRUE) {
 21 |   
 22 |   # get inverse predictors
 23 |   inverse_preds <- pred_config[pred_config$inverse_predictor == TRUE, ][["pred_uid"]]
 24 |   
 25 |   # extract relevant columns from merged data
 26 |   merged <- select(merged, name, Regulated = all_of(pos_col), pred_uid, pred_value)
 27 |   
 28 |   # filter out boolean predictors
 29 |   if (rm_boolean == TRUE) {
 30 |     boolean_preds <- pred_config[pred_config$boolean == TRUE, ][["pred_uid"]]
 31 |     merged <- filter(merged, !pred_uid %in% boolean_preds)
 32 |   }
 33 |   
 34 |   # multiply inverse predictors by -1
 35 |   merged <- merged %>% 
 36 |     mutate(pred_value = if_else(pred_uid %in% inverse_preds,
 37 |                                 true = pred_value * -1,
 38 |                                 false = pred_value))
 39 |   
 40 |   # convert to wide format to create output
 41 |   output <- pivot_wider(merged, names_from = pred_uid, values_from = pred_value)
 42 |   
 43 |   return(output)
 44 |   
 45 | }
 46 | 
 47 | #' Get predictor threshold values
 48 | #' 
 49 | #' Extract predictor thresholds from pred_config file and invert for inverse predictors.
 50 | #' 
 51 | #' @param pred_config pred_config table used to run CRISPR benchmarking pipeline.
 52 | #' @param predictors Vector with predictors for which thresholds should be extracted. Default is
 53 | #'  NULL, which will extract thresholds for all predictors if possible.
 54 | #' @param threshold_col Column name in pred_config containing threshold values (Default: alpha).
 55 | getThresholdValues <- function(pred_config, predictors = NULL, threshold_col = "alpha") {
 56 |   
 57 |   # filter for subset of predictors only if specified
 58 |   if (!is.null(predictors)) {
 59 |     pred_config <- pred_config[pred_config$pred_uid %in% predictors, ]
 60 |   }
 61 |   
 62 |   # extract defined thresholds and "invert" (*-1) thresholds for inverse predictors
 63 |   thresholds <- deframe(select(pred_config, pred_uid, alpha))
 64 |   thresholds[pred_config$inverse_predictor] <- thresholds[pred_config$inverse_predictor] * -1
 65 |   
 66 |   # only return non-NA thresholds that are found in merged data
 67 |   thresholds <- thresholds[!is.na(thresholds)]
 68 |   
 69 |   return(thresholds)
 70 |   
 71 | }
 72 | 
 73 | #' Bootstrapped performance confidence intervals
 74 | #' 
 75 | #' Run bootstraps to compute confidence intervals for AUPRC, or precision or recall at a given
 76 | #' threshold performance metrics.
 77 | #' 
 78 | #' @param data Data frame containing merged data in wide format. Needs to contain columns named
 79 | #'   'name' with a unique identifier for each E-G pair and 'Regulated' identifying positives and
 80 | #'   negatives for benchmarking. All other columns are considered to be scores for predictors. Note
 81 | #'   that higher scores are assumed to rank higher. Inverse predictors (e.g. distance to TSS) need
 82 | #'   to be multiplied by -1. To convert merged data from CRISPR benchmarking to the required format,
 83 | #'   use the convertMergedForBootstrap() function.
 84 | #' @param metric Performance metric to bootstrap. Can either be 'auprc' for area under the
 85 | #'   precision-recall curve, or 'precision' or 'recall' for precision or recall at threshold
 86 | #'   at provided thresholds.
 87 | #' @param predictors Predictors (columns in data) for which performance should be bootstrapped. A
 88 | #'   simple vector or list with names of predictors for which performance should be bootstrapped.
 89 | #' @param thresholds Named vector with thresholds for all predictors (e.g. at 70% recall).
 90 | #'   Only required if metric is set to 'precision' or 'recall'.
 91 | #' @param R Number of bootstrap replicates (default: 10000).
 92 | #' @param conf Desired confidence levels for confidence intervals (default: 0.95).
 93 | #' @param ci_type Confidence interval type. See ?boot.ci for more information.
 94 | #' @param ncpus Specifies how many CPUs should be used for bootstrapping and computing confidence
 95 | #'   intervals. If 1 not parallelization is used, if > 1 parallel computing using the specified
 96 | #'   number of CPUs will be used. Parts of parallel computing rely in BiocParallel. 
 97 | bootstrapPerformanceIntervals <- function(data, metric = c("auprc", "precision", "recall"),
 98 |                                           predictors = NULL, thresholds = NULL, R = 10000,
 99 |                                           conf = 0.95, ncpus = 1,
100 |                                           ci_type = c("perc", "norm", "basic", "bca")) {
101 |   
102 |   # parse input arguments
103 |   metric <- match.arg(metric)
104 |   ci_type <- match.arg(ci_type)
105 |   
106 |   # check that thresholds are provided if precision is bootstrapped
107 |   if (metric %in% c("precision", "recall") & is.null(thresholds)) {
108 |     stop("Thresholds required if bootstrapping precision or recall", call. = FALSE)
109 |   }
110 |   
111 |   # subset data to specified predictors if passed via arguments
112 |   if (!is.null(predictors)) {
113 |     data <- data[, c("name", "Regulated", predictors)]
114 |   }
115 |   
116 |   # set parallel argument for boot function
117 |   parallel <- ifelse(ncpus > 1, yes = "multicore", no = "no")
118 |   
119 |   # bootstrap performance
120 |   message("Running bootstraps...")
121 |   bs_perf <- boot(data, statistic = calculate_performance, metric = metric, R = R,
122 |                   parallel = parallel, ncpus = ncpus, thresholds = thresholds)
123 |   
124 |   # set up parallel backend for computing confidence intervals if specified
125 |   if (ncpus > 1) {
126 |     register(MulticoreParam(workers = ncpus))
127 |   } else {
128 |     register(SerialParam())
129 |   }
130 |   
131 |   # compute confidence intervals for all predictors
132 |   message("Computing confidence intervals...")
133 |   pred_indices <- seq_along(bs_perf$t0)[!is.na(bs_perf$t0)]  # indices of non-NA predictors 
134 |   ci <- bplapply(pred_indices, FUN = boot.ci, boot.out = bs_perf, conf = conf,
135 |                  type = ci_type)
136 |   
137 |   # process boot.ci output to make pretty output table
138 |   output <- process_ci(ci, boot = bs_perf, metric = metric)
139 |   
140 |   return(output)
141 |   
142 | }
143 | 
144 | #' Bootstrapped pairwise performance comparisons
145 | #' 
146 | #' Run bootstraps to compute confidence intervals for delta AUPRC, or delta precision or recall at
147 | #' threshold for specified predictor pairs. delta is simply defined as performance predictor 1 -
148 | #' performance predictor 2.
149 | #' 
150 | #' @param data Data frame containing merged data in wide format. Needs to contain columns named
151 | #'   'name' with a unique identifier for each E-G pair and 'Regulated' identifying positives and
152 | #'   negatives for benchmarking. All other columns are considered to be scores for predictors. Note
153 | #'   that higher scores are assumed to rank higher. Inverse predictors (e.g. distance to TSS) need
154 | #'   to be multiplied by -1. To convert merged data from CRISPR benchmarking to the required format,
155 | #'   use the convertMergedForBootstrap() function.
156 | #' @param metric Performance metric to bootstrap. Can either be 'auprc' for area under the
157 | #'   precision-recall curve, or 'precision' or 'recall' for precision or recall at threshold
158 | #'   at provided thresholds.
159 | #' @param comparisons List containing pairwise comparisons of predictors that should be computed
160 | #'   (one pair per element). If 'NULL', all pairwise comparisons between all predictors will be
161 | #'   tested.
162 | #' @param thresholds Named vector with thresholds for all predictors (e.g. at 70% recall).
163 | #'   Only required if metric is set to 'precision' or 'recall'.
164 | #' @param R Number of bootstrap replicates (default: 10000).
165 | #' @param conf Desired confidence levels for confidence intervals (default: 0.95).
166 | #' @param ci_type Confidence interval type. See ?boot.ci for more information.
167 | #' @param ncpus Specifies how many CPUs should be used for bootstrapping and computing confidence
168 | #'   intervals. If 1 not parallelization is used, if > 1 parallel computing using the specified
169 | #'   number of CPUs will be used. Parts of parallel computing rely in BiocParallel. 
170 | bootstrapDeltaPerformance <- function(data, metric = c("auprc", "precision", "recall"),
171 |                                       comparisons = NULL, thresholds = NULL, R = 10000, conf = 0.95,
172 |                                       ci_type = c("perc", "norm", "basic", "bca"), ncpus = 1) {
173 |   
174 |   # parse input arguments
175 |   metric <- match.arg(metric)
176 |   ci_type <- match.arg(ci_type)
177 |   
178 |   # check that thresholds are provided if precision is bootstrapped
179 |   if (metric == "precision" & is.null(thresholds)) {
180 |     stop("Thresholds required if bootstrapping precision", call. = FALSE)
181 |   }
182 |   
183 |   # subset data to predictors in comparisons if specified, else create all pairwise comparisons
184 |   if (!is.null(comparisons)) {
185 |     data <- data[, c("name", "Regulated", unique(unlist(comparisons)))]
186 |   } else {
187 |     comparisons <- combn(setdiff(colnames(data), c("name", "Regulated")), m = 2, simplify = FALSE)
188 |   }
189 |   
190 |   # set names for all comparisons
191 |   names(comparisons) <- vapply(comparisons, FUN = paste, FUN.VALUE = character(1), collapse = " | ")
192 |   
193 |   # set parallel argument for boot function
194 |   parallel <- ifelse(ncpus > 1, yes = "multicore", no = "no")
195 |   
196 |   # bootstrap performance
197 |   message("Running bootstraps...")
198 |   bs_delta <- boot(data, statistic = calc_delta_performance, metric = metric, R = R,
199 |                    parallel = parallel, ncpus = ncpus, thresholds = thresholds,
200 |                    comparisons = comparisons)
201 |   
202 |   # set up parallel backend for computing confidence intervals if specified (useful for 'bca')
203 |   if (ncpus > 1) {
204 |     register(MulticoreParam(workers = ncpus))
205 |   } else {
206 |     register(SerialParam())
207 |   }
208 |   
209 |   # compute confidence intervals for all predictors
210 |   message("Computing confidence intervals...")
211 |   ci <- bplapply(seq_along(bs_delta$t0), FUN = boot.ci, boot.out = bs_delta, conf = conf,
212 |                  type = ci_type)
213 |   
214 |   # process boot.ci output to make pretty output table
215 |   output <- process_ci(ci, boot = bs_delta, metric = paste0("delta_", metric))
216 |   
217 |   # compute p-values under the null hypothesis that delta is 0
218 |   message("Computing p-values...")
219 |   pvalues <- compute_pvalues(bs_delta, type = ci_type, theta_null = 0, pval_precision = NULL)
220 |   output$pvalue <- pvalues[output$id]
221 |   
222 |   return(output)
223 |   
224 | }
225 | 
226 | #' Bootstrapped performance differences between datasets
227 | #' 
228 | #' Run bootstraps to compute confidence intervals for delta AUPRC, or delta precision or recall at
229 | #' threshold between two benchmarking datasets for specified predictors. delta is simply defined as
230 | #' performance predictor 1 - performance predictor 2.
231 | #' 
232 | #' @param data1,data2 Data frames containing merged data for the two benchmarking datasets in wide
233 | #'   format. Need to contain columns named 'name' with a unique identifier for each E-G pair and
234 | #'   'Regulated' identifying positives and negatives for benchmarking. All other columns are
235 | #'   considered to be scores for predictors. Note that higher scores are assumed to rank higher.
236 | #'   Inverse predictors (e.g. distance to TSS) need to be multiplied by -1. To convert merged data
237 | #'   from CRISPR benchmarking to the required format, use the convertMergedForBootstrap() function.
238 | #' @param metric Performance metric to bootstrap. Can either be 'auprc' for area under the
239 | #'   precision-recall curve, or 'precision' or 'recall' for precision or recall at threshold
240 | #'   at provided thresholds.
241 | #' @param predictors Predictors (columns in data) for which performance differences should be
242 | #'  computed. A simple vector or list with names of predictors to include. If not specified, the
243 | #'  intersect between predictors in data1 and data2 will be used.
244 | #' @param thresholds Named vector with thresholds for all predictors (e.g. at 70% recall).
245 | #'   Only required if metric is set to 'precision' or 'recall'.
246 | #' @param R Number of bootstrap replicates (default: 10000).
247 | #' @param conf Desired confidence levels for confidence intervals (default: 0.95).
248 | #' @param ci_type Confidence interval type. See ?boot.ci for more information.
249 | #' @param ncpus Specifies how many CPUs should be used for bootstrapping and computing confidence
250 | #'   intervals. If 1 not parallelization is used, if > 1 parallel computing using the specified
251 | #'   number of CPUs will be used. Parts of parallel computing rely in BiocParallel.
252 | bootstrapDeltaPerformanceDatasets <- function(data1, data2,
253 |                                               metric = c("auprc", "precision", "recall"),
254 |                                               predictors = NULL, thresholds = NULL, R = 10000,
255 |                                               conf = 0.95, ncpus = 1,
256 |                                               ci_type = c("perc", "norm", "basic", "bca")) {
257 |   
258 |   # parse input arguments
259 |   metric <- match.arg(metric)
260 |   ci_type <- match.arg(ci_type)
261 |   
262 |   # check that thresholds are provided if precision is bootstrapped
263 |   if (metric == "precision" & is.null(thresholds)) {
264 |     stop("Thresholds required if bootstrapping precision", call. = FALSE)
265 |   }
266 |   
267 |   # subset data to specified predictors if specified, else retain predictors shared across datasets
268 |   if (is.null(predictors)) {
269 |     predictors <- setdiff(intersect(colnames(data1), colnames(data2)), c("name", "Regulated"))
270 |   }
271 |   data1 <- data1[, c("name", "Regulated", predictors)]
272 |   data2 <- data2[, c("name", "Regulated", predictors)]
273 |   
274 |   # combine both datasets into one table and add column specifying dataset for each E-G pair
275 |   data <- bind_rows(data1, data2, .id = "dataset")
276 |   
277 |   # set parallel argument for boot function
278 |   parallel <- ifelse(ncpus > 1, yes = "multicore", no = "no")
279 |   
280 |   # bootstrap performance
281 |   message("Running bootstraps...")
282 |   bs_delta <- boot(data, statistic = calc_delta_performance_datasets, metric = metric, R = R,
283 |                    strata = data$dataset, parallel = parallel, ncpus = ncpus,
284 |                    thresholds = thresholds)
285 |   
286 |   # set up parallel backend for computing confidence intervals if specified (useful for 'bca')
287 |   if (ncpus > 1) {
288 |     register(MulticoreParam(workers = ncpus))
289 |   } else {
290 |     register(SerialParam())
291 |   }
292 |   
293 |   # compute confidence intervals for all predictors
294 |   message("Computing confidence intervals...")
295 |   ci <- bplapply(seq_along(bs_delta$t0), FUN = boot.ci, boot.out = bs_delta, conf = conf,
296 |                  type = ci_type)
297 |   
298 |   # process boot.ci output to make pretty output table
299 |   output <- process_ci(ci, boot = bs_delta, metric = paste0("delta_", metric))
300 |   
301 |   # compute p-values under the null hypothesis that delta is 0
302 |   message("Computing p-values...")
303 |   pvalues <- compute_pvalues(bs_delta, type = ci_type, theta_null = 0, pval_precision = NULL)
304 |   output$pvalue <- pvalues[output$id]
305 |   
306 |   return(output) 
307 |   
308 | }
309 | 
310 | #' Plot bootstrapped performance / delta performance
311 | #' 
312 | #' @param results Data frame containing bootstrapped performance metrics or delta performance.
313 | #' @param title (optional) Main title for plot.
314 | plotBootstrappedIntervals <- function(results, title = NULL) {
315 |   
316 |   # set id to factor ordered full metric and add column specifiying whether CI is different from 0
317 |   results <- results %>% 
318 |     mutate(id = fct_reorder(id, .x = full)) %>% 
319 |     mutate(diff_zero = if_else(lower <= 0 & upper >= 0, true = FALSE, false = TRUE))
320 |   
321 |   # create default main title
322 |   default_title <- switch(unique(results$metric), "auprc" = "AUPRC", "precision" = "Precision",
323 |                           "delta_auprc" = "Delta AUPRC", "delta_precision" = "Delta Precision")
324 |   
325 |   # create axis title
326 |   if (unique(results$metric) %in% c("delta_auprc", "delta_precision")) {
327 |     axis_title <- paste(default_title, "[first predictor - second predictor]")
328 |   } else {
329 |     axis_title <- default_title
330 |   }
331 |   
332 |   # create title for color/fill legend
333 |   color_title <- paste0(unique(results$conf) * 100, "% interval\ndifferent from 0")
334 |   
335 |   # use default main title unless manually specified otherwise
336 |   if (is.null(title)) title <- default_title
337 |   
338 |   # plot mean delta, 95% intervals and range for all comparisons
339 |   ggplot(results, aes(x = id, y = full)) +
340 |     geom_hline(yintercept = 0, linetype = "dashed") +
341 |     geom_errorbar(aes(ymin = lower, ymax = upper, color = diff_zero), linewidth = 1.5, width = 0) +
342 |     geom_errorbar(aes(ymin = min, ymax = max, color = diff_zero), linewidth = 0.5, width = 0) +
343 |     geom_point(aes(color = diff_zero, fill = diff_zero), shape = 23, size = 2) +
344 |     labs(title = title, y = axis_title, color = color_title, fill = color_title) +
345 |     coord_flip() +
346 |     scale_color_manual(values = c("FALSE" = "gray55", "TRUE" = "black")) +
347 |     scale_fill_manual(values = c("FALSE" = "gray55", "TRUE" = "red")) +
348 |     theme_bw() +
349 |     theme(axis.title.y = element_blank())
350 |   
351 | }
352 | 
353 | ## FUNCTIONS TO COMPUTE PERFORMANCE AND DELTA PERFORMANCE ==========================================
354 | 
355 | # calculate performance AUPRC, or precision or recall at threshold
356 | calculate_performance <- function(data, indices, metric, thresholds) {
357 |   
358 |   # select bootstrap sample
359 |   data <- data[indices, ]
360 |   
361 |   # get all predictors in input data
362 |   preds <- setdiff(colnames(data), c("name", "Regulated", "dataset"))
363 |   
364 |   # if thresholds are provided, get thresholds for these predictors else create NA thresholds
365 |   if (!is.null(thresholds)) {
366 |     thresholds <- thresholds[preds]
367 |   } else {
368 |     thresholds <- rep_len(NA_real_, length(preds))
369 |   }
370 |   
371 |   # calculate performance for all predictors
372 |   performance <- mapply(FUN = calculate_performance_one_pred, pred = preds, threshold = thresholds,
373 |                         MoreArgs = list(data = data, metric = metric), SIMPLIFY = TRUE)
374 |   
375 |   return(performance)
376 |   
377 | }
378 | 
379 | # function to calculate delta AUPRC, or precision or recall at threshold between pairwise
380 | # predictor combinations
381 | calc_delta_performance <- function(data, indices, metric, thresholds, comparisons) {
382 |   
383 |   # calculate bootstrapped performance
384 |   perf <- calculate_performance(data, indices = indices, metric = metric, thresholds = thresholds)
385 |   
386 |   # calculate delta performance for all specified comparisons
387 |   delta_perf <- vapply(comparisons, FUN = function(comp, perf) {
388 |     perf[[comp[[1]]]] - perf[[comp[[2]]]]
389 |   }, perf = perf, FUN.VALUE = numeric(1))
390 |   
391 |   return(delta_perf)
392 |   
393 | }
394 | 
395 | # function to calculate delta AUPRC, or precision or recall at threshold between 2 datasets
396 | calc_delta_performance_datasets <- function(data, indices, metric, thresholds) {
397 |   
398 |   # select bootstrap sample
399 |   data <- data[indices, ]
400 |   
401 |   # calculate bootstrapped performance for both stratifications
402 |   data1 <- data[data$dataset == "1", ]
403 |   data2 <- data[data$dataset == "2", ]
404 |   perf1 <- calculate_performance(data1, indices = seq_len(nrow(data1)), metric = metric,
405 |                                   thresholds = thresholds)
406 |   perf2 <- calculate_performance(data2, indices = seq_len(nrow(data2)), metric = metric,
407 |                                   thresholds = thresholds)
408 |   
409 |   # calculate delta performance for all predictors
410 |   delta_perf <- perf1 - perf2
411 |     
412 |   return(delta_perf)
413 |   
414 | }
415 | 
416 | 
417 | ## HELPER FUNCTIONS ================================================================================
418 | 
419 | # calculate performance auprc, or precision or recall at threshold for one predictor
420 | calculate_performance_one_pred <- function(data, pred, threshold, metric) {
421 |   
422 |   # return NA if 'Regulated' column does not contain at least one positive and negative
423 |   if (length(unique(data$Regulated)) != 2) {
424 |     warning("Not both positives and negatives ('Regulated') in bootstrap sample. Returning 'NA'.",
425 |             call. = FALSE)
426 |     return(NA_real_)
427 |   }
428 |   
429 |   # compute precision-recall curve
430 |   pr <- performance(prediction(data[[pred]], data$Regulated), measure = "prec", x.measure = "rec")
431 |   
432 |   # convert to data.frame
433 |   pr <- data.frame(
434 |     alpha = pr@alpha.values[[1]],
435 |     precision = pr@y.values[[1]],
436 |     recall = pr@x.values[[1]]
437 |   )
438 |   
439 |   # calculate AUPRC, or precision or recall at threshold performance
440 |   if (metric %in% c("precision", "recall")) {
441 |     performance <- calculate_performance_at_threshold(pr, threshold = threshold, metric = metric)
442 |   } else if (metric == "auprc") {
443 |     performance <- calculate_auprc(pr)
444 |   } else {
445 |     stop("Invalid 'metric' argument", call. = FALSE)
446 |   }
447 |   
448 |   return(performance)
449 |   
450 | }
451 | 
452 | # calculate area-under-the-precision-recall-curve (AUPRC)
453 | calculate_auprc <- function(pr) {
454 |   
455 |   # the head() calls here remove the last element of the vector. 
456 |   # The point is that performance objects produced by ROCR always include a Recall = 100% point even
457 |   # if the predictor cannot achieve a recall of 100%. This results in a straight line ending at
458 |   # (1,0) on the PR curve. This should not be included in the performance computation.
459 |   pr <- head(pr, -1)
460 |   
461 |   # compute auprc
462 |   auprc <- compute_auc(x_vals = pr$recall, y_vals = pr$precision)
463 |   
464 |   return(auprc)
465 |   
466 | }
467 | 
468 | # try to compute area under the curve
469 | compute_auc <- function(x_vals, y_vals) {
470 |   good.idx <- which(!is.na(x_vals) & !is.na(y_vals))
471 |   if (length(good.idx) > 0) {
472 |     auc <- trapz(x_vals[good.idx], y_vals[good.idx])
473 |   } else {
474 |     auc <- NA_real_
475 |   }
476 |   return(auc)
477 | }
478 | 
479 | # calculate precision at a given threshold
480 | calculate_performance_at_threshold <- function(pr, threshold, metric) {
481 |   
482 |   # get index of highest alpha value that is larger or equal to alpha_cutoff
483 |   idx <- sum(pr$alpha >= threshold)
484 |   
485 |   # get precision at this alpha value
486 |   perf_at_threshold <- pr[[metric]][[idx]]
487 |   
488 |   return(perf_at_threshold)
489 |   
490 | }
491 | 
492 | # function to extract the full value and upper and lower CI boundaries for a given bootci object
493 | extract_ci <- function(bootci) {
494 |   
495 |   # get full data metric
496 |   full <- bootci$t0
497 |   
498 |   # get CI data for given CI type
499 |   ci <- bootci[[4]]
500 |   
501 |   # assemble output data.frame
502 |   output <- data.frame(id = names(full), full = full, conf = ci[[1]], lower = ci[[length(ci) - 1]],
503 |                        upper = ci[[length(ci)]], row.names = NULL)
504 |   
505 |   return(output)
506 |   
507 | }
508 | 
509 | # function to extract full range of
510 | extract_range <- function(boot) {
511 |   
512 |   # calculate range of bootstrapped metric
513 |   range <- apply(boot$t, MARGIN = 2, FUN = range, na.rm = TRUE)
514 |   rownames(range) <- c("min", "max")
515 |   colnames(range) <- names(boot$t0)
516 |   
517 |   # transpose and make names a column
518 |   range <- as.data.frame(t(range))
519 |   range$id <- rownames(range)
520 |   rownames(range) <- NULL
521 |   
522 |   return(range)
523 |   
524 | }
525 | 
526 | # process outpur from boot.ci function and calculate absolute range of values
527 | process_ci <- function(ci, boot, metric) {
528 |   
529 |   # extract relevant information for confidence intervals and create output data.frame
530 |   ci <- lapply(ci, FUN = extract_ci)
531 |   ci <- rbindlist(ci)
532 |   
533 |   # extract full range of bootstrapped metric
534 |   range <- extract_range(boot)
535 |   
536 |   # add range to confidence intervals to create output
537 |   output <- merge(ci, range, by = "id")
538 |   
539 |   # add metric and rearrange columns
540 |   output <- mutate(output, metric = metric, .after = 1)
541 |   
542 |   return(output)
543 |   
544 | }
545 | 
546 | # compute p-values from bootstrapping results
547 | compute_pvalues <- function(boot, type, theta_null, pval_precision) {
548 |   
549 |   # get indices for different bootstraps
550 |   indices <- structure(seq_along(boot$t0), names = names(boot$t0))
551 |   
552 |   # compute p-values from boot object
553 |   pvals <- bplapply(indices, FUN = boot.pval, boot_res = boot, type = type, theta_null = theta_null,
554 |                     pval_precision = pval_precision)
555 |   
556 |   # convert from list ot simple vector
557 |   pvals <- unlist(pvals)
558 |   
559 |   return(pvals)
560 |   
561 | }
562 | 
563 | # function to compute p-values from boot objects. taken from the boot.pval package, which is not on
564 | # conda yet: https://github.com/mthulin/boot.pval/blob/main/R/boot.pval.R
565 | boot.pval <- function(boot_res,
566 |                       type = "perc",
567 |                       theta_null = 0,
568 |                       pval_precision = NULL,
569 |                       ...) {
570 |   
571 |   if(is.null(pval_precision)) { pval_precision = 1/boot_res$R }
572 |   
573 |   # create a sequence of alphas:
574 |   alpha_seq <- seq(1e-16, 1-1e-16, pval_precision)
575 |   
576 |   # compute the 1-alpha confidence intervals, and extract their bounds:
577 |   ci <- suppressWarnings(boot::boot.ci(boot_res,
578 |                                        conf = 1- alpha_seq,
579 |                                        type = type,
580 |                                        ...))
581 |   
582 |   bounds <- switch(type,
583 |                    norm = ci$normal[,2:3],
584 |                    basic = ci$basic[,4:5],
585 |                    stud = ci$student[,4:5],
586 |                    perc = ci$percent[,4:5],
587 |                    bca = ci$bca[,4:5])
588 |   
589 |   # find the smallest alpha such that theta_null is not contained in the 1-alpha
590 |   # confidence interval:
591 |   alpha <- alpha_seq[which.min(theta_null >= bounds[,1] & theta_null <= bounds[,2])]
592 |   
593 |   # return the p-value:
594 |   return(alpha)
595 |   
596 | }
597 | 


--------------------------------------------------------------------------------
/workflow/scripts/crisprComparisonLoadInputData.R:
--------------------------------------------------------------------------------
  1 | ## Functions to load, process and QC benchmarking pipeline input data files
  2 | 
  3 | library(data.table)
  4 | library(dplyr)
  5 | 
  6 | #' Import predictor config
  7 | #' 
  8 | #' Import predictor config file specifying parameters for each predictor in the benchmark and add
  9 | #' all required columns and baseline predictors.
 10 | #' 
 11 | #' @param pred_config_file Path to the predictor config file.
 12 | #' @param expr (logical) Should baseline predictors use information on whether genes are expressed?
 13 | #'   Can later be switched using `toggleBaselinePredictors`.
 14 | #' @param include_col Column containing information on which predictors to include in benchmark
 15 | #'   (TRUE/FALSE). Default column name is 'include', which is added if not in predictor config file
 16 | #'   with setting all predictors to be included in benchmark.
 17 | #' @param filter (logical) Specifying whether predictor config file should be filtered for
 18 | #'   predictors to include in benchmark only (default: TRUE).
 19 | importPredConfig <- function(pred_config_file, expr = FALSE, include_col = "include",
 20 |                              filter = TRUE) {
 21 |   
 22 |   # classes of standard columns in pred_config file
 23 |   cols <- c("pred_id" = "character", "pred_col" = "character", "boolean" = "logical",
 24 |             "alpha" = "numeric",  "aggregate_function" = "character", "fill_value" = "numeric",
 25 |             "inverse_predictor" = "logical", "pred_name_long" = "character", "color" = "character")
 26 |   
 27 |   # load pred_config file and subset to required columns
 28 |   pred_config <- fread(pred_config_file, colClasses = cols)
 29 |   select_cols <- c(names(cols), intersect(include_col, colnames(pred_config)))
 30 |   pred_config <- unique(pred_config[, ..select_cols])
 31 |   
 32 |   # rename include_col or add default if missing from pred_config 
 33 |   if (include_col %in% colnames(pred_config)) {
 34 |     colnames(pred_config)[colnames(pred_config) == include_col] <- "include"
 35 |   } else {
 36 |     pred_config$include <- TRUE
 37 |   }
 38 |   
 39 |   # add unique identifier (pred_uid) for each predictor
 40 |   pred_config$pred_uid <- paste(pred_config$pred_id, pred_config$pred_col, sep = ".")
 41 |   
 42 |   # create default baseline predictors config if needed
 43 |   baseline_config <- create_baseline_pred_config(expr)
 44 |   
 45 |   # if no colors are specified for any of the predictors to include in the benchmark, set colors
 46 |   # of all default baseline predictors to NA as well
 47 |   include_preds <- pred_config[pred_config$include == TRUE, ]
 48 |   if (all(is.na(include_preds$color)) == TRUE) {
 49 |     baseline_config$color <- NA_character_
 50 |   } 
 51 |   
 52 |   # config for baseline predictors can be set in pred_config, so only add default values for
 53 |   # baseline predictors not specified in pred config
 54 |   add_baseline_preds <- setdiff(baseline_config$pred_uid, pred_config$pred_uid)
 55 |   pred_config <- rbind(pred_config, subset(baseline_config, pred_uid %in% add_baseline_preds))
 56 | 
 57 |   # if specified filter out any predictors not included in benchmark
 58 |   if (filter == TRUE) {
 59 |     pred_config <- pred_config[pred_config$include == TRUE, ]
 60 |   }
 61 |   
 62 |   # check that generated pred_uid names are unique and all colors are valid color names/codes
 63 |   check_unique_identifier(pred_config, col = "pred_uid")
 64 |   check_colors(pred_config)
 65 |   
 66 |   return(pred_config)
 67 |   
 68 | }
 69 | 
 70 | #' Import experimental CRISPR data
 71 | #' 
 72 | #' Import CRISPR data used as ground truth for benchmarks.
 73 | #' @param expt_file Path to experimental CRISPR data.
 74 | loadExpt <- function(expt_file, showProgress = FALSE) {
 75 |   
 76 |   message("Reading CRISPR data in: ", expt_file)
 77 |   
 78 |   # load CRISPR data with correct class for 'ValidConnection' column
 79 |   expt <- fread(file = expt_file, showProgress = showProgress,
 80 |                 colClasses = c("ValidConnection" = "character"))
 81 |   
 82 |   message("\tLoaded CRISPR data for ", nrow(expt), " E-G pairs\n")
 83 |   
 84 |   return(expt)
 85 |   
 86 | }
 87 | 
 88 | #' Load predictions
 89 | #' 
 90 | #' Load input prediction files and create a list of tables, containing data for all predictors.
 91 | #' 
 92 | #' @param pred_files Named list containing all input prediction files.
 93 | #' @param format Predictor file format to use correct import function (Default: generic). Either a
 94 | #'   a single string specifying the format of all files, or a vector of equal length as pred_files
 95 | #'   specifying the format of each file individually.
 96 | #' @param show_progress (logical) Should detailed loading progress be showed?
 97 | loadPredictions <- function(pred_files, format = "generic", show_progress = FALSE) {
 98 |   
 99 |   # check format argument
100 |   possible_formats <- c("generic", "ENCODE", "IGVF")
101 |   format <- process_format_arg(format, pred_files, possible_formats = possible_formats)
102 |   
103 |   # pick import functions based on specified predictors file format
104 |   load_functions <- case_match(
105 |     format,
106 |     "generic" ~ "fread",
107 |     "ENCODE" ~ "load_encode_pred_file",
108 |     "IGVF" ~ "load_igvf_pred_file"
109 |   )
110 |   
111 |   message("Loading prediction files:")
112 |   
113 |   # import all pred_files as a list using the correct load function for each format
114 |   pred_ids <- structure(names(pred_files), names = names(pred_files))
115 |   preds <- mapply(FUN = load_pred_files_predictor, pred_ids, load_functions,
116 |                   MoreArgs = list(pred_files = pred_files, show_progress = show_progress),
117 |                   SIMPLIFY = FALSE)
118 | 
119 |   return(preds)
120 |   
121 | }
122 | 
123 | #' Filter predictions for TSS overlaps
124 | #' 
125 | #' Filter out any predictions, where elements overlap annotated gene TSS
126 | #' 
127 | #' @param pred_list List containing predictions for all predictors.
128 | #' @param tss_annot Table containing TSS annotations from tss_universe file.
129 | #' @param summary_file Path to file where number of filtered out elements and E-G pairs will be
130 | #'  reported.
131 | filterPredictionsTSS <- function(pred_list, tss_annot, summary_file) {
132 |   
133 |   # create a GRanges object containing TSS coordinates for filtering
134 |   tss <- makeGRangesFromDataFrame(tss_annot, seqnames.field = "chrTSS", start.field = "startTSS",
135 |                                   end.field = "endTSS", starts.in.df.are.0based = TRUE)
136 |   
137 |   # filter all predictions based on TSS overlaps
138 |   message("Filtering out predictions overlapping gene TSS for:")
139 |   pred_ids <- structure(names(pred_list), names = names(pred_list))
140 |   pred_list <- lapply(pred_ids, FUN = function(x, pred_list, tss) {
141 |     message("\t", x)
142 |     lapply(pred_list[[x]], FUN = filter_pred_tss, tss = tss)
143 |   }, pred_list = pred_list, tss = tss)
144 |   
145 |   # extract TSS filtering results and combined into one table
146 |   tss_filter <- lapply(pred_list, FUN = lapply, `[[`, 2)
147 |   tss_filter <- rbindlist(lapply(tss_filter, FUN = rbindlist, idcol = "file"), idcol = "pred_id")
148 |   
149 |   # write TSS filtering stats to summary file
150 |   fwrite(tss_filter, file = summary_file, sep = "\t", quote = FALSE, na = "NA")
151 |   
152 |   # remove filter from 'preds' list
153 |   pred_list <- lapply(pred_list, FUN = lapply, `[[`, 1)
154 |   
155 |   return(pred_list)
156 |   
157 | }
158 | 
159 | #' Toggle baseline predictors on or off for benchmarks
160 | #' 
161 | #' Set internal baseline predictors on predictor config table on or off to include in benchmarks.
162 | #' 
163 | #' @param pred_config Table containing predictor config information.
164 | #' @param on,off Vectors specifying which baseline predictors should be turned on or off
165 | toggleBaselinePredictors <- function(pred_config, on = NULL, off = NULL) {
166 |   
167 |   # parse input arguments
168 |   preds <- c("distToTSS", "distToGene", "nearestTSS", "nearestGene", "within100kbTSS",
169 |              "nearestExprTSS", "nearestExprGene", "within100kbExprTSS")
170 |   if(!is.null(on))  on  <- match.arg(on,  choices = preds, several.ok = TRUE)
171 |   if(!is.null(off)) off <- match.arg(off, choices = preds, several.ok = TRUE)
172 |   
173 |   # toggle on of off specific baseline predictors
174 |   pred_config[pred_config$pred_col %in% on,  "include"] <- TRUE
175 |   pred_config[pred_config$pred_col %in% off, "include"] <- FALSE
176 |   
177 |   return(pred_config)
178 |   
179 | }
180 | 
181 | 
182 | #' Load information on expressed genes
183 | #' 
184 | #' Load files containing information on which genes are expressed in different cell types.
185 | #' 
186 | #' @param ... Paths to one or more input files containing information on which genes are expressed.
187 | #'   If more than one file are passed, these are simply concatenated.
188 | loadGeneExpr <- function(...) {
189 |   expr <- rbindlist(lapply(..., FUN = fread))
190 |   return(expr)
191 | }
192 | 
193 | ## Functions to load predictions from different formats --------------------------------------------
194 | 
195 | # load one predictor file in ENCODE format (same as generic with optional "#" for header row)
196 | load_encode_pred_file <- function(file, showProgress) {
197 |   
198 |   # load predictions and remove optional "#" in header row
199 |   pred <- fread(file)
200 |   colnames(pred)[[1]] <- sub("^#", "", colnames(pred)[[1]])
201 |   
202 |   return(pred)
203 |   
204 | }
205 | 
206 | # load one predictor file in igvf format and convert to generic format
207 | load_igvf_pred_file <- function(file, showProgress) {
208 |   
209 |   # get header lines and extract biosample name (BiosampleTermName)
210 |   header <- grep(readLines(file, n = 1000), pattern = "^#", value = TRUE)
211 |   biosample <- grep(header, pattern = "BiosampleTermName", value = 1)
212 |   biosample <- sub(".*BiosampleTermName:[ ]*", "", biosample)
213 |   
214 |   # load predictions table and skip header lines (cautious approach, fread() should skip '#' lines)
215 |   pred <- fread(file, skip = length(header), showProgress = showProgress)
216 |   
217 |   # add BiosampleTermName column if missing
218 |   if (!"BiosampleTermName" %in% colnames(pred)) {
219 |     pred$BiosampleTermName <- biosample
220 |   }
221 |   
222 |   # select relevant columns for benchmarking pipeline
223 |   base_cols <- c("ElementChr", "ElementStart", "ElementEnd", "ElementName", "ElementClass",
224 |                  "GeneSymbol", "GeneEnsemblID", "GeneTSS", "BiosampleTermName")
225 |   score_cols <- setdiff(colnames(pred), base_cols)
226 |   select_cols <- c(base_cols[c(1:4, 6, 9)], score_cols)
227 |   pred <- pred[, ..select_cols]
228 |   
229 |   # rename column to CRISPR benchmark internal columns
230 |   colnames(pred)[1:6] <- c("chr", "start", "end", "name", "TargetGene", "CellType")
231 |   
232 |   return(pred)
233 |   
234 | }
235 | 
236 | ## Helper functions --------------------------------------------------------------------------------
237 | 
238 | # add baseline predictors to pred config table
239 | create_baseline_pred_config <- function(expr) {
240 |   
241 |   # all internal baseline identifiers
242 |   baseline_preds <- c("distToTSS", "distToGene", "nearestTSS", "nearestGene", "within100kbTSS",
243 |                       "nearestExprTSS", "nearestExprGene", "within100kbExprTSS")
244 |   baseline_preds_uid <- paste("baseline", baseline_preds, sep = ".")
245 |   
246 |   # create pred_config table 
247 |   baseline_config <- data.table(
248 |     pred_id = "baseline",
249 |     pred_col = baseline_preds,
250 |     boolean = c(FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE),
251 |     alpha = c(NA_real_, 1, 1, 1, 1, 1, 1, 1),
252 |     aggregate_function = c("mean", "max", "max", "max", "max", "max", "max", "max"),
253 |     fill_value = c(Inf, 0, 0, 0, 0, 0, 0, 0),
254 |     inverse_predictor = c(TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE),
255 |     pred_name_long = c("Distance to TSS", "Distance to Gene", "Nearest TSS", "Nearest gene",
256 |                        "Within 100kb of TSS", "Nearest expr. TSS", "Nearest expr. gene",
257 |                        "Within 100kb of expr. TSS"),
258 |     color = c("#ffa600", "#ffa600", "#595959", "#bebebe", "#000000", "#595959", "#bebebe",
259 |               "#000000"),
260 |     include = c(TRUE, FALSE, rep(NA, times = 6)),
261 |     pred_uid = baseline_preds_uid
262 |   )
263 |   
264 |   # set default include column values based on expr argument, which specifies whether baseline
265 |   # predictors based on expressed genes should be included or not
266 |   expr_baselines <- c("nearestExprTSS", "nearestExprGene", "within100kbExprTSS")
267 |   non_expr_baselines <- c("nearestTSS", "nearestGene", "within100kbTSS")
268 |   if (expr == TRUE) {
269 |     baseline_config <- toggleBaselinePredictors(baseline_config, on = expr_baselines,
270 |                                                 off = non_expr_baselines)
271 |   } else {
272 |     baseline_config <- toggleBaselinePredictors(baseline_config, on = non_expr_baselines,
273 |                                                 off = expr_baselines)
274 |   }
275 |   
276 |   return(baseline_config)
277 |   
278 | }
279 | 
280 | # check that a given column is a unique identifier in a predictor config file
281 | check_unique_identifier <- function(pred_config, col) {
282 |   if (any(table(pred_config[[col]]) > 1)) {
283 |     stop("'", col, "' in pred_config_file is not a unique identifier.", call. = FALSE)
284 |   }
285 | }
286 | 
287 | # check if colors in pred_config file are valid color ids
288 | check_colors <- function(pred_config) {
289 |  
290 |    # check that colors are valid options
291 |   valid_colors <- vapply(pred_config$color, FUN = function(col) {
292 |     tryCatch(expr = is.matrix(col2rgb(col)),
293 |              error = function(err) return(FALSE))
294 |   }, FUN.VALUE = logical(1))
295 |   
296 |   # raise error if invalid color specification was found
297 |   invalid_colors <- names(valid_colors[valid_colors == FALSE])
298 |   if (length(invalid_colors > 0)) {
299 |     stop("Invalid color specification(s): ", paste(invalid_colors, collapse = ", "), call. = FALSE)
300 |   }
301 |   
302 | }
303 | 
304 | # check if file format argument is valid
305 | process_format_arg <- function(format, pred_files, possible_formats) {
306 |   
307 |   # check that all format values are valid
308 |   invalid_formats <- setdiff(format, possible_formats)
309 |   if (length(invalid_formats) > 0) {
310 |     stop("Invalid prediction file format(s): ", paste(invalid_formats, collapse = ", "),
311 |          call. = FALSE)
312 |   }
313 |   
314 |   # if format is a list of formats per file, check that number is correct, else create vector with
315 |   # with the same format for each file
316 |   if (length(format) == 1) {
317 |     format <- rep_len(format, length(pred_files))
318 |   } else if (length(format) != length(pred_files)) {
319 |     stop("'format' needs to be either one value or a vector of equal length as 'pred_files'",
320 |          call. = FALSE)
321 |   }
322 |   
323 |   return(format)
324 |   
325 | }
326 | 
327 | # load all prediction files for one predictor provided via pred_id
328 | load_pred_files_predictor <- function(pred_id, load_function, pred_files, show_progress) {
329 |   
330 |   message("\tLoading predictions for: ", pred_id)
331 |   
332 |   # get load function specified by load_function string
333 |   load_function <- get(load_function)
334 |   
335 |   # load all files for the given predictor
336 |   files <- structure(pred_files[[pred_id]], names = pred_files[[pred_id]])
337 |   preds <- lapply(files, FUN = function(file, show_progress) {
338 |     message("\t\tReading: ", file)
339 |     load_function(file, showProgress = show_progress)
340 |   }, show_progress = show_progress)
341 |   
342 |   # report the total number of loaded E-G pairs
343 |   n_pairs <- sum(vapply(preds, FUN = nrow, FUN.VALUE = integer(1)))
344 |   message("\t\tLoaded predictions for ", n_pairs, " E-G pairs")
345 |   
346 |   return(preds)
347 | }
348 | 
349 | # filter one prediction file for elements not overlapping any provided TSS 
350 | filter_pred_tss <- function(pred, tss) {
351 |   
352 |   # get unique element coordinates and names for predictions
353 |   element_cols <- c("chr", "start", "end", "name")
354 |   elements <- unique(pred[, ..element_cols])
355 |   
356 |   # create GRanges object for both elements and TSSs
357 |   elements <- makeGRangesFromDataFrame(elements, seqnames.field = element_cols[[1]],
358 |                                        keep.extra.columns = TRUE, starts.in.df.are.0based = TRUE)
359 |   
360 |   # filter out elements that are overlapping any TSS
361 |   elements_filt <- subsetByOverlaps(elements, tss, invert = TRUE)
362 |   pred_filt <- pred[pred$name %in% elements_filt$name, ]
363 |   
364 |   # get the number of filtered out elements and predictions
365 |   filter <- data.table(removed_elements = length(elements) - length(elements_filt),
366 |                        removed_predictions = nrow(pred) - nrow(pred_filt))
367 |   
368 |   return(list(pred_filt, filter))
369 |   
370 | }
371 | 


--------------------------------------------------------------------------------
/workflow/scripts/crisprComparisonMergeFunctions.R:
--------------------------------------------------------------------------------
  1 | 
  2 | # required packages for functions in this file
  3 | library(data.table)
  4 | library(GenomicRanges)
  5 | library(dplyr)
  6 | 
  7 | 
  8 | # check format of an experiment dataset and report any issues
  9 | qcExperiment <- function(expt, pos_col, remove_na_pos = FALSE) {
 10 |   
 11 |   message("Running QC on experimental data:")
 12 |   
 13 |   # make sure that all column names are valid
 14 |   illegal_cols <- c("ExperimentCellType")
 15 |   illegal_expt_cols <- intersect(colnames(expt), illegal_cols)
 16 |   if (length(illegal_expt_cols) > 0) {
 17 |     stop("Illegal columns in experiment: ", paste(illegal_expt_cols, collapse = ", "),
 18 |          call. = FALSE)
 19 |   }
 20 |   
 21 |   # check for duplicate perturbation-gene pairs
 22 |   expt_filt <- subset(expt, ValidConnection == "TRUE")
 23 |   eg_id_cols <- c("CellType", "measuredGeneSymbol", "chrom", "chromStart", "chromEnd")
 24 |   duplicates <- duplicated(expt_filt[, ..eg_id_cols])
 25 |   if (any(duplicates)) {
 26 |     stop("The experimental data file contains duplicate perturbation - gene pairs", call. = FALSE)
 27 |   }
 28 |   
 29 |   # check to make sure regulated column contains TRUE/FALSE
 30 |   pos_vals <- unique(expt[[pos_col]])
 31 |   if (!all(pos_vals %in% c(FALSE, TRUE, NA))) {
 32 |     stop("The experimental data column must contain TRUE/FALSE or 1/0", call. = FALSE)
 33 |   }
 34 |   
 35 |   # filter out any perturbation-gene pairs where pos_col is NA if specified
 36 |   if (any(is.na(pos_vals)) & remove_na_pos == TRUE) {
 37 |     na_pos_col <- is.na(expt[[pos_col]])
 38 |     message("Filtering out ", sum(na_pos_col) , " case(s) with ", pos_col, " == NA")
 39 |     expt <- expt[!na_pos_col, ]
 40 |   }
 41 |     
 42 |   # raise warning if all perturbation-gene pairs have the same pos_col value
 43 |   if (sum(!is.na(pos_vals)) == 1) {
 44 |     warning("All values in ", pos_col, " are either positives or negative", call. = FALSE)
 45 |   }
 46 |   
 47 |   message("Done")
 48 |   return(expt)
 49 |   
 50 | }
 51 | 
 52 | # check format of a list of predictions and report any issues
 53 | qcPredictions <- function(pred_list, pred_config, one_tss = TRUE)  {
 54 |   
 55 |   message("Running QC on predictions:")
 56 |   
 57 |   # check that there is no 'baseline' prediction set as this is used internally
 58 |   if ("baseline" %in% names(pred_list)) {
 59 |     stop("Prediction set called 'baseline' not allowed. Please rename.", call. = FALSE)
 60 |   }
 61 |   
 62 |   # make sure that minimum required columns are present
 63 |   base_cols <- c("chr", "start", "end", "name", "TargetGene", "CellType")
 64 |   invisible(lapply(names(pred_list), FUN = check_min_cols, pred_list = pred_list,
 65 |                    pred_config = pred_config, base_cols = base_cols))
 66 |   
 67 |   # make sure all column names are valid
 68 |   illegal_cols <- c("experiment", "MappedCellType", "PredCellType")
 69 |   invisible(lapply(names(pred_list), FUN = check_illegal_cols, pred_list = pred_list,
 70 |                    illegal_cols = illegal_cols))
 71 |   
 72 |   # check that pred_col formats are ok
 73 |   invisible(lapply(names(pred_list), FUN = function(pred_name) { 
 74 |     pred <- pred_list[[pred_name]]
 75 |     conf <- pred_config[pred_config$pred_id == pred_name, ]
 76 |     invisible(mapply(FUN = check_pred_col, pred_col = conf$pred_col, boolean = conf$boolean,
 77 |                      MoreArgs = list(df = pred)))
 78 |   }))
 79 |   
 80 |   # check that predictions contain only one TSS per gene
 81 |   if (one_tss == TRUE) {
 82 |     invisible(lapply(names(pred_list), FUN = check_one_tss, pred_list = pred_list))
 83 |   }
 84 |   
 85 |   message("Done")
 86 |   return(pred_list)
 87 |   
 88 | }
 89 | 
 90 | # check for valid pred_config file
 91 | qcPredConfig <- function(pred_config, pred_list) {
 92 |   
 93 |   message("Running QC on pred_config file")
 94 |   
 95 |   # check that all predictors in pred_list are found in the pred_config file
 96 |   missing_preds <- setdiff(names(pred_list), pred_config$pred_id)
 97 |   if (length(missing_preds) > 0) {
 98 |     missing_preds <- paste(missing_preds, collapse = ", ")
 99 |     stop("Not all predictors in prediction files are listed in pred_config: ", missing_preds,
100 |          call. = FALSE)
101 |   }
102 |   
103 |   message("Done")
104 |   
105 | }
106 | 
107 | qcCellMapping <- function(cell_mappings) {
108 |   
109 |   dummy <- lapply(cell_mappings, FUN = function(cm) {
110 |     dup_expt <- any(duplicated(cm$experiment))
111 |     dup_pred <- any(duplicated(cm$predictions))
112 |     if (any(c(dup_expt, dup_pred))) {
113 |       stop("Currently only unique cell type mappings allowed", call. = FALSE)
114 |     } 
115 |   })
116 |   
117 | }
118 | 
119 | # add information on whether genes are expressed or not to experimental data
120 | addGeneExpression <- function(expt, expressed_genes) {
121 |   
122 |   # column order for output
123 |   output_cols <- c(colnames(expt), "expressed")
124 |   
125 |   # add expression data to experimental data
126 |   expt <- merge(expt, expressed_genes, by.x = c("CellType", "measuredGeneSymbol"),
127 |                 by.y = c("cell_type", "gene"), all.x = TRUE)
128 |   expt <- expt[, ..output_cols]
129 |   
130 |   # set any experimental genes not in the expressed_genes table to expressed = FALSE
131 |   missing_genes <- unique(expt[is.na(expt$expressed), ][["measuredGeneSymbol"]])
132 |   if (length(missing_genes) > 0) {
133 |     warning(length(missing_genes), " experimental genes missing from 'expressed_genes'. ",
134 |             "Assuming these as non-expressed.", call. = FALSE)
135 |     expt[is.na(expt$expressed), "expressed"] <- FALSE
136 |   }
137 |   
138 |   return(expt)
139 |   
140 | }
141 | 
142 | # map cell types from predictions to cell types in experimental data
143 | mapCellTypes <- function(pred_list, cell_mappings) {
144 |   
145 |   # create empty data.table for predictions not having cell_mappings
146 |   missing_cell_mappings <- setdiff(names(pred_list), names(cell_mappings))
147 |   names(missing_cell_mappings) <- missing_cell_mappings
148 |   missing_cell_mappings <- lapply(missing_cell_mappings, FUN = function(i) data.table() )
149 |   
150 |   # combine with provided cell mappings and sort
151 |   cell_mappings <- c(cell_mappings, missing_cell_mappings)[names(pred_list)]
152 |   
153 |   # map cell types
154 |   mapped_preds <- mapply(FUN = map_cell_type, pred = pred_list, ct_map = cell_mappings,
155 |                          SIMPLIFY = FALSE)
156 |   
157 |   return(mapped_preds)
158 |   
159 | }
160 | 
161 | # function to map cell types for one prediction set
162 | map_cell_type <- function(pred, ct_map) {
163 |   
164 |   # save column order for correct output columns later
165 |   pred_cols <- colnames(pred)  
166 |   
167 |   # map cell types if cell type mapping is provided
168 |   if (nrow(ct_map) == 0) {
169 |     
170 |     # simply assume that experimental cell types are identical to prediction cell types
171 |     pred <- cbind(pred, ExperimentCellType = pred$CellType)
172 |     
173 |   } else {
174 |     
175 |     # rename columns in ct_map
176 |     colnames(ct_map)[colnames(ct_map) == "experiment"] <- "ExperimentCellType"
177 |     
178 |     # merge pred and cell type mapping
179 |     pred <- merge(x = pred, y = ct_map, by.x = "CellType", by.y = "predictions", all.x = TRUE,
180 |                      all.y = FALSE)
181 |     
182 |     # if no experimental cell type was assigned, use cell type from predictions
183 |     pred$ExperimentCellType <- fifelse(is.na(pred$ExperimentCellType),
184 |                                             yes = pred$CellType,
185 |                                             no = pred$ExperimentCellType)
186 |     
187 |   }
188 |   
189 |   # rename original CellType column in output
190 |   pred <- pred[, c(pred_cols, "ExperimentCellType"), with = FALSE]
191 |   colnames(pred)[colnames(pred) == "CellType"] <- "PredictionCellType"
192 |   
193 |   return(pred)
194 |   
195 | }
196 | 
197 | # check if which genes in experimental also occur in predictions
198 | checkExistenceOfExperimentalGenesInPredictions <- function(expt, pred_list, summary_file) {
199 |   
200 |   # all genes in experimental data
201 |   expt_genes <- sort(unique(expt$measuredGeneSymbol))
202 |   
203 |   # check which genes occur in predictions
204 |   expt_genes_in_pred <- lapply(pred_list, function(pred) {expt_genes %in% unique(pred$TargetGene) })
205 |   
206 |   # create summary containing all experimental genes and their occurrence in each prediction set
207 |   summary <- data.table(experimental_genes = expt_genes, as.data.table(expt_genes_in_pred))
208 |   
209 |   # write to output file
210 |   write.table(summary, file = summary_file, sep = "\t", quote = FALSE, row.names = FALSE)
211 |   
212 | }
213 | 
214 | # combine a list of predictions with experimental data
215 | combineAllExptPred <- function(expt, pred_list, config, outdir, fill_pred_na = TRUE) {
216 |   
217 |   # merge each set of predictions with experimental data
218 |   prediction_sets <- structure(names(pred_list), names = names(pred_list))
219 |   merged_list <- lapply(
220 |     prediction_sets,
221 |     function(p) {
222 |       combineSingleExptPred(expt = expt, pred = pred_list[[p]], pred_name = p, config = config,
223 |                             outdir = outdir, fill_pred_na = fill_pred_na)
224 |     })
225 |   
226 |   # combine merged data into one table
227 |   output <- rbindlist(merged_list, idcol = "pred_id")
228 |   
229 |   # add unique identifier for each predictor
230 |   output$pred_uid <- paste(output$pred_id, output$pred_col, sep = ".")
231 |   
232 |   # rearrange columns for output
233 |   left_cols <- colnames(output)[seq(2, ncol(output) - 5)]
234 |   output_col_order <- c(left_cols, "pred_elements", "pred_uid", "pred_id", "pred_col", "pred_value",
235 |                         "Prediction")
236 |   setcolorder(output, output_col_order)
237 |   
238 |   return(output)
239 |   
240 | }
241 | 
242 | # merge predictions with experimental data
243 | combineSingleExptPred <- function(expt, pred, pred_name, config, outdir, fill_pred_na) {
244 |   
245 |   message("Overlapping predictions with experimental data for: ", pred_name)
246 |   
247 |   # replace any NA in predictions with fill value to avoid NAs in output
248 |   if (fill_pred_na == TRUE) {
249 |     pred <- fill_pred_na(pred, pred_name = pred_name, config = config)
250 |   }
251 |   
252 |   # Step 1: merging overlapping enhancer - gene pairs ----------------------------------------------
253 |   
254 |   # subset config to specified predictions
255 |   config_pred <- subset(config, pred_id == pred_name)
256 |   if (nrow(config_pred) == 0) {
257 |     stop("Predictions ", pred_name, " missing from pred_config!", call. = FALSE)
258 |   }
259 |   
260 |   # subset config to prediction columns appearing in data
261 |   config_filt <- subset(config_pred, pred_col %in% colnames(pred))
262 |   missing_pred <- setdiff(config_pred$pred_col, config_filt$pred_col)
263 |   if (length(missing_pred) > 0) {
264 |     warning("Following predictor(s) specified in config file not found for ", pred_name, ": ",
265 |             paste(missing_pred, collapse = ", "), call. = FALSE)
266 |   }
267 |   
268 |   # create GenomicRanges for CRE-G links for both experimental data and predictions. this applies a
269 |   # trick with using the seqnames to restrict overlaps to E-G pairs involving the same genes and in
270 |   # the same cell type
271 |   expt_gr <- with(expt, GRanges(seqnames = paste0(CellType,":", chrom,":", measuredGeneSymbol),
272 |                                 ranges = IRanges(chromStart, chromEnd)))
273 |   pred_gr <- with(pred, GRanges(seqnames = paste0(ExperimentCellType,":", chr,":", TargetGene),
274 |                                 ranges = IRanges(start, end)))
275 |   
276 |   # make sure that seqnames are the same, else GRanges will report unnecessary warnings. this could
277 |   # be removed and replaces with suppressWarnings() when calling findOverlaps() for a small gain
278 |   # in computation time
279 |   seqlevels_all_pairs <- as.character(unique(c(seqnames(expt_gr), seqnames(pred_gr))))
280 |   seqlevels(expt_gr) <- seqlevels_all_pairs
281 |   seqlevels(pred_gr) <- seqlevels_all_pairs
282 |   
283 |   # find overlaps between predictions and experimental data
284 |   ovl <- findOverlaps(expt_gr, pred_gr)
285 |   
286 |   # merge predictions with experimental data
287 |   pred_cols <- c("PredictionCellType", "name", config_filt$pred_col)
288 |   pred_col_names <- c("PredictionCellType", "pred_elements", config_filt$pred_col)
289 |   merged <- cbind(expt[queryHits(ovl)],
290 |                   pred[subjectHits(ovl), setNames(.SD, pred_col_names), .SDcols = pred_cols])
291 |   
292 |   # Step 2: aggregating pairs with multiple overlaps -----------------------------------------------
293 |   
294 |   # sometimes perturbed elements will overlap multiple predicted elements (eg in the case of a large
295 |   # deletion). in these cases need to summarize, e.g., sum ABC.Score across model elements
296 |   # overlapping the deletion. this requires a config file describing how each prediction column
297 |   # should be aggregated
298 |   agg_cols <- setdiff(colnames(merged), c("pred_elements", config_filt$pred_col))
299 |   merged <- collapseEnhancersOverlappingMultiplePredictions(merged, config = config_filt,
300 |                                                             agg_cols = agg_cols)
301 |   
302 |   # Step 3: experimental data missing predictions --------------------------------------------------
303 |   
304 |   # A tested enhancer element may not have a prediction. For ABC this is typically the case if the
305 |   # tested element does not overlap a DHS peak. In this case we need to fill the predictions table
306 |   
307 |   # add 'Prediction' column to merged with value 1, indicating pairs that overlap with predictions
308 |   merged$Prediction <- 1
309 |   
310 |   # get pairs from experimental data that are missing from predictions 
311 |   expt_missing_preds <- expt[setdiff(seq_len(nrow(expt)), queryHits(ovl)), ]
312 |   
313 |   # write these to a text file in the output sub-directory
314 |   expt_missing_pred_dir <- file.path(outdir, "experimentalDataMissingPredictions")
315 |   dir.create(expt_missing_pred_dir, recursive = TRUE, showWarnings = FALSE)
316 |   write.table(
317 |     x = expt_missing_preds[, -c("chrTSS", "startTSS", "endTSS")],
318 |     file = file.path(expt_missing_pred_dir, paste0(pred_name, "_expt_missing_predictions.txt")),
319 |     sep = "\t", quote = FALSE, row.names = FALSE
320 |   )
321 |   
322 |   # fill in missing values
323 |   expt_missing_preds$PredictionCellType <- NA_character_
324 |   expt_missing_preds$pred_elements <- NA_character_
325 |   expt_missing_preds <- fillMissingPredictions(expt_missing_preds, config = config_filt,
326 |                                                agg_cols = agg_cols)
327 |   
328 |   # add 'Prediction' column with value 0, indicating pairs that were not found in predictions
329 |   expt_missing_preds$Prediction <- 0
330 |   
331 |   # merge filled data with merged data
332 |   merged <- rbind(merged, expt_missing_preds[, colnames(merged), with = FALSE])
333 |   
334 |   # Step 4: create output --------------------------------------------------------------------------
335 |   
336 |   # convert to long format to generate output
337 |   output <- melt(merged, measure.vars =  config_filt$pred_col, variable.name = "pred_col",
338 |                  value.name = "pred_value")
339 |   
340 |   # rename CellType column from experimental input to ExperimentCellType
341 |   colnames(output)[colnames(output) == "CellType"] <- "ExperimentCellType"
342 | 
343 |   # sort output according to genomic coordinates of enhancers and target gene
344 |   sortcols <- c("chrom", "chromStart", "chromEnd", "measuredGeneSymbol")
345 |   setorderv(output, sortcols)
346 |   
347 |   return(output)
348 |   
349 | }
350 | 
351 | # aggregate experimental enhancers overlapping multiple predicted enhancers
352 | collapseEnhancersOverlappingMultiplePredictions <- function(df, config, agg_cols) {
353 |   
354 |   # function to concatenate element ids into one string
355 |   cat_elements <- function(x) {
356 |     out <- paste(x, collapse = ",")
357 |     return(out)
358 |   }
359 |   
360 |   # create vectors of all columns to process (predictor scores and elements) and aggregate functions
361 |   process_cols <- c("pred_elements", config$pred_col)
362 |   agg_functions <- c("cat_elements", config$aggregate_function)
363 |   
364 |   # summarize columns based on defined aggregation functions
365 |   all_list <- mapply(FUN = function(pred_col, agg_func) {
366 |     agg_func <- get(agg_func)  # get function from string
367 |     df[, setNames(.(agg_func(get(pred_col))), pred_col), by = agg_cols]
368 |   }, pred_col = process_cols, agg_func = agg_functions, SIMPLIFY = FALSE)
369 |   
370 |   # merge all the aggregates together to make collapsed data.frame
371 |   # TODO: AVOID INTERMEDIATE DATA.FRAME BY Reduce
372 |   output <- Reduce(function(df1, df2) merge(df1, df2, by = agg_cols), all_list)
373 |   output <- as.data.table(output)
374 |   
375 |   return(output)
376 |   
377 | }
378 | 
379 | # fill in prediction values for experimental pairs missing from prediction data
380 | fillMissingPredictions <- function(df, config, agg_cols) {
381 |   
382 |   # fill in missing predictions as described in the config file
383 |   for (i in seq_len(nrow(config))) {
384 |     df[, config$pred_col[[i]]] <- config$fill_value[i]
385 |   }
386 |   
387 |   # fill in unknown columns (columns appearing in agg_cols, but not experiment or pred_cols)
388 |   # unk_cols <- setdiff(c("class", agg_cols), unique(c(colnames(df), config$pred_col)))
389 |   # df[, unk_cols] <- "Merge:UNKNOWN"
390 |   
391 |   return(df)
392 |   
393 | }
394 | 
395 | # filter experimental data for genes in gene universe and add TSS coordinates to experimental data
396 | filterExptGeneUniverse <- function(expt, genes, missing_file = NULL) {
397 |   
398 |   # remove any existing TSS annotations from expt data
399 |   expt_cols <- colnames(expt)
400 |   tss_cols <- expt_cols %in% c("chrTSS", "startTSS", "endTSS")
401 |   expt <- expt[, !tss_cols, with = FALSE]
402 |   
403 |   # add gene TSSs from specified gene universe to experimental data
404 |   expt <- merge(expt, genes, by.x = "measuredGeneSymbol", by.y = "gene", all.x = TRUE, sort = FALSE)
405 |   expt <- expt[, ..expt_cols]
406 |   
407 |   # get experimental data from genes that do not appear in the gene universe
408 |   expt_missing <- expt[is.na(expt$chrTSS), ]
409 |   
410 |   # if there are any, write these to output file (if specified)
411 |   if (nrow(expt_missing) > 0) {
412 |     message("cre-gene pairs in experimental data missing from gene universe: ", nrow(expt_missing))
413 |     if (!is.null(missing_file)) {
414 |       write.table(expt_missing, file = missing_file, quote = FALSE, row.names = FALSE, sep = "\t")
415 |     }
416 |   }
417 |   
418 |   # filter out missing expt data, arrange columns and return as output
419 |   expt_filt <- expt[!is.na(expt$chrTSS), ]
420 |   
421 |   return(expt_filt)
422 |   
423 | }
424 | 
425 | # create summary of experimental data
426 | writeExptSummary <- function(df, summary_file) {
427 |   
428 |   # create summary table
429 |   df_summary <- as.data.frame(list(
430 |     numConnections = nrow(df),
431 |     numIncludeInModel = sum(df$IncludeInModel),
432 |     numIncludeInModelRegulated = sum(df$IncludeInModel & df$Regulated)
433 |   ))
434 |   
435 |   # write to specified file
436 |   write.table(df_summary, file = summary_file, sep = "\t", quote = FALSE, row.names = FALSE)
437 |   
438 | }
439 | 
440 | ## HELPER FUNCTIONS ================================================================================
441 | 
442 | # check input --------------------------------------------------------------------------------------
443 | 
444 | # make sure that minimum required columns are present in a prediction set
445 | check_min_cols <- function(pred_list, pred_config, pred, base_cols) {
446 |   
447 |   # check for all required columns
448 |   score_col <- pred_config[pred_config$pred_id == pred, ][["pred_col"]]  # get score cols for pred
449 |   missing_cols <- setdiff(c(base_cols, score_col), colnames(pred_list[[pred]]))
450 |   if (length(missing_cols) > 0) {
451 |     stop("Missing columns in predictions '", pred, "': ", paste(missing_cols, collapse = ", "), ".",
452 |          call. = FALSE)
453 |   }
454 | }
455 | 
456 | # check if a prediction set contains illegal column names
457 | check_illegal_cols <- function(pred_list, pred, illegal_cols) {
458 |   df <- pred_list[[pred]]
459 |   wrong_pred_cols <- intersect(colnames(df), illegal_cols)
460 |   if (length(wrong_pred_cols) > 0) {
461 |     stop("Illegal columns in predictions '", pred, "': ", paste(wrong_pred_cols, collapse = ", "),
462 |          ".", call. = FALSE)
463 |   }
464 | }
465 | 
466 | # check if a given predictor column has the correct format
467 | check_pred_col <- function(df, pred_col, boolean) {
468 |   pred_values <- df[[pred_col]]
469 |   if (boolean == TRUE) {
470 |     if (any(!unique(as.numeric(pred_values)) %in% c(0, 1))) {
471 |       stop("Incorrect format for boolean predictor. Must be either 1/0 or TRUE/FALSE.",
472 |            call. = FALSE)
473 |     }
474 |   } else {
475 |     if (!is.numeric(pred_values)) {
476 |       stop("Continuous predictors must have numeric values.", call. = FALSE)
477 |     }
478 |   }
479 | }
480 | 
481 | # check that a prediction set only uses one TSS per gene, i.e. each enhancer-gene pair occurs once
482 | check_one_tss <- function(pred_list, pred) {
483 |   df <- pred_list[[pred]]
484 |   total_pairs <- nrow(df)
485 |   unique_pairs <- nrow(unique(df[, c("chr", "start", "end", "TargetGene")]))
486 |   if (unique_pairs < total_pairs) {
487 |     stop("Predictions '", pred, "' uses more than one TSS per gene.", call. = FALSE)
488 |   }
489 | }
490 | 
491 | # replace NAs in prediction scores with appropriate fill values
492 | fill_pred_na <- function(pred, pred_name, config) {
493 |   
494 |   # get fill values for columns containing prediction scores in pred
495 |   fill_values <- config[config$pred_id == pred_name, c("pred_col", "fill_value")]
496 |   fill_values <- as.list(structure(fill_values$fill_value, names = fill_values$pred_col))
497 |   
498 |   # convert any integer scores to numeric to avoid errors when replacing NAs
499 |   pred <- mutate(pred, across(where(is.integer) & names(fill_values), as.numeric)) 
500 |   
501 |   # replace NAs with fill values
502 |   pred <- tidyr::replace_na(pred, replace = fill_values)
503 |   
504 |   return(pred)
505 |   
506 | }
507 | 


--------------------------------------------------------------------------------
/workflow/scripts/crisprComparisonPlotFunctions.R:
--------------------------------------------------------------------------------
   1 | ## Plotting functions for CRISPR comparison
   2 | 
   3 | library(tidyverse)
   4 | library(data.table)
   5 | library(ggpubr)
   6 | library(ggcorrplot)
   7 | 
   8 | ## WORK IN PROGRESS CODE ===========================================================================
   9 | 
  10 | # calculate performance metrics using ROCR
  11 | calculatePerformance <- function(merged, pos_col, pred_config, measure, x.measure = NULL) {
  12 |   
  13 |   # split into list for lapply
  14 |   merged_split <- split(merged, f = merged$pred_uid)
  15 |   
  16 |   # get inverse predictors
  17 |   inverse_predictors <- pred_config %>% 
  18 |     select(pred_uid, inverse_predictor) %>% 
  19 |     deframe()
  20 |   
  21 |   # multiply inverse predictors by -1 so that higher value corresponds to higher score
  22 |   inverse_predictors <- inverse_predictors[names(merged_split)]  # same as predictors for cell type
  23 |   merged_split <- mapply(FUN = function(pred, inv_pred) {
  24 |     inv_multiplier <- ifelse(inv_pred, -1, 1)
  25 |     pred$pred_value <- pred$pred_value * inv_multiplier
  26 |     return(pred)
  27 |   }, merged_split, inverse_predictors, SIMPLIFY = FALSE)
  28 |   
  29 |   # compute precision-recall performance for each predictor
  30 |   perf <- lapply(merged_split, FUN = function(p){
  31 |     performance(prediction(p$pred_value, p[[pos_col]]), measure = measure, x.measure = x.measure)
  32 |   })
  33 |   
  34 |   return(perf)
  35 |   
  36 | }
  37 | 
  38 | # convert a list of ROCR performance objects into a table
  39 | # TODO: get rid of manual names and use internal names for ROCR performance metrics
  40 | perfToTable <- function(perf_list, measure_name, x.measure_name) {
  41 |   
  42 |   # function to convert one performance object to a table
  43 |   convert_perfToTable <- function(perf, measure_name, x.measure_name) {
  44 |     df <- data.frame(list(
  45 |       alpha = perf@alpha.values[[1]],
  46 |       measure = perf@y.values[[1]],
  47 |       x.measure = perf@x.values[[1]]
  48 |     ))
  49 |     return(df)
  50 |   }
  51 |   
  52 |   # apply to input list
  53 |   perf_list <- lapply(perf_list, convert_perfToTable)
  54 |   
  55 |   # convert list of tables into one table
  56 |   perf_table <- rbindlist(perf_list, idcol = "pred_uid")
  57 |   colnames(perf_table)[colnames(perf_table) == "measure"] <- measure_name
  58 |   colnames(perf_table)[colnames(perf_table) == "x.measure"] <- x.measure_name
  59 |   
  60 |   return(perf_table)
  61 |   
  62 | }
  63 | 
  64 | # make a receiver operating characteristic (ROC) curve
  65 | plotROC <- function(merged, pos_col, pred_config, colors, thresholds = NULL,
  66 |                     plot_name = "ROC curve full experimental data", line_width = 1, 
  67 |                     point_size = 3, text_size = 15) {
  68 |   
  69 |   # compute ROC curve
  70 |   roc <- calculatePerformance(merged, pos_col = pos_col, pred_config = pred_config, measure = "tpr",
  71 |                               x.measure = "fpr")
  72 |   
  73 |   # make ROC table
  74 |   roc <- perfToTable(roc, measure_name = "TPR", x.measure_name = "FPR")
  75 |   
  76 |   # add pretty predictor names to pr_df for plotting
  77 |   roc <- left_join(roc, select(pred_config, pred_uid, pred_name_long), by = "pred_uid")
  78 |   
  79 |   # separate pr data into quantitative and boolean predictors
  80 |   bool_preds <- pull(filter(pred_config, boolean == TRUE), pred_uid)
  81 |   roc_quant <- filter(roc, !pred_uid %in% bool_preds)
  82 |   roc_bool  <- filter(roc, pred_uid %in% bool_preds)
  83 |   
  84 |   # get precision and recall for boolean predictor at alpha 1
  85 |   roc_bool <- filter(roc_bool, alpha == 1)
  86 | 
  87 |   # create PRC plot (caution, this assumes that there at least 1 quant and 1 bool predictor!)
  88 |   ggplot(roc_quant, aes(x = FPR, y = TPR, color = pred_name_long)) +
  89 |     geom_line(linewidth = line_width) +
  90 |     geom_point(data = roc_bool, size = point_size) +
  91 |     labs(title = plot_name, x  = "False postitive rate", y = "True postitive rate",
  92 |          color = "Predictor") + 
  93 |     coord_cartesian(xlim = c(0, 1), ylim = c(0, 1)) + 
  94 |     scale_color_manual(values = colors, breaks = names(colors)) +
  95 |     theme_bw() +
  96 |     theme(text = element_text(size = text_size))
  97 |   
  98 | }
  99 | 
 100 | # make performance (AUPRC) per subset plot
 101 | plotPerfSubsets <- function(perf, pred_config, subset_name = NULL, title = NULL, order = NULL) {
 102 |   
 103 |   # create prettier labels for subsets in plots
 104 |   perf <- perf %>% 
 105 |     mutate(subset_label = paste0(subset, "\n", pos, "\n", neg)) %>% 
 106 |     mutate(subset_label = fct_inorder(subset_label))
 107 |   
 108 |   # add pretty names for predictors
 109 |   perf <- left_join(perf, select(pred_config, pred_uid, pred_name_long), by = c("id" = "pred_uid"))
 110 |   
 111 |   # order predictors according AUPRC on whole dataset or based on provided order
 112 |   if (is.null(order)) {
 113 |     perf <- perf %>% 
 114 |       filter(subset == "All") %>% 
 115 |       select(id, full_all = full) %>% 
 116 |       left_join(perf, .,  by = "id") %>%
 117 |       mutate(pred_name_long = fct_reorder(pred_name_long, .x = full_all, .desc = TRUE))
 118 |   } else {
 119 |     perf <- perf %>% 
 120 |       mutate(pred_name_long = factor(pred_name_long, levels = order))
 121 |   }
 122 |   
 123 |   # get color for each predictor
 124 |   pred_colors <- deframe(select(pred_config, pred_name_long, color))
 125 |   
 126 |   # create title and x axis label
 127 |   if (is.null(subset_name)) subset_name <- unique(perf$subset_col)
 128 |   x_label <- paste0(subset_name, "\nCRISPRi positives\nCRISPRi negatives")
 129 |   if (is.null(title)) title <- paste(unique(perf$metric), "vs.", subset_name)
 130 |   
 131 |   # make performance as function of distance plot
 132 |   ggplot(perf, aes(x = subset_label, y = full, fill = pred_name_long)) +
 133 |     geom_bar(stat = "identity", position = position_dodge()) +
 134 |     geom_errorbar(aes(ymin = lower, ymax = upper), position = position_dodge(width = 0.9),
 135 |                   color = "black", width = 0.25) +
 136 |     labs(fill = "Predictor", x = x_label, y = unique(perf$metric), title = title) +
 137 |     scale_fill_manual(values = pred_colors[levels(perf$pred_name_long)]) + 
 138 |     scale_y_continuous(limits = c(0, 1)) +
 139 |     theme_bw() +
 140 |     theme(legend.position = "bottom")
 141 |   
 142 | }
 143 | 
 144 | # compute bootstrapped performance on subsets (and whole dataset if all == TRUE)
 145 | computePerfSubsets <- function(merged, pred_config, subset_col, metric = c("auprc", "precision"),
 146 |                                thresholds = NULL, pos_col = "Regulated",
 147 |                                bs_iter = 1000, all = TRUE) {
 148 |   
 149 |   # process metric argument
 150 |   metric <- match.arg(metric)
 151 |   
 152 |   # compute and bootstrap performance on subsets
 153 |   perf_subsets <- merged %>%
 154 |     split(., f = merged[[subset_col]]) %>% 
 155 |     lapply(FUN = convertMergedForBootstrap, pred_config = pred_config, pos_col = pos_col) %>% 
 156 |     lapply(FUN = bootstrapPerformanceIntervals, metric = metric, thresholds = thresholds,
 157 |            R = bs_iter) %>% 
 158 |     bind_rows(.id = "subset")
 159 |   
 160 |   # compute and bootstrap performance on entire dataset
 161 |   if (all == TRUE) {
 162 |     perf_subsets <- merged %>% 
 163 |       convertMergedForBootstrap(pred_config = pred_config, pos_col = pos_col) %>% 
 164 |       bootstrapPerformanceIntervals(metric = metric, thresholds = thresholds, R = bs_iter) %>% 
 165 |       mutate(subset = "All", .before = 1) %>% 
 166 |       bind_rows(., perf_subsets)
 167 |   }
 168 |   
 169 |   # count the number of positive and negative E-G pair for each subset and add to performance table
 170 |   pairs_subsets <- count_pairs_subset(merged, subset_col = subset_col, pos_col = pos_col, all = all)
 171 |   perf_subsets <- left_join(perf_subsets, pairs_subsets, by = "subset")
 172 |   
 173 |   # add used subset column to table
 174 |   perf_subsets <- mutate(perf_subsets, subset_col = subset_col, .before = 1)
 175 |   
 176 |   return(perf_subsets)
 177 |   
 178 | }
 179 | 
 180 | # count the number of crispr positive and negatives in subsets (and whole dataset if all == TRUE)
 181 | count_pairs_subset <- function(merged, subset_col, pos_col, all = TRUE) {
 182 |   
 183 |   # unique experimentally tested E-G pairs
 184 |   crispr_pairs <- merged %>% 
 185 |     select(name, subset = all_of(subset_col), positive = all_of(pos_col)) %>% 
 186 |     distinct()
 187 |   
 188 |   # count number of positive and negatives in each subset
 189 |   pairs_subsets <- crispr_pairs %>% 
 190 |     group_by(subset) %>% 
 191 |     summarize(pos = sum(positive == TRUE),
 192 |               neg = sum(positive == FALSE))
 193 |   
 194 |   # add the number of positive and negatives in the entire dataset if specified
 195 |   if (all == TRUE) {
 196 |     
 197 |     pairs_subsets <- crispr_pairs %>% 
 198 |       summarize(pos = sum(positive == TRUE),
 199 |                 neg = sum(positive == FALSE)) %>% 
 200 |       mutate(subset = "All", .before = 1) %>% 
 201 |       bind_rows(., pairs_subsets)
 202 |     
 203 |   }
 204 |   
 205 |   return(pairs_subsets)
 206 |   
 207 | }
 208 | 
 209 | 
 210 | ## MAIN FUNCTIONS ==================================================================================
 211 | 
 212 | # process merged data for benchmarking analyses
 213 | processMergedData <- function(merged, pred_config, filter_valid_connections = TRUE,
 214 |                               include_missing_predictions = TRUE) {
 215 |   
 216 |   # only retain predictors that are specified to be included in plots
 217 |   plot_preds <- pred_config[pred_config$include == TRUE, ][["pred_uid"]]
 218 |   merged <- merged[merged$pred_uid %in% plot_preds, ]
 219 |   
 220 |   # add long names and whether a predictor is boolean from pred_config to merged data
 221 |   merged <- pred_config %>% 
 222 |     select(pred_uid, boolean, pred_name_long) %>% 
 223 |     left_join(x = merged, y = ., by = "pred_uid")
 224 |   
 225 |   # filter merged data for valid connections if specified
 226 |   if (filter_valid_connections == TRUE) {
 227 |     merged <- subset(merged, ValidConnection == "TRUE")
 228 |   }
 229 |   
 230 |   # filter out CRE - gene pairs with missing predictions if specified
 231 |   if (include_missing_predictions == FALSE) {
 232 |     merged <- merged[merged$Prediction == 1, ]
 233 |   }
 234 |   
 235 |   return(merged)
 236 |   
 237 | }
 238 | 
 239 | # plot CRISPR E-G pairs overlapping prediction E-G pairs
 240 | plotOverlaps <- function(merged, title = "E-G pairs in predictions part of CRISPR E-G universe") {
 241 |   
 242 |   # count number of total CRISPR E-G pairs and overlapping pairs per predictor
 243 |   n_pairs <- merged %>% 
 244 |     group_by(pred_uid, pred_name_long) %>% 
 245 |     summarize(`Overlaps predictions` = sum(Prediction == 1),
 246 |               `Not in predictions` = sum(Prediction == 0),
 247 |               .groups = "drop") %>% 
 248 |     pivot_longer(cols = c(`Overlaps predictions`, `Not in predictions`), names_to = "Overlaps",
 249 |                  values_to = "pairs")
 250 |   
 251 |   # plot number of CRISPR E-G pairs overlapping E-G pairs in predictions
 252 |   ggplot(n_pairs, aes(x = pred_name_long, y = pairs, fill = Overlaps)) +
 253 |     geom_bar(stat = "identity") +
 254 |     labs(y = "E-G pairs", x = "Predictor", title = title) +
 255 |     scale_fill_manual(values = c("Overlaps predictions" = "steelblue",
 256 |                                  "Not in predictions" = "darkgray")) +
 257 |     coord_flip() +
 258 |     theme_bw()
 259 |   
 260 | }
 261 | 
 262 | # compute PR curves for merged data
 263 | calcPRCurves <- function(df, pred_config, pos_col) {
 264 |   
 265 |   # split into list for lapply
 266 |   df_split <- split(df, f = df$pred_uid)
 267 |   
 268 |   # get inverse predictors
 269 |   inverse_predictors <- pred_config %>% 
 270 |     select(pred_uid, inverse_predictor) %>% 
 271 |     deframe()
 272 |   
 273 |   # multiply inverse predictors by -1 so that higher value corresponds to higher score
 274 |   inverse_predictors <- inverse_predictors[names(df_split)]  # same as predictors for cell type
 275 |   df_split <- mapply(FUN = function(pred, inv_pred) {
 276 |     inv_multiplier <- ifelse(inv_pred, -1, 1)
 277 |     pred$pred_value <- pred$pred_value * inv_multiplier
 278 |     return(pred)
 279 |   }, df_split, inverse_predictors, SIMPLIFY = FALSE)
 280 |   
 281 |   # compute precision-recall performance for each predictor
 282 |   pr <- lapply(df_split, FUN = function(p){
 283 |     performance(prediction(p$pred_value, p[[pos_col]]), measure = "prec", x.measure = "rec")
 284 |   })
 285 |   
 286 |   # convert to table and calculate F1
 287 |   pr_df <- pr2df(pr, calc_f1 = TRUE)
 288 |   
 289 |   return(pr_df)
 290 |   
 291 | }
 292 | 
 293 | # create bootstrapped performance summary table for all predictors in a PR table
 294 | makePRSummaryTableBS <- function(merged, pred_config, pos_col, threshold_col = "alpha",
 295 |                                  min_sensitivity = 0.7, R = 1000, conf = 0.95, ncpus = 1) {
 296 |   
 297 |   # convert merged to wide format for bootstrapping
 298 |   merged_bs <- convertMergedForBootstrap(merged, pred_config = pred_config, pos_col = pos_col)
 299 |   
 300 |   # extract defined thresholds for provided predictors
 301 |   preds <- setdiff(colnames(merged_bs), c("name", "Regulated", "dataset"))
 302 |   thresholds <- getThresholdValues(pred_config, predictors = preds, threshold_col = threshold_col)
 303 |   
 304 |   # bootstrap overall performance (AUPRC) and reformat for performance summary table
 305 |   message("Bootstrapping AUPRC:")
 306 |   perf <- bootstrapPerformanceIntervals(merged_bs, metric = "auprc", R = R, conf = conf,
 307 |                                         ci_type = "perc", ncpus = ncpus)
 308 |   perf <- select(perf, pred_uid = id, AUPRC = full, AUPRC_lowerCi = lower, AUPRC_upperCi = upper)
 309 |   
 310 |   # get performance at minimum sensitivity
 311 |   pr <- calcPRCurves(merged, pred_config = pred_config, pos_col = pos_col)
 312 |   perf_min_sens <- pr %>% 
 313 |     arrange(pred_uid, recall, desc(precision)) %>% 
 314 |     split(., f = .$pred_uid) %>% 
 315 |     lapply(FUN = computePerfGivenSensitivity, min_sensitivity = min_sensitivity) %>% 
 316 |     bind_rows(.id = "pred_uid")
 317 |   
 318 |   # extract threshold at min sensitivity
 319 |   thresholds_min_sens <- deframe(select(perf_min_sens, pred_uid, alpha_at_min_sensitivity))
 320 |   
 321 |   # bootstrap precision at minimum sensitivity
 322 |   message("Bootstrapping precision at minimum sensitivity:")
 323 |   prec_min_sens <- bootstrapPerformanceIntervals(merged_bs, metric = "precision",
 324 |                                                  thresholds = thresholds_min_sens, R = R,
 325 |                                                  conf = conf, ci_type = "perc", ncpus = ncpus)
 326 |   
 327 |   # select relevant columns and reformat names for output
 328 |   prec_min_sens <- prec_min_sens %>% 
 329 |     select(pred_uid = id, PrecMinSens = full, PrecMinSens_lowerCi = lower,
 330 |            PrecMinSens_upperCi = upper)
 331 |   
 332 |   # bootstrap recall at minimum sensitivity (to get confidence intervals on min sensitivity)
 333 |   message("Bootstrapping recall at minimum sensitivity:")
 334 |   recall_min_sens <- bootstrapPerformanceIntervals(merged_bs, metric = "recall",
 335 |                                                   thresholds = thresholds_min_sens, R = R,
 336 |                                                   conf = conf, ci_type = "perc", ncpus = ncpus)
 337 |   
 338 |   # select relevant columns and reformat names for output
 339 |   recall_min_sens <- recall_min_sens %>% 
 340 |     select(pred_uid = id, RecallMinSens = full, RecallMinSens_lowerCi = lower,
 341 |            RecallMinSens_upperCi = upper)
 342 |   
 343 |   # add to performance table
 344 |   perf <- perf %>%
 345 |     mutate(MinSensitivity = min_sensitivity) %>% 
 346 |     left_join(prec_min_sens, by = "pred_uid") %>% 
 347 |     left_join(recall_min_sens, by = "pred_uid") %>% 
 348 |     left_join(enframe(thresholds_min_sens, name = "pred_uid", value = "ThresholdMinSens"),
 349 |               by = "pred_uid") 
 350 |   
 351 |   # bootstrap precision and recall at defined thresholds (if there are any)
 352 |   if (length(thresholds) > 0) {
 353 |     
 354 |     # get data on predictors with thresholds
 355 |     merged_bs_thresh <- select(merged_bs, name, Regulated, any_of(names(thresholds)))
 356 |     
 357 |     # run precision bootstraps using defined thresholds
 358 |     message("Bootstrapping precision at threshold:")
 359 |     prec_thresh <- bootstrapPerformanceIntervals(merged_bs_thresh, metric = "precision",
 360 |                                                  thresholds = thresholds, R = R, conf = conf,
 361 |                                                  ci_type = "perc", ncpus = ncpus)
 362 |     
 363 |     # select relevant columns and reformat names for output
 364 |     prec_thresh <- prec_thresh %>%
 365 |       select(pred_uid = id, PrecThresh = full, PrecThresh_lowerCi = lower,
 366 |              PrecThresh_upperCi = upper)
 367 |     
 368 |     # run recall bootstraps using defined thresholds
 369 |     message("Bootstrapping recall at threshold:")
 370 |     recall_thresh <- bootstrapPerformanceIntervals(merged_bs_thresh, metric = "recall",
 371 |                                                   thresholds = thresholds, R = R, conf = conf,
 372 |                                                   ci_type = "perc", ncpus = ncpus)
 373 |     
 374 |     # select relevant columns and reformat names for output
 375 |     recall_thresh <- recall_thresh %>%
 376 |       select(pred_uid = id, RecallThresh = full, RecallThresh_lowerCi = lower,
 377 |              RecallThresh_upperCi = upper)
 378 |     
 379 |     # combine and add to performance table
 380 |     perf <- perf %>% 
 381 |       left_join(enframe(thresholds, name = "pred_uid", value = "Threshold"),
 382 |                 by = "pred_uid") %>% 
 383 |       left_join(prec_thresh, by = "pred_uid") %>% 
 384 |       left_join(recall_thresh, by = "pred_uid")
 385 |       
 386 |   }
 387 |   
 388 |   # sort according to overall performance for output
 389 |   perf <- arrange(perf, desc(AUPRC))
 390 |     
 391 |   return(perf)
 392 |   
 393 | }
 394 | 
 395 | # make a PR curve plot for a set of provided predictors
 396 | makePRCurvePlot <- function(pr_df, pred_config, n_pos, pct_pos, min_sensitivity = 0.7,
 397 |                             plot_name = "PRC full experimental data", line_width = 1, 
 398 |                             point_size = 3, text_size = 15, colors = NULL, plot_thresholds = TRUE,
 399 |                             na_color = "gray66", na_size_factor = 0.5) {
 400 |   
 401 |   # create performance summary
 402 |   perf_summary <- makePRSummaryTable(pr_df, pred_config = pred_config,
 403 |                                      min_sensitivity = min_sensitivity)
 404 |   
 405 |   # get PR values at threshold
 406 |   pr_threshold <- perf_summary %>% 
 407 |     select(pred_name_long, precision = precision_at_cutoff, recall = sensitivity_at_cutoff)
 408 |   
 409 |   # add pretty predictor names to pr_df for plotting
 410 |   pr_df <- left_join(pr_df, select(pred_config, pred_uid, pred_name_long), by = "pred_uid")
 411 |   
 412 |   # separate pr data into quantitative and boolean predictors
 413 |   bool_preds <- pull(filter(pred_config, boolean == TRUE), pred_uid)
 414 |   pr_quant <- filter(pr_df, !pred_uid %in% bool_preds)
 415 |   pr_bool <- filter(pr_df, pred_uid %in% bool_preds)
 416 |   
 417 |   # get precision and recall for boolean predictor at alpha 1
 418 |   pr_bool <- filter(pr_bool, alpha == 1)
 419 |   
 420 |   # create default colors if none were specified
 421 |   if (is.null(colors)) {
 422 |     colors <- scales::hue_pal()(nrow(pred_config))
 423 |   }
 424 |   
 425 |   # separate predictors into those with a set color and those with NA (useful against overplotting)
 426 |   na_col <- names(colors)[is.na(colors)]
 427 |   pr_quant_col <- filter(pr_quant, !pred_name_long %in% na_col)
 428 |   pr_quant_na_col <- filter(pr_quant, pred_name_long %in% na_col)
 429 |   pr_bool_col <- filter(pr_bool, !pred_name_long %in% na_col)
 430 |   pr_bool_na_col <- filter(pr_bool, pred_name_long %in% na_col)
 431 |   pr_threshold_col <- filter(pr_threshold, !pred_name_long %in% na_col)
 432 |   pr_threshold_na_col <- filter(pr_threshold, pred_name_long %in% na_col)
 433 |   
 434 |   # set NA colors to a lighter gray than the ggplot default
 435 |   colors[is.na(colors)] <- na_color
 436 |   
 437 |   # create PRC plot (caution, this assumes that there at least 1 quant and 1 bool predictor!)
 438 |   p <- ggplot(pr_quant, aes(x = recall, y = precision, color = pred_name_long)) +
 439 |     geom_line(data = pr_quant_na_col, linewidth = line_width * na_size_factor)
 440 |   
 441 |   if (plot_thresholds) p <- p + geom_point(data = pr_threshold_na_col,
 442 |                                            size = point_size * na_size_factor)
 443 |   
 444 |   p <- p + 
 445 |     geom_point(data = pr_bool_na_col, size = point_size * na_size_factor) +
 446 |     geom_line(data = pr_quant_col, linewidth = line_width)
 447 |   
 448 |   if (plot_thresholds) p <- p + geom_point(data = pr_threshold_col, size = point_size)
 449 |   
 450 |   p + 
 451 |     geom_point(data = pr_bool_col, size = point_size) +
 452 |     geom_hline(yintercept = pct_pos, linetype = "dashed", color = "black") +
 453 |     labs(title = plot_name, x  = paste0("Recall (n=", n_pos, ")"), y = "Precision",
 454 |          color = "Predictor") + 
 455 |     coord_cartesian(xlim = c(0, 1), ylim = c(0, 1)) + 
 456 |     scale_color_manual(values = colors, breaks = names(colors)) +
 457 |     theme_bw() +
 458 |     theme(text = element_text(size = text_size))
 459 |   
 460 | }
 461 | 
 462 | # plot predictor scores vs effect sizes
 463 | plotPredictorsVsEffectSize <- function(merged, pos_col = "Regulated", pred_names_col = "pred_uid",
 464 |                                        corr_groups = pos_col, point_size = 2, text_size = 13,
 465 |                                        title = "Predictor scores vs. CRISPRi effect sizes",
 466 |                                        alpha_value = 1, include_boolean_preds = FALSE,
 467 |                                        cor_method = "spearman", ncol = NULL, nrow = NULL,
 468 |                                        label.x.npc = 0.75) {
 469 |   
 470 |   # remove boolean predictors if specified
 471 |   if (include_boolean_preds == FALSE) {
 472 |     if ("boolean" %in% colnames(merged)) {
 473 |       merged <- subset(merged, boolean == FALSE)
 474 |     } else {
 475 |       warning("'boolean' column not in 'merged' data frame, can't filter for boolean predictors",
 476 |               call. = FALSE)
 477 |     }
 478 |   }
 479 |   
 480 |   # set color based on column identifying positive CRISPRi pairs
 481 |   if (!is.null(pos_col)) {
 482 |     values <- sort(unique(merged[[pos_col]]))
 483 |     colors <- structure(c("darkgray", "steelblue"), names = as.character(values))
 484 |   } else {
 485 |     colors <- NULL
 486 |   }
 487 |   
 488 |   
 489 |   # create basic effect size vs predictors scatter plots
 490 |   p <- makeScatterPlots(merged, x_col = "pred_value", y_col = "EffectSize", color_col = pos_col,
 491 |                         x_lab = "Predictor score", y_lab = "CRISPRi effect size",
 492 |                         title = title, colors = colors,
 493 |                         pred_names_col = pred_names_col, point_size = point_size,
 494 |                         text_size = text_size, alpha_value = alpha_value, ncol = ncol, nrow = nrow)
 495 |   
 496 |   # add linear model fit and correlation coefficient
 497 |   if (is.null(corr_groups)) {
 498 |     p + 
 499 |       geom_smooth(method = "lm", formula = y ~ x, se = FALSE, color = "black") +
 500 |       stat_cor(aes(color = NULL, label = ..r.label..), method = cor_method, cor.coef.name = "rho",
 501 |                label.y.npc = "top", label.x.npc = label.x.npc, r.accuracy = 0.01,
 502 |                show.legend = FALSE)
 503 |   } else {
 504 |     p + 
 505 |       geom_smooth(aes(color = !!sym(corr_groups)), method = "lm", formula = y ~ x, se = FALSE) +
 506 |       stat_cor(aes(label = ..r.label..), method = cor_method, cor.coef.name = "rho",
 507 |                label.y.npc = "top", label.x.npc = label.x.npc, r.accuracy = 0.01,
 508 |                show.legend = FALSE)
 509 |   }
 510 |   
 511 | }
 512 | 
 513 | # make violin plots showing scores for each predictor as a function of experimental outcome
 514 | plotPredictorsVsExperiment <- function(merged, pos_col = "Regulated", pred_names_col = "pred_uid",
 515 |                                        text_size = 13, include_boolean_preds = FALSE, ncol = NULL,
 516 |                                        nrow = NULL) {
 517 |   
 518 |   # remove boolean predictors if specified
 519 |   if (include_boolean_preds == FALSE) {
 520 |     if ("boolean" %in% colnames(merged)) {
 521 |       merged <- subset(merged, boolean == FALSE)
 522 |     } else {
 523 |       warning("'boolean' column not in 'merged' data frame, can't filter for boolean predictors",
 524 |               call. = FALSE)
 525 |     }
 526 |   }
 527 |   
 528 |   # set color based on column identifying positive CRISPRi pairs
 529 |   values <- sort(unique(merged[[pos_col]]))
 530 |   colors <- structure(c("darkgray", "steelblue"), names = as.character(values))
 531 |   
 532 |   # convert column identifying positive CRISPR hits from string to symbol for ggplot
 533 |   pos_col <- sym(pos_col)
 534 |   
 535 |   # plot scores for each predictor as a function of experimental outcome
 536 |   ggplot(merged, aes(x = !!pos_col, y = pred_value, color = !!pos_col, fill = !!pos_col)) +
 537 |     facet_wrap(~get(pred_names_col), scales = "free", ncol = ncol, nrow = nrow) +
 538 |     geom_violin() +
 539 |     geom_boxplot(width = 0.1, outlier.shape = NA, color = "black", fill = "NA") +
 540 |     scale_color_manual(values = colors) +
 541 |     scale_fill_manual(values = colors) +
 542 |     labs(title = "Predictors vs experimental outcome", x = "Experimental E-G pair",
 543 |          y = "Predictor value (sqrt)") +
 544 |     scale_y_sqrt() +
 545 |     theme_bw() +
 546 |     theme(legend.position = "none", text = element_text(size = text_size))
 547 |   
 548 | }
 549 | 
 550 | # plot distance distributions for experimental positives and negatives
 551 | plotDistanceDistribution <- function(merged, dist = "baseline.distToTSS", convert_dist_kb = TRUE,
 552 |                                      pos_col = "Regulated", text_size = 13) {
 553 |   
 554 |   # get data for distance and add label for faceting
 555 |   dist_data <- merged %>% 
 556 |     filter(pred_uid == dist) %>% 
 557 |     mutate(label = if_else(get(pos_col) == TRUE, true = "Positives", false = "Negatives")) %>% 
 558 |     mutate(label = factor(label, levels = c("Positives", "Negatives")))
 559 |   
 560 |   # convert distance from bp to kb if specified
 561 |   if (convert_dist_kb == TRUE) {
 562 |     dist_data$pred_value <- dist_data$pred_value / 1000
 563 |   }
 564 |   
 565 |   # plot distance distribution for all pairs in experimental data
 566 |   ggplot(dist_data, aes(x = pred_value, fill = label)) +
 567 |     facet_wrap(~label, ncol = 1, scales = "free_y") +
 568 |     geom_histogram(binwidth = 10) +
 569 |     labs(x = "Distance to TSS (kb)") +
 570 |     scale_fill_manual(values = c("Positives" = "steelblue", "Negatives" = "darkgray")) +
 571 |     theme_bw() +
 572 |     theme(legend.position = "none", text = element_text(size = text_size))
 573 |   
 574 | }
 575 | 
 576 | # make an upset plot of overlapping features for a given cell type
 577 | plotOverlappingFeatures <- function(merged, feature_cols) {
 578 |   
 579 |   # create table with unique enhancers and overlapping features
 580 |   enh_features <- merged %>% 
 581 |     mutate(enh_id = paste0(chrom, ":", chromStart, "-", chromEnd)) %>% 
 582 |     select(enh_id, all_of(feature_cols)) %>% 
 583 |     distinct() %>% 
 584 |     mutate(across(all_of(feature_cols), ~ .x * 1))
 585 |   
 586 |   # set new column names
 587 |   colnames(enh_features) <- sub("enh_feature_", "", colnames(enh_features))
 588 |   
 589 |   # create upset plot with overlapping features
 590 |   upset(enh_features, nsets = 10, order.by = "freq", number.angles = 30, point.size = 3.5,
 591 |         line.size = 2, mainbar.y.label = "Enhancers overlapping features",
 592 |         sets.x.label = "Overlapping sites",
 593 |         text.scale = c(1.5, 1.5, 1.2, 1.2, 1.5, 1.5))
 594 |   
 595 | }
 596 | 
 597 | # plot correlation between predictors
 598 | plotPredCorMatrix <- function(merged, pred_names_col = "pred_uid", include_boolean_preds = FALSE,
 599 |                               method = c("spearman", "pearson", "kendall"),
 600 |                               title = "Correlation of predictor scores") {
 601 |   
 602 |   # parse method argument
 603 |   method <- match.arg(method)
 604 |   
 605 |   # remove boolean predictors if specified
 606 |   if (include_boolean_preds == FALSE) {
 607 |     if ("boolean" %in% colnames(merged)) {
 608 |       merged <- subset(merged, boolean == FALSE)
 609 |     } else {
 610 |       warning("'boolean' column not in 'merged' data frame, can't filter for boolean predictors",
 611 |               call. = FALSE)
 612 |     }
 613 |   }
 614 |   
 615 |   # extract predictor scores and transform to wide format
 616 |   pred_scores <- merged %>% 
 617 |     select(name, all_of(pred_names_col), pred_value) %>% 
 618 |     pivot_wider(names_from = all_of(pred_names_col), values_from = pred_value)
 619 |   
 620 |   # compute correlation coefficients between predictors
 621 |   cor_matrix <- cor(select(pred_scores, -name), method = method)
 622 |   
 623 |   # plot correlation matrix for scores of different predictors
 624 |   ggcorrplot(cor_matrix, hc.order = TRUE, type = "lower", lab = TRUE, title = title)
 625 |   
 626 | }
 627 | 
 628 | ## Make plots for subsets of the data based on gene or enhancer features ---------------------------
 629 | 
 630 | # count the number of positive and negative pairs per gene and enhancer feature
 631 | countPairsFeatures <- function(merged, feature_cols_pattern = "^(gene|enh)_feature_.+$",
 632 |                                pos_col = "Regulated") {
 633 |   
 634 |   # get all feature columns
 635 |   feature_cols <- grep(colnames(merged), pattern = feature_cols_pattern, value = TRUE)
 636 |   
 637 |   # count the number of pairs per feature column
 638 |   output <- merged %>% 
 639 |     select(name, all_of(c(pos_col, feature_cols))) %>% 
 640 |     distinct() %>% 
 641 |     pivot_longer(cols = all_of(feature_cols), names_to = "feature", values_to = "value",
 642 |                  values_transform = as.character) %>% 
 643 |     group_by(feature, value) %>%
 644 |     summarize("TRUE" = sum(get(pos_col) == TRUE),
 645 |               "FALSE" = sum(get(pos_col) == FALSE),
 646 |               .groups = "drop") %>% 
 647 |     pivot_longer(cols = c("TRUE", "FALSE"), names_to = "outcome", values_to = "pairs") %>% 
 648 |     separate(col = feature, into = c("feature_type", "feature"), sep = "_feature_")
 649 |   
 650 |   return(output)
 651 |   
 652 | }
 653 | 
 654 | # plot the number of pairs grouped by feature types (if feature_types has names, the names will be
 655 | # used for pretty plots)
 656 | plotPairsFeatures <- function(n_pairs_features,
 657 |                               feature_types = c("Gene" = "gene", "Enhancer" = "enh")) {
 658 |   
 659 |   # split n_pairs_features by feature types
 660 |   n_pairs_features <- split(n_pairs_features, f = n_pairs_features$feature_type)
 661 |   
 662 |   # order and rename feature types (if specified)
 663 |   n_pairs_features <- n_pairs_features[feature_types]
 664 |   if (!is.null(names(feature_types))) names(n_pairs_features) <- names(feature_types)
 665 |   
 666 |   # create titles for plots
 667 |   titles <- paste(names(n_pairs_features), "features")
 668 |   
 669 |   # plot number of pairs per feature type
 670 |   feat_pair_plots <- mapply(FUN = function(set, title) {
 671 |     ggplot(set, aes(x = value, y = pairs, fill = as.logical(outcome))) +
 672 |       facet_wrap(~feature) +
 673 |       geom_bar(stat = "identity") +
 674 |       labs(title = title, y = "Number of E-G pairs", fill = pos_col) +
 675 |       scale_fill_manual(values = c("FALSE" = "darkgray", "TRUE" = "steelblue")) +
 676 |       theme_bw() +
 677 |       theme(axis.title.x = element_blank(),
 678 |             axis.text.x = element_text(angle = 45, hjust = 1))
 679 |   }, set = n_pairs_features, title = titles, SIMPLIFY = FALSE)
 680 |   
 681 |   return(feat_pair_plots)
 682 |   
 683 | }
 684 | 
 685 | # calculate and plot PR curves for a given subset
 686 | makePRCurveSubset <- function(merged, subset_col, pred_config, pos_col, min_sensitivity = 0.7,
 687 |                               line_width = 1, point_size = 3, text_size = 15, nrow = 1,
 688 |                               colors = NULL) {
 689 |   
 690 |   # split df into subsets based on provided column
 691 |   merged_split <- split(merged, f = merged[[subset_col]])
 692 |   subsets <- names(merged_split)  # used later
 693 |   
 694 |   # get subsets that do not contain both positives and negatives
 695 |   one_class_only <- vapply(merged_split, FUN.VALUE = logical(1), FUN = function(x){
 696 |     n_distinct(x$Regulated) != 2 
 697 |   })
 698 |   
 699 |   # filter out any subsets that do not have both
 700 |   if (all(one_class_only)) {
 701 |     stop("No subsets contain both positive and negatives.", call. = FALSE)
 702 |   } else if (any(one_class_only)) {
 703 |     warning("Not all subsets contain both positive and negatives. These will be removed from plots.",
 704 |             call. = FALSE)
 705 |     merged_split <- merged_split[!one_class_only]
 706 |   } else {
 707 |     merged_split <- merged_split
 708 |   }
 709 |   
 710 |   # compute PR curve
 711 |   prc <- lapply(merged_split, FUN = calcPRCurves, pred_config = pred_config, pos_col = pos_col)
 712 |   
 713 |   # calculate number and percentage of positives for all splits
 714 |   n_pos <- lapply(merged_split, FUN = calcNPos, pos_col = pos_col)
 715 |   pct_pos <- lapply(merged_split, FUN = calcPctPos, pos_col = pos_col)
 716 |   
 717 |   # create plot titles based on feature name and number of pairs for that feature
 718 |   feature_name <- sub(".+_feature_", "", subset_col)
 719 |   pairs <- vapply(merged_split, FUN = function(x) length(unique(x$name)), FUN.VALUE = integer(1))
 720 |   titles <- paste0(feature_name, " = " , names(prc), " (", pairs, " pairs)")
 721 |   
 722 |   # plot PR curves
 723 |   pr_plots <- mapply(FUN = makePRCurvePlot, pr_df = prc, n_pos = n_pos, pct_pos = pct_pos, plot_name = titles, 
 724 |                      MoreArgs = list(pred_config = pred_config, min_sensitivity = min_sensitivity,
 725 |                                      line_width = line_width, point_size = point_size,
 726 |                                      text_size = text_size, colors = colors),
 727 |                      SIMPLIFY = FALSE)
 728 |   
 729 |   # add empty plots for subsets without both positives and negatives
 730 |   empty <- setdiff(subsets, names(pr_plots))
 731 |   for (i in empty) pr_plots[[i]] <- ggplot(NULL)
 732 |   
 733 |   # create title for this comparison
 734 |   title <- ggdraw() + 
 735 |     draw_label(feature_name, fontface = "bold", x = 0, hjust = 0, size = text_size * 1.5) +
 736 |     theme(plot.margin = margin(0, 0, 0, 7)) # align title with left edge of first plot
 737 |   
 738 |   # combine into one plot
 739 |   pr_plots_row <- plot_grid(plotlist = pr_plots, nrow = nrow)
 740 |   plot_grid(title, pr_plots_row, ncol = 1, rel_heights = c(0.1, 1))
 741 |   
 742 | }
 743 | 
 744 | # calculate and plot PR curves for several subset columns and arrange plots into one figure for one
 745 | # cell type
 746 | makePRCurveSubsets <- function(merged, subset_cols, pred_config, pos_col, cell_type = "combined",
 747 |                                min_sensitivity = 0.7, line_width = 1, point_size = 3,
 748 |                                text_size = 15, nrow = 1, colors = NULL) {
 749 |   
 750 |   # return NULL if no subsets are available
 751 |   if (length(subset_cols) == 0) {
 752 |     return(NULL)
 753 |   }
 754 |   
 755 |   # create PR curve plots for each subset
 756 |   pr_plots <- lapply(subset_cols, FUN = makePRCurveSubset, merged = merged,
 757 |                      pred_config = pred_config, pos_col = pos_col,
 758 |                      min_sensitivity = min_sensitivity, line_width = line_width,
 759 |                      point_size = point_size, text_size = text_size, nrow = nrow, colors = colors)
 760 |   
 761 |   # create one figure with all plots
 762 |   plot_grid(plotlist = pr_plots, nrow = length(subset_cols))
 763 |   
 764 | }
 765 | 
 766 | # make violin plots showing scores for each predictor vs experimental outcome for a given subset
 767 | plotPredVsExperimentSubset <- function(merged, subset_col, pos_col, pred_names_col = "pred_uid",
 768 |                                        text_size = 13) {
 769 |   
 770 |   # plot scores for each predictor as a function of experimental outcome
 771 |   p <- plotPredictorsVsExperiment(merged, pos_col = pos_col, pred_names_col = pred_names_col,
 772 |                                   text_size = text_size)
 773 |   
 774 |   # change title and add faceting based on subset column
 775 |   p +
 776 |     labs(title = sub(".+_feature_", "", subset_col)) +
 777 |     facet_grid(get(pred_names_col) ~ get(subset_col), scales = "free")
 778 |   
 779 | }
 780 | 
 781 | # make violin plots showing scores for each predictor vs experimental outcome for a given subset
 782 | plotPredVsExperimentSubsets <- function(merged, subset_cols, pos_col, pred_names_col = "pred_uid",
 783 |                                         text_size = 13, label.x.npc = 0.5) {
 784 |   
 785 |   # return NULL if no subsets are available
 786 |   if (length(subset_cols) == 0) {
 787 |     return(NULL)
 788 |   }
 789 |   
 790 |   # create predictor vs experiment plots for each subset
 791 |   pe_plots <- lapply(subset_cols, FUN = plotPredVsExperimentSubset, merged = merged,
 792 |                      pos_col = pos_col, pred_names_col = pred_names_col, text_size = text_size)
 793 |   
 794 |   # create one figure with all plots
 795 |   plot_grid(plotlist = pe_plots, nrow = length(subset_cols))
 796 |   
 797 | }
 798 | 
 799 | # make violin plots showing scores for each predictor vs experimental outcome for a given subset
 800 | predVsEffectSizeSubset <- function(merged, subset_col, pos_col, pred_names_col = "pred_uid",
 801 |                                    corr_groups = pos_col, point_size = 2, text_size = 13,
 802 |                                    alpha_value = 1, label.x.npc = 0.75) {
 803 |   
 804 |   # plot each predictor against effect size for the given subset
 805 |   p <- plotPredictorsVsEffectSize(merged, pos_col = pos_col, pred_names_col = pred_names_col,
 806 |                                   corr_groups = corr_groups, point_size = point_size,
 807 |                                   text_size = text_size, alpha_value = alpha_value,
 808 |                                   label.x.npc = label.x.npc)
 809 |   
 810 |   # change title and add faceting based on subset column
 811 |   p +
 812 |     labs(title = sub(".+_feature_", "", subset_col)) +
 813 |     facet_grid(get(subset_col) ~ get(pred_names_col), scales = "free")
 814 |   
 815 | }
 816 | 
 817 | # make violin plots showing scores for each predictor vs experimental outcome for a given subset
 818 | predVsEffectSizeSubsets <- function(merged, subset_cols, pos_col, pred_names_col = "pred_uid",
 819 |                                     corr_groups = pos_col, point_size = 2, text_size = 13,
 820 |                                     alpha_value = 1, label.x.npc = 0.75) {
 821 |   
 822 |   # return NULL if no subsets are available
 823 |   if (length(subset_cols) == 0) {
 824 |     return(NULL)
 825 |   }
 826 |   
 827 |   # create predictor vs experiment scatter plots for each subset
 828 |   es_plots <- lapply(subset_cols, FUN = predVsEffectSizeSubset, merged = merged, pos_col = pos_col,
 829 |                      pred_names_col = pred_names_col, corr_groups = corr_groups,
 830 |                      point_size = point_size, text_size = text_size, alpha_value = alpha_value,
 831 |                      label.x.npc = label.x.npc)
 832 |   
 833 |   # create one figure with all plots
 834 |   plot_grid(plotlist = es_plots, nrow = length(subset_cols))
 835 |   
 836 | }
 837 | 
 838 | ## HELPER FUNCTIONS ================================================================================
 839 | 
 840 | # apply analyses across all cell types in merged data ----------------------------------------------
 841 | 
 842 | # get all cell types in merged data. if there are multiple cell types and if combined = TRUE add
 843 | # "combined" to include a set of all cell types combined
 844 | getCellTypes <- function(merged, cell_types_col, combined) {
 845 |   
 846 |   # get all cell types in merged data (remove any NAs)
 847 |   cell_types <- unique(merged[[cell_types_col]])
 848 |   cell_types <- cell_types[!is.na(cell_types)]
 849 |   cell_types <- structure(as.list(cell_types), names = cell_types)
 850 |   
 851 |   # add combined set if specified
 852 |   if (length(cell_types) > 1 & combined == TRUE) {
 853 |     cell_types <- c(cell_types, list(combined = as.character(cell_types)))
 854 |   }
 855 |   
 856 |   return(cell_types)
 857 |   
 858 | }
 859 | 
 860 | # get data for a specified cell type or all if cell_type == "combined"
 861 | getCellTypeData <- function(df, cell_type, cell_types_col) {
 862 |   df[df[[cell_types_col]] %in% cell_type, ]
 863 | }
 864 | 
 865 | # helper function to apply a function to one cell type in merged data
 866 | applyCellType <- function(cell_type, merged, .fun, ..., cell_types_col) {
 867 |   
 868 |   # get data for the given cell type
 869 |   merged_cell_type <- getCellTypeData(merged, cell_type = cell_type, cell_types_col = cell_types_col)
 870 |   
 871 |   # try to apply the specified function and capture any errors and warnings
 872 |   output <- tryCatch(
 873 |     withCallingHandlers({
 874 |       .fun(merged_cell_type, ...)
 875 |     }, warning = function(w) {
 876 |       message("For cell type ", cell_type, ": ", w)
 877 |       invokeRestart("muffleWarning")
 878 |     }), error = function(e) {
 879 |       message("For cell type ", cell_type, ": ", e)
 880 |       return(NULL)
 881 |     })
 882 |   
 883 |   return(output)
 884 |   
 885 | }
 886 | 
 887 | # simple wrapper to apply a function to all cell types in merged data
 888 | applyCellTypes <- function(merged, .fun, ..., cell_types_col = "ExperimentCellType",
 889 |                            combined = TRUE, remove_failed = TRUE) {
 890 |   
 891 |   # get all cell types in merged data
 892 |   cell_types <- getCellTypes(merged, cell_types_col = cell_types_col, combined = combined)
 893 |   
 894 |   # apply function to all cell types
 895 |   output <- lapply(cell_types, FUN = applyCellType, merged = merged, .fun = .fun, ...,
 896 |                    cell_types_col = cell_types_col)
 897 |   
 898 |   # remove output for any cell types where function failed
 899 |    if (remove_failed == TRUE) {
 900 |      output <- output[!vapply(output, FUN = is.null, FUN.VALUE = logical(1))]
 901 |    }
 902 |   
 903 |   return(output)
 904 |   
 905 | }
 906 | 
 907 | # plotting functions -------------------------------------------------------------------------------
 908 | 
 909 | # make scatter plots of two columns (e.g. effect size vs predictor score) across all predictors
 910 | makeScatterPlots <- function(df, x_col, y_col, color_col = NULL, x_lab = NULL, y_lab = NULL,
 911 |                              title = NULL, colors = NULL, pred_names_col = "pred_uid",
 912 |                              point_size = 2, text_size = 13, alpha_value = 1, ncol = NULL,
 913 |                              nrow = NULL) {
 914 |   
 915 |   # create labels and title based on input parameters
 916 |   if (is.null(x_lab)) x_lab <- x_col
 917 |   if (is.null(y_lab)) y_lab <- y_col
 918 |   if (is.null(title)) title <- paste(x_col, "vs", y_col)
 919 |   
 920 |   # convert column names to symbols for ggplot
 921 |   x_col <- sym(x_col)
 922 |   y_col <- sym(y_col)
 923 |   if (!is.null(color_col)) color_col <- sym(color_col)
 924 |   
 925 |   # plot x_col vs y_col across predictors
 926 |   p <- ggplot(df, aes(x = !!x_col, y = !!y_col, color = !!color_col)) +
 927 |     facet_wrap(as.formula(paste("~", pred_names_col)), scales = "free", ncol = ncol, nrow = nrow) +
 928 |     geom_point(size = point_size, alpha = alpha_value) +
 929 |     labs(title = title, x = x_lab, y = y_lab) +
 930 |     theme_bw() +
 931 |     theme(legend.position = "bottom", text = element_text(size = text_size))
 932 |   
 933 |   # set colors if provided
 934 |   if (!is.null(colors)) {
 935 |     p <- p + scale_color_manual(values = colors)
 936 |   }
 937 |   
 938 |   return(p)
 939 |   
 940 | }
 941 | 
 942 | # save a list of plots to output files (... can be any other argument for ggsave)
 943 | savePlotList <- function(plot_list, basename, path = ".", ...) {
 944 |   
 945 |   # output filenames for plots
 946 |   outfiles <- paste(rep(path, times = length(plot_list)), names(plot_list),
 947 |                     rep(basename, times = length(plot_list)), sep = "/")
 948 |   
 949 |   # create required directories if needed
 950 |   for (i in dirname(outfiles)) {
 951 |     dir.create(i, recursive = TRUE, showWarnings = FALSE)
 952 |   }
 953 |   
 954 |   # save list of plots to output files
 955 |   invisible(mapply(FUN = ggsave, outfiles, plot_list, MoreArgs = list(limitsize = FALSE, ...)))
 956 |   
 957 | }
 958 | 
 959 | # print plot a plot list with a tab per plot
 960 | printTabbedPlots <- function(plots, section_level = "#", plot_function = plot) {
 961 |   for (i in names(plots)){
 962 |     cat(paste0(section_level, "#"), i, '{.unlisted .unnumbered}', '\n', '<br>', '\n')
 963 |     plot_function(plots[[i]])
 964 |     cat('\n', '<br>', '\n\n')
 965 |   }
 966 |   cat(section_level, "{.unlisted .unnumbered}")
 967 | }
 968 | 
 969 | # other helper functions ---------------------------------------------------------------------------
 970 | 
 971 | # add label for each pair based on whether it's significant and activates or represses it's target
 972 | labelPairs <- function(df, sig_col = "Significant") {
 973 |   
 974 |   df$scatterplot_color <- with(df,
 975 |                                ifelse(get(sig_col) == FALSE,
 976 |                                       yes = "Not Significant",
 977 |                                       no = ifelse(EffectSize > 0,
 978 |                                                   yes = "Repressive",
 979 |                                                   no  = "Activating")
 980 |                                )
 981 |   )
 982 |   
 983 |   return(df)
 984 |   
 985 | }
 986 | 
 987 | # convert a list of ROCR performance objects into a table and calculate F1 metric
 988 | pr2df <- function(pr, calc_f1 = TRUE) {
 989 |   
 990 |   # function to convert one performance object to a table
 991 |   convert_pr2df <- function(this_pr) {
 992 |     df <- as.data.frame(list(
 993 |       alpha = this_pr@alpha.values[[1]],
 994 |       precision = this_pr@y.values[[1]],
 995 |       recall = this_pr@x.values[[1]]
 996 |     ))
 997 |     return(df)
 998 |   }
 999 |   
1000 |   # apply to input list
1001 |   pr_list <- lapply(pr, convert_pr2df)
1002 |   
1003 |   # convert list of tables into one table
1004 |   pr_df <- rbindlist(pr_list, idcol = "pred_uid")
1005 |   
1006 |   # calculate F1 metric if specified
1007 |   if (calc_f1 == TRUE) {
1008 |     pr_df$F1 <- with(pr_df, 2 / ((1 / precision) + (1 / recall)))
1009 |   }
1010 |   
1011 |   return(pr_df)
1012 |   
1013 | }
1014 | 
1015 | # try to compute AUC
1016 | computeAUC <- function(x_vals, y_vals) {
1017 |   good.idx <- which(!is.na(x_vals) & !is.na(y_vals))
1018 |   if (length(good.idx) > 0) {
1019 |     auc <- trapz(x_vals[good.idx], y_vals[good.idx])
1020 |   } else {
1021 |     auc <- NA_real_
1022 |   }
1023 |   return(auc)
1024 | }
1025 | 
1026 | # try to compute maximum F1 from a prc table
1027 | computeMaxF1 <- function(pr_df) {
1028 |   if (any(!is.na(pr_df$F1))) {
1029 |     maxF1 <- max(pr_df$F1, na.rm = TRUE)
1030 |   } else {
1031 |     maxF1 <- NA_real_
1032 |   }
1033 |   return(maxF1)
1034 | }
1035 | 
1036 | # compute performance given a minimum sensitivity (recall)
1037 | computePerfGivenSensitivity <- function(pr_df, min_sensitivity) {
1038 |   
1039 |   # get required values from pr_df
1040 |   prec <- pr_df$precision
1041 |   sens <- pr_df$recall
1042 |   alpha <- pr_df$alpha
1043 |   
1044 |   # get sensitivity value that is closest (at least) to minimum sensitivity
1045 |   cutoff_sens <- min(sens[sens >= min_sensitivity])
1046 |   
1047 |   # get indices of all sensitivities equal to that cutoff (can be more than 1)
1048 |   idx <- which(sens == cutoff_sens)
1049 |   
1050 |   # pick the one with the maximum precision
1051 |   idx2 <- idx[which.max(prec[idx])]
1052 |   
1053 |   # get cutoff (alpha), precision and senitivity for that point on the PR curve
1054 |   alpha_at_sensitivity <- alpha[idx2]
1055 |   prec_at_sensitivity <- prec[idx2]
1056 |   sens_at_sensitivity <- sens[idx2]
1057 |   
1058 |   # create output data.frame row
1059 |   output <- data.frame(min_sensitivity = min_sensitivity,
1060 |                        alpha_at_min_sensitivity = alpha_at_sensitivity,
1061 |                        precision_at_min_sensitivity = prec_at_sensitivity, 
1062 |                        sensitivity_at_min_sensitivity = sens_at_sensitivity,
1063 |                        stringsAsFactors = FALSE)
1064 |   
1065 |   return(output)
1066 |   
1067 | }
1068 | 
1069 | # compute a performance given a specified alpha cutoff
1070 | computePerfGivenCutoff <- function(pr_df, alpha_cutoff) {
1071 |   
1072 |   # get sensitivity (recall), precision and alpha values
1073 |   sens <- pr_df$recall
1074 |   prec <- pr_df$precision
1075 |   alpha <- pr_df$alpha
1076 |   
1077 |   # set alpha cutoff to min alpha if it was NA
1078 |   if (is.na(alpha_cutoff)) alpha_cutoff <- min(alpha)
1079 |   
1080 |   # get index of highest alpha value that is larger or equal to alpha_cutoff
1081 |   idx <- sum(alpha >= alpha_cutoff)
1082 |   
1083 |   # get sensitivity and precision at that cutoff
1084 |   sens_at_cutoff <- sens[idx]
1085 |   prec_at_cutoff <- prec[idx]
1086 |   alpha_at_cutoff <- alpha[idx]
1087 |   
1088 |   # create output data.frame row
1089 |   output <- data.frame(alpha_cutoff = alpha_cutoff,
1090 |                        alpha_at_cutoff = alpha_at_cutoff,
1091 |                        sensitivity_at_cutoff = sens_at_cutoff,
1092 |                        precision_at_cutoff = prec_at_cutoff,
1093 |                        stringsAsFactors = FALSE)
1094 |   
1095 |   return(output)
1096 |   
1097 | }
1098 | 
1099 | # assign prediction class label
1100 | addPredictionClassLabels <- function(df, perf_summary, pos_col = "Regulated") {
1101 |   
1102 |   # add alpha thresholds to df
1103 |   df <- perf_summary %>% 
1104 |     select(cell_type, pred_uid, alpha_cutoff, inverse_predictor) %>% 
1105 |     left_join(x = df, y = ., by = c("ExperimentCellType" = "cell_type", "pred_uid"))
1106 |   
1107 |   # invert iverse predictor values and generate labels (TP, FP, TN, FN)
1108 |   pos_col <- sym(pos_col)  ## create symbol from column name for tidy evaluation
1109 |   df <- df %>% 
1110 |     mutate(pred_value_inv = if_else(inverse_predictor == TRUE, true = pred_value * -1,
1111 |                                     false = pred_value)) %>% 
1112 |     mutate(pred_class = case_when(
1113 |       pred_value_inv > alpha_cutoff & !!pos_col == TRUE ~ "TP",
1114 |       pred_value_inv > alpha_cutoff & !!pos_col == FALSE ~ "FP",
1115 |       pred_value_inv <= alpha_cutoff & !!pos_col == TRUE ~ "FN",
1116 |       pred_value_inv <= alpha_cutoff & !!pos_col == FALSE ~ "TN"
1117 |     )) %>% 
1118 |     as.data.table()
1119 |   
1120 |   return(df)
1121 |   
1122 | }
1123 | 
1124 | # DEPRECATED:assign prediction class labels for one predictor
1125 | # addOneLabel <- function(df, cutoff, score_col, pos_col) {
1126 | #   
1127 | #   label_name <- paste0(score_col, ".pred.class")
1128 | #   df[, label_name] <- "NA"
1129 | #   
1130 | #   df[which(!is.na(df[, ..score_col]) & df[, ..score_col] > cutoff & df[, ..pos_col]), label_name] <- "TP"
1131 | #   df[which(!is.na(df[, ..score_col]) & df[, ..score_col] <= cutoff & !df[, ..pos_col]), label_name] <- "TN"
1132 | #   df[which(!is.na(df[, ..score_col]) & df[, ..score_col] > cutoff & !df[, ..pos_col]), label_name] <- "FP"
1133 | #   df[which(!is.na(df[, ..score_col]) & df[, ..score_col] <= cutoff & df[, ..pos_col]), label_name] <- "FN"
1134 | #   
1135 | #   return(df)
1136 | # }
1137 | 
1138 | # compute the fraction of experimental positives
1139 | calcPctPos <- function(df, pos_col = "Regulated") {
1140 |   mean(df[[pos_col]])
1141 | }
1142 | 
1143 | # compute number and fractions of experimental positives
1144 | calcNPos <- function(df, pos_col = "Regulated") {
1145 |   
1146 |   expt_pairs <- df %>%
1147 |     select(chrom, chromStart, chromEnd, measuredGeneSymbol, all_of(pos_col)) %>% 
1148 |     distinct()
1149 |   
1150 |   sum(expt_pairs[[pos_col]])
1151 |   
1152 | }
1153 | 
1154 | # get precision at a specific sensitivity (recall)
1155 | getPrecisionAtRecall <- function(pr_df, col_list, min_sensitivity) {
1156 |   
1157 |   thresholds <- data.frame(matrix(ncol=3, nrow=length(col_list)))
1158 |   colnames(thresholds) <- c("precision", "recall", "pred_col")
1159 |   thresholds$pred_col <- col_list
1160 |   thresholds$recall <- min_sensitivity
1161 |   
1162 |   prec <- lapply(col_list, function(s) {
1163 |     recall <- as.numeric(unlist(pr_df[pr_df$pred_col==s, "recall"]))
1164 |     precision <- as.numeric(unlist(pr_df[pr_df$pred_col==s, "precision"]))
1165 |     approxfun(x = recall, y = precision)(min_sensitivity)
1166 |   })
1167 |   
1168 |   thresholds$precision = as.numeric(unlist(prec))
1169 |   return(thresholds)
1170 |   
1171 | }
1172 | 
1173 | # get the number of facet rows and columns from a ggplot object
1174 | get_row_col <- function(p) {
1175 |   n <- length(unique(ggplot_build(p)$data[[1]]$PANEL))
1176 |   par <- ggplot_build(p)$layout$facet$params
1177 |   wrap_dims(n, par$nrow, par$ncol)
1178 | }
1179 | 
1180 | # get default alpha values for all predictors in pred_config that have alpha == NA
1181 | getDefaultAlpha <- function(pred_config, merged) {
1182 |   
1183 |   # get predictors with missing alpha
1184 |   missing_alpha <- filter(pred_config, is.na(alpha))
1185 |   
1186 |   # get default alpha for these (if there are any)
1187 |   if (nrow(missing_alpha) > 0) {
1188 |     alpha <- mapply(FUN = get_alpha_min, predictor = missing_alpha$pred_uid,
1189 |                     inverse = missing_alpha$inverse_predictor, MoreArgs = list(merged = merged),
1190 |                     SIMPLIFY = FALSE)
1191 |     alpha <- unlist(alpha)
1192 |     
1193 |     # set alpha in pred_config to default alpha for these predictors
1194 |     pred_config$alpha <- replace(pred_config$alpha, list = is.na(pred_config$alpha), values = alpha)
1195 |     
1196 |   }
1197 |   
1198 |   return(pred_config)
1199 |   
1200 | }
1201 | 
1202 | # get minimum (or maximum) predictor value from merged data for default alpha values
1203 | get_alpha_min <- function(merged, predictor, inverse = FALSE) {
1204 |   merged %>% 
1205 |     filter(pred_uid == predictor, Prediction == 1) %>% 
1206 |     summarize(alpha = if_else(inverse == TRUE, max(pred_value), min(pred_value))) %>% 
1207 |     pull(alpha)
1208 | }
1209 | 
1210 | ## DEPRECATED ======================================================================================
1211 | 
1212 | # create performance summary table for all predictors in a PR table
1213 | makePRSummaryTable <- function(pr_df, pred_config, min_sensitivity = 0.7) {
1214 |   
1215 |   # remove any boolean predictors since the following metrics don't make sense for them
1216 |   bool_preds <- pull(filter(pred_config, boolean == TRUE), pred_uid)
1217 |   pr_df <- filter(pr_df, !pred_uid %in% bool_preds)
1218 |   
1219 |   # compute performance summary for each predictor
1220 |   perf_summary <- pr_df %>% 
1221 |     group_split(pred_uid) %>% 
1222 |     lapply(calcPerfSummaryOnePred, pred_config = pred_config, min_sensitivity = min_sensitivity) %>% 
1223 |     bind_rows()
1224 |   
1225 |   # add predictor information from pred_config
1226 |   perf_summary <- pred_config %>% 
1227 |     select(pred_uid, pred_id, pred_col, inverse_predictor, pred_name_long) %>% 
1228 |     left_join(x = perf_summary, y = ., by = "pred_uid")
1229 |   
1230 |   # set alpha to alpha at minimum sensitivity if threshold was set to NA
1231 |   perf_summary <- perf_summary %>% 
1232 |     left_join(select(pred_config, pred_uid, alpha), by = "pred_uid") %>% 
1233 |     mutate(alpha_cutoff = if_else(is.na(alpha), alpha_at_min_sensitivity, alpha_cutoff),
1234 |            alpha_at_cutoff = if_else(is.na(alpha), alpha_at_min_sensitivity, alpha_at_cutoff),
1235 |            sensitivity_at_cutoff = if_else(is.na(alpha), sensitivity_at_min_sensitivity, sensitivity_at_cutoff),
1236 |            precision_at_cutoff = if_else(is.na(alpha), precision_at_min_sensitivity, precision_at_cutoff)) %>% 
1237 |     select(-alpha)
1238 |   
1239 |   return(perf_summary)
1240 |   
1241 | }
1242 | 
1243 | # create performance summary for one predictor
1244 | calcPerfSummaryOnePred <- function(pr_df, pred_config, min_sensitivity) {
1245 |   
1246 |   # check that pr_df contains data on only one predictor
1247 |   predictor <- unique(pr_df$pred_uid)
1248 |   if (length(predictor) > 1) {
1249 |     stop("Input pr_df contains data for more than one unique predictor.", call. = FALSE)
1250 |   }
1251 |   
1252 |   # make sure that input is sorted according to recall and precision
1253 |   pr_df <- arrange(pr_df, recall, desc(precision))
1254 |   
1255 |   # compute AUC and maximum F1
1256 |   # the head() calls here remove the last element of the vector. 
1257 |   # The point is that performance objects produced by ROCR always include a Recall = 100% point even
1258 |   # if the predictor cannot achieve a recall of 100%. This results in a straight line ending at
1259 |   # (1,0) on the PR curve. This should not be included in the AUC computation.
1260 |   auprc <- pr_df %>% 
1261 |     head(-1) %>% 
1262 |     summarize(AUPRC = computeAUC(x_vals = recall, y_vals = precision),
1263 |               max_F1 = computeMaxF1(.))
1264 |   
1265 |   # compute performance at min sensitivity
1266 |   perf_min_sens <- computePerfGivenSensitivity(pr_df, min_sensitivity = min_sensitivity)
1267 |   
1268 |   # get cutoff specified in pred_config for given predictor
1269 |   cutoff <- pred_config %>% 
1270 |     filter(pred_uid == predictor) %>% 
1271 |     mutate(alpha = if_else(inverse_predictor == TRUE, true = alpha * -1, false = alpha)) %>% 
1272 |     pull(alpha)
1273 |   
1274 |   # compute performance at specified alpha cutoff
1275 |   perf_alpha_cutoff <- computePerfGivenCutoff(pr_df, alpha_cutoff = cutoff)
1276 |   
1277 |   # create output table
1278 |   perf_summary <- data.frame(pred_uid = predictor, auprc, perf_min_sens, perf_alpha_cutoff)
1279 |   
1280 |   # if AUC couldn't be computed, set all performance metrics to NA, since this indicates a PRC
1281 |   # without any real points except the starting and end points added by the ROCR package
1282 |   # TODO: find better solution for this
1283 |   if (all(is.na(auprc))) {
1284 |     perf_cols <- !colnames(perf_summary) %in% c("pred_uid", "min_sensitivity", "alpha_cutoff")
1285 |     perf_summary[, perf_cols] <- NA_real_ 
1286 |   }
1287 |   
1288 |   return(perf_summary)
1289 |   
1290 | }
1291 | 
1292 | # convert distance to TSS baseline predictor from bp to kb (TO DO: REMOVE IF NOT USED)
1293 | convert_dist_to_kb <- function(merged) {
1294 |   merged %>% 
1295 |     mutate(pred_value = if_else(pred_uid == "baseline.distToTSS", true = pred_value / 1000,
1296 |                                 false = pred_value)) %>% 
1297 |     mutate(pred_name_long = if_else(pred_uid == "baseline.distToTSS",
1298 |                                     true = "Distance to TSS (kb)", false = pred_name_long))
1299 | }
1300 | 


--------------------------------------------------------------------------------
/workflow/scripts/crisprComparisonSimplePredictors.R:
--------------------------------------------------------------------------------
  1 | ## A couple of simple baseline predictors
  2 | 
  3 | # function to compute specified baseline predictors
  4 | computeBaselinePreds <- function(expt, tss_annot = NULL, gene_annot = NULL, expressed_genes = NULL,
  5 |                                  preds = c("distToTSS", "distToGene", "nearestTSS", "nearestGene",
  6 |                                            "within100kbTSS", "within100kbGene", "nearestExprTSS",
  7 |                                            "nearestExprGene", "within100kbExprTSS",
  8 |                                            "within100kbExprGene")) {
  9 |   
 10 |   # parse preds argument
 11 |   preds <- match.arg(preds, several.ok = TRUE)
 12 |   
 13 |   # get categories of specified predictors to check input data
 14 |   tss_preds  <- intersect(preds, c("distToTSS","nearestTSS", "nearestExprTSS", "within100kbTSS",
 15 |                                    "within100kbExprTSS"))
 16 |   gene_preds <- intersect(preds, c("distToGene", "nearestGene", "nearestExprGene",
 17 |                                    "within100kbExprGene"))
 18 |   expr_preds <- intersect(preds, c("nearestExprTSS", "nearestExprGene", "within100kbExprTSS",
 19 |                                    "within100kbExprGene"))
 20 |   
 21 |   # abort if required input data is not provided for all specified predictors
 22 |   if (length(tss_preds) > 0 & is.null(tss_annot)) {
 23 |     stop("tss_annot required to compute: ", paste(tss_preds, collapse = ", "), call. = FALSE)
 24 |   }
 25 |   if (length(gene_preds) > 0 & is.null(gene_annot)) {
 26 |     stop("gene_annot required to compute: ", paste(gene_preds, collapse = ", "), call. = FALSE)
 27 |   }
 28 |   if (length(expr_preds) > 0 & is.null(expressed_genes)) {
 29 |     stop("expressed_genes required to compute: ", paste(expr_preds, collapse = ", "), call. = FALSE)
 30 |   }
 31 | 
 32 |   # compute baseline predictors for each cell type in experimental data
 33 |   cell_types <- unique(expt$CellType)
 34 |   baseline_preds <- lapply(cell_types, FUN = compute_baseline_predictors_cell_type, preds = preds, 
 35 |                            expt = expt, tss_annot = tss_annot, gene_annot = gene_annot,
 36 |                            expr_genes = expressed_genes)
 37 |   
 38 |   # combine into one table and reformat for output
 39 |   baseline_preds <- rbindlist(baseline_preds)
 40 |   colnames(baseline_preds)[colnames(baseline_preds) == "CellType"] <- "ExperimentCellType"
 41 |   
 42 |   return(baseline_preds)
 43 |   
 44 | }
 45 | 
 46 | ## HELPER FUNCTIONS ================================================================================
 47 | 
 48 | # compute all baseline predictors for one cell type (required for 'expressed genes' predictors)
 49 | compute_baseline_predictors_cell_type <- function(x, preds, expt, tss_annot, gene_annot,
 50 |                                                   expr_genes) {
 51 |   
 52 |   # extract experimental data for given cell type
 53 |   expt <- expt[CellType == x, ]
 54 |   
 55 |   # extract expressed gene names if expr_genes is provided
 56 |   if (!is.null(expr_genes)) {
 57 |     expr_genes <- expr_genes[cell_type == x & expressed == TRUE, ][["gene"]]
 58 |   }
 59 |   
 60 |   # compute baseline predictors
 61 |   baseline_preds <- lapply(preds, FUN = compute_baseline_predictor, expt = expt,
 62 |                            tss_annot = tss_annot, gene_annot = gene_annot, expr_genes = expr_genes)
 63 |   
 64 |   # combine into one table
 65 |   baseline_preds <- rbindlist(baseline_preds)
 66 |   
 67 |   return(baseline_preds)
 68 |   
 69 | }
 70 | 
 71 | # function to compute one baseline predictor
 72 | compute_baseline_predictor <- function(pred, expt, tss_annot, gene_annot, expr_genes) {
 73 |   
 74 |   # compute baseline predictors
 75 |   output <- switch(
 76 |     pred,
 77 |     "distToTSS" = computeDistToGene(expt, annot = tss_annot, name = pred, fix_annot = "center"),
 78 |     "distToGene" = computeDistToGene(expt, annot = gene_annot, name = pred, fix_annot = "none"),
 79 |     "nearestTSS" = nearestFeaturePred(expt, features = tss_annot, name = pred),
 80 |     "nearestGene" = nearestFeaturePred(expt, features = gene_annot, name = pred),
 81 |     "within100kbTSS" = withinDistFeature(expt, features = tss_annot, dist = 1e+05, name = pred),
 82 |     "within100kbGene" = withinDistFeature(expt, features = gene_annot, dist = 1e+05, name = pred),
 83 |     "nearestExprTSS" = nearestFeaturePred(
 84 |       expt, features = tss_annot[tss_annot$gene %in% expr_genes, ], name = pred),
 85 |     "nearestExprGene" = nearestFeaturePred(
 86 |       expt, features = gene_annot[gene_annot$gene %in% expr_genes, ], name = pred),
 87 |     "within100kbExprTSS" = withinDistFeature(
 88 |       expt, features = tss_annot[tss_annot$gene %in% expr_genes, ], dist = 1e+05, name = pred),
 89 |     "within100kbExprGene" = withinDistFeature(
 90 |       expt, features = gene_annot[gene_annot$gene %in% expr_genes, ], dist = 1e+05, name = pred)
 91 |   )
 92 | 
 93 |   return(output)
 94 |   
 95 | }
 96 | 
 97 | # compute baseline distance to any gene annotation (TSS or gene)
 98 | computeDistToGene <- function(expt, annot, name, fix_annot = c("none", "center", "start", "end")) {
 99 |   
100 |   # parse fix argument
101 |   fix_annot <- match.arg(fix_annot)
102 |   
103 |   # assume first three column of annot are chr, start, end and convert annot to GRanges object
104 |   colnames(annot)[1:3] <- c("chr", "start", "end")
105 |   annot <- makeGRangesFromDataFrame(annot, keep.extra.columns = TRUE)
106 |   names(annot) <- annot$gene
107 |   
108 |   # convert gene annotations to 1bp coordinates if specified by fix_annot (e.g. for distance to TSS)
109 |   if (fix_annot != "none") {
110 |     annot <- resize(annot, width = 1, fix = fix_annot)
111 |   }
112 |   
113 |   # GRanges object for E-G pairs in expt using centers of candidate enhancers as coordinates
114 |   eg_pairs <- makeGRangesFromDataFrame(expt, seqnames.field = "chrom", start.field = "chromStart",
115 |                                        end = "chromEnd", keep.extra.columns = TRUE)
116 |   eg_pairs <- resize(eg_pairs, width = 1, fix = "center")
117 |   
118 |   # combine into a paired GRanges object containing enhancer and TSS/gene annotations for each pair
119 |   eg_pairs <- Pairs(first = eg_pairs, second = annot[expt$measuredGeneSymbol])
120 |   
121 |   # compute distance between enhancers and provided TSS / gene annotations
122 |   distance_pred <- distance(eg_pairs)
123 |   
124 |   # create output table
125 |   output <- data.table(
126 |     expt,
127 |     PredictionCellType = NA_character_,
128 |     pred_elements = with(expt, paste0(chrom, ":", chromStart, "-", chromEnd)),
129 |     pred_uid = paste("baseline", name, sep = "."), 
130 |     pred_id = "baseline",
131 |     pred_col = name,
132 |     pred_value = distance_pred,
133 |     Prediction = 1
134 |   )
135 |   
136 |   return(output)
137 |   
138 | }
139 | 
140 | # nearest genomic feature (e.g. TSS or gene body) to the candidate enhancer. features needs to be a
141 | # data frame in bed style format, i.e chr, start, end, name, etc.
142 | nearestFeaturePred <- function(expt, features, name) {
143 |   
144 |   # initial column names in expt
145 |   expt_cols <- colnames(expt)
146 |   
147 |   # create GRanges object containing feature coordinates
148 |   features <- features[, 1:4]
149 |   colnames(features) <- c("chr", "start", "stop", "name")
150 |   features <- makeGRangesFromDataFrame(features, keep.extra.columns = TRUE)
151 |   
152 |   # get closest TSS to every CRE in expt
153 |   expt <- nearest_genomic_feature(expt, features = features)
154 |   
155 |   # create predictor whether the gene of each pair is the closest TSS
156 |   output <- data.table(
157 |     expt[, ..expt_cols],
158 |     PredictionCellType = NA_character_,
159 |     pred_elements = with(expt, paste0(chrom, ":", chromStart, "-", chromEnd)),
160 |     pred_uid = paste("baseline", name, sep = "."), 
161 |     pred_id = "baseline",
162 |     pred_col = name,
163 |     pred_value = as.numeric(expt$nearest_feature == expt$measuredGeneSymbol),
164 |     Prediction = 1
165 |   )
166 |   
167 | }
168 | 
169 | # find the closest genomic feature for every enhancer in experimental data
170 | nearest_genomic_feature <- function(expt, features, ignore.strand = TRUE) {
171 |   
172 |   # add uniaue identifier for every E-G pair in expt
173 |   expt$uid <- paste0("eg_pair_", seq_len(nrow(expt)))
174 |   
175 |   # create genomic ranges for CREs
176 |   cres <- makeGRangesFromDataFrame(expt, seqnames.field = "chrom", start.field = "chromStart",
177 |                                    end.field = "chromEnd", keep.extra.columns = TRUE)
178 |   
179 |   # find nearest feature for every CRE
180 |   nearest_feat <- nearest(cres, subject = features, ignore.strand = ignore.strand)
181 |   
182 |   # get name of nearest feature for each CRE
183 |   nearest_feat_names <- data.frame(uid = cres$uid, nearest_feature = features[nearest_feat]$name)
184 |   
185 |   # add this to expt data
186 |   output <- merge(expt, nearest_feat_names, by = "uid", all.x = TRUE)
187 |   
188 |   return(output[, -"uid"])
189 |   
190 | }
191 | 
192 | # candidate enhancers within a certain distance from gene TSS or body baseline predictor
193 | withinDistFeature <- function(expt, features, dist, name) {
194 |   
195 |   # initial column names in expt
196 |   expt_cols <- colnames(expt)
197 |   
198 |   # create GRanges object containing feature coordinates
199 |   features <- features[, 1:4]
200 |   colnames(features) <- c("chr", "start", "stop", "name")
201 |   features <- makeGRangesFromDataFrame(features, keep.extra.columns = TRUE)
202 |   
203 |   # create unique CRE identifiers
204 |   expt$cre_id <- with(expt, paste0(chrom, ":", chromStart, "-", chromEnd))
205 |   
206 |   # get unique CREs create GRanges object containing CRE coordinates
207 |   cres <- unique(expt[, c("chrom" ,"chromStart", "chromEnd", "cre_id")])
208 |   cres <- makeGRangesFromDataFrame(cres, seqnames.field = "chrom", start.field = "chromStart",
209 |                                    end.field = "chromEnd", keep.extra.columns = TRUE)
210 |   
211 |   # extend CREs by dist on both sides to create windows for overlapping
212 |   cres <- resize(cres, width = dist * 2, fix = "center")
213 |   
214 |   # find all features within the specified distance from each CRE
215 |   ovl <- findOverlaps(cres, features)
216 |   
217 |   # get names of CREs and feature pairs within specified distance
218 |   within_dist_pairs <- data.table(cre_id = cres[queryHits(ovl)]$cre_id,
219 |                                   measuredGeneSymbol = features[subjectHits(ovl)]$name,
220 |                                   pred_value = 1)
221 |   
222 |   # add this to expt data
223 |   expt <- merge(expt, within_dist_pairs, by = c("cre_id", "measuredGeneSymbol"), all.x = TRUE)
224 |   expt$pred_value[is.na(expt$pred_value)] <- 0
225 |   
226 |   # reformat for output
227 |   output <- data.table(
228 |     expt[, ..expt_cols],
229 |     PredictionCellType = NA_character_,
230 |     pred_elements = with(expt, paste0(chrom, ":", chromStart, "-", chromEnd)),
231 |     pred_uid = paste("baseline", name, sep = "."),    
232 |     pred_id = "baseline",
233 |     pred_col = name,
234 |     pred_value = expt$pred_value,
235 |     Prediction = 1
236 |   )
237 |   
238 | }
239 | 


--------------------------------------------------------------------------------
/workflow/scripts/mergePredictionsWithExperiment.R:
--------------------------------------------------------------------------------
  1 | ## Merge predictions with experimental data for downstream comparisons of predictions with CRISPR
  2 | ## CRE perturbation data
  3 | 
  4 | # save.image("merge.rda")
  5 | # stop()
  6 | 
  7 | # open log file to collect all messages, warnings and errors
  8 | log <- file(snakemake@log[[1]], open = "wt")
  9 | sink(log)
 10 | sink(log, type = "message")
 11 | 
 12 | # attach required packages and functions
 13 | suppressPackageStartupMessages({
 14 |   source(file.path(snakemake@scriptdir, "crisprComparisonLoadInputData.R"))
 15 |   source(file.path(snakemake@scriptdir, "crisprComparisonMergeFunctions.R"))
 16 |   source(file.path(snakemake@scriptdir, "crisprComparisonSimplePredictors.R"))
 17 | })
 18 | 
 19 | ## load data ---------------------------------------------------------------------------------------
 20 | 
 21 | # directory for all output
 22 | outdir <- dirname(snakemake@output$merged)
 23 | 
 24 | # config entry for this comparison is used to load named list of input files
 25 | config <- snakemake@config$comparisons[[snakemake@wildcards$comparison]]
 26 | 
 27 | # load pred_config file
 28 | include_col <- ifelse(is.null(snakemake@params$include_col), "include", snakemake@params$include_col)
 29 | pred_config <- importPredConfig(snakemake@input$pred_config,
 30 |                                 expr = !is.null(snakemake@input$expressed_genes),
 31 |                                 include_col = include_col,
 32 |                                 filter = snakemake@params$filter_include_col)
 33 | 
 34 | # load experimental data
 35 | expt <- loadExpt(snakemake@input$experiment, showProgress = FALSE)
 36 | 
 37 | # load tss and gene universe files
 38 | tss_annot <- fread(snakemake@input$tss_universe, select = 1:6,
 39 |                    col.names = c("chrTSS", "startTSS", "endTSS", "gene", "score", "strandTSS"))
 40 | gene_annot <- fread(snakemake@input$gene_universe, select = 1:6,
 41 |                     col.names = c("chr", "start", "end", "gene", "score", "strand"))
 42 | 
 43 | # get file formats for all prediction files to load
 44 | if (length(snakemake@params$pred_format) > 1) {
 45 |   format <- unlist(config$pred_file_format)[names(config$pred)]
 46 | } else {
 47 |   format <- snakemake@params$pred_format
 48 | }
 49 | 
 50 | # load all prediction files
 51 | pred_list <- loadPredictions(config$pred, format = format, show_progress = FALSE)
 52 | 
 53 | # if specified, filter out any predictions where elements overlap annotated gene TSS
 54 | if (snakemake@params$filter_tss == TRUE) {
 55 |   tss_filt_file <- file.path(outdir, "filter_predictions_tss_results.txt")
 56 |   pred_list <- filterPredictionsTSS(pred_list, tss_annot = tss_annot, summary_file = tss_filt_file)
 57 | }
 58 | 
 59 | # combine files per predictor, if files for multiple cell types were provided
 60 | pred_list <- lapply(pred_list, FUN = rbindlist)
 61 | 
 62 | # load optional cell mapping files if provided
 63 | ct_map_files <- config$cell_type_mapping
 64 | if (!is.null(ct_map_files)) {
 65 |   cell_mappings <- lapply(ct_map_files, FUN = fread)
 66 |   qcCellMapping(cell_mappings)
 67 | } else {
 68 |   cell_mappings <- list()
 69 | }
 70 | 
 71 | # load expressed genes files if provided
 72 | if (!is.null(snakemake@input$expressed_genes)) {
 73 |   expressed_genes <- loadGeneExpr(snakemake@input$expressed_genes)
 74 | } else {
 75 |   expressed_genes <- NULL
 76 | }
 77 | 
 78 | # QC pred_config file
 79 | qcPredConfig(pred_config, pred_list = pred_list)
 80 | 
 81 | # QC predictions and experimental data
 82 | pred_list <- qcPredictions(pred_list, pred_config = pred_config, one_tss = FALSE)
 83 | expt <- qcExperiment(expt, pos_col = snakemake@params$pos_col, remove_na_pos = TRUE)
 84 | 
 85 | ## process input data ------------------------------------------------------------------------------
 86 | 
 87 | # filter experimental data for genes in gene universe
 88 | missing_file <- file.path(outdir, "expt_missing_from_gene_universe.txt")
 89 | expt <- filterExptGeneUniverse(expt, genes = tss_annot, missing_file = missing_file)
 90 | 
 91 | # add expression information to experimental data if specified
 92 | if (!is.null(snakemake@input$expressed_genes)) {
 93 |   expt <- addGeneExpression(expt, expressed_genes = expressed_genes)
 94 | }
 95 | 
 96 | # cell type matching and filter predictions for cell types in experimental data
 97 | message("Mapping cell types in predictions to cell types in experimental data")
 98 | pred_list <- mapCellTypes(pred_list, cell_mappings = cell_mappings)
 99 | pred_list <- lapply(pred_list, FUN = function(p) p[p$ExperimentCellType %in% expt$CellType, ] )
100 | 
101 | # verify if bad cell matching resulted in no data for some predictions after matching
102 | pred_rows <- vapply(pred_list, FUN = nrow, FUN.VALUE = integer(1))
103 | if (any(pred_rows == 0)) {
104 |   stop("No predictions left for ", paste(names(pred_rows[pred_rows == 0]), collapse = ", "),
105 |        " after cell type matching. Check that cell type mapping files are correct.", call. = FALSE)
106 | }
107 | 
108 | ## overlap experimental data with predictions ------------------------------------------------------
109 | 
110 | # check if genes in experimental data are also found in predictions and write to file
111 | # TODO: make this per cell type
112 | genes_summary_file <- file.path(outdir, "experimental_genes_in_predictions.txt")
113 | checkExistenceOfExperimentalGenesInPredictions(expt, pred_list, summary_file = genes_summary_file)
114 | 
115 | # merge experimental data with predictions
116 | message("\nMerging experimentals data and predictions:")
117 | merged <- combineAllExptPred(expt = expt, 
118 |                              pred_list = pred_list,
119 |                              config = pred_config,
120 |                              outdir = outdir,
121 |                              fill_pred_na = TRUE)
122 | 
123 | ## compute baseline predictors ---------------------------------------------------------------------
124 | 
125 | # get all simple baseline predictors to compute
126 | baseline_pred_ids <- c("distToTSS", "distToGene", "nearestTSS", "nearestGene", "within100kbTSS")
127 | if (!is.null(expressed_genes)) {
128 |   baseline_pred_ids <- c(baseline_pred_ids,
129 |                          c("nearestExprTSS", "nearestExprGene", "within100kbExprTSS"))
130 | }
131 | 
132 | # only retain baseline predictors to include in benchmark
133 | baseline_pred_ids <- intersect(baseline_pred_ids, pred_config$pred_col)
134 | 
135 | # compute and add baseline predictors
136 | message("Computing baseline predictors:\n\t", paste(baseline_pred_ids, collapse = "\n\t"))
137 | baseline_preds <- computeBaselinePreds(expt, preds = baseline_pred_ids, tss_annot = tss_annot,
138 |                                        gene_annot = gene_annot, expressed_genes = expressed_genes)
139 | merged <- rbind(merged, baseline_preds)
140 | 
141 | ## write to file -----------------------------------------------------------------------------------
142 | 
143 | # write merged data to main output file
144 | fwrite(merged, file = snakemake@output$merged, sep = "\t", quote = FALSE, na = "NA")
145 | 
146 | message("\nAll done!")
147 | 
148 | # close log file connection
149 | sink()
150 | sink(type = "message")
151 | 


--------------------------------------------------------------------------------