├── .gitignore
├── scripts
    ├── findPeaks.sh
    ├── convertBEDtoTagAlign.sh
    ├── convertWigToBigWig.sh
    ├── convertBEDPEtoTagAlign.sh
    ├── getSignalTrack.sh
    ├── averageSignalTrack.sh
    ├── filterAndConvertBAMs_SE.sh
    ├── filterAndConvertBAMs.sh
    └── subsampleBEDPEs.sh
├── copyData.py
├── setup.py
├── runGMExperiments.py
├── dataNormalizer.py
├── README.md
├── diConstants.py
├── modelTemplates.py
├── evaluations.py
├── PRROC.R
├── dataset.py
├── models.py
└── prepData.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | bds.pid*
 2 | *bed
 3 | *txt
 4 | .ipynb*
 5 | *.pyc
 6 | chipseq.bds*
 7 | *.csv
 8 | /logs
 9 | .Rproj.user
10 | 


--------------------------------------------------------------------------------
/scripts/findPeaks.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | bds ${1}/chipseq.bds \
 4 |        -out_dir ${2} \
 5 |        -histone \
 6 |        -tag1 ${3} \
 7 |        -ctl_tag1 ${4} \
 8 |        -callpeak macs2 \
 9 |        -species ${5} \
10 |        -nth 2
11 | 
12 | 
13 | 


--------------------------------------------------------------------------------
/scripts/convertBEDtoTagAlign.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Takes in a BED file from SE reads
 3 | # and outputs tagAlign
 4 | 
 5 | BEDPath=$1
 6 | tagAlignPath=$2
 7 | 
 8 | awkProg='
 9 | BEGIN {OFS = "\t"}
10 | {
11 | 	printf "%s\t%s\t%s\tN\t1000\t%s\n",$1,$2,$3,$6
12 | }
13 | '
14 | 
15 | awk -F'\t' "${awkProg}" ${BEDPath}| \
16 | 	gzip -c > ${tagAlignPath}


--------------------------------------------------------------------------------
/scripts/convertWigToBigWig.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Takes in a wig file 
 3 | # Outputs a bigWig file
 4 | # Then deletes the wig file
 5 | 
 6 | wigPath=$1
 7 | bigWigPath=$2
 8 | chromSizesPath=$3
 9 | 
10 | # . /etc/profile.d/modules.sh
11 | # module load ucsc_tools/3.0.9
12 | 
13 | wigToBigWig ${wigPath} ${chromSizesPath} ${bigWigPath}
14 | 
15 | rm ${wigPath}
16 | 


--------------------------------------------------------------------------------
/scripts/convertBEDPEtoTagAlign.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Takes in a BEDPE file
 3 | # and outputs tagAlign
 4 | 
 5 | BEDPath=$1
 6 | tagAlignPath=$2
 7 | 
 8 | awkProg='
 9 | BEGIN {OFS = "\t"}
10 | {
11 | 	printf "%s\t%s\t%s\tN\t1000\t%s\n%s\t%s\t%s\tN\t1000\t%s\n",$1,$2,$3,$9,$4,$5,$6,$10
12 | }
13 | '
14 | 
15 | awk -F'\t' "${awkProg}" ${BEDPath}| \
16 | 	gzip -c > ${tagAlignPath}


--------------------------------------------------------------------------------
/scripts/getSignalTrack.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Runs the BDS CHIP-seq pipeline on a given tagAlign file.
 3 | 
 4 | pipelineDir=$1
 5 | tagAlignPath=$2
 6 | outputDir=$3
 7 | species=$4
 8 | 
 9 | bds ${pipelineDir}/chipseq.bds \
10 |     -out_dir ${outputDir} \
11 |     -histone \
12 |     -input tag \
13 |     -final_stage xcor \
14 |     -tag1 ${tagAlignPath} \
15 |     -tag2bw \
16 |     -species ${species} \
17 |     -nth 2
18 | 


--------------------------------------------------------------------------------
/scripts/averageSignalTrack.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Takes in a bigWig file and BED file
 3 | # Gets the average of the bigWig signal over the BED intervals
 4 | # Outputs it to outputPath
 5 | 
 6 | bigWigPath=$1
 7 | BEDPath=$2
 8 | outputPath=$3
 9 | 
10 | # . /etc/profile.d/modules.sh
11 | # module load ucsc_tools/3.0.9
12 | 
13 | # bigWigAverageOverBed has no option to output to stdout, so we need a temp file
14 | bigWigAverageOverBed ${bigWigPath} ${BEDPath} ${outputPath}.temp
15 | 
16 | cut -f5 ${outputPath}.temp > ${outputPath}
17 | rm ${outputPath}.temp
18 | 
19 | 
20 | 


--------------------------------------------------------------------------------
/scripts/filterAndConvertBAMs_SE.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Takes in a BAM file with single-end reads
 3 | # Filters it for properly mapping reads above a certain MAPQ threshold 
 4 | # Returns BED
 5 | 
 6 | BAMPath=$1
 7 | BEDPath=$2
 8 | mapQThreshold=$3
 9 | 
10 | # . /etc/profile.d/modules.sh
11 | # module load samtools/1.2
12 | # module load bedtools/2.23.0
13 | 
14 | samtools view -F 1804 -q ${mapQThreshold} -u ${BAMPath} | \
15 |     samtools sort -m 10000M -O bam -n -T ${BAMPath} - | \
16 |     samtools view -F 1804 -u - | \
17 |     bedtools bamtobed -i stdin > ${BEDPath}


--------------------------------------------------------------------------------
/scripts/filterAndConvertBAMs.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Takes in a BAM file 
 3 | # Filters it for properly mapping reads above a certain MAPQ threshold 
 4 | # Returns BEDPE
 5 | 
 6 | BAMPath=$1
 7 | BEDPath=$2
 8 | mapQThreshold=$3
 9 | 
10 | # . /etc/profile.d/modules.sh
11 | # module load samtools/1.2
12 | # module load bedtools/2.23.0
13 | 
14 | samtools view -F 1804 -f 2 -q ${mapQThreshold} -u ${BAMPath} | \
15 |     samtools sort -m 10000M -O bam -n -T ${BAMPath} - | \
16 |     samtools fixmate -r -O 'bam' - - | \
17 |     samtools view -F 1804 -f 2 -u - | \
18 |     bedtools bamtobed -bedpe -i stdin > ${BEDPath}


--------------------------------------------------------------------------------
/scripts/subsampleBEDPEs.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Takes in a BEDPE file
 3 | # Subsamples it and outputs tagAlign
 4 | # numSamplePairs is measured in pairs of reads
 5 | 
 6 | get_seeded_random()
 7 | {
 8 |   seed="$1"
 9 |   openssl enc -aes-256-ctr -pass pass:"$seed" -nosalt \
10 |     </dev/zero 2>/dev/null
11 | }
12 | 
13 | BEDPath=$1
14 | tagAlignPath=$2
15 | numSamplePairs=$3
16 | 
17 | awkProg='
18 | BEGIN {OFS = "\t"}
19 | {
20 | 	printf "%s\t%s\t%s\tN\t1000\t%s\n%s\t%s\t%s\tN\t1000\t%s\n",$1,$2,$3,$9,$4,$5,$6,$10
21 | }
22 | '
23 | 
24 | shuf -n ${numSamplePairs} --random-source=<(get_seeded_random 42) ${BEDPath} | \
25 | 	awk -F'\t' "${awkProg}" | \
26 | 	gzip -c > ${tagAlignPath}


--------------------------------------------------------------------------------
/copyData.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from subprocess import call
 3 | 
 4 | import diConstants as di
 5 | 
 6 | call('wget http://mitra.stanford.edu/kundaje/pangwei/coda_denoising/hg19_blacklist.bed', shell=True)
 7 | call('wget http://mitra.stanford.edu/kundaje/pangwei/coda_denoising/hg19.chrom.sizes', shell=True)
 8 | call('mv hg19_blacklist.bed %s' % di.HG19_BLACKLIST_FILE, shell=True)
 9 | call('mv hg19.chrom.sizes %s' % di.HG19_CHROM_SIZES_PATH, shell=True)
10 | 
11 | call('wget http://mitra.stanford.edu/kundaje/pangwei/coda_denoising/low_seq_depth_processed_files.tar.gz', shell=True)
12 | call('tar -xvf low_seq_depth_processed_files.tar.gz', shell=True)
13 | call('mv *metadata *npz %s' % di.BASE_ROOT, shell=True)
14 | call('mv *gappedPeaks* %s' % di.PEAK_GAPPED_DIR, shell=True)


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import diConstants as di
 2 | import os
 3 | 
 4 | import numpy as np
 5 | import pandas as pd
 6 | from keras.models import Sequential, model_from_json
 7 | from scipy.stats.stats import pearsonr
 8 | from sklearn.metrics import precision_score
 9 | import rpy2.robjects as robjects
10 | from rpy2.robjects.packages import STAP
11 | import h5py
12 | 
13 | from subprocess import call
14 | 
15 | call('mkdir %s' % di.DATA_ROOT, shell=True)
16 | call('mkdir %s' % di.MODELS_ROOT, shell=True)
17 | call('mkdir %s' % di.RESULTS_ROOT, shell=True)
18 | 
19 | call('mkdir %s' % di.RAW_ROOT, shell=True)
20 | call('mkdir %s' % di.MERGED_ROOT, shell=True)
21 | call('mkdir %s' % di.SUBSAMPLED_ROOT, shell=True)
22 | call('mkdir %s' % di.BIGWIGS_ROOT, shell=True)
23 | call('mkdir %s' % di.INTERVALS_ROOT, shell=True)
24 | call('mkdir %s' % di.NUMPY_ROOT, shell=True)
25 | call('mkdir %s' % di.PEAK_BASE_DIR, shell=True)
26 | call('mkdir -p %s' % di.PEAK_GAPPED_DIR, shell=True)
27 | call('mkdir %s' % di.DATASETS_ROOT, shell=True)
28 | call('mkdir %s' % di.BASE_ROOT, shell=True)
29 | call('mkdir %s' % di.BASE_BIGWIG_ROOT, shell=True)
30 | call('mkdir %s' % di.SEQ_ROOT, shell=True)
31 | call('mkdir %s' % di.WEIGHTS_ROOT, shell=True)
32 | call('mkdir %s' % di.LOSS_ROOT, shell=True)
33 | call('mkdir %s' % di.HIST_ROOT, shell=True)
34 | call('mkdir %s' % di.EVAL_ROOT, shell=True)
35 | 


--------------------------------------------------------------------------------
/runGMExperiments.py:
--------------------------------------------------------------------------------
 1 | # Run on different "full" depths
 2 | # Re-run roadmap experiments 
 3 | # Map all scRNA stuff
 4 | 
 5 | import os
 6 | import copy
 7 | import tempfile
 8 | import json
 9 | from subprocess import call
10 | from diConstants import (HG19_ALL_CHROMS, MM9_ALL_CHROMS,
11 |     HG19_TRAIN_CHROMS, MM9_TRAIN_CHROMS,
12 |     VALID_CHROMS, TEST_CHROMS) 
13 | 
14 | import models
15 | import modelTemplates
16 | 
17 | def run_model(model_params):
18 |     m = models.SeqModel.instantiate_model(model_params)
19 |     m.compile_and_train_model()
20 |     results = m.evaluate_model()
21 |     return results
22 | 
23 | GM_MARKS = ['H3K27AC', 'H3K4ME1', 'H3K4ME3', 'H3K27ME3', 'H3K36ME3']
24 | 
25 | 
26 | def test_GM18526():
27 | 
28 |     for test_cell_line in ['GM18526']:
29 |         for subsample_target_string in ['0.5e6']:
30 |             for predict_binary_output in [True, False]:    
31 |                 for output_mark in GM_MARKS:                            
32 | 
33 |                     model_params = modelTemplates.make_model_params(
34 |                         model_library='keras',
35 |                         model_class='SeqToPoint',
36 |                         model_type='cnn',
37 |                         model_specific_params={
38 |                             'num_filters': 6,
39 |                             'filter_length': 51
40 |                         },
41 |                         compile_params={            
42 |                             'optimizer': 'adagrad'
43 |                         },
44 |                         dataset_params={
45 |                             'train_dataset_name': 'GM12878_5+1marks-K4me3_all',
46 |                             'test_dataset_name': '%s_5+1marks-K4me3_all' % test_cell_line, 
47 |                             'num_train_examples': 100000,
48 |                             'seq_length': 1001,
49 |                             'peak_fraction': 0.5,                            
50 |                             'train_X_subsample_target_string': subsample_target_string,
51 |                             'num_bins_to_test': None,
52 |                             'train_chroms': HG19_ALL_CHROMS,
53 |                             'test_chroms': HG19_ALL_CHROMS,
54 |                             'only_chr1': True
55 |                         },
56 |                         output_marks=[output_mark],
57 |                         train_params={
58 |                             'nb_epoch': 30,
59 |                             'batch_size': 100
60 |                         },
61 |                         predict_binary_output=predict_binary_output,
62 |                         zero_out_non_bins=True,
63 |                         generate_bigWig=True)
64 | 
65 |                     run_model(model_params)
66 | 
67 | 
68 | if __name__ == '__main__':
69 |     
70 |     test_GM18526()


--------------------------------------------------------------------------------
/dataNormalizer.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | class DataNormalizer(object):
  4 |     """
  5 |     This class learns input scaling parameters and uses those parameters to apply input scaling
  6 |     to given data. It contains the fit() and transform() methods. 
  7 | 
  8 |     We suppose four types of input scaling:
  9 |         1) 'ZCA': Zero-mean, features linearly transformed to have unit covariance
 10 |         2) 'Z': Zero-mean, each feature independently scaled to unit variance
 11 |         3) '01': Zero-mean, each feature independently scaled to lie within [-1, 1]
 12 |         4) 'identity': Nothing happens to the input
 13 | 
 14 |     If not using 'ZCA', the transform() method does not create a copy. Instead, it modifies the 
 15 |     argument passed to it. If using 'ZCA', this behavior is ambiguous because of the use of 
 16 |     np.reshape, and you should assume that the argument passed to it could randomly be either 
 17 |     transformed or not (and so you should not make further use of the argument).
 18 | 
 19 |     All of these transformations are affine transformations. To represent them, each instance of the 
 20 |     class has two variables, W and b, which roughly correspond to the scale and translation factors
 21 |     for the different transformations.
 22 | 
 23 |     Sample usage: 
 24 |         normalizer = DataNormalizer('01')
 25 |         normalizer.fit(X_train)
 26 |         X_train = normalizer.transform(X_train)
 27 |         X_test = normalizer.transform(X_test)
 28 |     """
 29 | 
 30 |     def __init__(self, mode):
 31 |         self.b = None
 32 |         self.W = None
 33 |         self.mode = mode
 34 |         if mode not in ['ZCA', 'Z', '01', 'identity']:
 35 |             raise ValueError, "mode=%s must be 'ZCA', 'Z', '01', or 'identity'" % mode
 36 | 
 37 | 
 38 |     def fit(self, X_orig):
 39 |         """
 40 |         Learns scaling parameters on the X_orig dataset. Does not modify X_orig.
 41 |         """        
 42 |         if len(X_orig.shape) != 2 and len(X_orig.shape) != 3:
 43 |             raise ValueError, "X must be either a 3-tensor of shape num_examples x seq_length x \
 44 |                                num_input_marks, or a 2-tensor of shape num_examples x num_input_marks"
 45 |         if self.mode == 'identity':
 46 |             return None        
 47 | 
 48 |         X = np.copy(X_orig)
 49 |         num_input_marks = X.shape[-1]
 50 | 
 51 |         # If X is a 3-tensor, reshape X such that it is a 2-tensor of shape 
 52 |         # (num_examples * seq_length) x num_input_marks. 
 53 |         if len(X.shape) == 3:    
 54 |             X = np.reshape(X, (-1, num_input_marks))
 55 |         
 56 |         self.b = np.mean(X, axis=0) 
 57 | 
 58 |         X -= self.b
 59 | 
 60 |         if self.mode == 'ZCA':
 61 |             sigma = np.dot(X.T, X) / X.shape[0]
 62 |             U, S, V = np.linalg.svd(sigma)
 63 |             self.W = np.dot(
 64 |                 np.dot(U, np.diag(1 / np.sqrt(S + 1e-5))),
 65 |                 U.T)
 66 |         elif self.mode == 'Z':
 67 |             self.W = np.empty(num_input_marks)
 68 |             for idx in range(num_input_marks):
 69 |                 self.W[idx] = np.std(X[:, idx])
 70 |         elif self.mode == '01':
 71 |             self.W = np.empty(num_input_marks)
 72 |             for idx in range(num_input_marks):
 73 |                 self.W[idx] = np.max(np.abs(X[:, idx]))
 74 | 
 75 |         return None            
 76 | 
 77 | 
 78 |     def transform(self, X):
 79 |         if len(X.shape) != 2 and len(X.shape) != 3:
 80 |             raise ValueError, "X must be either a 3-tensor of shape num_examples x seq_length x \
 81 |                                num_input_marks, or a 2-tensor of shape num_examples x num_input_marks"
 82 | 
 83 |         if self.mode == 'identity':
 84 |             return X
 85 |             
 86 |         assert self.b is not None
 87 |         assert self.W is not None
 88 | 
 89 |         num_input_marks = X.shape[-1]
 90 |         orig_shape = X.shape
 91 | 
 92 |         if self.mode == 'ZCA':            
 93 |             X = np.reshape(X, (-1, num_input_marks))
 94 |             if self.W.shape[1] != X.shape[1]:
 95 |                 raise ValueError, "When doing a ZCA transform, X and W must have the same number of columns."
 96 |             X = np.dot(
 97 |                 X - self.b,
 98 |                 self.W.T)
 99 |             X = np.reshape(X, orig_shape)
100 |         elif self.mode in ['Z', '01']:
101 |             if (len(self.b) != num_input_marks) or (len(self.W) != num_input_marks):
102 |                 print("X.shape: ", X.shape)
103 |                 print("b.shape: ", self.b.shape)
104 |                 print("W.shape: ", self.W.shape)
105 |                 raise ValueError, "The shapes of X, b, and W must all share the same last dimension."                
106 |             for idx in range(num_input_marks):
107 |                 X[..., idx] = (X[..., idx] - self.b[idx]) / self.W[idx]
108 | 
109 |         return X
110 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Coda: a convolutional denoising algorithm for genome-wide ChIP-seq data
 2 | 
 3 | Coda uses convolutional neural networks to learn a mapping from noisy to high-quality ChIP-seq data.
 4 | These trained networks can then be used to remove noise and improve the quality of new ChIP-seq data.
 5 | For more details, please refer to our paper
 6 | 
 7 | Koh PW, Pierson E, Kundaje A, Denoising genome-wide histone ChIP-seq with convolutional neural networks. Bioinformatics (2017) 33 (14): i225-i233 URL:https://doi.org/10.1093/bioinformatics/btx243 (ISMB 2017 Proceedings)
 8 | 
 9 | bioRxiv doi: https://doi.org/10.1101/052118
10 | 
11 | 
12 | ## Dependencies
13 | The code is written in Python 2.7 and requires the following Python packages to run:
14 | - Numpy (1.11.1)
15 | - Scipy (0.18.0)
16 | - Scikit-learn (0.17.1)
17 | - Pandas (0.18.1)
18 | - h5py (2.6.0)
19 | - rpy2 (2.8.1)
20 | - Keras (1.0.7)
21 | 
22 | In addition, if you want to process your own data, you will need:
23 | - AQUAS ChIP-seq pipeline
24 | - SAMtools (1.2)
25 | - BEDtools (2.23)
26 | - ucsc_tools (3.0.9)
27 | 
28 | ## Training and testing a model with pre-processed data
29 | The fastest way to get started is to download data that has already been pre-processed. 
30 | We have uploaded processed ChIP-seq data from lymphoblastoid cell lines GM12878 and GM18526, 
31 | taken from [1]. Each cell line has two sets of ChIP-seq data, one derived from 1M reads per mark and 
32 | the other from 100M+ reads per mark. The instructions below will train a model to recover high-depth
33 | data from low-depth data on GM12878, and then apply it to low-depth data on GM18526, evaluating the 
34 | model output against high-depth data on GM18526:
35 | 
36 | 1) Clone the repo and install the dependencies above.
37 | 
38 | 2) Edit `diConstants.py` to reflect the paths where you want to store the data, code, results, etc.
39 | 
40 | 3) Run `setup.py`. This runs a few test imports to make sure you have the required libraries, and sets
41 | up the directory structure as specified in `diConstants.py`.
42 | 
43 | 4) Run `copyData.py`. This copies the required data (including hg19 blacklist and chromosome sizes) to 
44 | the appropriate folders. Note that the data is 6GB in size, so please run this script in a location
45 | where there's enough space!
46 | 
47 | 5) Finally, run `python runGMExperiments.py` to get the experiments going. Numerical results will be 
48 | written to `RESULTS_ROOT`. Output tracks (reconstructed signal and peak calls) will be written to `RESULTS_BIGWIG_ROOT`.
49 | We make use of the R 'PRROC' package, written by Jan Grau and Jens Keilwagen, to evaluate peak calls.
50 | 
51 | ## Processing your own data
52 | We use the AQUAS ChIP-seq pipeline (https://github.com/kundajelab/TF_chipseq_pipeline)
53 | to process raw ChIP-seq data. The script `prepData.py` (and the contents of the `scripts` folder)
54 | contains wrapper functions that call the AQUAS pipeline for you. 
55 | 
56 | Please install the AQUAS pipeline before proceeding. Note that this pipeline is still under
57 | some development and might be changing in non-backwards-compatible ways. Our code has been tested with
58 | commit 7b7dd27d42d46ac52f5687f80904c576d1b6595d of the AQUAS pipeline. 
59 | 
60 | To create the processed data that we provided above, you may run the following steps:
61 | 
62 | 1) Follow steps 1-3 of the above section.
63 | 
64 | 2) Download the files corresponding to GM12878 and GM18526:
65 | http://gbsc-share.stanford.edu/chromovar/rawdata/mapped/bam/personal/reconcile/dedup/
66 | 
67 | 3) Run `python prepData.py make_intervals hg19`. You only need to do this once.
68 | 
69 | 4) Run `python prepData.py run_GM_pipeline`. 
70 | 
71 | This code assumes that you've downloaded the files to a shared location 
72 | (`REMOTE_ROOT`, specified in diConstants.py). It makes copies of the files in a 
73 | local directory, `RAW_ROOT`, before proceeding. This setup is useful if `REMOTE_ROOT`
74 | is shared across multiple machines and `RAW_ROOT` is local to the machine that you're
75 | running the code on, because there will be a lot of IO operations that will be faster
76 | if done locally. If you do not need this, modify `merge_BAMs()` in `prepData.py`
77 | to remove the copying.
78 | 
79 | To process your own data, simply modify the paths in `diConstants.py` or copy your 
80 | data to the right directories. While we start from BAM files in this example, the AQUAS 
81 | pipeline can start from a variety of input files (e.g., FASTQ, tagAligns). Edit 
82 | `scripts/getSignalTrack.sh` and `scripts/findPeaks.sh` if you want to change the parameters that 
83 | are passed into AQUAS.
84 | 
85 | ## Contact
86 | If you have any questions, please contact:
87 | - Pang Wei Koh <pangwei@cs.stanford.edu>
88 | - Emma Pierson <emmap1@stanford.edu>
89 | - Anshul Kundaje <akundaje@stanford.edu>
90 | 
91 | ## References
92 | [1] Kasowski M, Kyriazopoulou-Panagiotopoulou S, Grubert F, Zaugg JB, Kundaje A, Liu Y, et al. Extensive variation in chromatin states across humans. Science (New York, NY). 2013 11;342(6159):750–2
93 | 


--------------------------------------------------------------------------------
/diConstants.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | ### Variables to set
  4 | 
  5 | # Where the remote directory that the GM12878 and GM18526 data have been downloaded to
  6 | REMOTE_ROOT = "/mnt/data/chromatinVariation1/rawdata/mapped/bam/personal/reconcile/dedup"    
  7 | 
  8 | # Where the AQUAS pipeline is installed
  9 | PIPELINE_ROOT = "/users/pangwei/TF_chipseq_pipeline/"
 10 | 
 11 | # Where the code is
 12 | CODE_ROOT = "/users/pangwei/deepimpute_pub"
 13 | 
 14 | # Where the bulk of the storage will be (intermediate/processed files, etc.)
 15 | DISK_ROOT = "/srv/scratch/pangwei/deepimpute_pub/"
 16 | 
 17 | # Where output bigwigs will be written to
 18 | RESULTS_BIGWIG_ROOT = "/srv/www/kundaje/deepimpute/model-bw"
 19 | 
 20 | HG19_BLACKLIST_FILE = '/srv/www/kundaje/pangwei/coda_denoising/hg19_blacklist.bed'
 21 | MM9_BLACKLIST_FILE = '/srv/www/kundaje/pangwei/coda_denoising/mm9_blacklist.bed'
 22 | HG19_CHROM_SIZES_PATH = '/srv/www/kundaje/pangwei/coda_denoising/hg19.chrom.sizes'
 23 | MM9_CHROM_SIZES_PATH = '/srv/www/kundaje/pangwei/coda_denoising/mm9.male.chrom.sizes'
 24 | 
 25 | MAPQ_THRESHOLD = 30
 26 | 
 27 | ######
 28 | 
 29 | 
 30 | DATA_ROOT = os.path.join(DISK_ROOT, 'data')
 31 | MODELS_ROOT = os.path.join(DISK_ROOT, 'models')
 32 | RESULTS_ROOT = os.path.join(DISK_ROOT, 'results')
 33 | 
 34 | RAW_ROOT = os.path.join(DATA_ROOT, 'raw')
 35 | MERGED_ROOT = os.path.join(DATA_ROOT, 'merged')
 36 | SUBSAMPLED_ROOT = os.path.join(DATA_ROOT, 'subsampled')
 37 | BIGWIGS_ROOT = os.path.join(DATA_ROOT, 'bigWigs')
 38 | INTERVALS_ROOT = os.path.join(DATA_ROOT, 'intervals')
 39 | NUMPY_ROOT = os.path.join(DATA_ROOT, 'numpy')
 40 | PEAK_BASE_DIR = os.path.join(DATA_ROOT, 'peaks')
 41 | PEAK_GAPPED_DIR = os.path.join(PEAK_BASE_DIR, 'peak', 'macs2', 'rep1')
 42 | DATASETS_ROOT = os.path.join(DATA_ROOT, 'datasets')
 43 | BASE_ROOT = os.path.join(DATASETS_ROOT, 'base')
 44 | BASE_BIGWIG_ROOT = os.path.join(BASE_ROOT, 'bigWigs')
 45 | SEQ_ROOT = os.path.join(DATASETS_ROOT, 'processed-seq')
 46 | 
 47 | WEIGHTS_ROOT = os.path.join(MODELS_ROOT, 'weights')
 48 | 
 49 | LOSS_ROOT = os.path.join(RESULTS_ROOT, 'loss')
 50 | HIST_ROOT = os.path.join(RESULTS_ROOT, 'hist')
 51 | EVAL_ROOT = os.path.join(RESULTS_ROOT, 'eval')
 52 | 
 53 | 
 54 | HG19_CHROM_SIZES = {
 55 |     'chr1':  249250621,
 56 |     'chr2':  243199373,
 57 |     'chr3':  198022430,
 58 |     'chr4':  191154276,
 59 |     'chr5':  180915260,
 60 |     'chr6':  171115067,
 61 |     'chr7':  159138663,
 62 |     'chr8':  146364022,
 63 |     'chr9':  141213431,
 64 |     'chr10': 135534747,
 65 |     'chr11': 135006516,
 66 |     'chr12': 133851895,
 67 |     'chr13': 115169878,
 68 |     'chr14': 107349540,
 69 |     'chr15': 102531392,
 70 |     'chr16': 90354753,
 71 |     'chr17': 81195210,
 72 |     'chr18': 78077248,
 73 |     'chr19': 59128983,
 74 |     'chr20': 63025520,
 75 |     'chr21': 48129895,
 76 |     'chr22': 51304566,
 77 | }
 78 | 
 79 | MM9_CHROM_SIZES = {
 80 |     'chr1':  197195432,
 81 |     'chr2':  181748087,
 82 |     'chr3':  159599783,
 83 |     'chr4':  155630120,
 84 |     'chr5':  152537259,
 85 |     'chr6':  149517037,
 86 |     'chr7':  152524553,
 87 |     'chr8':  131738871,
 88 |     'chr9':  124076172,
 89 |     'chr10': 129993255,
 90 |     'chr11': 121843856,
 91 |     'chr12': 121257530,
 92 |     'chr13': 120284312,
 93 |     'chr14': 125194864,
 94 |     'chr15': 103494974,
 95 |     'chr16': 98319150,
 96 |     'chr17': 95272651,
 97 |     'chr18': 90772031,
 98 |     'chr19': 61342430
 99 | }
100 | BIN_SIZE = 25
101 | GENOME_BATCH_SIZE = 50000
102 | NUM_BASES = 4
103 | 
104 | GM_CELL_LINES = ['GM12878', 'GM19239', 'GM10847', 'GM18505', 'GM18526', 'GM18951', 'GM2610']
105 | GM_FACTORS = ['H3K27AC','H3K27ME3', 'H3K36ME3','H3K4ME1', 'H3K4ME3', 'INPUT']
106 | SUBSAMPLE_TARGETS = ['0.1e6','0.25e6', '0.5e6','1e6', '2.5e6', '5e6','7.5e6', '10e6','30e6','20e6', None]
107 | 
108 | GM_DATASET_NAME_TEMPLATE = '%s_5+1marks-K4me3_all'
109 | ROADMAP_DATASET_NAME_TEMPLATE = '%s_6+1marks_all'
110 | ULI_DATASET_NAME_TEMPLATE = '%s_3marks_all'
111 | MOW_DATASET_NAME_TEMPLATE = '%s_2marks_all'
112 | 
113 | 
114 | HG19_ALL_CHROMS = [
115 |     'chr1',
116 |     'chr2',
117 |     'chr3',
118 |     'chr4',
119 |     'chr5', 
120 |     'chr6', 
121 |     'chr7',
122 |     'chr8', 
123 |     'chr9', 
124 |     'chr10', 
125 |     'chr11', 
126 |     'chr12', 
127 |     'chr13', 
128 |     'chr14', 
129 |     'chr15', 
130 |     'chr16', 
131 |     'chr17', 
132 |     'chr18', 
133 |     'chr19', 
134 |     'chr20', 
135 |     'chr21', 
136 |     'chr22', 
137 |     ]
138 | 
139 | MM9_ALL_CHROMS = [
140 |     'chr1',
141 |     'chr2',
142 |     'chr3',
143 |     'chr4',
144 |     'chr5', 
145 |     'chr6', 
146 |     'chr7',
147 |     'chr8', 
148 |     'chr9', 
149 |     'chr10', 
150 |     'chr11', 
151 |     'chr12', 
152 |     'chr13', 
153 |     'chr14', 
154 |     'chr15', 
155 |     'chr16', 
156 |     'chr17', 
157 |     'chr18', 
158 |     'chr19'
159 |     ]
160 | 
161 | 
162 | TEST_CHROMS = [
163 |     'chr1',
164 |     'chr2',
165 |     ]
166 | 
167 | VALID_CHROMS = [
168 |     'chr3',
169 |     'chr4'
170 |     ]
171 | 
172 | HG19_TRAIN_CHROMS = [
173 |     'chr5', 
174 |     'chr6', 
175 |     'chr7',
176 |     'chr8', 
177 |     'chr9', 
178 |     'chr10', 
179 |     'chr11', 
180 |     'chr12', 
181 |     'chr13', 
182 |     'chr14', 
183 |     'chr15', 
184 |     'chr16', 
185 |     'chr17', 
186 |     'chr18', 
187 |     'chr19', 
188 |     'chr20', 
189 |     'chr21', 
190 |     'chr22', 
191 |     ]
192 | 
193 | MM9_TRAIN_CHROMS = [
194 |     'chr5', 
195 |     'chr6', 
196 |     'chr7',
197 |     'chr8', 
198 |     'chr9', 
199 |     'chr10', 
200 |     'chr11', 
201 |     'chr12', 
202 |     'chr13', 
203 |     'chr14', 
204 |     'chr15', 
205 |     'chr16', 
206 |     'chr17', 
207 |     'chr18', 
208 |     'chr19'
209 |     ]


--------------------------------------------------------------------------------
/modelTemplates.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | from dataset import Dataset, get_species_from_dataset_name
  3 | from diConstants import (HG19_ALL_CHROMS, MM9_ALL_CHROMS,
  4 |     HG19_TRAIN_CHROMS, MM9_TRAIN_CHROMS,
  5 |     VALID_CHROMS, TEST_CHROMS) 
  6 | 
  7 | def make_dataset_params(num_train_examples,
  8 |                         seq_length,
  9 |                         train_dataset_name='GM12878_5+1marks-K4me3_all', 
 10 |                         test_dataset_name='GM19239_5+1marks-K4me3_all',
 11 |                         train_X_subsample_target_string='5e6',
 12 |                         train_Y_subsample_target_string=None,
 13 |                         test_X_subsample_target_string=None,
 14 |                         test_Y_subsample_target_string=None,                        
 15 |                         random_seed=0,                        
 16 |                         num_test_examples=None,                        
 17 |                         normalization='arcsinh',
 18 |                         peak_fraction=0.5,
 19 |                         only_chr1=True,
 20 |                         num_bins_to_test=1000000,                        
 21 |                         train_chroms=None,
 22 |                         test_chroms=None):
 23 |     """
 24 |     only_chr1 controls whether genome-wide prediction is done on the whole genome, or just
 25 |     on chr1 for speed.
 26 | 
 27 |     num_bins_to_test controls how many bins of each chromosome should be tested. If num_bins_to_test
 28 |     == 1000000, for example, then only the first 1M bins of each chromosome (or of chr1, if only_chr1 is 
 29 |     True) will be tested. Set num_bins_to_test to None to test the whole chromosome.
 30 |     """
 31 | 
 32 |     if num_test_examples is None:
 33 |         num_test_examples = num_train_examples 
 34 | 
 35 |     if test_X_subsample_target_string is None:
 36 |         test_X_subsample_target_string = train_X_subsample_target_string
 37 |     
 38 |     if test_Y_subsample_target_string is None:
 39 |         test_Y_subsample_target_string = train_Y_subsample_target_string        
 40 | 
 41 |     if train_chroms is None:
 42 |         if get_species_from_dataset_name(train_dataset_name) == 'mm9':
 43 |             train_chroms = MM9_ALL_CHROMS
 44 |         else:
 45 |             train_chroms = HG19_ALL_CHROMS
 46 | 
 47 |     if test_chroms is None:
 48 |         if get_species_from_dataset_name(test_dataset_name) == 'mm9':
 49 |             test_chroms = MM9_ALL_CHROMS
 50 |         else:
 51 |             test_chroms = HG19_ALL_CHROMS
 52 | 
 53 |     return {
 54 |         'train_dataset': Dataset(
 55 |             dataset_name=train_dataset_name,
 56 |             num_examples=num_train_examples, 
 57 |             X_subsample_target_string=train_X_subsample_target_string, 
 58 |             Y_subsample_target_string=train_Y_subsample_target_string,
 59 |             random_seed=random_seed, 
 60 |             normalization=normalization,
 61 |             peak_fraction=peak_fraction,
 62 |             chroms=train_chroms),
 63 |         'test_datasets': [Dataset(
 64 |             dataset_name=test_dataset_name,
 65 |             num_examples=num_test_examples, 
 66 |             X_subsample_target_string=test_X_subsample_target_string, 
 67 |             Y_subsample_target_string=test_Y_subsample_target_string,
 68 |             random_seed=random_seed, 
 69 |             normalization=normalization,
 70 |             peak_fraction=peak_fraction,
 71 |             chroms=test_chroms)],
 72 |         'seq_length': seq_length,        
 73 |         'num_bins_to_test': num_bins_to_test,
 74 |         'only_chr1': only_chr1,
 75 |     }
 76 | 
 77 | 
 78 | def make_model_params(model_library,
 79 |                       model_class,
 80 |                       model_type,
 81 |                       dataset_params,       
 82 |                       scale_input='01',                                            
 83 |                       model_specific_params=None,
 84 |                       compile_params=None,
 85 |                       train_params=None,               
 86 |                       input_marks=None,
 87 |                       output_marks=None,
 88 |                       random_seed=0,
 89 |                       generate_bigWig=False,
 90 |                       predict_binary_output=False, 
 91 |                       zero_out_non_bins=False):    
 92 |     """
 93 |     input_marks is a list of histone marks that the model will take in as input.
 94 |     
 95 |     output_marks is a list of all the marks that we want the model to learn to output. 
 96 |     If we're training a single multi-task model, this is either a list of length 5 or 6, 
 97 |     depending on whether we're doing classification or regression (if we're doing classification, 
 98 |     we don't predict INPUT).
 99 |     If we're training a separate model for each mark, then output_marks is just a list of length 1.
100 |     
101 |     scale_input is one of 'ZCA', 'Z', '01', or 'identity'.  
102 | 
103 |     zero_out_non_bins is only used when predict_binary_output is True. It specifies whether 
104 |     we should zero out the -log10 p values of bins that are not in the corresponding gappedPeak file.
105 |     This is used for baseline evaluations.
106 |     """
107 |     
108 |     params = {
109 |         'model_library': model_library,
110 |         'model_class': model_class,
111 |         'model_type': model_type,
112 |         'scale_input': scale_input,
113 |         'random_seed': random_seed,
114 |         'generate_bigWig': generate_bigWig,
115 |         'predict_binary_output': predict_binary_output,
116 |         'zero_out_non_bins': zero_out_non_bins
117 |     }
118 | 
119 |     params['dataset_params'] = make_dataset_params(**dataset_params)
120 | 
121 |     # Defaults for compile_params
122 |     if compile_params is None:
123 |         compile_params = {}
124 |     if model_library == 'keras':
125 |         if predict_binary_output:
126 |             compile_params_defaults = {
127 |                 'loss': 'binary_crossentropy',
128 |                 'optimizer': 'adagrad'
129 |             }
130 |         else:
131 |             compile_params_defaults = {
132 |                 'loss': 'MSE',
133 |                 'optimizer': 'adagrad'
134 |             }
135 |         for key in compile_params_defaults:
136 |             if key not in compile_params:
137 |                 compile_params[key] = compile_params_defaults[key]
138 |     params['compile_params'] = compile_params
139 | 
140 |     # Defaults for train_params
141 |     if train_params is None:
142 |         train_params = {}
143 |     if model_library == 'keras':
144 |         train_params_defaults = {
145 |             'nb_epoch': 50,
146 |             'batch_size': 2000,
147 |             'validation_split': 0.2
148 |         }
149 |         for key in train_params_defaults:
150 |             if key not in train_params:
151 |                 train_params[key] = train_params_defaults[key]
152 |     params['train_params'] = train_params
153 | 
154 |     # If input_marks is not set, then set it to all the marks in the training dataset
155 |     if input_marks is None:
156 |         input_marks = params['dataset_params']['train_dataset'].marks_in_dataset
157 | 
158 |     # Default for output_marks is to output all of the input_marks
159 |     # Unless we're doing classification, in which case we don't output INPUT
160 |     if output_marks is None:
161 |         output_marks = copy.copy(input_marks)
162 |         if predict_binary_output and 'INPUT' in output_marks:
163 |             output_marks.remove('INPUT')
164 | 
165 |     # Make sure that input_marks and output_marks are both contained within
166 |     # marks_in_train_dataset and marks_in_test_dataset
167 |     for mark in input_marks + output_marks:
168 |         assert mark in params['dataset_params']['train_dataset'].marks_in_dataset
169 |         for test_dataset in params['dataset_params']['test_datasets']:
170 |             assert mark in test_dataset.marks_in_dataset
171 | 
172 |     params['input_marks'] = input_marks
173 |     params['output_marks'] = output_marks
174 | 
175 |     if model_specific_params is None:
176 |         model_specific_params = {}
177 |     for key in model_specific_params:
178 |         if key in params:
179 |             raise ValueError, 'model_specific_params cannot overwrite existing model params'
180 |         params[key] = model_specific_params[key]
181 | 
182 |     return params
183 | 
184 | 


--------------------------------------------------------------------------------
/evaluations.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from scipy.stats.stats import pearsonr
  3 | from sklearn.metrics import precision_recall_curve
  4 | import rpy2.robjects as robjects
  5 | from rpy2.robjects.packages import STAP
  6 | import datetime
  7 | from random import sample
  8 | 
  9 | def get_MSE(pred_Y, test_Y):
 10 |     """
 11 |     Returns mean squared error calculated across all dimensions.
 12 |     """
 13 |     assert pred_Y.shape == test_Y.shape
 14 |     return np.mean((pred_Y - test_Y) ** 2)
 15 | 
 16 | def get_pearsonR(pred_Y, test_Y):
 17 |     """
 18 |     Returns Pearson correlation for a single mark.
 19 |     
 20 |     Only takes in vectors.
 21 |     """
 22 | 
 23 |     assert pred_Y.shape == test_Y.shape
 24 |     assert len(pred_Y.shape) == 1
 25 |     
 26 |     return pearsonr(pred_Y, test_Y)[0]
 27 | 
 28 | def is_binary(M):
 29 |     unique_elements = list(set(M.flatten()))
 30 |     return all([elem in [0, 1] for elem in unique_elements])
 31 | 
 32 | def downsample_curve(vals):
 33 |     """
 34 |     Downsamples vals by a factor of 10 if len(vals) > 1000 (used to keep precision / recall curves from getting too long)
 35 |     """
 36 |     n = len(vals)
 37 |     
 38 |     if n > 1000:
 39 |         new_vals = []
 40 |         for i in range(int(n / 10)):
 41 |             new_vals.append(vals[i * 10])
 42 |         return new_vals
 43 |     else:
 44 |         return list(vals)
 45 | 
 46 | def compute_recalls_at_precision(precisions, recalls):
 47 |     """
 48 |     Computes recalls at 10%, 20%, ... 90% precision. 
 49 |     Does not interpolate.
 50 |     """
 51 |     precision_increment = .1
 52 |     desired_precision = precision_increment
 53 |     desired_precisions = []
 54 |     recalls_at_precision = []
 55 |     for i in range(len(precisions)):
 56 |         while precisions[i] > desired_precision:
 57 |             desired_precisions.append(desired_precision)
 58 |             recalls_at_precision.append(recalls[i])
 59 |             desired_precision += precision_increment
 60 | 
 61 |     return desired_precisions, recalls_at_precision
 62 | 
 63 | 
 64 | def compare(pred_Y, test_Y, predict_binary_output, peaks=None,
 65 |             save_curves=True, save_data=False):
 66 |     """
 67 |     Evaluates performance for predictions pred_Y relative to true labels test_Y. 
 68 |     If predict_binary_output, pred_Y should be a set of scores and test_Y should be 0, 1 labels. 
 69 |     Otherwise, both pred_Y and test_Y should be continuous values. 
 70 |     Returns squared error and Pearson correlation between the predicted output and the actual output.
 71 |     
 72 |     Both pred_Y and test_Y must be matrices of shape num_examples x num_histone_marks, 
 73 |     or they must both be matrices of shape num_examples x seq_length x num_histone_marks.
 74 |     If the latter, examples are concatenated together before correlations are computed.
 75 | 
 76 |     peaks is a list. Each element of this list corresponds to one mark and is a N x 2 matrix 
 77 |     where each row contains the (start, end) coordinates of a peak in that mark.
 78 |     If passing in peaks, make sure the coordinate system matches that of pred_Y and test_Y!
 79 |     For example, if your peaks start at the start of the chromosome, then pred_Y and test_Y have
 80 |     to start at the start of the chromosome as well.
 81 | 
 82 |     If save_curves is True, it saves the full precision-recall curve. save_curves cannot be True if 
 83 |     predict_binary_output is False. Right now it saves recalls @10, 20...90% precision. 
 84 | 
 85 |     If save_data is True, it saves the first mark of pred_Y and test_Y.
 86 |         
 87 |     Returns results, a dictionary containing:
 88 |         'AUC' (if predict_binary_output)
 89 |         'AUPRC' (if predict_binary_output)
 90 |         'precision_curves' (if save_curves)
 91 |         'recall_curves' (if save_curves)
 92 |         'threshold_curves' (if save_curves)
 93 |         'MSE' (if not predict_binary_output)
 94 |         'true_var' (if not predict_binary_output)
 95 |         'pearsonR' (if not predict_binary_output)
 96 |         'pred_Y' (if save_data)
 97 |         'test_Y' (if save_data)
 98 | 
 99 |     AUC, AUPRC, MSE, true_var, pearsonR, and spearmanR are each vectors of length num_histone_marks.  
100 |     true_var is the variance of the true data; it is useful for interpreting whether a given
101 |     MSE is good or bad.
102 |     """
103 |     
104 |     # save_curves has to be False if predict_binary_output is also False
105 |     if not predict_binary_output: save_curves = False
106 |     
107 |     pred_Y_is_binary = is_binary(pred_Y)
108 |     test_Y_is_binary = is_binary(test_Y)   
109 |     assert pred_Y.shape == test_Y.shape, \
110 |         "pred_Y.shape = %s doesn't match test_Y.shape = %s" % (str(pred_Y.shape), str(test_Y.shape))
111 |     assert test_Y_is_binary == predict_binary_output 
112 | 
113 |     #test_Y (the true labels) ought to be binary IFF we're predicting binary output. 
114 |     #pred_Y should be a set of continuous scores, regardless of whether we're predicting binary output. 
115 |     assert len(pred_Y.shape) == 2 or len(pred_Y.shape) == 3
116 | 
117 |     # If peaks is not None, then there should be one element in peaks for each mark in pred_Y.
118 |     if peaks:
119 |         assert len(peaks) == pred_Y.shape[-1]
120 |     
121 |     # If the input matrices are 3D, then squash the first two dimensions together
122 |     if len(pred_Y.shape) == 3:
123 |         pred_Y = np.reshape(pred_Y, [pred_Y.shape[0] * pred_Y.shape[1], pred_Y.shape[2]])
124 |         test_Y = np.reshape(test_Y, [test_Y.shape[0] * test_Y.shape[1], test_Y.shape[2]])
125 |     
126 |     num_histone_marks = pred_Y.shape[len(pred_Y.shape) - 1]
127 | 
128 |     true_var = []
129 |     MSE = []
130 |     pearsonR = []
131 | 
132 |     precision_curves = []
133 |     recall_curves = [] 
134 |     threshold_curves = []
135 |     auc = []
136 |     auprc = []
137 |     Y_pos_frac = []
138 | 
139 |     with open('PRROC.R', 'r') as f:#load in the R code. 
140 |         r_fxn_string = f.read()
141 |     r_auc_func = STAP(r_fxn_string, "auc_func")
142 |     
143 |     for mark_idx in range(num_histone_marks):
144 |         ### Sub-select only peak regions
145 |         if peaks:
146 |             # If peaks exists but peaks[mark_idx] is set to None, we should skip this mark. 
147 |             # This mark should correspond to INPUT, which has no peaks of its own.
148 |             if peaks[mark_idx] is None:
149 |                 if predict_binary_output:
150 |                     precision_curves.append(None)
151 |                     recall_curves.append(None)
152 |                     threshold_curves.append(None)
153 |                     auprc.append(None)
154 |                     auc.append(None)
155 |                 else:
156 |                     true_var.append(None)
157 |                     MSE.append(None)
158 |                     pearsonR.append(None)
159 |                 continue
160 | 
161 |             # Initialize peak_idxs to all False
162 |             num_bins = pred_Y.shape[0]
163 |             peak_idxs = np.zeros(
164 |                 num_bins,
165 |                 dtype=bool)
166 |             
167 |             # Set peak_idx such that it is True in each peak
168 |             # Simultaneously get the average signal density in each peak
169 |             for peak_counter, peak in enumerate(peaks[mark_idx]):            
170 |                 # We have to check for this, because pred_Y and test_Y might only represent
171 |                 # a fraction of any given chromosome
172 |                 if peak[1] > num_bins: 
173 |                     continue
174 |                 
175 |                 peak_idxs[peak[0]:peak[1]] = True                                
176 | 
177 |             pred_Y_mark = pred_Y[peak_idxs, mark_idx]
178 |             test_Y_mark = test_Y[peak_idxs, mark_idx]
179 |         else:
180 |             pred_Y_mark = pred_Y[:, mark_idx]
181 |             test_Y_mark = test_Y[:, mark_idx]
182 | 
183 |         ### Run evaluations on (selected) regions
184 |         if predict_binary_output:
185 |             precisions, recalls, thresholds = precision_recall_curve(test_Y_mark, pred_Y_mark)
186 |             precisions, recalls = compute_recalls_at_precision(precisions, recalls)
187 | 
188 |             precision_curves.append(list(precisions))
189 |             recall_curves.append(list(recalls))
190 | 
191 |             if len(test_Y_mark) < 100000:
192 |                 downsample_idxs = range(len(test_Y_mark))
193 |             else:
194 |                 downsample_idxs = sample(range(len(test_Y_mark)), 100000)
195 |     
196 |             r_auprc_results = r_auc_func.pr_curve(scores_class0 = robjects.vectors.FloatVector(pred_Y_mark[downsample_idxs]), weights_class0 = robjects.vectors.FloatVector(test_Y_mark[downsample_idxs]))
197 | 
198 |             auprc.append(float(r_auprc_results.rx('auc.davis.goadrich')[0][0]))
199 |             r_auc_results = r_auc_func.roc_curve(scores_class0 = robjects.vectors.FloatVector(pred_Y_mark[downsample_idxs]), weights_class0 = robjects.vectors.FloatVector(test_Y_mark[downsample_idxs]))
200 |             auc.append(float(r_auc_results.rx('auc')[0][0]))
201 |             Y_pos_frac.append(test_Y_mark.mean())        
202 |             print("AUC %2.3f; AUPRC %2.3f" % (auc[mark_idx], auprc[mark_idx]))
203 |         else:
204 |             true_var.append(np.var(test_Y_mark))
205 |             MSE.append(get_MSE(pred_Y_mark, test_Y_mark))
206 |             pearsonR.append(get_pearsonR(pred_Y_mark, test_Y_mark))
207 | 
208 |             print("MSE %2.3f (true var %2.3f), pearsonR %2.3f" % 
209 |                 (MSE[mark_idx], true_var[mark_idx], pearsonR[mark_idx]))      
210 | 
211 |     if predict_binary_output:
212 |         assert((len(precisions) > 0) and (len(recalls) > 0))
213 |         results = {
214 |             'AUC':auc,
215 |             'AUPRC':auprc,
216 |             'Y_pos_frac':Y_pos_frac
217 |         }
218 |         results['precision_curves'] = precision_curves
219 |         results['recall_curves'] = recall_curves
220 | 
221 |     else:
222 |         results = {
223 |             'MSE': MSE,
224 |             'true_var': true_var,
225 |             'pearsonR': pearsonR
226 |         }
227 | 
228 |     if save_data: 
229 |         results['pred_Y'] = list(pred_Y[..., 0])
230 |         results['test_Y'] = list(test_Y[..., 0])
231 | 
232 |     return results
233 | 
234 | 
235 | 
236 | 
237 | 


--------------------------------------------------------------------------------
/PRROC.R:
--------------------------------------------------------------------------------
  1 | # Taken from the PRROC R package, https://cran.r-project.org/web/packages/PRROC/PRROC.pdf.
  2 | # Written by Jan Grau and Jens Keilwagen.
  3 | 
  4 | pr_curve<-function( scores_class0, scores.class1=scores.class0, weights_class0=NULL, 
  5 | 		weights.class1 = {if(is.null(weights.class0)){NULL}else{1-weights.class0}}, sorted = FALSE, curve = FALSE, 
  6 | 		minStepSize=min(1,ifelse(is.null(weights.class0),1,sum(weights.class0)/100)),
  7 | 		max.compute=F, min.compute=F, rand.compute=F){
  8 | 	scores.class0 = scores_class0
  9 | 	weights.class0 = weights_class0
 10 | 	if(!sorted){
 11 | 		o0<-order(scores.class0);
 12 | 		scores.class0<-scores.class0[o0];
 13 | 		if(!is.null(weights.class0)){
 14 | 			weights.class0<-weights.class0[o0];
 15 | 		}
 16 | 		o1<-order(scores.class1);
 17 | 		scores.class1<-scores.class1[o1];
 18 | 		if(!is.null(weights.class1)){
 19 | 			weights.class1<-weights.class1[o1];
 20 | 		}
 21 | 	}
 22 | 	compute.pr(scores.class0,scores.class1,weights.class0,weights.class1,curve,minStepSize,max.compute,min.compute,rand.compute);
 23 | }
 24 | 
 25 | 
 26 | roc_curve<-function( scores.class0, scores.class1=scores.class0, weights.class0=NULL, 
 27 | 		weights.class1 = {if(is.null(weights.class0)){NULL}else{1-weights.class0}}, sorted = FALSE, curve = FALSE, 
 28 | 		max.compute=F, min.compute=F, rand.compute=F){
 29 | 	if(!sorted){
 30 | 		o0<-order(scores.class0);
 31 | 		scores.class0<-scores.class0[o0];
 32 | 		if(!is.null(weights.class0)){
 33 | 			weights.class0<-weights.class0[o0];
 34 | 		}
 35 | 		o1<-order(scores.class1);
 36 | 		scores.class1<-scores.class1[o1];
 37 | 		if(!is.null(weights.class1)){
 38 | 			weights.class1<-weights.class1[o1];
 39 | 		}
 40 | 	}
 41 | 	compute.roc(scores.class0,scores.class1,weights.class0,weights.class1,curve,max.compute,min.compute,rand.compute);
 42 | }
 43 | 
 44 | 
 45 | check <- function( n, weights ) {
 46 | 	if( !is.null( weights ) ) {
 47 | 		if( n != length(weights) ) {
 48 | 			stop( "The weights must have the same length as the scores." );
 49 | 		}
 50 | 		if( sum( weights < 0 ) != 0 ) {
 51 | 			stop( "The weights must be non-negative." );
 52 | 		}
 53 | 	}
 54 | }
 55 | 
 56 | compute.pr <- function( sorted.scores.class0, sorted.scores.class1=sorted.scores.class0, weights.class0 = NULL, 
 57 | 						weights.class1 = {if(is.null(weights.class0)){NULL}else{1-weights.class0}}, curve = FALSE, 
 58 | 						minStepSize=min(1,ifelse(is.null(weights.class0),1,sum(weights.class0)/100)),
 59 | 						max.compute=F, min.compute=F, rand.compute=F ){
 60 | 	
 61 | 	check( length(sorted.scores.class0), weights.class0 );
 62 | 	check( length(sorted.scores.class1), weights.class1 );
 63 | 	
 64 | 	if( !is.null(sorted.scores.class1) & ( length(sorted.scores.class0) != length(sorted.scores.class1) | 
 65 | 			suppressWarnings( sum(sorted.scores.class0 != sorted.scores.class1) > 0 ) 
 66 | 		) & is.null(weights.class0) & is.null(weights.class1) ){
 67 | 		weights.class0<-c(rep(1,length(sorted.scores.class0)),rep(0,length(sorted.scores.class1)));
 68 | 		sorted.scores.class0<-c(sorted.scores.class0,sorted.scores.class1);
 69 | 		o0<-order(sorted.scores.class0);
 70 | 		sorted.scores.class0<-sorted.scores.class0[o0];
 71 | 		weights.class0<-weights.class0[o0];
 72 | 		weights.class1<-1-weights.class0;
 73 | 		sorted.scores.class1<-sorted.scores.class0;
 74 | 	}
 75 | 		
 76 | 	davis.and.goadrich <- ( length(sorted.scores.class0) == length(sorted.scores.class1) & 
 77 | 								suppressWarnings( sum( sorted.scores.class0 != sorted.scores.class1 ) == 0 ) & 
 78 | 								length(weights.class0) == length(weights.class1) &
 79 | 								suppressWarnings( sum( weights.class0 != (1 - weights.class1) ) == 0 ) &
 80 | 								sum(weights.class0 != 0 & weights.class0 != 1)==0);
 81 | 								
 82 | 		#( is.null( weights.class0 ) | sum( weights.class0 != 1 ) == 0 ) & ( is.null( weights.class1 ) | sum( weights.class1 != 1 ) == 0 );
 83 | 
 84 | 	i.old <- 0; j.old <- 0; i <- 0; j <- 0; d <- length( sorted.scores.class1 ); m <- length( sorted.scores.class0 );
 85 | 	help1 <- 0; help2 <- 0;
 86 | 	auc.GD <- ifelse(davis.and.goadrich,0,NA); auc.integral <- 0; fn <- 0; tn <- 0;
 87 | 	
 88 | 	nw0 <- is.null( weights.class0 );
 89 | 	nw1 <- is.null( weights.class1 );
 90 | 	
 91 | 	pos <- ifelse( nw0, m, sum( weights.class0 ) );
 92 | 	neg <- ifelse( nw1, d, sum( weights.class1 ) );
 93 | 	
 94 | 	while( ( j<d ) & sorted.scores.class0[ i + 1 ] > sorted.scores.class1[ j + 1 ] ){
 95 | 		tn <- tn + ifelse( nw1, 1, weights.class1[ j + 1 ] );
 96 | 		j <- j + 1;
 97 | 	}
 98 | 	p <- c( ( pos - fn ) / pos, ( pos - fn ) / ( pos - fn + neg - tn ), sorted.scores.class0[ i + 1 ] );
 99 | 	ci <- 1;
100 | 	if( curve ){
101 | 		list.curve <- create.curve( length( sorted.scores.class0 ) + length( sorted.scores.class1 ) );
102 | 		list.curve <- append.to.curve( list.curve, p, ci );
103 | 		ci <- ci + 1;
104 | 	}else{
105 | 		list.curve <- NULL;
106 | 	}
107 | 	
108 | 	unique <- !( j < d & sorted.scores.class0[ i + 1 ] == sorted.scores.class1[ j + 1 ] );
109 | 	from.motif <- unique;
110 | 	
111 | 	while( i< m & j < d ){
112 | 		i.old <- i;
113 | 		j.old <- j;
114 | 		tn.old <- tn;
115 | 		fn.old <- fn;
116 | 		
117 | 		if( !unique || from.motif ){
118 | 			while( i + 1 < m & sorted.scores.class0[ i + 1 ] == sorted.scores.class0[ i + 2 ] ){
119 | 				fn <- fn + ifelse( nw0, 1, weights.class0[ i + 1 ] );
120 | 				i <- i + 1;
121 | 			}
122 | 			fn <- fn + ifelse( nw0, 1, weights.class0[ i + 1 ] );
123 | 			i <- i + 1;
124 | 		}						
125 | 		if( !unique || !from.motif ){
126 | 			while( j + 1 < d & sorted.scores.class1[ j + 1 ] == sorted.scores.class1[ j + 2 ] ){
127 | 				tn <- tn + ifelse( nw1, 1, weights.class1[ j + 1 ] );
128 | 				j <- j + 1;
129 | 			}
130 | 			tn <- tn + ifelse( nw1, 1, weights.class1[ j + 1 ] );
131 | 			j <- j + 1;
132 | 		}
133 | 		score<-0;
134 | 		if( i < m & j < d ){
135 | 			if( sorted.scores.class0[ i + 1 ] == sorted.scores.class1[ j + 1 ] ){
136 | 				unique <- F;
137 | 				score <- sorted.scores.class0[ i + 1 ];
138 | 			}else{
139 | 				unique <- T;
140 | 				if( sorted.scores.class0[ i + 1 ] < sorted.scores.class1[ j + 1 ] ){
141 | 					from.motif <- T;
142 | 					score <- sorted.scores.class0[ i + 1 ];
143 | 				}else{
144 | 					from.motif <- F;
145 | 					score <- sorted.scores.class1[ j + 1 ];
146 | 				}
147 | 			}
148 | 		} else {
149 | 			if( i < m ) {
150 | 				score <- sorted.scores.class0[ i + 1 ];
151 | 			} else if( j < d ) {
152 | 				score <- sorted.scores.class1[ j + 1 ];
153 | 			} else {
154 | 				#i=m, j=d
155 | 				max = max(sorted.scores.class0[ m ],sorted.scores.class1[ d ]);
156 | 				score = max; #+ 0.01*( max - min(sorted.scores.class0[ 1 ],sorted.scores.class1[ 1 ]) ); #max + arbitrary offset
157 | 			}
158 | 		}
159 | 		
160 | 		if( fn == fn.old ) {#i == i.old ){
161 | 			old.p<-p;
162 | 			p <- c( p[ 1 ], ( pos - fn ) / ( pos - fn + neg - tn ), score );
163 | 			if(is.nan(p[2])){
164 | 				p<-old.p;
165 | 			}
166 | 			if( curve ){
167 | 				list.curve <- append.to.curve( list.curve, p, ci );
168 | 				ci <- ci + 1;
169 | 			}
170 | 		}else{
171 | 			p.b <- p[ 1 ];
172 | 			p.a <- ( pos - fn ) / pos;
173 | 			
174 | 			if( davis.and.goadrich ){
175 | 				if( i < m | j < d ){# TODO
176 | 					prop.term <- ( tn - tn.old ) / ( fn - fn.old );
177 | 					h1 <- p[ 1 ]; h2 <- p[ 2 ];
178 | 					c <- fn.old + 1;
179 | 					help.j <- tn.old + prop.term;
180 | 					while( c <= fn ){
181 | 						help1 <- (pos - c) / pos;
182 | 						help2 <- (pos - c) / ( pos - c + neg - help.j );
183 | 						help.j <- help.j + prop.term;
184 | 						auc.GD <- auc.GD + ( h2 + help2 ) / 2 * ( h1 - help1 );
185 | 						#print(c(1,auc.GD,i=i.v,m=pos,j=j.v,d=neg,c=c,i.old=i.v.o,j.old=j.v.o))
186 |  						h1 <- help1;
187 |  						h2 <- help2;
188 | 						c <- c + 1;
189 | 					}
190 | 				}else{
191 | 					auc.GD <- auc.GD + p[ 2 ] * p[ 1 ];
192 | 				}
193 | 			}
194 | 			
195 | 			h <- ( tn - tn.old ) / ( fn - fn.old );
196 | 			a <- 1 + h;
197 | 			b <- ( neg - tn - h * ( pos - fn ) ) / pos;
198 | 
199 | 			if( !isTRUE(all.equal(b, 0)) ){
200 | 				auc.integral <- auc.integral + ( p.b - p.a - b / a * ( log( a * p.b + b ) - log( a * p.a + b ) ) ) / a;
201 | 			}else{
202 | 				auc.integral <- auc.integral + ( p.b - p.a ) / a;
203 | 			}
204 | 			
205 | 			prop.term <- min( ( fn - fn.old ) / ( i - i.old ), minStepSize );
206 | 			h <- h*prop.term;
207 | 			help.i <- fn.old + prop.term;
208 | 			i.old <- i.old + 1;
209 | 			help.j <- tn.old + h;
210 | 			k=1;
211 | 			while( help.i < fn ){
212 | 				p <- c( ( pos - help.i ) / pos, ( pos - help.i ) / ( pos - help.i + neg - help.j ), score );#interpolate score?
213 | 				if( curve ){
214 | 					list.curve <- append.to.curve( list.curve, p, ci );
215 | 					ci <- ci + 1;
216 | 				}
217 | 				k=k+1;
218 | 				help.j <- tn.old + k*h;
219 | 				help.i <- fn.old + k*prop.term;
220 | 			}
221 | 			if( p.a != p[ 1 ] ){
222 | 				temp <- ( pos - fn ) / ( pos - fn + neg - tn );
223 | 				if(is.nan(temp)){
224 | 					temp <- p[2];
225 | 				}
226 | 				p <- c( p.a, temp, score );
227 | 				if( curve ){
228 | 					list.curve <- append.to.curve( list.curve, p, ci );
229 | 					ci <- ci + 1;
230 | 				}
231 | 			}
232 | 		}		
233 | 	}
234 | 	
235 | 	if( i < m ){
236 | 		help1 <- 0;
237 | 		if( davis.and.goadrich ){
238 | 			auc.GD <- auc.GD + p[ 2 ] * ( p[ 1 ] - help1 );
239 | 		}
240 | 		
241 | 		auc.integral <- auc.integral + p[ 2 ] * ( p[ 1 ] - help1 );
242 | 		
243 | 		p <- c( help1, p[ 2 ], sorted.scores.class0[ i + 1 ] );
244 | 		if( curve ){
245 | 			list.curve <- append.to.curve( list.curve, p, ci );
246 | 			ci <- ci + 1;
247 | 		}
248 | 	}
249 | 	if(curve){
250 | 		list.curve<-shrink.curve( list.curve );
251 | 	#	list.curve<-rbind(c(list.curve[1,1],list.curve[1,2],min(sorted.scores.class0,sorted.scores.class1)),
252 | 	#					  list.curve,
253 | 	#					  c(list.curve[nrow(list.curve),1],list.curve[nrow(list.curve),2],max(sorted.scores.class0,sorted.scores.class1)))
254 | 	}
255 | 	res<-list( type = "PR", auc.integral = auc.integral, auc.davis.goadrich = auc.GD, curve=list.curve );
256 | 	
257 | 	if(max.compute){
258 | 		scores0<-NULL;
259 | 		if(is.null(weights.class0) | length(sorted.scores.class0)!=length(sorted.scores.class1) | suppressWarnings(sum(sorted.scores.class0!=sorted.scores.class1)) > 0){
260 | 			scores0<-rep(1,length(sorted.scores.class0));
261 | 		}else{
262 | 			scores0<-weights.class0;
263 | 		}
264 | 		scores1<-NULL;
265 | 		if(is.null(weights.class0) | length(sorted.scores.class0)!=length(sorted.scores.class1) | suppressWarnings(sum(sorted.scores.class0!=sorted.scores.class1)) > 0){
266 | 			scores1<-rep(0,length(sorted.scores.class1));
267 | 		}else{
268 | 			scores1<-weights.class0;
269 | 		}
270 | 		
271 | 		max.res<-pr.curve( scores.class0=scores0, scores.class1=scores1,weights.class0=weights.class0,
272 | 					weights.class1=weights.class1,curve=curve,minStepSize=minStepSize);
273 | 		res<-c(res,list(max=max.res));
274 | 	}
275 | 	
276 | 	if(min.compute){
277 | 		scores0<-NULL;
278 | 		if(is.null(weights.class0) | length(sorted.scores.class0)!=length(sorted.scores.class1) | suppressWarnings(sum(sorted.scores.class0!=sorted.scores.class1)) > 0){
279 | 			scores0<-rep(0,length(sorted.scores.class0));
280 | 		}else{
281 | 			scores0<-(-weights.class0);
282 | 		}
283 | 		scores1<-NULL;
284 | 		if(is.null(weights.class0) | length(sorted.scores.class0)!=length(sorted.scores.class1) | suppressWarnings(sum(sorted.scores.class0!=sorted.scores.class1)) > 0){
285 | 			scores1<-rep(1,length(sorted.scores.class1));
286 | 		}else{
287 | 			scores1<-(-weights.class0);
288 | 		}
289 | 		
290 | 		min.res<-pr.curve( scores.class0=scores0, scores.class1=scores1,weights.class0=weights.class0,
291 | 					weights.class1=weights.class1,curve=curve,minStepSize=minStepSize);
292 | 		res<-c(res,list(min=min.res));
293 | 	}
294 | 	if(rand.compute){
295 | 		rand.auc<-NULL;
296 | 		if(is.null(weights.class0)){
297 | 			rand.auc<-length(sorted.scores.class0)/(length(sorted.scores.class0)+length(sorted.scores.class1));	
298 | 		}else{
299 | 			rand.auc<-sum(weights.class0)/sum(weights.class0+weights.class1);
300 | 		}
301 | 		rand.curve<-create.curve( 2 );
302 | 		rand.curve<-append.to.curve( rand.curve, c(0,rand.auc,0), 1 );
303 | 		rand.curve<-append.to.curve( rand.curve, c(1,rand.auc,0), 2 );
304 | 		rand.result<-list( type = "PR", auc.integral = rand.auc, auc.davis.goadrich = rand.auc, curve=rand.curve );
305 | 		class(rand.result)<-"PRROC";
306 | 		
307 | 		res<-c(res,list(rand=rand.result));
308 | 	}
309 | 	
310 | 	class(res)<-"PRROC";
311 | 	res
312 | }
313 | 
314 | compute.roc<-function( sorted.scores.class0, sorted.scores.class1=sorted.scores.class0, weights.class0 = NULL, 
315 | 					   weights.class1 = {if(is.null(weights.class0)){NULL}else{1-weights.class0}}, curve = FALSE,
316 | 					   max.compute=F, min.compute=F, rand.compute=F){
317 | 	
318 | 	if( !is.null(sorted.scores.class1) & ( length(sorted.scores.class0) != length(sorted.scores.class1) | 
319 | 		 suppressWarnings( sum(sorted.scores.class0 != sorted.scores.class1) > 0 ) 
320 | 		) & is.null(weights.class0) & is.null(weights.class1) ){
321 | 			weights.class0<-c(rep(1,length(sorted.scores.class0)),rep(0,length(sorted.scores.class1)));
322 | 			sorted.scores.class0<-c(sorted.scores.class0,sorted.scores.class1);
323 | 			o0<-order(sorted.scores.class0);
324 | 			sorted.scores.class0<-sorted.scores.class0[o0];
325 | 			weights.class0<-weights.class0[o0];
326 | 			weights.class1<-1-weights.class0;
327 | 			sorted.scores.class1<-sorted.scores.class0;
328 | 	}
329 | 	
330 | 	i <- 0; j <- 0; d <- length( sorted.scores.class1 ); m <- length( sorted.scores.class0 );
331 | 	fn <- 0; tn <- 0;
332 | 	
333 | 	nw0 <- is.null( weights.class0 );
334 | 	nw1 <- is.null( weights.class1 );
335 | 	
336 | 	pos <- ifelse( nw0, m, sum( weights.class0 ) );
337 | 	neg <- ifelse( nw1, d, sum( weights.class1 ) );
338 | 	
339 | 	erg <- 0;
340 | 	ci <- 1;
341 | 	p <- c( 1, 1, min(sorted.scores.class0,sorted.scores.class1) );
342 | 	if( curve ){
343 | 		list.curve <- create.curve( length( sorted.scores.class0 ) + length( sorted.scores.class1 ) );
344 | 		list.curve <- append.to.curve( list.curve, p, ci );
345 | 		ci <- ci + 1;
346 | 	}else{
347 | 		list.curve <- NULL;
348 | 	}
349 | 	
350 | 	unique <- F; from.motif <- F;
351 | 	if( sorted.scores.class0[ i + 1 ] == sorted.scores.class1[ j + 1 ] ){
352 | 		unique <- F;
353 | 	}else{
354 | 		unique <- T;
355 | 		if( sorted.scores.class0[ i + 1 ] < sorted.scores.class1[ j + 1 ] ){
356 | 			from.motif <- T;
357 | 		}else{
358 | 			from.motif <- F;
359 | 		}
360 | 	}
361 | 	
362 | 	while( i < m & j < d ){
363 | 		score <- 0;
364 | 		if( unique ){
365 | 			if( from.motif ){
366 | 				while( i < m & sorted.scores.class0[ i + 1 ] < sorted.scores.class1[ j + 1 ] ){
367 | 					fn <- fn + ifelse( nw0, 1, weights.class0[ i + 1 ] );
368 | 					score <- sorted.scores.class0[ i + 1 ];
369 | 					i <- i + 1;
370 | 				}
371 | 				
372 | 			}else{
373 | 				while( j < d & sorted.scores.class0[ i + 1 ] > sorted.scores.class1[ j + 1 ] ){
374 | 					tn <- tn + ifelse( nw1, 1, weights.class1[ j + 1 ] );
375 | 					score <- sorted.scores.class1[ j + 1 ];
376 | 					j <- j + 1;
377 | 				}
378 | 				#score <- sorted.scores.class0[ i + 1 ];
379 | 			}
380 | 		}else{
381 | 			while( i + 1 < m & sorted.scores.class0[ i + 1 ] == sorted.scores.class0[ i + 2 ] ){
382 | 				fn <- fn + ifelse( nw0, 1, weights.class0[ i + 1 ] );
383 | 				i <- i + 1;
384 | 			}
385 | 			while( j + 1 < d & sorted.scores.class1[ j + 1 ] == sorted.scores.class1[ j + 2 ]){
386 | 				tn <- tn + ifelse( nw1, 1, weights.class1[ j + 1 ] );
387 | 				j <- j + 1;
388 | 			}
389 | 			fn <- fn + ifelse( nw0, 1, weights.class0[ i + 1 ] );
390 | 			tn <- tn + ifelse( nw1, 1, weights.class1[ j + 1 ] );
391 | 			i <- i + 1;
392 | 			j <- j + 1;
393 | 			score <- sorted.scores.class0[ i ];
394 | 		}
395 | 		
396 | 		help1 <- ( neg - tn ) / neg;
397 | 		help2 <- ( pos - fn ) / pos;
398 | 		erg <- erg + ( p[ 2 ] + help2 ) / 2 * ( p[ 1 ] - help1 );
399 | 		p <- c( help1, help2, score );
400 | 		if(curve){
401 | 			list.curve <- append.to.curve( list.curve, p, ci );
402 | 			ci <- ci + 1;
403 | 		}
404 | 		
405 | 		if( i < m & j < d ){
406 | 			if( sorted.scores.class0[ i + 1 ] == sorted.scores.class1[ j + 1 ] ){
407 | 				unique <- F;
408 | 			}else{
409 | 				unique <- T;
410 | 				if( sorted.scores.class0[ i + 1 ] < sorted.scores.class1[ j + 1 ] ){
411 | 					from.motif <- T;
412 | 				}else{
413 | 					from.motif <- F;
414 | 				}
415 | 			}
416 | 		}
417 | 	}
418 | 	
419 | 	if(curve){
420 | 		p <- c( 0, 0, max( sorted.scores.class0, sorted.scores.class1 ) );
421 | 		list.curve <- append.to.curve( list.curve, p, ci );
422 | 		ci <- ci + 1;
423 | 		list.curve<-shrink.curve( list.curve );
424 | 		list.curve<-rbind(c(list.curve[1,1],list.curve[1,2],min(sorted.scores.class0,sorted.scores.class1)),
425 | 						  list.curve,
426 | 						  c(list.curve[nrow(list.curve),1],list.curve[nrow(list.curve),2],max(sorted.scores.class0,sorted.scores.class1)))
427 | 	}
428 | 	res<-list( type = "ROC", auc = erg, curve=list.curve );
429 | 	
430 | 	
431 | 	if(max.compute){
432 | 		scores0<-NULL;
433 | 		if(is.null(weights.class0) | length(sorted.scores.class0)!=length(sorted.scores.class1) | suppressWarnings(sum(sorted.scores.class0!=sorted.scores.class1)) > 0){
434 | 			scores0<-rep(1,length(sorted.scores.class0));
435 | 		}else{
436 | 			scores0<-weights.class0;
437 | 		}
438 | 		scores1<-NULL;
439 | 		if(is.null(weights.class0) | length(sorted.scores.class0)!=length(sorted.scores.class1) | suppressWarnings(sum(sorted.scores.class0!=sorted.scores.class1)) > 0){
440 | 			scores1<-rep(0,length(sorted.scores.class1));
441 | 		}else{
442 | 			scores1<-weights.class0;
443 | 		}
444 | 		
445 | 		max.res<-roc.curve( scores.class0=scores0, scores.class1=scores1,weights.class0=weights.class0,
446 | 							 weights.class1=weights.class1,curve=curve);
447 | 		res<-c(res,list(max=max.res));
448 | 	}
449 | 	
450 | 	if(min.compute){
451 | 		scores0<-NULL;
452 | 		if(is.null(weights.class0) | length(sorted.scores.class0)!=length(sorted.scores.class1) | suppressWarnings(sum(sorted.scores.class0!=sorted.scores.class1)) > 0){
453 | 			scores0<-rep(0,length(sorted.scores.class0));
454 | 		}else{
455 | 			scores0<-(-weights.class0);
456 | 		}
457 | 		scores1<-NULL;
458 | 		if(is.null(weights.class0) | length(sorted.scores.class0)!=length(sorted.scores.class1) | suppressWarnings(sum(sorted.scores.class0!=sorted.scores.class1)) > 0){
459 | 			scores1<-rep(1,length(sorted.scores.class1));
460 | 		}else{
461 | 			scores1<-(-weights.class0);
462 | 		}
463 | 		
464 | 		min.res<-roc.curve( scores.class0=scores0, scores.class1=scores1,weights.class0=weights.class0,
465 | 							 weights.class1=weights.class1,curve=curve);
466 | 		res<-c(res,list(min=min.res));
467 | 	}
468 | 	if(rand.compute){
469 | 		rand.auc<-0.5;
470 | 		rand.curve<-create.curve( 2 );
471 | 		rand.curve<-append.to.curve( rand.curve, c(0,0,0), 1 );
472 | 		rand.curve<-append.to.curve( rand.curve, c(1,1,0), 2 );
473 | 		rand.result<-list( type = "ROC", auc=rand.auc, curve=rand.curve );
474 | 		class(rand.result)<-"PRROC";
475 | 		
476 | 		res<-c(res,list(rand=rand.result));
477 | 	}
478 | 	
479 | 	class(res)<-"PRROC";
480 | 	res	
481 | }
482 | 
483 | shrink.curve <- function( curve ){
484 | 	if( is.null( curve ) ){
485 | 		curve;
486 | 	}else{
487 | 		curve[ !is.na( curve[ , 1 ] ), ];
488 | 	}
489 | }
490 | 
491 | create.curve <- function( n ){
492 | 	m <- matrix( NA, nrow=n, ncol=3 );
493 | 	m
494 | }
495 | 
496 | append.to.curve <- function( curve, p, row ){
497 | 	if( row>=nrow( curve ) ){
498 | 		curve2 <- matrix( NA, nrow=nrow( curve ) * 2, ncol=3 );
499 | 		curve2[ 1:nrow( curve ), ] <- curve;
500 | 		curve <- curve2;
501 | 	}
502 | 	curve[ row, ] <- p;
503 | #	print(c(row,p))
504 | #	if(is.nan(p[2])){
505 | #		traceback(0)
506 | #	}
507 | 	curve
508 | }
509 | 
510 | print.PRROC<-function(x,...){
511 | 	if(x$type == "PR"){
512 | 		cat("\n  Precision-recall curve\n");
513 | 		cat("\n    Area under curve (Integral):\n");
514 | 		cat("    ",x$auc.integral,"\n");
515 | 		if( !is.null(x$max) & !is.null(x$min) ){
516 | 			cat("\n    Relative area under curve (Integral):\n");
517 | 			cat("    ",(x$auc.integral - x$min$auc.integral)/(x$max$auc.integral-x$min$auc.integral),"\n");
518 | 		}
519 | 		cat("\n    Area under curve (Davis & Goadrich):\n");
520 | 		if(!is.null(x$auc.davis.goadrich) & !is.na(x$auc.davis.goadrich)){
521 | 			cat("    ",x$auc.davis.goadrich,"\n");
522 | 			if( !is.null(x$max) & !is.null(x$min) ){
523 | 				cat("\n    Relative area under curves (Davis & Goadrich):\n");
524 | 				cat("    ",(x$auc.davis.goadrich - x$min$auc.davis.goadrich)/(x$max$auc.davis.goadrich-x$min$auc.davis.goadrich),"\n");
525 | 			}
526 | 		}else{
527 | 			cat("    cannot be computed for weighted data\n");
528 | 		}
529 | 		
530 | 	}else{
531 | 		cat("\n  ROC curve\n");
532 | 		cat("\n    Area under curve:\n");
533 | 		cat("    ",x$auc,"\n");
534 | 		if( !is.null(x$max) & !is.null(x$min) ){
535 | 			cat("\n    Relative area under curve:\n");
536 | 			cat("    ",(x$auc - x$min$auc)/(x$max$auc-x$min$auc),"\n");
537 | 		}
538 | 	}
539 | 	
540 | 	if(!is.null(x$curve)){
541 | 		cat("\n    Curve for scores from ",min(x$curve[,3])," to ",max(x$curve[,3]),"\n");
542 | 		cat("    ( can be plotted with plot(x) )\n\n");
543 | 	}else{
544 | 		cat("\n    Curve not computed ( can be done by using curve=TRUE )\n");
545 | 	}
546 | 	
547 | 	if(!is.null(x$max)){
548 | 		cat("\n\n    Maximum AUC:\n");
549 | 		if(x$type == "PR"){
550 | 			cat("    ",x$max$auc.integral," ",x$max$auc.davis.goadrich,"\n");
551 | 		}else{
552 | 			cat("    ",x$max$auc,"\n");
553 | 		}
554 | 	}
555 | 	
556 | 	if(!is.null(x$min)){
557 | 		cat("\n\n    Minimum AUC:\n");
558 | 		if(x$type == "PR"){
559 | 			cat("    ",x$min$auc.integral," ",x$min$auc.davis.goadrich,"\n");
560 | 		}else{
561 | 			cat("    ",x$min$auc,"\n");
562 | 		}
563 | 	}
564 | 	
565 | 	if(!is.null(x$rand)){
566 | 		cat("\n\n    AUC of a random classifier:\n");
567 | 		if(x$type == "PR"){
568 | 			cat("    ",x$rand$auc.integral," ",x$rand$auc.davis.goadrich,"\n");
569 | 		}else{
570 | 			cat("    ",x$rand$auc,"\n");
571 | 		}
572 | 	}
573 | }
574 | 
575 | 
576 | plot.PRROC<-function(x, xlim=c(0,1), ylim=c(0,1), auc.main=TRUE, auc.type=c("integral","davis.goadrich"), 
577 | 					 legend=ifelse(is.logical(color) & color==TRUE,4,NA), xlab=NULL, ylab=NULL, main=NULL, color=TRUE, lwd=3, 
578 | 					 add=FALSE, scale.color=hsv(h=seq(0,1,length=100)*0.8, s=1, v=1), 
579 | 					 max.plot = FALSE, min.plot = FALSE, rand.plot = FALSE, fill.area = (max.plot & min.plot),
580 | 					 maxminrand.col = grey(0.5), fill.color = grey(0.95),
581 | 					 ...){
582 | 	auc.type<-match.arg(auc.type);
583 | 	if(is.null(x$curve)){
584 | 		stop("Curve is NULL. Use curve=T in pr.curve or roc.curve to obtain one.");
585 | 	}
586 | 	if(ncol(x$curve) != 3){
587 | 		stop("Curve has wrong dimension");
588 | 	}
589 | 	if(is.null(xlab)){
590 | 		my.xlab<-ifelse(x$type=="PR","Recall","FPR");
591 | 	}else{
592 | 		my.xlab<-xlab;
593 | 	}
594 | 	if(is.null(ylab)){
595 | 		my.ylab<-ifelse(x$type=="PR","Precision","Sensitivity");
596 | 	}else{
597 | 		my.ylab<-ylab;
598 | 	}
599 | 	
600 | 	if(is.null(main)){
601 | 		my.main<-paste(x$type," curve",sep="",collapse="");
602 | 	}else{
603 | 		my.main<-main;
604 | 	}
605 | 	if(auc.main){
606 | 		my.main<-paste(my.main,"\nAUC = ",format(ifelse(x$type=="PR",ifelse(auc.type=="integral",x$auc.integral,x$auc.davis.goadrich),x$auc)),sep="",collapse="");
607 | 	}
608 |  
609 | 	
610 | 	max.curve<-NULL;
611 | 	if(!is.null(x$max) & !is.null(x$max$curve)){
612 | 		max.curve<-x$max$curve;
613 | 	}
614 | 	min.curve<-NULL;
615 | 	if(!is.null(x$min) & !is.null(x$min$curve)){
616 | 		min.curve<-x$min$curve;
617 | 	}
618 | 	rand.curve<-NULL;
619 | 	if(!is.null(x$rand) & !is.null(x$rand$curve)){
620 | 		rand.curve<-x$rand$curve;
621 | 	}
622 | 	
623 | 	x<-x$curve;
624 | 	
625 | 	cols<-1;
626 | 	segment=F;
627 | 	plotscale.color=F;
628 | 	if( is.logical(color) ){
629 | 		if(color){
630 | 			min<-min(x[,3]);
631 | 			max<-max(x[,3]);
632 | 	
633 | 			cols<-getColor( scale.color, x[,3], min, max );
634 | 			plotscale.color=T;
635 | 			segment=T;
636 | 		}else{
637 | 			cols<-1;
638 | 			segment<-F;
639 | 		}
640 | 	}else {
641 | 		cols<-color;
642 | 		segment<-F;
643 | 	}
644 | 	
645 | 	if(!add & !is.na(legend) & (is.numeric(legend) | suppressWarnings(legend==TRUE)) & plotscale.color ){
646 | 		if(is.logical(legend)){
647 | 			legend<-4;
648 | 		}
649 | 		m<-NULL;widths<-rep(1,2);heights<-rep(1,2)
650 | 		if(legend == 1){
651 | 			m<-matrix(c(1,2),nrow=2);
652 | 			heights<-c(4,lcm(2));
653 | 		}else if(legend==2){
654 | 			m<-matrix(c(2,1),nrow=1);
655 | 			widths=c(lcm(2.5),4);
656 | 		}else if(legend==3){
657 | 			m<-matrix(c(2,1),nrow=2);
658 | 			heights=c(lcm(2),4);
659 | 		}else{
660 | 			m<-matrix(c(1,2),nrow=1);
661 | 			widths=c(4,lcm(2.5));
662 | 		}
663 | 		layout(mat = m,widths = widths,heights = heights);
664 | 		
665 | 	}#else if(!add){
666 | 	#	layout(1);
667 | 	#}
668 | 	
669 | 	if(!add){
670 | 		plot(0,xlim=xlim,ylim=ylim,col=0,xlab=my.xlab,ylab=my.ylab,main=my.main,...);
671 | 	}
672 | 	
673 | 	if( !add ){
674 | 		if( fill.area & !is.null(max.curve) & !is.null(min.curve)){
675 | 			xs<-c(min.curve[,1],max.curve[nrow(max.curve):1,1],min.curve[1,1]);
676 | 			ys<-c(min.curve[,2],max.curve[nrow(max.curve):1,2],min.curve[1,2]);
677 | 			polygon( x = xs, y = ys, density = -1, border = NA, col = fill.color );
678 | 		}
679 | 		
680 | 		if(max.plot & !is.null(max.curve)){
681 | 			lines(max.curve[,1],max.curve[,2],col=maxminrand.col, lty="dashed", ...);
682 | 		}
683 | 		
684 | 		if(min.plot & !is.null(min.curve)){
685 | 			lines(min.curve[,1],min.curve[,2],col=maxminrand.col, lty="dotted", ...);
686 | 		}
687 | 		
688 | 		if(rand.plot & !is.null(rand.curve)){
689 | 			lines(rand.curve[,1],rand.curve[,2],col=maxminrand.col, lty="dotdash", ...);
690 | 		}
691 | 	}
692 | 	
693 | 	d=nrow(x);
694 | 	if( segment ) {
695 | 		segments( x[1:(d-1),1], x[1:(d-1),2], x[2:d,1], x[2:d,2], col=cols, lwd=lwd, ...);
696 | 	} else {
697 | 		lines( x[,1], x[,2], col=cols, lwd=lwd, ...);
698 | 	}
699 | 	
700 | 	if(!add & legend & !is.numeric(color) & color == TRUE){
701 | 		scale<-seq( min, max, length = 100 );
702 | 		cols<-getColor( scale.color, scale, min, max );
703 | 		bak<-par("mar");
704 | 		on.exit(par(mar=bak));
705 | 		if(legend==2 | legend==4){
706 | 			if(legend==4){par(mar=c(5,1,4,2)+0.1);}else{par(mar=c(5,2,4,1)+0.1);}
707 | 			image(c(1),scale,matrix(scale,nrow=1),col=cols,xlab="",ylab="",axes=F)
708 | 		}else{
709 | 			if(legend==1){par(mar=c(2,4,0,2)+0.1);}else{par(mar=c(0,4,2,2)+0.1);}
710 | 			image(scale,c(1),matrix(scale,ncol=1),col=cols,xlab="",ylab="",axes=F)
711 | 		}
712 | 		axis(legend)
713 | 		layout(1)
714 | 	}
715 | 	
716 | 
717 | }
718 | 
719 | getColor <- function( scale, x, min=min(x), max=max(x) )  {
720 | 	return( scale[round(1 + (length(scale)-1) * (x - min)/(max-min))] );
721 | }
722 | 
723 | 
724 | 


--------------------------------------------------------------------------------
/dataset.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import json
  3 | import os
  4 | import pandas as pd
  5 | import numpy as np
  6 | from IPython import embed
  7 | from collections import Counter
  8 | 
  9 | from diConstants import (
 10 |     SEQ_ROOT, BIN_SIZE, NUM_BASES,
 11 |     HG19_ALL_CHROMS, MM9_ALL_CHROMS, HG19_TRAIN_CHROMS, MM9_TRAIN_CHROMS, VALID_CHROMS, TEST_CHROMS,
 12 |     HG19_CHROM_SIZES, MM9_CHROM_SIZES)
 13 | from prepData import get_metadata_path, input_not_before_end, get_base_path, get_peaks, get_blacklisted_locs
 14 | 
 15 | 
 16 | def load_chrom(data_path, chrom):
 17 |     """
 18 |     Loads the .npz file in data_path and returns data for a given chrom.
 19 |     chrom is like "chr1"
 20 |     """
 21 |     m = np.load(data_path)
 22 |     return m[chrom]
 23 | 
 24 | 
 25 | def get_species_from_dataset_name(dataset_name):
 26 |     if 'ULI' in dataset_name or 'MOUSE' in dataset_name:
 27 |         return 'mm9'
 28 |     else:
 29 |         return 'hg19'  
 30 | 
 31 | 
 32 | class DatasetEncoder(json.JSONEncoder):
 33 |     """
 34 |     Encodes Dataset objects in JSON.
 35 |     """
 36 |     def default(self, obj):        
 37 |         if isinstance(obj, Dataset):
 38 |             return obj.__dict__
 39 |         else:
 40 |             return super(DatasetEncoder, self).default(obj)
 41 | 
 42 | 
 43 | class Dataset(object):
 44 |     """
 45 |     Dataset objects have the following fields:
 46 |         dataset_name
 47 |         num_examples
 48 |         X_subsample_target_string (string like "5e6" or None)
 49 |         Y_subsample_target_string (string like "5e6" or None)
 50 |         random_seed
 51 |         normalization
 52 |         peak_fraction
 53 |         chroms
 54 |         chroms_string
 55 |     
 56 |     They support the following methods:
 57 |         get_subsample_target_string(self, X_or_Y)
 58 |         get_seq_dataset_path(self, seq_length, factor_for_peaks)
 59 |         load_seq_dataset(self, seq_length, input_marks, output_marks)
 60 |         load_binary_genome(self, X_or_Y, marks, only_chr1=False)
 61 |         load_genome(self, X_or_Y, marks, only_chr1=False, peaks=False)
 62 |     
 63 |     And the static method process_subsample_target_string.
 64 |     """
 65 | 
 66 |     @staticmethod
 67 |     def process_subsample_target_string(subsample_target_string):
 68 |         if subsample_target_string is None:
 69 |             return subsample_target_string
 70 |         elif subsample_target_string == 'None':
 71 |             return None
 72 |         else:
 73 |             return str(subsample_target_string)
 74 | 
 75 | 
 76 |     def __init__(self, dataset_name, num_examples,
 77 |                  X_subsample_target_string, Y_subsample_target_string,
 78 |                  random_seed, normalization, peak_fraction, chroms):
 79 |         self.dataset_name = dataset_name
 80 |         self.num_examples = num_examples
 81 |         self.X_subsample_target_string = Dataset.process_subsample_target_string(X_subsample_target_string)
 82 |         self.Y_subsample_target_string = Dataset.process_subsample_target_string(Y_subsample_target_string)
 83 |         self.random_seed = random_seed
 84 |         self.normalization = normalization
 85 |         self.peak_fraction = peak_fraction
 86 |         self.chroms = chroms
 87 | 
 88 |         self.species = get_species_from_dataset_name(self.dataset_name)
 89 |         if self.species == 'hg19':
 90 |             all_chroms = HG19_ALL_CHROMS
 91 |             train_chroms = HG19_TRAIN_CHROMS
 92 |         else:
 93 |             all_chroms = MM9_ALL_CHROMS
 94 |             train_chroms = MM9_TRAIN_CHROMS
 95 | 
 96 |         if self.chroms == all_chroms:
 97 |             self.chroms_string = ""
 98 |         elif self.chroms == TEST_CHROMS:
 99 |             self.chroms_string = "_chroms-test"
100 |         elif self.chroms == train_chroms:
101 |             self.chroms_string = "_chroms-train"
102 |         elif self.chroms == VALID_CHROMS:
103 |             self.chroms_string = "_chroms-valid"
104 |         else:
105 |             raise ValueError, "chroms must be ALL_CHROMS, TEST_CHROMS, TRAIN_CHROMS, or VALID_CHROMS"
106 |     
107 |         if (self.normalization not in ['arcsinh', 'log', None]):
108 |             raise ValueError, "normalization must be 'arcsinh', 'log', or None"
109 | 
110 |         peak_fraction = float(peak_fraction)
111 |         if peak_fraction < 0.0 or peak_fraction > 1.0:
112 |             raise ValueError, "peak_fraction must be in [0, 1]"
113 | 
114 |         try:
115 |             metadata_path = get_metadata_path(self.dataset_name, self.X_subsample_target_string, self.normalization)
116 |             with open(metadata_path, 'r') as f:        
117 |                 metadata = json.loads(f.read())
118 |                 self.marks_in_dataset = metadata['factors_to_include']
119 |                 self.cell_line = metadata['cell_line']
120 |         except IOError:
121 |             raise IOError, "Dataset %s doesn't exist." % metadata_path
122 | 
123 |         try:
124 |             # Sanity check to make sure that metadata is consistent with different subsample target string            
125 |             metadata_path = get_metadata_path(self.dataset_name, self.Y_subsample_target_string, self.normalization)
126 |             with open(metadata_path, 'r') as f:        
127 |                 metadata = json.loads(f.read())
128 |                 assert self.marks_in_dataset == metadata['factors_to_include']
129 |                 assert self.cell_line == metadata['cell_line']
130 |         except IOError:
131 |             raise IOError, "Dataset %s doesn't exist." % metadata_path
132 | 
133 | 
134 |     def get_subsample_target_string(self, X_or_Y):
135 |         assert X_or_Y in ["X", "Y"]
136 |         if X_or_Y == "X":
137 |             return self.X_subsample_target_string
138 |         else:
139 |             return self.Y_subsample_target_string
140 | 
141 | 
142 |     def get_seq_dataset_path(self, seq_length, factor_for_peaks):
143 |         """
144 |         If factor_for_peaks is INPUT,
145 |         that means that all marks in the dataset are used for peak enrichment,
146 |         but that the Y matrices in the dataset only contain the INPUT mark.
147 |         This is used for training a separate model that only outputs INPUT.
148 | 
149 |         In contrast, if factor_for_peaks is None, all marks in dataset are similarly
150 |         used for peak enrichment, but the Y matrices in the dataset contain all marks.
151 |         This is used for training a single, multi-task model that outputs all marks.
152 | 
153 |         Y_subsample_target_string is normally set to None, unless we are intentionally
154 |         trying to use a certain subsampling depth as the "full" data. 
155 |         """
156 | 
157 |         if factor_for_peaks is None:
158 |             dataset_path = os.path.join(
159 |                 SEQ_ROOT, "%s_subsample-%s-%s_rS-%s_numEx-%s_seqLen-%s_peakFrac-%s_norm-%s%s.npz" % \
160 |                 (self.dataset_name, self.X_subsample_target_string, self.Y_subsample_target_string, 
161 |                  self.random_seed, self.num_examples, 
162 |                  seq_length, self.peak_fraction, self.normalization, self.chroms_string))
163 |         else:
164 |             dataset_path = os.path.join(
165 |                 SEQ_ROOT, "%s_subsample-%s-%s_rS-%s_numEx-%s_seqLen-%s_peakFrac-%s_peaksFac-%s_norm-%s%s.npz" % \
166 |                 (self.dataset_name, self.X_subsample_target_string, self.Y_subsample_target_string, 
167 |                  self.random_seed, self.num_examples, 
168 |                  seq_length, self.peak_fraction, factor_for_peaks, self.normalization, self.chroms_string))
169 | 
170 |         return dataset_path
171 | 
172 | 
173 |     def load_seq_dataset(self, seq_length, input_marks, output_marks):
174 |         """
175 |         Reads in (X, Y, peakPValueX, peakPValueY, peakBinaryX, peakBinaryY) from a previously created .npz file, 
176 |         where X is the input (subsampled) and Y is the output (full) and 
177 |         peakPValueX, peakPValueY contain the -log10 pvalues for the called peaks (bin by bin)
178 |         peakBinaryX, peakBinaryY contain the binarized peak signal for the called peaks (bin by bin)
179 |         X is of shape num_examples x seq_length x len(input_marks).
180 |         Y is of shape num_examples x seq_length x len(output_marks).
181 | 
182 |         peakPValueX is of similar shape to X, except that it does not contain an INPUT track, so it is of shape    
183 |         num_examples x seq_length x (len(input_marks) - ('INPUT' in input_marks)).
184 |         peakPValueY is of similar shape to Y, except that it does not contain an INPUT track, so it is of shape    
185 |         num_examples x seq_length x (len(output_marks) - ('INPUT' in output_marks)).
186 | 
187 |         input_marks is a list of marks that will be used as input to the model. 
188 |         output_marks is a list of marks that will be used as output from the model. It can be of length 1-6, depending
189 |         on whether we're training separate models or one single model, and on whether we're doing classification
190 |         or regression.
191 | 
192 |         If the .npz file doesn't exist, it will create it by calling extract_seq_dataset.
193 |         """
194 |         assert(input_not_before_end(output_marks))
195 |         assert(input_not_before_end(input_marks))
196 | 
197 |         for input_mark in input_marks:
198 |             if input_mark not in self.marks_in_dataset:
199 |                 raise ValueError, "input_marks must be in marks_in_dataset"
200 | 
201 |         for output_mark in output_marks:
202 |             if output_mark not in self.marks_in_dataset:
203 |                 raise ValueError, "output_marks must be in marks_in_dataset"
204 | 
205 |         # Construct an identifying string for this dataset based on what the output marks are.
206 |         # If all marks in marks_in_dataset are present, then for brevity we omit output_marks_string. 
207 |         if len(output_marks) == len(self.marks_in_dataset):
208 |             output_marks_string = None
209 |         else:
210 |             output_marks_string = '-'.join(output_marks)
211 | 
212 |         dataset_path = self.get_seq_dataset_path(seq_length, output_marks_string)
213 |         
214 |         try:      
215 |             with np.load(dataset_path) as data:
216 |                 X = data['X'].astype('float32')
217 |                 Y = data['Y'].astype('float32')
218 |                 peakPValueX = data['peakPValueX'].astype('float32')
219 |                 peakPValueY = data['peakPValueY'].astype('float32')
220 |                 peakBinaryX = data['peakBinaryX'].astype('int8')
221 |                 peakBinaryY = data['peakBinaryY'].astype('int8')
222 | 
223 |         except:
224 |             print("Dataset %s doesn't exist or is missing a required matrix. Creating..." % dataset_path)
225 |             
226 |             X, Y, peakPValueX, peakPValueY, peakBinaryX, peakBinaryY = self.extract_seq_dataset(
227 |                 seq_length,                
228 |                 output_marks,
229 |                 dataset_path)
230 | 
231 |         # Only select the input marks that we want
232 |         marks_idx = []
233 |         peak_marks_idx = []
234 |         for mark in input_marks:
235 |             marks_idx.append(self.marks_in_dataset.index(mark))
236 | 
237 |         # We don't want to have INPUT inside peakPValueX
238 |         factors_without_input = copy.copy(self.marks_in_dataset)
239 |         if 'INPUT' in factors_without_input:
240 |             factors_without_input.remove('INPUT')
241 | 
242 |         for mark in input_marks:
243 |             if mark == 'INPUT':
244 |                 continue
245 |             peak_marks_idx.append(factors_without_input.index(mark))    
246 | 
247 |         X = X[..., marks_idx]
248 |         peakPValueX = peakPValueX[..., peak_marks_idx]
249 |         peakBinaryX = peakBinaryX[..., peak_marks_idx]
250 | 
251 |         assert(np.all(peakPValueX >= 0) & np.all(peakPValueY >= 0))
252 | 
253 |         if (X.shape[0], X.shape[1]) != (Y.shape[0], Y.shape[1]):
254 |             raise Exception, "First two dimensions of X and Y shapes (num_examples, seq_length) \
255 |                               need to agree."
256 |         if (peakPValueX.shape[0], peakPValueX.shape[1]) != (peakPValueY.shape[0], peakPValueY.shape[1]):
257 |             raise Exception, "First two dimensions of peakPValueX and peakPValueY shapes \
258 |                               (num_examples, seq_length) need to agree."
259 |         if len(peakPValueX) != len(X):
260 |             raise Exception, "peakPValueX and X must have same length."
261 |         
262 |         if ((seq_length != X.shape[1]) or (seq_length != peakPValueX.shape[1])):
263 |             raise Exception, "seq_length between model and data needs to agree"
264 | 
265 |         return X, Y, peakPValueX, peakPValueY, peakBinaryX, peakBinaryY
266 | 
267 | 
268 |     def extract_seq_dataset(self, seq_length, output_marks, dataset_path):
269 |         """
270 |         Returns (X, Y, peakPValueX, peakPValueY, peakBinaryX, peakBinaryY), where X is the input (subsampled) and Y is the output (full)
271 |         and peakPValueX and peakPValueY are the -log10 pvalue scores for peaks called using MACS on X and Y respectively. 
272 |         Both X and Y are each of shape num_examples x seq_length x num_factors.
273 |         peakPValueX and peakPValueY are each of shape num_examples x seq_length x num_factors - 1. (no peaks for input)
274 | 
275 |         Also writes all matrices to a compressed .npz file.
276 | 
277 |         peak_fraction is the fraction of examples that should be centered on a peak that exists in the full data.
278 |         For example, if peak_fraction = 0.5, then half of the examples will have a peak at the 
279 |         center of the sequence, and the other half will not.
280 | 
281 |         factor_for_peaks determines which factor is used for determining whether a given location is
282 |         counted as a 'peak' or not, since it could be a peak in one factor but not another.
283 |         It should be a string, like 'H3K27AC'. 
284 |         If it is None (the singleton, not a string), then a location is counted as having a peak 
285 |         so long as there's a peak in any factor.
286 | 
287 |         This function sets the numpy random seed.
288 |         """
289 | 
290 |         def sanity_check():
291 |             """
292 |             Sanity checks on the full and subsampled data. 
293 | 
294 |             Uses full_path and sub_path as defined in the main function.
295 |             """
296 |             assert os.path.isfile(full_path), "%s does not exist" % full_path
297 |             assert os.path.isfile(sub_path), "%s does not exist" % sub_path
298 |             assert os.path.isfile(full_peak_path), "%s does not exist" % full_peak_path
299 |             assert os.path.isfile(sub_peak_path), "%s does not exist" % sub_peak_path
300 | 
301 |             with np.load(full_path) as full_data:
302 |                 with np.load(sub_path) as sub_data:
303 | 
304 |                     full_chroms = full_data.keys()
305 |                     sub_chroms = sub_data.keys()
306 | 
307 |                     assert set(full_chroms) == set(sub_chroms), \
308 |                       "Full and subsampled data must have exactly the same chromosomes."
309 | 
310 |                     assert full_chroms == sub_chroms, \
311 |                       "Technically this is ok, but it's weird that the chromosomes in the full and subsampled \
312 |                       data are not in the same order."
313 | 
314 |                     for chrom in full_chroms:
315 |                         assert full_data[chrom].shape == sub_data[chrom].shape, \
316 |                           "Each chromosome should have exactly the same number of bins and factors in both \
317 |                           datasets."
318 | 
319 |                     assert len(set([full_data[chrom].shape[1] for chrom in full_chroms])) == 1, \
320 |                       "Number of factors should be constant across all chromosomes."
321 | 
322 |         def get_start_positions(data_path, cell_line, chroms):
323 |             """
324 |             Returns a dictionary where each chromosome is a key, and each value is a set
325 |             of start positions in that chromosome from which we can extract an example.
326 |             Chromosomes are chosen uniformly at random, so longer chromosomes are not sampled 
327 |             more than shorter chromosomes.
328 | 
329 |             Start positions are chosen to be enriched in peaks in the full data, as specified
330 |             by the peak_fraction parameter to the main extract_seq_dataset function.
331 | 
332 |             Uses seq_length and num_examples from the main function parameters.
333 |             """
334 |             assert(seq_length % 2 == 1)
335 |             with np.load(data_path) as data:
336 |                 # Make sure all chroms are present in the data
337 |                 assert all([chrom in data.keys() for chrom in chroms])
338 | 
339 |                 # How long is each chromosome?
340 |                 num_bins = {chrom: data[chrom].shape[0] for chrom in chroms}
341 | 
342 |             # Filter out blacklisted bins. Add a bit of buffer to be safe. 
343 |             blacklist_buffer = 5
344 |             blacklisted_locs = get_blacklisted_locs(cell_line)
345 |             non_blacklisted_bins = {}
346 |             for chrom in chroms:
347 |                 good_locs = np.ones(num_bins[chrom] - seq_length + 1, dtype=bool)
348 |                 print('Prior to filtering out bad locations for chromosome %s, %i bins available' % (chrom, len(good_locs)))
349 |                 for bad_range in blacklisted_locs[chrom]:
350 |                     left = max(bad_range[0] - seq_length - blacklist_buffer, 0)
351 |                     right = max(bad_range[1] + blacklist_buffer, 0)
352 |                     good_locs[left:right] = 0        
353 |                 print('After filtering out bad locations for chromosome %s, %i bins available' % (chrom, len(good_locs)))
354 |                 non_blacklisted_bins[chrom] = np.flatnonzero(good_locs).tolist()
355 | 
356 |             # Which chromosome? Sample uniformly at random
357 |             # without caring about chromosome length.
358 |             # Then count how many samples we are getting from each chromosome.
359 |             chrom_samples = list(
360 |                 np.random.choice(
361 |                     chroms,
362 |                     num_examples,
363 |                     replace=True))
364 |             num_samples = {chrom: chrom_samples.count(chrom) for chrom in chroms}
365 | 
366 |             # Load all peaks into memory
367 |             print("Preparing peaks...")
368 |             peaks = {}
369 | 
370 |             # We want to enrich the data with parts of the genome that have peaks in the output marks
371 |             # We don't have INPUT peaks, so we remove it
372 |             if output_marks == ['INPUT']:
373 |                 factors_for_peaks = copy.copy(marks_in_dataset)
374 |             else:
375 |                 factors_for_peaks = copy.copy(output_marks)
376 |             if 'INPUT' in factors_for_peaks:
377 |                 factors_for_peaks.remove('INPUT')
378 |         
379 |             # Get peaks that correspond to the "full" data as specified by Y_subsample_target_string
380 |             for factor in factors_for_peaks:
381 |                 peaks[factor], _ = get_peaks(cell_line, factor, Y_subsample_target_string)
382 |         
383 |             # Get start positions from each chromosome
384 |             start_positions = {}
385 |             for chrom in chroms:
386 | 
387 |                 print("Calculating start positions for %s" % chrom)
388 |                 # We shift each peak such that the peak will be in the middle of the example,
389 |                 # i.e., the start position is (seq_length - 1)/2 bins before the actual peak
390 |                 # Unless doing so would move the starting position off the actual chromosome
391 |                 # For example, say seq_length = 101 (so shift = 50) and there is a peak at position 1050.
392 |                 # We would include position 1000 = 1050 - shift as a start position. 
393 |                 # A sequence that starts at position 1000 would go to position 1100 (inclusive)
394 |                 # and the midpoint of that sequence, position 1050, would have a peak.
395 |                 shift = int((seq_length - 1) / 2)
396 |                 peak_bins = np.zeros(max(non_blacklisted_bins[chrom]), dtype=bool)
397 | 
398 |                 for factor in factors_for_peaks:
399 |                     num_peaks = 0
400 |                     for peak in peaks[factor][chrom]:
401 |                         
402 |                         left = max(int(peak[0] - shift), 0)
403 |                         right = max(int(peak[1] - shift), 0)
404 |                         num_peaks += right - left
405 | 
406 |                         # A "1" in peak_bins means that starting at that location will result 
407 |                         # in a sequence whose center is a peak.
408 |                         peak_bins[left:right] = 1
409 |                     print("    %s peaks for %s" % (num_peaks, factor))
410 |                 peak_bins_binarized = np.copy(peak_bins)
411 |                 peak_bins = set(np.flatnonzero(peak_bins).tolist())
412 |                     
413 |                 # Remove blacklisted bins, and create nonpeak_bins
414 |                 all_bins = set(non_blacklisted_bins[chrom])
415 |                 peak_bins = peak_bins.intersection(all_bins)
416 |                 nonpeak_bins = all_bins.difference(peak_bins)
417 |                 peak_bins = list(peak_bins)
418 |                 nonpeak_bins = list(nonpeak_bins)
419 |                 print("    Total after blacklisting: %s peaks and %s non-peaks" % (len(peak_bins), len(nonpeak_bins)))
420 | 
421 |                 # Get samples of peak and non-peak locations
422 |                 peak_samples = np.round(num_samples[chrom] * peak_fraction).astype(int)
423 |                 nonpeak_samples = num_samples[chrom] - peak_samples
424 | 
425 |                 start_positions[chrom] = np.random.choice(
426 |                     nonpeak_bins,
427 |                     nonpeak_samples,
428 |                     replace=False)
429 |                 
430 | 
431 |                 # There is a potential problem here if we are trying to draw more peak_samples
432 |                 # than there are peak locations on the chromosome.
433 |                 # If so, np.random.choice will error out.
434 |                 start_positions[chrom] = np.concatenate([
435 |                     start_positions[chrom],
436 |                     np.random.choice(
437 |                         peak_bins,
438 |                         peak_samples,
439 |                         replace=False)])
440 |                 
441 |                 # Sort in the hopes that it makes memory access in extract_single_dataset faster
442 |                 start_positions[chrom].sort()
443 |                 
444 |                 
445 |             return start_positions
446 | 
447 | 
448 |         def extract_single_dataset(data_path, start_positions, marks):
449 |             """
450 |             From the data in data_path, extracts num_examples subsequences of length seq_length 
451 |             from the start positions in start_positions.
452 | 
453 |             Returns a matrix of size num_examples x seq_length x num_marks.
454 | 
455 |             Uses seq_length and num_examples from the main function parameters.
456 |             
457 |             Used to load both the continuous signal and the peak p-values.
458 |             """
459 |             print('Extracting samples from %s...' % data_path)
460 |             
461 |             num_marks = len(marks)
462 |             marks_idx = []
463 |             for mark in marks:
464 |                 marks_idx.append(
465 |                     marks_in_dataset.index(mark))
466 |             
467 |             return_dataset = np.empty([num_examples, seq_length, num_marks])
468 |             first_empty_row = 0
469 | 
470 |             with np.load(data_path) as data:
471 |                 # Get required samples from each chromosome
472 |                 for chrom in start_positions.keys():
473 |                                 
474 |                     data_chrom = data[chrom] 
475 | 
476 |                     for start_pos in start_positions[chrom]:                
477 |                         return_dataset[first_empty_row, :, :] = data_chrom[
478 |                             start_pos : start_pos+seq_length, 
479 |                             marks_idx]
480 |                         first_empty_row += 1
481 | 
482 |                     print("At sample number %s..." % first_empty_row)
483 | 
484 |             assert first_empty_row == num_examples
485 | 
486 |             # Note: this dataset has not been randomized yet. 
487 |             # So it has consecutive elements from the same chromosome. 
488 |             # We will randomize both X and Y datasets together later.
489 |             return return_dataset
490 | 
491 | 
492 |         def extract_binary_peak_dataset(full_path, subsample_target_string_to_extract, start_positions,
493 |                                         cell_line, marks):
494 |             """
495 |             Method for returning Y with peak information. A 1 denotes a peak. 
496 |             From the data in data_path, extracts num_examples subsequences of length seq_length 
497 |             from the start positions in start_positions.
498 | 
499 |             Returns binary_peak_matrix, a matrix of size num_examples x seq_length x num_marks,
500 |             where a 1 denotes a peak        
501 |             """
502 | 
503 |             shift = int((seq_length - 1) / 2)
504 |             peak_pval_matrix = np.empty([
505 |                 num_examples, 
506 |                 seq_length, 
507 |                 (len(marks) - ('INPUT' in marks))
508 |             ])
509 | 
510 |             factor_idx = 0
511 |             for factor in marks:
512 |                 if factor == 'INPUT':
513 |                     continue
514 |                 first_empty_row = 0
515 |                 peak_dict, peak_log_pvalue_dict = get_peaks(
516 |                     cell_line, 
517 |                     factor, 
518 |                     subsample_target_string=subsample_target_string_to_extract)
519 |                 for chrom in start_positions:
520 |                     peak_vector_length = max(
521 |                         np.max(peak_dict[chrom]), 
522 |                         np.max(start_positions[chrom]) + seq_length) + 1
523 |                     peak_pval_vector = np.zeros([peak_vector_length,])
524 |                     for peak_idx, peak in enumerate(peak_dict[chrom]):
525 |                         peak_pval_vector[peak[0]:peak[1]] = peak_log_pvalue_dict[chrom][peak_idx]
526 |                     is_peak = (peak_pval_vector > 0)
527 |                     print(factor, chrom, is_peak[start_positions[chrom] + shift].mean())
528 |                     for start_pos in start_positions[chrom]:
529 |                         peak_pval_matrix[first_empty_row, :, factor_idx] = peak_pval_vector[start_pos : (start_pos+seq_length)]
530 |                         first_empty_row += 1            
531 |                 factor_idx += 1          
532 |             binary_peak_matrix = (peak_pval_matrix > 0) * 1.
533 |             assert np.all(peak_pval_matrix >= 0) 
534 |             return binary_peak_matrix
535 | 
536 | 
537 |         def extract_single_sequence_dataset(start_positions):
538 |             """
539 |             Extracts num_examples subsequences of length seq_length, at positions start_positions,
540 |             from the hg19 sequence.
541 | 
542 |             Returns a matrix of size num_examples x (seq_length * BIN_SIZE) x NUM_BASES.
543 | 
544 |             Uses seq_length and num_examples from the main function parameters.
545 |             """
546 |             print('Extracting sequences...')
547 |                     
548 |             return_dataset = np.empty([num_examples, BIN_SIZE*seq_length, NUM_BASES])
549 |             first_empty_row = 0
550 |         
551 |             # Get required samples from each chromosome
552 |             for chrom in start_positions.keys():
553 |                             
554 |                 data_chrom = load_seq_for_chrom(chrom)
555 | 
556 |                 for start_pos in start_positions[chrom]:                
557 |                     return_dataset[first_empty_row, :, :] = data_chrom[
558 |                         start_pos*BIN_SIZE : (start_pos+seq_length)*BIN_SIZE, :]
559 |                     first_empty_row += 1
560 | 
561 |                 print("At sample number %s..." % first_empty_row)
562 | 
563 |             assert first_empty_row == num_examples
564 | 
565 |             # Make sure each base has at most one 1, and that at least one base is not N
566 |             assert np.max(np.sum(return_dataset, axis=2)) == 1
567 | 
568 |             return return_dataset
569 | 
570 | 
571 |         ### Main function code starts here
572 | 
573 |         # Read dataset metadata 
574 |         dataset_name = self.dataset_name
575 |         X_subsample_target_string = self.X_subsample_target_string
576 |         Y_subsample_target_string = self.Y_subsample_target_string
577 |         random_seed = self.random_seed
578 |         num_examples = self.num_examples
579 |         peak_fraction = self.peak_fraction
580 |         normalization = self.normalization
581 |         marks_in_dataset = self.marks_in_dataset
582 |         cell_line = self.cell_line
583 |         chroms = self.chroms
584 | 
585 |         # We always prepare dataset files with the full set of input_marks
586 |         input_marks = copy.copy(marks_in_dataset)
587 | 
588 |         np.random.seed(random_seed)
589 | 
590 |         full_path = get_base_path(dataset_name, Y_subsample_target_string, normalization)
591 |         sub_path = get_base_path(dataset_name, X_subsample_target_string, normalization)
592 |         full_peak_path = get_base_path(dataset_name, Y_subsample_target_string, normalization=None, peaks=True)
593 |         sub_peak_path = get_base_path(dataset_name, X_subsample_target_string, normalization=None, peaks=True)
594 | 
595 |         print('input', input_marks)
596 |         print('output', output_marks)
597 |         print('sub path', sub_path)
598 |         print('full path', full_path)
599 |         print('sub peak path', sub_peak_path)
600 |         print('full peak path', full_peak_path)
601 | 
602 |         # Sanity check the input
603 |         sanity_check()
604 | 
605 |         # Get a shared list of start positions for both X and Y
606 |         # then extract the datasets
607 |         start_positions = get_start_positions(full_path, cell_line, chroms)
608 |         X = extract_single_dataset(sub_path, start_positions, input_marks)
609 |         peakPValueX = extract_single_dataset(
610 |             sub_peak_path, 
611 |             start_positions, 
612 |             [a for a in input_marks if a != 'INPUT'])
613 |         peakBinaryX = extract_binary_peak_dataset(
614 |             sub_path, 
615 |             X_subsample_target_string, 
616 |             start_positions, 
617 |             cell_line, 
618 |             [a for a in input_marks if a != 'INPUT'])
619 |         
620 |         Y = extract_single_dataset(full_path, start_positions, output_marks)
621 |         peakPValueY = extract_single_dataset(
622 |             full_peak_path, 
623 |             start_positions, 
624 |             [a for a in output_marks if a != 'INPUT'])
625 |         peakBinaryY = extract_binary_peak_dataset(
626 |             full_path, 
627 |             Y_subsample_target_string, 
628 |             start_positions, 
629 |             cell_line, 
630 |             [a for a in output_marks if a != 'INPUT'])
631 | 
632 |         
633 |         # Sanity check the output
634 |         assert (X.shape[0], X.shape[1]) == (Y.shape[0], Y.shape[1])
635 |         assert (peakPValueX.shape[0], peakPValueX.shape[1]) == (peakPValueY.shape[0], peakPValueY.shape[1])
636 |         assert X.shape[2] == len(input_marks)
637 |         assert peakPValueX.shape[2] + ('INPUT' in input_marks) == len(input_marks)
638 |         assert Y.shape[2] == len(output_marks)
639 |         assert peakPValueY.shape[2] + ('INPUT' in output_marks) == len(output_marks)
640 |         assert(peakPValueY.shape == peakBinaryY.shape)
641 |         assert(peakPValueX.shape == peakBinaryX.shape)
642 | 
643 |         assert X.shape[0] == num_examples
644 |         assert X.shape[1] == seq_length
645 |         assert peakPValueX.shape[0] == num_examples
646 |         assert peakPValueX.shape[1] == seq_length
647 |         
648 |         assert np.all(peakPValueX >= 0)
649 |         assert np.all(peakPValueY >= 0)
650 | 
651 |         # If we only have one output mark, make sure the peak fraction is close. 
652 |         if len(output_marks) == 1 and output_marks != ['INPUT']:
653 |             midpoint = (seq_length - 1) / 2#
654 |             true_peak_fraction = peakBinaryY[:, midpoint, 0].mean()
655 | 
656 |             assert np.abs(true_peak_fraction - peak_fraction) < 1e-2, 'Error: true peak fraction is %2.3f, desired fraction is %2.3f' % (true_peak_fraction, peak_fraction)
657 | 
658 | 
659 |         # Randomize the ordering of return_dataset so we don't see consecutive elements 
660 |         # from the same chromosome
661 |         random_ordering = np.random.permutation(X.shape[0])
662 |         X = X[random_ordering]
663 |         Y = Y[random_ordering]
664 |         peakPValueX = peakPValueX[random_ordering]
665 |         peakPValueY = peakPValueY[random_ordering]
666 |         peakBinaryX = peakBinaryX[random_ordering]
667 |         peakBinaryY = peakBinaryY[random_ordering]
668 | 
669 |         # Downsize
670 |         X = X.astype('float32')
671 |         Y = Y.astype('float32')
672 |         peakPValueX = peakPValueX.astype('float32')
673 |         peakPValueY = peakPValueY.astype('float32')
674 |         peakBinaryX = peakBinaryX.astype('int8')
675 |         peakBinaryY = peakBinaryY.astype('int8')
676 | 
677 | 
678 | 
679 |         # Write output to disk
680 |         print("Writing output to %s" % dataset_path)
681 |         
682 |         np.savez_compressed(
683 |             dataset_path, 
684 |             X=X, 
685 |             Y=Y, 
686 |             peakPValueX=peakPValueX, 
687 |             peakPValueY=peakPValueY,
688 |             peakBinaryX=peakBinaryX,
689 |             peakBinaryY=peakBinaryY)
690 | 
691 |         return (X, Y, peakPValueX, peakPValueY, peakBinaryX, peakBinaryY)
692 | 
693 | 
694 |     def load_genome(self, X_or_Y, marks, only_chr1=False, peaks=False):
695 |             """
696 |             Loads a genome with the appropriate normalization, selecting only chroms present 
697 |             in self.chroms.
698 | 
699 |             The only_chr1 flag is provided for convenience, so that code runs faster when we are only 
700 |             looking at chr1.
701 | 
702 |             If peaks = True, loads the peak p-values instead. 
703 |             """
704 |             subsample_target_string = self.get_subsample_target_string(X_or_Y)
705 |             
706 |             data_path = get_base_path(self.dataset_name, subsample_target_string, self.normalization, peaks=peaks)
707 | 
708 |             # We only want to return the tracks corresponding to marks
709 |             # The genome file has all factors in marks_in_dataset, so we iterate through marks
710 |             # to pick out the correct indices
711 |             if peaks and ('INPUT' in self.marks_in_dataset):
712 |                 marks_in_dataset = copy.copy(self.marks_in_dataset)
713 |                 marks_in_dataset.remove('INPUT')
714 |             marks_idx = []
715 | 
716 |             for mark in marks:
717 |                 assert mark in self.marks_in_dataset
718 |                 marks_idx.append(self.marks_in_dataset.index(mark))
719 | 
720 |             with np.load(data_path) as data:
721 |                 # We have to create a new dictionary for the returned data
722 |                 # because data is a NpzFile object that does not support item assignment
723 |                 # We index with marks_idx so that only the correct tracks are returned.
724 |                 return_data = {} 
725 |                 if only_chr1 == False:
726 |                     for key in self.chroms:
727 |                         return_data[key] = data[key][..., marks_idx]
728 |                 else:
729 |                     return_data['chr1'] = data['chr1'][..., marks_idx]
730 | 
731 |             for key in return_data:
732 |                 assert len(return_data[key].shape) == 2
733 |                 assert return_data[key].shape[1] == len(marks)
734 | 
735 |             return return_data
736 | 
737 | 
738 |     def load_binary_genome(self, X_or_Y, marks, only_chr1=False):
739 |         """
740 |         Loads a binary genome, selecting only chroms present in self.chroms.
741 |         Returns peak_matrices, peak_pval_matrices
742 |         where peak_pval_matrices is a dictionary where each key is a chromosome, value is a matrix
743 |         which is chrom_length x len(marks) with a zero if there's no peak and a -log10 pvalue otherwise.
744 |         peak_matrices is the same but with a 1, not a p-value, for peaks; returned for convenience. 
745 | 
746 |         normalization is passed in only to get the correct metadata.
747 |         """
748 |         subsample_target_string = self.get_subsample_target_string(X_or_Y)
749 | 
750 |         peak_dict = {}
751 |         peak_pval_dict = {}
752 | 
753 |         for mark in marks:
754 |             peak_dict[mark], peak_pval_dict[mark] = get_peaks(self.cell_line, mark, subsample_target_string)
755 |             
756 |         peak_matrices = {}
757 |         peak_pval_matrices = {}
758 | 
759 |         if self.species == 'hg19':
760 |             chrom_sizes = HG19_CHROM_SIZES
761 |         else:
762 |             chrom_sizes = MM9_CHROM_SIZES
763 |         chroms_to_use = self.chroms if not only_chr1 else ['chr1']
764 | 
765 |         for chromosome in chroms_to_use:
766 |             n_bins_in_chrom = int(chrom_sizes[chromosome] / 25.)
767 |             peak_matrices[chromosome] = np.zeros([n_bins_in_chrom, len(marks)])
768 |             peak_pval_matrices[chromosome] = np.zeros([n_bins_in_chrom, len(marks)])
769 |             for mark_idx, mark in enumerate(marks):
770 |                 for i, peak in enumerate(peak_dict[mark][chromosome]):
771 |                     peak_matrices[chromosome][peak[0]:peak[1], mark_idx]  = 1
772 |                     peak_pval_matrices[chromosome][peak[0]:peak[1], mark_idx]  = peak_pval_dict[mark][chromosome][i]
773 |         
774 |         return peak_matrices, peak_pval_matrices
775 | 


--------------------------------------------------------------------------------
/models.py:
--------------------------------------------------------------------------------
   1 | from __future__ import division
   2 | from __future__ import print_function
   3 | from __future__ import absolute_import
   4 | from __future__ import unicode_literals  
   5 | 
   6 | import os
   7 | import datetime
   8 | import json
   9 | import copy
  10 | import math
  11 | 
  12 | import numpy as np
  13 | import pandas as pd
  14 | 
  15 | from keras.models import Sequential, model_from_json
  16 | from keras.layers.core import TimeDistributedDense, Activation, Dense, Flatten, Merge
  17 | from keras.layers.convolutional import Convolution1D, MaxPooling1D
  18 | from keras.layers.recurrent import SimpleRNN, GRU, LSTM
  19 | from keras.callbacks import ModelCheckpoint, EarlyStopping
  20 | from keras.constraints import maxnorm
  21 | from keras.regularizers import l2, activity_l2
  22 | 
  23 | from prepData import generate_bigWig, get_peaks, perform_denormalization, input_not_before_end
  24 | from dataset import DatasetEncoder
  25 | import evaluations
  26 | from dataNormalizer import DataNormalizer
  27 | 
  28 | from diConstants import (BASE_ROOT, MODELS_ROOT, WEIGHTS_ROOT, 
  29 |                          RESULTS_ROOT, LOSS_ROOT, HIST_ROOT, EVAL_ROOT, RESULTS_BIGWIG_ROOT,
  30 |                          BIN_SIZE, GENOME_BATCH_SIZE, NUM_BASES)
  31 | 
  32 | 
  33 | 
  34 | def pad_sequence_with_zeros(X, padding):
  35 |     """
  36 |     Takes in a matrix X of shape num_bins x num_histone_marks and adds zero padding to the left end
  37 |     and to the right end. Returns a matrix of shape (num_bins + 2 * padding) x num_histone_marks 
  38 |     """
  39 | 
  40 |     assert len(X.shape) == 2
  41 |     assert padding >= 0
  42 | 
  43 |     num_bins, num_histone_marks = X.shape
  44 |     P = np.zeros([
  45 |         num_bins + 2 * padding,
  46 |         num_histone_marks])
  47 | 
  48 |     # Say we want to add a padding of 2 on each side of an X that is 101 x 6
  49 |     # We want P[2:103, :] to be X
  50 |     P[padding : (num_bins+padding), :] = X
  51 | 
  52 |     return P
  53 | 
  54 | 
  55 | 
  56 | class SeqModel(object):
  57 |     """
  58 |     Base class from which SeqToPoint derives. (We used this base class to prototype other 
  59 |     approaches; the paper is based only on SeqToPoint.)
  60 |     
  61 |     SeqToPoint:
  62 |         X is 3D with shape num_examples x seq_length x num_input_marks
  63 |         Y is 2D with shape num_examples x 1 x num_output_marks
  64 |     
  65 |     SeqModel implements instance methods:
  66 |         load_model()
  67 |         save_model_params()
  68 |         get_unprocessed_data()
  69 |         get_processed_data()
  70 |         train_single_model()
  71 |         compile_and_train_model()
  72 |         evaluate_model()
  73 |         test_model_on_samples()
  74 |         test_model_on_genome()
  75 | 
  76 |     Static method:
  77 |         instantiate_model()
  78 |     This is a static method because the __init__ function expects model_params,
  79 |     and if we're loading a model from a file, we don't know those model_params before we
  80 |     call this method.
  81 | 
  82 |     Abstract methods:
  83 |         process_X()
  84 |         process_Y()   
  85 |         predict_samples()
  86 |         predict_sequence()
  87 |     """
  88 | 
  89 |     def __init__(self, model_params):
  90 | 
  91 |         # We set the random seed two times in this file:
  92 |         # Once here, and once right after loading the training data but before training the model.
  93 |         # The reason is that when we try to load the training data, if the dataset doesn't exist yet,
  94 |         # we generate and save the training data on the fly. However, the dataset generation code
  95 |         # also sets the numpy random seed, so we need to reset it after loading the data.
  96 |         # We set a random seed of model_params['random_seed'] here 
  97 |         # and a random seed of model_params['random_seed'] + 42 right after loading the data.
  98 |         np.random.seed(model_params['random_seed'])
  99 | 
 100 |         self.model_library = model_params['model_library']
 101 |         if not (self.model_library in ['keras']):
 102 |             raise ValueError, "model_library must be 'keras'"
 103 | 
 104 |         self.model = None
 105 |         self.model_params = model_params
 106 |         self.dataset_params = model_params['dataset_params']
 107 |         self.train_dataset = model_params['dataset_params']['train_dataset']
 108 |         self.test_datasets = model_params['dataset_params']['test_datasets']
 109 | 
 110 | 
 111 |         self.normalizer = DataNormalizer(self.model_params['scale_input'])
 112 | 
 113 |         # self.model_stamp is the unique identifier for this particular model
 114 |         # It looks like "RNN-20150911-175345976535", where the numbers are the date and time
 115 |         # that the model was saved, down to the microsecond to avoid race conditions.
 116 |         # It is set when the model is saved, and can be read from the filename that
 117 |         # the model is saved in.
 118 |         self.model_stamp = None
 119 | 
 120 |         # self.model_path is where the model was saved in the disk
 121 |         # It should be in MODELS_ROOT with filename [model_stamp],json
 122 |         self.model_path = None
 123 | 
 124 |         self.final_train_error = None
 125 |         self.final_valid_error = None
 126 | 
 127 |         self.hist = None
 128 | 
 129 |         self.input_marks = model_params['input_marks']
 130 |         self.num_input_marks = len(self.input_marks)
 131 |         
 132 |         self.output_marks = model_params['output_marks']
 133 |         self.num_output_marks = len(self.output_marks)
 134 | 
 135 |         assert(input_not_before_end(model_params['output_marks']))
 136 |         assert(input_not_before_end(model_params['input_marks']))
 137 |         
 138 |         self.is_output_in_input = True
 139 |         
 140 | 
 141 |         for output_mark in self.output_marks:
 142 |             if output_mark not in self.input_marks:
 143 |                 self.is_output_in_input = False
 144 |                 break
 145 | 
 146 |         if (self.model_params['predict_binary_output']) and ('INPUT' in self.output_marks):
 147 |             raise ValueError, "Cannot predict peaks on INPUT."
 148 |             
 149 |         self.verbose = True
 150 | 
 151 |         print("Initialized model with parameters:")
 152 |         print(json.dumps(model_params, indent=4, cls=DatasetEncoder))
 153 | 
 154 |         
 155 |     @staticmethod
 156 |     def instantiate_model(model_params):
 157 |         """
 158 |         Given model_params, looks at the model_class in it and 
 159 |         returns a copy of the appropriate subclass of SeqModel.
 160 |         """
 161 |         
 162 |         if model_params['model_class'] == 'SeqToSeq':
 163 |             m = SeqToSeq(model_params)
 164 |         elif model_params['model_class'] == 'SeqToPoint':
 165 |             m = SeqToPoint(model_params)
 166 |         elif model_params['model_class'] == 'PointToPoint':
 167 |             m = PointToPoint(model_params)
 168 | 
 169 |         return m
 170 | 
 171 | 
 172 |     def load_model(self, model_path):
 173 |         """
 174 |         Loads a Keras model from disk. 
 175 | 
 176 |         This only works on Keras models.
 177 | 
 178 |         The model will need to be compiled before it can be used for training.
 179 | 
 180 |         This is currently a weird function: because it's an instance method,
 181 |         it expects a SeqModel object to already exist. Worse, the SeqModel object
 182 |         must already be pre-initialized with fake model_params, since the SeqModel constructor
 183 |         needs model_params to be passed in. 
 184 | 
 185 |         This should be rewritten when we actually need to use it. 
 186 |         Thankfully, it is not super useful right now - we will only need it
 187 |         if the model init code changes such that we cannot recover previous models with 
 188 |         current code plus model_params.
 189 |         """
 190 | 
 191 |         assert self.model_library == 'keras'
 192 | 
 193 |         model_JSON_str = open(model_path).read()
 194 |         model_JSON = json.loads(model_JSON_str)
 195 | 
 196 |         self.model = model_from_json(model_JSON_str)
 197 |         self.model_params = model_JSON['_modelParams']
 198 |         self.dataset_params = self.model_params['dataset_params']
 199 |         assert self.model_params['model_library'] == 'keras'
 200 | 
 201 |         self.num_input_marks = self.model_params['num_input_marks']
 202 |         self.num_output_marks = self.model_params['num_output_marks']
 203 | 
 204 |         self.final_train_error = None
 205 |         self.final_valid_error = None
 206 | 
 207 |         self.hist = None
 208 | 
 209 |         # self.model_stamp is the unique identifier for this particular model
 210 |         # It looks like "RNN-20150911-175345976535", where the numbers are the date and time
 211 |         # that the model was saved, down to the microsecond to avoid race conditions.
 212 |         self.model_stamp = os.path.splitext(
 213 |             os.path.basename(model_path))[0]
 214 | 
 215 |         self.model_path = model_path
 216 | 
 217 |         return None
 218 | 
 219 | 
 220 |     def get_unprocessed_data(self, dataset):
 221 |         """
 222 |         Loads the train or test dataset (as specified in train_or_test) found in self.dataset_params
 223 |         in its original seq-to-seq form, as returned by extractDataset.load_seq_dataset.
 224 | 
 225 |         This function resets the random seed.
 226 |         """
 227 |         X, Y, peakPValueX, peakPValueY, peakBinaryX, peakBinaryY = dataset.load_seq_dataset(     
 228 |             seq_length=self.dataset_params['seq_length'],
 229 |             input_marks=self.input_marks,
 230 |             output_marks=self.output_marks)
 231 | 
 232 |         if self.model_params['zero_out_non_bins']:
 233 |             peakPValueX = peakPValueX * peakBinaryX
 234 |             peakPValueY = peakPValueY * peakBinaryY            
 235 | 
 236 |         if ((self.num_input_marks != X.shape[2]) or 
 237 |             (self.num_input_marks != peakPValueX.shape[2] + ('INPUT' in self.input_marks))):
 238 |             raise Exception, "num_input_marks between model and data needs to agree"
 239 |         if ((self.num_output_marks != Y.shape[2]) or 
 240 |             (self.num_output_marks != peakPValueY.shape[2] + ('INPUT' in self.output_marks))):
 241 |             raise Exception, "num_output_marks between model and data needs to agree"
 242 | 
 243 |         # See comment in __init__ about random seeds
 244 |         np.random.seed(self.model_params['random_seed'] + 42)
 245 | 
 246 |         return (X, Y, peakPValueX, peakPValueY, peakBinaryX, peakBinaryY)
 247 | 
 248 | 
 249 |     def get_processed_data(self, dataset):
 250 |         """
 251 |         Returns the train or test dataset (as specified in train_or_test) found in 
 252 |         self.dataset_params, transformed into a format that the model can directly use.
 253 | 
 254 |         Helper functions process_X and process_Y are implemented in subclasses because
 255 |         different models need differently formatted data, e.g., seq-to-seq vs. seq-to-point.
 256 |         
 257 |         Seq-to-seq takes in:
 258 |             X: num_examples x seq_length x num_input_marks
 259 |             Y: num_examples x seq_length x num_output_marks
 260 | 
 261 |         Seq-to-point takes in:
 262 |             X: num_examples x seq_length x num_input_marks
 263 |             Y: num_examples x 1 x num_output_marks
 264 | 
 265 |         Point-to-point takes in:
 266 |             X: num_examples x (seq_length * num_input_marks)
 267 |             Y: num_examples x num_output_marks
 268 | 
 269 |         """
 270 | 
 271 |         X, Y, peakPValueX, peakPValueY, peakBinaryX, peakBinaryY = self.get_unprocessed_data(dataset)
 272 | 
 273 |         X = self.process_X(X)
 274 |         Y = self.process_Y(Y)
 275 |         peakPValueX = self.process_X(peakPValueX)
 276 |         peakPValueY = self.process_Y(peakPValueY)
 277 |         peakBinaryX = self.process_X(peakBinaryX)
 278 |         peakBinaryY = self.process_Y(peakBinaryY)
 279 | 
 280 |         return (X, Y, peakPValueX, peakPValueY, peakBinaryX, peakBinaryY)
 281 | 
 282 | 
 283 |     def compile_and_train_model(self):
 284 |         """
 285 |         Trains the model specified by self.model and self.model_params 
 286 |         on the training data given by self.dataset_params.
 287 | 
 288 |         If self.model is a Keras model, it also writes out model weights and 
 289 |         training history to disk.
 290 |         """
 291 |         
 292 |         assert self.model
 293 |         assert self.model_params
 294 | 
 295 |         # Train model
 296 |         (train_X, Y, peakPValueX, peakPValueY, peakBinaryX, peakBinaryY) = self.get_processed_data(
 297 |             self.train_dataset)
 298 | 
 299 |         self.normalizer.fit(train_X)
 300 |         train_X = self.normalizer.transform(train_X)
 301 |         train_inputs_X = train_X
 302 | 
 303 |         if self.model_params['predict_binary_output']:
 304 |             train_Y = peakBinaryY      
 305 |         else: 
 306 |             train_Y = Y
 307 |                 
 308 |         
 309 |         
 310 |         if self.model_library == 'keras':
 311 | 
 312 |             # Compiles model: this sets the optimizer and loss function
 313 |             self.model.compile(**self.model_params['compile_params'])
 314 | 
 315 |             # ModelCheckpoint() is a Keras callback that saves the weights of the model while 
 316 |             # it's being trained.
 317 |             # save_best_only means that the model weights will be saved after every epoch
 318 |             # in which the validation error improves.
 319 |             checkpointer = ModelCheckpoint(
 320 |                 filepath=os.path.join(WEIGHTS_ROOT, '%s-weights.hdf5' % self.model_stamp), 
 321 |                 verbose=1, 
 322 |                 save_best_only=True)
 323 | 
 324 |             # EarlyStopping() is a Keras callback that stops training once the validation loss
 325 |             # of the model has not improved for [patience] epochs in a row.
 326 |             earlystopper = EarlyStopping(monitor='val_loss', patience=3, verbose=0)
 327 | 
 328 |             self.hist = self.model.fit(
 329 |                 train_inputs_X,
 330 |                 train_Y,
 331 |                 callbacks=[checkpointer, earlystopper],
 332 |                 **self.model_params['train_params'])
 333 | 
 334 |             # Store training history for Keras models
 335 |             # Note that the "final training error" in self.hist.history is only approximate: 
 336 |             # it is averaged over all minibatches in the final epoch. So it's not exactly the 
 337 |             # training error with the final weights. The final validation error is accurate.
 338 | 
 339 |             hist_path = os.path.join(
 340 |                 HIST_ROOT,
 341 |                 "%s.hist" % self.model_stamp)
 342 | 
 343 |             with open(hist_path, 'w') as f:
 344 |                 f.write(json.dumps(self.hist.history))
 345 | 
 346 |         return None
 347 | 
 348 | 
 349 |     def save_model_params(self):
 350 |         """
 351 |         Writes model to disk, initializing model_stamp and model_path in the process.
 352 |         This function is called in the __init__ method of derived classes.
 353 | 
 354 |         For Keras models, this saves compilation parameters separately 
 355 |         without actually compiling the model to save time. Keras model weights are
 356 |         saved during training through the ModelCheckpoint() callback, so we can reconstruct
 357 |         trained models by separately loading the saved params and the weights. See
 358 |         http://keras.io/faq/#how-can-i-save-a-keras-model for more details.
 359 |         """
 360 | 
 361 |         assert self.model
 362 |         assert self.model_params
 363 | 
 364 |         # If it's a Keras model, we save not only model_params but the actual
 365 |         # architecture of the model, since the code that constructs models from model_params
 366 |         # might change over time.
 367 |         if self.model_library == 'keras':
 368 |             model_JSON = self.model_params
 369 |             model_JSON['_keras_model_params'] = json.loads(self.model.to_json())
 370 |             model_JSON_str = json.dumps(model_JSON, cls=DatasetEncoder)
 371 |         
 372 |         timeStr = datetime.datetime.now().strftime("%Y%m%d-%H%M%S%f")
 373 |         self.model_stamp = "%s-%s" % (self.model_params['model_type'], timeStr)
 374 |         self.model_path = os.path.join(MODELS_ROOT, "%s.json" % self.model_stamp)
 375 | 
 376 |         assert os.path.isfile(self.model_path) == False
 377 | 
 378 |         with open(self.model_path, 'w') as model_file:
 379 |             model_file.write(model_JSON_str)
 380 | 
 381 |         return None
 382 | 
 383 | 
 384 |     def test_model_on_samples(self, dataset, train_or_test): 
 385 |         """
 386 |         Evaluates the model on samples drawn from dataset.
 387 |         Returns a dictionary with keys orig_results and denoised_results, with values obtained
 388 |         from evaluations.compare.
 389 | 
 390 |         The train_or_test param is just for display.
 391 |         """
 392 |         assert self.model
 393 |         assert train_or_test == 'train' or train_or_test == 'test'
 394 | 
 395 |         (X, Y, peakPValueX, peakPValueY, peakBinaryX, peakBinaryY) = self.get_unprocessed_data(dataset)
 396 |         binaryY = peakBinaryY
 397 | 
 398 |         Y = self.process_Y(Y)
 399 |         peakPValueY = self.process_Y(peakPValueY)
 400 |         binaryY = self.process_Y(binaryY)
 401 | 
 402 |         if not self.model_params['predict_binary_output']:
 403 |             print('Bias-only MSE is ', np.mean((Y - np.mean(Y)) ** 2))
 404 | 
 405 |         # First, compare the true data with the subsampled data
 406 |         # To get the "original" error, we just make the prediction that Y = X.
 407 |         # Before doing this, we have to call process_Y on X to get it into the right form.
 408 |         # This is not a typo! We have to process X in the way that we'd normally process Y.
 409 |         # This is needed for SeqToPoint and PointToPoint models, since in those models 
 410 |         # the X and Y returned from self.get_data have different shapes.
 411 |         # Since input_marks might not equals output_marks, we also have to subset the right
 412 |         # parts of X to compare.
 413 |         # If we're doing de novo imputation then output marks will not be in input marks; if
 414 |         # so, we just skip this step.
 415 |         orig_results = None
 416 | 
 417 |         if self.is_output_in_input:
 418 |             output_marks_idx = [self.input_marks.index(output_mark) for output_mark in self.output_marks]
 419 |             if self.model_params['predict_binary_output']:
 420 |                 print("%s samples - Original peaks vs. true peaks:" % train_or_test)
 421 |                 orig_results = evaluations.compare(
 422 |                     self.process_Y(peakPValueX[..., output_marks_idx]), 
 423 |                     binaryY, 
 424 |                     predict_binary_output=True)
 425 |             else:
 426 |                 print("%s samples - Original:" % train_or_test)
 427 |                 orig_results = evaluations.compare(
 428 |                     self.process_Y(X[..., output_marks_idx]), 
 429 |                     Y, 
 430 |                     predict_binary_output=False)
 431 | 
 432 |         # Then compare the true data with the output of the model
 433 |         # Process the data properly
 434 |         X = self.process_X(X)
 435 |         X = self.normalizer.transform(X)
 436 | 
 437 |         # We have to batch the prediction so that the GPU doesn't run out of memory
 438 |         if 'batch_size' in self.model_params['train_params']:
 439 |             batch_size = self.model_params['train_params']['batch_size'] 
 440 |         else:
 441 |             batch_size = 10000
 442 |         num_examples = X.shape[0]
 443 |         num_batches = int(math.ceil(1.0 * num_examples / batch_size))
 444 |         
 445 |         # If predict_binary_output is true, then INPUT cannot be in output_marks, so
 446 |         # Y will have the same shape as binaryY. 
 447 |         # This is not necessarily true if predict_binary_output is false.
 448 |         # There's no need to branch separately here to initialize Y_pred = np.empty(binaryY.shape) 
 449 |         # if predict_binary_output is true.
 450 |         Y_pred = np.empty(Y.shape)
 451 | 
 452 |         for batch in range(num_batches):
 453 |             start_idx = batch * batch_size
 454 |             end_idx = min((batch + 1) * batch_size, num_examples)
 455 |             Y_pred[start_idx : end_idx] = self.predict_samples(X[start_idx : end_idx])
 456 | 
 457 |         if self.model_params['predict_binary_output']:
 458 |             print("%s samples - Predicted peaks vs. true peaks:" % train_or_test)
 459 |             denoised_results = evaluations.compare(Y_pred, binaryY, predict_binary_output=True)
 460 |         else:            
 461 |             print("%s samples - Denoised:" % train_or_test)
 462 |             denoised_results = evaluations.compare(Y_pred, Y, predict_binary_output=False)        
 463 | 
 464 |         samples_results = {
 465 |             'orig': orig_results,
 466 |             'dn': denoised_results
 467 |         }       
 468 |         return samples_results
 469 | 
 470 | 
 471 |     def test_model_on_genome(self, dataset):
 472 |         """
 473 |         Evaluates the model on the entire genome in dataset.
 474 |         Returns a dictionary with keys orig_results and denoised_results, with values obtained
 475 |         from evaluations.compare.
 476 | 
 477 |         This function generates genome-wide predictions for each chromosome in the 
 478 |         test dataset. Blacklisted regions have previously been zero-ed out in prepData.
 479 |         """
 480 | 
 481 |         assert self.model        
 482 | 
 483 |         # only_chr1 controls whether genome-wide prediction is done on the whole genome, or just
 484 |         # on chr1 for speed.
 485 |         only_chr1 = self.dataset_params['only_chr1']
 486 |         # Load data   
 487 |         test_X_all = dataset.load_genome(            
 488 |             "X",
 489 |             marks=self.input_marks,
 490 |             only_chr1=only_chr1, 
 491 |             peaks=False)
 492 | 
 493 |         if self.model_params['predict_binary_output']:
 494 |             #if binary, want to use binary peak matrix as Y. 
 495 |             #and noisy peak p-values as baseline. 
 496 |             assert('INPUT' not in self.output_marks)
 497 |             test_Y_all, _ = dataset.load_binary_genome(                
 498 |                 "Y",
 499 |                 marks=self.output_marks,
 500 |                 only_chr1=only_chr1)
 501 | 
 502 |             noisy_peak_pvals_all = dataset.load_genome(                
 503 |                 "X",
 504 |                 marks=self.output_marks,
 505 |                 only_chr1=only_chr1, 
 506 |                 peaks=True)
 507 | 
 508 |             if self.model_params['zero_out_non_bins']:
 509 |                 noisy_peaks_all, _ = dataset.load_binary_genome(                    
 510 |                     "X",
 511 |                     marks=self.output_marks,
 512 |                     only_chr1=only_chr1)
 513 |                 assert(set(noisy_peak_pvals_all.keys()) == set(noisy_peaks_all.keys()))
 514 |                 for chrom in noisy_peak_pvals_all:
 515 |                     noisy_peak_pvals_all[chrom] = noisy_peak_pvals_all[chrom] * noisy_peaks_all[chrom]
 516 | 
 517 |         else:
 518 |             #otherwise, use continuous non-subsampled signal as Y. 
 519 |             test_Y_all = dataset.load_genome(                
 520 |                 "Y",
 521 |                 marks=self.output_marks,
 522 |                 only_chr1=only_chr1, 
 523 |                 peaks=False)
 524 | 
 525 |             # Load peaks from test cell line
 526 |             peak_locs_all = {}
 527 |             for factor in dataset.marks_in_dataset:
 528 |                 if factor == 'INPUT': continue
 529 |                 peak_locs, _ = get_peaks(
 530 |                     dataset.cell_line, 
 531 |                     factor, 
 532 |                     subsample_target_string=dataset.Y_subsample_target_string)
 533 |                 peak_locs_all[factor] = peak_locs
 534 | 
 535 | 
 536 |         if (test_Y_all.keys() != test_X_all.keys()):
 537 |             raise Exception, "Subsampled and full data must have the same chroms"
 538 | 
 539 | 
 540 |         chroms = sorted(test_X_all.keys())
 541 | 
 542 | 
 543 |         ### Compute results separately for each chromosome
 544 | 
 545 |         orig_results_all = {}
 546 |         denoised_results_all = {}
 547 |         orig_results_peaks = {}
 548 |         denoised_results_peaks = {}
 549 | 
 550 |         preds = {}
 551 | 
 552 |         # Warning: the peak comparison code relies on the sequence starting at the start
 553 |         # of the chromosome. If this is not true, we'd have to offset the peak coordinates before
 554 |         # passing them into compare().
 555 |         if only_chr1:
 556 |             chroms = ['chr1']
 557 | 
 558 |         for chrom in chroms:            
 559 |             test_X = test_X_all[chrom]
 560 |             test_Y = test_Y_all[chrom]
 561 |             if self.model_params['predict_binary_output']:
 562 |                 noisy_peak_pvals = noisy_peak_pvals_all[chrom]
 563 | 
 564 | 
 565 |             if self.dataset_params['num_bins_to_test']:
 566 |                 num_bins_to_test = self.dataset_params['num_bins_to_test']
 567 |                 assert num_bins_to_test > 0
 568 |                 test_X = test_X[:num_bins_to_test]
 569 |                 test_Y = test_Y[:num_bins_to_test]
 570 |                 if self.model_params['predict_binary_output']:
 571 |                     noisy_peak_pvals = noisy_peak_pvals[:num_bins_to_test]
 572 |             
 573 |             assert test_X.shape[0] == test_Y.shape[0], \
 574 |                 "Subsampled and full data must have the same length"
 575 | 
 576 |             if self.model_params['predict_binary_output']:
 577 |                 assert(list(noisy_peak_pvals.shape) == list(test_Y.shape))
 578 | 
 579 |             assert test_X.shape[1] == self.num_input_marks
 580 |             assert test_Y.shape[1] == self.num_output_marks
 581 | 
 582 |             chrom_length = test_X.shape[0] 
 583 | 
 584 |             ### Get a list of peaks for this chromosome            
 585 |             peaks = []       
 586 |             if not self.model_params['predict_binary_output']:     
 587 |                 for factor in self.output_marks:
 588 |                 # For INPUT, we calculate MSE across peaks of all other marks in the test dataset,
 589 |                 # since we want to get INPUT right whenever there's a peak in some other mark.
 590 |                 # Note that we're purely concatenating peaks from different marks here,
 591 |                 # so there'll be some overlapping peaks.
 592 |                 # This is fine right now but might break later depending on what evaluation code we 
 593 |                 # write, so watch out.
 594 |                     if factor == 'INPUT':
 595 |                         peak_factor = []
 596 |                         for other_factor in dataset.marks_in_dataset:
 597 |                             if other_factor == 'INPUT': continue
 598 |                             peak_factor.extend(peak_locs_all[other_factor][chrom])                        
 599 |                         peak_factor = np.array(peak_factor)
 600 |                     else:                    
 601 |                         peak_factor = peak_locs_all[factor][chrom]
 602 |                     peaks.append(peak_factor)        
 603 |             
 604 |             ### Do comparisons between original (subsampled) and full data
 605 |             # The original comparison is only done if the output mark is actually in the input data
 606 |             if self.is_output_in_input:
 607 |                 if not self.model_params['predict_binary_output']:     
 608 | 
 609 |                     output_marks_idx = [self.input_marks.index(output_mark) for output_mark in self.output_marks]
 610 |                     
 611 |                     print("Test %s, %.2E bins - Original, all signal:" % (chrom, chrom_length))
 612 |                     orig_results_all[chrom] = evaluations.compare(
 613 |                         test_X[:, output_marks_idx], 
 614 |                         test_Y, 
 615 |                         predict_binary_output=False)
 616 | 
 617 |                     print("Test %s, %.2E bins - Original, only peaks:" % (chrom, chrom_length))
 618 |                     orig_results_peaks[chrom] = evaluations.compare(
 619 |                         test_X[:, output_marks_idx], 
 620 |                         test_Y, 
 621 |                         predict_binary_output=False,
 622 |                         peaks=peaks)
 623 | 
 624 |                 elif self.model_params['predict_binary_output']:
 625 |                     print("Test %s, %.2E bins - Original:" % (chrom, chrom_length))
 626 |                     orig_results_all[chrom] = evaluations.compare(
 627 |                         noisy_peak_pvals, 
 628 |                         test_Y, 
 629 |                         predict_binary_output=True)
 630 | 
 631 | 
 632 |             ### Do comparisons between model output and full data
 633 |             # We have to batch this up so that the GPU doesn't run out of memory
 634 |             # Assume a fixed batch size of 5M bins
 635 |             num_batches = int(math.ceil(1.0 * chrom_length / GENOME_BATCH_SIZE))
 636 |             
 637 |             test_Y_pred = np.empty(test_Y.shape)
 638 |             test_X = self.normalizer.transform(test_X)        
 639 | 
 640 |             for batch in range(num_batches):
 641 |                 start_idx = batch * GENOME_BATCH_SIZE
 642 |                 end_idx = min((batch + 1) * GENOME_BATCH_SIZE, chrom_length)
 643 |                 test_Y_pred[start_idx : end_idx] = self.predict_sequence(
 644 |                     test_X[start_idx : end_idx])
 645 |             
 646 |             print("Test %s, %.2E bins - Denoised, all signal:" % (chrom, chrom_length))
 647 |             denoised_results_all[chrom] = evaluations.compare(
 648 |                 test_Y_pred, 
 649 |                 test_Y,
 650 |                 predict_binary_output=self.model_params['predict_binary_output'])
 651 | 
 652 |             if not self.model_params['predict_binary_output']:
 653 |                 print("Test %s, %.2E bins - Denoised, only peaks:" % (chrom, chrom_length))
 654 |                 denoised_results_peaks[chrom] = evaluations.compare(
 655 |                     test_Y_pred, 
 656 |                     test_Y,
 657 |                     predict_binary_output=False,
 658 |                     peaks=peaks)
 659 | 
 660 |             
 661 | 
 662 |             # If we're generating a bigWig file from the output, we need to save the results
 663 |             # If we're doing regression, we first denormalize the outputs so that it can be viewed v
 664 |             # correctly in the genome browser
 665 |             if self.model_params['generate_bigWig']:
 666 | 
 667 |                 if self.model_params['predict_binary_output']:
 668 |                     preds[chrom] = test_Y_pred
 669 |                 else:
 670 |                     preds[chrom] = perform_denormalization(
 671 |                         test_Y_pred, 
 672 |                         dataset.normalization)
 673 | 
 674 |         # Write bigWig file to disk
 675 |         if self.model_params['generate_bigWig']:
 676 |             if self.model_params['predict_binary_output']:
 677 |                 suffix = 'peaks'
 678 |             else:
 679 |                 suffix = 'signal'
 680 | 
 681 |             generate_bigWig(
 682 |                 preds, 
 683 |                 self.output_marks,
 684 |                 '%s_%s_subsample-%s_%s' % (
 685 |                     self.model_stamp,
 686 |                     dataset.cell_line,
 687 |                     dataset.X_subsample_target_string ,
 688 |                     suffix),
 689 |                 RESULTS_BIGWIG_ROOT)
 690 | 
 691 |         # Construct dict of results
 692 |         if self.model_params['predict_binary_output']:
 693 |             test_genome_results = {
 694 |                 'orig_all': orig_results_all,
 695 |                 'dn_all': denoised_results_all
 696 |             }  
 697 |         else:
 698 |             test_genome_results = {
 699 |                 'orig_all': orig_results_all,
 700 |                 'dn_all': denoised_results_all,
 701 |                 'orig_peaks': orig_results_peaks,
 702 |                 'dn_peaks': denoised_results_peaks,
 703 |             }        
 704 |          
 705 | 
 706 |         
 707 |         print('final results', test_genome_results)
 708 |         return test_genome_results
 709 | 
 710 | 
 711 |     def evaluate_model(self):
 712 |         """
 713 |         Evaluates the model on the train and test datasets specified in self.dataset_params.
 714 |         Writes the results to disk in EVAL_ROOT.        
 715 |         """
 716 |         # We need to write our own JSON encoder for numpy.float32s
 717 |         # because the built-in JSON encoder only knows how to encode normal floats
 718 |         class NumpyEncoder(json.JSONEncoder):
 719 |             def default(self, obj):        
 720 |                 if isinstance(obj, np.floating):
 721 |                     return float(obj)
 722 |                 else:
 723 |                     return super(NumpyEncoder, self).default(obj)
 724 | 
 725 |         # Evaluate model on training data        
 726 |         train_samples_results = self.test_model_on_samples(self.train_dataset, 'train')
 727 |         train_results = {
 728 |             'samples': train_samples_results
 729 |         }
 730 | 
 731 |         train_eval_path = os.path.join(
 732 |             EVAL_ROOT,
 733 |             "%s-train.eval" % self.model_stamp)
 734 | 
 735 |         with open(train_eval_path, 'w') as f:
 736 |             f.write(json.dumps(train_results, cls=NumpyEncoder))
 737 | 
 738 |         # Evaluate model on testing data
 739 |         all_test_results = []        
 740 |         for dataset_idx, test_dataset in enumerate(self.test_datasets):
 741 |             test_samples_results = self.test_model_on_samples(test_dataset, 'test')
 742 | 
 743 |             try:
 744 |                 test_genome_results = self.test_model_on_genome(test_dataset)
 745 |             except NotImplementedError:
 746 |                 print("Genome-wide prediction hasn't been implemented for this type of model. Skipping...")
 747 |                 test_genome_results = None
 748 | 
 749 |             test_results = {            
 750 |                 'samples': test_samples_results,
 751 |                 'genome': test_genome_results
 752 |             }
 753 | 
 754 |             test_eval_path = os.path.join(
 755 |                 EVAL_ROOT,
 756 |                 "%s-test-%s.eval" % (self.model_stamp, dataset_idx))
 757 |             with open(test_eval_path, 'w') as f:
 758 |                 f.write(json.dumps(test_results, cls=NumpyEncoder))
 759 | 
 760 |             all_test_results.append(test_results)
 761 | 
 762 |         results = {
 763 |             'train_samples': train_samples_results,
 764 |             'test_results': all_test_results
 765 |         }
 766 | 
 767 |         return results
 768 | 
 769 | 
 770 |     def process_X(self, X):
 771 |         """
 772 |         Takes in a matrix X of shape num_examples x seq_length x num_histone_marks,
 773 |         returned from extractDataset.load_seq_dataset, and processes it as necessary 
 774 |         for the type of model. X should be the input data that is fed to the model.
 775 |         
 776 |         This is implemented in subclasses because different models need differently
 777 |         formatted data, e.g., seq-to-seq vs. seq-to-point.
 778 |         """
 779 | 
 780 |         raise NotImplementedError
 781 | 
 782 | 
 783 |     def process_Y(self, Y):
 784 |         """
 785 |         Takes in a matrix Y of shape num_examples x seq_length x num_histone_marks,
 786 |         returned from extractDataset.load_seq_dataset, and processes it as necessary 
 787 |         for the type of model. Y should represet the desired output of the model.
 788 |         
 789 |         This is implemented in subclasses because different models need differently
 790 |         formatted data, e.g., seq-to-seq vs. seq-to-point.
 791 |         """
 792 | 
 793 |         raise NotImplementedError
 794 | 
 795 | 
 796 |     def SeqToX_predict_samples(self, signalX):
 797 |         """
 798 |         Common code used in the predict_samples() method defined in SeqToSeq and SeqToPoint subclasses.
 799 |         """
 800 | 
 801 |         num_examples = signalX.shape[0]
 802 | 
 803 |         assert len(signalX.shape) == 3
 804 |         assert signalX.shape[0] == num_examples
 805 |         assert signalX.shape[1] == self.dataset_params['seq_length']
 806 |         assert signalX.shape[2] == self.num_input_marks
 807 | 
 808 | 
 809 |         Y = self.model.predict(signalX)
 810 | 
 811 |         assert Y.shape[0] == num_examples
 812 |         assert Y.shape[2] == self.num_output_marks
 813 | 
 814 |         return Y
 815 | 
 816 | 
 817 |     def predict_samples(self, signalX):
 818 |         """
 819 |         Takes in input signalX of whatever dimensions are needed for the model, which
 820 |         is subclass-dependent. It passes it through the model and returns the output matrix.
 821 |         """
 822 | 
 823 |         raise NotImplementedError
 824 | 
 825 | 
 826 |     def predict_sequence(self, signalX):
 827 |         """
 828 |         Takes in input matrix signalX of dimensions num_bins x num_input_marks 
 829 |         and passes it through the model, 
 830 |         returning an output matrix of num_bins x num_output_marks.
 831 |         """
 832 | 
 833 |         raise NotImplementedError
 834 | 
 835 | 
 836 | 
 837 | 
 838 | class SeqToPoint(SeqModel):
 839 | 
 840 |     def __init__(self, model_params):            
 841 |         """
 842 |         Initializes the correct model based on model_params.
 843 |         """
 844 | 
 845 |         super(SeqToPoint, self).__init__(model_params)
 846 | 
 847 |         assert self.dataset_params['seq_length'] % 2 == 1, "seq_length must be odd for SeqToPoint models."
 848 | 
 849 |         if model_params['model_type'] == 'cnn':
 850 | 
 851 |             num_filters = model_params['num_filters']
 852 |             filter_length = model_params['filter_length']
 853 | 
 854 |             model = Sequential()
 855 | 
 856 |             # border_mode='same' makes the length of the output 
 857 |             # the same size as the length of the input
 858 |             # by adding just the right amount of zero padding to each side.
 859 |             model.add(
 860 |                 Convolution1D(                    
 861 |                     num_filters, 
 862 |                     filter_length,
 863 |                     input_dim=self.num_input_marks,
 864 |                     init='uniform', 
 865 |                     border_mode='same')) 
 866 | 
 867 |             model.add(Activation('relu'))
 868 | 
 869 |             # See below for documentation on border_mode='valid'
 870 |             # We are essentially replicating the "dense" layer here, but with a convolutional layer
 871 |             # so that later we can do genome-wide prediction.
 872 |             model.add(
 873 |                 Convolution1D(
 874 |                     self.num_output_marks, # output_dim,                    
 875 |                     self.dataset_params['seq_length'],
 876 |                     init='uniform',
 877 |                     border_mode='valid'))
 878 |             
 879 |             if model_params['predict_binary_output']:
 880 |                 model.add(Activation('sigmoid'))
 881 |             else: 
 882 |                 model.add(Activation('relu'))
 883 | 
 884 |         # 'lrnn' stands for linear regression neural network
 885 |         # It is a single convolutional layer with filters that span the entire seq length. 
 886 |         # Essentially, this replicates linear or logistic regression in the Keras framework.
 887 |         # border_mode='valid' means that it only does convolutions where the whole filter can fit in the sequence 
 888 |         # so effectively it is only doing one convolution/feedforward operation during training. 
 889 |         # We make it convolutional so that we can easily do genome-wide predictions later.
 890 |         # It has as many neurons as there are histone marks, that is, there is one filter per histone mark.
 891 |         # This way, each histone mark gets seq_length * num_input_marks parameters to make a linear prediction.
 892 |         elif model_params['model_type'] == 'lrnn':
 893 |             model = Sequential()
 894 | 
 895 |             model.add(
 896 |                 Convolution1D(                    
 897 |                     self.num_output_marks, # nb_filter: one filter per histone mark
 898 |                     self.dataset_params['seq_length'], # filter_length
 899 |                     input_dim=self.num_input_marks,
 900 |                     border_mode='valid'))
 901 | 
 902 |             if model_params['predict_binary_output']:
 903 |                 model.add(Activation('sigmoid'))
 904 | 
 905 |         else:
 906 |             raise Exception, "Model type not recognized"
 907 | 
 908 |         self.model = model
 909 |         self.save_model_params()
 910 | 
 911 | 
 912 |     def process_X(self, X):
 913 |         """
 914 |         See documentation in SeqModel.
 915 |         Input to seq-to-point models need no further processing from load_seq_dataset.
 916 |         """
 917 | 
 918 |         return X
 919 | 
 920 | 
 921 |     def process_Y(self, Y):
 922 |         """
 923 |         See documentation in SeqModel.
 924 |         Takes in matrix Y of shape num_examples x seq_length x num_histone_marks
 925 |         and returns matrix of shape num_examples x 1 x num_histone_marks, selecting the 
 926 |         middle of the sequence.
 927 |         
 928 |         We want the singleton dimension so that we can avoid flattening the output of the 
 929 |         model in Keras. This doesn't matter in training, but it does in testing when we 
 930 |         are trying to do genome-wide predictions.
 931 |         """
 932 | 
 933 |         # If seq_length is 101
 934 |         # then the array goes from 0 to 100
 935 |         # and we want to pick mid = 50        
 936 |         mid = (self.dataset_params['seq_length'] - 1) / 2
 937 |         
 938 |         # Y = np.squeeze(Y[:, mid, :]) 
 939 |         # return Y
 940 | 
 941 |         return Y[:, mid:mid+1, :]
 942 | 
 943 | 
 944 |     def predict_samples(self, signalX):
 945 |         """
 946 |         Takes in input matrix signalX, of shape num_examples x seq_length x num_input_marks             
 947 |         and feeds it through the model, returning an output matrix of shape
 948 |         num_examples x 1 x num_output_marks.
 949 |         """
 950 |         Y = self.SeqToX_predict_samples(signalX)
 951 |         assert Y.shape[1] == 1
 952 | 
 953 |         return Y        
 954 | 
 955 | 
 956 |     def predict_sequence(self, signalX):
 957 |         """
 958 |         Takes in input matrix signalX of dimensions num_bins x num_input_marks 
 959 |         and passes it through the model, 
 960 |         returning an output matrix of num_bins x num_output_marks.
 961 |         """
 962 |         if ('lrnn' not in self.model_params['model_type']) and ('cnn' not in self.model_params['model_type']):
 963 |             raise NotImplementedError
 964 | 
 965 |         # We have to do some zero-padding on the input sequences before we pass them to the 
 966 |         # convolutional models defined in SeqToPoint. 
 967 |         # This is because the final layer of these conv nets are 'valid' convolutions with 
 968 |         # filter_length = seq_length. This means that the output of that layer, and therefore the 
 969 |         # model, will be (seq_length - 1) shorter than the input to that layer. This is necessary 
 970 |         # for training, since in training we the input is a sequence whereas the output is a single 
 971 |         # bin in the middle of the sequence. However, when trying to do genome-wide prediction, 
 972 |         # we need the output shape to match the input shape.
 973 | 
 974 |         # Warning: this code assumes that the final layer of the conv net is a 'valid' conv with 
 975 |         # filter_length = seq_length.
 976 |         num_bins = signalX.shape[0]
 977 | 
 978 |         # Initially, the shape of signalX is num_bins x num_input_marks.
 979 |         # We add (seq_length - 1) / 2 zeroes to both sides of the input, so that the 
 980 |         # resulting shape of the padded input is (num_bins + seq_length - 1) x num_input_marks.
 981 |         # The shape of the output will then be exactly num_bins x num_input_marks.
 982 |         assert len(signalX.shape) == 2
 983 |         assert signalX.shape[1] == self.num_input_marks
 984 |         signalX_pad = pad_sequence_with_zeros(
 985 |             signalX,
 986 |             padding=(self.dataset_params['seq_length'] - 1) / 2)
 987 | 
 988 |         # After padding, we reshape the input to fit the Keras predict() API, 
 989 |         # which requires a 3-tensor where the first dimension is the number of examples.
 990 |         # In our case, the number of examples is always 1 when doing genome-wide prediction.
 991 |         signalX = np.reshape(
 992 |             signalX_pad, 
 993 |             [1, signalX_pad.shape[0], signalX_pad.shape[1]])
 994 | 
 995 |         Y = self.model.predict(signalX)
 996 |         Y = Y[0]
 997 | 
 998 |         assert Y.shape[0] == num_bins
 999 |         assert Y.shape[1] == self.num_output_marks
1000 | 
1001 |         return Y 
1002 | 
1003 | 
1004 | 
1005 | 


--------------------------------------------------------------------------------
/prepData.py:
--------------------------------------------------------------------------------
   1 | from __future__ import division
   2 | from __future__ import print_function
   3 | from __future__ import absolute_import
   4 | from __future__ import unicode_literals  
   5 | 
   6 | from subprocess import call, check_output
   7 | import os
   8 | import json
   9 | from traceback import print_exc
  10 | import signal
  11 | import sys
  12 | import pandas as pd
  13 | from time import time, sleep
  14 | import numpy as np
  15 | import multiprocessing
  16 | import thread
  17 | import gzip
  18 | 
  19 | import IPython
  20 | 
  21 | from diConstants import (PIPELINE_ROOT, CODE_ROOT, DATA_ROOT, RAW_ROOT, MERGED_ROOT, REMOTE_ROOT,
  22 |     SUBSAMPLED_ROOT, BIGWIGS_ROOT, INTERVALS_ROOT, NUMPY_ROOT, BASE_ROOT, BASE_BIGWIG_ROOT,
  23 |     RESULTS_BIGWIG_ROOT, MODELS_ROOT,
  24 |     HG19_BLACKLIST_FILE, MM9_BLACKLIST_FILE,
  25 |     BIN_SIZE, HG19_CHROM_SIZES, HG19_CHROM_SIZES_PATH, MM9_CHROM_SIZES, MM9_CHROM_SIZES_PATH,
  26 |     PEAK_BASE_DIR, COMBINED_PEAK_DIR, SUBSAMPLE_TARGETS,
  27 |     GM_CELL_LINES, GM_FACTORS, GM_DATASET_NAME_TEMPLATE,        
  28 |     HG19_ALL_CHROMS, MM9_ALL_CHROMS,
  29 |     MAPQ_THRESHOLD)
  30 | 
  31 | 
  32 | def perform_normalization(X, normalization):
  33 |     """
  34 |     Normalizes a dataset using a method in ['log', 'arcsinh', None]. If none, just returns original dataset. 
  35 |     """
  36 |     assert(normalization in ['log', 'arcsinh', None])   
  37 |     if set(X.flatten()) == set([1.0, 0.0]):
  38 |         assert(normalization is None)
  39 | 
  40 |     if normalization in ['arcsinh', 'log']:
  41 |         if normalization == 'arcsinh':
  42 |             X = np.arcsinh(X)
  43 |         else:
  44 |             X = np.log(X + 1)
  45 |         print('Normalization: took %s of data. Mean is now %2.3f, max %2.3f' % (normalization, np.mean(X), np.max(X)))
  46 |     return X
  47 | 
  48 | 
  49 | def perform_denormalization(X, normalization):
  50 |     """
  51 |     Denormalizes a dataset using a method in ['log', 'arcsinh', None]. If none, just returns original dataset. 
  52 |     """
  53 | 
  54 |     assert(normalization in ['log', 'arcsinh', None])   
  55 |     if set(X.flatten()) == set([1.0, 0.0]):
  56 |         assert(normalization is None)
  57 |     if normalization in ['arcsinh', 'log']:
  58 |         if normalization == 'arcsinh':
  59 |             X = np.sinh(X)
  60 |         else:
  61 |             X = np.exp(X) - 1
  62 |         print('Denormalization: took inverse %s of data. Mean is now %2.3f, max %2.3f' % (normalization, np.mean(X), np.max(X)))
  63 |     return X
  64 | 
  65 | 
  66 | def check_npz_files():
  67 |     """
  68 |     This confirms that we can load all the .npz files in BASE_DIR (for some reason they were getting corrupted.)
  69 |     """
  70 |     desired_keys = ['chr' + str(i) for i in range(1, 23)]
  71 |     n_successes = n_errors = 0
  72 |     for f in os.listdir(BASE_ROOT):
  73 |         if '.npz' not in f:
  74 |             continue
  75 |         try:
  76 |             d = np.load(os.path.join(BASE_ROOT, f))
  77 |             n_successes += 1
  78 |             assert(sorted(d.keys()) == sorted(HG19_ALL_CHROMS)) # This check will fail on mouse
  79 |         except:
  80 |             n_errors += 1
  81 |             os.remove(os.path.join(BASE_ROOT, f))
  82 |             print('Error with ' + f)
  83 |             continue
  84 |     print('successes', n_successes, 'errors', n_errors)
  85 | 
  86 | 
  87 | def get_peaks(cell_line, factor, subsample_target_string):
  88 |     """
  89 |     chrs_to_peaks: a dictionary whose keys are chromosomes which map to an array of bin starts and ends 
  90 |     indices (not chromosome locations) which are peaks. 
  91 |     Eg, {'chr1':[[5, 10], [25, 50]]} means bins 5 - 9 and 25 - 49 on chromosome 1 are peaks. 
  92 |     When computing peak boundaries, rounds (ie, a peak beginning at bin .6 = a bin beginning at bin 1.)
  93 |     peak_log_pvalues: a dictionary whose keys are chromosomes which map to an array of peak log pvalues
  94 |     in the same order as the peaks in chrs_to_peaks. 
  95 |     Eg, {'chr1':[99, 104]}  means the peaks in chr1 have log10 pvalues 99 and 104, respectively. 
  96 |     """
  97 |     
  98 |     peak_path = get_peak_path(cell_line, factor, subsample_target_string)
  99 |     if not os.path.isfile(peak_path):
 100 |         raise ValueError, "%s does not exist." % peak_path
 101 | 
 102 |     d = pd.read_csv(peak_path, sep = '\t', header = None)
 103 |     d = d[[0, 1, 2, 13]]
 104 | 
 105 |     d.columns = ['chr', 'start', 'end', 'log10_pvalue']
 106 |     chrs = list(set(d['chr']))
 107 |     chrs_to_peaks = {}
 108 |     peak_log_pvalues = {}
 109 |     for chrom in chrs:
 110 |         idxs = d['chr'] == chrom
 111 |         chrs_to_peaks[chrom] = np.array(zip(list(d.loc[idxs]['start']), list(d.loc[idxs]['end'])))
 112 |         chrs_to_peaks[chrom] = np.around(chrs_to_peaks[chrom] / BIN_SIZE).astype(int)
 113 |         peak_log_pvalues[chrom] = np.array(d.loc[idxs]['log10_pvalue'])
 114 |         assert(len(peak_log_pvalues[chrom]) == len(chrs_to_peaks[chrom]))
 115 |     return chrs_to_peaks, peak_log_pvalues
 116 | 
 117 | 
 118 | 
 119 | def generate_bigWig(data, marks, bigWig_prefix, bigWig_folder):
 120 |     """
 121 |     Takes in data, a dictionary with keys corresponding to chromosomes
 122 |     and each chromosome being a matrix of shape num_bins x num_histone_marks
 123 |     and outputs bigWigs generated from that data in bigWig_folder,
 124 |     one for each factor in FACTORS_TO_INCLUDE
 125 |     """
 126 | 
 127 |     assert data[data.keys()[0]].shape[1] == len(marks)
 128 |     chrom_sizes_path = HG19_CHROM_SIZES_PATH
 129 | 
 130 |     for (factorIdx, factor) in enumerate(marks):
 131 |         
 132 |         wig_path = os.path.join(bigWig_folder, '%s_%s.wig' % (bigWig_prefix, factor))
 133 |         bigWig_path = os.path.join(bigWig_folder, '%s_%s.bw' % (bigWig_prefix, factor))
 134 | 
 135 |         with open(wig_path, 'w') as f:
 136 |             for chrom in data:
 137 | 
 138 |                 f.write('fixedStep chrom=%s start=1 step=%s span=%d\n' % (chrom, BIN_SIZE, BIN_SIZE))
 139 | 
 140 |                 for i in data[chrom][:, factorIdx]:                    
 141 |                     f.write('%s\n' % str(i))
 142 |                     
 143 |         call('bash scripts/convertWigToBigWig.sh %s %s %s' % (wig_path, bigWig_path, chrom_sizes_path), 
 144 |              shell=True)        
 145 | 
 146 |     return None
 147 |         
 148 | 
 149 | def get_blacklisted_locs(cell_line):
 150 |     """
 151 |     Returns a dictionary whose keys are chromosomes which map to an array of bin starts and ends 
 152 |     indices (not chromosome locations) to exclude: does not include upper end of range (in line with numpy indexing conventions).
 153 |     Eg, {'chr1':[[5, 10], [25, 50]]} means we should exclude bins 5 - 9 and 25 - 49 on chromosome 1.
 154 |     """
 155 |     if get_species(cell_line) == 'mm9':
 156 |         blacklist_file = MM9_BLACKLIST_FILE
 157 |     else:
 158 |         blacklist_file = HG19_BLACKLIST_FILE
 159 | 
 160 |     d = pd.read_csv(blacklist_file, sep = "\t")
 161 |     blacklist_dictionary = {}
 162 |     for i in range(len(d)):
 163 |         chrom = d.iloc[i]['chromosome']
 164 |         start = d.iloc[i]['start']
 165 |         end =  d.iloc[i]['end']
 166 |         if chrom not in blacklist_dictionary:
 167 |             blacklist_dictionary[chrom] = []
 168 |         blacklist_dictionary[chrom].append([int(1.*start / BIN_SIZE), int(1. * end / BIN_SIZE) + 1])
 169 | 
 170 |     return blacklist_dictionary
 171 | 
 172 | 
 173 | def get_merged_BAM_path(cell_line, factor):
 174 |     """
 175 |     Returns the path to the BAM file that contains all merged replicates 
 176 |     for a given cell_line and factor.
 177 |     """
 178 | 
 179 |     return os.path.join(MERGED_ROOT, '%s-%s_merged.bam' % (cell_line, factor))
 180 | 
 181 | 
 182 | def get_merged_BED_SE_path(cell_line, factor):
 183 |     """
 184 |     Returns the path to the BED file that contains all merged replicates 
 185 |     for a given cell_line and factor. This is for single-end reads.
 186 |     These BED files have already been filtered for MAPQ.
 187 |     """
 188 | 
 189 |     return os.path.join(MERGED_ROOT, '%s-%s_merged.bed' % (cell_line, factor))
 190 | 
 191 | 
 192 | def get_merged_BED_path(cell_line, factor):
 193 |     """
 194 |     Returns the path to the BEDPE file that contains all merged replicates 
 195 |     for a given cell_line and factor.
 196 |     These BEDPE files have already been filtered for MAPQ and properly paired reads.
 197 |     """
 198 | 
 199 |     return os.path.join(MERGED_ROOT, '%s-%s_merged.bedpe' % (cell_line, factor))
 200 | 
 201 | 
 202 | def get_tagAlign_path(cell_line, factor, subsample_target_string = None):
 203 |     """
 204 |     Returns the path to the tagAlign file that contains all merged replicates 
 205 |     for a given cell_line and factor.
 206 |     These tagAlign files have already been filtered for MAPQ and properly paired reads.
 207 | 
 208 |     If subsample_target_string is specified, return a subsampled tagAlign instead.
 209 |     """
 210 | 
 211 |     if subsample_target_string:
 212 |         return os.path.join(SUBSAMPLED_ROOT, '%s-%s_subsample-%s.tagAlign.gz' % (cell_line, factor, subsample_target_string))
 213 |     else:
 214 |         return os.path.join(MERGED_ROOT, '%s-%s_merged.tagAlign.gz' % (cell_line, factor))    
 215 | 
 216 | 
 217 | def get_bigWig_folder(cell_line, factor, subsample_target_string = None):
 218 |     """
 219 |     Returns the name of the output folder where bigWigs for a given cell_line, factor,
 220 |     and optionally subsample_target_string should be placed. 
 221 |     This output folder is passed to the ENCODE CHiP-seq pipeline.
 222 |     """
 223 | 
 224 |     if subsample_target_string:
 225 |         return os.path.join(BIGWIGS_ROOT, '%s-%s_subsample-%s' % (cell_line, factor, subsample_target_string))    
 226 |     else:
 227 |         return os.path.join(BIGWIGS_ROOT, '%s-%s_merged' % (cell_line, factor))    
 228 | 
 229 | 
 230 | def get_peak_path(cell_line, factor, subsample_target_string):
 231 |     assert(factor != 'INPUT')
 232 |     if subsample_target_string:
 233 |         subsample_output_string = "subsample-%s" % subsample_target_string
 234 |     else:
 235 |         subsample_output_string = "merged"
 236 | 
 237 |     return os.path.join(
 238 |         PEAK_BASE_DIR,
 239 |         'peak',
 240 |         'macs2',
 241 |         'rep1',
 242 |         '%s-%s_%s' % (cell_line, factor, subsample_output_string) + 
 243 |         '.tagAlign_x_%s-INPUT_%s.tagAlign.gappedPeak.gz' % (cell_line, subsample_output_string))
 244 | 
 245 | 
 246 | def get_peak_bigWig_path(cell_line, factor, subsample_target_string = None):
 247 |     """
 248 |     Returns the path to the bigWig file that contains the peak p-values
 249 |     for a given cell_line, factor, and optionally 
 250 |     subsample_target_string.
 251 |     """
 252 |     if subsample_target_string:
 253 |         subsample_output_string = "subsample-%s" % subsample_target_string
 254 |     else:
 255 |         subsample_output_string = "merged"
 256 | 
 257 |     return os.path.join(
 258 |         PEAK_BASE_DIR,
 259 |         'signal',
 260 |         'macs2',
 261 |         'rep1',
 262 |         '%s-%s_%s' % (cell_line, factor, subsample_output_string) + 
 263 |         '.tagAlign_x_%s-INPUT_%s.tagAlign.pval.signal.bw' % (cell_line, subsample_output_string))
 264 | 
 265 | 
 266 | def get_bigWig_path(cell_line, factor, subsample_target_string = None):
 267 |     """
 268 |     Returns the path to the bigWig file that contains the output of align2rawsignal 
 269 |     (from the ENCODE CHiP-seq pipeline) for a given cell_line, factor, and optionally 
 270 |     subsample_target_string.
 271 |     """
 272 |     
 273 |     if subsample_target_string:
 274 |         return os.path.join(
 275 |             BIGWIGS_ROOT, 
 276 |             '%s-%s_subsample-%s' % (cell_line, factor, subsample_target_string),
 277 |             'signal',
 278 |             'tag2bw',
 279 |             'rep1',
 280 |             '%s-%s_subsample-%s.bigwig' % (cell_line, factor, subsample_target_string))
 281 | 
 282 |     else:
 283 |         return os.path.join(
 284 |             BIGWIGS_ROOT, 
 285 |             '%s-%s_merged' % (cell_line, factor),
 286 |             'signal',
 287 |             'tag2bw',
 288 |             'rep1',
 289 |             '%s-%s_merged.bigwig' % (cell_line, factor))
 290 | 
 291 | 
 292 | def get_intervals_path(chrom, species):
 293 |     """
 294 |     Returns the path to the intervals BED file for a given chromosome.
 295 | 
 296 |     This BED file contains equally spaced intervals at BIN_SIZE."""
 297 | 
 298 |     assert species in ['hg19', 'mm9']
 299 |     return os.path.join(INTERVALS_ROOT, '%s_%s_%s.bed' % (species, chrom, BIN_SIZE))
 300 | 
 301 | 
 302 | def get_numpy_path(cell_line, factor, chrom, subsample_target_string=None):
 303 |     """
 304 |     Returns the path of the numpy array containing the binned signal for a given cell_line, factor,
 305 |     and optionally subsample_target_string.
 306 |     """
 307 |     
 308 |     if subsample_target_string:
 309 |         return os.path.join(NUMPY_ROOT, '%s-%s-%s_subsample-%s.npy' % (cell_line, factor, chrom, subsample_target_string))
 310 | 
 311 |     else:
 312 |         return os.path.join(NUMPY_ROOT, '%s-%s-%s_merged.npy' % (cell_line, factor, chrom))
 313 | 
 314 | def get_peak_numpy_path(cell_line, factor, chrom, subsample_target_string=None):
 315 |     """
 316 |     Returns the path of the numpy array containing the binned peak p-value signal for a given cell_line, factor,
 317 |     and optionally subsample_target_string.
 318 |     """
 319 |     assert(factor != 'INPUT')
 320 |     if subsample_target_string:
 321 |         return os.path.join(NUMPY_ROOT, 'peak_pvals_by_bin_%s-%s-%s_subsample-%s.npy' % (cell_line, factor, chrom, subsample_target_string))
 322 | 
 323 |     else:
 324 |         return os.path.join(NUMPY_ROOT, 'peak_pvals_by_bin_%s-%s-%s_merged.npy' % (cell_line, factor, chrom))
 325 | 
 326 | def get_base_path(dataset_name, subsample_target_string, normalization, peaks=False):
 327 |     """
 328 |     If peaks is True, returns the base path for the peak pvals; otherwise, returns base path for continuous signal.
 329 | 
 330 |     Normalization is always set to None if peaks is True. 
 331 |     """
 332 |     if peaks:
 333 |         return os.path.join(BASE_ROOT, 'peak_pvals_by_bin_%s_subsample-%s_norm-None.npz' % 
 334 |                         (dataset_name, subsample_target_string))
 335 |     else:
 336 |         return os.path.join(BASE_ROOT, '%s_subsample-%s_norm-%s.npz' % 
 337 |                         (dataset_name, subsample_target_string, normalization))
 338 | 
 339 | 
 340 | def get_metadata_path(dataset_name, subsample_target_string, normalization):
 341 |     return os.path.join(BASE_ROOT, '%s_subsample-%s_norm-%s.metadata' % 
 342 |                         (dataset_name, subsample_target_string, normalization))
 343 |     
 344 | 
 345 | def merge_BAMs(cell_lines_to_use, factors_to_use):
 346 |     """
 347 |     Takes a remote directory (REMOTE_ROOT) containing several different cell lines, marks, and 
 348 |     replicates, copies the data over to a local directory (RAW_ROOT), then combines all replicates 
 349 |     for each pair of cell lines and marks. Outputs to MERGED_ROOT.
 350 |     Only looks at cell lines that are in cell_lines_to_use and marks that are in factors_to_use.
 351 | 
 352 |     Operates on raw data available at http://gbsc-share.stanford.edu/chromovar/rawdata/
 353 |     """ 
 354 | 
 355 |     cell_mark_pairs = set()
 356 |     cell_mark_name_triples = [] 
 357 |     all_cmds = [[]]
 358 | 
 359 |     # First, copy files over from REMOTE_ROOT (/mnt/data...) to RAW_ROOT 
 360 |     for f in os.listdir(REMOTE_ROOT):
 361 |         if (os.path.isfile(os.path.join(REMOTE_ROOT, f)) and f.startswith('SNYDER_HG19_') 
 362 |             and f.endswith('.dedup.bam')):
 363 | 
 364 |             spl = f.split('_')
 365 |             cell_line = spl[2]
 366 |             if cell_line not in cell_lines_to_use:
 367 |                 continue
 368 | 
 369 |             mark = spl[3]
 370 |             if mark not in factors_to_use:
 371 |                 continue
 372 | 
 373 |             all_cmds[0].append('cp %s %s' % (os.path.join(REMOTE_ROOT, f), RAW_ROOT))
 374 | 
 375 |             cell_mark_pairs.add((cell_line, mark))
 376 |             cell_mark_name_triples.append((cell_line, mark, f))
 377 | 
 378 |     # Then process all files in RAW_ROOT
 379 |     for (cell, mark) in cell_mark_pairs:
 380 | 
 381 |         # How many replicates does this (cell, mark) pair have?
 382 |         count = 0
 383 |         filename = ''
 384 |         for (c, m, f) in cell_mark_name_triples:
 385 |             if cell == c and mark == m:
 386 |                 count += 1
 387 |                 filename = f
 388 |         assert count > 0
 389 |         
 390 |         if count == 1:
 391 |             print("%s-%s has no replicates. Copying straight..." % (cell, mark))
 392 |             all_cmds[-1].append("cp %s %s;" % (os.path.join(RAW_ROOT, filename), get_merged_BAM_path(cell, mark)))
 393 | 
 394 |         else:
 395 |             print("%s-%s has %s replicates. Merging..." % (cell, mark, count))
 396 |             all_cmds[-1].append("samtools merge %s %s/*%s_%s*.bam" % \
 397 |                 (get_merged_BAM_path(cell, mark), RAW_ROOT, cell, mark))
 398 |     return all_cmds
 399 | 
 400 | 
 401 | def filter_and_convert_BAMs(cell_lines_to_use, factors_to_use):
 402 |     """
 403 |     Looks at all merged BAM files in MERGED_ROOT, and for each BAM file,
 404 |     filters out all reads below MAPQ 30 and all reads that aren't paired properly,
 405 |     and then outputs a tagAlign.gz file with only the filtered reads 
 406 |     in the same MERGED_ROOT folder.
 407 |     """
 408 |     all_cmds = [[], []]
 409 |     for cell_line in cell_lines_to_use:
 410 |         for factor in factors_to_use:
 411 | 
 412 |             BAM_path = get_merged_BAM_path(cell_line, factor)
 413 |             tagAlign_path = get_tagAlign_path(cell_line, factor)
 414 |             
 415 |             if os.path.isfile(BAM_path):        
 416 |                 BED_path = get_merged_BED_path(cell_line, factor)
 417 |                 all_cmds[0].append("bash scripts/filterAndConvertBAMs.sh %s %s %s" % (BAM_path, BED_path, MAPQ_THRESHOLD))
 418 |                 all_cmds[1].append("bash scripts/convertBEDPEtoTagAlign.sh %s %s" % (BED_path, tagAlign_path))
 419 |             else:
 420 |                 print("Warning: %s does not exist. Skipping..." % BAM_path)
 421 |     return all_cmds
 422 | 
 423 | 
 424 | def subsample_BAMs(cell_lines_to_use, factors_to_use, subsample_targets_to_use):
 425 |     """
 426 |     For each cell_line and factor, subsamples the corresponding BEDPE file to 
 427 |     the desired depths. Outputs in SUBSAMPLED_ROOT a tagAlign.gz file for each 
 428 |     (cell_line, factor, subsample_target) combination.
 429 |     """
 430 |     all_cmds = [[]]
 431 |     for cell_line in cell_lines_to_use:
 432 |         for factor in factors_to_use:
 433 |             
 434 |             subsample_input = get_merged_BED_path(cell_line, factor)
 435 |             full_reads = int(float(check_output('wc -l %s' % subsample_input, shell=True).split(' ')[0]))
 436 |             # subsample_command = ""
 437 | 
 438 |             for subsample_target_string in subsample_targets_to_use:
 439 | 
 440 |                 if subsample_target_string == None:
 441 |                     continue
 442 | 
 443 |                 subsample_target = int(float(subsample_target_string))
 444 |                 
 445 |                 if full_reads < subsample_target:
 446 |                     print("Warning: %s-%s only has %s read pairs, less than subsampling target of %s. Skipping..." %
 447 |                         (cell_line, factor, full_reads, subsample_target_string))
 448 |                     continue
 449 | 
 450 |                 print("Subsampling %s-%s: %s read pairs from %s read pairs" % (cell_line, factor, subsample_target_string, full_reads))
 451 | 
 452 |                 subsample_output = get_tagAlign_path(cell_line, factor, subsample_target_string)
 453 | 
 454 |                 # if subsample_command != "":
 455 |                     # subsample_command += '; '
 456 | 
 457 |                 cmd = "bash scripts/subsampleBEDPEs.sh %s %s %s" % (subsample_input, subsample_output, subsample_target)
 458 |                 # subsample_command += cmd
 459 | 
 460 |                 all_cmds[0].append(cmd)
 461 |             
 462 |             # subsample_command = "(" + subsample_command + ") &"
 463 |             #call(subsample_command, shell=True)
 464 |     return all_cmds  
 465 | 
 466 | 
 467 | def get_chrom_sizes(cell_line):
 468 |     if get_species(cell_line) == 'mm9':
 469 |         return MM9_CHROM_SIZES
 470 |     else:
 471 |         return HG19_CHROM_SIZES
 472 | 
 473 | def get_species(cell_line):    
 474 |     if 'MOUSE' in cell_line:
 475 |         return 'mm9'
 476 |     else:
 477 |         return 'hg19'
 478 | 
 479 | def get_signal_tracks(cell_lines_to_use, factors_to_use, subsample_targets_to_use):
 480 |     """
 481 |     Calls the ENCODE CHiP-seq pipeline on the tagAlign files for all 
 482 |     cell lines, factors, and subsample targets (including the full data).
 483 |     Outputs in BIGWIGS_ROOT a .bigWig file for each 
 484 |     (cell_line, factor, subsample_target) combination.
 485 |     """
 486 |     all_cmds = [[]]
 487 |     for cell_line in cell_lines_to_use:
 488 |         species = get_species(cell_line)
 489 |         for factor in factors_to_use:
 490 | 
 491 |             chrom_sizes = get_chrom_sizes(cell_line)
 492 | 
 493 |             # This gets signal tracks from both full and subsampled data
 494 |             # because None is an element of SUBSAMPLE_TARGETS
 495 |                         
 496 |             signal_command = ""
 497 | 
 498 |             for subsample_target_string in subsample_targets_to_use:
 499 |                 tagAlign_path = get_tagAlign_path(cell_line, factor, subsample_target_string)
 500 |                 bigWig_folder = get_bigWig_folder(cell_line, factor, subsample_target_string)
 501 | 
 502 |                 if os.path.isfile(tagAlign_path):                    
 503 |                     files_already_exist = check_whether_BW_files_exist(
 504 |                         cell_line, 
 505 |                         factor, 
 506 |                         subsample_target_string, 
 507 |                         average_peaks=False)
 508 | 
 509 |                     if files_already_exist:
 510 |                         print('Bigwig files already exist for %s; skipping.' % bigWig_folder)
 511 |                     else:
 512 |                         print('Bigwig files DO NOT exist for %s; adding to tasks.' % bigWig_folder)
 513 |                         if signal_command != "":
 514 |                             signal_command += '; '
 515 |                         cmd = "bash scripts/getSignalTrack.sh %s %s %s %s" % (PIPELINE_ROOT, tagAlign_path, bigWig_folder, species)
 516 |                         signal_command += cmd
 517 |                         all_cmds[0].append(cmd)
 518 |                 else:
 519 |                     print("Warning: %s does not exist. Skipping..." % tagAlign_path)
 520 |                 
 521 |             signal_command = "(" + signal_command + ") &"
 522 |             
 523 |             #call(signal_command, shell=True)
 524 |     return all_cmds
 525 | 
 526 | def make_intervals(species):
 527 |     """
 528 |     Constructs BED files, one for each chromosome, each containing equally
 529 |     spaced intervals at BIN_SIZE.
 530 | 
 531 |     The third column of the BED file is exclusive, i.e., the interval is
 532 |     actually [start, end). So for a BIN_SIZE of size 25 the intervals will look like
 533 |         chr1 0 25
 534 |         chr2 25 50
 535 |         ...
 536 | 
 537 |     For convenience, here is the official documentation:
 538 |     
 539 |         chromEnd - The ending position of the feature in the chromosome or scaffold. 
 540 |         The chromEnd base is not included in the display of the feature. 
 541 |         For example, the first 100 bases of a chromosome are defined as 
 542 |         chromStart=0, chromEnd=100, and span the bases numbered 0-99.
 543 | 
 544 |     The fourth column (name) is added because bigWigAverageOverBed only accepts 
 545 |     BED files with 4 columns.
 546 | 
 547 |     We just truncate the end of the chromosome if it's not cleanly divisible
 548 |     by BIN_SIZE.
 549 |     """
 550 | 
 551 |     if species == 'hg19':
 552 |         chrom_sizes = HG19_CHROM_SIZES
 553 |     elif species == 'mm9':
 554 |         chrom_sizes = MM9_CHROM_SIZES
 555 |     else:
 556 |         raise ValueError, 'species must be hg19 or mm9'
 557 | 
 558 |     for chrom, chrom_size in chrom_sizes.items():
 559 |         print("Generating BED file for %s" % chrom)
 560 |         BED_path = get_intervals_path(chrom, species)
 561 | 
 562 |         with open(BED_path, 'w') as f:
 563 |             for start in range(0, chrom_size - BIN_SIZE + 1, BIN_SIZE):
 564 |                 end = start + BIN_SIZE
 565 |                 name = "%s-%s" % (chrom, start)
 566 |                 f.write("%s\t%s\t%s\t%s\n" % (chrom, start, end, name))
 567 | 
 568 | def check_whether_BW_files_exist(cell_line, factor, subsample_target_string, average_peaks):
 569 |     """
 570 |     Checks whether bigwig files + the corresponding interval paths exist. 
 571 |     """
 572 |     
 573 |     allFilesExist = True
 574 | 
 575 |     if average_peaks:
 576 |         bigWig_path = get_peak_bigWig_path(cell_line, factor, subsample_target_string)
 577 |     else:
 578 |         bigWig_path = get_bigWig_path(cell_line, factor, subsample_target_string)
 579 |     if not (os.path.isfile(bigWig_path)):
 580 |         allFilesExist = False
 581 | 
 582 |     species = get_species(cell_line)
 583 |     chrom_sizes = get_chrom_sizes(cell_line)
 584 |     for chrom in chrom_sizes.keys():
 585 |         BED_path = get_intervals_path(chrom, species)        
 586 |         if not os.path.isfile(BED_path):  
 587 |             allFilesExist = False
 588 | 
 589 |     return allFilesExist
 590 | 
 591 | def get_average_signal_over_intervals(cell_lines_to_use, factors_to_use, subsample_targets_to_use, average_peaks = False):
 592 |     """    
 593 |     Averages the signal in the .bigWig files in BIGWIGS_ROOT into bins of BIN_SIZE.
 594 |     Outputs a .npy file in NUMPY_ROOT for each (cell_line, factor, subsample_target) 
 595 |     combination.
 596 | 
 597 |     This calls the bigWigAverageOverBed tool from UCSC tools and takes the mean0 column.
 598 | 
 599 |     This function does nothing if the .npy file in NUMPY_ROOT already exists.
 600 |     """
 601 |     all_cmds = [[], [], []]
 602 |     assert(input_not_before_end(factors_to_use))
 603 |     for cell_line in cell_lines_to_use:
 604 |         for factor in factors_to_use:
 605 |             if average_peaks and factor == 'INPUT':
 606 |                 continue
 607 | 
 608 |             chrom_sizes = get_chrom_sizes(cell_line)
 609 |             species = get_species(cell_line)
 610 |             # This averages signal tracks from both full and subsampled data
 611 |             # because None is an element of subsample_targets_to_use
 612 |             for subsample_target_string in subsample_targets_to_use:
 613 |                 allFilesExist = check_whether_BW_files_exist(cell_line, factor, subsample_target_string, average_peaks)
 614 |                 if allFilesExist:
 615 |                     print('All files exist for %s, %s, %s, average_peaks = %s; averaging signal over intervals' % (cell_line, factor, subsample_target_string, average_peaks))
 616 |                     for chrom in chrom_sizes.keys():
 617 |                         BED_path = get_intervals_path(chrom, species)
 618 |                         if average_peaks:
 619 |                             bigWig_path = get_peak_bigWig_path(cell_line, factor, subsample_target_string)
 620 |                             numpy_path = get_peak_numpy_path(cell_line, factor, chrom, subsample_target_string)
 621 |                         else:
 622 |                             bigWig_path = get_bigWig_path(cell_line, factor, subsample_target_string)
 623 |                             numpy_path = get_numpy_path(cell_line, factor, chrom, subsample_target_string)
 624 |                         output_path = bigWig_path + '-%s_binned.out' % chrom
 625 |                         if os.path.isfile(numpy_path):#we've already done everything. 
 626 |                             print("Warning: %s already exists. Skipping..." % numpy_path)
 627 |                         else:
 628 |                             print("Numpy file does not exist; creating %s" % (numpy_path))
 629 |                             if os.path.isfile(output_path):
 630 |                                 print("Warning: %s already exists. Skipping..." % output_path)
 631 |                             else:
 632 |                                 cmd = "bash scripts/averageSignalTrack.sh %s %s %s" % (bigWig_path, BED_path, output_path)
 633 |                                 all_cmds[0].append(cmd)
 634 |                             all_cmds[1].append('python prepData.py turn_into_numpy %s %s' % (output_path, numpy_path))
 635 |                             # Clean up intermediate output
 636 |                             all_cmds[2].append("rm -rf %s" % output_path)
 637 | 
 638 |                 else:
 639 |                     print('Warning: not all files exist for %s, %s, %s, average_peaks = %s' % (cell_line, factor, subsample_target_string, average_peaks))
 640 | 
 641 |     return all_cmds
 642 | 
 643 | def turn_into_numpy(output_path, numpy_path):
 644 |     """
 645 |     Saves the output_path as a numpy_path. 
 646 |     """
 647 |     df = pd.read_csv(output_path, header = None)
 648 |     np.save(numpy_path, np.array(df))
 649 | 
 650 | 
 651 | def prep_dataset(dataset_name, cell_line, factors_to_include, chroms_to_include, 
 652 |                  subsample_targets, normalization, peak_dataset = False):
 653 |     """
 654 |     Cobbles together a single .npz file containing binned signals for a given cell_line, 
 655 |     list of factors, and list of chromosomes. There is one .npz file per 
 656 |     (cell_line, subsample_target, normalization) triplet.
 657 |     
 658 |     Output is a single .npz file in BASE_ROOT with name dataset_name.
 659 |     This .npz file contains one matrix for each chromosome. 
 660 |     Each matrix is of dimensions num_bins x num_factors,
 661 |     where num_bins is roughly floor(length of chromosome / BIN_SIZE),
 662 |     and num_factors is the length of factors_to_include.
 663 | 
 664 |     If peak_dataset = True, loads a peak dataset instead. 
 665 |     """
 666 | 
 667 |     if peak_dataset:
 668 |         assert(normalization is None)
 669 | 
 670 |     assert(input_not_before_end(factors_to_include))
 671 |     if peak_dataset:
 672 |         factors_to_include = np.copy(factors_to_include)
 673 |         if factors_to_include[-1] == 'INPUT':
 674 |             factors_to_include = factors_to_include[:-1]
 675 | 
 676 |     for subsample_target_string in subsample_targets:
 677 |         
 678 |         output_path = get_base_path(dataset_name, subsample_target_string, normalization)
 679 |         if os.path.isfile(output_path):
 680 |             print('Output file %s exists' % output_path)
 681 |             continue
 682 |         print("Preparing %s %s" % (dataset_name, subsample_target_string))
 683 |         # First make sure that all the numpy files we need exist
 684 |         do_files_exist = True
 685 |         for chrom in chroms_to_include:
 686 |             for factor in factors_to_include:
 687 |                 if peak_dataset:
 688 |                     numpy_path = get_peak_numpy_path(cell_line, factor, chrom, subsample_target_string)
 689 |                 else:
 690 |                     numpy_path = get_numpy_path(cell_line, factor, chrom, subsample_target_string)
 691 |                 if not os.path.isfile(numpy_path):
 692 |                     print('Warning: %s does not exist' % numpy_path)
 693 |                     do_files_exist = False
 694 |                     break
 695 | 
 696 |         if not do_files_exist:
 697 |             print("Warning: not all .npy files are ready to make dataset %s for %s %s" % (dataset_name, cell_line, subsample_target_string))
 698 |             continue
 699 |  
 700 | 
 701 |         # Write dataset metadata to disk
 702 |         if not peak_dataset:
 703 |             metadata = {
 704 |                 'dataset_name': dataset_name,
 705 |                 'cell_line': cell_line,
 706 |                 'factors_to_include': factors_to_include,
 707 |                 'chroms_to_include': chroms_to_include,
 708 |                 'subsample_targets': subsample_targets,
 709 |                 'normalization': normalization
 710 |             }
 711 |             metadata_path = get_metadata_path(dataset_name, subsample_target_string, normalization)
 712 |             with open(metadata_path, 'w') as f:
 713 |                 f.write(json.dumps(metadata))
 714 | 
 715 | 
 716 |         # Construct output matrix
 717 | 
 718 |         num_factors = len(factors_to_include)
 719 |         matrices = {}
 720 |         unnormalized_matrices = {}
 721 |         blacklist_buffer = 5
 722 |         blacklisted_locs = get_blacklisted_locs(cell_line)
 723 | 
 724 |         for chrom in chroms_to_include:
 725 | 
 726 |             print("... packing %s" % chrom)
 727 | 
 728 |             first_factor = True
 729 | 
 730 |             for (idx, factor) in enumerate(factors_to_include): 
 731 |                 if peak_dataset:
 732 |                     numpy_path = get_peak_numpy_path(cell_line, factor, chrom, subsample_target_string)
 733 |                 else:
 734 |                     numpy_path = get_numpy_path(cell_line, factor, chrom, subsample_target_string)
 735 | 
 736 |                 assert os.path.isfile(numpy_path), "Error: %s is missing" % numpy_path
 737 | 
 738 |                 # Each individual chrom-factor is a column vector
 739 |                 arr = np.load(numpy_path)
 740 | 
 741 |                 if first_factor:
 742 |                     first_factor = False
 743 |                     num_bins = len(arr)
 744 |                     chrom_matrix = np.empty([num_bins, num_factors])
 745 | 
 746 |                 chrom_matrix[:, idx] = arr[:, 0]
 747 | 
 748 |             # Zero out blacklist regions. Add a bit of buffer to be safe.                         
 749 |             print('Before blacklisting %s, average signal is %s' % (chrom, np.mean(chrom_matrix)))            
 750 |             for bad_range in blacklisted_locs[chrom]:
 751 |                 chrom_matrix[bad_range[0]-blacklist_buffer : bad_range[1]+blacklist_buffer, :] = 0                                    
 752 |             print('After blacklisting %s, average signal is %s' % (chrom, np.mean(chrom_matrix)))            
 753 |             
 754 |             # Save matrix for this chrom
 755 |             unnormalized_matrices[chrom] = chrom_matrix
 756 |             matrices[chrom] = perform_normalization(chrom_matrix, normalization)
 757 |         np.savez_compressed(output_path, **matrices)
 758 | 
 759 |         # Always save unnormalized bigWigs even if the actual data is normalized
 760 |         # because we don't want to view normalized bigWigs on the genome browser
 761 |         generate_bigWig(
 762 |             unnormalized_matrices, 
 763 |             factors_to_include,
 764 |             '%s_subsample-%s_norm-None' % (dataset_name, subsample_target_string), 
 765 |             BASE_BIGWIG_ROOT)
 766 | 
 767 | 
 768 | def prep_dataset_wrapper(dataset_name, cell_line, factors_string, subsample_target, normalization, peak_dataset):
 769 |     """
 770 |     This is just a wrapper to allow prep dataset to be called from the command line. 
 771 |     """
 772 | 
 773 |     if normalization == 'None':
 774 |         normalization = None
 775 |     if subsample_target == 'None':
 776 |         subsample_target = None
 777 |     assert(peak_dataset in ['True', 'False'])
 778 |     peak_dataset = peak_dataset == 'True'
 779 | 
 780 |     if get_species(cell_line) == 'mm9':
 781 |         all_chroms = MM9_ALL_CHROMS
 782 |     else:        
 783 |         all_chroms = HG19_ALL_CHROMS
 784 |     prep_dataset(dataset_name, cell_line, factors_string.split('-'), all_chroms, 
 785 |                  [subsample_target], normalization, peak_dataset)
 786 |     
 787 | 
 788 | def generate_datasets(cell_lines_to_use, dataset_name_template, factors_to_use, subsample_targets_to_use):
 789 |     """
 790 |     Calls prep_dataset on each cell_line, factor, and subsample_target; 
 791 |     Each dataset uses data from chr1-22 and all factors in factors_to_use.
 792 |     Also creates peak datasets.
 793 | 
 794 |     Output is in BASE_ROOT.
 795 |     """
 796 |     all_cmds = [[]]
 797 |     factors_string = '-'.join(factors_to_use)
 798 |     for cell_line in cell_lines_to_use:        
 799 |         for subsample_target in subsample_targets_to_use:
 800 |             all_cmds[0].append('python prepData.py ' \
 801 |                 + ' prep_dataset_wrapper peak_pvals_by_bin_%s %s %s %s None True' % \
 802 |                 (dataset_name_template % cell_line, cell_line, factors_string, subsample_target))
 803 |             all_cmds[0].append('python prepData.py ' \
 804 |                 + ' prep_dataset_wrapper %s %s %s %s arcsinh False' % \
 805 |                 (dataset_name_template % cell_line, cell_line, factors_string, subsample_target))
 806 |     return all_cmds
 807 | 
 808 | 
 809 | def call_all_peaks(cell_lines_to_use, factors_to_use, subsample_targets_to_use):
 810 |     """
 811 |     Calls the ENCODE CHiP-seq pipeline on the tagAlign files for all 
 812 |     cell lines, factors, and subsample targets (including the full data).
 813 |     Outputs in PEAK_BASE_DIR/peaks_macs2/true_replicates a gappedPeak.gz file for each 
 814 |     (cellLine, factor, subsampleTarget) combination. 
 815 |     """
 816 |     print('calling all peaks!!')
 817 |     all_cmds = [[]]
 818 |     for cell_line in cell_lines_to_use:
 819 |         species = get_species(cell_line)
 820 |         for factor in factors_to_use:
 821 |             if factor == 'INPUT':
 822 |                 continue
 823 | 
 824 |             controls_and_inputs = []
 825 |             
 826 |             for subsample_target_string in subsample_targets_to_use:
 827 |                 if check_whether_BW_files_exist(cell_line, factor, subsample_target_string, average_peaks = True):
 828 |                     print('%-8s %-8s %-8s peak files already exist, not regenerating' % (cell_line, factor, subsample_target_string))
 829 |                     continue
 830 | 
 831 |                 else:
 832 |                     input_file = get_tagAlign_path(cell_line, factor, subsample_target_string = subsample_target_string)
 833 |                     control_input_file = get_tagAlign_path(cell_line, 'INPUT', subsample_target_string = subsample_target_string)
 834 |                     if os.path.exists(input_file) and os.path.exists(control_input_file):
 835 |                         print('%-8s %-8s %-8s peak files DO NOT exist, regenerating' % (cell_line, factor, subsample_target_string))
 836 |                         controls_and_inputs.append([input_file, control_input_file])
 837 |                     else:
 838 |                         print('%-8s %-8s %-8s input files DO NOT exist, cannot call peaks' % (cell_line, factor, subsample_target_string))
 839 |                         continue
 840 | 
 841 |             for input_file, control_input_file in controls_and_inputs:                
 842 |                 
 843 |                 if os.path.isfile(input_file) and os.path.isfile(control_input_file):
 844 | 
 845 |                     if os.path.isfile(control_input_file):                    
 846 |                         cmd = "bash scripts/findPeaks.sh %s %s %s %s %s" % (PIPELINE_ROOT, PEAK_BASE_DIR, input_file, control_input_file, species)                        
 847 | 
 848 |                     all_cmds[0].append(cmd)                        
 849 | 
 850 |                     print('Running command ', cmd)
 851 |                 else:
 852 |                     print("Warning: input file %s or %s does not exist. Skipping..." % (input_file, control_input_file))
 853 |             
 854 |             
 855 |     return all_cmds
 856 | 
 857 | 
 858 | def input_not_before_end(list_of_marks):
 859 |     """
 860 |     Makes sure that INPUT does not occur before the last element of a list of marks. 
 861 |     """
 862 |     return ('INPUT' not in list_of_marks[:-1])
 863 | def callCommand(cmd):
 864 |     call(cmd, shell = True)
 865 |     sleep(3)
 866 | 
 867 | def fork_and_wait(n_proc, target, args=[]):
 868 |     """
 869 |     Fork n_proc processes, run target(*args) in each, and wait to finish.
 870 |     This is Nathan's method. 
 871 |     """
 872 |     if n_proc == 1:
 873 |         target(*args)
 874 |         return
 875 |     else:
 876 |         pids = []
 877 |         for i in xrange(n_proc):
 878 |             pid = os.fork()
 879 |             if pid == 0:
 880 |                 try:
 881 |                     signal.signal(signal.SIGINT, handle_interrupt_signal)
 882 |                     target(*args)
 883 |                     os._exit(os.EX_OK)
 884 |                 except Exception, inst:
 885 |                     print_exc()
 886 |                     config.log_statement( "Uncaught exception in subprocess\n" 
 887 |                                           + traceback.format_exc(), log=True)
 888 |                     os._exit(os.EX_SOFTWARE)
 889 |             else:
 890 |                 pids.append(pid)
 891 |         try:
 892 |             while len(pids) > 0:
 893 |                 ret_pid, error_code = os.wait()
 894 |                 if ret_pid in pids:
 895 |                     pids.remove(ret_pid)
 896 |                 if error_code != os.EX_OK: 
 897 |                     raise OSError, "Process '{}' returned error code '{}'".format(
 898 |                         ret_pid, error_code) 
 899 |         except KeyboardInterrupt:
 900 |             for pid in pids:
 901 |                 try: os.kill(pid, signal.SIGHUP)
 902 |                 except: pass
 903 |             raise
 904 |         except OSError:
 905 |             for pid in pids:
 906 |                 try: os.kill(pid, signal.SIGHUP)
 907 |                 except: pass
 908 |             raise
 909 |         return
 910 | 
 911 | class Counter(object):
 912 |     """
 913 |     Nathan's implementation of the Counter class; used for running multiple threads simultaneously.
 914 |     """
 915 |     def __init__(self, initval=0):
 916 |         self.val = multiprocessing.Value('i', initval)
 917 |         self.lock = multiprocessing.Lock()
 918 |     
 919 |     def return_and_increment(self):
 920 |         with self.lock:
 921 |             rv = self.val.value
 922 |             self.val.value += 1
 923 |         return rv
 924 | def handle_interrupt_signal(signum, frame):
 925 |     os._exit(os.EX_TEMPFAIL)
 926 | 
 927 | 
 928 | def run_in_parallel(task_name, n_proc, target, all_args):
 929 |     """
 930 |     Run target on each item in items.
 931 |     all_args should be a list of lists (where each element is one argument set).
 932 |     """
 933 |     if len(all_args) == 0:
 934 |         print("No tasks to run!")
 935 |         return
 936 |     curr_item = Counter()
 937 |     def worker():
 938 |         index = curr_item.return_and_increment()
 939 |         while index < len(all_args):
 940 |             args = all_args[index]
 941 |             sys.stdout.write('Now running %s, command %i / %i with %i processes; commands are %s\n' % (task_name, index + 1, len(all_args), n_proc, args))
 942 |             sleep(2)
 943 |             sys.stdout.flush()
 944 |             sys.stderr.flush()
 945 |             target(*args)
 946 |             index = curr_item.return_and_increment()
 947 |         return
 948 | 
 949 |     fork_and_wait(n_proc, worker)
 950 | 
 951 | def callCommand(cmd):
 952 |     call(cmd, shell = True)
 953 |     sleep(3)
 954 | 
 955 | 
 956 | def run_pipeline_commands(cell_lines_to_use, factors_to_use, subsample_targets_to_use, 
 957 |                           dataset_name_template, n_processes = 8, steps_to_skip = []):
 958 | 
 959 |     """
 960 |     Runs the full pipeline using n_processes. 
 961 |     Skips steps in steps_to_skip. 
 962 | 
 963 |     Each method returns a list of lists: each element in the outside list is a list of bash commands that can be run in parallel. 
 964 |     """
 965 |     
 966 |     # GM-specific processing    
 967 |     if cell_lines_to_use[0].startswith('GM'):
 968 |         if 'merge_bam' not in steps_to_skip:
 969 |             merge_bam_cmds = merge_BAMs(cell_lines_to_use, factors_to_use)
 970 |             for cmd_set in merge_bam_cmds:
 971 |                 run_in_parallel('Merge BAM', n_processes, callCommand, [[cmd] for cmd in cmd_set])
 972 |         if 'filter_bam' not in steps_to_skip:
 973 |             filter_bam_cmds = filter_and_convert_BAMs(cell_lines_to_use, factors_to_use)
 974 |             for cmd_set in filter_bam_cmds:
 975 |                 run_in_parallel('Filter BAM', n_processes, callCommand, [[cmd] for cmd in cmd_set])
 976 |         if 'subsample_bam' not in steps_to_skip:
 977 |             subsample_bam_cmds = subsample_BAMs(cell_lines_to_use, factors_to_use, subsample_targets_to_use)
 978 |             for cmd_set in subsample_bam_cmds:
 979 |                 run_in_parallel('Subsample BAM', n_processes, callCommand, [[cmd] for cmd in cmd_set])
 980 | 
 981 |     # Common processing
 982 |     if 'get_signal_tracks' not in steps_to_skip:
 983 |         signal_track_cmds = get_signal_tracks(cell_lines_to_use, factors_to_use, subsample_targets_to_use)
 984 |         for cmd_set in signal_track_cmds:
 985 |             run_in_parallel('Get signal track', n_processes, callCommand, [[cmd] for cmd in cmd_set])
 986 |     if 'call_peaks' not in steps_to_skip:
 987 |         call_peak_cmds = call_all_peaks(cell_lines_to_use, factors_to_use, subsample_targets_to_use)
 988 |         for cmd_set in call_peak_cmds:
 989 |             run_in_parallel('Call peak', n_processes, callCommand, [[cmd] for cmd in cmd_set])
 990 | 
 991 |     if 'get_average_signal' not in steps_to_skip:
 992 |         
 993 |         get_average_signal_peaks_cmds = get_average_signal_over_intervals(cell_lines_to_use, factors_to_use, subsample_targets_to_use, average_peaks = True)
 994 |         get_average_signal_cmds = get_average_signal_over_intervals(cell_lines_to_use, factors_to_use, subsample_targets_to_use, average_peaks = False)
 995 | 
 996 |         for cmd_set in get_average_signal_cmds + get_average_signal_peaks_cmds:
 997 | 
 998 |             run_in_parallel('Average signal', n_processes, callCommand, [[cmd] for cmd in cmd_set])
 999 | 
1000 |     generate_all_dataset_cmds = generate_datasets(cell_lines_to_use, dataset_name_template, 
1001 |                                                   factors_to_use, subsample_targets_to_use)
1002 |     for cmd_set in generate_all_dataset_cmds:
1003 |             run_in_parallel('Generate dataset', n_processes, callCommand, [[cmd] for cmd in cmd_set])
1004 | 
1005 | 
1006 | def run_GM_pipeline():
1007 |     """
1008 |     Runs the full pipeline (starting from subsampling) to get many subsample targets for GM12878 
1009 |     and GM18526, and one subsample target for the other cell lines.
1010 |     """
1011 | 
1012 |     try:        
1013 |         run_pipeline_commands(
1014 |             ['GM12878', 'GM18526'],
1015 |             GM_FACTORS, 
1016 |             ['0.5e6', None], 
1017 |             GM_DATASET_NAME_TEMPLATE, 
1018 |             steps_to_skip=['merge_bam', 'filter_bam', 'subsample_bam', 'get_signal_tracks', 'call_peaks'],
1019 |             n_processes=12)
1020 | 
1021 |     except:
1022 |         print_exc()
1023 |         sys.stdout.flush()
1024 |         sys.stderr.flush()
1025 | 
1026 | 
1027 | if __name__ == '__main__':
1028 |     """
1029 |     Calls a method using arguments from command line. Eg, 
1030 | 
1031 |     python prepData.py run_in_parallel a b c 
1032 | 
1033 |     calls run_in_parallel(a, b, c)
1034 |     """
1035 | 
1036 |     args = sys.argv
1037 |     fxn_args = args[2:]   
1038 |     print('Calling %s with arguments' % args[1], args[2:])
1039 |     locals()[args[1]](*args[2:])
1040 |    
1041 |     
1042 | 


--------------------------------------------------------------------------------