├── .gitignore ├── scripts ├── findPeaks.sh ├── convertBEDtoTagAlign.sh ├── convertWigToBigWig.sh ├── convertBEDPEtoTagAlign.sh ├── getSignalTrack.sh ├── averageSignalTrack.sh ├── filterAndConvertBAMs_SE.sh ├── filterAndConvertBAMs.sh └── subsampleBEDPEs.sh ├── copyData.py ├── setup.py ├── runGMExperiments.py ├── dataNormalizer.py ├── README.md ├── diConstants.py ├── modelTemplates.py ├── evaluations.py ├── PRROC.R ├── dataset.py ├── models.py └── prepData.py /.gitignore: -------------------------------------------------------------------------------- 1 | bds.pid* 2 | *bed 3 | *txt 4 | .ipynb* 5 | *.pyc 6 | chipseq.bds* 7 | *.csv 8 | /logs 9 | .Rproj.user 10 | -------------------------------------------------------------------------------- /scripts/findPeaks.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | bds ${1}/chipseq.bds \ 4 | -out_dir ${2} \ 5 | -histone \ 6 | -tag1 ${3} \ 7 | -ctl_tag1 ${4} \ 8 | -callpeak macs2 \ 9 | -species ${5} \ 10 | -nth 2 11 | 12 | 13 | -------------------------------------------------------------------------------- /scripts/convertBEDtoTagAlign.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Takes in a BED file from SE reads 3 | # and outputs tagAlign 4 | 5 | BEDPath=$1 6 | tagAlignPath=$2 7 | 8 | awkProg=' 9 | BEGIN {OFS = "\t"} 10 | { 11 | printf "%s\t%s\t%s\tN\t1000\t%s\n",$1,$2,$3,$6 12 | } 13 | ' 14 | 15 | awk -F'\t' "${awkProg}" ${BEDPath}| \ 16 | gzip -c > ${tagAlignPath} -------------------------------------------------------------------------------- /scripts/convertWigToBigWig.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Takes in a wig file 3 | # Outputs a bigWig file 4 | # Then deletes the wig file 5 | 6 | wigPath=$1 7 | bigWigPath=$2 8 | chromSizesPath=$3 9 | 10 | # . /etc/profile.d/modules.sh 11 | # module load ucsc_tools/3.0.9 12 | 13 | wigToBigWig ${wigPath} ${chromSizesPath} ${bigWigPath} 14 | 15 | rm ${wigPath} 16 | -------------------------------------------------------------------------------- /scripts/convertBEDPEtoTagAlign.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Takes in a BEDPE file 3 | # and outputs tagAlign 4 | 5 | BEDPath=$1 6 | tagAlignPath=$2 7 | 8 | awkProg=' 9 | BEGIN {OFS = "\t"} 10 | { 11 | printf "%s\t%s\t%s\tN\t1000\t%s\n%s\t%s\t%s\tN\t1000\t%s\n",$1,$2,$3,$9,$4,$5,$6,$10 12 | } 13 | ' 14 | 15 | awk -F'\t' "${awkProg}" ${BEDPath}| \ 16 | gzip -c > ${tagAlignPath} -------------------------------------------------------------------------------- /scripts/getSignalTrack.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Runs the BDS CHIP-seq pipeline on a given tagAlign file. 3 | 4 | pipelineDir=$1 5 | tagAlignPath=$2 6 | outputDir=$3 7 | species=$4 8 | 9 | bds ${pipelineDir}/chipseq.bds \ 10 | -out_dir ${outputDir} \ 11 | -histone \ 12 | -input tag \ 13 | -final_stage xcor \ 14 | -tag1 ${tagAlignPath} \ 15 | -tag2bw \ 16 | -species ${species} \ 17 | -nth 2 18 | -------------------------------------------------------------------------------- /scripts/averageSignalTrack.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Takes in a bigWig file and BED file 3 | # Gets the average of the bigWig signal over the BED intervals 4 | # Outputs it to outputPath 5 | 6 | bigWigPath=$1 7 | BEDPath=$2 8 | outputPath=$3 9 | 10 | # . /etc/profile.d/modules.sh 11 | # module load ucsc_tools/3.0.9 12 | 13 | # bigWigAverageOverBed has no option to output to stdout, so we need a temp file 14 | bigWigAverageOverBed ${bigWigPath} ${BEDPath} ${outputPath}.temp 15 | 16 | cut -f5 ${outputPath}.temp > ${outputPath} 17 | rm ${outputPath}.temp 18 | 19 | 20 | -------------------------------------------------------------------------------- /scripts/filterAndConvertBAMs_SE.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Takes in a BAM file with single-end reads 3 | # Filters it for properly mapping reads above a certain MAPQ threshold 4 | # Returns BED 5 | 6 | BAMPath=$1 7 | BEDPath=$2 8 | mapQThreshold=$3 9 | 10 | # . /etc/profile.d/modules.sh 11 | # module load samtools/1.2 12 | # module load bedtools/2.23.0 13 | 14 | samtools view -F 1804 -q ${mapQThreshold} -u ${BAMPath} | \ 15 | samtools sort -m 10000M -O bam -n -T ${BAMPath} - | \ 16 | samtools view -F 1804 -u - | \ 17 | bedtools bamtobed -i stdin > ${BEDPath} -------------------------------------------------------------------------------- /scripts/filterAndConvertBAMs.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Takes in a BAM file 3 | # Filters it for properly mapping reads above a certain MAPQ threshold 4 | # Returns BEDPE 5 | 6 | BAMPath=$1 7 | BEDPath=$2 8 | mapQThreshold=$3 9 | 10 | # . /etc/profile.d/modules.sh 11 | # module load samtools/1.2 12 | # module load bedtools/2.23.0 13 | 14 | samtools view -F 1804 -f 2 -q ${mapQThreshold} -u ${BAMPath} | \ 15 | samtools sort -m 10000M -O bam -n -T ${BAMPath} - | \ 16 | samtools fixmate -r -O 'bam' - - | \ 17 | samtools view -F 1804 -f 2 -u - | \ 18 | bedtools bamtobed -bedpe -i stdin > ${BEDPath} -------------------------------------------------------------------------------- /scripts/subsampleBEDPEs.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Takes in a BEDPE file 3 | # Subsamples it and outputs tagAlign 4 | # numSamplePairs is measured in pairs of reads 5 | 6 | get_seeded_random() 7 | { 8 | seed="$1" 9 | openssl enc -aes-256-ctr -pass pass:"$seed" -nosalt \ 10 | /dev/null 11 | } 12 | 13 | BEDPath=$1 14 | tagAlignPath=$2 15 | numSamplePairs=$3 16 | 17 | awkProg=' 18 | BEGIN {OFS = "\t"} 19 | { 20 | printf "%s\t%s\t%s\tN\t1000\t%s\n%s\t%s\t%s\tN\t1000\t%s\n",$1,$2,$3,$9,$4,$5,$6,$10 21 | } 22 | ' 23 | 24 | shuf -n ${numSamplePairs} --random-source=<(get_seeded_random 42) ${BEDPath} | \ 25 | awk -F'\t' "${awkProg}" | \ 26 | gzip -c > ${tagAlignPath} -------------------------------------------------------------------------------- /copyData.py: -------------------------------------------------------------------------------- 1 | import os 2 | from subprocess import call 3 | 4 | import diConstants as di 5 | 6 | call('wget http://mitra.stanford.edu/kundaje/pangwei/coda_denoising/hg19_blacklist.bed', shell=True) 7 | call('wget http://mitra.stanford.edu/kundaje/pangwei/coda_denoising/hg19.chrom.sizes', shell=True) 8 | call('mv hg19_blacklist.bed %s' % di.HG19_BLACKLIST_FILE, shell=True) 9 | call('mv hg19.chrom.sizes %s' % di.HG19_CHROM_SIZES_PATH, shell=True) 10 | 11 | call('wget http://mitra.stanford.edu/kundaje/pangwei/coda_denoising/low_seq_depth_processed_files.tar.gz', shell=True) 12 | call('tar -xvf low_seq_depth_processed_files.tar.gz', shell=True) 13 | call('mv *metadata *npz %s' % di.BASE_ROOT, shell=True) 14 | call('mv *gappedPeaks* %s' % di.PEAK_GAPPED_DIR, shell=True) -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import diConstants as di 2 | import os 3 | 4 | import numpy as np 5 | import pandas as pd 6 | from keras.models import Sequential, model_from_json 7 | from scipy.stats.stats import pearsonr 8 | from sklearn.metrics import precision_score 9 | import rpy2.robjects as robjects 10 | from rpy2.robjects.packages import STAP 11 | import h5py 12 | 13 | from subprocess import call 14 | 15 | call('mkdir %s' % di.DATA_ROOT, shell=True) 16 | call('mkdir %s' % di.MODELS_ROOT, shell=True) 17 | call('mkdir %s' % di.RESULTS_ROOT, shell=True) 18 | 19 | call('mkdir %s' % di.RAW_ROOT, shell=True) 20 | call('mkdir %s' % di.MERGED_ROOT, shell=True) 21 | call('mkdir %s' % di.SUBSAMPLED_ROOT, shell=True) 22 | call('mkdir %s' % di.BIGWIGS_ROOT, shell=True) 23 | call('mkdir %s' % di.INTERVALS_ROOT, shell=True) 24 | call('mkdir %s' % di.NUMPY_ROOT, shell=True) 25 | call('mkdir %s' % di.PEAK_BASE_DIR, shell=True) 26 | call('mkdir -p %s' % di.PEAK_GAPPED_DIR, shell=True) 27 | call('mkdir %s' % di.DATASETS_ROOT, shell=True) 28 | call('mkdir %s' % di.BASE_ROOT, shell=True) 29 | call('mkdir %s' % di.BASE_BIGWIG_ROOT, shell=True) 30 | call('mkdir %s' % di.SEQ_ROOT, shell=True) 31 | call('mkdir %s' % di.WEIGHTS_ROOT, shell=True) 32 | call('mkdir %s' % di.LOSS_ROOT, shell=True) 33 | call('mkdir %s' % di.HIST_ROOT, shell=True) 34 | call('mkdir %s' % di.EVAL_ROOT, shell=True) 35 | -------------------------------------------------------------------------------- /runGMExperiments.py: -------------------------------------------------------------------------------- 1 | # Run on different "full" depths 2 | # Re-run roadmap experiments 3 | # Map all scRNA stuff 4 | 5 | import os 6 | import copy 7 | import tempfile 8 | import json 9 | from subprocess import call 10 | from diConstants import (HG19_ALL_CHROMS, MM9_ALL_CHROMS, 11 | HG19_TRAIN_CHROMS, MM9_TRAIN_CHROMS, 12 | VALID_CHROMS, TEST_CHROMS) 13 | 14 | import models 15 | import modelTemplates 16 | 17 | def run_model(model_params): 18 | m = models.SeqModel.instantiate_model(model_params) 19 | m.compile_and_train_model() 20 | results = m.evaluate_model() 21 | return results 22 | 23 | GM_MARKS = ['H3K27AC', 'H3K4ME1', 'H3K4ME3', 'H3K27ME3', 'H3K36ME3'] 24 | 25 | 26 | def test_GM18526(): 27 | 28 | for test_cell_line in ['GM18526']: 29 | for subsample_target_string in ['0.5e6']: 30 | for predict_binary_output in [True, False]: 31 | for output_mark in GM_MARKS: 32 | 33 | model_params = modelTemplates.make_model_params( 34 | model_library='keras', 35 | model_class='SeqToPoint', 36 | model_type='cnn', 37 | model_specific_params={ 38 | 'num_filters': 6, 39 | 'filter_length': 51 40 | }, 41 | compile_params={ 42 | 'optimizer': 'adagrad' 43 | }, 44 | dataset_params={ 45 | 'train_dataset_name': 'GM12878_5+1marks-K4me3_all', 46 | 'test_dataset_name': '%s_5+1marks-K4me3_all' % test_cell_line, 47 | 'num_train_examples': 100000, 48 | 'seq_length': 1001, 49 | 'peak_fraction': 0.5, 50 | 'train_X_subsample_target_string': subsample_target_string, 51 | 'num_bins_to_test': None, 52 | 'train_chroms': HG19_ALL_CHROMS, 53 | 'test_chroms': HG19_ALL_CHROMS, 54 | 'only_chr1': True 55 | }, 56 | output_marks=[output_mark], 57 | train_params={ 58 | 'nb_epoch': 30, 59 | 'batch_size': 100 60 | }, 61 | predict_binary_output=predict_binary_output, 62 | zero_out_non_bins=True, 63 | generate_bigWig=True) 64 | 65 | run_model(model_params) 66 | 67 | 68 | if __name__ == '__main__': 69 | 70 | test_GM18526() -------------------------------------------------------------------------------- /dataNormalizer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class DataNormalizer(object): 4 | """ 5 | This class learns input scaling parameters and uses those parameters to apply input scaling 6 | to given data. It contains the fit() and transform() methods. 7 | 8 | We suppose four types of input scaling: 9 | 1) 'ZCA': Zero-mean, features linearly transformed to have unit covariance 10 | 2) 'Z': Zero-mean, each feature independently scaled to unit variance 11 | 3) '01': Zero-mean, each feature independently scaled to lie within [-1, 1] 12 | 4) 'identity': Nothing happens to the input 13 | 14 | If not using 'ZCA', the transform() method does not create a copy. Instead, it modifies the 15 | argument passed to it. If using 'ZCA', this behavior is ambiguous because of the use of 16 | np.reshape, and you should assume that the argument passed to it could randomly be either 17 | transformed or not (and so you should not make further use of the argument). 18 | 19 | All of these transformations are affine transformations. To represent them, each instance of the 20 | class has two variables, W and b, which roughly correspond to the scale and translation factors 21 | for the different transformations. 22 | 23 | Sample usage: 24 | normalizer = DataNormalizer('01') 25 | normalizer.fit(X_train) 26 | X_train = normalizer.transform(X_train) 27 | X_test = normalizer.transform(X_test) 28 | """ 29 | 30 | def __init__(self, mode): 31 | self.b = None 32 | self.W = None 33 | self.mode = mode 34 | if mode not in ['ZCA', 'Z', '01', 'identity']: 35 | raise ValueError, "mode=%s must be 'ZCA', 'Z', '01', or 'identity'" % mode 36 | 37 | 38 | def fit(self, X_orig): 39 | """ 40 | Learns scaling parameters on the X_orig dataset. Does not modify X_orig. 41 | """ 42 | if len(X_orig.shape) != 2 and len(X_orig.shape) != 3: 43 | raise ValueError, "X must be either a 3-tensor of shape num_examples x seq_length x \ 44 | num_input_marks, or a 2-tensor of shape num_examples x num_input_marks" 45 | if self.mode == 'identity': 46 | return None 47 | 48 | X = np.copy(X_orig) 49 | num_input_marks = X.shape[-1] 50 | 51 | # If X is a 3-tensor, reshape X such that it is a 2-tensor of shape 52 | # (num_examples * seq_length) x num_input_marks. 53 | if len(X.shape) == 3: 54 | X = np.reshape(X, (-1, num_input_marks)) 55 | 56 | self.b = np.mean(X, axis=0) 57 | 58 | X -= self.b 59 | 60 | if self.mode == 'ZCA': 61 | sigma = np.dot(X.T, X) / X.shape[0] 62 | U, S, V = np.linalg.svd(sigma) 63 | self.W = np.dot( 64 | np.dot(U, np.diag(1 / np.sqrt(S + 1e-5))), 65 | U.T) 66 | elif self.mode == 'Z': 67 | self.W = np.empty(num_input_marks) 68 | for idx in range(num_input_marks): 69 | self.W[idx] = np.std(X[:, idx]) 70 | elif self.mode == '01': 71 | self.W = np.empty(num_input_marks) 72 | for idx in range(num_input_marks): 73 | self.W[idx] = np.max(np.abs(X[:, idx])) 74 | 75 | return None 76 | 77 | 78 | def transform(self, X): 79 | if len(X.shape) != 2 and len(X.shape) != 3: 80 | raise ValueError, "X must be either a 3-tensor of shape num_examples x seq_length x \ 81 | num_input_marks, or a 2-tensor of shape num_examples x num_input_marks" 82 | 83 | if self.mode == 'identity': 84 | return X 85 | 86 | assert self.b is not None 87 | assert self.W is not None 88 | 89 | num_input_marks = X.shape[-1] 90 | orig_shape = X.shape 91 | 92 | if self.mode == 'ZCA': 93 | X = np.reshape(X, (-1, num_input_marks)) 94 | if self.W.shape[1] != X.shape[1]: 95 | raise ValueError, "When doing a ZCA transform, X and W must have the same number of columns." 96 | X = np.dot( 97 | X - self.b, 98 | self.W.T) 99 | X = np.reshape(X, orig_shape) 100 | elif self.mode in ['Z', '01']: 101 | if (len(self.b) != num_input_marks) or (len(self.W) != num_input_marks): 102 | print("X.shape: ", X.shape) 103 | print("b.shape: ", self.b.shape) 104 | print("W.shape: ", self.W.shape) 105 | raise ValueError, "The shapes of X, b, and W must all share the same last dimension." 106 | for idx in range(num_input_marks): 107 | X[..., idx] = (X[..., idx] - self.b[idx]) / self.W[idx] 108 | 109 | return X 110 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Coda: a convolutional denoising algorithm for genome-wide ChIP-seq data 2 | 3 | Coda uses convolutional neural networks to learn a mapping from noisy to high-quality ChIP-seq data. 4 | These trained networks can then be used to remove noise and improve the quality of new ChIP-seq data. 5 | For more details, please refer to our paper 6 | 7 | Koh PW, Pierson E, Kundaje A, Denoising genome-wide histone ChIP-seq with convolutional neural networks. Bioinformatics (2017) 33 (14): i225-i233 URL:https://doi.org/10.1093/bioinformatics/btx243 (ISMB 2017 Proceedings) 8 | 9 | bioRxiv doi: https://doi.org/10.1101/052118 10 | 11 | 12 | ## Dependencies 13 | The code is written in Python 2.7 and requires the following Python packages to run: 14 | - Numpy (1.11.1) 15 | - Scipy (0.18.0) 16 | - Scikit-learn (0.17.1) 17 | - Pandas (0.18.1) 18 | - h5py (2.6.0) 19 | - rpy2 (2.8.1) 20 | - Keras (1.0.7) 21 | 22 | In addition, if you want to process your own data, you will need: 23 | - AQUAS ChIP-seq pipeline 24 | - SAMtools (1.2) 25 | - BEDtools (2.23) 26 | - ucsc_tools (3.0.9) 27 | 28 | ## Training and testing a model with pre-processed data 29 | The fastest way to get started is to download data that has already been pre-processed. 30 | We have uploaded processed ChIP-seq data from lymphoblastoid cell lines GM12878 and GM18526, 31 | taken from [1]. Each cell line has two sets of ChIP-seq data, one derived from 1M reads per mark and 32 | the other from 100M+ reads per mark. The instructions below will train a model to recover high-depth 33 | data from low-depth data on GM12878, and then apply it to low-depth data on GM18526, evaluating the 34 | model output against high-depth data on GM18526: 35 | 36 | 1) Clone the repo and install the dependencies above. 37 | 38 | 2) Edit `diConstants.py` to reflect the paths where you want to store the data, code, results, etc. 39 | 40 | 3) Run `setup.py`. This runs a few test imports to make sure you have the required libraries, and sets 41 | up the directory structure as specified in `diConstants.py`. 42 | 43 | 4) Run `copyData.py`. This copies the required data (including hg19 blacklist and chromosome sizes) to 44 | the appropriate folders. Note that the data is 6GB in size, so please run this script in a location 45 | where there's enough space! 46 | 47 | 5) Finally, run `python runGMExperiments.py` to get the experiments going. Numerical results will be 48 | written to `RESULTS_ROOT`. Output tracks (reconstructed signal and peak calls) will be written to `RESULTS_BIGWIG_ROOT`. 49 | We make use of the R 'PRROC' package, written by Jan Grau and Jens Keilwagen, to evaluate peak calls. 50 | 51 | ## Processing your own data 52 | We use the AQUAS ChIP-seq pipeline (https://github.com/kundajelab/TF_chipseq_pipeline) 53 | to process raw ChIP-seq data. The script `prepData.py` (and the contents of the `scripts` folder) 54 | contains wrapper functions that call the AQUAS pipeline for you. 55 | 56 | Please install the AQUAS pipeline before proceeding. Note that this pipeline is still under 57 | some development and might be changing in non-backwards-compatible ways. Our code has been tested with 58 | commit 7b7dd27d42d46ac52f5687f80904c576d1b6595d of the AQUAS pipeline. 59 | 60 | To create the processed data that we provided above, you may run the following steps: 61 | 62 | 1) Follow steps 1-3 of the above section. 63 | 64 | 2) Download the files corresponding to GM12878 and GM18526: 65 | http://gbsc-share.stanford.edu/chromovar/rawdata/mapped/bam/personal/reconcile/dedup/ 66 | 67 | 3) Run `python prepData.py make_intervals hg19`. You only need to do this once. 68 | 69 | 4) Run `python prepData.py run_GM_pipeline`. 70 | 71 | This code assumes that you've downloaded the files to a shared location 72 | (`REMOTE_ROOT`, specified in diConstants.py). It makes copies of the files in a 73 | local directory, `RAW_ROOT`, before proceeding. This setup is useful if `REMOTE_ROOT` 74 | is shared across multiple machines and `RAW_ROOT` is local to the machine that you're 75 | running the code on, because there will be a lot of IO operations that will be faster 76 | if done locally. If you do not need this, modify `merge_BAMs()` in `prepData.py` 77 | to remove the copying. 78 | 79 | To process your own data, simply modify the paths in `diConstants.py` or copy your 80 | data to the right directories. While we start from BAM files in this example, the AQUAS 81 | pipeline can start from a variety of input files (e.g., FASTQ, tagAligns). Edit 82 | `scripts/getSignalTrack.sh` and `scripts/findPeaks.sh` if you want to change the parameters that 83 | are passed into AQUAS. 84 | 85 | ## Contact 86 | If you have any questions, please contact: 87 | - Pang Wei Koh 88 | - Emma Pierson 89 | - Anshul Kundaje 90 | 91 | ## References 92 | [1] Kasowski M, Kyriazopoulou-Panagiotopoulou S, Grubert F, Zaugg JB, Kundaje A, Liu Y, et al. Extensive variation in chromatin states across humans. Science (New York, NY). 2013 11;342(6159):750–2 93 | -------------------------------------------------------------------------------- /diConstants.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | ### Variables to set 4 | 5 | # Where the remote directory that the GM12878 and GM18526 data have been downloaded to 6 | REMOTE_ROOT = "/mnt/data/chromatinVariation1/rawdata/mapped/bam/personal/reconcile/dedup" 7 | 8 | # Where the AQUAS pipeline is installed 9 | PIPELINE_ROOT = "/users/pangwei/TF_chipseq_pipeline/" 10 | 11 | # Where the code is 12 | CODE_ROOT = "/users/pangwei/deepimpute_pub" 13 | 14 | # Where the bulk of the storage will be (intermediate/processed files, etc.) 15 | DISK_ROOT = "/srv/scratch/pangwei/deepimpute_pub/" 16 | 17 | # Where output bigwigs will be written to 18 | RESULTS_BIGWIG_ROOT = "/srv/www/kundaje/deepimpute/model-bw" 19 | 20 | HG19_BLACKLIST_FILE = '/srv/www/kundaje/pangwei/coda_denoising/hg19_blacklist.bed' 21 | MM9_BLACKLIST_FILE = '/srv/www/kundaje/pangwei/coda_denoising/mm9_blacklist.bed' 22 | HG19_CHROM_SIZES_PATH = '/srv/www/kundaje/pangwei/coda_denoising/hg19.chrom.sizes' 23 | MM9_CHROM_SIZES_PATH = '/srv/www/kundaje/pangwei/coda_denoising/mm9.male.chrom.sizes' 24 | 25 | MAPQ_THRESHOLD = 30 26 | 27 | ###### 28 | 29 | 30 | DATA_ROOT = os.path.join(DISK_ROOT, 'data') 31 | MODELS_ROOT = os.path.join(DISK_ROOT, 'models') 32 | RESULTS_ROOT = os.path.join(DISK_ROOT, 'results') 33 | 34 | RAW_ROOT = os.path.join(DATA_ROOT, 'raw') 35 | MERGED_ROOT = os.path.join(DATA_ROOT, 'merged') 36 | SUBSAMPLED_ROOT = os.path.join(DATA_ROOT, 'subsampled') 37 | BIGWIGS_ROOT = os.path.join(DATA_ROOT, 'bigWigs') 38 | INTERVALS_ROOT = os.path.join(DATA_ROOT, 'intervals') 39 | NUMPY_ROOT = os.path.join(DATA_ROOT, 'numpy') 40 | PEAK_BASE_DIR = os.path.join(DATA_ROOT, 'peaks') 41 | PEAK_GAPPED_DIR = os.path.join(PEAK_BASE_DIR, 'peak', 'macs2', 'rep1') 42 | DATASETS_ROOT = os.path.join(DATA_ROOT, 'datasets') 43 | BASE_ROOT = os.path.join(DATASETS_ROOT, 'base') 44 | BASE_BIGWIG_ROOT = os.path.join(BASE_ROOT, 'bigWigs') 45 | SEQ_ROOT = os.path.join(DATASETS_ROOT, 'processed-seq') 46 | 47 | WEIGHTS_ROOT = os.path.join(MODELS_ROOT, 'weights') 48 | 49 | LOSS_ROOT = os.path.join(RESULTS_ROOT, 'loss') 50 | HIST_ROOT = os.path.join(RESULTS_ROOT, 'hist') 51 | EVAL_ROOT = os.path.join(RESULTS_ROOT, 'eval') 52 | 53 | 54 | HG19_CHROM_SIZES = { 55 | 'chr1': 249250621, 56 | 'chr2': 243199373, 57 | 'chr3': 198022430, 58 | 'chr4': 191154276, 59 | 'chr5': 180915260, 60 | 'chr6': 171115067, 61 | 'chr7': 159138663, 62 | 'chr8': 146364022, 63 | 'chr9': 141213431, 64 | 'chr10': 135534747, 65 | 'chr11': 135006516, 66 | 'chr12': 133851895, 67 | 'chr13': 115169878, 68 | 'chr14': 107349540, 69 | 'chr15': 102531392, 70 | 'chr16': 90354753, 71 | 'chr17': 81195210, 72 | 'chr18': 78077248, 73 | 'chr19': 59128983, 74 | 'chr20': 63025520, 75 | 'chr21': 48129895, 76 | 'chr22': 51304566, 77 | } 78 | 79 | MM9_CHROM_SIZES = { 80 | 'chr1': 197195432, 81 | 'chr2': 181748087, 82 | 'chr3': 159599783, 83 | 'chr4': 155630120, 84 | 'chr5': 152537259, 85 | 'chr6': 149517037, 86 | 'chr7': 152524553, 87 | 'chr8': 131738871, 88 | 'chr9': 124076172, 89 | 'chr10': 129993255, 90 | 'chr11': 121843856, 91 | 'chr12': 121257530, 92 | 'chr13': 120284312, 93 | 'chr14': 125194864, 94 | 'chr15': 103494974, 95 | 'chr16': 98319150, 96 | 'chr17': 95272651, 97 | 'chr18': 90772031, 98 | 'chr19': 61342430 99 | } 100 | BIN_SIZE = 25 101 | GENOME_BATCH_SIZE = 50000 102 | NUM_BASES = 4 103 | 104 | GM_CELL_LINES = ['GM12878', 'GM19239', 'GM10847', 'GM18505', 'GM18526', 'GM18951', 'GM2610'] 105 | GM_FACTORS = ['H3K27AC','H3K27ME3', 'H3K36ME3','H3K4ME1', 'H3K4ME3', 'INPUT'] 106 | SUBSAMPLE_TARGETS = ['0.1e6','0.25e6', '0.5e6','1e6', '2.5e6', '5e6','7.5e6', '10e6','30e6','20e6', None] 107 | 108 | GM_DATASET_NAME_TEMPLATE = '%s_5+1marks-K4me3_all' 109 | ROADMAP_DATASET_NAME_TEMPLATE = '%s_6+1marks_all' 110 | ULI_DATASET_NAME_TEMPLATE = '%s_3marks_all' 111 | MOW_DATASET_NAME_TEMPLATE = '%s_2marks_all' 112 | 113 | 114 | HG19_ALL_CHROMS = [ 115 | 'chr1', 116 | 'chr2', 117 | 'chr3', 118 | 'chr4', 119 | 'chr5', 120 | 'chr6', 121 | 'chr7', 122 | 'chr8', 123 | 'chr9', 124 | 'chr10', 125 | 'chr11', 126 | 'chr12', 127 | 'chr13', 128 | 'chr14', 129 | 'chr15', 130 | 'chr16', 131 | 'chr17', 132 | 'chr18', 133 | 'chr19', 134 | 'chr20', 135 | 'chr21', 136 | 'chr22', 137 | ] 138 | 139 | MM9_ALL_CHROMS = [ 140 | 'chr1', 141 | 'chr2', 142 | 'chr3', 143 | 'chr4', 144 | 'chr5', 145 | 'chr6', 146 | 'chr7', 147 | 'chr8', 148 | 'chr9', 149 | 'chr10', 150 | 'chr11', 151 | 'chr12', 152 | 'chr13', 153 | 'chr14', 154 | 'chr15', 155 | 'chr16', 156 | 'chr17', 157 | 'chr18', 158 | 'chr19' 159 | ] 160 | 161 | 162 | TEST_CHROMS = [ 163 | 'chr1', 164 | 'chr2', 165 | ] 166 | 167 | VALID_CHROMS = [ 168 | 'chr3', 169 | 'chr4' 170 | ] 171 | 172 | HG19_TRAIN_CHROMS = [ 173 | 'chr5', 174 | 'chr6', 175 | 'chr7', 176 | 'chr8', 177 | 'chr9', 178 | 'chr10', 179 | 'chr11', 180 | 'chr12', 181 | 'chr13', 182 | 'chr14', 183 | 'chr15', 184 | 'chr16', 185 | 'chr17', 186 | 'chr18', 187 | 'chr19', 188 | 'chr20', 189 | 'chr21', 190 | 'chr22', 191 | ] 192 | 193 | MM9_TRAIN_CHROMS = [ 194 | 'chr5', 195 | 'chr6', 196 | 'chr7', 197 | 'chr8', 198 | 'chr9', 199 | 'chr10', 200 | 'chr11', 201 | 'chr12', 202 | 'chr13', 203 | 'chr14', 204 | 'chr15', 205 | 'chr16', 206 | 'chr17', 207 | 'chr18', 208 | 'chr19' 209 | ] -------------------------------------------------------------------------------- /modelTemplates.py: -------------------------------------------------------------------------------- 1 | import copy 2 | from dataset import Dataset, get_species_from_dataset_name 3 | from diConstants import (HG19_ALL_CHROMS, MM9_ALL_CHROMS, 4 | HG19_TRAIN_CHROMS, MM9_TRAIN_CHROMS, 5 | VALID_CHROMS, TEST_CHROMS) 6 | 7 | def make_dataset_params(num_train_examples, 8 | seq_length, 9 | train_dataset_name='GM12878_5+1marks-K4me3_all', 10 | test_dataset_name='GM19239_5+1marks-K4me3_all', 11 | train_X_subsample_target_string='5e6', 12 | train_Y_subsample_target_string=None, 13 | test_X_subsample_target_string=None, 14 | test_Y_subsample_target_string=None, 15 | random_seed=0, 16 | num_test_examples=None, 17 | normalization='arcsinh', 18 | peak_fraction=0.5, 19 | only_chr1=True, 20 | num_bins_to_test=1000000, 21 | train_chroms=None, 22 | test_chroms=None): 23 | """ 24 | only_chr1 controls whether genome-wide prediction is done on the whole genome, or just 25 | on chr1 for speed. 26 | 27 | num_bins_to_test controls how many bins of each chromosome should be tested. If num_bins_to_test 28 | == 1000000, for example, then only the first 1M bins of each chromosome (or of chr1, if only_chr1 is 29 | True) will be tested. Set num_bins_to_test to None to test the whole chromosome. 30 | """ 31 | 32 | if num_test_examples is None: 33 | num_test_examples = num_train_examples 34 | 35 | if test_X_subsample_target_string is None: 36 | test_X_subsample_target_string = train_X_subsample_target_string 37 | 38 | if test_Y_subsample_target_string is None: 39 | test_Y_subsample_target_string = train_Y_subsample_target_string 40 | 41 | if train_chroms is None: 42 | if get_species_from_dataset_name(train_dataset_name) == 'mm9': 43 | train_chroms = MM9_ALL_CHROMS 44 | else: 45 | train_chroms = HG19_ALL_CHROMS 46 | 47 | if test_chroms is None: 48 | if get_species_from_dataset_name(test_dataset_name) == 'mm9': 49 | test_chroms = MM9_ALL_CHROMS 50 | else: 51 | test_chroms = HG19_ALL_CHROMS 52 | 53 | return { 54 | 'train_dataset': Dataset( 55 | dataset_name=train_dataset_name, 56 | num_examples=num_train_examples, 57 | X_subsample_target_string=train_X_subsample_target_string, 58 | Y_subsample_target_string=train_Y_subsample_target_string, 59 | random_seed=random_seed, 60 | normalization=normalization, 61 | peak_fraction=peak_fraction, 62 | chroms=train_chroms), 63 | 'test_datasets': [Dataset( 64 | dataset_name=test_dataset_name, 65 | num_examples=num_test_examples, 66 | X_subsample_target_string=test_X_subsample_target_string, 67 | Y_subsample_target_string=test_Y_subsample_target_string, 68 | random_seed=random_seed, 69 | normalization=normalization, 70 | peak_fraction=peak_fraction, 71 | chroms=test_chroms)], 72 | 'seq_length': seq_length, 73 | 'num_bins_to_test': num_bins_to_test, 74 | 'only_chr1': only_chr1, 75 | } 76 | 77 | 78 | def make_model_params(model_library, 79 | model_class, 80 | model_type, 81 | dataset_params, 82 | scale_input='01', 83 | model_specific_params=None, 84 | compile_params=None, 85 | train_params=None, 86 | input_marks=None, 87 | output_marks=None, 88 | random_seed=0, 89 | generate_bigWig=False, 90 | predict_binary_output=False, 91 | zero_out_non_bins=False): 92 | """ 93 | input_marks is a list of histone marks that the model will take in as input. 94 | 95 | output_marks is a list of all the marks that we want the model to learn to output. 96 | If we're training a single multi-task model, this is either a list of length 5 or 6, 97 | depending on whether we're doing classification or regression (if we're doing classification, 98 | we don't predict INPUT). 99 | If we're training a separate model for each mark, then output_marks is just a list of length 1. 100 | 101 | scale_input is one of 'ZCA', 'Z', '01', or 'identity'. 102 | 103 | zero_out_non_bins is only used when predict_binary_output is True. It specifies whether 104 | we should zero out the -log10 p values of bins that are not in the corresponding gappedPeak file. 105 | This is used for baseline evaluations. 106 | """ 107 | 108 | params = { 109 | 'model_library': model_library, 110 | 'model_class': model_class, 111 | 'model_type': model_type, 112 | 'scale_input': scale_input, 113 | 'random_seed': random_seed, 114 | 'generate_bigWig': generate_bigWig, 115 | 'predict_binary_output': predict_binary_output, 116 | 'zero_out_non_bins': zero_out_non_bins 117 | } 118 | 119 | params['dataset_params'] = make_dataset_params(**dataset_params) 120 | 121 | # Defaults for compile_params 122 | if compile_params is None: 123 | compile_params = {} 124 | if model_library == 'keras': 125 | if predict_binary_output: 126 | compile_params_defaults = { 127 | 'loss': 'binary_crossentropy', 128 | 'optimizer': 'adagrad' 129 | } 130 | else: 131 | compile_params_defaults = { 132 | 'loss': 'MSE', 133 | 'optimizer': 'adagrad' 134 | } 135 | for key in compile_params_defaults: 136 | if key not in compile_params: 137 | compile_params[key] = compile_params_defaults[key] 138 | params['compile_params'] = compile_params 139 | 140 | # Defaults for train_params 141 | if train_params is None: 142 | train_params = {} 143 | if model_library == 'keras': 144 | train_params_defaults = { 145 | 'nb_epoch': 50, 146 | 'batch_size': 2000, 147 | 'validation_split': 0.2 148 | } 149 | for key in train_params_defaults: 150 | if key not in train_params: 151 | train_params[key] = train_params_defaults[key] 152 | params['train_params'] = train_params 153 | 154 | # If input_marks is not set, then set it to all the marks in the training dataset 155 | if input_marks is None: 156 | input_marks = params['dataset_params']['train_dataset'].marks_in_dataset 157 | 158 | # Default for output_marks is to output all of the input_marks 159 | # Unless we're doing classification, in which case we don't output INPUT 160 | if output_marks is None: 161 | output_marks = copy.copy(input_marks) 162 | if predict_binary_output and 'INPUT' in output_marks: 163 | output_marks.remove('INPUT') 164 | 165 | # Make sure that input_marks and output_marks are both contained within 166 | # marks_in_train_dataset and marks_in_test_dataset 167 | for mark in input_marks + output_marks: 168 | assert mark in params['dataset_params']['train_dataset'].marks_in_dataset 169 | for test_dataset in params['dataset_params']['test_datasets']: 170 | assert mark in test_dataset.marks_in_dataset 171 | 172 | params['input_marks'] = input_marks 173 | params['output_marks'] = output_marks 174 | 175 | if model_specific_params is None: 176 | model_specific_params = {} 177 | for key in model_specific_params: 178 | if key in params: 179 | raise ValueError, 'model_specific_params cannot overwrite existing model params' 180 | params[key] = model_specific_params[key] 181 | 182 | return params 183 | 184 | -------------------------------------------------------------------------------- /evaluations.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.stats.stats import pearsonr 3 | from sklearn.metrics import precision_recall_curve 4 | import rpy2.robjects as robjects 5 | from rpy2.robjects.packages import STAP 6 | import datetime 7 | from random import sample 8 | 9 | def get_MSE(pred_Y, test_Y): 10 | """ 11 | Returns mean squared error calculated across all dimensions. 12 | """ 13 | assert pred_Y.shape == test_Y.shape 14 | return np.mean((pred_Y - test_Y) ** 2) 15 | 16 | def get_pearsonR(pred_Y, test_Y): 17 | """ 18 | Returns Pearson correlation for a single mark. 19 | 20 | Only takes in vectors. 21 | """ 22 | 23 | assert pred_Y.shape == test_Y.shape 24 | assert len(pred_Y.shape) == 1 25 | 26 | return pearsonr(pred_Y, test_Y)[0] 27 | 28 | def is_binary(M): 29 | unique_elements = list(set(M.flatten())) 30 | return all([elem in [0, 1] for elem in unique_elements]) 31 | 32 | def downsample_curve(vals): 33 | """ 34 | Downsamples vals by a factor of 10 if len(vals) > 1000 (used to keep precision / recall curves from getting too long) 35 | """ 36 | n = len(vals) 37 | 38 | if n > 1000: 39 | new_vals = [] 40 | for i in range(int(n / 10)): 41 | new_vals.append(vals[i * 10]) 42 | return new_vals 43 | else: 44 | return list(vals) 45 | 46 | def compute_recalls_at_precision(precisions, recalls): 47 | """ 48 | Computes recalls at 10%, 20%, ... 90% precision. 49 | Does not interpolate. 50 | """ 51 | precision_increment = .1 52 | desired_precision = precision_increment 53 | desired_precisions = [] 54 | recalls_at_precision = [] 55 | for i in range(len(precisions)): 56 | while precisions[i] > desired_precision: 57 | desired_precisions.append(desired_precision) 58 | recalls_at_precision.append(recalls[i]) 59 | desired_precision += precision_increment 60 | 61 | return desired_precisions, recalls_at_precision 62 | 63 | 64 | def compare(pred_Y, test_Y, predict_binary_output, peaks=None, 65 | save_curves=True, save_data=False): 66 | """ 67 | Evaluates performance for predictions pred_Y relative to true labels test_Y. 68 | If predict_binary_output, pred_Y should be a set of scores and test_Y should be 0, 1 labels. 69 | Otherwise, both pred_Y and test_Y should be continuous values. 70 | Returns squared error and Pearson correlation between the predicted output and the actual output. 71 | 72 | Both pred_Y and test_Y must be matrices of shape num_examples x num_histone_marks, 73 | or they must both be matrices of shape num_examples x seq_length x num_histone_marks. 74 | If the latter, examples are concatenated together before correlations are computed. 75 | 76 | peaks is a list. Each element of this list corresponds to one mark and is a N x 2 matrix 77 | where each row contains the (start, end) coordinates of a peak in that mark. 78 | If passing in peaks, make sure the coordinate system matches that of pred_Y and test_Y! 79 | For example, if your peaks start at the start of the chromosome, then pred_Y and test_Y have 80 | to start at the start of the chromosome as well. 81 | 82 | If save_curves is True, it saves the full precision-recall curve. save_curves cannot be True if 83 | predict_binary_output is False. Right now it saves recalls @10, 20...90% precision. 84 | 85 | If save_data is True, it saves the first mark of pred_Y and test_Y. 86 | 87 | Returns results, a dictionary containing: 88 | 'AUC' (if predict_binary_output) 89 | 'AUPRC' (if predict_binary_output) 90 | 'precision_curves' (if save_curves) 91 | 'recall_curves' (if save_curves) 92 | 'threshold_curves' (if save_curves) 93 | 'MSE' (if not predict_binary_output) 94 | 'true_var' (if not predict_binary_output) 95 | 'pearsonR' (if not predict_binary_output) 96 | 'pred_Y' (if save_data) 97 | 'test_Y' (if save_data) 98 | 99 | AUC, AUPRC, MSE, true_var, pearsonR, and spearmanR are each vectors of length num_histone_marks. 100 | true_var is the variance of the true data; it is useful for interpreting whether a given 101 | MSE is good or bad. 102 | """ 103 | 104 | # save_curves has to be False if predict_binary_output is also False 105 | if not predict_binary_output: save_curves = False 106 | 107 | pred_Y_is_binary = is_binary(pred_Y) 108 | test_Y_is_binary = is_binary(test_Y) 109 | assert pred_Y.shape == test_Y.shape, \ 110 | "pred_Y.shape = %s doesn't match test_Y.shape = %s" % (str(pred_Y.shape), str(test_Y.shape)) 111 | assert test_Y_is_binary == predict_binary_output 112 | 113 | #test_Y (the true labels) ought to be binary IFF we're predicting binary output. 114 | #pred_Y should be a set of continuous scores, regardless of whether we're predicting binary output. 115 | assert len(pred_Y.shape) == 2 or len(pred_Y.shape) == 3 116 | 117 | # If peaks is not None, then there should be one element in peaks for each mark in pred_Y. 118 | if peaks: 119 | assert len(peaks) == pred_Y.shape[-1] 120 | 121 | # If the input matrices are 3D, then squash the first two dimensions together 122 | if len(pred_Y.shape) == 3: 123 | pred_Y = np.reshape(pred_Y, [pred_Y.shape[0] * pred_Y.shape[1], pred_Y.shape[2]]) 124 | test_Y = np.reshape(test_Y, [test_Y.shape[0] * test_Y.shape[1], test_Y.shape[2]]) 125 | 126 | num_histone_marks = pred_Y.shape[len(pred_Y.shape) - 1] 127 | 128 | true_var = [] 129 | MSE = [] 130 | pearsonR = [] 131 | 132 | precision_curves = [] 133 | recall_curves = [] 134 | threshold_curves = [] 135 | auc = [] 136 | auprc = [] 137 | Y_pos_frac = [] 138 | 139 | with open('PRROC.R', 'r') as f:#load in the R code. 140 | r_fxn_string = f.read() 141 | r_auc_func = STAP(r_fxn_string, "auc_func") 142 | 143 | for mark_idx in range(num_histone_marks): 144 | ### Sub-select only peak regions 145 | if peaks: 146 | # If peaks exists but peaks[mark_idx] is set to None, we should skip this mark. 147 | # This mark should correspond to INPUT, which has no peaks of its own. 148 | if peaks[mark_idx] is None: 149 | if predict_binary_output: 150 | precision_curves.append(None) 151 | recall_curves.append(None) 152 | threshold_curves.append(None) 153 | auprc.append(None) 154 | auc.append(None) 155 | else: 156 | true_var.append(None) 157 | MSE.append(None) 158 | pearsonR.append(None) 159 | continue 160 | 161 | # Initialize peak_idxs to all False 162 | num_bins = pred_Y.shape[0] 163 | peak_idxs = np.zeros( 164 | num_bins, 165 | dtype=bool) 166 | 167 | # Set peak_idx such that it is True in each peak 168 | # Simultaneously get the average signal density in each peak 169 | for peak_counter, peak in enumerate(peaks[mark_idx]): 170 | # We have to check for this, because pred_Y and test_Y might only represent 171 | # a fraction of any given chromosome 172 | if peak[1] > num_bins: 173 | continue 174 | 175 | peak_idxs[peak[0]:peak[1]] = True 176 | 177 | pred_Y_mark = pred_Y[peak_idxs, mark_idx] 178 | test_Y_mark = test_Y[peak_idxs, mark_idx] 179 | else: 180 | pred_Y_mark = pred_Y[:, mark_idx] 181 | test_Y_mark = test_Y[:, mark_idx] 182 | 183 | ### Run evaluations on (selected) regions 184 | if predict_binary_output: 185 | precisions, recalls, thresholds = precision_recall_curve(test_Y_mark, pred_Y_mark) 186 | precisions, recalls = compute_recalls_at_precision(precisions, recalls) 187 | 188 | precision_curves.append(list(precisions)) 189 | recall_curves.append(list(recalls)) 190 | 191 | if len(test_Y_mark) < 100000: 192 | downsample_idxs = range(len(test_Y_mark)) 193 | else: 194 | downsample_idxs = sample(range(len(test_Y_mark)), 100000) 195 | 196 | r_auprc_results = r_auc_func.pr_curve(scores_class0 = robjects.vectors.FloatVector(pred_Y_mark[downsample_idxs]), weights_class0 = robjects.vectors.FloatVector(test_Y_mark[downsample_idxs])) 197 | 198 | auprc.append(float(r_auprc_results.rx('auc.davis.goadrich')[0][0])) 199 | r_auc_results = r_auc_func.roc_curve(scores_class0 = robjects.vectors.FloatVector(pred_Y_mark[downsample_idxs]), weights_class0 = robjects.vectors.FloatVector(test_Y_mark[downsample_idxs])) 200 | auc.append(float(r_auc_results.rx('auc')[0][0])) 201 | Y_pos_frac.append(test_Y_mark.mean()) 202 | print("AUC %2.3f; AUPRC %2.3f" % (auc[mark_idx], auprc[mark_idx])) 203 | else: 204 | true_var.append(np.var(test_Y_mark)) 205 | MSE.append(get_MSE(pred_Y_mark, test_Y_mark)) 206 | pearsonR.append(get_pearsonR(pred_Y_mark, test_Y_mark)) 207 | 208 | print("MSE %2.3f (true var %2.3f), pearsonR %2.3f" % 209 | (MSE[mark_idx], true_var[mark_idx], pearsonR[mark_idx])) 210 | 211 | if predict_binary_output: 212 | assert((len(precisions) > 0) and (len(recalls) > 0)) 213 | results = { 214 | 'AUC':auc, 215 | 'AUPRC':auprc, 216 | 'Y_pos_frac':Y_pos_frac 217 | } 218 | results['precision_curves'] = precision_curves 219 | results['recall_curves'] = recall_curves 220 | 221 | else: 222 | results = { 223 | 'MSE': MSE, 224 | 'true_var': true_var, 225 | 'pearsonR': pearsonR 226 | } 227 | 228 | if save_data: 229 | results['pred_Y'] = list(pred_Y[..., 0]) 230 | results['test_Y'] = list(test_Y[..., 0]) 231 | 232 | return results 233 | 234 | 235 | 236 | 237 | -------------------------------------------------------------------------------- /PRROC.R: -------------------------------------------------------------------------------- 1 | # Taken from the PRROC R package, https://cran.r-project.org/web/packages/PRROC/PRROC.pdf. 2 | # Written by Jan Grau and Jens Keilwagen. 3 | 4 | pr_curve<-function( scores_class0, scores.class1=scores.class0, weights_class0=NULL, 5 | weights.class1 = {if(is.null(weights.class0)){NULL}else{1-weights.class0}}, sorted = FALSE, curve = FALSE, 6 | minStepSize=min(1,ifelse(is.null(weights.class0),1,sum(weights.class0)/100)), 7 | max.compute=F, min.compute=F, rand.compute=F){ 8 | scores.class0 = scores_class0 9 | weights.class0 = weights_class0 10 | if(!sorted){ 11 | o0<-order(scores.class0); 12 | scores.class0<-scores.class0[o0]; 13 | if(!is.null(weights.class0)){ 14 | weights.class0<-weights.class0[o0]; 15 | } 16 | o1<-order(scores.class1); 17 | scores.class1<-scores.class1[o1]; 18 | if(!is.null(weights.class1)){ 19 | weights.class1<-weights.class1[o1]; 20 | } 21 | } 22 | compute.pr(scores.class0,scores.class1,weights.class0,weights.class1,curve,minStepSize,max.compute,min.compute,rand.compute); 23 | } 24 | 25 | 26 | roc_curve<-function( scores.class0, scores.class1=scores.class0, weights.class0=NULL, 27 | weights.class1 = {if(is.null(weights.class0)){NULL}else{1-weights.class0}}, sorted = FALSE, curve = FALSE, 28 | max.compute=F, min.compute=F, rand.compute=F){ 29 | if(!sorted){ 30 | o0<-order(scores.class0); 31 | scores.class0<-scores.class0[o0]; 32 | if(!is.null(weights.class0)){ 33 | weights.class0<-weights.class0[o0]; 34 | } 35 | o1<-order(scores.class1); 36 | scores.class1<-scores.class1[o1]; 37 | if(!is.null(weights.class1)){ 38 | weights.class1<-weights.class1[o1]; 39 | } 40 | } 41 | compute.roc(scores.class0,scores.class1,weights.class0,weights.class1,curve,max.compute,min.compute,rand.compute); 42 | } 43 | 44 | 45 | check <- function( n, weights ) { 46 | if( !is.null( weights ) ) { 47 | if( n != length(weights) ) { 48 | stop( "The weights must have the same length as the scores." ); 49 | } 50 | if( sum( weights < 0 ) != 0 ) { 51 | stop( "The weights must be non-negative." ); 52 | } 53 | } 54 | } 55 | 56 | compute.pr <- function( sorted.scores.class0, sorted.scores.class1=sorted.scores.class0, weights.class0 = NULL, 57 | weights.class1 = {if(is.null(weights.class0)){NULL}else{1-weights.class0}}, curve = FALSE, 58 | minStepSize=min(1,ifelse(is.null(weights.class0),1,sum(weights.class0)/100)), 59 | max.compute=F, min.compute=F, rand.compute=F ){ 60 | 61 | check( length(sorted.scores.class0), weights.class0 ); 62 | check( length(sorted.scores.class1), weights.class1 ); 63 | 64 | if( !is.null(sorted.scores.class1) & ( length(sorted.scores.class0) != length(sorted.scores.class1) | 65 | suppressWarnings( sum(sorted.scores.class0 != sorted.scores.class1) > 0 ) 66 | ) & is.null(weights.class0) & is.null(weights.class1) ){ 67 | weights.class0<-c(rep(1,length(sorted.scores.class0)),rep(0,length(sorted.scores.class1))); 68 | sorted.scores.class0<-c(sorted.scores.class0,sorted.scores.class1); 69 | o0<-order(sorted.scores.class0); 70 | sorted.scores.class0<-sorted.scores.class0[o0]; 71 | weights.class0<-weights.class0[o0]; 72 | weights.class1<-1-weights.class0; 73 | sorted.scores.class1<-sorted.scores.class0; 74 | } 75 | 76 | davis.and.goadrich <- ( length(sorted.scores.class0) == length(sorted.scores.class1) & 77 | suppressWarnings( sum( sorted.scores.class0 != sorted.scores.class1 ) == 0 ) & 78 | length(weights.class0) == length(weights.class1) & 79 | suppressWarnings( sum( weights.class0 != (1 - weights.class1) ) == 0 ) & 80 | sum(weights.class0 != 0 & weights.class0 != 1)==0); 81 | 82 | #( is.null( weights.class0 ) | sum( weights.class0 != 1 ) == 0 ) & ( is.null( weights.class1 ) | sum( weights.class1 != 1 ) == 0 ); 83 | 84 | i.old <- 0; j.old <- 0; i <- 0; j <- 0; d <- length( sorted.scores.class1 ); m <- length( sorted.scores.class0 ); 85 | help1 <- 0; help2 <- 0; 86 | auc.GD <- ifelse(davis.and.goadrich,0,NA); auc.integral <- 0; fn <- 0; tn <- 0; 87 | 88 | nw0 <- is.null( weights.class0 ); 89 | nw1 <- is.null( weights.class1 ); 90 | 91 | pos <- ifelse( nw0, m, sum( weights.class0 ) ); 92 | neg <- ifelse( nw1, d, sum( weights.class1 ) ); 93 | 94 | while( ( j sorted.scores.class1[ j + 1 ] ){ 95 | tn <- tn + ifelse( nw1, 1, weights.class1[ j + 1 ] ); 96 | j <- j + 1; 97 | } 98 | p <- c( ( pos - fn ) / pos, ( pos - fn ) / ( pos - fn + neg - tn ), sorted.scores.class0[ i + 1 ] ); 99 | ci <- 1; 100 | if( curve ){ 101 | list.curve <- create.curve( length( sorted.scores.class0 ) + length( sorted.scores.class1 ) ); 102 | list.curve <- append.to.curve( list.curve, p, ci ); 103 | ci <- ci + 1; 104 | }else{ 105 | list.curve <- NULL; 106 | } 107 | 108 | unique <- !( j < d & sorted.scores.class0[ i + 1 ] == sorted.scores.class1[ j + 1 ] ); 109 | from.motif <- unique; 110 | 111 | while( i< m & j < d ){ 112 | i.old <- i; 113 | j.old <- j; 114 | tn.old <- tn; 115 | fn.old <- fn; 116 | 117 | if( !unique || from.motif ){ 118 | while( i + 1 < m & sorted.scores.class0[ i + 1 ] == sorted.scores.class0[ i + 2 ] ){ 119 | fn <- fn + ifelse( nw0, 1, weights.class0[ i + 1 ] ); 120 | i <- i + 1; 121 | } 122 | fn <- fn + ifelse( nw0, 1, weights.class0[ i + 1 ] ); 123 | i <- i + 1; 124 | } 125 | if( !unique || !from.motif ){ 126 | while( j + 1 < d & sorted.scores.class1[ j + 1 ] == sorted.scores.class1[ j + 2 ] ){ 127 | tn <- tn + ifelse( nw1, 1, weights.class1[ j + 1 ] ); 128 | j <- j + 1; 129 | } 130 | tn <- tn + ifelse( nw1, 1, weights.class1[ j + 1 ] ); 131 | j <- j + 1; 132 | } 133 | score<-0; 134 | if( i < m & j < d ){ 135 | if( sorted.scores.class0[ i + 1 ] == sorted.scores.class1[ j + 1 ] ){ 136 | unique <- F; 137 | score <- sorted.scores.class0[ i + 1 ]; 138 | }else{ 139 | unique <- T; 140 | if( sorted.scores.class0[ i + 1 ] < sorted.scores.class1[ j + 1 ] ){ 141 | from.motif <- T; 142 | score <- sorted.scores.class0[ i + 1 ]; 143 | }else{ 144 | from.motif <- F; 145 | score <- sorted.scores.class1[ j + 1 ]; 146 | } 147 | } 148 | } else { 149 | if( i < m ) { 150 | score <- sorted.scores.class0[ i + 1 ]; 151 | } else if( j < d ) { 152 | score <- sorted.scores.class1[ j + 1 ]; 153 | } else { 154 | #i=m, j=d 155 | max = max(sorted.scores.class0[ m ],sorted.scores.class1[ d ]); 156 | score = max; #+ 0.01*( max - min(sorted.scores.class0[ 1 ],sorted.scores.class1[ 1 ]) ); #max + arbitrary offset 157 | } 158 | } 159 | 160 | if( fn == fn.old ) {#i == i.old ){ 161 | old.p<-p; 162 | p <- c( p[ 1 ], ( pos - fn ) / ( pos - fn + neg - tn ), score ); 163 | if(is.nan(p[2])){ 164 | p<-old.p; 165 | } 166 | if( curve ){ 167 | list.curve <- append.to.curve( list.curve, p, ci ); 168 | ci <- ci + 1; 169 | } 170 | }else{ 171 | p.b <- p[ 1 ]; 172 | p.a <- ( pos - fn ) / pos; 173 | 174 | if( davis.and.goadrich ){ 175 | if( i < m | j < d ){# TODO 176 | prop.term <- ( tn - tn.old ) / ( fn - fn.old ); 177 | h1 <- p[ 1 ]; h2 <- p[ 2 ]; 178 | c <- fn.old + 1; 179 | help.j <- tn.old + prop.term; 180 | while( c <= fn ){ 181 | help1 <- (pos - c) / pos; 182 | help2 <- (pos - c) / ( pos - c + neg - help.j ); 183 | help.j <- help.j + prop.term; 184 | auc.GD <- auc.GD + ( h2 + help2 ) / 2 * ( h1 - help1 ); 185 | #print(c(1,auc.GD,i=i.v,m=pos,j=j.v,d=neg,c=c,i.old=i.v.o,j.old=j.v.o)) 186 | h1 <- help1; 187 | h2 <- help2; 188 | c <- c + 1; 189 | } 190 | }else{ 191 | auc.GD <- auc.GD + p[ 2 ] * p[ 1 ]; 192 | } 193 | } 194 | 195 | h <- ( tn - tn.old ) / ( fn - fn.old ); 196 | a <- 1 + h; 197 | b <- ( neg - tn - h * ( pos - fn ) ) / pos; 198 | 199 | if( !isTRUE(all.equal(b, 0)) ){ 200 | auc.integral <- auc.integral + ( p.b - p.a - b / a * ( log( a * p.b + b ) - log( a * p.a + b ) ) ) / a; 201 | }else{ 202 | auc.integral <- auc.integral + ( p.b - p.a ) / a; 203 | } 204 | 205 | prop.term <- min( ( fn - fn.old ) / ( i - i.old ), minStepSize ); 206 | h <- h*prop.term; 207 | help.i <- fn.old + prop.term; 208 | i.old <- i.old + 1; 209 | help.j <- tn.old + h; 210 | k=1; 211 | while( help.i < fn ){ 212 | p <- c( ( pos - help.i ) / pos, ( pos - help.i ) / ( pos - help.i + neg - help.j ), score );#interpolate score? 213 | if( curve ){ 214 | list.curve <- append.to.curve( list.curve, p, ci ); 215 | ci <- ci + 1; 216 | } 217 | k=k+1; 218 | help.j <- tn.old + k*h; 219 | help.i <- fn.old + k*prop.term; 220 | } 221 | if( p.a != p[ 1 ] ){ 222 | temp <- ( pos - fn ) / ( pos - fn + neg - tn ); 223 | if(is.nan(temp)){ 224 | temp <- p[2]; 225 | } 226 | p <- c( p.a, temp, score ); 227 | if( curve ){ 228 | list.curve <- append.to.curve( list.curve, p, ci ); 229 | ci <- ci + 1; 230 | } 231 | } 232 | } 233 | } 234 | 235 | if( i < m ){ 236 | help1 <- 0; 237 | if( davis.and.goadrich ){ 238 | auc.GD <- auc.GD + p[ 2 ] * ( p[ 1 ] - help1 ); 239 | } 240 | 241 | auc.integral <- auc.integral + p[ 2 ] * ( p[ 1 ] - help1 ); 242 | 243 | p <- c( help1, p[ 2 ], sorted.scores.class0[ i + 1 ] ); 244 | if( curve ){ 245 | list.curve <- append.to.curve( list.curve, p, ci ); 246 | ci <- ci + 1; 247 | } 248 | } 249 | if(curve){ 250 | list.curve<-shrink.curve( list.curve ); 251 | # list.curve<-rbind(c(list.curve[1,1],list.curve[1,2],min(sorted.scores.class0,sorted.scores.class1)), 252 | # list.curve, 253 | # c(list.curve[nrow(list.curve),1],list.curve[nrow(list.curve),2],max(sorted.scores.class0,sorted.scores.class1))) 254 | } 255 | res<-list( type = "PR", auc.integral = auc.integral, auc.davis.goadrich = auc.GD, curve=list.curve ); 256 | 257 | if(max.compute){ 258 | scores0<-NULL; 259 | if(is.null(weights.class0) | length(sorted.scores.class0)!=length(sorted.scores.class1) | suppressWarnings(sum(sorted.scores.class0!=sorted.scores.class1)) > 0){ 260 | scores0<-rep(1,length(sorted.scores.class0)); 261 | }else{ 262 | scores0<-weights.class0; 263 | } 264 | scores1<-NULL; 265 | if(is.null(weights.class0) | length(sorted.scores.class0)!=length(sorted.scores.class1) | suppressWarnings(sum(sorted.scores.class0!=sorted.scores.class1)) > 0){ 266 | scores1<-rep(0,length(sorted.scores.class1)); 267 | }else{ 268 | scores1<-weights.class0; 269 | } 270 | 271 | max.res<-pr.curve( scores.class0=scores0, scores.class1=scores1,weights.class0=weights.class0, 272 | weights.class1=weights.class1,curve=curve,minStepSize=minStepSize); 273 | res<-c(res,list(max=max.res)); 274 | } 275 | 276 | if(min.compute){ 277 | scores0<-NULL; 278 | if(is.null(weights.class0) | length(sorted.scores.class0)!=length(sorted.scores.class1) | suppressWarnings(sum(sorted.scores.class0!=sorted.scores.class1)) > 0){ 279 | scores0<-rep(0,length(sorted.scores.class0)); 280 | }else{ 281 | scores0<-(-weights.class0); 282 | } 283 | scores1<-NULL; 284 | if(is.null(weights.class0) | length(sorted.scores.class0)!=length(sorted.scores.class1) | suppressWarnings(sum(sorted.scores.class0!=sorted.scores.class1)) > 0){ 285 | scores1<-rep(1,length(sorted.scores.class1)); 286 | }else{ 287 | scores1<-(-weights.class0); 288 | } 289 | 290 | min.res<-pr.curve( scores.class0=scores0, scores.class1=scores1,weights.class0=weights.class0, 291 | weights.class1=weights.class1,curve=curve,minStepSize=minStepSize); 292 | res<-c(res,list(min=min.res)); 293 | } 294 | if(rand.compute){ 295 | rand.auc<-NULL; 296 | if(is.null(weights.class0)){ 297 | rand.auc<-length(sorted.scores.class0)/(length(sorted.scores.class0)+length(sorted.scores.class1)); 298 | }else{ 299 | rand.auc<-sum(weights.class0)/sum(weights.class0+weights.class1); 300 | } 301 | rand.curve<-create.curve( 2 ); 302 | rand.curve<-append.to.curve( rand.curve, c(0,rand.auc,0), 1 ); 303 | rand.curve<-append.to.curve( rand.curve, c(1,rand.auc,0), 2 ); 304 | rand.result<-list( type = "PR", auc.integral = rand.auc, auc.davis.goadrich = rand.auc, curve=rand.curve ); 305 | class(rand.result)<-"PRROC"; 306 | 307 | res<-c(res,list(rand=rand.result)); 308 | } 309 | 310 | class(res)<-"PRROC"; 311 | res 312 | } 313 | 314 | compute.roc<-function( sorted.scores.class0, sorted.scores.class1=sorted.scores.class0, weights.class0 = NULL, 315 | weights.class1 = {if(is.null(weights.class0)){NULL}else{1-weights.class0}}, curve = FALSE, 316 | max.compute=F, min.compute=F, rand.compute=F){ 317 | 318 | if( !is.null(sorted.scores.class1) & ( length(sorted.scores.class0) != length(sorted.scores.class1) | 319 | suppressWarnings( sum(sorted.scores.class0 != sorted.scores.class1) > 0 ) 320 | ) & is.null(weights.class0) & is.null(weights.class1) ){ 321 | weights.class0<-c(rep(1,length(sorted.scores.class0)),rep(0,length(sorted.scores.class1))); 322 | sorted.scores.class0<-c(sorted.scores.class0,sorted.scores.class1); 323 | o0<-order(sorted.scores.class0); 324 | sorted.scores.class0<-sorted.scores.class0[o0]; 325 | weights.class0<-weights.class0[o0]; 326 | weights.class1<-1-weights.class0; 327 | sorted.scores.class1<-sorted.scores.class0; 328 | } 329 | 330 | i <- 0; j <- 0; d <- length( sorted.scores.class1 ); m <- length( sorted.scores.class0 ); 331 | fn <- 0; tn <- 0; 332 | 333 | nw0 <- is.null( weights.class0 ); 334 | nw1 <- is.null( weights.class1 ); 335 | 336 | pos <- ifelse( nw0, m, sum( weights.class0 ) ); 337 | neg <- ifelse( nw1, d, sum( weights.class1 ) ); 338 | 339 | erg <- 0; 340 | ci <- 1; 341 | p <- c( 1, 1, min(sorted.scores.class0,sorted.scores.class1) ); 342 | if( curve ){ 343 | list.curve <- create.curve( length( sorted.scores.class0 ) + length( sorted.scores.class1 ) ); 344 | list.curve <- append.to.curve( list.curve, p, ci ); 345 | ci <- ci + 1; 346 | }else{ 347 | list.curve <- NULL; 348 | } 349 | 350 | unique <- F; from.motif <- F; 351 | if( sorted.scores.class0[ i + 1 ] == sorted.scores.class1[ j + 1 ] ){ 352 | unique <- F; 353 | }else{ 354 | unique <- T; 355 | if( sorted.scores.class0[ i + 1 ] < sorted.scores.class1[ j + 1 ] ){ 356 | from.motif <- T; 357 | }else{ 358 | from.motif <- F; 359 | } 360 | } 361 | 362 | while( i < m & j < d ){ 363 | score <- 0; 364 | if( unique ){ 365 | if( from.motif ){ 366 | while( i < m & sorted.scores.class0[ i + 1 ] < sorted.scores.class1[ j + 1 ] ){ 367 | fn <- fn + ifelse( nw0, 1, weights.class0[ i + 1 ] ); 368 | score <- sorted.scores.class0[ i + 1 ]; 369 | i <- i + 1; 370 | } 371 | 372 | }else{ 373 | while( j < d & sorted.scores.class0[ i + 1 ] > sorted.scores.class1[ j + 1 ] ){ 374 | tn <- tn + ifelse( nw1, 1, weights.class1[ j + 1 ] ); 375 | score <- sorted.scores.class1[ j + 1 ]; 376 | j <- j + 1; 377 | } 378 | #score <- sorted.scores.class0[ i + 1 ]; 379 | } 380 | }else{ 381 | while( i + 1 < m & sorted.scores.class0[ i + 1 ] == sorted.scores.class0[ i + 2 ] ){ 382 | fn <- fn + ifelse( nw0, 1, weights.class0[ i + 1 ] ); 383 | i <- i + 1; 384 | } 385 | while( j + 1 < d & sorted.scores.class1[ j + 1 ] == sorted.scores.class1[ j + 2 ]){ 386 | tn <- tn + ifelse( nw1, 1, weights.class1[ j + 1 ] ); 387 | j <- j + 1; 388 | } 389 | fn <- fn + ifelse( nw0, 1, weights.class0[ i + 1 ] ); 390 | tn <- tn + ifelse( nw1, 1, weights.class1[ j + 1 ] ); 391 | i <- i + 1; 392 | j <- j + 1; 393 | score <- sorted.scores.class0[ i ]; 394 | } 395 | 396 | help1 <- ( neg - tn ) / neg; 397 | help2 <- ( pos - fn ) / pos; 398 | erg <- erg + ( p[ 2 ] + help2 ) / 2 * ( p[ 1 ] - help1 ); 399 | p <- c( help1, help2, score ); 400 | if(curve){ 401 | list.curve <- append.to.curve( list.curve, p, ci ); 402 | ci <- ci + 1; 403 | } 404 | 405 | if( i < m & j < d ){ 406 | if( sorted.scores.class0[ i + 1 ] == sorted.scores.class1[ j + 1 ] ){ 407 | unique <- F; 408 | }else{ 409 | unique <- T; 410 | if( sorted.scores.class0[ i + 1 ] < sorted.scores.class1[ j + 1 ] ){ 411 | from.motif <- T; 412 | }else{ 413 | from.motif <- F; 414 | } 415 | } 416 | } 417 | } 418 | 419 | if(curve){ 420 | p <- c( 0, 0, max( sorted.scores.class0, sorted.scores.class1 ) ); 421 | list.curve <- append.to.curve( list.curve, p, ci ); 422 | ci <- ci + 1; 423 | list.curve<-shrink.curve( list.curve ); 424 | list.curve<-rbind(c(list.curve[1,1],list.curve[1,2],min(sorted.scores.class0,sorted.scores.class1)), 425 | list.curve, 426 | c(list.curve[nrow(list.curve),1],list.curve[nrow(list.curve),2],max(sorted.scores.class0,sorted.scores.class1))) 427 | } 428 | res<-list( type = "ROC", auc = erg, curve=list.curve ); 429 | 430 | 431 | if(max.compute){ 432 | scores0<-NULL; 433 | if(is.null(weights.class0) | length(sorted.scores.class0)!=length(sorted.scores.class1) | suppressWarnings(sum(sorted.scores.class0!=sorted.scores.class1)) > 0){ 434 | scores0<-rep(1,length(sorted.scores.class0)); 435 | }else{ 436 | scores0<-weights.class0; 437 | } 438 | scores1<-NULL; 439 | if(is.null(weights.class0) | length(sorted.scores.class0)!=length(sorted.scores.class1) | suppressWarnings(sum(sorted.scores.class0!=sorted.scores.class1)) > 0){ 440 | scores1<-rep(0,length(sorted.scores.class1)); 441 | }else{ 442 | scores1<-weights.class0; 443 | } 444 | 445 | max.res<-roc.curve( scores.class0=scores0, scores.class1=scores1,weights.class0=weights.class0, 446 | weights.class1=weights.class1,curve=curve); 447 | res<-c(res,list(max=max.res)); 448 | } 449 | 450 | if(min.compute){ 451 | scores0<-NULL; 452 | if(is.null(weights.class0) | length(sorted.scores.class0)!=length(sorted.scores.class1) | suppressWarnings(sum(sorted.scores.class0!=sorted.scores.class1)) > 0){ 453 | scores0<-rep(0,length(sorted.scores.class0)); 454 | }else{ 455 | scores0<-(-weights.class0); 456 | } 457 | scores1<-NULL; 458 | if(is.null(weights.class0) | length(sorted.scores.class0)!=length(sorted.scores.class1) | suppressWarnings(sum(sorted.scores.class0!=sorted.scores.class1)) > 0){ 459 | scores1<-rep(1,length(sorted.scores.class1)); 460 | }else{ 461 | scores1<-(-weights.class0); 462 | } 463 | 464 | min.res<-roc.curve( scores.class0=scores0, scores.class1=scores1,weights.class0=weights.class0, 465 | weights.class1=weights.class1,curve=curve); 466 | res<-c(res,list(min=min.res)); 467 | } 468 | if(rand.compute){ 469 | rand.auc<-0.5; 470 | rand.curve<-create.curve( 2 ); 471 | rand.curve<-append.to.curve( rand.curve, c(0,0,0), 1 ); 472 | rand.curve<-append.to.curve( rand.curve, c(1,1,0), 2 ); 473 | rand.result<-list( type = "ROC", auc=rand.auc, curve=rand.curve ); 474 | class(rand.result)<-"PRROC"; 475 | 476 | res<-c(res,list(rand=rand.result)); 477 | } 478 | 479 | class(res)<-"PRROC"; 480 | res 481 | } 482 | 483 | shrink.curve <- function( curve ){ 484 | if( is.null( curve ) ){ 485 | curve; 486 | }else{ 487 | curve[ !is.na( curve[ , 1 ] ), ]; 488 | } 489 | } 490 | 491 | create.curve <- function( n ){ 492 | m <- matrix( NA, nrow=n, ncol=3 ); 493 | m 494 | } 495 | 496 | append.to.curve <- function( curve, p, row ){ 497 | if( row>=nrow( curve ) ){ 498 | curve2 <- matrix( NA, nrow=nrow( curve ) * 2, ncol=3 ); 499 | curve2[ 1:nrow( curve ), ] <- curve; 500 | curve <- curve2; 501 | } 502 | curve[ row, ] <- p; 503 | # print(c(row,p)) 504 | # if(is.nan(p[2])){ 505 | # traceback(0) 506 | # } 507 | curve 508 | } 509 | 510 | print.PRROC<-function(x,...){ 511 | if(x$type == "PR"){ 512 | cat("\n Precision-recall curve\n"); 513 | cat("\n Area under curve (Integral):\n"); 514 | cat(" ",x$auc.integral,"\n"); 515 | if( !is.null(x$max) & !is.null(x$min) ){ 516 | cat("\n Relative area under curve (Integral):\n"); 517 | cat(" ",(x$auc.integral - x$min$auc.integral)/(x$max$auc.integral-x$min$auc.integral),"\n"); 518 | } 519 | cat("\n Area under curve (Davis & Goadrich):\n"); 520 | if(!is.null(x$auc.davis.goadrich) & !is.na(x$auc.davis.goadrich)){ 521 | cat(" ",x$auc.davis.goadrich,"\n"); 522 | if( !is.null(x$max) & !is.null(x$min) ){ 523 | cat("\n Relative area under curves (Davis & Goadrich):\n"); 524 | cat(" ",(x$auc.davis.goadrich - x$min$auc.davis.goadrich)/(x$max$auc.davis.goadrich-x$min$auc.davis.goadrich),"\n"); 525 | } 526 | }else{ 527 | cat(" cannot be computed for weighted data\n"); 528 | } 529 | 530 | }else{ 531 | cat("\n ROC curve\n"); 532 | cat("\n Area under curve:\n"); 533 | cat(" ",x$auc,"\n"); 534 | if( !is.null(x$max) & !is.null(x$min) ){ 535 | cat("\n Relative area under curve:\n"); 536 | cat(" ",(x$auc - x$min$auc)/(x$max$auc-x$min$auc),"\n"); 537 | } 538 | } 539 | 540 | if(!is.null(x$curve)){ 541 | cat("\n Curve for scores from ",min(x$curve[,3])," to ",max(x$curve[,3]),"\n"); 542 | cat(" ( can be plotted with plot(x) )\n\n"); 543 | }else{ 544 | cat("\n Curve not computed ( can be done by using curve=TRUE )\n"); 545 | } 546 | 547 | if(!is.null(x$max)){ 548 | cat("\n\n Maximum AUC:\n"); 549 | if(x$type == "PR"){ 550 | cat(" ",x$max$auc.integral," ",x$max$auc.davis.goadrich,"\n"); 551 | }else{ 552 | cat(" ",x$max$auc,"\n"); 553 | } 554 | } 555 | 556 | if(!is.null(x$min)){ 557 | cat("\n\n Minimum AUC:\n"); 558 | if(x$type == "PR"){ 559 | cat(" ",x$min$auc.integral," ",x$min$auc.davis.goadrich,"\n"); 560 | }else{ 561 | cat(" ",x$min$auc,"\n"); 562 | } 563 | } 564 | 565 | if(!is.null(x$rand)){ 566 | cat("\n\n AUC of a random classifier:\n"); 567 | if(x$type == "PR"){ 568 | cat(" ",x$rand$auc.integral," ",x$rand$auc.davis.goadrich,"\n"); 569 | }else{ 570 | cat(" ",x$rand$auc,"\n"); 571 | } 572 | } 573 | } 574 | 575 | 576 | plot.PRROC<-function(x, xlim=c(0,1), ylim=c(0,1), auc.main=TRUE, auc.type=c("integral","davis.goadrich"), 577 | legend=ifelse(is.logical(color) & color==TRUE,4,NA), xlab=NULL, ylab=NULL, main=NULL, color=TRUE, lwd=3, 578 | add=FALSE, scale.color=hsv(h=seq(0,1,length=100)*0.8, s=1, v=1), 579 | max.plot = FALSE, min.plot = FALSE, rand.plot = FALSE, fill.area = (max.plot & min.plot), 580 | maxminrand.col = grey(0.5), fill.color = grey(0.95), 581 | ...){ 582 | auc.type<-match.arg(auc.type); 583 | if(is.null(x$curve)){ 584 | stop("Curve is NULL. Use curve=T in pr.curve or roc.curve to obtain one."); 585 | } 586 | if(ncol(x$curve) != 3){ 587 | stop("Curve has wrong dimension"); 588 | } 589 | if(is.null(xlab)){ 590 | my.xlab<-ifelse(x$type=="PR","Recall","FPR"); 591 | }else{ 592 | my.xlab<-xlab; 593 | } 594 | if(is.null(ylab)){ 595 | my.ylab<-ifelse(x$type=="PR","Precision","Sensitivity"); 596 | }else{ 597 | my.ylab<-ylab; 598 | } 599 | 600 | if(is.null(main)){ 601 | my.main<-paste(x$type," curve",sep="",collapse=""); 602 | }else{ 603 | my.main<-main; 604 | } 605 | if(auc.main){ 606 | my.main<-paste(my.main,"\nAUC = ",format(ifelse(x$type=="PR",ifelse(auc.type=="integral",x$auc.integral,x$auc.davis.goadrich),x$auc)),sep="",collapse=""); 607 | } 608 | 609 | 610 | max.curve<-NULL; 611 | if(!is.null(x$max) & !is.null(x$max$curve)){ 612 | max.curve<-x$max$curve; 613 | } 614 | min.curve<-NULL; 615 | if(!is.null(x$min) & !is.null(x$min$curve)){ 616 | min.curve<-x$min$curve; 617 | } 618 | rand.curve<-NULL; 619 | if(!is.null(x$rand) & !is.null(x$rand$curve)){ 620 | rand.curve<-x$rand$curve; 621 | } 622 | 623 | x<-x$curve; 624 | 625 | cols<-1; 626 | segment=F; 627 | plotscale.color=F; 628 | if( is.logical(color) ){ 629 | if(color){ 630 | min<-min(x[,3]); 631 | max<-max(x[,3]); 632 | 633 | cols<-getColor( scale.color, x[,3], min, max ); 634 | plotscale.color=T; 635 | segment=T; 636 | }else{ 637 | cols<-1; 638 | segment<-F; 639 | } 640 | }else { 641 | cols<-color; 642 | segment<-F; 643 | } 644 | 645 | if(!add & !is.na(legend) & (is.numeric(legend) | suppressWarnings(legend==TRUE)) & plotscale.color ){ 646 | if(is.logical(legend)){ 647 | legend<-4; 648 | } 649 | m<-NULL;widths<-rep(1,2);heights<-rep(1,2) 650 | if(legend == 1){ 651 | m<-matrix(c(1,2),nrow=2); 652 | heights<-c(4,lcm(2)); 653 | }else if(legend==2){ 654 | m<-matrix(c(2,1),nrow=1); 655 | widths=c(lcm(2.5),4); 656 | }else if(legend==3){ 657 | m<-matrix(c(2,1),nrow=2); 658 | heights=c(lcm(2),4); 659 | }else{ 660 | m<-matrix(c(1,2),nrow=1); 661 | widths=c(4,lcm(2.5)); 662 | } 663 | layout(mat = m,widths = widths,heights = heights); 664 | 665 | }#else if(!add){ 666 | # layout(1); 667 | #} 668 | 669 | if(!add){ 670 | plot(0,xlim=xlim,ylim=ylim,col=0,xlab=my.xlab,ylab=my.ylab,main=my.main,...); 671 | } 672 | 673 | if( !add ){ 674 | if( fill.area & !is.null(max.curve) & !is.null(min.curve)){ 675 | xs<-c(min.curve[,1],max.curve[nrow(max.curve):1,1],min.curve[1,1]); 676 | ys<-c(min.curve[,2],max.curve[nrow(max.curve):1,2],min.curve[1,2]); 677 | polygon( x = xs, y = ys, density = -1, border = NA, col = fill.color ); 678 | } 679 | 680 | if(max.plot & !is.null(max.curve)){ 681 | lines(max.curve[,1],max.curve[,2],col=maxminrand.col, lty="dashed", ...); 682 | } 683 | 684 | if(min.plot & !is.null(min.curve)){ 685 | lines(min.curve[,1],min.curve[,2],col=maxminrand.col, lty="dotted", ...); 686 | } 687 | 688 | if(rand.plot & !is.null(rand.curve)){ 689 | lines(rand.curve[,1],rand.curve[,2],col=maxminrand.col, lty="dotdash", ...); 690 | } 691 | } 692 | 693 | d=nrow(x); 694 | if( segment ) { 695 | segments( x[1:(d-1),1], x[1:(d-1),2], x[2:d,1], x[2:d,2], col=cols, lwd=lwd, ...); 696 | } else { 697 | lines( x[,1], x[,2], col=cols, lwd=lwd, ...); 698 | } 699 | 700 | if(!add & legend & !is.numeric(color) & color == TRUE){ 701 | scale<-seq( min, max, length = 100 ); 702 | cols<-getColor( scale.color, scale, min, max ); 703 | bak<-par("mar"); 704 | on.exit(par(mar=bak)); 705 | if(legend==2 | legend==4){ 706 | if(legend==4){par(mar=c(5,1,4,2)+0.1);}else{par(mar=c(5,2,4,1)+0.1);} 707 | image(c(1),scale,matrix(scale,nrow=1),col=cols,xlab="",ylab="",axes=F) 708 | }else{ 709 | if(legend==1){par(mar=c(2,4,0,2)+0.1);}else{par(mar=c(0,4,2,2)+0.1);} 710 | image(scale,c(1),matrix(scale,ncol=1),col=cols,xlab="",ylab="",axes=F) 711 | } 712 | axis(legend) 713 | layout(1) 714 | } 715 | 716 | 717 | } 718 | 719 | getColor <- function( scale, x, min=min(x), max=max(x) ) { 720 | return( scale[round(1 + (length(scale)-1) * (x - min)/(max-min))] ); 721 | } 722 | 723 | 724 | -------------------------------------------------------------------------------- /dataset.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import json 3 | import os 4 | import pandas as pd 5 | import numpy as np 6 | from IPython import embed 7 | from collections import Counter 8 | 9 | from diConstants import ( 10 | SEQ_ROOT, BIN_SIZE, NUM_BASES, 11 | HG19_ALL_CHROMS, MM9_ALL_CHROMS, HG19_TRAIN_CHROMS, MM9_TRAIN_CHROMS, VALID_CHROMS, TEST_CHROMS, 12 | HG19_CHROM_SIZES, MM9_CHROM_SIZES) 13 | from prepData import get_metadata_path, input_not_before_end, get_base_path, get_peaks, get_blacklisted_locs 14 | 15 | 16 | def load_chrom(data_path, chrom): 17 | """ 18 | Loads the .npz file in data_path and returns data for a given chrom. 19 | chrom is like "chr1" 20 | """ 21 | m = np.load(data_path) 22 | return m[chrom] 23 | 24 | 25 | def get_species_from_dataset_name(dataset_name): 26 | if 'ULI' in dataset_name or 'MOUSE' in dataset_name: 27 | return 'mm9' 28 | else: 29 | return 'hg19' 30 | 31 | 32 | class DatasetEncoder(json.JSONEncoder): 33 | """ 34 | Encodes Dataset objects in JSON. 35 | """ 36 | def default(self, obj): 37 | if isinstance(obj, Dataset): 38 | return obj.__dict__ 39 | else: 40 | return super(DatasetEncoder, self).default(obj) 41 | 42 | 43 | class Dataset(object): 44 | """ 45 | Dataset objects have the following fields: 46 | dataset_name 47 | num_examples 48 | X_subsample_target_string (string like "5e6" or None) 49 | Y_subsample_target_string (string like "5e6" or None) 50 | random_seed 51 | normalization 52 | peak_fraction 53 | chroms 54 | chroms_string 55 | 56 | They support the following methods: 57 | get_subsample_target_string(self, X_or_Y) 58 | get_seq_dataset_path(self, seq_length, factor_for_peaks) 59 | load_seq_dataset(self, seq_length, input_marks, output_marks) 60 | load_binary_genome(self, X_or_Y, marks, only_chr1=False) 61 | load_genome(self, X_or_Y, marks, only_chr1=False, peaks=False) 62 | 63 | And the static method process_subsample_target_string. 64 | """ 65 | 66 | @staticmethod 67 | def process_subsample_target_string(subsample_target_string): 68 | if subsample_target_string is None: 69 | return subsample_target_string 70 | elif subsample_target_string == 'None': 71 | return None 72 | else: 73 | return str(subsample_target_string) 74 | 75 | 76 | def __init__(self, dataset_name, num_examples, 77 | X_subsample_target_string, Y_subsample_target_string, 78 | random_seed, normalization, peak_fraction, chroms): 79 | self.dataset_name = dataset_name 80 | self.num_examples = num_examples 81 | self.X_subsample_target_string = Dataset.process_subsample_target_string(X_subsample_target_string) 82 | self.Y_subsample_target_string = Dataset.process_subsample_target_string(Y_subsample_target_string) 83 | self.random_seed = random_seed 84 | self.normalization = normalization 85 | self.peak_fraction = peak_fraction 86 | self.chroms = chroms 87 | 88 | self.species = get_species_from_dataset_name(self.dataset_name) 89 | if self.species == 'hg19': 90 | all_chroms = HG19_ALL_CHROMS 91 | train_chroms = HG19_TRAIN_CHROMS 92 | else: 93 | all_chroms = MM9_ALL_CHROMS 94 | train_chroms = MM9_TRAIN_CHROMS 95 | 96 | if self.chroms == all_chroms: 97 | self.chroms_string = "" 98 | elif self.chroms == TEST_CHROMS: 99 | self.chroms_string = "_chroms-test" 100 | elif self.chroms == train_chroms: 101 | self.chroms_string = "_chroms-train" 102 | elif self.chroms == VALID_CHROMS: 103 | self.chroms_string = "_chroms-valid" 104 | else: 105 | raise ValueError, "chroms must be ALL_CHROMS, TEST_CHROMS, TRAIN_CHROMS, or VALID_CHROMS" 106 | 107 | if (self.normalization not in ['arcsinh', 'log', None]): 108 | raise ValueError, "normalization must be 'arcsinh', 'log', or None" 109 | 110 | peak_fraction = float(peak_fraction) 111 | if peak_fraction < 0.0 or peak_fraction > 1.0: 112 | raise ValueError, "peak_fraction must be in [0, 1]" 113 | 114 | try: 115 | metadata_path = get_metadata_path(self.dataset_name, self.X_subsample_target_string, self.normalization) 116 | with open(metadata_path, 'r') as f: 117 | metadata = json.loads(f.read()) 118 | self.marks_in_dataset = metadata['factors_to_include'] 119 | self.cell_line = metadata['cell_line'] 120 | except IOError: 121 | raise IOError, "Dataset %s doesn't exist." % metadata_path 122 | 123 | try: 124 | # Sanity check to make sure that metadata is consistent with different subsample target string 125 | metadata_path = get_metadata_path(self.dataset_name, self.Y_subsample_target_string, self.normalization) 126 | with open(metadata_path, 'r') as f: 127 | metadata = json.loads(f.read()) 128 | assert self.marks_in_dataset == metadata['factors_to_include'] 129 | assert self.cell_line == metadata['cell_line'] 130 | except IOError: 131 | raise IOError, "Dataset %s doesn't exist." % metadata_path 132 | 133 | 134 | def get_subsample_target_string(self, X_or_Y): 135 | assert X_or_Y in ["X", "Y"] 136 | if X_or_Y == "X": 137 | return self.X_subsample_target_string 138 | else: 139 | return self.Y_subsample_target_string 140 | 141 | 142 | def get_seq_dataset_path(self, seq_length, factor_for_peaks): 143 | """ 144 | If factor_for_peaks is INPUT, 145 | that means that all marks in the dataset are used for peak enrichment, 146 | but that the Y matrices in the dataset only contain the INPUT mark. 147 | This is used for training a separate model that only outputs INPUT. 148 | 149 | In contrast, if factor_for_peaks is None, all marks in dataset are similarly 150 | used for peak enrichment, but the Y matrices in the dataset contain all marks. 151 | This is used for training a single, multi-task model that outputs all marks. 152 | 153 | Y_subsample_target_string is normally set to None, unless we are intentionally 154 | trying to use a certain subsampling depth as the "full" data. 155 | """ 156 | 157 | if factor_for_peaks is None: 158 | dataset_path = os.path.join( 159 | SEQ_ROOT, "%s_subsample-%s-%s_rS-%s_numEx-%s_seqLen-%s_peakFrac-%s_norm-%s%s.npz" % \ 160 | (self.dataset_name, self.X_subsample_target_string, self.Y_subsample_target_string, 161 | self.random_seed, self.num_examples, 162 | seq_length, self.peak_fraction, self.normalization, self.chroms_string)) 163 | else: 164 | dataset_path = os.path.join( 165 | SEQ_ROOT, "%s_subsample-%s-%s_rS-%s_numEx-%s_seqLen-%s_peakFrac-%s_peaksFac-%s_norm-%s%s.npz" % \ 166 | (self.dataset_name, self.X_subsample_target_string, self.Y_subsample_target_string, 167 | self.random_seed, self.num_examples, 168 | seq_length, self.peak_fraction, factor_for_peaks, self.normalization, self.chroms_string)) 169 | 170 | return dataset_path 171 | 172 | 173 | def load_seq_dataset(self, seq_length, input_marks, output_marks): 174 | """ 175 | Reads in (X, Y, peakPValueX, peakPValueY, peakBinaryX, peakBinaryY) from a previously created .npz file, 176 | where X is the input (subsampled) and Y is the output (full) and 177 | peakPValueX, peakPValueY contain the -log10 pvalues for the called peaks (bin by bin) 178 | peakBinaryX, peakBinaryY contain the binarized peak signal for the called peaks (bin by bin) 179 | X is of shape num_examples x seq_length x len(input_marks). 180 | Y is of shape num_examples x seq_length x len(output_marks). 181 | 182 | peakPValueX is of similar shape to X, except that it does not contain an INPUT track, so it is of shape 183 | num_examples x seq_length x (len(input_marks) - ('INPUT' in input_marks)). 184 | peakPValueY is of similar shape to Y, except that it does not contain an INPUT track, so it is of shape 185 | num_examples x seq_length x (len(output_marks) - ('INPUT' in output_marks)). 186 | 187 | input_marks is a list of marks that will be used as input to the model. 188 | output_marks is a list of marks that will be used as output from the model. It can be of length 1-6, depending 189 | on whether we're training separate models or one single model, and on whether we're doing classification 190 | or regression. 191 | 192 | If the .npz file doesn't exist, it will create it by calling extract_seq_dataset. 193 | """ 194 | assert(input_not_before_end(output_marks)) 195 | assert(input_not_before_end(input_marks)) 196 | 197 | for input_mark in input_marks: 198 | if input_mark not in self.marks_in_dataset: 199 | raise ValueError, "input_marks must be in marks_in_dataset" 200 | 201 | for output_mark in output_marks: 202 | if output_mark not in self.marks_in_dataset: 203 | raise ValueError, "output_marks must be in marks_in_dataset" 204 | 205 | # Construct an identifying string for this dataset based on what the output marks are. 206 | # If all marks in marks_in_dataset are present, then for brevity we omit output_marks_string. 207 | if len(output_marks) == len(self.marks_in_dataset): 208 | output_marks_string = None 209 | else: 210 | output_marks_string = '-'.join(output_marks) 211 | 212 | dataset_path = self.get_seq_dataset_path(seq_length, output_marks_string) 213 | 214 | try: 215 | with np.load(dataset_path) as data: 216 | X = data['X'].astype('float32') 217 | Y = data['Y'].astype('float32') 218 | peakPValueX = data['peakPValueX'].astype('float32') 219 | peakPValueY = data['peakPValueY'].astype('float32') 220 | peakBinaryX = data['peakBinaryX'].astype('int8') 221 | peakBinaryY = data['peakBinaryY'].astype('int8') 222 | 223 | except: 224 | print("Dataset %s doesn't exist or is missing a required matrix. Creating..." % dataset_path) 225 | 226 | X, Y, peakPValueX, peakPValueY, peakBinaryX, peakBinaryY = self.extract_seq_dataset( 227 | seq_length, 228 | output_marks, 229 | dataset_path) 230 | 231 | # Only select the input marks that we want 232 | marks_idx = [] 233 | peak_marks_idx = [] 234 | for mark in input_marks: 235 | marks_idx.append(self.marks_in_dataset.index(mark)) 236 | 237 | # We don't want to have INPUT inside peakPValueX 238 | factors_without_input = copy.copy(self.marks_in_dataset) 239 | if 'INPUT' in factors_without_input: 240 | factors_without_input.remove('INPUT') 241 | 242 | for mark in input_marks: 243 | if mark == 'INPUT': 244 | continue 245 | peak_marks_idx.append(factors_without_input.index(mark)) 246 | 247 | X = X[..., marks_idx] 248 | peakPValueX = peakPValueX[..., peak_marks_idx] 249 | peakBinaryX = peakBinaryX[..., peak_marks_idx] 250 | 251 | assert(np.all(peakPValueX >= 0) & np.all(peakPValueY >= 0)) 252 | 253 | if (X.shape[0], X.shape[1]) != (Y.shape[0], Y.shape[1]): 254 | raise Exception, "First two dimensions of X and Y shapes (num_examples, seq_length) \ 255 | need to agree." 256 | if (peakPValueX.shape[0], peakPValueX.shape[1]) != (peakPValueY.shape[0], peakPValueY.shape[1]): 257 | raise Exception, "First two dimensions of peakPValueX and peakPValueY shapes \ 258 | (num_examples, seq_length) need to agree." 259 | if len(peakPValueX) != len(X): 260 | raise Exception, "peakPValueX and X must have same length." 261 | 262 | if ((seq_length != X.shape[1]) or (seq_length != peakPValueX.shape[1])): 263 | raise Exception, "seq_length between model and data needs to agree" 264 | 265 | return X, Y, peakPValueX, peakPValueY, peakBinaryX, peakBinaryY 266 | 267 | 268 | def extract_seq_dataset(self, seq_length, output_marks, dataset_path): 269 | """ 270 | Returns (X, Y, peakPValueX, peakPValueY, peakBinaryX, peakBinaryY), where X is the input (subsampled) and Y is the output (full) 271 | and peakPValueX and peakPValueY are the -log10 pvalue scores for peaks called using MACS on X and Y respectively. 272 | Both X and Y are each of shape num_examples x seq_length x num_factors. 273 | peakPValueX and peakPValueY are each of shape num_examples x seq_length x num_factors - 1. (no peaks for input) 274 | 275 | Also writes all matrices to a compressed .npz file. 276 | 277 | peak_fraction is the fraction of examples that should be centered on a peak that exists in the full data. 278 | For example, if peak_fraction = 0.5, then half of the examples will have a peak at the 279 | center of the sequence, and the other half will not. 280 | 281 | factor_for_peaks determines which factor is used for determining whether a given location is 282 | counted as a 'peak' or not, since it could be a peak in one factor but not another. 283 | It should be a string, like 'H3K27AC'. 284 | If it is None (the singleton, not a string), then a location is counted as having a peak 285 | so long as there's a peak in any factor. 286 | 287 | This function sets the numpy random seed. 288 | """ 289 | 290 | def sanity_check(): 291 | """ 292 | Sanity checks on the full and subsampled data. 293 | 294 | Uses full_path and sub_path as defined in the main function. 295 | """ 296 | assert os.path.isfile(full_path), "%s does not exist" % full_path 297 | assert os.path.isfile(sub_path), "%s does not exist" % sub_path 298 | assert os.path.isfile(full_peak_path), "%s does not exist" % full_peak_path 299 | assert os.path.isfile(sub_peak_path), "%s does not exist" % sub_peak_path 300 | 301 | with np.load(full_path) as full_data: 302 | with np.load(sub_path) as sub_data: 303 | 304 | full_chroms = full_data.keys() 305 | sub_chroms = sub_data.keys() 306 | 307 | assert set(full_chroms) == set(sub_chroms), \ 308 | "Full and subsampled data must have exactly the same chromosomes." 309 | 310 | assert full_chroms == sub_chroms, \ 311 | "Technically this is ok, but it's weird that the chromosomes in the full and subsampled \ 312 | data are not in the same order." 313 | 314 | for chrom in full_chroms: 315 | assert full_data[chrom].shape == sub_data[chrom].shape, \ 316 | "Each chromosome should have exactly the same number of bins and factors in both \ 317 | datasets." 318 | 319 | assert len(set([full_data[chrom].shape[1] for chrom in full_chroms])) == 1, \ 320 | "Number of factors should be constant across all chromosomes." 321 | 322 | def get_start_positions(data_path, cell_line, chroms): 323 | """ 324 | Returns a dictionary where each chromosome is a key, and each value is a set 325 | of start positions in that chromosome from which we can extract an example. 326 | Chromosomes are chosen uniformly at random, so longer chromosomes are not sampled 327 | more than shorter chromosomes. 328 | 329 | Start positions are chosen to be enriched in peaks in the full data, as specified 330 | by the peak_fraction parameter to the main extract_seq_dataset function. 331 | 332 | Uses seq_length and num_examples from the main function parameters. 333 | """ 334 | assert(seq_length % 2 == 1) 335 | with np.load(data_path) as data: 336 | # Make sure all chroms are present in the data 337 | assert all([chrom in data.keys() for chrom in chroms]) 338 | 339 | # How long is each chromosome? 340 | num_bins = {chrom: data[chrom].shape[0] for chrom in chroms} 341 | 342 | # Filter out blacklisted bins. Add a bit of buffer to be safe. 343 | blacklist_buffer = 5 344 | blacklisted_locs = get_blacklisted_locs(cell_line) 345 | non_blacklisted_bins = {} 346 | for chrom in chroms: 347 | good_locs = np.ones(num_bins[chrom] - seq_length + 1, dtype=bool) 348 | print('Prior to filtering out bad locations for chromosome %s, %i bins available' % (chrom, len(good_locs))) 349 | for bad_range in blacklisted_locs[chrom]: 350 | left = max(bad_range[0] - seq_length - blacklist_buffer, 0) 351 | right = max(bad_range[1] + blacklist_buffer, 0) 352 | good_locs[left:right] = 0 353 | print('After filtering out bad locations for chromosome %s, %i bins available' % (chrom, len(good_locs))) 354 | non_blacklisted_bins[chrom] = np.flatnonzero(good_locs).tolist() 355 | 356 | # Which chromosome? Sample uniformly at random 357 | # without caring about chromosome length. 358 | # Then count how many samples we are getting from each chromosome. 359 | chrom_samples = list( 360 | np.random.choice( 361 | chroms, 362 | num_examples, 363 | replace=True)) 364 | num_samples = {chrom: chrom_samples.count(chrom) for chrom in chroms} 365 | 366 | # Load all peaks into memory 367 | print("Preparing peaks...") 368 | peaks = {} 369 | 370 | # We want to enrich the data with parts of the genome that have peaks in the output marks 371 | # We don't have INPUT peaks, so we remove it 372 | if output_marks == ['INPUT']: 373 | factors_for_peaks = copy.copy(marks_in_dataset) 374 | else: 375 | factors_for_peaks = copy.copy(output_marks) 376 | if 'INPUT' in factors_for_peaks: 377 | factors_for_peaks.remove('INPUT') 378 | 379 | # Get peaks that correspond to the "full" data as specified by Y_subsample_target_string 380 | for factor in factors_for_peaks: 381 | peaks[factor], _ = get_peaks(cell_line, factor, Y_subsample_target_string) 382 | 383 | # Get start positions from each chromosome 384 | start_positions = {} 385 | for chrom in chroms: 386 | 387 | print("Calculating start positions for %s" % chrom) 388 | # We shift each peak such that the peak will be in the middle of the example, 389 | # i.e., the start position is (seq_length - 1)/2 bins before the actual peak 390 | # Unless doing so would move the starting position off the actual chromosome 391 | # For example, say seq_length = 101 (so shift = 50) and there is a peak at position 1050. 392 | # We would include position 1000 = 1050 - shift as a start position. 393 | # A sequence that starts at position 1000 would go to position 1100 (inclusive) 394 | # and the midpoint of that sequence, position 1050, would have a peak. 395 | shift = int((seq_length - 1) / 2) 396 | peak_bins = np.zeros(max(non_blacklisted_bins[chrom]), dtype=bool) 397 | 398 | for factor in factors_for_peaks: 399 | num_peaks = 0 400 | for peak in peaks[factor][chrom]: 401 | 402 | left = max(int(peak[0] - shift), 0) 403 | right = max(int(peak[1] - shift), 0) 404 | num_peaks += right - left 405 | 406 | # A "1" in peak_bins means that starting at that location will result 407 | # in a sequence whose center is a peak. 408 | peak_bins[left:right] = 1 409 | print(" %s peaks for %s" % (num_peaks, factor)) 410 | peak_bins_binarized = np.copy(peak_bins) 411 | peak_bins = set(np.flatnonzero(peak_bins).tolist()) 412 | 413 | # Remove blacklisted bins, and create nonpeak_bins 414 | all_bins = set(non_blacklisted_bins[chrom]) 415 | peak_bins = peak_bins.intersection(all_bins) 416 | nonpeak_bins = all_bins.difference(peak_bins) 417 | peak_bins = list(peak_bins) 418 | nonpeak_bins = list(nonpeak_bins) 419 | print(" Total after blacklisting: %s peaks and %s non-peaks" % (len(peak_bins), len(nonpeak_bins))) 420 | 421 | # Get samples of peak and non-peak locations 422 | peak_samples = np.round(num_samples[chrom] * peak_fraction).astype(int) 423 | nonpeak_samples = num_samples[chrom] - peak_samples 424 | 425 | start_positions[chrom] = np.random.choice( 426 | nonpeak_bins, 427 | nonpeak_samples, 428 | replace=False) 429 | 430 | 431 | # There is a potential problem here if we are trying to draw more peak_samples 432 | # than there are peak locations on the chromosome. 433 | # If so, np.random.choice will error out. 434 | start_positions[chrom] = np.concatenate([ 435 | start_positions[chrom], 436 | np.random.choice( 437 | peak_bins, 438 | peak_samples, 439 | replace=False)]) 440 | 441 | # Sort in the hopes that it makes memory access in extract_single_dataset faster 442 | start_positions[chrom].sort() 443 | 444 | 445 | return start_positions 446 | 447 | 448 | def extract_single_dataset(data_path, start_positions, marks): 449 | """ 450 | From the data in data_path, extracts num_examples subsequences of length seq_length 451 | from the start positions in start_positions. 452 | 453 | Returns a matrix of size num_examples x seq_length x num_marks. 454 | 455 | Uses seq_length and num_examples from the main function parameters. 456 | 457 | Used to load both the continuous signal and the peak p-values. 458 | """ 459 | print('Extracting samples from %s...' % data_path) 460 | 461 | num_marks = len(marks) 462 | marks_idx = [] 463 | for mark in marks: 464 | marks_idx.append( 465 | marks_in_dataset.index(mark)) 466 | 467 | return_dataset = np.empty([num_examples, seq_length, num_marks]) 468 | first_empty_row = 0 469 | 470 | with np.load(data_path) as data: 471 | # Get required samples from each chromosome 472 | for chrom in start_positions.keys(): 473 | 474 | data_chrom = data[chrom] 475 | 476 | for start_pos in start_positions[chrom]: 477 | return_dataset[first_empty_row, :, :] = data_chrom[ 478 | start_pos : start_pos+seq_length, 479 | marks_idx] 480 | first_empty_row += 1 481 | 482 | print("At sample number %s..." % first_empty_row) 483 | 484 | assert first_empty_row == num_examples 485 | 486 | # Note: this dataset has not been randomized yet. 487 | # So it has consecutive elements from the same chromosome. 488 | # We will randomize both X and Y datasets together later. 489 | return return_dataset 490 | 491 | 492 | def extract_binary_peak_dataset(full_path, subsample_target_string_to_extract, start_positions, 493 | cell_line, marks): 494 | """ 495 | Method for returning Y with peak information. A 1 denotes a peak. 496 | From the data in data_path, extracts num_examples subsequences of length seq_length 497 | from the start positions in start_positions. 498 | 499 | Returns binary_peak_matrix, a matrix of size num_examples x seq_length x num_marks, 500 | where a 1 denotes a peak 501 | """ 502 | 503 | shift = int((seq_length - 1) / 2) 504 | peak_pval_matrix = np.empty([ 505 | num_examples, 506 | seq_length, 507 | (len(marks) - ('INPUT' in marks)) 508 | ]) 509 | 510 | factor_idx = 0 511 | for factor in marks: 512 | if factor == 'INPUT': 513 | continue 514 | first_empty_row = 0 515 | peak_dict, peak_log_pvalue_dict = get_peaks( 516 | cell_line, 517 | factor, 518 | subsample_target_string=subsample_target_string_to_extract) 519 | for chrom in start_positions: 520 | peak_vector_length = max( 521 | np.max(peak_dict[chrom]), 522 | np.max(start_positions[chrom]) + seq_length) + 1 523 | peak_pval_vector = np.zeros([peak_vector_length,]) 524 | for peak_idx, peak in enumerate(peak_dict[chrom]): 525 | peak_pval_vector[peak[0]:peak[1]] = peak_log_pvalue_dict[chrom][peak_idx] 526 | is_peak = (peak_pval_vector > 0) 527 | print(factor, chrom, is_peak[start_positions[chrom] + shift].mean()) 528 | for start_pos in start_positions[chrom]: 529 | peak_pval_matrix[first_empty_row, :, factor_idx] = peak_pval_vector[start_pos : (start_pos+seq_length)] 530 | first_empty_row += 1 531 | factor_idx += 1 532 | binary_peak_matrix = (peak_pval_matrix > 0) * 1. 533 | assert np.all(peak_pval_matrix >= 0) 534 | return binary_peak_matrix 535 | 536 | 537 | def extract_single_sequence_dataset(start_positions): 538 | """ 539 | Extracts num_examples subsequences of length seq_length, at positions start_positions, 540 | from the hg19 sequence. 541 | 542 | Returns a matrix of size num_examples x (seq_length * BIN_SIZE) x NUM_BASES. 543 | 544 | Uses seq_length and num_examples from the main function parameters. 545 | """ 546 | print('Extracting sequences...') 547 | 548 | return_dataset = np.empty([num_examples, BIN_SIZE*seq_length, NUM_BASES]) 549 | first_empty_row = 0 550 | 551 | # Get required samples from each chromosome 552 | for chrom in start_positions.keys(): 553 | 554 | data_chrom = load_seq_for_chrom(chrom) 555 | 556 | for start_pos in start_positions[chrom]: 557 | return_dataset[first_empty_row, :, :] = data_chrom[ 558 | start_pos*BIN_SIZE : (start_pos+seq_length)*BIN_SIZE, :] 559 | first_empty_row += 1 560 | 561 | print("At sample number %s..." % first_empty_row) 562 | 563 | assert first_empty_row == num_examples 564 | 565 | # Make sure each base has at most one 1, and that at least one base is not N 566 | assert np.max(np.sum(return_dataset, axis=2)) == 1 567 | 568 | return return_dataset 569 | 570 | 571 | ### Main function code starts here 572 | 573 | # Read dataset metadata 574 | dataset_name = self.dataset_name 575 | X_subsample_target_string = self.X_subsample_target_string 576 | Y_subsample_target_string = self.Y_subsample_target_string 577 | random_seed = self.random_seed 578 | num_examples = self.num_examples 579 | peak_fraction = self.peak_fraction 580 | normalization = self.normalization 581 | marks_in_dataset = self.marks_in_dataset 582 | cell_line = self.cell_line 583 | chroms = self.chroms 584 | 585 | # We always prepare dataset files with the full set of input_marks 586 | input_marks = copy.copy(marks_in_dataset) 587 | 588 | np.random.seed(random_seed) 589 | 590 | full_path = get_base_path(dataset_name, Y_subsample_target_string, normalization) 591 | sub_path = get_base_path(dataset_name, X_subsample_target_string, normalization) 592 | full_peak_path = get_base_path(dataset_name, Y_subsample_target_string, normalization=None, peaks=True) 593 | sub_peak_path = get_base_path(dataset_name, X_subsample_target_string, normalization=None, peaks=True) 594 | 595 | print('input', input_marks) 596 | print('output', output_marks) 597 | print('sub path', sub_path) 598 | print('full path', full_path) 599 | print('sub peak path', sub_peak_path) 600 | print('full peak path', full_peak_path) 601 | 602 | # Sanity check the input 603 | sanity_check() 604 | 605 | # Get a shared list of start positions for both X and Y 606 | # then extract the datasets 607 | start_positions = get_start_positions(full_path, cell_line, chroms) 608 | X = extract_single_dataset(sub_path, start_positions, input_marks) 609 | peakPValueX = extract_single_dataset( 610 | sub_peak_path, 611 | start_positions, 612 | [a for a in input_marks if a != 'INPUT']) 613 | peakBinaryX = extract_binary_peak_dataset( 614 | sub_path, 615 | X_subsample_target_string, 616 | start_positions, 617 | cell_line, 618 | [a for a in input_marks if a != 'INPUT']) 619 | 620 | Y = extract_single_dataset(full_path, start_positions, output_marks) 621 | peakPValueY = extract_single_dataset( 622 | full_peak_path, 623 | start_positions, 624 | [a for a in output_marks if a != 'INPUT']) 625 | peakBinaryY = extract_binary_peak_dataset( 626 | full_path, 627 | Y_subsample_target_string, 628 | start_positions, 629 | cell_line, 630 | [a for a in output_marks if a != 'INPUT']) 631 | 632 | 633 | # Sanity check the output 634 | assert (X.shape[0], X.shape[1]) == (Y.shape[0], Y.shape[1]) 635 | assert (peakPValueX.shape[0], peakPValueX.shape[1]) == (peakPValueY.shape[0], peakPValueY.shape[1]) 636 | assert X.shape[2] == len(input_marks) 637 | assert peakPValueX.shape[2] + ('INPUT' in input_marks) == len(input_marks) 638 | assert Y.shape[2] == len(output_marks) 639 | assert peakPValueY.shape[2] + ('INPUT' in output_marks) == len(output_marks) 640 | assert(peakPValueY.shape == peakBinaryY.shape) 641 | assert(peakPValueX.shape == peakBinaryX.shape) 642 | 643 | assert X.shape[0] == num_examples 644 | assert X.shape[1] == seq_length 645 | assert peakPValueX.shape[0] == num_examples 646 | assert peakPValueX.shape[1] == seq_length 647 | 648 | assert np.all(peakPValueX >= 0) 649 | assert np.all(peakPValueY >= 0) 650 | 651 | # If we only have one output mark, make sure the peak fraction is close. 652 | if len(output_marks) == 1 and output_marks != ['INPUT']: 653 | midpoint = (seq_length - 1) / 2# 654 | true_peak_fraction = peakBinaryY[:, midpoint, 0].mean() 655 | 656 | assert np.abs(true_peak_fraction - peak_fraction) < 1e-2, 'Error: true peak fraction is %2.3f, desired fraction is %2.3f' % (true_peak_fraction, peak_fraction) 657 | 658 | 659 | # Randomize the ordering of return_dataset so we don't see consecutive elements 660 | # from the same chromosome 661 | random_ordering = np.random.permutation(X.shape[0]) 662 | X = X[random_ordering] 663 | Y = Y[random_ordering] 664 | peakPValueX = peakPValueX[random_ordering] 665 | peakPValueY = peakPValueY[random_ordering] 666 | peakBinaryX = peakBinaryX[random_ordering] 667 | peakBinaryY = peakBinaryY[random_ordering] 668 | 669 | # Downsize 670 | X = X.astype('float32') 671 | Y = Y.astype('float32') 672 | peakPValueX = peakPValueX.astype('float32') 673 | peakPValueY = peakPValueY.astype('float32') 674 | peakBinaryX = peakBinaryX.astype('int8') 675 | peakBinaryY = peakBinaryY.astype('int8') 676 | 677 | 678 | 679 | # Write output to disk 680 | print("Writing output to %s" % dataset_path) 681 | 682 | np.savez_compressed( 683 | dataset_path, 684 | X=X, 685 | Y=Y, 686 | peakPValueX=peakPValueX, 687 | peakPValueY=peakPValueY, 688 | peakBinaryX=peakBinaryX, 689 | peakBinaryY=peakBinaryY) 690 | 691 | return (X, Y, peakPValueX, peakPValueY, peakBinaryX, peakBinaryY) 692 | 693 | 694 | def load_genome(self, X_or_Y, marks, only_chr1=False, peaks=False): 695 | """ 696 | Loads a genome with the appropriate normalization, selecting only chroms present 697 | in self.chroms. 698 | 699 | The only_chr1 flag is provided for convenience, so that code runs faster when we are only 700 | looking at chr1. 701 | 702 | If peaks = True, loads the peak p-values instead. 703 | """ 704 | subsample_target_string = self.get_subsample_target_string(X_or_Y) 705 | 706 | data_path = get_base_path(self.dataset_name, subsample_target_string, self.normalization, peaks=peaks) 707 | 708 | # We only want to return the tracks corresponding to marks 709 | # The genome file has all factors in marks_in_dataset, so we iterate through marks 710 | # to pick out the correct indices 711 | if peaks and ('INPUT' in self.marks_in_dataset): 712 | marks_in_dataset = copy.copy(self.marks_in_dataset) 713 | marks_in_dataset.remove('INPUT') 714 | marks_idx = [] 715 | 716 | for mark in marks: 717 | assert mark in self.marks_in_dataset 718 | marks_idx.append(self.marks_in_dataset.index(mark)) 719 | 720 | with np.load(data_path) as data: 721 | # We have to create a new dictionary for the returned data 722 | # because data is a NpzFile object that does not support item assignment 723 | # We index with marks_idx so that only the correct tracks are returned. 724 | return_data = {} 725 | if only_chr1 == False: 726 | for key in self.chroms: 727 | return_data[key] = data[key][..., marks_idx] 728 | else: 729 | return_data['chr1'] = data['chr1'][..., marks_idx] 730 | 731 | for key in return_data: 732 | assert len(return_data[key].shape) == 2 733 | assert return_data[key].shape[1] == len(marks) 734 | 735 | return return_data 736 | 737 | 738 | def load_binary_genome(self, X_or_Y, marks, only_chr1=False): 739 | """ 740 | Loads a binary genome, selecting only chroms present in self.chroms. 741 | Returns peak_matrices, peak_pval_matrices 742 | where peak_pval_matrices is a dictionary where each key is a chromosome, value is a matrix 743 | which is chrom_length x len(marks) with a zero if there's no peak and a -log10 pvalue otherwise. 744 | peak_matrices is the same but with a 1, not a p-value, for peaks; returned for convenience. 745 | 746 | normalization is passed in only to get the correct metadata. 747 | """ 748 | subsample_target_string = self.get_subsample_target_string(X_or_Y) 749 | 750 | peak_dict = {} 751 | peak_pval_dict = {} 752 | 753 | for mark in marks: 754 | peak_dict[mark], peak_pval_dict[mark] = get_peaks(self.cell_line, mark, subsample_target_string) 755 | 756 | peak_matrices = {} 757 | peak_pval_matrices = {} 758 | 759 | if self.species == 'hg19': 760 | chrom_sizes = HG19_CHROM_SIZES 761 | else: 762 | chrom_sizes = MM9_CHROM_SIZES 763 | chroms_to_use = self.chroms if not only_chr1 else ['chr1'] 764 | 765 | for chromosome in chroms_to_use: 766 | n_bins_in_chrom = int(chrom_sizes[chromosome] / 25.) 767 | peak_matrices[chromosome] = np.zeros([n_bins_in_chrom, len(marks)]) 768 | peak_pval_matrices[chromosome] = np.zeros([n_bins_in_chrom, len(marks)]) 769 | for mark_idx, mark in enumerate(marks): 770 | for i, peak in enumerate(peak_dict[mark][chromosome]): 771 | peak_matrices[chromosome][peak[0]:peak[1], mark_idx] = 1 772 | peak_pval_matrices[chromosome][peak[0]:peak[1], mark_idx] = peak_pval_dict[mark][chromosome][i] 773 | 774 | return peak_matrices, peak_pval_matrices 775 | -------------------------------------------------------------------------------- /models.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | from __future__ import absolute_import 4 | from __future__ import unicode_literals 5 | 6 | import os 7 | import datetime 8 | import json 9 | import copy 10 | import math 11 | 12 | import numpy as np 13 | import pandas as pd 14 | 15 | from keras.models import Sequential, model_from_json 16 | from keras.layers.core import TimeDistributedDense, Activation, Dense, Flatten, Merge 17 | from keras.layers.convolutional import Convolution1D, MaxPooling1D 18 | from keras.layers.recurrent import SimpleRNN, GRU, LSTM 19 | from keras.callbacks import ModelCheckpoint, EarlyStopping 20 | from keras.constraints import maxnorm 21 | from keras.regularizers import l2, activity_l2 22 | 23 | from prepData import generate_bigWig, get_peaks, perform_denormalization, input_not_before_end 24 | from dataset import DatasetEncoder 25 | import evaluations 26 | from dataNormalizer import DataNormalizer 27 | 28 | from diConstants import (BASE_ROOT, MODELS_ROOT, WEIGHTS_ROOT, 29 | RESULTS_ROOT, LOSS_ROOT, HIST_ROOT, EVAL_ROOT, RESULTS_BIGWIG_ROOT, 30 | BIN_SIZE, GENOME_BATCH_SIZE, NUM_BASES) 31 | 32 | 33 | 34 | def pad_sequence_with_zeros(X, padding): 35 | """ 36 | Takes in a matrix X of shape num_bins x num_histone_marks and adds zero padding to the left end 37 | and to the right end. Returns a matrix of shape (num_bins + 2 * padding) x num_histone_marks 38 | """ 39 | 40 | assert len(X.shape) == 2 41 | assert padding >= 0 42 | 43 | num_bins, num_histone_marks = X.shape 44 | P = np.zeros([ 45 | num_bins + 2 * padding, 46 | num_histone_marks]) 47 | 48 | # Say we want to add a padding of 2 on each side of an X that is 101 x 6 49 | # We want P[2:103, :] to be X 50 | P[padding : (num_bins+padding), :] = X 51 | 52 | return P 53 | 54 | 55 | 56 | class SeqModel(object): 57 | """ 58 | Base class from which SeqToPoint derives. (We used this base class to prototype other 59 | approaches; the paper is based only on SeqToPoint.) 60 | 61 | SeqToPoint: 62 | X is 3D with shape num_examples x seq_length x num_input_marks 63 | Y is 2D with shape num_examples x 1 x num_output_marks 64 | 65 | SeqModel implements instance methods: 66 | load_model() 67 | save_model_params() 68 | get_unprocessed_data() 69 | get_processed_data() 70 | train_single_model() 71 | compile_and_train_model() 72 | evaluate_model() 73 | test_model_on_samples() 74 | test_model_on_genome() 75 | 76 | Static method: 77 | instantiate_model() 78 | This is a static method because the __init__ function expects model_params, 79 | and if we're loading a model from a file, we don't know those model_params before we 80 | call this method. 81 | 82 | Abstract methods: 83 | process_X() 84 | process_Y() 85 | predict_samples() 86 | predict_sequence() 87 | """ 88 | 89 | def __init__(self, model_params): 90 | 91 | # We set the random seed two times in this file: 92 | # Once here, and once right after loading the training data but before training the model. 93 | # The reason is that when we try to load the training data, if the dataset doesn't exist yet, 94 | # we generate and save the training data on the fly. However, the dataset generation code 95 | # also sets the numpy random seed, so we need to reset it after loading the data. 96 | # We set a random seed of model_params['random_seed'] here 97 | # and a random seed of model_params['random_seed'] + 42 right after loading the data. 98 | np.random.seed(model_params['random_seed']) 99 | 100 | self.model_library = model_params['model_library'] 101 | if not (self.model_library in ['keras']): 102 | raise ValueError, "model_library must be 'keras'" 103 | 104 | self.model = None 105 | self.model_params = model_params 106 | self.dataset_params = model_params['dataset_params'] 107 | self.train_dataset = model_params['dataset_params']['train_dataset'] 108 | self.test_datasets = model_params['dataset_params']['test_datasets'] 109 | 110 | 111 | self.normalizer = DataNormalizer(self.model_params['scale_input']) 112 | 113 | # self.model_stamp is the unique identifier for this particular model 114 | # It looks like "RNN-20150911-175345976535", where the numbers are the date and time 115 | # that the model was saved, down to the microsecond to avoid race conditions. 116 | # It is set when the model is saved, and can be read from the filename that 117 | # the model is saved in. 118 | self.model_stamp = None 119 | 120 | # self.model_path is where the model was saved in the disk 121 | # It should be in MODELS_ROOT with filename [model_stamp],json 122 | self.model_path = None 123 | 124 | self.final_train_error = None 125 | self.final_valid_error = None 126 | 127 | self.hist = None 128 | 129 | self.input_marks = model_params['input_marks'] 130 | self.num_input_marks = len(self.input_marks) 131 | 132 | self.output_marks = model_params['output_marks'] 133 | self.num_output_marks = len(self.output_marks) 134 | 135 | assert(input_not_before_end(model_params['output_marks'])) 136 | assert(input_not_before_end(model_params['input_marks'])) 137 | 138 | self.is_output_in_input = True 139 | 140 | 141 | for output_mark in self.output_marks: 142 | if output_mark not in self.input_marks: 143 | self.is_output_in_input = False 144 | break 145 | 146 | if (self.model_params['predict_binary_output']) and ('INPUT' in self.output_marks): 147 | raise ValueError, "Cannot predict peaks on INPUT." 148 | 149 | self.verbose = True 150 | 151 | print("Initialized model with parameters:") 152 | print(json.dumps(model_params, indent=4, cls=DatasetEncoder)) 153 | 154 | 155 | @staticmethod 156 | def instantiate_model(model_params): 157 | """ 158 | Given model_params, looks at the model_class in it and 159 | returns a copy of the appropriate subclass of SeqModel. 160 | """ 161 | 162 | if model_params['model_class'] == 'SeqToSeq': 163 | m = SeqToSeq(model_params) 164 | elif model_params['model_class'] == 'SeqToPoint': 165 | m = SeqToPoint(model_params) 166 | elif model_params['model_class'] == 'PointToPoint': 167 | m = PointToPoint(model_params) 168 | 169 | return m 170 | 171 | 172 | def load_model(self, model_path): 173 | """ 174 | Loads a Keras model from disk. 175 | 176 | This only works on Keras models. 177 | 178 | The model will need to be compiled before it can be used for training. 179 | 180 | This is currently a weird function: because it's an instance method, 181 | it expects a SeqModel object to already exist. Worse, the SeqModel object 182 | must already be pre-initialized with fake model_params, since the SeqModel constructor 183 | needs model_params to be passed in. 184 | 185 | This should be rewritten when we actually need to use it. 186 | Thankfully, it is not super useful right now - we will only need it 187 | if the model init code changes such that we cannot recover previous models with 188 | current code plus model_params. 189 | """ 190 | 191 | assert self.model_library == 'keras' 192 | 193 | model_JSON_str = open(model_path).read() 194 | model_JSON = json.loads(model_JSON_str) 195 | 196 | self.model = model_from_json(model_JSON_str) 197 | self.model_params = model_JSON['_modelParams'] 198 | self.dataset_params = self.model_params['dataset_params'] 199 | assert self.model_params['model_library'] == 'keras' 200 | 201 | self.num_input_marks = self.model_params['num_input_marks'] 202 | self.num_output_marks = self.model_params['num_output_marks'] 203 | 204 | self.final_train_error = None 205 | self.final_valid_error = None 206 | 207 | self.hist = None 208 | 209 | # self.model_stamp is the unique identifier for this particular model 210 | # It looks like "RNN-20150911-175345976535", where the numbers are the date and time 211 | # that the model was saved, down to the microsecond to avoid race conditions. 212 | self.model_stamp = os.path.splitext( 213 | os.path.basename(model_path))[0] 214 | 215 | self.model_path = model_path 216 | 217 | return None 218 | 219 | 220 | def get_unprocessed_data(self, dataset): 221 | """ 222 | Loads the train or test dataset (as specified in train_or_test) found in self.dataset_params 223 | in its original seq-to-seq form, as returned by extractDataset.load_seq_dataset. 224 | 225 | This function resets the random seed. 226 | """ 227 | X, Y, peakPValueX, peakPValueY, peakBinaryX, peakBinaryY = dataset.load_seq_dataset( 228 | seq_length=self.dataset_params['seq_length'], 229 | input_marks=self.input_marks, 230 | output_marks=self.output_marks) 231 | 232 | if self.model_params['zero_out_non_bins']: 233 | peakPValueX = peakPValueX * peakBinaryX 234 | peakPValueY = peakPValueY * peakBinaryY 235 | 236 | if ((self.num_input_marks != X.shape[2]) or 237 | (self.num_input_marks != peakPValueX.shape[2] + ('INPUT' in self.input_marks))): 238 | raise Exception, "num_input_marks between model and data needs to agree" 239 | if ((self.num_output_marks != Y.shape[2]) or 240 | (self.num_output_marks != peakPValueY.shape[2] + ('INPUT' in self.output_marks))): 241 | raise Exception, "num_output_marks between model and data needs to agree" 242 | 243 | # See comment in __init__ about random seeds 244 | np.random.seed(self.model_params['random_seed'] + 42) 245 | 246 | return (X, Y, peakPValueX, peakPValueY, peakBinaryX, peakBinaryY) 247 | 248 | 249 | def get_processed_data(self, dataset): 250 | """ 251 | Returns the train or test dataset (as specified in train_or_test) found in 252 | self.dataset_params, transformed into a format that the model can directly use. 253 | 254 | Helper functions process_X and process_Y are implemented in subclasses because 255 | different models need differently formatted data, e.g., seq-to-seq vs. seq-to-point. 256 | 257 | Seq-to-seq takes in: 258 | X: num_examples x seq_length x num_input_marks 259 | Y: num_examples x seq_length x num_output_marks 260 | 261 | Seq-to-point takes in: 262 | X: num_examples x seq_length x num_input_marks 263 | Y: num_examples x 1 x num_output_marks 264 | 265 | Point-to-point takes in: 266 | X: num_examples x (seq_length * num_input_marks) 267 | Y: num_examples x num_output_marks 268 | 269 | """ 270 | 271 | X, Y, peakPValueX, peakPValueY, peakBinaryX, peakBinaryY = self.get_unprocessed_data(dataset) 272 | 273 | X = self.process_X(X) 274 | Y = self.process_Y(Y) 275 | peakPValueX = self.process_X(peakPValueX) 276 | peakPValueY = self.process_Y(peakPValueY) 277 | peakBinaryX = self.process_X(peakBinaryX) 278 | peakBinaryY = self.process_Y(peakBinaryY) 279 | 280 | return (X, Y, peakPValueX, peakPValueY, peakBinaryX, peakBinaryY) 281 | 282 | 283 | def compile_and_train_model(self): 284 | """ 285 | Trains the model specified by self.model and self.model_params 286 | on the training data given by self.dataset_params. 287 | 288 | If self.model is a Keras model, it also writes out model weights and 289 | training history to disk. 290 | """ 291 | 292 | assert self.model 293 | assert self.model_params 294 | 295 | # Train model 296 | (train_X, Y, peakPValueX, peakPValueY, peakBinaryX, peakBinaryY) = self.get_processed_data( 297 | self.train_dataset) 298 | 299 | self.normalizer.fit(train_X) 300 | train_X = self.normalizer.transform(train_X) 301 | train_inputs_X = train_X 302 | 303 | if self.model_params['predict_binary_output']: 304 | train_Y = peakBinaryY 305 | else: 306 | train_Y = Y 307 | 308 | 309 | 310 | if self.model_library == 'keras': 311 | 312 | # Compiles model: this sets the optimizer and loss function 313 | self.model.compile(**self.model_params['compile_params']) 314 | 315 | # ModelCheckpoint() is a Keras callback that saves the weights of the model while 316 | # it's being trained. 317 | # save_best_only means that the model weights will be saved after every epoch 318 | # in which the validation error improves. 319 | checkpointer = ModelCheckpoint( 320 | filepath=os.path.join(WEIGHTS_ROOT, '%s-weights.hdf5' % self.model_stamp), 321 | verbose=1, 322 | save_best_only=True) 323 | 324 | # EarlyStopping() is a Keras callback that stops training once the validation loss 325 | # of the model has not improved for [patience] epochs in a row. 326 | earlystopper = EarlyStopping(monitor='val_loss', patience=3, verbose=0) 327 | 328 | self.hist = self.model.fit( 329 | train_inputs_X, 330 | train_Y, 331 | callbacks=[checkpointer, earlystopper], 332 | **self.model_params['train_params']) 333 | 334 | # Store training history for Keras models 335 | # Note that the "final training error" in self.hist.history is only approximate: 336 | # it is averaged over all minibatches in the final epoch. So it's not exactly the 337 | # training error with the final weights. The final validation error is accurate. 338 | 339 | hist_path = os.path.join( 340 | HIST_ROOT, 341 | "%s.hist" % self.model_stamp) 342 | 343 | with open(hist_path, 'w') as f: 344 | f.write(json.dumps(self.hist.history)) 345 | 346 | return None 347 | 348 | 349 | def save_model_params(self): 350 | """ 351 | Writes model to disk, initializing model_stamp and model_path in the process. 352 | This function is called in the __init__ method of derived classes. 353 | 354 | For Keras models, this saves compilation parameters separately 355 | without actually compiling the model to save time. Keras model weights are 356 | saved during training through the ModelCheckpoint() callback, so we can reconstruct 357 | trained models by separately loading the saved params and the weights. See 358 | http://keras.io/faq/#how-can-i-save-a-keras-model for more details. 359 | """ 360 | 361 | assert self.model 362 | assert self.model_params 363 | 364 | # If it's a Keras model, we save not only model_params but the actual 365 | # architecture of the model, since the code that constructs models from model_params 366 | # might change over time. 367 | if self.model_library == 'keras': 368 | model_JSON = self.model_params 369 | model_JSON['_keras_model_params'] = json.loads(self.model.to_json()) 370 | model_JSON_str = json.dumps(model_JSON, cls=DatasetEncoder) 371 | 372 | timeStr = datetime.datetime.now().strftime("%Y%m%d-%H%M%S%f") 373 | self.model_stamp = "%s-%s" % (self.model_params['model_type'], timeStr) 374 | self.model_path = os.path.join(MODELS_ROOT, "%s.json" % self.model_stamp) 375 | 376 | assert os.path.isfile(self.model_path) == False 377 | 378 | with open(self.model_path, 'w') as model_file: 379 | model_file.write(model_JSON_str) 380 | 381 | return None 382 | 383 | 384 | def test_model_on_samples(self, dataset, train_or_test): 385 | """ 386 | Evaluates the model on samples drawn from dataset. 387 | Returns a dictionary with keys orig_results and denoised_results, with values obtained 388 | from evaluations.compare. 389 | 390 | The train_or_test param is just for display. 391 | """ 392 | assert self.model 393 | assert train_or_test == 'train' or train_or_test == 'test' 394 | 395 | (X, Y, peakPValueX, peakPValueY, peakBinaryX, peakBinaryY) = self.get_unprocessed_data(dataset) 396 | binaryY = peakBinaryY 397 | 398 | Y = self.process_Y(Y) 399 | peakPValueY = self.process_Y(peakPValueY) 400 | binaryY = self.process_Y(binaryY) 401 | 402 | if not self.model_params['predict_binary_output']: 403 | print('Bias-only MSE is ', np.mean((Y - np.mean(Y)) ** 2)) 404 | 405 | # First, compare the true data with the subsampled data 406 | # To get the "original" error, we just make the prediction that Y = X. 407 | # Before doing this, we have to call process_Y on X to get it into the right form. 408 | # This is not a typo! We have to process X in the way that we'd normally process Y. 409 | # This is needed for SeqToPoint and PointToPoint models, since in those models 410 | # the X and Y returned from self.get_data have different shapes. 411 | # Since input_marks might not equals output_marks, we also have to subset the right 412 | # parts of X to compare. 413 | # If we're doing de novo imputation then output marks will not be in input marks; if 414 | # so, we just skip this step. 415 | orig_results = None 416 | 417 | if self.is_output_in_input: 418 | output_marks_idx = [self.input_marks.index(output_mark) for output_mark in self.output_marks] 419 | if self.model_params['predict_binary_output']: 420 | print("%s samples - Original peaks vs. true peaks:" % train_or_test) 421 | orig_results = evaluations.compare( 422 | self.process_Y(peakPValueX[..., output_marks_idx]), 423 | binaryY, 424 | predict_binary_output=True) 425 | else: 426 | print("%s samples - Original:" % train_or_test) 427 | orig_results = evaluations.compare( 428 | self.process_Y(X[..., output_marks_idx]), 429 | Y, 430 | predict_binary_output=False) 431 | 432 | # Then compare the true data with the output of the model 433 | # Process the data properly 434 | X = self.process_X(X) 435 | X = self.normalizer.transform(X) 436 | 437 | # We have to batch the prediction so that the GPU doesn't run out of memory 438 | if 'batch_size' in self.model_params['train_params']: 439 | batch_size = self.model_params['train_params']['batch_size'] 440 | else: 441 | batch_size = 10000 442 | num_examples = X.shape[0] 443 | num_batches = int(math.ceil(1.0 * num_examples / batch_size)) 444 | 445 | # If predict_binary_output is true, then INPUT cannot be in output_marks, so 446 | # Y will have the same shape as binaryY. 447 | # This is not necessarily true if predict_binary_output is false. 448 | # There's no need to branch separately here to initialize Y_pred = np.empty(binaryY.shape) 449 | # if predict_binary_output is true. 450 | Y_pred = np.empty(Y.shape) 451 | 452 | for batch in range(num_batches): 453 | start_idx = batch * batch_size 454 | end_idx = min((batch + 1) * batch_size, num_examples) 455 | Y_pred[start_idx : end_idx] = self.predict_samples(X[start_idx : end_idx]) 456 | 457 | if self.model_params['predict_binary_output']: 458 | print("%s samples - Predicted peaks vs. true peaks:" % train_or_test) 459 | denoised_results = evaluations.compare(Y_pred, binaryY, predict_binary_output=True) 460 | else: 461 | print("%s samples - Denoised:" % train_or_test) 462 | denoised_results = evaluations.compare(Y_pred, Y, predict_binary_output=False) 463 | 464 | samples_results = { 465 | 'orig': orig_results, 466 | 'dn': denoised_results 467 | } 468 | return samples_results 469 | 470 | 471 | def test_model_on_genome(self, dataset): 472 | """ 473 | Evaluates the model on the entire genome in dataset. 474 | Returns a dictionary with keys orig_results and denoised_results, with values obtained 475 | from evaluations.compare. 476 | 477 | This function generates genome-wide predictions for each chromosome in the 478 | test dataset. Blacklisted regions have previously been zero-ed out in prepData. 479 | """ 480 | 481 | assert self.model 482 | 483 | # only_chr1 controls whether genome-wide prediction is done on the whole genome, or just 484 | # on chr1 for speed. 485 | only_chr1 = self.dataset_params['only_chr1'] 486 | # Load data 487 | test_X_all = dataset.load_genome( 488 | "X", 489 | marks=self.input_marks, 490 | only_chr1=only_chr1, 491 | peaks=False) 492 | 493 | if self.model_params['predict_binary_output']: 494 | #if binary, want to use binary peak matrix as Y. 495 | #and noisy peak p-values as baseline. 496 | assert('INPUT' not in self.output_marks) 497 | test_Y_all, _ = dataset.load_binary_genome( 498 | "Y", 499 | marks=self.output_marks, 500 | only_chr1=only_chr1) 501 | 502 | noisy_peak_pvals_all = dataset.load_genome( 503 | "X", 504 | marks=self.output_marks, 505 | only_chr1=only_chr1, 506 | peaks=True) 507 | 508 | if self.model_params['zero_out_non_bins']: 509 | noisy_peaks_all, _ = dataset.load_binary_genome( 510 | "X", 511 | marks=self.output_marks, 512 | only_chr1=only_chr1) 513 | assert(set(noisy_peak_pvals_all.keys()) == set(noisy_peaks_all.keys())) 514 | for chrom in noisy_peak_pvals_all: 515 | noisy_peak_pvals_all[chrom] = noisy_peak_pvals_all[chrom] * noisy_peaks_all[chrom] 516 | 517 | else: 518 | #otherwise, use continuous non-subsampled signal as Y. 519 | test_Y_all = dataset.load_genome( 520 | "Y", 521 | marks=self.output_marks, 522 | only_chr1=only_chr1, 523 | peaks=False) 524 | 525 | # Load peaks from test cell line 526 | peak_locs_all = {} 527 | for factor in dataset.marks_in_dataset: 528 | if factor == 'INPUT': continue 529 | peak_locs, _ = get_peaks( 530 | dataset.cell_line, 531 | factor, 532 | subsample_target_string=dataset.Y_subsample_target_string) 533 | peak_locs_all[factor] = peak_locs 534 | 535 | 536 | if (test_Y_all.keys() != test_X_all.keys()): 537 | raise Exception, "Subsampled and full data must have the same chroms" 538 | 539 | 540 | chroms = sorted(test_X_all.keys()) 541 | 542 | 543 | ### Compute results separately for each chromosome 544 | 545 | orig_results_all = {} 546 | denoised_results_all = {} 547 | orig_results_peaks = {} 548 | denoised_results_peaks = {} 549 | 550 | preds = {} 551 | 552 | # Warning: the peak comparison code relies on the sequence starting at the start 553 | # of the chromosome. If this is not true, we'd have to offset the peak coordinates before 554 | # passing them into compare(). 555 | if only_chr1: 556 | chroms = ['chr1'] 557 | 558 | for chrom in chroms: 559 | test_X = test_X_all[chrom] 560 | test_Y = test_Y_all[chrom] 561 | if self.model_params['predict_binary_output']: 562 | noisy_peak_pvals = noisy_peak_pvals_all[chrom] 563 | 564 | 565 | if self.dataset_params['num_bins_to_test']: 566 | num_bins_to_test = self.dataset_params['num_bins_to_test'] 567 | assert num_bins_to_test > 0 568 | test_X = test_X[:num_bins_to_test] 569 | test_Y = test_Y[:num_bins_to_test] 570 | if self.model_params['predict_binary_output']: 571 | noisy_peak_pvals = noisy_peak_pvals[:num_bins_to_test] 572 | 573 | assert test_X.shape[0] == test_Y.shape[0], \ 574 | "Subsampled and full data must have the same length" 575 | 576 | if self.model_params['predict_binary_output']: 577 | assert(list(noisy_peak_pvals.shape) == list(test_Y.shape)) 578 | 579 | assert test_X.shape[1] == self.num_input_marks 580 | assert test_Y.shape[1] == self.num_output_marks 581 | 582 | chrom_length = test_X.shape[0] 583 | 584 | ### Get a list of peaks for this chromosome 585 | peaks = [] 586 | if not self.model_params['predict_binary_output']: 587 | for factor in self.output_marks: 588 | # For INPUT, we calculate MSE across peaks of all other marks in the test dataset, 589 | # since we want to get INPUT right whenever there's a peak in some other mark. 590 | # Note that we're purely concatenating peaks from different marks here, 591 | # so there'll be some overlapping peaks. 592 | # This is fine right now but might break later depending on what evaluation code we 593 | # write, so watch out. 594 | if factor == 'INPUT': 595 | peak_factor = [] 596 | for other_factor in dataset.marks_in_dataset: 597 | if other_factor == 'INPUT': continue 598 | peak_factor.extend(peak_locs_all[other_factor][chrom]) 599 | peak_factor = np.array(peak_factor) 600 | else: 601 | peak_factor = peak_locs_all[factor][chrom] 602 | peaks.append(peak_factor) 603 | 604 | ### Do comparisons between original (subsampled) and full data 605 | # The original comparison is only done if the output mark is actually in the input data 606 | if self.is_output_in_input: 607 | if not self.model_params['predict_binary_output']: 608 | 609 | output_marks_idx = [self.input_marks.index(output_mark) for output_mark in self.output_marks] 610 | 611 | print("Test %s, %.2E bins - Original, all signal:" % (chrom, chrom_length)) 612 | orig_results_all[chrom] = evaluations.compare( 613 | test_X[:, output_marks_idx], 614 | test_Y, 615 | predict_binary_output=False) 616 | 617 | print("Test %s, %.2E bins - Original, only peaks:" % (chrom, chrom_length)) 618 | orig_results_peaks[chrom] = evaluations.compare( 619 | test_X[:, output_marks_idx], 620 | test_Y, 621 | predict_binary_output=False, 622 | peaks=peaks) 623 | 624 | elif self.model_params['predict_binary_output']: 625 | print("Test %s, %.2E bins - Original:" % (chrom, chrom_length)) 626 | orig_results_all[chrom] = evaluations.compare( 627 | noisy_peak_pvals, 628 | test_Y, 629 | predict_binary_output=True) 630 | 631 | 632 | ### Do comparisons between model output and full data 633 | # We have to batch this up so that the GPU doesn't run out of memory 634 | # Assume a fixed batch size of 5M bins 635 | num_batches = int(math.ceil(1.0 * chrom_length / GENOME_BATCH_SIZE)) 636 | 637 | test_Y_pred = np.empty(test_Y.shape) 638 | test_X = self.normalizer.transform(test_X) 639 | 640 | for batch in range(num_batches): 641 | start_idx = batch * GENOME_BATCH_SIZE 642 | end_idx = min((batch + 1) * GENOME_BATCH_SIZE, chrom_length) 643 | test_Y_pred[start_idx : end_idx] = self.predict_sequence( 644 | test_X[start_idx : end_idx]) 645 | 646 | print("Test %s, %.2E bins - Denoised, all signal:" % (chrom, chrom_length)) 647 | denoised_results_all[chrom] = evaluations.compare( 648 | test_Y_pred, 649 | test_Y, 650 | predict_binary_output=self.model_params['predict_binary_output']) 651 | 652 | if not self.model_params['predict_binary_output']: 653 | print("Test %s, %.2E bins - Denoised, only peaks:" % (chrom, chrom_length)) 654 | denoised_results_peaks[chrom] = evaluations.compare( 655 | test_Y_pred, 656 | test_Y, 657 | predict_binary_output=False, 658 | peaks=peaks) 659 | 660 | 661 | 662 | # If we're generating a bigWig file from the output, we need to save the results 663 | # If we're doing regression, we first denormalize the outputs so that it can be viewed v 664 | # correctly in the genome browser 665 | if self.model_params['generate_bigWig']: 666 | 667 | if self.model_params['predict_binary_output']: 668 | preds[chrom] = test_Y_pred 669 | else: 670 | preds[chrom] = perform_denormalization( 671 | test_Y_pred, 672 | dataset.normalization) 673 | 674 | # Write bigWig file to disk 675 | if self.model_params['generate_bigWig']: 676 | if self.model_params['predict_binary_output']: 677 | suffix = 'peaks' 678 | else: 679 | suffix = 'signal' 680 | 681 | generate_bigWig( 682 | preds, 683 | self.output_marks, 684 | '%s_%s_subsample-%s_%s' % ( 685 | self.model_stamp, 686 | dataset.cell_line, 687 | dataset.X_subsample_target_string , 688 | suffix), 689 | RESULTS_BIGWIG_ROOT) 690 | 691 | # Construct dict of results 692 | if self.model_params['predict_binary_output']: 693 | test_genome_results = { 694 | 'orig_all': orig_results_all, 695 | 'dn_all': denoised_results_all 696 | } 697 | else: 698 | test_genome_results = { 699 | 'orig_all': orig_results_all, 700 | 'dn_all': denoised_results_all, 701 | 'orig_peaks': orig_results_peaks, 702 | 'dn_peaks': denoised_results_peaks, 703 | } 704 | 705 | 706 | 707 | print('final results', test_genome_results) 708 | return test_genome_results 709 | 710 | 711 | def evaluate_model(self): 712 | """ 713 | Evaluates the model on the train and test datasets specified in self.dataset_params. 714 | Writes the results to disk in EVAL_ROOT. 715 | """ 716 | # We need to write our own JSON encoder for numpy.float32s 717 | # because the built-in JSON encoder only knows how to encode normal floats 718 | class NumpyEncoder(json.JSONEncoder): 719 | def default(self, obj): 720 | if isinstance(obj, np.floating): 721 | return float(obj) 722 | else: 723 | return super(NumpyEncoder, self).default(obj) 724 | 725 | # Evaluate model on training data 726 | train_samples_results = self.test_model_on_samples(self.train_dataset, 'train') 727 | train_results = { 728 | 'samples': train_samples_results 729 | } 730 | 731 | train_eval_path = os.path.join( 732 | EVAL_ROOT, 733 | "%s-train.eval" % self.model_stamp) 734 | 735 | with open(train_eval_path, 'w') as f: 736 | f.write(json.dumps(train_results, cls=NumpyEncoder)) 737 | 738 | # Evaluate model on testing data 739 | all_test_results = [] 740 | for dataset_idx, test_dataset in enumerate(self.test_datasets): 741 | test_samples_results = self.test_model_on_samples(test_dataset, 'test') 742 | 743 | try: 744 | test_genome_results = self.test_model_on_genome(test_dataset) 745 | except NotImplementedError: 746 | print("Genome-wide prediction hasn't been implemented for this type of model. Skipping...") 747 | test_genome_results = None 748 | 749 | test_results = { 750 | 'samples': test_samples_results, 751 | 'genome': test_genome_results 752 | } 753 | 754 | test_eval_path = os.path.join( 755 | EVAL_ROOT, 756 | "%s-test-%s.eval" % (self.model_stamp, dataset_idx)) 757 | with open(test_eval_path, 'w') as f: 758 | f.write(json.dumps(test_results, cls=NumpyEncoder)) 759 | 760 | all_test_results.append(test_results) 761 | 762 | results = { 763 | 'train_samples': train_samples_results, 764 | 'test_results': all_test_results 765 | } 766 | 767 | return results 768 | 769 | 770 | def process_X(self, X): 771 | """ 772 | Takes in a matrix X of shape num_examples x seq_length x num_histone_marks, 773 | returned from extractDataset.load_seq_dataset, and processes it as necessary 774 | for the type of model. X should be the input data that is fed to the model. 775 | 776 | This is implemented in subclasses because different models need differently 777 | formatted data, e.g., seq-to-seq vs. seq-to-point. 778 | """ 779 | 780 | raise NotImplementedError 781 | 782 | 783 | def process_Y(self, Y): 784 | """ 785 | Takes in a matrix Y of shape num_examples x seq_length x num_histone_marks, 786 | returned from extractDataset.load_seq_dataset, and processes it as necessary 787 | for the type of model. Y should represet the desired output of the model. 788 | 789 | This is implemented in subclasses because different models need differently 790 | formatted data, e.g., seq-to-seq vs. seq-to-point. 791 | """ 792 | 793 | raise NotImplementedError 794 | 795 | 796 | def SeqToX_predict_samples(self, signalX): 797 | """ 798 | Common code used in the predict_samples() method defined in SeqToSeq and SeqToPoint subclasses. 799 | """ 800 | 801 | num_examples = signalX.shape[0] 802 | 803 | assert len(signalX.shape) == 3 804 | assert signalX.shape[0] == num_examples 805 | assert signalX.shape[1] == self.dataset_params['seq_length'] 806 | assert signalX.shape[2] == self.num_input_marks 807 | 808 | 809 | Y = self.model.predict(signalX) 810 | 811 | assert Y.shape[0] == num_examples 812 | assert Y.shape[2] == self.num_output_marks 813 | 814 | return Y 815 | 816 | 817 | def predict_samples(self, signalX): 818 | """ 819 | Takes in input signalX of whatever dimensions are needed for the model, which 820 | is subclass-dependent. It passes it through the model and returns the output matrix. 821 | """ 822 | 823 | raise NotImplementedError 824 | 825 | 826 | def predict_sequence(self, signalX): 827 | """ 828 | Takes in input matrix signalX of dimensions num_bins x num_input_marks 829 | and passes it through the model, 830 | returning an output matrix of num_bins x num_output_marks. 831 | """ 832 | 833 | raise NotImplementedError 834 | 835 | 836 | 837 | 838 | class SeqToPoint(SeqModel): 839 | 840 | def __init__(self, model_params): 841 | """ 842 | Initializes the correct model based on model_params. 843 | """ 844 | 845 | super(SeqToPoint, self).__init__(model_params) 846 | 847 | assert self.dataset_params['seq_length'] % 2 == 1, "seq_length must be odd for SeqToPoint models." 848 | 849 | if model_params['model_type'] == 'cnn': 850 | 851 | num_filters = model_params['num_filters'] 852 | filter_length = model_params['filter_length'] 853 | 854 | model = Sequential() 855 | 856 | # border_mode='same' makes the length of the output 857 | # the same size as the length of the input 858 | # by adding just the right amount of zero padding to each side. 859 | model.add( 860 | Convolution1D( 861 | num_filters, 862 | filter_length, 863 | input_dim=self.num_input_marks, 864 | init='uniform', 865 | border_mode='same')) 866 | 867 | model.add(Activation('relu')) 868 | 869 | # See below for documentation on border_mode='valid' 870 | # We are essentially replicating the "dense" layer here, but with a convolutional layer 871 | # so that later we can do genome-wide prediction. 872 | model.add( 873 | Convolution1D( 874 | self.num_output_marks, # output_dim, 875 | self.dataset_params['seq_length'], 876 | init='uniform', 877 | border_mode='valid')) 878 | 879 | if model_params['predict_binary_output']: 880 | model.add(Activation('sigmoid')) 881 | else: 882 | model.add(Activation('relu')) 883 | 884 | # 'lrnn' stands for linear regression neural network 885 | # It is a single convolutional layer with filters that span the entire seq length. 886 | # Essentially, this replicates linear or logistic regression in the Keras framework. 887 | # border_mode='valid' means that it only does convolutions where the whole filter can fit in the sequence 888 | # so effectively it is only doing one convolution/feedforward operation during training. 889 | # We make it convolutional so that we can easily do genome-wide predictions later. 890 | # It has as many neurons as there are histone marks, that is, there is one filter per histone mark. 891 | # This way, each histone mark gets seq_length * num_input_marks parameters to make a linear prediction. 892 | elif model_params['model_type'] == 'lrnn': 893 | model = Sequential() 894 | 895 | model.add( 896 | Convolution1D( 897 | self.num_output_marks, # nb_filter: one filter per histone mark 898 | self.dataset_params['seq_length'], # filter_length 899 | input_dim=self.num_input_marks, 900 | border_mode='valid')) 901 | 902 | if model_params['predict_binary_output']: 903 | model.add(Activation('sigmoid')) 904 | 905 | else: 906 | raise Exception, "Model type not recognized" 907 | 908 | self.model = model 909 | self.save_model_params() 910 | 911 | 912 | def process_X(self, X): 913 | """ 914 | See documentation in SeqModel. 915 | Input to seq-to-point models need no further processing from load_seq_dataset. 916 | """ 917 | 918 | return X 919 | 920 | 921 | def process_Y(self, Y): 922 | """ 923 | See documentation in SeqModel. 924 | Takes in matrix Y of shape num_examples x seq_length x num_histone_marks 925 | and returns matrix of shape num_examples x 1 x num_histone_marks, selecting the 926 | middle of the sequence. 927 | 928 | We want the singleton dimension so that we can avoid flattening the output of the 929 | model in Keras. This doesn't matter in training, but it does in testing when we 930 | are trying to do genome-wide predictions. 931 | """ 932 | 933 | # If seq_length is 101 934 | # then the array goes from 0 to 100 935 | # and we want to pick mid = 50 936 | mid = (self.dataset_params['seq_length'] - 1) / 2 937 | 938 | # Y = np.squeeze(Y[:, mid, :]) 939 | # return Y 940 | 941 | return Y[:, mid:mid+1, :] 942 | 943 | 944 | def predict_samples(self, signalX): 945 | """ 946 | Takes in input matrix signalX, of shape num_examples x seq_length x num_input_marks 947 | and feeds it through the model, returning an output matrix of shape 948 | num_examples x 1 x num_output_marks. 949 | """ 950 | Y = self.SeqToX_predict_samples(signalX) 951 | assert Y.shape[1] == 1 952 | 953 | return Y 954 | 955 | 956 | def predict_sequence(self, signalX): 957 | """ 958 | Takes in input matrix signalX of dimensions num_bins x num_input_marks 959 | and passes it through the model, 960 | returning an output matrix of num_bins x num_output_marks. 961 | """ 962 | if ('lrnn' not in self.model_params['model_type']) and ('cnn' not in self.model_params['model_type']): 963 | raise NotImplementedError 964 | 965 | # We have to do some zero-padding on the input sequences before we pass them to the 966 | # convolutional models defined in SeqToPoint. 967 | # This is because the final layer of these conv nets are 'valid' convolutions with 968 | # filter_length = seq_length. This means that the output of that layer, and therefore the 969 | # model, will be (seq_length - 1) shorter than the input to that layer. This is necessary 970 | # for training, since in training we the input is a sequence whereas the output is a single 971 | # bin in the middle of the sequence. However, when trying to do genome-wide prediction, 972 | # we need the output shape to match the input shape. 973 | 974 | # Warning: this code assumes that the final layer of the conv net is a 'valid' conv with 975 | # filter_length = seq_length. 976 | num_bins = signalX.shape[0] 977 | 978 | # Initially, the shape of signalX is num_bins x num_input_marks. 979 | # We add (seq_length - 1) / 2 zeroes to both sides of the input, so that the 980 | # resulting shape of the padded input is (num_bins + seq_length - 1) x num_input_marks. 981 | # The shape of the output will then be exactly num_bins x num_input_marks. 982 | assert len(signalX.shape) == 2 983 | assert signalX.shape[1] == self.num_input_marks 984 | signalX_pad = pad_sequence_with_zeros( 985 | signalX, 986 | padding=(self.dataset_params['seq_length'] - 1) / 2) 987 | 988 | # After padding, we reshape the input to fit the Keras predict() API, 989 | # which requires a 3-tensor where the first dimension is the number of examples. 990 | # In our case, the number of examples is always 1 when doing genome-wide prediction. 991 | signalX = np.reshape( 992 | signalX_pad, 993 | [1, signalX_pad.shape[0], signalX_pad.shape[1]]) 994 | 995 | Y = self.model.predict(signalX) 996 | Y = Y[0] 997 | 998 | assert Y.shape[0] == num_bins 999 | assert Y.shape[1] == self.num_output_marks 1000 | 1001 | return Y 1002 | 1003 | 1004 | 1005 | -------------------------------------------------------------------------------- /prepData.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | from __future__ import absolute_import 4 | from __future__ import unicode_literals 5 | 6 | from subprocess import call, check_output 7 | import os 8 | import json 9 | from traceback import print_exc 10 | import signal 11 | import sys 12 | import pandas as pd 13 | from time import time, sleep 14 | import numpy as np 15 | import multiprocessing 16 | import thread 17 | import gzip 18 | 19 | import IPython 20 | 21 | from diConstants import (PIPELINE_ROOT, CODE_ROOT, DATA_ROOT, RAW_ROOT, MERGED_ROOT, REMOTE_ROOT, 22 | SUBSAMPLED_ROOT, BIGWIGS_ROOT, INTERVALS_ROOT, NUMPY_ROOT, BASE_ROOT, BASE_BIGWIG_ROOT, 23 | RESULTS_BIGWIG_ROOT, MODELS_ROOT, 24 | HG19_BLACKLIST_FILE, MM9_BLACKLIST_FILE, 25 | BIN_SIZE, HG19_CHROM_SIZES, HG19_CHROM_SIZES_PATH, MM9_CHROM_SIZES, MM9_CHROM_SIZES_PATH, 26 | PEAK_BASE_DIR, COMBINED_PEAK_DIR, SUBSAMPLE_TARGETS, 27 | GM_CELL_LINES, GM_FACTORS, GM_DATASET_NAME_TEMPLATE, 28 | HG19_ALL_CHROMS, MM9_ALL_CHROMS, 29 | MAPQ_THRESHOLD) 30 | 31 | 32 | def perform_normalization(X, normalization): 33 | """ 34 | Normalizes a dataset using a method in ['log', 'arcsinh', None]. If none, just returns original dataset. 35 | """ 36 | assert(normalization in ['log', 'arcsinh', None]) 37 | if set(X.flatten()) == set([1.0, 0.0]): 38 | assert(normalization is None) 39 | 40 | if normalization in ['arcsinh', 'log']: 41 | if normalization == 'arcsinh': 42 | X = np.arcsinh(X) 43 | else: 44 | X = np.log(X + 1) 45 | print('Normalization: took %s of data. Mean is now %2.3f, max %2.3f' % (normalization, np.mean(X), np.max(X))) 46 | return X 47 | 48 | 49 | def perform_denormalization(X, normalization): 50 | """ 51 | Denormalizes a dataset using a method in ['log', 'arcsinh', None]. If none, just returns original dataset. 52 | """ 53 | 54 | assert(normalization in ['log', 'arcsinh', None]) 55 | if set(X.flatten()) == set([1.0, 0.0]): 56 | assert(normalization is None) 57 | if normalization in ['arcsinh', 'log']: 58 | if normalization == 'arcsinh': 59 | X = np.sinh(X) 60 | else: 61 | X = np.exp(X) - 1 62 | print('Denormalization: took inverse %s of data. Mean is now %2.3f, max %2.3f' % (normalization, np.mean(X), np.max(X))) 63 | return X 64 | 65 | 66 | def check_npz_files(): 67 | """ 68 | This confirms that we can load all the .npz files in BASE_DIR (for some reason they were getting corrupted.) 69 | """ 70 | desired_keys = ['chr' + str(i) for i in range(1, 23)] 71 | n_successes = n_errors = 0 72 | for f in os.listdir(BASE_ROOT): 73 | if '.npz' not in f: 74 | continue 75 | try: 76 | d = np.load(os.path.join(BASE_ROOT, f)) 77 | n_successes += 1 78 | assert(sorted(d.keys()) == sorted(HG19_ALL_CHROMS)) # This check will fail on mouse 79 | except: 80 | n_errors += 1 81 | os.remove(os.path.join(BASE_ROOT, f)) 82 | print('Error with ' + f) 83 | continue 84 | print('successes', n_successes, 'errors', n_errors) 85 | 86 | 87 | def get_peaks(cell_line, factor, subsample_target_string): 88 | """ 89 | chrs_to_peaks: a dictionary whose keys are chromosomes which map to an array of bin starts and ends 90 | indices (not chromosome locations) which are peaks. 91 | Eg, {'chr1':[[5, 10], [25, 50]]} means bins 5 - 9 and 25 - 49 on chromosome 1 are peaks. 92 | When computing peak boundaries, rounds (ie, a peak beginning at bin .6 = a bin beginning at bin 1.) 93 | peak_log_pvalues: a dictionary whose keys are chromosomes which map to an array of peak log pvalues 94 | in the same order as the peaks in chrs_to_peaks. 95 | Eg, {'chr1':[99, 104]} means the peaks in chr1 have log10 pvalues 99 and 104, respectively. 96 | """ 97 | 98 | peak_path = get_peak_path(cell_line, factor, subsample_target_string) 99 | if not os.path.isfile(peak_path): 100 | raise ValueError, "%s does not exist." % peak_path 101 | 102 | d = pd.read_csv(peak_path, sep = '\t', header = None) 103 | d = d[[0, 1, 2, 13]] 104 | 105 | d.columns = ['chr', 'start', 'end', 'log10_pvalue'] 106 | chrs = list(set(d['chr'])) 107 | chrs_to_peaks = {} 108 | peak_log_pvalues = {} 109 | for chrom in chrs: 110 | idxs = d['chr'] == chrom 111 | chrs_to_peaks[chrom] = np.array(zip(list(d.loc[idxs]['start']), list(d.loc[idxs]['end']))) 112 | chrs_to_peaks[chrom] = np.around(chrs_to_peaks[chrom] / BIN_SIZE).astype(int) 113 | peak_log_pvalues[chrom] = np.array(d.loc[idxs]['log10_pvalue']) 114 | assert(len(peak_log_pvalues[chrom]) == len(chrs_to_peaks[chrom])) 115 | return chrs_to_peaks, peak_log_pvalues 116 | 117 | 118 | 119 | def generate_bigWig(data, marks, bigWig_prefix, bigWig_folder): 120 | """ 121 | Takes in data, a dictionary with keys corresponding to chromosomes 122 | and each chromosome being a matrix of shape num_bins x num_histone_marks 123 | and outputs bigWigs generated from that data in bigWig_folder, 124 | one for each factor in FACTORS_TO_INCLUDE 125 | """ 126 | 127 | assert data[data.keys()[0]].shape[1] == len(marks) 128 | chrom_sizes_path = HG19_CHROM_SIZES_PATH 129 | 130 | for (factorIdx, factor) in enumerate(marks): 131 | 132 | wig_path = os.path.join(bigWig_folder, '%s_%s.wig' % (bigWig_prefix, factor)) 133 | bigWig_path = os.path.join(bigWig_folder, '%s_%s.bw' % (bigWig_prefix, factor)) 134 | 135 | with open(wig_path, 'w') as f: 136 | for chrom in data: 137 | 138 | f.write('fixedStep chrom=%s start=1 step=%s span=%d\n' % (chrom, BIN_SIZE, BIN_SIZE)) 139 | 140 | for i in data[chrom][:, factorIdx]: 141 | f.write('%s\n' % str(i)) 142 | 143 | call('bash scripts/convertWigToBigWig.sh %s %s %s' % (wig_path, bigWig_path, chrom_sizes_path), 144 | shell=True) 145 | 146 | return None 147 | 148 | 149 | def get_blacklisted_locs(cell_line): 150 | """ 151 | Returns a dictionary whose keys are chromosomes which map to an array of bin starts and ends 152 | indices (not chromosome locations) to exclude: does not include upper end of range (in line with numpy indexing conventions). 153 | Eg, {'chr1':[[5, 10], [25, 50]]} means we should exclude bins 5 - 9 and 25 - 49 on chromosome 1. 154 | """ 155 | if get_species(cell_line) == 'mm9': 156 | blacklist_file = MM9_BLACKLIST_FILE 157 | else: 158 | blacklist_file = HG19_BLACKLIST_FILE 159 | 160 | d = pd.read_csv(blacklist_file, sep = "\t") 161 | blacklist_dictionary = {} 162 | for i in range(len(d)): 163 | chrom = d.iloc[i]['chromosome'] 164 | start = d.iloc[i]['start'] 165 | end = d.iloc[i]['end'] 166 | if chrom not in blacklist_dictionary: 167 | blacklist_dictionary[chrom] = [] 168 | blacklist_dictionary[chrom].append([int(1.*start / BIN_SIZE), int(1. * end / BIN_SIZE) + 1]) 169 | 170 | return blacklist_dictionary 171 | 172 | 173 | def get_merged_BAM_path(cell_line, factor): 174 | """ 175 | Returns the path to the BAM file that contains all merged replicates 176 | for a given cell_line and factor. 177 | """ 178 | 179 | return os.path.join(MERGED_ROOT, '%s-%s_merged.bam' % (cell_line, factor)) 180 | 181 | 182 | def get_merged_BED_SE_path(cell_line, factor): 183 | """ 184 | Returns the path to the BED file that contains all merged replicates 185 | for a given cell_line and factor. This is for single-end reads. 186 | These BED files have already been filtered for MAPQ. 187 | """ 188 | 189 | return os.path.join(MERGED_ROOT, '%s-%s_merged.bed' % (cell_line, factor)) 190 | 191 | 192 | def get_merged_BED_path(cell_line, factor): 193 | """ 194 | Returns the path to the BEDPE file that contains all merged replicates 195 | for a given cell_line and factor. 196 | These BEDPE files have already been filtered for MAPQ and properly paired reads. 197 | """ 198 | 199 | return os.path.join(MERGED_ROOT, '%s-%s_merged.bedpe' % (cell_line, factor)) 200 | 201 | 202 | def get_tagAlign_path(cell_line, factor, subsample_target_string = None): 203 | """ 204 | Returns the path to the tagAlign file that contains all merged replicates 205 | for a given cell_line and factor. 206 | These tagAlign files have already been filtered for MAPQ and properly paired reads. 207 | 208 | If subsample_target_string is specified, return a subsampled tagAlign instead. 209 | """ 210 | 211 | if subsample_target_string: 212 | return os.path.join(SUBSAMPLED_ROOT, '%s-%s_subsample-%s.tagAlign.gz' % (cell_line, factor, subsample_target_string)) 213 | else: 214 | return os.path.join(MERGED_ROOT, '%s-%s_merged.tagAlign.gz' % (cell_line, factor)) 215 | 216 | 217 | def get_bigWig_folder(cell_line, factor, subsample_target_string = None): 218 | """ 219 | Returns the name of the output folder where bigWigs for a given cell_line, factor, 220 | and optionally subsample_target_string should be placed. 221 | This output folder is passed to the ENCODE CHiP-seq pipeline. 222 | """ 223 | 224 | if subsample_target_string: 225 | return os.path.join(BIGWIGS_ROOT, '%s-%s_subsample-%s' % (cell_line, factor, subsample_target_string)) 226 | else: 227 | return os.path.join(BIGWIGS_ROOT, '%s-%s_merged' % (cell_line, factor)) 228 | 229 | 230 | def get_peak_path(cell_line, factor, subsample_target_string): 231 | assert(factor != 'INPUT') 232 | if subsample_target_string: 233 | subsample_output_string = "subsample-%s" % subsample_target_string 234 | else: 235 | subsample_output_string = "merged" 236 | 237 | return os.path.join( 238 | PEAK_BASE_DIR, 239 | 'peak', 240 | 'macs2', 241 | 'rep1', 242 | '%s-%s_%s' % (cell_line, factor, subsample_output_string) + 243 | '.tagAlign_x_%s-INPUT_%s.tagAlign.gappedPeak.gz' % (cell_line, subsample_output_string)) 244 | 245 | 246 | def get_peak_bigWig_path(cell_line, factor, subsample_target_string = None): 247 | """ 248 | Returns the path to the bigWig file that contains the peak p-values 249 | for a given cell_line, factor, and optionally 250 | subsample_target_string. 251 | """ 252 | if subsample_target_string: 253 | subsample_output_string = "subsample-%s" % subsample_target_string 254 | else: 255 | subsample_output_string = "merged" 256 | 257 | return os.path.join( 258 | PEAK_BASE_DIR, 259 | 'signal', 260 | 'macs2', 261 | 'rep1', 262 | '%s-%s_%s' % (cell_line, factor, subsample_output_string) + 263 | '.tagAlign_x_%s-INPUT_%s.tagAlign.pval.signal.bw' % (cell_line, subsample_output_string)) 264 | 265 | 266 | def get_bigWig_path(cell_line, factor, subsample_target_string = None): 267 | """ 268 | Returns the path to the bigWig file that contains the output of align2rawsignal 269 | (from the ENCODE CHiP-seq pipeline) for a given cell_line, factor, and optionally 270 | subsample_target_string. 271 | """ 272 | 273 | if subsample_target_string: 274 | return os.path.join( 275 | BIGWIGS_ROOT, 276 | '%s-%s_subsample-%s' % (cell_line, factor, subsample_target_string), 277 | 'signal', 278 | 'tag2bw', 279 | 'rep1', 280 | '%s-%s_subsample-%s.bigwig' % (cell_line, factor, subsample_target_string)) 281 | 282 | else: 283 | return os.path.join( 284 | BIGWIGS_ROOT, 285 | '%s-%s_merged' % (cell_line, factor), 286 | 'signal', 287 | 'tag2bw', 288 | 'rep1', 289 | '%s-%s_merged.bigwig' % (cell_line, factor)) 290 | 291 | 292 | def get_intervals_path(chrom, species): 293 | """ 294 | Returns the path to the intervals BED file for a given chromosome. 295 | 296 | This BED file contains equally spaced intervals at BIN_SIZE.""" 297 | 298 | assert species in ['hg19', 'mm9'] 299 | return os.path.join(INTERVALS_ROOT, '%s_%s_%s.bed' % (species, chrom, BIN_SIZE)) 300 | 301 | 302 | def get_numpy_path(cell_line, factor, chrom, subsample_target_string=None): 303 | """ 304 | Returns the path of the numpy array containing the binned signal for a given cell_line, factor, 305 | and optionally subsample_target_string. 306 | """ 307 | 308 | if subsample_target_string: 309 | return os.path.join(NUMPY_ROOT, '%s-%s-%s_subsample-%s.npy' % (cell_line, factor, chrom, subsample_target_string)) 310 | 311 | else: 312 | return os.path.join(NUMPY_ROOT, '%s-%s-%s_merged.npy' % (cell_line, factor, chrom)) 313 | 314 | def get_peak_numpy_path(cell_line, factor, chrom, subsample_target_string=None): 315 | """ 316 | Returns the path of the numpy array containing the binned peak p-value signal for a given cell_line, factor, 317 | and optionally subsample_target_string. 318 | """ 319 | assert(factor != 'INPUT') 320 | if subsample_target_string: 321 | return os.path.join(NUMPY_ROOT, 'peak_pvals_by_bin_%s-%s-%s_subsample-%s.npy' % (cell_line, factor, chrom, subsample_target_string)) 322 | 323 | else: 324 | return os.path.join(NUMPY_ROOT, 'peak_pvals_by_bin_%s-%s-%s_merged.npy' % (cell_line, factor, chrom)) 325 | 326 | def get_base_path(dataset_name, subsample_target_string, normalization, peaks=False): 327 | """ 328 | If peaks is True, returns the base path for the peak pvals; otherwise, returns base path for continuous signal. 329 | 330 | Normalization is always set to None if peaks is True. 331 | """ 332 | if peaks: 333 | return os.path.join(BASE_ROOT, 'peak_pvals_by_bin_%s_subsample-%s_norm-None.npz' % 334 | (dataset_name, subsample_target_string)) 335 | else: 336 | return os.path.join(BASE_ROOT, '%s_subsample-%s_norm-%s.npz' % 337 | (dataset_name, subsample_target_string, normalization)) 338 | 339 | 340 | def get_metadata_path(dataset_name, subsample_target_string, normalization): 341 | return os.path.join(BASE_ROOT, '%s_subsample-%s_norm-%s.metadata' % 342 | (dataset_name, subsample_target_string, normalization)) 343 | 344 | 345 | def merge_BAMs(cell_lines_to_use, factors_to_use): 346 | """ 347 | Takes a remote directory (REMOTE_ROOT) containing several different cell lines, marks, and 348 | replicates, copies the data over to a local directory (RAW_ROOT), then combines all replicates 349 | for each pair of cell lines and marks. Outputs to MERGED_ROOT. 350 | Only looks at cell lines that are in cell_lines_to_use and marks that are in factors_to_use. 351 | 352 | Operates on raw data available at http://gbsc-share.stanford.edu/chromovar/rawdata/ 353 | """ 354 | 355 | cell_mark_pairs = set() 356 | cell_mark_name_triples = [] 357 | all_cmds = [[]] 358 | 359 | # First, copy files over from REMOTE_ROOT (/mnt/data...) to RAW_ROOT 360 | for f in os.listdir(REMOTE_ROOT): 361 | if (os.path.isfile(os.path.join(REMOTE_ROOT, f)) and f.startswith('SNYDER_HG19_') 362 | and f.endswith('.dedup.bam')): 363 | 364 | spl = f.split('_') 365 | cell_line = spl[2] 366 | if cell_line not in cell_lines_to_use: 367 | continue 368 | 369 | mark = spl[3] 370 | if mark not in factors_to_use: 371 | continue 372 | 373 | all_cmds[0].append('cp %s %s' % (os.path.join(REMOTE_ROOT, f), RAW_ROOT)) 374 | 375 | cell_mark_pairs.add((cell_line, mark)) 376 | cell_mark_name_triples.append((cell_line, mark, f)) 377 | 378 | # Then process all files in RAW_ROOT 379 | for (cell, mark) in cell_mark_pairs: 380 | 381 | # How many replicates does this (cell, mark) pair have? 382 | count = 0 383 | filename = '' 384 | for (c, m, f) in cell_mark_name_triples: 385 | if cell == c and mark == m: 386 | count += 1 387 | filename = f 388 | assert count > 0 389 | 390 | if count == 1: 391 | print("%s-%s has no replicates. Copying straight..." % (cell, mark)) 392 | all_cmds[-1].append("cp %s %s;" % (os.path.join(RAW_ROOT, filename), get_merged_BAM_path(cell, mark))) 393 | 394 | else: 395 | print("%s-%s has %s replicates. Merging..." % (cell, mark, count)) 396 | all_cmds[-1].append("samtools merge %s %s/*%s_%s*.bam" % \ 397 | (get_merged_BAM_path(cell, mark), RAW_ROOT, cell, mark)) 398 | return all_cmds 399 | 400 | 401 | def filter_and_convert_BAMs(cell_lines_to_use, factors_to_use): 402 | """ 403 | Looks at all merged BAM files in MERGED_ROOT, and for each BAM file, 404 | filters out all reads below MAPQ 30 and all reads that aren't paired properly, 405 | and then outputs a tagAlign.gz file with only the filtered reads 406 | in the same MERGED_ROOT folder. 407 | """ 408 | all_cmds = [[], []] 409 | for cell_line in cell_lines_to_use: 410 | for factor in factors_to_use: 411 | 412 | BAM_path = get_merged_BAM_path(cell_line, factor) 413 | tagAlign_path = get_tagAlign_path(cell_line, factor) 414 | 415 | if os.path.isfile(BAM_path): 416 | BED_path = get_merged_BED_path(cell_line, factor) 417 | all_cmds[0].append("bash scripts/filterAndConvertBAMs.sh %s %s %s" % (BAM_path, BED_path, MAPQ_THRESHOLD)) 418 | all_cmds[1].append("bash scripts/convertBEDPEtoTagAlign.sh %s %s" % (BED_path, tagAlign_path)) 419 | else: 420 | print("Warning: %s does not exist. Skipping..." % BAM_path) 421 | return all_cmds 422 | 423 | 424 | def subsample_BAMs(cell_lines_to_use, factors_to_use, subsample_targets_to_use): 425 | """ 426 | For each cell_line and factor, subsamples the corresponding BEDPE file to 427 | the desired depths. Outputs in SUBSAMPLED_ROOT a tagAlign.gz file for each 428 | (cell_line, factor, subsample_target) combination. 429 | """ 430 | all_cmds = [[]] 431 | for cell_line in cell_lines_to_use: 432 | for factor in factors_to_use: 433 | 434 | subsample_input = get_merged_BED_path(cell_line, factor) 435 | full_reads = int(float(check_output('wc -l %s' % subsample_input, shell=True).split(' ')[0])) 436 | # subsample_command = "" 437 | 438 | for subsample_target_string in subsample_targets_to_use: 439 | 440 | if subsample_target_string == None: 441 | continue 442 | 443 | subsample_target = int(float(subsample_target_string)) 444 | 445 | if full_reads < subsample_target: 446 | print("Warning: %s-%s only has %s read pairs, less than subsampling target of %s. Skipping..." % 447 | (cell_line, factor, full_reads, subsample_target_string)) 448 | continue 449 | 450 | print("Subsampling %s-%s: %s read pairs from %s read pairs" % (cell_line, factor, subsample_target_string, full_reads)) 451 | 452 | subsample_output = get_tagAlign_path(cell_line, factor, subsample_target_string) 453 | 454 | # if subsample_command != "": 455 | # subsample_command += '; ' 456 | 457 | cmd = "bash scripts/subsampleBEDPEs.sh %s %s %s" % (subsample_input, subsample_output, subsample_target) 458 | # subsample_command += cmd 459 | 460 | all_cmds[0].append(cmd) 461 | 462 | # subsample_command = "(" + subsample_command + ") &" 463 | #call(subsample_command, shell=True) 464 | return all_cmds 465 | 466 | 467 | def get_chrom_sizes(cell_line): 468 | if get_species(cell_line) == 'mm9': 469 | return MM9_CHROM_SIZES 470 | else: 471 | return HG19_CHROM_SIZES 472 | 473 | def get_species(cell_line): 474 | if 'MOUSE' in cell_line: 475 | return 'mm9' 476 | else: 477 | return 'hg19' 478 | 479 | def get_signal_tracks(cell_lines_to_use, factors_to_use, subsample_targets_to_use): 480 | """ 481 | Calls the ENCODE CHiP-seq pipeline on the tagAlign files for all 482 | cell lines, factors, and subsample targets (including the full data). 483 | Outputs in BIGWIGS_ROOT a .bigWig file for each 484 | (cell_line, factor, subsample_target) combination. 485 | """ 486 | all_cmds = [[]] 487 | for cell_line in cell_lines_to_use: 488 | species = get_species(cell_line) 489 | for factor in factors_to_use: 490 | 491 | chrom_sizes = get_chrom_sizes(cell_line) 492 | 493 | # This gets signal tracks from both full and subsampled data 494 | # because None is an element of SUBSAMPLE_TARGETS 495 | 496 | signal_command = "" 497 | 498 | for subsample_target_string in subsample_targets_to_use: 499 | tagAlign_path = get_tagAlign_path(cell_line, factor, subsample_target_string) 500 | bigWig_folder = get_bigWig_folder(cell_line, factor, subsample_target_string) 501 | 502 | if os.path.isfile(tagAlign_path): 503 | files_already_exist = check_whether_BW_files_exist( 504 | cell_line, 505 | factor, 506 | subsample_target_string, 507 | average_peaks=False) 508 | 509 | if files_already_exist: 510 | print('Bigwig files already exist for %s; skipping.' % bigWig_folder) 511 | else: 512 | print('Bigwig files DO NOT exist for %s; adding to tasks.' % bigWig_folder) 513 | if signal_command != "": 514 | signal_command += '; ' 515 | cmd = "bash scripts/getSignalTrack.sh %s %s %s %s" % (PIPELINE_ROOT, tagAlign_path, bigWig_folder, species) 516 | signal_command += cmd 517 | all_cmds[0].append(cmd) 518 | else: 519 | print("Warning: %s does not exist. Skipping..." % tagAlign_path) 520 | 521 | signal_command = "(" + signal_command + ") &" 522 | 523 | #call(signal_command, shell=True) 524 | return all_cmds 525 | 526 | def make_intervals(species): 527 | """ 528 | Constructs BED files, one for each chromosome, each containing equally 529 | spaced intervals at BIN_SIZE. 530 | 531 | The third column of the BED file is exclusive, i.e., the interval is 532 | actually [start, end). So for a BIN_SIZE of size 25 the intervals will look like 533 | chr1 0 25 534 | chr2 25 50 535 | ... 536 | 537 | For convenience, here is the official documentation: 538 | 539 | chromEnd - The ending position of the feature in the chromosome or scaffold. 540 | The chromEnd base is not included in the display of the feature. 541 | For example, the first 100 bases of a chromosome are defined as 542 | chromStart=0, chromEnd=100, and span the bases numbered 0-99. 543 | 544 | The fourth column (name) is added because bigWigAverageOverBed only accepts 545 | BED files with 4 columns. 546 | 547 | We just truncate the end of the chromosome if it's not cleanly divisible 548 | by BIN_SIZE. 549 | """ 550 | 551 | if species == 'hg19': 552 | chrom_sizes = HG19_CHROM_SIZES 553 | elif species == 'mm9': 554 | chrom_sizes = MM9_CHROM_SIZES 555 | else: 556 | raise ValueError, 'species must be hg19 or mm9' 557 | 558 | for chrom, chrom_size in chrom_sizes.items(): 559 | print("Generating BED file for %s" % chrom) 560 | BED_path = get_intervals_path(chrom, species) 561 | 562 | with open(BED_path, 'w') as f: 563 | for start in range(0, chrom_size - BIN_SIZE + 1, BIN_SIZE): 564 | end = start + BIN_SIZE 565 | name = "%s-%s" % (chrom, start) 566 | f.write("%s\t%s\t%s\t%s\n" % (chrom, start, end, name)) 567 | 568 | def check_whether_BW_files_exist(cell_line, factor, subsample_target_string, average_peaks): 569 | """ 570 | Checks whether bigwig files + the corresponding interval paths exist. 571 | """ 572 | 573 | allFilesExist = True 574 | 575 | if average_peaks: 576 | bigWig_path = get_peak_bigWig_path(cell_line, factor, subsample_target_string) 577 | else: 578 | bigWig_path = get_bigWig_path(cell_line, factor, subsample_target_string) 579 | if not (os.path.isfile(bigWig_path)): 580 | allFilesExist = False 581 | 582 | species = get_species(cell_line) 583 | chrom_sizes = get_chrom_sizes(cell_line) 584 | for chrom in chrom_sizes.keys(): 585 | BED_path = get_intervals_path(chrom, species) 586 | if not os.path.isfile(BED_path): 587 | allFilesExist = False 588 | 589 | return allFilesExist 590 | 591 | def get_average_signal_over_intervals(cell_lines_to_use, factors_to_use, subsample_targets_to_use, average_peaks = False): 592 | """ 593 | Averages the signal in the .bigWig files in BIGWIGS_ROOT into bins of BIN_SIZE. 594 | Outputs a .npy file in NUMPY_ROOT for each (cell_line, factor, subsample_target) 595 | combination. 596 | 597 | This calls the bigWigAverageOverBed tool from UCSC tools and takes the mean0 column. 598 | 599 | This function does nothing if the .npy file in NUMPY_ROOT already exists. 600 | """ 601 | all_cmds = [[], [], []] 602 | assert(input_not_before_end(factors_to_use)) 603 | for cell_line in cell_lines_to_use: 604 | for factor in factors_to_use: 605 | if average_peaks and factor == 'INPUT': 606 | continue 607 | 608 | chrom_sizes = get_chrom_sizes(cell_line) 609 | species = get_species(cell_line) 610 | # This averages signal tracks from both full and subsampled data 611 | # because None is an element of subsample_targets_to_use 612 | for subsample_target_string in subsample_targets_to_use: 613 | allFilesExist = check_whether_BW_files_exist(cell_line, factor, subsample_target_string, average_peaks) 614 | if allFilesExist: 615 | print('All files exist for %s, %s, %s, average_peaks = %s; averaging signal over intervals' % (cell_line, factor, subsample_target_string, average_peaks)) 616 | for chrom in chrom_sizes.keys(): 617 | BED_path = get_intervals_path(chrom, species) 618 | if average_peaks: 619 | bigWig_path = get_peak_bigWig_path(cell_line, factor, subsample_target_string) 620 | numpy_path = get_peak_numpy_path(cell_line, factor, chrom, subsample_target_string) 621 | else: 622 | bigWig_path = get_bigWig_path(cell_line, factor, subsample_target_string) 623 | numpy_path = get_numpy_path(cell_line, factor, chrom, subsample_target_string) 624 | output_path = bigWig_path + '-%s_binned.out' % chrom 625 | if os.path.isfile(numpy_path):#we've already done everything. 626 | print("Warning: %s already exists. Skipping..." % numpy_path) 627 | else: 628 | print("Numpy file does not exist; creating %s" % (numpy_path)) 629 | if os.path.isfile(output_path): 630 | print("Warning: %s already exists. Skipping..." % output_path) 631 | else: 632 | cmd = "bash scripts/averageSignalTrack.sh %s %s %s" % (bigWig_path, BED_path, output_path) 633 | all_cmds[0].append(cmd) 634 | all_cmds[1].append('python prepData.py turn_into_numpy %s %s' % (output_path, numpy_path)) 635 | # Clean up intermediate output 636 | all_cmds[2].append("rm -rf %s" % output_path) 637 | 638 | else: 639 | print('Warning: not all files exist for %s, %s, %s, average_peaks = %s' % (cell_line, factor, subsample_target_string, average_peaks)) 640 | 641 | return all_cmds 642 | 643 | def turn_into_numpy(output_path, numpy_path): 644 | """ 645 | Saves the output_path as a numpy_path. 646 | """ 647 | df = pd.read_csv(output_path, header = None) 648 | np.save(numpy_path, np.array(df)) 649 | 650 | 651 | def prep_dataset(dataset_name, cell_line, factors_to_include, chroms_to_include, 652 | subsample_targets, normalization, peak_dataset = False): 653 | """ 654 | Cobbles together a single .npz file containing binned signals for a given cell_line, 655 | list of factors, and list of chromosomes. There is one .npz file per 656 | (cell_line, subsample_target, normalization) triplet. 657 | 658 | Output is a single .npz file in BASE_ROOT with name dataset_name. 659 | This .npz file contains one matrix for each chromosome. 660 | Each matrix is of dimensions num_bins x num_factors, 661 | where num_bins is roughly floor(length of chromosome / BIN_SIZE), 662 | and num_factors is the length of factors_to_include. 663 | 664 | If peak_dataset = True, loads a peak dataset instead. 665 | """ 666 | 667 | if peak_dataset: 668 | assert(normalization is None) 669 | 670 | assert(input_not_before_end(factors_to_include)) 671 | if peak_dataset: 672 | factors_to_include = np.copy(factors_to_include) 673 | if factors_to_include[-1] == 'INPUT': 674 | factors_to_include = factors_to_include[:-1] 675 | 676 | for subsample_target_string in subsample_targets: 677 | 678 | output_path = get_base_path(dataset_name, subsample_target_string, normalization) 679 | if os.path.isfile(output_path): 680 | print('Output file %s exists' % output_path) 681 | continue 682 | print("Preparing %s %s" % (dataset_name, subsample_target_string)) 683 | # First make sure that all the numpy files we need exist 684 | do_files_exist = True 685 | for chrom in chroms_to_include: 686 | for factor in factors_to_include: 687 | if peak_dataset: 688 | numpy_path = get_peak_numpy_path(cell_line, factor, chrom, subsample_target_string) 689 | else: 690 | numpy_path = get_numpy_path(cell_line, factor, chrom, subsample_target_string) 691 | if not os.path.isfile(numpy_path): 692 | print('Warning: %s does not exist' % numpy_path) 693 | do_files_exist = False 694 | break 695 | 696 | if not do_files_exist: 697 | print("Warning: not all .npy files are ready to make dataset %s for %s %s" % (dataset_name, cell_line, subsample_target_string)) 698 | continue 699 | 700 | 701 | # Write dataset metadata to disk 702 | if not peak_dataset: 703 | metadata = { 704 | 'dataset_name': dataset_name, 705 | 'cell_line': cell_line, 706 | 'factors_to_include': factors_to_include, 707 | 'chroms_to_include': chroms_to_include, 708 | 'subsample_targets': subsample_targets, 709 | 'normalization': normalization 710 | } 711 | metadata_path = get_metadata_path(dataset_name, subsample_target_string, normalization) 712 | with open(metadata_path, 'w') as f: 713 | f.write(json.dumps(metadata)) 714 | 715 | 716 | # Construct output matrix 717 | 718 | num_factors = len(factors_to_include) 719 | matrices = {} 720 | unnormalized_matrices = {} 721 | blacklist_buffer = 5 722 | blacklisted_locs = get_blacklisted_locs(cell_line) 723 | 724 | for chrom in chroms_to_include: 725 | 726 | print("... packing %s" % chrom) 727 | 728 | first_factor = True 729 | 730 | for (idx, factor) in enumerate(factors_to_include): 731 | if peak_dataset: 732 | numpy_path = get_peak_numpy_path(cell_line, factor, chrom, subsample_target_string) 733 | else: 734 | numpy_path = get_numpy_path(cell_line, factor, chrom, subsample_target_string) 735 | 736 | assert os.path.isfile(numpy_path), "Error: %s is missing" % numpy_path 737 | 738 | # Each individual chrom-factor is a column vector 739 | arr = np.load(numpy_path) 740 | 741 | if first_factor: 742 | first_factor = False 743 | num_bins = len(arr) 744 | chrom_matrix = np.empty([num_bins, num_factors]) 745 | 746 | chrom_matrix[:, idx] = arr[:, 0] 747 | 748 | # Zero out blacklist regions. Add a bit of buffer to be safe. 749 | print('Before blacklisting %s, average signal is %s' % (chrom, np.mean(chrom_matrix))) 750 | for bad_range in blacklisted_locs[chrom]: 751 | chrom_matrix[bad_range[0]-blacklist_buffer : bad_range[1]+blacklist_buffer, :] = 0 752 | print('After blacklisting %s, average signal is %s' % (chrom, np.mean(chrom_matrix))) 753 | 754 | # Save matrix for this chrom 755 | unnormalized_matrices[chrom] = chrom_matrix 756 | matrices[chrom] = perform_normalization(chrom_matrix, normalization) 757 | np.savez_compressed(output_path, **matrices) 758 | 759 | # Always save unnormalized bigWigs even if the actual data is normalized 760 | # because we don't want to view normalized bigWigs on the genome browser 761 | generate_bigWig( 762 | unnormalized_matrices, 763 | factors_to_include, 764 | '%s_subsample-%s_norm-None' % (dataset_name, subsample_target_string), 765 | BASE_BIGWIG_ROOT) 766 | 767 | 768 | def prep_dataset_wrapper(dataset_name, cell_line, factors_string, subsample_target, normalization, peak_dataset): 769 | """ 770 | This is just a wrapper to allow prep dataset to be called from the command line. 771 | """ 772 | 773 | if normalization == 'None': 774 | normalization = None 775 | if subsample_target == 'None': 776 | subsample_target = None 777 | assert(peak_dataset in ['True', 'False']) 778 | peak_dataset = peak_dataset == 'True' 779 | 780 | if get_species(cell_line) == 'mm9': 781 | all_chroms = MM9_ALL_CHROMS 782 | else: 783 | all_chroms = HG19_ALL_CHROMS 784 | prep_dataset(dataset_name, cell_line, factors_string.split('-'), all_chroms, 785 | [subsample_target], normalization, peak_dataset) 786 | 787 | 788 | def generate_datasets(cell_lines_to_use, dataset_name_template, factors_to_use, subsample_targets_to_use): 789 | """ 790 | Calls prep_dataset on each cell_line, factor, and subsample_target; 791 | Each dataset uses data from chr1-22 and all factors in factors_to_use. 792 | Also creates peak datasets. 793 | 794 | Output is in BASE_ROOT. 795 | """ 796 | all_cmds = [[]] 797 | factors_string = '-'.join(factors_to_use) 798 | for cell_line in cell_lines_to_use: 799 | for subsample_target in subsample_targets_to_use: 800 | all_cmds[0].append('python prepData.py ' \ 801 | + ' prep_dataset_wrapper peak_pvals_by_bin_%s %s %s %s None True' % \ 802 | (dataset_name_template % cell_line, cell_line, factors_string, subsample_target)) 803 | all_cmds[0].append('python prepData.py ' \ 804 | + ' prep_dataset_wrapper %s %s %s %s arcsinh False' % \ 805 | (dataset_name_template % cell_line, cell_line, factors_string, subsample_target)) 806 | return all_cmds 807 | 808 | 809 | def call_all_peaks(cell_lines_to_use, factors_to_use, subsample_targets_to_use): 810 | """ 811 | Calls the ENCODE CHiP-seq pipeline on the tagAlign files for all 812 | cell lines, factors, and subsample targets (including the full data). 813 | Outputs in PEAK_BASE_DIR/peaks_macs2/true_replicates a gappedPeak.gz file for each 814 | (cellLine, factor, subsampleTarget) combination. 815 | """ 816 | print('calling all peaks!!') 817 | all_cmds = [[]] 818 | for cell_line in cell_lines_to_use: 819 | species = get_species(cell_line) 820 | for factor in factors_to_use: 821 | if factor == 'INPUT': 822 | continue 823 | 824 | controls_and_inputs = [] 825 | 826 | for subsample_target_string in subsample_targets_to_use: 827 | if check_whether_BW_files_exist(cell_line, factor, subsample_target_string, average_peaks = True): 828 | print('%-8s %-8s %-8s peak files already exist, not regenerating' % (cell_line, factor, subsample_target_string)) 829 | continue 830 | 831 | else: 832 | input_file = get_tagAlign_path(cell_line, factor, subsample_target_string = subsample_target_string) 833 | control_input_file = get_tagAlign_path(cell_line, 'INPUT', subsample_target_string = subsample_target_string) 834 | if os.path.exists(input_file) and os.path.exists(control_input_file): 835 | print('%-8s %-8s %-8s peak files DO NOT exist, regenerating' % (cell_line, factor, subsample_target_string)) 836 | controls_and_inputs.append([input_file, control_input_file]) 837 | else: 838 | print('%-8s %-8s %-8s input files DO NOT exist, cannot call peaks' % (cell_line, factor, subsample_target_string)) 839 | continue 840 | 841 | for input_file, control_input_file in controls_and_inputs: 842 | 843 | if os.path.isfile(input_file) and os.path.isfile(control_input_file): 844 | 845 | if os.path.isfile(control_input_file): 846 | cmd = "bash scripts/findPeaks.sh %s %s %s %s %s" % (PIPELINE_ROOT, PEAK_BASE_DIR, input_file, control_input_file, species) 847 | 848 | all_cmds[0].append(cmd) 849 | 850 | print('Running command ', cmd) 851 | else: 852 | print("Warning: input file %s or %s does not exist. Skipping..." % (input_file, control_input_file)) 853 | 854 | 855 | return all_cmds 856 | 857 | 858 | def input_not_before_end(list_of_marks): 859 | """ 860 | Makes sure that INPUT does not occur before the last element of a list of marks. 861 | """ 862 | return ('INPUT' not in list_of_marks[:-1]) 863 | def callCommand(cmd): 864 | call(cmd, shell = True) 865 | sleep(3) 866 | 867 | def fork_and_wait(n_proc, target, args=[]): 868 | """ 869 | Fork n_proc processes, run target(*args) in each, and wait to finish. 870 | This is Nathan's method. 871 | """ 872 | if n_proc == 1: 873 | target(*args) 874 | return 875 | else: 876 | pids = [] 877 | for i in xrange(n_proc): 878 | pid = os.fork() 879 | if pid == 0: 880 | try: 881 | signal.signal(signal.SIGINT, handle_interrupt_signal) 882 | target(*args) 883 | os._exit(os.EX_OK) 884 | except Exception, inst: 885 | print_exc() 886 | config.log_statement( "Uncaught exception in subprocess\n" 887 | + traceback.format_exc(), log=True) 888 | os._exit(os.EX_SOFTWARE) 889 | else: 890 | pids.append(pid) 891 | try: 892 | while len(pids) > 0: 893 | ret_pid, error_code = os.wait() 894 | if ret_pid in pids: 895 | pids.remove(ret_pid) 896 | if error_code != os.EX_OK: 897 | raise OSError, "Process '{}' returned error code '{}'".format( 898 | ret_pid, error_code) 899 | except KeyboardInterrupt: 900 | for pid in pids: 901 | try: os.kill(pid, signal.SIGHUP) 902 | except: pass 903 | raise 904 | except OSError: 905 | for pid in pids: 906 | try: os.kill(pid, signal.SIGHUP) 907 | except: pass 908 | raise 909 | return 910 | 911 | class Counter(object): 912 | """ 913 | Nathan's implementation of the Counter class; used for running multiple threads simultaneously. 914 | """ 915 | def __init__(self, initval=0): 916 | self.val = multiprocessing.Value('i', initval) 917 | self.lock = multiprocessing.Lock() 918 | 919 | def return_and_increment(self): 920 | with self.lock: 921 | rv = self.val.value 922 | self.val.value += 1 923 | return rv 924 | def handle_interrupt_signal(signum, frame): 925 | os._exit(os.EX_TEMPFAIL) 926 | 927 | 928 | def run_in_parallel(task_name, n_proc, target, all_args): 929 | """ 930 | Run target on each item in items. 931 | all_args should be a list of lists (where each element is one argument set). 932 | """ 933 | if len(all_args) == 0: 934 | print("No tasks to run!") 935 | return 936 | curr_item = Counter() 937 | def worker(): 938 | index = curr_item.return_and_increment() 939 | while index < len(all_args): 940 | args = all_args[index] 941 | sys.stdout.write('Now running %s, command %i / %i with %i processes; commands are %s\n' % (task_name, index + 1, len(all_args), n_proc, args)) 942 | sleep(2) 943 | sys.stdout.flush() 944 | sys.stderr.flush() 945 | target(*args) 946 | index = curr_item.return_and_increment() 947 | return 948 | 949 | fork_and_wait(n_proc, worker) 950 | 951 | def callCommand(cmd): 952 | call(cmd, shell = True) 953 | sleep(3) 954 | 955 | 956 | def run_pipeline_commands(cell_lines_to_use, factors_to_use, subsample_targets_to_use, 957 | dataset_name_template, n_processes = 8, steps_to_skip = []): 958 | 959 | """ 960 | Runs the full pipeline using n_processes. 961 | Skips steps in steps_to_skip. 962 | 963 | Each method returns a list of lists: each element in the outside list is a list of bash commands that can be run in parallel. 964 | """ 965 | 966 | # GM-specific processing 967 | if cell_lines_to_use[0].startswith('GM'): 968 | if 'merge_bam' not in steps_to_skip: 969 | merge_bam_cmds = merge_BAMs(cell_lines_to_use, factors_to_use) 970 | for cmd_set in merge_bam_cmds: 971 | run_in_parallel('Merge BAM', n_processes, callCommand, [[cmd] for cmd in cmd_set]) 972 | if 'filter_bam' not in steps_to_skip: 973 | filter_bam_cmds = filter_and_convert_BAMs(cell_lines_to_use, factors_to_use) 974 | for cmd_set in filter_bam_cmds: 975 | run_in_parallel('Filter BAM', n_processes, callCommand, [[cmd] for cmd in cmd_set]) 976 | if 'subsample_bam' not in steps_to_skip: 977 | subsample_bam_cmds = subsample_BAMs(cell_lines_to_use, factors_to_use, subsample_targets_to_use) 978 | for cmd_set in subsample_bam_cmds: 979 | run_in_parallel('Subsample BAM', n_processes, callCommand, [[cmd] for cmd in cmd_set]) 980 | 981 | # Common processing 982 | if 'get_signal_tracks' not in steps_to_skip: 983 | signal_track_cmds = get_signal_tracks(cell_lines_to_use, factors_to_use, subsample_targets_to_use) 984 | for cmd_set in signal_track_cmds: 985 | run_in_parallel('Get signal track', n_processes, callCommand, [[cmd] for cmd in cmd_set]) 986 | if 'call_peaks' not in steps_to_skip: 987 | call_peak_cmds = call_all_peaks(cell_lines_to_use, factors_to_use, subsample_targets_to_use) 988 | for cmd_set in call_peak_cmds: 989 | run_in_parallel('Call peak', n_processes, callCommand, [[cmd] for cmd in cmd_set]) 990 | 991 | if 'get_average_signal' not in steps_to_skip: 992 | 993 | get_average_signal_peaks_cmds = get_average_signal_over_intervals(cell_lines_to_use, factors_to_use, subsample_targets_to_use, average_peaks = True) 994 | get_average_signal_cmds = get_average_signal_over_intervals(cell_lines_to_use, factors_to_use, subsample_targets_to_use, average_peaks = False) 995 | 996 | for cmd_set in get_average_signal_cmds + get_average_signal_peaks_cmds: 997 | 998 | run_in_parallel('Average signal', n_processes, callCommand, [[cmd] for cmd in cmd_set]) 999 | 1000 | generate_all_dataset_cmds = generate_datasets(cell_lines_to_use, dataset_name_template, 1001 | factors_to_use, subsample_targets_to_use) 1002 | for cmd_set in generate_all_dataset_cmds: 1003 | run_in_parallel('Generate dataset', n_processes, callCommand, [[cmd] for cmd in cmd_set]) 1004 | 1005 | 1006 | def run_GM_pipeline(): 1007 | """ 1008 | Runs the full pipeline (starting from subsampling) to get many subsample targets for GM12878 1009 | and GM18526, and one subsample target for the other cell lines. 1010 | """ 1011 | 1012 | try: 1013 | run_pipeline_commands( 1014 | ['GM12878', 'GM18526'], 1015 | GM_FACTORS, 1016 | ['0.5e6', None], 1017 | GM_DATASET_NAME_TEMPLATE, 1018 | steps_to_skip=['merge_bam', 'filter_bam', 'subsample_bam', 'get_signal_tracks', 'call_peaks'], 1019 | n_processes=12) 1020 | 1021 | except: 1022 | print_exc() 1023 | sys.stdout.flush() 1024 | sys.stderr.flush() 1025 | 1026 | 1027 | if __name__ == '__main__': 1028 | """ 1029 | Calls a method using arguments from command line. Eg, 1030 | 1031 | python prepData.py run_in_parallel a b c 1032 | 1033 | calls run_in_parallel(a, b, c) 1034 | """ 1035 | 1036 | args = sys.argv 1037 | fxn_args = args[2:] 1038 | print('Calling %s with arguments' % args[1], args[2:]) 1039 | locals()[args[1]](*args[2:]) 1040 | 1041 | 1042 | --------------------------------------------------------------------------------