├── .config.yaml ├── 1-map-to-genome.py ├── 2-sanitize-bam.py ├── 3-call-peaks.py ├── LICENSE ├── README.md ├── _logshim.py ├── _script_helpers.py ├── bamliquidatorbatch ├── __init__.py ├── bamliquidator_batch.py ├── flattener.py ├── normalize_plot_and_summarize.py └── test.py ├── blacklists ├── ce10-blacklist.bed ├── dm3-blacklist.bed ├── mm10-blacklist-via-mm9.bed ├── mm9-blacklist.bed ├── source-info │ ├── URLs.txt │ └── hg19-blacklist-README.pdf └── wgEncodeDacMapabilityConsensusExcludable.bed ├── demo-data ├── sample-get-SuperEnhancers-output │ ├── 0-enhancer-stats.txt │ ├── 0-se-population.R.bed │ ├── 0-stretch-population.R.bed │ ├── 0-stretch-se-population.R.bed │ ├── 0-te-population.R.bed │ ├── se-cutoff.R.png │ ├── se-size-histogram.R.png │ ├── se-te-stretch-vs-nonstretch-count-pie.R.png │ ├── se-vs-te-count-pie.R.png │ ├── se-vs-te-signal-pie.R.png │ ├── stretch-vs-nonstretch-count-pie.R.png │ └── te-size-histogram.R.png └── sample-mm10-CD4.bed ├── dist ├── README.md ├── bedtools-2.22.0 ├── bedtools-2.23.0 ├── samtools-0.1.19 ├── samtools-1.1 └── samtools-1.2 ├── get-SuperEnhancers.R ├── helper-scripts ├── 0-merge-fastq.py ├── 3-merge-bam-rmdup.py ├── README.md └── hdf5_to_counts_table.py ├── refseq ├── .gitignore ├── UPDATED-06-11-2014 ├── hg18.ucsc.RefSeq.refGene.tsv.gz ├── hg19.ucsc.RefSeq.refGene.tsv.gz ├── hg38.ucsc.RefSeq.refGene.tsv.gz ├── mm10.ucsc.RefSeq.refGene.tsv.gz └── mm9.ucsc.RefSeq.refGene.tsv.gz ├── requirements.txt ├── riesling.py └── statistics.py /.config.yaml: -------------------------------------------------------------------------------- 1 | general: 2 | custom_tmp_dir: /tmp/ 3 | 4 | # Good source for standard builds & paths is the Illumina iGenomes collection: 5 | # http://support.illumina.com/sequencing/sequencing_software/igenome.html 6 | bowtie2_genomes: 7 | mm9: Mus_musculus/UCSC/mm9/Sequence/Bowtie2Index/genome 8 | mm10: Mus_musculus/UCSC/mm10/Sequence/Bowtie2Index/genome 9 | hg18: Mus_musculus/UCSC/hg18/Sequence/Bowtie2Index/genome 10 | hg19: Mus_musculus/UCSC/hg19/Sequence/Bowtie2Index/genome 11 | 12 | # Binaries -- some distributed with this package 13 | # We include these to try to standardize on sane versions of software. 14 | binaries: 15 | samtools: dist/samtools-1.2 16 | samtools_legacy: dist/samtools-0.1.19 17 | bedtools: dist/bedtools-2.23.0 18 | findPeaks: dist/findPeaks-4.7 # via http://homer.salk.edu/homer/ 19 | 20 | # Blacklisting paths 21 | # From the ENCODE blacklist of known overcalled/false-positive regions. 22 | blacklists: 23 | mm9: blacklists/mm9-blacklist.bed 24 | mm10: blacklists/mm10-blacklist-via-mm9.bed 25 | hg19: blacklists/wgEncodeDacMapabilityConsensusExcludable.bed 26 | ce10: blacklists/ce10-blacklist.bed 27 | dm3: blacklists/dm3-blacklist.bed 28 | 29 | # RefSeq maps, for the RIESLING code, via UCSC 30 | refseq: 31 | mm9: refseq/mm9.ucsc.RefSeq.refGene.tsv.gz 32 | mm10: refseq/mm10.ucsc.RefSeq.refGene.tsv.gz 33 | hg18: refseq/hg18.ucsc.RefSeq.refGene.tsv.gz 34 | hg19: refseq/hg19.ucsc.RefSeq.refGene.tsv.gz 35 | hg38: refseq/hg38.ucsc.RefSeq.refGene.tsv.gz 36 | 37 | # Gene info, for final annotation steps of RIESLING 38 | gffs: 39 | mm9: /dev/null 40 | mm10: genomes/mm10genes.transcript.gtf 41 | hg18: /dev/null 42 | hg19: /dev/null 43 | -------------------------------------------------------------------------------- /1-map-to-genome.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # A simple 5 | # 6 | # 7 | # Copyright (c) 2014-2016 Nick Semenkovich . 8 | # https://nick.semenkovich.com/ 9 | # 10 | # Developed for the Gordon Lab, Washington University in St. Louis (WUSTL) 11 | # http://gordonlab.wustl.edu/ 12 | # 13 | # This software is released under the MIT License: 14 | # http://opensource.org/licenses/MIT 15 | # 16 | # Source: https://github.com/GordonLab/riesling-pipeline 17 | 18 | from __future__ import absolute_import, division, print_function, unicode_literals 19 | 20 | __author__ = 'Nick Semenkovich ' 21 | __copyright__ = 'Gordon Lab at Washington University in St. Louis' 22 | __license__ = 'MIT' 23 | __version__ = '1.0.3' 24 | 25 | from collections import OrderedDict 26 | import _logshim 27 | import _script_helpers 28 | import argparse 29 | import glob 30 | import os 31 | import pprint 32 | import re 33 | 34 | 35 | def find_paired_ends(input_path, verbose=False): 36 | """ 37 | Given an input path, return 38 | 39 | :param input_path: 40 | :return: 41 | """ 42 | find_pe_logger = _logshim.getLogger('find_paired_ends') 43 | 44 | # TODO: Modularize all this! 45 | 46 | if not os.path.isdir(input_path): 47 | raise ValueError("Input must be a directory. You gave: %s" % (input_path)) 48 | 49 | all_files = glob.glob(input_path + "/*.PE1.fastq.gz") # Must have .PEX. in title 50 | all_files.extend(glob.glob(input_path + "/*.PE2.fastq.gz")) 51 | all_files.extend(glob.glob(input_path + "/*.PE1.fastq")) 52 | all_files.extend(glob.glob(input_path + "/*.PE2.fastq")) 53 | 54 | if len(all_files) == 0: 55 | raise ValueError("Input directory is empty!") 56 | 57 | 58 | # Given paired ends, we must always have an even number of input files. 59 | if len(all_files) % 2 != 0: 60 | raise ValueError("Input directory contains an odd number of files.") 61 | 62 | re_pattern = re.compile(r'^(.*)\.PE(\d)(\.fastq|\.fastq\.gz)$') 63 | 64 | file_dict = OrderedDict() 65 | 66 | prefixes_seen = [] 67 | pe_seen = [] 68 | for file in sorted(all_files): 69 | if not os.access(file, os.R_OK): 70 | raise OSError("Cannot read file: %s" % (file)) 71 | 72 | filename_only = file.rsplit('/', 1)[-1] 73 | result = re.match(re_pattern, filename_only) 74 | 75 | file_dict[file] = {'prefix': str(result.group(1)), 76 | 'PE': int(result.group(2))} 77 | 78 | prefixes_seen.append(file_dict[file]['prefix']) 79 | pe_seen.append(file_dict[file]['PE']) 80 | 81 | if len(set(pe_seen)) != 2: 82 | raise ValueError("Saw %d paired ends, expecting exactly two. That's confusing!" % (len(set(pe_seen)))) 83 | 84 | if pe_seen.count(1) != pe_seen.count(2): 85 | raise ValueError("Uneven pairing of paired ends (are you missing a file)? PE1 count: %d, PE2 count: %d" % 86 | (pe_seen.count(1), pe_seen.count(2))) 87 | 88 | find_pe_logger.info("Files seen: %d" % (len(all_files))) 89 | find_pe_logger.info("Samples seen: %d" % (len(set(prefixes_seen)))) 90 | 91 | merge_strategy = {} 92 | 93 | find_pe_logger.info("Sample IDs:") 94 | for prefix in sorted(set(prefixes_seen)): 95 | find_pe_logger.info(" %s" % (prefix)) 96 | 97 | for file in file_dict.iterkeys(): 98 | merge_strategy.setdefault(file_dict[file]['prefix'], []).append(file) 99 | 100 | if verbose: 101 | find_pe_logger.debug("Merge strategy is:") 102 | find_pe_logger.debug(pprint.pformat(merge_strategy)) 103 | 104 | return merge_strategy 105 | 106 | def run_bowtie2(paired_end_mapping, genome, output_path, disable_parallel=False): 107 | bowtie2_logger = _logshim.getLogger('run_bowtie2') 108 | 109 | # Import the config file to get genome locations 110 | config = _script_helpers.get_config() 111 | 112 | if disable_parallel: 113 | shell_job_runner = _script_helpers.ShellJobRunner(bowtie2_logger) 114 | else: 115 | shell_job_runner = _script_helpers.ShellJobRunner(bowtie2_logger, delay_seconds=60) 116 | 117 | for output_prefix, paired_ends in paired_end_mapping.iteritems(): 118 | bowtie2_logger.info('Spawning niced process for bowtie2 on: %s' % (output_prefix)) 119 | for filename in paired_ends: 120 | assert(" " not in filename) 121 | assert(";" not in filename) # Vague sanity testing for input filenames 122 | bowtie2_logger.debug(' Input: %s' % (filename)) 123 | 124 | # bowtie2 options: 125 | # --end-to-end: this is the default, but let's explicitly specify it 126 | # --sensitive: again, the default (consider switching to --fast?) 127 | # --no-unal: Suppress unaligned reads from the output .sam 128 | # --no-discordant: These are paired-end reads. We expect them to be non-discordant. 129 | # --mm: mmap MAP_SHARED (other processes can use our genome, cool!) 130 | # --met-stderr: Write metrics to stderr 131 | # --time: output the time things took 132 | # -x: target genome 133 | command = "bowtie2 --end-to-end --sensitive --no-unal --no-discordant --mm --met-stderr --time -x %s -1 %s -2 %s 2>%s | samtools view -bS - >%s" 134 | 135 | shell_job_runner.run(command % (config['bowtie2_genomes'][genome], 136 | paired_ends[0], 137 | paired_ends[1], 138 | output_path + "/" + output_prefix + ".bt2.log", 139 | output_path + "/" + output_prefix + ".bt2.bam")) 140 | 141 | shell_job_runner.finish() 142 | 143 | 144 | def main(): 145 | # Parse & interpret command line flags. 146 | parser = argparse.ArgumentParser(description='Given paired-end .fastq/.fastq.gz files, map to a genome.', 147 | epilog="Written by Nick Semenkovich for the Gordon Lab at " 148 | "Washington University in St. Louis: https://gordonlab.wustl.edu.", 149 | usage='%(prog)s [options]', 150 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 151 | 152 | parser.add_argument('--input-path', '-i', dest="input_path", metavar='input_dir/', type=str, 153 | help='Input path.', required=True) 154 | parser.add_argument('--output-path', '-o', dest="output_path", metavar='output_dir/', type=str, 155 | help='Output path.', required=True) 156 | parser.add_argument('--genome', '-g', dest="genome", metavar='genome', type=str, 157 | choices=['mm9', 'mm10', 'hg18', 'hg19'], help='Genome to use for bowtie2', required=True) 158 | parser.add_argument('--no-parallel', '-np', dest="no_parallel", default=False, action='store_true', 159 | help='Disable parallel job spawning.') 160 | 161 | 162 | parser.add_argument("--verbose", "-v", dest="verbose", default=False, action='store_true') 163 | 164 | parser.add_argument("--no-log", "-nl", dest="nolog", default=False, action='store_true', 165 | help="Do not create a log file.") 166 | 167 | args = parser.parse_args() 168 | 169 | output_path = _script_helpers.setup_output_path(args.output_path) 170 | 171 | _logshim.startLogger(verbose=args.verbose, noFileLog=args.nolog, outPath=output_path) 172 | 173 | 174 | paired_end_mapping = find_paired_ends(args.input_path, verbose=args.verbose) 175 | 176 | run_bowtie2(paired_end_mapping, args.genome, output_path) 177 | 178 | 179 | if __name__ == '__main__': 180 | main() 181 | -------------------------------------------------------------------------------- /2-sanitize-bam.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Do very simple cleaning of .bam files for ATAC-seq data pre-processing. 5 | # 6 | # This script: 7 | # * Drops low mapping quality reads (<10) 8 | # * Removes chrM mitochondrial reads 9 | # * Removes PCR duplicates (using samtools) 10 | # * Removes blacklisted regions (from ENCODE or custom blacklists) 11 | # 12 | # 13 | # Copyright (c) 2014-2016 Nick Semenkovich . 14 | # https://nick.semenkovich.com/ 15 | # 16 | # Developed for the Gordon Lab, Washington University in St. Louis (WUSTL) 17 | # https://gordonlab.wustl.edu/ 18 | # 19 | # This software is released under the MIT License: 20 | # http://opensource.org/licenses/MIT 21 | # 22 | # Source: https://github.com/GordonLab/riesling-pipeline 23 | 24 | from __future__ import absolute_import, division, print_function, unicode_literals 25 | 26 | __author__ = 'Nick Semenkovich ' 27 | __copyright__ = 'Gordon Lab at Washington University in St. Louis' 28 | __license__ = 'MIT' 29 | __version__ = '1.0.3' 30 | 31 | import _logshim 32 | import _script_helpers 33 | import argparse 34 | import glob 35 | import os 36 | import tempfile 37 | 38 | # A parameter needed by samtools to sort in-memory. 39 | MAX_MEM = "50G" 40 | 41 | # Load our config files 42 | CONFIG = _script_helpers.get_config() 43 | 44 | 45 | def large_filter_fixmate_and_sort(input_files, genome, output_path, disable_parallel=False): 46 | primary_logger = _logshim.getLogger('first_pass') 47 | 48 | output_suffix = ".tmp" 49 | 50 | if disable_parallel: # Doesn't change parallelism in last samtools sort 51 | shell_job_runner = _script_helpers.ShellJobRunner(primary_logger) 52 | else: 53 | shell_job_runner = _script_helpers.ShellJobRunner(primary_logger, delay_seconds=60) 54 | 55 | # We do a few things here: 56 | # - View only mapping quality >= 10 57 | # - Remove chrM 58 | # - Sort by name for fixmate 59 | # We don't parallelize here (-@ #) because fixmate blocks & parallel seems to only help for compressed. 60 | # - Fixmate (needed for rmdrup) 61 | # - Resorted by position 62 | tempfiles = [] 63 | for filename in input_files: 64 | primary_logger.debug('Working on: %s' % (filename)) 65 | command = 'export LANG=C; %s view -h -q 10 %s | grep -vF "chrM" | %s view -u -b - | ' \ 66 | '%s sort -l 0 -n -m %s -T %s -O bam | %s fixmate -O bam - - | %s sort -@ 8 -m %s - %s' 67 | 68 | # A super evil user could modify TMPDIR and make this generate evil strings. That's evil. 69 | temporary_file = tempfile.mkstemp('.tmp.bam') 70 | tempfiles.append(temporary_file) 71 | 72 | shell_job_runner.run(command % (CONFIG['binaries']['samtools'], 73 | filename, 74 | CONFIG['binaries']['samtools'], 75 | CONFIG['binaries']['samtools'], 76 | MAX_MEM, 77 | temporary_file[1], 78 | CONFIG['binaries']['samtools'], 79 | CONFIG['binaries']['samtools'], 80 | MAX_MEM, 81 | output_path + "/" + os.path.basename(os.path.splitext(filename)[0]) + output_suffix)) 82 | 83 | shell_job_runner.finish() 84 | 85 | # Clean up our temporary files. 86 | primary_logger.info('Removing temporary files ...') 87 | for fd, fname in tempfiles: 88 | os.close(fd) 89 | os.unlink(fname) 90 | 91 | primary_logger.info('First large stage complete! Saved as .tmp.bam for next stage.') 92 | 93 | 94 | def rmdup_and_blacklist(input_files, genome, output_path, disable_parallel=False): 95 | primary_logger = _logshim.getLogger('rmdup_blacklist') 96 | 97 | if disable_parallel: 98 | shell_job_runner = _script_helpers.ShellJobRunner(primary_logger) 99 | else: 100 | shell_job_runner = _script_helpers.ShellJobRunner(primary_logger, delay_seconds=20) 101 | 102 | for filename in input_files: 103 | primary_logger.debug('Working on: %s' % (filename)) 104 | # This is extremely fast and has minimal memory usage. Yay! 105 | # TODO: Allow adjustable windowing (-w %d) to blacklist larger/surrounding regions? 106 | command = "%s rmdup %s - 2>%s | %s window -abam - -b %s -v -w 0 > %s" 107 | 108 | shell_job_runner.run(command % (CONFIG['binaries']['samtools_legacy'], # TODO: Update this when samtools is fixed. 109 | output_path + "/" + os.path.basename(os.path.splitext(filename)[0]) + '.tmp.bam', # TODO: CLEAN THIS 110 | output_path + "/" + os.path.basename(os.path.splitext(filename)[0]) + '.srt.rmdup.bam.log', 111 | CONFIG['binaries']['bedtools'], 112 | os.path.dirname(os.path.realpath(__file__)) + '/' + CONFIG['blacklists'][genome], # TODO: CLEAN THIS 113 | output_path + "/" + os.path.basename(os.path.splitext(filename)[0]) + '.srt.rmdup.bam')) 114 | 115 | shell_job_runner.finish() 116 | 117 | primary_logger.info('Removing temporary files from stage 1 ...') 118 | for filename in input_files: 119 | os.unlink(output_path + "/" + os.path.basename(os.path.splitext(filename)[0]) + '.tmp.bam') 120 | 121 | primary_logger.info('Completed rmdup and blacklist') 122 | 123 | 124 | def main(): 125 | # Parse & interpret command line flags. 126 | parser = argparse.ArgumentParser(description='Given input .bam files, fix matepairs, remove duplicates, blacklist bad' 127 | 'regions, and sort the output.', 128 | epilog="Written by Nick Semenkovich for the Gordon Lab at " 129 | "Washington University in St. Louis: http://gordonlab.wustl.edu.", 130 | usage='%(prog)s [options]', 131 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 132 | 133 | parser.add_argument('--input-path', '-i', dest="input_path", metavar='input_dir/', type=str, 134 | help='Input path.', required=True) 135 | parser.add_argument('--output-path', '-o', dest="output_path", metavar='output_dir/', type=str, 136 | help='Output path.', required=True) 137 | parser.add_argument('--genome', '-g', dest="genome", metavar='genome', type=str, 138 | choices=['mm9', 'mm10', 'hg18', 'hg19'], help='Genome to use for blacklisting.', required=True) 139 | parser.add_argument('--no-parallel', '-np', dest="no_parallel", default=False, action='store_true', 140 | help='Disable parallel job spawning.') 141 | 142 | parser.add_argument("--verbose", "-v", dest="verbose", default=False, action='store_true') 143 | 144 | parser.add_argument("--no-log", "-nl", dest="nolog", default=False, action='store_true', 145 | help="Do not create a log file.") 146 | 147 | args = parser.parse_args() 148 | 149 | output_path = _script_helpers.setup_output_path(args.output_path) 150 | 151 | log_main = _logshim.startLogger(verbose=args.verbose, noFileLog=args.nolog, outPath=output_path) 152 | 153 | 154 | # Samtools requires a temp directory for sorting /sometimes/. 155 | # This seems to only matter if it exceeds the in-ram limits set by the MAX_MEM parameter. 156 | # Sanity check the /tmp directory has a bit of space. 157 | temppath = tempfile.gettempdir() 158 | s = os.statvfs(temppath) 159 | if ((s.f_bavail * s.f_frsize) / (1024 * 1024)) < 10000: # ~10 G, not for any good reason though 160 | log_main.warn('Temp directory %s doesn\'t have a lot of free space!' % (temppath)) 161 | 162 | 163 | input_files = glob.glob(args.input_path + "/*.bam") # Take ALL of the .bams. 164 | 165 | large_filter_fixmate_and_sort(input_files, args.genome, output_path, disable_parallel=args.no_parallel) 166 | rmdup_and_blacklist(input_files, args.genome, output_path, disable_parallel=args.no_parallel) 167 | 168 | 169 | 170 | if __name__ == '__main__': 171 | main() 172 | -------------------------------------------------------------------------------- /3-call-peaks.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Automate the execution of peak calling on .bam files. 5 | # Originally designed for ATAC-seq data, but will work with any directory of .bam files. 6 | # 7 | # By default, this runs both macs14 and macs2 for comparison. 8 | # 9 | # 10 | # Copyright (c) 2014-2016 Nick Semenkovich . 11 | # https://nick.semenkovich.com/ 12 | # 13 | # Developed for the Gordon Lab, Washington University in St. Louis (WUSTL) 14 | # https://gordonlab.wustl.edu/ 15 | # 16 | # This software is released under the MIT License: 17 | # http://opensource.org/licenses/MIT 18 | # 19 | # Source: https://github.com/GordonLab/riesling-pipeline 20 | 21 | from __future__ import absolute_import, division, print_function, unicode_literals 22 | 23 | __author__ = 'Nick Semenkovich ' 24 | __copyright__ = 'Gordon Lab at Washington University in St. Louis' 25 | __license__ = 'MIT' 26 | __version__ = '1.0.3' 27 | 28 | import _logshim 29 | import _script_helpers 30 | import argparse 31 | import os 32 | 33 | # Load our config files 34 | CONFIG = _script_helpers.get_config() 35 | 36 | 37 | def generate_index(input_files, output_path, disable_parallel=False): 38 | """ 39 | Many peak pickers want indexed .bams. Let's build indexes! (yay!) 40 | 41 | :param input_files: 42 | :param output_path: 43 | :param disable_parallel: 44 | :return: 45 | """ 46 | primary_logger = _logshim.getLogger('index') 47 | 48 | if disable_parallel: 49 | shell_job_runner = _script_helpers.ShellJobRunner(primary_logger) 50 | else: 51 | shell_job_runner = _script_helpers.ShellJobRunner(primary_logger, delay_seconds=10) 52 | 53 | for filename in input_files: 54 | primary_logger.debug('Working on: %s' % (filename)) 55 | command = "%s index %s" 56 | 57 | shell_job_runner.run(command % (CONFIG['binaries']['samtools'], filename)) 58 | 59 | shell_job_runner.finish() 60 | 61 | 62 | 63 | def run_macs14(input_files, output_path, genome, disable_parallel=False): 64 | macs14_log = _logshim.getLogger('run_macs14') 65 | 66 | macs14_log.info('Spawning MACS14 jobs...') 67 | 68 | if disable_parallel: 69 | shell_job_runner = _script_helpers.ShellJobRunner(macs14_log) 70 | else: 71 | shell_job_runner = _script_helpers.ShellJobRunner(macs14_log, delay_seconds=20) 72 | 73 | for filename in input_files: 74 | macs14_log.debug('Working on: %s' % (filename)) 75 | 76 | # macs14 is old, but we try it anyway, since it's sometimes useful. 77 | # -t: input 78 | # -n: output name 79 | # -f: format 80 | # -g: genome 81 | # -p: pvalue for peak cutoff 82 | # --wig: save .wig outputs 83 | # --single-profile: make one single wiggle 84 | # --space=50: wiggle resolution (default: 10) 85 | # 86 | # Note: This CD hack is because MACS1.4 can't specify an output path :( 87 | command = "cd %s && %s -t %s -n %s -f BAM -g %s -p 1e-9 --wig --single-profile --space=50 2>%s" 88 | 89 | filename_without_extension = os.path.splitext(filename)[0] + '.macs14' 90 | 91 | shell_job_runner.run(command % (output_path, # for cd hack 92 | 'macs14', # This must be pre-installed by the user. It's a big, complex package. 93 | os.getcwd() + '/' + filename, # input file # TODO: Fix this path hack. MACS14 cannot specify an output path :/ 94 | os.path.basename(filename_without_extension), 95 | genome, # for genome size 96 | os.path.basename(filename_without_extension) + '.macs14.log')) 97 | 98 | shell_job_runner.finish() 99 | 100 | macs14_log.info('MACS14 peak calling complete.') 101 | 102 | 103 | def run_macs2(input_files, output_path, genome, disable_parallel=False): 104 | macs2_log = _logshim.getLogger('run_macs2') 105 | 106 | macs2_log.info('Spawning MACS2 jobs...') 107 | 108 | if disable_parallel: 109 | shell_job_runner = _script_helpers.ShellJobRunner(macs2_log) 110 | else: 111 | shell_job_runner = _script_helpers.ShellJobRunner(macs2_log, delay_seconds=0.1) 112 | 113 | for filename in input_files: 114 | macs2_log.debug('Working on: %s' % (filename)) 115 | 116 | # --bdg: generate .bed graph output 117 | # --nomodel: We'll be shifting manually! 118 | # --extsize 200: See long discussion at: @@@ 119 | # --shift -100: As per above. 120 | # --slocal: Look at a local window of 20kb to build peak models 121 | # --keep-dup: We already removed duplicates with samtools. 122 | # TODO: Consider allowing tweaks to these settings with flags? 123 | command = "%s callpeak -t %s -n %s --outdir %s -g %s --bdg --nomodel --extsize 200 --shift -100 --slocal 20000 --llocal 20000 --keep-dup all 2>%s" 124 | 125 | filename_without_extension = os.path.splitext(filename)[0] + '.macs2' 126 | 127 | shell_job_runner.run(command % ('macs2', # This must be pre-installed by the user. It's a big, complex package. 128 | filename, # input file 129 | os.path.basename(filename_without_extension), 130 | output_path, 131 | genome, # for genome size, uncleaer if this actually matters with nolambda/nomodel 132 | output_path + "/" + os.path.basename(filename_without_extension) + '.log')) 133 | 134 | shell_job_runner.finish() 135 | 136 | macs2_log.info('MACS2 peak calling complete.') 137 | 138 | 139 | def main(): 140 | # Parse & interpret command line flags. 141 | parser = argparse.ArgumentParser(description='Run a number of standard peak calling algorithms for ATAC-seq data. ' 142 | 'Expects de-duplicated, sorted, merged, ChrM-removed data.', 143 | epilog="Written by Nick Semenkovich for the Gordon Lab at " 144 | "Washington University in St. Louis: https://gordonlab.wustl.edu.", 145 | usage='%(prog)s [options]', 146 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 147 | 148 | parser.add_argument('--input-path', '-i', dest="input_path", metavar='input_dir/', type=str, 149 | help='Input path (or a specific .bam file).', required=True) 150 | parser.add_argument('--output-path', '-o', dest="output_path", metavar='output_dir/', type=str, 151 | help='Output path.', required=True) 152 | parser.add_argument('--genome', '-g', dest="genome", metavar='genome', type=str, 153 | choices=['ms', 'mm', 'ce', 'dm'], help='Genome size to pass to MACS.', required=True) # TODO: Consider using mm9/mm10, etc. for uniformity? 154 | parser.add_argument('--no-parallel', '-np', dest="no_parallel", default=False, action='store_true', 155 | help='Disable parallel job spawning.') 156 | 157 | parser.add_argument('--skip-bam-indexing', dest="skip_bam_indexing", action='store_true', 158 | help='Skip bam indexing (You must have generated indexes independently for peak callers to work!).', required=False) 159 | 160 | parser.add_argument('--skip-macs14', dest="skip_macs14", action='store_true', 161 | help='Skip MACS v1.4 peak calling.', required=False) 162 | parser.add_argument('--skip-macs2', dest="skip_macs2", action='store_true', 163 | help='Skip MACS v2 peak calling.', required=False) 164 | 165 | 166 | parser.add_argument("--verbose", "-v", dest="verbose", default=False, action='store_true') 167 | 168 | parser.add_argument("--no-log", "-nl", dest="nolog", default=False, action='store_true', 169 | help="Do not create a log file.") 170 | 171 | args = parser.parse_args() 172 | 173 | output_path = _script_helpers.setup_output_path(args.output_path) 174 | 175 | log_main = _logshim.startLogger(verbose=args.verbose, noFileLog=args.nolog, outPath=output_path) 176 | 177 | input_files = _script_helpers.validate_input_files(args.input_path) 178 | 179 | 180 | # Generate BAM indexes 181 | if not args.skip_bam_indexing: 182 | generate_index(input_files, output_path, disable_parallel=args.no_parallel) 183 | else: 184 | log_main.warn("Skipping bam index .bai generation as requested.") 185 | log_main.warn("You must have generated these separately, otherwise peak callers will fail.") 186 | 187 | if not args.skip_macs14: 188 | # Start with old-school MACS 1.4 189 | run_macs14(input_files, output_path, args.genome, disable_parallel=args.no_parallel) 190 | 191 | if not args.skip_macs2: 192 | # Now new MACS 2 193 | # macs2 callpeak --nomodel -t $BAM -n $OUT --nolambda --keep-dup all --slocal 10000 194 | run_macs2(input_files, output_path, args.genome, disable_parallel=args.no_parallel) 195 | 196 | 197 | 198 | 199 | if __name__ == '__main__': 200 | main() 201 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014-2016 Nick Semenkovich | semenko@alum.mit.edu 4 | https://nick.semenkovich.com 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy of 7 | this software and associated documentation files (the "Software"), to deal in 8 | the Software without restriction, including without limitation the rights to 9 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 10 | the Software, and to permit persons to whom the Software is furnished to do so, 11 | subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 18 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 19 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 20 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 21 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## The RIESLING ATAC-seq Pipeline [![Build Status](https://travis-ci.org/GordonLab/riesling-pipeline.svg?branch=master)](https://travis-ci.org/GordonLab/riesling-pipeline) 2 | 3 | The RIESLING (Rapid Identification of EnhancerS LInked to Nearby Genes) ATAC-seq pipeline is designed to be an efficient set of standalone scripts for quickly analyzing ATAC-seq data. 4 | 5 | You may find it particularly useful for identifying and stratifying super-enhancers, though can also be leveraged for differential accessibility analysis using DESeq2. 6 | 7 | Since this was originally developed in 2014-2015, a number of other packages have been developed, notably the [Kundaje lab pipeline](https://github.com/kundajelab/atac_dnase_pipelines) and the [Ren lab single-cell ATAC pipeline](https://github.com/r3fang/scATAC). The Kundaje lab pipeline in particular includes other features (e.g. IDR analysis) which you may find more useful if you aren't interested in super-enhancers / enhancer clusters. 8 | 9 | ================ 10 | 11 | ## Getting started 12 | 1. Clone this repo: `git clone https://github.com/GordonLab/riesling-pipeline.git` 13 | 2. `cd riesling-pipeline` 14 | 3. Install the Python dependencies: `pip install --user -U -r requirements.txt` 15 | 16 | ================ 17 | 18 | ### Simple Hacks: Call super-enhancers on a .bed 19 | 20 | If you already have a .bed of putative enhancers, you can rapidly derive the super-enhancer population and statistics using `get-SuperEnhancers.R`. It will *not* filter blacklisted regions, stitch large regions together, remove TSSes, etc. -- use the full pipeline (detailed below) for that. 21 | 22 | A quick example, using the [demo-data/sample-mm10-CD4.bed](demo-data/sample-mm10-CD4.bed) file, which contains signal intensity in the 7th column: 23 | 24 | ``` 25 | $ git clone https://github.com/GordonLab/riesling-pipeline/ 26 | ... 27 | $ cd riesling-pipeline/ 28 | $ Rscript get-SuperEnhancers.R demo-data/sample-mm10-CD4.bed demo-data/sample-get-SuperEnhancers-output/ 29 | 30 | [1] "Working on: demo-data/sample-mm10-CD4.bed" 31 | [1] "Output dir: demo-data/sample-get-SuperEnhancers-output/" 32 | [1] "Current directory is: /Users/semenko/git/riesling-pipeline" 33 | [1] "Setting output directory to: demo-data/sample-get-SuperEnhancers-output/" 34 | [1] "Inflection at entry: 24795" 35 | [1] "Corresponding cutoff score: 8105.366159055" 36 | 37 | $ cat demo-data/sample-get-SuperEnhancers-output/0-enhancer-stats.txt 38 | Statistics for: demo-data/sample-mm10-CD4.bed 39 | SE Signal %: 38 40 | TE Signal %: 62 41 | SE Count: 1329 42 | TE Count: 24794 43 | SE Count %: 5.09 44 | TE Count %: 94.91 45 | Mean SE Size: 35846.22 46 | Mean TE Size: 5104.87 47 | Median SE Size: 31833 48 | Median TE Size: 892.5 49 | ``` 50 | 51 | Graphical & .bed results are now in [demo-data/sample-get-SuperEnhancers-output/](demo-data/sample-get-SuperEnhancers-output/), and will include these figures and more: 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 |
Super-enhancer Cutoff HockeystickSuper-enhancer Size DistributionSuper vs Traditional vs Stretch Enhancers
65 | 66 | Again, this may not be appropriate on non-preprocessed (blacklisted, TSS-filtered, etc.) data. You more likely want to use the full pipeline, detailed below. 67 | 68 | 69 | 70 | 75 | 76 | ## Expected Inputs & Pre-processing Data 77 | 78 | The heart of RIESLING and related code expects: 79 | 80 | * A .bam file of your aligned ATAC-seq data (e.g. from bowtie2) 81 | * A .bed of peaks (from MACS, HOMER, or any other standard peak caller) 82 | 83 | There are lots of ways to pre-process data -- please use whatever approach you prefer. A basic set of standalone 84 | scripts to help pre-process paired-end sequencing data is provided, detailed below under 'Pre-processing Tools'. 85 | 86 | 96 | 97 | ## Pre-processing Tools 98 | 99 | There are lots of valid way to pre-process data. These standalone scripts may be of use, but please pay careful attention 100 | to the peak calling settings you use -- as the defaults may not be applicable to your experimental approach. 101 | 102 | 103 | * ./1-map-to-genome.py: Run bowtie2 across a folder of paired-end sequence data. 104 | 105 | * ./2-sanitize-bam.py: Clean up .bam files, including ATAC-specific fixes. 106 | This includes: chrM removal, quality filtering, ENCODE blacklist removal, and PCR duplicate removal 107 | 108 | * ./3-call-peaks.py: Run both macs14 and macs2 on .bam files. 109 | 110 | By default, this runs both macs14 and macs2, and operates on directories of .bam files. 111 | 112 | ================ 113 | -------------------------------------------------------------------------------- /_logshim.py: -------------------------------------------------------------------------------- 1 | """ 2 | Clean, colorized logging for scripts. 3 | 4 | Based on https://stackoverflow.com/questions/384076/how-can-i-color-python-logging-output 5 | with many adaptations by Nick Semenkovich 6 | 7 | License: MIT 8 | Author: Nick Semenkovich 9 | """ 10 | 11 | from __future__ import absolute_import, division, print_function, unicode_literals 12 | 13 | __author__ = 'Nick Semenkovich ' 14 | __copyright__ = 'Gordon Lab at Washington University in St. Louis' 15 | 16 | import atexit 17 | import datetime 18 | import logging 19 | import os 20 | import platform 21 | import sys 22 | import time 23 | 24 | if __name__ == '__main__': 25 | print("Do not run this as a standalone script.") 26 | exit() 27 | 28 | # Very pretty error reporting, where available 29 | try: 30 | from IPython.core import ultratb 31 | sys.excepthook = ultratb.FormattedTB(mode='Context', color_scheme='Linux') 32 | except ImportError: 33 | pass 34 | 35 | # Log our execution time, used by log_execution_time below. 36 | STARTTIME = time.time() 37 | 38 | ## Define our core colors and resets. 39 | 40 | RESET = "\x1b[0m" 41 | BOLD = "\033[1m" 42 | BLUE = "\x1b[34;01m" 43 | CYAN = "\x1b[36;01m" 44 | GREEN = "\x1b[32;01m" 45 | RED = "\x1b[31;01m" 46 | GRAY = "\x1b[37;01m" 47 | YELLOW = "\x1b[33;01m" 48 | 49 | # TODO: replace TMP with gettmp or whatever 50 | 51 | def startLogger(verbose=False, noFileLog=False, initialLoggerName='main', outPath='/tmp'): 52 | """ 53 | Set logging if called. 54 | TODO: Make this a class, extending the logging module. 55 | """ 56 | datestamp = datetime.datetime.now().strftime("%Y%m%d-%H:%M:%S") 57 | results_path = outPath + '/' + datestamp + '.' + platform.node() + '.log' 58 | 59 | if not noFileLog: 60 | fileHandler = logging.FileHandler(results_path, 'w') 61 | fileFormatter = logging.Formatter('%(asctime)s: %(name)-25s: %(levelno)-3s: ' 62 | '(%(filename)s:%(lineno)d, %(funcName)s) : %(message)s') 63 | fileHandler.setFormatter(fileFormatter) 64 | fileHandler.setLevel(logging.DEBUG) 65 | 66 | # Sneakily add colors directly as the log-level name. 67 | logging.addLevelName(logging.DEBUG, CYAN + 'DEBUG') 68 | logging.addLevelName(logging.INFO, GREEN + 'INFO') 69 | logging.addLevelName(logging.WARNING, YELLOW + 'WARNING') 70 | logging.addLevelName(logging.ERROR, RED + 'ERROR') 71 | logging.addLevelName(logging.CRITICAL, RED + 'CRITICAL') 72 | 73 | consoleHandler = logging.StreamHandler() 74 | consoleFormatter = logging.Formatter(BOLD + "%(name)-25s" + RESET + ": %(levelname)-17s" + RESET + 75 | ": %(message)-80s (" + BOLD + "%(filename)s" + RESET + ":%(lineno)d)") 76 | consoleHandler.setFormatter(consoleFormatter) 77 | 78 | consoleHandler.setLevel(logging.INFO) 79 | if verbose: 80 | consoleHandler.setLevel(logging.DEBUG) 81 | 82 | # Give out a logger! 83 | rootlog = logging.getLogger() 84 | rootlog.setLevel(logging.DEBUG) 85 | 86 | rootlog.addHandler(consoleHandler) 87 | if not noFileLog: 88 | rootlog.addHandler(fileHandler) 89 | rootlog.info('>> Logging to %s <<' % (results_path)) 90 | rootlog.info('Running: %s [full command saved to log file]' % (os.path.basename(sys.argv[0]))) 91 | rootlog.info(' Using python version: %s' % (sys.version.split('\n')[0])) 92 | rootlog.debug('%s' % (' '.join(sys.argv))) 93 | rootlog.info('Written by %s' % (__author__)) 94 | rootlog.info(' Developed for the %s' % (__copyright__)) 95 | 96 | return logging.getLogger(initialLoggerName) 97 | 98 | 99 | def getLogger(name): 100 | """ 101 | Returns a logger! 102 | """ 103 | return logging.getLogger(name) 104 | 105 | def log_execution_time(): 106 | """ 107 | Print the total elapsed execution time. 108 | 109 | :return: 110 | """ 111 | logging.getLogger('root').info("Execution took: %0.2f secs." % (time.time() - STARTTIME)) 112 | 113 | # Register an exit handler to print execution time. 114 | atexit.register(log_execution_time) 115 | -------------------------------------------------------------------------------- /_script_helpers.py: -------------------------------------------------------------------------------- 1 | """ 2 | Shared helper functions. 3 | 4 | License: MIT 5 | Author: Nick Semenkovich 6 | """ 7 | 8 | from __future__ import print_function 9 | import glob 10 | import os 11 | import shlex 12 | import time 13 | import tempfile 14 | import yaml 15 | from distutils import spawn 16 | from subprocess import Popen, PIPE 17 | 18 | 19 | if __name__ == '__main__': 20 | print("Do not run this as a standalone script.") 21 | exit() 22 | 23 | ################### 24 | ## Early sanity checks. 25 | ################### 26 | 27 | THISPATH = os.path.dirname(os.path.realpath(__file__)) 28 | 29 | ## TODO: Given our new dist, consider dropping some of these or modifying this. 30 | for cmd in ['samtools', 'grep', 'bedtools', 'bowtie2']: 31 | if spawn.find_executable(cmd) is None: 32 | raise OSError("Software missing, unable to find: %s" % (cmd)) 33 | 34 | 35 | def get_config(): 36 | """ 37 | Load the yaml config based on our true root path. 38 | 39 | :return: YAML config object 40 | """ 41 | with open(THISPATH + '/.config.yaml') as yamlfile: 42 | config = yaml.load(yamlfile) 43 | 44 | # Append our true path to each binary 45 | binaries_with_paths = {} 46 | for binary_name in config['binaries']: 47 | binaries_with_paths[binary_name] = THISPATH + '/' + config['binaries'][binary_name] 48 | 49 | config['binaries'] = binaries_with_paths 50 | 51 | return config 52 | 53 | def setup_output_path(path_or_file): 54 | """ 55 | Make sure our output directory is writeable. Create it if necessary. 56 | """ 57 | if os.path.isfile(path_or_file): 58 | raise ValueError("Output path appears to be a file. Please specify a directory.") 59 | 60 | output_path = path_or_file 61 | 62 | output_path = os.path.normpath(os.path.normcase(output_path)) 63 | 64 | try: 65 | os.mkdir(output_path) 66 | except OSError: 67 | if not os.access(output_path, os.W_OK): 68 | raise OSError("Output path couldn't be created or isn't writeable: %s" % (output_path)) 69 | 70 | return output_path 71 | 72 | 73 | def validate_input_files(input_path_or_file, mask='.bam'): 74 | """ 75 | Given an input arg (either a specific file, or a path), return it as a list of files. 76 | Also check that files are readable. 77 | """ 78 | # TODO: Make this handle a list of raw files, too. (e.g. for 3-OPTIONAL...) 79 | if os.path.isfile(input_path_or_file): 80 | # We got a single file! 81 | if not input_path_or_file.endswith(mask): 82 | raise ValueError("Expected a %s input (or a directory with %s). You gave: %s" 83 | % (mask, mask, input_path_or_file)) 84 | file_list = [input_path_or_file] 85 | else: 86 | # It's not a file. Must be a directory. 87 | if not os.path.isdir(input_path_or_file): 88 | raise ValueError("Input not found (or not a file/folder): %s" % ((input_path_or_file))) 89 | file_list = glob.glob((input_path_or_file) + "/*" + mask) 90 | 91 | if len(file_list) == 0: 92 | raise ValueError("Input was empty!") 93 | 94 | for filename in file_list: 95 | if not os.access(filename, os.R_OK): 96 | raise OSError("Cannot read file: %s" % (filename)) 97 | 98 | return file_list 99 | 100 | 101 | class ShellJobRunner(): 102 | """ 103 | Run shell jobs and make sure they complete. 104 | 105 | This is dangerous to run on untrusted inputs! 106 | """ 107 | def __init__(self, logger, delay_seconds=False): 108 | self.logger = logger 109 | self.delay_seconds = delay_seconds 110 | self.process_list = [] 111 | if delay_seconds is False: 112 | self.logger.info('Created a NON-parallel job runner.') 113 | else: 114 | self.logger.info('Created a parallel job runner with %i second delay between jobs.' % (delay_seconds)) 115 | 116 | def run(self, command): 117 | """ 118 | Run a given command. May be blocking (default) or non-blocking if delay_seconds is set. 119 | """ 120 | 121 | self.logger.debug('Running: %s' % (command)) 122 | 123 | # TODO: Clean this up? We shouldn't be spawning sh to spawn bash to set pipefail ... 124 | process = Popen('nice bash -c "set -o pipefail; (%s)"' % command, shell=True) 125 | self.logger.debug('Spawned PID: %i' % (process.pid)) 126 | self.process_list.append(process) 127 | 128 | if self.delay_seconds is False: 129 | self.logger.info('* Parallelism disabled. Waiting for job to complete.') 130 | runtime_process_status = process.wait() 131 | else: 132 | self.logger.info('* Waiting %i seconds to spawn next job.' % (self.delay_seconds)) 133 | time.sleep(self.delay_seconds) 134 | runtime_process_status = process.poll() 135 | 136 | if runtime_process_status is None: 137 | # Not done yet, that's cool! 138 | # Since delay_seconds was set, we'll return now. The user better call finish() later! 139 | pass 140 | elif runtime_process_status == 0: 141 | # We're done already? That was suspiciously fast (or delay_seconds is too high). 142 | self.logger.warn('This task finished in less than %d seconds.' % (self.delay_seconds)) 143 | self.logger.warn('This is OK if your input files are small, otherwise, this is suspicious.') 144 | 145 | if runtime_process_status > 0: 146 | self.logger.critical('The last command failed!') 147 | self.logger.critical('Fault occurred in: %s' % (command)) 148 | raise ValueError('Process failed with exit code: %i' % (runtime_process_status)) 149 | 150 | 151 | def finish(self): 152 | """ 153 | Close out / block for processes. 154 | """ 155 | self.logger.info('Waiting for all %i processes to complete...' % (len(self.process_list))) 156 | 157 | # TODO: Consider more granular failure info here? 158 | exit_codes = [p.wait() for p in self.process_list] 159 | if sum(exit_codes) != 0: 160 | self.logger.critical('A process died! Cannot continue.') 161 | raise ValueError("One of the processes failed! Are you out of RAM (or hitting a system limit?)") 162 | 163 | self.logger.info('All processes done! Yay!') 164 | 165 | 166 | class IntelligentRunner(): 167 | """ 168 | Run the input command string (echo | grep | cut ...) via subprocess, and 169 | catch / discard known false-positive/annoying errors. 170 | """ 171 | known_ignorable_stderr = {'[samopen] SAM header is present:'} 172 | stderr_fp = tempfile.SpooledTemporaryFile() 173 | 174 | def __init__(self, command_string): 175 | self.command_string = command_string 176 | self.command_list = ','.join(shlex.split(command_string)).split(",|,") 177 | 178 | def _check_for_errors(self): 179 | self.stderr_fp.seek(0) 180 | for stdout_line in self.stderr_fp.readlines(): 181 | if stdout_line.strip() not in self.known_ignorable_stderr: 182 | if len(filter(stdout_line.strip().startswith, self.known_ignorable_stderr)) == 0: 183 | print("Was running: %s" % (self.command_string)) 184 | raise ValueError("Unignorable STDERR output: %s" % (stdout_line.strip())) 185 | print(stdout_line.strip()) 186 | 187 | 188 | def run(self): 189 | print(self.command_list) 190 | last_process = Popen(self.command_list[0].split(','), stdout=PIPE, stderr=self.stderr_fp) 191 | for command in self.command_list: 192 | print(command) 193 | last_process = Popen(command.split(','), stdin=last_process.stdout, stdout=PIPE, stderr=self.stderr_fp) 194 | 195 | # Grab the output 196 | output = last_process.communicate()[0] 197 | 198 | self._check_for_errors() 199 | 200 | return output 201 | -------------------------------------------------------------------------------- /bamliquidatorbatch/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GordonLab/riesling-pipeline/384f41dc964db0f59b3992f775e87c651e846f2b/bamliquidatorbatch/__init__.py -------------------------------------------------------------------------------- /bamliquidatorbatch/bamliquidator_batch.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import normalize_plot_and_summarize as nps 4 | from flattener import write_tab_for_all 5 | 6 | import argparse 7 | import csv 8 | import datetime 9 | import errno 10 | import os 11 | import subprocess 12 | import tables 13 | import logging 14 | import sys 15 | import abc 16 | import collections 17 | import numpy 18 | 19 | from time import time 20 | from os.path import basename 21 | from os.path import dirname 22 | 23 | __version__ = '1.2.0' 24 | 25 | default_black_list = ["chrUn", "_random", "Zv9_", "_hap"] 26 | 27 | def create_files_table(h5file): 28 | class Files(tables.IsDescription): 29 | key = tables.UInt32Col( pos=0) # is there an easier way to assign keys? 30 | length = tables.UInt64Col( pos=2) 31 | # file_name would be included here, but pytables doesn't support variable length strings as table column 32 | # so it is instead in a vlarray "file_names" 33 | 34 | table = h5file.create_table("/", "files", Files, "File keys and reference sequence lengths corresponding " 35 | "to the counts table") 36 | table.flush() 37 | 38 | return table 39 | 40 | def create_file_names_array(h5file): 41 | # vlarray of strings only supports a single column, so the file_key is implicitly the array index 42 | array = h5file.create_vlarray("/", "file_names", tables.VLStringAtom(), 43 | "File names with index corresponding to Files table key") 44 | array.append("*") # index/key 0 is reserved for this 45 | array.flush() 46 | 47 | return array 48 | 49 | def all_bam_file_paths_in_directory(bam_directory): 50 | bam_file_paths = [] 51 | for dirpath, _, files in os.walk(bam_directory, followlinks=True): 52 | for file_ in files: 53 | if file_.endswith(".bam"): 54 | bam_file_paths.append(os.path.join(dirpath, file_)) 55 | return bam_file_paths 56 | 57 | def bam_file_paths_with_no_file_entries(file_names, bam_file_paths): 58 | with_no_counts = [] 59 | 60 | for bam_file_path in bam_file_paths: 61 | if basename(bam_file_path) not in file_names: 62 | with_no_counts.append(bam_file_path) 63 | 64 | return with_no_counts 65 | 66 | # BaseLiquidator is an abstract base class, with concrete classes BinLiquidator and RegionLiquidator 67 | # that implement the abstract methods. 68 | class BaseLiquidator(object): 69 | __metaclass__ = abc.ABCMeta 70 | 71 | @abc.abstractmethod 72 | def liquidate(self, bam_file_path, extension, sense = None): 73 | pass 74 | 75 | @abc.abstractmethod 76 | def normalize(self): 77 | pass 78 | 79 | @abc.abstractmethod 80 | def create_counts_table(self, h5file): 81 | pass 82 | 83 | def __init__(self, executable, counts_table_name, output_directory, bam_file_path, 84 | include_cpp_warnings_in_stderr = True, counts_file_path = None, number_of_threads = 0): 85 | # clear all memoized values from any prior runs 86 | nps.file_keys_memo = {} 87 | 88 | self.timings = collections.OrderedDict() 89 | 90 | self.output_directory = output_directory 91 | self.counts_file_path = counts_file_path 92 | self.include_cpp_warnings_in_stderr = include_cpp_warnings_in_stderr 93 | self.number_of_threads = number_of_threads 94 | self.chromosome_patterns_to_skip = [] 95 | 96 | # This script may be run by either a developer install from a git pipeline checkout, 97 | # or from a user install so that the exectuable is on the path. First we try to 98 | # find the exectuable for a developer install, and if that fails we look on the 99 | # standard path. 100 | if basename(dirname(dirname(os.path.realpath(__file__)))) == 'bamliquidator_internal': 101 | # look for developer executable location 102 | self.executable_path = os.path.join(dirname(dirname(os.path.realpath(__file__))), executable) 103 | if not os.path.isfile(self.executable_path): 104 | exit("%s is missing -- try cd'ing into the directory and running 'make'" % self.executable_path) 105 | else: 106 | # just look on standard path 107 | self.executable_path = executable 108 | 109 | mkdir_if_not_exists(output_directory) 110 | 111 | if self.counts_file_path is None: 112 | self.counts_file_path = os.path.join(output_directory, "counts.h5") 113 | 114 | counts_file = tables.open_file(self.counts_file_path, mode = "w", 115 | title = 'bam liquidator genome read counts - version %s' % __version__) 116 | else: 117 | counts_file = tables.open_file(self.counts_file_path, "r+") 118 | 119 | try: 120 | counts = counts_file.get_node("/", counts_table_name) 121 | files = counts_file.root.files 122 | file_names = counts_file.root.file_names 123 | except: 124 | counts = self.create_counts_table(counts_file) 125 | files = create_files_table(counts_file) 126 | file_names = create_file_names_array(counts_file) 127 | 128 | if os.path.isdir(bam_file_path): 129 | self.bam_file_paths = all_bam_file_paths_in_directory(bam_file_path) 130 | else: 131 | self.bam_file_paths = [bam_file_path] 132 | 133 | self.bam_file_paths = bam_file_paths_with_no_file_entries(file_names, self.bam_file_paths) 134 | 135 | self.preprocess(files, file_names) 136 | 137 | counts_file.close() # bamliquidator_bins/bamliquidator_regions will open this file and modify 138 | # it, so it is probably best that we not hold an out of sync reference 139 | 140 | 141 | # adds files being liquidated to the files table and populates the following member dictionaries: 142 | # 1) file_name -> [(chromosome, sequence length), ...] 143 | # 2) file_name -> total mapped count 144 | # 3) file_name -> file key number 145 | def preprocess(self, files, file_names): 146 | self.file_to_chromosome_length_pairs = {} 147 | self.file_to_count = {} 148 | self.file_to_key = {} 149 | 150 | chr_col = 0 151 | length_col = 1 152 | mapped_read_col = 2 153 | 154 | # bam file keys start at 1. 155 | # key 0 is special and denotes "no specific file", which 156 | # is used in normalizated_counts tables to mean an average or total for all bam files 157 | # of a specific cell type. 158 | next_file_key = 0 # see += 1 below 159 | for file_record in files: 160 | next_file_key = max(next_file_key, file_record["key"]) 161 | next_file_key += 1 162 | 163 | for bam_file_path in self.bam_file_paths: 164 | args = ["samtools", "idxstats", bam_file_path] 165 | output = subprocess.check_output(args) 166 | # skip last two lines: the unmapped chromosome line and the empty line 167 | reader = csv.reader(output.split('\n')[:-2], delimiter='\t') 168 | file_name = basename(bam_file_path) 169 | file_count = 0 170 | 171 | chromosome_length_pairs = [] 172 | for row in reader: 173 | chromosome = row[chr_col] 174 | if len(chromosome) >= nps.chromosome_name_length: 175 | raise RuntimeError('Chromosome name "%s" exceeds the max supported chromosome name length (%d). ' 176 | 'This max chromosome length may be updated in the code if necessary -- please ' 177 | 'contact the bamliquidator developers for additional assistance.' 178 | % (chromosome, nps.chromosome_name_length)) 179 | 180 | file_count += int(row[mapped_read_col]) 181 | chromosome_length_pairs.append((chromosome, int(row[length_col]))) 182 | 183 | files.row["key"] = next_file_key 184 | files.row["length"] = file_count 185 | files.row.append() 186 | file_names.append(file_name) 187 | 188 | self.file_to_chromosome_length_pairs[file_name] = chromosome_length_pairs 189 | self.file_to_count[file_name] = file_count 190 | self.file_to_key[file_name] = next_file_key 191 | 192 | next_file_key += 1 193 | 194 | files.flush() 195 | file_names.flush() 196 | assert(len(file_names) - 1 == len(files)) 197 | assert(len(file_names) == next_file_key) 198 | 199 | def batch(self, extension, sense): 200 | for i, bam_file_path in enumerate(self.bam_file_paths): 201 | logging.info("Liquidating %s (file %d of %d)", bam_file_path, i+1, len(self.bam_file_paths)) 202 | 203 | return_code = self.liquidate(bam_file_path, extension, sense) 204 | if return_code != 0: 205 | raise Exception("%s failed with exit code %d" % (self.executable_path, return_code)) 206 | 207 | start = time() 208 | self.normalize() 209 | duration = time() - start 210 | logging.info("Post liquidation processing took %f seconds", duration) 211 | self.log_time('post_liquidation', duration) 212 | 213 | def flatten(self): 214 | logging.info("Flattening HDF5 tables into text files") 215 | start = time() 216 | 217 | with tables.open_file(self.counts_file_path, mode = "r") as counts_file: 218 | write_tab_for_all(counts_file, self.output_directory) 219 | 220 | duration = time() - start 221 | logging.info("Flattening took %f seconds" % duration) 222 | self.log_time('flattening', duration) 223 | 224 | def chromosome_args(self, bam_file_name, skip_non_canonical): 225 | args = [] 226 | for chromosome, length in self.file_to_chromosome_length_pairs[bam_file_name]: 227 | if skip_non_canonical: 228 | if any(pattern in chromosome for pattern in self.chromosome_patterns_to_skip): 229 | continue 230 | args.append(chromosome) 231 | args.append(str(length)) 232 | return args 233 | 234 | def logging_cpp_args(self): 235 | return [os.path.join(self.output_directory, "log.txt"), "1" if self.include_cpp_warnings_in_stderr else "0"] 236 | 237 | def log_time(self, title, seconds): 238 | self.timings[title] = seconds 239 | 240 | def write_timings_to_junit_xml(self): 241 | with open(os.path.join(self.output_directory, 'timings.xml'), 'w') as xml: 242 | xml.write('\n' % len(self.timings.keys())) 243 | for title in self.timings: 244 | xml.write('\t\n' % (title, self.timings[title])) 245 | xml.write('\n') 246 | 247 | class BinLiquidator(BaseLiquidator): 248 | def __init__(self, bin_size, output_directory, bam_file_path, 249 | counts_file_path = None, extension = 0, sense = '.', skip_plot = False, 250 | include_cpp_warnings_in_stderr = True, number_of_threads = 0, blacklist = default_black_list): 251 | self.bin_size = bin_size 252 | self.skip_plot = skip_plot 253 | super(BinLiquidator, self).__init__("bamliquidator_bins", "bin_counts", output_directory, bam_file_path, 254 | include_cpp_warnings_in_stderr, counts_file_path, number_of_threads) 255 | self.chromosome_patterns_to_skip = blacklist 256 | self.batch(extension, sense) 257 | 258 | def liquidate(self, bam_file_path, extension, sense = None): 259 | if sense is None: sense = '.' 260 | 261 | cell_type = basename(dirname(bam_file_path)) 262 | if cell_type == '': 263 | cell_type = '-' 264 | bam_file_name = basename(bam_file_path) 265 | args = [self.executable_path, str(self.number_of_threads), cell_type, str(self.bin_size), str(extension), sense, bam_file_path, 266 | str(self.file_to_key[bam_file_name]), self.counts_file_path] 267 | args.extend(self.logging_cpp_args()) 268 | args.extend(self.chromosome_args(bam_file_name, skip_non_canonical=True)) 269 | 270 | start = time() 271 | return_code = subprocess.call(args) 272 | duration = time() - start 273 | 274 | reads = self.file_to_count[bam_file_name] 275 | rate = reads / (10**6) / duration 276 | logging.info("Liquidation completed: %f seconds, %d reads, %f millions of reads per second", duration, reads, rate) 277 | self.log_time('liquidation', duration) 278 | 279 | return return_code 280 | 281 | def normalize(self): 282 | with tables.open_file(self.counts_file_path, mode = "r+") as counts_file: 283 | nps.normalize_plot_and_summarize(counts_file, self.output_directory, self.bin_size, self.skip_plot) 284 | 285 | def create_counts_table(self, h5file): 286 | class BinCount(tables.IsDescription): 287 | bin_number = tables.UInt32Col( pos=0) 288 | cell_type = tables.StringCol(16, pos=1) 289 | chromosome = tables.StringCol(nps.chromosome_name_length, pos=2) 290 | count = tables.UInt64Col( pos=3) 291 | file_key = tables.UInt32Col( pos=4) 292 | 293 | table = h5file.create_table("/", "bin_counts", BinCount, "bin counts") 294 | table.flush() 295 | return table 296 | 297 | class RegionLiquidator(BaseLiquidator): 298 | def __init__(self, regions_file, output_directory, bam_file_path, 299 | region_format=None, counts_file_path = None, extension = 0, sense = '.', 300 | include_cpp_warnings_in_stderr = True, number_of_threads = 0): 301 | self.regions_file = regions_file 302 | self.region_format = region_format 303 | if self.region_format is None: 304 | _, self.region_format = os.path.splitext(regions_file) 305 | if len(self.region_format) > 0 and self.region_format[0] == '.': 306 | self.region_format = self.region_format[1:] 307 | if self.region_format not in ("gff", "bed"): 308 | raise RuntimeError("Only bed and gff region file formats are supported -- %s format specified" 309 | % str(self.region_format)) 310 | 311 | super(RegionLiquidator, self).__init__("bamliquidator_regions", "region_counts", output_directory, 312 | bam_file_path, include_cpp_warnings_in_stderr, counts_file_path, number_of_threads) 313 | 314 | self.batch(extension, sense) 315 | 316 | def liquidate(self, bam_file_path, extension, sense = None): 317 | bam_file_name = basename(bam_file_path) 318 | args = [self.executable_path, str(self.number_of_threads), self.regions_file, str(self.region_format), str(extension), bam_file_path, 319 | str(self.file_to_key[bam_file_name]), self.counts_file_path] 320 | args.extend(self.logging_cpp_args()) 321 | if sense is None: 322 | args.append('_') # _ means use strand specified in region file (or . if none specified) 323 | else: 324 | args.append(sense) 325 | args.extend(self.chromosome_args(bam_file_name, skip_non_canonical=False)) 326 | 327 | start = time() 328 | return_code = subprocess.call(args) 329 | duration = time() - start 330 | 331 | logging.info("Liquidation completed: %f seconds", duration) 332 | self.log_time('liquidation', duration) 333 | 334 | return return_code 335 | 336 | def normalize(self): 337 | with tables.open_file(self.counts_file_path, mode = "r+") as counts_file: 338 | nps.normalize_regions(counts_file.root.region_counts, counts_file.root.files) 339 | 340 | def create_counts_table(self, h5file): 341 | class Region(tables.IsDescription): 342 | file_key = tables.UInt32Col( pos=0) 343 | chromosome = tables.StringCol(nps.chromosome_name_length, pos=1) 344 | region_name = tables.StringCol(64, pos=2) 345 | start = tables.UInt64Col( pos=3) 346 | stop = tables.UInt64Col( pos=4) 347 | strand = tables.StringCol(1, pos=5) 348 | count = tables.UInt64Col( pos=6) 349 | normalized_count = tables.Float64Col( pos=7) 350 | 351 | table = h5file.create_table("/", "region_counts", Region, "region counts") 352 | table.flush() 353 | return table 354 | 355 | def write_bamToGff_matrix(output_file_path, h5_region_counts_file_path): 356 | with tables.open_file(h5_region_counts_file_path, "r") as counts_file: 357 | with open(output_file_path, "w") as output: 358 | file_keys = [] 359 | 360 | output.write("GENE_ID\tlocusLine") 361 | for file_record in counts_file.root.files: 362 | file_key = file_record["key"] 363 | file_keys.append(file_key) 364 | output.write("\tbin_1_%s" % counts_file.root.file_names[file_key]) 365 | output.write("\n") 366 | 367 | number_of_files = len(file_keys) 368 | number_of_regions = counts_file.root.region_counts.nrows / number_of_files 369 | 370 | # first loop through all but the last file index, storing those counts 371 | prior_region_counts = numpy.zeros((number_of_regions, number_of_files - 1)) 372 | for col, file_key in enumerate(file_keys[:-1]): 373 | for row, region in enumerate(counts_file.root.region_counts.where("file_key == %d" % file_key)): 374 | prior_region_counts[row, col] = region["normalized_count"] 375 | 376 | # then loop through the last index, 377 | # printing the region columns and the counts for the prior files, 378 | # along with the count for the last index 379 | for row, region in enumerate(counts_file.root.region_counts.where("file_key == %d" % file_keys[-1])): 380 | output.write("%s\t%s(%s):%d-%d" % (region["region_name"], region["chromosome"], 381 | region["strand"], region["start"], region["stop"])) 382 | for col in range(0, number_of_files-1): 383 | output.write("\t%s" % round(prior_region_counts[row, col], 4)) 384 | output.write("\t%s\n" % round(region["normalized_count"], 4)) 385 | 386 | def configure_logging(args): 387 | # Using root logger so we can just do logging.info/warn/error in this and other files. 388 | # If people start using bamliquidator_batch as an imported module, then we should probably 389 | # change this logging to not use the root logger directly. 390 | logger = logging.getLogger() 391 | logger.setLevel(logging.INFO) 392 | 393 | file_handler = logging.FileHandler(os.path.join(args.output_directory, 'log.txt')) 394 | file_handler.setFormatter(logging.Formatter('%(asctime)s %(levelname)s\t%(message)s', 395 | datefmt='%Y-%m-%d %H:%M:%S')) 396 | 397 | logger.addHandler(file_handler) 398 | # todo: add bamliquidator version to the starting log message 399 | logging.info("Starting %s %s with args %s", basename(sys.argv[0]), __version__, vars(args)) 400 | 401 | # Adding console handler after writing the startup log entry. The startup log could be useful 402 | # in a file that is being appended to from a prior run, but would be annonying on stderr. 403 | 404 | console_handler = logging.StreamHandler() 405 | if args.quiet: 406 | console_handler.setLevel(logging.ERROR) 407 | else: 408 | console_handler.setLevel(logging.INFO) 409 | 410 | class FormatterNotFormattingInfo(logging.Formatter): 411 | def __init__(self, fmt): 412 | logging.Formatter.__init__(self, fmt) 413 | 414 | def format(self, record): 415 | if record.levelno == logging.INFO: 416 | return record.getMessage() 417 | return logging.Formatter.format(self, record) 418 | 419 | console_handler.setFormatter(FormatterNotFormattingInfo('%(levelname)s\t%(message)s')) 420 | logger.addHandler(console_handler) 421 | 422 | def mkdir_if_not_exists(directory): 423 | try: 424 | os.mkdir(directory) 425 | except OSError as exception: 426 | if exception.errno != errno.EEXIST: 427 | raise 428 | 429 | def main(): 430 | parser = argparse.ArgumentParser(description='Count the number of base pair reads in each bin or region ' 431 | 'in the bam file(s) at the given directory, and then normalize, plot bins, ' 432 | 'and summarize the counts in the output directory. For additional ' 433 | 'help, please see https://github.com/BradnerLab/pipeline/wiki') 434 | 435 | mut_exclusive_group = parser.add_mutually_exclusive_group() 436 | mut_exclusive_group.add_argument('-b', '--bin_size', type=int, default=100000, 437 | help="Number of base pairs in each bin -- the smaller the bin size the longer the runtime and " 438 | "the larger the data files (default is 100000)") 439 | mut_exclusive_group.add_argument('-r', '--regions_file', 440 | help='a region file in either .gff or .bed format') 441 | 442 | parser.add_argument('-o', '--output_directory', default='output', 443 | help='Directory to output the h5, log, gff, tab, and/or html files to. Creates directory if necessary. ' 444 | 'May overwrite prior run results if present. Default is "./output".') 445 | parser.add_argument('-c', '--counts_file', default=None, 446 | help='HDF5 counts file from a prior run to be appended to. If unspecified, defaults to ' 447 | 'creating a new file "counts.h5" in the output directory.') 448 | parser.add_argument('-f', '--flatten', action='store_true', 449 | help='flatten all HDF5 tables into tab delimited text files in the output directory, one for each ' 450 | 'chromosome (note that HDF5 files can be efficiently queried and used directly -- e.g. please ' 451 | 'see http://www.pytables.org/ for easy to use Python APIs and ' 452 | 'http://www.hdfgroup.org/products/java/hdf-java-html/hdfview/ for an easy to use GUI for ' 453 | 'browsing HDF5 files)') 454 | parser.add_argument('-e', '--extension', type=int, default=0, 455 | help='Extends reads by n bp (default is 0)') 456 | parser.add_argument('--sense', default=None, choices=['+', '-', '.'], 457 | help="Map to '+' (forward), '-' (reverse) or '.' (both) strands. For gff regions, default is to use " 458 | "the sense specified by the gff file; otherwise, default maps to both.") 459 | parser.add_argument('-m', '--match_bamToGFF', default=False, action='store_true', 460 | help="match bamToGFF_turbo.py matrix output format, storing the result as matrix.txt in the output folder") 461 | parser.add_argument('--region_format', default=None, choices=['gff', 'bed'], 462 | help="Interpret region file as having the given format. Default is to deduce format from file extension.") 463 | parser.add_argument('--skip_plot', action='store_true', help='Skip generating plots. (This can speed up execution.)') 464 | parser.add_argument('--black_list', nargs='+', type=str, default=default_black_list, 465 | help='One or more (space separated) chromosome patterns to skip during bin liquidation. Default is ' 466 | 'to skip any chromosomes that contain any of the following substrings: %s. ' % " ".join(default_black_list)) 467 | parser.add_argument('-q', '--quiet', action='store_true', 468 | help='Informational and warning output is suppressed so only errors are written to the console (stderr). ' 469 | 'All bamliquidator logs are still written to log.txt in the output directory. This also disables ' 470 | 'samtools error messages to stderr, but a corresponding bamliquidator message should still be logged ' 471 | 'in log.txt.') 472 | parser.add_argument('-n', '--number_of_threads', type=int, default=0, 473 | help='Number of threads to run concurrently during liquidation. Defaults to the total number of logical ' 474 | 'cpus on the system.') 475 | parser.add_argument('--xml_timings', action='store_true', 476 | help='Write performance timings to junit style timings.xml in output folder, which is useful for ' 477 | 'tracking performance over time with automatically generated Jenkins graphs') 478 | parser.add_argument('--version', action='version', version='%s %s' % (basename(sys.argv[0]), __version__)) 479 | parser.add_argument('bam_file_path', 480 | help='The directory to recursively search for .bam files for counting. Every .bam file must ' 481 | 'have a corresponding .bai file at the same location. To count just a single file, ' 482 | 'provide the .bam file path instead of a directory. The parent directory (up to 16 char) of each ' 483 | '.bam file is interpreted as the cell type (e.g. mm1s might be an appropriate directory ' 484 | 'name). Bam files in the same directory are grouped together for plotting. Plots use ' 485 | 'normalized counts, such that all .bam files in the same directory have bin ' 486 | 'counts that add up to 1 for each chromosome. If your .bam files are not in this ' 487 | 'directory format, please consider creating a directory of sym links to your actual ' 488 | '.bam and .bai files. If the .bam file already has 1 or more reads in the HDF5 counts file, ' 489 | 'then that .bam file is skipped from liquidation, but is still included in normalization, ' 490 | 'plotting, and summaries.') 491 | 492 | args = parser.parse_args() 493 | 494 | assert(tables.__version__ >= '3.0.0') 495 | 496 | mkdir_if_not_exists(args.output_directory) 497 | 498 | configure_logging(args) 499 | 500 | if args.regions_file is None: 501 | liquidator = BinLiquidator(args.bin_size, args.output_directory, args.bam_file_path, 502 | args.counts_file, args.extension, args.sense, args.skip_plot, 503 | not args.quiet, args.number_of_threads, args.black_list) 504 | else: 505 | if args.counts_file: 506 | raise Exception("Appending to a prior regions counts.h5 file is not supported at this time -- " 507 | "please email the developer if you need this feature") 508 | # non-exhaustive list of items that would need to be handled to get this working: 509 | ## review matrix output, specifically the assumption that each file has the exact same regions in the same order 510 | liquidator = RegionLiquidator(args.regions_file, args.output_directory, args.bam_file_path, 511 | args.region_format, args.counts_file, args.extension, args.sense, 512 | not args.quiet, args.number_of_threads) 513 | 514 | if args.flatten: 515 | liquidator.flatten() 516 | 517 | if args.match_bamToGFF: 518 | if args.regions_file is None: 519 | logging.warning("Ignoring match_bamToGFF argument (this is only supported if a regions file is provided)") 520 | else: 521 | logging.info("Writing bamToGff style matrix.txt file") 522 | start = time() 523 | write_bamToGff_matrix(os.path.join(args.output_directory, "matrix.txt"), liquidator.counts_file_path) 524 | duration = time() - start 525 | logging.info("Writing matrix.txt took %f seconds" % duration) 526 | liquidator.log_time('matrix', duration) 527 | 528 | if args.xml_timings: 529 | liquidator.write_timings_to_junit_xml() 530 | 531 | if __name__ == "__main__": 532 | main() 533 | 534 | ''' 535 | The MIT License (MIT) 536 | 537 | Copyright (c) 2013 John DiMatteo (jdimatteo@gmail.com) 538 | 539 | Permission is hereby granted, free of charge, to any person obtaining a copy 540 | of this software and associated documentation files (the "Software"), to deal 541 | in the Software without restriction, including without limitation the rights 542 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 543 | copies of the Software, and to permit persons to whom the Software is 544 | furnished to do so, subject to the following conditions: 545 | 546 | The above copyright notice and this permission notice shall be included in 547 | all copies or substantial portions of the Software. 548 | 549 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 550 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 551 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 552 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 553 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 554 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 555 | THE SOFTWARE. 556 | ''' 557 | -------------------------------------------------------------------------------- /bamliquidatorbatch/flattener.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import csv 5 | import os 6 | import tables 7 | 8 | def write_tab(table, file_names, output_directory, log=False): 9 | chromosome_to_file_writer_pair = {} 10 | 11 | columns = [col for col in table.colnames if col != "chromosome"] 12 | columns = [col if col != "file_key" else "file_name" for col in columns] 13 | 14 | for row in table: 15 | chromosome = row["chromosome"] 16 | if chromosome not in chromosome_to_file_writer_pair: 17 | tab_file_path = os.path.join(output_directory, table.name + "_" + chromosome + ".tab") 18 | if log: 19 | print "Writing", tab_file_path 20 | 21 | tab_file = open(tab_file_path, 'wb') 22 | writer = csv.writer(tab_file, delimiter='\t') 23 | writer.writerow(columns) 24 | chromosome_to_file_writer_pair[chromosome] = (tab_file, writer) 25 | else: 26 | _, writer = chromosome_to_file_writer_pair[chromosome] 27 | 28 | # pickup here: translate file_key to file_name 29 | row_list = [] 30 | for col in columns: 31 | if col == "file_name": 32 | row_list.append(file_names[row["file_key"]]) 33 | else: 34 | row_list.append(row[col]) 35 | 36 | writer.writerow(row_list) 37 | 38 | for tab_file, _ in chromosome_to_file_writer_pair.values(): 39 | tab_file.close() 40 | 41 | def write_tab_for_all(h5_file, output_directory, log=False): 42 | for table in h5_file.root: 43 | if table.name not in ("files", "file_names"): 44 | write_tab(table, h5_file.root.file_names, output_directory, log) 45 | 46 | def main(): 47 | parser = argparse.ArgumentParser(description='Writes bamliquidator_batch.py hdf5 tables into tab delimited ' 48 | 'text files, one for each chromosome. Note that this is provided as a convenience, but it is hoped that ' 49 | 'the hdf5 files will be used directly since they are much more efficient to work with -- e.g. please see ' 50 | 'http://www.pytables.org/ for easy to use Python APIs and ' 51 | 'http://www.hdfgroup.org/products/java/hdf-java-html/hdfview/ for an easy to use GUI for browsing HDF5 ' 52 | 'files. For more info, please see https://github.com/BradnerLab/pipeline/wiki/bamliquidator .') 53 | parser.add_argument('-t', '--table', default=None, help='the table to write to hdf5, e.g. "region_counts" for ' 54 | 'a regions counts.h5 file, or one of the following for a uniform bins counts.h5 file: "bin_counts", ' 55 | '"normalized_counts", "sorted_summary", or "summary". If none specified flattens every table in the h5 file, ' 56 | 'using the table name as a file prefix.') 57 | parser.add_argument('h5_file', help='the hdf5 file generated by bamliquidator_batch.py') 58 | parser.add_argument('output_directory', help='directory to store the tab files (must already exist)') 59 | args = parser.parse_args() 60 | 61 | h5_file = tables.open_file(args.h5_file, mode = "r") 62 | 63 | log = True 64 | 65 | if args.table: 66 | table = h5_file.get_node("/" + args.table) 67 | write_tab(table, h5_file.root.file_names, args.output_directory, log) 68 | else: 69 | write_tab_for_all(h5_file, args.output_directory, log) 70 | 71 | h5_file.close() 72 | 73 | if __name__ == "__main__": 74 | main() 75 | 76 | ''' 77 | The MIT License (MIT) 78 | 79 | Copyright (c) 2013 John DiMatteo (jdimatteo@gmail.com) 80 | 81 | Permission is hereby granted, free of charge, to any person obtaining a copy 82 | of this software and associated documentation files (the "Software"), to deal 83 | in the Software without restriction, including without limitation the rights 84 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 85 | copies of the Software, and to permit persons to whom the Software is 86 | furnished to do so, subject to the following conditions: 87 | 88 | The above copyright notice and this permission notice shall be included in 89 | all copies or substantial portions of the Software. 90 | 91 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 92 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 93 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 94 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 95 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 96 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 97 | THE SOFTWARE. 98 | ''' 99 | -------------------------------------------------------------------------------- /bamliquidatorbatch/normalize_plot_and_summarize.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ################################################################################## 4 | # The MIT License (MIT) 5 | # 6 | # Copyright (c) 2013 John DiMatteo (jdimatteo@gmail.com) 7 | # 8 | # Permission is hereby granted, free of charge, to any person obtaining a copy of 9 | # this software and associated documentation files (the "Software"), to deal in 10 | # the Software without restriction, including without limitation the rights to 11 | # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 12 | # the Software, and to permit persons to whom the Software is furnished to do so, 13 | # subject to the following conditions: 14 | # 15 | # The above copyright notice and this permission notice shall be included in all 16 | # copies or substantial portions of the Software. 17 | # 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 20 | # FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 21 | # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 22 | # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 23 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 24 | # 25 | ################################################################################## 26 | 27 | from __future__ import division 28 | 29 | import sys 30 | import os 31 | import argparse 32 | import tables 33 | import scipy.stats as stats 34 | import collections 35 | import logging 36 | 37 | try: 38 | # import bokeh.plotting as bp 39 | pass 40 | except: 41 | bp = None 42 | 43 | # note that my initial version didn't do any flush calls, which lead to bogus rows being added 44 | # to the normalized_counts table (which was evident when the normalized counts <= 95 + > 95 didn't add up right). 45 | # -- I should probably look into why flush was necessary and/or file a bug with pytables 46 | 47 | # I also found that create_index doesn't always work (this was causing where statements to not work) 48 | # -- I don't know if this was my fault or a bug in pytables, but I just always use create_csindex instead 49 | 50 | chromosome_name_length = 64 # Includes 1 for null terminator, so really max of 63 characters. 51 | # Note that changing this value requires updating C++ code as well. 52 | 53 | def delete_all_but_bin_counts_and_files_table(h5file): 54 | for table in h5file.root: 55 | if table.name != "bin_counts" and table.name != "files" and table.name != "file_names": 56 | for index in table.colindexes.values(): 57 | index.column.remove_index() 58 | table.remove() 59 | 60 | def create_normalized_counts_table(h5file): 61 | class BinCount(tables.IsDescription): 62 | bin_number = tables.UInt32Col( pos=0) 63 | cell_type = tables.StringCol(16, pos=1) 64 | chromosome = tables.StringCol(chromosome_name_length, pos=2) 65 | count = tables.Float64Col( pos=3) 66 | percentile = tables.Float64Col( pos=4) 67 | file_key = tables.UInt32Col( pos=5) 68 | 69 | table = h5file.create_table("/", "normalized_counts", BinCount, "normalized bin counts") 70 | 71 | table.flush() 72 | 73 | return table 74 | 75 | def all_cell_types(counts): 76 | types = set() 77 | 78 | for row in counts: 79 | types.add(row["cell_type"]) 80 | 81 | return types 82 | 83 | def all_chromosomes(counts): 84 | chromosomes = collections.OrderedDict() 85 | 86 | for row in counts: 87 | chromosomes[row["chromosome"]] = None 88 | 89 | return chromosomes.keys() 90 | 91 | # todo: if this used the files table and we added the cell_type to the files table, this would be much faster, 92 | # but it is probably necessary to leave cell_type in counts table as well (for queries) 93 | file_keys_memo = {} 94 | def file_keys(counts, cell_type): 95 | if not cell_type in file_keys_memo: 96 | file_keys = set() 97 | 98 | logging.debug("Getting file keys for cell type %s", cell_type) 99 | for row in counts.where("cell_type == '%s'" % cell_type): 100 | file_keys.add(row["file_key"]) 101 | 102 | file_keys_memo[cell_type] = file_keys 103 | 104 | logging.debug("memoizing files for %s: %s", cell_type, str(file_keys_memo[cell_type])) 105 | 106 | return file_keys_memo[cell_type] 107 | 108 | def plot_summaries(output_directory, normalized_counts, chromosomes): 109 | bp.output_file(output_directory + "/summary.html") 110 | 111 | for chromosome in chromosomes: 112 | plot_summary(normalized_counts, chromosome) 113 | 114 | bp.save() 115 | 116 | def plot_summary(normalized_counts, chromosome): 117 | logging.debug(" - plotting %s summary", chromosome) 118 | 119 | condition = "(file_key == 0) & (chromosome == '%s')" % chromosome 120 | 121 | chromosome_count_by_bin = collections.defaultdict(int) 122 | for row in normalized_counts.where(condition): 123 | chromosome_count_by_bin[row["bin_number"]] += row["count"] 124 | 125 | num_bins = len(chromosome_count_by_bin) 126 | if num_bins < 2: 127 | logging.info("-- skipping plotting %s because not enough bins (only %d)", chromosome, num_bins) 128 | return 129 | 130 | overall = bp.scatter(chromosome_count_by_bin.keys(), chromosome_count_by_bin.values()) 131 | overall.title = chromosome + " counts per bin across all bam files" 132 | 133 | def plot(output_directory, normalized_counts, chromosome, cell_types): 134 | bp.output_file(output_directory + "/" + chromosome + ".html") 135 | 136 | plot_summary(normalized_counts, chromosome) 137 | 138 | for cell_type in cell_types: 139 | logging.debug(" - plotting %s", cell_type) 140 | 141 | bin_number = [] 142 | count = [] 143 | 144 | condition = "(file_key == 0) & (chromosome == '%s') & (cell_type == '%s')" % (chromosome, cell_type) 145 | 146 | for row in normalized_counts.where(condition): 147 | bin_number.append(row["bin_number"]) 148 | count.append(row["count"]) 149 | 150 | cell_type_plot = bp.scatter(bin_number, count) 151 | cell_type_plot.title = "%s counts per bin" % cell_type 152 | 153 | bp.save() 154 | 155 | def populate_normalized_counts(normalized_counts, counts, file_key, bin_size, files): 156 | total_count = length_for_file_key(files, file_key) 157 | 158 | ''' 159 | Excerpt from Feb 13, 2014 email from Charles Lin: 160 | 161 | We typically report read density in units of reads per million per basepair 162 | 163 | bamliquidator reports counts back in total read positions per bin. To convert that 164 | into reads per million per basepair, we first need to divide by the total million 165 | number of reads in the bam. Then we need to divide by the size of the bin 166 | 167 | So for instance if you have a 1kb bin and get 2500 counts from a bam with 30 million 168 | reads you would calculate density as 2500/1000/30 = 0.083rpm/bp 169 | ''' 170 | factor = (1 / bin_size) * (1 / (total_count / 10**6)) 171 | 172 | for count_row in counts.where("file_key == %d" % file_key): 173 | normalized_counts.row["bin_number"] = count_row["bin_number"] 174 | normalized_counts.row["cell_type"] = count_row["cell_type"] 175 | normalized_counts.row["chromosome"] = count_row["chromosome"] 176 | assert file_key == count_row["file_key"] 177 | normalized_counts.row["file_key"] = file_key 178 | normalized_counts.row["count"] = count_row["count"] * factor 179 | normalized_counts.row["percentile"] = -1 180 | normalized_counts.row.append() 181 | 182 | normalized_counts.flush() 183 | 184 | 185 | def length_for_file_key(files, file_key): 186 | file_rows = files.read_where("key == %d" % file_key) 187 | assert len(file_rows) == 1 188 | return file_rows[0]["length"] 189 | 190 | def normalize_regions(region_counts, files): 191 | logging.info("Normalizing") 192 | 193 | file_key = None 194 | 195 | for row in region_counts: 196 | if row["file_key"] != file_key: 197 | file_key = row["file_key"] 198 | total_count = length_for_file_key(files, file_key) 199 | 200 | region_size = row["stop"] - row["start"] 201 | factor = (1 / region_size) * (1 / (total_count / 10**6)) 202 | 203 | row["normalized_count"] = row["count"] * factor 204 | row.update() 205 | 206 | region_counts.flush() 207 | 208 | # leave off file_key argument to calculate percentiles for the cell_type averaged normalized counts 209 | def populate_percentiles(normalized_counts, cell_type, file_key = 0): 210 | bin_numbers = [] 211 | normalized_count_list = [] 212 | 213 | condition = "(cell_type == '%s') & (file_key == %d)" % (cell_type, file_key) 214 | 215 | for row in normalized_counts.where(condition): 216 | bin_numbers.append(row["bin_number"]) 217 | normalized_count_list.append(row["count"]) 218 | 219 | percentiles = (stats.rankdata(normalized_count_list) - 1) / (len(normalized_count_list)-1) * 100 220 | # percentiles calculated in bulk as suggested at 221 | # http://grokbase.com/t/python/python-list/092235vj27/faster-scipy-percentileofscore 222 | 223 | for i, row in enumerate(normalized_counts.where(condition)): 224 | assert bin_numbers[i] == row["bin_number"] 225 | row["percentile"] = percentiles[i] 226 | row.update() 227 | normalized_counts.flush() 228 | 229 | # the cell type normalized counts are the averages of the genomes in the cell type 230 | def populate_normalized_counts_for_cell_type(normalized_counts, cell_type, file_keys): 231 | processed_a_single_file = False 232 | chromosome_to_summed_counts = collections.OrderedDict() 233 | 234 | for file_key in file_keys: 235 | condition = "(file_key == %d) & (cell_type == '%s')" % (file_key, cell_type) 236 | for row in normalized_counts.where(condition): 237 | if processed_a_single_file: 238 | chromosome_to_summed_counts[row["chromosome"]][row["bin_number"]] += row["count"] 239 | else: 240 | if not chromosome_to_summed_counts.has_key(row["chromosome"]): 241 | chromosome_to_summed_counts[row["chromosome"]] = [] 242 | chromosome_to_summed_counts[row["chromosome"]].append(row["count"]) 243 | processed_a_single_file = True 244 | 245 | cell_type_condition = "(file_key == 0) & (cell_type == '%s')" % cell_type 246 | 247 | len_file_keys = len(file_keys) 248 | 249 | for chromosome, summed_counts in chromosome_to_summed_counts.iteritems(): 250 | for i, summed_count in enumerate(summed_counts): 251 | normalized_counts.row["bin_number"] = i 252 | normalized_counts.row["cell_type"] = cell_type 253 | normalized_counts.row["chromosome"] = chromosome 254 | normalized_counts.row["file_key"] = 0 255 | normalized_counts.row["count"] = chromosome_to_summed_counts[chromosome][i] / len_file_keys 256 | normalized_counts.row["percentile"] = -1 257 | normalized_counts.row.append() 258 | 259 | normalized_counts.flush() 260 | 261 | def create_summary_table(h5file): 262 | class Summary(tables.IsDescription): 263 | bin_number = tables.UInt32Col( pos=0) 264 | chromosome = tables.StringCol(chromosome_name_length, pos=2) 265 | avg_cell_type_percentile = tables.Float64Col( pos=1) 266 | cell_types_gte_95th_percentile = tables.UInt32Col(pos=2) 267 | cell_types_lt_95th_percentile = tables.UInt32Col( pos=3) 268 | lines_gte_95th_percentile = tables.UInt32Col( pos=4) 269 | lines_lt_95th_percentile = tables.UInt32Col( pos=5) 270 | cell_types_gte_5th_percentile = tables.UInt32Col( pos=6) 271 | cell_types_lt_5th_percentile = tables.UInt32Col( pos=7) 272 | lines_gte_5th_percentile = tables.UInt32Col( pos=8) 273 | lines_lt_5th_percentile = tables.UInt32Col( pos=9) 274 | 275 | table = h5file.create_table("/", "summary", Summary, "bin count summary") 276 | 277 | table.flush() 278 | 279 | return table 280 | 281 | 282 | def populate_summary(summary, normalized_counts, chromosome): 283 | high = 95 # 95th percentile 284 | low = 5 # 5th percentile 285 | 286 | summed_cell_type_percentiles_by_bin = collections.defaultdict(float) 287 | cell_types_gte_high_percentile_by_bin = collections.defaultdict(int) 288 | cell_types_lt_high_percentile_by_bin = collections.defaultdict(int) 289 | lines_gte_high_percentile_by_bin = collections.defaultdict(int) 290 | lines_lt_high_percentile_by_bin = collections.defaultdict(int) 291 | cell_types_gte_low_percentile_by_bin = collections.defaultdict(int) 292 | cell_types_lt_low_percentile_by_bin = collections.defaultdict(int) 293 | lines_gte_low_percentile_by_bin = collections.defaultdict(int) 294 | lines_lt_low_percentile_by_bin = collections.defaultdict(int) 295 | lines = set() 296 | cell_types = set() 297 | max_bin = 0 298 | 299 | # note populating the dictionaries this way is much faster than looping through 300 | # each bin and finding the matching fraction rows 301 | for row in normalized_counts.where("chromosome == '%s'" % chromosome): 302 | bin_number = row["bin_number"] 303 | max_bin = max(max_bin, bin_number) 304 | percentile = row["percentile"] 305 | 306 | if row["file_key"] == 0: 307 | cell_types.add(row["cell_type"]) 308 | summed_cell_type_percentiles_by_bin[bin_number] += percentile 309 | if percentile >= high: 310 | cell_types_gte_high_percentile_by_bin[bin_number] += 1 311 | else: 312 | cell_types_lt_high_percentile_by_bin[bin_number] += 1 313 | 314 | if percentile >= low: 315 | cell_types_gte_low_percentile_by_bin[bin_number] += 1 316 | else: 317 | cell_types_lt_low_percentile_by_bin[bin_number] += 1 318 | else: 319 | lines.add(row["file_key"]) 320 | if percentile >= high: 321 | lines_gte_high_percentile_by_bin[bin_number] += 1 322 | else: 323 | lines_lt_high_percentile_by_bin[bin_number] += 1 324 | 325 | if percentile >= low: 326 | lines_gte_low_percentile_by_bin[bin_number] += 1 327 | else: 328 | lines_lt_low_percentile_by_bin[bin_number] += 1 329 | 330 | logging.debug(" - populating summary table with calculated summaries") 331 | 332 | for bin_number in xrange(max_bin+1): 333 | summary.row["bin_number"] = bin_number 334 | summary.row["chromosome"] = chromosome 335 | summary.row["avg_cell_type_percentile"] = summed_cell_type_percentiles_by_bin[bin_number] / len(cell_types) 336 | summary.row["cell_types_gte_95th_percentile"] = cell_types_gte_high_percentile_by_bin[bin_number] 337 | summary.row["cell_types_lt_95th_percentile"] = cell_types_lt_high_percentile_by_bin[bin_number] 338 | summary.row["lines_gte_95th_percentile"] = lines_gte_high_percentile_by_bin[bin_number] 339 | summary.row["lines_lt_95th_percentile"] = lines_lt_high_percentile_by_bin[bin_number] 340 | summary.row["cell_types_gte_5th_percentile"] = cell_types_gte_low_percentile_by_bin[bin_number] 341 | summary.row["cell_types_lt_5th_percentile"] = cell_types_lt_low_percentile_by_bin[bin_number] 342 | summary.row["lines_gte_5th_percentile"] = lines_gte_low_percentile_by_bin[bin_number] 343 | summary.row["lines_lt_5th_percentile"] = lines_lt_low_percentile_by_bin[bin_number] 344 | summary.row.append() 345 | summary.flush() 346 | 347 | def normalize_plot_and_summarize(counts_file, output_directory, bin_size, skip_plot): 348 | delete_all_but_bin_counts_and_files_table(counts_file) 349 | 350 | # recreating the entirity of the remaining tables is quick and easier than updating prior records correctly 351 | 352 | counts = counts_file.root.bin_counts 353 | files = counts_file.root.files 354 | normalized_counts = create_normalized_counts_table(counts_file) 355 | summary = create_summary_table(counts_file) 356 | 357 | cell_types = all_cell_types(counts) 358 | chromosomes = all_chromosomes(counts) 359 | 360 | logging.info("Cell Types: %s", ", ".join(cell_types)) 361 | 362 | for cell_type in cell_types: 363 | logging.info("Normalizing and calculating percentiles for cell type %s", cell_type) 364 | current_file_keys = file_keys(counts, cell_type) 365 | for file_key in current_file_keys: 366 | populate_normalized_counts(normalized_counts, counts, file_key, bin_size, files) 367 | populate_percentiles(normalized_counts, cell_type, file_key) 368 | populate_normalized_counts_for_cell_type(normalized_counts, cell_type, current_file_keys) 369 | populate_percentiles(normalized_counts, cell_type) 370 | 371 | logging.info("Indexing normalized counts") 372 | normalized_counts.cols.bin_number.create_csindex() 373 | normalized_counts.cols.percentile.create_csindex() 374 | normalized_counts.cols.file_key.create_csindex() 375 | normalized_counts.cols.chromosome.create_csindex() 376 | 377 | if not skip_plot: 378 | if bp is None: 379 | logging.error('Skipping plotting because plots require bokeh and it is not installed -- ' 380 | 'see https://github.com/BradnerLab/pipeline/wiki/bamliquidator#Install . ' 381 | 'Consider running the following command to install bokeh: ' 382 | 'sudo pip install bokeh==0.4.4 "openpyxl>=1.6.1,<2.0.0"') 383 | else: 384 | logging.info("Plotting") 385 | for chromosome in chromosomes: 386 | plot(output_directory, normalized_counts, chromosome, cell_types) 387 | plot_summaries(output_directory, normalized_counts, chromosomes) 388 | 389 | logging.info("Summarizing") 390 | for chromosome in chromosomes: 391 | populate_summary(summary, normalized_counts, chromosome) 392 | summary.cols.avg_cell_type_percentile.create_csindex() 393 | 394 | # Iterating over this index in reverse order is hundreds of times slower than iterating 395 | # in ascending order in my tests, but copying into a reverse sorted table is very fast. 396 | # So we create a sorted summary table sorted in decreasing percentile order. If we need to 397 | # iterate in the reverse sorted order, than this sorted_summary table should be used. 398 | # Otherwise, we should use the summary table (including the case of ascending percentile 399 | # order, which is fast since the table is indexed by that column). See 400 | # https://groups.google.com/d/topic/pytables-users/EKMUxghQiPQ/discussion 401 | sorted_summary = summary.copy(newname="sorted_summary", sortby=summary.cols.avg_cell_type_percentile, 402 | step=-1, checkCSI=True, 403 | title="Summary table sorted in decreasing percentile order") 404 | sorted_summary.cols.bin_number.create_csindex() 405 | 406 | def debugging_handler(signal, frame): 407 | import pdb 408 | pdb.set_trace() 409 | 410 | def main(): 411 | parser = argparse.ArgumentParser(description='Calculate and plot normalized bin counts and percentiles. ' 412 | 'Normalized counts, percentiles, and summaries are stored in hdf5 tables in the file "normalized_counts.h5". ' 413 | 'Plots are stored in .html files. The hdf5 and html files are stored by default in a new directory "output" ' 414 | '(which can be overridden by argument, see below), and the program aborts if this directory already exists.') 415 | parser.add_argument('-o', '--output_directory', default='output', 416 | help='directory to create and output the h5 and/or html files to (aborts if already exists)') 417 | parser.add_argument('-b', '--bin_size', type=int, default=100000, 418 | help="Number of base pairs in each bin -- should match the bin size in the bin_counts_h5_file") 419 | parser.add_argument('-v', '--validate', action='store_true', 420 | help='validates the previously generated normalization and/or summary tables, returning ' 421 | 'non-zero if any problems detected') 422 | parser.add_argument('-d', '--debug', action='store_true', 423 | help='enables debugging hooks so ctr-c (SIGINT) enters debugging instead of halting execution') 424 | parser.add_argument('--skip_plot', action='store_true', help='skip generating plots (this can speed up execution') 425 | parser.add_argument('bin_counts_h5_file', help='the hdf5 file with a "counts" and "files" tables as generated by ' 426 | 'bamliquidate_batch') 427 | args = parser.parse_args() 428 | 429 | if args.debug: 430 | import signal; 431 | signal.signal(signal.SIGINT, debugging_handler) 432 | 433 | if args.validate: 434 | sys.exit(validate(args.bin_counts_h5_file)) 435 | 436 | os.mkdir(args.output_directory) 437 | 438 | counts_file = tables.open_file(args.bin_counts_h5_file, "r+") 439 | 440 | normalize_plot_and_summarize(counts_file, args.output_directory, args.bin_size, args.skip_plot) 441 | 442 | counts_file.close() 443 | 444 | def validate(counts_file_path): 445 | counts_file = tables.open_file(counts_file_path, "r") 446 | 447 | error_count = 0 448 | 449 | counts = counts_file.root.bin_counts 450 | cell_types = all_cell_types(counts) 451 | num_cell_types = len(cell_types) 452 | num_files = 0 453 | for cell_type in cell_types: 454 | num_files += len(file_keys(counts, cell_type)) 455 | 456 | logging.info("Verifying that summary files add up to %d and cell types add up to %d", num_files, num_cell_types) 457 | 458 | for row in counts_file.root.summary: 459 | if (num_cell_types != (row["cell_types_gte_95th_percentile"] + row["cell_types_lt_95th_percentile"]) 460 | or num_cell_types != (row["cell_types_gte_5th_percentile"] + row["cell_types_lt_5th_percentile"]) 461 | or num_files != (row["lines_gte_95th_percentile"] + row["lines_lt_95th_percentile"]) 462 | or num_files != (row["lines_gte_5th_percentile"] + row["lines_lt_5th_percentile"])): 463 | error_count += 1 464 | logging.error("Summary row doesn't add up: %s", row[:]) 465 | 466 | counts_file.close() 467 | 468 | if error_count != 0: 469 | logging.error("%d validation errors", error_count) 470 | return 1 471 | return 0 472 | 473 | if __name__ == "__main__": 474 | main() 475 | 476 | -------------------------------------------------------------------------------- /blacklists/ce10-blacklist.bed: -------------------------------------------------------------------------------- 1 | chrI 933000 934500 2 | chrI 2542900 2544000 3 | chrI 3171400 3172600 4 | chrI 3664800 3666100 5 | chrI 3989700 3991000 6 | chrI 4544300 4547500 7 | chrI 5152600 5154000 8 | chrI 10130600 10133000 9 | chrI 10208000 10209100 10 | chrI 10216300 10219200 11 | chrI 10266300 10274300 12 | chrI 10946000 10953100 13 | chrI 14453000 14454600 14 | chrI 15059800 15072400 15 | chrII 0 1000 16 | chrII 500900 502100 17 | chrII 694800 696500 18 | chrII 1452500 1453600 19 | chrII 2569900 2571400 20 | chrII 2897400 2898700 21 | chrII 3466000 3468700 22 | chrII 3796200 3797500 23 | chrII 3942000 3946700 24 | chrII 3962400 3963400 25 | chrII 3993900 3994900 26 | chrII 4284900 4285900 27 | chrII 4640900 4645000 28 | chrII 5144700 5146700 29 | chrII 6506100 6509100 30 | chrII 7444200 7448800 31 | chrII 8287400 8292900 32 | chrII 8975400 8976900 33 | chrII 9631700 9633200 34 | chrII 9809600 9824700 35 | chrII 10335700 10339300 36 | chrII 12843500 12846100 37 | chrII 13598500 13600000 38 | chrII 13939900 13941400 39 | chrII 13984900 13987000 40 | chrII 14324100 14326100 41 | chrII 14336800 14339700 42 | chrII 14992300 14994200 43 | chrII 15277000 15279300 44 | chrIII 414400 415600 45 | chrIII 930600 932400 46 | chrIII 1017900 1020100 47 | chrIII 1269500 1270500 48 | chrIII 1299400 1302900 49 | chrIII 2497000 2501100 50 | chrIII 5353900 5358500 51 | chrIII 7415800 7417800 52 | chrIII 7443900 7449200 53 | chrIII 7594600 7597200 54 | chrIII 8862600 8864100 55 | chrIII 10224200 10226100 56 | chrIII 13778200 13783700 57 | chrIV 906200 907700 58 | chrIV 2828300 2830900 59 | chrIV 3206300 3209500 60 | chrIV 4416200 4421900 61 | chrIV 6357700 6361000 62 | chrIV 6468700 6469800 63 | chrIV 6698000 6699700 64 | chrIV 6714300 6724400 65 | chrIV 7593500 7598300 66 | chrIV 8572900 8581900 67 | chrIV 9045800 9049000 68 | chrIV 10943000 10951200 69 | chrIV 11070500 11076000 70 | chrIV 11610800 11612700 71 | chrIV 11697000 11698000 72 | chrIV 12024000 12025400 73 | chrIV 12169300 12170600 74 | chrIV 12314400 12319500 75 | chrIV 12730500 12731800 76 | chrIV 13360400 13362200 77 | chrIV 13548500 13549900 78 | chrIV 16963300 16964800 79 | chrIV 17059700 17062200 80 | chrV 264300 267300 81 | chrV 1638000 1639300 82 | chrV 3098300 3099700 83 | chrV 3434600 3438800 84 | chrV 4333300 4336600 85 | chrV 5073300 5076300 86 | chrV 5283100 5286100 87 | chrV 6172100 6178000 88 | chrV 6939100 6943200 89 | chrV 7442600 7444800 90 | chrV 7919700 7925000 91 | chrV 7988600 7991500 92 | chrV 8699200 8701900 93 | chrV 9432700 9435500 94 | chrV 10606100 10612000 95 | chrV 12509600 12510900 96 | chrV 14756400 14757500 97 | chrV 14766600 14770500 98 | chrV 16707200 16709400 99 | chrV 17119700 17132600 100 | chrV 17308600 17311700 101 | chrV 17384100 17385800 102 | chrV 17391200 17394500 103 | chrV 18400100 18401700 104 | chrX 109500 114200 105 | chrX 291200 295300 106 | chrX 1752200 1755100 107 | chrX 3007000 3008300 108 | chrX 4026000 4051800 109 | chrX 5056200 5057300 110 | chrX 5914600 5915800 111 | chrX 7076900 7079100 112 | chrX 9186000 9189200 113 | chrX 9438100 9439500 114 | chrX 10361500 10367000 115 | chrX 11785700 11789800 116 | chrX 11886300 11889000 117 | chrX 12277100 12278900 118 | chrX 14388000 14389200 119 | chrX 14907900 14909700 120 | chrX 15226900 15228800 121 | chrX 15807400 15811200 122 | chrX 16758300 16760000 123 | -------------------------------------------------------------------------------- /blacklists/dm3-blacklist.bed: -------------------------------------------------------------------------------- 1 | chr2L 47600 49300 2 | chr2L 982500 984400 3 | chr2L 2885500 2887000 4 | chr2L 4920500 4922400 5 | chr2L 4937900 4941100 6 | chr2L 5171400 5177700 7 | chr2L 6426500 6427500 8 | chr2L 6992200 6996700 9 | chr2L 7345200 7350300 10 | chr2L 8102400 8103400 11 | chr2L 8729600 8731000 12 | chr2L 9899400 9902800 13 | chr2L 9976200 9979800 14 | chr2L 10422300 10423400 15 | chr2L 11992600 11999400 16 | chr2L 12558600 12563800 17 | chr2L 12792200 12794100 18 | chr2L 13522300 13523300 19 | chr2L 13650700 13651700 20 | chr2L 15451900 15452900 21 | chr2L 16514400 16518200 22 | chr2L 19576100 19577300 23 | chr2L 19709600 19711500 24 | chr2L 20197000 20201100 25 | chr2L 20458300 20459300 26 | chr2L 20746500 20747500 27 | chr2L 21022300 21023500 28 | chr2L 21416300 21440600 29 | chr2L 21447300 21454900 30 | chr2L 21482700 21485200 31 | chr2L 21499300 21500400 32 | chr2L 21537800 21543500 33 | chr2L 22202600 22203600 34 | chr2L 22377700 22389700 35 | chr2L 22498500 22500400 36 | chr2L 22543700 22546600 37 | chr2L 22574300 22575300 38 | chr2L 22602400 22603500 39 | chr2L 22661200 22663200 40 | chr2L 22752200 22753200 41 | chr2L 22785500 22787400 42 | chr2L 22809200 22810300 43 | chr2L 22855300 22856400 44 | chr2L 22992900 22994200 45 | chr2LHet 14300 15300 46 | chr2LHet 123500 126300 47 | chr2LHet 133300 134300 48 | chr2LHet 187400 188400 49 | chr2LHet 237700 239100 50 | chr2LHet 244500 245500 51 | chr2LHet 252100 253100 52 | chr2LHet 261200 262200 53 | chr2LHet 347600 348600 54 | chr2LHet 358400 359400 55 | chr2LHet 367600 368800 56 | chr2R 100700 101800 57 | chr2R 141300 144400 58 | chr2R 205900 207500 59 | chr2R 241900 242900 60 | chr2R 260200 261200 61 | chr2R 325500 331800 62 | chr2R 376800 387800 63 | chr2R 567900 572400 64 | chr2R 620800 621800 65 | chr2R 654000 657200 66 | chr2R 698200 700000 67 | chr2R 722000 723500 68 | chr2R 875700 876800 69 | chr2R 893300 894300 70 | chr2R 936500 943900 71 | chr2R 992700 997500 72 | chr2R 1108900 1110300 73 | chr2R 1118300 1119300 74 | chr2R 1174600 1175600 75 | chr2R 1280600 1282300 76 | chr2R 1294200 1295500 77 | chr2R 1458600 1459700 78 | chr2R 1540100 1541400 79 | chr2R 2196300 2202100 80 | chr2R 2231100 2236200 81 | chr2R 2276700 2279200 82 | chr2R 2287700 2289800 83 | chr2R 2328300 2329400 84 | chr2R 2341200 2342600 85 | chr2R 3087700 3088700 86 | chr2R 3123500 3134800 87 | chr2R 3714200 3715200 88 | chr2R 4668700 4670700 89 | chr2R 5615500 5617500 90 | chr2R 6072200 6073500 91 | chr2R 6547100 6549000 92 | chr2R 6838200 6840100 93 | chr2R 6909300 6911100 94 | chr2R 7185100 7189400 95 | chr2R 8369000 8370000 96 | chr2R 8707100 8709600 97 | chr2R 9295900 9299100 98 | chr2R 9615700 9623300 99 | chr2R 9989900 9994400 100 | chr2R 10061200 10062400 101 | chr2R 10076600 10083000 102 | chr2R 10246300 10249200 103 | chr2R 10354900 10356800 104 | chr2R 10779500 10780700 105 | chr2R 13035500 13039700 106 | chr2R 13125400 13127200 107 | chr2R 14258700 14260100 108 | chr2R 14464100 14467300 109 | chr2R 14481500 14483500 110 | chr2R 15617000 15618000 111 | chr2R 15627400 15631300 112 | chr2R 15647200 15648300 113 | chr2R 16667500 16672900 114 | chr2R 17701800 17704000 115 | chr2R 18414400 18415800 116 | chr2R 19253300 19255000 117 | chr2R 19294200 19295300 118 | chr2R 20070900 20072200 119 | chr2RHet 0 1000 120 | chr2RHet 579900 580900 121 | chr2RHet 685800 688400 122 | chr2RHet 717100 718100 123 | chr2RHet 908100 912900 124 | chr2RHet 1013300 1015200 125 | chr2RHet 1260000 1261700 126 | chr2RHet 1319600 1322400 127 | chr2RHet 1354100 1355100 128 | chr2RHet 1422600 1424400 129 | chr2RHet 1430900 1435700 130 | chr2RHet 1636500 1637800 131 | chr2RHet 2049300 2050300 132 | chr2RHet 2089000 2090100 133 | chr2RHet 2230300 2231300 134 | chr2RHet 2580100 2581500 135 | chr2RHet 2610100 2611100 136 | chr2RHet 2823800 2824900 137 | chr2RHet 2985000 2986100 138 | chr2RHet 3181000 3183700 139 | chr3L 1245300 1247200 140 | chr3L 1425400 1427300 141 | chr3L 2063900 2069700 142 | chr3L 3899200 3901900 143 | chr3L 4361900 4362900 144 | chr3L 4849900 4850900 145 | chr3L 5047600 5048600 146 | chr3L 5104600 5105700 147 | chr3L 5456000 5457700 148 | chr3L 5995100 5997500 149 | chr3L 7242000 7243400 150 | chr3L 7372600 7373600 151 | chr3L 7676400 7684700 152 | chr3L 7788500 7789500 153 | chr3L 7913800 7914800 154 | chr3L 8014900 8017000 155 | chr3L 9076600 9077700 156 | chr3L 9385600 9386600 157 | chr3L 9569700 9574100 158 | chr3L 9923100 9930700 159 | chr3L 11322900 11324600 160 | chr3L 11500300 11501300 161 | chr3L 11606200 11612400 162 | chr3L 11961600 11965500 163 | chr3L 13572200 13573200 164 | chr3L 14719900 14721800 165 | chr3L 14818500 14819700 166 | chr3L 15290000 15291500 167 | chr3L 15416900 15419800 168 | chr3L 15548700 15551100 169 | chr3L 15818700 15819700 170 | chr3L 16044500 16046400 171 | chr3L 16592100 16600800 172 | chr3L 16678900 16681600 173 | chr3L 17911500 17914200 174 | chr3L 18522300 18523300 175 | chr3L 20470800 20476600 176 | chr3L 20815200 20817800 177 | chr3L 21367700 21369600 178 | chr3L 21478100 21479400 179 | chr3L 21746300 21747500 180 | chr3L 22092900 22095600 181 | chr3L 22811000 22812700 182 | chr3L 23036000 23037200 183 | chr3L 23134000 23135600 184 | chr3L 23416500 23417700 185 | chr3L 23433200 23434200 186 | chr3L 23490600 23491600 187 | chr3L 23662400 23668400 188 | chr3L 23784200 23785300 189 | chr3L 23819100 23821000 190 | chr3L 23961100 23964900 191 | chr3L 24084700 24095200 192 | chr3L 24162700 24165000 193 | chr3L 24187000 24188800 194 | chr3L 24214000 24215000 195 | chr3L 24364000 24365000 196 | chr3L 24434000 24435200 197 | chr3L 24461000 24463300 198 | chr3L 24496000 24498000 199 | chr3L 24537400 24539300 200 | chr3LHet 72700 74500 201 | chr3LHet 87000 89200 202 | chr3LHet 153100 154400 203 | chr3LHet 278100 279100 204 | chr3LHet 537200 538200 205 | chr3LHet 708600 710200 206 | chr3LHet 773600 776000 207 | chr3LHet 1279900 1285700 208 | chr3LHet 1346200 1347400 209 | chr3LHet 1483500 1485000 210 | chr3LHet 1517000 1518000 211 | chr3LHet 1894100 1895100 212 | chr3LHet 1986900 1988900 213 | chr3LHet 2163700 2164700 214 | chr3LHet 2186300 2187300 215 | chr3LHet 2202200 2207900 216 | chr3LHet 2245500 2252900 217 | chr3R 57400 59400 218 | chr3R 96400 102900 219 | chr3R 198900 199900 220 | chr3R 579100 586900 221 | chr3R 719600 720600 222 | chr3R 829600 832900 223 | chr3R 873600 878700 224 | chr3R 1085200 1086600 225 | chr3R 1165600 1169100 226 | chr3R 1648000 1649200 227 | chr3R 2133300 2134500 228 | chr3R 2335800 2336800 229 | chr3R 2645900 2648700 230 | chr3R 2912900 2914300 231 | chr3R 3177000 3178900 232 | chr3R 3526300 3528800 233 | chr3R 3869500 3870600 234 | chr3R 3920900 3922000 235 | chr3R 4228800 4230700 236 | chr3R 4396900 4399200 237 | chr3R 4726700 4739500 238 | chr3R 5516800 5525600 239 | chr3R 6083200 6085300 240 | chr3R 6210800 6213800 241 | chr3R 6786300 6787700 242 | chr3R 6892800 6893800 243 | chr3R 7197700 7199000 244 | chr3R 7494800 7496100 245 | chr3R 7666900 7668000 246 | chr3R 7785600 7786600 247 | chr3R 8317600 8320200 248 | chr3R 8325500 8336000 249 | chr3R 8537600 8539000 250 | chr3R 9140500 9144600 251 | chr3R 9804300 9806700 252 | chr3R 10072200 10073600 253 | chr3R 10931600 10932600 254 | chr3R 10959500 10964500 255 | chr3R 10991400 10992400 256 | chr3R 11433500 11434500 257 | chr3R 12814000 12820400 258 | chr3R 13859500 13862900 259 | chr3R 14855600 14856900 260 | chr3R 15267300 15268600 261 | chr3R 15601300 15602900 262 | chr3R 15936800 15937800 263 | chr3R 16073600 16074600 264 | chr3R 16379100 16385500 265 | chr3R 17126000 17128300 266 | chr3R 17436400 17437500 267 | chr3R 18124200 18125200 268 | chr3R 18276100 18277600 269 | chr3R 18314300 18315300 270 | chr3R 18709600 18710600 271 | chr3R 19359000 19360100 272 | chr3R 19664700 19669400 273 | chr3R 19683000 19684000 274 | chr3R 19903100 19904200 275 | chr3R 20032900 20037400 276 | chr3R 21152900 21154400 277 | chr3R 21224000 21226100 278 | chr3R 21435400 21436400 279 | chr3R 21942100 21943100 280 | chr3R 22922400 22923600 281 | chr3R 22969100 22971000 282 | chr3R 24079300 24081200 283 | chr3R 24584500 24585500 284 | chr3R 25479000 25480100 285 | chr3R 25604600 25605700 286 | chr3R 26064600 26065600 287 | chr3R 26227000 26228900 288 | chr3R 26901000 26904300 289 | chr3R 27157400 27159300 290 | chr3R 27241400 27243600 291 | chr3R 27718700 27720500 292 | chr3R 27806500 27809200 293 | chr3R 27895800 27899700 294 | chr3RHet 54600 55600 295 | chr3RHet 790700 791700 296 | chr3RHet 1034300 1035300 297 | chr3RHet 1275200 1277400 298 | chr3RHet 1346400 1348100 299 | chr3RHet 1358800 1360000 300 | chr3RHet 1371000 1372000 301 | chr3RHet 1416800 1417900 302 | chr3RHet 1492600 1496100 303 | chr3RHet 1518600 1524600 304 | chr3RHet 1599200 1600400 305 | chr3RHet 1721500 1723400 306 | chr3RHet 1778200 1779200 307 | chr3RHet 1828500 1833900 308 | chr3RHet 1867900 1875100 309 | chr3RHet 1950500 1958900 310 | chr3RHet 1983600 1989500 311 | chr3RHet 2109200 2110300 312 | chr3RHet 2268000 2269000 313 | chr3RHet 2450000 2451500 314 | chr4 58500 59500 315 | chr4 228400 229400 316 | chr4 434600 435600 317 | chr4 565900 566900 318 | chr4 612300 615900 319 | chr4 810100 811200 320 | chr4 860800 862000 321 | chr4 928600 929800 322 | chr4 1197100 1198100 323 | chr4 1283400 1300500 324 | chr4 1314800 1328300 325 | chr4 1339700 1341000 326 | chrU 108000 111700 327 | chrU 924100 925700 328 | chrU 984500 986000 329 | chrU 1041000 1042000 330 | chrU 1094700 1095900 331 | chrU 1499100 1503700 332 | chrU 2254700 2255700 333 | chrU 3229400 3230500 334 | chrU 3309700 3314000 335 | chrU 3430700 3432100 336 | chrU 4472900 4476400 337 | chrU 5290000 5303800 338 | chrU 5527800 5529000 339 | chrU 5583400 5584500 340 | chrU 6085800 6087100 341 | chrU 6334300 6335600 342 | chrU 6565500 6567300 343 | chrU 6938200 6939300 344 | chrU 7168400 7169600 345 | chrU 7222200 7223300 346 | chrU 7284400 7285500 347 | chrU 7735700 7736900 348 | chrU 7870600 7873000 349 | chrU 7934000 7935700 350 | chrU 8286900 8288100 351 | chrU 8558700 8559800 352 | chrU 8608600 8610400 353 | chrU 8746800 8748600 354 | chrU 9171000 9172000 355 | chrU 9285700 9286800 356 | chrU 9691100 9692500 357 | chrU 9823800 9825200 358 | chrU 10033000 10034200 359 | chrUextra 4169000 4171000 360 | chrUextra 4484500 4485700 361 | chrUextra 4686700 4687700 362 | chrUextra 5348900 5350700 363 | chrUextra 6549400 6550900 364 | chrUextra 6671700 6672800 365 | chrUextra 8371300 8373000 366 | chrUextra 8510200 8511200 367 | chrUextra 10902600 10904200 368 | chrUextra 13532500 13534000 369 | chrUextra 14386900 14388400 370 | chrUextra 14419500 14421000 371 | chrUextra 17296900 17298500 372 | chrUextra 18430600 18431700 373 | chrUextra 18567800 18569300 374 | chrUextra 19220900 19222200 375 | chrUextra 20209800 20211200 376 | chrUextra 20696200 20698000 377 | chrUextra 21539600 21541500 378 | chrUextra 21942200 21943600 379 | chrUextra 22178200 22179800 380 | chrUextra 22317900 22319400 381 | chrUextra 23209900 23211500 382 | chrUextra 24697200 24698200 383 | chrUextra 24985000 24986100 384 | chrUextra 25004300 25005900 385 | chrUextra 25110400 25111900 386 | chrUextra 25257200 25258900 387 | chrUextra 25407300 25408900 388 | chrUextra 25861700 25863300 389 | chrUextra 26159300 26160800 390 | chrUextra 26370300 26371600 391 | chrUextra 26851300 26852900 392 | chrUextra 27076100 27077800 393 | chrUextra 27370400 27371400 394 | chrUextra 27599800 27601000 395 | chrUextra 27638000 27639500 396 | chrUextra 27711600 27713100 397 | chrUextra 27825100 27826700 398 | chrUextra 27871800 27873400 399 | chrUextra 27890400 27891500 400 | chrUextra 27931900 27933400 401 | chrUextra 27954600 27955600 402 | chrUextra 27979700 27980900 403 | chrUextra 28089700 28094700 404 | chrUextra 28106300 28107900 405 | chrUextra 28213100 28214700 406 | chrUextra 28324000 28325100 407 | chrUextra 28366600 28367800 408 | chrUextra 28421900 28423200 409 | chrUextra 28447200 28450100 410 | chrUextra 28456000 28457500 411 | chrUextra 28539300 28542400 412 | chrUextra 28555900 28573200 413 | chrUextra 28581500 28593500 414 | chrUextra 28604700 28606200 415 | chrUextra 28612700 28621800 416 | chrUextra 28635700 28647800 417 | chrUextra 28653100 28654300 418 | chrUextra 28668200 28669500 419 | chrUextra 28702600 28704200 420 | chrUextra 28719800 28727800 421 | chrUextra 28740900 28744300 422 | chrUextra 28751900 28762000 423 | chrUextra 28769300 28771600 424 | chrUextra 28797200 28798500 425 | chrUextra 28810900 28812400 426 | chrUextra 28853400 28855100 427 | chrUextra 28896600 28902100 428 | chrUextra 28908900 28910100 429 | chrUextra 28940100 28944000 430 | chrUextra 28958300 28961600 431 | chrUextra 28977200 28989700 432 | chrX 6300 16700 433 | chrX 103200 105100 434 | chrX 323200 328600 435 | chrX 448700 450900 436 | chrX 458600 459600 437 | chrX 708200 709200 438 | chrX 900500 901700 439 | chrX 1259600 1261500 440 | chrX 1412200 1416800 441 | chrX 1827800 1829700 442 | chrX 1853600 1854800 443 | chrX 2299000 2300700 444 | chrX 2505400 2511600 445 | chrX 3309200 3315200 446 | chrX 3684400 3687600 447 | chrX 3692800 3695800 448 | chrX 3839200 3842100 449 | chrX 4627700 4630000 450 | chrX 4820300 4827000 451 | chrX 4885400 4887400 452 | chrX 6278000 6279000 453 | chrX 6918200 6920700 454 | chrX 7019400 7021300 455 | chrX 7374900 7376200 456 | chrX 8187200 8190700 457 | chrX 10289600 10290600 458 | chrX 10993900 10997100 459 | chrX 11490100 11492100 460 | chrX 11784200 11785200 461 | chrX 12826500 12831800 462 | chrX 13943100 13944300 463 | chrX 13950900 13954400 464 | chrX 14172800 14174600 465 | chrX 14445700 14450600 466 | chrX 15689300 15690500 467 | chrX 15947800 15953500 468 | chrX 17009200 17013300 469 | chrX 19472200 19473700 470 | chrX 19531100 19532300 471 | chrX 19837300 19839200 472 | chrX 20069000 20074400 473 | chrX 20085100 20086700 474 | chrX 20713000 20714400 475 | chrX 21478800 21480700 476 | chrX 21493800 21494900 477 | chrX 21594700 21596700 478 | chrX 21612700 21614600 479 | chrX 21771300 21773200 480 | chrX 21834000 21835900 481 | chrXHet 0 1100 482 | chrXHet 14900 15900 483 | chrXHet 34900 41200 484 | chrXHet 87500 88500 485 | chrXHet 163000 164000 486 | chrXHet 178200 179200 487 | chrXHet 196100 197300 488 | chrYHet 4500 5500 489 | chrYHet 36700 37700 490 | chrYHet 72100 75500 491 | chrYHet 136400 137600 492 | chrYHet 280400 284300 493 | -------------------------------------------------------------------------------- /blacklists/source-info/URLs.txt: -------------------------------------------------------------------------------- 1 | See info at: 2 | https://sites.google.com/site/anshulkundaje/projects/blacklists 3 | 4 | HUMAN (hg19/GRCh37): http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeMapability/wgEncodeDacMapabilityConsensusExcludable.bed.gz 5 | Official track at UCSC http://genome.ucsc.edu/cgi-bin/hgFileUi?db=hg19&g=wgEncodeMapability 6 | README on how this track of generated: http://www.broadinstitute.org/~anshul/projects/encode/rawdata/blacklists/hg19-blacklist-README.pdf 7 | MOUSE (mm9): http://www.broadinstitute.org/~anshul/projects/mouse/blacklist/mm9-blacklist.bed.gz 8 | WORM (ce10): http://www.broadinstitute.org/~anshul/projects/worm/blacklist/ce10-blacklist.bed.gz 9 | FLY (dm3): http://www.broadinstitute.org/~anshul/projects/fly/blacklist/dm3-blacklist.bed.gz 10 | -------------------------------------------------------------------------------- /blacklists/source-info/hg19-blacklist-README.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GordonLab/riesling-pipeline/384f41dc964db0f59b3992f775e87c651e846f2b/blacklists/source-info/hg19-blacklist-README.pdf -------------------------------------------------------------------------------- /blacklists/wgEncodeDacMapabilityConsensusExcludable.bed: -------------------------------------------------------------------------------- 1 | chr1 564449 570371 High_Mappability_island 1000 . 2 | chr1 724136 727043 Satellite_repeat 1000 . 3 | chr1 825006 825115 BSR/Beta 1000 . 4 | chr1 2583334 2634374 Low_mappability_island 1000 . 5 | chr1 4363064 4363242 (CATTC)n 1000 . 6 | chr1 5725866 5736651 Low_mappability_island 1000 . 7 | chr1 16839923 16841396 Low_mappability_island 1000 . 8 | chr1 38077347 38077423 Low_mappability_island 1000 . 9 | chr1 91852785 91853147 LSU-rRNA_Hsa 1000 . 10 | chr1 104163724 104163860 Low_mappability_island 1000 . 11 | chr1 108112972 108113707 LSU-rRNA_Hsa 1000 . 12 | chr1 121351474 121487059 centromeric_repeat 1000 . 13 | chr1 142535434 142543081 Satellite_repeat 1000 . 14 | chr1 142723256 142723968 Low_mappability_island 1000 . 15 | chr1 142792613 142793303 Low_mappability_island 1000 . 16 | chr1 142835822 142837333 Low_mappability_island 1000 . 17 | chr1 143274490 143284340 centromeric_repeat 1000 . 18 | chr1 145277108 145277572 LSU-rRNA_Hsa 1000 . 19 | chr1 149033183 149035829 Satellite_repeat 1000 . 20 | chr1 156186169 156186712 High_Mappability_island 1000 . 21 | chr1 224199390 224204260 Satellite_repeat 1000 . 22 | chr1 233318467 233318516 (CATTC)n 1000 . 23 | chr1 236260366 236260821 Low_mappability_island 1000 . 24 | chr1 237766308 237766764 LSU-rRNA_Hsa 1000 . 25 | chr1 238105345 238105511 Low_mappability_island 1000 . 26 | chr1 238108025 238108378 Low_mappability_island 1000 . 27 | chr1 238108645 238109697 Low_mappability_island 1000 . 28 | chr10 18841533 18862467 (CATTC)n 1000 . 29 | chr10 20035661 20037171 Low_mappability_island 1000 . 30 | chr10 36722282 36723650 Low_mappability_island 1000 . 31 | chr10 38772277 38819357 Satellite_repeat 1000 . 32 | chr10 38868892 38889025 Satellite_repeat 1000 . 33 | chr10 39076515 39155771 Satellite_repeat 1000 . 34 | chr10 42354835 42548642 centromeric_repeat 1000 . 35 | chr10 42596676 42602082 Satellite_repeat 1000 . 36 | chr10 42596700 42602110 Satellite_repeat 1000 . 37 | chr10 42661264 42667623 Satellite_repeat 1000 . 38 | chr10 42790522 42818398 Satellite_repeat 1000 . 39 | chr10 135498649 135502716 Satellite_repeat 1000 . 40 | chr11 6831669 6831838 ALR/Alpha 1000 . 41 | chr11 10529403 10531969 Low_mappability_island 1000 . 42 | chr11 48671444 48902406 centromeric_repeat 1000 . 43 | chr11 48931242 48964015 centromeric_repeat 1000 . 44 | chr11 50318471 50784078 centromeric_repeat 1000 . 45 | chr11 51090700 51374066 centromeric_repeat 1000 . 46 | chr11 51567242 51594226 centromeric_repeat 1000 . 47 | chr11 54694046 55027975 centromeric_repeat 1000 . 48 | chr11 73221660 73221946 Low_mappability_island 1000 . 49 | chr11 85194913 85195322 LSU-rRNA_Hsa 1000 . 50 | chr11 87524468 87525005 Low_mappability_island 1000 . 51 | chr11 103275584 103281729 Low_mappability_island 1000 . 52 | chr11 122874287 122874443 Low_mappability_island 1000 . 53 | chr12 20704285 20704583 SSU-rRNA_Hsa 1000 . 54 | chr12 34372315 34372825 LSU-rRNA_Hsa 1000 . 55 | chr12 34432130 34857010 centromeric_repeat 1000 . 56 | chr12 37989447 38441828 centromeric_repeat 1000 . 57 | chr12 38531376 38531930 LSU-rRNA_Hsa 1000 . 58 | chr12 41757383 41757545 Low_mappability_island 1000 . 59 | chr12 127650407 127651075 LSU-rRNA_Hsa 1000 . 60 | chr12 132061320 132062046 Low_mappability_island 1000 . 61 | chr13 56545728 56545925 Low_mappability_island 1000 . 62 | chr13 110076444 110076782 Low_mappability_island 1000 . 63 | chr14 18999935 19056900 centromeric_repeat 1000 . 64 | chr14 32953263 32954381 Low_mappability_island 1000 . 65 | chr14 84637832 84639038 Low_mappability_island 1000 . 66 | chr14 90341302 90341516 SSU-rRNA_Hsa 1000 . 67 | chr15 19999941 20044132 centromeric_repeat 1000 . 68 | chr16 32493036 32570826 ALR/Alpha 1000 . 69 | chr16 32590063 32598801 ALR/Alpha 1000 . 70 | chr16 33237130 33241330 Low_mappability_island 1000 . 71 | chr16 33864355 34023306 centromeric_repeat 1000 . 72 | chr16 34180542 34197081 Satellite_repeat 1000 . 73 | chr16 34530115 34542632 BSR/Beta 1000 . 74 | chr16 35193580 35285885 centromeric_repeat 1000 . 75 | chr16 46385718 46456668 Satellite_repeat 1000 . 76 | chr16 46497639 46500515 Satellite_repeat 1000 . 77 | chr16 47538629 47539297 LSU-rRNA_Hsa 1000 . 78 | chr17 19355538 19356096 LSU-rRNA_Hsa 1000 . 79 | chr17 19502495 19506773 Low_mappability_island 1000 . 80 | chr17 21905167 21906712 centromeric_repeat 1000 . 81 | chr17 22018524 22032049 Low_mappability_island 1000 . 82 | chr17 22221073 22263006 centromeric_repeat 1000 . 83 | chr17 25263010 25268059 Satellite_repeat 1000 . 84 | chr17 25415551 25417559 telomeric_repeat 1000 . 85 | chr17 31149365 31149981 High_Mappability_island 1000 . 86 | chr17 33478114 33478372 LSU-rRNA_Hsa 1000 . 87 | chr17 41381502 41382591 High_Mappability_island 1000 . 88 | chr17 41463538 41464075 High_Mappability_island 1000 . 89 | chr17 41464478 41465015 snRNA 1000 . 90 | chr17 41465562 41467288 High_Mappability_island 1000 . 91 | chr17 51183038 51183763 Low_mappability_island 1000 . 92 | chr17 55868618 55868752 LSU-rRNA_Hsa 1000 . 93 | chr17 75158031 75158430 LSU-rRNA_Hsa 1000 . 94 | chr18 96416 97552 Satellite_repeat 1000 . 95 | chr18 105658 112233 Satellite_repeat 1000 . 96 | chr18 2842252 2842356 Low_mappability_island 1000 . 97 | chr18 15393801 15393992 centromeric_repeat 1000 . 98 | chr18 18510894 18520356 centromeric_repeat 1000 . 99 | chr18 44126235 44126593 (CATTC)n 1000 . 100 | chr18 45379603 45379864 Low_mappability_island 1000 . 101 | chr18 50319086 50319301 Low_mappability_island 1000 . 102 | chr18 77772846 77773065 LSU-rRNA_Hsa 1000 . 103 | chr19 246006 247844 TAR1 1000 . 104 | chr19 22877614 22877696 SSU-rRNA_Hsa 1000 . 105 | chr19 23235030 23235504 BSR/Beta 1000 . 106 | chr19 24182398 24186210 LSU-rRNA_Hsa 1000 . 107 | chr19 24385474 24633168 centromeric_repeat 1000 . 108 | chr19 27730611 28262682 centromeric_repeat 1000 . 109 | chr19 36066445 36066810 LSU-rRNA_Hsa 1000 . 110 | chr19 36756398 36800948 centromeric_repeat 1000 . 111 | chr19 37759473 37797722 centromeric_repeat 1000 . 112 | chr19 44914313 44916340 ACRO1 1000 . 113 | chr19 44960681 44962681 ACRO1 1000 . 114 | chr2 739925 740994 Low_mappability_island 1000 . 115 | chr2 49456729 49457067 Low_mappability_island 1000 . 116 | chr2 88124390 88124903 Low_mappability_island 1000 . 117 | chr2 89830421 89880514 Satellite_repeat 1000 . 118 | chr2 90371401 90394776 Satellite_repeat 1000 . 119 | chr2 90443001 90545431 Low_mappability_island 1000 . 120 | chr2 91595080 91616015 Satellite_repeat 1000 . 121 | chr2 92267428 92326280 centromeric_repeat 1000 . 122 | chr2 115695017 115695281 LSU-rRNA_Hsa 1000 . 123 | chr2 117781085 117781300 Low_mappability_island 1000 . 124 | chr2 132966248 132989300 centromeric_repeat 1000 . 125 | chr2 132994855 133007983 ALR/Alpha 1000 . 126 | chr2 133011824 133013298 SSU-rRNA_Hsa 1000 . 127 | chr2 133036250 133040042 LSU-rRNA_Hsa 1000 . 128 | chr2 133044095 133045945 ACRO1 1000 . 129 | chr2 143848503 143848792 Low_mappability_island 1000 . 130 | chr2 148022736 148022878 Low_mappability_island 1000 . 131 | chr2 149639207 149639515 Low_mappability_island 1000 . 132 | chr2 156120500 156120610 Low_mappability_island 1000 . 133 | chr2 162135000 162139241 Low_mappability_island 1000 . 134 | chr2 230045426 230045796 LSU-rRNA_Hsa 1000 . 135 | chr20 26257032 26320267 centromeric_repeat 1000 . 136 | chr20 29517710 29521147 centromeric_repeat 1000 . 137 | chr20 29803876 29833334 centromeric_repeat 1000 . 138 | chr20 55932703 55936114 chrM 1000 . 139 | chr20 62916702 62918053 telomeric_repeat 1000 . 140 | chr21 9647205 9648529 Low_mappability_island 1000 . 141 | chr21 9694896 9704962 centromeric_repeat 1000 . 142 | chr21 9825451 9827612 High_Mappability_island 1000 . 143 | chr21 9827612 9845233 Low_mappability_island 1000 . 144 | chr21 9881895 9882569 TAR1 1000 . 145 | chr21 10084922 10088004 Satellite_repeat 1000 . 146 | chr21 10492876 10493049 Low_mappability_island 1000 . 147 | chr21 10599428 10599915 TAR1 1000 . 148 | chr21 10697886 10860890 centromeric_repeat 1000 . 149 | chr21 11186054 11188131 Satellite_repeat 1000 . 150 | chr21 14338127 14369791 centromeric_repeat 1000 . 151 | chr21 18800575 18800997 (GAGTG)n 1000 . 152 | chr21 27228003 27228242 SSU-rRNA_Hsa 1000 . 153 | chr21 46796081 46796336 Low_mappability_island 1000 . 154 | chr22 16847814 16862659 Satellite_repeat 1000 . 155 | chr22 18876789 18884510 Satellite_repeat 1000 . 156 | chr3 25508897 25509131 Low_mappability_island 1000 . 157 | chr3 73159606 73161131 snRNA 1000 . 158 | chr3 75696297 75699304 BSR/Beta 1000 . 159 | chr3 75717841 75720426 Satellite_repeat 1000 . 160 | chr3 80995858 81014459 ALR/Alpha 1000 . 161 | chr3 90311686 90507410 centromeric_repeat 1000 . 162 | chr3 93504815 93519133 centromeric_repeat 1000 . 163 | chr3 96335934 96337436 Low_mappability_island 1000 . 164 | chr3 160665423 160665642 Low_mappability_island 1000 . 165 | chr3 196625514 196625860 Satellite_repeat 1000 . 166 | chr3 197825427 197834080 Low_mappability_island 1000 . 167 | chr4 9987 12694 telomeric_repeat 1000 . 168 | chr4 12276463 12292424 ALR/Alpha 1000 . 169 | chr4 12641862 12642305 Low_mappability_island 1000 . 170 | chr4 21583630 21583719 (GAATG)n 1000 . 171 | chr4 27732004 27732240 Low_mappability_island 1000 . 172 | chr4 47774268 47774416 Low_mappability_island 1000 . 173 | chr4 49085372 49342114 centromeric_repeat 1000 . 174 | chr4 49488472 49662085 centromeric_repeat 1000 . 175 | chr4 52659961 52688986 centromeric_repeat 1000 . 176 | chr4 56194229 56194584 Low_mappability_island 1000 . 177 | chr4 65473858 65473941 Low_mappability_island 1000 . 178 | chr4 68264186 68266830 centromeric_repeat 1000 . 179 | chr4 70296565 70296841 LSU-rRNA_Hsa 1000 . 180 | chr4 76807083 76807320 LSU-rRNA_Hsa 1000 . 181 | chr4 78929660 78929920 Low_mappability_island 1000 . 182 | chr4 156374749 156377226 chrM 1000 . 183 | chr4 156384860 156387314 Low_mappability_island 1000 . 184 | chr4 163342479 163342744 Low_mappability_island 1000 . 185 | chr4 190190746 190203442 Low_mappability_island 1000 . 186 | chr4 190801869 190802909 Low_mappability_island 1000 . 187 | chr4 190943802 190943962 Satellite_repeat 1000 . 188 | chr4 190987268 190990949 Satellite_repeat 1000 . 189 | chr4 191026302 191044344 telomeric_repeat 1000 . 190 | chr5 17517177 17600940 Low_mappability_island 1000 . 191 | chr5 21477365 21497415 Low_mappability_island 1000 . 192 | chr5 34177882 34197574 Low_mappability_island 1000 . 193 | chr5 45908253 46411114 centromeric_repeat 1000 . 194 | chr5 49405493 49554574 centromeric_repeat 1000 . 195 | chr5 71146650 71146996 LSU-rRNA_Hsa 1000 . 196 | chr5 79945807 79948223 Low_mappability_island 1000 . 197 | chr5 93903068 93906726 Low_mappability_island 1000 . 198 | chr5 97746525 97746679 Low_mappability_island 1000 . 199 | chr5 99381556 99390873 Low_mappability_island 1000 . 200 | chr5 105889063 105889263 chrM 1000 . 201 | chr5 123095972 123097432 chrM 1000 . 202 | chr5 134258949 134264271 Low_mappability_island 1000 . 203 | chr5 174541634 174542177 SSU-rRNA_Hsa 1000 . 204 | chr6 58735349 58739031 centromeric_repeat 1000 . 205 | chr6 58745955 58780547 centromeric_repeat 1000 . 206 | chr6 61880095 61944008 centromeric_repeat 1000 . 207 | chr6 62189892 62206612 ALR/Alpha 1000 . 208 | chr6 62207809 62230644 ALR/Alpha 1000 . 209 | chr6 62283966 62284581 Low_mappability_island 1000 . 210 | chr6 133593944 133594201 LSU-rRNA_Hsa 1000 . 211 | chr6 137059142 137059326 SSU-rRNA_Hsa 1000 . 212 | chr6 150665074 150665281 SSU-rRNA_Hsa 1000 . 213 | chr6 157731310 157735525 Low_mappability_island 1000 . 214 | chr7 43878355 43878530 TAR1 1000 . 215 | chr7 45291517 45291740 Low_mappability_island 1000 . 216 | chr7 56437808 56442977 Low_mappability_island 1000 . 217 | chr7 57253980 57254183 Low_mappability_island 1000 . 218 | chr7 57255310 57255444 Low_mappability_island 1000 . 219 | chr7 57261829 57261998 Low_mappability_island 1000 . 220 | chr7 57544726 57556913 Satellite_repeat 1000 . 221 | chr7 57811488 57836990 centromeric_repeat 1000 . 222 | chr7 57939184 58055539 centromeric_repeat 1000 . 223 | chr7 61054285 62454680 centromeric_repeat 1000 . 224 | chr7 64059157 64066183 BSR/Beta 1000 . 225 | chr7 64951348 64956223 centromeric_repeat 1000 . 226 | chr7 68201468 68201673 Low_mappability_island 1000 . 227 | chr7 68527370 68527788 LSU-rRNA_Hsa 1000 . 228 | chr7 80962907 80963147 SSU-rRNA_Hsa 1000 . 229 | chr7 100550640 100551321 Low_mappability_island 1000 . 230 | chr7 142372972 142375638 Low_mappability_island 1000 . 231 | chr7 145694403 145694561 Low_mappability_island 1000 . 232 | chr8 155512 157639 TAR1 1000 . 233 | chr8 21455971 21456306 LSU-rRNA_Hsa 1000 . 234 | chr8 32868966 32873279 Low_mappability_island 1000 . 235 | chr8 43092737 43097573 Satellite_repeat 1000 . 236 | chr8 43399486 43843604 centromeric_repeat 1000 . 237 | chr8 46838215 47457541 centromeric_repeat 1000 . 238 | chr8 47739043 47742797 Low_mappability_island 1000 . 239 | chr8 47750844 47776101 BSR/Beta 1000 . 240 | chr8 56754955 56755418 LSU-rRNA_Hsa 1000 . 241 | chr8 69218401 69218922 LSU-rRNA_Hsa 1000 . 242 | chr8 70602248 70602620 LSU-rRNA_Hsa 1000 . 243 | chr8 77114154 77114389 Low_mappability_island 1000 . 244 | chr8 100508010 100508287 Low_mappability_island 1000 . 245 | chr9 10435 11574 TAR1 1000 . 246 | chr9 4799734 4800000 SSU-rRNA_Hsa 1000 . 247 | chr9 33656606 33659249 Low_mappability_island 1000 . 248 | chr9 42819021 42832395 centromeric_repeat 1000 . 249 | chr9 44070617 44070871 Low_mappability_island 1000 . 250 | chr9 44873123 44902307 centromeric_repeat 1000 . 251 | chr9 45355954 45357644 telomeric_repeat 1000 . 252 | chr9 45435109 45443517 centromeric_repeat 1000 . 253 | chr9 66494170 66494805 TAR1 1000 . 254 | chr9 66767710 66864329 centromeric_repeat 1000 . 255 | chr9 66970914 67005594 centromeric_repeat 1000 . 256 | chr9 67315122 67321036 centromeric_repeat 1000 . 257 | chr9 67789868 67792893 centromeric_repeat 1000 . 258 | chr9 68410775 68435115 Low_mappability_island 1000 . 259 | chr9 69677073 69687998 centromeric_repeat 1000 . 260 | chr9 69689770 69711497 centromeric_repeat 1000 . 261 | chr9 69947961 70011196 centromeric_repeat 1000 . 262 | chr9 70076144 70076855 centromeric_repeat 1000 . 263 | chr9 70318723 70327683 centromeric_repeat 1000 . 264 | chr9 72653073 72653572 Satellite_repeat 1000 . 265 | chr9 78790077 78790255 (GAATG)n 1000 . 266 | chr9 79186574 79187026 LSU-rRNA_Hsa 1000 . 267 | chr9 141019938 141021783 TAR1 1000 . 268 | chrM 1 16571 chrM 1000 . 269 | chrX 55206111 55206740 Low_mappability_island 1000 . 270 | chrX 55207753 55208152 Low_mappability_island 1000 . 271 | chrX 55208300 55208643 Low_mappability_island 1000 . 272 | chrX 55208980 55209208 Low_mappability_island 1000 . 273 | chrX 55209655 55210006 Low_mappability_island 1000 . 274 | chrX 58330488 58330843 centromeric_repeat 1000 . 275 | chrX 58373806 58373962 centromeric_repeat 1000 . 276 | chrX 58377680 58377864 centromeric_repeat 1000 . 277 | chrX 58415350 58416387 centromeric_repeat 1000 . 278 | chrX 58432411 58432680 centromeric_repeat 1000 . 279 | chrX 58485887 58486241 centromeric_repeat 1000 . 280 | chrX 58488898 58494528 centromeric_repeat 1000 . 281 | chrX 58499466 58504235 centromeric_repeat 1000 . 282 | chrX 58506076 58528214 centromeric_repeat 1000 . 283 | chrX 58528184 58536883 centromeric_repeat 1000 . 284 | chrX 58544061 58582415 centromeric_repeat 1000 . 285 | chrX 61681834 61919683 centromeric_repeat 1000 . 286 | chrX 62003205 62041580 centromeric_repeat 1000 . 287 | chrX 83658929 83659019 Low_mappability_island 1000 . 288 | chrX 108297348 108297886 LSU-rRNA_Hsa 1000 . 289 | chrX 114959057 115006437 Low_mappability_island 1000 . 290 | chrX 125605623 125607351 Low_mappability_island 1000 . 291 | chrX 125714985 125715338 Low_mappability_island 1000 . 292 | chrX 125864844 125864980 Low_mappability_island 1000 . 293 | chrX 125865719 125865874 Low_mappability_island 1000 . 294 | chrY 313470 313613 ALR/Alpha 1000 . 295 | chrY 3004989 3005175 LSU-rRNA_Hsa 1000 . 296 | chrY 4212807 4212910 Low_mappability_island 1000 . 297 | chrY 7671817 7694928 BSR/Beta 1000 . 298 | chrY 7726064 7730229 BSR/Beta 1000 . 299 | chrY 7730734 7731598 BSR/Beta 1000 . 300 | chrY 7735811 7752887 BSR/Beta 1000 . 301 | chrY 7785067 7806311 BSR/Beta 1000 . 302 | chrY 7806856 7814704 BSR/Beta 1000 . 303 | chrY 7815230 7820478 BSR/Beta 1000 . 304 | chrY 7829937 7832032 BSR/Beta 1000 . 305 | chrY 7832744 7848695 BSR/Beta 1000 . 306 | chrY 7870343 7873582 BSR/Beta 1000 . 307 | chrY 7874115 7874584 BSR/Beta 1000 . 308 | chrY 7875409 7885257 BSR/Beta 1000 . 309 | chrY 7886545 7894591 BSR/Beta 1000 . 310 | chrY 7898927 7916812 BSR/Beta 1000 . 311 | chrY 7918790 7921352 BSR/Beta 1000 . 312 | chrY 7926344 7936705 BSR/Beta 1000 . 313 | chrY 7941130 7947438 BSR/Beta 1000 . 314 | chrY 7948790 7964448 BSR/Beta 1000 . 315 | chrY 8179010 8181143 BSR/Beta 1000 . 316 | chrY 8181757 8213330 BSR/Beta 1000 . 317 | chrY 8214629 8215637 BSR/Beta 1000 . 318 | chrY 8220421 8230061 BSR/Beta 1000 . 319 | chrY 8230686 8231546 BSR/Beta 1000 . 320 | chrY 8240772 8265916 BSR/Beta 1000 . 321 | chrY 8291535 8292942 BSR/Beta 1000 . 322 | chrY 8294002 8295175 BSR/Beta 1000 . 323 | chrY 8296944 8321375 BSR/Beta 1000 . 324 | chrY 8325813 8325929 BSR/Beta 1000 . 325 | chrY 8326678 8333466 BSR/Beta 1000 . 326 | chrY 8334027 8342387 BSR/Beta 1000 . 327 | chrY 8356544 8369346 BSR/Beta 1000 . 328 | chrY 8909560 8909925 TAR1 1000 . 329 | chrY 8979478 8979585 Low_mappability_island 1000 . 330 | chrY 9072781 9072993 TAR1 1000 . 331 | chrY 9908430 9925608 centromeric_repeat 1000 . 332 | chrY 9981952 9982126 BSR/Beta 1000 . 333 | chrY 10034864 10036712 SSU-rRNA_Hsa 1000 . 334 | chrY 10040627 10045657 ALR/Alpha 1000 . 335 | chrY 10047773 10052533 ALR/Alpha 1000 . 336 | chrY 10053695 10057722 ALR/Alpha 1000 . 337 | chrY 10059394 10073694 ALR/Alpha 1000 . 338 | chrY 10075082 10075781 ALR/Alpha 1000 . 339 | chrY 10080736 10104539 ALR/Alpha 1000 . 340 | chrY 13104530 13144368 centromeric_repeat 1000 . 341 | chrY 13193966 13196535 Low_mappability_island 1000 . 342 | chrY 13252193 13259484 centromeric_repeat 1000 . 343 | chrY 13290177 13290667 chrM 1000 . 344 | chrY 13445957 13490591 Satellite_repeat 1000 . 345 | chrY 13642186 13749784 Satellite_repeat 1000 . 346 | chrY 13798522 13870984 Satellite_repeat 1000 . 347 | chrY 19691913 19692524 LSU-rRNA_Hsa 1000 . 348 | chrY 19764063 19776198 ALR/Alpha 1000 . 349 | chrY 19780600 19781704 ALR/Alpha 1000 . 350 | chrY 19783669 19796396 ALR/Alpha 1000 . 351 | chrY 19800068 19801419 ALR/Alpha 1000 . 352 | chrY 19808085 19817100 ALR/Alpha 1000 . 353 | chrY 19944298 19944581 TAR1 1000 . 354 | chrY 20235195 20235478 TAR1 1000 . 355 | chrY 20362679 20371694 ALR/Alpha 1000 . 356 | chrY 20378360 20379711 ALR/Alpha 1000 . 357 | chrY 20383383 20396110 ALR/Alpha 1000 . 358 | chrY 20398075 20399179 ALR/Alpha 1000 . 359 | chrY 20403581 20415713 ALR/Alpha 1000 . 360 | chrY 20487248 20487859 LSU-rRNA_Hsa 1000 . 361 | chrY 23124788 23125577 BSR/Beta 1000 . 362 | chrY 23149027 23151205 BSR/Beta 1000 . 363 | chrY 23157969 23158245 BSR/Beta 1000 . 364 | chrY 23159001 23167737 BSR/Beta 1000 . 365 | chrY 23178886 23181770 BSR/Beta 1000 . 366 | chrY 23220740 23223625 BSR/Beta 1000 . 367 | chrY 23234125 23235822 BSR/Beta 1000 . 368 | chrY 23236898 23248080 BSR/Beta 1000 . 369 | chrY 23248729 23248851 BSR/Beta 1000 . 370 | chrY 23899295 23899388 TAR1 1000 . 371 | chrY 23956449 23956628 TAR1 1000 . 372 | chrY 24247659 24247700 TAR1 1000 . 373 | chrY 24630999 24631040 TAR1 1000 . 374 | chrY 24953159 24975657 BSR/Beta 1000 . 375 | chrY 24980997 24991235 BSR/Beta 1000 . 376 | chrY 25022753 25039185 BSR/Beta 1000 . 377 | chrY 25040153 25042421 BSR/Beta 1000 . 378 | chrY 25048332 25059258 BSR/Beta 1000 . 379 | chrY 25060235 25064798 BSR/Beta 1000 . 380 | chrY 25099139 25121882 BSR/Beta 1000 . 381 | chrY 25122419 25160800 BSR/Beta 1000 . 382 | chrY 25182404 25192372 BSR/Beta 1000 . 383 | chrY 25217722 25219409 BSR/Beta 1000 . 384 | chrY 25493588 25495275 BSR/Beta 1000 . 385 | chrY 26148315 26148450 TAR1 1000 . 386 | chrY 26586905 26609405 BSR/Beta 1000 . 387 | chrY 26614745 26624983 BSR/Beta 1000 . 388 | chrY 26656502 26672934 BSR/Beta 1000 . 389 | chrY 26673902 26676170 BSR/Beta 1000 . 390 | chrY 26682081 26693007 BSR/Beta 1000 . 391 | chrY 26693984 26698547 BSR/Beta 1000 . 392 | chrY 26732883 26755623 BSR/Beta 1000 . 393 | chrY 26756160 26794538 BSR/Beta 1000 . 394 | chrY 26816148 26826116 BSR/Beta 1000 . 395 | chrY 26851466 26853153 BSR/Beta 1000 . 396 | chrY 27109247 27110934 BSR/Beta 1000 . 397 | chrY 27136281 27146249 BSR/Beta 1000 . 398 | chrY 27167859 27206241 BSR/Beta 1000 . 399 | chrY 27206778 27229502 BSR/Beta 1000 . 400 | chrY 27263848 27268411 BSR/Beta 1000 . 401 | chrY 27269388 27280315 BSR/Beta 1000 . 402 | chrY 27286226 27288494 BSR/Beta 1000 . 403 | chrY 27289462 27305895 BSR/Beta 1000 . 404 | chrY 27337415 27347656 BSR/Beta 1000 . 405 | chrY 27352996 27375497 BSR/Beta 1000 . 406 | chrY 27813984 27814119 TAR1 1000 . 407 | chrY 28555026 28555353 TAR1 1000 . 408 | chrY 28784129 28819695 Satellite_repeat 1000 . 409 | chrY 58819367 58917648 (CATTC)n 1000 . 410 | chrY 58971913 58997782 (CATTC)n 1000 . 411 | chrY 59361267 59362785 TAR1 1000 . 412 | -------------------------------------------------------------------------------- /demo-data/sample-get-SuperEnhancers-output/0-enhancer-stats.txt: -------------------------------------------------------------------------------- 1 | Statistics for: demo-data/sample-mm10-CD4.bed 2 | SE Signal %: 38 3 | TE Signal %: 62 4 | SE Count: 1329 5 | TE Count: 24794 6 | SE Count %: 5.09 7 | TE Count %: 94.91 8 | Mean SE Size: 35846.22 9 | Mean TE Size: 5104.87 10 | Median SE Size: 31833 11 | Median TE Size: 892.5 12 | -------------------------------------------------------------------------------- /demo-data/sample-get-SuperEnhancers-output/se-cutoff.R.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GordonLab/riesling-pipeline/384f41dc964db0f59b3992f775e87c651e846f2b/demo-data/sample-get-SuperEnhancers-output/se-cutoff.R.png -------------------------------------------------------------------------------- /demo-data/sample-get-SuperEnhancers-output/se-size-histogram.R.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GordonLab/riesling-pipeline/384f41dc964db0f59b3992f775e87c651e846f2b/demo-data/sample-get-SuperEnhancers-output/se-size-histogram.R.png -------------------------------------------------------------------------------- /demo-data/sample-get-SuperEnhancers-output/se-te-stretch-vs-nonstretch-count-pie.R.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GordonLab/riesling-pipeline/384f41dc964db0f59b3992f775e87c651e846f2b/demo-data/sample-get-SuperEnhancers-output/se-te-stretch-vs-nonstretch-count-pie.R.png -------------------------------------------------------------------------------- /demo-data/sample-get-SuperEnhancers-output/se-vs-te-count-pie.R.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GordonLab/riesling-pipeline/384f41dc964db0f59b3992f775e87c651e846f2b/demo-data/sample-get-SuperEnhancers-output/se-vs-te-count-pie.R.png -------------------------------------------------------------------------------- /demo-data/sample-get-SuperEnhancers-output/se-vs-te-signal-pie.R.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GordonLab/riesling-pipeline/384f41dc964db0f59b3992f775e87c651e846f2b/demo-data/sample-get-SuperEnhancers-output/se-vs-te-signal-pie.R.png -------------------------------------------------------------------------------- /demo-data/sample-get-SuperEnhancers-output/stretch-vs-nonstretch-count-pie.R.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GordonLab/riesling-pipeline/384f41dc964db0f59b3992f775e87c651e846f2b/demo-data/sample-get-SuperEnhancers-output/stretch-vs-nonstretch-count-pie.R.png -------------------------------------------------------------------------------- /demo-data/sample-get-SuperEnhancers-output/te-size-histogram.R.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GordonLab/riesling-pipeline/384f41dc964db0f59b3992f775e87c651e846f2b/demo-data/sample-get-SuperEnhancers-output/te-size-histogram.R.png -------------------------------------------------------------------------------- /dist/README.md: -------------------------------------------------------------------------------- 1 | This directory contains ~portable linux binaries for [bedtools](https://github.com/arq5x/bedtools2) and [samtools](https://github.com/samtools/samtools). 2 | 3 | Feel free to provide your own or use pre-installed versons. 4 | -------------------------------------------------------------------------------- /dist/bedtools-2.22.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GordonLab/riesling-pipeline/384f41dc964db0f59b3992f775e87c651e846f2b/dist/bedtools-2.22.0 -------------------------------------------------------------------------------- /dist/bedtools-2.23.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GordonLab/riesling-pipeline/384f41dc964db0f59b3992f775e87c651e846f2b/dist/bedtools-2.23.0 -------------------------------------------------------------------------------- /dist/samtools-0.1.19: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GordonLab/riesling-pipeline/384f41dc964db0f59b3992f775e87c651e846f2b/dist/samtools-0.1.19 -------------------------------------------------------------------------------- /dist/samtools-1.1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GordonLab/riesling-pipeline/384f41dc964db0f59b3992f775e87c651e846f2b/dist/samtools-1.1 -------------------------------------------------------------------------------- /dist/samtools-1.2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GordonLab/riesling-pipeline/384f41dc964db0f59b3992f775e87c651e846f2b/dist/samtools-1.2 -------------------------------------------------------------------------------- /get-SuperEnhancers.R: -------------------------------------------------------------------------------- 1 | ##### 2 | # You can use this script to delineate super-enhancers from a .bed, *only* after it has been 3 | # pre-processed by bamliquidator. Conveniently this can be done by riesling.py, which also 4 | # allows for alternative stratifications of super-enhancers. 5 | # 6 | # 7 | # Run as: Rscript get-SuperEnhancers.R input.bed output_directory 8 | # 9 | # 10 | # **Input: A .bed from bamliquidator 11 | # (A .bed with 7 columns, where the 7th column contains the normalized score for ranking.) 12 | # 13 | # ## This input might look like: 14 | # V1 V2 V3 V4 V5 V6 V7 15 | # 1 chr1 3514643 3515351 1 30438 . 0.17688850 16 | # 2 chr1 4426753 4427110 2 4119 . 0.04747231 17 | # 18 | # 19 | # **Output: 20 | # Super-enhacners (defined by the point where the tangent reaches 1) and 21 | # Stretch enhancers (>3 kb) 22 | # Super-Stretch enhancers (> tangent cutoff AND >3kb) 23 | # 24 | # 25 | # Copyright (c) 2014-2016 Nick Semenkovich . 26 | # https://nick.semenkovich.com/ 27 | # 28 | # Developed for the Gordon Lab, Washington University in St. Louis (WUSTL) 29 | # https://gordonlab.wustl.edu/ 30 | # 31 | # This software is released under the MIT License: 32 | # http://opensource.org/licenses/MIT 33 | # 34 | # Source: https://github.com/GordonLab/riesling-pipeline 35 | # 36 | # Includes approaches inspired by: 37 | # https://stackoverflow.com/questions/29642867/drawing-a-tangent-to-the-plot-and-finding-the-x-intercept-using-r 38 | 39 | # Force sane graphics output 40 | options(bitmapType='cairo') 41 | 42 | #### Cleanup 43 | # dev.off(dev.list()["RStudioGD"]) # Clear the graphs 44 | rm(list = ls(all = TRUE)) # Clear all ojects from workspace 45 | cat("\014") # Reset the console 46 | 47 | # argv[1] is the input bed file we're operating on. 48 | args <- commandArgs(TRUE) 49 | input_filename = args[1] 50 | output_dir = args[2] 51 | 52 | 53 | print(paste('Working on:', input_filename)) 54 | print(paste('Output dir:', output_dir)) 55 | 56 | bed <- read.table(input_filename, sep="\t", header=FALSE) 57 | 58 | ## Looks like: 59 | # V1 V2 V3 V4 V5 V6 V7 60 | # 1 chr1 3514643 3515351 1 30438 . 0.17688850 61 | # 2 chr1 4426753 4427110 2 4119 . 0.04747231 62 | 63 | if (!is.na(output_dir)) { 64 | print(paste('Current directory is:', getwd())) 65 | print(paste('Setting output directory to:', output_dir)) 66 | setwd(output_dir) 67 | } 68 | 69 | 70 | ## Sort and scale axes 71 | y = sort(bed[,c(7)]*(bed[,c(3)]-bed[,c(2)])) # normalized_counts * width 72 | x = c(1:length(y)) 73 | ynorm = y*length(x)/max(y) 74 | # plot(x, ynorm) 75 | # plot(x, ynorm, log="xy") 76 | 77 | spl <- smooth.spline(x, ynorm) 78 | # pred <- predict(spl) 79 | # lines(pred, col=2) 80 | 81 | ynorm.prime <- diff(ynorm)/diff(x) 82 | # plot(ynorm.prime) 83 | pred.prime <- predict(spl, deriv=1) 84 | # lines(pred.prime$y, col=2) 85 | 86 | ## Find where the tangent line first crosses 1 87 | se_cutoff <- min(which(pred.prime$y > 1)) 88 | print(paste('Inflection at entry:', se_cutoff)) 89 | print(paste('Corresponding cutoff score:', y[[se_cutoff]])) 90 | 91 | # Use the spline models to plot tangent to that point 92 | pred0 <- predict(spl, x=se_cutoff, deriv=0) 93 | pred1 <- predict(spl, x=se_cutoff, deriv=1) 94 | 95 | # And compute intercepts for graphing 96 | yint <- pred0$y - (pred1$y*se_cutoff) 97 | xint <- -yint/pred1$y 98 | 99 | 100 | ################ Save subpopulation beds 101 | y_se_cutoff = y[[se_cutoff]] 102 | se_population = bed[bed$V7*(bed$V3-bed$V2) >= y_se_cutoff,] 103 | te_population = bed[bed$V7*(bed$V3-bed$V2) < y_se_cutoff,] 104 | 105 | write.table(se_population, file='0-se-population.R.bed', quote=FALSE, sep='\t', col.names = FALSE, row.names = FALSE) 106 | write.table(te_population, file='0-te-population.R.bed', quote=FALSE, sep='\t', col.names = FALSE, row.names = FALSE) 107 | 108 | # >3000 stretch 109 | stretch_cutoff = 3000 110 | stretch_population = bed[bed$V3-bed$V2 >= stretch_cutoff,] 111 | stretch_se_population = se_population[se_population$V3-se_population$V2 >= stretch_cutoff,] 112 | write.table(stretch_population, file='0-stretch-population.R.bed', quote=FALSE, sep='\t', col.names = FALSE, row.names = FALSE) 113 | write.table(stretch_se_population, file='0-stretch-se-population.R.bed', quote=FALSE, sep='\t', col.names = FALSE, row.names = FALSE) 114 | 115 | 116 | ## Save a diagnostic plot 117 | png(paste('se-cutoff.R.png', sep=''), width=800, height=800) 118 | plot(x, ynorm, cex=0.5) 119 | abline(h=0, col=8) # baseline zero 120 | lines(spl, col=2) # spline 121 | abline(h=ynorm[[se_cutoff]], col=8) 122 | abline(v=se_cutoff, col=8) 123 | 124 | points(pred0, col=2, pch=19) # point to predict tangent 125 | lines(x, yint + pred1$y*x, col=3) # tangent (1st deriv. of spline at se_cutoff) 126 | points(xint, 0, col=3, pch=19) # x intercept 127 | dev.off() 128 | 129 | ### 130 | # Export pie charts & histograms 131 | ### 132 | 133 | ## First, make some data frames to use ggplot2 134 | se_sizes = se_population$V3-se_population$V2 135 | se_signals = se_population$V7*se_sizes 136 | te_sizes = te_population$V3-te_population$V2 137 | te_signals = te_population$V7*te_sizes 138 | 139 | # size_df = data.frame(c(se_sizes, te_sizes), factor(rep(c('se','te'), c(length(se_sizes), length(te_sizes))))) 140 | # colnames(size_df) <- c('size', 'type') 141 | # signal_df = data.frame(c(se_signals, te_signals), factor(rep(c('se','te'), c(length(se_signals), length(te_signals))))) 142 | # colnames(signal_df) <- c('signal', 'type') 143 | 144 | 145 | # Pie chart of % SE vs % TE [count] 146 | png(paste('se-vs-te-count-pie.R.png', sep=''), width=800, height=800) 147 | pie(c(nrow(se_population), nrow(te_population)), 148 | labels=c(paste('Super Enhancers\n', nrow(se_population)), 149 | paste('Traditional Enhancers\n', nrow(te_population))), 150 | main="Number of Super- vs Traditional Enhancers") 151 | dev.off() 152 | 153 | # Stretch vs Non-Stretch 154 | png(paste('stretch-vs-nonstretch-count-pie.R.png', sep=''), width=800, height=800) 155 | pie(c(nrow(stretch_population), nrow(bed)-nrow(stretch_population)), 156 | labels=c(paste('Stretch Enhancers\n', nrow(stretch_population)), 157 | paste('Non-Stretch Enhancers\n', nrow(bed)-nrow(stretch_population))), 158 | main="Number of Stretch vs Non-Stretch Enhancers") 159 | dev.off() 160 | 161 | # And SE stretch / traditional stretch, etc. 162 | png(paste('se-te-stretch-vs-nonstretch-count-pie.R.png', sep=''), width=800, height=800) 163 | pie(c(nrow(se_population)-nrow(stretch_se_population), nrow(stretch_se_population), 164 | nrow(te_population)-(nrow(stretch_population)-nrow(stretch_se_population)), 165 | nrow(stretch_population)-nrow(stretch_se_population)), 166 | labels=c(paste('Super Enhancers\n', nrow(se_population)-nrow(stretch_se_population)), 167 | paste('Super-Stretch Enhancers\n', nrow(stretch_se_population)), 168 | paste('Traditional Enhancers\n', nrow(te_population)-(nrow(stretch_population)-nrow(stretch_se_population))), 169 | paste('Traditional-Stretch Enhancers\n', nrow(stretch_population)-nrow(stretch_se_population))), 170 | main="Number of Super- vs Traditional Enhancers w/ Stretch >3kb") 171 | dev.off() 172 | 173 | 174 | # Pie chart of % SE Signal vs % TE Signal 175 | total_signal = sum(as.numeric(bed$V5)) 176 | se_signal = sum(as.numeric(se_population$V5)) 177 | # te_signal = sum(as.numeric(te_population$V5)) 178 | se_fraction = round(se_signal/total_signal*100, digits = 1) 179 | te_fraction = 100-se_fraction 180 | 181 | png(paste('se-vs-te-signal-pie.R.png', sep=''), width=800, height=800) 182 | pie(c(se_fraction, te_fraction), 183 | labels=c(paste('Super Enhancers\n', se_fraction, '%', sep = ''), 184 | paste('Traditional Enhancers\n', te_fraction, '%', sep = '')), 185 | main="Signal in Super- vs Traditional Enhancers") 186 | dev.off() 187 | 188 | 189 | ###### Histograms 190 | # Histogram of SE sizes 191 | png(paste('se-size-histogram.R.png', sep=''), width=800, height=800) 192 | hist(se_signals, breaks=10, 193 | xlab = 'Super-Enhancer Sizes', ylab="Counts", main="Super-Enhancer Size Distribution") 194 | dev.off() 195 | 196 | png(paste('te-size-histogram.R.png', sep=''), width=800, height=800) 197 | hist(te_signals, breaks=10, 198 | xlab = 'Super-Enhancer Sizes', ylab="Counts", main="Traditional-Enhancer Size Distribution") 199 | dev.off() 200 | 201 | # Inverse hockeystick, honestly 202 | # qplot(signal, data=signal_df[signal_df$type == 'se',], geom="histogram", 203 | # xlab="Normalized ATAC Signal", ylab="Count") 204 | 205 | ######## Text Diagnostics 206 | # Print some raw statistics to a text file 207 | 208 | fh <- file("0-enhancer-stats.txt", "w") 209 | writeLines(paste("Statistics for:", input_filename), con=fh) 210 | writeLines(paste("SE Signal %:", round(se_fraction,2)), con=fh) 211 | writeLines(paste("TE Signal %:", round(te_fraction,2)), con=fh) 212 | writeLines(paste("SE Count:", nrow(se_population)), con=fh) 213 | writeLines(paste("TE Count:", nrow(te_population)), con=fh) 214 | writeLines(paste("SE Count %:", round(nrow(se_population)/nrow(bed)*100, 2)), con=fh) 215 | writeLines(paste("TE Count %:", round(nrow(te_population)/nrow(bed)*100, 2)), con=fh) 216 | writeLines(paste("Mean SE Size:", round(mean(se_sizes), 2)), con=fh) 217 | writeLines(paste("Mean TE Size:", round(mean(te_sizes), 2)), con=fh) 218 | writeLines(paste("Median SE Size:", median(se_sizes)), con=fh) 219 | writeLines(paste("Median TE Size:", median(te_sizes)), con=fh) 220 | close(fh) 221 | 222 | -------------------------------------------------------------------------------- /helper-scripts/0-merge-fastq.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # This script merges one sample across lanes. 5 | # 6 | # This script will *only* be useful if you have the same multiplexed sample loaded into multiple lanes of a flowcell. 7 | # 8 | # This concatenates the same index's pared-end files (L*_R*_* .fastq.gz) across multiple 9 | # lanes into one set of PE files per-sample. 10 | # 11 | # 12 | # Copyright (c) 2014-2016 Nick Semenkovich . 13 | # https://nick.semenkovich.com/ 14 | # 15 | # Developed for the Gordon Lab, Washington University in St. Louis (WUSTL) 16 | # https://gordonlab.wustl.edu/ 17 | # 18 | # This software is released under the MIT License: 19 | # http://opensource.org/licenses/MIT 20 | # 21 | # Source: https://github.com/GordonLab/riesling-pipeline 22 | 23 | from __future__ import absolute_import, division, print_function, unicode_literals 24 | 25 | __author__ = 'Nick Semenkovich ' 26 | __copyright__ = 'Gordon Lab at Washington University in St. Louis' 27 | __license__ = 'MIT' 28 | __version__ = '1.0.3' 29 | 30 | from collections import OrderedDict 31 | import _logshim 32 | import _script_helpers 33 | import argparse 34 | import glob 35 | import os 36 | import pprint 37 | import re 38 | 39 | 40 | def fastq_map_predict(input_path, verbose=False): 41 | """ 42 | Determine a sane .fastq muti-lane merge strategy. 43 | Fail if we can't merge correctly, if there are remaining files, etc. 44 | 45 | sample file name: Gordon-Ad2-11-AAGAGGCA-AAGAGGCA_S7_L001_R1_001.fastq.gz 46 | 47 | Args: 48 | input_path: An input path containing .fastq / .fastq.gz files 49 | Returns: 50 | A dict of mappings. 51 | """ 52 | fastq_map_logger = _logshim.getLogger('fastq_map_predict') 53 | 54 | if not os.path.isdir(input_path): 55 | raise ValueError("Input must be a directory. You gave: %s" % (input_path)) 56 | 57 | all_files = glob.glob(input_path + "/*_R*.fastq.gz") # Ignore index files, must have _R in title 58 | all_files.extend(glob.glob(input_path + "/*_R*.fastq")) 59 | 60 | if len(all_files) == 0: 61 | raise ValueError("Input directory is empty!") 62 | 63 | # Given paired ends, we must always have an even number of input files. 64 | if len(all_files) % 2 != 0: 65 | raise ValueError("Input directory contains an odd number of files.") 66 | 67 | re_pattern = re.compile(r'^(.*)_L(\d+)_R(\d)_\d+(\.fastq|\.fastq\.gz)$') 68 | 69 | 70 | file_dict = OrderedDict() 71 | 72 | prefixes_seen = [] 73 | lanes_seen = [] 74 | pe_seen = [] 75 | for file in sorted(all_files): 76 | if not os.access(file, os.R_OK): 77 | raise OSError("Cannot read file: %s" % (file)) 78 | 79 | filename_only = file.rsplit('/', 1)[-1] 80 | result = re.match(re_pattern, filename_only) 81 | 82 | file_dict[file] = {'prefix': str(result.group(1)), 83 | 'L': int(result.group(2)), 84 | 'R': int(result.group(3))} 85 | 86 | prefixes_seen.append(file_dict[file]['prefix']) 87 | lanes_seen.append(file_dict[file]['L']) 88 | pe_seen.append(file_dict[file]['R']) 89 | 90 | 91 | # Sanity checking here. Missing files? Other oddities? 92 | if len(file_dict) % len(set(lanes_seen)) != 0: 93 | raise ValueError("Missing or extra file(s)? Saw %d lanes, and %d input files." % 94 | (len(file_dict), len(set(lanes_seen)))) 95 | 96 | if len(set(pe_seen)) != 2: 97 | raise ValueError("Saw %d paired ends, expecting exactly two. That's confusing!" % (len(set(pe_seen)))) 98 | 99 | if pe_seen.count(1) != pe_seen.count(2): 100 | raise ValueError("Uneven pairing of paired ends (are you missing a file)? R1 count: %d, R2 count: %d" % 101 | (pe_seen.count(1), pe_seen.count(2))) 102 | 103 | fastq_map_logger.info("Files seen: %d" % (len(all_files))) 104 | fastq_map_logger.info("Samples seen: %d" % (len(set(prefixes_seen)))) 105 | fastq_map_logger.info("Lanes seen: %d" % (len(set(lanes_seen)))) 106 | 107 | merge_strategy = {} 108 | 109 | fastq_map_logger.info("Sample IDs:") 110 | for prefix in sorted(set(prefixes_seen)): 111 | fastq_map_logger.info(" %s" % (prefix)) 112 | 113 | for file in file_dict.iterkeys(): 114 | merge_strategy.setdefault(file_dict[file]['prefix'] + ".PE" + str(file_dict[file]['R']), []).append(file) 115 | 116 | if verbose: 117 | fastq_map_logger.debug("Merge strategy is:") 118 | fastq_map_logger.debug(pprint.pformat(merge_strategy)) 119 | 120 | return merge_strategy 121 | 122 | def fastq_merge(merge_strategy, output_path, disable_parallel=False): 123 | """ 124 | Concatenate multiple fastq files (from multiple lanes) into one. 125 | 126 | :param merge_strategy: 127 | :param output_path: 128 | :return: 129 | """ 130 | merge_log = _logshim.getLogger('fastq_merge') 131 | 132 | if disable_parallel: 133 | shell_job_runner = _script_helpers.ShellJobRunner(merge_log) 134 | else: 135 | shell_job_runner = _script_helpers.ShellJobRunner(merge_log, delay_seconds=45) 136 | 137 | for merged_name, merge_inputs in merge_strategy.iteritems(): 138 | merge_input_files = ' '.join(merge_inputs) 139 | merge_log.info('Spawning niced process to merge: %s' % (merged_name)) 140 | for filename in merge_inputs: 141 | assert(" " not in filename) 142 | assert(";" not in filename) # Vague sanity testing for input filenames 143 | merge_log.debug(' Input: %s' % (filename)) 144 | 145 | # WARNING: Using shell has security implications! Don't work on untrusted input filenames. 146 | command = "zcat %s | gzip -1 > %s/%s.fastq.gz" % (merge_input_files, output_path, merged_name) 147 | 148 | shell_job_runner.run(command) 149 | 150 | shell_job_runner.finish() 151 | 152 | return True 153 | 154 | 155 | def main(): 156 | # Parse & interpret command line flags. 157 | parser = argparse.ArgumentParser(description='Intelligently merge fastq/fastq.gz files from an Illumina pipeline.' 158 | 'Merges all L*_R*_* .fastq.gz files into one per sample.', 159 | epilog="Written by Nick Semenkovich for the Gordon Lab at " 160 | "Washington University in St. Louis: http://gordonlab.wustl.edu.", 161 | usage='%(prog)s [options]', 162 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 163 | 164 | parser.add_argument('--input-path', '-i', dest="input_path", metavar='input_dir/', type=str, 165 | help='Input path.', required=True) 166 | parser.add_argument('--output-path', '-o', dest="output_path", metavar='output_dir/', type=str, 167 | help='Output path.', required=True) 168 | parser.add_argument('--no-parallel', '-np', dest="no_parallel", default=False, action='store_true', 169 | help='Disable parallel job spawning.') 170 | 171 | # parser.add_argument('--skip-stats', dest="skip_stats", action='store_true', 172 | # help='Skip statistics generation.', required=False) 173 | 174 | parser.add_argument("--verbose", "-v", dest="verbose", default=False, action='store_true') 175 | 176 | parser.add_argument("--no-log", "-nl", dest="nolog", default=False, action='store_true', 177 | help="Do not create a log file.") 178 | 179 | args = parser.parse_args() 180 | 181 | output_path = _script_helpers.setup_output_path(args.output_path) 182 | 183 | _logshim.startLogger(verbose=args.verbose, noFileLog=args.nolog, outPath=output_path) 184 | 185 | # Our goal is to intelligently merge .fastq/.fastq.gz output from an Illumina run 186 | # The Illumina standard pipeline splits by barcode w/ semi-predictable filenames we can use, e.g. 187 | # IsoA-M1-CD4_S1_L001_I1_001.fastq.gz # index (discard) 188 | # IsoA-M1-CD4_S1_L001_R1_001.fastq.gz # end 1, lane 1 189 | # IsoA-M1-CD4_S1_L001_R2_001.fastq.gz # end 2, lane 2 190 | # IsoA-M1-CD4_S1_L002_I1_001.fastq.gz # index (discard), lane 2 191 | # IsoA-M1-CD4_S1_L002_R1_001.fastq.gz # end 1, lane 2 192 | # ... 193 | 194 | # TODO: Move some lower glob code up so we can test these functions 195 | merge_strategy = fastq_map_predict(args.input_path, verbose=args.verbose) 196 | 197 | fastq_merge(merge_strategy, args.output_path, disable_parallel=args.no_parallel) 198 | 199 | 200 | 201 | if __name__ == '__main__': 202 | main() 203 | -------------------------------------------------------------------------------- /helper-scripts/3-merge-bam-rmdup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # In case you've sequenced the same sample multiple times, use this script to 5 | # merge those samples together and then rmdup the pooled data. 6 | 7 | # WARNING: This assumes you've used the pipeline up to this point, and have pre-sorted & fixmated .bams. 8 | # Why use this script, when you could merge & rmdup by hand? 9 | # * This just works (so you won't forget anything) 10 | # * Automatic sane file naming 11 | # 12 | # 13 | # Copyright (c) 2014-2016 Nick Semenkovich . 14 | # https://nick.semenkovich.com/ 15 | # 16 | # Developed for the Gordon Lab, Washington University in St. Louis (WUSTL) 17 | # https://gordonlab.wustl.edu/ 18 | # 19 | # This software is released under the MIT License: 20 | # http://opensource.org/licenses/MIT 21 | # 22 | # Source: https://github.com/GordonLab/riesling-pipeline 23 | 24 | from __future__ import absolute_import, division, print_function, unicode_literals 25 | 26 | __author__ = 'Nick Semenkovich ' 27 | __copyright__ = 'Gordon Lab at Washington University in St. Louis' 28 | __license__ = 'MIT' 29 | __version__ = '1.0.3' 30 | 31 | import _logshim 32 | import _script_helpers 33 | import argparse 34 | import os 35 | 36 | # Load our config files 37 | CONFIG = _script_helpers.get_config() 38 | 39 | 40 | def merge_and_rmdup(input_files, output_path, disable_parallel=False): 41 | primary_logger = _logshim.getLogger('merge_and_rmdup') 42 | 43 | # Sanity checks on the input files list 44 | assert(len(input_files) > 1) 45 | # Check files are readable 46 | for filename in input_files: 47 | if not os.access(filename, os.R_OK): 48 | primary_logger.fatal("Unable to read input files.") 49 | raise IOError 50 | 51 | output_file_name = '-AND-'.join([os.path.basename(os.path.splitext(filename)[0]) for filename in input_files]) 52 | 53 | # Sanity check: maximum output filename length 54 | max_filename_length = os.statvfs(output_path).f_namemax 55 | if max_filename_length < 100: 56 | primary_logger.fatal("Cannot deal with short filename length limit. Maybe namemax is broken?") 57 | raise IOError 58 | 59 | if (len(output_file_name) + 10) > max_filename_length: # roughly truncate filename for sanity. 60 | primary_logger.critical("Very long filename! Truncating!") 61 | output_file_name = output_file_name[:-20] # Give us some extra room for downstream stuff? 62 | 63 | output_file_name += ".merged.bam" 64 | 65 | input_file_string = ' '.join(input_files) 66 | 67 | shell_job_runner = _script_helpers.ShellJobRunner(primary_logger) 68 | 69 | primary_logger.debug('Input file string: %s' % (input_file_string)) 70 | primary_logger.debug('Working on merge as: %s' % (output_file_name)) 71 | # This is pretty fast and has minimal memory usage. Yay! 72 | # We're probably re-rmduping some files if we're merging. That's ok since this is speedy. 73 | command = "%s merge -u - %s | %s rmdup - %s 2>%s" 74 | 75 | shell_job_runner.run(command % (CONFIG['binaries']['samtools'], 76 | input_file_string, 77 | CONFIG['binaries']['samtools_legacy'], # TODO: Update this when samtools is fixed. 78 | output_path + "/" + output_file_name, 79 | output_path + "/" + os.path.basename(os.path.splitext(output_file_name)[0]) + '-rmdup.log')) 80 | 81 | shell_job_runner.finish() 82 | 83 | 84 | primary_logger.info('Merge and rmdup complete!') 85 | 86 | 87 | def main(): 88 | # Parse & interpret command line flags. 89 | parser = argparse.ArgumentParser(description='Pool multiple .bams together for the same sample.' 90 | 'Note: This is *only* necessary if you sequenced the same sample multiple times.', 91 | epilog="Written by Nick Semenkovich for the Gordon Lab at " 92 | "Washington University in St. Louis: https://gordonlab.wustl.edu.", 93 | usage='%(prog)s [options]', 94 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 95 | 96 | parser.add_argument('--input-files', '-i', dest="input_files", metavar='input_dir/', type=str, 97 | help='Input files. (Not just a path!)', required=True, nargs='+') 98 | parser.add_argument('--output-path', '-o', dest="output_path", metavar='output_dir/', type=str, 99 | help='Output path.', required=True) 100 | 101 | parser.add_argument("--verbose", "-v", dest="verbose", default=False, action='store_true') 102 | 103 | parser.add_argument("--no-log", "-nl", dest="nolog", default=False, action='store_true', 104 | help="Do not create a log file.") 105 | 106 | args = parser.parse_args() 107 | 108 | output_path = _script_helpers.setup_output_path(args.output_path) 109 | 110 | _logshim.startLogger(verbose=args.verbose, noFileLog=args.nolog, outPath=output_path) 111 | 112 | merge_and_rmdup(args.input_files, output_path) 113 | 114 | 115 | 116 | if __name__ == '__main__': 117 | main() 118 | -------------------------------------------------------------------------------- /helper-scripts/README.md: -------------------------------------------------------------------------------- 1 | ## Optional & Helper Scripts 2 | 3 | * `0-merge-fastq.py`: Intelligently merge across lanes, for multiple-lane samples (e.g. one multiplexed sample loaded into multiple lanes). 4 | This will only be useful if you've loaded the *same, multiplexed sample* into multiple lanes of an Illumina flowcell. 5 | 6 | This script concatenates the same index's pared-end files (L*_R*_* .fastq.gz) across multiple lanes into one set of PE files per-sample. 7 | 8 | * `3-merge-bam-rmdup.py`: A helper script to blindly concatenate and deduplicate multiple sets of BAMs. 9 | 10 | * `hdf5_to_counts_table.py`: This script exists to convert .hdf5 files from bamliquidator into counts tables readable by R. 11 | -------------------------------------------------------------------------------- /helper-scripts/hdf5_to_counts_table.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # This script exists to convert .hdf5 files into counts tables readable by R. 5 | # 6 | # It is useful for performing differential accessibility analyses (e.g. with 7 | # DESeq2) on hdf5 counts data from RIESLING, ROSE, or bamliquidator. 8 | # 9 | # 10 | # Unfortunately, we can't use the rhdf5 package, since it doesn't support the 11 | # datatypes used by our .hdf5 files. 12 | # 13 | # 14 | # Copyright (c) 2014-2016 Nick Semenkovich . 15 | # https://nick.semenkovich.com/ 16 | # 17 | # Developed for the Gordon Lab, Washington University in St. Louis (WUSTL) 18 | # http://gordonlab.wustl.edu/ 19 | # 20 | # This software is released under the MIT License: 21 | # http://opensource.org/licenses/MIT 22 | # 23 | # Source: https://github.com/GordonLab/riesling-pipeline 24 | 25 | from __future__ import absolute_import, division, print_function, unicode_literals 26 | 27 | __author__ = 'Nick Semenkovich ' 28 | __copyright__ = 'Gordon Lab at Washington University in St. Louis' 29 | __license__ = 'MIT' 30 | __version__ = '1.0.3' 31 | 32 | import argparse 33 | import csv 34 | import fnmatch 35 | import operator 36 | import os 37 | import pybedtools 38 | import tables 39 | import _logshim 40 | import _script_helpers 41 | from collections import deque, OrderedDict 42 | 43 | 44 | CONFIG = _script_helpers.get_config() 45 | 46 | 47 | # TODO: Modularize this function. This code is repeated in a *lot* of scripts. 48 | def get_input_files(input_path): 49 | """ 50 | Generate a list of all input files. 51 | 52 | :param input_files: A directory with .h5 files. (e.g. /tmp/) 53 | :return: a list of all .h5 files with absolute paths. (e.g. ['/tmp/a.h5'] ) 54 | """ 55 | if not os.path.isdir(input_path): 56 | raise ValueError("Input must be a directory. You gave: %s" % (input_path)) 57 | 58 | # Adapted from: 59 | # https://stackoverflow.com/questions/2186525/use-a-glob-to-find-files-recursively-in-python 60 | all_files = [] 61 | for root, _, filenames in os.walk(input_path): 62 | for filename in fnmatch.filter(filenames, '*.h5'): 63 | all_files.append(os.path.join(root, filename)) 64 | 65 | if len(all_files) == 0: 66 | raise ValueError("Input directory contains no .h5 files!") 67 | 68 | return all_files 69 | 70 | 71 | def flatten_tsv(filename): 72 | """ 73 | Flaten a TSV file -- parse and concatenate identical row names, by summing their values. 74 | """ 75 | flatlog = _logshim.getLogger('flatten_tsv') 76 | 77 | flatlog.debug('Flattening input file: %s' % (filename)) 78 | 79 | data_dict = OrderedDict() 80 | 81 | with open(filename, 'r') as tsv_ro_fh: 82 | tsv_input = csv.reader(tsv_ro_fh, delimiter=str("\t")) 83 | 84 | header = next(tsv_input, None) 85 | 86 | for row in tsv_input: 87 | row_key = row[0] 88 | these_row_values_as_int = map(int, row[1:]) 89 | if row_key in data_dict: 90 | # Add the current row values to the existing values 91 | data_dict[row_key] = map(operator.add, data_dict[row_key], these_row_values_as_int) 92 | else: 93 | data_dict[row_key] = these_row_values_as_int 94 | 95 | # Write back the parsed dict 96 | with open(filename, 'wb') as tsv_rw_fh: 97 | tsv_writer = csv.writer(tsv_rw_fh, delimiter=str("\t")) 98 | tsv_writer.writerow(header) 99 | 100 | for key, val in data_dict.iteritems(): 101 | tsv_writer.writerow([key] + val) 102 | 103 | 104 | 105 | def parse_h5files(input_files, annotationBedTool, overwrite, flatten, density, normalized, sizescaled): 106 | h5logger = _logshim.getLogger('parse_h5files') 107 | 108 | assert(not (density and normalized)) 109 | total_file_count = len(input_files) 110 | h5logger.info('Parsing a total of: %d file(s)' % (total_file_count)) 111 | 112 | output_suffix_list = ['tsv'] 113 | 114 | annotating_regions = False 115 | if annotationBedTool: 116 | annotating_regions = True 117 | output_suffix_list.append('annotated') 118 | 119 | if normalized: 120 | output_suffix_list.append('normalized') 121 | elif density: 122 | output_suffix_list.append('density') 123 | elif sizescaled: 124 | output_suffix_list.append('sizescaled') 125 | 126 | output_suffix = '.'.join(reversed(output_suffix_list)) 127 | 128 | # Cache regions that we're annotating, maybe. 129 | region_annotation_cache = {} 130 | 131 | for this_file_count, file in enumerate(input_files): 132 | h5logger.info('\tParsing: %s (%d/%d)' % (file, this_file_count + 1, total_file_count)) 133 | 134 | output_filename = file + '.' + output_suffix 135 | 136 | if not overwrite and os.path.isfile(output_filename): 137 | h5logger.warn('Skipping this .h5 as output .tsv already exists: %s' % (output_filename)) 138 | continue 139 | 140 | # TODO: Modularize H5FD_CORE (the in-memory driver?) 141 | with tables.open_file(file, mode="r", driver="H5FD_CORE") as h5_object: 142 | assert(h5_object.title.startswith("bam liquidator genome read counts")) # Some sanity checking 143 | assert(h5_object.root.file_names[0] == "*") 144 | 145 | bam_filename_header = h5_object.root.file_names[1:] 146 | bam_filename_header.insert(0, 'region') 147 | 148 | # Note: len(files) = len(file_names) - 1, since file_names has a 'wildcard' first entry. 149 | number_of_regions = int(len(h5_object.root.region_counts) / len(h5_object.root.files)) 150 | 151 | # We expect this .h5 object's region_counts to contain: 152 | # /region_counts (Table(SIZE,)) 'region counts' 153 | # description := { 154 | # "file_key": UInt32Col(shape=(), dflt=0, pos=0), 155 | # "chromosome": StringCol(itemsize=64, shape=(), dflt='', pos=1), 156 | # "region_name": StringCol(itemsize=64, shape=(), dflt='', pos=2), 157 | # "start": UInt64Col(shape=(), dflt=0, pos=3), 158 | # "stop": UInt64Col(shape=(), dflt=0, pos=4), 159 | # "strand": StringCol(itemsize=1, shape=(), dflt='', pos=5), 160 | # "count": UInt64Col(shape=(), dflt=0, pos=6), 161 | # "normalized_count": Float64Col(shape=(), dflt=0.0, pos=7)} 162 | # byteorder := 'little' 163 | # chunkshape := (NNN,) 164 | counts = h5_object.root.region_counts 165 | 166 | with open(output_filename, 'wb') as tsv_output: 167 | tsvwriter = csv.writer(tsv_output, delimiter=str("\t")) 168 | tsvwriter.writerow(bam_filename_header) 169 | 170 | if annotating_regions: 171 | h5logger.debug('Generating .bed annotations from provided genome.') 172 | region_to_gene = {} 173 | # Perform one annotation rapidly for all regions in the .hdf5 174 | hdf5_positions_only = [] 175 | 176 | for region_number in range(0, number_of_regions): 177 | hdf5_positions_only.append(counts[region_number][1] + ' ' + str(counts[region_number][3]) + ' ' + str(counts[region_number][4])) 178 | 179 | hdf5_positions_only_hashkey = ''.join(hdf5_positions_only) 180 | 181 | if hdf5_positions_only_hashkey in region_annotation_cache: 182 | # The genome doesn't change mid run, so we cache only on hdf5_positions 183 | region_to_gene = region_annotation_cache[hdf5_positions_only_hashkey] 184 | h5logger.debug('Annotation from cache.') 185 | else: 186 | hdf5_stub_bed = pybedtools.BedTool('\n'.join(hdf5_positions_only), from_string=True) 187 | 188 | annotated_bed = hdf5_stub_bed.closest(annotationBedTool, t='first') 189 | 190 | for locus in annotated_bed: 191 | region_to_gene[locus.chrom + ':' + str(locus.start) + '-' + str(locus.end)] = locus.fields[11].split('"')[1] 192 | 193 | region_annotation_cache[hdf5_positions_only_hashkey] = region_to_gene 194 | h5logger.debug('Annotation completed.') 195 | 196 | 197 | # We're going to aggressively access the hdf5 at a bunch of fixed offsets. 198 | # rowarray = [counts[number_of_regions*0 + i], counts[number_of_regions*1 + i] + counts[number_of_regions*2 + i] ...] 199 | 200 | number_of_files = len(h5_object.root.files) 201 | working_deque = deque(maxlen=number_of_files + 1) 202 | 203 | # Here, we loop over every "region"/locus (every entry in the first column of the .tsv) 204 | # And then (within this loop) jump to each individual "file" (the hdf5 can contain multiple 205 | # separate samples) to build the data for every row. 206 | for region_number in range(0, number_of_regions): 207 | # Prefix the row with chrN:bpSTART-pbEND e.g. chr4:100-2000 208 | locus_name = counts[region_number][1] + ':' + str(counts[region_number][3]) + '-' + str(counts[region_number][4]) 209 | 210 | # Sanity checking, in case the input is nuts 211 | feature_width = counts[region_number][4] - counts[region_number][3] 212 | assert(feature_width > 0) 213 | 214 | # DESeq2 requires each region have a unique name. 215 | # You can either append a unique value, or aggregate identical loci. 216 | # We address this later by re-opening and aggregating. 217 | if annotating_regions: 218 | working_deque.append(region_to_gene[locus_name]) 219 | else: 220 | working_deque.append(locus_name) 221 | #rowarray = [counts[region_number][1] + ':' + str(counts[region_number][3]) + '-' + str(counts[region_number][4])] 222 | 223 | for file_number in range(0, number_of_files): 224 | if normalized: 225 | # Standard normalized (counts/mreads) 226 | # bamliquidator gives us (counts/mreads)/width so we multiply by width 227 | working_deque.append(int(counts[number_of_regions * file_number + region_number][7] * feature_width)) 228 | elif density: 229 | # (counts/mreads)/width 230 | # We upscale the fractional normalized count values by an arbitrary amount, 231 | # because subsequent analyses like integers. 232 | working_deque.append(int(counts[number_of_regions * file_number + region_number][7] * 10000)) 233 | elif sizescaled: 234 | # counts/width 235 | # We upscale the fractional normalized count values by an arbitrary amount, 236 | # because subsequent analyses like integers. 237 | working_deque.append(int(counts[number_of_regions * file_number + region_number][6] / feature_width * 100)) 238 | else: 239 | working_deque.append(int(counts[number_of_regions * file_number + region_number][6])) 240 | 241 | tsvwriter.writerow(working_deque) 242 | 243 | if flatten: 244 | flatten_tsv(output_filename) 245 | 246 | h5logger.info('Completed.') 247 | 248 | 249 | def main(): 250 | # Parse & interpret command line flags. 251 | parser = argparse.ArgumentParser(description='Convert hdf5 tables from bamliquidator format to CSV counts tables ' 252 | 'for use in R and elsewhere. (Necessary as rhdf5 doesn\'t support our data structure.)', 253 | epilog="Written by Nick Semenkovich for the Gordon Lab at " 254 | "Washington University in St. Louis: http://gordonlab.wustl.edu.", 255 | usage='%(prog)s [options]', 256 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 257 | 258 | parser.add_argument('--input-path', '-i', dest="input_path", metavar='input_dir/', type=str, 259 | help='Input path with .h5 files.', 260 | required=True) 261 | 262 | parser.add_argument("--overwrite", dest="overwrite", default=False, action='store_true', 263 | help='Regenerate and overwrite output .tsv files, even if they already exist.') 264 | 265 | parser.add_argument('--call-genes', dest="call_genes", default=False, action='store_true', 266 | help='Instead of a .tsv (with positions as keys), make a .annotated.tsv with nearby genes.') 267 | 268 | parser.add_argument('--normalized', dest="normalized", default=False, action='store_true', 269 | help='Store the normalized counts (counts/total reads) instead of the raw read counts.') 270 | 271 | parser.add_argument('--density', dest="density", default=False, action='store_true', 272 | help='Store the width-normalized density (counts/total reads/region size) instead of the raw read counts.') 273 | 274 | parser.add_argument('--sizescaled', dest="sizescaled", default=False, action='store_true', 275 | help='Store the size scaled counts (counts/feature size) instead of the raw read counts.') 276 | 277 | # Useful for EdgeR/DESeq2, etc. where every locus/position/gene-name must be unique. 278 | parser.add_argument('--flatten', dest="flatten", default=False, action='store_true', 279 | help='Aggregate identical locus IDs and sum their values. ' 280 | 'Think carefully before you sum non-normalized values!') 281 | 282 | 283 | genome_choices = sorted(CONFIG['gffs'].keys()) 284 | parser.add_argument('--genome', '-g', dest="genome", metavar='genome', type=str, default=None, 285 | choices=genome_choices, help='Genome to use for annotation, one of: %s' % (', '.join(genome_choices)), required=False) 286 | 287 | 288 | parser.add_argument("--verbose", "-v", dest="verbose", default=False, action='store_true') 289 | 290 | parser.add_argument("--no-log", "-nl", dest="nolog", default=False, action='store_true', 291 | help="Do not create a log file.") 292 | 293 | args = parser.parse_args() 294 | 295 | if args.call_genes and not args.genome: 296 | parser.error('--genome is when requesting --call_genes') 297 | 298 | assert((args.density + args.normalized + args.sizescaled) <= 1) 299 | 300 | annotationBedTool = None 301 | if args.call_genes: 302 | genome_gff = CONFIG['gffs'][args.genome] 303 | assert(os.access(genome_gff, os.R_OK)) 304 | annotationBedTool = pybedtools.BedTool(genome_gff) 305 | 306 | # Output path is input path. This also checks that the path is writeable. 307 | output_path = _script_helpers.setup_output_path(args.input_path) 308 | 309 | _logshim.startLogger(verbose=args.verbose, noFileLog=args.nolog, outPath=output_path) 310 | 311 | 312 | input_files = get_input_files(args.input_path) 313 | 314 | parse_h5files(input_files, 315 | annotationBedTool=annotationBedTool, 316 | overwrite=args.overwrite, 317 | flatten=args.flatten, 318 | density=args.density, 319 | normalized=args.normalized, 320 | sizescaled=args.sizescaled) 321 | 322 | 323 | 324 | if __name__ == '__main__': 325 | main() 326 | -------------------------------------------------------------------------------- /refseq/.gitignore: -------------------------------------------------------------------------------- 1 | !*.gz -------------------------------------------------------------------------------- /refseq/UPDATED-06-11-2014: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GordonLab/riesling-pipeline/384f41dc964db0f59b3992f775e87c651e846f2b/refseq/UPDATED-06-11-2014 -------------------------------------------------------------------------------- /refseq/hg18.ucsc.RefSeq.refGene.tsv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GordonLab/riesling-pipeline/384f41dc964db0f59b3992f775e87c651e846f2b/refseq/hg18.ucsc.RefSeq.refGene.tsv.gz -------------------------------------------------------------------------------- /refseq/hg19.ucsc.RefSeq.refGene.tsv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GordonLab/riesling-pipeline/384f41dc964db0f59b3992f775e87c651e846f2b/refseq/hg19.ucsc.RefSeq.refGene.tsv.gz -------------------------------------------------------------------------------- /refseq/hg38.ucsc.RefSeq.refGene.tsv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GordonLab/riesling-pipeline/384f41dc964db0f59b3992f775e87c651e846f2b/refseq/hg38.ucsc.RefSeq.refGene.tsv.gz -------------------------------------------------------------------------------- /refseq/mm10.ucsc.RefSeq.refGene.tsv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GordonLab/riesling-pipeline/384f41dc964db0f59b3992f775e87c651e846f2b/refseq/mm10.ucsc.RefSeq.refGene.tsv.gz -------------------------------------------------------------------------------- /refseq/mm9.ucsc.RefSeq.refGene.tsv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GordonLab/riesling-pipeline/384f41dc964db0f59b3992f775e87c651e846f2b/refseq/mm9.ucsc.RefSeq.refGene.tsv.gz -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Cython>=0.22 2 | gffutils>=0.8.3 3 | MACS2>=2.1.0.20140616 4 | MACS==1.4.3 5 | matplotlib>=1.4.3 6 | metaseq>=0.5.5.4 7 | numexpr>=2.4 8 | numpy>=1.9.2 9 | pandas>=0.16.0 10 | PyYaml>=3.11 11 | pybedtools>=0.6.9 12 | seaborn==0.5.1 13 | tables>=3.1.1 14 | -------------------------------------------------------------------------------- /statistics.py: -------------------------------------------------------------------------------- 1 | ## Module statistics.py 2 | ## 3 | ## Copyright (c) 2013 Steven D'Aprano . 4 | ## 5 | ## Licensed under the Apache License, Version 2.0 (the "License"); 6 | ## you may not use this file except in compliance with the License. 7 | ## You may obtain a copy of the License at 8 | ## 9 | ## http://www.apache.org/licenses/LICENSE-2.0 10 | ## 11 | ## Unless required by applicable law or agreed to in writing, software 12 | ## distributed under the License is distributed on an "AS IS" BASIS, 13 | ## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | ## See the License for the specific language governing permissions and 15 | ## limitations under the License. 16 | 17 | 18 | """ 19 | Basic statistics module. 20 | 21 | This module provides functions for calculating statistics of data, including 22 | averages, variance, and standard deviation. 23 | 24 | Calculating averages 25 | -------------------- 26 | 27 | ================== ============================================= 28 | Function Description 29 | ================== ============================================= 30 | mean Arithmetic mean (average) of data. 31 | median Median (middle value) of data. 32 | median_low Low median of data. 33 | median_high High median of data. 34 | median_grouped Median, or 50th percentile, of grouped data. 35 | mode Mode (most common value) of data. 36 | ================== ============================================= 37 | 38 | Calculate the arithmetic mean ("the average") of data: 39 | 40 | >>> mean([-1.0, 2.5, 3.25, 5.75]) 41 | 2.625 42 | 43 | 44 | Calculate the standard median of discrete data: 45 | 46 | >>> median([2, 3, 4, 5]) 47 | 3.5 48 | 49 | 50 | Calculate the median, or 50th percentile, of data grouped into class intervals 51 | centred on the data values provided. E.g. if your data points are rounded to 52 | the nearest whole number: 53 | 54 | >>> median_grouped([2, 2, 3, 3, 3, 4]) #doctest: +ELLIPSIS 55 | 2.8333333333... 56 | 57 | This should be interpreted in this way: you have two data points in the class 58 | interval 1.5-2.5, three data points in the class interval 2.5-3.5, and one in 59 | the class interval 3.5-4.5. The median of these data points is 2.8333... 60 | 61 | 62 | Calculating variability or spread 63 | --------------------------------- 64 | 65 | ================== ============================================= 66 | Function Description 67 | ================== ============================================= 68 | pvariance Population variance of data. 69 | variance Sample variance of data. 70 | pstdev Population standard deviation of data. 71 | stdev Sample standard deviation of data. 72 | ================== ============================================= 73 | 74 | Calculate the standard deviation of sample data: 75 | 76 | >>> stdev([2.5, 3.25, 5.5, 11.25, 11.75]) #doctest: +ELLIPSIS 77 | 4.38961843444... 78 | 79 | If you have previously calculated the mean, you can pass it as the optional 80 | second argument to the four "spread" functions to avoid recalculating it: 81 | 82 | >>> data = [1, 2, 2, 4, 4, 4, 5, 6] 83 | >>> mu = mean(data) 84 | >>> pvariance(data, mu) 85 | 2.5 86 | 87 | 88 | Exceptions 89 | ---------- 90 | 91 | A single exception is defined: StatisticsError is a subclass of ValueError. 92 | 93 | """ 94 | 95 | __all__ = [ 'StatisticsError', 96 | 'pstdev', 'pvariance', 'stdev', 'variance', 97 | 'median', 'median_low', 'median_high', 'median_grouped', 98 | 'mean', 'mode', 99 | ] 100 | 101 | 102 | import collections 103 | import math 104 | 105 | from fractions import Fraction 106 | from decimal import Decimal 107 | 108 | 109 | # === Exceptions === 110 | 111 | class StatisticsError(ValueError): 112 | pass 113 | 114 | 115 | # === Private utilities === 116 | 117 | def _sum(data, start=0): 118 | """_sum(data [, start]) -> value 119 | 120 | Return a high-precision sum of the given numeric data. If optional 121 | argument ``start`` is given, it is added to the total. If ``data`` is 122 | empty, ``start`` (defaulting to 0) is returned. 123 | 124 | 125 | Examples 126 | -------- 127 | 128 | >>> _sum([3, 2.25, 4.5, -0.5, 1.0], 0.75) 129 | 11.0 130 | 131 | Some sources of round-off error will be avoided: 132 | 133 | >>> _sum([1e50, 1, -1e50] * 1000) # Built-in sum returns zero. 134 | 1000.0 135 | 136 | Fractions and Decimals are also supported: 137 | 138 | >>> from fractions import Fraction as F 139 | >>> _sum([F(2, 3), F(7, 5), F(1, 4), F(5, 6)]) 140 | Fraction(63, 20) 141 | 142 | >>> from decimal import Decimal as D 143 | >>> data = [D("0.1375"), D("0.2108"), D("0.3061"), D("0.0419")] 144 | >>> _sum(data) 145 | Decimal('0.6963') 146 | 147 | Mixed types are currently treated as an error, except that int is 148 | allowed. 149 | """ 150 | # We fail as soon as we reach a value that is not an int or the type of 151 | # the first value which is not an int. E.g. _sum([int, int, float, int]) 152 | # is okay, but sum([int, int, float, Fraction]) is not. 153 | allowed_types = set([int, type(start)]) 154 | n, d = _exact_ratio(start) 155 | partials = {d: n} # map {denominator: sum of numerators} 156 | # Micro-optimizations. 157 | exact_ratio = _exact_ratio 158 | partials_get = partials.get 159 | # Add numerators for each denominator. 160 | for x in data: 161 | _check_type(type(x), allowed_types) 162 | n, d = exact_ratio(x) 163 | partials[d] = partials_get(d, 0) + n 164 | # Find the expected result type. If allowed_types has only one item, it 165 | # will be int; if it has two, use the one which isn't int. 166 | assert len(allowed_types) in (1, 2) 167 | if len(allowed_types) == 1: 168 | assert allowed_types.pop() is int 169 | T = int 170 | else: 171 | T = (allowed_types - set([int])).pop() 172 | if None in partials: 173 | assert issubclass(T, (float, Decimal)) 174 | assert not math.isfinite(partials[None]) 175 | return T(partials[None]) 176 | total = Fraction() 177 | for d, n in sorted(partials.items()): 178 | total += Fraction(n, d) 179 | if issubclass(T, int): 180 | assert total.denominator == 1 181 | return T(total.numerator) 182 | if issubclass(T, Decimal): 183 | return T(total.numerator)/total.denominator 184 | return T(total) 185 | 186 | 187 | def _check_type(T, allowed): 188 | if T not in allowed: 189 | if len(allowed) == 1: 190 | allowed.add(T) 191 | else: 192 | types = ', '.join([t.__name__ for t in allowed] + [T.__name__]) 193 | raise TypeError("unsupported mixed types: %s" % types) 194 | 195 | 196 | def _exact_ratio(x): 197 | """Convert Real number x exactly to (numerator, denominator) pair. 198 | 199 | >>> _exact_ratio(0.25) 200 | (1, 4) 201 | 202 | x is expected to be an int, Fraction, Decimal or float. 203 | """ 204 | try: 205 | try: 206 | # int, Fraction 207 | return (x.numerator, x.denominator) 208 | except AttributeError: 209 | # float 210 | try: 211 | return x.as_integer_ratio() 212 | except AttributeError: 213 | # Decimal 214 | try: 215 | return _decimal_to_ratio(x) 216 | except AttributeError: 217 | msg = "can't convert type '{}' to numerator/denominator" 218 | raise TypeError(msg.format(type(x).__name__)) 219 | except (OverflowError, ValueError): 220 | # INF or NAN 221 | if __debug__: 222 | # Decimal signalling NANs cannot be converted to float :-( 223 | if isinstance(x, Decimal): 224 | assert not x.is_finite() 225 | else: 226 | assert not math.isfinite(x) 227 | return (x, None) 228 | 229 | 230 | # FIXME This is faster than Fraction.from_decimal, but still too slow. 231 | def _decimal_to_ratio(d): 232 | """Convert Decimal d to exact integer ratio (numerator, denominator). 233 | 234 | >>> from decimal import Decimal 235 | >>> _decimal_to_ratio(Decimal("2.6")) 236 | (26, 10) 237 | 238 | """ 239 | sign, digits, exp = d.as_tuple() 240 | if exp in ('F', 'n', 'N'): # INF, NAN, sNAN 241 | assert not d.is_finite() 242 | raise ValueError 243 | num = 0 244 | for digit in digits: 245 | num = num*10 + digit 246 | if exp < 0: 247 | den = 10**-exp 248 | else: 249 | num *= 10**exp 250 | den = 1 251 | if sign: 252 | num = -num 253 | return (num, den) 254 | 255 | 256 | def _counts(data): 257 | # Generate a table of sorted (value, frequency) pairs. 258 | table = collections.Counter(iter(data)).most_common() 259 | if not table: 260 | return table 261 | # Extract the values with the highest frequency. 262 | maxfreq = table[0][1] 263 | for i in range(1, len(table)): 264 | if table[i][1] != maxfreq: 265 | table = table[:i] 266 | break 267 | return table 268 | 269 | 270 | # === Measures of central tendency (averages) === 271 | 272 | def mean(data): 273 | """Return the sample arithmetic mean of data. 274 | 275 | >>> mean([1, 2, 3, 4, 4]) 276 | 2.8 277 | 278 | >>> from fractions import Fraction as F 279 | >>> mean([F(3, 7), F(1, 21), F(5, 3), F(1, 3)]) 280 | Fraction(13, 21) 281 | 282 | >>> from decimal import Decimal as D 283 | >>> mean([D("0.5"), D("0.75"), D("0.625"), D("0.375")]) 284 | Decimal('0.5625') 285 | 286 | If ``data`` is empty, StatisticsError will be raised. 287 | """ 288 | if iter(data) is data: 289 | data = list(data) 290 | n = len(data) 291 | if n < 1: 292 | raise StatisticsError('mean requires at least one data point') 293 | return _sum(data)/n 294 | 295 | 296 | # FIXME: investigate ways to calculate medians without sorting? Quickselect? 297 | def median(data): 298 | """Return the median (middle value) of numeric data. 299 | 300 | When the number of data points is odd, return the middle data point. 301 | When the number of data points is even, the median is interpolated by 302 | taking the average of the two middle values: 303 | 304 | >>> median([1, 3, 5]) 305 | 3 306 | >>> median([1, 3, 5, 7]) 307 | 4.0 308 | 309 | """ 310 | data = sorted(data) 311 | n = len(data) 312 | if n == 0: 313 | raise StatisticsError("no median for empty data") 314 | if n%2 == 1: 315 | return data[n//2] 316 | else: 317 | i = n//2 318 | return (data[i - 1] + data[i])/2 319 | 320 | 321 | def median_low(data): 322 | """Return the low median of numeric data. 323 | 324 | When the number of data points is odd, the middle value is returned. 325 | When it is even, the smaller of the two middle values is returned. 326 | 327 | >>> median_low([1, 3, 5]) 328 | 3 329 | >>> median_low([1, 3, 5, 7]) 330 | 3 331 | 332 | """ 333 | data = sorted(data) 334 | n = len(data) 335 | if n == 0: 336 | raise StatisticsError("no median for empty data") 337 | if n%2 == 1: 338 | return data[n//2] 339 | else: 340 | return data[n//2 - 1] 341 | 342 | 343 | def median_high(data): 344 | """Return the high median of data. 345 | 346 | When the number of data points is odd, the middle value is returned. 347 | When it is even, the larger of the two middle values is returned. 348 | 349 | >>> median_high([1, 3, 5]) 350 | 3 351 | >>> median_high([1, 3, 5, 7]) 352 | 5 353 | 354 | """ 355 | data = sorted(data) 356 | n = len(data) 357 | if n == 0: 358 | raise StatisticsError("no median for empty data") 359 | return data[n//2] 360 | 361 | 362 | def median_grouped(data, interval=1): 363 | """"Return the 50th percentile (median) of grouped continuous data. 364 | 365 | >>> median_grouped([1, 2, 2, 3, 4, 4, 4, 4, 4, 5]) 366 | 3.7 367 | >>> median_grouped([52, 52, 53, 54]) 368 | 52.5 369 | 370 | This calculates the median as the 50th percentile, and should be 371 | used when your data is continuous and grouped. In the above example, 372 | the values 1, 2, 3, etc. actually represent the midpoint of classes 373 | 0.5-1.5, 1.5-2.5, 2.5-3.5, etc. The middle value falls somewhere in 374 | class 3.5-4.5, and interpolation is used to estimate it. 375 | 376 | Optional argument ``interval`` represents the class interval, and 377 | defaults to 1. Changing the class interval naturally will change the 378 | interpolated 50th percentile value: 379 | 380 | >>> median_grouped([1, 3, 3, 5, 7], interval=1) 381 | 3.25 382 | >>> median_grouped([1, 3, 3, 5, 7], interval=2) 383 | 3.5 384 | 385 | This function does not check whether the data points are at least 386 | ``interval`` apart. 387 | """ 388 | data = sorted(data) 389 | n = len(data) 390 | if n == 0: 391 | raise StatisticsError("no median for empty data") 392 | elif n == 1: 393 | return data[0] 394 | # Find the value at the midpoint. Remember this corresponds to the 395 | # centre of the class interval. 396 | x = data[n//2] 397 | for obj in (x, interval): 398 | if isinstance(obj, (str, bytes)): 399 | raise TypeError('expected number but got %r' % obj) 400 | try: 401 | L = x - interval/2 # The lower limit of the median interval. 402 | except TypeError: 403 | # Mixed type. For now we just coerce to float. 404 | L = float(x) - float(interval)/2 405 | cf = data.index(x) # Number of values below the median interval. 406 | # FIXME The following line could be more efficient for big lists. 407 | f = data.count(x) # Number of data points in the median interval. 408 | return L + interval*(n/2 - cf)/f 409 | 410 | 411 | def mode(data): 412 | """Return the most common data point from discrete or nominal data. 413 | 414 | ``mode`` assumes discrete data, and returns a single value. This is the 415 | standard treatment of the mode as commonly taught in schools: 416 | 417 | >>> mode([1, 1, 2, 3, 3, 3, 3, 4]) 418 | 3 419 | 420 | This also works with nominal (non-numeric) data: 421 | 422 | >>> mode(["red", "blue", "blue", "red", "green", "red", "red"]) 423 | 'red' 424 | 425 | If there is not exactly one most common value, ``mode`` will raise 426 | StatisticsError. 427 | """ 428 | # Generate a table of sorted (value, frequency) pairs. 429 | table = _counts(data) 430 | if len(table) == 1: 431 | return table[0][0] 432 | elif table: 433 | raise StatisticsError( 434 | 'no unique mode; found %d equally common values' % len(table) 435 | ) 436 | else: 437 | raise StatisticsError('no mode for empty data') 438 | 439 | 440 | # === Measures of spread === 441 | 442 | # See http://mathworld.wolfram.com/Variance.html 443 | # http://mathworld.wolfram.com/SampleVariance.html 444 | # http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance 445 | # 446 | # Under no circumstances use the so-called "computational formula for 447 | # variance", as that is only suitable for hand calculations with a small 448 | # amount of low-precision data. It has terrible numeric properties. 449 | # 450 | # See a comparison of three computational methods here: 451 | # http://www.johndcook.com/blog/2008/09/26/comparing-three-methods-of-computing-standard-deviation/ 452 | 453 | def _ss(data, c=None): 454 | """Return sum of square deviations of sequence data. 455 | 456 | If ``c`` is None, the mean is calculated in one pass, and the deviations 457 | from the mean are calculated in a second pass. Otherwise, deviations are 458 | calculated from ``c`` as given. Use the second case with care, as it can 459 | lead to garbage results. 460 | """ 461 | if c is None: 462 | c = mean(data) 463 | ss = _sum((x-c)**2 for x in data) 464 | # The following sum should mathematically equal zero, but due to rounding 465 | # error may not. 466 | ss -= _sum((x-c) for x in data)**2/len(data) 467 | assert not ss < 0, 'negative sum of square deviations: %f' % ss 468 | return ss 469 | 470 | 471 | def variance(data, xbar=None): 472 | """Return the sample variance of data. 473 | 474 | data should be an iterable of Real-valued numbers, with at least two 475 | values. The optional argument xbar, if given, should be the mean of 476 | the data. If it is missing or None, the mean is automatically calculated. 477 | 478 | Use this function when your data is a sample from a population. To 479 | calculate the variance from the entire population, see ``pvariance``. 480 | 481 | Examples: 482 | 483 | >>> data = [2.75, 1.75, 1.25, 0.25, 0.5, 1.25, 3.5] 484 | >>> variance(data) 485 | 1.3720238095238095 486 | 487 | If you have already calculated the mean of your data, you can pass it as 488 | the optional second argument ``xbar`` to avoid recalculating it: 489 | 490 | >>> m = mean(data) 491 | >>> variance(data, m) 492 | 1.3720238095238095 493 | 494 | This function does not check that ``xbar`` is actually the mean of 495 | ``data``. Giving arbitrary values for ``xbar`` may lead to invalid or 496 | impossible results. 497 | 498 | Decimals and Fractions are supported: 499 | 500 | >>> from decimal import Decimal as D 501 | >>> variance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")]) 502 | Decimal('31.01875') 503 | 504 | >>> from fractions import Fraction as F 505 | >>> variance([F(1, 6), F(1, 2), F(5, 3)]) 506 | Fraction(67, 108) 507 | 508 | """ 509 | if iter(data) is data: 510 | data = list(data) 511 | n = len(data) 512 | if n < 2: 513 | raise StatisticsError('variance requires at least two data points') 514 | ss = _ss(data, xbar) 515 | return ss/(n-1) 516 | 517 | 518 | def pvariance(data, mu=None): 519 | """Return the population variance of ``data``. 520 | 521 | data should be an iterable of Real-valued numbers, with at least one 522 | value. The optional argument mu, if given, should be the mean of 523 | the data. If it is missing or None, the mean is automatically calculated. 524 | 525 | Use this function to calculate the variance from the entire population. 526 | To estimate the variance from a sample, the ``variance`` function is 527 | usually a better choice. 528 | 529 | Examples: 530 | 531 | >>> data = [0.0, 0.25, 0.25, 1.25, 1.5, 1.75, 2.75, 3.25] 532 | >>> pvariance(data) 533 | 1.25 534 | 535 | If you have already calculated the mean of the data, you can pass it as 536 | the optional second argument to avoid recalculating it: 537 | 538 | >>> mu = mean(data) 539 | >>> pvariance(data, mu) 540 | 1.25 541 | 542 | This function does not check that ``mu`` is actually the mean of ``data``. 543 | Giving arbitrary values for ``mu`` may lead to invalid or impossible 544 | results. 545 | 546 | Decimals and Fractions are supported: 547 | 548 | >>> from decimal import Decimal as D 549 | >>> pvariance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")]) 550 | Decimal('24.815') 551 | 552 | >>> from fractions import Fraction as F 553 | >>> pvariance([F(1, 4), F(5, 4), F(1, 2)]) 554 | Fraction(13, 72) 555 | 556 | """ 557 | if iter(data) is data: 558 | data = list(data) 559 | n = len(data) 560 | if n < 1: 561 | raise StatisticsError('pvariance requires at least one data point') 562 | ss = _ss(data, mu) 563 | return ss/n 564 | 565 | 566 | def stdev(data, xbar=None): 567 | """Return the square root of the sample variance. 568 | 569 | See ``variance`` for arguments and other details. 570 | 571 | >>> stdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75]) 572 | 1.0810874155219827 573 | 574 | """ 575 | var = variance(data, xbar) 576 | try: 577 | return var.sqrt() 578 | except AttributeError: 579 | return math.sqrt(var) 580 | 581 | 582 | def pstdev(data, mu=None): 583 | """Return the square root of the population variance. 584 | 585 | See ``pvariance`` for arguments and other details. 586 | 587 | >>> pstdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75]) 588 | 0.986893273527251 589 | 590 | """ 591 | var = pvariance(data, mu) 592 | try: 593 | return var.sqrt() 594 | except AttributeError: 595 | return math.sqrt(var) 596 | --------------------------------------------------------------------------------