├── .config.yaml
├── 1-map-to-genome.py
├── 2-sanitize-bam.py
├── 3-call-peaks.py
├── LICENSE
├── README.md
├── _logshim.py
├── _script_helpers.py
├── bamliquidatorbatch
    ├── __init__.py
    ├── bamliquidator_batch.py
    ├── flattener.py
    ├── normalize_plot_and_summarize.py
    └── test.py
├── blacklists
    ├── ce10-blacklist.bed
    ├── dm3-blacklist.bed
    ├── mm10-blacklist-via-mm9.bed
    ├── mm9-blacklist.bed
    ├── source-info
    │   ├── URLs.txt
    │   └── hg19-blacklist-README.pdf
    └── wgEncodeDacMapabilityConsensusExcludable.bed
├── demo-data
    ├── sample-get-SuperEnhancers-output
    │   ├── 0-enhancer-stats.txt
    │   ├── 0-se-population.R.bed
    │   ├── 0-stretch-population.R.bed
    │   ├── 0-stretch-se-population.R.bed
    │   ├── 0-te-population.R.bed
    │   ├── se-cutoff.R.png
    │   ├── se-size-histogram.R.png
    │   ├── se-te-stretch-vs-nonstretch-count-pie.R.png
    │   ├── se-vs-te-count-pie.R.png
    │   ├── se-vs-te-signal-pie.R.png
    │   ├── stretch-vs-nonstretch-count-pie.R.png
    │   └── te-size-histogram.R.png
    └── sample-mm10-CD4.bed
├── dist
    ├── README.md
    ├── bedtools-2.22.0
    ├── bedtools-2.23.0
    ├── samtools-0.1.19
    ├── samtools-1.1
    └── samtools-1.2
├── get-SuperEnhancers.R
├── helper-scripts
    ├── 0-merge-fastq.py
    ├── 3-merge-bam-rmdup.py
    ├── README.md
    └── hdf5_to_counts_table.py
├── refseq
    ├── .gitignore
    ├── UPDATED-06-11-2014
    ├── hg18.ucsc.RefSeq.refGene.tsv.gz
    ├── hg19.ucsc.RefSeq.refGene.tsv.gz
    ├── hg38.ucsc.RefSeq.refGene.tsv.gz
    ├── mm10.ucsc.RefSeq.refGene.tsv.gz
    └── mm9.ucsc.RefSeq.refGene.tsv.gz
├── requirements.txt
├── riesling.py
└── statistics.py


/.config.yaml:
--------------------------------------------------------------------------------
 1 | general:
 2 |   custom_tmp_dir: /tmp/
 3 | 
 4 | # Good source for standard builds & paths is the Illumina iGenomes collection:
 5 | #   http://support.illumina.com/sequencing/sequencing_software/igenome.html
 6 | bowtie2_genomes:
 7 |   mm9: Mus_musculus/UCSC/mm9/Sequence/Bowtie2Index/genome
 8 |   mm10: Mus_musculus/UCSC/mm10/Sequence/Bowtie2Index/genome
 9 |   hg18: Mus_musculus/UCSC/hg18/Sequence/Bowtie2Index/genome
10 |   hg19: Mus_musculus/UCSC/hg19/Sequence/Bowtie2Index/genome
11 | 
12 | # Binaries -- some distributed with this package
13 | # We include these to try to standardize on sane versions of software.
14 | binaries:
15 |   samtools: dist/samtools-1.2
16 |   samtools_legacy: dist/samtools-0.1.19
17 |   bedtools: dist/bedtools-2.23.0
18 |   findPeaks: dist/findPeaks-4.7  # via http://homer.salk.edu/homer/
19 | 
20 | # Blacklisting paths
21 | # From the ENCODE blacklist of known overcalled/false-positive regions.
22 | blacklists:
23 |   mm9: blacklists/mm9-blacklist.bed
24 |   mm10: blacklists/mm10-blacklist-via-mm9.bed
25 |   hg19: blacklists/wgEncodeDacMapabilityConsensusExcludable.bed
26 |   ce10: blacklists/ce10-blacklist.bed
27 |   dm3: blacklists/dm3-blacklist.bed
28 | 
29 | # RefSeq maps, for the RIESLING code, via UCSC
30 | refseq:
31 |   mm9: refseq/mm9.ucsc.RefSeq.refGene.tsv.gz
32 |   mm10: refseq/mm10.ucsc.RefSeq.refGene.tsv.gz
33 |   hg18: refseq/hg18.ucsc.RefSeq.refGene.tsv.gz
34 |   hg19: refseq/hg19.ucsc.RefSeq.refGene.tsv.gz
35 |   hg38: refseq/hg38.ucsc.RefSeq.refGene.tsv.gz
36 | 
37 | # Gene info, for final annotation steps of RIESLING
38 | gffs:
39 |   mm9: /dev/null
40 |   mm10: genomes/mm10genes.transcript.gtf
41 |   hg18: /dev/null
42 |   hg19: /dev/null
43 | 


--------------------------------------------------------------------------------
/1-map-to-genome.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # A simple
  5 | #
  6 | #
  7 | # Copyright (c) 2014-2016 Nick Semenkovich <semenko@alum.mit.edu>.
  8 | #   https://nick.semenkovich.com/
  9 | #
 10 | # Developed for the Gordon Lab, Washington University in St. Louis (WUSTL)
 11 | #   http://gordonlab.wustl.edu/
 12 | #
 13 | # This software is released under the MIT License:
 14 | #  http://opensource.org/licenses/MIT
 15 | #
 16 | # Source: https://github.com/GordonLab/riesling-pipeline
 17 | 
 18 | from __future__ import absolute_import, division, print_function, unicode_literals
 19 | 
 20 | __author__ = 'Nick Semenkovich <semenko@alum.mit.edu>'
 21 | __copyright__ = 'Gordon Lab at Washington University in St. Louis'
 22 | __license__ = 'MIT'
 23 | __version__ = '1.0.3'
 24 | 
 25 | from collections import OrderedDict
 26 | import _logshim
 27 | import _script_helpers
 28 | import argparse
 29 | import glob
 30 | import os
 31 | import pprint
 32 | import re
 33 | 
 34 | 
 35 | def find_paired_ends(input_path, verbose=False):
 36 |     """
 37 |     Given an input path, return
 38 | 
 39 |     :param input_path:
 40 |     :return:
 41 |     """
 42 |     find_pe_logger = _logshim.getLogger('find_paired_ends')
 43 | 
 44 |     # TODO: Modularize all this!
 45 | 
 46 |     if not os.path.isdir(input_path):
 47 |         raise ValueError("Input must be a directory. You gave: %s" % (input_path))
 48 | 
 49 |     all_files = glob.glob(input_path + "/*.PE1.fastq.gz")  # Must have .PEX. in title
 50 |     all_files.extend(glob.glob(input_path + "/*.PE2.fastq.gz"))
 51 |     all_files.extend(glob.glob(input_path + "/*.PE1.fastq"))
 52 |     all_files.extend(glob.glob(input_path + "/*.PE2.fastq"))
 53 | 
 54 |     if len(all_files) == 0:
 55 |         raise ValueError("Input directory is empty!")
 56 | 
 57 | 
 58 |     # Given paired ends, we must always have an even number of input files.
 59 |     if len(all_files) % 2 != 0:
 60 |         raise ValueError("Input directory contains an odd number of files.")
 61 | 
 62 |     re_pattern = re.compile(r'^(.*)\.PE(\d)(\.fastq|\.fastq\.gz)$')
 63 | 
 64 |     file_dict = OrderedDict()
 65 | 
 66 |     prefixes_seen = []
 67 |     pe_seen = []
 68 |     for file in sorted(all_files):
 69 |         if not os.access(file, os.R_OK):
 70 |             raise OSError("Cannot read file: %s" % (file))
 71 | 
 72 |         filename_only = file.rsplit('/', 1)[-1]
 73 |         result = re.match(re_pattern, filename_only)
 74 | 
 75 |         file_dict[file] = {'prefix': str(result.group(1)),
 76 |                            'PE': int(result.group(2))}
 77 | 
 78 |         prefixes_seen.append(file_dict[file]['prefix'])
 79 |         pe_seen.append(file_dict[file]['PE'])
 80 | 
 81 |     if len(set(pe_seen)) != 2:
 82 |         raise ValueError("Saw %d paired ends, expecting exactly two. That's confusing!" % (len(set(pe_seen))))
 83 | 
 84 |     if pe_seen.count(1) != pe_seen.count(2):
 85 |         raise ValueError("Uneven pairing of paired ends (are you missing a file)? PE1 count: %d, PE2 count: %d" %
 86 |                          (pe_seen.count(1), pe_seen.count(2)))
 87 | 
 88 |     find_pe_logger.info("Files seen: %d" % (len(all_files)))
 89 |     find_pe_logger.info("Samples seen: %d" % (len(set(prefixes_seen))))
 90 | 
 91 |     merge_strategy = {}
 92 | 
 93 |     find_pe_logger.info("Sample IDs:")
 94 |     for prefix in sorted(set(prefixes_seen)):
 95 |         find_pe_logger.info("     %s" % (prefix))
 96 | 
 97 |     for file in file_dict.iterkeys():
 98 |         merge_strategy.setdefault(file_dict[file]['prefix'], []).append(file)
 99 | 
100 |     if verbose:
101 |         find_pe_logger.debug("Merge strategy is:")
102 |         find_pe_logger.debug(pprint.pformat(merge_strategy))
103 | 
104 |     return merge_strategy
105 | 
106 | def run_bowtie2(paired_end_mapping, genome, output_path, disable_parallel=False):
107 |     bowtie2_logger = _logshim.getLogger('run_bowtie2')
108 | 
109 |     # Import the config file to get genome locations
110 |     config = _script_helpers.get_config()
111 | 
112 |     if disable_parallel:
113 |         shell_job_runner = _script_helpers.ShellJobRunner(bowtie2_logger)
114 |     else:
115 |         shell_job_runner = _script_helpers.ShellJobRunner(bowtie2_logger, delay_seconds=60)
116 | 
117 |     for output_prefix, paired_ends in paired_end_mapping.iteritems():
118 |         bowtie2_logger.info('Spawning niced process for bowtie2 on: %s' % (output_prefix))
119 |         for filename in paired_ends:
120 |             assert(" " not in filename)
121 |             assert(";" not in filename)  # Vague sanity testing for input filenames
122 |             bowtie2_logger.debug('    Input: %s' % (filename))
123 | 
124 |         # bowtie2 options:
125 |         # --end-to-end: this is the default, but let's explicitly specify it
126 |         # --sensitive: again, the default (consider switching to --fast?)
127 |         # --no-unal: Suppress unaligned reads from the output .sam
128 |         # --no-discordant: These are paired-end reads. We expect them to be non-discordant.
129 |         # --mm: mmap MAP_SHARED (other processes can use our genome, cool!)
130 |         # --met-stderr: Write metrics to stderr
131 |         # --time: output the time things took
132 |         # -x: target genome
133 |         command = "bowtie2 --end-to-end --sensitive --no-unal --no-discordant --mm --met-stderr --time -x %s -1 %s -2 %s 2>%s | samtools view -bS - >%s"
134 | 
135 |         shell_job_runner.run(command % (config['bowtie2_genomes'][genome],
136 |                                         paired_ends[0],
137 |                                         paired_ends[1],
138 |                                         output_path + "/" + output_prefix + ".bt2.log",
139 |                                         output_path + "/" + output_prefix + ".bt2.bam"))
140 | 
141 |     shell_job_runner.finish()
142 | 
143 | 
144 | def main():
145 |     # Parse & interpret command line flags.
146 |     parser = argparse.ArgumentParser(description='Given paired-end .fastq/.fastq.gz files, map to a genome.',
147 |                                      epilog="Written by Nick Semenkovich <semenko@alum.mit.edu> for the Gordon Lab at "
148 |                                             "Washington University in St. Louis: https://gordonlab.wustl.edu.",
149 |                                      usage='%(prog)s [options]',
150 |                                      formatter_class=argparse.ArgumentDefaultsHelpFormatter)
151 | 
152 |     parser.add_argument('--input-path', '-i', dest="input_path", metavar='input_dir/', type=str,
153 |                         help='Input path.', required=True)
154 |     parser.add_argument('--output-path', '-o', dest="output_path", metavar='output_dir/', type=str,
155 |                         help='Output path.', required=True)
156 |     parser.add_argument('--genome', '-g', dest="genome", metavar='genome', type=str,
157 |                         choices=['mm9', 'mm10', 'hg18', 'hg19'], help='Genome to use for bowtie2', required=True)
158 |     parser.add_argument('--no-parallel', '-np', dest="no_parallel", default=False, action='store_true',
159 |                         help='Disable parallel job spawning.')
160 | 
161 | 
162 |     parser.add_argument("--verbose", "-v", dest="verbose", default=False, action='store_true')
163 | 
164 |     parser.add_argument("--no-log", "-nl", dest="nolog", default=False, action='store_true',
165 |                         help="Do not create a log file.")
166 | 
167 |     args = parser.parse_args()
168 | 
169 |     output_path = _script_helpers.setup_output_path(args.output_path)
170 | 
171 |     _logshim.startLogger(verbose=args.verbose, noFileLog=args.nolog, outPath=output_path)
172 | 
173 | 
174 |     paired_end_mapping = find_paired_ends(args.input_path, verbose=args.verbose)
175 | 
176 |     run_bowtie2(paired_end_mapping, args.genome, output_path)
177 | 
178 | 
179 | if __name__ == '__main__':
180 |     main()
181 | 


--------------------------------------------------------------------------------
/2-sanitize-bam.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # Do very simple cleaning of .bam files for ATAC-seq data pre-processing.
  5 | #
  6 | # This script:
  7 | # * Drops low mapping quality reads (<10)
  8 | # * Removes chrM mitochondrial reads
  9 | # * Removes PCR duplicates (using samtools)
 10 | # * Removes blacklisted regions (from ENCODE or custom blacklists)
 11 | #
 12 | #
 13 | # Copyright (c) 2014-2016 Nick Semenkovich <semenko@alum.mit.edu>.
 14 | #   https://nick.semenkovich.com/
 15 | #
 16 | # Developed for the Gordon Lab, Washington University in St. Louis (WUSTL)
 17 | #   https://gordonlab.wustl.edu/
 18 | #
 19 | # This software is released under the MIT License:
 20 | #  http://opensource.org/licenses/MIT
 21 | #
 22 | # Source: https://github.com/GordonLab/riesling-pipeline
 23 | 
 24 | from __future__ import absolute_import, division, print_function, unicode_literals
 25 | 
 26 | __author__ = 'Nick Semenkovich <semenko@alum.mit.edu>'
 27 | __copyright__ = 'Gordon Lab at Washington University in St. Louis'
 28 | __license__ = 'MIT'
 29 | __version__ = '1.0.3'
 30 | 
 31 | import _logshim
 32 | import _script_helpers
 33 | import argparse
 34 | import glob
 35 | import os
 36 | import tempfile
 37 | 
 38 | # A parameter needed by samtools to sort in-memory.
 39 | MAX_MEM = "50G"
 40 | 
 41 | # Load our config files
 42 | CONFIG = _script_helpers.get_config()
 43 | 
 44 | 
 45 | def large_filter_fixmate_and_sort(input_files, genome, output_path, disable_parallel=False):
 46 |     primary_logger = _logshim.getLogger('first_pass')
 47 | 
 48 |     output_suffix = ".tmp"
 49 | 
 50 |     if disable_parallel:  # Doesn't change parallelism in last samtools sort
 51 |         shell_job_runner = _script_helpers.ShellJobRunner(primary_logger)
 52 |     else:
 53 |         shell_job_runner = _script_helpers.ShellJobRunner(primary_logger, delay_seconds=60)
 54 | 
 55 |     # We do a few things here:
 56 |     #  - View only mapping quality >= 10
 57 |     #  - Remove chrM
 58 |     #  - Sort by name for fixmate
 59 |     #     We don't parallelize here (-@ #) because fixmate blocks & parallel seems to only help for compressed.
 60 |     #  - Fixmate (needed for rmdrup)
 61 |     #  - Resorted by position
 62 |     tempfiles = []
 63 |     for filename in input_files:
 64 |         primary_logger.debug('Working on: %s' % (filename))
 65 |         command = 'export LANG=C; %s view -h -q 10 %s | grep -vF "chrM" | %s view -u -b - | ' \
 66 |                   '%s sort -l 0 -n -m %s -T %s -O bam | %s fixmate -O bam - - | %s sort -@ 8 -m %s - %s'
 67 | 
 68 |         # A super evil user could modify TMPDIR and make this generate evil strings. That's evil.
 69 |         temporary_file = tempfile.mkstemp('.tmp.bam')
 70 |         tempfiles.append(temporary_file)
 71 | 
 72 |         shell_job_runner.run(command % (CONFIG['binaries']['samtools'],
 73 |                                         filename,
 74 |                                         CONFIG['binaries']['samtools'],
 75 |                                         CONFIG['binaries']['samtools'],
 76 |                                         MAX_MEM,
 77 |                                         temporary_file[1],
 78 |                                         CONFIG['binaries']['samtools'],
 79 |                                         CONFIG['binaries']['samtools'],
 80 |                                         MAX_MEM,
 81 |                                         output_path + "/" + os.path.basename(os.path.splitext(filename)[0]) + output_suffix))
 82 | 
 83 |     shell_job_runner.finish()
 84 | 
 85 |     # Clean up our temporary files.
 86 |     primary_logger.info('Removing temporary files ...')
 87 |     for fd, fname in tempfiles:
 88 |         os.close(fd)
 89 |         os.unlink(fname)
 90 | 
 91 |     primary_logger.info('First large stage complete! Saved as .tmp.bam for next stage.')
 92 | 
 93 | 
 94 | def rmdup_and_blacklist(input_files, genome, output_path, disable_parallel=False):
 95 |     primary_logger = _logshim.getLogger('rmdup_blacklist')
 96 | 
 97 |     if disable_parallel:
 98 |         shell_job_runner = _script_helpers.ShellJobRunner(primary_logger)
 99 |     else:
100 |         shell_job_runner = _script_helpers.ShellJobRunner(primary_logger, delay_seconds=20)
101 | 
102 |     for filename in input_files:
103 |         primary_logger.debug('Working on: %s' % (filename))
104 |         # This is extremely fast and has minimal memory usage. Yay!
105 |         # TODO: Allow adjustable windowing (-w %d) to blacklist larger/surrounding regions?
106 |         command = "%s rmdup %s - 2>%s | %s window -abam - -b %s -v -w 0 > %s"
107 | 
108 |         shell_job_runner.run(command % (CONFIG['binaries']['samtools_legacy'],  # TODO: Update this when samtools is fixed.
109 |                                         output_path + "/" + os.path.basename(os.path.splitext(filename)[0]) + '.tmp.bam',  # TODO: CLEAN THIS
110 |                                         output_path + "/" + os.path.basename(os.path.splitext(filename)[0]) + '.srt.rmdup.bam.log',
111 |                                         CONFIG['binaries']['bedtools'],
112 |                                         os.path.dirname(os.path.realpath(__file__)) + '/' + CONFIG['blacklists'][genome],  # TODO: CLEAN THIS
113 |                                         output_path + "/" + os.path.basename(os.path.splitext(filename)[0]) + '.srt.rmdup.bam'))
114 | 
115 |     shell_job_runner.finish()
116 | 
117 |     primary_logger.info('Removing temporary files from stage 1 ...')
118 |     for filename in input_files:
119 |         os.unlink(output_path + "/" + os.path.basename(os.path.splitext(filename)[0]) + '.tmp.bam')
120 | 
121 |     primary_logger.info('Completed rmdup and blacklist')
122 | 
123 | 
124 | def main():
125 |     # Parse & interpret command line flags.
126 |     parser = argparse.ArgumentParser(description='Given input .bam files, fix matepairs, remove duplicates, blacklist bad'
127 |                                                  'regions, and sort the output.',
128 |                                      epilog="Written by Nick Semenkovich <semenko@alum.mit.edu> for the Gordon Lab at "
129 |                                             "Washington University in St. Louis: http://gordonlab.wustl.edu.",
130 |                                      usage='%(prog)s [options]',
131 |                                      formatter_class=argparse.ArgumentDefaultsHelpFormatter)
132 | 
133 |     parser.add_argument('--input-path', '-i', dest="input_path", metavar='input_dir/', type=str,
134 |                         help='Input path.', required=True)
135 |     parser.add_argument('--output-path', '-o', dest="output_path", metavar='output_dir/', type=str,
136 |                         help='Output path.', required=True)
137 |     parser.add_argument('--genome', '-g', dest="genome", metavar='genome', type=str,
138 |                         choices=['mm9', 'mm10', 'hg18', 'hg19'], help='Genome to use for blacklisting.', required=True)
139 |     parser.add_argument('--no-parallel', '-np', dest="no_parallel", default=False, action='store_true',
140 |                         help='Disable parallel job spawning.')
141 | 
142 |     parser.add_argument("--verbose", "-v", dest="verbose", default=False, action='store_true')
143 | 
144 |     parser.add_argument("--no-log", "-nl", dest="nolog", default=False, action='store_true',
145 |                         help="Do not create a log file.")
146 | 
147 |     args = parser.parse_args()
148 | 
149 |     output_path = _script_helpers.setup_output_path(args.output_path)
150 | 
151 |     log_main = _logshim.startLogger(verbose=args.verbose, noFileLog=args.nolog, outPath=output_path)
152 | 
153 | 
154 |     # Samtools requires a temp directory for sorting /sometimes/.
155 |     # This seems to only matter if it exceeds the in-ram limits set by the MAX_MEM parameter.
156 |     # Sanity check the /tmp directory has a bit of space.
157 |     temppath = tempfile.gettempdir()
158 |     s = os.statvfs(temppath)
159 |     if ((s.f_bavail * s.f_frsize) / (1024 * 1024)) < 10000:  # ~10 G, not for any good reason though
160 |         log_main.warn('Temp directory %s doesn\'t have a lot of free space!' % (temppath))
161 | 
162 | 
163 |     input_files = glob.glob(args.input_path + "/*.bam")  # Take ALL of the .bams.
164 | 
165 |     large_filter_fixmate_and_sort(input_files, args.genome, output_path, disable_parallel=args.no_parallel)
166 |     rmdup_and_blacklist(input_files, args.genome, output_path, disable_parallel=args.no_parallel)
167 | 
168 | 
169 | 
170 | if __name__ == '__main__':
171 |     main()
172 | 


--------------------------------------------------------------------------------
/3-call-peaks.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # Automate the execution of peak calling on .bam files.
  5 | # Originally designed for ATAC-seq data, but will work with any directory of .bam files.
  6 | #
  7 | # By default, this runs both macs14 and macs2 for comparison.
  8 | #
  9 | #
 10 | # Copyright (c) 2014-2016 Nick Semenkovich <semenko@alum.mit.edu>.
 11 | #   https://nick.semenkovich.com/
 12 | #
 13 | # Developed for the Gordon Lab, Washington University in St. Louis (WUSTL)
 14 | #   https://gordonlab.wustl.edu/
 15 | #
 16 | # This software is released under the MIT License:
 17 | #  http://opensource.org/licenses/MIT
 18 | #
 19 | # Source: https://github.com/GordonLab/riesling-pipeline
 20 | 
 21 | from __future__ import absolute_import, division, print_function, unicode_literals
 22 | 
 23 | __author__ = 'Nick Semenkovich <semenko@alum.mit.edu>'
 24 | __copyright__ = 'Gordon Lab at Washington University in St. Louis'
 25 | __license__ = 'MIT'
 26 | __version__ = '1.0.3'
 27 | 
 28 | import _logshim
 29 | import _script_helpers
 30 | import argparse
 31 | import os
 32 | 
 33 | # Load our config files
 34 | CONFIG = _script_helpers.get_config()
 35 | 
 36 | 
 37 | def generate_index(input_files, output_path, disable_parallel=False):
 38 |     """
 39 |     Many peak pickers want indexed .bams. Let's build indexes! (yay!)
 40 | 
 41 |     :param input_files:
 42 |     :param output_path:
 43 |     :param disable_parallel:
 44 |     :return:
 45 |     """
 46 |     primary_logger = _logshim.getLogger('index')
 47 | 
 48 |     if disable_parallel:
 49 |         shell_job_runner = _script_helpers.ShellJobRunner(primary_logger)
 50 |     else:
 51 |         shell_job_runner = _script_helpers.ShellJobRunner(primary_logger, delay_seconds=10)
 52 | 
 53 |     for filename in input_files:
 54 |         primary_logger.debug('Working on: %s' % (filename))
 55 |         command = "%s index %s"
 56 | 
 57 |         shell_job_runner.run(command % (CONFIG['binaries']['samtools'], filename))
 58 | 
 59 |     shell_job_runner.finish()
 60 | 
 61 | 
 62 | 
 63 | def run_macs14(input_files, output_path, genome, disable_parallel=False):
 64 |     macs14_log = _logshim.getLogger('run_macs14')
 65 | 
 66 |     macs14_log.info('Spawning MACS14 jobs...')
 67 | 
 68 |     if disable_parallel:
 69 |         shell_job_runner = _script_helpers.ShellJobRunner(macs14_log)
 70 |     else:
 71 |         shell_job_runner = _script_helpers.ShellJobRunner(macs14_log, delay_seconds=20)
 72 | 
 73 |     for filename in input_files:
 74 |         macs14_log.debug('Working on: %s' % (filename))
 75 | 
 76 |         # macs14 is old, but we try it anyway, since it's sometimes useful.
 77 |         # -t: input
 78 |         # -n: output name
 79 |         # -f: format
 80 |         # -g: genome
 81 |         # -p: pvalue for peak cutoff
 82 |         # --wig: save .wig outputs
 83 |         # --single-profile: make one single wiggle
 84 |         # --space=50: wiggle resolution (default: 10)
 85 |         #
 86 |         # Note: This CD hack is because MACS1.4 can't specify an output path :(
 87 |         command = "cd %s && %s -t %s -n %s -f BAM -g %s -p 1e-9 --wig --single-profile --space=50 2>%s"
 88 | 
 89 |         filename_without_extension = os.path.splitext(filename)[0] + '.macs14'
 90 | 
 91 |         shell_job_runner.run(command % (output_path,  # for cd hack
 92 |                                         'macs14',  # This must be pre-installed by the user. It's a big, complex package.
 93 |                                         os.getcwd() + '/' + filename,  # input file # TODO: Fix this path hack. MACS14 cannot specify an output path :/
 94 |                                         os.path.basename(filename_without_extension),
 95 |                                         genome,  # for genome size
 96 |                                         os.path.basename(filename_without_extension) + '.macs14.log'))
 97 | 
 98 |     shell_job_runner.finish()
 99 | 
100 |     macs14_log.info('MACS14 peak calling complete.')
101 | 
102 | 
103 | def run_macs2(input_files, output_path, genome, disable_parallel=False):
104 |     macs2_log = _logshim.getLogger('run_macs2')
105 | 
106 |     macs2_log.info('Spawning MACS2 jobs...')
107 | 
108 |     if disable_parallel:
109 |         shell_job_runner = _script_helpers.ShellJobRunner(macs2_log)
110 |     else:
111 |         shell_job_runner = _script_helpers.ShellJobRunner(macs2_log, delay_seconds=0.1)
112 | 
113 |     for filename in input_files:
114 |         macs2_log.debug('Working on: %s' % (filename))
115 | 
116 |         # --bdg: generate .bed graph output
117 |         # --nomodel: We'll be shifting manually!
118 |         # --extsize 200: See long discussion at: @@@
119 |         # --shift -100: As per above.
120 |         # --slocal: Look at a local window of 20kb to build peak models
121 |         # --keep-dup: We already removed duplicates with samtools.
122 |         # TODO: Consider allowing tweaks to these settings with flags?
123 |         command = "%s callpeak -t %s -n %s --outdir %s -g %s --bdg --nomodel --extsize 200 --shift -100 --slocal 20000 --llocal 20000 --keep-dup all 2>%s"
124 | 
125 |         filename_without_extension = os.path.splitext(filename)[0] + '.macs2'
126 | 
127 |         shell_job_runner.run(command % ('macs2',  # This must be pre-installed by the user. It's a big, complex package.
128 |                                         filename,  # input file
129 |                                         os.path.basename(filename_without_extension),
130 |                                         output_path,
131 |                                         genome,  # for genome size, uncleaer if this actually matters with nolambda/nomodel
132 |                                         output_path + "/" + os.path.basename(filename_without_extension) + '.log'))
133 | 
134 |     shell_job_runner.finish()
135 | 
136 |     macs2_log.info('MACS2 peak calling complete.')
137 | 
138 | 
139 | def main():
140 |     # Parse & interpret command line flags.
141 |     parser = argparse.ArgumentParser(description='Run a number of standard peak calling algorithms for ATAC-seq data. '
142 |                                                  'Expects de-duplicated, sorted, merged, ChrM-removed data.',
143 |                                      epilog="Written by Nick Semenkovich <semenko@alum.mit.edu> for the Gordon Lab at "
144 |                                             "Washington University in St. Louis: https://gordonlab.wustl.edu.",
145 |                                      usage='%(prog)s [options]',
146 |                                      formatter_class=argparse.ArgumentDefaultsHelpFormatter)
147 | 
148 |     parser.add_argument('--input-path', '-i', dest="input_path", metavar='input_dir/', type=str,
149 |                         help='Input path (or a specific .bam file).', required=True)
150 |     parser.add_argument('--output-path', '-o', dest="output_path", metavar='output_dir/', type=str,
151 |                         help='Output path.', required=True)
152 |     parser.add_argument('--genome', '-g', dest="genome", metavar='genome', type=str,
153 |                         choices=['ms', 'mm', 'ce', 'dm'], help='Genome size to pass to MACS.', required=True)  # TODO: Consider using mm9/mm10, etc. for uniformity?
154 |     parser.add_argument('--no-parallel', '-np', dest="no_parallel", default=False, action='store_true',
155 |                         help='Disable parallel job spawning.')
156 | 
157 |     parser.add_argument('--skip-bam-indexing', dest="skip_bam_indexing", action='store_true',
158 |                         help='Skip bam indexing (You must have generated indexes independently for peak callers to work!).', required=False)
159 | 
160 |     parser.add_argument('--skip-macs14', dest="skip_macs14", action='store_true',
161 |                         help='Skip MACS v1.4 peak calling.', required=False)
162 |     parser.add_argument('--skip-macs2', dest="skip_macs2", action='store_true',
163 |                         help='Skip MACS v2 peak calling.', required=False)
164 | 
165 | 
166 |     parser.add_argument("--verbose", "-v", dest="verbose", default=False, action='store_true')
167 | 
168 |     parser.add_argument("--no-log", "-nl", dest="nolog", default=False, action='store_true',
169 |                         help="Do not create a log file.")
170 | 
171 |     args = parser.parse_args()
172 | 
173 |     output_path = _script_helpers.setup_output_path(args.output_path)
174 | 
175 |     log_main = _logshim.startLogger(verbose=args.verbose, noFileLog=args.nolog, outPath=output_path)
176 | 
177 |     input_files = _script_helpers.validate_input_files(args.input_path)
178 | 
179 | 
180 |     # Generate BAM indexes
181 |     if not args.skip_bam_indexing:
182 |         generate_index(input_files, output_path, disable_parallel=args.no_parallel)
183 |     else:
184 |         log_main.warn("Skipping bam index .bai generation as requested.")
185 |         log_main.warn("You must have generated these separately, otherwise peak callers will fail.")
186 | 
187 |     if not args.skip_macs14:
188 |         # Start with old-school MACS 1.4
189 |         run_macs14(input_files, output_path, args.genome, disable_parallel=args.no_parallel)
190 | 
191 |     if not args.skip_macs2:
192 |         # Now new MACS 2
193 |         # macs2 callpeak --nomodel -t $BAM -n $OUT --nolambda --keep-dup all --slocal 10000
194 |         run_macs2(input_files, output_path, args.genome, disable_parallel=args.no_parallel)
195 | 
196 | 
197 | 
198 | 
199 | if __name__ == '__main__':
200 |     main()
201 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2014-2016 Nick Semenkovich | semenko@alum.mit.edu
 4 | 	      	       https://nick.semenkovich.com
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 7 | this software and associated documentation files (the "Software"), to deal in
 8 | the Software without restriction, including without limitation the rights to
 9 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
10 | the Software, and to permit persons to whom the Software is furnished to do so,
11 | subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
18 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
19 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
20 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## The RIESLING ATAC-seq Pipeline [![Build Status](https://travis-ci.org/GordonLab/riesling-pipeline.svg?branch=master)](https://travis-ci.org/GordonLab/riesling-pipeline)
  2 | 
  3 | The RIESLING (Rapid Identification of EnhancerS LInked to Nearby Genes) ATAC-seq pipeline is designed to be an efficient set of standalone scripts for quickly analyzing ATAC-seq data.
  4 | 
  5 | You may find it particularly useful for identifying and stratifying super-enhancers, though can also be leveraged for differential accessibility analysis using DESeq2.
  6 | 
  7 | Since this was originally developed in 2014-2015, a number of other packages have been developed, notably the [Kundaje lab pipeline](https://github.com/kundajelab/atac_dnase_pipelines) and the [Ren lab single-cell ATAC pipeline](https://github.com/r3fang/scATAC). The Kundaje lab pipeline in particular includes other features (e.g. IDR analysis) which you may find more useful if you aren't interested in super-enhancers / enhancer clusters.
  8 | 
  9 | ================
 10 | 
 11 | ## Getting started
 12 | 1. Clone this repo: `git clone https://github.com/GordonLab/riesling-pipeline.git`
 13 | 2. `cd riesling-pipeline`
 14 | 3. Install the Python dependencies: `pip install --user -U -r requirements.txt`
 15 | 
 16 | ================
 17 | 
 18 | ### Simple Hacks: Call super-enhancers on a .bed
 19 | 
 20 | If you already have a .bed of putative enhancers, you can rapidly derive the super-enhancer population and statistics using `get-SuperEnhancers.R`. It will *not* filter blacklisted regions, stitch large regions together, remove TSSes, etc. -- use the full pipeline (detailed below) for that.
 21 | 
 22 | A quick example, using the [demo-data/sample-mm10-CD4.bed](demo-data/sample-mm10-CD4.bed) file, which contains signal intensity in the 7th column:
 23 | 
 24 | ```
 25 | $ git clone https://github.com/GordonLab/riesling-pipeline/
 26 | ...
 27 | $ cd riesling-pipeline/
 28 | $ Rscript get-SuperEnhancers.R demo-data/sample-mm10-CD4.bed demo-data/sample-get-SuperEnhancers-output/
 29 | 
 30 | [1] "Working on: demo-data/sample-mm10-CD4.bed"
 31 | [1] "Output dir: demo-data/sample-get-SuperEnhancers-output/"
 32 | [1] "Current directory is: /Users/semenko/git/riesling-pipeline"
 33 | [1] "Setting output directory to: demo-data/sample-get-SuperEnhancers-output/"
 34 | [1] "Inflection at entry: 24795"
 35 | [1] "Corresponding cutoff score: 8105.366159055"
 36 | 
 37 | $ cat demo-data/sample-get-SuperEnhancers-output/0-enhancer-stats.txt
 38 |  Statistics for: demo-data/sample-mm10-CD4.bed
 39 |  SE Signal %: 38
 40 |  TE Signal %: 62
 41 |  SE Count: 1329
 42 |  TE Count: 24794
 43 |  SE Count %: 5.09
 44 |  TE Count %: 94.91
 45 |  Mean SE Size: 35846.22
 46 |  Mean TE Size: 5104.87
 47 |  Median SE Size: 31833
 48 |  Median TE Size: 892.5
 49 | ```
 50 | 
 51 | Graphical & .bed results are now in [demo-data/sample-get-SuperEnhancers-output/](demo-data/sample-get-SuperEnhancers-output/), and will include these figures and more:
 52 | 
 53 | <table>
 54 | <tr>
 55 | <td>Super-enhancer Cutoff Hockeystick</td>
 56 | <td>Super-enhancer Size Distribution</td>
 57 | <td>Super vs Traditional vs Stretch Enhancers</td>
 58 | </tr>
 59 | <tr>
 60 | <td><img src="demo-data/sample-get-SuperEnhancers-output/se-cutoff.R.png?raw=true" height="300px"></td>
 61 | <td><img src="demo-data/sample-get-SuperEnhancers-output/se-size-histogram.R.png?raw=true" height="300px"></td>
 62 | <td><img src="demo-data/sample-get-SuperEnhancers-output/se-te-stretch-vs-nonstretch-count-pie.R.png?raw=true" height="300px"></td>
 63 | </tr>
 64 | </table>
 65 | 
 66 | Again, this may not be appropriate on non-preprocessed (blacklisted, TSS-filtered, etc.) data. You more likely want to use the full pipeline, detailed below.
 67 | 
 68 | 
 69 | 
 70 | <!---
 71 | # A Working Example
 72 | 
 73 | ** (Coming soon) **
 74 | --->
 75 | 
 76 | ## Expected Inputs & Pre-processing Data
 77 | 
 78 | The heart of RIESLING and related code expects:
 79 | 
 80 | * A .bam file of your aligned ATAC-seq data (e.g. from bowtie2)
 81 | * A .bed of peaks (from MACS, HOMER, or any other standard peak caller)
 82 | 
 83 | There are lots of ways to pre-process data -- please use whatever approach you prefer. A basic set of standalone
 84 | scripts to help pre-process paired-end sequencing data is provided, detailed below under 'Pre-processing Tools'.
 85 | 
 86 | <!---
 87 | ## Calling Super-Enhancers, Stretch Enhancers, and more
 88 | 
 89 | ** (Full example coming soon) **
 90 | 
 91 | ## Differential Accessibility Analyses (e.g. with DESeq2)
 92 | 
 93 | ** (Full example coming soon) **
 94 | */
 95 | -->
 96 | 
 97 | ## Pre-processing Tools
 98 | 
 99 | There are lots of valid way to pre-process data. These standalone scripts may be of use, but please pay careful attention
100 | to the peak calling settings you use -- as the defaults may not be applicable to your experimental approach.
101 | 
102 | 
103 | * ./1-map-to-genome.py: Run bowtie2 across a folder of paired-end sequence data.
104 | 
105 | * ./2-sanitize-bam.py: Clean up .bam files, including ATAC-specific fixes.
106 | This includes: chrM removal, quality filtering, ENCODE blacklist removal, and PCR duplicate removal
107 | 
108 | * ./3-call-peaks.py: Run both macs14 and macs2 on .bam files.
109 | 
110 | By default, this runs both macs14 and macs2, and operates on directories of .bam files.
111 | 
112 | ================
113 | 


--------------------------------------------------------------------------------
/_logshim.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Clean, colorized logging for scripts.
  3 | 
  4 | Based on https://stackoverflow.com/questions/384076/how-can-i-color-python-logging-output
  5 | with many adaptations by Nick Semenkovich <semenko@alum.mit.edu>
  6 | 
  7 | License: MIT
  8 | Author: Nick Semenkovich <semenko@alum.mit.edu>
  9 | """
 10 | 
 11 | from __future__ import absolute_import, division, print_function, unicode_literals
 12 | 
 13 | __author__ = 'Nick Semenkovich <semenko@alum.mit.edu>'
 14 | __copyright__ = 'Gordon Lab at Washington University in St. Louis'
 15 | 
 16 | import atexit
 17 | import datetime
 18 | import logging
 19 | import os
 20 | import platform
 21 | import sys
 22 | import time
 23 | 
 24 | if __name__ == '__main__':
 25 |     print("Do not run this as a standalone script.")
 26 |     exit()
 27 | 
 28 | # Very pretty error reporting, where available
 29 | try:
 30 |     from IPython.core import ultratb
 31 |     sys.excepthook = ultratb.FormattedTB(mode='Context', color_scheme='Linux')
 32 | except ImportError:
 33 |     pass
 34 | 
 35 | # Log our execution time, used by log_execution_time below.
 36 | STARTTIME = time.time()
 37 | 
 38 | ## Define our core colors and resets.
 39 | 
 40 | RESET = "\x1b[0m"
 41 | BOLD = "\033[1m"
 42 | BLUE = "\x1b[34;01m"
 43 | CYAN = "\x1b[36;01m"
 44 | GREEN = "\x1b[32;01m"
 45 | RED = "\x1b[31;01m"
 46 | GRAY = "\x1b[37;01m"
 47 | YELLOW = "\x1b[33;01m"
 48 | 
 49 | # TODO: replace TMP with gettmp or whatever
 50 | 
 51 | def startLogger(verbose=False, noFileLog=False, initialLoggerName='main', outPath='/tmp'):
 52 |     """
 53 |     Set logging if called.
 54 |     TODO: Make this a class, extending the logging module.
 55 |     """
 56 |     datestamp = datetime.datetime.now().strftime("%Y%m%d-%H:%M:%S")
 57 |     results_path = outPath + '/' + datestamp + '.' + platform.node() + '.log'
 58 | 
 59 |     if not noFileLog:
 60 |         fileHandler = logging.FileHandler(results_path, 'w')
 61 |         fileFormatter = logging.Formatter('%(asctime)s: %(name)-25s: %(levelno)-3s: '
 62 |                                           '(%(filename)s:%(lineno)d, %(funcName)s) : %(message)s')
 63 |         fileHandler.setFormatter(fileFormatter)
 64 |         fileHandler.setLevel(logging.DEBUG)
 65 | 
 66 |     # Sneakily add colors directly as the log-level name.
 67 |     logging.addLevelName(logging.DEBUG,    CYAN   + 'DEBUG')
 68 |     logging.addLevelName(logging.INFO,     GREEN  + 'INFO')
 69 |     logging.addLevelName(logging.WARNING,  YELLOW + 'WARNING')
 70 |     logging.addLevelName(logging.ERROR,    RED    + 'ERROR')
 71 |     logging.addLevelName(logging.CRITICAL, RED    + 'CRITICAL')
 72 | 
 73 |     consoleHandler = logging.StreamHandler()
 74 |     consoleFormatter = logging.Formatter(BOLD + "%(name)-25s" + RESET + ": %(levelname)-17s" + RESET +
 75 |                                          ": %(message)-80s (" + BOLD + "%(filename)s" + RESET + ":%(lineno)d)")
 76 |     consoleHandler.setFormatter(consoleFormatter)
 77 | 
 78 |     consoleHandler.setLevel(logging.INFO)
 79 |     if verbose:
 80 |         consoleHandler.setLevel(logging.DEBUG)
 81 | 
 82 |     # Give out a logger!
 83 |     rootlog = logging.getLogger()
 84 |     rootlog.setLevel(logging.DEBUG)
 85 | 
 86 |     rootlog.addHandler(consoleHandler)
 87 |     if not noFileLog:
 88 |         rootlog.addHandler(fileHandler)
 89 |         rootlog.info('>> Logging to %s <<' % (results_path))
 90 |         rootlog.info('Running: %s [full command saved to log file]' % (os.path.basename(sys.argv[0])))
 91 |         rootlog.info('    Using python version: %s' % (sys.version.split('\n')[0]))
 92 |         rootlog.debug('%s' % (' '.join(sys.argv)))
 93 |         rootlog.info('Written by %s' % (__author__))
 94 |         rootlog.info('    Developed for the %s' % (__copyright__))
 95 | 
 96 |     return logging.getLogger(initialLoggerName)
 97 | 
 98 | 
 99 | def getLogger(name):
100 |     """
101 |     Returns a logger!
102 |     """
103 |     return logging.getLogger(name)
104 | 
105 | def log_execution_time():
106 |     """
107 |     Print the total elapsed execution time.
108 | 
109 |     :return:
110 |     """
111 |     logging.getLogger('root').info("Execution took: %0.2f secs." % (time.time() - STARTTIME))
112 | 
113 | # Register an exit handler to print execution time.
114 | atexit.register(log_execution_time)
115 | 


--------------------------------------------------------------------------------
/_script_helpers.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Shared helper functions.
  3 | 
  4 | License: MIT
  5 | Author: Nick Semenkovich <semenko@alum.mit.edu>
  6 | """
  7 | 
  8 | from __future__ import print_function
  9 | import glob
 10 | import os
 11 | import shlex
 12 | import time
 13 | import tempfile
 14 | import yaml
 15 | from distutils import spawn
 16 | from subprocess import Popen, PIPE
 17 | 
 18 | 
 19 | if __name__ == '__main__':
 20 |     print("Do not run this as a standalone script.")
 21 |     exit()
 22 | 
 23 | ###################
 24 | ## Early sanity checks.
 25 | ###################
 26 | 
 27 | THISPATH = os.path.dirname(os.path.realpath(__file__))
 28 | 
 29 | ## TODO: Given our new dist, consider dropping some of these or modifying this.
 30 | for cmd in ['samtools', 'grep', 'bedtools', 'bowtie2']:
 31 |     if spawn.find_executable(cmd) is None:
 32 |         raise OSError("Software missing, unable to find: %s" % (cmd))
 33 | 
 34 | 
 35 | def get_config():
 36 |     """
 37 |     Load the yaml config based on our true root path.
 38 | 
 39 |     :return: YAML config object
 40 |     """
 41 |     with open(THISPATH + '/.config.yaml') as yamlfile:
 42 |         config = yaml.load(yamlfile)
 43 | 
 44 |     # Append our true path to each binary
 45 |     binaries_with_paths = {}
 46 |     for binary_name in config['binaries']:
 47 |         binaries_with_paths[binary_name] = THISPATH + '/' + config['binaries'][binary_name]
 48 | 
 49 |     config['binaries'] = binaries_with_paths
 50 | 
 51 |     return config
 52 | 
 53 | def setup_output_path(path_or_file):
 54 |     """
 55 |     Make sure our output directory is writeable. Create it if necessary.
 56 |     """
 57 |     if os.path.isfile(path_or_file):
 58 |         raise ValueError("Output path appears to be a file. Please specify a directory.")
 59 | 
 60 |     output_path = path_or_file
 61 | 
 62 |     output_path = os.path.normpath(os.path.normcase(output_path))
 63 | 
 64 |     try:
 65 |         os.mkdir(output_path)
 66 |     except OSError:
 67 |         if not os.access(output_path, os.W_OK):
 68 |             raise OSError("Output path couldn't be created or isn't writeable: %s" % (output_path))
 69 | 
 70 |     return output_path
 71 | 
 72 | 
 73 | def validate_input_files(input_path_or_file, mask='.bam'):
 74 |     """
 75 |     Given an input arg (either a specific file, or a path), return it as a list of files.
 76 |     Also check that files are readable.
 77 |     """
 78 |     # TODO: Make this handle a list of raw files, too. (e.g. for 3-OPTIONAL...)
 79 |     if os.path.isfile(input_path_or_file):
 80 |         # We got a single file!
 81 |         if not input_path_or_file.endswith(mask):
 82 |             raise ValueError("Expected a %s input (or a directory with %s). You gave: %s"
 83 |                              % (mask, mask, input_path_or_file))
 84 |         file_list = [input_path_or_file]
 85 |     else:
 86 |         # It's not a file. Must be a directory.
 87 |         if not os.path.isdir(input_path_or_file):
 88 |             raise ValueError("Input not found (or not a file/folder): %s" % ((input_path_or_file)))
 89 |         file_list = glob.glob((input_path_or_file) + "/*" + mask)
 90 | 
 91 |     if len(file_list) == 0:
 92 |         raise ValueError("Input was empty!")
 93 | 
 94 |     for filename in file_list:
 95 |         if not os.access(filename, os.R_OK):
 96 |             raise OSError("Cannot read file: %s" % (filename))
 97 | 
 98 |     return file_list
 99 | 
100 | 
101 | class ShellJobRunner():
102 |     """
103 |     Run shell jobs and make sure they complete.
104 | 
105 |     This is dangerous to run on untrusted inputs!
106 |     """
107 |     def __init__(self, logger, delay_seconds=False):
108 |         self.logger = logger
109 |         self.delay_seconds = delay_seconds
110 |         self.process_list = []
111 |         if delay_seconds is False:
112 |             self.logger.info('Created a NON-parallel job runner.')
113 |         else:
114 |             self.logger.info('Created a parallel job runner with %i second delay between jobs.' % (delay_seconds))
115 | 
116 |     def run(self, command):
117 |         """
118 |         Run a given command. May be blocking (default) or non-blocking if delay_seconds is set.
119 |         """
120 | 
121 |         self.logger.debug('Running: %s' % (command))
122 | 
123 |         # TODO: Clean this up? We shouldn't be spawning sh to spawn bash to set pipefail ...
124 |         process = Popen('nice bash -c "set -o pipefail; (%s)"' % command, shell=True)
125 |         self.logger.debug('Spawned PID: %i' % (process.pid))
126 |         self.process_list.append(process)
127 | 
128 |         if self.delay_seconds is False:
129 |             self.logger.info('* Parallelism disabled. Waiting for job to complete.')
130 |             runtime_process_status = process.wait()
131 |         else:
132 |             self.logger.info('* Waiting %i seconds to spawn next job.' % (self.delay_seconds))
133 |             time.sleep(self.delay_seconds)
134 |             runtime_process_status = process.poll()
135 | 
136 |             if runtime_process_status is None:
137 |                 # Not done yet, that's cool!
138 |                 # Since delay_seconds was set, we'll return now. The user better call finish() later!
139 |                 pass
140 |             elif runtime_process_status == 0:
141 |                 # We're done already? That was suspiciously fast (or delay_seconds is too high).
142 |                 self.logger.warn('This task finished in less than %d seconds.' % (self.delay_seconds))
143 |                 self.logger.warn('This is OK if your input files are small, otherwise, this is suspicious.')
144 | 
145 |         if runtime_process_status > 0:
146 |             self.logger.critical('The last command failed!')
147 |             self.logger.critical('Fault occurred in: %s' % (command))
148 |             raise ValueError('Process failed with exit code: %i' % (runtime_process_status))
149 | 
150 | 
151 |     def finish(self):
152 |         """
153 |         Close out / block for processes.
154 |         """
155 |         self.logger.info('Waiting for all %i processes to complete...' % (len(self.process_list)))
156 | 
157 |         # TODO: Consider more granular failure info here?
158 |         exit_codes = [p.wait() for p in self.process_list]
159 |         if sum(exit_codes) != 0:
160 |             self.logger.critical('A process died! Cannot continue.')
161 |             raise ValueError("One of the processes failed! Are you out of RAM (or hitting a system limit?)")
162 | 
163 |         self.logger.info('All processes done! Yay!')
164 | 
165 | 
166 | class IntelligentRunner():
167 |     """
168 |     Run the input command string (echo | grep | cut ...) via subprocess, and
169 |     catch / discard known false-positive/annoying errors.
170 |     """
171 |     known_ignorable_stderr = {'[samopen] SAM header is present:'}
172 |     stderr_fp = tempfile.SpooledTemporaryFile()
173 | 
174 |     def __init__(self, command_string):
175 |         self.command_string = command_string
176 |         self.command_list = ','.join(shlex.split(command_string)).split(",|,")
177 | 
178 |     def _check_for_errors(self):
179 |         self.stderr_fp.seek(0)
180 |         for stdout_line in self.stderr_fp.readlines():
181 |             if stdout_line.strip() not in self.known_ignorable_stderr:
182 |                 if len(filter(stdout_line.strip().startswith, self.known_ignorable_stderr)) == 0:
183 |                     print("Was running: %s" % (self.command_string))
184 |                     raise ValueError("Unignorable STDERR output: %s" % (stdout_line.strip()))
185 |             print(stdout_line.strip())
186 | 
187 | 
188 |     def run(self):
189 |         print(self.command_list)
190 |         last_process = Popen(self.command_list[0].split(','), stdout=PIPE, stderr=self.stderr_fp)
191 |         for command in self.command_list:
192 |             print(command)
193 |             last_process = Popen(command.split(','), stdin=last_process.stdout, stdout=PIPE, stderr=self.stderr_fp)
194 | 
195 |         # Grab the output
196 |         output = last_process.communicate()[0]
197 | 
198 |         self._check_for_errors()
199 | 
200 |         return output
201 | 


--------------------------------------------------------------------------------
/bamliquidatorbatch/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GordonLab/riesling-pipeline/384f41dc964db0f59b3992f775e87c651e846f2b/bamliquidatorbatch/__init__.py


--------------------------------------------------------------------------------
/bamliquidatorbatch/bamliquidator_batch.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import normalize_plot_and_summarize as nps 
  4 | from flattener import write_tab_for_all 
  5 | 
  6 | import argparse
  7 | import csv
  8 | import datetime
  9 | import errno
 10 | import os
 11 | import subprocess
 12 | import tables
 13 | import logging
 14 | import sys
 15 | import abc
 16 | import collections
 17 | import numpy
 18 | 
 19 | from time import time 
 20 | from os.path import basename
 21 | from os.path import dirname
 22 | 
 23 | __version__ = '1.2.0'
 24 | 
 25 | default_black_list = ["chrUn", "_random", "Zv9_", "_hap"]
 26 | 
 27 | def create_files_table(h5file):
 28 |     class Files(tables.IsDescription):
 29 |         key       = tables.UInt32Col(    pos=0) # is there an easier way to assign keys?
 30 |         length    = tables.UInt64Col(    pos=2)
 31 |         # file_name would be included here, but pytables doesn't support variable length strings as table column
 32 |         # so it is instead in a vlarray "file_names" 
 33 | 
 34 |     table = h5file.create_table("/", "files", Files, "File keys and reference sequence lengths corresponding "
 35 |                                                      "to the counts table")
 36 |     table.flush()
 37 | 
 38 |     return table
 39 | 
 40 | def create_file_names_array(h5file):
 41 |     # vlarray of strings only supports a single column, so the file_key is implicitly the array index
 42 |     array = h5file.create_vlarray("/", "file_names", tables.VLStringAtom(),
 43 |                                 "File names with index corresponding to Files table key")
 44 |     array.append("*") # index/key 0 is reserved for this
 45 |     array.flush()
 46 | 
 47 |     return array
 48 | 
 49 | def all_bam_file_paths_in_directory(bam_directory):
 50 |     bam_file_paths = []
 51 |     for dirpath, _, files in os.walk(bam_directory, followlinks=True):
 52 |         for file_ in files:
 53 |             if file_.endswith(".bam"):
 54 |                 bam_file_paths.append(os.path.join(dirpath, file_))
 55 |     return bam_file_paths
 56 | 
 57 | def bam_file_paths_with_no_file_entries(file_names, bam_file_paths):
 58 |     with_no_counts = []
 59 | 
 60 |     for bam_file_path in bam_file_paths:
 61 |         if basename(bam_file_path) not in file_names:
 62 |             with_no_counts.append(bam_file_path)
 63 | 
 64 |     return with_no_counts
 65 | 
 66 | # BaseLiquidator is an abstract base class, with concrete classes BinLiquidator and RegionLiquidator
 67 | # that implement the abstract methods.
 68 | class BaseLiquidator(object):
 69 |     __metaclass__ = abc.ABCMeta
 70 | 
 71 |     @abc.abstractmethod
 72 |     def liquidate(self, bam_file_path, extension, sense = None):
 73 |         pass
 74 | 
 75 |     @abc.abstractmethod
 76 |     def normalize(self):
 77 |         pass
 78 | 
 79 |     @abc.abstractmethod
 80 |     def create_counts_table(self, h5file):
 81 |         pass
 82 | 
 83 |     def __init__(self, executable, counts_table_name, output_directory, bam_file_path,
 84 |                  include_cpp_warnings_in_stderr = True, counts_file_path = None, number_of_threads = 0):
 85 |         # clear all memoized values from any prior runs
 86 |         nps.file_keys_memo = {}
 87 | 
 88 |         self.timings = collections.OrderedDict()
 89 | 
 90 |         self.output_directory = output_directory
 91 |         self.counts_file_path = counts_file_path
 92 |         self.include_cpp_warnings_in_stderr = include_cpp_warnings_in_stderr
 93 |         self.number_of_threads = number_of_threads
 94 |         self.chromosome_patterns_to_skip = [] 
 95 | 
 96 |         # This script may be run by either a developer install from a git pipeline checkout,
 97 |         # or from a user install so that the exectuable is on the path.  First we try to
 98 |         # find the exectuable for a developer install, and if that fails we look on the
 99 |         # standard path.
100 |         if basename(dirname(dirname(os.path.realpath(__file__)))) == 'bamliquidator_internal':
101 |             # look for developer executable location 
102 |             self.executable_path = os.path.join(dirname(dirname(os.path.realpath(__file__))), executable)
103 |             if not os.path.isfile(self.executable_path):
104 |                 exit("%s is missing -- try cd'ing into the directory and running 'make'" % self.executable_path)
105 |         else:
106 |             # just look on standard path
107 |             self.executable_path = executable 
108 | 
109 |         mkdir_if_not_exists(output_directory)
110 | 
111 |         if self.counts_file_path is None:
112 |             self.counts_file_path = os.path.join(output_directory, "counts.h5")
113 |         
114 |             counts_file = tables.open_file(self.counts_file_path, mode = "w",
115 |                                            title = 'bam liquidator genome read counts - version %s' % __version__)
116 |         else:
117 |             counts_file = tables.open_file(self.counts_file_path, "r+")
118 | 
119 |         try: 
120 |             counts = counts_file.get_node("/", counts_table_name)
121 |             files = counts_file.root.files
122 |             file_names = counts_file.root.file_names
123 |         except:
124 |             counts = self.create_counts_table(counts_file)
125 |             files = create_files_table(counts_file)
126 |             file_names = create_file_names_array(counts_file)
127 | 
128 |         if os.path.isdir(bam_file_path):
129 |             self.bam_file_paths = all_bam_file_paths_in_directory(bam_file_path)
130 |         else:
131 |             self.bam_file_paths = [bam_file_path]
132 |        
133 |         self.bam_file_paths = bam_file_paths_with_no_file_entries(file_names, self.bam_file_paths)
134 | 
135 |         self.preprocess(files, file_names)
136 | 
137 |         counts_file.close() # bamliquidator_bins/bamliquidator_regions will open this file and modify
138 |                             # it, so it is probably best that we not hold an out of sync reference
139 | 
140 |     
141 |     # adds files being liquidated to the files table and populates the following member dictionaries:
142 |     # 1) file_name -> [(chromosome, sequence length), ...] 
143 |     # 2) file_name -> total mapped count
144 |     # 3) file_name -> file key number
145 |     def preprocess(self, files, file_names):
146 |         self.file_to_chromosome_length_pairs = {}
147 |         self.file_to_count = {}
148 |         self.file_to_key = {}
149 | 
150 |         chr_col         = 0
151 |         length_col      = 1
152 |         mapped_read_col = 2
153 | 
154 |         # bam file keys start at 1.
155 |         # key 0 is special and denotes "no specific file", which
156 |         # is used in normalizated_counts tables to mean an average or total for all bam files
157 |         # of a specific cell type.
158 |         next_file_key = 0 # see += 1 below
159 |         for file_record in files:
160 |             next_file_key = max(next_file_key, file_record["key"])
161 |         next_file_key += 1
162 | 
163 |         for bam_file_path in self.bam_file_paths:
164 |             args = ["samtools", "idxstats", bam_file_path]
165 |             output = subprocess.check_output(args)
166 |             # skip last two lines: the unmapped chromosome line and the empty line
167 |             reader = csv.reader(output.split('\n')[:-2], delimiter='\t')
168 |             file_name = basename(bam_file_path)
169 |             file_count = 0
170 |             
171 |             chromosome_length_pairs = []
172 |             for row in reader:
173 |                 chromosome = row[chr_col]
174 |                 if len(chromosome) >= nps.chromosome_name_length:
175 |                     raise RuntimeError('Chromosome name "%s" exceeds the max supported chromosome name length (%d). '
176 |                                        'This max chromosome length may be updated in the code if necessary -- please '
177 |                                        'contact the bamliquidator developers for additional assistance.'
178 |                                        % (chromosome, nps.chromosome_name_length))
179 |                                         
180 |                 file_count += int(row[mapped_read_col])
181 |                 chromosome_length_pairs.append((chromosome, int(row[length_col])))
182 |             
183 |             files.row["key"] = next_file_key
184 |             files.row["length"] = file_count
185 |             files.row.append()
186 |             file_names.append(file_name)
187 | 
188 |             self.file_to_chromosome_length_pairs[file_name] = chromosome_length_pairs
189 |             self.file_to_count[file_name] = file_count
190 |             self.file_to_key[file_name] = next_file_key
191 | 
192 |             next_file_key += 1
193 | 
194 |         files.flush()
195 |         file_names.flush()
196 |         assert(len(file_names) - 1 == len(files))
197 |         assert(len(file_names) == next_file_key)
198 | 
199 |     def batch(self, extension, sense):
200 |         for i, bam_file_path in enumerate(self.bam_file_paths):
201 |             logging.info("Liquidating %s (file %d of %d)", bam_file_path, i+1, len(self.bam_file_paths))
202 | 
203 |             return_code = self.liquidate(bam_file_path, extension, sense)
204 |             if return_code != 0:
205 |                 raise Exception("%s failed with exit code %d" % (self.executable_path, return_code))
206 | 
207 |         start = time()
208 |         self.normalize()
209 |         duration = time() - start
210 |         logging.info("Post liquidation processing took %f seconds", duration)
211 |         self.log_time('post_liquidation', duration)
212 | 
213 |     def flatten(self):
214 |         logging.info("Flattening HDF5 tables into text files")
215 |         start = time()
216 | 
217 |         with tables.open_file(self.counts_file_path, mode = "r") as counts_file:
218 |             write_tab_for_all(counts_file, self.output_directory)
219 | 
220 |         duration = time() - start
221 |         logging.info("Flattening took %f seconds" % duration)
222 |         self.log_time('flattening', duration)
223 | 
224 |     def chromosome_args(self, bam_file_name, skip_non_canonical):
225 |         args = []
226 |         for chromosome, length in self.file_to_chromosome_length_pairs[bam_file_name]:
227 |             if skip_non_canonical:
228 |                 if any(pattern in chromosome for pattern in self.chromosome_patterns_to_skip):
229 |                     continue
230 |             args.append(chromosome)
231 |             args.append(str(length))
232 |         return args
233 |         
234 |     def logging_cpp_args(self):
235 |         return [os.path.join(self.output_directory, "log.txt"), "1" if self.include_cpp_warnings_in_stderr else "0"]
236 | 
237 |     def log_time(self, title, seconds):
238 |         self.timings[title] = seconds
239 | 
240 |     def write_timings_to_junit_xml(self):
241 |         with open(os.path.join(self.output_directory, 'timings.xml'), 'w') as xml:
242 |             xml.write('<testsuite tests="%d">\n' % len(self.timings.keys()))
243 |             for title in self.timings:
244 |                 xml.write('\t<testcase classname="bamliquidator" name="%s" time="%f"/>\n' % (title, self.timings[title]))
245 |             xml.write('</testsuite>\n')
246 | 
247 | class BinLiquidator(BaseLiquidator):
248 |     def __init__(self, bin_size, output_directory, bam_file_path,
249 |                  counts_file_path = None, extension = 0, sense = '.', skip_plot = False,
250 |                  include_cpp_warnings_in_stderr = True, number_of_threads = 0, blacklist = default_black_list):
251 |         self.bin_size = bin_size
252 |         self.skip_plot = skip_plot
253 |         super(BinLiquidator, self).__init__("bamliquidator_bins", "bin_counts", output_directory, bam_file_path,
254 |                                             include_cpp_warnings_in_stderr, counts_file_path, number_of_threads)
255 |         self.chromosome_patterns_to_skip = blacklist
256 |         self.batch(extension, sense)
257 | 
258 |     def liquidate(self, bam_file_path, extension, sense = None):
259 |         if sense is None: sense = '.'
260 | 
261 |         cell_type = basename(dirname(bam_file_path))
262 |         if cell_type == '':
263 |             cell_type = '-'
264 |         bam_file_name = basename(bam_file_path)
265 |         args = [self.executable_path, str(self.number_of_threads), cell_type, str(self.bin_size), str(extension), sense, bam_file_path, 
266 |                 str(self.file_to_key[bam_file_name]), self.counts_file_path]
267 |         args.extend(self.logging_cpp_args())
268 |         args.extend(self.chromosome_args(bam_file_name, skip_non_canonical=True))
269 | 
270 |         start = time()
271 |         return_code = subprocess.call(args)
272 |         duration = time() - start
273 | 
274 |         reads = self.file_to_count[bam_file_name]
275 |         rate = reads / (10**6) / duration
276 |         logging.info("Liquidation completed: %f seconds, %d reads, %f millions of reads per second", duration, reads, rate)
277 |         self.log_time('liquidation', duration)
278 | 
279 |         return return_code
280 |        
281 |     def normalize(self):
282 |         with tables.open_file(self.counts_file_path, mode = "r+") as counts_file:
283 |             nps.normalize_plot_and_summarize(counts_file, self.output_directory, self.bin_size, self.skip_plot) 
284 | 
285 |     def create_counts_table(self, h5file):
286 |         class BinCount(tables.IsDescription):
287 |             bin_number = tables.UInt32Col(    pos=0)
288 |             cell_type  = tables.StringCol(16, pos=1)
289 |             chromosome = tables.StringCol(nps.chromosome_name_length, pos=2)
290 |             count      = tables.UInt64Col(    pos=3)
291 |             file_key   = tables.UInt32Col(    pos=4)
292 | 
293 |         table = h5file.create_table("/", "bin_counts", BinCount, "bin counts")
294 |         table.flush()
295 |         return table
296 | 
297 | class RegionLiquidator(BaseLiquidator):
298 |     def __init__(self, regions_file, output_directory, bam_file_path,
299 |                  region_format=None, counts_file_path = None, extension = 0, sense = '.',
300 |                  include_cpp_warnings_in_stderr = True, number_of_threads = 0):
301 |         self.regions_file = regions_file
302 |         self.region_format = region_format
303 |         if self.region_format is None:
304 |             _, self.region_format = os.path.splitext(regions_file)
305 |             if len(self.region_format) > 0 and self.region_format[0] == '.':
306 |                 self.region_format = self.region_format[1:]
307 |         if self.region_format not in ("gff", "bed"):
308 |             raise RuntimeError("Only bed and gff region file formats are supported -- %s format specified"
309 |                                % str(self.region_format))
310 | 
311 |         super(RegionLiquidator, self).__init__("bamliquidator_regions", "region_counts", output_directory, 
312 |                                                bam_file_path, include_cpp_warnings_in_stderr, counts_file_path, number_of_threads)
313 |         
314 |         self.batch(extension, sense)
315 | 
316 |     def liquidate(self, bam_file_path, extension, sense = None):
317 |         bam_file_name = basename(bam_file_path)
318 |         args = [self.executable_path, str(self.number_of_threads), self.regions_file, str(self.region_format), str(extension), bam_file_path, 
319 |                 str(self.file_to_key[bam_file_name]), self.counts_file_path]
320 |         args.extend(self.logging_cpp_args())
321 |         if sense is None:
322 |             args.append('_') # _ means use strand specified in region file (or . if none specified)
323 |         else:
324 |             args.append(sense)
325 |         args.extend(self.chromosome_args(bam_file_name, skip_non_canonical=False))
326 | 
327 |         start = time()
328 |         return_code = subprocess.call(args)
329 |         duration = time() - start
330 | 
331 |         logging.info("Liquidation completed: %f seconds", duration)
332 |         self.log_time('liquidation', duration)
333 | 
334 |         return return_code
335 | 
336 |     def normalize(self):
337 |         with tables.open_file(self.counts_file_path, mode = "r+") as counts_file:
338 |             nps.normalize_regions(counts_file.root.region_counts, counts_file.root.files)
339 | 
340 |     def create_counts_table(self, h5file):
341 |         class Region(tables.IsDescription):
342 |             file_key         = tables.UInt32Col(    pos=0)
343 |             chromosome       = tables.StringCol(nps.chromosome_name_length, pos=1)
344 |             region_name      = tables.StringCol(64, pos=2)
345 |             start            = tables.UInt64Col(    pos=3)
346 |             stop             = tables.UInt64Col(    pos=4)
347 |             strand           = tables.StringCol(1,  pos=5)
348 |             count            = tables.UInt64Col(    pos=6)
349 |             normalized_count = tables.Float64Col(   pos=7)
350 | 
351 |         table = h5file.create_table("/", "region_counts", Region, "region counts")
352 |         table.flush()
353 |         return table
354 | 
355 | def write_bamToGff_matrix(output_file_path, h5_region_counts_file_path):
356 |     with tables.open_file(h5_region_counts_file_path, "r") as counts_file:
357 |         with open(output_file_path, "w") as output:
358 |             file_keys = []
359 | 
360 |             output.write("GENE_ID\tlocusLine")
361 |             for file_record in counts_file.root.files:
362 |                 file_key = file_record["key"] 
363 |                 file_keys.append(file_key)
364 |                 output.write("\tbin_1_%s" % counts_file.root.file_names[file_key])
365 |             output.write("\n")
366 | 
367 |             number_of_files = len(file_keys)
368 |             number_of_regions = counts_file.root.region_counts.nrows / number_of_files 
369 | 
370 |             # first loop through all but the last file index, storing those counts 
371 |             prior_region_counts = numpy.zeros((number_of_regions,  number_of_files - 1))
372 |             for col, file_key in enumerate(file_keys[:-1]):
373 |                 for row, region in enumerate(counts_file.root.region_counts.where("file_key == %d" % file_key)):
374 |                     prior_region_counts[row, col] = region["normalized_count"]
375 | 
376 |             # then loop through the last index,
377 |             # printing the region columns and the counts for the prior files,
378 |             # along with the count for the last index
379 |             for row, region in enumerate(counts_file.root.region_counts.where("file_key == %d" % file_keys[-1])):
380 |                 output.write("%s\t%s(%s):%d-%d" % (region["region_name"], region["chromosome"],
381 |                     region["strand"], region["start"], region["stop"]))
382 |                 for col in range(0, number_of_files-1):
383 |                     output.write("\t%s" % round(prior_region_counts[row, col], 4))
384 |                 output.write("\t%s\n" % round(region["normalized_count"], 4))
385 | 
386 | def configure_logging(args):
387 |     # Using root logger so we can just do logging.info/warn/error in this and other files.
388 |     # If people start using bamliquidator_batch as an imported module, then we should probably
389 |     # change this logging to not use the root logger directly.
390 |     logger = logging.getLogger()
391 |     logger.setLevel(logging.INFO)
392 | 
393 |     file_handler = logging.FileHandler(os.path.join(args.output_directory, 'log.txt'))
394 |     file_handler.setFormatter(logging.Formatter('%(asctime)s %(levelname)s\t%(message)s',
395 |                                                 datefmt='%Y-%m-%d %H:%M:%S'))
396 |     
397 |     logger.addHandler(file_handler)
398 |     # todo: add bamliquidator version to the starting log message
399 |     logging.info("Starting %s %s with args %s", basename(sys.argv[0]), __version__, vars(args))
400 | 
401 |     # Adding console handler after writing the startup log entry.  The startup log could be useful 
402 |     # in a file that is being appended to from a prior run, but would be annonying on stderr.
403 | 
404 |     console_handler = logging.StreamHandler()
405 |     if args.quiet:
406 |         console_handler.setLevel(logging.ERROR)
407 |     else:
408 |         console_handler.setLevel(logging.INFO)
409 | 
410 |     class FormatterNotFormattingInfo(logging.Formatter):
411 |         def __init__(self, fmt):
412 |             logging.Formatter.__init__(self, fmt)
413 | 
414 |         def format(self, record):
415 |             if record.levelno == logging.INFO:
416 |                 return record.getMessage()
417 |             return logging.Formatter.format(self, record)
418 | 
419 |     console_handler.setFormatter(FormatterNotFormattingInfo('%(levelname)s\t%(message)s'))
420 |     logger.addHandler(console_handler)
421 | 
422 | def mkdir_if_not_exists(directory):
423 |     try:
424 |         os.mkdir(directory)
425 |     except OSError as exception:
426 |         if exception.errno != errno.EEXIST:
427 |             raise
428 | 
429 | def main():
430 |     parser = argparse.ArgumentParser(description='Count the number of base pair reads in each bin or region '
431 |                                                  'in the bam file(s) at the given directory, and then normalize, plot bins, '
432 |                                                  'and summarize the counts in the output directory.  For additional '
433 |                                                  'help, please see https://github.com/BradnerLab/pipeline/wiki')
434 | 
435 |     mut_exclusive_group = parser.add_mutually_exclusive_group()
436 |     mut_exclusive_group.add_argument('-b', '--bin_size', type=int, default=100000,
437 |                         help="Number of base pairs in each bin -- the smaller the bin size the longer the runtime and "
438 |                              "the larger the data files (default is 100000)")
439 |     mut_exclusive_group.add_argument('-r', '--regions_file',
440 |                         help='a region file in either .gff or .bed format')
441 | 
442 |     parser.add_argument('-o', '--output_directory', default='output',
443 |                         help='Directory to output the h5, log, gff, tab, and/or html files to.  Creates directory if necessary.  '
444 |                              'May overwrite prior run results if present. Default is "./output".')
445 |     parser.add_argument('-c', '--counts_file', default=None,
446 |                         help='HDF5 counts file from a prior run to be appended to.  If unspecified, defaults to '
447 |                              'creating a new file "counts.h5" in the output directory.')
448 |     parser.add_argument('-f', '--flatten', action='store_true',
449 |                         help='flatten all HDF5 tables into tab delimited text files in the output directory, one for each '
450 |                               'chromosome (note that HDF5 files can be efficiently queried and used directly -- e.g. please '
451 |                               'see http://www.pytables.org/ for easy to use Python APIs and '
452 |                               'http://www.hdfgroup.org/products/java/hdf-java-html/hdfview/ for an easy to use GUI for '
453 |                               'browsing HDF5 files)')
454 |     parser.add_argument('-e', '--extension', type=int, default=0,
455 |                         help='Extends reads by n bp (default is 0)')
456 |     parser.add_argument('--sense', default=None, choices=['+', '-', '.'],
457 |                         help="Map to '+' (forward), '-' (reverse) or '.' (both) strands. For gff regions, default is to use "
458 |                              "the sense specified by the gff file; otherwise, default maps to both.")
459 |     parser.add_argument('-m', '--match_bamToGFF', default=False, action='store_true',
460 |                         help="match bamToGFF_turbo.py matrix output format, storing the result as matrix.txt in the output folder")
461 |     parser.add_argument('--region_format', default=None, choices=['gff', 'bed'],
462 |                         help="Interpret region file as having the given format.  Default is to deduce format from file extension.")
463 |     parser.add_argument('--skip_plot', action='store_true', help='Skip generating plots.  (This can speed up execution.)')
464 |     parser.add_argument('--black_list', nargs='+', type=str, default=default_black_list,
465 |                         help='One or more (space separated) chromosome patterns to skip during bin liquidation. Default is '
466 |                              'to skip any chromosomes that contain any of the following substrings: %s. ' %  " ".join(default_black_list))
467 |     parser.add_argument('-q', '--quiet', action='store_true',
468 |                         help='Informational and warning output is suppressed so only errors are written to the console (stderr).  '
469 |                              'All bamliquidator logs are still written to log.txt in the output directory.  This also disables '
470 |                              'samtools error messages to stderr, but a corresponding bamliquidator message should still be logged '
471 |                              'in log.txt.')
472 |     parser.add_argument('-n', '--number_of_threads', type=int, default=0,
473 |                         help='Number of threads to run concurrently during liquidation.  Defaults to the total number of logical '
474 |                              'cpus on the system.')
475 |     parser.add_argument('--xml_timings', action='store_true',
476 |                         help='Write performance timings to junit style timings.xml in output folder, which is useful for '
477 |                              'tracking performance over time with automatically generated Jenkins graphs')
478 |     parser.add_argument('--version', action='version', version='%s %s' % (basename(sys.argv[0]), __version__))
479 |     parser.add_argument('bam_file_path', 
480 |                         help='The directory to recursively search for .bam files for counting.  Every .bam file must '
481 |                              'have a corresponding .bai file at the same location.  To count just a single file, '
482 |                              'provide the .bam file path instead of a directory.  The parent directory (up to 16 char) of each '
483 |                              '.bam file is interpreted as the cell type (e.g. mm1s might be an appropriate directory '
484 |                              'name).  Bam files in the same directory are grouped together for plotting. Plots use '
485 |                              'normalized counts, such that all .bam files in the same directory have bin '
486 |                              'counts that add up to 1 for each chromosome.  If your .bam files are not in this '
487 |                              'directory format, please consider creating a directory of sym links to your actual '
488 |                              '.bam and .bai files. If the .bam file already has 1 or more reads in the HDF5 counts file, '
489 |                              'then that .bam file is skipped from liquidation, but is still included in normalization, '
490 |                              'plotting, and summaries.')
491 | 
492 |     args = parser.parse_args()
493 | 
494 |     assert(tables.__version__ >= '3.0.0')
495 | 
496 |     mkdir_if_not_exists(args.output_directory)
497 | 
498 |     configure_logging(args)
499 | 
500 |     if args.regions_file is None:
501 |         liquidator = BinLiquidator(args.bin_size, args.output_directory, args.bam_file_path,
502 |                                    args.counts_file, args.extension, args.sense, args.skip_plot,
503 |                                    not args.quiet, args.number_of_threads, args.black_list)
504 |     else:
505 |         if args.counts_file:
506 |             raise Exception("Appending to a prior regions counts.h5 file is not supported at this time -- "
507 |                             "please email the developer if you need this feature")
508 |         # non-exhaustive list of items that would need to be handled to get this working:
509 |         ## review matrix output, specifically the assumption that each file has the exact same regions in the same order
510 |         liquidator = RegionLiquidator(args.regions_file, args.output_directory, args.bam_file_path, 
511 |                                       args.region_format, args.counts_file, args.extension, args.sense,
512 |                                       not args.quiet, args.number_of_threads)
513 | 
514 |     if args.flatten:
515 |         liquidator.flatten()
516 | 
517 |     if args.match_bamToGFF:
518 |         if args.regions_file is None:
519 |             logging.warning("Ignoring match_bamToGFF argument (this is only supported if a regions file is provided)")
520 |         else:
521 |             logging.info("Writing bamToGff style matrix.txt file")
522 |             start = time()
523 |             write_bamToGff_matrix(os.path.join(args.output_directory, "matrix.txt"), liquidator.counts_file_path)  
524 |             duration = time() - start
525 |             logging.info("Writing matrix.txt took %f seconds" % duration)
526 |             liquidator.log_time('matrix', duration)
527 | 
528 |     if args.xml_timings:
529 |         liquidator.write_timings_to_junit_xml()
530 | 
531 | if __name__ == "__main__":
532 |     main()
533 | 
534 | '''
535 |    The MIT License (MIT) 
536 | 
537 |    Copyright (c) 2013 John DiMatteo (jdimatteo@gmail.com)
538 | 
539 |    Permission is hereby granted, free of charge, to any person obtaining a copy
540 |    of this software and associated documentation files (the "Software"), to deal
541 |    in the Software without restriction, including without limitation the rights
542 |    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
543 |    copies of the Software, and to permit persons to whom the Software is
544 |    furnished to do so, subject to the following conditions:
545 | 
546 |    The above copyright notice and this permission notice shall be included in
547 |    all copies or substantial portions of the Software.
548 | 
549 |    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
550 |    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
551 |    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
552 |    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
553 |    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
554 |    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
555 |    THE SOFTWARE. 
556 | '''
557 | 


--------------------------------------------------------------------------------
/bamliquidatorbatch/flattener.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import argparse
 4 | import csv
 5 | import os
 6 | import tables
 7 | 
 8 | def write_tab(table, file_names, output_directory, log=False):
 9 |     chromosome_to_file_writer_pair = {}
10 | 
11 |     columns = [col for col in table.colnames if col != "chromosome"]
12 |     columns = [col if col != "file_key" else "file_name" for col in columns]
13 | 
14 |     for row in table:
15 |         chromosome = row["chromosome"]
16 |         if chromosome not in chromosome_to_file_writer_pair:
17 |             tab_file_path = os.path.join(output_directory, table.name + "_" + chromosome + ".tab")
18 |             if log:
19 |                 print "Writing", tab_file_path
20 | 
21 |             tab_file = open(tab_file_path, 'wb')
22 |             writer = csv.writer(tab_file, delimiter='\t')
23 |             writer.writerow(columns)
24 |             chromosome_to_file_writer_pair[chromosome] = (tab_file, writer)
25 |         else:
26 |             _, writer = chromosome_to_file_writer_pair[chromosome]
27 | 
28 |         # pickup here: translate file_key to file_name
29 |         row_list = []
30 |         for col in columns:
31 |             if col == "file_name":
32 |                 row_list.append(file_names[row["file_key"]])
33 |             else:
34 |                 row_list.append(row[col])
35 |             
36 |         writer.writerow(row_list)
37 | 
38 |     for tab_file, _ in chromosome_to_file_writer_pair.values():
39 |         tab_file.close()
40 | 
41 | def write_tab_for_all(h5_file, output_directory, log=False):
42 |     for table in h5_file.root:
43 |         if table.name not in ("files", "file_names"):
44 |             write_tab(table, h5_file.root.file_names, output_directory, log)
45 | 
46 | def main():
47 |     parser = argparse.ArgumentParser(description='Writes bamliquidator_batch.py hdf5 tables into tab delimited '
48 |         'text files, one for each chromosome.  Note that this is provided as a convenience, but it is hoped that '
49 |         'the hdf5 files will be used directly since they are much more efficient to work with -- e.g. please see '
50 |         'http://www.pytables.org/ for easy to use Python APIs and '
51 |         'http://www.hdfgroup.org/products/java/hdf-java-html/hdfview/ for an easy to use GUI for browsing HDF5 '
52 |         'files.  For more info, please see https://github.com/BradnerLab/pipeline/wiki/bamliquidator .')
53 |     parser.add_argument('-t', '--table', default=None, help='the table to write to hdf5, e.g. "region_counts" for '
54 |         'a regions counts.h5 file, or one of the following for a uniform bins counts.h5 file: "bin_counts", '
55 |         '"normalized_counts", "sorted_summary", or "summary".  If none specified flattens every table in the h5 file, '
56 |         'using the table name as a file prefix.')
57 |     parser.add_argument('h5_file', help='the hdf5 file generated by bamliquidator_batch.py')
58 |     parser.add_argument('output_directory', help='directory to store the tab files (must already exist)')
59 |     args = parser.parse_args()
60 | 
61 |     h5_file = tables.open_file(args.h5_file, mode = "r")
62 | 
63 |     log = True
64 | 
65 |     if args.table:
66 |         table = h5_file.get_node("/" + args.table)
67 |         write_tab(table, h5_file.root.file_names, args.output_directory, log)
68 |     else:
69 |         write_tab_for_all(h5_file, args.output_directory, log)
70 |     
71 |     h5_file.close()
72 | 
73 | if __name__ == "__main__":
74 |     main()
75 | 
76 | '''
77 |    The MIT License (MIT) 
78 | 
79 |    Copyright (c) 2013 John DiMatteo (jdimatteo@gmail.com) 
80 | 
81 |    Permission is hereby granted, free of charge, to any person obtaining a copy
82 |    of this software and associated documentation files (the "Software"), to deal
83 |    in the Software without restriction, including without limitation the rights
84 |    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
85 |    copies of the Software, and to permit persons to whom the Software is
86 |    furnished to do so, subject to the following conditions:
87 | 
88 |    The above copyright notice and this permission notice shall be included in
89 |    all copies or substantial portions of the Software.
90 | 
91 |    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
92 |    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
93 |    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
94 |    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
95 |    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
96 |    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
97 |    THE SOFTWARE. 
98 | '''
99 | 


--------------------------------------------------------------------------------
/bamliquidatorbatch/normalize_plot_and_summarize.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | ##################################################################################
  4 | # The MIT License (MIT)
  5 | #
  6 | # Copyright (c) 2013 John DiMatteo (jdimatteo@gmail.com)
  7 | #
  8 | # Permission is hereby granted, free of charge, to any person obtaining a copy of
  9 | # this software and associated documentation files (the "Software"), to deal in
 10 | # the Software without restriction, including without limitation the rights to
 11 | # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 12 | # the Software, and to permit persons to whom the Software is furnished to do so,
 13 | # subject to the following conditions:
 14 | #
 15 | # The above copyright notice and this permission notice shall be included in all
 16 | # copies or substantial portions of the Software.
 17 | #
 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
 20 | # FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
 21 | # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
 22 | # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 23 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 24 | #
 25 | ##################################################################################
 26 | 
 27 | from __future__ import division
 28 | 
 29 | import sys
 30 | import os
 31 | import argparse
 32 | import tables
 33 | import scipy.stats as stats
 34 | import collections
 35 | import logging
 36 | 
 37 | try:
 38 | #    import bokeh.plotting as bp
 39 |     pass
 40 | except:
 41 |     bp = None 
 42 | 
 43 | # note that my initial version didn't do any flush calls, which lead to bogus rows being added
 44 | # to the normalized_counts table (which was evident when the normalized counts <= 95 + > 95 didn't add up right).
 45 | # -- I should probably look into why flush was necessary and/or file a bug with pytables
 46 | 
 47 | # I also found that create_index doesn't always work (this was causing where statements to not work)
 48 | # -- I don't know if this was my fault or a bug in pytables, but I just always use create_csindex instead
 49 | 
 50 | chromosome_name_length = 64 # Includes 1 for null terminator, so really max of 63 characters.
 51 |                             # Note that changing this value requires updating C++ code as well.
 52 | 
 53 | def delete_all_but_bin_counts_and_files_table(h5file):
 54 |     for table in h5file.root:
 55 |         if table.name != "bin_counts" and table.name != "files" and table.name != "file_names":
 56 |             for index in table.colindexes.values():
 57 |                 index.column.remove_index()
 58 |             table.remove()
 59 | 
 60 | def create_normalized_counts_table(h5file):
 61 |     class BinCount(tables.IsDescription):
 62 |         bin_number = tables.UInt32Col(    pos=0)
 63 |         cell_type  = tables.StringCol(16, pos=1)
 64 |         chromosome = tables.StringCol(chromosome_name_length, pos=2)
 65 |         count      = tables.Float64Col(   pos=3)
 66 |         percentile = tables.Float64Col(   pos=4)
 67 |         file_key   = tables.UInt32Col(    pos=5)
 68 | 
 69 |     table = h5file.create_table("/", "normalized_counts", BinCount, "normalized bin counts")
 70 | 
 71 |     table.flush()
 72 | 
 73 |     return table
 74 | 
 75 | def all_cell_types(counts):
 76 |     types = set()
 77 | 
 78 |     for row in counts:
 79 |         types.add(row["cell_type"])
 80 | 
 81 |     return types 
 82 | 
 83 | def all_chromosomes(counts):
 84 |     chromosomes = collections.OrderedDict() 
 85 | 
 86 |     for row in counts:
 87 |         chromosomes[row["chromosome"]] = None
 88 | 
 89 |     return chromosomes.keys() 
 90 | 
 91 | # todo: if this used the files table and we added the cell_type to the files table, this would be much faster,
 92 | #       but it is probably necessary to leave cell_type in counts table as well (for queries)
 93 | file_keys_memo = {}
 94 | def file_keys(counts, cell_type):
 95 |     if not cell_type in file_keys_memo:
 96 |         file_keys = set() 
 97 |        
 98 |         logging.debug("Getting file keys for cell type %s", cell_type)
 99 |         for row in counts.where("cell_type == '%s'" % cell_type):
100 |             file_keys.add(row["file_key"])
101 | 
102 |         file_keys_memo[cell_type] = file_keys
103 | 
104 |         logging.debug("memoizing files for %s: %s", cell_type, str(file_keys_memo[cell_type]))
105 |         
106 |     return file_keys_memo[cell_type] 
107 | 
108 | def plot_summaries(output_directory, normalized_counts, chromosomes):
109 |     bp.output_file(output_directory + "/summary.html")
110 |     
111 |     for chromosome in chromosomes:
112 |         plot_summary(normalized_counts, chromosome)
113 | 
114 |     bp.save()
115 | 
116 | def plot_summary(normalized_counts, chromosome):
117 |     logging.debug(" - plotting %s summary", chromosome)
118 | 
119 |     condition = "(file_key == 0) & (chromosome == '%s')" % chromosome
120 | 
121 |     chromosome_count_by_bin = collections.defaultdict(int) 
122 |     for row in normalized_counts.where(condition):
123 |         chromosome_count_by_bin[row["bin_number"]] += row["count"]
124 |   
125 |     num_bins = len(chromosome_count_by_bin)
126 |     if num_bins < 2:
127 |         logging.info("-- skipping plotting %s because not enough bins (only %d)", chromosome, num_bins)
128 |         return
129 | 
130 |     overall = bp.scatter(chromosome_count_by_bin.keys(), chromosome_count_by_bin.values())
131 |     overall.title = chromosome + " counts per bin across all bam files"
132 | 
133 | def plot(output_directory, normalized_counts, chromosome, cell_types):
134 |     bp.output_file(output_directory + "/" + chromosome + ".html")
135 | 
136 |     plot_summary(normalized_counts, chromosome)
137 | 
138 |     for cell_type in cell_types:
139 |         logging.debug(" - plotting %s", cell_type)
140 | 
141 |         bin_number = [] 
142 |         count = [] 
143 |         
144 |         condition = "(file_key == 0) & (chromosome == '%s') & (cell_type == '%s')" % (chromosome, cell_type)
145 | 
146 |         for row in normalized_counts.where(condition):
147 |             bin_number.append(row["bin_number"])
148 |             count.append(row["count"])
149 | 
150 |         cell_type_plot = bp.scatter(bin_number, count)
151 |         cell_type_plot.title = "%s counts per bin" % cell_type 
152 | 
153 |     bp.save()
154 | 
155 | def populate_normalized_counts(normalized_counts, counts, file_key, bin_size, files):
156 |     total_count = length_for_file_key(files, file_key)
157 | 
158 |     '''
159 |     Excerpt from Feb 13, 2014 email from Charles Lin:
160 | 
161 |     We typically report read density in units of reads per million per basepair
162 | 
163 |     bamliquidator reports counts back in total read positions per bin.  To convert that 
164 |     into reads per million per basepair, we first need to divide by the total million 
165 |     number of reads in the bam.  Then we need to divide by the size of the bin
166 | 
167 |     So for instance if you have a 1kb bin and get 2500 counts from a bam with 30 million
168 |     reads you would calculate density as 2500/1000/30 = 0.083rpm/bp
169 |     '''
170 |     factor = (1 / bin_size) * (1 / (total_count / 10**6))
171 | 
172 |     for count_row in counts.where("file_key == %d" % file_key):
173 |         normalized_counts.row["bin_number"] = count_row["bin_number"]
174 |         normalized_counts.row["cell_type"] = count_row["cell_type"] 
175 |         normalized_counts.row["chromosome"] = count_row["chromosome"] 
176 |         assert file_key == count_row["file_key"]
177 |         normalized_counts.row["file_key"] = file_key
178 |         normalized_counts.row["count"] = count_row["count"] * factor 
179 |         normalized_counts.row["percentile"] = -1
180 |         normalized_counts.row.append()
181 | 
182 |     normalized_counts.flush()
183 |   
184 | 
185 | def length_for_file_key(files, file_key):
186 |     file_rows = files.read_where("key == %d" % file_key)
187 |     assert len(file_rows) == 1
188 |     return file_rows[0]["length"]
189 | 
190 | def normalize_regions(region_counts, files):
191 |     logging.info("Normalizing")
192 |      
193 |     file_key = None
194 | 
195 |     for row in region_counts:
196 |         if row["file_key"] != file_key:
197 |             file_key = row["file_key"]
198 |             total_count = length_for_file_key(files, file_key)
199 |         
200 |         region_size = row["stop"] - row["start"]
201 |         factor = (1 / region_size) * (1 / (total_count / 10**6))
202 | 
203 |         row["normalized_count"] = row["count"] * factor 
204 |         row.update()
205 | 
206 |     region_counts.flush()
207 | 
208 | # leave off file_key argument to calculate percentiles for the cell_type averaged normalized counts
209 | def populate_percentiles(normalized_counts, cell_type, file_key = 0):
210 |     bin_numbers = []
211 |     normalized_count_list = []
212 | 
213 |     condition = "(cell_type == '%s') & (file_key == %d)" % (cell_type, file_key)
214 | 
215 |     for row in normalized_counts.where(condition):
216 |         bin_numbers.append(row["bin_number"])
217 |         normalized_count_list.append(row["count"])
218 | 
219 |     percentiles = (stats.rankdata(normalized_count_list) - 1) / (len(normalized_count_list)-1) * 100
220 |     # percentiles calculated in bulk as suggested at 
221 |     # http://grokbase.com/t/python/python-list/092235vj27/faster-scipy-percentileofscore
222 | 
223 |     for i, row in enumerate(normalized_counts.where(condition)):
224 |         assert bin_numbers[i] == row["bin_number"]
225 |         row["percentile"] = percentiles[i]
226 |         row.update()
227 |     normalized_counts.flush()
228 | 
229 | # the cell type normalized counts are the averages of the genomes in the cell type
230 | def populate_normalized_counts_for_cell_type(normalized_counts, cell_type, file_keys):
231 |     processed_a_single_file = False
232 |     chromosome_to_summed_counts = collections.OrderedDict() 
233 | 
234 |     for file_key in file_keys:
235 |         condition = "(file_key == %d) & (cell_type == '%s')" % (file_key, cell_type)
236 |         for row in normalized_counts.where(condition):
237 |             if processed_a_single_file:
238 |                 chromosome_to_summed_counts[row["chromosome"]][row["bin_number"]] += row["count"]
239 |             else:
240 |                 if not chromosome_to_summed_counts.has_key(row["chromosome"]):
241 |                     chromosome_to_summed_counts[row["chromosome"]] = []
242 |                 chromosome_to_summed_counts[row["chromosome"]].append(row["count"])
243 |         processed_a_single_file = True
244 |             
245 |     cell_type_condition = "(file_key == 0) & (cell_type == '%s')" % cell_type
246 | 
247 |     len_file_keys = len(file_keys)
248 | 
249 |     for chromosome, summed_counts in chromosome_to_summed_counts.iteritems():
250 |         for i, summed_count in enumerate(summed_counts):
251 |             normalized_counts.row["bin_number"] = i
252 |             normalized_counts.row["cell_type"] = cell_type 
253 |             normalized_counts.row["chromosome"] = chromosome 
254 |             normalized_counts.row["file_key"] = 0 
255 |             normalized_counts.row["count"] = chromosome_to_summed_counts[chromosome][i] / len_file_keys
256 |             normalized_counts.row["percentile"] = -1
257 |             normalized_counts.row.append()
258 | 
259 |     normalized_counts.flush()
260 | 
261 | def create_summary_table(h5file):
262 |     class Summary(tables.IsDescription):
263 |         bin_number = tables.UInt32Col(                    pos=0)
264 |         chromosome = tables.StringCol(chromosome_name_length, pos=2)
265 |         avg_cell_type_percentile = tables.Float64Col(     pos=1)
266 |         cell_types_gte_95th_percentile = tables.UInt32Col(pos=2)
267 |         cell_types_lt_95th_percentile = tables.UInt32Col( pos=3)
268 |         lines_gte_95th_percentile = tables.UInt32Col(     pos=4)
269 |         lines_lt_95th_percentile = tables.UInt32Col(      pos=5)
270 |         cell_types_gte_5th_percentile = tables.UInt32Col( pos=6)
271 |         cell_types_lt_5th_percentile = tables.UInt32Col(  pos=7)
272 |         lines_gte_5th_percentile = tables.UInt32Col(      pos=8)
273 |         lines_lt_5th_percentile = tables.UInt32Col(       pos=9)
274 | 
275 |     table = h5file.create_table("/", "summary", Summary, "bin count summary")
276 | 
277 |     table.flush()
278 | 
279 |     return table
280 |    
281 | 
282 | def populate_summary(summary, normalized_counts, chromosome):
283 |     high = 95 # 95th percentile
284 |     low  = 5  # 5th percentile
285 | 
286 |     summed_cell_type_percentiles_by_bin = collections.defaultdict(float) 
287 |     cell_types_gte_high_percentile_by_bin = collections.defaultdict(int)
288 |     cell_types_lt_high_percentile_by_bin = collections.defaultdict(int)
289 |     lines_gte_high_percentile_by_bin = collections.defaultdict(int)
290 |     lines_lt_high_percentile_by_bin = collections.defaultdict(int)
291 |     cell_types_gte_low_percentile_by_bin = collections.defaultdict(int)
292 |     cell_types_lt_low_percentile_by_bin = collections.defaultdict(int)
293 |     lines_gte_low_percentile_by_bin = collections.defaultdict(int)
294 |     lines_lt_low_percentile_by_bin = collections.defaultdict(int)
295 |     lines = set()
296 |     cell_types = set()
297 |     max_bin = 0
298 | 
299 |     # note populating the dictionaries this way is much faster than looping through
300 |     # each bin and finding the matching fraction rows
301 |     for row in normalized_counts.where("chromosome == '%s'" % chromosome):
302 |         bin_number = row["bin_number"]
303 |         max_bin = max(max_bin, bin_number)
304 |         percentile = row["percentile"]
305 | 
306 |         if row["file_key"] == 0:
307 |             cell_types.add(row["cell_type"])
308 |             summed_cell_type_percentiles_by_bin[bin_number] += percentile
309 |             if percentile >= high:
310 |                 cell_types_gte_high_percentile_by_bin[bin_number] += 1
311 |             else:
312 |                 cell_types_lt_high_percentile_by_bin[bin_number] += 1
313 | 
314 |             if percentile >= low:
315 |                 cell_types_gte_low_percentile_by_bin[bin_number] += 1
316 |             else:
317 |                 cell_types_lt_low_percentile_by_bin[bin_number] += 1
318 |         else:
319 |             lines.add(row["file_key"])
320 |             if percentile >= high:
321 |                 lines_gte_high_percentile_by_bin[bin_number] += 1
322 |             else:
323 |                 lines_lt_high_percentile_by_bin[bin_number] += 1
324 | 
325 |             if percentile >= low:
326 |                 lines_gte_low_percentile_by_bin[bin_number] += 1
327 |             else:
328 |                 lines_lt_low_percentile_by_bin[bin_number] += 1
329 | 
330 |     logging.debug(" - populating summary table with calculated summaries")
331 | 
332 |     for bin_number in xrange(max_bin+1):
333 |         summary.row["bin_number"] = bin_number
334 |         summary.row["chromosome"] = chromosome
335 |         summary.row["avg_cell_type_percentile"] = summed_cell_type_percentiles_by_bin[bin_number] / len(cell_types)
336 |         summary.row["cell_types_gte_95th_percentile"] = cell_types_gte_high_percentile_by_bin[bin_number]
337 |         summary.row["cell_types_lt_95th_percentile"] = cell_types_lt_high_percentile_by_bin[bin_number]
338 |         summary.row["lines_gte_95th_percentile"] = lines_gte_high_percentile_by_bin[bin_number]
339 |         summary.row["lines_lt_95th_percentile"] = lines_lt_high_percentile_by_bin[bin_number]
340 |         summary.row["cell_types_gte_5th_percentile"] = cell_types_gte_low_percentile_by_bin[bin_number]
341 |         summary.row["cell_types_lt_5th_percentile"] = cell_types_lt_low_percentile_by_bin[bin_number]
342 |         summary.row["lines_gte_5th_percentile"] = lines_gte_low_percentile_by_bin[bin_number]
343 |         summary.row["lines_lt_5th_percentile"] = lines_lt_low_percentile_by_bin[bin_number]
344 |         summary.row.append()
345 |     summary.flush()
346 | 
347 | def normalize_plot_and_summarize(counts_file, output_directory, bin_size, skip_plot):
348 |     delete_all_but_bin_counts_and_files_table(counts_file)
349 | 
350 |     # recreating the entirity of the remaining tables is quick and easier than updating prior records correctly
351 | 
352 |     counts = counts_file.root.bin_counts
353 |     files = counts_file.root.files
354 |     normalized_counts = create_normalized_counts_table(counts_file)
355 |     summary = create_summary_table(counts_file)
356 | 
357 |     cell_types = all_cell_types(counts)
358 |     chromosomes = all_chromosomes(counts)
359 | 
360 |     logging.info("Cell Types: %s", ", ".join(cell_types))
361 | 
362 |     for cell_type in cell_types:
363 |         logging.info("Normalizing and calculating percentiles for cell type %s", cell_type)
364 |         current_file_keys = file_keys(counts, cell_type)
365 |         for file_key in current_file_keys:
366 |            populate_normalized_counts(normalized_counts, counts, file_key, bin_size, files)
367 |            populate_percentiles(normalized_counts, cell_type, file_key)
368 |         populate_normalized_counts_for_cell_type(normalized_counts, cell_type, current_file_keys) 
369 |         populate_percentiles(normalized_counts, cell_type)
370 | 
371 |     logging.info("Indexing normalized counts")
372 |     normalized_counts.cols.bin_number.create_csindex()
373 |     normalized_counts.cols.percentile.create_csindex()
374 |     normalized_counts.cols.file_key.create_csindex()
375 |     normalized_counts.cols.chromosome.create_csindex()
376 | 
377 |     if not skip_plot:
378 |         if bp is None:
379 |             logging.error('Skipping plotting because plots require bokeh and it is not installed -- '
380 |                           'see https://github.com/BradnerLab/pipeline/wiki/bamliquidator#Install . '
381 |                           'Consider running the following command to install bokeh: '
382 |                           'sudo pip install bokeh==0.4.4 "openpyxl>=1.6.1,<2.0.0"')
383 |         else:
384 |             logging.info("Plotting")
385 |             for chromosome in chromosomes:
386 |                 plot(output_directory, normalized_counts, chromosome, cell_types)
387 |             plot_summaries(output_directory, normalized_counts, chromosomes)
388 | 
389 |     logging.info("Summarizing")
390 |     for chromosome in chromosomes:
391 |         populate_summary(summary, normalized_counts, chromosome)
392 |     summary.cols.avg_cell_type_percentile.create_csindex()
393 | 
394 |     # Iterating over this index in reverse order is hundreds of times slower than iterating
395 |     # in ascending order in my tests, but copying into a reverse sorted table is very fast.
396 |     # So we create a sorted summary table sorted in decreasing percentile order.  If we need to
397 |     # iterate in the reverse sorted order, than this sorted_summary table should be used.
398 |     # Otherwise, we should use the summary table (including the case of ascending percentile
399 |     # order, which is fast since the table is indexed by that column). See
400 |     # https://groups.google.com/d/topic/pytables-users/EKMUxghQiPQ/discussion
401 |     sorted_summary = summary.copy(newname="sorted_summary", sortby=summary.cols.avg_cell_type_percentile,
402 |                                   step=-1, checkCSI=True,
403 |                                   title="Summary table sorted in decreasing percentile order")
404 |     sorted_summary.cols.bin_number.create_csindex()
405 | 
406 | def debugging_handler(signal, frame):
407 |     import pdb
408 |     pdb.set_trace()
409 | 
410 | def main():
411 |     parser = argparse.ArgumentParser(description='Calculate and plot normalized bin counts and percentiles. '
412 |         'Normalized counts, percentiles, and summaries are stored in hdf5 tables in the file "normalized_counts.h5". '
413 |         'Plots are stored in .html files.  The hdf5 and html files are stored by default in a new directory "output" '
414 |         '(which can be overridden by argument, see below), and the program aborts if this directory already exists.') 
415 |     parser.add_argument('-o', '--output_directory', default='output',
416 |                         help='directory to create and output the h5 and/or html files to (aborts if already exists)')
417 |     parser.add_argument('-b', '--bin_size', type=int, default=100000,
418 |                         help="Number of base pairs in each bin -- should match the bin size in the bin_counts_h5_file")
419 |     parser.add_argument('-v', '--validate', action='store_true',
420 |                         help='validates the previously generated normalization and/or summary tables, returning '
421 |                              'non-zero if any problems detected')
422 |     parser.add_argument('-d', '--debug', action='store_true',
423 |                         help='enables debugging hooks so ctr-c (SIGINT) enters debugging instead of halting execution')
424 |     parser.add_argument('--skip_plot', action='store_true', help='skip generating plots (this can speed up execution')
425 |     parser.add_argument('bin_counts_h5_file', help='the hdf5 file with a "counts" and "files" tables as generated by ' 
426 |                                                    'bamliquidate_batch')
427 |     args = parser.parse_args()
428 | 
429 |     if args.debug:
430 |         import signal;
431 |         signal.signal(signal.SIGINT, debugging_handler)
432 | 
433 |     if args.validate:
434 |         sys.exit(validate(args.bin_counts_h5_file)) 
435 | 
436 |     os.mkdir(args.output_directory)
437 | 
438 |     counts_file = tables.open_file(args.bin_counts_h5_file, "r+")
439 | 
440 |     normalize_plot_and_summarize(counts_file, args.output_directory, args.bin_size, args.skip_plot)
441 | 
442 |     counts_file.close()
443 | 
444 | def validate(counts_file_path):
445 |     counts_file = tables.open_file(counts_file_path, "r")
446 | 
447 |     error_count = 0
448 | 
449 |     counts = counts_file.root.bin_counts
450 |     cell_types = all_cell_types(counts)
451 |     num_cell_types = len(cell_types)
452 |     num_files = 0
453 |     for cell_type in cell_types:
454 |         num_files += len(file_keys(counts, cell_type))
455 | 
456 |     logging.info("Verifying that summary files add up to %d and cell types add up to %d", num_files, num_cell_types)
457 |     
458 |     for row in counts_file.root.summary:
459 |         if (num_cell_types != (row["cell_types_gte_95th_percentile"] + row["cell_types_lt_95th_percentile"])
460 |          or num_cell_types != (row["cell_types_gte_5th_percentile"] + row["cell_types_lt_5th_percentile"])
461 |          or num_files      != (row["lines_gte_95th_percentile"] + row["lines_lt_95th_percentile"])
462 |          or num_files      != (row["lines_gte_5th_percentile"] + row["lines_lt_5th_percentile"])):
463 |             error_count += 1
464 |             logging.error("Summary row doesn't add up: %s", row[:])
465 | 
466 |     counts_file.close()
467 | 
468 |     if error_count != 0:
469 |         logging.error("%d validation errors", error_count)
470 |         return 1
471 |     return 0
472 |         
473 | if __name__ == "__main__":
474 |     main()
475 | 
476 | 


--------------------------------------------------------------------------------
/blacklists/ce10-blacklist.bed:
--------------------------------------------------------------------------------
  1 | chrI	933000	934500
  2 | chrI	2542900	2544000
  3 | chrI	3171400	3172600
  4 | chrI	3664800	3666100
  5 | chrI	3989700	3991000
  6 | chrI	4544300	4547500
  7 | chrI	5152600	5154000
  8 | chrI	10130600	10133000
  9 | chrI	10208000	10209100
 10 | chrI	10216300	10219200
 11 | chrI	10266300	10274300
 12 | chrI	10946000	10953100
 13 | chrI	14453000	14454600
 14 | chrI	15059800	15072400
 15 | chrII	0	1000
 16 | chrII	500900	502100
 17 | chrII	694800	696500
 18 | chrII	1452500	1453600
 19 | chrII	2569900	2571400
 20 | chrII	2897400	2898700
 21 | chrII	3466000	3468700
 22 | chrII	3796200	3797500
 23 | chrII	3942000	3946700
 24 | chrII	3962400	3963400
 25 | chrII	3993900	3994900
 26 | chrII	4284900	4285900
 27 | chrII	4640900	4645000
 28 | chrII	5144700	5146700
 29 | chrII	6506100	6509100
 30 | chrII	7444200	7448800
 31 | chrII	8287400	8292900
 32 | chrII	8975400	8976900
 33 | chrII	9631700	9633200
 34 | chrII	9809600	9824700
 35 | chrII	10335700	10339300
 36 | chrII	12843500	12846100
 37 | chrII	13598500	13600000
 38 | chrII	13939900	13941400
 39 | chrII	13984900	13987000
 40 | chrII	14324100	14326100
 41 | chrII	14336800	14339700
 42 | chrII	14992300	14994200
 43 | chrII	15277000	15279300
 44 | chrIII	414400	415600
 45 | chrIII	930600	932400
 46 | chrIII	1017900	1020100
 47 | chrIII	1269500	1270500
 48 | chrIII	1299400	1302900
 49 | chrIII	2497000	2501100
 50 | chrIII	5353900	5358500
 51 | chrIII	7415800	7417800
 52 | chrIII	7443900	7449200
 53 | chrIII	7594600	7597200
 54 | chrIII	8862600	8864100
 55 | chrIII	10224200	10226100
 56 | chrIII	13778200	13783700
 57 | chrIV	906200	907700
 58 | chrIV	2828300	2830900
 59 | chrIV	3206300	3209500
 60 | chrIV	4416200	4421900
 61 | chrIV	6357700	6361000
 62 | chrIV	6468700	6469800
 63 | chrIV	6698000	6699700
 64 | chrIV	6714300	6724400
 65 | chrIV	7593500	7598300
 66 | chrIV	8572900	8581900
 67 | chrIV	9045800	9049000
 68 | chrIV	10943000	10951200
 69 | chrIV	11070500	11076000
 70 | chrIV	11610800	11612700
 71 | chrIV	11697000	11698000
 72 | chrIV	12024000	12025400
 73 | chrIV	12169300	12170600
 74 | chrIV	12314400	12319500
 75 | chrIV	12730500	12731800
 76 | chrIV	13360400	13362200
 77 | chrIV	13548500	13549900
 78 | chrIV	16963300	16964800
 79 | chrIV	17059700	17062200
 80 | chrV	264300	267300
 81 | chrV	1638000	1639300
 82 | chrV	3098300	3099700
 83 | chrV	3434600	3438800
 84 | chrV	4333300	4336600
 85 | chrV	5073300	5076300
 86 | chrV	5283100	5286100
 87 | chrV	6172100	6178000
 88 | chrV	6939100	6943200
 89 | chrV	7442600	7444800
 90 | chrV	7919700	7925000
 91 | chrV	7988600	7991500
 92 | chrV	8699200	8701900
 93 | chrV	9432700	9435500
 94 | chrV	10606100	10612000
 95 | chrV	12509600	12510900
 96 | chrV	14756400	14757500
 97 | chrV	14766600	14770500
 98 | chrV	16707200	16709400
 99 | chrV	17119700	17132600
100 | chrV	17308600	17311700
101 | chrV	17384100	17385800
102 | chrV	17391200	17394500
103 | chrV	18400100	18401700
104 | chrX	109500	114200
105 | chrX	291200	295300
106 | chrX	1752200	1755100
107 | chrX	3007000	3008300
108 | chrX	4026000	4051800
109 | chrX	5056200	5057300
110 | chrX	5914600	5915800
111 | chrX	7076900	7079100
112 | chrX	9186000	9189200
113 | chrX	9438100	9439500
114 | chrX	10361500	10367000
115 | chrX	11785700	11789800
116 | chrX	11886300	11889000
117 | chrX	12277100	12278900
118 | chrX	14388000	14389200
119 | chrX	14907900	14909700
120 | chrX	15226900	15228800
121 | chrX	15807400	15811200
122 | chrX	16758300	16760000
123 | 


--------------------------------------------------------------------------------
/blacklists/dm3-blacklist.bed:
--------------------------------------------------------------------------------
  1 | chr2L	47600	49300
  2 | chr2L	982500	984400
  3 | chr2L	2885500	2887000
  4 | chr2L	4920500	4922400
  5 | chr2L	4937900	4941100
  6 | chr2L	5171400	5177700
  7 | chr2L	6426500	6427500
  8 | chr2L	6992200	6996700
  9 | chr2L	7345200	7350300
 10 | chr2L	8102400	8103400
 11 | chr2L	8729600	8731000
 12 | chr2L	9899400	9902800
 13 | chr2L	9976200	9979800
 14 | chr2L	10422300	10423400
 15 | chr2L	11992600	11999400
 16 | chr2L	12558600	12563800
 17 | chr2L	12792200	12794100
 18 | chr2L	13522300	13523300
 19 | chr2L	13650700	13651700
 20 | chr2L	15451900	15452900
 21 | chr2L	16514400	16518200
 22 | chr2L	19576100	19577300
 23 | chr2L	19709600	19711500
 24 | chr2L	20197000	20201100
 25 | chr2L	20458300	20459300
 26 | chr2L	20746500	20747500
 27 | chr2L	21022300	21023500
 28 | chr2L	21416300	21440600
 29 | chr2L	21447300	21454900
 30 | chr2L	21482700	21485200
 31 | chr2L	21499300	21500400
 32 | chr2L	21537800	21543500
 33 | chr2L	22202600	22203600
 34 | chr2L	22377700	22389700
 35 | chr2L	22498500	22500400
 36 | chr2L	22543700	22546600
 37 | chr2L	22574300	22575300
 38 | chr2L	22602400	22603500
 39 | chr2L	22661200	22663200
 40 | chr2L	22752200	22753200
 41 | chr2L	22785500	22787400
 42 | chr2L	22809200	22810300
 43 | chr2L	22855300	22856400
 44 | chr2L	22992900	22994200
 45 | chr2LHet	14300	15300
 46 | chr2LHet	123500	126300
 47 | chr2LHet	133300	134300
 48 | chr2LHet	187400	188400
 49 | chr2LHet	237700	239100
 50 | chr2LHet	244500	245500
 51 | chr2LHet	252100	253100
 52 | chr2LHet	261200	262200
 53 | chr2LHet	347600	348600
 54 | chr2LHet	358400	359400
 55 | chr2LHet	367600	368800
 56 | chr2R	100700	101800
 57 | chr2R	141300	144400
 58 | chr2R	205900	207500
 59 | chr2R	241900	242900
 60 | chr2R	260200	261200
 61 | chr2R	325500	331800
 62 | chr2R	376800	387800
 63 | chr2R	567900	572400
 64 | chr2R	620800	621800
 65 | chr2R	654000	657200
 66 | chr2R	698200	700000
 67 | chr2R	722000	723500
 68 | chr2R	875700	876800
 69 | chr2R	893300	894300
 70 | chr2R	936500	943900
 71 | chr2R	992700	997500
 72 | chr2R	1108900	1110300
 73 | chr2R	1118300	1119300
 74 | chr2R	1174600	1175600
 75 | chr2R	1280600	1282300
 76 | chr2R	1294200	1295500
 77 | chr2R	1458600	1459700
 78 | chr2R	1540100	1541400
 79 | chr2R	2196300	2202100
 80 | chr2R	2231100	2236200
 81 | chr2R	2276700	2279200
 82 | chr2R	2287700	2289800
 83 | chr2R	2328300	2329400
 84 | chr2R	2341200	2342600
 85 | chr2R	3087700	3088700
 86 | chr2R	3123500	3134800
 87 | chr2R	3714200	3715200
 88 | chr2R	4668700	4670700
 89 | chr2R	5615500	5617500
 90 | chr2R	6072200	6073500
 91 | chr2R	6547100	6549000
 92 | chr2R	6838200	6840100
 93 | chr2R	6909300	6911100
 94 | chr2R	7185100	7189400
 95 | chr2R	8369000	8370000
 96 | chr2R	8707100	8709600
 97 | chr2R	9295900	9299100
 98 | chr2R	9615700	9623300
 99 | chr2R	9989900	9994400
100 | chr2R	10061200	10062400
101 | chr2R	10076600	10083000
102 | chr2R	10246300	10249200
103 | chr2R	10354900	10356800
104 | chr2R	10779500	10780700
105 | chr2R	13035500	13039700
106 | chr2R	13125400	13127200
107 | chr2R	14258700	14260100
108 | chr2R	14464100	14467300
109 | chr2R	14481500	14483500
110 | chr2R	15617000	15618000
111 | chr2R	15627400	15631300
112 | chr2R	15647200	15648300
113 | chr2R	16667500	16672900
114 | chr2R	17701800	17704000
115 | chr2R	18414400	18415800
116 | chr2R	19253300	19255000
117 | chr2R	19294200	19295300
118 | chr2R	20070900	20072200
119 | chr2RHet	0	1000
120 | chr2RHet	579900	580900
121 | chr2RHet	685800	688400
122 | chr2RHet	717100	718100
123 | chr2RHet	908100	912900
124 | chr2RHet	1013300	1015200
125 | chr2RHet	1260000	1261700
126 | chr2RHet	1319600	1322400
127 | chr2RHet	1354100	1355100
128 | chr2RHet	1422600	1424400
129 | chr2RHet	1430900	1435700
130 | chr2RHet	1636500	1637800
131 | chr2RHet	2049300	2050300
132 | chr2RHet	2089000	2090100
133 | chr2RHet	2230300	2231300
134 | chr2RHet	2580100	2581500
135 | chr2RHet	2610100	2611100
136 | chr2RHet	2823800	2824900
137 | chr2RHet	2985000	2986100
138 | chr2RHet	3181000	3183700
139 | chr3L	1245300	1247200
140 | chr3L	1425400	1427300
141 | chr3L	2063900	2069700
142 | chr3L	3899200	3901900
143 | chr3L	4361900	4362900
144 | chr3L	4849900	4850900
145 | chr3L	5047600	5048600
146 | chr3L	5104600	5105700
147 | chr3L	5456000	5457700
148 | chr3L	5995100	5997500
149 | chr3L	7242000	7243400
150 | chr3L	7372600	7373600
151 | chr3L	7676400	7684700
152 | chr3L	7788500	7789500
153 | chr3L	7913800	7914800
154 | chr3L	8014900	8017000
155 | chr3L	9076600	9077700
156 | chr3L	9385600	9386600
157 | chr3L	9569700	9574100
158 | chr3L	9923100	9930700
159 | chr3L	11322900	11324600
160 | chr3L	11500300	11501300
161 | chr3L	11606200	11612400
162 | chr3L	11961600	11965500
163 | chr3L	13572200	13573200
164 | chr3L	14719900	14721800
165 | chr3L	14818500	14819700
166 | chr3L	15290000	15291500
167 | chr3L	15416900	15419800
168 | chr3L	15548700	15551100
169 | chr3L	15818700	15819700
170 | chr3L	16044500	16046400
171 | chr3L	16592100	16600800
172 | chr3L	16678900	16681600
173 | chr3L	17911500	17914200
174 | chr3L	18522300	18523300
175 | chr3L	20470800	20476600
176 | chr3L	20815200	20817800
177 | chr3L	21367700	21369600
178 | chr3L	21478100	21479400
179 | chr3L	21746300	21747500
180 | chr3L	22092900	22095600
181 | chr3L	22811000	22812700
182 | chr3L	23036000	23037200
183 | chr3L	23134000	23135600
184 | chr3L	23416500	23417700
185 | chr3L	23433200	23434200
186 | chr3L	23490600	23491600
187 | chr3L	23662400	23668400
188 | chr3L	23784200	23785300
189 | chr3L	23819100	23821000
190 | chr3L	23961100	23964900
191 | chr3L	24084700	24095200
192 | chr3L	24162700	24165000
193 | chr3L	24187000	24188800
194 | chr3L	24214000	24215000
195 | chr3L	24364000	24365000
196 | chr3L	24434000	24435200
197 | chr3L	24461000	24463300
198 | chr3L	24496000	24498000
199 | chr3L	24537400	24539300
200 | chr3LHet	72700	74500
201 | chr3LHet	87000	89200
202 | chr3LHet	153100	154400
203 | chr3LHet	278100	279100
204 | chr3LHet	537200	538200
205 | chr3LHet	708600	710200
206 | chr3LHet	773600	776000
207 | chr3LHet	1279900	1285700
208 | chr3LHet	1346200	1347400
209 | chr3LHet	1483500	1485000
210 | chr3LHet	1517000	1518000
211 | chr3LHet	1894100	1895100
212 | chr3LHet	1986900	1988900
213 | chr3LHet	2163700	2164700
214 | chr3LHet	2186300	2187300
215 | chr3LHet	2202200	2207900
216 | chr3LHet	2245500	2252900
217 | chr3R	57400	59400
218 | chr3R	96400	102900
219 | chr3R	198900	199900
220 | chr3R	579100	586900
221 | chr3R	719600	720600
222 | chr3R	829600	832900
223 | chr3R	873600	878700
224 | chr3R	1085200	1086600
225 | chr3R	1165600	1169100
226 | chr3R	1648000	1649200
227 | chr3R	2133300	2134500
228 | chr3R	2335800	2336800
229 | chr3R	2645900	2648700
230 | chr3R	2912900	2914300
231 | chr3R	3177000	3178900
232 | chr3R	3526300	3528800
233 | chr3R	3869500	3870600
234 | chr3R	3920900	3922000
235 | chr3R	4228800	4230700
236 | chr3R	4396900	4399200
237 | chr3R	4726700	4739500
238 | chr3R	5516800	5525600
239 | chr3R	6083200	6085300
240 | chr3R	6210800	6213800
241 | chr3R	6786300	6787700
242 | chr3R	6892800	6893800
243 | chr3R	7197700	7199000
244 | chr3R	7494800	7496100
245 | chr3R	7666900	7668000
246 | chr3R	7785600	7786600
247 | chr3R	8317600	8320200
248 | chr3R	8325500	8336000
249 | chr3R	8537600	8539000
250 | chr3R	9140500	9144600
251 | chr3R	9804300	9806700
252 | chr3R	10072200	10073600
253 | chr3R	10931600	10932600
254 | chr3R	10959500	10964500
255 | chr3R	10991400	10992400
256 | chr3R	11433500	11434500
257 | chr3R	12814000	12820400
258 | chr3R	13859500	13862900
259 | chr3R	14855600	14856900
260 | chr3R	15267300	15268600
261 | chr3R	15601300	15602900
262 | chr3R	15936800	15937800
263 | chr3R	16073600	16074600
264 | chr3R	16379100	16385500
265 | chr3R	17126000	17128300
266 | chr3R	17436400	17437500
267 | chr3R	18124200	18125200
268 | chr3R	18276100	18277600
269 | chr3R	18314300	18315300
270 | chr3R	18709600	18710600
271 | chr3R	19359000	19360100
272 | chr3R	19664700	19669400
273 | chr3R	19683000	19684000
274 | chr3R	19903100	19904200
275 | chr3R	20032900	20037400
276 | chr3R	21152900	21154400
277 | chr3R	21224000	21226100
278 | chr3R	21435400	21436400
279 | chr3R	21942100	21943100
280 | chr3R	22922400	22923600
281 | chr3R	22969100	22971000
282 | chr3R	24079300	24081200
283 | chr3R	24584500	24585500
284 | chr3R	25479000	25480100
285 | chr3R	25604600	25605700
286 | chr3R	26064600	26065600
287 | chr3R	26227000	26228900
288 | chr3R	26901000	26904300
289 | chr3R	27157400	27159300
290 | chr3R	27241400	27243600
291 | chr3R	27718700	27720500
292 | chr3R	27806500	27809200
293 | chr3R	27895800	27899700
294 | chr3RHet	54600	55600
295 | chr3RHet	790700	791700
296 | chr3RHet	1034300	1035300
297 | chr3RHet	1275200	1277400
298 | chr3RHet	1346400	1348100
299 | chr3RHet	1358800	1360000
300 | chr3RHet	1371000	1372000
301 | chr3RHet	1416800	1417900
302 | chr3RHet	1492600	1496100
303 | chr3RHet	1518600	1524600
304 | chr3RHet	1599200	1600400
305 | chr3RHet	1721500	1723400
306 | chr3RHet	1778200	1779200
307 | chr3RHet	1828500	1833900
308 | chr3RHet	1867900	1875100
309 | chr3RHet	1950500	1958900
310 | chr3RHet	1983600	1989500
311 | chr3RHet	2109200	2110300
312 | chr3RHet	2268000	2269000
313 | chr3RHet	2450000	2451500
314 | chr4	58500	59500
315 | chr4	228400	229400
316 | chr4	434600	435600
317 | chr4	565900	566900
318 | chr4	612300	615900
319 | chr4	810100	811200
320 | chr4	860800	862000
321 | chr4	928600	929800
322 | chr4	1197100	1198100
323 | chr4	1283400	1300500
324 | chr4	1314800	1328300
325 | chr4	1339700	1341000
326 | chrU	108000	111700
327 | chrU	924100	925700
328 | chrU	984500	986000
329 | chrU	1041000	1042000
330 | chrU	1094700	1095900
331 | chrU	1499100	1503700
332 | chrU	2254700	2255700
333 | chrU	3229400	3230500
334 | chrU	3309700	3314000
335 | chrU	3430700	3432100
336 | chrU	4472900	4476400
337 | chrU	5290000	5303800
338 | chrU	5527800	5529000
339 | chrU	5583400	5584500
340 | chrU	6085800	6087100
341 | chrU	6334300	6335600
342 | chrU	6565500	6567300
343 | chrU	6938200	6939300
344 | chrU	7168400	7169600
345 | chrU	7222200	7223300
346 | chrU	7284400	7285500
347 | chrU	7735700	7736900
348 | chrU	7870600	7873000
349 | chrU	7934000	7935700
350 | chrU	8286900	8288100
351 | chrU	8558700	8559800
352 | chrU	8608600	8610400
353 | chrU	8746800	8748600
354 | chrU	9171000	9172000
355 | chrU	9285700	9286800
356 | chrU	9691100	9692500
357 | chrU	9823800	9825200
358 | chrU	10033000	10034200
359 | chrUextra	4169000	4171000
360 | chrUextra	4484500	4485700
361 | chrUextra	4686700	4687700
362 | chrUextra	5348900	5350700
363 | chrUextra	6549400	6550900
364 | chrUextra	6671700	6672800
365 | chrUextra	8371300	8373000
366 | chrUextra	8510200	8511200
367 | chrUextra	10902600	10904200
368 | chrUextra	13532500	13534000
369 | chrUextra	14386900	14388400
370 | chrUextra	14419500	14421000
371 | chrUextra	17296900	17298500
372 | chrUextra	18430600	18431700
373 | chrUextra	18567800	18569300
374 | chrUextra	19220900	19222200
375 | chrUextra	20209800	20211200
376 | chrUextra	20696200	20698000
377 | chrUextra	21539600	21541500
378 | chrUextra	21942200	21943600
379 | chrUextra	22178200	22179800
380 | chrUextra	22317900	22319400
381 | chrUextra	23209900	23211500
382 | chrUextra	24697200	24698200
383 | chrUextra	24985000	24986100
384 | chrUextra	25004300	25005900
385 | chrUextra	25110400	25111900
386 | chrUextra	25257200	25258900
387 | chrUextra	25407300	25408900
388 | chrUextra	25861700	25863300
389 | chrUextra	26159300	26160800
390 | chrUextra	26370300	26371600
391 | chrUextra	26851300	26852900
392 | chrUextra	27076100	27077800
393 | chrUextra	27370400	27371400
394 | chrUextra	27599800	27601000
395 | chrUextra	27638000	27639500
396 | chrUextra	27711600	27713100
397 | chrUextra	27825100	27826700
398 | chrUextra	27871800	27873400
399 | chrUextra	27890400	27891500
400 | chrUextra	27931900	27933400
401 | chrUextra	27954600	27955600
402 | chrUextra	27979700	27980900
403 | chrUextra	28089700	28094700
404 | chrUextra	28106300	28107900
405 | chrUextra	28213100	28214700
406 | chrUextra	28324000	28325100
407 | chrUextra	28366600	28367800
408 | chrUextra	28421900	28423200
409 | chrUextra	28447200	28450100
410 | chrUextra	28456000	28457500
411 | chrUextra	28539300	28542400
412 | chrUextra	28555900	28573200
413 | chrUextra	28581500	28593500
414 | chrUextra	28604700	28606200
415 | chrUextra	28612700	28621800
416 | chrUextra	28635700	28647800
417 | chrUextra	28653100	28654300
418 | chrUextra	28668200	28669500
419 | chrUextra	28702600	28704200
420 | chrUextra	28719800	28727800
421 | chrUextra	28740900	28744300
422 | chrUextra	28751900	28762000
423 | chrUextra	28769300	28771600
424 | chrUextra	28797200	28798500
425 | chrUextra	28810900	28812400
426 | chrUextra	28853400	28855100
427 | chrUextra	28896600	28902100
428 | chrUextra	28908900	28910100
429 | chrUextra	28940100	28944000
430 | chrUextra	28958300	28961600
431 | chrUextra	28977200	28989700
432 | chrX	6300	16700
433 | chrX	103200	105100
434 | chrX	323200	328600
435 | chrX	448700	450900
436 | chrX	458600	459600
437 | chrX	708200	709200
438 | chrX	900500	901700
439 | chrX	1259600	1261500
440 | chrX	1412200	1416800
441 | chrX	1827800	1829700
442 | chrX	1853600	1854800
443 | chrX	2299000	2300700
444 | chrX	2505400	2511600
445 | chrX	3309200	3315200
446 | chrX	3684400	3687600
447 | chrX	3692800	3695800
448 | chrX	3839200	3842100
449 | chrX	4627700	4630000
450 | chrX	4820300	4827000
451 | chrX	4885400	4887400
452 | chrX	6278000	6279000
453 | chrX	6918200	6920700
454 | chrX	7019400	7021300
455 | chrX	7374900	7376200
456 | chrX	8187200	8190700
457 | chrX	10289600	10290600
458 | chrX	10993900	10997100
459 | chrX	11490100	11492100
460 | chrX	11784200	11785200
461 | chrX	12826500	12831800
462 | chrX	13943100	13944300
463 | chrX	13950900	13954400
464 | chrX	14172800	14174600
465 | chrX	14445700	14450600
466 | chrX	15689300	15690500
467 | chrX	15947800	15953500
468 | chrX	17009200	17013300
469 | chrX	19472200	19473700
470 | chrX	19531100	19532300
471 | chrX	19837300	19839200
472 | chrX	20069000	20074400
473 | chrX	20085100	20086700
474 | chrX	20713000	20714400
475 | chrX	21478800	21480700
476 | chrX	21493800	21494900
477 | chrX	21594700	21596700
478 | chrX	21612700	21614600
479 | chrX	21771300	21773200
480 | chrX	21834000	21835900
481 | chrXHet	0	1100
482 | chrXHet	14900	15900
483 | chrXHet	34900	41200
484 | chrXHet	87500	88500
485 | chrXHet	163000	164000
486 | chrXHet	178200	179200
487 | chrXHet	196100	197300
488 | chrYHet	4500	5500
489 | chrYHet	36700	37700
490 | chrYHet	72100	75500
491 | chrYHet	136400	137600
492 | chrYHet	280400	284300
493 | 


--------------------------------------------------------------------------------
/blacklists/source-info/URLs.txt:
--------------------------------------------------------------------------------
 1 | See info at:
 2 | https://sites.google.com/site/anshulkundaje/projects/blacklists
 3 | 
 4 | HUMAN (hg19/GRCh37): http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeMapability/wgEncodeDacMapabilityConsensusExcludable.bed.gz
 5 | Official track at UCSC http://genome.ucsc.edu/cgi-bin/hgFileUi?db=hg19&g=wgEncodeMapability
 6 | README on how this track of generated: http://www.broadinstitute.org/~anshul/projects/encode/rawdata/blacklists/hg19-blacklist-README.pdf
 7 | MOUSE (mm9): http://www.broadinstitute.org/~anshul/projects/mouse/blacklist/mm9-blacklist.bed.gz
 8 | WORM (ce10): http://www.broadinstitute.org/~anshul/projects/worm/blacklist/ce10-blacklist.bed.gz
 9 | FLY (dm3): http://www.broadinstitute.org/~anshul/projects/fly/blacklist/dm3-blacklist.bed.gz
10 | 


--------------------------------------------------------------------------------
/blacklists/source-info/hg19-blacklist-README.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GordonLab/riesling-pipeline/384f41dc964db0f59b3992f775e87c651e846f2b/blacklists/source-info/hg19-blacklist-README.pdf


--------------------------------------------------------------------------------
/blacklists/wgEncodeDacMapabilityConsensusExcludable.bed:
--------------------------------------------------------------------------------
  1 | chr1	564449	570371	High_Mappability_island	1000	.
  2 | chr1	724136	727043	Satellite_repeat	1000	.
  3 | chr1	825006	825115	BSR/Beta	1000	.
  4 | chr1	2583334	2634374	Low_mappability_island	1000	.
  5 | chr1	4363064	4363242	(CATTC)n	1000	.
  6 | chr1	5725866	5736651	Low_mappability_island	1000	.
  7 | chr1	16839923	16841396	Low_mappability_island	1000	.
  8 | chr1	38077347	38077423	Low_mappability_island	1000	.
  9 | chr1	91852785	91853147	LSU-rRNA_Hsa	1000	.
 10 | chr1	104163724	104163860	Low_mappability_island	1000	.
 11 | chr1	108112972	108113707	LSU-rRNA_Hsa	1000	.
 12 | chr1	121351474	121487059	centromeric_repeat	1000	.
 13 | chr1	142535434	142543081	Satellite_repeat	1000	.
 14 | chr1	142723256	142723968	Low_mappability_island	1000	.
 15 | chr1	142792613	142793303	Low_mappability_island	1000	.
 16 | chr1	142835822	142837333	Low_mappability_island	1000	.
 17 | chr1	143274490	143284340	centromeric_repeat	1000	.
 18 | chr1	145277108	145277572	LSU-rRNA_Hsa	1000	.
 19 | chr1	149033183	149035829	Satellite_repeat	1000	.
 20 | chr1	156186169	156186712	High_Mappability_island	1000	.
 21 | chr1	224199390	224204260	Satellite_repeat	1000	.
 22 | chr1	233318467	233318516	(CATTC)n	1000	.
 23 | chr1	236260366	236260821	Low_mappability_island	1000	.
 24 | chr1	237766308	237766764	LSU-rRNA_Hsa	1000	.
 25 | chr1	238105345	238105511	Low_mappability_island	1000	.
 26 | chr1	238108025	238108378	Low_mappability_island	1000	.
 27 | chr1	238108645	238109697	Low_mappability_island	1000	.
 28 | chr10	18841533	18862467	(CATTC)n	1000	.
 29 | chr10	20035661	20037171	Low_mappability_island	1000	.
 30 | chr10	36722282	36723650	Low_mappability_island	1000	.
 31 | chr10	38772277	38819357	Satellite_repeat	1000	.
 32 | chr10	38868892	38889025	Satellite_repeat	1000	.
 33 | chr10	39076515	39155771	Satellite_repeat	1000	.
 34 | chr10	42354835	42548642	centromeric_repeat	1000	.
 35 | chr10	42596676	42602082	Satellite_repeat	1000	.
 36 | chr10	42596700	42602110	Satellite_repeat	1000	.
 37 | chr10	42661264	42667623	Satellite_repeat	1000	.
 38 | chr10	42790522	42818398	Satellite_repeat	1000	.
 39 | chr10	135498649	135502716	Satellite_repeat	1000	.
 40 | chr11	6831669	6831838	ALR/Alpha	1000	.
 41 | chr11	10529403	10531969	Low_mappability_island	1000	.
 42 | chr11	48671444	48902406	centromeric_repeat	1000	.
 43 | chr11	48931242	48964015	centromeric_repeat	1000	.
 44 | chr11	50318471	50784078	centromeric_repeat	1000	.
 45 | chr11	51090700	51374066	centromeric_repeat	1000	.
 46 | chr11	51567242	51594226	centromeric_repeat	1000	.
 47 | chr11	54694046	55027975	centromeric_repeat	1000	.
 48 | chr11	73221660	73221946	Low_mappability_island	1000	.
 49 | chr11	85194913	85195322	LSU-rRNA_Hsa	1000	.
 50 | chr11	87524468	87525005	Low_mappability_island	1000	.
 51 | chr11	103275584	103281729	Low_mappability_island	1000	.
 52 | chr11	122874287	122874443	Low_mappability_island	1000	.
 53 | chr12	20704285	20704583	SSU-rRNA_Hsa	1000	.
 54 | chr12	34372315	34372825	LSU-rRNA_Hsa	1000	.
 55 | chr12	34432130	34857010	centromeric_repeat	1000	.
 56 | chr12	37989447	38441828	centromeric_repeat	1000	.
 57 | chr12	38531376	38531930	LSU-rRNA_Hsa	1000	.
 58 | chr12	41757383	41757545	Low_mappability_island	1000	.
 59 | chr12	127650407	127651075	LSU-rRNA_Hsa	1000	.
 60 | chr12	132061320	132062046	Low_mappability_island	1000	.
 61 | chr13	56545728	56545925	Low_mappability_island	1000	.
 62 | chr13	110076444	110076782	Low_mappability_island	1000	.
 63 | chr14	18999935	19056900	centromeric_repeat	1000	.
 64 | chr14	32953263	32954381	Low_mappability_island	1000	.
 65 | chr14	84637832	84639038	Low_mappability_island	1000	.
 66 | chr14	90341302	90341516	SSU-rRNA_Hsa	1000	.
 67 | chr15	19999941	20044132	centromeric_repeat	1000	.
 68 | chr16	32493036	32570826	ALR/Alpha	1000	.
 69 | chr16	32590063	32598801	ALR/Alpha	1000	.
 70 | chr16	33237130	33241330	Low_mappability_island	1000	.
 71 | chr16	33864355	34023306	centromeric_repeat	1000	.
 72 | chr16	34180542	34197081	Satellite_repeat	1000	.
 73 | chr16	34530115	34542632	BSR/Beta	1000	.
 74 | chr16	35193580	35285885	centromeric_repeat	1000	.
 75 | chr16	46385718	46456668	Satellite_repeat	1000	.
 76 | chr16	46497639	46500515	Satellite_repeat	1000	.
 77 | chr16	47538629	47539297	LSU-rRNA_Hsa	1000	.
 78 | chr17	19355538	19356096	LSU-rRNA_Hsa	1000	.
 79 | chr17	19502495	19506773	Low_mappability_island	1000	.
 80 | chr17	21905167	21906712	centromeric_repeat	1000	.
 81 | chr17	22018524	22032049	Low_mappability_island	1000	.
 82 | chr17	22221073	22263006	centromeric_repeat	1000	.
 83 | chr17	25263010	25268059	Satellite_repeat	1000	.
 84 | chr17	25415551	25417559	telomeric_repeat	1000	.
 85 | chr17	31149365	31149981	High_Mappability_island	1000	.
 86 | chr17	33478114	33478372	LSU-rRNA_Hsa	1000	.
 87 | chr17	41381502	41382591	High_Mappability_island	1000	.
 88 | chr17	41463538	41464075	High_Mappability_island	1000	.
 89 | chr17	41464478	41465015	snRNA	1000	.
 90 | chr17	41465562	41467288	High_Mappability_island	1000	.
 91 | chr17	51183038	51183763	Low_mappability_island	1000	.
 92 | chr17	55868618	55868752	LSU-rRNA_Hsa	1000	.
 93 | chr17	75158031	75158430	LSU-rRNA_Hsa	1000	.
 94 | chr18	96416	97552	Satellite_repeat	1000	.
 95 | chr18	105658	112233	Satellite_repeat	1000	.
 96 | chr18	2842252	2842356	Low_mappability_island	1000	.
 97 | chr18	15393801	15393992	centromeric_repeat	1000	.
 98 | chr18	18510894	18520356	centromeric_repeat	1000	.
 99 | chr18	44126235	44126593	(CATTC)n	1000	.
100 | chr18	45379603	45379864	Low_mappability_island	1000	.
101 | chr18	50319086	50319301	Low_mappability_island	1000	.
102 | chr18	77772846	77773065	LSU-rRNA_Hsa	1000	.
103 | chr19	246006	247844	TAR1	1000	.
104 | chr19	22877614	22877696	SSU-rRNA_Hsa	1000	.
105 | chr19	23235030	23235504	BSR/Beta	1000	.
106 | chr19	24182398	24186210	LSU-rRNA_Hsa	1000	.
107 | chr19	24385474	24633168	centromeric_repeat	1000	.
108 | chr19	27730611	28262682	centromeric_repeat	1000	.
109 | chr19	36066445	36066810	LSU-rRNA_Hsa	1000	.
110 | chr19	36756398	36800948	centromeric_repeat	1000	.
111 | chr19	37759473	37797722	centromeric_repeat	1000	.
112 | chr19	44914313	44916340	ACRO1	1000	.
113 | chr19	44960681	44962681	ACRO1	1000	.
114 | chr2	739925	740994	Low_mappability_island	1000	.
115 | chr2	49456729	49457067	Low_mappability_island	1000	.
116 | chr2	88124390	88124903	Low_mappability_island	1000	.
117 | chr2	89830421	89880514	Satellite_repeat	1000	.
118 | chr2	90371401	90394776	Satellite_repeat	1000	.
119 | chr2	90443001	90545431	Low_mappability_island	1000	.
120 | chr2	91595080	91616015	Satellite_repeat	1000	.
121 | chr2	92267428	92326280	centromeric_repeat	1000	.
122 | chr2	115695017	115695281	LSU-rRNA_Hsa	1000	.
123 | chr2	117781085	117781300	Low_mappability_island	1000	.
124 | chr2	132966248	132989300	centromeric_repeat	1000	.
125 | chr2	132994855	133007983	ALR/Alpha	1000	.
126 | chr2	133011824	133013298	SSU-rRNA_Hsa	1000	.
127 | chr2	133036250	133040042	LSU-rRNA_Hsa	1000	.
128 | chr2	133044095	133045945	ACRO1	1000	.
129 | chr2	143848503	143848792	Low_mappability_island	1000	.
130 | chr2	148022736	148022878	Low_mappability_island	1000	.
131 | chr2	149639207	149639515	Low_mappability_island	1000	.
132 | chr2	156120500	156120610	Low_mappability_island	1000	.
133 | chr2	162135000	162139241	Low_mappability_island	1000	.
134 | chr2	230045426	230045796	LSU-rRNA_Hsa	1000	.
135 | chr20	26257032	26320267	centromeric_repeat	1000	.
136 | chr20	29517710	29521147	centromeric_repeat	1000	.
137 | chr20	29803876	29833334	centromeric_repeat	1000	.
138 | chr20	55932703	55936114	chrM	1000	.
139 | chr20	62916702	62918053	telomeric_repeat	1000	.
140 | chr21	9647205	9648529	Low_mappability_island	1000	.
141 | chr21	9694896	9704962	centromeric_repeat	1000	.
142 | chr21	9825451	9827612	High_Mappability_island	1000	.
143 | chr21	9827612	9845233	Low_mappability_island	1000	.
144 | chr21	9881895	9882569	TAR1	1000	.
145 | chr21	10084922	10088004	Satellite_repeat	1000	.
146 | chr21	10492876	10493049	Low_mappability_island	1000	.
147 | chr21	10599428	10599915	TAR1	1000	.
148 | chr21	10697886	10860890	centromeric_repeat	1000	.
149 | chr21	11186054	11188131	Satellite_repeat	1000	.
150 | chr21	14338127	14369791	centromeric_repeat	1000	.
151 | chr21	18800575	18800997	(GAGTG)n	1000	.
152 | chr21	27228003	27228242	SSU-rRNA_Hsa	1000	.
153 | chr21	46796081	46796336	Low_mappability_island	1000	.
154 | chr22	16847814	16862659	Satellite_repeat	1000	.
155 | chr22	18876789	18884510	Satellite_repeat	1000	.
156 | chr3	25508897	25509131	Low_mappability_island	1000	.
157 | chr3	73159606	73161131	snRNA	1000	.
158 | chr3	75696297	75699304	BSR/Beta	1000	.
159 | chr3	75717841	75720426	Satellite_repeat	1000	.
160 | chr3	80995858	81014459	ALR/Alpha	1000	.
161 | chr3	90311686	90507410	centromeric_repeat	1000	.
162 | chr3	93504815	93519133	centromeric_repeat	1000	.
163 | chr3	96335934	96337436	Low_mappability_island	1000	.
164 | chr3	160665423	160665642	Low_mappability_island	1000	.
165 | chr3	196625514	196625860	Satellite_repeat	1000	.
166 | chr3	197825427	197834080	Low_mappability_island	1000	.
167 | chr4	9987	12694	telomeric_repeat	1000	.
168 | chr4	12276463	12292424	ALR/Alpha	1000	.
169 | chr4	12641862	12642305	Low_mappability_island	1000	.
170 | chr4	21583630	21583719	(GAATG)n	1000	.
171 | chr4	27732004	27732240	Low_mappability_island	1000	.
172 | chr4	47774268	47774416	Low_mappability_island	1000	.
173 | chr4	49085372	49342114	centromeric_repeat	1000	.
174 | chr4	49488472	49662085	centromeric_repeat	1000	.
175 | chr4	52659961	52688986	centromeric_repeat	1000	.
176 | chr4	56194229	56194584	Low_mappability_island	1000	.
177 | chr4	65473858	65473941	Low_mappability_island	1000	.
178 | chr4	68264186	68266830	centromeric_repeat	1000	.
179 | chr4	70296565	70296841	LSU-rRNA_Hsa	1000	.
180 | chr4	76807083	76807320	LSU-rRNA_Hsa	1000	.
181 | chr4	78929660	78929920	Low_mappability_island	1000	.
182 | chr4	156374749	156377226	chrM	1000	.
183 | chr4	156384860	156387314	Low_mappability_island	1000	.
184 | chr4	163342479	163342744	Low_mappability_island	1000	.
185 | chr4	190190746	190203442	Low_mappability_island	1000	.
186 | chr4	190801869	190802909	Low_mappability_island	1000	.
187 | chr4	190943802	190943962	Satellite_repeat	1000	.
188 | chr4	190987268	190990949	Satellite_repeat	1000	.
189 | chr4	191026302	191044344	telomeric_repeat	1000	.
190 | chr5	17517177	17600940	Low_mappability_island	1000	.
191 | chr5	21477365	21497415	Low_mappability_island	1000	.
192 | chr5	34177882	34197574	Low_mappability_island	1000	.
193 | chr5	45908253	46411114	centromeric_repeat	1000	.
194 | chr5	49405493	49554574	centromeric_repeat	1000	.
195 | chr5	71146650	71146996	LSU-rRNA_Hsa	1000	.
196 | chr5	79945807	79948223	Low_mappability_island	1000	.
197 | chr5	93903068	93906726	Low_mappability_island	1000	.
198 | chr5	97746525	97746679	Low_mappability_island	1000	.
199 | chr5	99381556	99390873	Low_mappability_island	1000	.
200 | chr5	105889063	105889263	chrM	1000	.
201 | chr5	123095972	123097432	chrM	1000	.
202 | chr5	134258949	134264271	Low_mappability_island	1000	.
203 | chr5	174541634	174542177	SSU-rRNA_Hsa	1000	.
204 | chr6	58735349	58739031	centromeric_repeat	1000	.
205 | chr6	58745955	58780547	centromeric_repeat	1000	.
206 | chr6	61880095	61944008	centromeric_repeat	1000	.
207 | chr6	62189892	62206612	ALR/Alpha	1000	.
208 | chr6	62207809	62230644	ALR/Alpha	1000	.
209 | chr6	62283966	62284581	Low_mappability_island	1000	.
210 | chr6	133593944	133594201	LSU-rRNA_Hsa	1000	.
211 | chr6	137059142	137059326	SSU-rRNA_Hsa	1000	.
212 | chr6	150665074	150665281	SSU-rRNA_Hsa	1000	.
213 | chr6	157731310	157735525	Low_mappability_island	1000	.
214 | chr7	43878355	43878530	TAR1	1000	.
215 | chr7	45291517	45291740	Low_mappability_island	1000	.
216 | chr7	56437808	56442977	Low_mappability_island	1000	.
217 | chr7	57253980	57254183	Low_mappability_island	1000	.
218 | chr7	57255310	57255444	Low_mappability_island	1000	.
219 | chr7	57261829	57261998	Low_mappability_island	1000	.
220 | chr7	57544726	57556913	Satellite_repeat	1000	.
221 | chr7	57811488	57836990	centromeric_repeat	1000	.
222 | chr7	57939184	58055539	centromeric_repeat	1000	.
223 | chr7	61054285	62454680	centromeric_repeat	1000	.
224 | chr7	64059157	64066183	BSR/Beta	1000	.
225 | chr7	64951348	64956223	centromeric_repeat	1000	.
226 | chr7	68201468	68201673	Low_mappability_island	1000	.
227 | chr7	68527370	68527788	LSU-rRNA_Hsa	1000	.
228 | chr7	80962907	80963147	SSU-rRNA_Hsa	1000	.
229 | chr7	100550640	100551321	Low_mappability_island	1000	.
230 | chr7	142372972	142375638	Low_mappability_island	1000	.
231 | chr7	145694403	145694561	Low_mappability_island	1000	.
232 | chr8	155512	157639	TAR1	1000	.
233 | chr8	21455971	21456306	LSU-rRNA_Hsa	1000	.
234 | chr8	32868966	32873279	Low_mappability_island	1000	.
235 | chr8	43092737	43097573	Satellite_repeat	1000	.
236 | chr8	43399486	43843604	centromeric_repeat	1000	.
237 | chr8	46838215	47457541	centromeric_repeat	1000	.
238 | chr8	47739043	47742797	Low_mappability_island	1000	.
239 | chr8	47750844	47776101	BSR/Beta	1000	.
240 | chr8	56754955	56755418	LSU-rRNA_Hsa	1000	.
241 | chr8	69218401	69218922	LSU-rRNA_Hsa	1000	.
242 | chr8	70602248	70602620	LSU-rRNA_Hsa	1000	.
243 | chr8	77114154	77114389	Low_mappability_island	1000	.
244 | chr8	100508010	100508287	Low_mappability_island	1000	.
245 | chr9	10435	11574	TAR1	1000	.
246 | chr9	4799734	4800000	SSU-rRNA_Hsa	1000	.
247 | chr9	33656606	33659249	Low_mappability_island	1000	.
248 | chr9	42819021	42832395	centromeric_repeat	1000	.
249 | chr9	44070617	44070871	Low_mappability_island	1000	.
250 | chr9	44873123	44902307	centromeric_repeat	1000	.
251 | chr9	45355954	45357644	telomeric_repeat	1000	.
252 | chr9	45435109	45443517	centromeric_repeat	1000	.
253 | chr9	66494170	66494805	TAR1	1000	.
254 | chr9	66767710	66864329	centromeric_repeat	1000	.
255 | chr9	66970914	67005594	centromeric_repeat	1000	.
256 | chr9	67315122	67321036	centromeric_repeat	1000	.
257 | chr9	67789868	67792893	centromeric_repeat	1000	.
258 | chr9	68410775	68435115	Low_mappability_island	1000	.
259 | chr9	69677073	69687998	centromeric_repeat	1000	.
260 | chr9	69689770	69711497	centromeric_repeat	1000	.
261 | chr9	69947961	70011196	centromeric_repeat	1000	.
262 | chr9	70076144	70076855	centromeric_repeat	1000	.
263 | chr9	70318723	70327683	centromeric_repeat	1000	.
264 | chr9	72653073	72653572	Satellite_repeat	1000	.
265 | chr9	78790077	78790255	(GAATG)n	1000	.
266 | chr9	79186574	79187026	LSU-rRNA_Hsa	1000	.
267 | chr9	141019938	141021783	TAR1	1000	.
268 | chrM	1	16571	chrM	1000	.
269 | chrX	55206111	55206740	Low_mappability_island	1000	.
270 | chrX	55207753	55208152	Low_mappability_island	1000	.
271 | chrX	55208300	55208643	Low_mappability_island	1000	.
272 | chrX	55208980	55209208	Low_mappability_island	1000	.
273 | chrX	55209655	55210006	Low_mappability_island	1000	.
274 | chrX	58330488	58330843	centromeric_repeat	1000	.
275 | chrX	58373806	58373962	centromeric_repeat	1000	.
276 | chrX	58377680	58377864	centromeric_repeat	1000	.
277 | chrX	58415350	58416387	centromeric_repeat	1000	.
278 | chrX	58432411	58432680	centromeric_repeat	1000	.
279 | chrX	58485887	58486241	centromeric_repeat	1000	.
280 | chrX	58488898	58494528	centromeric_repeat	1000	.
281 | chrX	58499466	58504235	centromeric_repeat	1000	.
282 | chrX	58506076	58528214	centromeric_repeat	1000	.
283 | chrX	58528184	58536883	centromeric_repeat	1000	.
284 | chrX	58544061	58582415	centromeric_repeat	1000	.
285 | chrX	61681834	61919683	centromeric_repeat	1000	.
286 | chrX	62003205	62041580	centromeric_repeat	1000	.
287 | chrX	83658929	83659019	Low_mappability_island	1000	.
288 | chrX	108297348	108297886	LSU-rRNA_Hsa	1000	.
289 | chrX	114959057	115006437	Low_mappability_island	1000	.
290 | chrX	125605623	125607351	Low_mappability_island	1000	.
291 | chrX	125714985	125715338	Low_mappability_island	1000	.
292 | chrX	125864844	125864980	Low_mappability_island	1000	.
293 | chrX	125865719	125865874	Low_mappability_island	1000	.
294 | chrY	313470	313613	ALR/Alpha	1000	.
295 | chrY	3004989	3005175	LSU-rRNA_Hsa	1000	.
296 | chrY	4212807	4212910	Low_mappability_island	1000	.
297 | chrY	7671817	7694928	BSR/Beta	1000	.
298 | chrY	7726064	7730229	BSR/Beta	1000	.
299 | chrY	7730734	7731598	BSR/Beta	1000	.
300 | chrY	7735811	7752887	BSR/Beta	1000	.
301 | chrY	7785067	7806311	BSR/Beta	1000	.
302 | chrY	7806856	7814704	BSR/Beta	1000	.
303 | chrY	7815230	7820478	BSR/Beta	1000	.
304 | chrY	7829937	7832032	BSR/Beta	1000	.
305 | chrY	7832744	7848695	BSR/Beta	1000	.
306 | chrY	7870343	7873582	BSR/Beta	1000	.
307 | chrY	7874115	7874584	BSR/Beta	1000	.
308 | chrY	7875409	7885257	BSR/Beta	1000	.
309 | chrY	7886545	7894591	BSR/Beta	1000	.
310 | chrY	7898927	7916812	BSR/Beta	1000	.
311 | chrY	7918790	7921352	BSR/Beta	1000	.
312 | chrY	7926344	7936705	BSR/Beta	1000	.
313 | chrY	7941130	7947438	BSR/Beta	1000	.
314 | chrY	7948790	7964448	BSR/Beta	1000	.
315 | chrY	8179010	8181143	BSR/Beta	1000	.
316 | chrY	8181757	8213330	BSR/Beta	1000	.
317 | chrY	8214629	8215637	BSR/Beta	1000	.
318 | chrY	8220421	8230061	BSR/Beta	1000	.
319 | chrY	8230686	8231546	BSR/Beta	1000	.
320 | chrY	8240772	8265916	BSR/Beta	1000	.
321 | chrY	8291535	8292942	BSR/Beta	1000	.
322 | chrY	8294002	8295175	BSR/Beta	1000	.
323 | chrY	8296944	8321375	BSR/Beta	1000	.
324 | chrY	8325813	8325929	BSR/Beta	1000	.
325 | chrY	8326678	8333466	BSR/Beta	1000	.
326 | chrY	8334027	8342387	BSR/Beta	1000	.
327 | chrY	8356544	8369346	BSR/Beta	1000	.
328 | chrY	8909560	8909925	TAR1	1000	.
329 | chrY	8979478	8979585	Low_mappability_island	1000	.
330 | chrY	9072781	9072993	TAR1	1000	.
331 | chrY	9908430	9925608	centromeric_repeat	1000	.
332 | chrY	9981952	9982126	BSR/Beta	1000	.
333 | chrY	10034864	10036712	SSU-rRNA_Hsa	1000	.
334 | chrY	10040627	10045657	ALR/Alpha	1000	.
335 | chrY	10047773	10052533	ALR/Alpha	1000	.
336 | chrY	10053695	10057722	ALR/Alpha	1000	.
337 | chrY	10059394	10073694	ALR/Alpha	1000	.
338 | chrY	10075082	10075781	ALR/Alpha	1000	.
339 | chrY	10080736	10104539	ALR/Alpha	1000	.
340 | chrY	13104530	13144368	centromeric_repeat	1000	.
341 | chrY	13193966	13196535	Low_mappability_island	1000	.
342 | chrY	13252193	13259484	centromeric_repeat	1000	.
343 | chrY	13290177	13290667	chrM	1000	.
344 | chrY	13445957	13490591	Satellite_repeat	1000	.
345 | chrY	13642186	13749784	Satellite_repeat	1000	.
346 | chrY	13798522	13870984	Satellite_repeat	1000	.
347 | chrY	19691913	19692524	LSU-rRNA_Hsa	1000	.
348 | chrY	19764063	19776198	ALR/Alpha	1000	.
349 | chrY	19780600	19781704	ALR/Alpha	1000	.
350 | chrY	19783669	19796396	ALR/Alpha	1000	.
351 | chrY	19800068	19801419	ALR/Alpha	1000	.
352 | chrY	19808085	19817100	ALR/Alpha	1000	.
353 | chrY	19944298	19944581	TAR1	1000	.
354 | chrY	20235195	20235478	TAR1	1000	.
355 | chrY	20362679	20371694	ALR/Alpha	1000	.
356 | chrY	20378360	20379711	ALR/Alpha	1000	.
357 | chrY	20383383	20396110	ALR/Alpha	1000	.
358 | chrY	20398075	20399179	ALR/Alpha	1000	.
359 | chrY	20403581	20415713	ALR/Alpha	1000	.
360 | chrY	20487248	20487859	LSU-rRNA_Hsa	1000	.
361 | chrY	23124788	23125577	BSR/Beta	1000	.
362 | chrY	23149027	23151205	BSR/Beta	1000	.
363 | chrY	23157969	23158245	BSR/Beta	1000	.
364 | chrY	23159001	23167737	BSR/Beta	1000	.
365 | chrY	23178886	23181770	BSR/Beta	1000	.
366 | chrY	23220740	23223625	BSR/Beta	1000	.
367 | chrY	23234125	23235822	BSR/Beta	1000	.
368 | chrY	23236898	23248080	BSR/Beta	1000	.
369 | chrY	23248729	23248851	BSR/Beta	1000	.
370 | chrY	23899295	23899388	TAR1	1000	.
371 | chrY	23956449	23956628	TAR1	1000	.
372 | chrY	24247659	24247700	TAR1	1000	.
373 | chrY	24630999	24631040	TAR1	1000	.
374 | chrY	24953159	24975657	BSR/Beta	1000	.
375 | chrY	24980997	24991235	BSR/Beta	1000	.
376 | chrY	25022753	25039185	BSR/Beta	1000	.
377 | chrY	25040153	25042421	BSR/Beta	1000	.
378 | chrY	25048332	25059258	BSR/Beta	1000	.
379 | chrY	25060235	25064798	BSR/Beta	1000	.
380 | chrY	25099139	25121882	BSR/Beta	1000	.
381 | chrY	25122419	25160800	BSR/Beta	1000	.
382 | chrY	25182404	25192372	BSR/Beta	1000	.
383 | chrY	25217722	25219409	BSR/Beta	1000	.
384 | chrY	25493588	25495275	BSR/Beta	1000	.
385 | chrY	26148315	26148450	TAR1	1000	.
386 | chrY	26586905	26609405	BSR/Beta	1000	.
387 | chrY	26614745	26624983	BSR/Beta	1000	.
388 | chrY	26656502	26672934	BSR/Beta	1000	.
389 | chrY	26673902	26676170	BSR/Beta	1000	.
390 | chrY	26682081	26693007	BSR/Beta	1000	.
391 | chrY	26693984	26698547	BSR/Beta	1000	.
392 | chrY	26732883	26755623	BSR/Beta	1000	.
393 | chrY	26756160	26794538	BSR/Beta	1000	.
394 | chrY	26816148	26826116	BSR/Beta	1000	.
395 | chrY	26851466	26853153	BSR/Beta	1000	.
396 | chrY	27109247	27110934	BSR/Beta	1000	.
397 | chrY	27136281	27146249	BSR/Beta	1000	.
398 | chrY	27167859	27206241	BSR/Beta	1000	.
399 | chrY	27206778	27229502	BSR/Beta	1000	.
400 | chrY	27263848	27268411	BSR/Beta	1000	.
401 | chrY	27269388	27280315	BSR/Beta	1000	.
402 | chrY	27286226	27288494	BSR/Beta	1000	.
403 | chrY	27289462	27305895	BSR/Beta	1000	.
404 | chrY	27337415	27347656	BSR/Beta	1000	.
405 | chrY	27352996	27375497	BSR/Beta	1000	.
406 | chrY	27813984	27814119	TAR1	1000	.
407 | chrY	28555026	28555353	TAR1	1000	.
408 | chrY	28784129	28819695	Satellite_repeat	1000	.
409 | chrY	58819367	58917648	(CATTC)n	1000	.
410 | chrY	58971913	58997782	(CATTC)n	1000	.
411 | chrY	59361267	59362785	TAR1	1000	.
412 | 


--------------------------------------------------------------------------------
/demo-data/sample-get-SuperEnhancers-output/0-enhancer-stats.txt:
--------------------------------------------------------------------------------
 1 | Statistics for: demo-data/sample-mm10-CD4.bed
 2 | SE Signal %: 38
 3 | TE Signal %: 62
 4 | SE Count: 1329
 5 | TE Count: 24794
 6 | SE Count %: 5.09
 7 | TE Count %: 94.91
 8 | Mean SE Size: 35846.22
 9 | Mean TE Size: 5104.87
10 | Median SE Size: 31833
11 | Median TE Size: 892.5
12 | 


--------------------------------------------------------------------------------
/demo-data/sample-get-SuperEnhancers-output/se-cutoff.R.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GordonLab/riesling-pipeline/384f41dc964db0f59b3992f775e87c651e846f2b/demo-data/sample-get-SuperEnhancers-output/se-cutoff.R.png


--------------------------------------------------------------------------------
/demo-data/sample-get-SuperEnhancers-output/se-size-histogram.R.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GordonLab/riesling-pipeline/384f41dc964db0f59b3992f775e87c651e846f2b/demo-data/sample-get-SuperEnhancers-output/se-size-histogram.R.png


--------------------------------------------------------------------------------
/demo-data/sample-get-SuperEnhancers-output/se-te-stretch-vs-nonstretch-count-pie.R.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GordonLab/riesling-pipeline/384f41dc964db0f59b3992f775e87c651e846f2b/demo-data/sample-get-SuperEnhancers-output/se-te-stretch-vs-nonstretch-count-pie.R.png


--------------------------------------------------------------------------------
/demo-data/sample-get-SuperEnhancers-output/se-vs-te-count-pie.R.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GordonLab/riesling-pipeline/384f41dc964db0f59b3992f775e87c651e846f2b/demo-data/sample-get-SuperEnhancers-output/se-vs-te-count-pie.R.png


--------------------------------------------------------------------------------
/demo-data/sample-get-SuperEnhancers-output/se-vs-te-signal-pie.R.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GordonLab/riesling-pipeline/384f41dc964db0f59b3992f775e87c651e846f2b/demo-data/sample-get-SuperEnhancers-output/se-vs-te-signal-pie.R.png


--------------------------------------------------------------------------------
/demo-data/sample-get-SuperEnhancers-output/stretch-vs-nonstretch-count-pie.R.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GordonLab/riesling-pipeline/384f41dc964db0f59b3992f775e87c651e846f2b/demo-data/sample-get-SuperEnhancers-output/stretch-vs-nonstretch-count-pie.R.png


--------------------------------------------------------------------------------
/demo-data/sample-get-SuperEnhancers-output/te-size-histogram.R.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GordonLab/riesling-pipeline/384f41dc964db0f59b3992f775e87c651e846f2b/demo-data/sample-get-SuperEnhancers-output/te-size-histogram.R.png


--------------------------------------------------------------------------------
/dist/README.md:
--------------------------------------------------------------------------------
1 | This directory contains ~portable linux binaries for [bedtools](https://github.com/arq5x/bedtools2) and [samtools](https://github.com/samtools/samtools).
2 | 
3 | Feel free to provide your own or use pre-installed versons.
4 | 


--------------------------------------------------------------------------------
/dist/bedtools-2.22.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GordonLab/riesling-pipeline/384f41dc964db0f59b3992f775e87c651e846f2b/dist/bedtools-2.22.0


--------------------------------------------------------------------------------
/dist/bedtools-2.23.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GordonLab/riesling-pipeline/384f41dc964db0f59b3992f775e87c651e846f2b/dist/bedtools-2.23.0


--------------------------------------------------------------------------------
/dist/samtools-0.1.19:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GordonLab/riesling-pipeline/384f41dc964db0f59b3992f775e87c651e846f2b/dist/samtools-0.1.19


--------------------------------------------------------------------------------
/dist/samtools-1.1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GordonLab/riesling-pipeline/384f41dc964db0f59b3992f775e87c651e846f2b/dist/samtools-1.1


--------------------------------------------------------------------------------
/dist/samtools-1.2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GordonLab/riesling-pipeline/384f41dc964db0f59b3992f775e87c651e846f2b/dist/samtools-1.2


--------------------------------------------------------------------------------
/get-SuperEnhancers.R:
--------------------------------------------------------------------------------
  1 | #####
  2 | # You can use this script to delineate super-enhancers from a .bed, *only* after it has been
  3 | # pre-processed by bamliquidator. Conveniently this can be done by riesling.py, which also
  4 | # allows for alternative stratifications of super-enhancers.
  5 | #
  6 | #
  7 | # Run as: Rscript get-SuperEnhancers.R input.bed output_directory
  8 | #
  9 | #
 10 | # **Input: A .bed from bamliquidator
 11 | #    (A .bed with 7 columns, where the 7th column contains the normalized score for ranking.)
 12 | #
 13 | #  ## This input might look like:
 14 | #      V1      V2      V3 V4    V5 V6         V7
 15 | #   1 chr1 3514643 3515351  1 30438  . 0.17688850
 16 | #   2 chr1 4426753 4427110  2  4119  . 0.04747231
 17 | #
 18 | #
 19 | # **Output:
 20 | #    Super-enhacners (defined by the point where the tangent reaches 1) and
 21 | #    Stretch enhancers (>3 kb)
 22 | #    Super-Stretch enhancers (> tangent cutoff AND >3kb)
 23 | #
 24 | #
 25 | # Copyright (c) 2014-2016 Nick Semenkovich <semenko@alum.mit.edu>.
 26 | #   https://nick.semenkovich.com/
 27 | #
 28 | # Developed for the Gordon Lab, Washington University in St. Louis (WUSTL)
 29 | #   https://gordonlab.wustl.edu/
 30 | #
 31 | # This software is released under the MIT License:
 32 | #  http://opensource.org/licenses/MIT
 33 | #
 34 | # Source: https://github.com/GordonLab/riesling-pipeline
 35 | #
 36 | # Includes approaches inspired by:
 37 | #  https://stackoverflow.com/questions/29642867/drawing-a-tangent-to-the-plot-and-finding-the-x-intercept-using-r
 38 | 
 39 | # Force sane graphics output
 40 | options(bitmapType='cairo')
 41 | 
 42 | #### Cleanup
 43 | # dev.off(dev.list()["RStudioGD"]) # Clear the graphs
 44 | rm(list = ls(all = TRUE)) # Clear all ojects from workspace
 45 | cat("\014") # Reset the console
 46 | 
 47 | # argv[1] is the input bed file we're operating on. 
 48 | args <- commandArgs(TRUE)
 49 | input_filename = args[1]
 50 | output_dir = args[2]
 51 | 
 52 | 
 53 | print(paste('Working on:', input_filename))
 54 | print(paste('Output dir:', output_dir))
 55 | 
 56 | bed <- read.table(input_filename, sep="\t", header=FALSE)
 57 | 
 58 | ## Looks like:
 59 | #    V1      V2      V3 V4    V5 V6         V7
 60 | # 1 chr1 3514643 3515351  1 30438  . 0.17688850
 61 | # 2 chr1 4426753 4427110  2  4119  . 0.04747231
 62 | 
 63 | if (!is.na(output_dir)) {
 64 |   print(paste('Current directory is:', getwd()))
 65 |   print(paste('Setting output directory to:', output_dir))
 66 |   setwd(output_dir)
 67 | }
 68 | 
 69 | 
 70 | ## Sort and scale axes
 71 | y = sort(bed[,c(7)]*(bed[,c(3)]-bed[,c(2)]))  # normalized_counts * width
 72 | x = c(1:length(y))
 73 | ynorm = y*length(x)/max(y)
 74 | # plot(x, ynorm)
 75 | # plot(x, ynorm, log="xy")
 76 | 
 77 | spl <- smooth.spline(x, ynorm)
 78 | # pred <- predict(spl)
 79 | # lines(pred, col=2)
 80 | 
 81 | ynorm.prime <- diff(ynorm)/diff(x)
 82 | # plot(ynorm.prime)
 83 | pred.prime <- predict(spl, deriv=1)
 84 | # lines(pred.prime$y, col=2)
 85 | 
 86 | ## Find where the tangent line first crosses 1
 87 | se_cutoff <- min(which(pred.prime$y > 1))
 88 | print(paste('Inflection at entry:', se_cutoff))
 89 | print(paste('Corresponding cutoff score:', y[[se_cutoff]]))
 90 | 
 91 | # Use the spline models to plot tangent to that point
 92 | pred0 <- predict(spl, x=se_cutoff, deriv=0)
 93 | pred1 <- predict(spl, x=se_cutoff, deriv=1)
 94 | 
 95 | # And compute intercepts for graphing
 96 | yint <- pred0$y - (pred1$y*se_cutoff)
 97 | xint <- -yint/pred1$y
 98 | 
 99 | 
100 | ################ Save subpopulation beds
101 | y_se_cutoff = y[[se_cutoff]]
102 | se_population = bed[bed$V7*(bed$V3-bed$V2) >= y_se_cutoff,]
103 | te_population = bed[bed$V7*(bed$V3-bed$V2) < y_se_cutoff,]
104 | 
105 | write.table(se_population, file='0-se-population.R.bed', quote=FALSE, sep='\t', col.names = FALSE, row.names = FALSE)
106 | write.table(te_population, file='0-te-population.R.bed', quote=FALSE, sep='\t', col.names = FALSE, row.names = FALSE)
107 | 
108 | # >3000 stretch
109 | stretch_cutoff = 3000
110 | stretch_population = bed[bed$V3-bed$V2 >= stretch_cutoff,]
111 | stretch_se_population = se_population[se_population$V3-se_population$V2 >= stretch_cutoff,]
112 | write.table(stretch_population, file='0-stretch-population.R.bed', quote=FALSE, sep='\t', col.names = FALSE, row.names = FALSE)
113 | write.table(stretch_se_population, file='0-stretch-se-population.R.bed', quote=FALSE, sep='\t', col.names = FALSE, row.names = FALSE)
114 | 
115 | 
116 | ## Save a diagnostic plot
117 | png(paste('se-cutoff.R.png', sep=''), width=800, height=800)
118 | plot(x, ynorm, cex=0.5)
119 | abline(h=0, col=8) # baseline zero
120 | lines(spl, col=2) # spline
121 | abline(h=ynorm[[se_cutoff]], col=8)
122 | abline(v=se_cutoff, col=8)
123 | 
124 | points(pred0, col=2, pch=19) # point to predict tangent 
125 | lines(x, yint + pred1$y*x, col=3) # tangent (1st deriv. of spline at se_cutoff)
126 | points(xint, 0, col=3, pch=19) # x intercept
127 | dev.off()
128 | 
129 | ###
130 | # Export pie charts & histograms
131 | ###
132 | 
133 | ## First, make some data frames to use ggplot2
134 | se_sizes = se_population$V3-se_population$V2
135 | se_signals = se_population$V7*se_sizes
136 | te_sizes = te_population$V3-te_population$V2
137 | te_signals = te_population$V7*te_sizes
138 | 
139 | # size_df = data.frame(c(se_sizes, te_sizes), factor(rep(c('se','te'), c(length(se_sizes), length(te_sizes)))))
140 | # colnames(size_df) <- c('size', 'type')
141 | # signal_df = data.frame(c(se_signals, te_signals), factor(rep(c('se','te'), c(length(se_signals), length(te_signals)))))
142 | # colnames(signal_df) <- c('signal', 'type')
143 | 
144 | 
145 | # Pie chart of % SE vs % TE [count]
146 | png(paste('se-vs-te-count-pie.R.png', sep=''), width=800, height=800)
147 | pie(c(nrow(se_population), nrow(te_population)),
148 |     labels=c(paste('Super Enhancers\n', nrow(se_population)),
149 |              paste('Traditional Enhancers\n', nrow(te_population))),
150 |     main="Number of Super- vs Traditional Enhancers")
151 | dev.off()
152 | 
153 | # Stretch vs Non-Stretch
154 | png(paste('stretch-vs-nonstretch-count-pie.R.png', sep=''), width=800, height=800)
155 | pie(c(nrow(stretch_population), nrow(bed)-nrow(stretch_population)),
156 |     labels=c(paste('Stretch Enhancers\n', nrow(stretch_population)),
157 |              paste('Non-Stretch Enhancers\n', nrow(bed)-nrow(stretch_population))),
158 |     main="Number of Stretch vs Non-Stretch Enhancers")
159 | dev.off()
160 | 
161 | # And SE stretch / traditional stretch, etc.
162 | png(paste('se-te-stretch-vs-nonstretch-count-pie.R.png', sep=''), width=800, height=800)
163 | pie(c(nrow(se_population)-nrow(stretch_se_population), nrow(stretch_se_population),
164 |       nrow(te_population)-(nrow(stretch_population)-nrow(stretch_se_population)),
165 |       nrow(stretch_population)-nrow(stretch_se_population)),
166 |     labels=c(paste('Super Enhancers\n', nrow(se_population)-nrow(stretch_se_population)),
167 |              paste('Super-Stretch Enhancers\n', nrow(stretch_se_population)),
168 |              paste('Traditional Enhancers\n', nrow(te_population)-(nrow(stretch_population)-nrow(stretch_se_population))),
169 |              paste('Traditional-Stretch Enhancers\n', nrow(stretch_population)-nrow(stretch_se_population))),
170 |     main="Number of Super- vs Traditional Enhancers w/ Stretch >3kb")
171 | dev.off()
172 | 
173 | 
174 | # Pie chart of % SE Signal vs % TE Signal
175 | total_signal = sum(as.numeric(bed$V5))
176 | se_signal = sum(as.numeric(se_population$V5))
177 | # te_signal = sum(as.numeric(te_population$V5))
178 | se_fraction = round(se_signal/total_signal*100, digits = 1)
179 | te_fraction = 100-se_fraction
180 | 
181 | png(paste('se-vs-te-signal-pie.R.png', sep=''), width=800, height=800)
182 | pie(c(se_fraction, te_fraction),
183 |     labels=c(paste('Super Enhancers\n', se_fraction, '%', sep = ''),
184 |              paste('Traditional Enhancers\n', te_fraction, '%', sep = '')),
185 |     main="Signal in Super- vs Traditional Enhancers")
186 | dev.off()
187 | 
188 | 
189 | ###### Histograms
190 | # Histogram of SE sizes
191 | png(paste('se-size-histogram.R.png', sep=''), width=800, height=800)
192 | hist(se_signals, breaks=10,
193 |      xlab = 'Super-Enhancer Sizes', ylab="Counts", main="Super-Enhancer Size Distribution")
194 | dev.off()
195 | 
196 | png(paste('te-size-histogram.R.png', sep=''), width=800, height=800)
197 | hist(te_signals, breaks=10,
198 |      xlab = 'Super-Enhancer Sizes', ylab="Counts", main="Traditional-Enhancer Size Distribution")
199 | dev.off()
200 | 
201 | # Inverse hockeystick, honestly
202 | # qplot(signal, data=signal_df[signal_df$type == 'se',], geom="histogram",
203 | #      xlab="Normalized ATAC Signal", ylab="Count")
204 | 
205 | ######## Text Diagnostics
206 | # Print some raw statistics to a text file
207 | 
208 | fh <- file("0-enhancer-stats.txt", "w")
209 | writeLines(paste("Statistics for:", input_filename), con=fh)
210 | writeLines(paste("SE Signal %:", round(se_fraction,2)), con=fh)
211 | writeLines(paste("TE Signal %:", round(te_fraction,2)), con=fh)
212 | writeLines(paste("SE Count:", nrow(se_population)), con=fh)
213 | writeLines(paste("TE Count:", nrow(te_population)), con=fh)
214 | writeLines(paste("SE Count %:", round(nrow(se_population)/nrow(bed)*100, 2)), con=fh)
215 | writeLines(paste("TE Count %:", round(nrow(te_population)/nrow(bed)*100, 2)), con=fh)
216 | writeLines(paste("Mean SE Size:", round(mean(se_sizes), 2)), con=fh)
217 | writeLines(paste("Mean TE Size:", round(mean(te_sizes), 2)), con=fh)
218 | writeLines(paste("Median SE Size:", median(se_sizes)), con=fh)
219 | writeLines(paste("Median TE Size:", median(te_sizes)), con=fh)
220 | close(fh)
221 | 
222 | 


--------------------------------------------------------------------------------
/helper-scripts/0-merge-fastq.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # This script merges one sample across lanes.
  5 | #
  6 | # This script will *only* be useful if you have the same multiplexed sample loaded into multiple lanes of a flowcell.
  7 | #
  8 | # This concatenates the same index's pared-end files (L*_R*_* .fastq.gz) across multiple
  9 | # lanes into one set of PE files per-sample.
 10 | #
 11 | #
 12 | # Copyright (c) 2014-2016 Nick Semenkovich <semenko@alum.mit.edu>.
 13 | #   https://nick.semenkovich.com/
 14 | #
 15 | # Developed for the Gordon Lab, Washington University in St. Louis (WUSTL)
 16 | #   https://gordonlab.wustl.edu/
 17 | #
 18 | # This software is released under the MIT License:
 19 | #  http://opensource.org/licenses/MIT
 20 | #
 21 | # Source: https://github.com/GordonLab/riesling-pipeline
 22 | 
 23 | from __future__ import absolute_import, division, print_function, unicode_literals
 24 | 
 25 | __author__ = 'Nick Semenkovich <semenko@alum.mit.edu>'
 26 | __copyright__ = 'Gordon Lab at Washington University in St. Louis'
 27 | __license__ = 'MIT'
 28 | __version__ = '1.0.3'
 29 | 
 30 | from collections import OrderedDict
 31 | import _logshim
 32 | import _script_helpers
 33 | import argparse
 34 | import glob
 35 | import os
 36 | import pprint
 37 | import re
 38 | 
 39 | 
 40 | def fastq_map_predict(input_path, verbose=False):
 41 |     """
 42 |     Determine a sane .fastq muti-lane merge strategy.
 43 |     Fail if we can't merge correctly, if there are remaining files, etc.
 44 | 
 45 |     sample file name: Gordon-Ad2-11-AAGAGGCA-AAGAGGCA_S7_L001_R1_001.fastq.gz
 46 | 
 47 |     Args:
 48 |         input_path: An input path containing .fastq / .fastq.gz files
 49 |     Returns:
 50 |         A dict of mappings.
 51 |     """
 52 |     fastq_map_logger = _logshim.getLogger('fastq_map_predict')
 53 | 
 54 |     if not os.path.isdir(input_path):
 55 |         raise ValueError("Input must be a directory. You gave: %s" % (input_path))
 56 | 
 57 |     all_files = glob.glob(input_path + "/*_R*.fastq.gz")  # Ignore index files, must have _R in title
 58 |     all_files.extend(glob.glob(input_path + "/*_R*.fastq"))
 59 | 
 60 |     if len(all_files) == 0:
 61 |         raise ValueError("Input directory is empty!")
 62 | 
 63 |     # Given paired ends, we must always have an even number of input files.
 64 |     if len(all_files) % 2 != 0:
 65 |         raise ValueError("Input directory contains an odd number of files.")
 66 | 
 67 |     re_pattern = re.compile(r'^(.*)_L(\d+)_R(\d)_\d+(\.fastq|\.fastq\.gz)$')
 68 | 
 69 | 
 70 |     file_dict = OrderedDict()
 71 | 
 72 |     prefixes_seen = []
 73 |     lanes_seen = []
 74 |     pe_seen = []
 75 |     for file in sorted(all_files):
 76 |         if not os.access(file, os.R_OK):
 77 |             raise OSError("Cannot read file: %s" % (file))
 78 | 
 79 |         filename_only = file.rsplit('/', 1)[-1]
 80 |         result = re.match(re_pattern, filename_only)
 81 | 
 82 |         file_dict[file] = {'prefix': str(result.group(1)),
 83 |                            'L': int(result.group(2)),
 84 |                            'R': int(result.group(3))}
 85 | 
 86 |         prefixes_seen.append(file_dict[file]['prefix'])
 87 |         lanes_seen.append(file_dict[file]['L'])
 88 |         pe_seen.append(file_dict[file]['R'])
 89 | 
 90 | 
 91 |     # Sanity checking here. Missing files? Other oddities?
 92 |     if len(file_dict) % len(set(lanes_seen)) != 0:
 93 |         raise ValueError("Missing or extra file(s)? Saw %d lanes, and %d input files." %
 94 |                          (len(file_dict), len(set(lanes_seen))))
 95 | 
 96 |     if len(set(pe_seen)) != 2:
 97 |         raise ValueError("Saw %d paired ends, expecting exactly two. That's confusing!" % (len(set(pe_seen))))
 98 | 
 99 |     if pe_seen.count(1) != pe_seen.count(2):
100 |         raise ValueError("Uneven pairing of paired ends (are you missing a file)? R1 count: %d, R2 count: %d" %
101 |                          (pe_seen.count(1), pe_seen.count(2)))
102 | 
103 |     fastq_map_logger.info("Files seen: %d" % (len(all_files)))
104 |     fastq_map_logger.info("Samples seen: %d" % (len(set(prefixes_seen))))
105 |     fastq_map_logger.info("Lanes seen: %d" % (len(set(lanes_seen))))
106 | 
107 |     merge_strategy = {}
108 | 
109 |     fastq_map_logger.info("Sample IDs:")
110 |     for prefix in sorted(set(prefixes_seen)):
111 |         fastq_map_logger.info("     %s" % (prefix))
112 | 
113 |     for file in file_dict.iterkeys():
114 |         merge_strategy.setdefault(file_dict[file]['prefix'] + ".PE" + str(file_dict[file]['R']), []).append(file)
115 | 
116 |     if verbose:
117 |         fastq_map_logger.debug("Merge strategy is:")
118 |         fastq_map_logger.debug(pprint.pformat(merge_strategy))
119 | 
120 |     return merge_strategy
121 | 
122 | def fastq_merge(merge_strategy, output_path, disable_parallel=False):
123 |     """
124 |     Concatenate multiple fastq files (from multiple lanes) into one.
125 | 
126 |     :param merge_strategy:
127 |     :param output_path:
128 |     :return:
129 |     """
130 |     merge_log = _logshim.getLogger('fastq_merge')
131 | 
132 |     if disable_parallel:
133 |         shell_job_runner = _script_helpers.ShellJobRunner(merge_log)
134 |     else:
135 |         shell_job_runner = _script_helpers.ShellJobRunner(merge_log, delay_seconds=45)
136 | 
137 |     for merged_name, merge_inputs in merge_strategy.iteritems():
138 |         merge_input_files = ' '.join(merge_inputs)
139 |         merge_log.info('Spawning niced process to merge: %s' % (merged_name))
140 |         for filename in merge_inputs:
141 |             assert(" " not in filename)
142 |             assert(";" not in filename)  # Vague sanity testing for input filenames
143 |             merge_log.debug('    Input: %s' % (filename))
144 | 
145 |         # WARNING: Using shell has security implications! Don't work on untrusted input filenames.
146 |         command = "zcat %s | gzip -1 > %s/%s.fastq.gz" % (merge_input_files, output_path, merged_name)
147 | 
148 |         shell_job_runner.run(command)
149 | 
150 |     shell_job_runner.finish()
151 | 
152 |     return True
153 | 
154 | 
155 | def main():
156 |     # Parse & interpret command line flags.
157 |     parser = argparse.ArgumentParser(description='Intelligently merge fastq/fastq.gz files from an Illumina pipeline.'
158 |                                      'Merges all L*_R*_* .fastq.gz files into one per sample.',
159 |                                      epilog="Written by Nick Semenkovich <semenko@alum.mit.edu> for the Gordon Lab at "
160 |                                             "Washington University in St. Louis: http://gordonlab.wustl.edu.",
161 |                                      usage='%(prog)s [options]',
162 |                                      formatter_class=argparse.ArgumentDefaultsHelpFormatter)
163 | 
164 |     parser.add_argument('--input-path', '-i', dest="input_path", metavar='input_dir/', type=str,
165 |                         help='Input path.', required=True)
166 |     parser.add_argument('--output-path', '-o', dest="output_path", metavar='output_dir/', type=str,
167 |                         help='Output path.', required=True)
168 |     parser.add_argument('--no-parallel', '-np', dest="no_parallel", default=False, action='store_true',
169 |                         help='Disable parallel job spawning.')
170 | 
171 |     # parser.add_argument('--skip-stats', dest="skip_stats", action='store_true',
172 |     #                    help='Skip statistics generation.', required=False)
173 | 
174 |     parser.add_argument("--verbose", "-v", dest="verbose", default=False, action='store_true')
175 | 
176 |     parser.add_argument("--no-log", "-nl", dest="nolog", default=False, action='store_true',
177 |                         help="Do not create a log file.")
178 | 
179 |     args = parser.parse_args()
180 | 
181 |     output_path = _script_helpers.setup_output_path(args.output_path)
182 | 
183 |     _logshim.startLogger(verbose=args.verbose, noFileLog=args.nolog, outPath=output_path)
184 | 
185 |     # Our goal is to intelligently merge .fastq/.fastq.gz output from an Illumina run
186 |     # The Illumina standard pipeline splits by barcode w/ semi-predictable filenames we can use, e.g.
187 |     # IsoA-M1-CD4_S1_L001_I1_001.fastq.gz # index (discard)
188 |     # IsoA-M1-CD4_S1_L001_R1_001.fastq.gz # end 1, lane 1
189 |     # IsoA-M1-CD4_S1_L001_R2_001.fastq.gz # end 2, lane 2
190 |     # IsoA-M1-CD4_S1_L002_I1_001.fastq.gz # index (discard), lane 2
191 |     # IsoA-M1-CD4_S1_L002_R1_001.fastq.gz # end 1, lane 2
192 |     # ...
193 | 
194 |     # TODO: Move some lower glob code up so we can test these functions
195 |     merge_strategy = fastq_map_predict(args.input_path, verbose=args.verbose)
196 | 
197 |     fastq_merge(merge_strategy, args.output_path, disable_parallel=args.no_parallel)
198 | 
199 | 
200 | 
201 | if __name__ == '__main__':
202 |     main()
203 | 


--------------------------------------------------------------------------------
/helper-scripts/3-merge-bam-rmdup.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # In case you've sequenced the same sample multiple times, use this script to
  5 | # merge those samples together and then rmdup the pooled data.
  6 | 
  7 | # WARNING: This assumes you've used the pipeline up to this point, and have pre-sorted & fixmated .bams.
  8 | # Why use this script, when you could merge & rmdup by hand?
  9 | # * This just works (so you won't forget anything)
 10 | # * Automatic sane file naming
 11 | #
 12 | #
 13 | # Copyright (c) 2014-2016 Nick Semenkovich <semenko@alum.mit.edu>.
 14 | #   https://nick.semenkovich.com/
 15 | #
 16 | # Developed for the Gordon Lab, Washington University in St. Louis (WUSTL)
 17 | #   https://gordonlab.wustl.edu/
 18 | #
 19 | # This software is released under the MIT License:
 20 | #  http://opensource.org/licenses/MIT
 21 | #
 22 | # Source: https://github.com/GordonLab/riesling-pipeline
 23 | 
 24 | from __future__ import absolute_import, division, print_function, unicode_literals
 25 | 
 26 | __author__ = 'Nick Semenkovich <semenko@alum.mit.edu>'
 27 | __copyright__ = 'Gordon Lab at Washington University in St. Louis'
 28 | __license__ = 'MIT'
 29 | __version__ = '1.0.3'
 30 | 
 31 | import _logshim
 32 | import _script_helpers
 33 | import argparse
 34 | import os
 35 | 
 36 | # Load our config files
 37 | CONFIG = _script_helpers.get_config()
 38 | 
 39 | 
 40 | def merge_and_rmdup(input_files, output_path, disable_parallel=False):
 41 |     primary_logger = _logshim.getLogger('merge_and_rmdup')
 42 | 
 43 |     # Sanity checks on the input files list
 44 |     assert(len(input_files) > 1)
 45 |     # Check files are readable
 46 |     for filename in input_files:
 47 |         if not os.access(filename, os.R_OK):
 48 |             primary_logger.fatal("Unable to read input files.")
 49 |             raise IOError
 50 | 
 51 |     output_file_name = '-AND-'.join([os.path.basename(os.path.splitext(filename)[0]) for filename in input_files])
 52 | 
 53 |     # Sanity check: maximum output filename length
 54 |     max_filename_length = os.statvfs(output_path).f_namemax
 55 |     if max_filename_length < 100:
 56 |         primary_logger.fatal("Cannot deal with short filename length limit. Maybe namemax is broken?")
 57 |         raise IOError
 58 | 
 59 |     if (len(output_file_name) + 10) > max_filename_length:  # roughly truncate filename for sanity.
 60 |         primary_logger.critical("Very long filename! Truncating!")
 61 |         output_file_name = output_file_name[:-20]  # Give us some extra room for downstream stuff?
 62 | 
 63 |     output_file_name += ".merged.bam"
 64 | 
 65 |     input_file_string = ' '.join(input_files)
 66 | 
 67 |     shell_job_runner = _script_helpers.ShellJobRunner(primary_logger)
 68 | 
 69 |     primary_logger.debug('Input file string: %s' % (input_file_string))
 70 |     primary_logger.debug('Working on merge as: %s' % (output_file_name))
 71 |     # This is pretty fast and has minimal memory usage. Yay!
 72 |     # We're probably re-rmduping some files if we're merging. That's ok since this is speedy.
 73 |     command = "%s merge -u - %s | %s rmdup - %s 2>%s"
 74 | 
 75 |     shell_job_runner.run(command % (CONFIG['binaries']['samtools'],
 76 |                                     input_file_string,
 77 |                                     CONFIG['binaries']['samtools_legacy'],  # TODO: Update this when samtools is fixed.
 78 |                                     output_path + "/" + output_file_name,
 79 |                                     output_path + "/" + os.path.basename(os.path.splitext(output_file_name)[0]) + '-rmdup.log'))
 80 | 
 81 |     shell_job_runner.finish()
 82 | 
 83 | 
 84 |     primary_logger.info('Merge and rmdup complete!')
 85 | 
 86 | 
 87 | def main():
 88 |     # Parse & interpret command line flags.
 89 |     parser = argparse.ArgumentParser(description='Pool multiple .bams together for the same sample.'
 90 |                                                  'Note: This is *only* necessary if you sequenced the same sample multiple times.',
 91 |                                      epilog="Written by Nick Semenkovich <semenko@alum.mit.edu> for the Gordon Lab at "
 92 |                                             "Washington University in St. Louis: https://gordonlab.wustl.edu.",
 93 |                                      usage='%(prog)s [options]',
 94 |                                      formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 95 | 
 96 |     parser.add_argument('--input-files', '-i', dest="input_files", metavar='input_dir/', type=str,
 97 |                         help='Input files. (Not just a path!)', required=True, nargs='+')
 98 |     parser.add_argument('--output-path', '-o', dest="output_path", metavar='output_dir/', type=str,
 99 |                         help='Output path.', required=True)
100 | 
101 |     parser.add_argument("--verbose", "-v", dest="verbose", default=False, action='store_true')
102 | 
103 |     parser.add_argument("--no-log", "-nl", dest="nolog", default=False, action='store_true',
104 |                         help="Do not create a log file.")
105 | 
106 |     args = parser.parse_args()
107 | 
108 |     output_path = _script_helpers.setup_output_path(args.output_path)
109 | 
110 |     _logshim.startLogger(verbose=args.verbose, noFileLog=args.nolog, outPath=output_path)
111 | 
112 |     merge_and_rmdup(args.input_files, output_path)
113 | 
114 | 
115 | 
116 | if __name__ == '__main__':
117 |     main()
118 | 


--------------------------------------------------------------------------------
/helper-scripts/README.md:
--------------------------------------------------------------------------------
 1 | ## Optional & Helper Scripts
 2 | 
 3 | * `0-merge-fastq.py`: Intelligently merge across lanes, for multiple-lane samples (e.g. one multiplexed sample loaded into multiple lanes).
 4 | This will only be useful if you've loaded the *same, multiplexed sample* into multiple lanes of an Illumina flowcell.
 5 | 
 6 | This script concatenates the same index's pared-end files (L*_R*_* .fastq.gz) across multiple lanes into one set of PE files per-sample.
 7 | 
 8 | * `3-merge-bam-rmdup.py`: A helper script to blindly concatenate and deduplicate multiple sets of BAMs.
 9 | 
10 | * `hdf5_to_counts_table.py`: This script exists to convert .hdf5 files from bamliquidator into counts tables readable by R.
11 | 


--------------------------------------------------------------------------------
/helper-scripts/hdf5_to_counts_table.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # This script exists to convert .hdf5 files into counts tables readable by R.
  5 | #
  6 | # It is useful for performing differential accessibility analyses (e.g. with
  7 | # DESeq2) on hdf5 counts data from RIESLING, ROSE, or bamliquidator.
  8 | #
  9 | #
 10 | # Unfortunately, we can't use the rhdf5 package, since it doesn't support the
 11 | # datatypes used by our .hdf5 files.
 12 | #
 13 | #
 14 | # Copyright (c) 2014-2016 Nick Semenkovich <semenko@alum.mit.edu>.
 15 | #   https://nick.semenkovich.com/
 16 | #
 17 | # Developed for the Gordon Lab, Washington University in St. Louis (WUSTL)
 18 | #   http://gordonlab.wustl.edu/
 19 | #
 20 | # This software is released under the MIT License:
 21 | #  http://opensource.org/licenses/MIT
 22 | #
 23 | # Source: https://github.com/GordonLab/riesling-pipeline
 24 | 
 25 | from __future__ import absolute_import, division, print_function, unicode_literals
 26 | 
 27 | __author__ = 'Nick Semenkovich <semenko@alum.mit.edu>'
 28 | __copyright__ = 'Gordon Lab at Washington University in St. Louis'
 29 | __license__ = 'MIT'
 30 | __version__ = '1.0.3'
 31 | 
 32 | import argparse
 33 | import csv
 34 | import fnmatch
 35 | import operator
 36 | import os
 37 | import pybedtools
 38 | import tables
 39 | import _logshim
 40 | import _script_helpers
 41 | from collections import deque, OrderedDict
 42 | 
 43 | 
 44 | CONFIG = _script_helpers.get_config()
 45 | 
 46 | 
 47 | # TODO: Modularize this function. This code is repeated in a *lot* of scripts.
 48 | def get_input_files(input_path):
 49 |     """
 50 |     Generate a list of all input files.
 51 | 
 52 |     :param input_files: A directory with .h5 files. (e.g. /tmp/)
 53 |     :return: a list of all .h5 files with absolute paths. (e.g. ['/tmp/a.h5'] )
 54 |     """
 55 |     if not os.path.isdir(input_path):
 56 |         raise ValueError("Input must be a directory. You gave: %s" % (input_path))
 57 | 
 58 |     # Adapted from:
 59 |     # https://stackoverflow.com/questions/2186525/use-a-glob-to-find-files-recursively-in-python
 60 |     all_files = []
 61 |     for root, _, filenames in os.walk(input_path):
 62 |         for filename in fnmatch.filter(filenames, '*.h5'):
 63 |             all_files.append(os.path.join(root, filename))
 64 | 
 65 |     if len(all_files) == 0:
 66 |         raise ValueError("Input directory contains no .h5 files!")
 67 | 
 68 |     return all_files
 69 | 
 70 | 
 71 | def flatten_tsv(filename):
 72 |     """
 73 |     Flaten a TSV file -- parse and concatenate identical row names, by summing their values.
 74 |     """
 75 |     flatlog = _logshim.getLogger('flatten_tsv')
 76 | 
 77 |     flatlog.debug('Flattening input file: %s' % (filename))
 78 | 
 79 |     data_dict = OrderedDict()
 80 | 
 81 |     with open(filename, 'r') as tsv_ro_fh:
 82 |         tsv_input = csv.reader(tsv_ro_fh, delimiter=str("\t"))
 83 | 
 84 |         header = next(tsv_input, None)
 85 | 
 86 |         for row in tsv_input:
 87 |             row_key = row[0]
 88 |             these_row_values_as_int = map(int, row[1:])
 89 |             if row_key in data_dict:
 90 |                 # Add the current row values to the existing values
 91 |                 data_dict[row_key] = map(operator.add, data_dict[row_key], these_row_values_as_int)
 92 |             else:
 93 |                 data_dict[row_key] = these_row_values_as_int
 94 | 
 95 |     # Write back the parsed dict
 96 |     with open(filename, 'wb') as tsv_rw_fh:
 97 |         tsv_writer = csv.writer(tsv_rw_fh, delimiter=str("\t"))
 98 |         tsv_writer.writerow(header)
 99 | 
100 |         for key, val in data_dict.iteritems():
101 |             tsv_writer.writerow([key] + val)
102 | 
103 | 
104 | 
105 | def parse_h5files(input_files, annotationBedTool, overwrite, flatten, density, normalized, sizescaled):
106 |     h5logger = _logshim.getLogger('parse_h5files')
107 | 
108 |     assert(not (density and normalized))
109 |     total_file_count = len(input_files)
110 |     h5logger.info('Parsing a total of: %d file(s)' % (total_file_count))
111 | 
112 |     output_suffix_list = ['tsv']
113 | 
114 |     annotating_regions = False
115 |     if annotationBedTool:
116 |         annotating_regions = True
117 |         output_suffix_list.append('annotated')
118 | 
119 |     if normalized:
120 |         output_suffix_list.append('normalized')
121 |     elif density:
122 |         output_suffix_list.append('density')
123 |     elif sizescaled:
124 |         output_suffix_list.append('sizescaled')
125 | 
126 |     output_suffix = '.'.join(reversed(output_suffix_list))
127 | 
128 |     # Cache regions that we're annotating, maybe.
129 |     region_annotation_cache = {}
130 | 
131 |     for this_file_count, file in enumerate(input_files):
132 |         h5logger.info('\tParsing: %s (%d/%d)' % (file, this_file_count + 1, total_file_count))
133 | 
134 |         output_filename = file + '.' + output_suffix
135 | 
136 |         if not overwrite and os.path.isfile(output_filename):
137 |             h5logger.warn('Skipping this .h5 as output .tsv already exists: %s' % (output_filename))
138 |             continue
139 | 
140 |         # TODO: Modularize H5FD_CORE (the in-memory driver?)
141 |         with tables.open_file(file, mode="r", driver="H5FD_CORE") as h5_object:
142 |             assert(h5_object.title.startswith("bam liquidator genome read counts"))  # Some sanity checking
143 |             assert(h5_object.root.file_names[0] == "*")
144 | 
145 |             bam_filename_header = h5_object.root.file_names[1:]
146 |             bam_filename_header.insert(0, 'region')
147 | 
148 |             # Note: len(files) = len(file_names) - 1, since file_names has a 'wildcard' first entry.
149 |             number_of_regions = int(len(h5_object.root.region_counts) / len(h5_object.root.files))
150 | 
151 |             # We expect this .h5 object's region_counts to contain:
152 |             # /region_counts (Table(SIZE,)) 'region counts'
153 |             #   description := {
154 |             #   "file_key": UInt32Col(shape=(), dflt=0, pos=0),
155 |             #   "chromosome": StringCol(itemsize=64, shape=(), dflt='', pos=1),
156 |             #   "region_name": StringCol(itemsize=64, shape=(), dflt='', pos=2),
157 |             #   "start": UInt64Col(shape=(), dflt=0, pos=3),
158 |             #   "stop": UInt64Col(shape=(), dflt=0, pos=4),
159 |             #   "strand": StringCol(itemsize=1, shape=(), dflt='', pos=5),
160 |             #   "count": UInt64Col(shape=(), dflt=0, pos=6),
161 |             #   "normalized_count": Float64Col(shape=(), dflt=0.0, pos=7)}
162 |             #   byteorder := 'little'
163 |             #   chunkshape := (NNN,)
164 |             counts = h5_object.root.region_counts
165 | 
166 |             with open(output_filename, 'wb') as tsv_output:
167 |                 tsvwriter = csv.writer(tsv_output, delimiter=str("\t"))
168 |                 tsvwriter.writerow(bam_filename_header)
169 | 
170 |                 if annotating_regions:
171 |                     h5logger.debug('Generating .bed annotations from provided genome.')
172 |                     region_to_gene = {}
173 |                     # Perform one annotation rapidly for all regions in the .hdf5
174 |                     hdf5_positions_only = []
175 | 
176 |                     for region_number in range(0, number_of_regions):
177 |                         hdf5_positions_only.append(counts[region_number][1] + ' ' + str(counts[region_number][3]) + ' ' + str(counts[region_number][4]))
178 | 
179 |                     hdf5_positions_only_hashkey = ''.join(hdf5_positions_only)
180 | 
181 |                     if hdf5_positions_only_hashkey in region_annotation_cache:
182 |                         # The genome doesn't change mid run, so we cache only on hdf5_positions
183 |                         region_to_gene = region_annotation_cache[hdf5_positions_only_hashkey]
184 |                         h5logger.debug('Annotation from cache.')
185 |                     else:
186 |                         hdf5_stub_bed = pybedtools.BedTool('\n'.join(hdf5_positions_only), from_string=True)
187 | 
188 |                         annotated_bed = hdf5_stub_bed.closest(annotationBedTool, t='first')
189 | 
190 |                         for locus in annotated_bed:
191 |                             region_to_gene[locus.chrom + ':' + str(locus.start) + '-' + str(locus.end)] = locus.fields[11].split('"')[1]
192 | 
193 |                         region_annotation_cache[hdf5_positions_only_hashkey] = region_to_gene
194 |                         h5logger.debug('Annotation completed.')
195 | 
196 | 
197 |                 # We're going to aggressively access the hdf5 at a bunch of fixed offsets.
198 |                 # rowarray = [counts[number_of_regions*0 + i], counts[number_of_regions*1 + i] + counts[number_of_regions*2 + i] ...]
199 | 
200 |                 number_of_files = len(h5_object.root.files)
201 |                 working_deque = deque(maxlen=number_of_files + 1)
202 | 
203 |                 # Here, we loop over every "region"/locus (every entry in the first column of the .tsv)
204 |                 # And then (within this loop) jump to each individual "file" (the hdf5 can contain multiple
205 |                 # separate samples) to build the data for every row.
206 |                 for region_number in range(0, number_of_regions):
207 |                     # Prefix the row with chrN:bpSTART-pbEND e.g. chr4:100-2000
208 |                     locus_name = counts[region_number][1] + ':' + str(counts[region_number][3]) + '-' + str(counts[region_number][4])
209 | 
210 |                     # Sanity checking, in case the input is nuts
211 |                     feature_width = counts[region_number][4] - counts[region_number][3]
212 |                     assert(feature_width > 0)
213 | 
214 |                     # DESeq2 requires each region have a unique name.
215 |                     # You can either append a unique value, or aggregate identical loci.
216 |                     # We address this later by re-opening and aggregating.
217 |                     if annotating_regions:
218 |                         working_deque.append(region_to_gene[locus_name])
219 |                     else:
220 |                         working_deque.append(locus_name)
221 |                     #rowarray = [counts[region_number][1] + ':' + str(counts[region_number][3]) + '-' + str(counts[region_number][4])]
222 | 
223 |                     for file_number in range(0, number_of_files):
224 |                         if normalized:
225 |                             # Standard normalized (counts/mreads)
226 |                             # bamliquidator gives us (counts/mreads)/width so we multiply by width
227 |                             working_deque.append(int(counts[number_of_regions * file_number + region_number][7] * feature_width))
228 |                         elif density:
229 |                             # (counts/mreads)/width
230 |                             # We upscale the fractional normalized count values by an arbitrary amount,
231 |                             # because subsequent analyses like integers.
232 |                             working_deque.append(int(counts[number_of_regions * file_number + region_number][7] * 10000))
233 |                         elif sizescaled:
234 |                             # counts/width
235 |                             # We upscale the fractional normalized count values by an arbitrary amount,
236 |                             # because subsequent analyses like integers.
237 |                             working_deque.append(int(counts[number_of_regions * file_number + region_number][6] / feature_width * 100))
238 |                         else:
239 |                             working_deque.append(int(counts[number_of_regions * file_number + region_number][6]))
240 | 
241 |                     tsvwriter.writerow(working_deque)
242 | 
243 |             if flatten:
244 |                 flatten_tsv(output_filename)
245 | 
246 |     h5logger.info('Completed.')
247 | 
248 | 
249 | def main():
250 |     # Parse & interpret command line flags.
251 |     parser = argparse.ArgumentParser(description='Convert hdf5 tables from bamliquidator format to CSV counts tables '
252 |                                                  'for use in R and elsewhere. (Necessary as rhdf5 doesn\'t support our data structure.)',
253 |                                      epilog="Written by Nick Semenkovich <semenko@alum.mit.edu> for the Gordon Lab at "
254 |                                             "Washington University in St. Louis: http://gordonlab.wustl.edu.",
255 |                                      usage='%(prog)s [options]',
256 |                                      formatter_class=argparse.ArgumentDefaultsHelpFormatter)
257 | 
258 |     parser.add_argument('--input-path', '-i', dest="input_path", metavar='input_dir/', type=str,
259 |                         help='Input path with .h5 files.',
260 |                         required=True)
261 | 
262 |     parser.add_argument("--overwrite", dest="overwrite", default=False, action='store_true',
263 |                         help='Regenerate and overwrite output .tsv files, even if they already exist.')
264 | 
265 |     parser.add_argument('--call-genes', dest="call_genes", default=False, action='store_true',
266 |                         help='Instead of a .tsv (with positions as keys), make a .annotated.tsv with nearby genes.')
267 | 
268 |     parser.add_argument('--normalized', dest="normalized", default=False, action='store_true',
269 |                         help='Store the normalized counts (counts/total reads) instead of the raw read counts.')
270 | 
271 |     parser.add_argument('--density', dest="density", default=False, action='store_true',
272 |                         help='Store the width-normalized density (counts/total reads/region size) instead of the raw read counts.')
273 | 
274 |     parser.add_argument('--sizescaled', dest="sizescaled", default=False, action='store_true',
275 |                         help='Store the size scaled counts (counts/feature size) instead of the raw read counts.')
276 | 
277 |     # Useful for EdgeR/DESeq2, etc. where every locus/position/gene-name must be unique.
278 |     parser.add_argument('--flatten', dest="flatten", default=False, action='store_true',
279 |                         help='Aggregate identical locus IDs and sum their values. '
280 |                              'Think carefully before you sum non-normalized values!')
281 | 
282 | 
283 |     genome_choices = sorted(CONFIG['gffs'].keys())
284 |     parser.add_argument('--genome', '-g', dest="genome", metavar='genome', type=str, default=None,
285 |                         choices=genome_choices, help='Genome to use for annotation, one of: %s' % (', '.join(genome_choices)), required=False)
286 | 
287 | 
288 |     parser.add_argument("--verbose", "-v", dest="verbose", default=False, action='store_true')
289 | 
290 |     parser.add_argument("--no-log", "-nl", dest="nolog", default=False, action='store_true',
291 |                         help="Do not create a log file.")
292 | 
293 |     args = parser.parse_args()
294 | 
295 |     if args.call_genes and not args.genome:
296 |         parser.error('--genome is when requesting --call_genes')
297 | 
298 |     assert((args.density + args.normalized + args.sizescaled) <= 1)
299 | 
300 |     annotationBedTool = None
301 |     if args.call_genes:
302 |         genome_gff = CONFIG['gffs'][args.genome]
303 |         assert(os.access(genome_gff, os.R_OK))
304 |         annotationBedTool = pybedtools.BedTool(genome_gff)
305 | 
306 |     # Output path is input path. This also checks that the path is writeable.
307 |     output_path = _script_helpers.setup_output_path(args.input_path)
308 | 
309 |     _logshim.startLogger(verbose=args.verbose, noFileLog=args.nolog, outPath=output_path)
310 | 
311 | 
312 |     input_files = get_input_files(args.input_path)
313 | 
314 |     parse_h5files(input_files,
315 |                   annotationBedTool=annotationBedTool,
316 |                   overwrite=args.overwrite,
317 |                   flatten=args.flatten,
318 |                   density=args.density,
319 |                   normalized=args.normalized,
320 |                   sizescaled=args.sizescaled)
321 | 
322 | 
323 | 
324 | if __name__ == '__main__':
325 |     main()
326 | 


--------------------------------------------------------------------------------
/refseq/.gitignore:
--------------------------------------------------------------------------------
1 | !*.gz


--------------------------------------------------------------------------------
/refseq/UPDATED-06-11-2014:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GordonLab/riesling-pipeline/384f41dc964db0f59b3992f775e87c651e846f2b/refseq/UPDATED-06-11-2014


--------------------------------------------------------------------------------
/refseq/hg18.ucsc.RefSeq.refGene.tsv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GordonLab/riesling-pipeline/384f41dc964db0f59b3992f775e87c651e846f2b/refseq/hg18.ucsc.RefSeq.refGene.tsv.gz


--------------------------------------------------------------------------------
/refseq/hg19.ucsc.RefSeq.refGene.tsv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GordonLab/riesling-pipeline/384f41dc964db0f59b3992f775e87c651e846f2b/refseq/hg19.ucsc.RefSeq.refGene.tsv.gz


--------------------------------------------------------------------------------
/refseq/hg38.ucsc.RefSeq.refGene.tsv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GordonLab/riesling-pipeline/384f41dc964db0f59b3992f775e87c651e846f2b/refseq/hg38.ucsc.RefSeq.refGene.tsv.gz


--------------------------------------------------------------------------------
/refseq/mm10.ucsc.RefSeq.refGene.tsv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GordonLab/riesling-pipeline/384f41dc964db0f59b3992f775e87c651e846f2b/refseq/mm10.ucsc.RefSeq.refGene.tsv.gz


--------------------------------------------------------------------------------
/refseq/mm9.ucsc.RefSeq.refGene.tsv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GordonLab/riesling-pipeline/384f41dc964db0f59b3992f775e87c651e846f2b/refseq/mm9.ucsc.RefSeq.refGene.tsv.gz


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | Cython>=0.22
 2 | gffutils>=0.8.3
 3 | MACS2>=2.1.0.20140616
 4 | MACS==1.4.3
 5 | matplotlib>=1.4.3
 6 | metaseq>=0.5.5.4
 7 | numexpr>=2.4
 8 | numpy>=1.9.2
 9 | pandas>=0.16.0
10 | PyYaml>=3.11
11 | pybedtools>=0.6.9
12 | seaborn==0.5.1
13 | tables>=3.1.1
14 | 


--------------------------------------------------------------------------------
/statistics.py:
--------------------------------------------------------------------------------
  1 | ##  Module statistics.py
  2 | ##
  3 | ##  Copyright (c) 2013 Steven D'Aprano <steve+python@pearwood.info>.
  4 | ##
  5 | ##  Licensed under the Apache License, Version 2.0 (the "License");
  6 | ##  you may not use this file except in compliance with the License.
  7 | ##  You may obtain a copy of the License at
  8 | ##
  9 | ##  http://www.apache.org/licenses/LICENSE-2.0
 10 | ##
 11 | ##  Unless required by applicable law or agreed to in writing, software
 12 | ##  distributed under the License is distributed on an "AS IS" BASIS,
 13 | ##  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | ##  See the License for the specific language governing permissions and
 15 | ##  limitations under the License.
 16 | 
 17 | 
 18 | """
 19 | Basic statistics module.
 20 | 
 21 | This module provides functions for calculating statistics of data, including
 22 | averages, variance, and standard deviation.
 23 | 
 24 | Calculating averages
 25 | --------------------
 26 | 
 27 | ==================  =============================================
 28 | Function            Description
 29 | ==================  =============================================
 30 | mean                Arithmetic mean (average) of data.
 31 | median              Median (middle value) of data.
 32 | median_low          Low median of data.
 33 | median_high         High median of data.
 34 | median_grouped      Median, or 50th percentile, of grouped data.
 35 | mode                Mode (most common value) of data.
 36 | ==================  =============================================
 37 | 
 38 | Calculate the arithmetic mean ("the average") of data:
 39 | 
 40 | >>> mean([-1.0, 2.5, 3.25, 5.75])
 41 | 2.625
 42 | 
 43 | 
 44 | Calculate the standard median of discrete data:
 45 | 
 46 | >>> median([2, 3, 4, 5])
 47 | 3.5
 48 | 
 49 | 
 50 | Calculate the median, or 50th percentile, of data grouped into class intervals
 51 | centred on the data values provided. E.g. if your data points are rounded to
 52 | the nearest whole number:
 53 | 
 54 | >>> median_grouped([2, 2, 3, 3, 3, 4])  #doctest: +ELLIPSIS
 55 | 2.8333333333...
 56 | 
 57 | This should be interpreted in this way: you have two data points in the class
 58 | interval 1.5-2.5, three data points in the class interval 2.5-3.5, and one in
 59 | the class interval 3.5-4.5. The median of these data points is 2.8333...
 60 | 
 61 | 
 62 | Calculating variability or spread
 63 | ---------------------------------
 64 | 
 65 | ==================  =============================================
 66 | Function            Description
 67 | ==================  =============================================
 68 | pvariance           Population variance of data.
 69 | variance            Sample variance of data.
 70 | pstdev              Population standard deviation of data.
 71 | stdev               Sample standard deviation of data.
 72 | ==================  =============================================
 73 | 
 74 | Calculate the standard deviation of sample data:
 75 | 
 76 | >>> stdev([2.5, 3.25, 5.5, 11.25, 11.75])  #doctest: +ELLIPSIS
 77 | 4.38961843444...
 78 | 
 79 | If you have previously calculated the mean, you can pass it as the optional
 80 | second argument to the four "spread" functions to avoid recalculating it:
 81 | 
 82 | >>> data = [1, 2, 2, 4, 4, 4, 5, 6]
 83 | >>> mu = mean(data)
 84 | >>> pvariance(data, mu)
 85 | 2.5
 86 | 
 87 | 
 88 | Exceptions
 89 | ----------
 90 | 
 91 | A single exception is defined: StatisticsError is a subclass of ValueError.
 92 | 
 93 | """
 94 | 
 95 | __all__ = [ 'StatisticsError',
 96 |             'pstdev', 'pvariance', 'stdev', 'variance',
 97 |             'median',  'median_low', 'median_high', 'median_grouped',
 98 |             'mean', 'mode',
 99 |           ]
100 | 
101 | 
102 | import collections
103 | import math
104 | 
105 | from fractions import Fraction
106 | from decimal import Decimal
107 | 
108 | 
109 | # === Exceptions ===
110 | 
111 | class StatisticsError(ValueError):
112 |     pass
113 | 
114 | 
115 | # === Private utilities ===
116 | 
117 | def _sum(data, start=0):
118 |     """_sum(data [, start]) -> value
119 | 
120 |     Return a high-precision sum of the given numeric data. If optional
121 |     argument ``start`` is given, it is added to the total. If ``data`` is
122 |     empty, ``start`` (defaulting to 0) is returned.
123 | 
124 | 
125 |     Examples
126 |     --------
127 | 
128 |     >>> _sum([3, 2.25, 4.5, -0.5, 1.0], 0.75)
129 |     11.0
130 | 
131 |     Some sources of round-off error will be avoided:
132 | 
133 |     >>> _sum([1e50, 1, -1e50] * 1000)  # Built-in sum returns zero.
134 |     1000.0
135 | 
136 |     Fractions and Decimals are also supported:
137 | 
138 |     >>> from fractions import Fraction as F
139 |     >>> _sum([F(2, 3), F(7, 5), F(1, 4), F(5, 6)])
140 |     Fraction(63, 20)
141 | 
142 |     >>> from decimal import Decimal as D
143 |     >>> data = [D("0.1375"), D("0.2108"), D("0.3061"), D("0.0419")]
144 |     >>> _sum(data)
145 |     Decimal('0.6963')
146 | 
147 |     Mixed types are currently treated as an error, except that int is
148 |     allowed.
149 |     """
150 |     # We fail as soon as we reach a value that is not an int or the type of
151 |     # the first value which is not an int. E.g. _sum([int, int, float, int])
152 |     # is okay, but sum([int, int, float, Fraction]) is not.
153 |     allowed_types = set([int, type(start)])
154 |     n, d = _exact_ratio(start)
155 |     partials = {d: n}  # map {denominator: sum of numerators}
156 |     # Micro-optimizations.
157 |     exact_ratio = _exact_ratio
158 |     partials_get = partials.get
159 |     # Add numerators for each denominator.
160 |     for x in data:
161 |         _check_type(type(x), allowed_types)
162 |         n, d = exact_ratio(x)
163 |         partials[d] = partials_get(d, 0) + n
164 |     # Find the expected result type. If allowed_types has only one item, it
165 |     # will be int; if it has two, use the one which isn't int.
166 |     assert len(allowed_types) in (1, 2)
167 |     if len(allowed_types) == 1:
168 |         assert allowed_types.pop() is int
169 |         T = int
170 |     else:
171 |         T = (allowed_types - set([int])).pop()
172 |     if None in partials:
173 |         assert issubclass(T, (float, Decimal))
174 |         assert not math.isfinite(partials[None])
175 |         return T(partials[None])
176 |     total = Fraction()
177 |     for d, n in sorted(partials.items()):
178 |         total += Fraction(n, d)
179 |     if issubclass(T, int):
180 |         assert total.denominator == 1
181 |         return T(total.numerator)
182 |     if issubclass(T, Decimal):
183 |         return T(total.numerator)/total.denominator
184 |     return T(total)
185 | 
186 | 
187 | def _check_type(T, allowed):
188 |     if T not in allowed:
189 |         if len(allowed) == 1:
190 |             allowed.add(T)
191 |         else:
192 |             types = ', '.join([t.__name__ for t in allowed] + [T.__name__])
193 |             raise TypeError("unsupported mixed types: %s" % types)
194 | 
195 | 
196 | def _exact_ratio(x):
197 |     """Convert Real number x exactly to (numerator, denominator) pair.
198 | 
199 |     >>> _exact_ratio(0.25)
200 |     (1, 4)
201 | 
202 |     x is expected to be an int, Fraction, Decimal or float.
203 |     """
204 |     try:
205 |         try:
206 |             # int, Fraction
207 |             return (x.numerator, x.denominator)
208 |         except AttributeError:
209 |             # float
210 |             try:
211 |                 return x.as_integer_ratio()
212 |             except AttributeError:
213 |                 # Decimal
214 |                 try:
215 |                     return _decimal_to_ratio(x)
216 |                 except AttributeError:
217 |                     msg = "can't convert type '{}' to numerator/denominator"
218 |                     raise TypeError(msg.format(type(x).__name__))
219 |     except (OverflowError, ValueError):
220 |         # INF or NAN
221 |         if __debug__:
222 |             # Decimal signalling NANs cannot be converted to float :-(
223 |             if isinstance(x, Decimal):
224 |                 assert not x.is_finite()
225 |             else:
226 |                 assert not math.isfinite(x)
227 |         return (x, None)
228 | 
229 | 
230 | # FIXME This is faster than Fraction.from_decimal, but still too slow.
231 | def _decimal_to_ratio(d):
232 |     """Convert Decimal d to exact integer ratio (numerator, denominator).
233 | 
234 |     >>> from decimal import Decimal
235 |     >>> _decimal_to_ratio(Decimal("2.6"))
236 |     (26, 10)
237 | 
238 |     """
239 |     sign, digits, exp = d.as_tuple()
240 |     if exp in ('F', 'n', 'N'):  # INF, NAN, sNAN
241 |         assert not d.is_finite()
242 |         raise ValueError
243 |     num = 0
244 |     for digit in digits:
245 |         num = num*10 + digit
246 |     if exp < 0:
247 |         den = 10**-exp
248 |     else:
249 |         num *= 10**exp
250 |         den = 1
251 |     if sign:
252 |         num = -num
253 |     return (num, den)
254 | 
255 | 
256 | def _counts(data):
257 |     # Generate a table of sorted (value, frequency) pairs.
258 |     table = collections.Counter(iter(data)).most_common()
259 |     if not table:
260 |         return table
261 |     # Extract the values with the highest frequency.
262 |     maxfreq = table[0][1]
263 |     for i in range(1, len(table)):
264 |         if table[i][1] != maxfreq:
265 |             table = table[:i]
266 |             break
267 |     return table
268 | 
269 | 
270 | # === Measures of central tendency (averages) ===
271 | 
272 | def mean(data):
273 |     """Return the sample arithmetic mean of data.
274 | 
275 |     >>> mean([1, 2, 3, 4, 4])
276 |     2.8
277 | 
278 |     >>> from fractions import Fraction as F
279 |     >>> mean([F(3, 7), F(1, 21), F(5, 3), F(1, 3)])
280 |     Fraction(13, 21)
281 | 
282 |     >>> from decimal import Decimal as D
283 |     >>> mean([D("0.5"), D("0.75"), D("0.625"), D("0.375")])
284 |     Decimal('0.5625')
285 | 
286 |     If ``data`` is empty, StatisticsError will be raised.
287 |     """
288 |     if iter(data) is data:
289 |         data = list(data)
290 |     n = len(data)
291 |     if n < 1:
292 |         raise StatisticsError('mean requires at least one data point')
293 |     return _sum(data)/n
294 | 
295 | 
296 | # FIXME: investigate ways to calculate medians without sorting? Quickselect?
297 | def median(data):
298 |     """Return the median (middle value) of numeric data.
299 | 
300 |     When the number of data points is odd, return the middle data point.
301 |     When the number of data points is even, the median is interpolated by
302 |     taking the average of the two middle values:
303 | 
304 |     >>> median([1, 3, 5])
305 |     3
306 |     >>> median([1, 3, 5, 7])
307 |     4.0
308 | 
309 |     """
310 |     data = sorted(data)
311 |     n = len(data)
312 |     if n == 0:
313 |         raise StatisticsError("no median for empty data")
314 |     if n%2 == 1:
315 |         return data[n//2]
316 |     else:
317 |         i = n//2
318 |         return (data[i - 1] + data[i])/2
319 | 
320 | 
321 | def median_low(data):
322 |     """Return the low median of numeric data.
323 | 
324 |     When the number of data points is odd, the middle value is returned.
325 |     When it is even, the smaller of the two middle values is returned.
326 | 
327 |     >>> median_low([1, 3, 5])
328 |     3
329 |     >>> median_low([1, 3, 5, 7])
330 |     3
331 | 
332 |     """
333 |     data = sorted(data)
334 |     n = len(data)
335 |     if n == 0:
336 |         raise StatisticsError("no median for empty data")
337 |     if n%2 == 1:
338 |         return data[n//2]
339 |     else:
340 |         return data[n//2 - 1]
341 | 
342 | 
343 | def median_high(data):
344 |     """Return the high median of data.
345 | 
346 |     When the number of data points is odd, the middle value is returned.
347 |     When it is even, the larger of the two middle values is returned.
348 | 
349 |     >>> median_high([1, 3, 5])
350 |     3
351 |     >>> median_high([1, 3, 5, 7])
352 |     5
353 | 
354 |     """
355 |     data = sorted(data)
356 |     n = len(data)
357 |     if n == 0:
358 |         raise StatisticsError("no median for empty data")
359 |     return data[n//2]
360 | 
361 | 
362 | def median_grouped(data, interval=1):
363 |     """"Return the 50th percentile (median) of grouped continuous data.
364 | 
365 |     >>> median_grouped([1, 2, 2, 3, 4, 4, 4, 4, 4, 5])
366 |     3.7
367 |     >>> median_grouped([52, 52, 53, 54])
368 |     52.5
369 | 
370 |     This calculates the median as the 50th percentile, and should be
371 |     used when your data is continuous and grouped. In the above example,
372 |     the values 1, 2, 3, etc. actually represent the midpoint of classes
373 |     0.5-1.5, 1.5-2.5, 2.5-3.5, etc. The middle value falls somewhere in
374 |     class 3.5-4.5, and interpolation is used to estimate it.
375 | 
376 |     Optional argument ``interval`` represents the class interval, and
377 |     defaults to 1. Changing the class interval naturally will change the
378 |     interpolated 50th percentile value:
379 | 
380 |     >>> median_grouped([1, 3, 3, 5, 7], interval=1)
381 |     3.25
382 |     >>> median_grouped([1, 3, 3, 5, 7], interval=2)
383 |     3.5
384 | 
385 |     This function does not check whether the data points are at least
386 |     ``interval`` apart.
387 |     """
388 |     data = sorted(data)
389 |     n = len(data)
390 |     if n == 0:
391 |         raise StatisticsError("no median for empty data")
392 |     elif n == 1:
393 |         return data[0]
394 |     # Find the value at the midpoint. Remember this corresponds to the
395 |     # centre of the class interval.
396 |     x = data[n//2]
397 |     for obj in (x, interval):
398 |         if isinstance(obj, (str, bytes)):
399 |             raise TypeError('expected number but got %r' % obj)
400 |     try:
401 |         L = x - interval/2  # The lower limit of the median interval.
402 |     except TypeError:
403 |         # Mixed type. For now we just coerce to float.
404 |         L = float(x) - float(interval)/2
405 |     cf = data.index(x)  # Number of values below the median interval.
406 |     # FIXME The following line could be more efficient for big lists.
407 |     f = data.count(x)  # Number of data points in the median interval.
408 |     return L + interval*(n/2 - cf)/f
409 | 
410 | 
411 | def mode(data):
412 |     """Return the most common data point from discrete or nominal data.
413 | 
414 |     ``mode`` assumes discrete data, and returns a single value. This is the
415 |     standard treatment of the mode as commonly taught in schools:
416 | 
417 |     >>> mode([1, 1, 2, 3, 3, 3, 3, 4])
418 |     3
419 | 
420 |     This also works with nominal (non-numeric) data:
421 | 
422 |     >>> mode(["red", "blue", "blue", "red", "green", "red", "red"])
423 |     'red'
424 | 
425 |     If there is not exactly one most common value, ``mode`` will raise
426 |     StatisticsError.
427 |     """
428 |     # Generate a table of sorted (value, frequency) pairs.
429 |     table = _counts(data)
430 |     if len(table) == 1:
431 |         return table[0][0]
432 |     elif table:
433 |         raise StatisticsError(
434 |                 'no unique mode; found %d equally common values' % len(table)
435 |                 )
436 |     else:
437 |         raise StatisticsError('no mode for empty data')
438 | 
439 | 
440 | # === Measures of spread ===
441 | 
442 | # See http://mathworld.wolfram.com/Variance.html
443 | #     http://mathworld.wolfram.com/SampleVariance.html
444 | #     http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
445 | #
446 | # Under no circumstances use the so-called "computational formula for
447 | # variance", as that is only suitable for hand calculations with a small
448 | # amount of low-precision data. It has terrible numeric properties.
449 | #
450 | # See a comparison of three computational methods here:
451 | # http://www.johndcook.com/blog/2008/09/26/comparing-three-methods-of-computing-standard-deviation/
452 | 
453 | def _ss(data, c=None):
454 |     """Return sum of square deviations of sequence data.
455 | 
456 |     If ``c`` is None, the mean is calculated in one pass, and the deviations
457 |     from the mean are calculated in a second pass. Otherwise, deviations are
458 |     calculated from ``c`` as given. Use the second case with care, as it can
459 |     lead to garbage results.
460 |     """
461 |     if c is None:
462 |         c = mean(data)
463 |     ss = _sum((x-c)**2 for x in data)
464 |     # The following sum should mathematically equal zero, but due to rounding
465 |     # error may not.
466 |     ss -= _sum((x-c) for x in data)**2/len(data)
467 |     assert not ss < 0, 'negative sum of square deviations: %f' % ss
468 |     return ss
469 | 
470 | 
471 | def variance(data, xbar=None):
472 |     """Return the sample variance of data.
473 | 
474 |     data should be an iterable of Real-valued numbers, with at least two
475 |     values. The optional argument xbar, if given, should be the mean of
476 |     the data. If it is missing or None, the mean is automatically calculated.
477 | 
478 |     Use this function when your data is a sample from a population. To
479 |     calculate the variance from the entire population, see ``pvariance``.
480 | 
481 |     Examples:
482 | 
483 |     >>> data = [2.75, 1.75, 1.25, 0.25, 0.5, 1.25, 3.5]
484 |     >>> variance(data)
485 |     1.3720238095238095
486 | 
487 |     If you have already calculated the mean of your data, you can pass it as
488 |     the optional second argument ``xbar`` to avoid recalculating it:
489 | 
490 |     >>> m = mean(data)
491 |     >>> variance(data, m)
492 |     1.3720238095238095
493 | 
494 |     This function does not check that ``xbar`` is actually the mean of
495 |     ``data``. Giving arbitrary values for ``xbar`` may lead to invalid or
496 |     impossible results.
497 | 
498 |     Decimals and Fractions are supported:
499 | 
500 |     >>> from decimal import Decimal as D
501 |     >>> variance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
502 |     Decimal('31.01875')
503 | 
504 |     >>> from fractions import Fraction as F
505 |     >>> variance([F(1, 6), F(1, 2), F(5, 3)])
506 |     Fraction(67, 108)
507 | 
508 |     """
509 |     if iter(data) is data:
510 |         data = list(data)
511 |     n = len(data)
512 |     if n < 2:
513 |         raise StatisticsError('variance requires at least two data points')
514 |     ss = _ss(data, xbar)
515 |     return ss/(n-1)
516 | 
517 | 
518 | def pvariance(data, mu=None):
519 |     """Return the population variance of ``data``.
520 | 
521 |     data should be an iterable of Real-valued numbers, with at least one
522 |     value. The optional argument mu, if given, should be the mean of
523 |     the data. If it is missing or None, the mean is automatically calculated.
524 | 
525 |     Use this function to calculate the variance from the entire population.
526 |     To estimate the variance from a sample, the ``variance`` function is
527 |     usually a better choice.
528 | 
529 |     Examples:
530 | 
531 |     >>> data = [0.0, 0.25, 0.25, 1.25, 1.5, 1.75, 2.75, 3.25]
532 |     >>> pvariance(data)
533 |     1.25
534 | 
535 |     If you have already calculated the mean of the data, you can pass it as
536 |     the optional second argument to avoid recalculating it:
537 | 
538 |     >>> mu = mean(data)
539 |     >>> pvariance(data, mu)
540 |     1.25
541 | 
542 |     This function does not check that ``mu`` is actually the mean of ``data``.
543 |     Giving arbitrary values for ``mu`` may lead to invalid or impossible
544 |     results.
545 | 
546 |     Decimals and Fractions are supported:
547 | 
548 |     >>> from decimal import Decimal as D
549 |     >>> pvariance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
550 |     Decimal('24.815')
551 | 
552 |     >>> from fractions import Fraction as F
553 |     >>> pvariance([F(1, 4), F(5, 4), F(1, 2)])
554 |     Fraction(13, 72)
555 | 
556 |     """
557 |     if iter(data) is data:
558 |         data = list(data)
559 |     n = len(data)
560 |     if n < 1:
561 |         raise StatisticsError('pvariance requires at least one data point')
562 |     ss = _ss(data, mu)
563 |     return ss/n
564 | 
565 | 
566 | def stdev(data, xbar=None):
567 |     """Return the square root of the sample variance.
568 | 
569 |     See ``variance`` for arguments and other details.
570 | 
571 |     >>> stdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
572 |     1.0810874155219827
573 | 
574 |     """
575 |     var = variance(data, xbar)
576 |     try:
577 |         return var.sqrt()
578 |     except AttributeError:
579 |         return math.sqrt(var)
580 | 
581 | 
582 | def pstdev(data, mu=None):
583 |     """Return the square root of the population variance.
584 | 
585 |     See ``pvariance`` for arguments and other details.
586 | 
587 |     >>> pstdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
588 |     0.986893273527251
589 | 
590 |     """
591 |     var = pvariance(data, mu)
592 |     try:
593 |         return var.sqrt()
594 |     except AttributeError:
595 |         return math.sqrt(var)
596 | 


--------------------------------------------------------------------------------