├── .gitignore ├── GenomeAnalysisTK ├── LICENSE.txt ├── MarkDuplicates ├── PicardMerge ├── README.markdown ├── SortSam ├── count_flagstat_wgs.py ├── example_data ├── input_data_wgs │ ├── NA12878wgs_20FUKAAXX_NA_L006_R1.fastq.gz │ ├── NA12878wgs_20FUKAAXX_NA_L006_R2.fastq.gz │ ├── NA12878wgs_20FUKAAXX_NA_L007_R1.fastq.gz │ ├── NA12878wgs_20FUKAAXX_NA_L007_R2.fastq.gz │ ├── NA12878wgs_20GAVAAXX_NA_L006_R1.fastq.gz │ ├── NA12878wgs_20GAVAAXX_NA_L006_R2.fastq.gz │ ├── NA12878wgs_20GAVAAXX_NA_L007_R1.fastq.gz │ ├── NA12878wgs_20GAVAAXX_NA_L007_R2.fastq.gz │ └── README └── output_wgs │ ├── README │ └── fastq_symlinks │ └── README ├── input_fastq.py ├── pipeline.py ├── pipeline_dev_config.py ├── pipeline_stages_config.py └── vcftools_prepare.sh /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | -------------------------------------------------------------------------------- /GenomeAnalysisTK: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | mem=$1 4 | shift 5 | 6 | java -Xmx${mem}g -Djava.io.tmpdir=$TMPDIR -jar $GATK_HOME/GenomeAnalysisTK.jar "$@" 7 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2013 Bernard Pope, Clare Sloggett, Gayle Philip 4 | Wakefield. 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 7 | 8 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 9 | 10 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 11 | -------------------------------------------------------------------------------- /MarkDuplicates: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | mem=$1 4 | shift 5 | 6 | java -Xmx${mem}g -Djava.io.tmpdir=$TMPDIR -jar $PICARD_HOME/lib/MarkDuplicates.jar TMP_DIR=$TMPDIR $* 7 | -------------------------------------------------------------------------------- /PicardMerge: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | mem=$1 4 | shift 5 | 6 | java -Xmx${mem}g -Djava.io.tmpdir=$TMPDIR -jar $PICARD_HOME/lib/MergeSamFiles.jar TMP_DIR=$TMPDIR $* 7 | -------------------------------------------------------------------------------- /README.markdown: -------------------------------------------------------------------------------- 1 | 2 | # Variant calling pipeline: exome and WGS 3 | 4 | ## Overview 5 | 6 | This is a basic variant-calling and annotation pipeline developed at the 7 | Victorian Life Sciences Computation Initiative (VLSCI), University of Melbourne. 8 | 9 | It is based around BWA, GATK and ENSEMBL and was originally designed for human (or similar) data. The master branch is configured for WGS data; there is an exome branch configured for variant calling in exome data. 10 | 11 | To run the pipeline you will need Rubra: [https://github.com/bjpop/rubra](https://github.com/bjpop/rubra). Rubra uses the python Ruffus library: [http://www.ruffus.org.uk/](http://www.ruffus.org.uk/). 12 | 13 | Usage: 14 | 15 | rubra pipeline.py --config --style {print,run,touchfiles,flowchart} 16 | 17 | More command-line options are described in the Rubra documentation. 18 | 19 | Specifically, to use the provided config files, you might call 20 | 21 | rubra pipeline.py --config pipeline_dev_config.py pipeline_stages_config.py --style print 22 | 23 | If you use the provided config files, you should make sure you understand the analysis steps and that they are appropriate for your project. 24 | 25 | If you use this code or Rubra itself in your research, please cite the poster at http://figshare.com/articles/Rubra_flexible_distributed_pipelines/895626 like so: 26 | 27 | Sloggett, Clare; Wakefield, Matthew; Philip, Gayle; Pope, Bernard (2014): 28 | Rubra - flexible distributed pipelines. figshare. 29 | http://dx.doi.org/10.6084/m9.figshare.895626 30 | 31 | ## Running on VLSCI's clusters (e.g. merri) 32 | 33 | On merri we have a version of Rubra installed into Python 2.7.5, which you can load with 34 | 35 | module load python-gcc/2.7.5 36 | 37 | To use the flowchart option you will need graphviz, which you can load with 38 | 39 | module load graphviz 40 | -------------------------------------------------------------------------------- /SortSam: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | mem=$1 4 | shift 5 | 6 | java -Xmx${mem}g -Djava.io.tmpdir=$TMPDIR -jar $PICARD_HOME/lib/SortSam.jar TMP_DIR=$TMPDIR $* 7 | -------------------------------------------------------------------------------- /count_flagstat_wgs.py: -------------------------------------------------------------------------------- 1 | 2 | """ 3 | Read all output flagstat files and store values in a text table. 4 | Creates two tables in the output directory: readcounts.txt and readcounts_fractions.txt, which contains useful human-readable percentage statistics. 5 | Each row will be a sample. 6 | Values to store (columns) are: 7 | Total reads from SAMPLE.bam.flagstat 8 | Mapped reads from SAMPLE.bam.flagstat 9 | Mapped reads from SAMPLE.dedup.bam.flagstat 10 | There is an assumption that the bams were mapped with something like bwa, which includes unmapped reads. 11 | 12 | Usage: python count_flagstat_wgs.py flagstat_directory output_directory 13 | """ 14 | 15 | import sys 16 | import os 17 | import re 18 | import optparse 19 | from collections import defaultdict 20 | 21 | class FlagstatParseException (Exception): 22 | pass 23 | 24 | def read_flagstat(filename): 25 | """ 26 | Given a filename, parse the flagstat values and return as a hash. 27 | Relying on flagstat contents being in usual order. 28 | """ 29 | numbers = re.compile(r'^(\d+)\s+\+\s+(\d+)\s+') 30 | values = {} 31 | f = open(filename) 32 | for field in ['total', 33 | 'duplicates', 34 | 'mapped', 35 | 'paired', 36 | 'read1', 37 | 'read2', 38 | 'properly_paired', 39 | 'both_mapped', 40 | 'singletons', 41 | 'mate_distant', 42 | 'mate_distant_goodqual']: 43 | line = f.readline().strip() 44 | match = numbers.match(line) 45 | if not match: 46 | raise FlagstatParseException 47 | values[field] = int(match.group(1)) 48 | values[field+'_QCfailed'] = int(match.group(2)) 49 | f.close() 50 | return values 51 | 52 | # ----- 53 | 54 | # Get arguments and input filenames 55 | parser = optparse.OptionParser(usage=__doc__) 56 | #parser.add_option() 57 | (options, args) = parser.parse_args() 58 | if len(args) != 2: 59 | parser.error("Wrong number of arguments - see usage info") 60 | in_dir = args[0] 61 | out_dir = args[1] 62 | if not (os.path.exists(out_dir) and os.path.isdir(out_dir)): 63 | sys.exit("There does not seem to be a directory %s , exiting" % out_dir) 64 | 65 | filenames = os.listdir(in_dir) 66 | 67 | #print ', '.join(filenames) 68 | 69 | alignedname = re.compile('^([^_\.]+).bam.flagstat') 70 | dedupname = re.compile('^([^_\.]+).dedup.bam.flagstat') 71 | 72 | samples = defaultdict(dict) 73 | for filename in filenames: 74 | if dedupname.match(filename): 75 | match = dedupname.match(filename) 76 | name = match.group(1) 77 | values = read_flagstat( os.path.join(in_dir, filename) ) 78 | samples[name]['deduped'] = values['mapped'] 79 | elif alignedname.match(filename): 80 | match = alignedname.match(filename) 81 | name = match.group(1) 82 | values = read_flagstat( os.path.join(in_dir, filename) ) 83 | samples[name]['mapped'] = values['mapped'] 84 | samples[name]['total'] = values['total'] 85 | 86 | #print ', '.join(samples.keys()) 87 | 88 | tablefile = os.path.join(out_dir, "readcounts.txt") 89 | tablefile_plus = os.path.join(out_dir, "readcounts_fractions.txt") 90 | 91 | OUT_TABLE = open(tablefile, 'w') 92 | OUT_TABLEPLUS = open(tablefile_plus, 'w') 93 | 94 | OUT_TABLE.write("Sample\tTotal\tMapped\tDeduped\n") 95 | OUT_TABLEPLUS.write("Sample\tTotal\tMapped\t%\tDeduped\t%\n") 96 | 97 | for sample in sorted(samples.keys()): 98 | values = samples[sample] 99 | fraction_mapped = float(values['mapped'])/values['total'] 100 | fraction_deduped = float(values['deduped'])/values['mapped'] 101 | 102 | OUT_TABLE.write( "%s\t%d\t%d\t%d\n" % (sample, values['total'], values['mapped'], values['deduped']) ) 103 | 104 | OUT_TABLEPLUS.write( "%s\t%d\t%d\t%.3f\t%d\t%.3f\n" % (sample, values['total'], values['mapped'], fraction_mapped, values['deduped'], fraction_deduped) ) 105 | 106 | OUT_TABLE.close() 107 | OUT_TABLEPLUS.close() 108 | -------------------------------------------------------------------------------- /example_data/input_data_wgs/NA12878wgs_20FUKAAXX_NA_L006_R1.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/melbournebioinformatics/variant_calling_pipeline/e1d6b8998e129af973f7f1946ebbfbed0ca62a1b/example_data/input_data_wgs/NA12878wgs_20FUKAAXX_NA_L006_R1.fastq.gz -------------------------------------------------------------------------------- /example_data/input_data_wgs/NA12878wgs_20FUKAAXX_NA_L006_R2.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/melbournebioinformatics/variant_calling_pipeline/e1d6b8998e129af973f7f1946ebbfbed0ca62a1b/example_data/input_data_wgs/NA12878wgs_20FUKAAXX_NA_L006_R2.fastq.gz -------------------------------------------------------------------------------- /example_data/input_data_wgs/NA12878wgs_20FUKAAXX_NA_L007_R1.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/melbournebioinformatics/variant_calling_pipeline/e1d6b8998e129af973f7f1946ebbfbed0ca62a1b/example_data/input_data_wgs/NA12878wgs_20FUKAAXX_NA_L007_R1.fastq.gz -------------------------------------------------------------------------------- /example_data/input_data_wgs/NA12878wgs_20FUKAAXX_NA_L007_R2.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/melbournebioinformatics/variant_calling_pipeline/e1d6b8998e129af973f7f1946ebbfbed0ca62a1b/example_data/input_data_wgs/NA12878wgs_20FUKAAXX_NA_L007_R2.fastq.gz -------------------------------------------------------------------------------- /example_data/input_data_wgs/NA12878wgs_20GAVAAXX_NA_L006_R1.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/melbournebioinformatics/variant_calling_pipeline/e1d6b8998e129af973f7f1946ebbfbed0ca62a1b/example_data/input_data_wgs/NA12878wgs_20GAVAAXX_NA_L006_R1.fastq.gz -------------------------------------------------------------------------------- /example_data/input_data_wgs/NA12878wgs_20GAVAAXX_NA_L006_R2.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/melbournebioinformatics/variant_calling_pipeline/e1d6b8998e129af973f7f1946ebbfbed0ca62a1b/example_data/input_data_wgs/NA12878wgs_20GAVAAXX_NA_L006_R2.fastq.gz -------------------------------------------------------------------------------- /example_data/input_data_wgs/NA12878wgs_20GAVAAXX_NA_L007_R1.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/melbournebioinformatics/variant_calling_pipeline/e1d6b8998e129af973f7f1946ebbfbed0ca62a1b/example_data/input_data_wgs/NA12878wgs_20GAVAAXX_NA_L007_R1.fastq.gz -------------------------------------------------------------------------------- /example_data/input_data_wgs/NA12878wgs_20GAVAAXX_NA_L007_R2.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/melbournebioinformatics/variant_calling_pipeline/e1d6b8998e129af973f7f1946ebbfbed0ca62a1b/example_data/input_data_wgs/NA12878wgs_20GAVAAXX_NA_L007_R2.fastq.gz -------------------------------------------------------------------------------- /example_data/input_data_wgs/README: -------------------------------------------------------------------------------- 1 | This directory contains test data which can be used to run the WGS(master) version of the pipeline. 2 | It is fastq data from the 1000 genomes project, individual NA12878, roughly restricted to a small genomic region (chromosome 17, 39700000-39800000). 3 | -------------------------------------------------------------------------------- /example_data/output_wgs/README: -------------------------------------------------------------------------------- 1 | This directory should contain the output created by running the WGS version of the pipeline on the WGS test data. 2 | The output directory needs to exist before the pipeline is invoked. 3 | This file mainly exists to make sure git will recognise this directory as version-controlled. 4 | -------------------------------------------------------------------------------- /example_data/output_wgs/fastq_symlinks/README: -------------------------------------------------------------------------------- 1 | This directory should contain the symlinks to the raw input fastq files, created by invoking the WGS version of the pipeline on the WGS test data. 2 | This directory currently needs to exist before the pipeline is invoked. 3 | This file mainly exists to make sure git will recognise this directory as version-controlled. 4 | -------------------------------------------------------------------------------- /input_fastq.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | 3 | """ 4 | Functions to parse directories of input fastq files, create symlinks with expected filename structures, and return metadata. 5 | 6 | Clare Sloggett, VLSCI 7 | """ 8 | 9 | import sys 10 | import re 11 | import os.path 12 | from collections import defaultdict 13 | from rubra.utils import (mkLink) 14 | 15 | def parse_and_link(file, symlink_dir, metadata_dict): 16 | """ 17 | Parse metadata out of input filename and construct symlink. 18 | Takes a fastq filename, destination directory, and a metadata dict, which should be of type defaultdict(dict). 19 | Parse the filename to get information on the sample name, run, read #, etc. 20 | Medadata is added to the provided metadata_dict. 21 | Some metadata is used to build symlinks, to guarantee filename uniqueness and a regular naming structure.\ 22 | Currently parsing by assuming AGRF naming structure and paired-end reads 23 | Currently will ONLY handle gzipped files, to avoid multiple links to the same data. 24 | """ 25 | match_old = re.match(r".*?/([^_/]+)_([a-zA-Z0-9-.]+)_s_([0-9]+)_(1|2)_sequence.txt.gz",file) 26 | match_new = re.match(r".*?/([a-zA-Z0-9-.]+)_([^_/]+)_[CAGTN]+_L([0-9]+)_R(1|2).fastq.gz",file) 27 | if match_old: 28 | run_id = match_old.group(1) 29 | sample = match_old.group(2) 30 | lane = int(match_old.group(3)) 31 | pair = match_old.group(4) 32 | encoding = 'I' 33 | elif match_new: 34 | run_id = match_new.group(2) 35 | sample = match_new.group(1) 36 | lane = int(match_new.group(3)) 37 | pair = match_new.group(4) 38 | encoding = 'S' 39 | else: 40 | print "Unable to parse name of fastq file %s ." % file 41 | sys.exit(1) 42 | newfile = os.path.join(symlink_dir, "%s_%s_L%d_%s.fastq.gz" % 43 | (sample, run_id, lane, pair)) 44 | metadata_dict[os.path.basename(newfile)]['sample'] = sample 45 | metadata_dict[os.path.basename(newfile)]['run_id'] = run_id 46 | metadata_dict[os.path.basename(newfile)]['lane'] = lane 47 | metadata_dict[os.path.basename(newfile)]['pair'] = pair 48 | metadata_dict[os.path.basename(newfile)]['encoding'] = encoding 49 | relative_sourcefile = os.path.relpath(file, symlink_dir) 50 | mkLink(relative_sourcefile, newfile) 51 | return newfile 52 | 53 | -------------------------------------------------------------------------------- /pipeline.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | 3 | """ 4 | GATK-based variant-calling pipeline, WGS version. 5 | 6 | Authors: Bernie Pope, Clare Sloggett, Gayle Philip. 7 | Thanks to Dmitri Mouradov and Maria Doyle for input on the initial 8 | analysis design. 9 | Thanks to Matt Wakefield for contributions to Rubra 10 | (https://github.com/bjpop/rubra) during pipeline development. 11 | 12 | Description: 13 | 14 | This program implements a workflow pipeline for next generation 15 | sequencing variant detection using the Broad Institute's GATK for 16 | variant calling and using ENSEMBL for basic annotation. 17 | 18 | It uses Rubra (https://github.com/bjpop/rubra) based on the 19 | Ruffus library. 20 | 21 | It supports parallel evaluation of independent pipeline stages, 22 | and can run stages on a cluster environment. 23 | 24 | The pipeline is configured by an options file in a python file, 25 | including the actual commands which are run at each stage. 26 | """ 27 | 28 | 29 | import sys 30 | import re 31 | import os.path 32 | import os 33 | from collections import defaultdict 34 | from glob import * 35 | import shutil 36 | from ruffus import * 37 | from rubra.utils import pipeline_options 38 | from rubra.utils import (runStageCheck, mkLogFile, mkDir, mkForceLink) 39 | from input_fastq import parse_and_link 40 | 41 | def make_metadata_string(metadata): 42 | return r'-R"@RG\tID:%s\tSM:%s\tPL:%s"' % (metadata['ID'], metadata['SM'], metadata['PL']) 43 | 44 | # Shorthand access to options 45 | ref_files = pipeline_options.ref_files 46 | working_files = pipeline_options.working_files 47 | logDir = pipeline_options.pipeline['logDir'] 48 | 49 | # Data setup process and input organisation and metadata functions 50 | 51 | #Metadata holding structures 52 | fastq_metadata = defaultdict(dict) 53 | 54 | original_fastq_files = [] 55 | for fastq_dir in working_files['fastq_dirs']: 56 | original_fastq_files += glob(os.path.join(fastq_dir, '*.fastq.gz')) 57 | 58 | if len(original_fastq_files)==0: 59 | print "No input files found. Do the filenames follow the naming convention?" 60 | print "Directories searched:" 61 | print "\n".join(working_files['fastq_dirs']) 62 | sys.exit(1) 63 | 64 | # Parse metadata out of input file names and construct symlinks 65 | # Metadata is put into a dict (for the rest of ruffus) and some of it also into symlinks (for filename uniqueness) 66 | # currently parsing by assuming AGRF naming structure and paired-end reads 67 | mkDir(working_files['fastq_symlink_dir']) 68 | all_fastq_files = [] 69 | for file in original_fastq_files: 70 | symlink = parse_and_link(file, working_files['fastq_symlink_dir'], fastq_metadata) 71 | all_fastq_files.append(symlink) 72 | 73 | # Make a list of files we will actually use 74 | if pipeline_options.pipeline['restrict_samples']: 75 | allowed_samples = set(pipeline_options.pipeline['allowed_samples']) 76 | fastq_files = [file for file in sorted(all_fastq_files) 77 | if (fastq_metadata[os.path.basename(file)]['sample'] in allowed_samples)] 78 | else: 79 | fastq_files = sorted(all_fastq_files) 80 | 81 | print "Symlinked files that will be used:" 82 | for file in fastq_files: 83 | print file 84 | print 85 | print "Output dir is %s" % working_files['output_dir'] 86 | print "Log dir is %s" % logDir 87 | print 88 | 89 | # Create output subdirectories 90 | 91 | output_dir = working_files['output_dir'] 92 | 93 | fastqc_dir = os.path.join(output_dir, "FastQC") 94 | mkDir(fastqc_dir) 95 | 96 | sambam_dir = os.path.join(output_dir, "alignments") 97 | mkDir(sambam_dir) 98 | 99 | variant_dir = os.path.join(output_dir, "variant_calls") 100 | mkDir(variant_dir) 101 | 102 | coverage_dir = os.path.join(output_dir, "coverage") 103 | mkDir(coverage_dir) 104 | 105 | ensembl_dir = os.path.join(output_dir, "ensembl") 106 | mkDir(ensembl_dir) 107 | 108 | # directory for final summary tables 109 | results_dir = os.path.join(output_dir, "results") 110 | mkDir(results_dir) 111 | 112 | # Pipeline declarations 113 | 114 | # Alignment and correction steps 115 | 116 | @transform(fastq_files, regex('(.+\/)?(.+?)\.fastq\.gz'), 117 | [r'%s/\2_fastqc' % fastqc_dir, r'%s/\2.fastqc.Success' % fastqc_dir]) 118 | def fastqc(inputs, outputs): 119 | """ 120 | Run FastQC on each fastq file. 121 | """ 122 | sequence = inputs 123 | fastqc_dest, flagFile = outputs 124 | runStageCheck('fastqc', flagFile, fastqc_dir, sequence) 125 | 126 | @collate(fastq_files, regex(r".*?([^/]+)(_1|_2)\.fastq.gz"), 127 | [r"%s/\1.sam" % sambam_dir, r"%s/\1.bwaPE.Success" % sambam_dir]) 128 | def bwaPE(inputs, outputs): 129 | """ 130 | Aligns two paired-end fastq files to a reference genome to produce a sam file. 131 | """ 132 | seq1, seq2 = sorted(inputs) 133 | output, flag_file = outputs 134 | fastq_name = os.path.basename(seq1) 135 | sample = fastq_metadata[fastq_name]['sample'] 136 | runID = fastq_metadata[fastq_name]['run_id'] 137 | lane = fastq_metadata[fastq_name]['lane'] 138 | readgroup_metadata = { 'PL': 'ILLUMINA', 139 | 'SM': sample, 140 | 'ID': "%s_%s_Lane%d" % (sample, runID, lane) } 141 | metadata_str = make_metadata_string(readgroup_metadata) 142 | print "bwa-mem on %s and %s" % (os.path.basename(seq1), os.path.basename(seq2)) 143 | runStageCheck('bwaMemPE', flag_file, metadata_str, ref_files['bwa_reference'], seq1, seq2, output) 144 | 145 | @transform(bwaPE, suffix(".sam"), 146 | [".bam", ".samToBam.Success"]) 147 | def samToBam(inputs, outputs): 148 | """ 149 | Convert sam to bam and sort, using Picard. 150 | """ 151 | output, flag_file = outputs 152 | sam, _success = inputs 153 | print "converting to sorted bam: %s" % os.path.basename(sam) 154 | runStageCheck('samToSortedBam', flag_file, sam, output) 155 | 156 | @collate(samToBam, regex(r'(.*?)([^/_]+)_([^/_]+_[^/_]+)\.bam'), 157 | [r"\1\2.bam", r'\1\2.mergeBams.Success']) 158 | def mergeBams(inputs, outputs): 159 | """ 160 | Merge the sorted bams together for each sample. 161 | Picard should cope correctly if there is only one input. 162 | """ 163 | bams = [bam for [bam, _success] in inputs] 164 | output, flag_file = outputs 165 | baminputs = ' '.join(["INPUT=%s" % bam for bam in bams]) 166 | print "merging %s into %s" % (",".join([os.path.basename(bam) for bam in bams]), os.path.basename(output)) 167 | runStageCheck('mergeBams', flag_file, baminputs, output) 168 | 169 | @follows('indexMergedBams') 170 | @transform(mergeBams, suffix('.bam'), 171 | ['.dedup.bam', '.bam.dedup.Success']) 172 | def dedup(inputs, outputs): 173 | """ 174 | Remove apparent duplicates from merged bams using Picard MarkDuplicates. 175 | """ 176 | input_bam, _success = inputs 177 | output_bam, flag_file = outputs 178 | logFile = mkLogFile(logDir, input_bam, '.dedup.log') 179 | print "de-duping %s" % os.path.basename(input_bam) 180 | runStageCheck('dedup', flag_file, input_bam, logFile, output_bam) 181 | 182 | @follows('indexDedupedBams') 183 | @transform(dedup, suffix('.bam'), 184 | ['.realigner.intervals', '.bam.realignIntervals.Success']) 185 | def realignIntervals(inputs, outputs): 186 | """ 187 | Run GATK RealignTargetCreator to find suspect intervals for realignment. 188 | """ 189 | bam, _success = inputs 190 | output_intervals, flag_file = outputs 191 | logFile = mkLogFile(logDir, bam, '.realignIntervals.log') 192 | print "calculating realignment intervals for %s" % os.path.basename(bam) 193 | runStageCheck('realignIntervals', flag_file, ref_files['fasta_reference'], bam, ref_files['indels_realign_goldstandard'], ref_files['indels_realign_1000G'], logFile, output_intervals) 194 | 195 | def remove_GATK_bai(bamfile): 196 | """ 197 | A bug in some versions of GATK cause it to create an x.bai file, and this gets in the way of using the properly named x.bam.bai file. If the given file exists, delete it. 198 | """ 199 | bad_bai = os.path.splitext(bamfile)[0] + ".bai" 200 | try: 201 | os.remove(bad_bai) 202 | except OSError, e: 203 | # Ignore error only if it is OSError #2, ie File Not Found 204 | if e.errno != 2: 205 | raise e 206 | 207 | @transform(realignIntervals, regex(r"(.*?)([^/]+)\.realigner\.intervals"), 208 | add_inputs([r'\1\2.bam']), 209 | [r'\1\2.realigned.bam', r'\1\2.bam.realign.Success']) 210 | def realign(inputs, outputs): 211 | """ 212 | Run GATK IndelRealigner for local realignment, using intervals found by realignIntervals. 213 | """ 214 | [intervals, _success], [input_bam] = inputs 215 | output_bam, flag_file = outputs 216 | logFile = mkLogFile(logDir, input_bam, '.realign.log') 217 | print "realigning %s" % os.path.basename(input_bam) 218 | runStageCheck('realign', flag_file, ref_files['fasta_reference'], input_bam, intervals, logFile, output_bam) 219 | remove_GATK_bai(output_bam) 220 | 221 | @follows('indexRealignedBams') 222 | @transform(realign, suffix('.bam'), 223 | ['.recal_data.csv', '.baseQualRecalCount.Success']) 224 | def baseQualRecalCount(inputs, outputs): 225 | """ 226 | GATK CountCovariates, first step of base quality score recalibration. 227 | """ 228 | bam, _success = inputs 229 | output_csv, flag_file = outputs 230 | logFile = mkLogFile(logDir, bam, '.baseQualRecalCount.log') 231 | print "count covariates using GATK for base quality score recalibration: %s" % os.path.basename(bam) 232 | runStageCheck('baseQualRecalCount', flag_file, bam, ref_files['fasta_reference'], ref_files['dbsnp'], logFile, output_csv) 233 | 234 | @transform(baseQualRecalCount, regex(r'(.*?)([^/]+)\.recal_data\.csv'), 235 | add_inputs([r'\1\2.bam']), 236 | [r'\1\2.recal.bam', r'\1\2.baseQualRecalTabulate.Success']) 237 | def baseQualRecalTabulate(inputs, outputs): 238 | """ 239 | GATK TableRecalibration: recalibrate base quality scores using the output of CountCovariates. 240 | """ 241 | [input_csv, _success], [input_bam] = inputs 242 | output_bam, flag_file = outputs 243 | logFile = mkLogFile(logDir, input_bam, '.baseQualRecalTabulate.log') 244 | print "recalibrate base quality scores using GATK on %s" % os.path.basename(input_bam) 245 | runStageCheck('baseQualRecalTabulate', flag_file, input_bam, ref_files['fasta_reference'], input_csv, logFile, output_bam) 246 | remove_GATK_bai(output_bam) 247 | 248 | # Temporarily putting this indexing step here to work around bug 249 | @transform(baseQualRecalTabulate, suffix('.bam'), 250 | ['.bam.bai', '.bam.indexRecalibratedBams.Success']) 251 | def indexRecalibratedBams(inputs, outputs): 252 | """ 253 | Index the recalibrated bams using samtools. 254 | """ 255 | bam, _success = inputs 256 | output, flag_file = outputs 257 | print "samtools index on %s" % os.path.basename(bam) 258 | runStageCheck('indexBam', flag_file, bam) 259 | 260 | # Variant calling steps 261 | 262 | @follows(indexRecalibratedBams) 263 | @transform(baseQualRecalTabulate, 264 | regex(r'(.*?)([^/]+)\.recal\.bam'), 265 | [r'%s/\2.SNP.vcf' % variant_dir, 266 | r'%s/\2.SNP.vcf.idx' % variant_dir, 267 | r'%s/\2.callSNPs.Success' % variant_dir]) 268 | def callSNPs(inputs, outputs): 269 | """ 270 | Use GATK UnifiedGenotyper to call SNPs from recalibrated bams. 271 | """ 272 | bam, _success = inputs 273 | output_vcf, _idx, flag_file = outputs 274 | logFile = mkLogFile(logDir, bam, '.callSNPs.log') 275 | print "calling SNPs from %s" % bam 276 | runStageCheck('callSNPs', flag_file, ref_files['fasta_reference'], bam, ref_files['dbsnp'], logFile, output_vcf) 277 | 278 | @follows(indexRecalibratedBams) 279 | @transform(baseQualRecalTabulate, 280 | regex(r'(.*?)([^/]+)\.recal\.bam'), 281 | [r'%s/\2.INDEL.vcf' % variant_dir, 282 | r'%s/\2.INDEL.vcf.idx' % variant_dir, 283 | r'%s/\2.callIndels.Success' % variant_dir]) 284 | def callIndels(inputs, outputs): 285 | """ 286 | Use GATK UnifiedGenotyper to call indels from recalibrated bams. 287 | """ 288 | bam, _success = inputs 289 | output_vcf, _idx, flag_file = outputs 290 | logFile = mkLogFile(logDir, bam, '.callIndels.log') 291 | print "calling Indels from %s" % bam 292 | runStageCheck('callIndels', flag_file, ref_files['fasta_reference'], bam, ref_files['dbsnp'], logFile, output_vcf) 293 | 294 | @transform(callSNPs, suffix('.SNP.vcf'), 295 | ['.SNP.filtered.vcf', '.SNP.filtered.vcf.idx', '.filterSNPs.Success']) 296 | def filterSNPs(inputs, outputs): 297 | """ 298 | Use GATK VariantFiltration to filter raw SNP calls. 299 | """ 300 | input_vcf, _idx, _success = inputs 301 | output_vcf, _idxout, flag_file = outputs 302 | logFile = mkLogFile(logDir, input_vcf, '.filterSNPs.log') 303 | print "filtering SNPs from %s" % input_vcf 304 | runStageCheck('filterSNPs', flag_file, ref_files['fasta_reference'], input_vcf, logFile, output_vcf) 305 | 306 | @transform(callIndels, suffix('.INDEL.vcf'), 307 | ['.INDEL.filtered.vcf', '.INDEL.filtered.vcf.idx', '.filterIndels.Success']) 308 | def filterIndels(inputs, outputs): 309 | """ 310 | Use GATK VariantFiltration to filter raw INDEL calls. 311 | """ 312 | input_vcf, _idx, _success = inputs 313 | output_vcf, _idxout, flag_file = outputs 314 | logFile = mkLogFile(logDir, input_vcf, '.filterIndels.log') 315 | print "filtering indels from %s" % input_vcf 316 | runStageCheck('filterIndels', flag_file, ref_files['fasta_reference'], input_vcf, logFile, output_vcf) 317 | 318 | 319 | @transform([filterSNPs, filterIndels], regex(r'.*?([^/]+)\.vcf'), 320 | [r'%s/\1.ensembl.vcf' % ensembl_dir,r'%s/\1.getEnsemblAnnotations.Success' % ensembl_dir]) 321 | def getEnsemblAnnotations(inputs, outputs): 322 | """ 323 | Annotate vcf using ENSEMBL variant effect predictor. 324 | """ 325 | vcf, _idx, _success = inputs 326 | output, flag_file = outputs 327 | logFile = mkLogFile(logDir, vcf, '.EnsemblAnnotation.log') 328 | print "Annotating %s with ENSEMBL variant effect predictor" % os.path.basename(vcf) 329 | runStageCheck('annotateEnsembl', flag_file, vcf, output, logFile) 330 | 331 | 332 | # Indexing steps 333 | 334 | @transform(mergeBams, suffix('.bam'), 335 | ['.bam.bai', '.bam.indexMergedBams.Success']) 336 | def indexMergedBams(inputs, outputs): 337 | """ 338 | Index the merged bams using samtools. 339 | """ 340 | bam, _success = inputs 341 | output, flag_file = outputs 342 | print "samtools index on %s" % os.path.basename(bam) 343 | runStageCheck('indexBam', flag_file, bam) 344 | 345 | @transform(dedup, suffix('.bam'), 346 | ['.bam.bai', '.bam.indexDedupedBams.Success']) 347 | def indexDedupedBams(inputs, outputs): 348 | """ 349 | Index the de-duplicated bams using samtools. Note that this actually goes from the fixMate-ed bams. 350 | """ 351 | bam, _success = inputs 352 | output, flag_file = outputs 353 | print "samtools index on %s" % os.path.basename(bam) 354 | runStageCheck('indexBam', flag_file, bam) 355 | 356 | @transform(realign, suffix('.bam'), 357 | ['.bam.bai', '.bam.indexRealignedBams.Success']) 358 | def indexRealignedBams(inputs, outputs): 359 | """ 360 | Index the locally realigned bams using samtools. 361 | """ 362 | bam, _success = inputs 363 | output, flag_file = outputs 364 | print "samtools index on %s" % os.path.basename(bam) 365 | runStageCheck('indexBam', flag_file, bam) 366 | 367 | 368 | 369 | @transform(mergeBams, suffix('.bam'), 370 | ['.bam.tdf', '.bam.igvcountMergedBams.Success']) 371 | def igvcountMergedBams(inputs, outputs): 372 | """ 373 | Use igvtools count to create a .tdf file for the merged bam files, to improve viewing of the bam coverage in igv. 374 | """ 375 | bam, _success = inputs 376 | outfile, flag_file = outputs 377 | print "igvtools count on %s" % os.path.basename(bam) 378 | runStageCheck('igvcount', flag_file, bam, outfile) 379 | 380 | @transform(realign, suffix('.bam'), 381 | ['.bam.tdf', '.bam.igvcountRealignedBams.Success']) 382 | def igvcountRealignedBams(inputs, outputs): 383 | """ 384 | Use igvtools count to create a .tdf file for the merged bam files, to improve viewing of the bam coverage in igv. 385 | """ 386 | bam, _success = inputs 387 | outfile, flag_file = outputs 388 | print "igvtools count on %s" % os.path.basename(bam) 389 | runStageCheck('igvcount', flag_file, bam, outfile) 390 | 391 | @transform(dedup, suffix('.bam'), 392 | ['.bam.tdf', '.bam.igvcountDedupedBams.Success']) 393 | def igvcountDedupedBams(inputs, outputs): 394 | """ 395 | Use igvtools count to create a .tdf file for the deduped bam files, to improve viewing of the bam coverage in igv. Note that this actually goes from the fixMate-ed bams. 396 | """ 397 | bam, _success = inputs 398 | outfile, flag_file = outputs 399 | print "igvtools count on %s" % os.path.basename(bam) 400 | runStageCheck('igvcount', flag_file, bam, outfile) 401 | 402 | @transform(baseQualRecalTabulate, suffix('.bam'), 403 | ['.bam.tdf', '.bam.igvcountRecalibratedBams.Success']) 404 | def igvcountRecalibratedBams(inputs, outputs): 405 | """ 406 | Use igvtools count to create a .tdf file for the recalibrated bam files, to improve viewing of the bam coverage in igv. 407 | """ 408 | bam, _success = inputs 409 | outfile, flag_file = outputs 410 | print "igvtools count on %s" % os.path.basename(bam) 411 | runStageCheck('igvcount', flag_file, bam, outfile) 412 | 413 | @transform(filterSNPs, suffix('.vcf'), 414 | ['.vcf.gz', '.vcf.gz.tbi', '.vcfindexSNPs.Success']) 415 | def vcfIndexSNPs(inputs, outputs): 416 | """ 417 | Use bgzip and tabix to prepare raw SNPs vcf for vcftools handling. 418 | """ 419 | vcf, _idx, _success = inputs 420 | zipfile, tabix_index, flag_file = outputs 421 | print "bgzip and tabix (for vcftools) on %s" % vcf 422 | runStageCheck('indexVCF', flag_file, vcf) 423 | 424 | @transform(filterIndels, suffix('.vcf'), 425 | ['.vcf.gz', '.vcf.gz.tbi', '.vcfindexIndels.Success']) 426 | def vcfIndexIndels(inputs, outputs): 427 | """ 428 | Use bgzip and tabix to prepare raw indels vcf for vcftools handling. 429 | """ 430 | vcf, _idx, _success = inputs 431 | zipfile, tabix_index, flag_file = outputs 432 | print "bgzip and tabix (for vcftools) on %s" % vcf 433 | runStageCheck('indexVCF', flag_file, vcf) 434 | 435 | 436 | # Coverage steps 437 | 438 | @follows(indexMergedBams) 439 | @transform(mergeBams, 440 | regex(r'(.*?)([^/]+)\.bam'), 441 | [r'%s/\2.early.DepthOfCoverage.sample_cumulative_coverage_counts' % coverage_dir, 442 | r'%s/\2.early.DepthOfCoverage.sample_cumulative_coverage_proportions' % coverage_dir, 443 | r'%s/\2.early.DepthOfCoverage.sample_interval_statistics' % coverage_dir, 444 | r'%s/\2.early.DepthOfCoverage.sample_interval_summary' % coverage_dir, 445 | r'%s/\2.early.DepthOfCoverage.sample_statistics' % coverage_dir, 446 | r'%s/\2.early.DepthOfCoverage.sample_summary' % coverage_dir, 447 | r'%s/\2.earlyDepthOfCoverage.Success' % coverage_dir]) 448 | def earlyDepthOfCoverage(inputs, outputs): 449 | """ 450 | Use GATK DepthOfCoverage to get a first pass at coverage statistics, after merging bams. 451 | """ 452 | bam, _success = inputs 453 | flag_file = outputs[-1] 454 | output_example = outputs[0] 455 | output_base = os.path.splitext(output_example)[0] 456 | print "calculating coverage statistics using GATK DepthOfCoverage on %s" % bam 457 | runStageCheck('depthOfCoverage', flag_file, ref_files['fasta_reference'], bam, output_base) 458 | 459 | @follows(indexDedupedBams) 460 | @transform(dedup, 461 | regex(r'(.*?)([^/]+)\.dedup\.bam'), 462 | [r'%s/\2.deduped.DepthOfCoverage.sample_cumulative_coverage_counts' % coverage_dir, 463 | r'%s/\2.deduped.DepthOfCoverage.sample_cumulative_coverage_proportions' % coverage_dir, 464 | r'%s/\2.deduped.DepthOfCoverage.sample_interval_statistics' % coverage_dir, 465 | r'%s/\2.deduped.DepthOfCoverage.sample_interval_summary' % coverage_dir, 466 | r'%s/\2.deduped.DepthOfCoverage.sample_statistics' % coverage_dir, 467 | r'%s/\2.deduped.DepthOfCoverage.sample_summary' % coverage_dir, 468 | r'%s/\2.dedupedDepthOfCoverage.Success' % coverage_dir]) 469 | def dedupedDepthOfCoverage(inputs, outputs): 470 | """ 471 | Use GATK DepthOfCoverage to get a coverage statistics as soon as duplicates are removed. 472 | """ 473 | bam, _success = inputs 474 | flag_file = outputs[-1] 475 | output_example = outputs[0] 476 | output_base = os.path.splitext(output_example)[0] 477 | print "calculating coverage statistics using GATK DepthOfCoverage on %s" % bam 478 | runStageCheck('depthOfCoverage', flag_file, ref_files['fasta_reference'], bam, output_base) 479 | 480 | @follows(indexRecalibratedBams) 481 | @transform(baseQualRecalTabulate, 482 | regex(r'(.*?)([^/]+)\.recal\.bam'), 483 | [r'%s/\2.DepthOfCoverage.sample_cumulative_coverage_counts' % coverage_dir, 484 | r'%s/\2.DepthOfCoverage.sample_cumulative_coverage_proportions' % coverage_dir, 485 | r'%s/\2.DepthOfCoverage.sample_interval_statistics' % coverage_dir, 486 | r'%s/\2.DepthOfCoverage.sample_interval_summary' % coverage_dir, 487 | r'%s/\2.DepthOfCoverage.sample_statistics' % coverage_dir, 488 | r'%s/\2.DepthOfCoverage.sample_summary' % coverage_dir, 489 | r'%s/\2.depthOfCoverage.Success' % coverage_dir]) 490 | def finalDepthOfCoverage(inputs, outputs): 491 | """ 492 | Use GATK DepthOfCoverage to get coverage statistics. 493 | """ 494 | bam, _success = inputs 495 | flag_file = outputs[-1] 496 | output_example = outputs[0] 497 | output_base = os.path.splitext(output_example)[0] 498 | print "calculating coverage statistics using GATK DepthOfCoverage on %s" % bam 499 | runStageCheck('depthOfCoverage', flag_file, ref_files['fasta_reference'], bam, output_base) 500 | 501 | 502 | # Read-counting steps 503 | 504 | @transform(samToBam, suffix('.bam'), 505 | ['.bam.flagstat', '.bam.countRunBam.Success']) 506 | def countRunBam(inputs, outputs): 507 | """ 508 | Run samtools flagstat on the initial per-lane, per-run bam file. 509 | """ 510 | bam, _success = inputs 511 | output, flag_file = outputs 512 | print "Running samtools flagstat on %s" % bam 513 | runStageCheck('flagstat', flag_file, bam, output) 514 | 515 | @transform(mergeBams, suffix('.bam'), 516 | ['.bam.flagstat', '.bam.countRunBam.Success']) 517 | def countMergedBam(inputs, outputs): 518 | """ 519 | Run samtools flagstat on the merged bam file. 520 | """ 521 | bam, _success = inputs 522 | output, flag_file = outputs 523 | print "Running samtools flagstat on %s" % bam 524 | runStageCheck('flagstat', flag_file, bam, output) 525 | 526 | @transform(realign, suffix('.bam'), 527 | ['.bam.flagstat', '.bam.countRealignedBam.Success']) 528 | def countRealignedBam(inputs, outputs): 529 | """ 530 | Run samtools flagstat on the realigned bam file. 531 | """ 532 | bam, _success = inputs 533 | output, flag_file = outputs 534 | print "Running samtools flagstat on %s" % bam 535 | runStageCheck('flagstat', flag_file, bam, output) 536 | 537 | @transform(dedup, suffix('.bam'), 538 | ['.bam.flagstat', '.bam.countDedupedBam.Success']) 539 | def countDedupedBam(inputs, outputs): 540 | """ 541 | Run samtools flagstat on the deduped bam file. 542 | """ 543 | bam, _success = inputs 544 | output, flag_file = outputs 545 | print "Running samtools flagstat on %s" % bam 546 | runStageCheck('flagstat', flag_file, bam, output) 547 | 548 | 549 | # Data collation and plotting steps 550 | 551 | @merge([countDedupedBam, countMergedBam], 552 | ["%s/readcounts.txt" % results_dir, "%s/readcount_fractions.txt" % results_dir, "%s/collateReadcounts.Success" % results_dir]) 553 | def collateReadCounts(inputs, outputs): 554 | """ 555 | Collate read counts from samtools flagstat output into a table. 556 | """ 557 | # Note expected input and output directories are effectively hard-coded 558 | in_dir = sambam_dir 559 | out_dir = results_dir 560 | flag_file = outputs[-1] 561 | print "Collating read counts" 562 | runStageCheck('collateReadcounts', flag_file, in_dir, out_dir) 563 | 564 | 565 | -------------------------------------------------------------------------------- /pipeline_dev_config.py: -------------------------------------------------------------------------------- 1 | 2 | # This section is used by the variant calling pipeline.py to specify input data and 3 | # working directories. 4 | # 5 | # Note that if you have downloaded the pipeline the directory names below are examples 6 | # only and you will need to edit them to suit your needs. 7 | # 8 | # Required variables: 9 | # - fastq_dirs: a list of directories where the raw input data is found. Currently this 10 | # data is expected to be paired-end gzipped fastq and to follow a specific naming 11 | # convention (see below). 12 | # - fastq_symlink_dir: symlinks to all raw fastq files will be written to this directory 13 | # and used by the rest of the pipeline. These symlinks have standardised names and 14 | # are a useful flattened summary of all known input data. 15 | # - output_dir: the directory used by the pipeline for output and intermediate files. 16 | # A directory structure will be created under this directory by pipeline.py. 17 | # 18 | # Input data naming convention: 19 | # The input fastq files must follow a naming convention so that the pipeline can determine 20 | # the metadata fields. This convention in the default script is to use the regex 21 | # ([a-zA-Z0-9-.]+)_([^_/]+)_[CAGTN]+_L([0-9]+)_R(1|2).fastq.gz 22 | # This corresponds to metadata fields 23 | # SAMPLE_RUN_TAG_LANE_READPAIR.fastq.gz 24 | # where 25 | # SAMPLE is a unique identifier for the sample sequenced 26 | # RUN is a unique identifier for the experiment (e.g. run or flowcell ID) 27 | # TAG is the barcode sequence used for multiplexing (NA if none) 28 | # LANE is the flowcell lane identifier, written like L001 29 | # READPAIR identifies whether the file contains forward or reverse reads, R1 or R2 30 | # 31 | # For example: Sample395_C0WK7ACXX_ACTTGA_L007_R1.fastq.gz 32 | # 33 | # This file naming convention follows that returned by many sequencing centres for 34 | # Illumina data. 35 | # 36 | working_files = { 37 | 'fastq_dirs': [ 38 | './example_data/input_data_wgs' 39 | ], 40 | 'fastq_symlink_dir': './example_data/output_wgs/fastq_symlinks', 41 | 'output_dir': './example_data/output_wgs' 42 | } 43 | 44 | # This section is used by the variant calling pipeline.py to specify reference data files. 45 | # 46 | # Note that if you have downloaded the pipeline the filenames below are examples only and 47 | # you will need to get the relevant reference files for your data. Exactly which files 48 | # you need depend on your data. At time of writing reference data can be obtained from: 49 | # - Reference genome: many sources depending on data. For our human data we used the 50 | # 1000 genomes version of the b37 (hg19) genome build, found at 51 | # ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/technical/reference/human_g1k_v37.fasta.gz 52 | # Note that your genome must use the same chromosome naming convention as any other 53 | # reference files (such as dbSNP); if you use hg19 (chr1,chr2) instead of b37 (1,2) 54 | # you may need to convert the files suggested below. 55 | # - dbSNP variants: dbSNP is at http://www.ncbi.nlm.nih.gov/projects/SNP/ 56 | # A useful release summary is at http://www.ncbi.nlm.nih.gov/projects/SNP/snp_summary.cgi 57 | # We used human variants which were obtained in VCF format from 58 | # ftp://ftp.ncbi.nlm.nih.gov/snp/organisms/human_9606/VCF/00-All.vcf.gz 59 | # - known indels for local realignment: these follow the Broad recommendations for the 60 | # GATK tool suite and come from the GATK resource bundle. See 61 | # http://gatkforums.broadinstitute.org/discussion/1213/what-s-in-the-resource-bundle-and-how-can-i-get-it 62 | # 63 | # Expected variables (if you use the relevant pipeline steps): 64 | # - fasta_reference: the reference genome fasta. Should be in the same location as the 65 | # .fai files produced by samtools faidx. 66 | # TODO: do this indexing as part of the pipeline and check for index files. 67 | # - bwa_reference: the reference genome fasta. Should be in the same location as the 68 | # index files produced by bwa index. 69 | # TODO: do this indexing as part of the pipeline and check for index files. 70 | # - dbsnp: the dbSNP variants file in VCF format, for annotating variants and for 71 | # GATK base quality recalibration. 72 | # - indels_realign_goldstandard and 73 | # - indels_realign_1000G: files of known indels for use in GATK local realignment. 74 | # Currently the Broad Institute recommends using these two files (see above). 75 | ref_files = { 76 | 'fasta_reference': '/vlsci/VR0002/shared/Reference_Files/Indexed_Ref_Genomes/bwa_0.7.5_Indexed/human_g1k_v37.fasta', 77 | 'bwa_reference': '/vlsci/VR0002/shared/Reference_Files/Indexed_Ref_Genomes/bwa_0.7.5_Indexed/human_g1k_v37.fasta', 78 | 79 | 'dbsnp': '/vlsci/VR0002/shared/Reference_Files/SNP_db/dbSNP137.vcf', 80 | 81 | 'indels_realign_goldstandard': '/vlsci/VR0002/shared/Reference_Files/Indels_for_realignment/Mills_and_1000G_gold_standard.indels.b37.vcf', 82 | 'indels_realign_1000G': '/vlsci/VR0002/shared/Reference_Files/Indels_for_realignment/1000G_phase1.indels.b37.vcf' 83 | } 84 | 85 | # pipeline should hold configuration options for Rubra and for the pipeline. 86 | # This section is required for every Rubra pipeline, 87 | # but restrict_samples and allowed_samples are specific to the variant-calling pipeline. 88 | # 89 | # Rubra variables: 90 | # - logDir: the directory where batch queue scripts, stdout and sterr dumps are stored. 91 | # - logFile: the file used to log all jobs that are run. 92 | # - style: the default style, one of 'flowchart', 'print', 'run', 'touchfiles'. Can be 93 | # overridden by specifying --style on the command line. 94 | # - procs: the number of python processes to run simultaneously. This determines the 95 | # maximum parallelism of the pipeline. For distributed jobs it also constrains the 96 | # maximum total jobs submitted to the queue at any one time. 97 | # - verbosity: one of 0 (quiet), 1 (normal), 2 (chatty). Can be overridden by specifying 98 | # --verbose on the command line. 99 | # - end: the desired tasks to be run. Rubra will also run all tasks which are dependencies 100 | # of these tasks. Can be overridden by specifying --end on the command line. 101 | # - force: tasks which will be forced to run, regardless of timestamps. Can be overridden 102 | # by supplying --force on the command line. 103 | # - rebuild: one of 'fromstart','fromend'. Whether to calculate which dependencies will 104 | # be rerun by working back from an end task to the latest up-to-date task, or forward 105 | # from the earliest out-of-date task. 'fromstart' is the most conservative and 106 | # commonly used as it brings all intermediate tasks up to date. 107 | # 108 | # Variant-calling pipeline variables: (TODO: move to a separate section) 109 | # - restrict_samples: whether to restrict input files to those specified by allowd_samples 110 | # - allowed_samples: sample names that will be run of restrict_samples is True 111 | pipeline = { 112 | 'logDir': 'log_example_wgs', 113 | 'logFile': 'pipeline.log', 114 | 'style': 'print', 115 | 'procs': 30, 116 | 'verbose': 1, 117 | 'end': ['earlyDepthOfCoverage', 'finalDepthOfCoverage', 118 | 'fastqc', 119 | 'igvcountMergedBams', 'countDedupedBam', 'countRunBam', 'countMergedBam', 120 | 'getEnsemblAnnotations', 121 | 'collateReadCounts', 122 | 'vcfIndexSNPs', 'vcfIndexIndels' 123 | ], 124 | 'force': [], 125 | 'rebuild' : "fromstart", 126 | 127 | 'restrict_samples': False, 128 | 'allowed_samples': [] 129 | } 130 | -------------------------------------------------------------------------------- /pipeline_stages_config.py: -------------------------------------------------------------------------------- 1 | 2 | # stageDefaults contains the default options which are applied to each stage (command). 3 | # This section is required for every Rubra pipeline. 4 | # These can be overridden by options defined for individual stages, below. 5 | # Stage options which Rubra will recognise are: 6 | # - distributed: a boolean determining whether the task should be submitted to a cluster 7 | # job scheduling system (True) or run on the system local to Rubra (False). 8 | # - walltime: for a distributed PBS job, gives the walltime requested from the job 9 | # queue system; the maximum allowed runtime. For local jobs has no effect. 10 | # - memInGB: for a distributed PBS job, gives the memory in Gigabytes requested from the 11 | # job queue system. For local jobs has no effect. 12 | # - queue: for a distributed PBS job, this is the name of the queue to submit the 13 | # job to. For local jobs has no effect. This is currently a mandatory field for 14 | # distributed jobs, but can be set to None. 15 | # - modules: the modules to be loaded before running the task. This is intended for 16 | # systems with environment modules installed. Rubra will call module load on each 17 | # required module before running the task. Note that defining modules for individual 18 | # stages will override (not add to) any modules listed here. This currently only 19 | # works for distributed jobs. 20 | stageDefaults = { 21 | 'distributed': True, 22 | 'queue': None, 23 | 'walltime': "01:00:00", 24 | 'memInGB': 8, 25 | 'modules': [ 26 | "bwa-intel/0.7.5a", 27 | "samtools-intel/0.1.19", 28 | "picard/1.53", 29 | "python-gcc/2.7.5", 30 | "R-gcc/3.0.2", 31 | "gatk/1.6-7" 32 | ] 33 | } 34 | 35 | # stages should hold the details of each stage which can be called by runStageCheck. 36 | # This section is required for every Rubra pipeline. 37 | # Calling a stage in this way carries out checkpointing and, if desired, batch job 38 | # submission. 39 | # Each stage must contain a 'command' definition. See stageDefaults above for other 40 | # allowable options. 41 | stages = { 42 | "fastqc": { 43 | "command": "fastqc --quiet -o %outdir %seq", 44 | 'modules': [ "fastqc/0.10.1" ] 45 | }, 46 | 'bwaMemSE': { 47 | 'command': "bwa mem -t 8 %meta %ref %seq > %out", 48 | 'walltime': "3:00:00", 49 | 'queue': 'smp', 50 | 'memInGB': 23 51 | }, 52 | 'bwaMemPE': { 53 | 'command': "bwa mem -t 8 %meta %ref %seq1 %seq2 > %out", 54 | 'walltime': "3:00:00", 55 | 'queue': 'smp', 56 | 'memInGB': 23 57 | }, 58 | 'samToSortedBam': { 59 | 'command': "./SortSam 6 VALIDATION_STRINGENCY=LENIENT INPUT=%seq OUTPUT=%out SORT_ORDER=coordinate", 60 | 'walltime': "5:00:00", 61 | }, 62 | 'mergeBams': { 63 | 'command': "./PicardMerge 6 %baminputs USE_THREADING=true VALIDATION_STRINGENCY=LENIENT AS=true OUTPUT=%out", 64 | 'walltime': "5:00:00" 65 | }, 66 | 'indexBam': { 67 | 'command': "samtools index %bam" 68 | }, 69 | 'flagstat': { 70 | 'command': "samtools flagstat %bam > %out", 71 | 'walltime': "00:10:00" 72 | }, 73 | 'igvcount': { 74 | 'command': "igvtools count %bam %out hg19", 75 | 'modules': [ "igvtools/1.5.15" ] 76 | }, 77 | 'indexVCF': { 78 | 'command': "./vcftools_prepare.sh %vcf", 79 | 'modules': [ "tabix/0.2.5" ] 80 | }, 81 | 'realignIntervals': { 82 | # Hard-coded to take 2 known indels files right now 83 | 'command': "./GenomeAnalysisTK 1 -T RealignerTargetCreator -R %ref -I %bam --known %indels_goldstandard --known %indels_1000G -log %log -o %out", 84 | 'memInGB': 23, 85 | 'walltime': "5:00:00" 86 | }, 87 | 'realign': { 88 | 'command': "./GenomeAnalysisTK 22 -T IndelRealigner -R %ref -I %bam -targetIntervals %intervals -log %log -o %out", 89 | 'memInGB': 23, 90 | 'walltime': "5:00:00" 91 | }, 92 | 'dedup': { 93 | 'command': "./MarkDuplicates 6 INPUT=%bam REMOVE_DUPLICATES=true VALIDATION_STRINGENCY=LENIENT AS=true METRICS_FILE=%log OUTPUT=%out", 94 | 'walltime': '5:00:00' 95 | }, 96 | 'baseQualRecalCount': { 97 | 'command': "./GenomeAnalysisTK 12 -T CountCovariates -I %bam -R %ref --knownSites %dbsnp -nt 8 -l INFO -cov ReadGroupCovariate -cov QualityScoreCovariate -cov CycleCovariate -cov DinucCovariate -log %log -recalFile %out", 98 | 'queue': 'smp', 99 | 'memInGB': 23, 100 | 'walltime': "5:00:00" 101 | }, 102 | 'baseQualRecalTabulate': { 103 | 'command': "./GenomeAnalysisTK 4 -T TableRecalibration -I %bam -R %ref -recalFile %csvfile -l INFO -log %log -o %out", 104 | 'walltime': "5:00:00" 105 | }, 106 | 'callSNPs': { 107 | 'command': "./GenomeAnalysisTK 12 -T UnifiedGenotyper -nt 8 -R %ref -I %bam --dbsnp %dbsnp -stand_call_conf 50.0 -stand_emit_conf 10.0 -dcov 1600 -l INFO -A AlleleBalance -A DepthOfCoverage -A FisherStrand -glm SNP -log %log -o %out", 108 | 'queue': 'smp', 109 | 'memInGB': 23, 110 | 'walltime': "3:00:00" 111 | }, 112 | 'callIndels': { 113 | 'command': "./GenomeAnalysisTK 12 -T UnifiedGenotyper -nt 8 -R %ref -I %bam --dbsnp %dbsnp -stand_call_conf 50.0 -stand_emit_conf 10.0 -dcov 1600 -l INFO -A AlleleBalance -A DepthOfCoverage -A FisherStrand -glm INDEL -log %log -o %out", 114 | 'queue': 'smp', 115 | 'memInGB': 23, 116 | 'walltime': "3:00:00" 117 | }, 118 | 'filterSNPs': { 119 | # Very minimal hard filters based on GATK recommendations. VQSR is preferable if possible. 120 | 'command': "./GenomeAnalysisTK 4 -T VariantFiltration -R %ref --variant %vcf --filterExpression 'QD < 2.0 || MQ < 40.0 || FS > 60.0 || HaplotypeScore > 13.0 || MQRankSum < -12.5 || ReadPosRankSum < -8.0' --filterName 'GATK_MINIMAL_FILTER' -log %log -o %out", 121 | }, 122 | 'filterIndels': { 123 | # Very minimal hard filters based on GATK recommendations. VQSR is preferable if possible. 124 | # If you have 10 or more samples GATK also recommends the filter InbreedingCoeff < -0.8 125 | 'command': "./GenomeAnalysisTK 4 -T VariantFiltration -R %ref --variant %vcf --filterExpression 'QD < 2.0 || ReadPosRankSum < -20.0 || FS > 200.0' --filterName 'GATK_MINIMAL_FILTER' -log %log -o %out", 126 | }, 127 | 'annotateEnsembl': { 128 | # This command as written assumes that VEP and its cache have been 129 | # downloaded in respective locations 130 | # ./variant_effect_predictor_2.5 131 | # ./variant_effect_predictor_2.5/vep_cache 132 | 'command': "perl variant_effect_predictor_2.5/variant_effect_predictor.pl --cache --dir variant_effect_predictor_2.5/vep_cache -i %vcf --vcf -o %out -species human --canonical --gene --protein --sift=b --polyphen=b > %log", 133 | 'modules': [ "perl/5.10.1", "ensembl/67" ] 134 | }, 135 | 'depthOfCoverage': { 136 | 'command': "./GenomeAnalysisTK 4 -T DepthOfCoverage -R %ref -I %bam -omitBaseOutput -ct 1 -ct 10 -ct 20 -ct 30 -o %out", 137 | }, 138 | 'collateReadcounts': { 139 | 'command': 'python count_flagstat_wgs.py %dir %outdir', 140 | 'walltime': "00:10:00" 141 | } 142 | } 143 | -------------------------------------------------------------------------------- /vcftools_prepare.sh: -------------------------------------------------------------------------------- 1 | bgzip -c $1 > $1".gz" 2 | tabix -p vcf $1".gz" 3 | --------------------------------------------------------------------------------