├── .gitignore
├── GenomeAnalysisTK
├── LICENSE.txt
├── MarkDuplicates
├── PicardMerge
├── README.markdown
├── SortSam
├── count_flagstat_wgs.py
├── example_data
    ├── input_data_wgs
    │   ├── NA12878wgs_20FUKAAXX_NA_L006_R1.fastq.gz
    │   ├── NA12878wgs_20FUKAAXX_NA_L006_R2.fastq.gz
    │   ├── NA12878wgs_20FUKAAXX_NA_L007_R1.fastq.gz
    │   ├── NA12878wgs_20FUKAAXX_NA_L007_R2.fastq.gz
    │   ├── NA12878wgs_20GAVAAXX_NA_L006_R1.fastq.gz
    │   ├── NA12878wgs_20GAVAAXX_NA_L006_R2.fastq.gz
    │   ├── NA12878wgs_20GAVAAXX_NA_L007_R1.fastq.gz
    │   ├── NA12878wgs_20GAVAAXX_NA_L007_R2.fastq.gz
    │   └── README
    └── output_wgs
    │   ├── README
    │   └── fastq_symlinks
    │       └── README
├── input_fastq.py
├── pipeline.py
├── pipeline_dev_config.py
├── pipeline_stages_config.py
└── vcftools_prepare.sh


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | 


--------------------------------------------------------------------------------
/GenomeAnalysisTK:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | mem=$1
4 | shift
5 | 
6 | java -Xmx${mem}g -Djava.io.tmpdir=$TMPDIR -jar $GATK_HOME/GenomeAnalysisTK.jar "$@"
7 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2013 Bernard Pope, Clare Sloggett, Gayle Philip
 4 | Wakefield.
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 7 | 
 8 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 9 | 
10 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
11 | 


--------------------------------------------------------------------------------
/MarkDuplicates:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | mem=$1
4 | shift
5 | 
6 | java -Xmx${mem}g -Djava.io.tmpdir=$TMPDIR -jar $PICARD_HOME/lib/MarkDuplicates.jar TMP_DIR=$TMPDIR $*
7 | 


--------------------------------------------------------------------------------
/PicardMerge:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | mem=$1
4 | shift
5 | 
6 | java -Xmx${mem}g -Djava.io.tmpdir=$TMPDIR -jar $PICARD_HOME/lib/MergeSamFiles.jar TMP_DIR=$TMPDIR $*
7 | 


--------------------------------------------------------------------------------
/README.markdown:
--------------------------------------------------------------------------------
 1 | 
 2 | # Variant calling pipeline: exome and WGS
 3 | 
 4 | ## Overview
 5 | 
 6 | This is a basic variant-calling and annotation pipeline developed at the 
 7 | Victorian Life Sciences Computation Initiative (VLSCI), University of Melbourne.
 8 | 
 9 | It is based around BWA, GATK and ENSEMBL and was originally designed for human (or similar) data. The master branch is configured for WGS data; there is an exome branch configured for variant calling in exome data.
10 | 
11 | To run the pipeline you will need Rubra: [https://github.com/bjpop/rubra](https://github.com/bjpop/rubra). Rubra uses the python Ruffus library: [http://www.ruffus.org.uk/](http://www.ruffus.org.uk/).
12 | 
13 | Usage: 
14 |     
15 |     rubra pipeline.py --config <your_config_file> --style {print,run,touchfiles,flowchart}
16 | 
17 | More command-line options are described in the Rubra documentation.
18 | 
19 | Specifically, to use the provided config files, you might call
20 | 
21 |     rubra pipeline.py --config pipeline_dev_config.py pipeline_stages_config.py --style print
22 |     
23 | If you use the provided config files, you should make sure you understand the analysis steps and that they are appropriate for your project.
24 | 
25 | If you use this code or Rubra itself in your research, please cite the poster at http://figshare.com/articles/Rubra_flexible_distributed_pipelines/895626 like so:
26 | 
27 |     Sloggett, Clare; Wakefield, Matthew; Philip, Gayle; Pope, Bernard (2014): 
28 |     Rubra - flexible distributed pipelines. figshare. 
29 |     http://dx.doi.org/10.6084/m9.figshare.895626
30 | 
31 | ## Running on VLSCI's clusters (e.g. merri)
32 | 
33 | On merri we have a version of Rubra installed into Python 2.7.5, which you can load with
34 | 
35 |     module load python-gcc/2.7.5
36 | 
37 | To use the flowchart option you will need graphviz, which you can load with
38 | 
39 |     module load graphviz
40 | 


--------------------------------------------------------------------------------
/SortSam:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | mem=$1
4 | shift
5 | 
6 | java -Xmx${mem}g -Djava.io.tmpdir=$TMPDIR -jar $PICARD_HOME/lib/SortSam.jar TMP_DIR=$TMPDIR $*
7 | 


--------------------------------------------------------------------------------
/count_flagstat_wgs.py:
--------------------------------------------------------------------------------
  1 | 
  2 | """
  3 | Read all output flagstat files and store values in a text table.
  4 | Creates two tables in the output directory: readcounts.txt and readcounts_fractions.txt, which contains useful human-readable percentage statistics.
  5 | Each row will be a sample.
  6 | Values to store (columns) are:
  7 |     Total reads from SAMPLE.bam.flagstat
  8 |     Mapped reads from SAMPLE.bam.flagstat
  9 |     Mapped reads from SAMPLE.dedup.bam.flagstat
 10 | There is an assumption that the bams were mapped with something like bwa, which includes unmapped reads.
 11 | 
 12 | Usage: python count_flagstat_wgs.py flagstat_directory output_directory
 13 | """
 14 | 
 15 | import sys
 16 | import os
 17 | import re
 18 | import optparse
 19 | from collections import defaultdict
 20 | 
 21 | class FlagstatParseException (Exception):
 22 |     pass
 23 | 
 24 | def read_flagstat(filename):
 25 |     """
 26 |     Given a filename, parse the flagstat values and return as a hash.
 27 |     Relying on flagstat contents being in usual order.
 28 |     """
 29 |     numbers = re.compile(r'^(\d+)\s+\+\s+(\d+)\s+')
 30 |     values = {}
 31 |     f = open(filename)
 32 |     for field in ['total',
 33 |                     'duplicates',
 34 |                     'mapped',
 35 |                     'paired',
 36 |                     'read1',
 37 |                     'read2',
 38 |                     'properly_paired',
 39 |                     'both_mapped',
 40 |                     'singletons',
 41 |                     'mate_distant',
 42 |                     'mate_distant_goodqual']:
 43 |         line = f.readline().strip()
 44 |         match = numbers.match(line)
 45 |         if not match:
 46 |             raise FlagstatParseException
 47 |         values[field] = int(match.group(1))
 48 |         values[field+'_QCfailed'] = int(match.group(2))
 49 |     f.close()
 50 |     return values
 51 | 
 52 | # -----
 53 | 
 54 | # Get arguments and input filenames
 55 | parser = optparse.OptionParser(usage=__doc__)
 56 | #parser.add_option()
 57 | (options, args) = parser.parse_args()
 58 | if len(args) != 2:
 59 |     parser.error("Wrong number of arguments - see usage info")
 60 | in_dir = args[0]
 61 | out_dir = args[1]
 62 | if not (os.path.exists(out_dir) and os.path.isdir(out_dir)):
 63 |     sys.exit("There does not seem to be a directory %s , exiting" % out_dir)
 64 | 
 65 | filenames = os.listdir(in_dir)
 66 | 
 67 | #print ', '.join(filenames)
 68 | 
 69 | alignedname = re.compile('^([^_\.]+).bam.flagstat')
 70 | dedupname = re.compile('^([^_\.]+).dedup.bam.flagstat')
 71 | 
 72 | samples = defaultdict(dict)
 73 | for filename in filenames:
 74 |     if dedupname.match(filename):
 75 |         match = dedupname.match(filename)
 76 |         name = match.group(1)
 77 |         values = read_flagstat( os.path.join(in_dir, filename) )
 78 |         samples[name]['deduped'] = values['mapped']    
 79 |     elif alignedname.match(filename):
 80 |         match = alignedname.match(filename)
 81 |         name = match.group(1)
 82 |         values = read_flagstat( os.path.join(in_dir, filename) )
 83 |         samples[name]['mapped'] = values['mapped']
 84 |         samples[name]['total'] = values['total']
 85 |         
 86 | #print ', '.join(samples.keys())
 87 | 
 88 | tablefile = os.path.join(out_dir, "readcounts.txt")
 89 | tablefile_plus = os.path.join(out_dir, "readcounts_fractions.txt")
 90 | 
 91 | OUT_TABLE = open(tablefile, 'w')
 92 | OUT_TABLEPLUS = open(tablefile_plus, 'w')
 93 | 
 94 | OUT_TABLE.write("Sample\tTotal\tMapped\tDeduped\n") 
 95 | OUT_TABLEPLUS.write("Sample\tTotal\tMapped\t%\tDeduped\t%\n")
 96 | 
 97 | for sample in sorted(samples.keys()):
 98 |     values = samples[sample]
 99 |     fraction_mapped = float(values['mapped'])/values['total']
100 |     fraction_deduped = float(values['deduped'])/values['mapped']
101 | 
102 |     OUT_TABLE.write( "%s\t%d\t%d\t%d\n" % (sample, values['total'], values['mapped'], values['deduped']) )
103 |     
104 |     OUT_TABLEPLUS.write( "%s\t%d\t%d\t%.3f\t%d\t%.3f\n" % (sample, values['total'], values['mapped'], fraction_mapped, values['deduped'], fraction_deduped) )
105 | 
106 | OUT_TABLE.close()
107 | OUT_TABLEPLUS.close()
108 | 


--------------------------------------------------------------------------------
/example_data/input_data_wgs/NA12878wgs_20FUKAAXX_NA_L006_R1.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/melbournebioinformatics/variant_calling_pipeline/e1d6b8998e129af973f7f1946ebbfbed0ca62a1b/example_data/input_data_wgs/NA12878wgs_20FUKAAXX_NA_L006_R1.fastq.gz


--------------------------------------------------------------------------------
/example_data/input_data_wgs/NA12878wgs_20FUKAAXX_NA_L006_R2.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/melbournebioinformatics/variant_calling_pipeline/e1d6b8998e129af973f7f1946ebbfbed0ca62a1b/example_data/input_data_wgs/NA12878wgs_20FUKAAXX_NA_L006_R2.fastq.gz


--------------------------------------------------------------------------------
/example_data/input_data_wgs/NA12878wgs_20FUKAAXX_NA_L007_R1.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/melbournebioinformatics/variant_calling_pipeline/e1d6b8998e129af973f7f1946ebbfbed0ca62a1b/example_data/input_data_wgs/NA12878wgs_20FUKAAXX_NA_L007_R1.fastq.gz


--------------------------------------------------------------------------------
/example_data/input_data_wgs/NA12878wgs_20FUKAAXX_NA_L007_R2.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/melbournebioinformatics/variant_calling_pipeline/e1d6b8998e129af973f7f1946ebbfbed0ca62a1b/example_data/input_data_wgs/NA12878wgs_20FUKAAXX_NA_L007_R2.fastq.gz


--------------------------------------------------------------------------------
/example_data/input_data_wgs/NA12878wgs_20GAVAAXX_NA_L006_R1.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/melbournebioinformatics/variant_calling_pipeline/e1d6b8998e129af973f7f1946ebbfbed0ca62a1b/example_data/input_data_wgs/NA12878wgs_20GAVAAXX_NA_L006_R1.fastq.gz


--------------------------------------------------------------------------------
/example_data/input_data_wgs/NA12878wgs_20GAVAAXX_NA_L006_R2.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/melbournebioinformatics/variant_calling_pipeline/e1d6b8998e129af973f7f1946ebbfbed0ca62a1b/example_data/input_data_wgs/NA12878wgs_20GAVAAXX_NA_L006_R2.fastq.gz


--------------------------------------------------------------------------------
/example_data/input_data_wgs/NA12878wgs_20GAVAAXX_NA_L007_R1.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/melbournebioinformatics/variant_calling_pipeline/e1d6b8998e129af973f7f1946ebbfbed0ca62a1b/example_data/input_data_wgs/NA12878wgs_20GAVAAXX_NA_L007_R1.fastq.gz


--------------------------------------------------------------------------------
/example_data/input_data_wgs/NA12878wgs_20GAVAAXX_NA_L007_R2.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/melbournebioinformatics/variant_calling_pipeline/e1d6b8998e129af973f7f1946ebbfbed0ca62a1b/example_data/input_data_wgs/NA12878wgs_20GAVAAXX_NA_L007_R2.fastq.gz


--------------------------------------------------------------------------------
/example_data/input_data_wgs/README:
--------------------------------------------------------------------------------
1 | This directory contains test data which can be used to run the WGS(master) version of the pipeline.
2 | It is fastq data from the 1000 genomes project, individual NA12878, roughly restricted to a small genomic region (chromosome 17, 39700000-39800000).
3 | 


--------------------------------------------------------------------------------
/example_data/output_wgs/README:
--------------------------------------------------------------------------------
1 | This directory should contain the output created by running the WGS version of the pipeline on the WGS test data.
2 | The output directory needs to exist before the pipeline is invoked.
3 | This file mainly exists to make sure git will recognise this directory as version-controlled.
4 | 


--------------------------------------------------------------------------------
/example_data/output_wgs/fastq_symlinks/README:
--------------------------------------------------------------------------------
1 | This directory should contain the symlinks to the raw input fastq files, created by invoking the WGS version of the pipeline on the WGS test data.
2 | This directory currently needs to exist before the pipeline is invoked.
3 | This file mainly exists to make sure git will recognise this directory as version-controlled.
4 | 


--------------------------------------------------------------------------------
/input_fastq.py:
--------------------------------------------------------------------------------
 1 | #!/bin/env python
 2 | 
 3 | """
 4 | Functions to parse directories of input fastq files, create symlinks with expected filename structures, and return metadata.
 5 | 
 6 | Clare Sloggett, VLSCI
 7 | """
 8 | 
 9 | import sys
10 | import re
11 | import os.path
12 | from collections import defaultdict
13 | from rubra.utils import (mkLink)
14 | 
15 | def parse_and_link(file, symlink_dir, metadata_dict):
16 |     """
17 |     Parse metadata out of input filename and construct symlink.
18 |     Takes a fastq filename, destination directory, and a metadata dict, which should be of type defaultdict(dict).
19 |     Parse the filename to get information on the sample name, run, read #, etc.
20 |     Medadata is added to the provided metadata_dict.
21 |     Some metadata is used to build symlinks, to guarantee filename uniqueness and a regular naming structure.\
22 |     Currently parsing by assuming AGRF naming structure and paired-end reads
23 |     Currently will ONLY handle gzipped files, to avoid multiple links to the same data.
24 |     """
25 |     match_old = re.match(r".*?/([^_/]+)_([a-zA-Z0-9-.]+)_s_([0-9]+)_(1|2)_sequence.txt.gz",file)
26 |     match_new = re.match(r".*?/([a-zA-Z0-9-.]+)_([^_/]+)_[CAGTN]+_L([0-9]+)_R(1|2).fastq.gz",file)
27 |     if match_old:
28 |         run_id = match_old.group(1)
29 |         sample = match_old.group(2)
30 |         lane = int(match_old.group(3))
31 |         pair = match_old.group(4)
32 |         encoding = 'I'
33 |     elif match_new:
34 |         run_id = match_new.group(2)
35 |         sample = match_new.group(1)
36 |         lane = int(match_new.group(3))
37 |         pair = match_new.group(4)
38 |         encoding = 'S'
39 |     else:
40 |         print "Unable to parse name of fastq file %s ." % file
41 |         sys.exit(1)
42 |     newfile = os.path.join(symlink_dir, "%s_%s_L%d_%s.fastq.gz" % 
43 |                                     (sample, run_id, lane, pair))
44 |     metadata_dict[os.path.basename(newfile)]['sample'] = sample
45 |     metadata_dict[os.path.basename(newfile)]['run_id'] = run_id
46 |     metadata_dict[os.path.basename(newfile)]['lane'] = lane
47 |     metadata_dict[os.path.basename(newfile)]['pair'] = pair
48 |     metadata_dict[os.path.basename(newfile)]['encoding'] = encoding
49 |     relative_sourcefile = os.path.relpath(file, symlink_dir)
50 |     mkLink(relative_sourcefile, newfile)
51 |     return newfile
52 | 
53 | 


--------------------------------------------------------------------------------
/pipeline.py:
--------------------------------------------------------------------------------
  1 | #!/bin/env python
  2 | 
  3 | """
  4 | GATK-based variant-calling pipeline, WGS version.
  5 | 
  6 | Authors: Bernie Pope, Clare Sloggett, Gayle Philip.
  7 | Thanks to Dmitri Mouradov and Maria Doyle for input on the initial 
  8 | analysis design.
  9 | Thanks to Matt Wakefield for contributions to Rubra 
 10 | (https://github.com/bjpop/rubra) during pipeline development.
 11 | 
 12 | Description:
 13 | 
 14 | This program implements a workflow pipeline for next generation
 15 | sequencing variant detection using the Broad Institute's GATK for
 16 | variant calling and using ENSEMBL for basic annotation.
 17 | 
 18 | It uses Rubra (https://github.com/bjpop/rubra) based on the 
 19 | Ruffus library.
 20 | 
 21 | It supports parallel evaluation of independent pipeline stages,
 22 | and can run stages on a cluster environment.
 23 | 
 24 | The pipeline is configured by an options file in a python file,
 25 | including the actual commands which are run at each stage.
 26 | """
 27 | 
 28 | 
 29 | import sys
 30 | import re
 31 | import os.path
 32 | import os
 33 | from collections import defaultdict
 34 | from glob import *
 35 | import shutil
 36 | from ruffus import *
 37 | from rubra.utils import pipeline_options
 38 | from rubra.utils import (runStageCheck, mkLogFile, mkDir, mkForceLink)
 39 | from input_fastq import parse_and_link
 40 | 
 41 | def make_metadata_string(metadata):
 42 |     return r'-R"@RG\tID:%s\tSM:%s\tPL:%s"' % (metadata['ID'], metadata['SM'], metadata['PL'])
 43 | 
 44 | # Shorthand access to options
 45 | ref_files = pipeline_options.ref_files
 46 | working_files = pipeline_options.working_files
 47 | logDir = pipeline_options.pipeline['logDir']
 48 | 
 49 | # Data setup process and input organisation and metadata functions
 50 | 
 51 | #Metadata holding structures
 52 | fastq_metadata = defaultdict(dict)
 53 | 
 54 | original_fastq_files = []
 55 | for fastq_dir in working_files['fastq_dirs']:
 56 |     original_fastq_files += glob(os.path.join(fastq_dir, '*.fastq.gz'))
 57 | 
 58 | if len(original_fastq_files)==0:
 59 |     print "No input files found. Do the filenames follow the naming convention?"
 60 |     print "Directories searched:"
 61 |     print "\n".join(working_files['fastq_dirs'])
 62 |     sys.exit(1)
 63 |     
 64 | # Parse metadata out of input file names and construct symlinks
 65 | # Metadata is put into a dict (for the rest of ruffus) and some of it also into symlinks (for filename uniqueness)
 66 | # currently parsing by assuming AGRF naming structure and paired-end reads
 67 | mkDir(working_files['fastq_symlink_dir'])
 68 | all_fastq_files = []
 69 | for file in original_fastq_files:
 70 |     symlink = parse_and_link(file, working_files['fastq_symlink_dir'], fastq_metadata)
 71 |     all_fastq_files.append(symlink)
 72 | 
 73 | # Make a list of files we will actually use
 74 | if pipeline_options.pipeline['restrict_samples']:
 75 |     allowed_samples = set(pipeline_options.pipeline['allowed_samples'])
 76 |     fastq_files = [file for file in sorted(all_fastq_files) 
 77 |                         if (fastq_metadata[os.path.basename(file)]['sample'] in allowed_samples)]
 78 | else:
 79 |     fastq_files = sorted(all_fastq_files)
 80 | 
 81 | print "Symlinked files that will be used:"
 82 | for file in fastq_files:
 83 |     print file
 84 | print
 85 | print "Output dir is %s" % working_files['output_dir']
 86 | print "Log dir is %s" % logDir
 87 | print
 88 | 
 89 | # Create output subdirectories
 90 | 
 91 | output_dir = working_files['output_dir']
 92 | 
 93 | fastqc_dir = os.path.join(output_dir, "FastQC")
 94 | mkDir(fastqc_dir)
 95 | 
 96 | sambam_dir = os.path.join(output_dir, "alignments")
 97 | mkDir(sambam_dir)
 98 | 
 99 | variant_dir = os.path.join(output_dir, "variant_calls")
100 | mkDir(variant_dir)
101 | 
102 | coverage_dir = os.path.join(output_dir, "coverage")
103 | mkDir(coverage_dir)
104 | 
105 | ensembl_dir = os.path.join(output_dir, "ensembl")
106 | mkDir(ensembl_dir)
107 | 
108 | # directory for final summary tables
109 | results_dir = os.path.join(output_dir, "results")
110 | mkDir(results_dir)
111 | 
112 | # Pipeline declarations
113 | 
114 | # Alignment and correction steps
115 | 
116 | @transform(fastq_files, regex('(.+\/)?(.+?)\.fastq\.gz'), 
117 |         [r'%s/\2_fastqc' % fastqc_dir, r'%s/\2.fastqc.Success' % fastqc_dir])
118 | def fastqc(inputs, outputs):
119 |     """
120 |     Run FastQC on each fastq file.
121 |     """
122 |     sequence = inputs
123 |     fastqc_dest, flagFile = outputs
124 |     runStageCheck('fastqc', flagFile, fastqc_dir, sequence)
125 | 
126 | @collate(fastq_files, regex(r".*?([^/]+)(_1|_2)\.fastq.gz"), 
127 |         [r"%s/\1.sam" % sambam_dir, r"%s/\1.bwaPE.Success" % sambam_dir])
128 | def bwaPE(inputs, outputs):
129 |     """
130 |     Aligns two paired-end fastq files to a reference genome to produce a sam file.
131 |     """
132 |     seq1, seq2 = sorted(inputs)
133 |     output, flag_file = outputs
134 |     fastq_name = os.path.basename(seq1)
135 |     sample = fastq_metadata[fastq_name]['sample']
136 |     runID = fastq_metadata[fastq_name]['run_id']
137 |     lane = fastq_metadata[fastq_name]['lane']
138 |     readgroup_metadata = { 'PL': 'ILLUMINA',
139 |                            'SM': sample,
140 |                            'ID': "%s_%s_Lane%d" % (sample, runID, lane) }
141 |     metadata_str = make_metadata_string(readgroup_metadata)
142 |     print "bwa-mem on %s and %s" % (os.path.basename(seq1), os.path.basename(seq2))
143 |     runStageCheck('bwaMemPE', flag_file, metadata_str, ref_files['bwa_reference'], seq1, seq2, output)
144 | 
145 | @transform(bwaPE, suffix(".sam"),
146 |             [".bam", ".samToBam.Success"])
147 | def samToBam(inputs, outputs):
148 |     """
149 |     Convert sam to bam and sort, using Picard.
150 |     """
151 |     output, flag_file = outputs
152 |     sam, _success = inputs
153 |     print "converting to sorted bam: %s" % os.path.basename(sam)
154 |     runStageCheck('samToSortedBam', flag_file, sam, output)
155 | 
156 | @collate(samToBam, regex(r'(.*?)([^/_]+)_([^/_]+_[^/_]+)\.bam'), 
157 |             [r"\1\2.bam", r'\1\2.mergeBams.Success'])
158 | def mergeBams(inputs, outputs):
159 |     """
160 |     Merge the sorted bams together for each sample.
161 |     Picard should cope correctly if there is only one input.
162 |     """
163 |     bams = [bam for [bam, _success] in inputs]
164 |     output, flag_file = outputs
165 |     baminputs = ' '.join(["INPUT=%s" % bam for bam in bams])
166 |     print "merging %s into %s" % (",".join([os.path.basename(bam) for bam in bams]), os.path.basename(output))
167 |     runStageCheck('mergeBams', flag_file, baminputs, output)
168 | 
169 | @follows('indexMergedBams')
170 | @transform(mergeBams, suffix('.bam'), 
171 |             ['.dedup.bam', '.bam.dedup.Success'])
172 | def dedup(inputs, outputs):
173 |     """
174 |     Remove apparent duplicates from merged bams using Picard MarkDuplicates.
175 |     """
176 |     input_bam, _success = inputs
177 |     output_bam, flag_file = outputs
178 |     logFile = mkLogFile(logDir, input_bam, '.dedup.log')
179 |     print "de-duping %s" % os.path.basename(input_bam)
180 |     runStageCheck('dedup', flag_file, input_bam, logFile, output_bam)
181 | 
182 | @follows('indexDedupedBams')  
183 | @transform(dedup, suffix('.bam'), 
184 |             ['.realigner.intervals', '.bam.realignIntervals.Success'])
185 | def realignIntervals(inputs, outputs):
186 |     """
187 |     Run GATK RealignTargetCreator to find suspect intervals for realignment.
188 |     """
189 |     bam, _success = inputs
190 |     output_intervals, flag_file = outputs
191 |     logFile = mkLogFile(logDir, bam, '.realignIntervals.log')
192 |     print "calculating realignment intervals for %s" % os.path.basename(bam)
193 |     runStageCheck('realignIntervals', flag_file, ref_files['fasta_reference'], bam, ref_files['indels_realign_goldstandard'], ref_files['indels_realign_1000G'], logFile, output_intervals)
194 | 
195 | def remove_GATK_bai(bamfile):
196 |     """
197 |     A bug in some versions of GATK cause it to create an x.bai file, and this gets in the way of using the properly named x.bam.bai file. If the given file exists, delete it.
198 |     """
199 |     bad_bai = os.path.splitext(bamfile)[0] + ".bai"
200 |     try:
201 |         os.remove(bad_bai)
202 |     except OSError, e:
203 |         # Ignore error only if it is OSError #2, ie File Not Found
204 |         if e.errno != 2:
205 |             raise e
206 | 
207 | @transform(realignIntervals, regex(r"(.*?)([^/]+)\.realigner\.intervals"),
208 |             add_inputs([r'\1\2.bam']), 
209 |             [r'\1\2.realigned.bam', r'\1\2.bam.realign.Success'])
210 | def realign(inputs, outputs):
211 |     """
212 |     Run GATK IndelRealigner for local realignment, using intervals found by realignIntervals.
213 |     """
214 |     [intervals, _success], [input_bam] = inputs
215 |     output_bam, flag_file = outputs
216 |     logFile = mkLogFile(logDir, input_bam, '.realign.log')
217 |     print "realigning %s" % os.path.basename(input_bam)
218 |     runStageCheck('realign', flag_file, ref_files['fasta_reference'], input_bam, intervals, logFile, output_bam)
219 |     remove_GATK_bai(output_bam)
220 | 
221 | @follows('indexRealignedBams')
222 | @transform(realign, suffix('.bam'),
223 |             ['.recal_data.csv', '.baseQualRecalCount.Success'])
224 | def baseQualRecalCount(inputs, outputs):
225 |     """
226 |     GATK CountCovariates, first step of base quality score recalibration.
227 |     """
228 |     bam, _success = inputs
229 |     output_csv, flag_file = outputs
230 |     logFile = mkLogFile(logDir, bam, '.baseQualRecalCount.log')
231 |     print "count covariates using GATK for base quality score recalibration: %s" % os.path.basename(bam)
232 |     runStageCheck('baseQualRecalCount', flag_file, bam, ref_files['fasta_reference'], ref_files['dbsnp'], logFile, output_csv)
233 | 
234 | @transform(baseQualRecalCount, regex(r'(.*?)([^/]+)\.recal_data\.csv'), 
235 |             add_inputs([r'\1\2.bam']), 
236 |             [r'\1\2.recal.bam', r'\1\2.baseQualRecalTabulate.Success'])
237 | def baseQualRecalTabulate(inputs, outputs):
238 |     """
239 |     GATK TableRecalibration: recalibrate base quality scores using the output of CountCovariates.
240 |     """
241 |     [input_csv, _success], [input_bam] = inputs
242 |     output_bam, flag_file = outputs
243 |     logFile = mkLogFile(logDir, input_bam, '.baseQualRecalTabulate.log')
244 |     print "recalibrate base quality scores using GATK on %s" % os.path.basename(input_bam)
245 |     runStageCheck('baseQualRecalTabulate', flag_file, input_bam, ref_files['fasta_reference'], input_csv, logFile, output_bam)
246 |     remove_GATK_bai(output_bam)
247 | 
248 | # Temporarily putting this indexing step here to work around bug
249 | @transform(baseQualRecalTabulate, suffix('.bam'),
250 |             ['.bam.bai', '.bam.indexRecalibratedBams.Success'])
251 | def indexRecalibratedBams(inputs, outputs):
252 |     """
253 |     Index the recalibrated bams using samtools. 
254 |     """
255 |     bam, _success = inputs
256 |     output, flag_file = outputs
257 |     print "samtools index on %s" % os.path.basename(bam)
258 |     runStageCheck('indexBam', flag_file, bam)
259 | 
260 | # Variant calling steps
261 | 
262 | @follows(indexRecalibratedBams)
263 | @transform(baseQualRecalTabulate, 
264 |             regex(r'(.*?)([^/]+)\.recal\.bam'),
265 |             [r'%s/\2.SNP.vcf' % variant_dir, 
266 |              r'%s/\2.SNP.vcf.idx' % variant_dir, 
267 |              r'%s/\2.callSNPs.Success' % variant_dir])
268 | def callSNPs(inputs, outputs):
269 |     """
270 |     Use GATK UnifiedGenotyper to call SNPs from recalibrated bams.
271 |     """
272 |     bam, _success = inputs
273 |     output_vcf, _idx, flag_file = outputs
274 |     logFile = mkLogFile(logDir, bam, '.callSNPs.log')
275 |     print "calling SNPs from %s" % bam
276 |     runStageCheck('callSNPs', flag_file, ref_files['fasta_reference'], bam, ref_files['dbsnp'], logFile, output_vcf)
277 | 
278 | @follows(indexRecalibratedBams)
279 | @transform(baseQualRecalTabulate, 
280 |             regex(r'(.*?)([^/]+)\.recal\.bam'),
281 |             [r'%s/\2.INDEL.vcf' % variant_dir, 
282 |              r'%s/\2.INDEL.vcf.idx' % variant_dir, 
283 |              r'%s/\2.callIndels.Success' % variant_dir])
284 | def callIndels(inputs, outputs):
285 |     """
286 |     Use GATK UnifiedGenotyper to call indels from recalibrated bams.
287 |     """
288 |     bam, _success = inputs
289 |     output_vcf, _idx, flag_file = outputs
290 |     logFile = mkLogFile(logDir, bam, '.callIndels.log')
291 |     print "calling Indels from %s" % bam
292 |     runStageCheck('callIndels', flag_file, ref_files['fasta_reference'], bam, ref_files['dbsnp'], logFile, output_vcf)
293 | 
294 | @transform(callSNPs, suffix('.SNP.vcf'),
295 |             ['.SNP.filtered.vcf', '.SNP.filtered.vcf.idx', '.filterSNPs.Success'])
296 | def filterSNPs(inputs, outputs):
297 |     """
298 |     Use GATK VariantFiltration to filter raw SNP calls.
299 |     """
300 |     input_vcf, _idx, _success = inputs
301 |     output_vcf, _idxout, flag_file = outputs
302 |     logFile = mkLogFile(logDir, input_vcf, '.filterSNPs.log')
303 |     print "filtering SNPs from %s" % input_vcf
304 |     runStageCheck('filterSNPs', flag_file, ref_files['fasta_reference'], input_vcf, logFile, output_vcf)
305 | 
306 | @transform(callIndels, suffix('.INDEL.vcf'),
307 |             ['.INDEL.filtered.vcf', '.INDEL.filtered.vcf.idx', '.filterIndels.Success'])
308 | def filterIndels(inputs, outputs):
309 |     """
310 |     Use GATK VariantFiltration to filter raw INDEL calls.
311 |     """
312 |     input_vcf, _idx, _success = inputs
313 |     output_vcf, _idxout, flag_file = outputs
314 |     logFile = mkLogFile(logDir, input_vcf, '.filterIndels.log')
315 |     print "filtering indels from %s" % input_vcf
316 |     runStageCheck('filterIndels', flag_file, ref_files['fasta_reference'], input_vcf, logFile, output_vcf)
317 | 
318 | 
319 | @transform([filterSNPs, filterIndels], regex(r'.*?([^/]+)\.vcf'), 
320 |     [r'%s/\1.ensembl.vcf' % ensembl_dir,r'%s/\1.getEnsemblAnnotations.Success' % ensembl_dir])
321 | def getEnsemblAnnotations(inputs, outputs):
322 |     """
323 |     Annotate vcf using ENSEMBL variant effect predictor.
324 |     """
325 |     vcf, _idx, _success = inputs
326 |     output, flag_file = outputs
327 |     logFile = mkLogFile(logDir, vcf, '.EnsemblAnnotation.log')
328 |     print "Annotating %s with ENSEMBL variant effect predictor" % os.path.basename(vcf)
329 |     runStageCheck('annotateEnsembl', flag_file, vcf, output, logFile)
330 | 
331 | 
332 | # Indexing steps
333 | 
334 | @transform(mergeBams, suffix('.bam'),
335 |             ['.bam.bai', '.bam.indexMergedBams.Success'])
336 | def indexMergedBams(inputs, outputs):
337 |     """
338 |     Index the merged bams using samtools.
339 |     """
340 |     bam, _success = inputs
341 |     output, flag_file = outputs
342 |     print "samtools index on %s" % os.path.basename(bam)
343 |     runStageCheck('indexBam', flag_file, bam)
344 | 
345 | @transform(dedup, suffix('.bam'),
346 |             ['.bam.bai', '.bam.indexDedupedBams.Success'])
347 | def indexDedupedBams(inputs, outputs):
348 |     """
349 |     Index the de-duplicated bams using samtools. Note that this actually goes from the fixMate-ed bams.
350 |     """
351 |     bam, _success = inputs
352 |     output, flag_file = outputs
353 |     print "samtools index on %s" % os.path.basename(bam)
354 |     runStageCheck('indexBam', flag_file, bam)
355 | 
356 | @transform(realign, suffix('.bam'),
357 |             ['.bam.bai', '.bam.indexRealignedBams.Success'])
358 | def indexRealignedBams(inputs, outputs):
359 |     """
360 |     Index the locally realigned bams using samtools.
361 |     """
362 |     bam, _success = inputs
363 |     output, flag_file = outputs
364 |     print "samtools index on %s" % os.path.basename(bam)
365 |     runStageCheck('indexBam', flag_file, bam)
366 | 
367 | 
368 | 
369 | @transform(mergeBams, suffix('.bam'),
370 |             ['.bam.tdf', '.bam.igvcountMergedBams.Success'])
371 | def igvcountMergedBams(inputs, outputs):
372 |     """
373 |     Use igvtools count to create a .tdf file for the merged bam files, to improve viewing of the bam coverage in igv.
374 |     """
375 |     bam, _success = inputs
376 |     outfile, flag_file = outputs
377 |     print "igvtools count on %s" % os.path.basename(bam)
378 |     runStageCheck('igvcount', flag_file, bam, outfile)
379 | 
380 | @transform(realign, suffix('.bam'),
381 |             ['.bam.tdf', '.bam.igvcountRealignedBams.Success'])
382 | def igvcountRealignedBams(inputs, outputs):
383 |     """
384 |     Use igvtools count to create a .tdf file for the merged bam files, to improve viewing of the bam coverage in igv.
385 |     """
386 |     bam, _success = inputs
387 |     outfile, flag_file = outputs
388 |     print "igvtools count on %s" % os.path.basename(bam)
389 |     runStageCheck('igvcount', flag_file, bam, outfile)
390 | 
391 | @transform(dedup, suffix('.bam'),
392 |             ['.bam.tdf', '.bam.igvcountDedupedBams.Success'])
393 | def igvcountDedupedBams(inputs, outputs):
394 |     """
395 |     Use igvtools count to create a .tdf file for the deduped bam files, to improve viewing of the bam coverage in igv. Note that this actually goes from the fixMate-ed bams.
396 |     """
397 |     bam, _success = inputs
398 |     outfile, flag_file = outputs
399 |     print "igvtools count on %s" % os.path.basename(bam)
400 |     runStageCheck('igvcount', flag_file, bam, outfile)
401 | 
402 | @transform(baseQualRecalTabulate, suffix('.bam'),
403 |             ['.bam.tdf', '.bam.igvcountRecalibratedBams.Success'])
404 | def igvcountRecalibratedBams(inputs, outputs):
405 |     """
406 |     Use igvtools count to create a .tdf file for the recalibrated bam files, to improve viewing of the bam coverage in igv.
407 |     """
408 |     bam, _success = inputs
409 |     outfile, flag_file = outputs
410 |     print "igvtools count on %s" % os.path.basename(bam)
411 |     runStageCheck('igvcount', flag_file, bam, outfile)
412 | 
413 | @transform(filterSNPs, suffix('.vcf'),
414 |             ['.vcf.gz', '.vcf.gz.tbi', '.vcfindexSNPs.Success'])
415 | def vcfIndexSNPs(inputs, outputs):
416 |     """
417 |     Use bgzip and tabix to prepare raw SNPs vcf for vcftools handling.
418 |     """
419 |     vcf, _idx, _success = inputs
420 |     zipfile, tabix_index, flag_file = outputs
421 |     print "bgzip and tabix (for vcftools) on %s" % vcf
422 |     runStageCheck('indexVCF', flag_file, vcf)
423 | 
424 | @transform(filterIndels, suffix('.vcf'),
425 |             ['.vcf.gz', '.vcf.gz.tbi', '.vcfindexIndels.Success'])
426 | def vcfIndexIndels(inputs, outputs):
427 |     """
428 |     Use bgzip and tabix to prepare raw indels vcf for vcftools handling.
429 |     """
430 |     vcf, _idx, _success = inputs
431 |     zipfile, tabix_index, flag_file = outputs
432 |     print "bgzip and tabix (for vcftools) on %s" % vcf
433 |     runStageCheck('indexVCF', flag_file, vcf)
434 | 
435 | 
436 | # Coverage steps
437 | 
438 | @follows(indexMergedBams)
439 | @transform(mergeBams, 
440 |             regex(r'(.*?)([^/]+)\.bam'),
441 |             [r'%s/\2.early.DepthOfCoverage.sample_cumulative_coverage_counts' % coverage_dir, 
442 |             r'%s/\2.early.DepthOfCoverage.sample_cumulative_coverage_proportions' % coverage_dir, 
443 |             r'%s/\2.early.DepthOfCoverage.sample_interval_statistics' % coverage_dir, 
444 |             r'%s/\2.early.DepthOfCoverage.sample_interval_summary' % coverage_dir, 
445 |             r'%s/\2.early.DepthOfCoverage.sample_statistics' % coverage_dir, 
446 |             r'%s/\2.early.DepthOfCoverage.sample_summary' % coverage_dir, 
447 |             r'%s/\2.earlyDepthOfCoverage.Success' % coverage_dir])
448 | def earlyDepthOfCoverage(inputs, outputs):
449 |     """
450 |     Use GATK DepthOfCoverage to get a first pass at coverage statistics, after merging bams.
451 |     """
452 |     bam, _success = inputs
453 |     flag_file = outputs[-1]
454 |     output_example = outputs[0]
455 |     output_base = os.path.splitext(output_example)[0]
456 |     print "calculating coverage statistics using GATK DepthOfCoverage on %s" % bam
457 |     runStageCheck('depthOfCoverage', flag_file, ref_files['fasta_reference'], bam, output_base)
458 | 
459 | @follows(indexDedupedBams)
460 | @transform(dedup, 
461 |         regex(r'(.*?)([^/]+)\.dedup\.bam'),
462 |         [r'%s/\2.deduped.DepthOfCoverage.sample_cumulative_coverage_counts' % coverage_dir, 
463 |          r'%s/\2.deduped.DepthOfCoverage.sample_cumulative_coverage_proportions' % coverage_dir, 
464 |          r'%s/\2.deduped.DepthOfCoverage.sample_interval_statistics' % coverage_dir, 
465 |          r'%s/\2.deduped.DepthOfCoverage.sample_interval_summary' % coverage_dir, 
466 |          r'%s/\2.deduped.DepthOfCoverage.sample_statistics' % coverage_dir, 
467 |          r'%s/\2.deduped.DepthOfCoverage.sample_summary' % coverage_dir, 
468 |          r'%s/\2.dedupedDepthOfCoverage.Success' % coverage_dir])
469 | def dedupedDepthOfCoverage(inputs, outputs):
470 |     """
471 |     Use GATK DepthOfCoverage to get a coverage statistics as soon as duplicates are removed.
472 |     """
473 |     bam, _success = inputs
474 |     flag_file = outputs[-1]
475 |     output_example = outputs[0]
476 |     output_base = os.path.splitext(output_example)[0]
477 |     print "calculating coverage statistics using GATK DepthOfCoverage on %s" % bam
478 |     runStageCheck('depthOfCoverage', flag_file, ref_files['fasta_reference'], bam, output_base)
479 | 
480 | @follows(indexRecalibratedBams)
481 | @transform(baseQualRecalTabulate, 
482 |             regex(r'(.*?)([^/]+)\.recal\.bam'),
483 |             [r'%s/\2.DepthOfCoverage.sample_cumulative_coverage_counts' % coverage_dir, 
484 |             r'%s/\2.DepthOfCoverage.sample_cumulative_coverage_proportions' % coverage_dir, 
485 |             r'%s/\2.DepthOfCoverage.sample_interval_statistics' % coverage_dir, 
486 |             r'%s/\2.DepthOfCoverage.sample_interval_summary' % coverage_dir, 
487 |             r'%s/\2.DepthOfCoverage.sample_statistics' % coverage_dir, 
488 |             r'%s/\2.DepthOfCoverage.sample_summary' % coverage_dir, 
489 |             r'%s/\2.depthOfCoverage.Success' % coverage_dir])
490 | def finalDepthOfCoverage(inputs, outputs):
491 |     """
492 |     Use GATK DepthOfCoverage to get coverage statistics.
493 |     """
494 |     bam, _success = inputs
495 |     flag_file = outputs[-1]
496 |     output_example = outputs[0]
497 |     output_base = os.path.splitext(output_example)[0]
498 |     print "calculating coverage statistics using GATK DepthOfCoverage on %s" % bam
499 |     runStageCheck('depthOfCoverage', flag_file, ref_files['fasta_reference'], bam, output_base)
500 | 
501 | 
502 | # Read-counting steps
503 | 
504 | @transform(samToBam, suffix('.bam'), 
505 |             ['.bam.flagstat', '.bam.countRunBam.Success'])
506 | def countRunBam(inputs, outputs):
507 |     """
508 |     Run samtools flagstat on the initial per-lane, per-run bam file.
509 |     """
510 |     bam, _success = inputs
511 |     output, flag_file = outputs
512 |     print "Running samtools flagstat on %s" % bam
513 |     runStageCheck('flagstat', flag_file, bam, output)
514 | 
515 | @transform(mergeBams, suffix('.bam'), 
516 |             ['.bam.flagstat', '.bam.countRunBam.Success'])
517 | def countMergedBam(inputs, outputs):
518 |     """
519 |     Run samtools flagstat on the merged bam file.
520 |     """
521 |     bam, _success = inputs
522 |     output, flag_file = outputs
523 |     print "Running samtools flagstat on %s" % bam
524 |     runStageCheck('flagstat', flag_file, bam, output)
525 | 
526 | @transform(realign, suffix('.bam'), 
527 |             ['.bam.flagstat', '.bam.countRealignedBam.Success'])
528 | def countRealignedBam(inputs, outputs):
529 |     """
530 |     Run samtools flagstat on the realigned bam file.
531 |     """
532 |     bam, _success = inputs
533 |     output, flag_file = outputs
534 |     print "Running samtools flagstat on %s" % bam
535 |     runStageCheck('flagstat', flag_file, bam, output)
536 | 
537 | @transform(dedup, suffix('.bam'), 
538 |             ['.bam.flagstat', '.bam.countDedupedBam.Success'])
539 | def countDedupedBam(inputs, outputs):
540 |     """
541 |     Run samtools flagstat on the deduped bam file.
542 |     """
543 |     bam, _success = inputs
544 |     output, flag_file = outputs
545 |     print "Running samtools flagstat on %s" % bam
546 |     runStageCheck('flagstat', flag_file, bam, output)
547 | 
548 | 
549 | # Data collation and plotting steps
550 | 
551 | @merge([countDedupedBam, countMergedBam],
552 |         ["%s/readcounts.txt" % results_dir, "%s/readcount_fractions.txt" % results_dir, "%s/collateReadcounts.Success" % results_dir])
553 | def collateReadCounts(inputs, outputs):
554 |     """
555 |     Collate read counts from samtools flagstat output into a table.
556 |     """
557 |     # Note expected input and output directories are effectively hard-coded
558 |     in_dir =  sambam_dir
559 |     out_dir = results_dir  
560 |     flag_file = outputs[-1]
561 |     print "Collating read counts"
562 |     runStageCheck('collateReadcounts', flag_file, in_dir, out_dir)
563 | 
564 | 
565 | 


--------------------------------------------------------------------------------
/pipeline_dev_config.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # This section is used by the variant calling pipeline.py to specify input data and 
  3 | # working directories.
  4 | #
  5 | # Note that if you have downloaded the pipeline the directory names below are examples 
  6 | # only and you will need to edit them to suit your needs.
  7 | #
  8 | # Required variables:
  9 | #  - fastq_dirs: a list of directories where the raw input data is found. Currently this
 10 | #      data is expected to be paired-end gzipped fastq and to follow a specific naming
 11 | #      convention (see below).
 12 | #  - fastq_symlink_dir: symlinks to all raw fastq files will be written to this directory
 13 | #      and used by the rest of the pipeline. These symlinks have standardised names and
 14 | #      are a useful flattened summary of all known input data.
 15 | #  - output_dir: the directory used by the pipeline for output and intermediate files.
 16 | #      A directory structure will be created under this directory by pipeline.py.
 17 | #
 18 | # Input data naming convention:
 19 | # The input fastq files must follow a naming convention so that the pipeline can determine
 20 | # the metadata fields. This convention in the default script is to use the regex
 21 | #     ([a-zA-Z0-9-.]+)_([^_/]+)_[CAGTN]+_L([0-9]+)_R(1|2).fastq.gz
 22 | # This corresponds to metadata fields
 23 | #    SAMPLE_RUN_TAG_LANE_READPAIR.fastq.gz
 24 | # where
 25 | #    SAMPLE is a unique identifier for the sample sequenced
 26 | #    RUN is a unique identifier for the experiment (e.g. run or flowcell ID)
 27 | #    TAG is the barcode sequence used for multiplexing (NA if none)
 28 | #    LANE is the flowcell lane identifier, written like L001
 29 | #    READPAIR identifies whether the file contains forward or reverse reads, R1 or R2
 30 | #
 31 | # For example: Sample395_C0WK7ACXX_ACTTGA_L007_R1.fastq.gz
 32 | #
 33 | # This file naming convention follows that returned by many sequencing centres for
 34 | # Illumina data.    
 35 | #
 36 | working_files = {
 37 |     'fastq_dirs': [
 38 |          './example_data/input_data_wgs'
 39 |     ],
 40 |     'fastq_symlink_dir': './example_data/output_wgs/fastq_symlinks',
 41 |     'output_dir': './example_data/output_wgs'
 42 | }
 43 | 
 44 | # This section is used by the variant calling pipeline.py to specify reference data files.
 45 | #
 46 | # Note that if you have downloaded the pipeline the filenames below are examples only and
 47 | # you will need to get the relevant reference files for your data. Exactly which files
 48 | # you need depend on your data. At time of writing reference data can be obtained from:
 49 | #  - Reference genome: many sources depending on data. For our human data we used the 
 50 | #     1000 genomes version of the b37 (hg19) genome build, found at
 51 | #     ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/technical/reference/human_g1k_v37.fasta.gz
 52 | #     Note that your genome must use the same chromosome naming convention as any other
 53 | #     reference files (such as dbSNP); if you use hg19 (chr1,chr2) instead of b37 (1,2)
 54 | #     you may need to convert the files suggested below.
 55 | #  - dbSNP variants: dbSNP is at http://www.ncbi.nlm.nih.gov/projects/SNP/
 56 | #     A useful release summary is at http://www.ncbi.nlm.nih.gov/projects/SNP/snp_summary.cgi
 57 | #     We used human variants which were obtained in VCF format from
 58 | #     ftp://ftp.ncbi.nlm.nih.gov/snp/organisms/human_9606/VCF/00-All.vcf.gz
 59 | #  - known indels for local realignment: these follow the Broad recommendations for the
 60 | #     GATK tool suite and come from the GATK resource bundle. See
 61 | #     http://gatkforums.broadinstitute.org/discussion/1213/what-s-in-the-resource-bundle-and-how-can-i-get-it
 62 | #
 63 | # Expected variables (if you use the relevant pipeline steps):
 64 | #  - fasta_reference: the reference genome fasta. Should be in the same location as the
 65 | #      .fai files produced by samtools faidx. 
 66 | #      TODO: do this indexing as part of the pipeline and check for index files.
 67 | #  - bwa_reference: the reference genome fasta. Should be in the same location as the
 68 | #      index files produced by bwa index.
 69 | #      TODO: do this indexing as part of the pipeline and check for index files.
 70 | #  - dbsnp: the dbSNP variants file in VCF format, for annotating variants and for 
 71 | #       GATK base quality recalibration.
 72 | #  - indels_realign_goldstandard and
 73 | #  - indels_realign_1000G: files of known indels for use in GATK local realignment.
 74 | #       Currently the Broad Institute recommends using these two files (see above).
 75 | ref_files = {
 76 |     'fasta_reference': '/vlsci/VR0002/shared/Reference_Files/Indexed_Ref_Genomes/bwa_0.7.5_Indexed/human_g1k_v37.fasta',
 77 |     'bwa_reference': '/vlsci/VR0002/shared/Reference_Files/Indexed_Ref_Genomes/bwa_0.7.5_Indexed/human_g1k_v37.fasta',
 78 |     
 79 |     'dbsnp': '/vlsci/VR0002/shared/Reference_Files/SNP_db/dbSNP137.vcf',  
 80 | 
 81 |     'indels_realign_goldstandard': '/vlsci/VR0002/shared/Reference_Files/Indels_for_realignment/Mills_and_1000G_gold_standard.indels.b37.vcf',
 82 |     'indels_realign_1000G': '/vlsci/VR0002/shared/Reference_Files/Indels_for_realignment/1000G_phase1.indels.b37.vcf'
 83 | }
 84 | 
 85 | # pipeline should hold configuration options for Rubra and for the pipeline.
 86 | # This section is required for every Rubra pipeline,
 87 | # but restrict_samples and allowed_samples are specific to the variant-calling pipeline.
 88 | #
 89 | # Rubra variables:
 90 | #  - logDir: the directory where batch queue scripts, stdout and sterr dumps are stored.
 91 | #  - logFile: the file used to log all jobs that are run.
 92 | #  - style: the default style, one of 'flowchart', 'print', 'run', 'touchfiles'. Can be 
 93 | #      overridden by specifying --style on the command line.
 94 | #  - procs: the number of python processes to run simultaneously. This determines the
 95 | #      maximum parallelism of the pipeline. For distributed jobs it also constrains the
 96 | #      maximum total jobs submitted to the queue at any one time.
 97 | #  - verbosity: one of 0 (quiet), 1 (normal), 2 (chatty). Can be overridden by specifying
 98 | #      --verbose on the command line.
 99 | #  - end: the desired tasks to be run. Rubra will also run all tasks which are dependencies 
100 | #      of these tasks. Can be overridden by specifying --end on the command line.
101 | #  - force: tasks which will be forced to run, regardless of timestamps. Can be overridden
102 | #      by supplying --force on the command line.
103 | #  - rebuild: one of 'fromstart','fromend'. Whether to calculate which dependencies will
104 | #      be rerun by working back from an end task to the latest up-to-date task, or forward
105 | #      from the earliest out-of-date task. 'fromstart' is the most conservative and 
106 | #      commonly used as it brings all intermediate tasks up to date.
107 | #
108 | # Variant-calling pipeline variables:  (TODO: move to a separate section)
109 | #  - restrict_samples: whether to restrict input files to those specified by allowd_samples
110 | #  - allowed_samples: sample names that will be run of restrict_samples is True
111 | pipeline = {
112 |     'logDir': 'log_example_wgs',    
113 |     'logFile': 'pipeline.log',
114 |     'style': 'print',
115 |     'procs': 30,
116 |     'verbose': 1,
117 |     'end': ['earlyDepthOfCoverage', 'finalDepthOfCoverage', 
118 |             'fastqc',
119 |             'igvcountMergedBams', 'countDedupedBam', 'countRunBam', 'countMergedBam',
120 |             'getEnsemblAnnotations',
121 |             'collateReadCounts',
122 |             'vcfIndexSNPs', 'vcfIndexIndels'
123 |             ],
124 |     'force': [],
125 |     'rebuild' : "fromstart",
126 | 
127 |     'restrict_samples': False,
128 |     'allowed_samples': []
129 | }
130 | 


--------------------------------------------------------------------------------
/pipeline_stages_config.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # stageDefaults contains the default options which are applied to each stage (command).
  3 | # This section is required for every Rubra pipeline.
  4 | # These can be overridden by options defined for individual stages, below.
  5 | # Stage options which Rubra will recognise are: 
  6 | #  - distributed: a boolean determining whether the task should be submitted to a cluster
  7 | #      job scheduling system (True) or run on the system local to Rubra (False). 
  8 | #  - walltime: for a distributed PBS job, gives the walltime requested from the job
  9 | #      queue system; the maximum allowed runtime. For local jobs has no effect.
 10 | #  - memInGB: for a distributed PBS job, gives the memory in Gigabytes requested from the 
 11 | #      job queue system. For local jobs has no effect.
 12 | #  - queue: for a distributed PBS job, this is the name of the queue to submit the
 13 | #      job to. For local jobs has no effect. This is currently a mandatory field for
 14 | #      distributed jobs, but can be set to None.
 15 | #  - modules: the modules to be loaded before running the task. This is intended for  
 16 | #      systems with environment modules installed. Rubra will call module load on each 
 17 | #      required module before running the task. Note that defining modules for individual 
 18 | #      stages will override (not add to) any modules listed here. This currently only
 19 | #      works for distributed jobs.
 20 | stageDefaults = {
 21 |     'distributed': True,
 22 |     'queue': None,
 23 |     'walltime': "01:00:00",
 24 |     'memInGB': 8,
 25 |     'modules': [
 26 |         "bwa-intel/0.7.5a",
 27 |         "samtools-intel/0.1.19",
 28 |         "picard/1.53",
 29 |         "python-gcc/2.7.5",
 30 |         "R-gcc/3.0.2",
 31 |         "gatk/1.6-7"
 32 |     ]
 33 | }
 34 | 
 35 | # stages should hold the details of each stage which can be called by runStageCheck.
 36 | # This section is required for every Rubra pipeline.
 37 | # Calling a stage in this way carries out checkpointing and, if desired, batch job
 38 | # submission. 
 39 | # Each stage must contain a 'command' definition. See stageDefaults above for other 
 40 | # allowable options.
 41 | stages = {
 42 |     "fastqc": {
 43 |         "command": "fastqc --quiet -o %outdir %seq",
 44 |         'modules': [ "fastqc/0.10.1" ]
 45 |     },
 46 |     'bwaMemSE': {
 47 |         'command': "bwa mem -t 8 %meta %ref %seq > %out",
 48 |         'walltime': "3:00:00",
 49 |         'queue': 'smp',
 50 |         'memInGB': 23
 51 |     },
 52 |     'bwaMemPE': {
 53 |         'command': "bwa mem -t 8 %meta %ref %seq1 %seq2 > %out",
 54 |         'walltime': "3:00:00",
 55 |         'queue': 'smp',
 56 |         'memInGB': 23
 57 |     },
 58 |     'samToSortedBam': {
 59 |         'command': "./SortSam 6 VALIDATION_STRINGENCY=LENIENT INPUT=%seq OUTPUT=%out SORT_ORDER=coordinate",
 60 |         'walltime': "5:00:00",
 61 |     },
 62 |     'mergeBams': {
 63 |         'command': "./PicardMerge 6 %baminputs USE_THREADING=true VALIDATION_STRINGENCY=LENIENT AS=true OUTPUT=%out",
 64 |         'walltime': "5:00:00"
 65 |     },
 66 |     'indexBam': {
 67 |         'command': "samtools index %bam"
 68 |     },
 69 |     'flagstat': {
 70 |         'command': "samtools flagstat %bam > %out",
 71 |         'walltime': "00:10:00"
 72 |     },
 73 |     'igvcount': {
 74 |         'command': "igvtools count %bam %out hg19",
 75 |         'modules': [ "igvtools/1.5.15" ]
 76 |     },
 77 |     'indexVCF': {
 78 |         'command': "./vcftools_prepare.sh %vcf",
 79 |         'modules': [ "tabix/0.2.5" ]
 80 |     },
 81 |     'realignIntervals': {
 82 |         # Hard-coded to take 2 known indels files right now
 83 |         'command': "./GenomeAnalysisTK 1 -T RealignerTargetCreator -R %ref -I %bam --known %indels_goldstandard --known %indels_1000G -log %log -o %out",
 84 |         'memInGB': 23,
 85 |         'walltime': "5:00:00"
 86 |     },
 87 |     'realign': {
 88 |         'command': "./GenomeAnalysisTK 22 -T IndelRealigner -R %ref -I %bam -targetIntervals %intervals -log %log -o %out",
 89 |         'memInGB': 23,
 90 |         'walltime': "5:00:00"
 91 |     },
 92 |     'dedup': {
 93 |         'command': "./MarkDuplicates 6 INPUT=%bam REMOVE_DUPLICATES=true VALIDATION_STRINGENCY=LENIENT AS=true METRICS_FILE=%log OUTPUT=%out",
 94 |         'walltime': '5:00:00'
 95 |     },
 96 |     'baseQualRecalCount': {
 97 |         'command': "./GenomeAnalysisTK 12 -T CountCovariates -I %bam -R %ref --knownSites %dbsnp -nt 8 -l INFO -cov ReadGroupCovariate -cov QualityScoreCovariate -cov CycleCovariate -cov DinucCovariate -log %log -recalFile %out",
 98 |         'queue': 'smp',
 99 |         'memInGB': 23,
100 |         'walltime': "5:00:00"
101 |     },
102 |     'baseQualRecalTabulate': {
103 |         'command': "./GenomeAnalysisTK 4 -T TableRecalibration -I %bam -R %ref -recalFile %csvfile -l INFO -log %log -o %out",
104 |         'walltime': "5:00:00"
105 |     },
106 |     'callSNPs': {
107 |         'command': "./GenomeAnalysisTK 12 -T UnifiedGenotyper -nt 8 -R %ref -I %bam --dbsnp %dbsnp -stand_call_conf 50.0 -stand_emit_conf 10.0 -dcov 1600 -l INFO -A AlleleBalance -A DepthOfCoverage -A FisherStrand -glm SNP -log %log -o %out",
108 |         'queue': 'smp',
109 |         'memInGB': 23,
110 |         'walltime': "3:00:00"
111 |     },
112 |     'callIndels': {
113 |         'command': "./GenomeAnalysisTK 12 -T UnifiedGenotyper -nt 8 -R %ref -I %bam --dbsnp %dbsnp -stand_call_conf 50.0 -stand_emit_conf 10.0 -dcov 1600 -l INFO -A AlleleBalance -A DepthOfCoverage -A FisherStrand -glm INDEL -log %log -o %out",
114 |         'queue': 'smp',
115 |         'memInGB': 23,
116 |         'walltime': "3:00:00"
117 |     },
118 |     'filterSNPs': {
119 |         # Very minimal hard filters based on GATK recommendations. VQSR is preferable if possible.
120 |         'command': "./GenomeAnalysisTK 4 -T VariantFiltration -R %ref --variant %vcf --filterExpression 'QD < 2.0 || MQ < 40.0 || FS > 60.0 || HaplotypeScore > 13.0 || MQRankSum < -12.5 || ReadPosRankSum < -8.0' --filterName 'GATK_MINIMAL_FILTER' -log %log -o %out",
121 |     },
122 |     'filterIndels': {
123 |         # Very minimal hard filters based on GATK recommendations. VQSR is preferable if possible.
124 |         # If you have 10 or more samples GATK also recommends the filter InbreedingCoeff < -0.8
125 |         'command': "./GenomeAnalysisTK 4 -T VariantFiltration -R %ref --variant %vcf --filterExpression 'QD < 2.0 || ReadPosRankSum < -20.0 || FS > 200.0' --filterName 'GATK_MINIMAL_FILTER' -log %log -o %out",
126 |     },
127 |     'annotateEnsembl': {
128 |         # This command as written assumes that VEP and its cache have been
129 |         # downloaded in respective locations
130 |         # ./variant_effect_predictor_2.5
131 |         # ./variant_effect_predictor_2.5/vep_cache
132 |         'command': "perl variant_effect_predictor_2.5/variant_effect_predictor.pl --cache --dir variant_effect_predictor_2.5/vep_cache -i %vcf --vcf -o %out -species human --canonical --gene --protein --sift=b --polyphen=b > %log",
133 |         'modules': [ "perl/5.10.1", "ensembl/67" ]
134 |     },
135 |     'depthOfCoverage': {
136 |         'command': "./GenomeAnalysisTK 4 -T DepthOfCoverage -R %ref -I %bam -omitBaseOutput -ct 1 -ct 10 -ct 20 -ct 30 -o %out",
137 |     },
138 |     'collateReadcounts': {
139 |         'command': 'python count_flagstat_wgs.py %dir %outdir',
140 |         'walltime': "00:10:00"
141 |     }
142 | }
143 | 


--------------------------------------------------------------------------------
/vcftools_prepare.sh:
--------------------------------------------------------------------------------
1 | bgzip -c $1 > $1".gz"
2 | tabix -p vcf $1".gz"
3 | 


--------------------------------------------------------------------------------