├── bin
    ├── nop.sh
    ├── .DS_Store
    ├── hugeseq_mod.sh
    ├── remove_bam.sh
    ├── prep.sh
    ├── clean_nodup.sh
    ├── breakseq.sh
    ├── fix_bai_name.py
    ├── samtools_index.sh
    ├── picard_nodup.sh
    ├── picard_sort.sh
    ├── bwa_bam.sh
    ├── write_refcalls.sh
    ├── bwa_fq.sh
    ├── bin_bam.sh
    ├── merge_gff.sh
    ├── combine_vcf.sh
    ├── util.py
    ├── breakdancer.sh
    ├── cnvnator.sh
    ├── sjm.py
    ├── gatk_realn.sh
    ├── pindel.sh
    ├── gatk_recal.sh
    ├── vqsr_indel.sh
    ├── vqsr_snp.sh
    ├── annotate.py
    ├── gatk_vc.sh
    └── hugeseq
├── .DS_Store
├── LICENSE
├── RELEASENOTES
├── modulefiles
    └── hugeseq
    │   └── 2.0
└── README


/bin/nop.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash -eu
2 | echo "nothing to be done!"
3 | 


--------------------------------------------------------------------------------
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/StanfordBioinformatics/HugeSeq/HEAD/.DS_Store


--------------------------------------------------------------------------------
/bin/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/StanfordBioinformatics/HugeSeq/HEAD/bin/.DS_Store


--------------------------------------------------------------------------------
/bin/hugeseq_mod.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -e
 2 | 
 3 | module load hugeseq/2.0
 4 | export TMP=$1
 5 | echo $TMP
 6 | shift
 7 | export LOGFILE=$1
 8 | echo $LOGFILE
 9 | shift
10 | 
11 | $*
12 | 


--------------------------------------------------------------------------------
/bin/remove_bam.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | echo "*** SAM/BAM Removel ***"
 4 | 
 5 | for i in $*
 6 | do
 7 | 	if [ -z "${i/*.bam/}" -o -z "${i/*.sam/}" ]
 8 | 	then
 9 | 		echo ">> Removing SAM/BAM file: $i"
10 | 		rm -f $i $i.bai
11 | 	fi
12 | done
13 | 
14 | echo "*** Finished SAM/BAM Removal ***"
15 | 


--------------------------------------------------------------------------------
/bin/prep.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | if [ $# -lt 2 ]
 4 | then
 5 | 	echo "Usage: $0 <input fasta/q> <link to input>"
 6 | 	exit 1
 7 | fi
 8 | 
 9 | i=`cd \`dirname $1\`; pwd`/`basename $1`
10 | l=`cd \`dirname $2\`; pwd`/`basename $2`
11 | 
12 | echo ">> Creating link to input sequence file"
13 | echo "-- Input: $i"
14 | echo "-- Link : $l"
15 | 
16 | if [ ! -e $l ]
17 | then
18 | 	ln -sf $i $l
19 | fi
20 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Product Name: 
 2 | HugeSeq v2.0
 3 | 
 4 | Description: 
 5 | An integrated pipeline for detecting and annotating genetic variations using high-throughput genome sequencing.
 6 | 
 7 | Copyright:
 8 | Stanford Center for Genomics and Personalized Medicine (SCGPM), Stanford School of Medicine, Stanford, California.
 9 | 
10 | Download: 
11 | https://github.com/StanfordBioinformatics/HugeSeq 
12 | 
13 | License:
14 | This work is licensed under the Creative Commons Attribution-NonCommercial 3.0 Unported License. To view a copy of this license, visit http://creativecommons.org/licenses/by-nc/3.0/.


--------------------------------------------------------------------------------
/bin/clean_nodup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -eu
 2 | 
 3 | echo "*** Removing duplicates ***"
 4 | 
 5 | if [ $# -lt 2 ]
 6 | then 
 7 | 	echo "Usage: $0 <bam> <out> [remove, default: true]"
 8 | 	exit 1
 9 | fi
10 | 
11 | rmdup="true"
12 | if [ $# -gt 2 ]
13 | then
14 | 	rmdup=$3
15 | fi
16 | 
17 | f=`cd \`dirname $1\`; pwd`/`basename $1`
18 | o=`cd \`dirname $2\`; pwd`/`basename $2`
19 | 
20 | echo ">>> Marking duplicates"
21 | java -Xms5g -Xmx5g -jar $PICARD/MarkDuplicates.jar \
22 | 	TMP_DIR=$TMP \
23 | 	I=${f} \
24 | 	O=${o} \
25 | 	M=${o/.bam/.metrics} \
26 | 	VALIDATION_STRINGENCY=SILENT \
27 | 	ASSUME_SORTED=true \
28 | 	REMOVE_DUPLICATES=$rmdup
29 | 
30 | echo "*** Finished removing duplicates ***"
31 | 


--------------------------------------------------------------------------------
/bin/breakseq.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -eu
 2 | 
 3 | echo "*** Calling SV using BreakSeq: $BREAKSEQ ***"
 4 | 
 5 | if [ $# -lt 2 ]
 6 | then
 7 | 	echo "Usage: $0 <output> <bam>..."
 8 | 	exit 1
 9 | fi
10 | 
11 | output=`cd \`dirname $1\`; pwd`/`basename $1`
12 | shift
13 | 
14 | bams=''
15 | for i in $*
16 | do
17 |         bams="$bams `cd \`dirname $i\`; pwd`/`basename $i`"
18 | done
19 | 
20 | echo ">> Invoking the BreakSeq (Lite) program (Library: $BPLIB)"
21 | $BREAKSEQ/bin/breakseq $output $bams
22 | 
23 | if [ -e "${output}.2" ]
24 | then
25 | 	rm ${output}.2
26 | fi
27 | sort -k1,1 -k4n -k5n < $output >> ${output}.2
28 | mv ${output}.2  $output
29 | 
30 | echo "*** Finished Calling SV using BreakSeq ***"
31 | 


--------------------------------------------------------------------------------
/bin/fix_bai_name.py:
--------------------------------------------------------------------------------
 1 | #!/bin/env python
 2 | 
 3 | ## Fixes BAM index file name to match what GATK expects.
 4 | 
 5 | import os
 6 | import sys
 7 | 
 8 | 
 9 | orig_filename = sys.argv[1]
10 | 
11 | if not os.path.exists(orig_filename):
12 |     print "Cannot find file", sys.argv[1]
13 |     raise SystemExit(1)
14 | if not orig_filename.endswith('.bam.bai'):
15 |     print "Filename doesn't end with .bam.bai"
16 |     raise SystemExit(0)
17 | 
18 | new_filename = orig_filename[:-8] + '.bai'
19 | 
20 | if os.path.exists(new_filename):
21 |     print "Existing file %s renamed to %s.old" % (new_filename, new_filename)
22 |     os.rename(new_filename, new_filename + '.old')
23 | 
24 | os.link(orig_filename, new_filename)
25 | 


--------------------------------------------------------------------------------
/bin/samtools_index.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -eu
 2 | 
 3 | echo "*** Indexing2 BAM ***"
 4 | echo "   " >> $LOGFILE
 5 | echo "*** Indexing BAM ***" >> $LOGFILE
 6 | 
 7 | if [ $# -lt 1 ]
 8 | then 
 9 | 	echo "Usage: $0 <bam>"
10 | 	exit 1
11 | fi
12 | 
13 | f=`cd \`dirname $1\`; pwd`/`basename $1`
14 | 
15 | if [ ! -e $f.bai ]
16 | then
17 | 	
18 | 	command="samtools index $f"
19 | 	echo ">>> BAM file $f is being indexed"
20 | 	echo ">>> BAM file $f is being indexed" >> $LOGFILE
21 | 	echo ">>> $command"
22 | 	$command
23 | 	
24 | 	echo ">>> Fixing BAI name"	
25 | 	echo ">>> Fixing BAI name" >> $LOGFILE	
26 | 	command="python $HUGESEQ_HOME/bin/fix_bai_name.py $f.bai"
27 | 	echo ">>> $command"
28 | 	echo ">>> $command" >> $LOGFILE
29 | 	$command
30 | fi
31 | 
32 | echo "*** Finished indexing BAM ***"
33 | 


--------------------------------------------------------------------------------
/bin/picard_nodup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -eu
 2 | 
 3 | echo "*** Marking duplicates ***"
 4 | 
 5 | if [ $# -lt 2 ]
 6 | then 
 7 | 	echo "Usage: $0 <bam> <out> [remove, default: false]"
 8 | 	exit 1
 9 | fi
10 | 
11 | input=`cd \`dirname $1\`; pwd`/`basename $1`
12 | output=`cd \`dirname $2\`; pwd`/`basename $2`
13 | 
14 | cp $input $output
15 | 
16 | command="java -Xms5g -Xmx5g -jar $PICARD/MarkDuplicates.jar \
17 | 	TMP_DIR=${TMP} \
18 | 	I=${input} \
19 | 	O=${output} \
20 | 	M=${output/.bam/.metrics} \
21 | 	VALIDATION_STRINGENCY=SILENT \
22 | 	ASSUME_SORTED=true \
23 | 	REMOVE_DUPLICATES=false"
24 | 
25 | echo ">>> Marking duplicates"
26 | echo ">>> Marking duplicates" >> $LOGFILE
27 | echo ">>> $command"
28 | echo ">>> $command" >> $LOGFILE
29 | $command
30 | 
31 | echo "*** Finished marking duplicates ***"
32 | 


--------------------------------------------------------------------------------
/bin/picard_sort.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -eu
 2 | 
 3 | echo "*** Sorting BAM by position ***"
 4 | 
 5 | if [ $# -lt 2 ]
 6 | then 
 7 | 	echo "Usage: $0 <bam> <sorted> [memory in GB]"
 8 | 	exit 1
 9 | fi
10 | 
11 | f=`cd \`dirname $1\`; pwd`/`basename $1`
12 | o=`cd \`dirname $2\`; pwd`/`basename $2`
13 | 
14 | if [ "$2" = '-' ]
15 | then
16 | 	o=$f.sorted
17 | fi
18 | 
19 | gmem=5
20 | if [ $# -gt 2 ]
21 | then
22 | 	gmem=$3
23 | fi
24 | 
25 | command="java -Xms${gmem}g -Xmx${gmem}g -jar $PICARD/SortSam.jar \
26 | 	TMP_DIR=$TMP \
27 | 	INPUT=$f \
28 | 	OUTPUT=$o \
29 | 	MAX_RECORDS_IN_RAM=$(($gmem*250000)) \
30 | 	VALIDATION_STRINGENCY=SILENT \
31 | 	SORT_ORDER=coordinate"
32 | 
33 | echo ">>> Sorting on BAM $f"
34 | echo ">>> Sorting on BAM $f" >> $LOGFILE
35 | echo ">>> $command"
36 | echo ">>> $command" >> $LOGFILE
37 | $command
38 | 
39 | if [ "$2" = '-' ]
40 | then
41 | 	mv $o $f
42 | 	o=$f
43 | fi
44 | 
45 | echo "*** Finished sorting BAM by position ***"
46 | 


--------------------------------------------------------------------------------
/bin/bwa_bam.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -eu
 2 | 
 3 | echo "*** Aligning reads using BWA MEM algorithm ***"
 4 | echo "   " >> $LOGFILE
 5 | echo "*** Aligning reads using BWA MEM algorithm ***" >> $LOGFILE
 6 | 
 7 | if [ $# -lt 1 ]
 8 | then 
 9 | 	echo "Usage: $0 <fastq1> <fastq2> [num of threads] [RG tag]  or $0 <bam> [num of threads] [RG tag]"
10 | 	exit 1
11 | fi
12 | 
13 | 
14 | bam=`cd \`dirname $1\`; pwd`/`basename $1`
15 | optRG=""
16 | lastArg=${BASH_ARGV[0]}
17 | if [[ $lastArg =~ "@RG" ]]
18 | then
19 |         optRG="-R $lastArg"
20 | fi
21 | 
22 | optT=""
23 | seclastArg=${@: -2:1}
24 | optT="-t $seclastArg"
25 | 
26 | echo ">> BAM input"
27 | echo ">> BAM input" >> $LOGFILE
28 | command="samtools bam2fq $bam | bwa mem -CMp $optT $optRG $REF - | samtools view -Sbt $REF.fai -o ${bam/.bam/}.bwa.bam -"
29 | echo ">>> $command"
30 | samtools bam2fq $bam | bwa mem -CMp $optT $optRG $REF - | samtools view -Sbt $REF.fai -o ${bam/.bam/}.bwa.bam -
31 | echo ">>> $command" >> $LOGFILE
32 | echo "*** Finished aligning reads ***"
33 | 


--------------------------------------------------------------------------------
/bin/write_refcalls.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo ">>> Writing reference calls"
 4 | echo "   " >> $LOGFILE
 5 | echo ">>> Writing reference calls" >> $LOGFILE
 6 | 
 7 | set -e
 8 | 
 9 | if [ $# -lt 1 ]
10 | then
11 |         echo "Usage: $0 <vcf>"
12 |         exit 1
13 | fi
14 | 
15 | START_VCF=`cd \`dirname $1\`; pwd`/`basename $1`
16 | PREFIX=`dirname $START_VCF`
17 | SUFFIX=`basename $START_VCF`
18 | SAMPLE=${SUFFIX/.gatk.vcf/}
19 | 
20 | REFCALL_VCF=$PREFIX/$SAMPLE.refcalls.vcf
21 | 
22 | command="java -Xmx6g -Xms6g -jar $GATK/GenomeAnalysisTK.jar \
23 |    -T SelectVariants \
24 |    -R $REF \
25 |    -V $START_VCF \
26 |    -o $REFCALL_VCF \
27 |    -selectType NO_VARIATION"
28 | 
29 | echo ">>> Select reference calls"
30 | echo ">>> Select reference calls" >> $LOGFILE
31 | echo ">>> $command &> $PREFIX/$SAMPLE.select.refcalls.log"
32 | echo ">>> $command &> $PREFIX/$SAMPLE.select.refcalls.log" >> $LOGFILE
33 | $command &> $PREFIX/$SAMPLE.select.refcalls.log 
34 | 
35 | echo ">>> Finished writine reference calls"
36 | 


--------------------------------------------------------------------------------
/bin/bwa_fq.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -eu
 2 | 
 3 | echo "*** Aligning reads using BWA MEM algorithm ***"
 4 | echo "   " >> $LOGFILE
 5 | echo "*** Aligning reads using BWA MEM algorithm ***" > $LOGFILE
 6 | 
 7 | if [ $# -lt 1 ]
 8 | then 
 9 | 	echo "Usage: $0 <fastq1> <fastq2> [num of threads] [RG tag]  or $0 <bam> [num of threads] [RG tag]"
10 | 	exit 1
11 | fi
12 | 
13 | fq1=`cd \`dirname $1\`; pwd`/`basename $1`
14 | fq2=`cd \`dirname $2\`; pwd`/`basename $2`
15 | 
16 | optRG=""
17 | lastArg=${BASH_ARGV[0]}
18 | if [[ $lastArg =~ "@RG" ]]
19 | then
20 |         optRG="-R $lastArg"
21 | fi
22 | optRG="-R @RG\tID:Default\tLB:Library\tPL:Illumina\tSM:SAMPLE"
23 | 
24 | optT=""
25 | seclastArg=${@: -2:1}
26 | optT="-t $seclastArg"
27 | 
28 | if [[ ${fq1: -6} == ".fastq" ]]
29 | then
30 |         fq=$(echo $fq1 | sed -e "s/.fastq//g")
31 |         output="${fq}bam"
32 | elif [[ ${fq1: -9} == ".fastq.gz" ]]
33 | then
34 |         fq=$(echo $fq1 | sed -e "s/.fastq.gz//g")
35 | fi
36 | 
37 | command="bwa mem $REF $fq1 $fq2 $optT $optRG | samtools view -Sbt $REF.fai -o $fq.bwa.bam -"
38 | echo ">>> $command"
39 | echo ">>> $command" >> $LOGFILE
40 | bwa mem $REF $fq1 $fq2 $optT $optRG | samtools view -Sbt $REF.fai -o $fq.bwa.bam -
41 | 
42 | echo "*** Finished aligning reads ***"
43 | 
44 | 


--------------------------------------------------------------------------------
/bin/bin_bam.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -eu
 2 | 
 3 | echo "*** Splitting BAM by chromosome ***"
 4 | echo "   " >> $LOGFILE
 5 | echo "*** Splitting BAM by chromosome ***" >> $LOGFILE
 6 | 
 7 | if [ $# -lt 3 ]
 8 | then
 9 |         echo "Usage: $0 <chr> <output> <bam>..."
10 |         exit 1
11 | fi
12 | 
13 | chr=$1
14 | out=`cd \`dirname $2\`; pwd`/`basename $2`
15 | shift 2
16 | 
17 | bams=''
18 | 
19 | for f in $*
20 | do
21 | 	f=`cd \`dirname $f\`; pwd`/`basename $f`
22 | 
23 | 	echo ">>> Extracting $chr from BAM: $f"
24 | 	echo ">>> Extracting $chr from BAM: $f" >> $LOGFILE
25 | 	o=${f/.bam/}.$chr.bam
26 | 
27 | 	if [ $chr = 'UNK' -o $chr = 'chrU' -o $chr = 'U' ]
28 | 	then
29 | 		command="samtools view $f -f 12 -bo $o"
30 | 		samtools view $f -f 12 -bo $o
31 | 	else
32 | 		command="samtools view $f $chr -bo $o"
33 | 		samtools view $f $chr -bo $o
34 | 	fi
35 | 	echo ">>> $command"
36 | 	echo ">>> $command" >> $LOGFILE
37 | 	bams="$bams $o"
38 | done
39 | 
40 | echo ">>> Merging $chr BAMs into $out"
41 | echo ">>> Merging $chr BAMs into $out" >> $LOGFILE
42 | if [ $# -gt 1 ]
43 | then
44 | 	command="samtools merge $out $bams"
45 | 	samtools merge $out $bams
46 | 	echo ">>> $command"
47 | 	echo ">>> $command" >> $LOGFILE
48 | 	rm $bams
49 | else
50 | 	mv $bams $out
51 | fi
52 | 
53 | echo "*** Finished splitting BAM by chromosome ***"
54 | 


--------------------------------------------------------------------------------
/bin/merge_gff.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -eu
 2 | 
 3 | echo "*** Performing GFF Merging ***"
 4 | 
 5 | if [ $# -lt 2 ]
 6 | then
 7 | 	echo "Usage: $0 <output> <GFF file to merge>..."
 8 | 	exit 1
 9 | fi
10 | 
11 | output=`cd \`dirname $1\`; pwd`/`basename $1`
12 | input=${output/.gff/}.raw.gff
13 | 
14 | shift
15 | 
16 | inputs=''
17 | for i in $*
18 | do
19 | 	inputs="$inputs `cd \`dirname $i\`; pwd`/`basename $i`"
20 | done
21 | 
22 | cat $inputs > $input
23 | 
24 | features="`cut -f 3 $input | sort -u`"
25 | sources="`cut -f 2 $input | sort -u`"
26 | 
27 | for feat in $features
28 | do
29 | 	f=$input.${feat}
30 | 	grep "	$feat	" $input > $f
31 | 	for src in $sources
32 | 	do
33 | 		grep "	$src	" $f | mergeBed -i stdin > $f.$src
34 | 	done
35 | 	cat $f.* > $f
36 | 	> $f.dup
37 | 	> $f.uni
38 | 	intersectBed -a $f -b $f -f 0.5 -r -c | awk '{print $0 > ($NF>1? "'$f.dup'": "'$f.uni'")}'
39 | 	if [ -s $f.uni ]; 
40 | 	then
41 | 		mergeBed -c -i $f.uni > $f.uni.merged
42 | 	fi
43 | 	if [ -s $f.dup ]; 
44 | 	then
45 | 		mergeBed -c -i $f.dup > $f.dup.merged
46 | 	fi
47 | 	for i in $f.*.merged
48 | 	do
49 | 		dup="0"
50 | 		if [ -n "${i/*.uni.merged/}" ]; then dup="1"; fi
51 | 		awk -F '\t' '{qual="LowQual"; if ('$dup' && $4>=2) qual="PASS"; print $1"\tHugeSeq\t'$feat'\t"$2"\t"$3"\t"qual"\t.\t.\tEVENTS "$4}' $i
52 | 	done
53 | 	rm $f $f.*
54 | done > $output
55 | 


--------------------------------------------------------------------------------
/bin/combine_vcf.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -eu
 2 | 
 3 | if [ $# -lt 2 ]
 4 | then 
 5 | 	echo "Usage: $0 <bam> <out>"
 6 | 	exit 1
 7 | fi
 8 | 
 9 | out=`cd \`dirname $1\`; pwd`/`basename $1`
10 | shift
11 | 
12 | noop=$1
13 | shift
14 | 
15 | zip=$1
16 | shift
17 | 
18 | inputs=`cd \`dirname $1\`; pwd`/`basename $1`
19 | shift
20 | for i in $*
21 | do
22 |        	inputs="$inputs --variant `cd \`dirname $i\`; pwd`/`basename $i`"
23 | done
24 | 
25 | if [[ "$noop" == "False" ]]
26 | then
27 | 	command="java -Xmx8g -Xms8g -jar $GATK/GenomeAnalysisTK.jar \
28 | 	   -R $REF \
29 | 	   -T CombineVariants \
30 | 	   --variant $inputs \
31 | 	   -o $out"
32 | 
33 | 	echo ">>> Combining VCFs"
34 | 	echo ">>> Combining VCFs" >> $LOGFILE
35 | 	echo ">>> $command"
36 | 	echo ">>> $command" >> $LOGFILE
37 | 	$command
38 | 
39 | 	if [[ "$zip" == "True" ]]
40 | 	then
41 | 		command="bgzip -fc $out"
42 | 	        echo ">>> Zipping VCF"
43 |         	echo ">>> $command"
44 | 	        echo ">>> $command" >> $LOGFILE
45 |         	$command &> $out.gz
46 | 
47 | 	        command="tabix -p vcf $out.gz"
48 |         	echo ">>> Indexing VCF using tabix"
49 | 	        #echo ">>> Indexing VCF using tabix" >>> $LOGFILE
50 |         	echo ">>> $command"
51 | 	        echo ">>> $command" >> $LOGFILE
52 |         	$command
53 | 	fi
54 | fi
55 | echo "*** Finished combining VCFs ***"
56 | 


--------------------------------------------------------------------------------
/bin/util.py:
--------------------------------------------------------------------------------
 1 | import os, re
 2 | 
 3 | class File:
 4 | 
 5 | 	def __init__(self, path, name=None):
 6 | 		fullpath=str(path) if name is None else os.path.join(str(path), str(name))
 7 | 		self.path=os.path.abspath(fullpath)
 8 | 		self.dir=os.path.dirname(self.path)
 9 | 		self.name=os.path.basename(self.path)
10 | 		nmatch=re.match(r"(.+)\.([^.]+)$", self.name)
11 | 		self.prefix=self.name if nmatch is None else nmatch.group(1)
12 | 		self.ext='' if nmatch is None else nmatch.group(2)
13 | 		self.absprefix=os.path.join(self.dir, self.prefix)
14 | 
15 | 	def __str__(self):
16 | 		return self.path
17 | 
18 | 	def chdir(self, dir):
19 | 		return File(dir, self.name)
20 | 
21 | 	def chext(self, ext):
22 | 		return File(self.absprefix+"."+ext)
23 | 	
24 | 	def appext(self, ext):
25 | 		return File(self.dir + "/" + self.name + "." + ext)
26 | 
27 | 	def exists(self):
28 | 		return os.path.exists(self.path)
29 | 
30 | 	def desc():
31 | 		s=""
32 | 		members = [attr for attr in dir(self) if not callable(attr) and not attr.startswith("__")]
33 | 		for member in members:
34 | 			s+="%s:\t%s\n"%(member,getattr(self, member))
35 | 		return s
36 | 
37 | class Dir(File):
38 | 
39 | 	def __init__(self, path, name=None):
40 | 		File.__init__(self, path, name)
41 | 
42 | 	def mkdirs(self):
43 | 		if self.exists():
44 | 			return False
45 | 		else:
46 | 			os.makedirs(self.path)
47 | 


--------------------------------------------------------------------------------
/bin/breakdancer.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -eu
 2 | 
 3 | echo "*** Calling SV using Read-Pair Mapping: $BREAKDANCER ***"
 4 | 
 5 | if [ $# -lt 2 ]
 6 | then
 7 | 	echo "Usage: $0 <output> <bam>..."
 8 | 	exit 1
 9 | fi
10 | 
11 | o=`cd \`dirname $1\`; pwd`/`basename $1`
12 | p=${o/.gff/}
13 | shift
14 | 
15 | bams=''
16 | for i in $*
17 | do
18 | 	bams="$bams `cd \`dirname $i\`; pwd`/`basename $i`"
19 | done
20 | 
21 | optO=''
22 | if [ $# -eq 1 ]
23 | then
24 |         if [[ "$1" =~ ".*chr[^.]*\..*" ]]
25 |         then
26 |                 chr=`echo "$1" | sed 's/.*\(chr[^\.]*\)\..*/\1/'`
27 |         fi
28 | fi
29 | 
30 | echo ">> Generating configuration file $bams"
31 | perl $BREAKDANCER/perl/bam2cfg.pl $bams > $p.cfg
32 | 
33 | echo ">> Performing read-pair mapping"
34 | $BREAKDANCER/cpp/breakdancer_max $optO $p.cfg > $p.txt
35 | 
36 | echo ">> Converting output to GFF"
37 | minsize=50
38 | awkopt='{size=$8; if (size<0 && $7=="INS") size=-size; feat=$7; if ($7=="DEL") feat="Deletion"; else if ($7=="INS") feat="Insertion"; else if ($7=="INV") feat="Inversion"; if (feat!="ITX" && $1==$4 && size>='$minsize') print $1"\tBreakDancer\t"feat"\t"($2<=$5?$2:$5)"\t"($5>=$2?$5:$2)"\t"$9"\t.\t.\tSize "size"; nr.reads: "$10};'
39 | 
40 | if [ -e "$o" ]
41 | then
42 |         rm $o
43 | fi
44 | 
45 | grep -v '\#' $p.txt | awk "$awkopt" | sort -k1,1 -k4n -k5n -u >> $o
46 | 
47 | echo "*** Finished Calling SV using Read-Pair Mapping ***"
48 | 


--------------------------------------------------------------------------------
/bin/cnvnator.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -eu
 2 | 
 3 | echo "*** Calling CNV using Read-Depth Analysis: $CNVNATOR ***"
 4 | 
 5 | if [ $# -lt 2 ]
 6 | then
 7 | 	echo "Usage: $0 <output> <root/bam...>"
 8 | 	exit 1
 9 | fi
10 | 
11 | CNVNATOR=$CNVNATOR/cnvnator
12 | 
13 | binsize=100
14 | 
15 | o=`cd \`dirname $1\`; pwd`/`basename $1`
16 | shift
17 | 
18 | bams=''
19 | for i in $*
20 | do
21 | 	bams="$bams `cd \`dirname $i\`; pwd`/`basename $i`"
22 | done
23 | 
24 | 
25 | if [ $# -eq 1 ]
26 | then
27 |         if [[ "$1" =~ ".*chr[^.]*\..*" ]]
28 |         then
29 |                 chr=`echo "$1" | sed 's/.*\(chr[^\.]*\)\..*/\1/'`
30 | 		CNVNATOR="$CNVNATOR -chrom $chr"
31 |         fi
32 | fi
33 | 
34 | if [ -z "${1/*.root/}" ]
35 | then
36 | 	echo ">> Processing root file: $1"
37 | 	p=${1/.root/}
38 | else
39 | 	echo ">> Extracting read mapping from bam files: $CNVNATOR -root ${o/.gff/}.root -tree $bams"
40 | 	p=${o/.gff/}
41 | 	$CNVNATOR -root $p.root -tree $bams
42 | fi
43 | 
44 | (
45 | echo ">> Generating histogram"
46 | $CNVNATOR -root $p.root -his $binsize -d `dirname $REF`
47 | echo ">> Calculating statistics"
48 | $CNVNATOR -root $p.root -stat $binsize
49 | echo ">> RD signal partitioning"
50 | $CNVNATOR -root $p.root -partition $binsize
51 | echo ">> CNV calling"
52 | $CNVNATOR -root $p.root -call $binsize | grep -v "\(WARN\)\|\(==\)" > $p.txt
53 | ) 2>&1 | grep -v bound | grep -v Zero | grep -v corrected
54 | 
55 | echo ">> Converting output to GFF"
56 | 
57 | minsize=50
58 | awkopt='{feat=$4; if ($4=="deletion") feat="Deletion"; else if ($4=="duplication") feat="Duplication"; if ($5>='$minsize') print $1"\tCNVnator\t"feat"\t"$2"\t"$3"\t"$7" "$8" "$9" "$10" "$11"\t.\t.\tSize "$5";RD "$6};'
59 | 
60 | if [ -e "$o" ]
61 | then
62 | 	rm $o
63 | fi 
64 | cut -f 1,2,3,4,5,6 $p.txt | sed 's/\(.*\)	\(.*\):\(.*\)-\(.*\)	\(.*\)	\(.*\)	\(.*\)	\(.*\)/\2	\3	\4	\1	\5	\6	p1: \7 | p2: \8/' | awk "$awkopt" | sort -k1,1 -k4n -k5n -u >> $o
65 | 
66 | echo "*** Finished CNV Calling using Read-Depth Analysis"
67 | 


--------------------------------------------------------------------------------
/RELEASENOTES:
--------------------------------------------------------------------------------
 1 | ##################################
 2 | # Release Notes for HugeSeq v2.0 #
 3 | ##################################
 4 | 
 5 | ++++ 7/14/2015 ++++
 6 | 
 7 | NOTE:  This is a beta release.  Several components have been modified in the package to accommodate the following changes.
 8 | 
 9 | ## Tool "required" upgrades ##
10 | 
11 | The following upgrades are "required" for HugeSeq-2.0 to operate properly:
12 | 
13 | * GATK -> 3.2.2
14 | * root -> 5.34.30
15 | * pindel -> 0.2.4t
16 | * vcftools -> 0.1.12
17 | 
18 | ## Major changes since 1.2 ##
19 | 
20 | - The user can now choose to use HaplotypeCaller as well as UnifiedGenotyper for SNPs and/or Indels.
21 | - The user can set GATK to output reference-calls. The standard format for these calls is gVCF. The file carrying the reference calls is generated in addition to the standard VCF that carries only the variants.
22 | - The user can now choose to switch off VQSR for either or both of SNP and Indels.
23 | - The user can now choose targeted genotyping by means of using exome target captures as input.
24 | - The failed jobs at the VQSR stage can now be repeated without running the genotyper again by means of a new input parameter "--donegenotyping".
25 | - The pipeline now is optimized for targeted genotyping using GATK's features for exome data processing (in --targeted mode).
26 | - The pipeline is now capable of running VQSR on whole data set OR on chromosomes individually. This is independent of the capability to split the data into chromosomes for faster genotyping (i.e. the user can choose to do the genotyping for individual chromosomes but run VQSR on all the chromosomes collectively.)
27 | - HugeSeq is now uses Stanovar-0.1 for annotation of variants. 
28 | 
29 | ## Open issues ##
30 | * Package has never been tested on external systems.  Unforeseen issues may arise
31 | 
32 | ## Additional documents
33 | 
34 | * Refer to README file for information on Installation and Execution of the pipeline including background information about directory structure and modules used in the pipeline.
35 | 


--------------------------------------------------------------------------------
/bin/sjm.py:
--------------------------------------------------------------------------------
 1 | class Job:
 2 | 
 3 | 	time=None
 4 | 	memory=None
 5 | 	queue=None
 6 | 	project=None
 7 | 	status=None
 8 | 	log_dir=None
 9 | 	cmd_prefix=None
10 | 	cmd_separator='&&'
11 | 	name_prefix=None
12 | 	sge_options=None
13 | 
14 | 	def __init__(self, name=None):
15 | 		self.name=name
16 | 		if self.name_prefix is not None and self.name is not None:
17 | 			self.name=self.name_prefix+self.name
18 | 		self.status=None
19 | 		self.cmds=[]
20 | 		self.dependents=[]
21 | 
22 | 	def __str__(self):
23 | 		s='job_begin\n'
24 | 		if self.name is not None:
25 | 			s+='\tname %s\n'%self.name
26 | 		if self.time is not None:
27 | 			s+='\ttime %s\n'%self.time
28 | 		if self.memory is not None:
29 | 			s+='\tmemory %s\n'%self.memory
30 | 		if self.queue is not None:
31 | 			s+='\tqueue %s\n'%self.queue
32 | 		if self.project is not None:
33 | 			s+='\tproject %s\n'%self.project
34 | 		if self.status is not None:
35 | 			s+='\tstatus %s\n'%self.status
36 | 		if self.sge_options is not None:
37 | 			s+='\tsge_options %s\n'%self.sge_options
38 | 		if len(self.cmds)>0:
39 | 			s+='\tcmd_begin\n'
40 | 			s+=(' %s\n'%('' if self.cmd_separator is None else self.cmd_separator)).join(['\t\t%s %s'%(('' if self.cmd_prefix is None else self.cmd_prefix), cmd) for cmd in self.cmds])+"\n"
41 | 			s+='\tcmd_end\n'
42 | 		s+='job_end\n'
43 | 		return s
44 | 
45 | 	def done(self):
46 | 		self.status='done'
47 | 
48 | 	def append(self, cmd):
49 | 		self.cmds.append(cmd)
50 | 		return self
51 | 
52 | 	def depend(self, *jobs):
53 | 		if jobs is not None:
54 | 			for job in jobs:
55 | 				if job is not None:
56 | 					self.dependents.append(job)
57 | 		return self
58 | 
59 | 	def order(self, history=[]):
60 | 		s=''
61 | 		for dependent in self.dependents:
62 | 			s+=dependent.order(history)
63 | 			order=(dependent.name, self.name)
64 | 			if self.name is not None and order not in history:
65 | 				s+= "order %s before %s\n" % order
66 | 				history.append(order)
67 | 		return s
68 | 
69 | 	def traverse(self, history=[]):
70 | 		s=''
71 |                 for dependent in self.dependents:
72 |                         s+=dependent.traverse(history)
73 | 		if self.name is not None and self not in history:
74 | 			s+=str(self)
75 | 			history.append(self)
76 |                 return s
77 | 
78 | 	def desc(self):
79 | 		s=self.traverse()
80 | 		s+=self.order()
81 | 		if self.log_dir is not None:
82 | 			s+='log_dir %s\n'%self.log_dir
83 | 		return s
84 | 


--------------------------------------------------------------------------------
/bin/gatk_realn.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -eu
 2 | 
 3 | echo "*** Realigning targeted regions ***"
 4 | echo "   " >> $LOGFILE
 5 | echo "*** Realigning targeted regions ***" >> $LOGFILE
 6 | 
 7 | if [ $# -lt 2 ]
 8 | then 
 9 | 	echo "Usage: $0 <bam> <out>"
10 | 	exit 1
11 | fi
12 | 
13 | f=`cd \`dirname $1\`; pwd`/`basename $1`
14 | o=`cd \`dirname $2\`; pwd`/`basename $2`
15 | relax_realign=$3
16 | 
17 | optL=''
18 | if [[ "$1" =~ .*chr[^\.]*\..* ]]
19 | then
20 | 	chr=`echo "$1" | sed 's/.*\(chr[^\.]*\)\..*/\1/'`
21 | 	optL="-L $chr"
22 | fi
23 | 
24 | RELAX=""
25 | if [[ "$relax_realign" == "True" ]]
26 | then
27 |         RELAX="--defaultBaseQualities 0 --filter_bases_not_stored --filter_mismatching_base_and_quals --filter_reads_with_N_cigar"
28 | fi
29 | 
30 | command="java -Xms8g -Xmx8g -jar $GATK/GenomeAnalysisTK.jar \
31 | 	-T RealignerTargetCreator \
32 | 	-I $f \
33 | 	-R $REF \
34 | 	-o ${o/.bam/.intervals} $optL \
35 |         -known $MILLS_1K_GOLD_INDELS \
36 | 	-known $GOLD_1K_INDELS \
37 | 	-et NO_ET \
38 |         -K /srv/gs1/software/gatk/GATKkey/stanford.edu.key"
39 |  
40 | echo ">>> Determining (small) suspicious intervals which are likely in need of realignment"
41 | echo ">>> Determining (small) suspicious intervals which are likely in need of realignment" >> $LOGFILE
42 | echo ">>> $command"
43 | echo ">>> $command" >> $LOGFILE
44 | $command
45 | 
46 | command="java -Xms8g -Xmx8g -Djava.io.tmpdir=$TMP -jar $GATK/GenomeAnalysisTK.jar \
47 | 	-T IndelRealigner \
48 | 	-I $f \
49 | 	-R $REF \
50 | 	-o $o $optL \
51 | 	-targetIntervals ${o/.bam/.intervals} \
52 |         -known $MILLS_1K_GOLD_INDELS \
53 |         -known $GOLD_1K_INDELS \
54 | 	-et NO_ET $RELAX\
55 | 	-K /srv/gs1/software/gatk/GATKkey/stanford.edu.key"
56 |  
57 | echo ">>> Running the realigner over the targeted intervals"
58 | echo ">>> Running the realigner over the targeted intervals" >> $LOGFILE
59 | echo ">>> $command"
60 | echo ">>> $command" >> $LOGFILE
61 | $command
62 | 
63 | command="java -Xms8g -Xmx8g -jar $PICARD/FixMateInformation.jar \
64 | 	TMP_DIR=$TMP \
65 | 	INPUT=$o \
66 | 	VALIDATION_STRINGENCY=SILENT \
67 | 	SORT_ORDER=coordinate"
68 | 
69 | echo ">>> Fixing the mate pairs and order of the realigned reads"
70 | echo ">>> Fixing the mate pairs and order of the realigned reads" >> $LOGFILE
71 | echo ">>> $command"
72 | echo ">>> $command" >> $LOGFILE
73 | $command
74 | 
75 | echo "*** Finished realigning targeted regions ***"
76 | 


--------------------------------------------------------------------------------
/bin/pindel.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -e
 2 | 
 3 | echo "*** Calling SV using Split-Read Analysis: $PINDEL ***"
 4 | 
 5 | if [ $# -lt 2 ]
 6 | then
 7 | 	echo "Usage: $0 <output> <bam>..."
 8 | 	exit 1
 9 | fi
10 | 
11 | 
12 | o=`cd \`dirname $1\`; pwd`/`basename $1`
13 | odir=`dirname $1`
14 | shift
15 | p=${o/.gff/}
16 | 
17 | rpm_output=${p/.pindel/}.breakdancer.txt
18 | rpm_cfg=${p/.pindel/}.breakdancer.cfg
19 | 
20 | chr="ALL"
21 | if [ $# -eq 1 ]
22 | then
23 | 	if [[ "$1" =~ ".*chr[^.]*\..*" ]]
24 | 	then
25 | 		chr=`echo "$1" | sed 's/.*\(chr[^\.]*\)\..*/\1/'`
26 | 	fi
27 | fi
28 | 
29 | bOpt=''
30 | if [ -e $rpm_output ]
31 | then
32 | 	bOpt="-b $rpm_output"
33 | fi
34 | 
35 | cd $odir
36 | pin_cfg=$p.cfg
37 | for bam in $*
38 | do
39 | 	isize=''
40 | 	if [ -e $rpm_cfg ]
41 | 	then
42 | 		cfg=`grep $bam $rpm_cfg | head -n 1`
43 | 		isize=`echo "$cfg" | sed 's/.*mean:\([0-9]*\).*/\1/'`
44 | 	else
45 | 		isize=`samtools view -H $bam 2> /dev/null | grep "@RG" | grep "PI" | head -n 1 | sed 's/.*PI:\([0-9]*\).*/\1/'` 
46 | 	fi
47 | 	if [ -z "$isize" ]
48 | 	then
49 | 		isize=300
50 | 	fi
51 | 
52 | 	sample=`samtools view -H $bam 2> /dev/null | grep "@RG" | grep "SM" | head -n 1| sed 's/.*SM:\([^\t]*\).*/\1/'`
53 | 	if [ -z "$sample" ]
54 | 	then
55 | 		sample="SAMPLE"
56 | 	fi
57 | 
58 | 	echo -e "$bam\t$isize\t$sample"
59 | done > $pin_cfg
60 | pindel="$PINDEL/pindel -f $REF -i $pin_cfg -o $p -c $chr $bOpt"
61 | echo ">> Running Pindel on $chr"
62 | $pindel
63 | 
64 | echo ">> Converting output to GFF"
65 | 
66 | minsize=50
67 | 
68 | AWKOPT='{feat="Unknown"; if ($2=="D" || $2=="DI") feat="Deletion"; else if ($2=="I" || $2=="LI") feat="Insertion"; else if ($2=="INV") feat="Inversion"; else if ($2=="TD") feat="TandemDup"; start=$7; end=$8; if (feat!="Insertion") {start++; end--;} if ($3>='$minsize') print $5"\tPindel\t"feat"\t"start"\t"end"\t"$24"\t.\t.\tSize "$3"; nr.unique reads: (+"$17",-"$20"); ComScore: "int(sqrt(($17+1)*($20+1)*$24))}'
69 | 
70 | #echo -e "#Chr\tProgram\tSV-type\t\tstart\tend\tscore\tstrand\tframe\tattributes" > $o
71 | if [ -e "$o" ]
72 | then
73 |         rm $o
74 | fi
75 | 
76 | for po in ${p}_[^LBT]*
77 | do
78 | 	grep "ChrID" $po | sed 's/D \([0-9]*\)	I \([0-9]*\)/DI \1:\2/' | sed 's/SUM_MS \([0-9]*\)	.*/SUM_MS \1/' | sed 's/NT[^	]*//' | awk "$AWKOPT"
79 | done | sort -k1,1 -k4n -k5n -u >> $o
80 | 
81 | #echo ">> Archiving raw output"
82 | #tar --remove-files -zcvf $p.tgz ${p}_*
83 | 
84 | echo "*** Finished calling SV using Split-Read Analysis: $PINDEL ***"
85 | 


--------------------------------------------------------------------------------
/bin/gatk_recal.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -eu
 2 | 
 3 | echo "*** Recalibrating base quality ***"
 4 | echo "   " >> $LOGFILE
 5 | echo "*** Recalibrating base quality ***" >> $LOGFILE
 6 | 
 7 | if [ $# -lt 2 ]
 8 | then 
 9 | 	echo "Usage: $0 <bam> <out>"
10 | 	exit 1
11 | fi
12 | 
13 | f=`cd \`dirname $1\`; pwd`/`basename $1`
14 | o=`cd \`dirname $2\`; pwd`/`basename $2`
15 | 
16 | command="java -Xms5g -Xmx5g -jar $GATK/GenomeAnalysisTK.jar \
17 | 	-T BaseRecalibrator \
18 | 	-I $f \
19 |    	-R $REF \
20 | 	-o ${o/.bam/.grp} \
21 |    	-knownSites $MILLS_1K_GOLD_INDELS \
22 |    	-knownSites $GOLD_1K_INDELS \
23 |    	-knownSites $DBSNP \
24 |         -K /srv/gs1/software/gatk/GATKkey/stanford.edu.key"
25 | 
26 | #	-cov ReadGroupCovariate \
27 | #	-cov QualityScoreCovariate \
28 | #	-cov CycleCovariate \
29 | echo ">>> Counting covariates"
30 | echo ">>> Counting covariates" >> $LOGFILE
31 | echo ">>> $command"
32 | echo ">>> $command" >> $LOGFILE
33 | $command
34 | 
35 | # This is added to create BQSR plots: 
36 | # http://gatkforums.broadinstitute.org/discussion/2801/howto-recalibrate-base-quality-scores-run-bqsr
37 | 
38 | command="java -Xms5g -Xmx5g -jar $GATK/GenomeAnalysisTK.jar \
39 | 	-T BaseRecalibrator \
40 | 	-I $f \
41 |    	-R $REF \
42 | 	-o ${o/.bam/.grp.post} \
43 |    	-knownSites $MILLS_1K_GOLD_INDELS \
44 |    	-knownSites $GOLD_1K_INDELS \
45 |    	-knownSites $DBSNP \
46 | 	-BQSR ${o/.bam/.grp} \
47 |         -K /srv/gs1/software/gatk/GATKkey/stanford.edu.key"
48 | 
49 | #	-cov ReadGroupCovariate \
50 | #	-cov QualityScoreCovariate \
51 | #	-cov CycleCovariate \
52 | echo ">>> Recounting covariates"
53 | echo ">>> Recounting covariates" >> $LOGFILE
54 | echo ">>> $command"
55 | echo ">>> $command" >> $LOGFILE
56 | $command
57 | 
58 | command="java -Xms5g -Xmx5g -jar $GATK/GenomeAnalysisTK.jar \
59 |     -T AnalyzeCovariates \
60 |     -R $REF \
61 |     -before ${o/.bam/.grp} \
62 |     -after ${o/.bam/.grp.post} \
63 |     -plots ${o/.bam/recalibration_plots.pdf}"
64 | 
65 | echo ">>> Creating BQSR plots"	
66 | echo ">>> Creating BQSR plots" >> $LOGFILE	
67 | echo ">>> $command"
68 | echo ">>> $command" >> $LOGFILE
69 | #$command
70 | 
71 | command="java -Xms5g -Xmx5g -jar $GATK/GenomeAnalysisTK.jar \
72 | 	-R $REF \
73 | 	-I $f \
74 | 	-o $o \
75 | 	-T PrintReads \
76 | 	-BQSR ${o/.bam/.grp} \
77 | 	-K /srv/gs1/software/gatk/GATKkey/stanford.edu.key"
78 | 
79 | echo ">>> Printing reads"	
80 | echo ">>> Printing reads" >> $LOGFILE	
81 | echo ">>> $command"
82 | echo ">>> $command" >> $LOGFILE
83 | $command
84 | 
85 | echo "*** Finished recalibrating base quality ***"
86 | 


--------------------------------------------------------------------------------
/modulefiles/hugeseq/2.0:
--------------------------------------------------------------------------------
 1 | #%Module1.0
 2 | # HugeSeq Module File
 3 | #
 4 | ## HugeSeq modulefile
 5 | ##
 6 | ## Initializes HugeSeq
 7 | ##
 8 | 
 9 | proc ModulesHelp { } {
10 | #        global hugeseq_version
11 |         puts stderr "\tInitializes your environment to use the HugeSeq variant detection pipeline\n"
12 |         puts stderr "\tVersion $hugeseq_version\n"
13 | }
14 | 
15 | module-whatis "Initializes the HugeSeq variant detection pipeline"
16 | 
17 | # Setting paths and env for HugeSeq
18 | # for Tcl script use only
19 | set hugeseqversion 2.0
20 | 
21 | # sets the MODULESAPPSDIR env var
22 | module add modsappsdir
23 | 
24 | setenv HUGESEQ_HOME ~/HugeSeq
25 | setenv HUGESEQROOT ~/HugeSeq
26 | prepend-path PATH ~/HugeSeq/bin
27 | 
28 | # Setting data resource directory
29 | set dat_dir ~/Resources/GATK
30 | 
31 | # Setting paths for programming tools
32 | module load python
33 | 
34 | # Setting paths and env for alignment tools
35 | module load bwa
36 | 
37 | # Setting paths and env for variant detection annotation tools
38 | module load gatk
39 | module load cnvnator
40 | module load breakseqlite
41 | module load annovar
42 | module load samtools
43 | module load r
44 | 
45 | # Setting perl libraries (for BreakDancer)
46 | setenv PERLLIB /srv/gs1/software/hugeseq/hugeseq-2.0/perllib
47 | prepend-path PERL5LIB $::env(PERLLIB)/File-Path-2.08/blib/lib
48 | prepend-path PERL5LIB $::env(PERLLIB)/Statistics-Descriptive-2.6/blib/lib
49 | prepend-path PERL5LIB $::env(PERLLIB)/GD-2.45/blib/lib
50 | prepend-path PERL5LIB $::env(PERLLIB)/GDGraph-1.44/blib/lib
51 | prepend-path PERL5LIB $::env(PERLLIB)/GDGraph-histogram-1.1/blib/lib
52 | prepend-path PERL5LIB $::env(PERLLIB)/GDTextUtil-0.86/blib/lib
53 | prepend-path PERL5LIB $::env(PERLLIB)/Math-CDF-0.1/blib/lib
54 | 
55 | # Setting paths and env for utilities
56 | module load breakdancer
57 | module load pindel
58 | module load bedtools
59 | module load tabix
60 | module load vcftools
61 | module load root
62 | 
63 | # Setting env for supporting databases
64 | setenv BPLIB $dat_dir/bplib/bplib.alt.fa
65 | setenv REF $dat_dir/hg19-3.0/ucsc.hg19.fasta
66 | setenv DIC $dat_dir/hg19-3.0/ucsc.hg19.dict
67 | setenv DBSNP $dat_dir/hg19-3.0/dbsnp_138.hg19.vcf
68 | setenv HAPMAP $dat_dir/hg19-3.0/hapmap_3.3.hg19.vcf
69 | setenv OMNI_1K $dat_dir/hg19-3.0/1000G_omni2.5.hg19.vcf
70 | setenv GOLD_1K_SNPS $dat_dir/hg19-3.0/1000G_phase1.snps.high_confidence.hg19.vcf
71 | setenv MILLS_1K_GOLD_INDELS $dat_dir/hg19-3.0/Mills_and_1000G_gold_standard.indels.hg19.vcf
72 | setenv GOLD_1K_INDELS $dat_dir/hg19-3.0/1000G_phase1.indels.hg19.vcf
73 | 
74 | module load picard-tools
75 | 
76 | # Setting paths and env for the Simple Job Manager (SJM)
77 | module load sjm
78 | 


--------------------------------------------------------------------------------
/bin/vqsr_indel.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo "Performing VQSR on indels" 
 4 | echo "   " >> $LOGFILE
 5 | echo "Performing VQSR on indels" >> $LOGFILE
 6 | 
 7 | set -e
 8 | 
 9 | if [ $# -lt 1 ]
10 | then
11 |         echo "Usage: $0 <vcf> vqsr targeted"
12 |         exit 1
13 | fi
14 | 
15 | START_VCF=`cd \`dirname $1\`; pwd`/`basename $1`
16 | PREFIX=`dirname $START_VCF`
17 | SUFFIX=`basename $START_VCF`
18 | SAMPLE=${SUFFIX/.gatk.vcf/}
19 | 
20 | INDEL_VCF=$PREFIX/$SAMPLE.indel.vcf
21 | INDEL_RECAL_VCF=$PREFIX/$SAMPLE.vqsr.indel.vcf
22 | INDEL_RECAL=$PREFIX/$SAMPLE.tmp.indel.vcf
23 | INDEL_TRANCHES=$PREFIX/$SAMPLE.tranches.gatk.indel.recal.csv
24 | INDEL_RSCRIPT=$PREFIX/$SAMPLE.gatk.recal.indel.R
25 | 
26 | touch $INDEL_RECAL_VCF
27 | doVQSR=$2
28 | targeted=$3
29 |     
30 | DP="-an DP"
31 | if [ "$targeted" == "True" ]
32 | then
33 |     DP="--maxGaussians 4"
34 | fi
35 | 
36 | command="java -Xmx6g -Xms6g -jar $GATK/GenomeAnalysisTK.jar \
37 |    -T SelectVariants \
38 |    -R $REF \
39 |    -V $START_VCF \
40 |    -o $INDEL_VCF \
41 |    -selectType INDEL"
42 | 
43 | echo ">>> Select variants for indel recalibration"
44 | echo ">>> Select variants for indel recalibration" >> $LOGFILE
45 | echo ">>> $command &> $PREFIX/$SAMPLE.select.indel.log"
46 | echo ">>> $command > $PREFIX/$SAMPLE.select.indel.log" >> $LOGFILE
47 | $command &> $PREFIX/$SAMPLE.select.indel.log
48 | 
49 | if [ "$doVQSR" != "True" ]
50 | then
51 | 	command="cp $INDEL_VCF $INDEL_RECAL_VCF"
52 | 	echo ">>> Do not perform indel VQSR"
53 | 	echo ">>> Do not perform indel VQSR" >> $LOGFILE
54 | 	echo "$command > $PREFIX/$SAMPLE.vqsr.indel.log" >> $LOGFILE
55 | 	$command &> $PREFIX/$SAMPLE.vqsr.indel.log
56 | 	exit $?
57 | fi
58 | 
59 | command="java -Xmx6g -Xms6g -jar $GATK/GenomeAnalysisTK.jar \
60 |     -T VariantRecalibrator \
61 |     -R $REF \
62 |     -input $INDEL_VCF \
63 |     -resource:mills,known=true,training=true,truth=true,prior=12.0 $MILLS_1K_GOLD_INDELS \
64 |     -an FS $DP \
65 |     -an MQRankSum \
66 |     -an ReadPosRankSum \
67 |     -tranche 100.0 -tranche 99.9 -tranche 99.0 -tranche 90.0 \
68 |     -mode INDEL \
69 |     -recalFile $INDEL_RECAL \
70 |     -tranchesFile $INDEL_TRANCHES \
71 |     -rscriptFile $INDEL_RSCRIPT"
72 | 
73 | echo ">>> Train recalibration for indels"
74 | echo ">>> Train recalibration for indels" >> $LOGFILE
75 | echo ">>> $command &> $PREFIX/$SAMPLE.recalibrate.indel.log"
76 | echo ">>> $command > $PREFIX/$SAMPLE.recalibrate.indel.log" >> $LOGFILE
77 | $command &> $PREFIX/$SAMPLE.recalibrate.indel.log
78 | 
79 | command="java -Xmx6g -Xms6g -jar $GATK/GenomeAnalysisTK.jar \
80 |    -T ApplyRecalibration \
81 |    -R $REF \
82 |    -input $INDEL_VCF \
83 |    --ts_filter_level 99.0 \
84 |    -tranchesFile $INDEL_TRANCHES \
85 |    -recalFile $INDEL_RECAL \
86 |    -o $INDEL_RECAL_VCF \
87 |    --mode INDEL"
88 | 
89 | echo ">>> Apply recalibration for indels"
90 | echo ">>> Apply recalibration for indels" >> $LOGFILE
91 | echo ">>> $command &> $PREFIX/$SAMPLE.apply.indel.log"
92 | echo ">>> $command > $PREFIX/$SAMPLE.apply.indel.log" >> $LOGFILE
93 | $command &> $PREFIX/$SAMPLE.apply.indel.log
94 | 
95 | echo "Finished performing VQSR on indels" 
96 | #rm $SNP_VCF $SNP_RECAL $SNP_TRANCHES $PREFIX/*.log $INDEL_VCF $INDEL_RECAL $INDEL_TRANCHES
97 | 


--------------------------------------------------------------------------------
/bin/vqsr_snp.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | echo ">>> Performing VQSR on SNPs"
  4 | echo "   " >> $LOGFILE
  5 | echo ">>> Performing VQSR on SNPs" >> $LOGFILE
  6 | 
  7 | set -e
  8 | 
  9 | if [ $# -lt 1 ]
 10 | then
 11 |         echo "Usage: $0 <vcf> vqsr"
 12 |         exit 1
 13 | fi
 14 | 
 15 | START_VCF=`cd \`dirname $1\`; pwd`/`basename $1`
 16 | PREFIX=`dirname $START_VCF`
 17 | SUFFIX=`basename $START_VCF`
 18 | SAMPLE=${SUFFIX/.gatk.vcf/}
 19 | 
 20 | SNP_VCF=$PREFIX/$SAMPLE.snp.vcf
 21 | SNP_RECAL_VCF=$PREFIX/$SAMPLE.vqsr.snp.vcf
 22 | SNP_RECAL=$PREFIX/$SAMPLE.tmp.snp.vcf
 23 | SNP_TRANCHES=$PREFIX/$SAMPLE.tranches.gatk.snp.recal.csv
 24 | SNP_RSCRIPT=$PREFIX/$SAMPLE.gatk.recal.snp.R
 25 | 
 26 | touch $SNP_RECAL_VCF
 27 | doVQSR=$2
 28 | targeted=$3
 29 | useHaplotypeScore=$4
 30 | noop=$5
 31 | 
 32 | if [ "$noop" == "True" ]
 33 | then
 34 | 	echo "bye bye"
 35 |         exit $?
 36 | fi
 37 | 
 38 | HAPLOTYPSCORE=""
 39 | if [ "$useHaplotypeScore" == "True" ]
 40 | then
 41 |         HAPLOTYPSCORE="-an HaplotypeScore"
 42 | fi
 43 | 
 44 | DP="-an DP"
 45 | if [ "$targeted" == "True" ]
 46 | then
 47 |         DP="--maxGaussians 4"
 48 | fi
 49 | 
 50 | if [ "$doVQSR" != "True" ]
 51 | then
 52 |         command="cp $SNP_VCF $SNP_RECAL_VCF"
 53 |         echo ">>> Do not perform SNP VQSR"
 54 |         echo ">>> Do not perform SNP VQSR" >> $LOGFILE
 55 |         $command &> $PREFIX/$SAMPLE.vqsr.snp.log
 56 |         echo "$command &> $PREFIX/$SAMPLE.vqsr.snp.log" >> $LOGFILE
 57 |         exit $?
 58 | fi
 59 | 
 60 | command="java -Xmx6g -Xms6g -jar $GATK/GenomeAnalysisTK.jar \
 61 |    -T SelectVariants \
 62 |    -R $REF \
 63 |    -V $START_VCF \
 64 |    -o $SNP_VCF \
 65 |    -selectType SNP"
 66 | 
 67 | echo ">>> Select variants for SNP recalibration"
 68 | echo ">>> Select variants for SNP recalibration" >> $LOGFILE
 69 | echo ">>> $command &> $PREFIX/$SAMPLE.select.snp.log"
 70 | echo ">>> $command &> $PREFIX/$SAMPLE.select.snp.log" >> $LOGFILE
 71 | $command &> $PREFIX/$SAMPLE.select.snp.log 
 72 | 
 73 | command="java -Xmx6g -Xms6g -jar $GATK/GenomeAnalysisTK.jar \
 74 |    -T VariantRecalibrator \
 75 |    -R $REF \
 76 |    -input $SNP_VCF \
 77 |    -resource:hapmap,known=false,training=true,truth=true,prior=15.0 $HAPMAP \
 78 |    -resource:omni,known=false,training=true,truth=true,prior=12.0 $OMNI_1K \
 79 |    -resource:1000G,known=false,training=true,truth=false,prior=10.0 $GOLD_1K_SNPS \
 80 |    -resource:dbsnp,known=true,training=false,truth=false,prior=2.0 $DBSNP \
 81 |    -an QD \
 82 |    -an FS $DP \
 83 |    -an MQRankSum \
 84 |    -an ReadPosRankSum $HAPLOTYPESCORE \
 85 |    -tranche 100.0 -tranche 99.9 -tranche 99.0 -tranche 90.0 \
 86 |    -mode SNP \
 87 |    -recalFile $SNP_RECAL \
 88 |    -tranchesFile $SNP_TRANCHES \
 89 |    -nt 4 \
 90 |    --minNumBadVariants 5000 \
 91 |    -rscriptFile $SNP_RSCRIPT"
 92 | 
 93 | echo ">>> Train recalibrator for SNPs"
 94 | echo ">>> Train recalibrator for SNPs" >> $LOGFILE
 95 | echo ">>> $command &> $PREFIX/$SAMPLE.recalibrate.snp.log"
 96 | echo ">>> $command > $PREFIX/$SAMPLE.recalibrate.snp.log" >> $LOGFILE
 97 | $command &> $PREFIX/$SAMPLE.recalibrate.snp.log
 98 | 
 99 | command="java -Xmx3g -Xms3g -jar $GATK/GenomeAnalysisTK.jar \
100 |    -T ApplyRecalibration \
101 |    -R $REF \
102 |    -input $SNP_VCF \
103 |    --ts_filter_level 99.0 \
104 |    -tranchesFile $SNP_TRANCHES \
105 |    -recalFile $SNP_RECAL \
106 |    -o $SNP_RECAL_VCF \
107 |    --mode SNP"
108 | 
109 | echo ">>> Apply recalibration to SNPs"
110 | echo ">>> Apply recalibration to SNPs" >> $LOGFILE
111 | echo ">>> $command &> $PREFIX/$SAMPLE.apply.snp.log"
112 | echo ">>> $command > $PREFIX/$SAMPLE.apply.snp.log" >> $LOGFILE
113 | $command &> $PREFIX/$SAMPLE.apply.snp.log
114 | 
115 | echo ">>> Finished performing VQSR on SNPs"
116 | 


--------------------------------------------------------------------------------
/bin/annotate.py:
--------------------------------------------------------------------------------
  1 | #!/bin/env python
  2 | 
  3 | import sys
  4 | import os
  5 | import re
  6 | 
  7 | if len(sys.argv) <= 2:
  8 | 	print 'usage: <output file> <vcf or gff file>'
  9 | 	exit(1)
 10 | 	
 11 | path = os.environ['ANNOVAR']
 12 | output = sys.argv[1]
 13 | input = sys.argv[2]
 14 | 
 15 | isVCF=False
 16 | if input.endswith('.vcf') or input.endswith('.vcf.gz'):
 17 | 	isVCF=True
 18 | 	vcf = input
 19 | 	avinput = vcf + '.avinput'
 20 | 	avoutput = vcf + '.avoutput'
 21 | 	os.system('less %s | %s/convert2annovar.pl -format vcf4 - > %s' % (vcf, path, avinput))
 22 | elif input.endswith('.gff'):
 23 | 	gff = input
 24 | 	avinput = gff + '.avinput'
 25 | 	avoutput = gff + '.avoutput'
 26 | 	gff_file = open(gff, 'read')
 27 | 	out_file = open(avinput, 'w')
 28 | 	for line in gff_file:
 29 | 		gffCols = line.split('\t')
 30 | 		out_file.write(gffCols[0]+'\t'+gffCols[3]+'\t'+gffCols[4]+'\t0\t0\n')
 31 | 	out_file.flush()
 32 | 	out_file.close()
 33 | else:
 34 | 	print >> sys.stderr, "Unknown input format: "+input
 35 | 	exit(1)
 36 | 
 37 | # use user-define output name for now
 38 | avoutput=output
 39 | 
 40 | print 'Annotating variants with hg19 UCSC knownGene...\n'
 41 | temp=('%s/annotate_variation.pl --geneanno --buildver hg19 -dbtype knownGene --separate %s %s/humandb/' %(path, avinput, path))
 42 | print temp
 43 | os.system('%s/annotate_variation.pl --geneanno --buildver hg19 -dbtype knownGene --separate %s %s/humandb/' %(path, avinput, path))
 44 | print 'Annotating variants with hg19 RMSK...\n'
 45 | 
 46 | temp=('%s/annotate_variation.pl -regionanno --buildver hg19 -dbtype gff3 -gff3dbfile hg19_rmsk.gff %s %s/humandb/' %(path, avinput, path))
 47 | print temp
 48 | os.system('%s/annotate_variation.pl -regionanno --buildver hg19 -dbtype gff3 -gff3dbfile hg19_rmsk.gff %s %s/humandb/' %(path, avinput, path))
 49 | if isVCF:
 50 | 	print 'Annotating variants with sift scores using hg19...\n'
 51 | 	os.system('%s/annotate_variation.pl --filter --sift_threshold 0 --buildver hg19 --separate -dbtype avsift %s %s/humandb/' %(path, avinput, path))
 52 | 	print 'Annotating variants with dbSNP137...\n'
 53 | 	os.system('%s/annotate_variation.pl --filter --buildver hg19 --dbtype snp137 %s %s/humandb/' %(path, avinput, path))
 54 | 
 55 | 
 56 | exonic_file = avinput + '.exonic_variant_function'
 57 | function_file = avinput + '.variant_function'
 58 | sift_file = avinput + '.hg19_avsift_dropped'
 59 | dbsnp_file = avinput + '.hg19_snp137_dropped'
 60 | rmsk_file = avinput + ".hg19_gff3"
 61 | 
 62 | def makeDict(filename, chrCol, startCol, endCol, valueCols, isGFF=False):
 63 | 	file = open(filename, 'r')
 64 | 	dic = {}
 65 | 	for line in file:
 66 | 		cols = line.split('\t')
 67 | 		key=(cols[chrCol], cols[startCol], cols[endCol])
 68 | 		if key not in dic:
 69 | 			dic[key] = []
 70 | 			for valueCol in valueCols:
 71 | 				dic[key].append([])
 72 | 		for i in range(len(valueCols)):
 73 | 			value=cols[valueCols[i]]
 74 | 			if isGFF:
 75 | 				value=value.split(";")[-1].split("=")[-1]
 76 | 			dic[key][i].append(value)
 77 | 	return dic
 78 | 
 79 | function = makeDict(function_file, 2, 3, 4, [1, 0])
 80 | rmsk = makeDict(rmsk_file, 2, 3, 4, [1], True)
 81 | if isVCF:
 82 | 	exonic = makeDict(exonic_file, 3, 4, 5, [1,2])
 83 | 	dbsnp = makeDict(dbsnp_file, 2, 3, 4, [1])
 84 | 	sift = makeDict(sift_file, 2, 3, 4, [1])
 85 | 
 86 | AVINPUT = open(avinput, 'r')
 87 | AVOUTPUT = open(avoutput, 'w')
 88 | AVOUTPUT.write("#chr\tstart\tend\tgene_name\ttype\trmsk");
 89 | if isVCF:
 90 | 	AVOUTPUT.write("\tSIFT\tconsequence\tmutation_info\tdbsnp137")
 91 | 
 92 | AVOUTPUT.write("\n")
 93 | 
 94 | def write(dic, key, nvalueCols=1):
 95 | 	if key in dic: 
 96 | 		for value in dic[key]:
 97 | 			AVOUTPUT.write('\t'+";".join(value))
 98 | 	else:		
 99 | 		AVOUTPUT.write('\t.'*nvalueCols)
100 | 
101 | for line in AVINPUT:
102 | 	if re.match('([0-9A-Za-z]+)\s+(\d+)', line):
103 | 		splitline = line.split('\t')
104 | 		key=(splitline[0], splitline[1],splitline[2])
105 | 		AVOUTPUT.write(splitline[0]+'\t'+splitline[1]+'\t'+splitline[2])
106 | 		write(function, key, 2)
107 | 		write(rmsk, key)
108 | 		if isVCF:
109 | 			write(sift, key)
110 | 			write(exonic, key, 2)
111 | 			write(dbsnp, key)
112 | 		AVOUTPUT.write('\n')
113 | 
114 | AVINPUT.flush();
115 | AVOUTPUT.flush();
116 | 
117 | AVINPUT.close();
118 | AVOUTPUT.close();
119 | 
120 | 


--------------------------------------------------------------------------------
/bin/gatk_vc.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash -eu
  2 | 
  3 | echo "*** Performing SNV discovery and genotyipng using GATK ***"
  4 | echo "   " >> $LOGFILE
  5 | echo "*** Performing SNV discovery and genotyipng using GATK ***" >> $LOGFILE
  6 | 
  7 | if [ $# -lt 4 ]
  8 | then 
  9 | 	echo "Usage: $0 output reference_calls snp_hapcaller indel_hapcaller bam ..."
 10 | 	exit 1
 11 | fi
 12 | 
 13 | o=`cd \`dirname $1\`; pwd`/`basename $1`
 14 | shift
 15 | output_hc=$o.hc.vcf
 16 | output_gc=$o.gc.vcf
 17 | 
 18 | snp_vcf=$o.snp
 19 | indel_vcf=$o.indel
 20 | 
 21 | capture=$1
 22 | shift
 23 | echo $capture
 24 | 
 25 | reference_calls=$1
 26 | shift
 27 | 
 28 | snp_hap=$1
 29 | shift
 30 | indel_hap=$1
 31 | shift
 32 | 
 33 | optL=''
 34 | if [ $# -eq 1 ]
 35 | then
 36 | 	echo $1
 37 |         if [[ "$1" =~ .*chr[^\.]*\..* ]]
 38 |         then
 39 |                 chr=`echo "$1" | sed 's/.*\(chr[^\.]*\)\..*/\1/'`
 40 |                 optL="-L $chr"
 41 |         fi
 42 | fi
 43 | 
 44 | f=''
 45 | for i in $*
 46 | do
 47 |         f="$f -I `cd \`dirname $i\`; pwd`/`basename $i`"
 48 | done
 49 | 
 50 | log=${o/gatk.vcf/}vc.log
 51 | 
 52 | run_hc="False"
 53 | run_gc="False"
 54 | if [[ "$snp_hap" == "True" ]] || [[ "$indel_hap" == "True" ]] 
 55 | then
 56 | 	run_hc="True"
 57 | fi
 58 | 
 59 | if [[ "$snp_hap" != "True" ]] || [[ "$indel_hap" != "True" ]] 
 60 | then
 61 | 	run_gc="True"
 62 | fi
 63 | 
 64 | NO_VARIATION=""
 65 | OUTPUT_MODE_UG=""
 66 | OUTPUT_MODE_HC=""
 67 | if [[ "$reference_calls" == "True" ]]
 68 | then
 69 | 	OUTPUT_MODE_UG="--output_mode emit_all_sites"
 70 | 	OUTPUT_MODE_HC="-ERC BP_RESOLUTION"
 71 | 	NO_VARIATION="-selecttype no_variation"
 72 | fi
 73 | 
 74 | CAPTURE=""
 75 | if [[ "$capture" != "False" ]]
 76 | then
 77 | 	#CAPTURE="-L $capture"
 78 | 	captures=$(echo $capture | tr "[" "\n")
 79 | 	captures=$(echo $captures | tr "]" "\n")
 80 | 	captures=$(echo $captures | tr "," "\n")
 81 | 	#echo "here" $captures
 82 | 	
 83 | 	for capt in $captures
 84 | 	do
 85 | 		#echo "HELLO" $capt
 86 | 		if [[ "$CAPTURE" != "" ]]
 87 | 		then 
 88 | 			CAPTURE="$CAPTURE -L $capt"
 89 | 		else
 90 | 			CAPTURE="-L $capt"
 91 | 		fi
 92 | 	done
 93 | fi
 94 | 
 95 | gc_command="java -Xmx8g -Xms8g -Djava.io.tmpdir=$TMP -jar $GATK/GenomeAnalysisTK.jar \
 96 |    -T UnifiedGenotyper \
 97 |    -R $REF \
 98 |    $f \
 99 |    --dbsnp $DBSNP \
100 |    -o $output_gc $optL $CAPTURE \
101 |    -stand_call_conf 20.0 \
102 |    -stand_emit_conf 10.0 \
103 |    -gt_mode DISCOVERY $OUTPUT_MODE_UG \
104 |    --genotype_likelihoods_model BOTH \
105 |    -nct 4"
106 | 
107 | hc_command="java -Xmx12g -Xms12g -Djava.io.tmpdir=$TMP -jar $GATK/GenomeAnalysisTK.jar \
108 |      -T HaplotypeCaller \
109 |      -R $REF \
110 |      $f \
111 |      --dbsnp $DBSNP \
112 |      -o $output_hc $optL $CAPTURE \
113 |      -stand_call_conf 20.0 \
114 |      -stand_emit_conf 10.0 \
115 |      --genotyping_mode DISCOVERY $OUTPUT_MODE_HC \
116 |      -nct 4"
117 | 
118 | # https://gatkforums.broadinstitute.org/discussion/3115/emit-all-sites-in-haplotypecaller
119 | # http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_walkers_haplotypecaller_HaplotypeCaller.html
120 | 
121 | if [[ "$run_gc" == "True" ]]
122 | then
123 | 	echo ">>> Running the GATK's UnifiedGenotyper"
124 | 	echo ">>> Running the GATK's UnifiedGenotyper" >> $LOGFILE
125 | 	echo ">>> $gc_command"
126 | 	echo ">>> $gc_command" >> $LOGFILE
127 | 	$gc_command &>> $log 
128 | fi
129 | 
130 | if [[ "$run_hc" == "True" ]]
131 | then
132 | 	echo ">>> Running the GATK's Haplotyper"
133 | 	echo ">>> Running the GATK's Haplotyper" >> $LOGFILE
134 | 	echo ">>> $hc_command"
135 | 	echo ">>> $hc_command" >> $LOGFILE
136 | 	$hc_command &>> $log
137 | fi
138 | 
139 | use_gc="False"
140 | use_hc="False"
141 | 
142 | if [[ "$snp_hap" == "True" ]] && [[ "$indel_hap" == "True" ]]
143 | then
144 | 	# select both snps and indels from HC
145 | 	select_indel_from=$output_hc
146 | 	select_snp_from=$output_hc
147 | 	use_hc="True"
148 | fi
149 | 
150 | if [[ "$snp_hap" == "True" ]] && [[ "$indel_hap" != "True" ]]
151 | then
152 | 	# select snps from HC and indels from GC
153 | 	select_indel_from=$output_gc
154 | 	select_snp_from=$output_hc
155 | fi
156 | 
157 | if [[ "$snp_hap" != "True" ]] && [[ "$indel_hap" == "True" ]]
158 | then
159 | 	# select snps from GC and indels from HC
160 | 	select_indel_from=$output_hc
161 | 	select_snp_from=$output_gc
162 | fi
163 | 
164 | if [[ "$snp_hap" != "True" ]] && [[ "$indel_hap" != "True" ]]
165 | then
166 | 	# select snps and indels from GC
167 | 	select_indel_from=$output_gc
168 | 	select_snp_from=$output_gc
169 | 	use_gc="True"
170 | fi
171 | 
172 | if [[ "$use_gc" == "True" ]]
173 | then
174 | 	command="mv $output_gc $o"
175 | 	echo ">>> Choosing both SNP and INDEL from UG output: $select_snp_from"
176 | 	echo ">>> Choosing both SNP and INDEL from UG output: $select_snp_from" >> $LOGFILE
177 | 	echo ">>> $command"
178 | 	echo ">>> $command" >> $LOGFILE
179 | 	$command &>> $log
180 | 
181 | elif [[ "$use_hc" == "True" ]]
182 | then
183 | 	command="mv $output_hc $o"
184 | 	echo ">>> Choosing both SNP and INDEL from HC output: $select_snp_from"
185 | 	echo ">>> Choosing both SNP and INDEL from HC output: $select_snp_from" >> $LOGFILE
186 | 	echo ">>> $command"
187 | 	echo ">>> $command" >> $LOGFILE
188 | 	$command &>> $log
189 | else
190 | 
191 | 	command="java -Xmx8g -Xms8g -jar $GATK/GenomeAnalysisTK.jar \
192 | 		-T SelectVariants \
193 | 		-R $REF \
194 | 		-V $select_snp_from \
195 | 		-o $snp_vcf \
196 | 		-selectType MNP \
197 | 		-selectType SNP $NO_VARIANTION"
198 | 	
199 | 		echo ">>> Selecting SNPs from $select_snp_from"
200 | 		echo ">>> Selecting SNPs from $select_snp_from" >> $LOGFILE
201 | 		echo ">>> $command"
202 | 		echo ">>> $command" >> $LOGFILE
203 | 		$command &>> $log
204 | 
205 | 	command="java -Xmx8g -Xms8g -jar $GATK/GenomeAnalysisTK.jar \
206 | 		-T SelectVariants \
207 | 		-R $REF \
208 | 		-V $select_indel_from \
209 | 		-o $indel_vcf \
210 | 		-selectType INDEL"
211 | 
212 | 		echo ">>> Selecting Indels from $select_indel_from"
213 | 		echo ">>> Selecting Indels from $select_indel_from" >> $LOGFILE
214 | 		echo ">>> $command"
215 | 		echo ">>> $command" >> $LOGFILE
216 | 		$command &>> $log
217 | 
218 | 	command="java -Xmx8g -Xms8g -jar $GATK/GenomeAnalysisTK.jar \
219 | 		-R $REF \
220 | 		-T CombineVariants \
221 | 		--variant $snp_vcf \
222 | 		--variant $indel_vcf \
223 | 		-o $o"
224 | 
225 | 		echo ">>> Combining SNP and Indel VCFs"
226 | 		echo ">>> Combining SNP and Indel VCFs" >> $LOGFILE
227 | 		echo ">>> $command"
228 | 		echo ">>> $command" >> $LOGFILE
229 | 		$command &>> $log
230 | fi
231 | 	
232 | echo "*** Finished SNV discovery and genotyipng using GATK ***"
233 | 
234 | 


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
  1 | #####################################
  2 | #                                   #
  3 | # HugeSeq                           #
  4 | # The Variant Detection Pipeline    #
  5 | #                                   #
  6 | #####################################
  7 | 
  8 | -- DEPENDENCIES
  9 | 
 10 | + STANOVAR version 0.1
 11 | 
 12 | + BEDtools version 2.17.0
 13 | 
 14 | + BreakDancer version 1.1.2
 15 | 
 16 | + BreakSeq Lite version 1.0
 17 | 
 18 | + BWA version 0.7.4
 19 | 
 20 | + CNVnator version 0.2.7
 21 | 
 22 | + GATK version 3.2.2
 23 | 
 24 | + JDK version 1.7.0_03
 25 | 
 26 | + Modules Release 3.2.8
 27 | 
 28 | + Perl
 29 | 
 30 | + Picard Tools version 1.32
 31 | 
 32 | + Pindel version 0.2.4t
 33 | 
 34 | + Python version 2.7
 35 | 
 36 | + Simple Job Manager version 1.0
 37 | 
 38 | + Tabix version 0.2.6
 39 | 
 40 | + vcftools version 0.1.12
 41 | 
 42 | + zlib version 1.2.7
 43 | 
 44 | + root version 5.34.30
 45 | 
 46 | + r version 3.2.0
 47 | 
 48 | 
 49 | -- INSTALLATION
 50 | 
 51 | HugeSeq is a modular, computational pipeline that runs in a Unix environment in a highly parallel fashion. It was tested on Red Hat Enterprise Linux (RHEL) server v5.6 but it should work in most Linux servers. The batch system it currently supports out-of-the-box is Sun Grid Engine.
 52 | 
 53 | 
 54 | Batch System
 55 | 
 56 | Many of the clusters are already installed with Sun Grid Engine (SGE). For installing SGE, please refer to the vendor's manual. 
 57 | 
 58 | Running the analysis pipeline requires submitting many interdependent jobs to the batch scheduling system (e.g. Sun Grid Engine). Therefore, we developed a software program called SJM (Simple Job Manager) to simplify this process, including properly specifying the dependencies, tracking progress of the group of jobs, and responding properly if a job fails.
 59 | 
 60 | For batch systems other than SGE, it requires developing an adaptor in SJM. Please write to us for more details.
 61 | 
 62 | Modules Environment
 63 | 
 64 | To manage different versions of softwares and parameters in the modules, HugeSeq uses a Unix software package called Environment Modules, which provides for the dynamic modification of a user's environment via modulefiles.
 65 | 
 66 | To initiate Modules, modify your login profile such as .bash_profile to add the following:
 67 | 
 68 | . /path-to-Modules/default/init/sh
 69 | 
 70 | Supporting Tools
 71 | 
 72 | Install the required softwares, such as the aligners, variant callers and manipulation tools, defined in the software requirements section. For details, please refer to the individual software websites. The softwares are recommended to be installed separately under a single parent directory, such as ~/apps/BreakSeq and ~/apps/CNVnator.
 73 | 
 74 | Data Sets
 75 | 
 76 | HugeSeq depends on several public data sets for alignment, variant calling, and annotations. They are:
 77 | The reference genome (e.g. HG19 in FASTA format: hg19.fa)
 78 | The BWA index of the reference genome (e.g. hg19.fa.bwt, hg19.fa.ann, etc)
 79 | A .dict dictionary of the contig names and sizes (e.g. hg19.fa.dict)
 80 | A .fai fasta index file (e.g. hg19.fa.fai)
 81 | For creating .dict and .fai, please see here. All the indexes and dictionary should reside in the reference genome directory which contains the whole genome FASTA (e.g. hg19.fa)
 82 | The breakpoint junctions (i.e. BreakSeq library in FASTA format: bplib.fa)
 83 | The SNP annotation
 84 | UCSC Known Genes (knownGene)
 85 | dbSNP
 86 | SIFT (avsift)
 87 | RepeatMasker (buildver_rmsk.gff)
 88 | The STANOVAR application should be installed and corresponding module need to be defined. 
 89 | 
 90 | 
 91 | Download HugeSeq to your server.
 92 | 
 93 | Extract the programs from the compressed archive to a directory, such as ~/app. A directory like ~/app/HugeSeq will then be created, which contains the core program and its configuration. As described above, HugeSeq uses the Environment Modules package for configuration. Its modulefile is in the directory /path-to-HugeSeq/modulefiles/hugeseq named with its version, such as 1.0. To enable Modules to look up the modulefile for correct setting, modify the login profile as above and add the following:
 94 | 
 95 | export MODULEPATH=/path-to-HugeSeq/modulefiles:$MODULEPATH
 96 | 
 97 | In addition, modify the module file, such as /path-to-HugeSeq/modulefiles/hugeseq/2.0, and change all the programs' paths to the locations where you installed the required programs and the data paths to where you stored the datasets.
 98 | 
 99 | Logout and login again to your shell to activate the login profile with the latest configuration. You should now be able to run HugeSeq by loading its module:
100 | 
101 | > module load hugeseq/2.0
102 | 
103 | After loading the module, you can run HugeSeq simply by typing:
104 | 
105 | > hugeseq
106 | 
107 | For the usage of HugeSeq, please refer to the Usage section.
108 | 
109 | -- USAGE
110 | 
111 | usage: hugeseq [-h] --reads1 FILE [FILE ...] [--reads2 FILE [FILE ...]]
112 |                --output DIR [--account STR] [--tmp DIR] [--readgroup STR]
113 |                [--samplename STR] [--bam] [--variants TYPE [TYPE ...]]
114 |                [--targeted] [--capture FILE [FILE ...]] [--relax_realignment]
115 |                [--reference_calls] [--snp_hapcaller] [--indel_hapcaller]
116 |                [--nosnpvqsr] [--noindelvqsr] [--vqsrchrom] [--nobinning]
117 |                [--nocleanup] [--novariant] [--alignmentonly] [--cleanuponly]
118 |                [--variantonly] [--donealign] [--donebinning] [--donecleanup]
119 |                [--donegenotyping] [--donesnpvqsr] [--memory SIZE]
120 |                [--queue NAME] [--email NAME] [--threads COUNT]
121 |                [--jobfile FILE] [--submit]
122 | 
123 | Generating the job file for the HugeSeq variant detection pipeline
124 | 
125 | optional arguments:
126 |   -h, --help            show this help message and exit
127 |   --reads1 FILE [FILE ...]
128 |                         The FASTQ file(s) for reads 1
129 |   --reads2 FILE [FILE ...]
130 |                         The FASTQ file(s) for reads 2, if paired-end
131 |   --output DIR          The output directory
132 |   --account STR         Accounting string for the purpose of cluster
133 |                         accounting.
134 |   --tmp DIR             The TMP directory for storing intermediate files
135 |                         (default=output directory
136 |   --readgroup STR       The read group annotation (Default:
137 |                         @RG\tID:Default\tLB:Library\tPL:Illumina\tSM:SAMPLE)
138 |   --samplename STR      The SM tag in the read group annotation (Default:
139 |                         "SAMPLE" in
140 |                         @RG\tID:Default\tLB:Library\tPL:Illumina\tSM:SAMPLE)
141 |   --bam                 Support for aligned BAMs as input. By default input
142 |                         (-r) is aligned again. Use --variantonly otherwise.
143 |   --variants TYPE [TYPE ...]
144 |                         gatk breakdancer cnvnator pindel breakseq (default to
145 |                         all)
146 |   --targeted            Use GATK in targeted sequencing mode (default: whole-
147 |                         genome mode)
148 |   --capture FILE [FILE ...]
149 |                         Capture BED file(s) used for targeted genotyping
150 |                         (default: void, separate multipe files with commas:
151 |                         capture1.bed,capture2.bed,...)
152 |   --relax_realignment   Relaxes GATKs realignment when dealing with badly
153 |                         scored reads (default: false)
154 |   --reference_calls     Store all reference calls from GATK (default: false)
155 |                         in gVCF format in addition to a standard VCF file
156 |                         containing only the variants (valid only for SNV
157 |                         calling)
158 |   --snp_hapcaller       Use GATK HaplotypeCaller to discover SNPs (default:
159 |                         UnifiredGenotyper)
160 |   --indel_hapcaller     Use GATK HaplotypeCaller to discover Indels (default:
161 |                         UnifiredGenotyper)
162 |   --nosnpvqsr           Do not perform VQSR SNPs (variant quality score
163 |                         recalibration)
164 |   --noindelvqsr         Do not perform VQSR on Indels (variant quality score
165 |                         recalibration)
166 |   --vqsrchrom           Perform VQSR on individual chromosomes (valid when
167 |                         binning performed; default: VQSR on whole genome VCF)
168 |   --nobinning           Do not bin the alignments by chromosomes
169 |   --nocleanup           Do not clean up the alignments
170 |   --novariant           Do not call variants
171 |   --alignmentonly       Only align input FASTQ or BAM files (-r)
172 |   --cleanuponly         Only clean up input BAM files (-r)
173 |   --variantonly         Only call variants in input BAM files (-r)
174 |   --donealign           Sequences already aligned using the pipeline
175 |   --donebinning         Alignments already binned by chromosomes using the
176 |                         pipeline
177 |   --donecleanup         Alignments already cleaned using the pipeline
178 |   --donegenotyping      Variants already called using the pipeline but VQSR is
179 |                         not
180 |   --donesnpvqsr         Processing is started after SNP VQSR (from Indel VQSR)
181 |   --memory SIZE         Memory size (GB) per job (default: 12)
182 |   --queue NAME          Queue for jobs (default: extended)
183 |   --email NAME          Email address to receive emails for ending or aborting
184 |                         last jobs in the queque
185 |   --threads COUNT       Number of threads for alignment, only works for SGE
186 |                         (default: 4)
187 |   --jobfile FILE        The jobfile name (default: stdout)
188 |   --submit              Submit the jobs
189 | 


--------------------------------------------------------------------------------
/bin/hugeseq:
--------------------------------------------------------------------------------
  1 | #!/bin/env python
  2 | 
  3 | import sys, os, re, argparse, subprocess, os.path
  4 | import dircache
  5 | from sjm import *
  6 | from util import *
  7 | from os import listdir
  8 | from os.path import isfile, join, splitext
  9 | 
 10 | try:
 11 |         home=os.environ['HUGESEQ_HOME']
 12 |         refi=os.environ['REF']+".fai"
 13 | except KeyError:
 14 |         print >> sys.stderr, "Error in initializing HugeSeq. Module HugeSeq probably is not loaded."
 15 |         exit(1)
 16 | 
 17 | parser = argparse.ArgumentParser(description='Generating the job file for the HugeSeq variant detection pipeline')
 18 | parser.add_argument('--reads1', metavar='FILE', nargs="+", required=True, help='The FASTQ file(s) for reads 1')
 19 | parser.add_argument('--reads2', metavar='FILE', nargs="+", help='The FASTQ file(s) for reads 2, if paired-end')
 20 | parser.add_argument('--output', metavar='DIR', required=True, help='The output directory')
 21 | parser.add_argument('--account', metavar='STR', help='Accounting string for the purpose of cluster accounting.')
 22 | parser.add_argument('--tmp', metavar='DIR', help='The TMP directory for storing intermediate files (default=output directory')
 23 | parser.add_argument('--readgroup', metavar='STR', default="@RG\\tID:Default\\tLB:Library\\tPL:Illumina\\tSM:SAMPLE", help='The read group annotation (Default: @RG\\tID:Default\\tLB:Library\\tPL:Illumina\\tSM:SAMPLE)')
 24 | parser.add_argument('--samplename', metavar='STR', help='The SM tag in the read group annotation (Default: "SAMPLE" in @RG\\tID:Default\\tLB:Library\\tPL:Illumina\\tSM:SAMPLE)')
 25 | parser.add_argument('--bam', action='store_true', help='Support for aligned BAMs as input. By default input (-r) is aligned again. Use --variantonly otherwise.')
 26 | parser.add_argument('--variants', metavar='TYPE', nargs="+", help='gatk breakdancer cnvnator pindel breakseq (default to all)')
 27 | parser.add_argument('--targeted', action='store_true', help='Use GATK in targeted sequencing mode (default: whole-genome mode)')
 28 | parser.add_argument('--capture', metavar='FILE', nargs="+", help='Capture BED file(s) used for targeted genotyping (default: void, separate multipe files with commas: capture1.bed,capture2.bed,...)')
 29 | parser.add_argument('--relax_realignment', action='store_true', help='Relaxes GATKs realignment when dealing with badly scored reads (default: false)')
 30 | parser.add_argument('--reference_calls', action='store_true', help='Store all reference calls from GATK (default: false) in gVCF format in addition to a standard VCF file containing only the variants (valid only for SNV calling)')
 31 | parser.add_argument('--snp_hapcaller', action='store_true', help='Use GATK HaplotypeCaller to discover SNPs (default: UnifiredGenotyper)')
 32 | parser.add_argument('--indel_hapcaller', action='store_true', help='Use GATK HaplotypeCaller to discover Indels (default: UnifiredGenotyper)')
 33 | parser.add_argument('--nosnpvqsr', action='store_true', help='Do not perform VQSR SNPs (variant quality score recalibration)')
 34 | parser.add_argument('--noindelvqsr', action='store_true', help='Do not perform VQSR on Indels (variant quality score recalibration)')
 35 | parser.add_argument('--vqsrchrom', action='store_true', help='Perform VQSR on individual chromosomes (valid when binning performed; default: VQSR on whole genome VCF)')
 36 | parser.add_argument('--nobinning', action='store_true', help='Do not bin the alignments by chromosomes')
 37 | parser.add_argument('--nocleanup', action='store_true', help='Do not clean up the alignments')
 38 | parser.add_argument('--novariant', action='store_true', help='Do not call variants')
 39 | parser.add_argument('--alignmentonly', action='store_true', help='Only align input FASTQ or BAM files (-r)')
 40 | parser.add_argument('--cleanuponly', action='store_true', help='Only clean up input BAM files (-r)')
 41 | parser.add_argument('--variantonly', action='store_true', help='Only call variants in input BAM files (-r)')
 42 | parser.add_argument('--donealign', action='store_true', help='Sequences already aligned using the pipeline')
 43 | parser.add_argument('--donebinning', action='store_true', help='Alignments already binned by chromosomes using the pipeline')
 44 | parser.add_argument('--donecleanup', action='store_true', help='Alignments already cleaned using the pipeline')
 45 | parser.add_argument('--donegenotyping', action='store_true', help='Variants already called using the pipeline but VQSR is not')
 46 | parser.add_argument('--donesnpvqsr', action='store_true', help='Processing is started after SNP VQSR (from Indel VQSR)')
 47 | parser.add_argument('--memory', metavar='SIZE', type=int, default=12, help='Memory size (GB) per job (default: 12)')
 48 | parser.add_argument('--queue', metavar='NAME', default="extended", help='Queue for jobs (default: extended)')
 49 | parser.add_argument('--email', metavar='NAME', default="aminzia@stanford.edu", help='Email address to receive emails for ending or aborting last jobs in the queque')
 50 | parser.add_argument('--threads', metavar='COUNT', type=int, default=4, help='Number of threads for alignment, only works for SGE (default: 4)')
 51 | parser.add_argument('--jobfile', metavar='FILE', help='The jobfile name (default: stdout)')
 52 | parser.add_argument('--submit', action='store_true', help='Submit the jobs')
 53 | args = parser.parse_args()
 54 | 
 55 | outdir=Dir(args.output)
 56 | logdir=Dir(outdir, 'log')
 57 | 
 58 | outdir.mkdirs()
 59 | logdir.mkdirs()
 60 | 
 61 | tmpdir=outdir
 62 | if (args.tmp is not None):
 63 | 	tmpdir=Dir(args.tmp)
 64 | tmpdir.mkdirs()
 65 | 
 66 | capture="True"
 67 | if (args.capture is None):
 68 | 	capture="False"
 69 | else:
 70 | 	capture=args.capture
 71 | 
 72 | id=re.match(r'(?:.+\\t)?ID:([^\\]+)', args.readgroup)
 73 | id=id.group(1)
 74 | lb=re.match(r'(?:.+\\t)?LB:([^\\]+)', args.readgroup)
 75 | lb=lb.group(1)
 76 | pl=re.match(r'(?:.+\\t)?PL:([^\\]+)', args.readgroup)
 77 | pl=pl.group(1)
 78 | sample=re.match(r'(?:.+\\t)?SM:([^\\]+)', args.readgroup)
 79 | sample=sample.group(1)
 80 | 
 81 | if args.samplename is not None:
 82 | 	sample = args.samplename	
 83 | readgroup="@RG\\tID:"+id+"\\tLB:"+lb+"\\tPL:"+pl+"\\tSM:"+sample
 84 | 
 85 | Job.name_prefix=sample+"."
 86 | Job.memory="%sG"%args.memory
 87 | Job.queue=args.queue
 88 | Job.cmd_prefix=os.path.join(home,'bin','hugeseq_mod.sh')
 89 | 
 90 | if args.jobfile is None and not args.submit:
 91 |         jobfile=None
 92 | else:
 93 |         if args.jobfile is None:
 94 |                 jobfile=File(outdir, "job")
 95 |         else:
 96 |                 jobfile=File(args.jobfile)
 97 | 
 98 | logfile = jobfile.appext("commands.log")
 99 | open(logfile.path, "w")
100 | 
101 | tmpdir = getattr(__builtins__, 'str')(tmpdir)
102 | logfile = getattr(__builtins__, 'str')(logfile)
103 | 
104 | Job.cmd_prefix = Job.cmd_prefix + ' ' + tmpdir + ' ' + logfile
105 | Job.log_dir=logdir.path
106 | 
107 | def prep(readfiles, ext):
108 |         jobs=[]
109 |         if readfiles is not None:
110 |                 sys.stderr.write(">>>  Pre-processing <<<\n")
111 |                 for f in readfiles:
112 |                         input = File(f)
113 | 			in_index = File(f+".bai")
114 |                         if (ext==".recal.bam"):
115 |                                 outfile = File(outdir, input.prefix+ext)
116 |                                 out_index = File(outdir, input.prefix+ext+".bai")
117 |                         	job = Job('prep_reads_bam-%s'%input.prefix)
118 |                         else:
119 |                                 outfile = File(outdir, input.name)
120 |                         	job = Job('prep_reads-%s'%input.prefix)
121 |                         job.append('echo "Input preparation performed locally"')
122 |                         p = subprocess.Popen('prep.sh %s %s'%(input, outfile), shell=True, stdout=subprocess.PIPE)
123 |                         rc = p.wait()
124 |                         if rc > 0:
125 |                                 raise Exception, "Error in preparing input. Return code: %s"%rc
126 |                         for l in p.stdout:
127 |                                 sys.stderr.write(l)
128 | 			if (ext==".recal.bam"):
129 | 	                        p = subprocess.Popen('prep.sh %s %s'%(in_index, out_index), shell=True, stdout=subprocess.PIPE)
130 |         	                rc = p.wait()
131 |                 	        if rc > 0:
132 |                         	        raise Exception, "Error in preparing input. Return code: %s"%rc
133 | 	                        for l in p.stdout:
134 |         	                        sys.stderr.write(l)
135 |                         job.output = outfile
136 |                         job.memory = "100K"
137 |                         job.sge_options="-l h_rt=120:00:00 -A %s"%args.account
138 |                         job.status = "done"
139 |                         jobs.append(job)
140 |         return jobs
141 | 
142 | def align(readjobs1, readjobs2, ext):
143 |         jobs=[]
144 |         for i in range(0, len(readjobs1)):
145 | 		paired = False
146 |                 if (readjobs2 is not None and i<len(readjobs2) and not args.bam):
147 | 			paired = True	
148 | 
149 |                 readfile1=readjobs1[i].output
150 |                 readfile2=readjobs2[i].output if paired else None
151 | 
152 | 		if ((not paired) or args.bam):
153 | 			job1 = __align(readjobs1[i], None)
154 | 		else:
155 | 			job1 = __align(readjobs1[i], readjobs2[i])
156 |                
157 |  
158 |                 if (ext==".recal.bam"):
159 |                 	bam=(File(outdir, readfile1.prefix) if readfile1.ext=="gz" else readfile1.chdir(outdir)).chext("bam")
160 | 			sorted=bam.chext("bam")
161 | 		else: 
162 |                 	bam=(File(outdir, readfile1.prefix) if readfile1.ext=="gz" else readfile1.chdir(outdir)).chext("bwa.bam")
163 | 			sorted=bam.chext("sorted.bam")
164 | 
165 |                 job4 = Job('picard_sort-%s'%readfile1.prefix)
166 |                 job4.memory = "16G"
167 |                 job4.sge_options="-l h_rt=120:00:00 -A %s"%args.account
168 |                 job4.append('picard_sort.sh %s %s %s'%(bam, sorted, 8))
169 |                 job4.append('samtools_index.sh %s'%sorted)
170 |                 job4.depend(job1)
171 |                 job4.output=sorted
172 |                 jobs.append(job4)
173 | 
174 |         return jobs
175 | 
176 | def __align(readjob1,readjob2):
177 |         job = None
178 | 	if (args.bam or (readjob2 is None)):
179 |                 readfile=File(readjob1.output)
180 |                 job = Job('bwa-%s' % readfile.prefix)
181 |                 job.memory="%sG"%(args.memory/args.threads)
182 |                 job.append('bwa_bam.sh %s %s \"%s\"'%(readfile,args.threads,readgroup))
183 |                 job.depend(readjob1)
184 |                 if args.threads > 1:
185 |                         job.sge_options="-pe shm %s -l h_stack=100M -l h_rt=120:00:00 -A %s"%(args.threads, args.account)
186 |         elif (readjob1 is not None and readjob2 is not None):
187 |                 readfile1=File(readjob1.output)
188 |                 readfile2=File(readjob2.output)
189 |                 job = Job('bwa-%s' % readfile1.prefix)
190 |                 job.memory="%sG"%(args.memory/args.threads)
191 |                 job.append('bwa_fq.sh %s %s %s \"%s\"'%(readfile1,readfile2,args.threads,readgroup))
192 |                 job.depend(readjob1).depend(readjob2)
193 |                 if args.threads > 1:
194 |                         job.sge_options="-pe shm %s -l h_stack=100M -l h_rt=120:00:00 -A %s"%(args.threads, args.account)
195 | 
196 |         return job
197 | 
198 | def cleanup(pjobs, ext):
199 |         jobs=[]
200 |         for pjob in pjobs:
201 |                 bam=pjob.output
202 |                 if (ext!=".recal.bam"):
203 | 			job1=__cleanup('picard_nodup-%s'%bam.prefix, 'picard_nodup.sh', bam, bam.chext("nodup.bam"), False)
204 |                 	job2=__cleanup('gatk_realn-%s'%bam.prefix, 'gatk_realn.sh', job1.output, bam.chext("realn.bam"), args.relax_realignment)
205 |                 	job3=__cleanup('gatk_recal-%s'%bam.prefix, 'gatk_recal.sh', job2.output, bam.chext("recal.bam"), False)
206 | 		else:
207 |                 	job1=__cleanup('picard_nodup-%s'%bam.prefix, 'picard_nodup.sh', bam, bam, False)
208 | 			job2=__cleanup('gatk_realn-%s'%bam.prefix, 'gatk_realn.sh', job1.output, bam, args.relax_realignment)
209 |                 	job3=__cleanup('gatk_recal-%s'%bam.prefix, 'gatk_recal.sh', job2.output, bam, False)
210 |                 job1.depend(pjob)
211 |                 job2.depend(job1)
212 |                 job3.depend(job2)
213 |                 jobs.append(job3)
214 |         return jobs
215 | 
216 | def __cleanup(jname, cmd, input, output, remove):
217 |         job=Job(jname)
218 |         job.memory = "24G"
219 |         job.sge_options="-l h_rt=120:00:00 -A %s"%args.account
220 |         job.append('%s %s %s %s'%(cmd, input, output, remove))
221 |         job.append('samtools_index.sh %s' % output)
222 |         job.output=output
223 |         return job
224 | 
225 | def binning(pjobs, fai):
226 |         jobs=[]
227 |         chrs=[]
228 |         for l in open(fai):
229 | 		m=re.match(r"(chr..|chr.)\t", l)
230 | 		if m:
231 |                 	chrs.append(m.group(1))
232 | 
233 |         for chr in chrs:
234 |                 chrBam=File(outdir, chr+".bam")
235 |                 job = Job('bin_aln-%s'%chr)
236 |                 job.memory = "3G"
237 |         	job.sge_options="-l h_rt=120:00:00 -A %s"%args.account
238 |                 job.output = chrBam
239 |                 job.append('bin_bam.sh %s %s %s'%(chr, chrBam, " ".join([pjob.output.path for pjob in pjobs])))
240 |                 job.append('samtools_index.sh %s'%chrBam)
241 |                 job.depend(*pjobs)
242 |                 jobs.append(job)
243 | 	return jobs
244 | 
245 | def callvars(pjobs, combine, variants):
246 |         jobs=([],[])
247 |         if len(pjobs)>0:
248 |                 if not combine:
249 |                         for pjob in pjobs:
250 |                                 __callvars(jobs, pjob.output.prefix, pjob.output.absprefix, [pjob.output.path], [pjob], variants)
251 |                 else:
252 |                         __callvars(jobs, sample, File(outdir.path, sample).path, [pjob.output.path for pjob in pjobs], pjobs, variants)
253 |         return jobs
254 | 
255 | def __callvars(jobs, idprefix, output, inputs, pjobs, variants):
256 |         input=" ".join(inputs)
257 |         output="".join(output.split(".recal"))
258 |         jobs1=jobs[0]
259 | 
260 | 	if (variants is None or "gatk" in variants):		
261 |                 job0=Job('gatk_vc-%s'%idprefix)
262 |                 job0.memory = "16G"
263 |                 job0.sge_options="-l h_rt=120:00:00 -A %s"%args.account
264 |                 job0.output=File(output+".gatk.vcf")
265 |                 job0.append('gatk_vc.sh %s %s %s %s %s %s'%(job0.output, capture, args.reference_calls, args.snp_hapcaller, args.indel_hapcaller, input))
266 |                 job0.depend(*pjobs)
267 | 		if args.donegenotyping:
268 | 			job0.status="done"
269 | 		
270 |                 if (not args.vqsrchrom):
271 |                         jobs1.append(job0)
272 |                 else:
273 | 			job1=Job('vqsr_snp-%s'%idprefix)
274 | 	      		job1.memory = "16G"
275 |        			job1.sge_options="-l h_rt=120:00:00 -A %s"%args.account
276 | 	        	job1.output=File(job0.output)
277 |        		        job1.append('vqsr_snp.sh %s %s %s %s %s'%(job1.output, not args.nosnpvqsr, args.targeted, not args.snp_hapcaller, args.donesnpvqsr))
278 |         		job1.depend(job0)
279 | 			
280 | 			job2=Job('vqsr_indel-%s'%idprefix)
281 |         	  	job2.memory = "16G"
282 |        			job2.sge_options="-l h_rt=120:00:00 -A %s"%args.account
283 | 		        job2.output=File(job0.output)
284 |       			job2.append('vqsr_indel.sh %s %s %s'%(job2.output, not args.noindelvqsr, args.targeted))
285 |                		job2.depend(job1)
286 | 	
287 | 			job3=Job('combine_vqsr-%s'%idprefix)
288 | 		        job3.memory = "12G"
289 | 			job3.sge_options="-l h_rt=120:00:00 -A %s"%args.account
290 | 	       		job3.output=File(output+".snv.vcf")
291 |      		
292 | 		       	if args.reference_calls:
293 | 				job3.append('combine_vcf.sh %s %s %s %s %s'%(output+".snv.vcf", False, False, output+".vqsr.snp.vcf", output+".vqsr.indel.vcf"))
294 | 				job3.append('write_refcalls.sh %s'%(job1.output))
295 | 				job3.append('combine_vcf.sh %s %s %s %s %s %s'%(output+".snv.refcalls.vcf", False, True, output+"refcalls.vcf", output+".vqsr.snp.vcf", output+".vqsr.indel.vcf"))
296 |      		        else:
297 | 				job3.append('combine_vcf.sh %s %s %s %s %s'%(output+".snv.vcf", False, False, output+".vqsr.snp.vcf", output+".vqsr.indel.vcf"))
298 |        	        	job3.depend(job2)
299 | 		
300 | 			jobs1.append(job3)
301 | 
302 | 	jobs2=jobs[1]
303 | 	job=None
304 |         if (variants is None or "breakdancer" in variants):
305 |                 job=Job('breakdancer-%s'%idprefix)
306 |                 job.memory = "24G"
307 |         	job.sge_options="-l h_rt=120:00:00 -A %s"%args.account
308 |                 job.output=File(output+".breakdancer.gff")
309 |                 jobs2.append(job.append('breakdancer.sh %s %s'%(job.output,input)).depend(*pjobs))
310 |         if (variants is None or "pindel" in variants):
311 |                 rpmJob=job
312 |                 job=Job('pindel-%s'%idprefix)
313 |                 job.memory = "24G"
314 |         	job.sge_options="-l h_rt=120:00:00 -A %s"%args.account
315 |                 job.output=File(output+".pindel.gff")
316 |                 jobs2.append(job.append('pindel.sh %s %s'%(job.output,input)).depend(*pjobs if rpmJob is None else [rpmJob]))
317 |         if (variants is None or "cnvnator" in variants):
318 |                 job=Job('cnvnator-%s'%idprefix)
319 |                 job.memory = "24G"
320 |         	job.sge_options="-l h_rt=120:00:00 -A %s"%args.account
321 |                 job.output=File(output+".cnvnator.gff")
322 |                 jobs2.append(job.append('cnvnator.sh %s %s'%(job.output,input)).depend(*pjobs))
323 |         if (variants is None or "breakseq" in variants):
324 |                 job=Job('breakseq-%s'%idprefix)
325 |                 job.memory = "24G"
326 |         	job.sge_options="-l h_rt=120:00:00 -A %s"%args.account
327 |                 job.output=File(output+".breakseq.gff")
328 |                 jobs2.append(job.append('breakseq.sh %s %s'%(job.output,input)).depend(*pjobs))
329 | 
330 | def group_output_by_suffix(suffixes, jobs):
331 |         groups={}
332 | 	groups[suffixes]=[]
333 |         for i in jobs:
334 |                 if i.output.path.endswith(suffixes):
335 |                         groups[suffixes].append(i.output.path)
336 |         return groups
337 | 
338 | def group_output_bams_by_suffix(suffixes, bams, jobs):
339 |         groups={}
340 |         groups[suffixes]=[]
341 |         for i in jobs:
342 |                 if i.output.path.endswith(suffixes):
343 |                         out = str(i.output.path)
344 |                         out=out.replace(suffixes, bams)
345 |                         groups[suffixes].append(out)
346 |         return groups
347 | 
348 | 
349 | def merge_annotate(siJobs, svJobs, variants):
350 |         jobs=[]
351 | 
352 | 	keys=".gatk.vcf"
353 |         if (args.vqsrchrom):
354 | 		keys=".snv.vcf"
355 |         siCombinedVCFs=group_output_by_suffix(keys, siJobs)
356 | 		
357 |         if variants is None or "gatk" in variants:
358 |                 job1=Job('concat-vcf-%s'%sample)
359 |                 for i in siCombinedVCFs.keys():
360 | 
361 |                         if (not args.vqsrchrom):
362 | 				if args.nobinning:
363 |                                 	job1.append('combine_vcf.sh %s %s %s %s'%(File(outdir.path, sample+".gatk.vcf"), True, False, " ".join(siCombinedVCFs[i])))
364 |                                 else:
365 | 					job1.append('combine_vcf.sh %s %s %s %s'%(File(outdir.path, sample+".gatk.vcf"), False, False, " ".join(siCombinedVCFs[i])))
366 |                         else:
367 |                                 job1.append('combine_vcf.sh %s %s %s %s'%(File(outdir.path, "genome.recal.vcf"), False, False, " ".join(siCombinedVCFs[i])))
368 | 
369 |         	if (not args.vqsrchrom):
370 | 	                job1.memory = "16G"
371 |         	        job1.sge_options="-l h_rt=120:00:00 -A %s"%args.account
372 |                 	job1.output=File(outdir.path, sample+".gatk.vcf")
373 | 	                job1.depend(*siJobs)
374 | 	                if args.donegenotyping:
375 | 				job1.status="done"
376 | 
377 | 			job2=Job('vqsr_snp-%s'%sample)
378 |       			job2.memory = "16G"
379 |        			job2.sge_options="-l h_rt=120:00:00 -A %s"%args.account
380 | 	        	job2.output=File(job1.output)
381 |        		        job2.append('vqsr_snp.sh %s %s %s %s %s'%(job2.output, not args.nosnpvqsr, args.targeted, not args.snp_hapcaller, args.donesnpvqsr))
382 |         		job2.depend(job1)
383 | 		
384 | 			job3=Job('vqsr_indel-%s'%sample)
385 |         	  	job3.memory = "16G"
386 |        			job3.sge_options="-l h_rt=120:00:00 -A %s"%args.account
387 | 		        job3.output=File(job1.output)
388 |       			job3.append('vqsr_indel.sh %s %s %s'%(job3.output, not args.noindelvqsr, args.targeted))
389 |                		job3.depend(job2)
390 | 
391 | 			job4=Job('combine_vqsr-%s'%sample)
392 | 		        job4.memory = "12G"
393 |                 	job4.sge_options="-l h_rt=120:00:00 -M %s -m ea -A %s"%(args.email, args.account)
394 | 	       		job4.output=File(outdir.path, sample+".vcf")
395 | 
396 | 			if args.reference_calls:
397 |      		        	job4.append('combine_vcf.sh %s %s %s %s %s'%(File(outdir.path, sample+".vcf"), False, False, File(outdir.path, sample+".vqsr.snp.vcf"), File(outdir.path, sample+".vqsr.indel.vcf")))
398 |                                 job4.append('write_refcalls.sh %s'%(job1.output))
399 |                                 job4.append('combine_vcf.sh %s %s %s %s %s %s'%(File(outdir.path, sample+".snv.refcalls.vcf"), False, True, File(outdir.path, sample+".refcalls.vcf"), File(outdir.path, sample+".vqsr.snp.vcf"), File(outdir.path, sample+".vqsr.indel.vcf")))
400 |        	        	else:	
401 |      		        	job4.append('combine_vcf.sh %s %s %s %s %s'%(File(outdir.path, sample+".vcf"), False, False, File(outdir.path, sample+".vqsr.snp.vcf"), File(outdir.path, sample+".vqsr.indel.vcf")))
402 | 
403 | 			job4.depend(job3)
404 | 			jobs.append(job4)
405 | 		else:
406 | 
407 | 	                job1=Job('anno_vcf-%s'%sample)
408 |         	        job1.memory = "16G"
409 |                 	job1.sge_options="-l h_rt=120:00:00 -M %s -m ea -A %s"%(args.email, args.account)
410 | 	                job1.output=File(outdir.path, sample+".vcf.tsv")
411 |         	        job1.append('annotate.py %s %s'%(job1.output, job0.output)).depend(job0)
412 |                 
413 | 			jobs.append(job1)
414 |         
415 | 	if variants is None or "breakdancer" in variants or "cnvnator" in variants or "pindel" in variants or "breakseq" in variants:
416 |                 inputs=" ".join([j.output.path for j in svJobs])
417 |                 job2=Job('merge_gff-%s'%sample)
418 |                 job2.memory = "5G"
419 |                 job2.sge_options="-l h_rt=120:00:00 -A %s"%args.account
420 |                 job2.output=File(outdir.path, sample+".gff")
421 |                 job2.append('merge_gff.sh %s %s'%(job2.output, inputs)).depend(*svJobs)
422 | 
423 |                 job3=Job('anno_gff-%s'%sample)
424 |                 job3.memory = "6G"
425 |                 #job3.sge_options="-l h_rt=120:00:00 -A %s"%args.account
426 |                 job3.output=File(outdir.path, sample+".gff.tsv")
427 |                 job3.sge_options="-l h_rt=120:00:00 -M %s -m ea -A %s"%(args.email, args.account)
428 |                 job3.append('annotate.py %s %s'%(job3.output, job2.output)).depend(job2)
429 |                 jobs.append(job3)
430 | 
431 |         return jobs
432 | 
433 | def markdone(jobs, mark=True):
434 |         if mark:
435 |                 for job in jobs:
436 |                         if len(job.dependents)>0:
437 |                                 markdone(job.dependents, mark)
438 |                         job.status='done'
439 | 
440 | extension=None
441 | if args.bam:
442 | 	if args.cleanuponly:
443 | 		extension=".bam"
444 | 	elif args.variantonly:
445 | 		extension=".recal.bam"
446 | 	else:
447 | 		extension=None
448 | 	jobs1=prep(args.reads1, extension)
449 | 	jobs2=()
450 | else:
451 | 	jobs1=prep(args.reads1, extension)
452 | 	jobs2=prep(args.reads2, extension)
453 | 
454 | jobs=[]
455 | jobs=align(jobs1, jobs2,extension)
456 | markdone(jobs, args.donealign or args.cleanuponly or args.variantonly)
457 | 
458 | if args.cleanuponly or args.variantonly or args.alignmentonly:
459 | 	args.nobinning = True
460 | 
461 | if not args.nobinning:
462 | 	jobs=binning(jobs, refi)
463 |         markdone(jobs, args.donebinning or args.variantonly)
464 | 
465 | if not args.nocleanup:
466 |         jobs=cleanup(jobs, extension)
467 |         markdone(jobs, args.donecleanup or args.variantonly)
468 | 
469 | if not args.alignmentonly and not args.cleanuponly and not args.novariant:
470 | 	siJobs, svJobs=callvars(jobs, args.nobinning, args.variants)
471 |         jobs=merge_annotate(siJobs, svJobs, args.variants)
472 | 
473 | descout = sys.stdout if jobfile is None else open(jobfile.path, "w")
474 | descout.write(Job().depend(*jobs).desc())
475 | descout.flush()
476 | 
477 | if args.submit:
478 |         print >> sys.stderr, "Submitting jobs (%s) through SJM"%jobfile
479 |         os.system("sjm %s &" %jobfile)
480 | 


--------------------------------------------------------------------------------