├── bin ├── nop.sh ├── .DS_Store ├── hugeseq_mod.sh ├── remove_bam.sh ├── prep.sh ├── clean_nodup.sh ├── breakseq.sh ├── fix_bai_name.py ├── samtools_index.sh ├── picard_nodup.sh ├── picard_sort.sh ├── bwa_bam.sh ├── write_refcalls.sh ├── bwa_fq.sh ├── bin_bam.sh ├── merge_gff.sh ├── combine_vcf.sh ├── util.py ├── breakdancer.sh ├── cnvnator.sh ├── sjm.py ├── gatk_realn.sh ├── pindel.sh ├── gatk_recal.sh ├── vqsr_indel.sh ├── vqsr_snp.sh ├── annotate.py ├── gatk_vc.sh └── hugeseq ├── .DS_Store ├── LICENSE ├── RELEASENOTES ├── modulefiles └── hugeseq │ └── 2.0 └── README /bin/nop.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -eu 2 | echo "nothing to be done!" 3 | -------------------------------------------------------------------------------- /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StanfordBioinformatics/HugeSeq/HEAD/.DS_Store -------------------------------------------------------------------------------- /bin/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StanfordBioinformatics/HugeSeq/HEAD/bin/.DS_Store -------------------------------------------------------------------------------- /bin/hugeseq_mod.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | module load hugeseq/2.0 4 | export TMP=$1 5 | echo $TMP 6 | shift 7 | export LOGFILE=$1 8 | echo $LOGFILE 9 | shift 10 | 11 | $* 12 | -------------------------------------------------------------------------------- /bin/remove_bam.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | echo "*** SAM/BAM Removel ***" 4 | 5 | for i in $* 6 | do 7 | if [ -z "${i/*.bam/}" -o -z "${i/*.sam/}" ] 8 | then 9 | echo ">> Removing SAM/BAM file: $i" 10 | rm -f $i $i.bai 11 | fi 12 | done 13 | 14 | echo "*** Finished SAM/BAM Removal ***" 15 | -------------------------------------------------------------------------------- /bin/prep.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | if [ $# -lt 2 ] 4 | then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | i=`cd \`dirname $1\`; pwd`/`basename $1` 10 | l=`cd \`dirname $2\`; pwd`/`basename $2` 11 | 12 | echo ">> Creating link to input sequence file" 13 | echo "-- Input: $i" 14 | echo "-- Link : $l" 15 | 16 | if [ ! -e $l ] 17 | then 18 | ln -sf $i $l 19 | fi 20 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Product Name: 2 | HugeSeq v2.0 3 | 4 | Description: 5 | An integrated pipeline for detecting and annotating genetic variations using high-throughput genome sequencing. 6 | 7 | Copyright: 8 | Stanford Center for Genomics and Personalized Medicine (SCGPM), Stanford School of Medicine, Stanford, California. 9 | 10 | Download: 11 | https://github.com/StanfordBioinformatics/HugeSeq 12 | 13 | License: 14 | This work is licensed under the Creative Commons Attribution-NonCommercial 3.0 Unported License. To view a copy of this license, visit http://creativecommons.org/licenses/by-nc/3.0/. -------------------------------------------------------------------------------- /bin/clean_nodup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -eu 2 | 3 | echo "*** Removing duplicates ***" 4 | 5 | if [ $# -lt 2 ] 6 | then 7 | echo "Usage: $0 [remove, default: true]" 8 | exit 1 9 | fi 10 | 11 | rmdup="true" 12 | if [ $# -gt 2 ] 13 | then 14 | rmdup=$3 15 | fi 16 | 17 | f=`cd \`dirname $1\`; pwd`/`basename $1` 18 | o=`cd \`dirname $2\`; pwd`/`basename $2` 19 | 20 | echo ">>> Marking duplicates" 21 | java -Xms5g -Xmx5g -jar $PICARD/MarkDuplicates.jar \ 22 | TMP_DIR=$TMP \ 23 | I=${f} \ 24 | O=${o} \ 25 | M=${o/.bam/.metrics} \ 26 | VALIDATION_STRINGENCY=SILENT \ 27 | ASSUME_SORTED=true \ 28 | REMOVE_DUPLICATES=$rmdup 29 | 30 | echo "*** Finished removing duplicates ***" 31 | -------------------------------------------------------------------------------- /bin/breakseq.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -eu 2 | 3 | echo "*** Calling SV using BreakSeq: $BREAKSEQ ***" 4 | 5 | if [ $# -lt 2 ] 6 | then 7 | echo "Usage: $0 ..." 8 | exit 1 9 | fi 10 | 11 | output=`cd \`dirname $1\`; pwd`/`basename $1` 12 | shift 13 | 14 | bams='' 15 | for i in $* 16 | do 17 | bams="$bams `cd \`dirname $i\`; pwd`/`basename $i`" 18 | done 19 | 20 | echo ">> Invoking the BreakSeq (Lite) program (Library: $BPLIB)" 21 | $BREAKSEQ/bin/breakseq $output $bams 22 | 23 | if [ -e "${output}.2" ] 24 | then 25 | rm ${output}.2 26 | fi 27 | sort -k1,1 -k4n -k5n < $output >> ${output}.2 28 | mv ${output}.2 $output 29 | 30 | echo "*** Finished Calling SV using BreakSeq ***" 31 | -------------------------------------------------------------------------------- /bin/fix_bai_name.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | 3 | ## Fixes BAM index file name to match what GATK expects. 4 | 5 | import os 6 | import sys 7 | 8 | 9 | orig_filename = sys.argv[1] 10 | 11 | if not os.path.exists(orig_filename): 12 | print "Cannot find file", sys.argv[1] 13 | raise SystemExit(1) 14 | if not orig_filename.endswith('.bam.bai'): 15 | print "Filename doesn't end with .bam.bai" 16 | raise SystemExit(0) 17 | 18 | new_filename = orig_filename[:-8] + '.bai' 19 | 20 | if os.path.exists(new_filename): 21 | print "Existing file %s renamed to %s.old" % (new_filename, new_filename) 22 | os.rename(new_filename, new_filename + '.old') 23 | 24 | os.link(orig_filename, new_filename) 25 | -------------------------------------------------------------------------------- /bin/samtools_index.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -eu 2 | 3 | echo "*** Indexing2 BAM ***" 4 | echo " " >> $LOGFILE 5 | echo "*** Indexing BAM ***" >> $LOGFILE 6 | 7 | if [ $# -lt 1 ] 8 | then 9 | echo "Usage: $0 " 10 | exit 1 11 | fi 12 | 13 | f=`cd \`dirname $1\`; pwd`/`basename $1` 14 | 15 | if [ ! -e $f.bai ] 16 | then 17 | 18 | command="samtools index $f" 19 | echo ">>> BAM file $f is being indexed" 20 | echo ">>> BAM file $f is being indexed" >> $LOGFILE 21 | echo ">>> $command" 22 | $command 23 | 24 | echo ">>> Fixing BAI name" 25 | echo ">>> Fixing BAI name" >> $LOGFILE 26 | command="python $HUGESEQ_HOME/bin/fix_bai_name.py $f.bai" 27 | echo ">>> $command" 28 | echo ">>> $command" >> $LOGFILE 29 | $command 30 | fi 31 | 32 | echo "*** Finished indexing BAM ***" 33 | -------------------------------------------------------------------------------- /bin/picard_nodup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -eu 2 | 3 | echo "*** Marking duplicates ***" 4 | 5 | if [ $# -lt 2 ] 6 | then 7 | echo "Usage: $0 [remove, default: false]" 8 | exit 1 9 | fi 10 | 11 | input=`cd \`dirname $1\`; pwd`/`basename $1` 12 | output=`cd \`dirname $2\`; pwd`/`basename $2` 13 | 14 | cp $input $output 15 | 16 | command="java -Xms5g -Xmx5g -jar $PICARD/MarkDuplicates.jar \ 17 | TMP_DIR=${TMP} \ 18 | I=${input} \ 19 | O=${output} \ 20 | M=${output/.bam/.metrics} \ 21 | VALIDATION_STRINGENCY=SILENT \ 22 | ASSUME_SORTED=true \ 23 | REMOVE_DUPLICATES=false" 24 | 25 | echo ">>> Marking duplicates" 26 | echo ">>> Marking duplicates" >> $LOGFILE 27 | echo ">>> $command" 28 | echo ">>> $command" >> $LOGFILE 29 | $command 30 | 31 | echo "*** Finished marking duplicates ***" 32 | -------------------------------------------------------------------------------- /bin/picard_sort.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -eu 2 | 3 | echo "*** Sorting BAM by position ***" 4 | 5 | if [ $# -lt 2 ] 6 | then 7 | echo "Usage: $0 [memory in GB]" 8 | exit 1 9 | fi 10 | 11 | f=`cd \`dirname $1\`; pwd`/`basename $1` 12 | o=`cd \`dirname $2\`; pwd`/`basename $2` 13 | 14 | if [ "$2" = '-' ] 15 | then 16 | o=$f.sorted 17 | fi 18 | 19 | gmem=5 20 | if [ $# -gt 2 ] 21 | then 22 | gmem=$3 23 | fi 24 | 25 | command="java -Xms${gmem}g -Xmx${gmem}g -jar $PICARD/SortSam.jar \ 26 | TMP_DIR=$TMP \ 27 | INPUT=$f \ 28 | OUTPUT=$o \ 29 | MAX_RECORDS_IN_RAM=$(($gmem*250000)) \ 30 | VALIDATION_STRINGENCY=SILENT \ 31 | SORT_ORDER=coordinate" 32 | 33 | echo ">>> Sorting on BAM $f" 34 | echo ">>> Sorting on BAM $f" >> $LOGFILE 35 | echo ">>> $command" 36 | echo ">>> $command" >> $LOGFILE 37 | $command 38 | 39 | if [ "$2" = '-' ] 40 | then 41 | mv $o $f 42 | o=$f 43 | fi 44 | 45 | echo "*** Finished sorting BAM by position ***" 46 | -------------------------------------------------------------------------------- /bin/bwa_bam.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -eu 2 | 3 | echo "*** Aligning reads using BWA MEM algorithm ***" 4 | echo " " >> $LOGFILE 5 | echo "*** Aligning reads using BWA MEM algorithm ***" >> $LOGFILE 6 | 7 | if [ $# -lt 1 ] 8 | then 9 | echo "Usage: $0 [num of threads] [RG tag] or $0 [num of threads] [RG tag]" 10 | exit 1 11 | fi 12 | 13 | 14 | bam=`cd \`dirname $1\`; pwd`/`basename $1` 15 | optRG="" 16 | lastArg=${BASH_ARGV[0]} 17 | if [[ $lastArg =~ "@RG" ]] 18 | then 19 | optRG="-R $lastArg" 20 | fi 21 | 22 | optT="" 23 | seclastArg=${@: -2:1} 24 | optT="-t $seclastArg" 25 | 26 | echo ">> BAM input" 27 | echo ">> BAM input" >> $LOGFILE 28 | command="samtools bam2fq $bam | bwa mem -CMp $optT $optRG $REF - | samtools view -Sbt $REF.fai -o ${bam/.bam/}.bwa.bam -" 29 | echo ">>> $command" 30 | samtools bam2fq $bam | bwa mem -CMp $optT $optRG $REF - | samtools view -Sbt $REF.fai -o ${bam/.bam/}.bwa.bam - 31 | echo ">>> $command" >> $LOGFILE 32 | echo "*** Finished aligning reads ***" 33 | -------------------------------------------------------------------------------- /bin/write_refcalls.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo ">>> Writing reference calls" 4 | echo " " >> $LOGFILE 5 | echo ">>> Writing reference calls" >> $LOGFILE 6 | 7 | set -e 8 | 9 | if [ $# -lt 1 ] 10 | then 11 | echo "Usage: $0 " 12 | exit 1 13 | fi 14 | 15 | START_VCF=`cd \`dirname $1\`; pwd`/`basename $1` 16 | PREFIX=`dirname $START_VCF` 17 | SUFFIX=`basename $START_VCF` 18 | SAMPLE=${SUFFIX/.gatk.vcf/} 19 | 20 | REFCALL_VCF=$PREFIX/$SAMPLE.refcalls.vcf 21 | 22 | command="java -Xmx6g -Xms6g -jar $GATK/GenomeAnalysisTK.jar \ 23 | -T SelectVariants \ 24 | -R $REF \ 25 | -V $START_VCF \ 26 | -o $REFCALL_VCF \ 27 | -selectType NO_VARIATION" 28 | 29 | echo ">>> Select reference calls" 30 | echo ">>> Select reference calls" >> $LOGFILE 31 | echo ">>> $command &> $PREFIX/$SAMPLE.select.refcalls.log" 32 | echo ">>> $command &> $PREFIX/$SAMPLE.select.refcalls.log" >> $LOGFILE 33 | $command &> $PREFIX/$SAMPLE.select.refcalls.log 34 | 35 | echo ">>> Finished writine reference calls" 36 | -------------------------------------------------------------------------------- /bin/bwa_fq.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -eu 2 | 3 | echo "*** Aligning reads using BWA MEM algorithm ***" 4 | echo " " >> $LOGFILE 5 | echo "*** Aligning reads using BWA MEM algorithm ***" > $LOGFILE 6 | 7 | if [ $# -lt 1 ] 8 | then 9 | echo "Usage: $0 [num of threads] [RG tag] or $0 [num of threads] [RG tag]" 10 | exit 1 11 | fi 12 | 13 | fq1=`cd \`dirname $1\`; pwd`/`basename $1` 14 | fq2=`cd \`dirname $2\`; pwd`/`basename $2` 15 | 16 | optRG="" 17 | lastArg=${BASH_ARGV[0]} 18 | if [[ $lastArg =~ "@RG" ]] 19 | then 20 | optRG="-R $lastArg" 21 | fi 22 | optRG="-R @RG\tID:Default\tLB:Library\tPL:Illumina\tSM:SAMPLE" 23 | 24 | optT="" 25 | seclastArg=${@: -2:1} 26 | optT="-t $seclastArg" 27 | 28 | if [[ ${fq1: -6} == ".fastq" ]] 29 | then 30 | fq=$(echo $fq1 | sed -e "s/.fastq//g") 31 | output="${fq}bam" 32 | elif [[ ${fq1: -9} == ".fastq.gz" ]] 33 | then 34 | fq=$(echo $fq1 | sed -e "s/.fastq.gz//g") 35 | fi 36 | 37 | command="bwa mem $REF $fq1 $fq2 $optT $optRG | samtools view -Sbt $REF.fai -o $fq.bwa.bam -" 38 | echo ">>> $command" 39 | echo ">>> $command" >> $LOGFILE 40 | bwa mem $REF $fq1 $fq2 $optT $optRG | samtools view -Sbt $REF.fai -o $fq.bwa.bam - 41 | 42 | echo "*** Finished aligning reads ***" 43 | 44 | -------------------------------------------------------------------------------- /bin/bin_bam.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -eu 2 | 3 | echo "*** Splitting BAM by chromosome ***" 4 | echo " " >> $LOGFILE 5 | echo "*** Splitting BAM by chromosome ***" >> $LOGFILE 6 | 7 | if [ $# -lt 3 ] 8 | then 9 | echo "Usage: $0 ..." 10 | exit 1 11 | fi 12 | 13 | chr=$1 14 | out=`cd \`dirname $2\`; pwd`/`basename $2` 15 | shift 2 16 | 17 | bams='' 18 | 19 | for f in $* 20 | do 21 | f=`cd \`dirname $f\`; pwd`/`basename $f` 22 | 23 | echo ">>> Extracting $chr from BAM: $f" 24 | echo ">>> Extracting $chr from BAM: $f" >> $LOGFILE 25 | o=${f/.bam/}.$chr.bam 26 | 27 | if [ $chr = 'UNK' -o $chr = 'chrU' -o $chr = 'U' ] 28 | then 29 | command="samtools view $f -f 12 -bo $o" 30 | samtools view $f -f 12 -bo $o 31 | else 32 | command="samtools view $f $chr -bo $o" 33 | samtools view $f $chr -bo $o 34 | fi 35 | echo ">>> $command" 36 | echo ">>> $command" >> $LOGFILE 37 | bams="$bams $o" 38 | done 39 | 40 | echo ">>> Merging $chr BAMs into $out" 41 | echo ">>> Merging $chr BAMs into $out" >> $LOGFILE 42 | if [ $# -gt 1 ] 43 | then 44 | command="samtools merge $out $bams" 45 | samtools merge $out $bams 46 | echo ">>> $command" 47 | echo ">>> $command" >> $LOGFILE 48 | rm $bams 49 | else 50 | mv $bams $out 51 | fi 52 | 53 | echo "*** Finished splitting BAM by chromosome ***" 54 | -------------------------------------------------------------------------------- /bin/merge_gff.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -eu 2 | 3 | echo "*** Performing GFF Merging ***" 4 | 5 | if [ $# -lt 2 ] 6 | then 7 | echo "Usage: $0 ..." 8 | exit 1 9 | fi 10 | 11 | output=`cd \`dirname $1\`; pwd`/`basename $1` 12 | input=${output/.gff/}.raw.gff 13 | 14 | shift 15 | 16 | inputs='' 17 | for i in $* 18 | do 19 | inputs="$inputs `cd \`dirname $i\`; pwd`/`basename $i`" 20 | done 21 | 22 | cat $inputs > $input 23 | 24 | features="`cut -f 3 $input | sort -u`" 25 | sources="`cut -f 2 $input | sort -u`" 26 | 27 | for feat in $features 28 | do 29 | f=$input.${feat} 30 | grep " $feat " $input > $f 31 | for src in $sources 32 | do 33 | grep " $src " $f | mergeBed -i stdin > $f.$src 34 | done 35 | cat $f.* > $f 36 | > $f.dup 37 | > $f.uni 38 | intersectBed -a $f -b $f -f 0.5 -r -c | awk '{print $0 > ($NF>1? "'$f.dup'": "'$f.uni'")}' 39 | if [ -s $f.uni ]; 40 | then 41 | mergeBed -c -i $f.uni > $f.uni.merged 42 | fi 43 | if [ -s $f.dup ]; 44 | then 45 | mergeBed -c -i $f.dup > $f.dup.merged 46 | fi 47 | for i in $f.*.merged 48 | do 49 | dup="0" 50 | if [ -n "${i/*.uni.merged/}" ]; then dup="1"; fi 51 | awk -F '\t' '{qual="LowQual"; if ('$dup' && $4>=2) qual="PASS"; print $1"\tHugeSeq\t'$feat'\t"$2"\t"$3"\t"qual"\t.\t.\tEVENTS "$4}' $i 52 | done 53 | rm $f $f.* 54 | done > $output 55 | -------------------------------------------------------------------------------- /bin/combine_vcf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -eu 2 | 3 | if [ $# -lt 2 ] 4 | then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | out=`cd \`dirname $1\`; pwd`/`basename $1` 10 | shift 11 | 12 | noop=$1 13 | shift 14 | 15 | zip=$1 16 | shift 17 | 18 | inputs=`cd \`dirname $1\`; pwd`/`basename $1` 19 | shift 20 | for i in $* 21 | do 22 | inputs="$inputs --variant `cd \`dirname $i\`; pwd`/`basename $i`" 23 | done 24 | 25 | if [[ "$noop" == "False" ]] 26 | then 27 | command="java -Xmx8g -Xms8g -jar $GATK/GenomeAnalysisTK.jar \ 28 | -R $REF \ 29 | -T CombineVariants \ 30 | --variant $inputs \ 31 | -o $out" 32 | 33 | echo ">>> Combining VCFs" 34 | echo ">>> Combining VCFs" >> $LOGFILE 35 | echo ">>> $command" 36 | echo ">>> $command" >> $LOGFILE 37 | $command 38 | 39 | if [[ "$zip" == "True" ]] 40 | then 41 | command="bgzip -fc $out" 42 | echo ">>> Zipping VCF" 43 | echo ">>> $command" 44 | echo ">>> $command" >> $LOGFILE 45 | $command &> $out.gz 46 | 47 | command="tabix -p vcf $out.gz" 48 | echo ">>> Indexing VCF using tabix" 49 | #echo ">>> Indexing VCF using tabix" >>> $LOGFILE 50 | echo ">>> $command" 51 | echo ">>> $command" >> $LOGFILE 52 | $command 53 | fi 54 | fi 55 | echo "*** Finished combining VCFs ***" 56 | -------------------------------------------------------------------------------- /bin/util.py: -------------------------------------------------------------------------------- 1 | import os, re 2 | 3 | class File: 4 | 5 | def __init__(self, path, name=None): 6 | fullpath=str(path) if name is None else os.path.join(str(path), str(name)) 7 | self.path=os.path.abspath(fullpath) 8 | self.dir=os.path.dirname(self.path) 9 | self.name=os.path.basename(self.path) 10 | nmatch=re.match(r"(.+)\.([^.]+)$", self.name) 11 | self.prefix=self.name if nmatch is None else nmatch.group(1) 12 | self.ext='' if nmatch is None else nmatch.group(2) 13 | self.absprefix=os.path.join(self.dir, self.prefix) 14 | 15 | def __str__(self): 16 | return self.path 17 | 18 | def chdir(self, dir): 19 | return File(dir, self.name) 20 | 21 | def chext(self, ext): 22 | return File(self.absprefix+"."+ext) 23 | 24 | def appext(self, ext): 25 | return File(self.dir + "/" + self.name + "." + ext) 26 | 27 | def exists(self): 28 | return os.path.exists(self.path) 29 | 30 | def desc(): 31 | s="" 32 | members = [attr for attr in dir(self) if not callable(attr) and not attr.startswith("__")] 33 | for member in members: 34 | s+="%s:\t%s\n"%(member,getattr(self, member)) 35 | return s 36 | 37 | class Dir(File): 38 | 39 | def __init__(self, path, name=None): 40 | File.__init__(self, path, name) 41 | 42 | def mkdirs(self): 43 | if self.exists(): 44 | return False 45 | else: 46 | os.makedirs(self.path) 47 | -------------------------------------------------------------------------------- /bin/breakdancer.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -eu 2 | 3 | echo "*** Calling SV using Read-Pair Mapping: $BREAKDANCER ***" 4 | 5 | if [ $# -lt 2 ] 6 | then 7 | echo "Usage: $0 ..." 8 | exit 1 9 | fi 10 | 11 | o=`cd \`dirname $1\`; pwd`/`basename $1` 12 | p=${o/.gff/} 13 | shift 14 | 15 | bams='' 16 | for i in $* 17 | do 18 | bams="$bams `cd \`dirname $i\`; pwd`/`basename $i`" 19 | done 20 | 21 | optO='' 22 | if [ $# -eq 1 ] 23 | then 24 | if [[ "$1" =~ ".*chr[^.]*\..*" ]] 25 | then 26 | chr=`echo "$1" | sed 's/.*\(chr[^\.]*\)\..*/\1/'` 27 | fi 28 | fi 29 | 30 | echo ">> Generating configuration file $bams" 31 | perl $BREAKDANCER/perl/bam2cfg.pl $bams > $p.cfg 32 | 33 | echo ">> Performing read-pair mapping" 34 | $BREAKDANCER/cpp/breakdancer_max $optO $p.cfg > $p.txt 35 | 36 | echo ">> Converting output to GFF" 37 | minsize=50 38 | awkopt='{size=$8; if (size<0 && $7=="INS") size=-size; feat=$7; if ($7=="DEL") feat="Deletion"; else if ($7=="INS") feat="Insertion"; else if ($7=="INV") feat="Inversion"; if (feat!="ITX" && $1==$4 && size>='$minsize') print $1"\tBreakDancer\t"feat"\t"($2<=$5?$2:$5)"\t"($5>=$2?$5:$2)"\t"$9"\t.\t.\tSize "size"; nr.reads: "$10};' 39 | 40 | if [ -e "$o" ] 41 | then 42 | rm $o 43 | fi 44 | 45 | grep -v '\#' $p.txt | awk "$awkopt" | sort -k1,1 -k4n -k5n -u >> $o 46 | 47 | echo "*** Finished Calling SV using Read-Pair Mapping ***" 48 | -------------------------------------------------------------------------------- /bin/cnvnator.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -eu 2 | 3 | echo "*** Calling CNV using Read-Depth Analysis: $CNVNATOR ***" 4 | 5 | if [ $# -lt 2 ] 6 | then 7 | echo "Usage: $0 " 8 | exit 1 9 | fi 10 | 11 | CNVNATOR=$CNVNATOR/cnvnator 12 | 13 | binsize=100 14 | 15 | o=`cd \`dirname $1\`; pwd`/`basename $1` 16 | shift 17 | 18 | bams='' 19 | for i in $* 20 | do 21 | bams="$bams `cd \`dirname $i\`; pwd`/`basename $i`" 22 | done 23 | 24 | 25 | if [ $# -eq 1 ] 26 | then 27 | if [[ "$1" =~ ".*chr[^.]*\..*" ]] 28 | then 29 | chr=`echo "$1" | sed 's/.*\(chr[^\.]*\)\..*/\1/'` 30 | CNVNATOR="$CNVNATOR -chrom $chr" 31 | fi 32 | fi 33 | 34 | if [ -z "${1/*.root/}" ] 35 | then 36 | echo ">> Processing root file: $1" 37 | p=${1/.root/} 38 | else 39 | echo ">> Extracting read mapping from bam files: $CNVNATOR -root ${o/.gff/}.root -tree $bams" 40 | p=${o/.gff/} 41 | $CNVNATOR -root $p.root -tree $bams 42 | fi 43 | 44 | ( 45 | echo ">> Generating histogram" 46 | $CNVNATOR -root $p.root -his $binsize -d `dirname $REF` 47 | echo ">> Calculating statistics" 48 | $CNVNATOR -root $p.root -stat $binsize 49 | echo ">> RD signal partitioning" 50 | $CNVNATOR -root $p.root -partition $binsize 51 | echo ">> CNV calling" 52 | $CNVNATOR -root $p.root -call $binsize | grep -v "\(WARN\)\|\(==\)" > $p.txt 53 | ) 2>&1 | grep -v bound | grep -v Zero | grep -v corrected 54 | 55 | echo ">> Converting output to GFF" 56 | 57 | minsize=50 58 | awkopt='{feat=$4; if ($4=="deletion") feat="Deletion"; else if ($4=="duplication") feat="Duplication"; if ($5>='$minsize') print $1"\tCNVnator\t"feat"\t"$2"\t"$3"\t"$7" "$8" "$9" "$10" "$11"\t.\t.\tSize "$5";RD "$6};' 59 | 60 | if [ -e "$o" ] 61 | then 62 | rm $o 63 | fi 64 | cut -f 1,2,3,4,5,6 $p.txt | sed 's/\(.*\) \(.*\):\(.*\)-\(.*\) \(.*\) \(.*\) \(.*\) \(.*\)/\2 \3 \4 \1 \5 \6 p1: \7 | p2: \8/' | awk "$awkopt" | sort -k1,1 -k4n -k5n -u >> $o 65 | 66 | echo "*** Finished CNV Calling using Read-Depth Analysis" 67 | -------------------------------------------------------------------------------- /RELEASENOTES: -------------------------------------------------------------------------------- 1 | ################################## 2 | # Release Notes for HugeSeq v2.0 # 3 | ################################## 4 | 5 | ++++ 7/14/2015 ++++ 6 | 7 | NOTE: This is a beta release. Several components have been modified in the package to accommodate the following changes. 8 | 9 | ## Tool "required" upgrades ## 10 | 11 | The following upgrades are "required" for HugeSeq-2.0 to operate properly: 12 | 13 | * GATK -> 3.2.2 14 | * root -> 5.34.30 15 | * pindel -> 0.2.4t 16 | * vcftools -> 0.1.12 17 | 18 | ## Major changes since 1.2 ## 19 | 20 | - The user can now choose to use HaplotypeCaller as well as UnifiedGenotyper for SNPs and/or Indels. 21 | - The user can set GATK to output reference-calls. The standard format for these calls is gVCF. The file carrying the reference calls is generated in addition to the standard VCF that carries only the variants. 22 | - The user can now choose to switch off VQSR for either or both of SNP and Indels. 23 | - The user can now choose targeted genotyping by means of using exome target captures as input. 24 | - The failed jobs at the VQSR stage can now be repeated without running the genotyper again by means of a new input parameter "--donegenotyping". 25 | - The pipeline now is optimized for targeted genotyping using GATK's features for exome data processing (in --targeted mode). 26 | - The pipeline is now capable of running VQSR on whole data set OR on chromosomes individually. This is independent of the capability to split the data into chromosomes for faster genotyping (i.e. the user can choose to do the genotyping for individual chromosomes but run VQSR on all the chromosomes collectively.) 27 | - HugeSeq is now uses Stanovar-0.1 for annotation of variants. 28 | 29 | ## Open issues ## 30 | * Package has never been tested on external systems. Unforeseen issues may arise 31 | 32 | ## Additional documents 33 | 34 | * Refer to README file for information on Installation and Execution of the pipeline including background information about directory structure and modules used in the pipeline. 35 | -------------------------------------------------------------------------------- /bin/sjm.py: -------------------------------------------------------------------------------- 1 | class Job: 2 | 3 | time=None 4 | memory=None 5 | queue=None 6 | project=None 7 | status=None 8 | log_dir=None 9 | cmd_prefix=None 10 | cmd_separator='&&' 11 | name_prefix=None 12 | sge_options=None 13 | 14 | def __init__(self, name=None): 15 | self.name=name 16 | if self.name_prefix is not None and self.name is not None: 17 | self.name=self.name_prefix+self.name 18 | self.status=None 19 | self.cmds=[] 20 | self.dependents=[] 21 | 22 | def __str__(self): 23 | s='job_begin\n' 24 | if self.name is not None: 25 | s+='\tname %s\n'%self.name 26 | if self.time is not None: 27 | s+='\ttime %s\n'%self.time 28 | if self.memory is not None: 29 | s+='\tmemory %s\n'%self.memory 30 | if self.queue is not None: 31 | s+='\tqueue %s\n'%self.queue 32 | if self.project is not None: 33 | s+='\tproject %s\n'%self.project 34 | if self.status is not None: 35 | s+='\tstatus %s\n'%self.status 36 | if self.sge_options is not None: 37 | s+='\tsge_options %s\n'%self.sge_options 38 | if len(self.cmds)>0: 39 | s+='\tcmd_begin\n' 40 | s+=(' %s\n'%('' if self.cmd_separator is None else self.cmd_separator)).join(['\t\t%s %s'%(('' if self.cmd_prefix is None else self.cmd_prefix), cmd) for cmd in self.cmds])+"\n" 41 | s+='\tcmd_end\n' 42 | s+='job_end\n' 43 | return s 44 | 45 | def done(self): 46 | self.status='done' 47 | 48 | def append(self, cmd): 49 | self.cmds.append(cmd) 50 | return self 51 | 52 | def depend(self, *jobs): 53 | if jobs is not None: 54 | for job in jobs: 55 | if job is not None: 56 | self.dependents.append(job) 57 | return self 58 | 59 | def order(self, history=[]): 60 | s='' 61 | for dependent in self.dependents: 62 | s+=dependent.order(history) 63 | order=(dependent.name, self.name) 64 | if self.name is not None and order not in history: 65 | s+= "order %s before %s\n" % order 66 | history.append(order) 67 | return s 68 | 69 | def traverse(self, history=[]): 70 | s='' 71 | for dependent in self.dependents: 72 | s+=dependent.traverse(history) 73 | if self.name is not None and self not in history: 74 | s+=str(self) 75 | history.append(self) 76 | return s 77 | 78 | def desc(self): 79 | s=self.traverse() 80 | s+=self.order() 81 | if self.log_dir is not None: 82 | s+='log_dir %s\n'%self.log_dir 83 | return s 84 | -------------------------------------------------------------------------------- /bin/gatk_realn.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -eu 2 | 3 | echo "*** Realigning targeted regions ***" 4 | echo " " >> $LOGFILE 5 | echo "*** Realigning targeted regions ***" >> $LOGFILE 6 | 7 | if [ $# -lt 2 ] 8 | then 9 | echo "Usage: $0 " 10 | exit 1 11 | fi 12 | 13 | f=`cd \`dirname $1\`; pwd`/`basename $1` 14 | o=`cd \`dirname $2\`; pwd`/`basename $2` 15 | relax_realign=$3 16 | 17 | optL='' 18 | if [[ "$1" =~ .*chr[^\.]*\..* ]] 19 | then 20 | chr=`echo "$1" | sed 's/.*\(chr[^\.]*\)\..*/\1/'` 21 | optL="-L $chr" 22 | fi 23 | 24 | RELAX="" 25 | if [[ "$relax_realign" == "True" ]] 26 | then 27 | RELAX="--defaultBaseQualities 0 --filter_bases_not_stored --filter_mismatching_base_and_quals --filter_reads_with_N_cigar" 28 | fi 29 | 30 | command="java -Xms8g -Xmx8g -jar $GATK/GenomeAnalysisTK.jar \ 31 | -T RealignerTargetCreator \ 32 | -I $f \ 33 | -R $REF \ 34 | -o ${o/.bam/.intervals} $optL \ 35 | -known $MILLS_1K_GOLD_INDELS \ 36 | -known $GOLD_1K_INDELS \ 37 | -et NO_ET \ 38 | -K /srv/gs1/software/gatk/GATKkey/stanford.edu.key" 39 | 40 | echo ">>> Determining (small) suspicious intervals which are likely in need of realignment" 41 | echo ">>> Determining (small) suspicious intervals which are likely in need of realignment" >> $LOGFILE 42 | echo ">>> $command" 43 | echo ">>> $command" >> $LOGFILE 44 | $command 45 | 46 | command="java -Xms8g -Xmx8g -Djava.io.tmpdir=$TMP -jar $GATK/GenomeAnalysisTK.jar \ 47 | -T IndelRealigner \ 48 | -I $f \ 49 | -R $REF \ 50 | -o $o $optL \ 51 | -targetIntervals ${o/.bam/.intervals} \ 52 | -known $MILLS_1K_GOLD_INDELS \ 53 | -known $GOLD_1K_INDELS \ 54 | -et NO_ET $RELAX\ 55 | -K /srv/gs1/software/gatk/GATKkey/stanford.edu.key" 56 | 57 | echo ">>> Running the realigner over the targeted intervals" 58 | echo ">>> Running the realigner over the targeted intervals" >> $LOGFILE 59 | echo ">>> $command" 60 | echo ">>> $command" >> $LOGFILE 61 | $command 62 | 63 | command="java -Xms8g -Xmx8g -jar $PICARD/FixMateInformation.jar \ 64 | TMP_DIR=$TMP \ 65 | INPUT=$o \ 66 | VALIDATION_STRINGENCY=SILENT \ 67 | SORT_ORDER=coordinate" 68 | 69 | echo ">>> Fixing the mate pairs and order of the realigned reads" 70 | echo ">>> Fixing the mate pairs and order of the realigned reads" >> $LOGFILE 71 | echo ">>> $command" 72 | echo ">>> $command" >> $LOGFILE 73 | $command 74 | 75 | echo "*** Finished realigning targeted regions ***" 76 | -------------------------------------------------------------------------------- /bin/pindel.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | echo "*** Calling SV using Split-Read Analysis: $PINDEL ***" 4 | 5 | if [ $# -lt 2 ] 6 | then 7 | echo "Usage: $0 ..." 8 | exit 1 9 | fi 10 | 11 | 12 | o=`cd \`dirname $1\`; pwd`/`basename $1` 13 | odir=`dirname $1` 14 | shift 15 | p=${o/.gff/} 16 | 17 | rpm_output=${p/.pindel/}.breakdancer.txt 18 | rpm_cfg=${p/.pindel/}.breakdancer.cfg 19 | 20 | chr="ALL" 21 | if [ $# -eq 1 ] 22 | then 23 | if [[ "$1" =~ ".*chr[^.]*\..*" ]] 24 | then 25 | chr=`echo "$1" | sed 's/.*\(chr[^\.]*\)\..*/\1/'` 26 | fi 27 | fi 28 | 29 | bOpt='' 30 | if [ -e $rpm_output ] 31 | then 32 | bOpt="-b $rpm_output" 33 | fi 34 | 35 | cd $odir 36 | pin_cfg=$p.cfg 37 | for bam in $* 38 | do 39 | isize='' 40 | if [ -e $rpm_cfg ] 41 | then 42 | cfg=`grep $bam $rpm_cfg | head -n 1` 43 | isize=`echo "$cfg" | sed 's/.*mean:\([0-9]*\).*/\1/'` 44 | else 45 | isize=`samtools view -H $bam 2> /dev/null | grep "@RG" | grep "PI" | head -n 1 | sed 's/.*PI:\([0-9]*\).*/\1/'` 46 | fi 47 | if [ -z "$isize" ] 48 | then 49 | isize=300 50 | fi 51 | 52 | sample=`samtools view -H $bam 2> /dev/null | grep "@RG" | grep "SM" | head -n 1| sed 's/.*SM:\([^\t]*\).*/\1/'` 53 | if [ -z "$sample" ] 54 | then 55 | sample="SAMPLE" 56 | fi 57 | 58 | echo -e "$bam\t$isize\t$sample" 59 | done > $pin_cfg 60 | pindel="$PINDEL/pindel -f $REF -i $pin_cfg -o $p -c $chr $bOpt" 61 | echo ">> Running Pindel on $chr" 62 | $pindel 63 | 64 | echo ">> Converting output to GFF" 65 | 66 | minsize=50 67 | 68 | AWKOPT='{feat="Unknown"; if ($2=="D" || $2=="DI") feat="Deletion"; else if ($2=="I" || $2=="LI") feat="Insertion"; else if ($2=="INV") feat="Inversion"; else if ($2=="TD") feat="TandemDup"; start=$7; end=$8; if (feat!="Insertion") {start++; end--;} if ($3>='$minsize') print $5"\tPindel\t"feat"\t"start"\t"end"\t"$24"\t.\t.\tSize "$3"; nr.unique reads: (+"$17",-"$20"); ComScore: "int(sqrt(($17+1)*($20+1)*$24))}' 69 | 70 | #echo -e "#Chr\tProgram\tSV-type\t\tstart\tend\tscore\tstrand\tframe\tattributes" > $o 71 | if [ -e "$o" ] 72 | then 73 | rm $o 74 | fi 75 | 76 | for po in ${p}_[^LBT]* 77 | do 78 | grep "ChrID" $po | sed 's/D \([0-9]*\) I \([0-9]*\)/DI \1:\2/' | sed 's/SUM_MS \([0-9]*\) .*/SUM_MS \1/' | sed 's/NT[^ ]*//' | awk "$AWKOPT" 79 | done | sort -k1,1 -k4n -k5n -u >> $o 80 | 81 | #echo ">> Archiving raw output" 82 | #tar --remove-files -zcvf $p.tgz ${p}_* 83 | 84 | echo "*** Finished calling SV using Split-Read Analysis: $PINDEL ***" 85 | -------------------------------------------------------------------------------- /bin/gatk_recal.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -eu 2 | 3 | echo "*** Recalibrating base quality ***" 4 | echo " " >> $LOGFILE 5 | echo "*** Recalibrating base quality ***" >> $LOGFILE 6 | 7 | if [ $# -lt 2 ] 8 | then 9 | echo "Usage: $0 " 10 | exit 1 11 | fi 12 | 13 | f=`cd \`dirname $1\`; pwd`/`basename $1` 14 | o=`cd \`dirname $2\`; pwd`/`basename $2` 15 | 16 | command="java -Xms5g -Xmx5g -jar $GATK/GenomeAnalysisTK.jar \ 17 | -T BaseRecalibrator \ 18 | -I $f \ 19 | -R $REF \ 20 | -o ${o/.bam/.grp} \ 21 | -knownSites $MILLS_1K_GOLD_INDELS \ 22 | -knownSites $GOLD_1K_INDELS \ 23 | -knownSites $DBSNP \ 24 | -K /srv/gs1/software/gatk/GATKkey/stanford.edu.key" 25 | 26 | # -cov ReadGroupCovariate \ 27 | # -cov QualityScoreCovariate \ 28 | # -cov CycleCovariate \ 29 | echo ">>> Counting covariates" 30 | echo ">>> Counting covariates" >> $LOGFILE 31 | echo ">>> $command" 32 | echo ">>> $command" >> $LOGFILE 33 | $command 34 | 35 | # This is added to create BQSR plots: 36 | # http://gatkforums.broadinstitute.org/discussion/2801/howto-recalibrate-base-quality-scores-run-bqsr 37 | 38 | command="java -Xms5g -Xmx5g -jar $GATK/GenomeAnalysisTK.jar \ 39 | -T BaseRecalibrator \ 40 | -I $f \ 41 | -R $REF \ 42 | -o ${o/.bam/.grp.post} \ 43 | -knownSites $MILLS_1K_GOLD_INDELS \ 44 | -knownSites $GOLD_1K_INDELS \ 45 | -knownSites $DBSNP \ 46 | -BQSR ${o/.bam/.grp} \ 47 | -K /srv/gs1/software/gatk/GATKkey/stanford.edu.key" 48 | 49 | # -cov ReadGroupCovariate \ 50 | # -cov QualityScoreCovariate \ 51 | # -cov CycleCovariate \ 52 | echo ">>> Recounting covariates" 53 | echo ">>> Recounting covariates" >> $LOGFILE 54 | echo ">>> $command" 55 | echo ">>> $command" >> $LOGFILE 56 | $command 57 | 58 | command="java -Xms5g -Xmx5g -jar $GATK/GenomeAnalysisTK.jar \ 59 | -T AnalyzeCovariates \ 60 | -R $REF \ 61 | -before ${o/.bam/.grp} \ 62 | -after ${o/.bam/.grp.post} \ 63 | -plots ${o/.bam/recalibration_plots.pdf}" 64 | 65 | echo ">>> Creating BQSR plots" 66 | echo ">>> Creating BQSR plots" >> $LOGFILE 67 | echo ">>> $command" 68 | echo ">>> $command" >> $LOGFILE 69 | #$command 70 | 71 | command="java -Xms5g -Xmx5g -jar $GATK/GenomeAnalysisTK.jar \ 72 | -R $REF \ 73 | -I $f \ 74 | -o $o \ 75 | -T PrintReads \ 76 | -BQSR ${o/.bam/.grp} \ 77 | -K /srv/gs1/software/gatk/GATKkey/stanford.edu.key" 78 | 79 | echo ">>> Printing reads" 80 | echo ">>> Printing reads" >> $LOGFILE 81 | echo ">>> $command" 82 | echo ">>> $command" >> $LOGFILE 83 | $command 84 | 85 | echo "*** Finished recalibrating base quality ***" 86 | -------------------------------------------------------------------------------- /modulefiles/hugeseq/2.0: -------------------------------------------------------------------------------- 1 | #%Module1.0 2 | # HugeSeq Module File 3 | # 4 | ## HugeSeq modulefile 5 | ## 6 | ## Initializes HugeSeq 7 | ## 8 | 9 | proc ModulesHelp { } { 10 | # global hugeseq_version 11 | puts stderr "\tInitializes your environment to use the HugeSeq variant detection pipeline\n" 12 | puts stderr "\tVersion $hugeseq_version\n" 13 | } 14 | 15 | module-whatis "Initializes the HugeSeq variant detection pipeline" 16 | 17 | # Setting paths and env for HugeSeq 18 | # for Tcl script use only 19 | set hugeseqversion 2.0 20 | 21 | # sets the MODULESAPPSDIR env var 22 | module add modsappsdir 23 | 24 | setenv HUGESEQ_HOME ~/HugeSeq 25 | setenv HUGESEQROOT ~/HugeSeq 26 | prepend-path PATH ~/HugeSeq/bin 27 | 28 | # Setting data resource directory 29 | set dat_dir ~/Resources/GATK 30 | 31 | # Setting paths for programming tools 32 | module load python 33 | 34 | # Setting paths and env for alignment tools 35 | module load bwa 36 | 37 | # Setting paths and env for variant detection annotation tools 38 | module load gatk 39 | module load cnvnator 40 | module load breakseqlite 41 | module load annovar 42 | module load samtools 43 | module load r 44 | 45 | # Setting perl libraries (for BreakDancer) 46 | setenv PERLLIB /srv/gs1/software/hugeseq/hugeseq-2.0/perllib 47 | prepend-path PERL5LIB $::env(PERLLIB)/File-Path-2.08/blib/lib 48 | prepend-path PERL5LIB $::env(PERLLIB)/Statistics-Descriptive-2.6/blib/lib 49 | prepend-path PERL5LIB $::env(PERLLIB)/GD-2.45/blib/lib 50 | prepend-path PERL5LIB $::env(PERLLIB)/GDGraph-1.44/blib/lib 51 | prepend-path PERL5LIB $::env(PERLLIB)/GDGraph-histogram-1.1/blib/lib 52 | prepend-path PERL5LIB $::env(PERLLIB)/GDTextUtil-0.86/blib/lib 53 | prepend-path PERL5LIB $::env(PERLLIB)/Math-CDF-0.1/blib/lib 54 | 55 | # Setting paths and env for utilities 56 | module load breakdancer 57 | module load pindel 58 | module load bedtools 59 | module load tabix 60 | module load vcftools 61 | module load root 62 | 63 | # Setting env for supporting databases 64 | setenv BPLIB $dat_dir/bplib/bplib.alt.fa 65 | setenv REF $dat_dir/hg19-3.0/ucsc.hg19.fasta 66 | setenv DIC $dat_dir/hg19-3.0/ucsc.hg19.dict 67 | setenv DBSNP $dat_dir/hg19-3.0/dbsnp_138.hg19.vcf 68 | setenv HAPMAP $dat_dir/hg19-3.0/hapmap_3.3.hg19.vcf 69 | setenv OMNI_1K $dat_dir/hg19-3.0/1000G_omni2.5.hg19.vcf 70 | setenv GOLD_1K_SNPS $dat_dir/hg19-3.0/1000G_phase1.snps.high_confidence.hg19.vcf 71 | setenv MILLS_1K_GOLD_INDELS $dat_dir/hg19-3.0/Mills_and_1000G_gold_standard.indels.hg19.vcf 72 | setenv GOLD_1K_INDELS $dat_dir/hg19-3.0/1000G_phase1.indels.hg19.vcf 73 | 74 | module load picard-tools 75 | 76 | # Setting paths and env for the Simple Job Manager (SJM) 77 | module load sjm 78 | -------------------------------------------------------------------------------- /bin/vqsr_indel.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "Performing VQSR on indels" 4 | echo " " >> $LOGFILE 5 | echo "Performing VQSR on indels" >> $LOGFILE 6 | 7 | set -e 8 | 9 | if [ $# -lt 1 ] 10 | then 11 | echo "Usage: $0 vqsr targeted" 12 | exit 1 13 | fi 14 | 15 | START_VCF=`cd \`dirname $1\`; pwd`/`basename $1` 16 | PREFIX=`dirname $START_VCF` 17 | SUFFIX=`basename $START_VCF` 18 | SAMPLE=${SUFFIX/.gatk.vcf/} 19 | 20 | INDEL_VCF=$PREFIX/$SAMPLE.indel.vcf 21 | INDEL_RECAL_VCF=$PREFIX/$SAMPLE.vqsr.indel.vcf 22 | INDEL_RECAL=$PREFIX/$SAMPLE.tmp.indel.vcf 23 | INDEL_TRANCHES=$PREFIX/$SAMPLE.tranches.gatk.indel.recal.csv 24 | INDEL_RSCRIPT=$PREFIX/$SAMPLE.gatk.recal.indel.R 25 | 26 | touch $INDEL_RECAL_VCF 27 | doVQSR=$2 28 | targeted=$3 29 | 30 | DP="-an DP" 31 | if [ "$targeted" == "True" ] 32 | then 33 | DP="--maxGaussians 4" 34 | fi 35 | 36 | command="java -Xmx6g -Xms6g -jar $GATK/GenomeAnalysisTK.jar \ 37 | -T SelectVariants \ 38 | -R $REF \ 39 | -V $START_VCF \ 40 | -o $INDEL_VCF \ 41 | -selectType INDEL" 42 | 43 | echo ">>> Select variants for indel recalibration" 44 | echo ">>> Select variants for indel recalibration" >> $LOGFILE 45 | echo ">>> $command &> $PREFIX/$SAMPLE.select.indel.log" 46 | echo ">>> $command > $PREFIX/$SAMPLE.select.indel.log" >> $LOGFILE 47 | $command &> $PREFIX/$SAMPLE.select.indel.log 48 | 49 | if [ "$doVQSR" != "True" ] 50 | then 51 | command="cp $INDEL_VCF $INDEL_RECAL_VCF" 52 | echo ">>> Do not perform indel VQSR" 53 | echo ">>> Do not perform indel VQSR" >> $LOGFILE 54 | echo "$command > $PREFIX/$SAMPLE.vqsr.indel.log" >> $LOGFILE 55 | $command &> $PREFIX/$SAMPLE.vqsr.indel.log 56 | exit $? 57 | fi 58 | 59 | command="java -Xmx6g -Xms6g -jar $GATK/GenomeAnalysisTK.jar \ 60 | -T VariantRecalibrator \ 61 | -R $REF \ 62 | -input $INDEL_VCF \ 63 | -resource:mills,known=true,training=true,truth=true,prior=12.0 $MILLS_1K_GOLD_INDELS \ 64 | -an FS $DP \ 65 | -an MQRankSum \ 66 | -an ReadPosRankSum \ 67 | -tranche 100.0 -tranche 99.9 -tranche 99.0 -tranche 90.0 \ 68 | -mode INDEL \ 69 | -recalFile $INDEL_RECAL \ 70 | -tranchesFile $INDEL_TRANCHES \ 71 | -rscriptFile $INDEL_RSCRIPT" 72 | 73 | echo ">>> Train recalibration for indels" 74 | echo ">>> Train recalibration for indels" >> $LOGFILE 75 | echo ">>> $command &> $PREFIX/$SAMPLE.recalibrate.indel.log" 76 | echo ">>> $command > $PREFIX/$SAMPLE.recalibrate.indel.log" >> $LOGFILE 77 | $command &> $PREFIX/$SAMPLE.recalibrate.indel.log 78 | 79 | command="java -Xmx6g -Xms6g -jar $GATK/GenomeAnalysisTK.jar \ 80 | -T ApplyRecalibration \ 81 | -R $REF \ 82 | -input $INDEL_VCF \ 83 | --ts_filter_level 99.0 \ 84 | -tranchesFile $INDEL_TRANCHES \ 85 | -recalFile $INDEL_RECAL \ 86 | -o $INDEL_RECAL_VCF \ 87 | --mode INDEL" 88 | 89 | echo ">>> Apply recalibration for indels" 90 | echo ">>> Apply recalibration for indels" >> $LOGFILE 91 | echo ">>> $command &> $PREFIX/$SAMPLE.apply.indel.log" 92 | echo ">>> $command > $PREFIX/$SAMPLE.apply.indel.log" >> $LOGFILE 93 | $command &> $PREFIX/$SAMPLE.apply.indel.log 94 | 95 | echo "Finished performing VQSR on indels" 96 | #rm $SNP_VCF $SNP_RECAL $SNP_TRANCHES $PREFIX/*.log $INDEL_VCF $INDEL_RECAL $INDEL_TRANCHES 97 | -------------------------------------------------------------------------------- /bin/vqsr_snp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo ">>> Performing VQSR on SNPs" 4 | echo " " >> $LOGFILE 5 | echo ">>> Performing VQSR on SNPs" >> $LOGFILE 6 | 7 | set -e 8 | 9 | if [ $# -lt 1 ] 10 | then 11 | echo "Usage: $0 vqsr" 12 | exit 1 13 | fi 14 | 15 | START_VCF=`cd \`dirname $1\`; pwd`/`basename $1` 16 | PREFIX=`dirname $START_VCF` 17 | SUFFIX=`basename $START_VCF` 18 | SAMPLE=${SUFFIX/.gatk.vcf/} 19 | 20 | SNP_VCF=$PREFIX/$SAMPLE.snp.vcf 21 | SNP_RECAL_VCF=$PREFIX/$SAMPLE.vqsr.snp.vcf 22 | SNP_RECAL=$PREFIX/$SAMPLE.tmp.snp.vcf 23 | SNP_TRANCHES=$PREFIX/$SAMPLE.tranches.gatk.snp.recal.csv 24 | SNP_RSCRIPT=$PREFIX/$SAMPLE.gatk.recal.snp.R 25 | 26 | touch $SNP_RECAL_VCF 27 | doVQSR=$2 28 | targeted=$3 29 | useHaplotypeScore=$4 30 | noop=$5 31 | 32 | if [ "$noop" == "True" ] 33 | then 34 | echo "bye bye" 35 | exit $? 36 | fi 37 | 38 | HAPLOTYPSCORE="" 39 | if [ "$useHaplotypeScore" == "True" ] 40 | then 41 | HAPLOTYPSCORE="-an HaplotypeScore" 42 | fi 43 | 44 | DP="-an DP" 45 | if [ "$targeted" == "True" ] 46 | then 47 | DP="--maxGaussians 4" 48 | fi 49 | 50 | if [ "$doVQSR" != "True" ] 51 | then 52 | command="cp $SNP_VCF $SNP_RECAL_VCF" 53 | echo ">>> Do not perform SNP VQSR" 54 | echo ">>> Do not perform SNP VQSR" >> $LOGFILE 55 | $command &> $PREFIX/$SAMPLE.vqsr.snp.log 56 | echo "$command &> $PREFIX/$SAMPLE.vqsr.snp.log" >> $LOGFILE 57 | exit $? 58 | fi 59 | 60 | command="java -Xmx6g -Xms6g -jar $GATK/GenomeAnalysisTK.jar \ 61 | -T SelectVariants \ 62 | -R $REF \ 63 | -V $START_VCF \ 64 | -o $SNP_VCF \ 65 | -selectType SNP" 66 | 67 | echo ">>> Select variants for SNP recalibration" 68 | echo ">>> Select variants for SNP recalibration" >> $LOGFILE 69 | echo ">>> $command &> $PREFIX/$SAMPLE.select.snp.log" 70 | echo ">>> $command &> $PREFIX/$SAMPLE.select.snp.log" >> $LOGFILE 71 | $command &> $PREFIX/$SAMPLE.select.snp.log 72 | 73 | command="java -Xmx6g -Xms6g -jar $GATK/GenomeAnalysisTK.jar \ 74 | -T VariantRecalibrator \ 75 | -R $REF \ 76 | -input $SNP_VCF \ 77 | -resource:hapmap,known=false,training=true,truth=true,prior=15.0 $HAPMAP \ 78 | -resource:omni,known=false,training=true,truth=true,prior=12.0 $OMNI_1K \ 79 | -resource:1000G,known=false,training=true,truth=false,prior=10.0 $GOLD_1K_SNPS \ 80 | -resource:dbsnp,known=true,training=false,truth=false,prior=2.0 $DBSNP \ 81 | -an QD \ 82 | -an FS $DP \ 83 | -an MQRankSum \ 84 | -an ReadPosRankSum $HAPLOTYPESCORE \ 85 | -tranche 100.0 -tranche 99.9 -tranche 99.0 -tranche 90.0 \ 86 | -mode SNP \ 87 | -recalFile $SNP_RECAL \ 88 | -tranchesFile $SNP_TRANCHES \ 89 | -nt 4 \ 90 | --minNumBadVariants 5000 \ 91 | -rscriptFile $SNP_RSCRIPT" 92 | 93 | echo ">>> Train recalibrator for SNPs" 94 | echo ">>> Train recalibrator for SNPs" >> $LOGFILE 95 | echo ">>> $command &> $PREFIX/$SAMPLE.recalibrate.snp.log" 96 | echo ">>> $command > $PREFIX/$SAMPLE.recalibrate.snp.log" >> $LOGFILE 97 | $command &> $PREFIX/$SAMPLE.recalibrate.snp.log 98 | 99 | command="java -Xmx3g -Xms3g -jar $GATK/GenomeAnalysisTK.jar \ 100 | -T ApplyRecalibration \ 101 | -R $REF \ 102 | -input $SNP_VCF \ 103 | --ts_filter_level 99.0 \ 104 | -tranchesFile $SNP_TRANCHES \ 105 | -recalFile $SNP_RECAL \ 106 | -o $SNP_RECAL_VCF \ 107 | --mode SNP" 108 | 109 | echo ">>> Apply recalibration to SNPs" 110 | echo ">>> Apply recalibration to SNPs" >> $LOGFILE 111 | echo ">>> $command &> $PREFIX/$SAMPLE.apply.snp.log" 112 | echo ">>> $command > $PREFIX/$SAMPLE.apply.snp.log" >> $LOGFILE 113 | $command &> $PREFIX/$SAMPLE.apply.snp.log 114 | 115 | echo ">>> Finished performing VQSR on SNPs" 116 | -------------------------------------------------------------------------------- /bin/annotate.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | 3 | import sys 4 | import os 5 | import re 6 | 7 | if len(sys.argv) <= 2: 8 | print 'usage: ' 9 | exit(1) 10 | 11 | path = os.environ['ANNOVAR'] 12 | output = sys.argv[1] 13 | input = sys.argv[2] 14 | 15 | isVCF=False 16 | if input.endswith('.vcf') or input.endswith('.vcf.gz'): 17 | isVCF=True 18 | vcf = input 19 | avinput = vcf + '.avinput' 20 | avoutput = vcf + '.avoutput' 21 | os.system('less %s | %s/convert2annovar.pl -format vcf4 - > %s' % (vcf, path, avinput)) 22 | elif input.endswith('.gff'): 23 | gff = input 24 | avinput = gff + '.avinput' 25 | avoutput = gff + '.avoutput' 26 | gff_file = open(gff, 'read') 27 | out_file = open(avinput, 'w') 28 | for line in gff_file: 29 | gffCols = line.split('\t') 30 | out_file.write(gffCols[0]+'\t'+gffCols[3]+'\t'+gffCols[4]+'\t0\t0\n') 31 | out_file.flush() 32 | out_file.close() 33 | else: 34 | print >> sys.stderr, "Unknown input format: "+input 35 | exit(1) 36 | 37 | # use user-define output name for now 38 | avoutput=output 39 | 40 | print 'Annotating variants with hg19 UCSC knownGene...\n' 41 | temp=('%s/annotate_variation.pl --geneanno --buildver hg19 -dbtype knownGene --separate %s %s/humandb/' %(path, avinput, path)) 42 | print temp 43 | os.system('%s/annotate_variation.pl --geneanno --buildver hg19 -dbtype knownGene --separate %s %s/humandb/' %(path, avinput, path)) 44 | print 'Annotating variants with hg19 RMSK...\n' 45 | 46 | temp=('%s/annotate_variation.pl -regionanno --buildver hg19 -dbtype gff3 -gff3dbfile hg19_rmsk.gff %s %s/humandb/' %(path, avinput, path)) 47 | print temp 48 | os.system('%s/annotate_variation.pl -regionanno --buildver hg19 -dbtype gff3 -gff3dbfile hg19_rmsk.gff %s %s/humandb/' %(path, avinput, path)) 49 | if isVCF: 50 | print 'Annotating variants with sift scores using hg19...\n' 51 | os.system('%s/annotate_variation.pl --filter --sift_threshold 0 --buildver hg19 --separate -dbtype avsift %s %s/humandb/' %(path, avinput, path)) 52 | print 'Annotating variants with dbSNP137...\n' 53 | os.system('%s/annotate_variation.pl --filter --buildver hg19 --dbtype snp137 %s %s/humandb/' %(path, avinput, path)) 54 | 55 | 56 | exonic_file = avinput + '.exonic_variant_function' 57 | function_file = avinput + '.variant_function' 58 | sift_file = avinput + '.hg19_avsift_dropped' 59 | dbsnp_file = avinput + '.hg19_snp137_dropped' 60 | rmsk_file = avinput + ".hg19_gff3" 61 | 62 | def makeDict(filename, chrCol, startCol, endCol, valueCols, isGFF=False): 63 | file = open(filename, 'r') 64 | dic = {} 65 | for line in file: 66 | cols = line.split('\t') 67 | key=(cols[chrCol], cols[startCol], cols[endCol]) 68 | if key not in dic: 69 | dic[key] = [] 70 | for valueCol in valueCols: 71 | dic[key].append([]) 72 | for i in range(len(valueCols)): 73 | value=cols[valueCols[i]] 74 | if isGFF: 75 | value=value.split(";")[-1].split("=")[-1] 76 | dic[key][i].append(value) 77 | return dic 78 | 79 | function = makeDict(function_file, 2, 3, 4, [1, 0]) 80 | rmsk = makeDict(rmsk_file, 2, 3, 4, [1], True) 81 | if isVCF: 82 | exonic = makeDict(exonic_file, 3, 4, 5, [1,2]) 83 | dbsnp = makeDict(dbsnp_file, 2, 3, 4, [1]) 84 | sift = makeDict(sift_file, 2, 3, 4, [1]) 85 | 86 | AVINPUT = open(avinput, 'r') 87 | AVOUTPUT = open(avoutput, 'w') 88 | AVOUTPUT.write("#chr\tstart\tend\tgene_name\ttype\trmsk"); 89 | if isVCF: 90 | AVOUTPUT.write("\tSIFT\tconsequence\tmutation_info\tdbsnp137") 91 | 92 | AVOUTPUT.write("\n") 93 | 94 | def write(dic, key, nvalueCols=1): 95 | if key in dic: 96 | for value in dic[key]: 97 | AVOUTPUT.write('\t'+";".join(value)) 98 | else: 99 | AVOUTPUT.write('\t.'*nvalueCols) 100 | 101 | for line in AVINPUT: 102 | if re.match('([0-9A-Za-z]+)\s+(\d+)', line): 103 | splitline = line.split('\t') 104 | key=(splitline[0], splitline[1],splitline[2]) 105 | AVOUTPUT.write(splitline[0]+'\t'+splitline[1]+'\t'+splitline[2]) 106 | write(function, key, 2) 107 | write(rmsk, key) 108 | if isVCF: 109 | write(sift, key) 110 | write(exonic, key, 2) 111 | write(dbsnp, key) 112 | AVOUTPUT.write('\n') 113 | 114 | AVINPUT.flush(); 115 | AVOUTPUT.flush(); 116 | 117 | AVINPUT.close(); 118 | AVOUTPUT.close(); 119 | 120 | -------------------------------------------------------------------------------- /bin/gatk_vc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -eu 2 | 3 | echo "*** Performing SNV discovery and genotyipng using GATK ***" 4 | echo " " >> $LOGFILE 5 | echo "*** Performing SNV discovery and genotyipng using GATK ***" >> $LOGFILE 6 | 7 | if [ $# -lt 4 ] 8 | then 9 | echo "Usage: $0 output reference_calls snp_hapcaller indel_hapcaller bam ..." 10 | exit 1 11 | fi 12 | 13 | o=`cd \`dirname $1\`; pwd`/`basename $1` 14 | shift 15 | output_hc=$o.hc.vcf 16 | output_gc=$o.gc.vcf 17 | 18 | snp_vcf=$o.snp 19 | indel_vcf=$o.indel 20 | 21 | capture=$1 22 | shift 23 | echo $capture 24 | 25 | reference_calls=$1 26 | shift 27 | 28 | snp_hap=$1 29 | shift 30 | indel_hap=$1 31 | shift 32 | 33 | optL='' 34 | if [ $# -eq 1 ] 35 | then 36 | echo $1 37 | if [[ "$1" =~ .*chr[^\.]*\..* ]] 38 | then 39 | chr=`echo "$1" | sed 's/.*\(chr[^\.]*\)\..*/\1/'` 40 | optL="-L $chr" 41 | fi 42 | fi 43 | 44 | f='' 45 | for i in $* 46 | do 47 | f="$f -I `cd \`dirname $i\`; pwd`/`basename $i`" 48 | done 49 | 50 | log=${o/gatk.vcf/}vc.log 51 | 52 | run_hc="False" 53 | run_gc="False" 54 | if [[ "$snp_hap" == "True" ]] || [[ "$indel_hap" == "True" ]] 55 | then 56 | run_hc="True" 57 | fi 58 | 59 | if [[ "$snp_hap" != "True" ]] || [[ "$indel_hap" != "True" ]] 60 | then 61 | run_gc="True" 62 | fi 63 | 64 | NO_VARIATION="" 65 | OUTPUT_MODE_UG="" 66 | OUTPUT_MODE_HC="" 67 | if [[ "$reference_calls" == "True" ]] 68 | then 69 | OUTPUT_MODE_UG="--output_mode emit_all_sites" 70 | OUTPUT_MODE_HC="-ERC BP_RESOLUTION" 71 | NO_VARIATION="-selecttype no_variation" 72 | fi 73 | 74 | CAPTURE="" 75 | if [[ "$capture" != "False" ]] 76 | then 77 | #CAPTURE="-L $capture" 78 | captures=$(echo $capture | tr "[" "\n") 79 | captures=$(echo $captures | tr "]" "\n") 80 | captures=$(echo $captures | tr "," "\n") 81 | #echo "here" $captures 82 | 83 | for capt in $captures 84 | do 85 | #echo "HELLO" $capt 86 | if [[ "$CAPTURE" != "" ]] 87 | then 88 | CAPTURE="$CAPTURE -L $capt" 89 | else 90 | CAPTURE="-L $capt" 91 | fi 92 | done 93 | fi 94 | 95 | gc_command="java -Xmx8g -Xms8g -Djava.io.tmpdir=$TMP -jar $GATK/GenomeAnalysisTK.jar \ 96 | -T UnifiedGenotyper \ 97 | -R $REF \ 98 | $f \ 99 | --dbsnp $DBSNP \ 100 | -o $output_gc $optL $CAPTURE \ 101 | -stand_call_conf 20.0 \ 102 | -stand_emit_conf 10.0 \ 103 | -gt_mode DISCOVERY $OUTPUT_MODE_UG \ 104 | --genotype_likelihoods_model BOTH \ 105 | -nct 4" 106 | 107 | hc_command="java -Xmx12g -Xms12g -Djava.io.tmpdir=$TMP -jar $GATK/GenomeAnalysisTK.jar \ 108 | -T HaplotypeCaller \ 109 | -R $REF \ 110 | $f \ 111 | --dbsnp $DBSNP \ 112 | -o $output_hc $optL $CAPTURE \ 113 | -stand_call_conf 20.0 \ 114 | -stand_emit_conf 10.0 \ 115 | --genotyping_mode DISCOVERY $OUTPUT_MODE_HC \ 116 | -nct 4" 117 | 118 | # https://gatkforums.broadinstitute.org/discussion/3115/emit-all-sites-in-haplotypecaller 119 | # http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_walkers_haplotypecaller_HaplotypeCaller.html 120 | 121 | if [[ "$run_gc" == "True" ]] 122 | then 123 | echo ">>> Running the GATK's UnifiedGenotyper" 124 | echo ">>> Running the GATK's UnifiedGenotyper" >> $LOGFILE 125 | echo ">>> $gc_command" 126 | echo ">>> $gc_command" >> $LOGFILE 127 | $gc_command &>> $log 128 | fi 129 | 130 | if [[ "$run_hc" == "True" ]] 131 | then 132 | echo ">>> Running the GATK's Haplotyper" 133 | echo ">>> Running the GATK's Haplotyper" >> $LOGFILE 134 | echo ">>> $hc_command" 135 | echo ">>> $hc_command" >> $LOGFILE 136 | $hc_command &>> $log 137 | fi 138 | 139 | use_gc="False" 140 | use_hc="False" 141 | 142 | if [[ "$snp_hap" == "True" ]] && [[ "$indel_hap" == "True" ]] 143 | then 144 | # select both snps and indels from HC 145 | select_indel_from=$output_hc 146 | select_snp_from=$output_hc 147 | use_hc="True" 148 | fi 149 | 150 | if [[ "$snp_hap" == "True" ]] && [[ "$indel_hap" != "True" ]] 151 | then 152 | # select snps from HC and indels from GC 153 | select_indel_from=$output_gc 154 | select_snp_from=$output_hc 155 | fi 156 | 157 | if [[ "$snp_hap" != "True" ]] && [[ "$indel_hap" == "True" ]] 158 | then 159 | # select snps from GC and indels from HC 160 | select_indel_from=$output_hc 161 | select_snp_from=$output_gc 162 | fi 163 | 164 | if [[ "$snp_hap" != "True" ]] && [[ "$indel_hap" != "True" ]] 165 | then 166 | # select snps and indels from GC 167 | select_indel_from=$output_gc 168 | select_snp_from=$output_gc 169 | use_gc="True" 170 | fi 171 | 172 | if [[ "$use_gc" == "True" ]] 173 | then 174 | command="mv $output_gc $o" 175 | echo ">>> Choosing both SNP and INDEL from UG output: $select_snp_from" 176 | echo ">>> Choosing both SNP and INDEL from UG output: $select_snp_from" >> $LOGFILE 177 | echo ">>> $command" 178 | echo ">>> $command" >> $LOGFILE 179 | $command &>> $log 180 | 181 | elif [[ "$use_hc" == "True" ]] 182 | then 183 | command="mv $output_hc $o" 184 | echo ">>> Choosing both SNP and INDEL from HC output: $select_snp_from" 185 | echo ">>> Choosing both SNP and INDEL from HC output: $select_snp_from" >> $LOGFILE 186 | echo ">>> $command" 187 | echo ">>> $command" >> $LOGFILE 188 | $command &>> $log 189 | else 190 | 191 | command="java -Xmx8g -Xms8g -jar $GATK/GenomeAnalysisTK.jar \ 192 | -T SelectVariants \ 193 | -R $REF \ 194 | -V $select_snp_from \ 195 | -o $snp_vcf \ 196 | -selectType MNP \ 197 | -selectType SNP $NO_VARIANTION" 198 | 199 | echo ">>> Selecting SNPs from $select_snp_from" 200 | echo ">>> Selecting SNPs from $select_snp_from" >> $LOGFILE 201 | echo ">>> $command" 202 | echo ">>> $command" >> $LOGFILE 203 | $command &>> $log 204 | 205 | command="java -Xmx8g -Xms8g -jar $GATK/GenomeAnalysisTK.jar \ 206 | -T SelectVariants \ 207 | -R $REF \ 208 | -V $select_indel_from \ 209 | -o $indel_vcf \ 210 | -selectType INDEL" 211 | 212 | echo ">>> Selecting Indels from $select_indel_from" 213 | echo ">>> Selecting Indels from $select_indel_from" >> $LOGFILE 214 | echo ">>> $command" 215 | echo ">>> $command" >> $LOGFILE 216 | $command &>> $log 217 | 218 | command="java -Xmx8g -Xms8g -jar $GATK/GenomeAnalysisTK.jar \ 219 | -R $REF \ 220 | -T CombineVariants \ 221 | --variant $snp_vcf \ 222 | --variant $indel_vcf \ 223 | -o $o" 224 | 225 | echo ">>> Combining SNP and Indel VCFs" 226 | echo ">>> Combining SNP and Indel VCFs" >> $LOGFILE 227 | echo ">>> $command" 228 | echo ">>> $command" >> $LOGFILE 229 | $command &>> $log 230 | fi 231 | 232 | echo "*** Finished SNV discovery and genotyipng using GATK ***" 233 | 234 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | ##################################### 2 | # # 3 | # HugeSeq # 4 | # The Variant Detection Pipeline # 5 | # # 6 | ##################################### 7 | 8 | -- DEPENDENCIES 9 | 10 | + STANOVAR version 0.1 11 | 12 | + BEDtools version 2.17.0 13 | 14 | + BreakDancer version 1.1.2 15 | 16 | + BreakSeq Lite version 1.0 17 | 18 | + BWA version 0.7.4 19 | 20 | + CNVnator version 0.2.7 21 | 22 | + GATK version 3.2.2 23 | 24 | + JDK version 1.7.0_03 25 | 26 | + Modules Release 3.2.8 27 | 28 | + Perl 29 | 30 | + Picard Tools version 1.32 31 | 32 | + Pindel version 0.2.4t 33 | 34 | + Python version 2.7 35 | 36 | + Simple Job Manager version 1.0 37 | 38 | + Tabix version 0.2.6 39 | 40 | + vcftools version 0.1.12 41 | 42 | + zlib version 1.2.7 43 | 44 | + root version 5.34.30 45 | 46 | + r version 3.2.0 47 | 48 | 49 | -- INSTALLATION 50 | 51 | HugeSeq is a modular, computational pipeline that runs in a Unix environment in a highly parallel fashion. It was tested on Red Hat Enterprise Linux (RHEL) server v5.6 but it should work in most Linux servers. The batch system it currently supports out-of-the-box is Sun Grid Engine. 52 | 53 | 54 | Batch System 55 | 56 | Many of the clusters are already installed with Sun Grid Engine (SGE). For installing SGE, please refer to the vendor's manual. 57 | 58 | Running the analysis pipeline requires submitting many interdependent jobs to the batch scheduling system (e.g. Sun Grid Engine). Therefore, we developed a software program called SJM (Simple Job Manager) to simplify this process, including properly specifying the dependencies, tracking progress of the group of jobs, and responding properly if a job fails. 59 | 60 | For batch systems other than SGE, it requires developing an adaptor in SJM. Please write to us for more details. 61 | 62 | Modules Environment 63 | 64 | To manage different versions of softwares and parameters in the modules, HugeSeq uses a Unix software package called Environment Modules, which provides for the dynamic modification of a user's environment via modulefiles. 65 | 66 | To initiate Modules, modify your login profile such as .bash_profile to add the following: 67 | 68 | . /path-to-Modules/default/init/sh 69 | 70 | Supporting Tools 71 | 72 | Install the required softwares, such as the aligners, variant callers and manipulation tools, defined in the software requirements section. For details, please refer to the individual software websites. The softwares are recommended to be installed separately under a single parent directory, such as ~/apps/BreakSeq and ~/apps/CNVnator. 73 | 74 | Data Sets 75 | 76 | HugeSeq depends on several public data sets for alignment, variant calling, and annotations. They are: 77 | The reference genome (e.g. HG19 in FASTA format: hg19.fa) 78 | The BWA index of the reference genome (e.g. hg19.fa.bwt, hg19.fa.ann, etc) 79 | A .dict dictionary of the contig names and sizes (e.g. hg19.fa.dict) 80 | A .fai fasta index file (e.g. hg19.fa.fai) 81 | For creating .dict and .fai, please see here. All the indexes and dictionary should reside in the reference genome directory which contains the whole genome FASTA (e.g. hg19.fa) 82 | The breakpoint junctions (i.e. BreakSeq library in FASTA format: bplib.fa) 83 | The SNP annotation 84 | UCSC Known Genes (knownGene) 85 | dbSNP 86 | SIFT (avsift) 87 | RepeatMasker (buildver_rmsk.gff) 88 | The STANOVAR application should be installed and corresponding module need to be defined. 89 | 90 | 91 | Download HugeSeq to your server. 92 | 93 | Extract the programs from the compressed archive to a directory, such as ~/app. A directory like ~/app/HugeSeq will then be created, which contains the core program and its configuration. As described above, HugeSeq uses the Environment Modules package for configuration. Its modulefile is in the directory /path-to-HugeSeq/modulefiles/hugeseq named with its version, such as 1.0. To enable Modules to look up the modulefile for correct setting, modify the login profile as above and add the following: 94 | 95 | export MODULEPATH=/path-to-HugeSeq/modulefiles:$MODULEPATH 96 | 97 | In addition, modify the module file, such as /path-to-HugeSeq/modulefiles/hugeseq/2.0, and change all the programs' paths to the locations where you installed the required programs and the data paths to where you stored the datasets. 98 | 99 | Logout and login again to your shell to activate the login profile with the latest configuration. You should now be able to run HugeSeq by loading its module: 100 | 101 | > module load hugeseq/2.0 102 | 103 | After loading the module, you can run HugeSeq simply by typing: 104 | 105 | > hugeseq 106 | 107 | For the usage of HugeSeq, please refer to the Usage section. 108 | 109 | -- USAGE 110 | 111 | usage: hugeseq [-h] --reads1 FILE [FILE ...] [--reads2 FILE [FILE ...]] 112 | --output DIR [--account STR] [--tmp DIR] [--readgroup STR] 113 | [--samplename STR] [--bam] [--variants TYPE [TYPE ...]] 114 | [--targeted] [--capture FILE [FILE ...]] [--relax_realignment] 115 | [--reference_calls] [--snp_hapcaller] [--indel_hapcaller] 116 | [--nosnpvqsr] [--noindelvqsr] [--vqsrchrom] [--nobinning] 117 | [--nocleanup] [--novariant] [--alignmentonly] [--cleanuponly] 118 | [--variantonly] [--donealign] [--donebinning] [--donecleanup] 119 | [--donegenotyping] [--donesnpvqsr] [--memory SIZE] 120 | [--queue NAME] [--email NAME] [--threads COUNT] 121 | [--jobfile FILE] [--submit] 122 | 123 | Generating the job file for the HugeSeq variant detection pipeline 124 | 125 | optional arguments: 126 | -h, --help show this help message and exit 127 | --reads1 FILE [FILE ...] 128 | The FASTQ file(s) for reads 1 129 | --reads2 FILE [FILE ...] 130 | The FASTQ file(s) for reads 2, if paired-end 131 | --output DIR The output directory 132 | --account STR Accounting string for the purpose of cluster 133 | accounting. 134 | --tmp DIR The TMP directory for storing intermediate files 135 | (default=output directory 136 | --readgroup STR The read group annotation (Default: 137 | @RG\tID:Default\tLB:Library\tPL:Illumina\tSM:SAMPLE) 138 | --samplename STR The SM tag in the read group annotation (Default: 139 | "SAMPLE" in 140 | @RG\tID:Default\tLB:Library\tPL:Illumina\tSM:SAMPLE) 141 | --bam Support for aligned BAMs as input. By default input 142 | (-r) is aligned again. Use --variantonly otherwise. 143 | --variants TYPE [TYPE ...] 144 | gatk breakdancer cnvnator pindel breakseq (default to 145 | all) 146 | --targeted Use GATK in targeted sequencing mode (default: whole- 147 | genome mode) 148 | --capture FILE [FILE ...] 149 | Capture BED file(s) used for targeted genotyping 150 | (default: void, separate multipe files with commas: 151 | capture1.bed,capture2.bed,...) 152 | --relax_realignment Relaxes GATKs realignment when dealing with badly 153 | scored reads (default: false) 154 | --reference_calls Store all reference calls from GATK (default: false) 155 | in gVCF format in addition to a standard VCF file 156 | containing only the variants (valid only for SNV 157 | calling) 158 | --snp_hapcaller Use GATK HaplotypeCaller to discover SNPs (default: 159 | UnifiredGenotyper) 160 | --indel_hapcaller Use GATK HaplotypeCaller to discover Indels (default: 161 | UnifiredGenotyper) 162 | --nosnpvqsr Do not perform VQSR SNPs (variant quality score 163 | recalibration) 164 | --noindelvqsr Do not perform VQSR on Indels (variant quality score 165 | recalibration) 166 | --vqsrchrom Perform VQSR on individual chromosomes (valid when 167 | binning performed; default: VQSR on whole genome VCF) 168 | --nobinning Do not bin the alignments by chromosomes 169 | --nocleanup Do not clean up the alignments 170 | --novariant Do not call variants 171 | --alignmentonly Only align input FASTQ or BAM files (-r) 172 | --cleanuponly Only clean up input BAM files (-r) 173 | --variantonly Only call variants in input BAM files (-r) 174 | --donealign Sequences already aligned using the pipeline 175 | --donebinning Alignments already binned by chromosomes using the 176 | pipeline 177 | --donecleanup Alignments already cleaned using the pipeline 178 | --donegenotyping Variants already called using the pipeline but VQSR is 179 | not 180 | --donesnpvqsr Processing is started after SNP VQSR (from Indel VQSR) 181 | --memory SIZE Memory size (GB) per job (default: 12) 182 | --queue NAME Queue for jobs (default: extended) 183 | --email NAME Email address to receive emails for ending or aborting 184 | last jobs in the queque 185 | --threads COUNT Number of threads for alignment, only works for SGE 186 | (default: 4) 187 | --jobfile FILE The jobfile name (default: stdout) 188 | --submit Submit the jobs 189 | -------------------------------------------------------------------------------- /bin/hugeseq: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | 3 | import sys, os, re, argparse, subprocess, os.path 4 | import dircache 5 | from sjm import * 6 | from util import * 7 | from os import listdir 8 | from os.path import isfile, join, splitext 9 | 10 | try: 11 | home=os.environ['HUGESEQ_HOME'] 12 | refi=os.environ['REF']+".fai" 13 | except KeyError: 14 | print >> sys.stderr, "Error in initializing HugeSeq. Module HugeSeq probably is not loaded." 15 | exit(1) 16 | 17 | parser = argparse.ArgumentParser(description='Generating the job file for the HugeSeq variant detection pipeline') 18 | parser.add_argument('--reads1', metavar='FILE', nargs="+", required=True, help='The FASTQ file(s) for reads 1') 19 | parser.add_argument('--reads2', metavar='FILE', nargs="+", help='The FASTQ file(s) for reads 2, if paired-end') 20 | parser.add_argument('--output', metavar='DIR', required=True, help='The output directory') 21 | parser.add_argument('--account', metavar='STR', help='Accounting string for the purpose of cluster accounting.') 22 | parser.add_argument('--tmp', metavar='DIR', help='The TMP directory for storing intermediate files (default=output directory') 23 | parser.add_argument('--readgroup', metavar='STR', default="@RG\\tID:Default\\tLB:Library\\tPL:Illumina\\tSM:SAMPLE", help='The read group annotation (Default: @RG\\tID:Default\\tLB:Library\\tPL:Illumina\\tSM:SAMPLE)') 24 | parser.add_argument('--samplename', metavar='STR', help='The SM tag in the read group annotation (Default: "SAMPLE" in @RG\\tID:Default\\tLB:Library\\tPL:Illumina\\tSM:SAMPLE)') 25 | parser.add_argument('--bam', action='store_true', help='Support for aligned BAMs as input. By default input (-r) is aligned again. Use --variantonly otherwise.') 26 | parser.add_argument('--variants', metavar='TYPE', nargs="+", help='gatk breakdancer cnvnator pindel breakseq (default to all)') 27 | parser.add_argument('--targeted', action='store_true', help='Use GATK in targeted sequencing mode (default: whole-genome mode)') 28 | parser.add_argument('--capture', metavar='FILE', nargs="+", help='Capture BED file(s) used for targeted genotyping (default: void, separate multipe files with commas: capture1.bed,capture2.bed,...)') 29 | parser.add_argument('--relax_realignment', action='store_true', help='Relaxes GATKs realignment when dealing with badly scored reads (default: false)') 30 | parser.add_argument('--reference_calls', action='store_true', help='Store all reference calls from GATK (default: false) in gVCF format in addition to a standard VCF file containing only the variants (valid only for SNV calling)') 31 | parser.add_argument('--snp_hapcaller', action='store_true', help='Use GATK HaplotypeCaller to discover SNPs (default: UnifiredGenotyper)') 32 | parser.add_argument('--indel_hapcaller', action='store_true', help='Use GATK HaplotypeCaller to discover Indels (default: UnifiredGenotyper)') 33 | parser.add_argument('--nosnpvqsr', action='store_true', help='Do not perform VQSR SNPs (variant quality score recalibration)') 34 | parser.add_argument('--noindelvqsr', action='store_true', help='Do not perform VQSR on Indels (variant quality score recalibration)') 35 | parser.add_argument('--vqsrchrom', action='store_true', help='Perform VQSR on individual chromosomes (valid when binning performed; default: VQSR on whole genome VCF)') 36 | parser.add_argument('--nobinning', action='store_true', help='Do not bin the alignments by chromosomes') 37 | parser.add_argument('--nocleanup', action='store_true', help='Do not clean up the alignments') 38 | parser.add_argument('--novariant', action='store_true', help='Do not call variants') 39 | parser.add_argument('--alignmentonly', action='store_true', help='Only align input FASTQ or BAM files (-r)') 40 | parser.add_argument('--cleanuponly', action='store_true', help='Only clean up input BAM files (-r)') 41 | parser.add_argument('--variantonly', action='store_true', help='Only call variants in input BAM files (-r)') 42 | parser.add_argument('--donealign', action='store_true', help='Sequences already aligned using the pipeline') 43 | parser.add_argument('--donebinning', action='store_true', help='Alignments already binned by chromosomes using the pipeline') 44 | parser.add_argument('--donecleanup', action='store_true', help='Alignments already cleaned using the pipeline') 45 | parser.add_argument('--donegenotyping', action='store_true', help='Variants already called using the pipeline but VQSR is not') 46 | parser.add_argument('--donesnpvqsr', action='store_true', help='Processing is started after SNP VQSR (from Indel VQSR)') 47 | parser.add_argument('--memory', metavar='SIZE', type=int, default=12, help='Memory size (GB) per job (default: 12)') 48 | parser.add_argument('--queue', metavar='NAME', default="extended", help='Queue for jobs (default: extended)') 49 | parser.add_argument('--email', metavar='NAME', default="aminzia@stanford.edu", help='Email address to receive emails for ending or aborting last jobs in the queque') 50 | parser.add_argument('--threads', metavar='COUNT', type=int, default=4, help='Number of threads for alignment, only works for SGE (default: 4)') 51 | parser.add_argument('--jobfile', metavar='FILE', help='The jobfile name (default: stdout)') 52 | parser.add_argument('--submit', action='store_true', help='Submit the jobs') 53 | args = parser.parse_args() 54 | 55 | outdir=Dir(args.output) 56 | logdir=Dir(outdir, 'log') 57 | 58 | outdir.mkdirs() 59 | logdir.mkdirs() 60 | 61 | tmpdir=outdir 62 | if (args.tmp is not None): 63 | tmpdir=Dir(args.tmp) 64 | tmpdir.mkdirs() 65 | 66 | capture="True" 67 | if (args.capture is None): 68 | capture="False" 69 | else: 70 | capture=args.capture 71 | 72 | id=re.match(r'(?:.+\\t)?ID:([^\\]+)', args.readgroup) 73 | id=id.group(1) 74 | lb=re.match(r'(?:.+\\t)?LB:([^\\]+)', args.readgroup) 75 | lb=lb.group(1) 76 | pl=re.match(r'(?:.+\\t)?PL:([^\\]+)', args.readgroup) 77 | pl=pl.group(1) 78 | sample=re.match(r'(?:.+\\t)?SM:([^\\]+)', args.readgroup) 79 | sample=sample.group(1) 80 | 81 | if args.samplename is not None: 82 | sample = args.samplename 83 | readgroup="@RG\\tID:"+id+"\\tLB:"+lb+"\\tPL:"+pl+"\\tSM:"+sample 84 | 85 | Job.name_prefix=sample+"." 86 | Job.memory="%sG"%args.memory 87 | Job.queue=args.queue 88 | Job.cmd_prefix=os.path.join(home,'bin','hugeseq_mod.sh') 89 | 90 | if args.jobfile is None and not args.submit: 91 | jobfile=None 92 | else: 93 | if args.jobfile is None: 94 | jobfile=File(outdir, "job") 95 | else: 96 | jobfile=File(args.jobfile) 97 | 98 | logfile = jobfile.appext("commands.log") 99 | open(logfile.path, "w") 100 | 101 | tmpdir = getattr(__builtins__, 'str')(tmpdir) 102 | logfile = getattr(__builtins__, 'str')(logfile) 103 | 104 | Job.cmd_prefix = Job.cmd_prefix + ' ' + tmpdir + ' ' + logfile 105 | Job.log_dir=logdir.path 106 | 107 | def prep(readfiles, ext): 108 | jobs=[] 109 | if readfiles is not None: 110 | sys.stderr.write(">>> Pre-processing <<<\n") 111 | for f in readfiles: 112 | input = File(f) 113 | in_index = File(f+".bai") 114 | if (ext==".recal.bam"): 115 | outfile = File(outdir, input.prefix+ext) 116 | out_index = File(outdir, input.prefix+ext+".bai") 117 | job = Job('prep_reads_bam-%s'%input.prefix) 118 | else: 119 | outfile = File(outdir, input.name) 120 | job = Job('prep_reads-%s'%input.prefix) 121 | job.append('echo "Input preparation performed locally"') 122 | p = subprocess.Popen('prep.sh %s %s'%(input, outfile), shell=True, stdout=subprocess.PIPE) 123 | rc = p.wait() 124 | if rc > 0: 125 | raise Exception, "Error in preparing input. Return code: %s"%rc 126 | for l in p.stdout: 127 | sys.stderr.write(l) 128 | if (ext==".recal.bam"): 129 | p = subprocess.Popen('prep.sh %s %s'%(in_index, out_index), shell=True, stdout=subprocess.PIPE) 130 | rc = p.wait() 131 | if rc > 0: 132 | raise Exception, "Error in preparing input. Return code: %s"%rc 133 | for l in p.stdout: 134 | sys.stderr.write(l) 135 | job.output = outfile 136 | job.memory = "100K" 137 | job.sge_options="-l h_rt=120:00:00 -A %s"%args.account 138 | job.status = "done" 139 | jobs.append(job) 140 | return jobs 141 | 142 | def align(readjobs1, readjobs2, ext): 143 | jobs=[] 144 | for i in range(0, len(readjobs1)): 145 | paired = False 146 | if (readjobs2 is not None and i 1: 185 | job.sge_options="-pe shm %s -l h_stack=100M -l h_rt=120:00:00 -A %s"%(args.threads, args.account) 186 | elif (readjob1 is not None and readjob2 is not None): 187 | readfile1=File(readjob1.output) 188 | readfile2=File(readjob2.output) 189 | job = Job('bwa-%s' % readfile1.prefix) 190 | job.memory="%sG"%(args.memory/args.threads) 191 | job.append('bwa_fq.sh %s %s %s \"%s\"'%(readfile1,readfile2,args.threads,readgroup)) 192 | job.depend(readjob1).depend(readjob2) 193 | if args.threads > 1: 194 | job.sge_options="-pe shm %s -l h_stack=100M -l h_rt=120:00:00 -A %s"%(args.threads, args.account) 195 | 196 | return job 197 | 198 | def cleanup(pjobs, ext): 199 | jobs=[] 200 | for pjob in pjobs: 201 | bam=pjob.output 202 | if (ext!=".recal.bam"): 203 | job1=__cleanup('picard_nodup-%s'%bam.prefix, 'picard_nodup.sh', bam, bam.chext("nodup.bam"), False) 204 | job2=__cleanup('gatk_realn-%s'%bam.prefix, 'gatk_realn.sh', job1.output, bam.chext("realn.bam"), args.relax_realignment) 205 | job3=__cleanup('gatk_recal-%s'%bam.prefix, 'gatk_recal.sh', job2.output, bam.chext("recal.bam"), False) 206 | else: 207 | job1=__cleanup('picard_nodup-%s'%bam.prefix, 'picard_nodup.sh', bam, bam, False) 208 | job2=__cleanup('gatk_realn-%s'%bam.prefix, 'gatk_realn.sh', job1.output, bam, args.relax_realignment) 209 | job3=__cleanup('gatk_recal-%s'%bam.prefix, 'gatk_recal.sh', job2.output, bam, False) 210 | job1.depend(pjob) 211 | job2.depend(job1) 212 | job3.depend(job2) 213 | jobs.append(job3) 214 | return jobs 215 | 216 | def __cleanup(jname, cmd, input, output, remove): 217 | job=Job(jname) 218 | job.memory = "24G" 219 | job.sge_options="-l h_rt=120:00:00 -A %s"%args.account 220 | job.append('%s %s %s %s'%(cmd, input, output, remove)) 221 | job.append('samtools_index.sh %s' % output) 222 | job.output=output 223 | return job 224 | 225 | def binning(pjobs, fai): 226 | jobs=[] 227 | chrs=[] 228 | for l in open(fai): 229 | m=re.match(r"(chr..|chr.)\t", l) 230 | if m: 231 | chrs.append(m.group(1)) 232 | 233 | for chr in chrs: 234 | chrBam=File(outdir, chr+".bam") 235 | job = Job('bin_aln-%s'%chr) 236 | job.memory = "3G" 237 | job.sge_options="-l h_rt=120:00:00 -A %s"%args.account 238 | job.output = chrBam 239 | job.append('bin_bam.sh %s %s %s'%(chr, chrBam, " ".join([pjob.output.path for pjob in pjobs]))) 240 | job.append('samtools_index.sh %s'%chrBam) 241 | job.depend(*pjobs) 242 | jobs.append(job) 243 | return jobs 244 | 245 | def callvars(pjobs, combine, variants): 246 | jobs=([],[]) 247 | if len(pjobs)>0: 248 | if not combine: 249 | for pjob in pjobs: 250 | __callvars(jobs, pjob.output.prefix, pjob.output.absprefix, [pjob.output.path], [pjob], variants) 251 | else: 252 | __callvars(jobs, sample, File(outdir.path, sample).path, [pjob.output.path for pjob in pjobs], pjobs, variants) 253 | return jobs 254 | 255 | def __callvars(jobs, idprefix, output, inputs, pjobs, variants): 256 | input=" ".join(inputs) 257 | output="".join(output.split(".recal")) 258 | jobs1=jobs[0] 259 | 260 | if (variants is None or "gatk" in variants): 261 | job0=Job('gatk_vc-%s'%idprefix) 262 | job0.memory = "16G" 263 | job0.sge_options="-l h_rt=120:00:00 -A %s"%args.account 264 | job0.output=File(output+".gatk.vcf") 265 | job0.append('gatk_vc.sh %s %s %s %s %s %s'%(job0.output, capture, args.reference_calls, args.snp_hapcaller, args.indel_hapcaller, input)) 266 | job0.depend(*pjobs) 267 | if args.donegenotyping: 268 | job0.status="done" 269 | 270 | if (not args.vqsrchrom): 271 | jobs1.append(job0) 272 | else: 273 | job1=Job('vqsr_snp-%s'%idprefix) 274 | job1.memory = "16G" 275 | job1.sge_options="-l h_rt=120:00:00 -A %s"%args.account 276 | job1.output=File(job0.output) 277 | job1.append('vqsr_snp.sh %s %s %s %s %s'%(job1.output, not args.nosnpvqsr, args.targeted, not args.snp_hapcaller, args.donesnpvqsr)) 278 | job1.depend(job0) 279 | 280 | job2=Job('vqsr_indel-%s'%idprefix) 281 | job2.memory = "16G" 282 | job2.sge_options="-l h_rt=120:00:00 -A %s"%args.account 283 | job2.output=File(job0.output) 284 | job2.append('vqsr_indel.sh %s %s %s'%(job2.output, not args.noindelvqsr, args.targeted)) 285 | job2.depend(job1) 286 | 287 | job3=Job('combine_vqsr-%s'%idprefix) 288 | job3.memory = "12G" 289 | job3.sge_options="-l h_rt=120:00:00 -A %s"%args.account 290 | job3.output=File(output+".snv.vcf") 291 | 292 | if args.reference_calls: 293 | job3.append('combine_vcf.sh %s %s %s %s %s'%(output+".snv.vcf", False, False, output+".vqsr.snp.vcf", output+".vqsr.indel.vcf")) 294 | job3.append('write_refcalls.sh %s'%(job1.output)) 295 | job3.append('combine_vcf.sh %s %s %s %s %s %s'%(output+".snv.refcalls.vcf", False, True, output+"refcalls.vcf", output+".vqsr.snp.vcf", output+".vqsr.indel.vcf")) 296 | else: 297 | job3.append('combine_vcf.sh %s %s %s %s %s'%(output+".snv.vcf", False, False, output+".vqsr.snp.vcf", output+".vqsr.indel.vcf")) 298 | job3.depend(job2) 299 | 300 | jobs1.append(job3) 301 | 302 | jobs2=jobs[1] 303 | job=None 304 | if (variants is None or "breakdancer" in variants): 305 | job=Job('breakdancer-%s'%idprefix) 306 | job.memory = "24G" 307 | job.sge_options="-l h_rt=120:00:00 -A %s"%args.account 308 | job.output=File(output+".breakdancer.gff") 309 | jobs2.append(job.append('breakdancer.sh %s %s'%(job.output,input)).depend(*pjobs)) 310 | if (variants is None or "pindel" in variants): 311 | rpmJob=job 312 | job=Job('pindel-%s'%idprefix) 313 | job.memory = "24G" 314 | job.sge_options="-l h_rt=120:00:00 -A %s"%args.account 315 | job.output=File(output+".pindel.gff") 316 | jobs2.append(job.append('pindel.sh %s %s'%(job.output,input)).depend(*pjobs if rpmJob is None else [rpmJob])) 317 | if (variants is None or "cnvnator" in variants): 318 | job=Job('cnvnator-%s'%idprefix) 319 | job.memory = "24G" 320 | job.sge_options="-l h_rt=120:00:00 -A %s"%args.account 321 | job.output=File(output+".cnvnator.gff") 322 | jobs2.append(job.append('cnvnator.sh %s %s'%(job.output,input)).depend(*pjobs)) 323 | if (variants is None or "breakseq" in variants): 324 | job=Job('breakseq-%s'%idprefix) 325 | job.memory = "24G" 326 | job.sge_options="-l h_rt=120:00:00 -A %s"%args.account 327 | job.output=File(output+".breakseq.gff") 328 | jobs2.append(job.append('breakseq.sh %s %s'%(job.output,input)).depend(*pjobs)) 329 | 330 | def group_output_by_suffix(suffixes, jobs): 331 | groups={} 332 | groups[suffixes]=[] 333 | for i in jobs: 334 | if i.output.path.endswith(suffixes): 335 | groups[suffixes].append(i.output.path) 336 | return groups 337 | 338 | def group_output_bams_by_suffix(suffixes, bams, jobs): 339 | groups={} 340 | groups[suffixes]=[] 341 | for i in jobs: 342 | if i.output.path.endswith(suffixes): 343 | out = str(i.output.path) 344 | out=out.replace(suffixes, bams) 345 | groups[suffixes].append(out) 346 | return groups 347 | 348 | 349 | def merge_annotate(siJobs, svJobs, variants): 350 | jobs=[] 351 | 352 | keys=".gatk.vcf" 353 | if (args.vqsrchrom): 354 | keys=".snv.vcf" 355 | siCombinedVCFs=group_output_by_suffix(keys, siJobs) 356 | 357 | if variants is None or "gatk" in variants: 358 | job1=Job('concat-vcf-%s'%sample) 359 | for i in siCombinedVCFs.keys(): 360 | 361 | if (not args.vqsrchrom): 362 | if args.nobinning: 363 | job1.append('combine_vcf.sh %s %s %s %s'%(File(outdir.path, sample+".gatk.vcf"), True, False, " ".join(siCombinedVCFs[i]))) 364 | else: 365 | job1.append('combine_vcf.sh %s %s %s %s'%(File(outdir.path, sample+".gatk.vcf"), False, False, " ".join(siCombinedVCFs[i]))) 366 | else: 367 | job1.append('combine_vcf.sh %s %s %s %s'%(File(outdir.path, "genome.recal.vcf"), False, False, " ".join(siCombinedVCFs[i]))) 368 | 369 | if (not args.vqsrchrom): 370 | job1.memory = "16G" 371 | job1.sge_options="-l h_rt=120:00:00 -A %s"%args.account 372 | job1.output=File(outdir.path, sample+".gatk.vcf") 373 | job1.depend(*siJobs) 374 | if args.donegenotyping: 375 | job1.status="done" 376 | 377 | job2=Job('vqsr_snp-%s'%sample) 378 | job2.memory = "16G" 379 | job2.sge_options="-l h_rt=120:00:00 -A %s"%args.account 380 | job2.output=File(job1.output) 381 | job2.append('vqsr_snp.sh %s %s %s %s %s'%(job2.output, not args.nosnpvqsr, args.targeted, not args.snp_hapcaller, args.donesnpvqsr)) 382 | job2.depend(job1) 383 | 384 | job3=Job('vqsr_indel-%s'%sample) 385 | job3.memory = "16G" 386 | job3.sge_options="-l h_rt=120:00:00 -A %s"%args.account 387 | job3.output=File(job1.output) 388 | job3.append('vqsr_indel.sh %s %s %s'%(job3.output, not args.noindelvqsr, args.targeted)) 389 | job3.depend(job2) 390 | 391 | job4=Job('combine_vqsr-%s'%sample) 392 | job4.memory = "12G" 393 | job4.sge_options="-l h_rt=120:00:00 -M %s -m ea -A %s"%(args.email, args.account) 394 | job4.output=File(outdir.path, sample+".vcf") 395 | 396 | if args.reference_calls: 397 | job4.append('combine_vcf.sh %s %s %s %s %s'%(File(outdir.path, sample+".vcf"), False, False, File(outdir.path, sample+".vqsr.snp.vcf"), File(outdir.path, sample+".vqsr.indel.vcf"))) 398 | job4.append('write_refcalls.sh %s'%(job1.output)) 399 | job4.append('combine_vcf.sh %s %s %s %s %s %s'%(File(outdir.path, sample+".snv.refcalls.vcf"), False, True, File(outdir.path, sample+".refcalls.vcf"), File(outdir.path, sample+".vqsr.snp.vcf"), File(outdir.path, sample+".vqsr.indel.vcf"))) 400 | else: 401 | job4.append('combine_vcf.sh %s %s %s %s %s'%(File(outdir.path, sample+".vcf"), False, False, File(outdir.path, sample+".vqsr.snp.vcf"), File(outdir.path, sample+".vqsr.indel.vcf"))) 402 | 403 | job4.depend(job3) 404 | jobs.append(job4) 405 | else: 406 | 407 | job1=Job('anno_vcf-%s'%sample) 408 | job1.memory = "16G" 409 | job1.sge_options="-l h_rt=120:00:00 -M %s -m ea -A %s"%(args.email, args.account) 410 | job1.output=File(outdir.path, sample+".vcf.tsv") 411 | job1.append('annotate.py %s %s'%(job1.output, job0.output)).depend(job0) 412 | 413 | jobs.append(job1) 414 | 415 | if variants is None or "breakdancer" in variants or "cnvnator" in variants or "pindel" in variants or "breakseq" in variants: 416 | inputs=" ".join([j.output.path for j in svJobs]) 417 | job2=Job('merge_gff-%s'%sample) 418 | job2.memory = "5G" 419 | job2.sge_options="-l h_rt=120:00:00 -A %s"%args.account 420 | job2.output=File(outdir.path, sample+".gff") 421 | job2.append('merge_gff.sh %s %s'%(job2.output, inputs)).depend(*svJobs) 422 | 423 | job3=Job('anno_gff-%s'%sample) 424 | job3.memory = "6G" 425 | #job3.sge_options="-l h_rt=120:00:00 -A %s"%args.account 426 | job3.output=File(outdir.path, sample+".gff.tsv") 427 | job3.sge_options="-l h_rt=120:00:00 -M %s -m ea -A %s"%(args.email, args.account) 428 | job3.append('annotate.py %s %s'%(job3.output, job2.output)).depend(job2) 429 | jobs.append(job3) 430 | 431 | return jobs 432 | 433 | def markdone(jobs, mark=True): 434 | if mark: 435 | for job in jobs: 436 | if len(job.dependents)>0: 437 | markdone(job.dependents, mark) 438 | job.status='done' 439 | 440 | extension=None 441 | if args.bam: 442 | if args.cleanuponly: 443 | extension=".bam" 444 | elif args.variantonly: 445 | extension=".recal.bam" 446 | else: 447 | extension=None 448 | jobs1=prep(args.reads1, extension) 449 | jobs2=() 450 | else: 451 | jobs1=prep(args.reads1, extension) 452 | jobs2=prep(args.reads2, extension) 453 | 454 | jobs=[] 455 | jobs=align(jobs1, jobs2,extension) 456 | markdone(jobs, args.donealign or args.cleanuponly or args.variantonly) 457 | 458 | if args.cleanuponly or args.variantonly or args.alignmentonly: 459 | args.nobinning = True 460 | 461 | if not args.nobinning: 462 | jobs=binning(jobs, refi) 463 | markdone(jobs, args.donebinning or args.variantonly) 464 | 465 | if not args.nocleanup: 466 | jobs=cleanup(jobs, extension) 467 | markdone(jobs, args.donecleanup or args.variantonly) 468 | 469 | if not args.alignmentonly and not args.cleanuponly and not args.novariant: 470 | siJobs, svJobs=callvars(jobs, args.nobinning, args.variants) 471 | jobs=merge_annotate(siJobs, svJobs, args.variants) 472 | 473 | descout = sys.stdout if jobfile is None else open(jobfile.path, "w") 474 | descout.write(Job().depend(*jobs).desc()) 475 | descout.flush() 476 | 477 | if args.submit: 478 | print >> sys.stderr, "Submitting jobs (%s) through SJM"%jobfile 479 | os.system("sjm %s &" %jobfile) 480 | --------------------------------------------------------------------------------