├── README.md ├── alignment_quality.sh ├── bsub_qc.sh ├── germline_somatic_loh_3.pl ├── pindel2vcf ├── qc_pipeline.sh └── read_count2.pl /README.md: -------------------------------------------------------------------------------- 1 | # VariantQC 2 | Variant quality checking scripts for complex indel variant discovery and filtering from Pindel-C outputs. Referenced in Systematic discovery of complex insertions and deletions in human cancers (doi:10.1038/nm.4002). 3 | 4 | # How to run QC 5 | Main QC script is run using bsub_qc.sh, which initiates the main qc_pipeline.sh. The input to bsub_qc.sh is described in the file. 6 | 7 | #Steps 8 | 1. Extract complex insertions and deletions from pindel output. 9 | 2. Identify somatic, germline, and loss of heterozygosity(loh) events. 10 | 3. Filter out low coverage sites (20 read min). 11 | 4. Make unfiltered VCF for germline, somatic and loh events. 12 | 5. Run readcount tool on tumor sample. Performing readcount analysis will determine if somatic and loh events are appropriately classified (Note: Not run for germline). 13 | 6. Run readcount tool on normal sample. Performing readcount analysis will determine if somatic and loh events are appropriately classified (Note: Not run for germline). 14 | 7. Reclassify germline, somatic, and loh based on read count data of somatic events. 15 | 8. Making VCFs for filtered pindel output for VEP input & annotate final filtered VCF using VEP. 16 | 17 | 18 | Reyka Jayasinghe (rjayasin@genome.wustl.edu) and Steven Foltz (sfoltz@genome.wustl.edu). 19 | -------------------------------------------------------------------------------- /alignment_quality.sh: -------------------------------------------------------------------------------- 1 | #SMF 7 September 2015 2 | #Script does initial QC on bams, identifying which bams have <80% perfectly aligned reads. 3 | 4 | #rm -f bad_bam.txt good_bam.txt 5 | while read sample_type_bam; do 6 | sample_name=$(echo $sample_type_bam | cut -f1 -d' ') 7 | type=$(echo $sample_type_bam | cut -f2 -d' ') 8 | bam=$(echo $sample_type_bam | cut -f3 -d' ') 9 | if [ -f $bam ]; then 10 | samtools view $bam 20 | cut -f6 > $sample_name.temp #view bam file for all of chromosome 20 11 | total=$(wc -l $sample_name.temp | cut -f1 -d' ') #total number of reads 12 | good=$(grep -c 100M $sample_name.temp) #number of perfectly matching 100M reads 13 | prop=$(echo 100*$good/$total | bc -l) #proportion of perfectly matching 100M reads 14 | if [ $(echo ${prop%%.*}) -lt 80 ]; then #grabs the digits to left of decimal point 15 | echo $sample_name $type $bam $prop | tr ' ' '\t' >> bad_bam.txt #if prop < 80, bad 16 | else 17 | echo $sample_name $type $bam $prop | tr ' ' '\t' >> good_bam.txt #if prop >= 80, good 18 | fi 19 | rm -f $sample_name.temp 20 | else 21 | echo "$sample_name $type $bam (no bam)" >> bad_bam.txt 22 | fi 23 | done < bams 24 | #input looks like 25 | #SAMPLE_NAME CANCER_TYPE /path/to/bamfile.bam -------------------------------------------------------------------------------- /bsub_qc.sh: -------------------------------------------------------------------------------- 1 | #!/gsc/bin/bash 2 | 3 | #Steven Foltz (sfoltz@genome.wustl.edu), Reyka Jayasinghe (rjayasin@genome.wustl.edu) 4 | #4 September 2015 5 | 6 | #Script runs complex indel QC pipeline for individuals specfied in the input file. Each individual is run in parallel using bsub. 7 | 8 | #Input file has six columns: SAMPLE_ID CANCER_TYPE /PATH/TO/PINDEL/OUTPUT/ /PATH/TO/TUMOR/BAM.bam /PATH/TO/NORMAL/BAM.bam PROJECT_NAME 9 | #Each row corresponds to a unique individual. 10 | 11 | #How the pipeline script is called: 12 | #bash /path/to/qc_pipeline.sh SAMPLE_ID CANCER_TYPE /PATH/TO/PINDEL/OUTPUT/ /PATH/TO/TUMOR/BAM.bam /PATH/TO/NORMAL/BAM.bam PROJECT_NAME 13 | 14 | if [ $# -ne 1 ]; then 15 | echo "Wrong number of arguments supplied. Usage: bash pipeline_bsub.sh six_column_input_file" 16 | exit 1 17 | fi 18 | 19 | coverage_min=20 #complex events called with coverage below this threhold are removed 20 | steps="1,2,3,4,5,6,7,8,9" #user can select which parts of the QC pipeline to run 21 | #1 Extracting complex indels 22 | #2 Separating germline, somatic, and LOH 23 | #3 Filtering out low coverage 24 | #4 Making VCF file for germline/somatic/loh 25 | #5 Running read counts for tumor germline/somatic/loh 26 | #6 Running read counts for normal germline/somatic/loh 27 | #7 Reclassifying complex indels based on read counts 28 | #8 Making VCF files for VEP input for germline/somatic/loh 29 | #9 Annotating for germline/somatic/loh 30 | 31 | file_containing_each_sample_info=$1 32 | #absolute path to the actual QC pipeline 33 | pipeline="bash qc_pipeline.sh" 34 | 35 | #screen error and output goes in these folders 36 | mkdir -p log 37 | 38 | while read line; do 39 | id=$(echo $line | cut -f1 -d' ') #Sample ID 40 | type=$(echo $line | cut -f2 -d' ') #Cancer type 41 | pindel_output=$(echo $line | cut -f3 -d' ') #Path to pindel output folder 42 | tumor_bam=$(echo $line | cut -f4 -d' ') #Path to tumor bam 43 | normal_bam=$(echo $line | cut -f5 -d' ') #Path to normal bam 44 | project=$(echo $line | cut -f6 -d' ') #Project name 45 | #Using bsub: 46 | bsub -e log/$id.$type.err -o log/$id.$type.out $pipeline $id $type $pindel_output $tumor_bam $normal_bam $project $coverage_min $steps 47 | echo $id >> Samples_$type #Adds sample ID to list of others with same cancer type 48 | done < $file_containing_each_sample_info 49 | -------------------------------------------------------------------------------- /germline_somatic_loh_3.pl: -------------------------------------------------------------------------------- 1 | #Reyka Jayasinghe, Steven Foltz 2 | #August 2015 3 | 4 | #Initial script classifies complex events into germline, somatic, or loh based on having read support in the tumor and/or normal bam. 5 | #Germline has support in both tumor and normal. 6 | #Somatic has support in tumor only. 7 | #LOH has support in normal only. 8 | 9 | use strict; 10 | my $usage =< 12 | Where is the complex indel file for this sample 13 | Where is a project identifier, useful for parsing sample IDs into tumor or normal 14 | Where and are the output files for germline, somatic, and LOH mutations. 15 | USER MUST DEFINE RULES FOR PARSING SAMPLE IDS INTO TUMOR OR NORMAL (see line 37-38) 16 | USAGE 17 | 18 | my $file=$ARGV[0]; 19 | my $project=$ARGV[1]; 20 | my $GERMLINE=$ARGV[2]; 21 | my $SOMATIC=$ARGV[3]; 22 | my $LOH=$ARGV[4]; 23 | 24 | open(COMPLEX,'<',$file) or die "Couldn't open complex $file."; 25 | open(SOMATIC,'>',$SOMATIC) or die "Couldn't open file for writing $SOMATIC."; 26 | open(LOH,'>',$LOH) or die "Couldn't open file for writing $LOH."; 27 | open(GERMLINE,'>',$GERMLINE) or die "Couldn't open file for writing $GERMLINE."; 28 | while(my $line=){ 29 | chomp ($line); 30 | my @pindel=split(/\s/,$line); 31 | my $size=@pindel; 32 | my $supsamples=$pindel[29]; 33 | #Identify samples and store their sample type (tumor/primary/normal/germline) 34 | my @sample1=$pindel[31]; 35 | my @sample2=$pindel[38]; 36 | #USER MUST WRITE OWN RULES FOR DETERMINING THE TYPE OF EACH BAM (tumor or normal) 37 | my $type1=""; #must be 'tumor' or 'normal', opposite of type2 38 | my $type2=""; #must be 'tumor' or 'normal', opposite of type1 39 | #CHECK TO SEE IF NO ENTERIES WERE PROVIDED FOR BAM INFORMATION - REFER TO ABOVE TWO LINES 40 | if ($type1=~/^$/){ 41 | print STDERR "ERROR: User needs to define rules for parsing sample IDS into tumor and normal!! See germline_somatic_loh_3.pl Lines 37 and 38\n"; 42 | die; 43 | } 44 | if ($type2=~/^$/){ 45 | print STDERR "ERROR: User needs to define rules for parsing sample IDS into tumor and normal!! See germline_somatic_loh_3.pl Lines 37 and 38\n"; 46 | die; 47 | } 48 | #Set column 34 as the anchor column for later comparisons 49 | #Script works only if two samples are provided in pindel output file (size=45) 50 | my $column=34 if $size==45; 51 | 52 | #If the number of supporting samples is 1, run check_sup1() 53 | if (($supsamples==1)){ 54 | check_sup1($column,$type1,$type2,\@pindel,$line); 55 | } 56 | #If the number of supporting samples i 2, run check_sup2() 57 | if (($supsamples==2)){ 58 | check_sup2($column,$type1,$type2,\@pindel,$line); 59 | } 60 | } 61 | 62 | sub check_sup1{ 63 | #Get passed arguments 64 | my ($column,$type1,$type2,$pindel1,$line)=@_; 65 | my @pindel=@{$pindel1}; 66 | #Check first sample 67 | if ($pindel[$column]>0||$pindel[$column+1]>0||$pindel[$column+2]>0||$pindel[$column+3]>0){ 68 | if ($type1 eq "tumor" && $type2 eq "normal"){ 69 | print SOMATIC "$line\n"; 70 | } 71 | if ($type1 eq "normal" && $type2 eq "tumor"){ 72 | print LOH "$line\n"; 73 | } 74 | } 75 | #Check second sample 76 | elsif ($pindel[$column+7]>0||$pindel[$column+8]>0||$pindel[$column+9]>0||$pindel[$column+10]>0){ 77 | if ($type1 eq "tumor" && $type2 eq "normal"){ 78 | print LOH "$line\n"; 79 | } 80 | if ($type1 eq "normal" && $type2 eq "tumor"){ 81 | print SOMATIC "$line\n"; 82 | } 83 | } 84 | } 85 | 86 | sub check_sup2{ 87 | #Get passed arguments 88 | my ($column,$type1,$type2,$pindel1,$line)=@_; 89 | my @pindel=@{$pindel1}; 90 | #Check both samples 91 | if (($pindel[$column]>0||$pindel[$column+1]>0||$pindel[$column+2]>0||$pindel[$column+3]>0)&&($pindel[$column+7]>0||$pindel[$column+8]>0||$pindel[$column+9]>0||$pindel[$column+10]>0)){ 92 | if ($type1 eq "tumor" && $type2 eq "normal"){ 93 | print GERMLINE "$line\n"; 94 | } 95 | if ($type1 eq "normal" && $type2 eq "tumor"){ 96 | print GERMLINE "$line\n"; 97 | } 98 | } 99 | } 100 | 101 | #QC steps 102 | print GERMLINE "Successfully completed germline.\n"; 103 | print SOMATIC "Successfully completed somatic.\n"; 104 | print LOH "Successfully completed loh.\n"; 105 | 106 | close GERMLINE; 107 | close SOMATIC; 108 | close LOH; 109 | -------------------------------------------------------------------------------- /pindel2vcf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ding-lab/VariantQC/5b22e2b5db725519e9789277e979996ec197f28a/pindel2vcf -------------------------------------------------------------------------------- /qc_pipeline.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #Reyka Jayasinghe (rjayasin@genome.wustl.edu), Steven Foltz (sfoltz@genome.wustl.edu) 4 | #4 September 2015 5 | 6 | #QC Pipeline is a series of commands calling several scripts to extract, classify, and annotate complex events reported by Pindel-C. 7 | 8 | #Annotated QC output is stored here: origdata/variants/ID.type.*.anno.vcf where *={germline, somatic, loh} 9 | 10 | #The input to the pipeline consists of: 11 | #1. sample ID 12 | #2. cancer type 13 | #3. absolute path to the folder of Pindel-C outputs for that sample (to the folder containing sampleID_D file(s), not the actual sampleID_D file(s)) 14 | #4. absolute path to the sample's tumor bam 15 | #5. absolute path to the sample's normal bam 16 | #6. project name (if needed for your application to parse sample ID, etc.) 17 | #7. coverage minimum (default: 20, can be changed in bsub_qc.sh) 18 | #8. steps to be completed (default: all steps, can be changed in bsub_qc.sh) 19 | 20 | #How the script is called: 21 | #bash /path/to/qc_pipeline.sh SAMPLE_ID CANCER_TYPE /PATH/TO/PINDEL/OUTPUT/ /PATH/TO/TUMOR/BAM.bam /PATH/TO/NORMAL/BAM.bam PROJECT_NAME 22 | 23 | #In actually running, each individual will be sent to bsub as their own job (see bsub_qc.sh) 24 | 25 | ###THESE VARIABLES MUST BE SET BY USER BEFORE RUNNING 26 | gsl3="perl germline_somatic_loh_3.pl" #path to the perl script germline_somatic_loh_3.pl 27 | p2v="pindel2vcf" #path to the executable pindel2vcf 28 | ref="reference_sequence_used_by_pindel.fa" #path to the reference fasta used by Pindel-C 29 | rc="read_count2.pl" #path to the executable perl script read_count2.pl 30 | vep_dir="/your/directory/vep80" #path to your VEP directory (ex: /your/directory/vep80) 31 | vep="$vep_dir/vep/variant_effect_predictor.pl" #path to your version of variant_effect_predictor.pl 32 | data_dir="$vep_dir/.vep" #path to your hidden VEP directory 33 | fasta="$data_dir/homo_sapiens/80_GRCh37/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa" #path to your VEP fasta 34 | #Also, user must edit germline_somatic_loh_3.pl to set rules for identifying tumor and normal 35 | ### 36 | 37 | if [ $# -ne 8 ]; then 38 | echo "Wrong number of arguments supplied. Usage: bash /path/to/pipeline.sh sample_id cancer_type /path/to/pindel/output /path/to/tumor/bam.bam path/to/normal/bam.bam project_name coverage_minimum steps" 39 | exit 1 40 | fi 41 | 42 | #Inputs 43 | id=$1 #the sample id 44 | type=$2 #their cancer type 45 | echo "###" >> log/$id.$type.log 46 | echo "New command at "$(date)": $1 $2 $3 $4 $5 $6 $7 $8" >> log/$id.$type.log 47 | echo "###" >> log/$id.$type.log 48 | echo "Timestamp 0: "$(date)" Sorting inputs..." >> log/$id.$type.log 49 | pindel_path=$3 #absolute path to their pindel outputs 50 | if [ -d $pindel_path ]; then 51 | echo "#Pindel output directory exists: "$pindel_path >> log/$id.$type.log 52 | else 53 | echo "#Fail: Pindel output directory does not exist: "$pindel_path >> log/$id.$type.log 54 | exit 1 55 | fi 56 | tumor_bam=$4 #absolute path to their tumor bam 57 | if [ -f $tumor_bam ]; then 58 | if [ -s $tumor_bam ]; then 59 | echo "#Tumor bam file exists and is not empty: "$tumor_bam >> log/$id.$type.log 60 | else 61 | echo "#Tumor bam file exists but is EMPTY: "$tumor_bam >> log/$id.$type.log 62 | fi 63 | else 64 | echo "#Fail: Tumor bam file does not exist: "$tumor_bam >> log/$id.$type.log 65 | exit 1 66 | fi 67 | normal_bam=$5 #absolute path to their normal bam 68 | if [ -f $normal_bam ]; then 69 | if [ -s $normal_bam ]; then 70 | echo "#Normal bam file exists and is not empty: "$normal_bam >> log/$id.$type.log 71 | else 72 | echo "#Normal bam file exists but is EMPTY: "$normal_bam >> log/$id.$type.log 73 | fi 74 | else 75 | echo "#Fail: Normal bam file does not exist: "$normal_bam >> log/$id.$type.log 76 | exit 1 77 | fi 78 | project=$(echo $6 | tr '[a-z]' '[A-Z]') #project name converted to upppercase 79 | #if your pipeline requires a specific project name: 80 | #if [ $project == "X" ] || [ $project == "Y" ]; then 81 | # echo "#Project is "$project >> log/$id.$type.log 82 | #else 83 | # echo "#Fail: Project must be X or Y: "$project >> log/$id.$type.log 84 | # exit 1 85 | #fi 86 | coverage_min=$7 87 | steps=$8 88 | 89 | #Extract complex indels 90 | if [[ $steps == *"1"* ]]; then 91 | echo "Timestamp 1: "$(date)" Extracting complex indels..." >> log/$id.$type.log 92 | mkdir -p origdata 93 | #picks up complex events (insertion length is not zero, deletion and insertion lengths are not equal) 94 | grep ChrID $pindel_path/*_D | awk '{if($5) print}' | awk '{if ($3!=$5) print}' > origdata/$id.$type.complex 95 | if [ -f origdata/$id.$type.complex ]; then 96 | if [ -s origdata/$id.$type.complex ]; then 97 | echo "#Complex indel file exists and is not empty: "origdata/$id.$type.complex >> log/$id.$type.log 98 | else 99 | echo "#Complex indel file exists but is EMPTY: "origdata/$id.$type.complex >> log/$id.$type.log 100 | fi 101 | else 102 | echo "#Fail: Complex indel file does not exist: "origdata/$id.$type.complex >> log/$id.$type.log 103 | exit 1 104 | fi 105 | fi 106 | 107 | #Separates germline, somatic, and LOH events 108 | #Germline is in both tumor and normal; Somatic is only in tumor; LOH is only in normal 109 | WD="origdata/$id.$type.complex" 110 | GERMLINE="origdata/$id.$type.germline" 111 | SOMATIC="origdata/$id.$type.somatic" 112 | LOH="origdata/$id.$type.loh" 113 | if [[ $steps == *"2"* ]]; then 114 | echo "Timestamp 2: "$(date)" Separating germline, somatic, and LOH..." >> log/$id.$type.log 115 | $gsl3 $WD $project $GERMLINE $SOMATIC $LOH 116 | test1=$(grep "Successfully completed germline." $GERMLINE) 117 | test2=$(grep "Successfully completed somatic." $SOMATIC) 118 | test3=$(grep "Successfully completed loh." $LOH) 119 | if [ "$test1" == "Successfully completed germline." ]; then 120 | echo "#Germline successfully completed: "$GERMLINE >> log/$id.$type.log 121 | else 122 | echo "#Fail: Germline failed: "$GERMLINE >> log/$id.$type.log 123 | exit 1 124 | fi 125 | if [ "$test2" == "Successfully completed somatic." ]; then 126 | echo "#Somatic successfully completed: "$SOMATIC >> log/$id.$type.log 127 | else 128 | echo "#Fail: Somatic failed: "$SOMATIC >> log/$id.$type.log 129 | exit 1 130 | fi 131 | if [ "$test3" == "Successfully completed loh." ]; then 132 | echo "#LOH successfully completed: "$LOH >> log/$id.$type.log 133 | else 134 | echo "#Fail: LOH failed: "$LOH >> log/$id.$type.log 135 | exit 1 136 | fi 137 | fi 138 | 139 | #Make sure there is sufficient coverage, filter out events with low coverage 140 | GERMLINEc="origdata/$id.$type.germline_coverage" 141 | SOMATICc="origdata/$id.$type.somatic_coverage" 142 | LOHc="origdata/$id.$type.loh_coverage" 143 | if [[ $steps == *"3"* ]]; then 144 | echo "Timestamp 3: "$(date)" Filtering out variants with coverage less than $coverage_min reads..." >> log/$id.$type.log 145 | grep -v "Successfully completed germline." $GERMLINE | awk -F' ' '{a=$33+$35+$37;b=$40+$42+$44;if(a>='$coverage_min'&&b>='$coverage_min'){print}}' > $GERMLINEc 146 | grep -v "Successfully completed somatic." $SOMATIC | awk -F' ' '{a=$33+$35+$37;b=$40+$42+$44;if(a>='$coverage_min'&&b>='$coverage_min'){print}}' > $SOMATICc 147 | grep -v "Successfully completed loh." $LOH | awk -F' ' '{a=$33+$35+$37;b=$40+$42+$44;if(a>='$coverage_min'&&b>='$coverage_min'){print}}' > $LOHc 148 | if [ -f $GERMLINEc ]; then 149 | if [ -s $GERMLINEc ]; then 150 | echo "#Germline file exists and is not empty: "$GERMLINEc >> log/$id.$type.log 151 | else 152 | echo "#Germline file exists but is EMPTY: "$GERMLINEc >> log/$id.$type.log 153 | fi 154 | else 155 | echo "#Fail: Germline file does not exist: "$GERMLINEc >> log/$id.$type.log 156 | exit 1 157 | fi 158 | if [ -f $SOMATICc ]; then 159 | if [ -s $SOMATICc ]; then 160 | echo "#Somatic file exists and is not empty: "$SOMATICc >> log/$id.$type.log 161 | else 162 | echo "#Somatic file exists but is EMPTY: "$SOMATICc >> log/$id.$type.log 163 | fi 164 | else 165 | echo "#Fail: Somatic file does not exist: "$SOMATICc >> log/$id.$type.log 166 | exit 1 167 | fi 168 | if [ -f $LOHc ]; then 169 | if [ -s $LOHc ]; then 170 | echo "#LOH file exists and is not empty: "$LOHc >> log/$id.$type.log 171 | else 172 | echo "#LOH file exists but is EMPTY: "$LOHc >> log/$id.$type.log 173 | fi 174 | else 175 | echo "#Fail: LOH file does not exist: "$LOHc >> log/$id.$type.log 176 | exit 1 177 | fi 178 | fi 179 | 180 | #Makes VCF files for germline, somatic, and LOH pindel complex indel outputs 181 | mkdir -p VCFs 182 | if [[ $steps == *"4"* ]]; then 183 | for gsl in germline somatic loh; do 184 | echo "Timestamp 4: "$(date)" Making VCF file for $gsl..." >> log/$id.$type.log 185 | $p2v -p origdata/$id.$type.${gsl}_coverage -r $ref -R $ref -d 2015 -v VCFs/$id.$type.$gsl.vcf 186 | if [ -f VCFs/$id.$type.$gsl.vcf ]; then 187 | if [ -s VCFs/$id.$type.$gsl.vcf ]; then 188 | echo "#VCF "$gsl" file exists and is not empty: "VCFs/$id.$type.$gsl.vcf >> log/$id.$type.log 189 | else 190 | echo "#VCF "$gsl" file exists but is EMPTY: "VCFs/$id.$type.$gsl.vcf >> log/$id.$type.log 191 | fi 192 | else 193 | echo "#Fail: VCF "$gsl" does not exist: "VCFs/$id.$type.$gsl.vcf >> log/$id.$type.log 194 | exit 1 195 | fi 196 | done 197 | fi 198 | 199 | #Runs read counts perl script for each germline, somatic, and LOH 200 | mkdir -p read_counts 201 | #for gsl in germline somatic loh; do 202 | for gsl in somatic loh; do 203 | vcf="VCFs/$id.$type.$gsl.vcf" 204 | outtumor="read_counts/$id.$type.$gsl.tumor.rc" 205 | outnormal="read_counts/$id.$type.$gsl.normal.rc" 206 | if [[ $steps == *"5"* ]]; then 207 | echo "Timestamp 5: "$(date)" Running read counts for tumor $gsl..." >> log/$id.$type.log 208 | perl $rc -id $id $ref $vcf $tumor_bam 20 $outtumor 209 | if [ -f $outtumor ]; then 210 | if [ -s $outtumor ]; then 211 | echo "#Read counts "$gsl" tumor file exists and is not empty: "$outtumor >> log/$id.$type.log 212 | else 213 | echo "#Read counts "$gsl" tumor file exists but is EMPTY: "$outtumor >> log/$id.$type.log 214 | fi 215 | else 216 | echo "#Fail: Read counts "$gsl" tumor does not exist: "$outtumor >> log/$id.$type.log 217 | exit 1 218 | fi 219 | fi 220 | if [[ $steps == *"6"* ]]; then 221 | echo "Timestamp 6: "$(date)" Running read counts for normal $gsl..." >> log/$id.$type.log 222 | perl $rc -id $id $ref $vcf $normal_bam 20 $outnormal #20 refers to mapping quality minimum 223 | if [ -f $outtumor ]; then 224 | if [ -s $outtumor ]; then 225 | echo "#Read counts "$gsl" normal file exists and is not empty: "$outnormal >> log/$id.$type.log 226 | else 227 | echo "#Read counts "$gsl" normal file exists but is EMPTY: "$outnormal >> log/$id.$type.log 228 | fi 229 | else 230 | echo "#Fail: Read counts "$gsl" normal does not exist: "$outnormal >> log/$id.$type.log 231 | exit 1 232 | fi 233 | fi 234 | done 235 | 236 | #Reclassify germline, somatic, and LOH based on read counts data 237 | if [[ $steps == *"7"* ]]; then 238 | echo "Timestamp 7: "$(date)" Reclassifying complex indels based on read counts..." >> log/$id.$type.log 239 | #Support for "somatic" event in normal bam 240 | awk '{if ($16>0) print "\""substr($6,2)"\"\tChrID "$2"\tBP "$3}' read_counts/$id.$type.somatic.normal.rc | sort | uniq > $id.TEMP 241 | #No support for "somatic" event in tumor bam 242 | awk '{if ($16==0) print "\""substr($6,2)"\"\tChrID "$2"\tBP "$3}' read_counts/$id.$type.somatic.tumor.rc | sort | uniq > $id.TEMP_NO_SOMATIC 243 | #Support for "LOH" event in tumor bam 244 | awk '{if ($16>0) print "\""substr($6,2)"\"\tChrID "$2"\tBP "$3}' read_counts/$id.$type.loh.tumor.rc | sort | uniq > $id.TEMP_2 245 | #No support for "LOH" event in normal bam 246 | awk '{if ($16==0) print "\""substr($6,2)"\"\tChrID "$2"\tBP "$3}' read_counts/$id.$type.loh.normal.rc | sort | uniq > $id.TEMP_2_NO_LOH 247 | #Grep misclassified somatic events from original "somatic" file 248 | grep -wf $id.TEMP $SOMATICc | sort > $id.TEMP_somatic 249 | #Grep misclassified LOH events from original "LOH" file 250 | grep -wf $id.TEMP_2 $LOHc | sort > $id.TEMP_loh 251 | #Sort "somatic" and "LOH" events for use in comm 252 | sort $SOMATICc > $id.sorted_og_somatic 253 | sort $LOHc > $id.sorted_og_loh 254 | #Save reclassified and filtered germline, somatic, and loh files here 255 | GERMLINEf="origdata/$id.$type.germline_filtered" 256 | SOMATICf="origdata/$id.$type.somatic_filtered" 257 | LOHf="origdata/$id.$type.loh_filtered" 258 | #Cat events originally classified or reclassified as germline 259 | cat $GERMLINEc $id.TEMP_somatic $id.TEMP_loh | sort | uniq > $GERMLINEf 260 | #Print lines from "somatic" file not reclassified as germline (comm -13 prints lines unique to second file) 261 | #Then remove "somatic" events without support in tumor bam 262 | comm -13 $id.TEMP_somatic $id.sorted_og_somatic | sort | uniq | grep -vf $id.TEMP_NO_SOMATIC > $SOMATICf 263 | #Print lines from "LOH" file not reclassified as germline (comm -13 prints lines unique to second file) 264 | #Then remove "LOH" events without support in normal bam 265 | comm -13 $id.TEMP_loh $id.sorted_og_loh | sort | uniq | grep -vf $id.TEMP_2_NO_LOH > $LOHf 266 | #Remove temporary files 267 | rm -f $id.TEMP $id.TEMP_NO_SOMATIC $id.TEMP_2 $id.TEMP_2_NO_LOH $id.TEMP_somatic $id.TEMP_loh $id.sorted_og_somatic $id.sorted_og_loh 268 | if [ -f $GERMLINEf ]; then 269 | if [ -s $GERMLINEf ]; then 270 | echo "#Germline file exists and is not empty: "$GERMLINEf >> log/$id.$type.log 271 | else 272 | echo "#Germline file exists but is EMPTY: "$GERMLINEf >> log/$id.$type.log 273 | fi 274 | else 275 | echo "#Fail: Germline file does not exist: "$GERMLINEf >> log/$id.$type.log 276 | exit 1 277 | fi 278 | if [ -f $SOMATICf ]; then 279 | if [ -s $SOMATICf ]; then 280 | echo "#Somatic file exists and is not empty: "$SOMATICf >> log/$id.$type.log 281 | else 282 | echo "#Somatic file exists but is EMPTY: "$SOMATICf >> log/$id.$type.log 283 | fi 284 | else 285 | echo "#Fail: Somatic file does not exist: "$SOMATICf >> log/$id.$type.log 286 | exit 1 287 | fi 288 | if [ -f $LOHf ]; then 289 | if [ -s $LOHf ]; then 290 | echo "#LOH file exists and is not empty: "$LOHf >> log/$id.$type.log 291 | else 292 | echo "#LOH file exists but is EMPTY: "$LOHf >> log/$id.$type.log 293 | fi 294 | else 295 | echo "#Fail: LOH file does not exist: "$LOHf >> log/$id.$type.log 296 | exit 1 297 | fi 298 | fi 299 | 300 | #Final sections: Make filtered VCFs, run VEP to annotate variants 301 | anno_input="origdata/annotate" 302 | anno_output="origdata/variants" 303 | mkdir -p $anno_input $anno_output 304 | for gsl in germline somatic loh; do 305 | #Makes VCF files for germline, somatic, and LOH pindel complex indel outputs 306 | if [[ $steps == *"8"* ]]; then 307 | echo "Timestamp 8: "$(date)" Making VCF file for VEP input for $gsl..." >> log/$id.$type.log 308 | $p2v -p origdata/$id.$type.${gsl}_filtered -r $ref -R $ref -d 2015 -v VCFs/$id.$type.$gsl.filtered.vcf 309 | if [ -f VCFs/$id.$type.$gsl.filtered.vcf ]; then 310 | if [ -s VCFs/$id.$type.$gsl.filtered.vcf ]; then 311 | echo "#VCF filtered "$gsl" file exists and is not empty: "VCFs/$id.$type.$gsl.filtered.vcf >> log/$id.$type.log 312 | else 313 | echo "#VCF filtered "$gsl" file exists but is EMPTY: "VCFs/$id.$type.$gsl.filtered.vcf >> log/$id.$type.log 314 | fi 315 | else 316 | echo "#Fail: VCF filtered "$gsl" does not exist: "VCFs/$id.$type.$gsl.filtered.vcf >> log/$id.$type.log 317 | exit 1 318 | fi 319 | fi 320 | #Runs VEP to annotate the variants 321 | if [[ $steps == *"9"* ]]; then 322 | echo "Timestamp 9: "$(date)" Annotating for $gsl..." 323 | perl $vep --everything -i VCFs/$id.$type.$gsl.filtered.vcf --format vcf --vcf -out $anno_output/$id.$type.$gsl.anno.vcf --dir $data_dir --assembly GRCh37 --cache --offline --fork 4 324 | if [ -f $anno_output/$id.$type.$gsl.anno.vcf ]; then 325 | if [ -s $anno_output/$id.$type.$gsl.anno.vcf ]; then 326 | echo "#Annotation output file exists and is not empty: "$anno_output/$id.$type.$gsl.anno.vcf >> log/$id.$type.log 327 | else 328 | echo "#Annotation output file exists but is EMPTY: "$anno_output/$id.$type.$gsl.anno.vcf >> log/$id.$type.log 329 | fi 330 | else 331 | echo "#Fail: Annotation output file does not exist: "$anno_output/$id.$type.$gsl.anno.vcf >> log/$id.$type.log 332 | exit 1 333 | fi 334 | fi 335 | done 336 | 337 | echo "End of pipeline! Ran steps: "$steps". Finished "$(date) >> log/$id.$type.log 338 | -------------------------------------------------------------------------------- /read_count2.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl -w 2 | use strict; 3 | use Getopt::Long; 4 | 5 | 6 | 7 | my($help,$bam_files,$bam_batch,$id); 8 | my %hash; 9 | &GetOptions('bam_file=s'=>\$bam_files,'bam_bat=s'=>\$bam_batch,'help|h'=>\$help,'id=s'=>\$id); 10 | 11 | 12 | if($#ARGV<4 or defined $help) 13 | { 14 | print "USAGE: $0 [option] fasta_file vcf_file bam_file mapping_quality_cutoff output_file.\n"; 15 | print " -id for multiple process.\n"; 16 | exit; 17 | } 18 | 19 | my $ref_file=shift; 20 | my $vcf_file=shift; 21 | my $bam_file=shift; 22 | my $MQ_cutoff=shift; 23 | my $out_file=shift; 24 | if( -e $out_file) 25 | { 26 | unlink $out_file; 27 | } 28 | 29 | my $key; 30 | my %str; 31 | open SEQ,"<$ref_file" or die "can not open ref sequence file:$!"; 32 | while() 33 | { 34 | chomp; 35 | if(/>/) 36 | { 37 | s/>//; 38 | s/\s+.*//; 39 | $key=$_; 40 | } 41 | else 42 | { 43 | $str{$key}.=uc $_; 44 | } 45 | } 46 | close SEQ; 47 | 48 | my $ref; 49 | my $mod_ref; 50 | my $num=0; 51 | open VCF,"<$vcf_file" or die "can not open vcf file:$!"; 52 | while() 53 | { 54 | next if /^#/; 55 | if(/^\s*$/){next;} 56 | chomp; 57 | my @arr=split(/\s+/); 58 | $num++; 59 | my $tem1; 60 | if($arr[1]>=10001){$tem1=substr($str{$arr[0]},$arr[1]-10001,10000);} 61 | else{$tem1=substr($str{$arr[0]},0,$arr[1]);} 62 | my $tem2=substr($str{$arr[0]},$arr[1]+length($arr[3])-1,10000); 63 | # print substr($str{$arr[0]},$arr[1]-2001,4000+length($arr[3])),"\n"; 64 | $ref=$tem1.$arr[3].$tem2; 65 | my $key; 66 | if(defined $id) 67 | { 68 | $key=$id.'_'.$arr[0].'_'.$num; 69 | } 70 | else{$key=$arr[0].'_'.$num;} 71 | my $fafile=$key.'.fa'; 72 | if( -e $fafile) 73 | { 74 | unlink $fafile; 75 | } 76 | print_out($key,'ref',$ref); 77 | $mod_ref=$tem1.$arr[4].$tem2; 78 | print_out($key,'alt',$mod_ref); 79 | my $start=$arr[1]-2000; 80 | my $end=$arr[1]+2000+length($arr[3]); 81 | print $start, "\t", $end, "\n"; 82 | 83 | # system(" samtools view $bam_file $arr[0]:$start-$end| awk '{print \"@\"\$1;print \$10;print \"+\";print \$11;}' > ${key}_reads.fastq "); 84 | system(" samtools view $bam_file $arr[0]:$start-$end -b > ${key}_reads.bam"); 85 | `java -jar /gsc/scripts/pkg/bio/picard/picard-tools-1.92/SamToFastq.jar VALIDATION_STRINGENCY=LENIENT I=${key}_reads.bam F=${key}_reads_1.fastq F2=${key}_reads_2.fastq`; 86 | 87 | 88 | print "***start $key bwa **********\n"; 89 | my $bwa_index="bwa index $key.fa"; 90 | system ("$bwa_index"); 91 | my $samtools_faidx="samtools faidx $key.fa"; 92 | system ("$samtools_faidx"); 93 | my $bwa_cmd1 = "bwa aln -t4 -q 5 $key.fa ${key}_reads_1.fastq > ${key}_1.sai"; 94 | system ("$bwa_cmd1"); 95 | my $bwa_cmd2 = "bwa aln -t4 -q 5 $key.fa ${key}_reads_2.fastq > ${key}_2.sai"; 96 | system ("$bwa_cmd2"); 97 | my $bwa_cmd = "bwa sampe -a 600 $key.fa ${key}_1.sai ${key}_2.sai ${key}_reads_1.fastq ${key}_reads_2.fastq > $key.sam"; 98 | system ("$bwa_cmd"); 99 | 100 | #my $samtools_view = "samtools view -bS $key.sam > $key.bam"; 101 | #system ("$samtools_view"); 102 | #my $samtools_sort = "samtools sort $key.bam $key.sort"; 103 | #system ("$samtools_sort"); 104 | #my $samtools_index = "samtools index $key.sort.bam"; 105 | #system ("$samtools_index"); 106 | 107 | my $samfile=$key.'.sam'; 108 | my $head_len=length($tem1); 109 | &analysis($samfile,$head_len,\@arr, $MQ_cutoff, $out_file, $vcf_file); 110 | 111 | my $rm_bam="rm -rf ${key}_reads.bam"; 112 | system ("$rm_bam"); 113 | my $rm_fa="rm -rf $key.fa*"; 114 | system ("$rm_fa"); 115 | my $rm_sai="rm -rf ${key}_*.sai"; 116 | system ("$rm_sai"); 117 | my $rm_sam="rm -rf $key.sam"; 118 | system ("$rm_sam"); 119 | my $rm_fastq="rm -rf ${key}_reads_*.fastq"; 120 | system ("$rm_fastq"); 121 | } 122 | 123 | sub analysis 124 | { 125 | my $file=shift; 126 | my $head_len=shift; 127 | my $arrref=shift; 128 | my $MQ=shift; 129 | my $out_file=shift; 130 | my $vcf_name=shift; 131 | open SAM,"<$file" or die "can not open sam file:$!"; 132 | open STAT,">>$out_file" or die "can not open stats file:$!"; 133 | my $ref_num=0; 134 | my $alt_num=0; 135 | while() 136 | { 137 | my @arr=split; 138 | if($arr[2] eq 'alt') 139 | { 140 | if($arr[3] + length($arr[9]) <= $head_len + length(${$arrref}[4]) or $arr[3]>$head_len+1){next;} 141 | if(substr($arr[9],$head_len-$arr[3]+1,length(${$arrref}[4])) eq ${$arrref}[4] and $arr[4] >= $MQ) {$alt_num++;} 142 | } 143 | if($arr[2] eq 'ref') 144 | { 145 | if($arr[3]+length($arr[9]) <=$head_len+length(${$arrref}[3]) or $arr[3]>$head_len+1){next;} 146 | if(substr($arr[9],$head_len-$arr[3]+1,length(${$arrref}[3])) eq ${$arrref}[3] and $arr[4] >= $MQ) {$ref_num++;} 147 | } 148 | } 149 | 150 | close SAM; 151 | $"="\t"; 152 | print STAT $vcf_name, "\t@{$arrref}\t"; 153 | print STAT "\tref\t",$ref_num,"\t"; 154 | print STAT "\talt\t",$alt_num,"\n"; 155 | close STAT; 156 | } 157 | 158 | sub print_out 159 | { 160 | my $k=shift; 161 | my $head=shift; 162 | my $seq=shift; 163 | open REFOUT,">>$k.fa" or die "can not open $k.fa file:$!"; 164 | print REFOUT ">",$head,"\n"; 165 | for(my $n=0;$n<=length($seq);$n+=60) 166 | { 167 | print REFOUT substr($seq,$n,60),"\n"; 168 | } 169 | close REFOUT; 170 | } 171 | 172 | exit; 173 | --------------------------------------------------------------------------------