├── README.md
├── alignment_quality.sh
├── bsub_qc.sh
├── germline_somatic_loh_3.pl
├── pindel2vcf
├── qc_pipeline.sh
└── read_count2.pl


/README.md:
--------------------------------------------------------------------------------
 1 | # VariantQC
 2 | Variant quality checking scripts for complex indel variant discovery and filtering from Pindel-C outputs. Referenced in Systematic discovery of complex insertions and deletions in human cancers (doi:10.1038/nm.4002).
 3 | 
 4 | # How to run QC
 5 | Main QC script is run using bsub_qc.sh, which initiates the main qc_pipeline.sh. The input to bsub_qc.sh is described in the file.
 6 | 
 7 | #Steps
 8 | 1. Extract complex insertions and deletions from pindel output. 	
 9 | 2. Identify somatic, germline, and loss of heterozygosity(loh) events.
10 | 3. Filter out low coverage sites (20 read min).
11 | 4. Make unfiltered VCF for germline, somatic and loh events.
12 | 5. Run readcount tool on tumor sample. Performing readcount analysis will determine if somatic and loh events are appropriately classified (Note: Not run for germline). 
13 | 6. Run readcount tool on normal sample. Performing readcount analysis will determine if somatic and loh events are appropriately classified (Note: Not run for germline).
14 | 7. Reclassify germline, somatic, and loh based on read count data of somatic events.  
15 | 8. Making VCFs for filtered pindel output for VEP input & annotate final filtered VCF using VEP.
16 | 
17 | 
18 | Reyka Jayasinghe (rjayasin@genome.wustl.edu) and Steven Foltz (sfoltz@genome.wustl.edu).
19 | 


--------------------------------------------------------------------------------
/alignment_quality.sh:
--------------------------------------------------------------------------------
 1 | #SMF 7 September 2015
 2 | #Script does initial QC on bams, identifying which bams have <80% perfectly aligned reads. 
 3 | 
 4 | #rm -f bad_bam.txt good_bam.txt
 5 | while read sample_type_bam; do
 6 |     sample_name=$(echo $sample_type_bam | cut -f1 -d' ')
 7 |     type=$(echo $sample_type_bam | cut -f2 -d' ')
 8 |     bam=$(echo $sample_type_bam | cut -f3 -d' ')
 9 |     if [ -f $bam ]; then 
10 | 	samtools view $bam 20 | cut -f6 > $sample_name.temp #view bam file for all of chromosome 20
11 | 	total=$(wc -l $sample_name.temp | cut -f1 -d' ') #total number of reads
12 | 	good=$(grep -c 100M $sample_name.temp) #number of perfectly matching 100M reads
13 | 	prop=$(echo 100*$good/$total | bc -l) #proportion of perfectly matching 100M reads
14 | 	if [ $(echo ${prop%%.*}) -lt 80 ]; then #grabs the digits to left of decimal point
15 | 	    echo $sample_name $type $bam $prop | tr ' ' '\t' >> bad_bam.txt #if prop < 80, bad
16 | 	else
17 | 	    echo $sample_name $type $bam $prop | tr ' ' '\t' >> good_bam.txt #if prop >= 80, good
18 | 	fi
19 | 	rm -f $sample_name.temp
20 |     else
21 | 	echo "$sample_name $type $bam (no bam)" >> bad_bam.txt
22 |     fi
23 | done < bams
24 | #input looks like
25 | #SAMPLE_NAME CANCER_TYPE /path/to/bamfile.bam


--------------------------------------------------------------------------------
/bsub_qc.sh:
--------------------------------------------------------------------------------
 1 | #!/gsc/bin/bash
 2 | 
 3 | #Steven Foltz (sfoltz@genome.wustl.edu), Reyka Jayasinghe (rjayasin@genome.wustl.edu)
 4 | #4 September 2015
 5 | 
 6 | #Script runs complex indel QC pipeline for individuals specfied in the input file. Each individual is run in parallel using bsub. 
 7 | 
 8 | #Input file has six columns: SAMPLE_ID CANCER_TYPE /PATH/TO/PINDEL/OUTPUT/ /PATH/TO/TUMOR/BAM.bam /PATH/TO/NORMAL/BAM.bam PROJECT_NAME
 9 | #Each row corresponds to a unique individual. 
10 | 
11 | #How the pipeline script is called:
12 | #bash /path/to/qc_pipeline.sh SAMPLE_ID CANCER_TYPE /PATH/TO/PINDEL/OUTPUT/ /PATH/TO/TUMOR/BAM.bam /PATH/TO/NORMAL/BAM.bam PROJECT_NAME 
13 | 
14 | if [ $# -ne 1 ]; then
15 |         echo "Wrong number of arguments supplied. Usage: bash pipeline_bsub.sh six_column_input_file"
16 |         exit 1
17 | fi
18 | 
19 | coverage_min=20 #complex events called with coverage below this threhold are removed
20 | steps="1,2,3,4,5,6,7,8,9" #user can select which parts of the QC pipeline to run
21 | #1 Extracting complex indels
22 | #2 Separating germline, somatic, and LOH
23 | #3 Filtering out low coverage
24 | #4 Making VCF file for germline/somatic/loh
25 | #5 Running read counts for tumor germline/somatic/loh
26 | #6 Running read counts for normal germline/somatic/loh
27 | #7 Reclassifying complex indels based on read counts
28 | #8 Making VCF files for VEP input for germline/somatic/loh
29 | #9 Annotating for germline/somatic/loh
30 | 
31 | file_containing_each_sample_info=$1
32 | #absolute path to the actual QC pipeline
33 | pipeline="bash qc_pipeline.sh"
34 | 
35 | #screen error and output goes in these folders
36 | mkdir -p log
37 | 
38 | while read line; do
39 |     id=$(echo $line | cut -f1 -d' ') #Sample ID
40 |     type=$(echo $line | cut -f2 -d' ') #Cancer type
41 |     pindel_output=$(echo $line | cut -f3 -d' ') #Path to pindel output folder
42 |     tumor_bam=$(echo $line | cut -f4 -d' ') #Path to tumor bam
43 |     normal_bam=$(echo $line | cut -f5 -d' ') #Path to normal bam
44 |     project=$(echo $line | cut -f6 -d' ') #Project name
45 |     #Using bsub:
46 |     bsub -e log/$id.$type.err -o log/$id.$type.out $pipeline $id $type $pindel_output $tumor_bam $normal_bam $project $coverage_min $steps
47 |     echo $id >> Samples_$type #Adds sample ID to list of others with same cancer type
48 | done < $file_containing_each_sample_info
49 | 


--------------------------------------------------------------------------------
/germline_somatic_loh_3.pl:
--------------------------------------------------------------------------------
  1 | #Reyka Jayasinghe, Steven Foltz
  2 | #August 2015
  3 | 
  4 | #Initial script classifies complex events into germline, somatic, or loh based on having read support in the tumor and/or normal bam.
  5 | #Germline has support in both tumor and normal.
  6 | #Somatic has support in tumor only.
  7 | #LOH has support in normal only. 
  8 | 
  9 | use strict;
 10 | my $usage =<<USAGE;
 11 |  Usage: $0 <complex> <project> <germline> <somatic> <loh>
 12 |     Where <complex> is the complex indel file for this sample
 13 |     Where <project> is a project identifier, useful for parsing sample IDs into tumor or normal
 14 |     Where <germline> <somatic> and <loh> are the output files for germline, somatic, and LOH mutations.
 15 |     USER MUST DEFINE RULES FOR PARSING SAMPLE IDS INTO TUMOR OR NORMAL (see line 37-38)
 16 | USAGE
 17 | 
 18 | my $file=$ARGV[0];
 19 | my $project=$ARGV[1];
 20 | my $GERMLINE=$ARGV[2];
 21 | my $SOMATIC=$ARGV[3];
 22 | my $LOH=$ARGV[4];
 23 | 
 24 | open(COMPLEX,'<',$file) or die "Couldn't open complex $file.";
 25 | open(SOMATIC,'>',$SOMATIC) or die "Couldn't open file for writing $SOMATIC.";
 26 | open(LOH,'>',$LOH) or die "Couldn't open file for writing $LOH.";
 27 | open(GERMLINE,'>',$GERMLINE) or die "Couldn't open file for writing $GERMLINE.";
 28 | while(my $line=<COMPLEX>){
 29 | 	chomp ($line);
 30 | 	my @pindel=split(/\s/,$line);
 31 | 	my $size=@pindel; 
 32 | 	my $supsamples=$pindel[29];
 33 | 	#Identify samples and store their sample type (tumor/primary/normal/germline)
 34 | 	my @sample1=$pindel[31];
 35 | 	my @sample2=$pindel[38];
 36 | 	#USER MUST WRITE OWN RULES FOR DETERMINING THE TYPE OF EACH BAM (tumor or normal)
 37 | 	my $type1=""; #must be 'tumor' or 'normal', opposite of type2
 38 | 	my $type2=""; #must be 'tumor' or 'normal', opposite of type1
 39 |     #CHECK TO SEE IF NO ENTERIES WERE PROVIDED FOR BAM INFORMATION - REFER TO ABOVE TWO LINES 
 40 |     if ($type1=~/^$/){
 41 |         print STDERR "ERROR: User needs to define rules for parsing sample IDS into tumor and normal!! See germline_somatic_loh_3.pl Lines 37 and 38\n";
 42 |         die;
 43 |     }
 44 |     if ($type2=~/^$/){
 45 |         print STDERR "ERROR: User needs to define rules for parsing sample IDS into tumor and normal!! See germline_somatic_loh_3.pl Lines 37 and 38\n";
 46 |         die;
 47 |     }	
 48 | 	#Set column 34 as the anchor column for later comparisons
 49 | 	#Script works only if two samples are provided in pindel output file (size=45)
 50 | 	my $column=34 if $size==45;
 51 | 
 52 | 	#If the number of supporting samples is 1, run check_sup1()
 53 | 	if (($supsamples==1)){
 54 | 		check_sup1($column,$type1,$type2,\@pindel,$line);
 55 | 	}
 56 | 	#If the number of supporting samples i 2, run check_sup2()
 57 | 	if (($supsamples==2)){
 58 | 		check_sup2($column,$type1,$type2,\@pindel,$line);
 59 | 	}
 60 | }
 61 | 
 62 | sub check_sup1{
 63 |     #Get passed arguments
 64 |     my ($column,$type1,$type2,$pindel1,$line)=@_;
 65 |     my @pindel=@{$pindel1};
 66 |     #Check first sample
 67 |     if ($pindel[$column]>0||$pindel[$column+1]>0||$pindel[$column+2]>0||$pindel[$column+3]>0){	
 68 | 	if ($type1 eq "tumor" && $type2 eq "normal"){
 69 | 	    print SOMATIC "$line\n";
 70 | 	}
 71 | 	if ($type1 eq "normal" && $type2 eq "tumor"){
 72 | 	    print LOH "$line\n";
 73 | 	}
 74 |     }
 75 |     #Check second sample
 76 |     elsif ($pindel[$column+7]>0||$pindel[$column+8]>0||$pindel[$column+9]>0||$pindel[$column+10]>0){
 77 | 	if ($type1 eq "tumor" && $type2 eq "normal"){
 78 | 	    print LOH "$line\n";
 79 | 	}
 80 | 	if ($type1 eq "normal" && $type2 eq "tumor"){
 81 | 		print SOMATIC "$line\n";
 82 | 	}
 83 |     }
 84 | }
 85 | 
 86 | sub check_sup2{
 87 |     #Get passed arguments
 88 |     my ($column,$type1,$type2,$pindel1,$line)=@_;
 89 |     my @pindel=@{$pindel1};
 90 |     #Check both samples
 91 |     if (($pindel[$column]>0||$pindel[$column+1]>0||$pindel[$column+2]>0||$pindel[$column+3]>0)&&($pindel[$column+7]>0||$pindel[$column+8]>0||$pindel[$column+9]>0||$pindel[$column+10]>0)){
 92 | 	if ($type1 eq "tumor" && $type2 eq "normal"){
 93 | 	    print GERMLINE "$line\n";
 94 | 	}
 95 | 	if ($type1 eq "normal" && $type2 eq "tumor"){
 96 | 	    print GERMLINE "$line\n";
 97 | 	}
 98 |     }
 99 | }
100 | 
101 | #QC steps
102 | print GERMLINE "Successfully completed germline.\n";
103 | print SOMATIC "Successfully completed somatic.\n";
104 | print LOH "Successfully completed loh.\n";
105 | 
106 | close GERMLINE;
107 | close SOMATIC;
108 | close LOH;
109 | 


--------------------------------------------------------------------------------
/pindel2vcf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ding-lab/VariantQC/5b22e2b5db725519e9789277e979996ec197f28a/pindel2vcf


--------------------------------------------------------------------------------
/qc_pipeline.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | #Reyka Jayasinghe (rjayasin@genome.wustl.edu), Steven Foltz (sfoltz@genome.wustl.edu)
  4 | #4 September 2015
  5 | 
  6 | #QC Pipeline is a series of commands calling several scripts to extract, classify, and annotate complex events reported by Pindel-C.
  7 | 
  8 | #Annotated QC output is stored here: origdata/variants/ID.type.*.anno.vcf where *={germline, somatic, loh}
  9 | 
 10 | #The input to the pipeline consists of:
 11 | #1. sample ID
 12 | #2. cancer type
 13 | #3. absolute path to the folder of Pindel-C outputs for that sample (to the folder containing sampleID_D file(s), not the actual sampleID_D file(s))
 14 | #4. absolute path to the sample's tumor bam
 15 | #5. absolute path to the sample's normal bam
 16 | #6. project name (if needed for your application to parse sample ID, etc.)
 17 | #7. coverage minimum (default: 20, can be changed in bsub_qc.sh)
 18 | #8. steps to be completed (default: all steps, can be changed in bsub_qc.sh)
 19 | 
 20 | #How the script is called:
 21 | #bash /path/to/qc_pipeline.sh SAMPLE_ID CANCER_TYPE /PATH/TO/PINDEL/OUTPUT/ /PATH/TO/TUMOR/BAM.bam /PATH/TO/NORMAL/BAM.bam PROJECT_NAME
 22 | 
 23 | #In actually running, each individual will be sent to bsub as their own job (see bsub_qc.sh) 
 24 | 
 25 | ###THESE VARIABLES MUST BE SET BY USER BEFORE RUNNING
 26 | gsl3="perl germline_somatic_loh_3.pl" #path to the perl script germline_somatic_loh_3.pl
 27 | p2v="pindel2vcf" #path to the executable pindel2vcf
 28 | ref="reference_sequence_used_by_pindel.fa" #path to the reference fasta used by Pindel-C
 29 | rc="read_count2.pl" #path to the executable perl script read_count2.pl
 30 | vep_dir="/your/directory/vep80" #path to your VEP directory (ex: /your/directory/vep80)
 31 | vep="$vep_dir/vep/variant_effect_predictor.pl" #path to your version of variant_effect_predictor.pl
 32 | data_dir="$vep_dir/.vep" #path to your hidden VEP directory
 33 | fasta="$data_dir/homo_sapiens/80_GRCh37/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa" #path to your VEP fasta
 34 | #Also, user must edit germline_somatic_loh_3.pl to set rules for identifying tumor and normal
 35 | ###
 36 | 
 37 | if [ $# -ne 8 ]; then
 38 |         echo "Wrong number of arguments supplied. Usage: bash /path/to/pipeline.sh sample_id cancer_type /path/to/pindel/output /path/to/tumor/bam.bam path/to/normal/bam.bam project_name coverage_minimum steps"
 39 |         exit 1
 40 | fi
 41 | 
 42 | #Inputs
 43 | id=$1 #the sample id
 44 | type=$2 #their cancer type
 45 | echo "###" >> log/$id.$type.log
 46 | echo "New command at "$(date)": $1 $2 $3 $4 $5 $6 $7 $8" >> log/$id.$type.log
 47 | echo "###" >> log/$id.$type.log
 48 | echo "Timestamp 0: "$(date)" Sorting inputs..." >> log/$id.$type.log
 49 | pindel_path=$3 #absolute path to their pindel outputs
 50 | if [ -d $pindel_path ]; then
 51 |     echo "#Pindel output directory exists: "$pindel_path >> log/$id.$type.log
 52 | else
 53 |     echo "#Fail: Pindel output directory does not exist: "$pindel_path >> log/$id.$type.log
 54 |     exit 1
 55 | fi
 56 | tumor_bam=$4 #absolute path to their tumor bam
 57 | if [ -f $tumor_bam ]; then
 58 |     if [ -s $tumor_bam ]; then
 59 | 	echo "#Tumor bam file exists and is not empty: "$tumor_bam >> log/$id.$type.log
 60 |     else
 61 | 	echo "#Tumor bam file exists but is EMPTY: "$tumor_bam >> log/$id.$type.log
 62 |     fi
 63 | else
 64 |     echo "#Fail: Tumor bam file does not exist: "$tumor_bam >> log/$id.$type.log
 65 |     exit 1
 66 | fi
 67 | normal_bam=$5 #absolute path to their normal bam
 68 | if [ -f $normal_bam ]; then
 69 |     if [ -s $normal_bam ]; then
 70 | 	echo "#Normal bam file exists and is not empty: "$normal_bam >> log/$id.$type.log
 71 |     else
 72 | 	echo "#Normal bam file exists but is EMPTY: "$normal_bam >> log/$id.$type.log
 73 |     fi
 74 | else
 75 |     echo "#Fail: Normal bam file does not exist: "$normal_bam >> log/$id.$type.log
 76 |     exit 1
 77 | fi
 78 | project=$(echo $6 | tr '[a-z]' '[A-Z]') #project name converted to upppercase
 79 | #if your pipeline requires a specific project name:
 80 | #if [ $project == "X" ] || [ $project == "Y" ]; then
 81 | #    echo "#Project is "$project >> log/$id.$type.log
 82 | #else
 83 | #    echo "#Fail: Project must be X or Y: "$project >> log/$id.$type.log
 84 | #    exit 1
 85 | #fi
 86 | coverage_min=$7
 87 | steps=$8
 88 | 
 89 | #Extract complex indels
 90 | if [[ $steps == *"1"* ]]; then
 91 |     echo "Timestamp 1: "$(date)" Extracting complex indels..." >> log/$id.$type.log
 92 |     mkdir -p origdata
 93 |     #picks up complex events (insertion length is not zero, deletion and insertion lengths are not equal)
 94 |     grep ChrID $pindel_path/*_D | awk '{if($5) print}' | awk '{if ($3!=$5) print}' > origdata/$id.$type.complex
 95 |     if [ -f origdata/$id.$type.complex ]; then
 96 | 	if [ -s origdata/$id.$type.complex ]; then
 97 | 	    echo "#Complex indel file exists and is not empty: "origdata/$id.$type.complex >> log/$id.$type.log
 98 | 	else
 99 | 	    echo "#Complex indel file exists but is EMPTY: "origdata/$id.$type.complex >> log/$id.$type.log
100 | 	fi
101 |     else
102 | 	echo "#Fail: Complex indel file does not exist: "origdata/$id.$type.complex >> log/$id.$type.log
103 | 	exit 1
104 |     fi
105 | fi
106 | 
107 | #Separates germline, somatic, and LOH events
108 | #Germline is in both tumor and normal; Somatic is only in tumor; LOH is only in normal
109 | WD="origdata/$id.$type.complex"
110 | GERMLINE="origdata/$id.$type.germline"
111 | SOMATIC="origdata/$id.$type.somatic"
112 | LOH="origdata/$id.$type.loh"
113 | if [[ $steps == *"2"* ]]; then
114 |     echo "Timestamp 2: "$(date)" Separating germline, somatic, and LOH..." >> log/$id.$type.log
115 |     $gsl3 $WD $project $GERMLINE $SOMATIC $LOH
116 |     test1=$(grep "Successfully completed germline." $GERMLINE)
117 |     test2=$(grep "Successfully completed somatic." $SOMATIC)
118 |     test3=$(grep "Successfully completed loh." $LOH)
119 |     if [ "$test1" == "Successfully completed germline." ]; then
120 | 	echo "#Germline successfully completed: "$GERMLINE >> log/$id.$type.log
121 |     else
122 | 	echo "#Fail: Germline failed: "$GERMLINE >> log/$id.$type.log
123 | 	exit 1
124 |     fi
125 |     if [ "$test2" == "Successfully completed somatic." ]; then
126 | 	echo "#Somatic successfully completed: "$SOMATIC >> log/$id.$type.log
127 |     else
128 | 	echo "#Fail: Somatic failed: "$SOMATIC >> log/$id.$type.log
129 | 	exit 1
130 |     fi
131 |     if [ "$test3" == "Successfully completed loh." ]; then
132 | 	echo "#LOH successfully completed: "$LOH >> log/$id.$type.log
133 |     else
134 | 	echo "#Fail: LOH failed: "$LOH >> log/$id.$type.log
135 | 	exit 1
136 |     fi
137 | fi
138 | 
139 | #Make sure there is sufficient coverage, filter out events with low coverage
140 | GERMLINEc="origdata/$id.$type.germline_coverage"
141 | SOMATICc="origdata/$id.$type.somatic_coverage"
142 | LOHc="origdata/$id.$type.loh_coverage"
143 | if [[ $steps == *"3"* ]]; then
144 |     echo "Timestamp 3: "$(date)" Filtering out variants with coverage less than $coverage_min reads..." >> log/$id.$type.log
145 |     grep -v "Successfully completed germline." $GERMLINE | awk -F' ' '{a=$33+$35+$37;b=$40+$42+$44;if(a>='$coverage_min'&&b>='$coverage_min'){print}}' > $GERMLINEc
146 |     grep -v "Successfully completed somatic." $SOMATIC | awk -F' ' '{a=$33+$35+$37;b=$40+$42+$44;if(a>='$coverage_min'&&b>='$coverage_min'){print}}' > $SOMATICc
147 |     grep -v "Successfully completed loh." $LOH | awk -F' ' '{a=$33+$35+$37;b=$40+$42+$44;if(a>='$coverage_min'&&b>='$coverage_min'){print}}' > $LOHc
148 |     if [ -f $GERMLINEc ]; then
149 | 	if [ -s $GERMLINEc ]; then
150 | 	    echo "#Germline file exists and is not empty: "$GERMLINEc >> log/$id.$type.log
151 | 	else
152 | 	    echo "#Germline file exists but is EMPTY: "$GERMLINEc >> log/$id.$type.log
153 | 	fi
154 |     else
155 | 	echo "#Fail: Germline file does not exist: "$GERMLINEc >> log/$id.$type.log
156 | 	exit 1
157 |     fi
158 |     if [ -f $SOMATICc ]; then
159 | 	if [ -s $SOMATICc ]; then
160 | 	    echo "#Somatic file exists and is not empty: "$SOMATICc >> log/$id.$type.log
161 | 	else
162 | 	    echo "#Somatic file exists but is EMPTY: "$SOMATICc >> log/$id.$type.log
163 | 	fi
164 |     else
165 | 	echo "#Fail: Somatic file does not exist: "$SOMATICc >> log/$id.$type.log
166 | 	exit 1
167 |     fi
168 |     if [ -f $LOHc ]; then
169 | 	if [ -s $LOHc ]; then
170 | 	    echo "#LOH file exists and is not empty: "$LOHc >> log/$id.$type.log
171 | 	else
172 | 	    echo "#LOH file exists but is EMPTY: "$LOHc >> log/$id.$type.log
173 | 	fi
174 |     else
175 | 	echo "#Fail: LOH file does not exist: "$LOHc >> log/$id.$type.log
176 | 	exit 1
177 |     fi
178 | fi
179 | 
180 | #Makes VCF files for germline, somatic, and LOH pindel complex indel outputs
181 | mkdir -p VCFs
182 | if [[ $steps == *"4"* ]]; then
183 |     for gsl in germline somatic loh; do
184 | 	echo "Timestamp 4: "$(date)" Making VCF file for $gsl..." >> log/$id.$type.log
185 | 	$p2v -p origdata/$id.$type.${gsl}_coverage -r $ref -R $ref -d 2015 -v VCFs/$id.$type.$gsl.vcf
186 | 	if [ -f VCFs/$id.$type.$gsl.vcf ]; then
187 | 	    if [ -s VCFs/$id.$type.$gsl.vcf ]; then
188 | 		echo "#VCF "$gsl" file exists and is not empty: "VCFs/$id.$type.$gsl.vcf >> log/$id.$type.log
189 | 	    else
190 | 		echo "#VCF "$gsl" file exists but is EMPTY: "VCFs/$id.$type.$gsl.vcf >> log/$id.$type.log
191 | 	    fi
192 | 	else
193 | 	    echo "#Fail: VCF "$gsl" does not exist: "VCFs/$id.$type.$gsl.vcf >> log/$id.$type.log
194 | 	    exit 1
195 | 	fi
196 |     done
197 | fi
198 | 
199 | #Runs read counts perl script for each germline, somatic, and LOH 
200 | mkdir -p read_counts
201 | #for gsl in germline somatic loh; do
202 | for gsl in somatic loh; do
203 |     vcf="VCFs/$id.$type.$gsl.vcf"
204 |     outtumor="read_counts/$id.$type.$gsl.tumor.rc"
205 |     outnormal="read_counts/$id.$type.$gsl.normal.rc"
206 |     if [[ $steps == *"5"* ]]; then
207 | 	echo "Timestamp 5: "$(date)" Running read counts for tumor $gsl..." >> log/$id.$type.log
208 | 	perl $rc -id $id $ref $vcf $tumor_bam 20 $outtumor
209 | 	if [ -f $outtumor ]; then
210 | 	    if [ -s $outtumor ]; then
211 | 		echo "#Read counts "$gsl" tumor file exists and is not empty: "$outtumor >> log/$id.$type.log
212 | 	    else
213 | 		echo "#Read counts "$gsl" tumor file exists but is EMPTY: "$outtumor >> log/$id.$type.log
214 | 	    fi
215 | 	else
216 | 	    echo "#Fail: Read counts "$gsl" tumor does not exist: "$outtumor >> log/$id.$type.log
217 | 	    exit 1
218 | 	fi
219 |     fi
220 |     if [[ $steps == *"6"* ]]; then
221 | 	echo "Timestamp 6: "$(date)" Running read counts for normal $gsl..." >> log/$id.$type.log
222 | 	perl $rc -id $id $ref $vcf $normal_bam 20 $outnormal #20 refers to mapping quality minimum
223 | 	if [ -f $outtumor ]; then
224 | 	    if [ -s $outtumor ]; then
225 | 		echo "#Read counts "$gsl" normal file exists and is not empty: "$outnormal >> log/$id.$type.log
226 | 	    else
227 | 		echo "#Read counts "$gsl" normal file exists but is EMPTY: "$outnormal >> log/$id.$type.log
228 | 	    fi
229 | 	else
230 |             echo "#Fail: Read counts "$gsl" normal does not exist: "$outnormal >> log/$id.$type.log
231 |             exit 1
232 | 	fi
233 |     fi
234 | done
235 | 
236 | #Reclassify germline, somatic, and LOH based on read counts data
237 | if [[ $steps == *"7"* ]]; then
238 |     echo "Timestamp 7: "$(date)" Reclassifying complex indels based on read counts..." >> log/$id.$type.log
239 |     #Support for "somatic" event in normal bam
240 |     awk '{if ($16>0) print "\""substr($6,2)"\"\tChrID "$2"\tBP "$3}' read_counts/$id.$type.somatic.normal.rc | sort | uniq > $id.TEMP
241 |     #No support for "somatic" event in tumor bam
242 |     awk '{if ($16==0) print "\""substr($6,2)"\"\tChrID "$2"\tBP "$3}' read_counts/$id.$type.somatic.tumor.rc | sort | uniq > $id.TEMP_NO_SOMATIC
243 |     #Support for "LOH" event in tumor bam 
244 |     awk '{if ($16>0) print "\""substr($6,2)"\"\tChrID "$2"\tBP "$3}' read_counts/$id.$type.loh.tumor.rc | sort | uniq > $id.TEMP_2
245 |     #No support for "LOH" event in normal bam
246 |     awk '{if ($16==0) print "\""substr($6,2)"\"\tChrID "$2"\tBP "$3}' read_counts/$id.$type.loh.normal.rc | sort | uniq > $id.TEMP_2_NO_LOH
247 |     #Grep misclassified somatic events from original "somatic" file
248 |     grep -wf $id.TEMP $SOMATICc | sort > $id.TEMP_somatic
249 |     #Grep misclassified LOH events from original "LOH" file
250 |     grep -wf $id.TEMP_2 $LOHc | sort > $id.TEMP_loh
251 |     #Sort "somatic" and "LOH" events for use in comm
252 |     sort $SOMATICc > $id.sorted_og_somatic
253 |     sort $LOHc > $id.sorted_og_loh
254 |     #Save reclassified and filtered germline, somatic, and loh files here
255 |     GERMLINEf="origdata/$id.$type.germline_filtered"
256 |     SOMATICf="origdata/$id.$type.somatic_filtered"
257 |     LOHf="origdata/$id.$type.loh_filtered"
258 |     #Cat events originally classified or reclassified as germline
259 |     cat $GERMLINEc $id.TEMP_somatic $id.TEMP_loh | sort | uniq > $GERMLINEf
260 |     #Print lines from "somatic" file not reclassified as germline (comm -13 prints lines unique to second file)
261 |     #Then remove "somatic" events without support in tumor bam
262 |     comm -13 $id.TEMP_somatic $id.sorted_og_somatic | sort | uniq | grep -vf $id.TEMP_NO_SOMATIC > $SOMATICf
263 |     #Print lines from "LOH" file not reclassified as germline (comm -13 prints lines unique to second file)
264 |     #Then remove "LOH" events without support in normal bam
265 |     comm -13 $id.TEMP_loh $id.sorted_og_loh | sort | uniq | grep -vf $id.TEMP_2_NO_LOH > $LOHf
266 |     #Remove temporary files
267 |     rm -f $id.TEMP $id.TEMP_NO_SOMATIC $id.TEMP_2 $id.TEMP_2_NO_LOH $id.TEMP_somatic $id.TEMP_loh $id.sorted_og_somatic $id.sorted_og_loh
268 |     if [ -f $GERMLINEf ]; then
269 | 	if [ -s $GERMLINEf ]; then
270 | 	    echo "#Germline file exists and is not empty: "$GERMLINEf >> log/$id.$type.log
271 | 	else
272 | 	    echo "#Germline file exists but is EMPTY: "$GERMLINEf >> log/$id.$type.log
273 | 	fi
274 |     else
275 | 	echo "#Fail: Germline file does not exist: "$GERMLINEf >> log/$id.$type.log
276 | 	exit 1
277 |     fi
278 |     if [ -f $SOMATICf ]; then
279 | 	if [ -s $SOMATICf ]; then
280 | 	    echo "#Somatic file exists and is not empty: "$SOMATICf >> log/$id.$type.log
281 | 	else
282 | 	    echo "#Somatic file exists but is EMPTY: "$SOMATICf >> log/$id.$type.log
283 | 	fi
284 |     else
285 | 	echo "#Fail: Somatic file does not exist: "$SOMATICf >> log/$id.$type.log
286 | 	exit 1
287 |     fi
288 |     if [ -f $LOHf ]; then
289 | 	if [ -s $LOHf ]; then
290 | 	    echo "#LOH file exists and is not empty: "$LOHf >> log/$id.$type.log
291 | 	else
292 |             echo "#LOH file exists but is EMPTY: "$LOHf >> log/$id.$type.log
293 | 	fi
294 |     else
295 | 	echo "#Fail: LOH file does not exist: "$LOHf >> log/$id.$type.log
296 | 	exit 1
297 |     fi
298 | fi
299 | 
300 | #Final sections: Make filtered VCFs, run VEP to annotate variants
301 | anno_input="origdata/annotate"
302 | anno_output="origdata/variants"
303 | mkdir -p $anno_input $anno_output 
304 | for gsl in germline somatic loh; do
305 |     #Makes VCF files for germline, somatic, and LOH pindel complex indel outputs
306 |     if [[ $steps == *"8"* ]]; then
307 | 	echo "Timestamp 8: "$(date)" Making VCF file for VEP input for $gsl..." >> log/$id.$type.log
308 | 	$p2v -p origdata/$id.$type.${gsl}_filtered -r $ref -R $ref -d 2015 -v VCFs/$id.$type.$gsl.filtered.vcf
309 | 	if [ -f VCFs/$id.$type.$gsl.filtered.vcf ]; then
310 | 	    if [ -s VCFs/$id.$type.$gsl.filtered.vcf ]; then
311 | 		echo "#VCF filtered "$gsl" file exists and is not empty: "VCFs/$id.$type.$gsl.filtered.vcf >> log/$id.$type.log
312 | 	    else
313 | 		echo "#VCF filtered "$gsl" file exists but is EMPTY: "VCFs/$id.$type.$gsl.filtered.vcf >> log/$id.$type.log
314 | 	    fi
315 | 	else
316 |             echo "#Fail: VCF filtered "$gsl" does not exist: "VCFs/$id.$type.$gsl.filtered.vcf >> log/$id.$type.log
317 |             exit 1
318 | 	fi
319 |     fi
320 |     #Runs VEP to annotate the variants
321 |     if [[ $steps == *"9"* ]]; then
322 | 	echo "Timestamp 9: "$(date)" Annotating for $gsl..."
323 | 	perl $vep --everything -i VCFs/$id.$type.$gsl.filtered.vcf --format vcf --vcf -out $anno_output/$id.$type.$gsl.anno.vcf --dir $data_dir --assembly GRCh37 --cache --offline --fork 4
324 | 	if [ -f $anno_output/$id.$type.$gsl.anno.vcf ]; then
325 | 	    if [ -s $anno_output/$id.$type.$gsl.anno.vcf ]; then
326 | 		echo "#Annotation output file exists and is not empty: "$anno_output/$id.$type.$gsl.anno.vcf >> log/$id.$type.log
327 | 	    else
328 | 		echo "#Annotation output file exists but is EMPTY: "$anno_output/$id.$type.$gsl.anno.vcf >> log/$id.$type.log
329 | 	    fi
330 | 	else
331 |             echo "#Fail: Annotation output file does not exist: "$anno_output/$id.$type.$gsl.anno.vcf >> log/$id.$type.log
332 |             exit 1
333 | 	fi
334 |     fi
335 | done
336 | 
337 | echo "End of pipeline! Ran steps: "$steps". Finished "$(date) >> log/$id.$type.log
338 | 


--------------------------------------------------------------------------------
/read_count2.pl:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/perl -w
  2 | use strict;
  3 | use Getopt::Long;
  4 | 
  5 | 
  6 | 
  7 | my($help,$bam_files,$bam_batch,$id);
  8 | my %hash;
  9 | &GetOptions('bam_file=s'=>\$bam_files,'bam_bat=s'=>\$bam_batch,'help|h'=>\$help,'id=s'=>\$id);
 10 | 
 11 | 
 12 | if($#ARGV<4 or defined $help)
 13 | {
 14 |   print "USAGE: $0 [option] fasta_file vcf_file bam_file mapping_quality_cutoff output_file.\n";
 15 |   print "       -id for multiple process.\n";
 16 |   exit;
 17 | }
 18 | 
 19 | my $ref_file=shift;
 20 | my $vcf_file=shift;
 21 | my $bam_file=shift;
 22 | my $MQ_cutoff=shift;
 23 | my $out_file=shift;
 24 | if( -e $out_file)
 25 | {
 26 |   unlink  $out_file;
 27 | }
 28 | 
 29 | my $key;
 30 | my %str;
 31 | open SEQ,"<$ref_file" or die "can not open ref sequence file:$!";
 32 | while(<SEQ>)
 33 | {
 34 |   chomp;
 35 |   if(/>/)
 36 |   {
 37 |     s/>//;
 38 |     s/\s+.*//;
 39 |     $key=$_;
 40 |   }
 41 |   else
 42 |   {
 43 |     $str{$key}.=uc $_;
 44 |   }
 45 | }
 46 | close SEQ;
 47 | 
 48 | my $ref;
 49 | my $mod_ref;
 50 | my $num=0;
 51 | open VCF,"<$vcf_file" or die "can not open vcf file:$!";
 52 | while(<VCF>)
 53 | {
 54 |   next if /^#/;
 55 |   if(/^\s*$/){next;}
 56 |   chomp;
 57 |   my @arr=split(/\s+/);
 58 |   $num++;
 59 |   my $tem1;
 60 |   if($arr[1]>=10001){$tem1=substr($str{$arr[0]},$arr[1]-10001,10000);}
 61 |   else{$tem1=substr($str{$arr[0]},0,$arr[1]);}
 62 |   my $tem2=substr($str{$arr[0]},$arr[1]+length($arr[3])-1,10000);
 63 | #  print substr($str{$arr[0]},$arr[1]-2001,4000+length($arr[3])),"\n";
 64 |   $ref=$tem1.$arr[3].$tem2;
 65 |   my $key;
 66 |   if(defined $id)
 67 |   {
 68 |     $key=$id.'_'.$arr[0].'_'.$num;
 69 |   }
 70 |   else{$key=$arr[0].'_'.$num;}
 71 |   my $fafile=$key.'.fa';
 72 |   if( -e $fafile)
 73 |   {
 74 |     unlink  $fafile;
 75 |   }
 76 |   print_out($key,'ref',$ref);
 77 |   $mod_ref=$tem1.$arr[4].$tem2;
 78 |   print_out($key,'alt',$mod_ref);
 79 |   my $start=$arr[1]-2000;
 80 |   my $end=$arr[1]+2000+length($arr[3]);
 81 |   print $start, "\t", $end, "\n";
 82 |   
 83 | #  system(" samtools view $bam_file $arr[0]:$start-$end| awk '{print \"@\"\$1;print \$10;print \"+\";print \$11;}' > ${key}_reads.fastq ");
 84 |   system(" samtools view $bam_file $arr[0]:$start-$end -b > ${key}_reads.bam");
 85 |   `java -jar /gsc/scripts/pkg/bio/picard/picard-tools-1.92/SamToFastq.jar VALIDATION_STRINGENCY=LENIENT I=${key}_reads.bam F=${key}_reads_1.fastq F2=${key}_reads_2.fastq`;
 86 |   
 87 | 
 88 | print "***start $key bwa **********\n";
 89 |   my $bwa_index="bwa index $key.fa";
 90 |   system ("$bwa_index");
 91 |   my $samtools_faidx="samtools faidx $key.fa";
 92 |   system ("$samtools_faidx");
 93 |   my $bwa_cmd1 = "bwa aln -t4 -q 5 $key.fa ${key}_reads_1.fastq > ${key}_1.sai";
 94 |   system ("$bwa_cmd1"); 
 95 |   my $bwa_cmd2 = "bwa aln -t4 -q 5 $key.fa ${key}_reads_2.fastq > ${key}_2.sai";
 96 |   system ("$bwa_cmd2"); 
 97 |   my $bwa_cmd = "bwa sampe -a 600 $key.fa ${key}_1.sai ${key}_2.sai ${key}_reads_1.fastq ${key}_reads_2.fastq > $key.sam"; 
 98 |   system ("$bwa_cmd");
 99 | 
100 |   #my $samtools_view = "samtools view -bS $key.sam > $key.bam"; 
101 |   #system ("$samtools_view");
102 |   #my $samtools_sort = "samtools sort $key.bam $key.sort"; 
103 |   #system ("$samtools_sort");
104 |   #my $samtools_index = "samtools index $key.sort.bam"; 
105 |   #system ("$samtools_index");
106 | 
107 |   my $samfile=$key.'.sam';
108 |   my $head_len=length($tem1);
109 |   &analysis($samfile,$head_len,\@arr, $MQ_cutoff, $out_file, $vcf_file);
110 |   
111 |   my $rm_bam="rm -rf ${key}_reads.bam";
112 |   system ("$rm_bam");
113 |   my $rm_fa="rm -rf $key.fa*";
114 |   system ("$rm_fa");
115 |   my $rm_sai="rm -rf ${key}_*.sai";
116 |   system ("$rm_sai");
117 |   my $rm_sam="rm -rf $key.sam";
118 |   system ("$rm_sam");
119 |   my $rm_fastq="rm -rf ${key}_reads_*.fastq";
120 |   system ("$rm_fastq");
121 | }
122 | 
123 | sub analysis
124 | {
125 |   my $file=shift;
126 |   my $head_len=shift;
127 |   my $arrref=shift;
128 |   my $MQ=shift;
129 |   my $out_file=shift;
130 |   my $vcf_name=shift;
131 |   open SAM,"<$file" or die "can not open sam file:$!";
132 |   open STAT,">>$out_file" or die "can not open stats file:$!";
133 |   my $ref_num=0;
134 |   my $alt_num=0;
135 |   while(<SAM>)
136 |   {
137 |     my @arr=split;
138 |     if($arr[2] eq 'alt')
139 |     {
140 |       if($arr[3] + length($arr[9]) <= $head_len + length(${$arrref}[4]) or $arr[3]>$head_len+1){next;}
141 |       if(substr($arr[9],$head_len-$arr[3]+1,length(${$arrref}[4])) eq ${$arrref}[4] and $arr[4] >= $MQ) {$alt_num++;}
142 |     }
143 |     if($arr[2] eq 'ref')
144 |     {
145 |       if($arr[3]+length($arr[9]) <=$head_len+length(${$arrref}[3]) or $arr[3]>$head_len+1){next;}
146 |       if(substr($arr[9],$head_len-$arr[3]+1,length(${$arrref}[3])) eq ${$arrref}[3] and $arr[4] >= $MQ) {$ref_num++;}   
147 |     }
148 |   } 
149 | 
150 |   close SAM;
151 |   $"="\t";
152 |   print STAT $vcf_name, "\t@{$arrref}\t";
153 |   print STAT "\tref\t",$ref_num,"\t";
154 |   print STAT "\talt\t",$alt_num,"\n";
155 |   close STAT;
156 | }
157 | 
158 | sub print_out
159 | {
160 |   my $k=shift;
161 |   my $head=shift;
162 |   my $seq=shift;
163 |   open REFOUT,">>$k.fa" or die "can not open $k.fa file:$!"; 
164 |   print REFOUT ">",$head,"\n";
165 |   for(my $n=0;$n<=length($seq);$n+=60)
166 |   {
167 |     print REFOUT substr($seq,$n,60),"\n";
168 |   }
169 |   close REFOUT;
170 | }      
171 | 
172 | exit;
173 | 


--------------------------------------------------------------------------------