├── directory_structure.001.jpeg ├── SNP_calling └── scripts │ ├── 5_merge_sample_bams.sh │ ├── 8_haplotype_caller.sh │ ├── 4_dedup.sh │ ├── 7_recal.sh │ ├── 6_dedup.sh │ ├── 9_consolidate_genotypes.sh │ ├── 2_bwa_align.sh │ ├── 3_add_readgroups.sh │ ├── 1_qc_clean.sh │ └── 10_refine_filter.sh └── README.md /directory_structure.001.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/josieparis/gatk-snp-calling/HEAD/directory_structure.001.jpeg -------------------------------------------------------------------------------- /SNP_calling/scripts/5_merge_sample_bams.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -D . 3 | #SBATCH -p pq 4 | #SBATCH --time=02:00:00 5 | #SBATCH --nodes=1 6 | #SBATCH --ntasks-per-node=2 7 | #SBATCH -A 8 | #SBATCH --job-name=master_merge_bams 9 | #SBATCH --error=master_merge_bams.err.txt 10 | #SBATCH --output=master_merge_bams.out.txt 11 | #SBATCH --export=All 12 | #SBATCH --array=1-40%10 13 | 14 | 15 | ### Script to merge bams from multiple fastq reads 16 | 17 | ## Load your system modules 18 | # Required modules are: picard tools 19 | module purge 20 | module load picard/2.6.0-Java-1.8.0_131 21 | 22 | ## Set your master path 23 | MASTER= 24 | 25 | ## Fill in directories if different from the workspace setup 26 | bam_path=$MASTER/SNP_calling/bams/interim_bams 27 | 28 | ## Fill in path for population specific metadata 29 | metadata=$MASTER/SNP_calling/metadata.tsv 30 | 31 | ## Path to picard: 32 | EBROOTPICARD= 33 | 34 | ### The sample IDs are extracted from the metadata 35 | i=$(cut -f 1 $metadata | tail -n+2 | uniq | sed -n "${SLURM_ARRAY_TASK_ID}p") 36 | 37 | # Find all bams for this individual and merge 38 | mfiles=`ls -1 ${bam_path}/${i}_*sorted.dups.bam` 39 | cmd="" 40 | for file in $mfiles 41 | do 42 | cmd+="I=$file " 43 | done 44 | jcmd="java -Xmx10g -jar $EBROOTPICARD/picard.jar MergeSamFiles $cmd O=$bam_path/${i}.merged.bam TMP_DIR=/gpfs/ts0/scratch/mv323/tmp" 45 | $jcmd 46 | 47 | -------------------------------------------------------------------------------- /SNP_calling/scripts/8_haplotype_caller.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -D . 3 | #SBATCH -p pq 4 | #SBATCH --time=168:00:00 5 | #SBATCH --nodes=1 6 | #SBATCH --ntasks-per-node=2 7 | #SBATCH -A 8 | #SBATCH --job-name=haplotype_caller 9 | #SBATCH --error=haplotype_caller.err.txt 10 | #SBATCH --output=haplotype_caller.out.txt 11 | #SBATCH --export=All 12 | #SBATCH --array=1-20%5 13 | 14 | ### Script to run haplotype caller 15 | 16 | # Required modules are: GATK4 17 | module load GATK/4.0.5.1-foss-2018a-Python-3.6.4 SAMtools/1.9-foss-2018b 18 | 19 | ## Set your master path 20 | MASTER= 21 | 22 | ## Fill in path for population specific metadata 23 | metadata=$MASTER/SNP_calling/metadata.tsv 24 | 25 | ## Fill in directories if different from the workspace setup 26 | ## Also add path to reference 27 | bam_in=$MASTER/SNP_calling/bams/clean_bams 28 | gvcfs=$MASTER/SNP_calling/bams/gvcfs 29 | reference=$MASTER/reference.fa.gz 30 | 31 | ## In array ## 32 | insampleID_array=( `cat $samples | cut -f 1` ) 33 | insampleID=$bam_in/${insampleID_array[(($SLURM_ARRAY_TASK_ID))]} 34 | 35 | ## Out array 36 | outsampleID_array=( `cat $samples | cut -f 1` ) 37 | outsampleID=$gvcfs/${outsampleID_array[(($SLURM_ARRAY_TASK_ID))]} 38 | 39 | ## Run haplotype caller 40 | gatk --java-options "-Xmx10g" HaplotypeCaller \ 41 | -R $reference \ 42 | -I ${insampleID}.recal.bam \ 43 | -O ${outsampleID}_g.vcf.gz -ERC GVCF 44 | -------------------------------------------------------------------------------- /SNP_calling/scripts/4_dedup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -D . 3 | #SBATCH -p pq 4 | #SBATCH --time=02:00:00 5 | #SBATCH --nodes=1 6 | #SBATCH --ntasks-per-node=2 7 | #SBATCH -A 8 | #SBATCH --job-name=index_dedup_master 9 | #SBATCH --error=index_dedup_master.err.txt 10 | #SBATCH --output=index_dedup_master.out.txt 11 | #SBATCH --export=All 12 | #SBATCH --array=1-125%20 13 | 14 | ### Script to mark duplicates and remove duplicates 15 | 16 | ## Load your system modules 17 | # Required modules are: picard tools 18 | module purge 19 | module load picard/2.6.0-Java-1.8.0_131 20 | 21 | ## Set your master path 22 | MASTER= 23 | 24 | ## Fill in directories if different from the workspace setup 25 | bam_in=$MASTER/SNP_calling/bams/interim_bams 26 | bam_out=$MASTER/SNP_calling/bams/interim_bams 27 | 28 | ## Picard path: 29 | EBROOTPICARD= 30 | 31 | ## Fill in path for population specific metadata 32 | metadata=$MASTER/SNP_calling/metadata.tsv 33 | 34 | ## In array ## 35 | insampleID_array=( `cat $metadata | cut -f 2` ) 36 | insampleID=$bam_in/${insampleID_array[(($SLURM_ARRAY_TASK_ID))]} 37 | 38 | ## Out array 39 | outsampleID_array=( `cat $metadata | cut -f 2` ) 40 | outsampleID=$bam_out/${outsampleID_array[(($SLURM_ARRAY_TASK_ID))]} 41 | 42 | 43 | ## Run picardtools MarkDuplicates 44 | java -Xmx10g -jar $EBROOTPICARD/picard.jar MarkDuplicates \ 45 | I=$insampleID.sorted.rg.bam \ 46 | O=$outsampleID.sorted.dups.bam \ 47 | METRICS_FILE=$outsampleID.metrics.txt \ 48 | REMOVE_DUPLICATES=true \ 49 | VALIDATION_STRINGENCY=LENIENT AS=true \ 50 | ## optional TMP_DIR= 51 | 52 | ## Index the deduped bam files 53 | java -Xmx10g -jar $EBROOTPICARD/picard.jar BuildBamIndex \ 54 | I=$outsampleID.sorted.dups.bam VALIDATION_STRINGENCY=LENIENT 55 | 56 | -------------------------------------------------------------------------------- /SNP_calling/scripts/7_recal.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -D . 3 | #SBATCH -p pq 4 | #SBATCH --time=02:00:00 5 | #SBATCH --nodes=1 6 | #SBATCH --ntasks-per-node=2 7 | #SBATCH -A 8 | #SBATCH --job-name=master_base_recal 9 | #SBATCH --error=master_base_recal.err.txt 10 | #SBATCH --output=master_base_recal.out.txt 11 | #SBATCH --export=All 12 | #SBATCH --array=1-25%5 13 | 14 | ### Script to perform recalibration on the bam files 15 | 16 | # Required modules are: GATK4 17 | module load GATK/4.0.5.1-foss-2018a-Python-3.6.4 18 | 19 | ## Set your master path 20 | MASTER= 21 | 22 | ## Fill in directories if different from the workspace setup 23 | ## Also add path to indexed reference and recalibration file 24 | reference= 25 | recal= 26 | bam_in=$MASTER/SNP_calling/bams/interim_bams 27 | bam_out=$MASTER/SNP_calling/bams/clean_bams 28 | 29 | ## This just catches the array in case it's running for a value with no individual (this screws with the outputs) 30 | IND_N=$(cut -f1 $metadata | tail -n+2 | uniq | awk 'NF > 0' | wc -l) 31 | 32 | if [ $SLURM_ARRAY_TASK_ID -le $IND_N ] 33 | then 34 | 35 | ## In array ## 36 | insampleID_array=( `cat $samples | cut -f 1` ) 37 | insampleID=$bam_path/${insampleID_array[(($SLURM_ARRAY_TASK_ID))]} 38 | 39 | ## Out array 40 | outsampleID_array=( `cat $samples | cut -f 1` ) 41 | outsampleID=$bam_path/${outsampleID_array[(($SLURM_ARRAY_TASK_ID))]} 42 | 43 | ## Apply base quality score recalibration 44 | 45 | ## BaseRecal 46 | gatk --java-options "-Xmx8g" BaseRecalibrator \ 47 | -I $insampleID.merged.sorted.dups.bam \ 48 | -R $reference --known-sites $recal \ 49 | -O $outsampleID.table 50 | 51 | ## ApplyBQSR 52 | gatk --java-options "-Xmx8g" ApplyBQSR \ 53 | -I $insampleID.merged.sorted.dups.bam \ 54 | -R $reference --bqsr-recal-file $outsampleID.table \ 55 | -O $outsampleID.recal.bam 56 | 57 | ## Index the recalibrated bam files 58 | java -Xmx10g -jar $EBROOTPICARD/picard.jar BuildBamIndex \ 59 | I=$outsampleID.recal.bam VALIDATION_STRINGENCY=LENIENT 60 | 61 | fi 62 | -------------------------------------------------------------------------------- /SNP_calling/scripts/6_dedup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -D . 3 | #SBATCH -p pq 4 | #SBATCH --time=02:00:00 5 | #SBATCH --nodes=1 6 | #SBATCH --ntasks-per-node=2 7 | #SBATCH -A Research_Project-T110748 8 | #SBATCH --job-name=master_dedup_merged 9 | #SBATCH --error=master_dedup_merged.err.txt 10 | #SBATCH --output=master_dedup_merged.out.txt 11 | #SBATCH --export=All 12 | #SBATCH --array=1-50%10 13 | 14 | 15 | ### Script to dedup merged bams 16 | 17 | ## Load your system modules 18 | # Required modules are: picard tools 19 | module load picard/2.6.0-Java-1.8.0_131 20 | module load picard/2.6.0-Java-1.8.0_131 21 | 22 | ## Set your master path 23 | MASTER=/gpfs/ts0/home/jrp228/NERC/people/josie/github_test/gatk-snp-calling 24 | 25 | ## Fill in directories if different from the workspace setup 26 | bam_path=$MASTER/SNP_calling/bams/interim_bams 27 | 28 | ## Fill in path for population specific metadata 29 | metadata=$MASTER/SNP_calling/metadata.tsv 30 | 31 | ## In array ## 32 | insampleID_array=( `cat $samples | cut -f 1` ) 33 | insampleID=$bam_path/${insampleID_array[(($SLURM_ARRAY_TASK_ID))]} 34 | 35 | ## Out array ## 36 | outsampleID_array=( `cat $samples | cut -f 1` ) 37 | outsampleID=$bam_path/${outsampleID_array[(($SLURM_ARRAY_TASK_ID))]} 38 | 39 | 40 | ## This just catches the array in case it's running for a value with no individual (this screws with the outputs) 41 | IND_N=$(cut -f1 $metadata | tail -n+2 | uniq | awk 'NF > 0' | wc -l) 42 | 43 | if [ $SLURM_ARRAY_TASK_ID -le $IND_N ] 44 | then 45 | 46 | ## Mark duplicates in the merged bam files ## 47 | java -Xmx10g -jar $EBROOTPICARD/picard.jar MarkDuplicates \ 48 | I=$insampleID.merged.bam \ 49 | O=$outsampleID.merged.sorted.dups.bam \ 50 | METRICS_FILE=$outsampleID.merged.sorted.dups.metrics.txt \ 51 | REMOVE_DUPLICATES=true \ 52 | VALIDATION_STRINGENCY=LENIENT AS=true \ 53 | TMP_DIR=/gpfs/ts0/scratch/mv323/tmp 54 | 55 | ## Index the Deduped merged bam files 56 | java -Xmx10g -jar $EBROOTPICARD/picard.jar BuildBamIndex \ 57 | I=$outsampleID.merged.sorted.dups.bam VALIDATION_STRINGENCY=LENIENT 58 | 59 | 60 | fi 61 | -------------------------------------------------------------------------------- /SNP_calling/scripts/9_consolidate_genotypes.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -D . 3 | #SBATCH -p pq 4 | #SBATCH --time=24:00:00 5 | #SBATCH --nodes=1 6 | #SBATCH --ntasks-per-node=2 7 | #SBATCH -A Research_Project-T110748 8 | #SBATCH --job-name=FIBR_STAR_batch_newQual 9 | #SBATCH --error=FIBR_STAR_batch_newQual.err.txt 10 | #SBATCH --output=FIBR_STAR_batch_newQual.out.txt 11 | #SBATCH --export=All 12 | #SBATCH --array=1-267%20 13 | 14 | ### Script to combine gVCFs and then genotype across the cohort 15 | 16 | # Required modules are: GATK4, bcftools 17 | module load GATK/4.0.5.1-foss-2018a-Python-3.6.4 18 | module load BCFtools/1.9-foss-2018b 19 | 20 | ## Set your master path 21 | MASTER=/gpfs/ts0/home/jrp228/NERC/people/josie/github_test/gatk-snp-calling 22 | 23 | ## Fill in directories if different from the workspace setup 24 | ## Also add path to reference 25 | gvcfs=$MASTER/SNP_calling/bams/gvcfs 26 | output_vcfs=$MASTER/SNP_calling/vcfs/intervals 27 | reference=/gpfs/ts0/home/jrp228/startup/STAR/STAR.chromosomes.release.fasta 28 | 29 | ## Name your dataset 30 | DATASET=FIBR_STAR 31 | 32 | # make the command for variants 33 | infiles=(`ls -1 ${gvcfs}/_g.vcf.gz`) 34 | let len=${#infiles[@]}-1 35 | cmd="" 36 | for i in `seq 0 $len` 37 | do 38 | cmd+="--variant ${infiles[$i]} " 39 | done 40 | 41 | # Fetch the interval of interest 42 | interval=$(awk '{print $1}' ${reference}.fai | sed "${SLURM_ARRAY_TASK_ID}q;d") 43 | echo $interval 44 | # Remove if done already 45 | rm -rf $gvcfs/INTERVAL_${interval}_db 46 | 47 | # Make database 48 | gatk --java-options "-Xmx16g -Xms4g" GenomicsDBImport \ 49 | $cmd \ 50 | --genomicsdb-workspace-path $gvcfs/INTERVAL_${SLURM_ARRAY_TASK_ID}_db \ 51 | --reader-threads 2 \ 52 | --intervals $interval 53 | 54 | # Genotype 55 | cd $output_vcfs 56 | 57 | gatk --java-options "-Xmx16g -Xms4g" GenotypeGVCFs \ 58 | -R $reference \ 59 | -V gendb://INTERVAL_${SLURM_ARRAY_TASK_ID}_db \ 60 | -O ${interval}.vcf.gz 61 | 62 | # Tidy 63 | rm -rf *INTERVAL_${SLURM_ARRAY_TASK_ID}_db* 64 | 65 | ### Run this when the above is all good 66 | ls $output_vcfs/intervals/*vcf.gz >> batch_filter.txt 67 | grep -Fxf batch_filter.txt batch_inputs.txt > batch_inputs_2.txt 68 | bcftools concat -o $output_vcfs/${DATASET}_cohort_batch_genotyped.g.vcf -f $output_vcfs/intervals/batch_inputs_2.txt 69 | 70 | -------------------------------------------------------------------------------- /SNP_calling/scripts/2_bwa_align.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -D . 3 | #SBATCH -p pq 4 | #SBATCH --time=1:00:00 5 | #SBATCH --nodes=1 6 | #SBATCH --ntasks-per-node=4 7 | #SBATCH -A 8 | #SBATCH --job-name=bwa_align 9 | #SBATCH --error=logs/bwa_align.err.txt 10 | #SBATCH --output=logs/bwa_align.out.txt 11 | #SBATCH --export=All 12 | #SBATCH --array=1-10 13 | ###SBATCH --array=1-20%5 14 | 15 | ### Script to run a bwa mem, bam and sort bams 16 | 17 | # Load modules 18 | # Required modules are: bwa, samtools 19 | module load BWA/0.7.17-foss-2018a 20 | module load SAMtools/1.3.1-foss-2018a 21 | 22 | ## Set your master path 23 | MASTER= 24 | 25 | ## Fill in directories if different from the workspace setup 26 | ## Also add path to indexed reference 27 | reference=$MASTER/reference.fasta 28 | input_reads=$MASTER/SNP_calling/reads/clean_reads 29 | bam_dir=$MASTER/SNP_calling/bams/raw_bams 30 | 31 | ## Fill in path for population specific metadata 32 | metadata=$MASTER/SNP_calling/metadata.tsv 33 | 34 | ## Read 1 array ## 35 | read1_array=(`cat $metadata | cut -f 3`) 36 | read1=$input_reads/${read1_array[(($SLURM_ARRAY_TASK_ID))]} 37 | 38 | ## Read 2 array ## 39 | read2_array=( `cat $metadata | cut -f 4` ) 40 | read2=$input_reads/${read2_array[(($SLURM_ARRAY_TASK_ID))]} 41 | 42 | ## Output array ## 43 | out_array=( `cat $metadata | cut -f 2` ) 44 | bam_out=$bam_dir/${out_array[(($SLURM_ARRAY_TASK_ID))]} 45 | 46 | echo "reference" $reference 47 | echo "read1" $read1 48 | echo "read2" $read2 49 | echo "alignment" ${bam_out}.unsorted.raw.sam 50 | 51 | ## Align with bwa mem using 4 cores. Again, make sure the read name prefixes match in the metadata 52 | bwa mem -t 4 $reference $read1 $read2 > $bam_out.unsorted.raw.sam 53 | 54 | ## Convert bam to sam, sort bam, index, flagstat 55 | samtools view -bS $bam_out.unsorted.raw.sam > $bam_out.unsorted.raw.bam 56 | samtools sort $bam_out.unsorted.raw.bam -o $bam_out.sorted.raw.bam 57 | samtools index $bam_out.sorted.raw.bam 58 | samtools flagstat $bam_out.sorted.raw.bam > $bam_out.mappingstats.txt 59 | 60 | # ## Remove the sam and unsorted bam files 61 | rm $bam_dir/*.sam 62 | rm $bam_dir/*.unsorted.raw.bam 63 | 64 | ## To check that the bams are not corrupted, run (in the directory where the bams are): 65 | 66 | # samtools quickcheck -v *.sorted.raw.bam > bad_bams.fofn && echo 'all ok' || echo 'some files failed check, see bad_bams.fofn' 67 | 68 | -------------------------------------------------------------------------------- /SNP_calling/scripts/3_add_readgroups.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -D . 3 | #SBATCH -p pq 4 | #SBATCH --time=00:60:00 5 | #SBATCH --nodes=1 6 | #SBATCH --ntasks-per-node=2 7 | #SBATCH -A 8 | #SBATCH --job-name=add_readgroups 9 | #SBATCH --error=add_readgroups.err.txt 10 | #SBATCH --output=add_readgroups.out.txt 11 | #SBATCH --export=All 12 | #SBATCH --array=1-3 13 | 14 | ### Script to add readgroups to bams 15 | 16 | ## Load your system modules 17 | # Required modules are picard tools 18 | module load picard/2.6.0-Java-1.8.0_131 19 | 20 | ## Set your master path 21 | MASTER= 22 | 23 | ## Fill in path for population specific metadata ## 24 | metadata=$MASTER/SNP_calling/metadata.tsv 25 | bam_in=$MASTER/SNP_calling/bams/raw_bams 26 | bam_out=$MASTER/SNP_calling/bams/interim_bams 27 | 28 | # Metadata file for each population 29 | # 1 simple_ID 30 | # 2 sample_ID 31 | # 3 read1 32 | # 4 read2 33 | # 5 instrument 34 | # 6 flowcell 35 | # 7 lane 36 | # 8 barcode 37 | # 9 sex 38 | # 10 run_num 39 | # 11 seq_num 40 | 41 | simpleID_array=( `cat $metadata | cut -f 1` ) 42 | simpleID=${simpleID_array[(($SLURM_ARRAY_TASK_ID))]} 43 | 44 | instrument_array=( `cat $metadata | cut -f 5` ) 45 | instrument=${instrument_array[(($SLURM_ARRAY_TASK_ID))]} 46 | 47 | seqnum_array=( `cat $metadata | cut -f 11` ) 48 | seqnum=${seqnum_array[(($SLURM_ARRAY_TASK_ID))]} 49 | 50 | flowcell_array=( `cat $metadata | cut -f 6` ) 51 | flowcell=${flowcell_array[(($SLURM_ARRAY_TASK_ID))]} 52 | 53 | lane_array=( `cat $metadata | cut -f 7` ) 54 | lane=${lane_array[(($SLURM_ARRAY_TASK_ID))]} 55 | 56 | barcode_array=( `cat $metadata | cut -f 8` ) 57 | barcode=${barcode_array[(($SLURM_ARRAY_TASK_ID))]} 58 | 59 | ## In array 60 | insampleID_array=( `cat $metadata | cut -f 2` ) 61 | insampleID=$bam_in/${insampleID_array[(($SLURM_ARRAY_TASK_ID))]} 62 | 63 | ## Out array 64 | outsampleID_array=( `cat $metadata | cut -f 2` ) 65 | outsampleID=$bam_out/${outsampleID_array[(($SLURM_ARRAY_TASK_ID))]} 66 | 67 | ## NB $EBROOTPICARD is the path to your install of picard 68 | ## Run picard tools AddreplaceRGs 69 | java -Xmx10g -jar $EBROOTPICARD/picard.jar AddOrReplaceReadGroups \ 70 | I=${insampleID}.sorted.raw.bam \ 71 | O=${outsampleID}.sorted.rg.bam \ 72 | RGSM=${simpleID} \ 73 | RGLB=${simpleID}.${seqnum} \ 74 | RGID=${flowcell}.${lane} \ 75 | RGPU=${flowcell}${lane}.${barcode} \ 76 | RGPL=${instrument} 77 | 78 | ## Index the readgroup bam files 79 | java -Xmx10g -jar $EBROOTPICARD/picard.jar BuildBamIndex \ 80 | I=${outsampleID}.sorted.rg.bam VALIDATION_STRINGENCY=LENIENT 81 | -------------------------------------------------------------------------------- /SNP_calling/scripts/1_qc_clean.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -D . 3 | #SBATCH -p sq 4 | #SBATCH --time=1:00:00 5 | #SBATCH --nodes=1 6 | #SBATCH --ntasks-per-node=1 7 | #SBATCH -A 8 | #SBATCH --job-name=qc_clean 9 | #SBATCH --error=logs/qc_clean.err.txt 10 | #SBATCH --output=logs/qc_clean.out.txt 11 | #SBATCH --export=All 12 | #SBATCH --array=1-10 13 | 14 | ## Load your system modules 15 | # Required modules are: FastQC, Python2, Trim_Galore 16 | module purge 17 | module load FastQC/0.11.7-Java-1.8.0_162 Python/2.7.9-intel-2016b Trim_Galore/0.4.5-foss-2016b 18 | 19 | ## Set your master path 20 | MASTER= 21 | 22 | ## Fill in path for population specific metadata ## 23 | metadata=$MASTER/SNP_calling/metadata.tsv 24 | 25 | ## Fill in directories if different from the workspace setup 26 | raw_reads=$MASTER/SNP_calling/reads/raw_reads 27 | clean_reads=$MASTER/SNP_calling/reads/clean_reads 28 | fastqc_raw=$MASTER/SNP_calling/reads/raw_reads/fastqc 29 | fastqc_clean=$MASTER/SNP_calling/reads/clean_reads/fastqc 30 | 31 | ## Create an array to hold all of the files within the raw reads location 32 | read1_array=( `cat $metadata | cut -f 3` ) 33 | read1=$raw_reads/${read1_array[(($SLURM_ARRAY_TASK_ID))]} 34 | 35 | read2_array=( `cat $metadata | cut -f 4` ) 36 | read2=$raw_reads/${read2_array[(($SLURM_ARRAY_TASK_ID))]} 37 | 38 | out_array=( `cat $metadata | cut -f 2` ) 39 | out=$clean_reads/${out_array[(($SLURM_ARRAY_TASK_ID))]} 40 | 41 | 42 | ## Testing that all variables are working correctly 43 | echo "read1" $read1 44 | echo "read2" $read2 45 | echo "output directory" $clean_reads 46 | 47 | ## Run fastqc on raw reads 48 | fastqc ${read1} ${read2} -o $fastqc_raw 49 | 50 | ## Run trim_galore default settings (but adjust adaptors if needed) 51 | trim_galore -q 20 --path_to_cutadapt cutadapt -o ${clean_reads} --paired ${read1} ${read2} 52 | 53 | ## you will probably need to change the suffix of these arrays here, depending on your read suffix. 54 | ## Alternatively, run fastqc and trimgalore, add the clean reads to your metadata and then create another array with the clean reads to be used here 55 | ## Run fastqc on clean reads 56 | fastqc ${out}_val_1.fq.gz ${out}_val_2.fq.gz -o ${fastqc_raw} 57 | 58 | 59 | -------------------------------------------------------------------------------- /SNP_calling/scripts/10_refine_filter.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -D . 3 | #SBATCH -p pq 4 | #SBATCH --time=72:00:00 5 | #SBATCH --nodes=1 6 | #SBATCH --ntasks-per-node=16 7 | #SBATCH -A Research_Project-T110748 8 | #SBATCH --job-name=five_aside_STAR_refine_filter 9 | #SBATCH --error=FIBR_STAR_refine_filter.err.txt 10 | #SBATCH --output=FIBR_STAR_refine_filter.out.txt 11 | #SBATCH --export=All 12 | #SBATCH --mail-type=END 13 | #SBATCH --mail-user=mv323@exeter.ac.uk 14 | 15 | ### Script to filter and refine vcfs ### 16 | 17 | module load GATK/4.0.5.1-foss-2018a-Python-3.6.4 18 | module load VCFtools/0.1.16-foss-2018b-Perl-5.28.0 19 | 20 | reference=/gpfs/ts0/home/mv323/lustre/start_up_data/STAR/STAR.chromosomes.release.fasta 21 | raw_vcf=/gpfs/ts0/home/mv323/lustre/start_up_data/FIBR/STAR/data/FIBR_gvcfs/FIBR_STAR_cohort_batch_genotyped.g.vcf 22 | DATASET=FIBR_STAR 23 | 24 | WORKING_DIR=/gpfs/ts0/home/mv323/lustre/start_up_data/FIBR/STAR/data/FIBR_gvcfs 25 | 26 | SNP_filtered=$WORKING_DIR/${DATASET}_SNP_filter.vcf 27 | gatk_filter_flag=$WORKING_DIR/${DATASET}_SNP_gatk_flagged.vcf 28 | gatk_filtered=$WORKING_DIR/${DATASET}_SNP_gatk_filtered 29 | allele_filtered=$WORKING_DIR/${DATASET}_SNP.minmax2.mindp5maxdp200.filtered 30 | maxmiss_filtered=$WORKING_DIR/${DATASET}_SNP.maxmiss50.filtered 31 | sex_maxmiss_filtered=$WORKING_DIR/${DATASET}_SNP.maxmiss10.filtered 32 | final_filtered=$WORKING_DIR/${DATASET}_pop_SNP.gatk.bi.miss.maf.final.filtered.depth4 33 | sex_final_filtered=$WORKING_DIR/${DATASET}_SEXY_pop_SNP.gatk.bi.miss.maf.final.filtered 34 | 35 | ## Popmaps ## - These have been filtered for low coverage individuals, make sure all are in one directory 36 | POPMAP_DIR=/gpfs/ts0/home/mv323/lustre/popmap/FIBR 37 | 38 | ## Processing ... 39 | 40 | ## Select only snps with the "snp_filter" 41 | #gatk --java-options "-Xmx20g" SelectVariants -R $reference -V $raw_vcf --select-type-to-include SNP -O $SNP_filtered 42 | 43 | ### This gatk step does not actually perform any filtering, it just applies the "snp_filter" tag to SNPs that would pass the filtering 44 | gatk --java-options "-Xmx20g" VariantFiltration -R $reference -V $SNP_filtered -O $gatk_filter_flag --filter-expression "QD < 2.0 || FS > 60.0 || MQ < 40.0 || HaplotypeScore > 13.0 || MappingQualityRankSum < -12.5" --filter-name "snp_filter" 45 | 46 | ### This stage actually filters out anything that doesn't have the "snp_filter" tag 47 | vcftools --vcf $gatk_filter_flag --recode --remove-filtered-all --out $gatk_filtered 48 | 49 | ### Use vcftools to filter remaining SNPS ## 50 | 51 | ## Retain only biallelic SNPs with a min depth of 5 and max depth of 200 52 | 53 | vcftools --vcf $gatk_filtered.recode.vcf --min-alleles 2 --max-alleles 2 --minDP 4 --maxDP 200 --recode --remove-filtered-all --out $allele_filtered 54 | 55 | pop_array=(GH GL C T LL UL) 56 | #pop_array=(APHP APLP) 57 | for pop in "${pop_array[@]}" 58 | do 59 | 60 | ## Split vcf file by population and filter by max missing 50 (for pop gen analyses) and max missing 10% for sex analysis 61 | vcftools --vcf $allele_filtered.recode.vcf --keep $POPMAP_DIR/${pop}.popmap --recode --remove-filtered-all --out ${allele_filtered}.${pop} 62 | vcftools --max-missing 0.5 --vcf $allele_filtered.${pop}.recode.vcf --recode --remove-filtered-all --out ${maxmiss_filtered}.${pop} 63 | #vcftools --max-missing 0.1 --vcf $allele_filtered.${pop}.recode.vcf --recode --remove-filtered-all --out ${sex_maxmiss_filtered}.${pop} 64 | 65 | ## Do this for every population ^^^ ## 66 | 67 | done 68 | 69 | ## Combine vcfs across poplns using gatk (one for population genetics and one for sex) 70 | 71 | module unload GATK/4.0.5.1-foss-2018a-Python-3.6.4 72 | module load GATK/3.8-0-Java-1.8.0_144 73 | 74 | ## NB Change minimumN6 depending on how many populations you are combining 75 | 76 | ## Pop gen merging ## 77 | java -Xmx20g -jar $EBROOTGATK/GenomeAnalysisTK.jar -l INFO -T CombineVariants -R $reference \ 78 | --variant $maxmiss_filtered.GL.recode.vcf \ 79 | --variant $maxmiss_filtered.GH.recode.vcf \ 80 | --variant $maxmiss_filtered.C.recode.vcf \ 81 | --variant $maxmiss_filtered.T.recode.vcf \ 82 | --variant $maxmiss_filtered.LL.recode.vcf \ 83 | --variant $maxmiss_filtered.UL.recode.vcf \ 84 | --minimumN 10 -o $maxmiss_filtered.merged.vcf 85 | 86 | ## Filter for minor allele frequency 87 | vcftools --vcf $maxmiss_filtered.merged.vcf --maf 0.01 --recode --remove-filtered-all --out $final_filtered 88 | 89 | ## Sex merging ## 90 | #java -Xmx20g -jar $EBROOTGATK/GenomeAnalysisTK.jar -l INFO -T CombineVariants -R $reference \ 91 | #--variant $sex_maxmiss_filtered.GL.recode.vcf \ 92 | #--variant $sex_maxmiss_filtered.GH.recode.vcf \ 93 | #--variant $sex_maxmiss_filtered.UQ.recode.vcf \ 94 | #--variant $sex_maxmiss_filtered.LO.recode.vcf \ 95 | #--variant $sex_maxmiss_filtered.LM.recode.vcf \ 96 | #--variant $sex_maxmiss_filtered.UM.recode.vcf \ 97 | #--minimumN 6 -o $sex_maxmiss_filtered.merged.vcf 98 | 99 | # Filter for minor allele frequency 100 | #vcftools --vcf $sex_maxmiss_filtered.merged.vcf --maf 0.05 --recode --remove-filtered-all --out $sex_final_filtered -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # gatk-snp-calling 2 | Full GATK SNP calling pipeline 3 | 4 | This set of scripts take raw illumina whole-genome sequencing reads as input to produce a filtered VCF file 5 | 6 | Disclaimer: These scripts work for us on our system, but there may be unforseen idiosyncractic errors! 7 | 8 | These scripts were written for a Slurm batch cluster system 9 | 10 | #### These scripts have been used to create VCF files in the following publications: 11 | 12 | Fraser BA, Whiting JR, Paris JR, Weadick CJ, Parsons PJ, Charlesworth D, Bergero R, Bemm F, Hoffmann M, Kottler VA, Liu C, Dreyer C, Weigel D (2020). Improved reference genome uncovers novel sex-linked regions in the guppy (_Poecilia reticulata_). Genome Biology and Evolution, evaa187. https://doi.org/10.1093/gbe/evaa187 13 | 14 | Whiting JR, Paris JR, van der Zee MJ, Parsons, PJ, Weigel D, Fraser BA. (2021) Drainage-structuring of ancestral variation and a common functional pathway shape limited genomic convergence in natural high- and low-predation guppies. PLoS Genetics, https://doi.org/10.1371/journal.pgen.1009566 15 | 16 | van der Zee, MJ, Whiting JR, Paris JR, Bassar RD, Travis J, Weigel D, Reznick DN, Fraser BA. (2022). Rapid genomic convergent evolution in experimental populations of Trinidadian guppies (_Poecilia reticulata_). Evolution Letters, https://doi.org/10.1002/evl3.272 17 | 18 | Josephine R Paris, James R Whiting, Mitchel J Daniel, Joan Ferrer Obiol, Paul J Parsons, Mijke J van der Zee, Christopher W Wheat, Kimberly A Hughes, Bonnie A Fraser 19 | 20 | 21 | 22 | ### For these scripts to work, you need to set up a neat waterfall workspace 23 | 24 | ![directory_structure 001](https://user-images.githubusercontent.com/38511308/105726962-7f0cbc80-5f22-11eb-85b3-f7854e1c27b9.jpeg) 25 | 26 | 27 | This directory structure is provided for you on the git clone, or else you can make it quickly yourself: 28 | 29 | `mkdir SNP_calling && cd SNP_calling && mkdir scripts reads bams gvcfs vcfs && cd scripts && mkdir logs && cd .. && cd reads && mkdir raw_reads clean_reads && cd raw_reads && mkdir fastqc && cd ../clean_reads && mkdir fastqc && cd ../../ && cd bams && mkdir raw_bams interim_bams clean_bams && cd ../vcfs/ && mkdir interim_vcfs && mkdir intervals && cd ..` 30 | 31 | #### Here's a list of the scripts and a brief description of what they do: 32 | 33 | ## 1_qc_clean.sh 34 | Takes raw illumina reads and runs fastqc, cleans the reads using trim_galore, performs fastqc on the clean reads 35 | 36 | ## 2_bwa_align.sh 37 | Aligns clean reads to a reference genome to form sam, converts to bam, sort, index, flagstat (for mapping stats). Includes a quick sanity check at the end to make sure the sorted.raw.bam files are in good shape 38 | 39 | ## 3_add_readgroups.sh 40 | Adds readgroup information from a metadata file, where columns specify which read group info should be added 41 | 42 | ## 4_dedup.sh 43 | ##### NB Technically this first run of marking duplicates is not necessary because we will run it again per-sample, and that per-sample marking would be enough to achieve the desired result. We only do this round of marking duplicates for QC purposes 44 | Marks duplicates in the bam files 45 | 46 | ## 5_merge_sample_bams.sh 47 | Merges bams from data generated from one sample which is in multiple fastq files 48 | ##### NB This merging only needs to happen if you have multiple fastq files for one sample, i.e. one individual sample which has been run across multiple lanes, e.g. sample_1A.fastq sample_1B.fastq. If you have one set of reads per sample you can skip this script (and the next one too) 49 | 50 | ## 6_dedup.sh 51 | Marks duplicates in bams from multiple lanes of sequencing 52 | 53 | ## 7_recal.sh 54 | Recalibrates the bam files against a "truth-set" of SNPs 55 | 56 | ##### Truth-set vcfs are variants for which we have high confidence, and tend to be generated from PCR-free high coverage libraries. If you don't have one of these available, skip this step. In such cases I reccommend calling variants with GATK, and then also calling variants with another program (e.g. Freebayes). When the VCFs of each caller are complete you can intersect them using `bedtools intersect` and keep the SNPs which were called by both programs. IF variants have been called in both programs, this offers you some confidence. 57 | 58 | ## 8_haplotype_caller.sh 59 | This script runs GATK's haplotype caller on your bams, and produces gvcf files for GATK4 CombineGVCFs 60 | ### NB This script takes the longest time to run 61 | 62 | ## 9_consolidate_genotypes.sh 63 | Runs GATK4 GenomicsDBImport and GenotypeGVCFs 64 | 65 | ## 10_refine_filter.sh 66 | Uses GATK4 SelectVariants, vcftools for various filters (user can choose!) and finally GATK3 CombineVariants to merge samples generated from multiple populations 67 | 68 | ### Important info before running the scripts: 69 | - Check which batch submission system your cluster is running, i.e. SGE, PBS, SLURM 70 | - for each script, I've set the array number arbitarily up to 10: 71 | e.g. `#SBATCH --array=1-10` 72 | - you need to change the array depending on how many samples you are mapping at each stage. I.e. --array=1-10 runs array 1-10 (i.e. 9 samples excluding the header as arrays start from 0) 73 | - Any information which requires editing by you is included in chevrons, i.e. `#SBATCH -A `and `MASTER=` - remove the chevrons and add your information here 74 | - any other specific comments for each script are included in the script themselves within the comments 75 | 76 | ### Important Metadata 77 | The reliability of these scripts relies heavily on a metadata file, which you will have to create prior to running the pipeline. 78 | The metadata file can have any information you require in it, e.g. sampling location, sex, sample ID etc etc. In fact, it's good habit to have a metadata file such as this associated with any sequencing project. Below I provide an example of a metadata file structure. It's a tsv file, with columns and rows. Can easily be made in Excel and saved as a .tsv ;) 79 | 80 | 81 | | simple_ID | sample_ID | read1 | read2 | instrument | flowcell | lane | barcode | sex | run_num | seq_num | 82 | | ---------- | ---------- | ---------- | ---------- | ---------- | ---------- | ---------- | ---------- | ---------- | ---------- | ---------- | 83 | | APLP_F1 | APLP_F1_A_L001 | APLP_F1_A_L001_r1.fq.gz | APLP_F1_A_L001_r2.fq.gz | ILLUMINA | A | 1 | ATGCA | F | 44 | 1 | 84 | | APLP_F1 | APLP_F1_A_L002 | APLP_F1_A_L002_r1.fq.gz | APLP_F1_A_L002_r2.fq.gz | ILLUMINA | A | 2 | ATGCA | F | 44 | 1 | 85 | | APLP_F1 | APLP_F1_A_L003 | APLP_F1_A_L003_r1.fq.gz | APLP_F1_A_L003_r2.fq.gz | ILLUMINA | A | 3 | ATGCA | F | 41 | 1 | 86 | | APLP_F2 | APLP_F2_A_L001 | APLP_F2_A_L001_r1.fq.gz | APLP_F2_A_L001_r2.fq.gz | ILLUMINA | A | 1 | GTCTA | F | 44 | 1 | 87 | | APLP_F2 | APLP_F2_A_L002 | APLP_F2_A_L002_r1.fq.gz | APLP_F2_A_L002_r2.fq.gz | ILLUMINA | A | 2 | GTCTA | F | 44 | 1 | 88 | | APLP_F2 | APLP_F2_A_L003 | APLP_F2_A_L003_r1.fq.gz | APLP_F2_A_L003_r2.fq.gz | ILLUMINA | A | 3 | CTAGA | F | 41 | 1 | 89 | | APLP_M1 | APLP_M1_A_L001 | APLP_M1_A_L001_r1.fq.gz | APLP_M1_A_L001_r2.fq.gz | ILLUMINA | A | 1 | CAAGC | M | 44 | 1 | 90 | | APLP_M1 | APLP_M1_B_L001 | APLP_M1_B_L001_r1.fq.gz | APLP_M1_B_L001_r2.fq.gz | ILLUMINA | B | 1 | CAAGC | M | 44 | 1 | 91 | | APLP_M1 | APLP_M1_C_L001 | APLP_M1_C_L001_r1.fq.gz | APLP_M1_C_L001_r2.fq.gz | ILLUMINA | C | 1 | CAAGC | M | 44 | 1 | 92 | 93 | 94 | #### Info on this metadata 95 | simple_ID = name of individual 96 | 97 | sample_ID = name of read pertaining to that individual 98 | 99 | instrument = One of ILLUMINA, SOLID, LS454, HELICOS and PACBIO (must be in caps!) 100 | 101 | flow_cell = flowcell ID that the sample was run on 102 | 103 | lane = lane the sample was run on 104 | 105 | run_num = run number of the fastq reads 106 | 107 | seq_num = library prep number. Sometimes you have an index for this. We only did one library per sample, hence all are 1. 108 | 109 | In this example metadata, we have several fastq files for each sample 110 | For example, sample `APLP_F1` is comprised of three sets of fastq reads: `APLP_F1_A_L001`, `APLP_F1_A_L002`, `APLP_F1_A_L003` and we can see in the metadata that they come from three different lanes of sequencing (1,2,3) on flow_cell A. 111 | On the other hand, `APLP_F2` are also derived from the same flow_cell (A), across three lanes (1,2,3), but one of the samples has a different barcode 112 | Finally, `APLP_M1` is sequenced on lane 1, but across three different flow cells (A, B, C). 113 | This example is just for illustrative purposes so you can see the differences between the metdata columns. 114 | 115 | #### Where do I get the metadata? 116 | Much of these metdata can be collected from your fastq read headers: 117 | @(instrument id):(run number):(flowcell ID):(lane):(tile):(x_pos):(y_pos) (read):(is filtered):(control number):(index sequence)FLOWCELL_BARCODE = @(instrument id):(run number):(flowcell ID) 118 | 119 | #### Depending on how you edit and put together your metadata, you will have to check each script to make sure it pulls out the correct column of data. 120 | For example, 121 | 122 | In `1_qc_clean.sh`, we take the third and fourth columns which is the name of the fastq reads 123 | 124 | ``` 125 | read1_array=( `cat $metadata | cut -f 2` ) 126 | read1=$raw_reads/${read1_array[(($SLURM_ARRAY_TASK_ID))]} 127 | 128 | read2_array=( `cat $metadata | cut -f 2` ) 129 | read2=$raw_reads/${read2_array[(($SLURM_ARRAY_TASK_ID))]} 130 | ``` 131 | 132 | ``` 133 | trim_galore -q 20 --path_to_cutadapt cutadapt -o $clean_reads --phred33 --paired ${read1} ${read2} 134 | ``` 135 | 136 | In `3_add_readgroups.sh` we also use a lot of this information too ... 137 | 138 | ### Read Groups 139 | 140 | #### RGSM 141 | (sample name) = simple_ID 142 | #### RGLB 143 | (DNA preparation library identifier) = simple_ID.seq_num (or index identifed from your library prep) 144 | ##### NB This is important to identify PCR duplicates in MarkDuplicates step. You can ignore this readgroup for PCR-free libraries 145 | #### RGID 146 | (Read group identifier) = flow_cell.lane 147 | #### RGPU 148 | (Platform Unit) = flow_cell.lane.barcode 149 | #### RGPL 150 | (Platform) = instrument 151 | ##### NB takes one of ILLUMINA, SOLID, LS454, HELICOS and PACBIO - must be in caps! 152 | 153 | You can get a lot of this read group information from your fastq files. 154 | 155 | @(instrument id):(run_num):(flow_cell):(lane):(tile):(x_pos):(y_pos) (read):(filtered):(control_num):(index sequence) 156 | 157 | #### NB This should be used as a guide only. Read group assignment changes depending on your library preparation set-up and type of sequencing data 158 | 159 | 160 | ## At the end of the process, we highly reccommend running MultiQC on your directories to collect data on quality control: 161 | https://multiqc.info/ 162 | 163 | ##### With thanks to Bonnie Fraser, Mijke van der Zee and Jim Whiting 164 | 165 | 166 | 167 | --------------------------------------------------------------------------------