├── images └── PATE.png ├── template.sh ├── helperScripts ├── getPloidy.pl ├── estimatePloidy.pl ├── getAB.pl └── PATE_formatInput.pl ├── LICENSE ├── ploidy.txt ├── PATE.ctl ├── README.md └── PATE.pl /images/PATE.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gtiley/Phasing/HEAD/images/PATE.png -------------------------------------------------------------------------------- /template.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=__RUNID__ 3 | #SBATCH --output=__LOGFILE__ 4 | #SBATCH --mail-user=__YOUR_EMAIL__ 5 | #SBATCH --mail-type=FAIL 6 | #SBATCH --time=24:00:00 7 | #SBATCH --mem-per-cpu=16000M 8 | #SBATCH --nodes=1 9 | #SBATCH --ntasks=1 10 | #SBATCH --cpus-per-task=1 11 | #SBATCH --qos=__YOUR_QUEUE__ 12 | #SBATCH --account=__YOUR_ACCOUNT__ 13 | 14 | module load picard/2.9.2 15 | module load bwa/0.7.17 16 | module load gatk/4.1.4.0 17 | module load samtools/1.10 18 | module load bamtools/2.1.1 19 | modele load R 20 | -------------------------------------------------------------------------------- /helperScripts/getPloidy.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | use strict; 3 | 4 | my $phasingRoot = $ARGV[0]; 5 | my $ploidyFile = $ARGV[1]; 6 | my $estimatedPloidyPath = $ARGV[2]; 7 | 8 | my %ploidies = (); 9 | 10 | open FH1,'<',"$phasingRoot/$ploidyFile"; 11 | while() 12 | { 13 | if (/^(\S+)\s+(\d+)\s+.*/) 14 | { 15 | my $tax = $1; 16 | my $startPloidy = $2; 17 | if ($startPloidy != 2) 18 | { 19 | print "Warning - starting ploidy for estimation was not 2 ($tax = $startPloidy) and results may be flawed\n"; 20 | } 21 | $ploidies{$tax} = $startPloidy; 22 | open FH2,'<',"$estimatedPloidyPath/$tax/$tax.modelSelection.txt"; 23 | while() 24 | { 25 | if (/Ploidy\:\s+(\d+)/) 26 | { 27 | my $estimatedPloidy = $1; 28 | $ploidies{$tax} = $estimatedPloidy; 29 | } 30 | } 31 | close FH2; 32 | } 33 | } 34 | close FH1; 35 | 36 | open OUT1,'>',"$phasingRoot/$ploidyFile.estimated"; 37 | foreach my $tax (keys %ploidies) 38 | { 39 | print OUT1 "$tax\t$ploidies{$tax}\n" 40 | } 41 | close OUT1; -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 George P. Tiley 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /helperScripts/estimatePloidy.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | 3 | $scheduler = $ARGV[0]; 4 | $template = $ARGV[1]; 5 | $ploidypath = $ARGV[2]; 6 | $helperpath = $ARGV[3]; 7 | $tax = $ARGV[4]; 8 | 9 | $scheduler =~ s/scheduler\_//; 10 | 11 | open OUT1,'>',"$ploidypath/$tax/$tax.mixturemodels.sh"; 12 | open FH1,'<',"$template"; 13 | while() 14 | { 15 | my $line = $_; 16 | chomp $line; 17 | $line =~ s/__RUNID__/$tax.mixturemodels.id/; 18 | $line =~ s/__LOGFILE__/$tax.mixturemodels.log/; 19 | print OUT1 "$line\n"; 20 | } 21 | close FH1; 22 | print OUT1 "$ploidypath/$tax\n"; 23 | open OUT2,'>',"$ploidypath/$tax/getPloidy.R"; 24 | print OUT2 "setwd(\"$ploidypath/$tax\");\n"; 25 | print OUT2 "source(\"$helperpath/Ks_plots/ploidy.test.R\")\;\nsource(\"$helperpath/Ks_plots/fitMixEM.R\")\;\nsource(\"$helperpath/Ks_plots/plotComponentExpectations.R\")\;\n"; 26 | print OUT2 "dat <- read.table(\"$ploidypath/$tax/$tax.ab\",header=TRUE,sep=\"\\t\")\;\n"; 27 | print OUT2 "ploidy.test(dat\$AB,maxPloidy=6,model=4,nstarts=100,outPrefix=\"$tax\")\;\n"; 28 | print OUT2 "quit();\n"; 29 | close OUT2; 30 | print OUT1 "R CMD BATCH $ploidypath/$tax/getPloidy.R\n"; 31 | close OUT1; 32 | system "$scheduler $ploidypath/$tax/$tax.mixturemodels.sh"; -------------------------------------------------------------------------------- /ploidy.txt: -------------------------------------------------------------------------------- 1 | UFG_393202_P004_WG01 4 Dryopteris celsa 2 | UFG_393202_P004_WH01 4 Dryopteris celsa 3 | UFG_393202_P077_WE12 4 Dryopteris celsa 4 | UFG_393202_P004_WB01 2 Dryopteris goldiana 5 | UFG_393202_P010_WG12 2 Dryopteris goldiana 6 | UFG_393202_P010_WH12 2 Dryopteris ludoviciana 7 | UFG_393202_P077_WA12 2 Dryopteris ludoviciana 8 | UFG_393202_P004_WE01 2 Dryopteris intermedia 9 | UFG_393202_P004_WF01 2 Dryopteris intermedia 10 | UFG_393202_P077_WE11 2 Dryopteris intermedia 11 | UFG_393202_P010_WD12 2 Dryopteris expansa 12 | UFG_393202_P077_WB12 2 Dryopteris expansa 13 | UFG_393202_P082_WD06 2 Dryopteris expansa 14 | UFG_393202_P004_WA01 4 Dryopteris campyloptera 15 | UFG_393202_P004_WD01 4 Dryopteris campyloptera 16 | UFG_393202_P082_WH06 4 Dryopteris campyloptera 17 | UFG_393202_P004_WC01 4 Dryopteris carthusiana 18 | UFG_393202_P010_WA12 4 Dryopteris carthusiana 19 | UFG_393202_P077_WB11 4 Dryopteris carthusiana 20 | UFG_393202_P010_WB12 6 Dryopteris clintoniana 21 | UFG_393202_P010_WC12 6 Dryopteris clintoniana 22 | UFG_393202_P047_WB12 6 Dryopteris clintoniana 23 | UFG_393202_P010_WE12 4 Dryopteris cristata 24 | UFG_393202_P010_WF12 4 Dryopteris cristata 25 | UFG_393202_P077_WC12 4 Dryopteris cristata 26 | UFG_393202_P050_WA01 2 Polystichum munitum 27 | UFG_393202_P050_WG01 2 Polystichum speciossisimum 28 | -------------------------------------------------------------------------------- /PATE.ctl: -------------------------------------------------------------------------------- 1 | #input and output paths 2 | PHASING_ROOT = __YOUR_PATH__/Phasing 3 | REF = __YOUR_PATH__/referenceFasta 4 | GENOTYPE_OUT = __YOUR_PATH__/genotypeOutput 5 | PHASE_OUT = __YOUR_PATH__/phasedOutput 6 | FASTA_OUT = __YOUR_PATH__/fastaOutput 7 | IUPAC_OUT = __YOUR_PATH__/iupacOutput 8 | SUMMARYSTATS_OUT = __YOUR_PATH__/summaryStatsOutput 9 | ESTPLOIDY_OUT = __YOUR_PATH__/estimatedPloidy 10 | FQ = __YOUR_PATH__/fastqFiles 11 | PLOIDY = __YOUR_PATH__/ploidy.txt 12 | 13 | 14 | #paths to software or just the binaries if already in your path 15 | BWA = bwa 16 | PICARD = picard 17 | GATK = gatk 18 | SAMTOOLS = samtools 19 | BAMTOOLS = bamtools 20 | BGZIP = bgzip 21 | TABIX = tabix 22 | HPOPG = YOUR_PATH/H-PoPG_2/H-PoPG/H-PoPGv0.2.0.jar 23 | SCHEDULER = sbatch 24 | 25 | #Path to the root directory of the helperScripts folder in the PATE repo 26 | HELPERSCRIPTS = __YOUR_PATH__/Phasing/helperScripts 27 | 28 | #gatk filter options 29 | GATK_FILTER_EXPRESSION = "QD < 2.0" "QD_lt2" 30 | GATK_FILTER_EXPRESSION = "FS > 60.0" "FS_gt60" 31 | GATK_FILTER_EXPRESSION = "MQ < 40.0" "MQ_lt40" 32 | GATK_FILTER_EXPRESSION = "ReadPosRankSum < -8.0" "ReadPosRankSum_ltm8" 33 | GATK_FILTER_EXPRESSION = "AF < 0.025" "AF_lt025" 34 | GATK_FILTER_EXPRESSION = "AF > 0.975" "AF_gt975" 35 | GATK_FILTER_EXPRESSION = "DP < 10" "DP_lt10" 36 | 37 | #Name of reference individual that matches the header in the reference fasta file 38 | REFERENCEIND = KJM225 39 | 40 | #Should only unique sequences be printed to the fasta files of all phased haplotype sequences (0 = no | 1 = yes) 41 | UNIQUE_ONLY = 1 42 | REMOVE_READ_DUPLICATES = 0 43 | OUTPUT_EXPECTED_DOSAGE = 0 44 | -------------------------------------------------------------------------------- /helperScripts/getAB.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | use strict; 3 | 4 | my $vcfFile = $ARGV[0]; 5 | my $outFileRoot = $ARGV[1]; 6 | my %ABdata = (); 7 | my %nvariants = (); 8 | my %chromosomes = (); 9 | my %positions = (); 10 | my @taxa = (); 11 | my $ntax = 0; 12 | open FH1,'<',"$vcfFile"; 13 | my $skippingHeader = 1; 14 | while () 15 | { 16 | my $line =$_; 17 | chomp $line; 18 | if ($line =~ m/^#CHROM/) 19 | { 20 | my @temp = (); 21 | @temp = split(/\s+/,$line); 22 | for my $i (9..(scalar(@temp)-1)) 23 | { 24 | my @refPath = split(/\//,$temp[$i]); 25 | $taxa[$i] = $refPath[(scalar(@refPath)-2)]; 26 | $nvariants{$taxa[$i]} = 0; 27 | $ntax++; 28 | } 29 | $skippingHeader = 0; 30 | } 31 | if ($skippingHeader == 0) 32 | { 33 | my @temp = (); 34 | @temp = split(/\s+/,$line); 35 | if ($temp[6] eq "PASS") 36 | { 37 | for my $i (9..(scalar(@temp)-1)) 38 | { 39 | my $filterValues = $temp[$i]; 40 | my @filterValuesVector = (); 41 | @filterValuesVector = split(/\:/,$filterValues); 42 | #print "@filterValuesVector\n"; 43 | if ($filterValuesVector[1] =~ m/(\d+)\,(\d+)/) 44 | { 45 | my $refAllele = $1; 46 | my $altAllele = $2; 47 | if ((($refAllele + $altAllele) >= 10) && ($refAllele > 1) && ($altAllele > 1)) 48 | { 49 | my $abValue = $altAllele/($refAllele + $altAllele); 50 | push @{$ABdata{$taxa[$i]}}, $abValue; 51 | push @{$chromosomes{$taxa[$i]}}, $temp[0]; 52 | push @{$positions{$taxa[$i]}}, $temp[1]; 53 | $nvariants{$taxa[$i]} = $nvariants{$taxa[$i]} + 1; 54 | } 55 | } 56 | } 57 | } 58 | } 59 | } 60 | close FH1; 61 | 62 | for my $i (9..($ntax + 8)) 63 | { 64 | open OUT1,'>',"$outFileRoot/$taxa[$i]/$taxa[$i].ab"; 65 | print OUT1 "Chromosome\tPosition\tAB\n"; 66 | for my $j (0..($nvariants{$taxa[$i]} - 1)) 67 | { 68 | print OUT1 "$chromosomes{$taxa[$i]}[$j]\t$positions{$taxa[$i]}[$j]\t$ABdata{$taxa[$i]}[$j]\n"; 69 | } 70 | close OUT1; 71 | } 72 | exit; 73 | -------------------------------------------------------------------------------- /helperScripts/PATE_formatInput.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | use strict; 3 | 4 | #----------------------------------------------------------------------------------------# 5 | #George P. Tiley and Andrew A. Crowl 6 | #25 May 2021 7 | #contact: george.tiley@duke.edu 8 | #contact: andrew.crowl@duke.edu 9 | #prepare reference sequences and reads for PATÉ 10 | #----------------------------------------------------------------------------------------# 11 | 12 | # Accepted AND necessary commands 13 | my @checkArgs = ("filetype","inputFolder","outputFolder","ploidyFile"); 14 | my %passedArgs = (); 15 | if (scalar(@ARGV) == 0) 16 | { 17 | die "/*--------INPUT PARAMETERS--------*/\n 18 | --filetype STRING 19 | --inputFolder STRING 20 | --outputFolder STRING 21 | --ploidyFile STRING 22 | 23 | \n/*--------EXAMPLE COMMAND--------*/\n 24 | perl PATE_formatInput.pl --filetype fasta --inputFolder supercontigs --outputFolder referenceSequences --ploidyFile ploidy.txt\n 25 | perl PATE_formatInput.pl --filetype fastq --inputFolder readsWithLongNames --outputFolder rawReads --ploidyFile ploidy.txt\n 26 | 27 | \n/*--------FLAG OPTIONS--------*/\n 28 | --filetype 29 | fasta = Supercontig output from hybpiper is expected as input 30 | fastq = uses the ploidy.txt file to get individual names and decompress and rename fastq files as IND.R1.fq an IND.R2.fq 31 | 32 | \n/*--------NOTES--------*/\n 33 | Meant to be a convient tool for users to go straight from hybpiper into PATÉ. Other pre-processing tools can be added upon request, but consult the example input for formatting if not using hybpiper supercontigs as reference sequences.\n"; 34 | } 35 | elsif (scalar(@ARGV) > 0) 36 | { 37 | for my $i (0..(scalar(@ARGV) - 1)) 38 | { 39 | if ($ARGV[$i] eq "--filetype") 40 | { 41 | $passedArgs{filetype} = $ARGV[$i+1]; 42 | } 43 | if ($ARGV[$i] eq "--inputFolder") 44 | { 45 | $passedArgs{inputFolder} = $ARGV[$i+1]; 46 | } 47 | if ($ARGV[$i] eq "--outputFolder") 48 | { 49 | $passedArgs{outputFolder} = $ARGV[$i+1]; 50 | system "mkdir $passedArgs{outputFolder}"; 51 | } 52 | if ($ARGV[$i] eq "--ploidyFile") 53 | { 54 | $passedArgs{ploidyFile} = $ARGV[$i+1]; 55 | } 56 | } 57 | foreach my $arg (@checkArgs) 58 | { 59 | if (! exists $passedArgs{$arg}) 60 | { 61 | die "/*--------MISSING PARAMETER--------*/\nMissing command line argument: $arg\n\n"; 62 | } 63 | } 64 | } 65 | 66 | if ($passedArgs{filetype} eq "fasta") 67 | { 68 | my %locusList = (); 69 | my %seqs = (); 70 | my %taxList = (); 71 | open FH1,'<',"$passedArgs{ploidyFile}"; 72 | while () 73 | { 74 | if (/^(\S+)\s+.+/) 75 | { 76 | my $tax = $1; 77 | if (! exists $taxList{$tax}) 78 | { 79 | $taxList{$tax} = 1; 80 | } 81 | } 82 | } 83 | close FH1; 84 | 85 | my @ff1 = glob("$passedArgs{inputFolder}/*.fasta"); 86 | foreach my $ff (@ff1) 87 | { 88 | if ($ff =~ m/$passedArgs{inputFolder}\/(\S+)\.fasta/) 89 | { 90 | my $locus = $1; 91 | my $tax = ""; 92 | if (! exists $locusList{$locus}) 93 | { 94 | $locusList{$locus} = 1; 95 | } 96 | open FH1,'<',"$ff"; 97 | while () 98 | { 99 | if (/^>(\S+)/) 100 | { 101 | $tax = $1; 102 | foreach my $ind (sort keys %taxList) 103 | { 104 | if (index($tax,$ind) >= 0) 105 | { 106 | $tax = $ind; 107 | } 108 | } 109 | $seqs{$locus}{$tax} = ""; 110 | } 111 | elsif (/(\S+)/) 112 | { 113 | my $seq = $1; 114 | $seqs{$locus}{$tax} = $seqs{$locus}{$tax} . $seq; 115 | } 116 | } 117 | close FH1; 118 | } 119 | } 120 | 121 | foreach my $locus (sort keys %locusList) 122 | { 123 | open OUT1,'>',"$passedArgs{outputFolder}/$locus.fasta"; 124 | foreach my $tax (sort keys %taxList) 125 | { 126 | if (exists $seqs{$locus}{$tax}) 127 | { 128 | print OUT1 ">$tax\n$seqs{$locus}{$tax}\n"; 129 | } 130 | } 131 | close OUT1; 132 | } 133 | } 134 | 135 | 136 | if ($passedArgs{filetype} eq "fastq") 137 | { 138 | my %taxList = (); 139 | open FH1,'<',"$passedArgs{ploidyFile}"; 140 | while () 141 | { 142 | if (/^(\S+)\s+.+/) 143 | { 144 | my $tax = $1; 145 | if (! exists $taxList{$tax}) 146 | { 147 | $taxList{$tax} = 1; 148 | } 149 | } 150 | } 151 | close FH1; 152 | 153 | foreach my $tax (sort keys %taxList) 154 | { 155 | my @fqFiles = glob("$passedArgs{inputFolder}/*$tax*.*"); 156 | foreach my $ff (@fqFiles) 157 | { 158 | if ($ff =~ m/$passedArgs{inputFolder}\/\S*R1\S*\.(\S+)/) 159 | { 160 | my $extension = $1; 161 | if ($extension =~ m/gz/) 162 | { 163 | system "cp $ff $passedArgs{outputFolder}/$tax.R1.fq.gz"; 164 | } 165 | elsif ($extension =~ m/fq/ || $extension =~ m/fastq/) 166 | { 167 | system "cp $ff $passedArgs{outputFolder}/$tax.R1.fq"; 168 | } 169 | } 170 | elsif ($ff =~ m/$passedArgs{inputFolder}\/\S*R2\S*\.(\S+)/) 171 | { 172 | my $extension = $1; 173 | if ($extension =~ m/gz/) 174 | { 175 | system "cp $ff $passedArgs{outputFolder}/$tax.R2.fq.gz"; 176 | } 177 | elsif ($extension =~ m/fq/ || $extension =~ m/fastq/) 178 | { 179 | system "cp $ff $passedArgs{outputFolder}/$tax.R2.fq"; 180 | } 181 | } 182 | } 183 | } 184 | } 185 | exit; -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # What is Phased Alleles from Target Enrichment data (PATÉ)? 2 | 3 | ![Generalized PATÉ Workflow](./images/PATE.png) 4 | 5 | PATÉ is a pipeline for recovering phased haplotype sequences from short-read data. The phased sequences depend on user-provided hapltype consensus or reference sequences, which could come from many applications but some recommendations are provided in our publications (Tiley et al. 2021; Crowl et al. 2022; Tiley et al. 2023). PATÉ has two primary pipelines: **species** and **populations**. The details are provided further down, but here are general differnces. 6 | * Species - Each individual is genotyped against its own haplotype consensus sequences. This is appropriate when analyzing multiple individuals of moderate to high sequence divergence where using a single reference would potentially lead to short-read alignment errors (e.g. species). This could be appropriate for most phylogenetic systematic studies. 7 | * Populations - Each individual is genotyped against the same reference. This is appropriate when their are many individuals equidistant to the reference where there is low sequence divergence so as not to confound joint genotyping. This could be appropriate for the analysis of multiple populations of a single species or cases of incipient speciation. Within the **populations** pipeline is a way to estimate the ploidy level for each individual, but the authors remain skeptical about such tests (see details). 8 | 9 | 10 | # Using the script 11 | ## Try this to see options: 12 | ```perl PATE.pl``` 13 | 14 | ## There are four input options and all are required 15 | ```perl PATE.pl --controlFile PATE.ctl --runmode species --template template.sh --genotype consensus``` 16 | 1. The control file (PATE.ctl) is used to configure the paths to other software, your input data, and where you want your output. 17 | 2. The runmode flag is used to determine if genotyping will be done jointly or not. There may be different conditions where one is desirable over the other. See below for more details. 18 | 3. The template file (template.sh) has the basic directives you will use for running on a cluster. Or if running on your local computer, you can set environment variables at run time. 19 | 4. The genotype flag determines how phasing ambiguity is handeled. See below for details. 20 | 21 | ## Important Note - you will run the script twice 22 | 1. First in --runmode ```species``` or ```population1``` 23 | + ```species``` will run GATK *HaplotypeCaller* on each individual, that is realigning short reads to the consensus assemblies to get genotypes based on ploidy levels specified in the *ploidy file*. 24 | + ```population1``` will run GATK on each individual in gVCF mode and set up the script for joint genotyping based on the specified ploidy levels. You will have to submit the joint genotyping job separately after the gVCF files are generated. The reference individual is specified in the *control file*. 25 | 2. Second in --runmode ```alleles``` or ```population2``` 26 | + ```alleles``` generates the per-locus fasta output and summary statistics based on the individual genotype data. This step is fast and happens on a single processor. Here the genotype < ```consensus``` || ```iupac``` > option comes into effect. 27 | +```population2``` is similar to *alleles* but will split the multisample VCF by individual to pass to *H-PoPG* to handle the phasing. The same genotype flags apply. 28 | + The genotype option affects how variants with ambiguous phases are handeled. When multiple haplotype blocks are recovered for a locus, we retain the phasing of the block with the most variants only. ```consensus``` causes the others to be replaced with "N" while ```iupac``` causes these unphased variants to be replaced with there IUPAC codes. There may be analyses where one option is more favorable than the other, so we make both possibilities available here. 29 | 30 | ## Important Note - you will to install a few software on your computer or cluster 31 | Please cite and credit the authors of all of the important bits that are glued together here. 32 | * BWA (Li et al. 2009a) 33 | * The samtools/htslib library (Li et al. 2009b) 34 | * GATK (McKenna at al. 2012) 35 | * HPoPG (Xie et al. 2016) 36 | 37 | ## Explanation of the control file options 38 | ### There are several input and output folders and files to keep track of 39 | * PHASING_ROOT = the root directory that you clone or download from github 40 | * REF = input folder reference fasta files 41 | * GENOTYPE\_OUT = output folder for genotyping files 42 | * PHASE\_OUT = output folder for phased sequences, but split by individual 43 | * FASTA\_OUT = output folder for fasta files you want to use 44 | + PHASED - these are the fasta files with all phased alleles per-locus 45 | + GENOTYPE - these fasta files have the unphased genotype sequences 46 | + PICKONE - these are the fasta files with only one haplotypic sequence available per individual per locus. These data may be preferred for phylogenetic network analyses. 47 | * IUPAC\_OUT = output folder for phased sequences, but split by individual 48 | * SUMMARYSTATS\_OUT = output folder with all of the phasing summary statistics 49 | * ESTPLOIDY\_OUT = output folder with all genotyping and mixture model results along with a new ploidy file 50 | * FQ = input folder of fastq files 51 | * PLOIDY = input ploidy file (see below) 52 | 53 | ### The software dependencies 54 | * BWA = path to bwa 55 | * PICARD = path to picard 56 | * GATK = path to gatk 57 | * SAMTOOLS = path to samtools 58 | * BAMTOOLS = path to bamtools 59 | * BGZIP = path to bgzip from htslib 60 | * TABIX = path to tabix from htslib 61 | * HPOPG = path to H-PoPG jar file 62 | * SCHEDULER = The submission command for your scheduler (SLURM uses sbatch) 63 | 64 | ### We make the VCF filtering options for GATK an option that the user can change. Each line is a separate filter and has the specific formatting of "" "" 65 | ### The following are some reasonable defaults, but will note always be optimal. Consult VCF filtering annotations before altering. 66 | * GATK\_FILTER\_EXPRESSION = "QD < 2.0" "QD\_lt2" 67 | * GATK\_FILTER\_EXPRESSION = "FS > 60.0" "FS\_gt60" 68 | * GATK\_FILTER\_EXPRESSION = "MQ < 40.0" "MQ\_lt40" 69 | * GATK\_FILTER\_EXPRESSION = "ReadPosRankSum < -8.0" "ReadPosRankSum\_ltm8" 70 | * GATK\_FILTER\_EXPRESSION = "AF < 0.05" "AF\_lt05" 71 | * GATK\_FILTER\_EXPRESSION = "AF > 0.95" "AF\_gt95" 72 | * GATK\_FILTER\_EXPRESSION = "DP < 10" "DP\_lt10" 73 | 74 | ### Some additional options 75 | * UNIQUE\_ONLY = <0 || 1> determines if only unique haplotypes are output following phasing by H-PoPG (0=no or 1=yes). It is very possible to have multiple identical haplotypes per locus. 76 | * REMOVE\_READ\_DUPLICATES = <0 || 1> = should Picard be used to remove pcr duplicates. Leave at 0=no for all target enrichment studies but this can be turned on for non-enriched libraries. 77 | * OUTPUT\_EXPECTED\_DOSAGE = <0 || 1> = should the expected number of sequences per individual be output even in the absence of variants. This might be needed when calculating allele frequencies. 78 | 79 | To generate the output comparable to Tiley et al. (2021) or Crowl et al. (2022), use the following options: 80 | * UNIQUE\_ONLY = 1 81 | * REMOVE\_READ\_DUPLICATES = 0 82 | * OUTPUT\_EXPECTED\_DOSAGE = 0 83 | 84 | # Some notes on configuring data and folders 85 | ## Naming of Fastq Files 86 | Fastq files follow the following naming rules: 87 | * Only paired-end data allowed 88 | * Reads should be named as 89 | + <Individual ID>.R1.<Fastq File Extension> 90 | + <Individual ID>.R2.<Fastq File Extention> 91 | + where <Individual ID> = The individual name specified in the ploidy file 92 | + and <Fastq File Extension> = Whatever you want; It does not matter if named .fq, .fastq, .fq.gz, etc 93 | * Fastq files are assumed to be pre-processed for quality and adapter removal. We do not integrate such tools here as some would argue that the soft-clipping in BWA is a better approach and GATK deals with quality explicitly. 94 | * A helper script is available to format the fastq names for you, please see ```helperScripts/PATE_formatInput.pl``` 95 | 96 | ## The Ploidy File 97 | This is where Individual IDs and their ploidy levels are specified. It has the following rules: 98 | * At least two columns are present 99 | * The first column is Individual ID 100 | * The second column is the ploidy level represented by an integer 101 | * More columns are allowed with any metadata you would like for your own purposes 102 | * Columns are seperated by whitespaces, do not format as comma-seperated 103 | 104 | ## The Reference Fastas 105 | Reference fasta files have the following rules: 106 | * There is a sinlge fasta file per locus that contains reference sequences for all individuals 107 | * The reference fasta is named <Locus Name>.fasta 108 | + No spaces allowed in locus name 109 | + The locus name here will be the locus name of the output fasta with phased sequences 110 | * The fasta files are not interleaved and assume no line breaks in the sequence data 111 | * The fasta headers are assumed to match the Individual ID (i.e. ><Individual ID>) 112 | * Other information can follow the Individual ID in the fasta header, but will be ignored 113 | * A helper script is available to format the fasta files for you, please see ```helperScripts/PATE_formatInput.pl``` 114 | 115 | ## The Template File 116 | There is a file called template.sh to help distribute jobs on a cluster 117 | * Make the necessary changes to the scheuduler directives for your account 118 | * I was on a cluster with SLURM when making this - you will need to edit for PBS or SGE accordingly 119 | * If you run the script in runmode ```cluster```, all of the commands are pasted below what you have in the template.sh file - all you need to do is configure the directives and paths to software here if they are not specified in the control file 120 | * The template file must always be provided as an argument, even if using runmode ```serial``` or ```alleles``` 121 | * The template file can be renamed if you like 122 | 123 | ## Explanation of Summary Statistics 124 | An output after runmode=2 is ```averagePhasingStats.txt```, which contains a few numbers aggregated over all loci. Here is a description of each column. 125 | 1. INDIVIDUAL - individual ID 126 | 2. NLOCI - number of loci assembled by hybpiper 127 | 3. NVARLOCI - number of loci with at least 1 variant 128 | 4. NINVLOCI - number of loci with no variants 129 | 5. NPHASELOCI - number of loci phased (it is possible to have unphased loci with variants, in the case when there is 1 or more blocks with only 1 variant) 130 | 6. AVG_LENGTH - average locus length 131 | 7. AVG_NVAR - average number of variants (includes invariable loci - averaged across the total number of loci) 132 | 8. AVG_HET - average heterozygosity (per-base heterozygosity averaged over all loci - including the invariable ones) 133 | 9. AVG_NBLOCKS - average number of blocks per phased locus (thus, excluding loci without phasing information. Many of the Dryopteris individuals were close to 1 or even 1 in some cases. I think this is incredibly useful for evaluating how good the phasing is working.) 134 | 10. AVG_LONGESTBL - Average number of variants in the longest phasing block. (thus not including loci without phasing information) 135 | + 1 - number of loci that were completely homozygous 136 | + 2 - number of loci with two phased alleles 137 | + 3 . 138 | + 4 . 139 | + 5 . 140 | + 6 - number of loci with six phased alleles (six was the max in Dryopteris. If you have higher ploidies, this will automatically go higher because it is based off of the ploidy file. If you have octoploids for example, this will then go to 8) 141 | 142 | # Some notes on ploidy estimation 143 | It is possible to estimate ploidy directly for the distributions of ratios with reads that carry the reference or alternate allele (Weiß et al. 2018; Viruel 2019). This is achieved by the --runmode ```estPloidy``` option and specifying an appropriate reference in the control file. All individuals are genotyped as a diploid and mixture models (Tiley et al. 2018) are used to determine the ploidy. I am skeptical that enough information is available in target enrichment data to achieve this task and an outgroup that can successfully polarize the alternate alleles are needed. Nevertheless, the option exists and may have value for some cases. 144 | 145 | I have used the ploidy estimation feature for Tiley et al. (2023). The results did not seem unreasonable and mostly aligned with a prior analysis of the study system, but I still have reservations about model overfitting. I have recently generated ome RADseq data for the study system with some technical replicates of individuals with known ploidy. It might take time to evalutate results and fine-tune existing models, but I am happy to collaborate with others that have similar empirical test cases on the topic. 146 | 147 | ## Using estPloidy 148 | Ploidy is dependent on some additional code for fitting the mixture models. Rather than repackage it here, it is available through another repo. Although cloning repos within repos is typically not recommened, it is the fastest way here. Make sure you are in the `helperScripts` directory first: 149 | ``` 150 | cd Phasing/helperScripts 151 | git clone https://github.com/gtiley/Ks_plots 152 | ``` 153 | # Opinions 154 | There are several technical issues compounded in the existing pipeline and I view this as a starting point for enabling some interesting analyses of polyploid complexes. 155 | First, the genotyping problem in polyploids has a lot of uncertainty and I recommend reading Gerard et al. (2018) to better appreciate the problems. Comparisons of genotypers are needed in the future, but we do our best to filter out errors from the final set of variants. 156 | Second, there has apparently been a flurry of phasing algorithms developed for polyploids in the recent years after my colleagues and I began working on this pipeline and our ideas. I will try to investigate and compare some of them (e.g. Moeinzadeh et al. 2020) in the future. 157 | 158 | # What happens when an entire locus is not phased? 159 | It is possible that two variants can be called in a locus but there is not sufficient read information, either due to quality or depth, to phase those variants with respect to the reference sequence. It can also depend on the library preparation. For example, if probes are designed across multiple exons of locus but the reads to not span the entire intron on some individuals. Who knows - it could also mean that the reference sequence is not appropriate, perhaps a chimera of homeologos sequences? In such cases regarless of the cause, we opt to retain the phased variants from the longest block while variants on the shorter block(s) are replaced by missing data (N). This is our least-worst strategy for keeping information while avoiding artificial recombination or generating loci of multiple ploidy levels (i.e. I do not think using the ambiguity codes here is a good idea). The number of loci where this happens from the target enrichment data I have evaluated is very low, but paying attention to the AVG_NBLOCKS summary statistic can help identify individuals with phasing difficulties. 160 | 161 | # Active development - some features that are planned in the near future 162 | 1. Imputation 163 | 2. SFS estimation 164 | 3. Parent assignment 165 | 4. Determining allo vs. autopolyploidy 166 | 167 | # References 168 | * Crowl AA, Fritsch PW, Tiley GP, Lynch NP, Ranney TG, Ashrafi H, Manos PS. 2022. A First Complete Phylogenomic Hypothesis for Diploid Blueberries (Vaccinium section Cyanococcus). American Journal of Botany. In press. 169 | * Li H, Durbin R. 2009a. Fast and accurate short read alignment with Burrows-Wheeler transform. Bioinformatics 25:1754-1760. 170 | * Li H, Handsaker B, Wysoker A, Fennell T, Ruan J, Homer N, Marth G, Abecasis G, Durbin R, 1000 Genome Project Data Processing Subgroup. 2009b. The sequence alignment/map format and SAMtools. Bioinformatics 25:2078-2079. 171 | * McKenna A, Hanna M, Banks E, Sivachenko A, Cibulskis K, Kernytsky A, Garimella K, Altshuler D, Gabriel S, Daly M, et al. 2010. The Genome Analysis Toolkit: a MapReduce framework for analyzing next-generation DNA sequencing data. Genome Res 20:1297-1303. 172 | * Xie M, Wu Q, Wang J, Jiang T. 2016. H-PoP and H-PoPG: heuristic partitioning algorithms for single individual haplotyping of polyploids. Bioinformatics 32:3735-3744. 173 | * Gerard D, Ferrão LFV, Garcia AAF, Stephens M. 2018. Genotyping polyploids from messy sequencing data. Genetics 210:789-807. 174 | * Moeinzadeh M-H, Yang J, Muzychenko E, Gallone G, Heller D, Reinert K, Haas S, Vingron M. 2020. Ranbow: A fast and accurate method for polyploid haplotype reconstruction. PLoS Comp Biol. 16:e1007843. 175 | * Schrinner SD, Mari RS, Ebler J, Rautiainen M, Seillier L, Reimer JJ, Usadel B, Marschall T. 2020. Haplotype threading: an accurate polyploid phasing from long reads. Genome Biol. 21:252. 176 | * Tiley GP, Barker MS, Burleigh JG. 2018. Assessing the Performance of *Ks* Plots for Detecting Ancient Whole Genome Duplications. Genome Biology and Evolution 10:2882-2898. 177 | * Tiley GP, Crowl AA, Manos PS, Sessa EB, Solís-Lemus C, Yoder AD, Burleigh JG. 2021. Phasing alleles improves network inference with allopolyploids. bioRxiv doi: https://doi.org/10.1101/2021.05.04.442457 178 | * Tiley GP, Crowl AA, Almary TOM, Luke WRQ, Solofondranohatra CL, Besnard G, Lehmann CER, Yoder AD, Vorontsova MS. 2023. Genetic variation in *Loudetia simplex* supports the presence of ancient grasslands in Madagascar. bioRxiv doi: https://doi.org/10.1101/2023.04.07.536094 179 | * Viruel J, Conejero M, Hidalgo O, Pokorny L, Powell RF, Forest F, Kantar MB, Soto Gomez M, Graham SW, Gravendeel B, Wilkin P, Leitch IJ. 2019. A Target Capture-Based Method to Estimate Ploidy from Herbarium Specimens. Front. Plant Sci. 10:937. 180 | * Weiß CL, Pais M, Cano LM, Kamoun S, Burbano HA. 2018. nQuire: a statistical framework for ploidy estimation using next generation sequencing. BMC Bioinformatics 19:122. 181 | -------------------------------------------------------------------------------- /PATE.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | use strict; 3 | 4 | #----------------------------------------------------------------------------------------# 5 | #30 August 2022 6 | #contact regarding code details: George P. Tiley g.tiley@kew.org 7 | #contact regarding empirical performance: Andy Crowl andrew.crowl@duke.edu 8 | #Genotype target-enrichment loci of known ploidy 9 | #Estimate ploidy from allele balance if unknown 10 | #Phase haplotype sequences for each locus 11 | #----------------------------------------------------------------------------------------# 12 | 13 | # Accepted AND necessary commands 14 | my @checkArgs = ("controlFile","runmode","template","genotype"); 15 | my %passedArgs = (); 16 | if (scalar(@ARGV) == 0) 17 | { 18 | die "/*--------INPUT PARAMETERS--------*/\n 19 | --controlFile STRING 20 | --runmode STRING 21 | --template STRING