├── LICENSE.md ├── README.md ├── alignment_subsetter.py ├── alleles2taghap.pl ├── autoFetcher.py ├── averageFastStructure.pl ├── batchBUCKY.pl ├── bootstrapGeneTrees.sh ├── check_cds.py ├── collapseHaps.pl ├── collapse_baits.py ├── compare2seqs.pl ├── compare_seqs_fasta.py ├── concatFasta.py ├── concatenateNexus.py ├── condenseAlleles.pl ├── count_residues.pl ├── expandSeq.py ├── fast2distruct.pl ├── fasta2gphocs.py ├── fasta2length.pl ├── fasta2nexus.pl ├── fasta2phylip.py ├── fastaFormatter.py ├── fill_quartets.py ├── filterFastaMedianLength.sh ├── filterLoci.py ├── filter_loci.pl ├── findBreaksVCF.py ├── fixedSNP.pl ├── genesFromGFF.pl ├── ipyrad2polyrad.py ├── liftoverCoords.py ├── liftoverFromPafscaff.py ├── makeHyde.py ├── makePopArt.py ├── makeSAMOVA.pl ├── newhybs2distruct.py ├── newhybs2props.py ├── nremover.pl ├── parallelMB.pl ├── parsePhaseCons.py ├── phylip2bgc.pl ├── phylip2biNumNex.py ├── phylip2ecoevolity.pl ├── phylip2introgress.pl ├── phylip2newhybrids.pl ├── phylip2nexus.pl ├── phylip2structure.pl ├── phylipFilterPops.pl ├── phylobarcode.py ├── process_ecoevolity.sh ├── pseudoHaploidize.py ├── pyrad2fasta.pl ├── python_template.py ├── revTransAll.py ├── seq2structure.pl ├── short2fullPopmap.pl ├── slidingWindowGC.pl ├── snps2phy.sh ├── splitFASTA.pl ├── splitFastaPops.py ├── splitStackedFasta.pl ├── splitTableCF.py ├── stacks2fasta.pl ├── structure2newhyb.pl ├── subsetPhy.py ├── subsetSnps.py ├── sumls.sh ├── summaryGFF.pl ├── terminalGapRemover.py ├── test_files ├── gtrees.tre ├── revTransAll_code.txt ├── revTransAll_in.fas ├── revTransAll_out.fas ├── terminal_gaps.fasta ├── terminal_gaps.gapfix.fasta ├── variable_length.fas └── variable_length.fas.filter ├── traitsList2LagrangePhylip.py ├── treeAlignment_subsetter.py ├── treeExpansion.py ├── trimFastq.pl ├── utm2latlong.py └── vcf2phylip.py /README.md: -------------------------------------------------------------------------------- 1 | # scripts 2 | Collection of scripts- mostly for manipulating, filtering, and format-conversion of DNA sequence files. Feel free to use. 3 | 4 | ### How to use 5 | Most scripts are written to accept the <-h> argument to display a help menu which should describe the function of the scripts as well as any optional or mandatory inputs. 6 | 7 | Example: 8 | The Perl program "alleles2taghap.pl" takes the ".alleles" output from the RADseq assembly program pyRAD and creates the ".taghap" format for the program fineRADstructure. To display the help menu, call the program like so: 9 | 10 | ./alleles2taghap.pl -h 11 | 12 | Which will display: 13 | 14 | tkchafin@acamel-linux1:~/scripts$ ./alleles2taghap.pl -h 15 | 16 | alleles2taghap.pl by Tyler Chafin 17 | 18 | This script converts from the .alleles file output by pyRAD to create the input for fineRADstructure 19 | 20 | NOTE: 21 | - All samples are assumed to be diploid. 22 | - Sample names CANNOT contain underscores. 23 | - Columns containing Ns or gaps will be deleted from final output 24 | - Popmap file should be tab-delimited, like so: SampleName [tab] PopID 25 | - If populations to include/exclude are not given, all samples in popmap are used. 26 | - You can specify multiple popIDs as: ID1+ID2+ID3, as long as these match IDs in popmap 27 | - For the -s filter, singletons are evaluated within the selected subset of individuals 28 | 29 | Options: 30 | -a : Path to input file (.alleles) 31 | -p : Path to popmap file (tab-delimited) 32 | -o : Output file prefix. [Default = out, i.e. out.taghap] 33 | -c : Min number of samples for which data must be present per locus [Default = 1] 34 | -n : Minumum proportion of loci an individual must be present at to be retained [def = 0.2] 35 | -i : PopIDs to include in output file (e.g. -i pop1+pop4) 36 | -x : PopIDs to exclude (e.g. -x catenatus or -x sistrTX+sistrIN) 37 | -m : Maximum number of SNPs per locus. Loci exceeding are deleted [default:10] 38 | -s : Skip SNPs that are singletons [Boolean; Default = false] 39 | -h : Displays this help message 40 | 41 | Program killed: Help menu called. 42 | 43 | ### Contents 44 | Here is a (probably) complete list of the scripts contained here, and generally what they do. All scripts written in Python require Python3. 45 | ``` 46 | alleles2taghap.pl : Converts from pyRAD .alleles format to input for fineRadStructure 47 | averageFastStructure.pl : Combines multiple replicate runs of FastStructure 48 | batchBUCKY.pl : Pipeline for running BUCKy. Old and probably broken. 49 | collapse_baits.py : For filtering baits by SNP count from BaitsTools output 50 | collapseHaps.pl : Collapse sequences to redundant consensus sequences 51 | compare2seqs.pl : This was a learning exercise. Just compares sequences. 52 | concatFasta.py : Script to concatenate fastas (No help menu- use argv) 53 | concatenateNexus.py : Concatenate Nexus alignments and calculate partitions block 54 | condenseAlleles.pl : Creates a consensus of alleles (input as FASTA) per individual 55 | count_residues.pl : Counts residues in an amino acid alignment 56 | fast2distruct.pl : Tries to parse FastStructure ouputs to create DISTRUCT input 57 | fasta2length.pl : Calculate non-gap character length of sequences 58 | fasta2nexus.pl : Converts FASTA to NEXUS format 59 | fasta2phylip.py : Converts from FASTA to PHYLIP and PHYLIP to FASTA, nothing fancy 60 | findBreaksVCF.py : Breaks contigs in VCF to chunks of X parsimony-informative SNPs, for running MDL 61 | fill_quartets.py : Sorts through TICR output to find missed quartets (for debugging only) (no help menu - use argv) 62 | filter_loci.pl : Parses a directory of FASTA alignments, and blacklists those with too low alignment coverage 63 | filterLoci.py : Filters a pyRAD .loci file on individual coverage and number of parsimony-informative sites 64 | fixedSNP.pl : Parses PHYLIP file to find differentially fixed SNPs between two given populations 65 | genesFromGFF.pl : Extracts elements from a FASTA file, given a GFF file of annotations 66 | liftoverCoords.py : Converts coordinates between assemblies (e.g. CanFam2 to CanFam3) and makes MareyMap inputs 67 | makeHyde.py : Makes inputs for HyDe- Hybrid Detection program 68 | makePopArt.py : Python program to make inputs for PopArt (haplotype network program) from FASTA 69 | makeSAMOVA.pl : Makes inputs for SAMOVA given FASTA and coordinates, with automatic clustering by distance 70 | newhybs2distruct.py : Takes posterior probs (PofZ.txt) from NewHybrids and makes inputs to run DISTRUCT 71 | newhybs2props.py : Calculates geneological assignment proportions from NewHybrids, outputs table and files to spoof DISTRUCT 72 | nremover.pl : My version of Steve Mussmann's nremover script, for filtering DNA alignments 73 | parallelMB.pl : For running batches of MrBayes on a cluster, in parallel per locus 74 | phylip2bgc.pl : Converts PHYLIP alignment to inputs for BGC (inference of Bayesian Genomic CLines) 75 | phylip2biNumNex.py : Converts PHYLIP to bi-allelic numerically coded NEXUS for PhyloNet's MLE_BiMarkers 76 | phylip2ecoevolity.pl : Converts PHYLIP to the NEXUS format needed for ecoevolity. 77 | phylip2introgress.pl : Converts PHYLIP to inputs for R package INTROGRESS (introgession analyses) 78 | phylip2newhybrids.pl : Creates inputs for NewHybrids, with missing data filters built in 79 | phylip2nexus.pl : Converts PHYLIP to NEXUS 80 | phylip2structure.pl : Converts PHYLIP alignment of SNPs to inputs for STRUCTURE 81 | phylipFilterPops.pl : Filters SNPs for creating PoMo-IQTREE inputs 82 | process_ecoevolity.sh : Runs the post-processing for ecoevolity outputs 83 | pseudoHaploidize.py : Script to haploidize FASTA-formatted sequences by randomly sampling alleles at heterozygous sites 84 | pyrad2fasta.pl : Extracts genewise alignments from pyRAD .loci format, and writes FASTA for each 85 | seq2structure.pl : I assume somehow different than phylip2structure, I don't remember honestly 86 | short2fullPopmap.pl : Does a very specific thing to my tab-delimited popmap files 87 | slidingWindowGC.pl : Calculates GC content along a sliding window down a sequence 88 | snps2phy.sh : Shell script to convert pyRAD .snp output to PHYLIP format 89 | splitFASTA.pl : Breaks a FASTA file into a user-defined number of chunks. For helping parse a large genome 90 | splitFastaPops.py : Pulls subsets from FASTA file to new FASTA file, given tab-delimited table of population IDs 91 | splitStackedFasta.pl : Splits FASTA of specifically-formatted collapsed read clusters 92 | stacks2fasta.pl : Fromats output of STACKS to a new FASTA for variable loci, but querying cstacks catalog 93 | structure2newhy.pl : Converts STRUCTURE file to input for NewHybrids 94 | subsetPhy.py : Quickly written and shitty script to subset taxa from a PHYLIP alignment 95 | subsetSnps.py : Given a list of desired columns, subsets SNPs from a STRUCTURE file 96 | sumls.sh : A bash alias for doing something with ls 97 | summaryGFF.pl : Something old and incomplete. 98 | treeExpansion.py : Converts a Newick tree of clade names, expanded to all taxa in given tab-delimited file 99 | trimFastq.pl : Perl script for end-trimming FASTQ reads 100 | utm2latlong.py : Converts UTM to and from latitude and longitudes 101 | vcf2phylip.py : VCF to PHYLIP 102 | ``` 103 | -------------------------------------------------------------------------------- /alignment_subsetter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import sys 4 | import os 5 | import getopt 6 | #import toytree as tt 7 | import random 8 | 9 | def main(): 10 | params = parseArgs() 11 | 12 | seqs=dict() 13 | for f in read_phylip(params.phylip): 14 | seqs[f[0]] = f[1] 15 | 16 | #tree=tt.tree(params.tree, tree_format=0) 17 | 18 | if not params.samples: 19 | params.samples=int(params.freq*len(list(seqs.keys()))) 20 | 21 | print("Generating",params.reps,"random subsets of",params.samples,"samples eac") 22 | 23 | for r in range(params.reps): 24 | print("starting replicate",str(r)) 25 | prefix=params.out + "_" + str(r) 26 | print("subsetting alignment") 27 | keeps=dict(random.sample(seqs.items(), params.samples)) 28 | bad_bois=[k for k in seqs.keys() if k not in keeps] 29 | #print("subsetting tree") 30 | #stree = tree.drop_tips(names=bad_bois) 31 | print("writing subset file") 32 | write_phylip(prefix+".phylip",keeps) 33 | #stree.write(prefix+".tre", tree_format=0) 34 | 35 | 36 | #Print dict to phylip file 37 | def write_phylip(p, aln): 38 | with open(p, 'w') as fh: 39 | try: 40 | header = getPhylipHeader(aln) + "\n" 41 | fh.write(header) 42 | 43 | for sample in aln.keys(): 44 | line = str(sample) + "\t" + "".join(aln[sample]) + "\n" 45 | fh.write(line) 46 | except IOError as e: 47 | print("Could not read file %s: %s"%(p,e)) 48 | sys.exit(1) 49 | except Exception as e: 50 | print("Unexpected error reading file %s: %s"%(p,e)) 51 | sys.exit(1) 52 | finally: 53 | fh.close() 54 | 55 | #Returns header for Phylip file from a dictionary of samples w/ data 56 | def getPhylipHeader(d): 57 | numSamp = 0 58 | numLoci = None 59 | for sample in d: 60 | numSamp = numSamp + 1 61 | if not numLoci: 62 | numLoci = len(d[sample]) 63 | else: 64 | if numLoci != len(d[sample]): 65 | print("getPhylipHeader: Warning: Sequences of unequal length.") 66 | header = str(numSamp) + " " + str(numLoci) 67 | if numLoci == 0 or not numLoci: 68 | print("getPhylipHeader: Warning: No loci in dictionary.") 69 | if numSamp == 0: 70 | print("getPhylipHeader: Warning: No samples in dictionary.") 71 | return(header) 72 | 73 | 74 | #Read samples as PHYLIP. Generator function 75 | def read_phylip(phy): 76 | if os.path.exists(phy): 77 | with open(phy, 'r') as fh: 78 | try: 79 | num=0 80 | for line in fh: 81 | line = line.strip() 82 | if not line: 83 | continue 84 | num += 1 85 | if num == 1: 86 | continue 87 | arr = line.split() 88 | yield(arr[0], arr[1]) 89 | except IOError: 90 | print("Could not read file ",phy) 91 | sys.exit(1) 92 | finally: 93 | fh.close() 94 | else: 95 | raise FileNotFoundError("File %s not found!"%phy) 96 | 97 | #Object to parse command-line arguments 98 | class parseArgs(): 99 | def __init__(self): 100 | #Define options 101 | try: 102 | options, remainder = getopt.getopt(sys.argv[1:], 'hs:f:r:p:o:m:', \ 103 | ["help", "reps=","phylip=","out=", "method=", "samples=", "freq="]) 104 | except getopt.GetoptError as err: 105 | print(err) 106 | self.display_help("\nExiting because getopt returned non-zero exit status.") 107 | #Default values for params 108 | #Input params 109 | #self.tree=None 110 | self.reps=10 111 | self.freq=0.1 112 | self.samples=None 113 | self.phylip=None 114 | self.method="random" 115 | self.out="subset" 116 | 117 | 118 | #First pass to see if help menu was called 119 | for o, a in options: 120 | if o in ("-h", "-help", "--help"): 121 | self.display_help("Exiting because help menu was called.") 122 | 123 | #Second pass to set all args. 124 | for opt, arg_raw in options: 125 | arg = arg_raw.replace(" ","") 126 | arg = arg.strip() 127 | opt = opt.replace("-","") 128 | #print(opt,arg) 129 | if opt == "h" or opt == "help": 130 | continue 131 | elif opt=="phylip" or opt=="p": 132 | self.phylip=arg 133 | elif opt=="method" or opt=="m": 134 | self.method=arg 135 | elif opt=="reps" or opt=="r": 136 | self.reps=int(arg) 137 | elif opt=="freq" or opt=="f": 138 | self.freq=float(arg) 139 | elif opt=="samples" or opt=="s": 140 | self.samples=int(arg) 141 | elif opt=="out" or opt=="o": 142 | self.out=arg 143 | else: 144 | assert False, "Unhandled option %r"%opt 145 | 146 | #Check manditory options are set 147 | if not self.phylip and not self.tree: 148 | self.display_help("Must provide input tree (newick) and alignment (phylip) files.") 149 | 150 | 151 | 152 | def display_help(self, message=None): 153 | if message is not None: 154 | print() 155 | print (message) 156 | print ("\nalignment_subsetter.py\n") 157 | print ("Description: Generate random subsets of an input phylip (alignment)") 158 | print(""" 159 | -p,--phylip : Path to input phylip file 160 | -s,--samples : Number of samples to keep 161 | -f,--freq : Sampling frequency (must set either -f or -s) 162 | -r,--reps : Number of replicates to generate 163 | -o,--out : Output file name (default=out.fas) 164 | """) 165 | print() 166 | sys.exit() 167 | 168 | #Call main function 169 | if __name__ == '__main__': 170 | main() 171 | -------------------------------------------------------------------------------- /averageFastStructure.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use strict; 4 | use warnings; 5 | use Getopt::Long; 6 | use File::Basename; 7 | use Statistics::R; 8 | 9 | my @meanQ; 10 | my @log; 11 | my $out = "./avg_k"; 12 | my $help = 0; 13 | my $force =0; 14 | my $k; 15 | my $reps; 16 | parseArgs(); 17 | 18 | my ($filepath, $dirpath) = fileparse($meanQ[0]); 19 | @meanQ = glob("@meanQ"); 20 | @log = glob("@log"); 21 | my $count = 0; 22 | my @data; 23 | my $fnum = 0; 24 | foreach my $file(@meanQ){ 25 | my @line; 26 | if ($force == 0){ 27 | $file !~ /.*meanQ/ and die "Error: File $file is missing .meanQ extension. Are you sure it is the correct file type? To skip this check, add the -f flag to your command-line call\n"; 28 | } 29 | open (my $fh, $file) || die "Can't open $file\n"; 30 | $count++; 31 | my $lnum = 0; 32 | while (<$fh>){ 33 | $lnum++; 34 | chomp; 35 | @line = split /\s+/; 36 | s/\s+//g; 37 | next unless length; 38 | if ($count==1){ 39 | if (!defined $k){ 40 | $k = @line unless $k; 41 | print "K value was not supplied; inferring clusters from file $file: $k\n" 42 | } 43 | } 44 | @line != $k and die "Error: Line $lnum of file $file doesn't have the correct number of clusters ($k)\n"; 45 | } 46 | #$fnum++; 47 | close $fh; 48 | } 49 | 50 | #Get likelihoods from .log files 51 | my @lognames; 52 | my @likelihoods; 53 | foreach my $file (@log){ 54 | my ($fpath, $dpath) = fileparse($file); 55 | push @lognames, $dpath . $fpath; 56 | open (my $fh, $file) || die "Can't open $file\n"; 57 | while (<$fh>){ 58 | chomp; 59 | if (m/Marginal Likelihood =/){ 60 | s/Marginal Likelihood =//; 61 | push @likelihoods, $_; 62 | } 63 | } 64 | } 65 | 66 | #Default use all reps if no subset number provided 67 | if (!defined $reps){ 68 | print "Warning: Number of replicates to subset not provided; using all by default\n"; 69 | $reps = scalar(@lognames); 70 | #print $reps . "\n"; 71 | } 72 | 73 | my $R = Statistics::R->new(); 74 | $R->start; 75 | $out = $out . $k . ".meanQ"; 76 | $R->set('lognames', \@lognames); 77 | $R->set('likelihoods', \@likelihoods); 78 | $R->set('reps', $reps); 79 | $R->set('out', $out); 80 | $R->send(q`options(scipen=999)`); 81 | $R->run(q`likes <- data.frame(lognames, likelihoods)`); 82 | $R->run(q`likes[,1] = sub(".log","",likes[,1])`); 83 | #Set up R functions 84 | $R->send(q` 85 | ################################ 86 | JSD.pair <- function(x, y){ 87 | ###Function to compute Shannon-Jensen Divergence 88 | ###x and y are the frequencies for the same p categories 89 | u <- x/sum(x) 90 | v <- y/sum(y) 91 | m <- (u+v)/2 92 | if (all(u*v>0)){ 93 | d <- (u*log(u/m)+v*log(v/m))/2 94 | } else { 95 | P1 <- u*log(u/m) 96 | P2 <- v*log(v/m) 97 | P1[is.nan(P1)] <- 0 98 | P2[is.nan(P2)] <- 0 99 | d <- (P1+P2)/2 100 | } 101 | return(sum(d)) 102 | } 103 | ############################## 104 | matchPops=function(ga, gb, niter=3000) { 105 | ### function to match population identifiers between fastStructure runs 106 | ### based on permutations of column names and Shannon-Jensen divergences 107 | minsum=1000 108 | for (i in 1:niter) { 109 | names(gb)=sample(names(gb)) 110 | sumjsd=0 111 | for (n in names(ga)) { 112 | sumjsd=sumjsd+JSD.pair(ga[,n],gb[,n]) 113 | } 114 | if (sumjsdrun(q`means=averageBest(likelihoods=likes, top=reps)`); 139 | $R->run(q`write.table(means, file=out, sep=" ", quote=FALSE, na="NA", append=FALSE, row.names=FALSE,col.names=FALSE)`); 140 | 141 | 142 | #open (my $ofstream, ">$out") || die "Can't open $out\n"; 143 | # for (my $i=0; $i<=$#data; $i++){ 144 | # for (my $k=0; $k<=$#{$data[$i]}; $k++){ 145 | # print $data[$i][$k]/$count . " "; 146 | # } 147 | # print "\n"; 148 | # } 149 | 150 | 151 | exit; 152 | 153 | ######################################################################################### 154 | 155 | sub parseArgs{ 156 | 157 | my $message = 158 | "\n\nAverages multiple fastStructure runs for the same k value. 159 | 160 | If you have problems running the script let me know. It hasn't really been tested fully, and I threw it together quickly. 161 | 162 | Arguments 163 | 164 | -i - Input fastStructure .meanQ files - wildcard usage is fine 165 | -o - Output prefix and path 166 | -l - Input fastStructure .log files 167 | -r - Number of replicates to use. Script will choose top N reps based on likelihoods 168 | -k - Provide a k value, otherwise it will be detected from column counts 169 | -f - Shut up and stop checking files for .meanQ extension 170 | \n\n"; 171 | 172 | my $result = GetOptions 173 | ( 174 | 'i=s{1,}' => \@meanQ, 175 | 'f!' => \$force, 176 | 'l=s{1,}' => \@log, 177 | 'k=i' => \$k, 178 | 'r=i' => \$reps, 179 | 'o=s' => \$out, 180 | 'h!' => \$help 181 | ); 182 | @meanQ or die "\n\nNo meanQ specified!" . $message; 183 | @log or die "\n\nNo .log specified!" . $message; 184 | $help == 1 and die $message; 185 | 186 | } 187 | 188 | 189 | 190 | -------------------------------------------------------------------------------- /batchBUCKY.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use strict; 4 | use warnings; 5 | use Getopt::Long; 6 | use File::Basename; 7 | 8 | our $cmd="bucky"; 9 | our $input; 10 | our $ngen=100000; 11 | our $nrun=2; 12 | our $alpha=1; 13 | our $help=0; 14 | our $nchain=1; 15 | our $rate=""; 16 | our $alpham=""; 17 | our @other = (); 18 | our $cutoff="0.05"; 19 | our $ind=0; 20 | our $spacesaver=0; 21 | our $out="bca"; 22 | 23 | #Parse command line arguments to define above variables 24 | 25 | parseArgs(); 26 | 27 | #Format some variables 28 | 29 | my $other = "@other"; 30 | $rate =~ /\S/ and $rate = "-r $rate"; 31 | $alpham =~ /\S/ and $alpham = "-m $alpham"; 32 | $ind == 1 and $ind = "--use-independence-prior"; 33 | $ind == 0 and $ind = ""; 34 | $spacesaver == 1 and $spacesaver = "--opt-space"; 35 | $spacesaver == 0 and $spacesaver = ""; 36 | my ($filepath, $dirpath) = fileparse($input); 37 | $input = "$dirpath\*.in"; 38 | my ($outname, $outpath) = fileparse($out); 39 | 40 | #BUCKy system call 41 | print $outpath, "\n"; 42 | chdir "$outpath"; 43 | 44 | system ("$cmd -a $alpha -k $nrun -c $nchain $rate $alpham -o $outname -cf $cutoff $ind $other $spacesaver $input"); 45 | 46 | 47 | 48 | 49 | exit; 50 | 51 | #####################################SUBROUTINES############################################## 52 | 53 | sub parseArgs{ 54 | 55 | my $usage="\nUsage: $0 -i /path/to/*.in [-option value] or [--option=value] 56 | 57 | batchBUCKY.pl takes as input the summed .t files from mrbayes (summarized per locus via mbsum) and performs a Bayesian Concordance Analysis to assess what proportion of the genome supports different phylogenetic topologies. 58 | 59 | --------------------------------------Mandatory Input------------------------------- 60 | -i, --input - Path to .in files created by mbsum (automatically generated by runMRBAYES.pl) 61 | 62 | 63 | ---------------------------------------General Options------------------------------- 64 | --cmd - Command to call bucky, if different than default [default=bucky] 65 | -o, --out - Output file root name [Default=bca] Can also include path to output directory [e.g. -o /path/to/bca] 66 | 67 | ----------------------------------------BUCKy Options-------------------------------- 68 | -a, --alpha - Use this option to set the a priori level of discordance among loci [default=1] 69 | -n, --ngen - Number of generations for MCMC. Burnin will automatically be 10% of the desired number of post-burnin updates. [default=100,000] 70 | -k, --nrun - Number of independent analyses to run 71 | -f, --cutoff - Provide a cutoff Concordance Factor value, above which all splits will be retained [default=0.05] 72 | --other - Use this option to set any other bucky parameters or functions [example: --other -s1 1234 --calculate-pairs --create-single-file] 73 | --ind - Use independent priors. Assumes a priori that loci have independent histories. [Usage: --ind ] 74 | --opt-space - This option accomodates large data sets with space optimization. [Usage: --opt-space ] 75 | 76 | ----------------------------------------MCMCMC Options-------------------------------- 77 | -c, --nchain - This option toggles on Metropolic coupled MCMC. Any chains more than one will be \"hot\" chains which will occasionally swap states with the cold chain to improve mixing [Default=1; i.e. no heated chains] 78 | -r, --rate - If MCMCMC is used, this controls the rate at which chains swap [default=100] 79 | -m, --alpham - Heated chains in MCMCMC use higher alpha values than the cold chain. This parameter sets the multiplier for the heated alpha value [default=10]\n\n"; 80 | 81 | my $result = GetOptions 82 | ( 83 | 'input|i=s' => \$input, 84 | 'cmd=s' => \$cmd, 85 | 'alpha|a=s' => \$alpha, 86 | 'ngen|n=i' => \$ngen, 87 | 'nrun|k=i' => \$nrun, 88 | 'cutoff|f=s' => \$cutoff, 89 | 'other=s{1,}' => \@other, 90 | 'nchain|c=i' => \$nchain, 91 | 'rate|r=i' => \$rate, 92 | 'alpham|m=i' => \$alpham, 93 | 'help|h!' => \$help, 94 | 'out|o=s' => \$out, 95 | 'ind!' => \$ind, 96 | 'opt-space!' => \$spacesaver, 97 | ); 98 | 99 | $help == 1 and die "\n$usage\n"; 100 | $input or die "\nInput not specified!\n$usage\n"; 101 | 102 | } 103 | -------------------------------------------------------------------------------- /bootstrapGeneTrees.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #Tyler K. Chafin 4 | #July 23 2021 5 | #Generates n bootstrap samples of an input newick-formatted file of trees 6 | #Email: tylerkchafin@gmail.com with issues 7 | 8 | if [ $1 ] && [ $2 ]; 9 | then 10 | trees="$1" 11 | n=$2 12 | else 13 | printf "\nUsage: $0 \n\n" 14 | exit 1 15 | fi 16 | 17 | for i in `seq 1 $n`; 18 | do 19 | ofile="b_"$i".tre" 20 | shuf -r -n $n $trees > $ofile 21 | done -------------------------------------------------------------------------------- /collapseHaps.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | #Tyler K. Chafin; 14-Dec-15 4 | #tkchafin@uark.edu 5 | 6 | use strict; 7 | use warnings; 8 | 9 | my $usage = " 10 | This script functions to collapse aligned sequences in FASTA format into haplotypes, and sort halpotypes by frequency. 11 | 12 | Author: Tyler K. Chafin - tkchafin\@uark.edu 13 | Last Modified: 14-Dec-15 14 | 15 | Usage: $0 16 | 17 | "; 18 | my $file; 19 | if (defined $ARGV[0]){ 20 | $file = $ARGV[0]; 21 | print "Input: $file\n"; 22 | }else{ 23 | die $usage; 24 | } 25 | 26 | my %contents; 27 | my %freq; 28 | 29 | open (INPUT, $file) || die "Cannot open $file: $!\n\n"; 30 | while (){ 31 | chomp; 32 | if ($_ =~ /^\s*$/){ 33 | next; 34 | }elsif ($_ =~ /^>/){ 35 | next; 36 | }else{ 37 | if ($contents{uc $_}){ 38 | $contents{uc $_}++; 39 | }else{ 40 | $contents{uc $_} = 1; 41 | } 42 | } 43 | } 44 | close INPUT; 45 | open (OUT, ">sorted.fasta") || die "Could not open output file: $!\n"; 46 | print "Output: sorted.fasta\n"; 47 | my $count=1; 48 | foreach my $key (sort {$contents{$b} <=> $contents{$a}}keys %contents){ 49 | print OUT ">H$count\n"; 50 | print OUT "$key\n"; 51 | $count++; 52 | } 53 | close OUT; 54 | 55 | exit; 56 | -------------------------------------------------------------------------------- /compare2seqs.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use strict; 4 | use warnings; 5 | 6 | my $tax1="Carica"; 7 | my $tax2="Vitis"; 8 | my $input="cox1.fasta"; 9 | 10 | #Open input fasta 11 | open (FAS, "$input" ) || die "\n\nI pity the fool that can't open their fasta files\n\n$!\n\n"; 12 | 13 | my $count=0; 14 | my @dna1; 15 | my @dna2; 16 | 17 | 18 | #Set first and second sequences to exploded arrays 19 | while ( ){ 20 | if ($_ !~ /^>/) { 21 | $count++; 22 | $count==1 and @dna1 = split //, "$_"; 23 | $count==2 and @dna2 = split //, "$_"; 24 | } 25 | } 26 | 27 | #print "\n\n@dna1\n\n@dna2\n\n"; 28 | 29 | 30 | if (length(@dna1) ne length(@dna2)){ 31 | print "\nWarning:Sequences to compare are of different length. Check your alignment.\n\n"; 32 | } 33 | 34 | #Print header for table 35 | print "\nPosition $tax1 $tax2\n"; 36 | 37 | my $samelen=0; 38 | my $difflen=0; 39 | my $total=0; 40 | for ( my $i=0; $i <= $#dna1; $i++){ 41 | if ($dna1[$i] eq $dna2[$i]){ 42 | $samelen++; 43 | $total++; 44 | }else{ 45 | $difflen++; 46 | $total++; 47 | print $i+1, "\t$dna1[$i]\t$dna2[$i]\n"; 48 | } 49 | } 50 | 51 | print "\nnumber of identical sites: $samelen\n"; 52 | print "number of different sites: $difflen\n"; 53 | print "percent difference: "; 54 | printf( "%.2f \n\n", $difflen / $total * 100 ); 55 | 56 | 57 | 58 | 59 | -------------------------------------------------------------------------------- /compare_seqs_fasta.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import sys 4 | import os 5 | import getopt 6 | from textwrap import wrap 7 | 8 | def main(): 9 | params = parseArgs() 10 | 11 | seqs=dict() 12 | seqlen=0 13 | for s in read_fasta(params.infile): 14 | seqs[s[0]]=s[1] 15 | seqlen=len(s[1]) 16 | 17 | ann="" 18 | for i in range(seqlen): 19 | seen=list() 20 | for s in seqs: 21 | seen.append(seqs[s][i].lower()) 22 | vars=set(seen) 23 | # add conditions here if you want to change output e.g. if there are gaps 24 | if len(vars)==1: 25 | ann = ann + "-" 26 | else: 27 | ann = ann + "*" 28 | seqs["annotation"] = ann 29 | 30 | write_fasta(params.out, seqs) 31 | 32 | #Function to write fasta-formatted sequences 33 | def write_fasta(f, aln, width=None): 34 | with open(f, 'w') as fh: 35 | try: 36 | for samp in aln.keys(): 37 | if width: 38 | ol = ">" + str(samp) + "\n" 39 | chunks=wrap(aln[samp], width=width, break_on_hyphens=False, drop_whitespace=False) 40 | for chunk in chunks: 41 | ol=ol + str(chunk) + "\n" 42 | else: 43 | ol = ">" + str(samp) + "\n" + str(aln[samp]) + "\n" 44 | fh.write(ol) 45 | except IOError as e: 46 | print("Could not read file %s: %s"%(f,e)) 47 | sys.exit(1) 48 | except Exception as e: 49 | print("Unexpected error reading file %s: %s"%(f,e)) 50 | sys.exit(1) 51 | finally: 52 | fh.close() 53 | 54 | #Read samples as FASTA. Generator function 55 | def read_fasta(fas): 56 | if os.path.exists(fas): 57 | with open(fas, 'r') as fh: 58 | try: 59 | contig = "" 60 | seq = "" 61 | for line in fh: 62 | line = line.strip() 63 | if not line: 64 | continue 65 | #print(line) 66 | if line[0] == ">": #Found a header line 67 | #If we already loaded a contig, yield that contig and 68 | #start loading a new one 69 | if contig: 70 | yield([contig,seq]) #yield 71 | contig = "" #reset contig and seq 72 | seq = "" 73 | split_line = line.split() 74 | contig = (split_line[0].replace(">","")) 75 | else: 76 | seq += line 77 | #Iyield last sequence, if it has both a header and sequence 78 | if contig and seq: 79 | yield([contig,seq]) 80 | except IOError: 81 | print("Could not read file ",fas) 82 | sys.exit(1) 83 | finally: 84 | fh.close() 85 | else: 86 | raise FileNotFoundError("File %s not found!"%fas) 87 | 88 | #Object to parse command-line arguments 89 | class parseArgs(): 90 | def __init__(self): 91 | #Define options 92 | try: 93 | options, remainder = getopt.getopt(sys.argv[1:], 'hi:o:', \ 94 | ["help", "infile=","out="]) 95 | except getopt.GetoptError as err: 96 | print(err) 97 | self.display_help("\nExiting because getopt returned non-zero exit status.") 98 | #Default values for params 99 | #Input params 100 | self.infile=None 101 | self.out="out.fas" 102 | 103 | 104 | #First pass to see if help menu was called 105 | for o, a in options: 106 | if o in ("-h", "-help", "--help"): 107 | self.display_help("Exiting because help menu was called.") 108 | 109 | #Second pass to set all args. 110 | for opt, arg_raw in options: 111 | arg = arg_raw.replace(" ","") 112 | arg = arg.strip() 113 | opt = opt.replace("-","") 114 | #print(opt,arg) 115 | if opt == "h" or opt == "help": 116 | continue 117 | elif opt=="i" or opt=="in": 118 | self.infile=arg 119 | elif opt=="out" or opt=="o": 120 | self.out=arg 121 | else: 122 | assert False, "Unhandled option %r"%opt 123 | 124 | #Check manditory options are set 125 | if not self.infile: 126 | self.display_help("No files provided.") 127 | 128 | 129 | def display_help(self, message=None): 130 | if message is not None: 131 | print() 132 | print (message) 133 | print ("Description: Annotate differences in a fasta alignment") 134 | print(""" 135 | -i,--in : Input fasta file 136 | -o,--out : Output file name (default=out.fas) 137 | """) 138 | print() 139 | sys.exit() 140 | 141 | #Call main function 142 | if __name__ == '__main__': 143 | main() 144 | -------------------------------------------------------------------------------- /concatFasta.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os 4 | import sys 5 | 6 | def main(): 7 | if len(sys.argv) <= 1: 8 | print("No files provided!") 9 | print('Usage: ./concatFasta.py *.fasta - or - ./concatFasta.py 1.fas 2.fas...') 10 | sys.exit(1) 11 | files = sys.argv[1:] 12 | 13 | #print("concatenating fastas using the following order:") 14 | #print("if this is incorrect, change something") 15 | 16 | pre=None 17 | samps=dict() 18 | #loop through and get list of samples 19 | for file in sorted(files): 20 | for s in read_fasta(file): 21 | samps[s[0]] = "" 22 | 23 | for file in sorted(files): 24 | #print(file) 25 | pre=file.split("_")[0] 26 | #get seqlen 27 | seqlen = None 28 | #seen 29 | seen = dict() 30 | for s in read_fasta(file): 31 | seen[s[0]] = 0 32 | seqlen = len(s[1]) 33 | samps[s[0]] = samps[s[0]] + s[1] 34 | 35 | for key in samps.keys(): 36 | if key not in seen: 37 | samps[key] = samps[key] + Nrepeats("N", seqlen) 38 | 39 | print("Using prefix from files to write output:",pre) 40 | oname = pre + ".fasta" 41 | write_fasta(oname, samps) 42 | 43 | def Nrepeats(pattern, N): 44 | ret = "" 45 | for i in range(int(N)): 46 | ret = ret + str(pattern) 47 | return(ret) 48 | 49 | #write fasta from dict 50 | def write_fasta(name, d): 51 | with open(name, 'w') as fh: 52 | try: 53 | for sample in d.keys(): 54 | to_write = ">" + str(sample) + "\n" + d[sample] + "\n" 55 | fh.write(to_write) 56 | except IOError as e: 57 | print("Could not read file:",e) 58 | sys.exit(1) 59 | except Exception as e: 60 | print("Unexpected error:",e) 61 | sys.exit(1) 62 | finally: 63 | fh.close() 64 | 65 | #Read samples as FASTA. Generator function 66 | def read_fasta(fas): 67 | if os.path.exists(fas): 68 | with open(fas, 'r') as fh: 69 | try: 70 | contig = "" 71 | seq = "" 72 | for line in fh: 73 | line = line.strip() 74 | if not line: 75 | continue 76 | #print(line) 77 | if line[0] == ">": #Found a header line 78 | #If we already loaded a contig, yield that contig and 79 | #start loading a new one 80 | if contig: 81 | yield([contig,seq]) #yield 82 | contig = "" #reset contig and seq 83 | seq = "" 84 | split_line = line.split() 85 | contig = (split_line[0].replace(">","")) 86 | else: 87 | seq += line 88 | #Iyield last sequence, if it has both a header and sequence 89 | if contig and seq: 90 | yield([contig,seq]) 91 | except IOError: 92 | print("Could not read file ",fas) 93 | sys.exit(1) 94 | finally: 95 | fh.close() 96 | else: 97 | raise FileNotFoundError("File %s not found!"%fas) 98 | 99 | 100 | 101 | #Call main function 102 | if __name__ == '__main__': 103 | main() -------------------------------------------------------------------------------- /condenseAlleles.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use strict; 4 | use warnings; 5 | use Getopt::Long; 6 | use File::Path; 7 | use File::Basename; 8 | 9 | our @input; 10 | 11 | #Call subroutine to parse command-line arguments 12 | parseArgs(); 13 | 14 | 15 | 16 | #iterate through input files 17 | @input = glob "@input"; 18 | foreach my $file ( @input ){ 19 | 20 | my $name = ""; 21 | my %data; 22 | 23 | #Open each file 24 | open ( FAS, "$file" ) || die "Derp: Cannot open $file!"; 25 | 26 | while (){ 27 | 28 | chomp $_; 29 | 30 | if ($_ =~ /\>/ ){ 31 | $_ =~ />(\S+)/; 32 | $name = "$1"; 33 | }else{ 34 | push @{$data{$name}}, $_; 35 | } 36 | } 37 | consense( \%data, $file ); 38 | 39 | } 40 | 41 | close FAS; 42 | 43 | ###########################################SUBROUTINES######################################### 44 | 45 | sub parseArgs{ 46 | 47 | my $usage="\ncondenseAlleles.pl takes FASTA files with individual alleles encoded separately, and generates a consensus sequence for each individual 48 | 49 | Usage: $0 --i /path/to/*.fasta 50 | 51 | Mandatory Variables 52 | -i, --input - path to input files in FASTA format\n\n"; 53 | 54 | my $result = GetOptions 55 | ( 56 | 'input|s=s{1,}' => \@input, 57 | ); 58 | 59 | @input or die "\nDerp: Input not specified!\n\n$usage"; 60 | 61 | } 62 | 63 | ############################################################################################ 64 | 65 | #This subroutine takes a hash of arrays, where the array stores the alleles for each sample (key in the hash), and spits out a consensus sequence for any heterozygote. 66 | # 67 | #WARNING: This script may contain excessive warnings! 68 | # 69 | #WARNING: OVERWRITES THE ORIGINAL FILE!! 70 | # 71 | #ANOTHER WARNING: Not written to accomodate polyploids, or paralogous loci. I will be implementing this as a part of post-processing the output of Stacks, which should have removed any loci with individuals having more than 2 alleles (as these indicate presence of paralogs, which I don't want). 72 | # 73 | #YET ANOTHER ANOTHER WARNING: This script also assumes that alleles are of the same length, and pre-aligned. 74 | 75 | sub consense{ 76 | 77 | my $datref = $_[0]; 78 | my $file = $_[1]; 79 | 80 | 81 | 82 | 83 | #Empty current file to start rewriting (all info stored in %info hash now) 84 | open ( OUT, ">$file" ) || die "Derp: Oh noes! I'm in the subroutine and cannot open $file!"; 85 | 86 | foreach my $key ( sort {$a <=> $b } ( keys %$datref ) ) { 87 | 88 | print OUT "\>$key\n"; #Print FASTA header 89 | 90 | if ( exists $$datref{$key}->[1] ) { #Check if sequence has multiple alleles 91 | #Load sequences into arrays for comparison 92 | my @allele1 = split //, $$datref{$key}->[0]; 93 | my @allele2 = split //, $$datref{$key}->[1]; 94 | 95 | for ( my $i=0; $i <= length $$datref{$key}->[0]; $i++ ){ 96 | #This is messy and sucks. Fix later with Bio::AlignIO and consensus_iupac function 97 | if ( uc ( $allele1[$i] ) eq uc ( $allele2[$i] ) ) { 98 | uc ( $allele1[$i] ) eq "A" and print OUT "A"; 99 | uc ( $allele1[$i] ) eq "G" and print OUT "G"; 100 | uc ( $allele1[$i] ) eq "T" and print OUT "T"; 101 | uc ( $allele1[$i] ) eq "C" and print OUT "C"; 102 | }else{ 103 | if ( uc $allele1[$i] eq "A" ){ 104 | uc $allele2[$i] eq "G" and print OUT "R"; 105 | uc $allele2[$i] eq "C" and print OUT "M"; 106 | uc $allele2[$i] eq "T" and print OUT "W"; 107 | }elsif ( uc $allele1[$i] eq "G" ){ 108 | uc $allele2[$i] eq "A" and print OUT "R"; 109 | uc $allele2[$i] eq "C" and print OUT "S"; 110 | uc $allele2[$i] eq "T" and print OUT "K"; 111 | }elsif ( uc $allele1[$i] eq "T" ){ 112 | uc $allele2[$i] eq "A" and print OUT "W"; 113 | uc $allele2[$i] eq "C" and print OUT "Y"; 114 | uc $allele2[$i] eq "G" and print OUT "K"; 115 | }elsif ( uc $allele1[$i] eq "C" ){ 116 | uc $allele2[$i] eq "A" and print OUT "M"; 117 | uc $allele2[$i] eq "G" and print OUT "S"; 118 | uc $allele2[$i] eq "T" and print OUT "Y"; 119 | } 120 | } 121 | } 122 | print OUT "\n"; 123 | }else{ 124 | print OUT $$datref{$key}->[0],"\n"; 125 | } 126 | } 127 | #close OUT; 128 | } 129 | 130 | -------------------------------------------------------------------------------- /count_residues.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | #Author: Tyler K. Chafin 4 | #Script counts a given residue in provided amino acid alignment 5 | 6 | use strict; 7 | use warnings; 8 | 9 | if ($#ARGV != 1){ 10 | die "\nUsage: $0 \n\n"; 11 | } 12 | 13 | my $file = $ARGV[0]; 14 | my $res = $ARGV[1]; 15 | 16 | open (FASTA, "$file") || die "Cannot open file $1: $!\n"; 17 | my %outhash; 18 | my $temp; 19 | while (){ 20 | chomp $_; 21 | if ($_ =~ "^\>"){ 22 | $temp = $_; 23 | $outhash{$temp} = ""; 24 | next; 25 | }else{ 26 | $outhash{$temp} .= $_; 27 | } 28 | 29 | } 30 | close FASTA; 31 | 32 | open (OUT, ">out.tsv") || die "Cannot open out.tsv: $!\n"; 33 | for my $key (keys %outhash){ 34 | $key =~ m/\>.*\|.*\|(.*?)\s+.*/; 35 | my $match = $1; 36 | my $count = () = $outhash{$key} =~ /$res/gi; 37 | print OUT $match, "\t", $count ,"\n"; 38 | } 39 | close OUT; 40 | 41 | exit; 42 | -------------------------------------------------------------------------------- /expandSeq.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import re 4 | import sys 5 | import os 6 | import getopt 7 | from itertools import product 8 | 9 | def main(): 10 | params = parseArgs() 11 | 12 | if params.fasta: 13 | file_object = open("out.fasta", "w") 14 | for seq in read_fasta(params.fasta): 15 | count = 1 16 | for i in expandAmbiquousDNA(seq[1]): 17 | header = ">" + str(seq[0]) + "." + str(count)+ "\n" 18 | sequence = str(i) + "\n" 19 | file_object.write(header) 20 | file_object.write(sequence) 21 | count += 1 22 | file_object.close() 23 | elif params.seq: 24 | for i in expandAmbiquousDNA(params.seq): 25 | print(i) 26 | else: 27 | sys.exit("No input provided.") 28 | 29 | 30 | #Object to parse command-line arguments 31 | class parseArgs(): 32 | def __init__(self): 33 | #Define options 34 | try: 35 | options, remainder = getopt.getopt(sys.argv[1:], 's:f:h', \ 36 | ["seq=","fasta=","help"]) 37 | except getopt.GetoptError as err: 38 | print(err) 39 | self.display_help("\nExiting because getopt returned non-zero exit status.") 40 | #Default values for params 41 | #Input params 42 | self.seq=None 43 | self.fasta=None 44 | 45 | #First pass to see if help menu was called 46 | for o, a in options: 47 | if o in ("-h", "-help", "--help"): 48 | self.display_help("Exiting because help menu was called.") 49 | 50 | #Second pass to set all args. 51 | for opt, arg_raw in options: 52 | arg = arg_raw.replace(" ","") 53 | arg = arg.strip() 54 | opt = opt.replace("-","") 55 | #print(opt,arg) 56 | if opt in ('s', 'seq'): 57 | self.seq = arg 58 | elif opt in ('h', 'help'): 59 | pass 60 | elif opt in ('f','fasta'): 61 | self.fasta = arg 62 | else: 63 | assert False, "Unhandled option %r"%opt 64 | 65 | #Check manditory options are set 66 | if self.seq and self.fasta: 67 | sys.exit("Error: Input either -s, or -f. Not both.") 68 | if not self.seq and not self.fasta: 69 | sys.exit("Error: Input either -s, or -f.") 70 | 71 | 72 | def display_help(self, message=None): 73 | if message is not None: 74 | print() 75 | print (message) 76 | print ("\nexpandSeq.py\n") 77 | print ("Contact:Tyler K. Chafin, University of Arkansas,tkchafin@uark.edu") 78 | print ("\nUsage: ", sys.argv[0], "-s AGTGATAGTAGTGRRTGAYAGAGT \n") 79 | print ("Description: expandSeq.py expands DNA sequences with ambiguities to a list of all possible variants.") 80 | 81 | print(""" 82 | Input options: 83 | -s,--seq : Sequence string to expand (results output to stdout) 84 | or 85 | -f,--fasta : You can also specify a FASTA file. Results will be output as FASTA. 86 | -h,--help : Displays help menu""") 87 | print() 88 | sys.exit() 89 | 90 | #Function to split character to IUPAC codes, assuing diploidy 91 | def get_iupac_caseless(char): 92 | lower = False 93 | if char.islower(): 94 | lower = True 95 | char = char.upper() 96 | iupac = { 97 | "A" : ["A"], 98 | "G" : ["G"], 99 | "C" : ["C"], 100 | "T" : ["T"], 101 | "N" : ["A", "C", "G", "T"], 102 | "-" : ["-"], 103 | "R" : ["A","G"], 104 | "Y" : ["C","T"], 105 | "S" : ["G","C"], 106 | "W" : ["A","T"], 107 | "K" : ["G","T"], 108 | "M" : ["A","C"], 109 | "B" : ["C","G","T"], 110 | "D" : ["A","G","T"], 111 | "H" : ["A","C","T"], 112 | "V" : ["A","C","G"] 113 | } 114 | ret = iupac[char] 115 | if lower: 116 | ret = [c.lower() for c in ret] 117 | return ret 118 | 119 | #Read genome as FASTA. FASTA header will be used 120 | #This is a generator function 121 | #Doesn't matter if sequences are interleaved or not. 122 | def read_fasta(fas): 123 | if not fileCheck(fas): 124 | raise FileNotFoundError("Fatal exception, file %s not found."%fas) 125 | 126 | fh = open(fas) 127 | try: 128 | with fh as file_object: 129 | contig = "" 130 | seq = "" 131 | for line in file_object: 132 | line = line.strip() 133 | if not line: 134 | continue 135 | line = line.replace(" ","") 136 | #print(line) 137 | if line[0] == ">": #Found a header line 138 | #If we already loaded a contig, yield that contig and 139 | #start loading a new one 140 | if contig: 141 | yield([contig,seq]) #yield 142 | contig = "" #reset contig and seq 143 | seq = "" 144 | contig = (line.replace(">","")) 145 | else: 146 | seq += line 147 | #Iyield last sequence, if it has both a header and sequence 148 | if contig and seq: 149 | yield([contig,seq]) 150 | finally: 151 | fh.close() 152 | 153 | #Function to check if a file path is valid 154 | def fileCheck(f): 155 | return (os.path.isfile(f)) 156 | 157 | #Function to expand ambiguous sequences 158 | #Generator function 159 | def expandAmbiquousDNA(sequence): 160 | for i in product(*[get_iupac_caseless(j) for j in sequence]): 161 | yield("".join(i)) 162 | 163 | 164 | #Call main function 165 | if __name__ == '__main__': 166 | main() 167 | -------------------------------------------------------------------------------- /fast2distruct.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use strict; 4 | use warnings; 5 | use Getopt::Long; 6 | 7 | my $struct; 8 | my $popmap ; 9 | my $meanQ ; 10 | my $out = "./distruct"; 11 | my $inline = 0; 12 | my $substrGuess = 0; 13 | my $help = 0; 14 | 15 | parseArgs(); 16 | 17 | my $popq = $out . ".popq"; 18 | my $indivq = $out . ".indivq"; 19 | 20 | #Capture individual order from structure file 21 | open (my $str, $struct) || die "\nDerp: Couldn't open $struct!\n\n"; 22 | 23 | my @indOrder; 24 | my @row; 25 | my $index = 0; 26 | my $skipLine = 0; 27 | while (<$str>){ 28 | #If 2 line structure format, skip every other line 29 | if ($inline == 0){ 30 | if ($skipLine == 1){ 31 | $skipLine = 0; 32 | next; 33 | } 34 | } 35 | chomp; 36 | @row = split /\t/, $_; 37 | s/\s+//g; 38 | next unless length; 39 | $row[0] =~ s/\s+//g; 40 | $indOrder[$index] = $row[0]; 41 | $index++; 42 | $skipLine = 1; 43 | } 44 | close $str; 45 | 46 | #Capture population identifiers 47 | open (my $pops, $popmap) || die "\nDerp: Couldn't open $popmap!\n\n"; 48 | 49 | my %popHash; 50 | my %substrHash; 51 | my %seen; #Hash lookup table 52 | my $popcount = 1; 53 | my $sub; 54 | my $subCount = 1; 55 | while (<$pops>){ 56 | chomp; 57 | @row = split /\t/, $_; 58 | $row[0] =~ s/\s+//g; 59 | $row[1] =~ s/\s+//g; 60 | s/\s+//g; 61 | next unless length; 62 | $row[0] = uc($row[0]); 63 | $row[0] =~ /(\d+[A-Za-z]+)/; 64 | $sub = $1; 65 | if ($seen{$row[1]}){ 66 | $popHash{$row[0]} = $seen{$row[1]}; 67 | if ($substrGuess == 0){ 68 | if ($substrHash{$sub}){ 69 | $substrHash{$sub} != $seen{$row[1]} and print "Warning:Found more than one population identifier for the same population substring " . $sub ."!\n"; 70 | }else{ 71 | #print "2Setting " . $sub ." to " . $seen{$row[1]} . "\n"; 72 | $substrHash{$sub} = $seen{$row[1]}; 73 | } 74 | } 75 | }else{ 76 | $seen{$row[1]} = $popcount; 77 | $popHash{$row[0]} = $popcount; 78 | $substrGuess == 0 and $substrHash{$sub} = $popcount; 79 | #$substrGuess == 0 and print "1Setting " . $sub ." to " . $popcount . "\n"; 80 | $popcount++; 81 | } 82 | 83 | } 84 | close $pops; 85 | 86 | #Parse meanQ file and write indivq file and make calculations for popq 87 | open (my $results, $meanQ) || die "\nDerp: Couldn't open $meanQ!\n\n"; 88 | open (my $iq, ">$indivq") || die "\nDerp: Couldn't open $indivq!\n\n"; 89 | 90 | my @asn; 91 | $index = 0; 92 | my $popID; 93 | my %popq; 94 | 95 | while (<$results>){ 96 | chomp; 97 | @row = split /\s+/, $_; 98 | s/\s+//g; 99 | next unless length; 100 | $asn[$index] = [@row]; 101 | 102 | if ($popHash{uc($indOrder[$index])}){ 103 | $popID = $popHash{uc($indOrder[$index])}; 104 | }elsif ($substrGuess == 0){ 105 | print "\nWarning: Individual ". $indOrder[$index]; 106 | print " not found in popmap! Trying to guess correct population identifier... \n"; 107 | $indOrder[$index] =~ /(\d+[A-Za-z]+)/; 108 | $sub = uc($1); 109 | if ($substrHash{$sub}){ 110 | $popID = $substrHash{$sub}; 111 | print "Assigning " . $sub ." to population " . $substrHash{$sub} . "\n"; 112 | }else{ 113 | print "Population substring " . $sub . " not found. Setting popID to ".$popcount .".\n"; 114 | $popID = $popcount; 115 | $popHash{$indOrder[$index]} = $popcount; 116 | $substrHash{$sub} = $popcount; 117 | $popcount++; 118 | } 119 | }elsif ($substrGuess == 1){ 120 | 121 | print "\nWarning: Individual " . $indOrder[$index]; 122 | print " not found in popmap! Setting popID to ". $popcount . ".\n"; 123 | $popID = $popcount; 124 | $popHash{$indOrder[$index]} = $popcount; 125 | $popcount++; 126 | } 127 | print $iq " " . $indOrder[$index] . " " . $index . " (0) "; 128 | print $iq $popID. " : "; 129 | printf $iq "%.4f ", $_ for @row; 130 | print $iq "\n"; 131 | 132 | #If popID already in popq table, then add to it 133 | if ($popq{$popID}){ 134 | for (my $i =0; $i <= $#row; $i++){ 135 | $popq{$popID}[0][$i] += $row[$i]; 136 | } 137 | $popq{$popID}[1]++; 138 | }else{ 139 | $popq{$popID}[0] = [@row]; 140 | $popq{$popID}[1] = 1; 141 | } 142 | 143 | $index++; 144 | } 145 | close $meanQ; 146 | close $iq; 147 | 148 | #Process popq table and print popq file 149 | open (my $pq, ">$popq") || die "Derp: Couldn't open $popq!\n"; 150 | my $total; 151 | foreach my $key ( sort {$a<=>$b} keys %popq){ 152 | print $pq $key . ": "; 153 | $total = $popq{$key}[1]; 154 | for (my $i=0; $i<@{$popq{$key}[0]}; $i++){ 155 | printf $pq("%.4f",($popq{$key}[0][$i]/$total)); 156 | print $pq " "; 157 | } 158 | print $pq $total . "\n"; 159 | } 160 | 161 | close $pq; 162 | exit; 163 | 164 | ######################################################################################### 165 | 166 | sub parseArgs{ 167 | 168 | my $message = 169 | "This script converts from the output of fastStructure to the input required for standard Distruct. It requires the structure file output by pyRAD (which was used for the analyses) and a population map in the style SMM required for Astral pipeline. It will use that pop map to determine a priori groupings, for building the popq files. I might add the ability to just pull these from the structure file later, but the pyRAD str doesn't have this so that's why I didn't do that yet. 170 | 171 | If you have problems running the script let me know. It hasn't really been tested fully, and I threw it together quickly. 172 | 173 | Options 174 | 175 | -i - Input fastStructure meanQ file 176 | -s - Structure file from pyRAD which was given to fastStructure 177 | -p - Population map, tab-delimted (e.g. 8HBC001 \t cypha or 8HBC001 \t 1) 178 | -o - Output prefix (e.g. ./k3) 179 | -e - Bool, toggle to turn off population estimation based on prefix 180 | e.g. program will guess 9WRW002 goes in same population as 9WRW001 181 | if 9WRW002 is missing from pop map. I did this because my pop map was 182 | missing samples, and I didn't want to go back and fine all of them to add in 183 | 184 | "; 185 | 186 | my $result = GetOptions 187 | ( 188 | 'i=s' => \$meanQ, 189 | 's=s' => \$struct, 190 | 'p=s' => \$popmap, 191 | 'o=s' => \$out, 192 | 'h!' => \$help, 193 | 'e!' => \$substrGuess 194 | ); 195 | $meanQ or die $message; 196 | $struct or die $message; 197 | $popmap or die $message; 198 | $help == 1 and die $message; 199 | 200 | } 201 | 202 | 203 | 204 | -------------------------------------------------------------------------------- /fasta2gphocs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import sys 4 | import os 5 | import getopt 6 | 7 | def main(): 8 | params = parseArgs() 9 | locnum=1 10 | skipped=0 11 | contents="" 12 | print("Minimum allowable alignment length:",params.minlen) 13 | for file in os.listdir(params.fasdir): 14 | if file.endswith(".fas") or file.endswith(".fasta") or file.endswith(".fsa"): 15 | aln=dict() 16 | tax=0 17 | aln_len=0 18 | skip=False 19 | for line in read_fasta(params.fasdir + "/" + file): 20 | aln[line[0]] = line[1].replace("-", "N") 21 | tax=tax+1 22 | aln_len=len(line[1]) 23 | if aln_len < params.minlen: 24 | skip=True 25 | continue 26 | if skip: 27 | skipped=skipped+1 28 | continue 29 | contents=contents+"locus"+str(locnum)+" "+str(tax)+" "+str(aln_len)+"\n" 30 | locnum=locnum+1 31 | #print(locnum) 32 | for samp in sorted(aln): 33 | #print(samp) 34 | contents=contents+str(samp)+" "+aln[samp]+"\n" 35 | contents=contents+"\n" 36 | #print(contents) 37 | 38 | print("Skipped alignments smaller than minimum length:",skipped) 39 | print("Total alignments passing filtering:",locnum) 40 | 41 | ofh=open(params.out, "w") 42 | header=str(locnum)+"\n" 43 | ofh.write(header) 44 | ofh.write(contents) 45 | ofh.close() 46 | 47 | #Read genome as FASTA. FASTA header will be used 48 | #This is a generator function 49 | #Doesn't matter if sequences are interleaved or not. 50 | def read_fasta(fas): 51 | fh = open(fas) 52 | try: 53 | with fh as file_object: 54 | contig = "" 55 | seq = "" 56 | for line in file_object: 57 | line = line.strip() 58 | if not line: 59 | continue 60 | line = line.replace(" ","") 61 | #print(line) 62 | if line[0] == ">": #Found a header line 63 | #If we already loaded a contig, yield that contig and 64 | #start loading a new one 65 | if contig: 66 | yield([contig,seq]) #yield 67 | contig = "" #reset contig and seq 68 | seq = "" 69 | contig = (line.replace(">","")) 70 | else: 71 | seq += line 72 | #Iyield last sequence, if it has both a header and sequence 73 | if contig and seq: 74 | yield([contig,seq]) 75 | finally: 76 | fh.close() 77 | 78 | 79 | #Object to parse command-line arguments 80 | class parseArgs(): 81 | def __init__(self): 82 | #Define options 83 | try: 84 | options, remainder = getopt.getopt(sys.argv[1:], 'hf:o:m:', \ 85 | ["help"]) 86 | except getopt.GetoptError as err: 87 | print(err) 88 | self.display_help("\nExiting because getopt returned non-zero exit status.") 89 | #Default values for params 90 | #Input params 91 | self.fasdir=None 92 | self.out="gphocs_input.txt" 93 | self.minlen=500 94 | 95 | #First pass to see if help menu was called 96 | for o, a in options: 97 | if o in ("-h", "-help", "--help"): 98 | self.display_help("Exiting because help menu was called.") 99 | 100 | #Second pass to set all args. 101 | for opt, arg_raw in options: 102 | arg = arg_raw.replace(" ","") 103 | arg = arg.strip() 104 | opt = opt.replace("-","") 105 | #print(opt,arg) 106 | if opt == "h" or opt == "help": 107 | continue 108 | elif opt == "f": 109 | self.fasdir=arg 110 | elif opt=="o": 111 | self.out=arg 112 | elif opt=="m": 113 | self.minlen=int(arg) 114 | else: 115 | assert False, "Unhandled option %r"%opt 116 | 117 | #Check manditory options are set 118 | if not self.fasdir: 119 | self.display_help("No files provided.") 120 | 121 | 122 | 123 | def display_help(self, message=None): 124 | if message is not None: 125 | print() 126 | print (message) 127 | print ("\nfasta2gphocs.py\n") 128 | print("Author: Tyler K Chafin, University of Arkansas") 129 | print ("Contact: tkchafin@uark.edu") 130 | print ("Description: Converts a set of separate FASTA-formatted gene alignments to g-phocs sequence file format") 131 | print(""" 132 | Arguments: 133 | -f : Directory containing FASTA files 134 | -o : Output file name 135 | -m : Minimum alignment length (default=500) 136 | """) 137 | print() 138 | sys.exit() 139 | 140 | #Call main function 141 | if __name__ == '__main__': 142 | main() 143 | -------------------------------------------------------------------------------- /fasta2length.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # gives the non-gap-character-length of each sequence 4 | 5 | use warnings; 6 | use strict; 7 | 8 | die "usage: $0 fastafile\n" unless $#ARGV == 0; 9 | 10 | open A, shift; 11 | 12 | my ($id, @ids, %seq, $total); 13 | while () { 14 | chomp; 15 | if (/^>(.*)/) { 16 | $id = $1; 17 | push @ids, $id; 18 | } else { 19 | $seq{$id} .= $_; 20 | } 21 | } 22 | 23 | my (%group, $group, $ar, $seq, $len); 24 | foreach $id (@ids) { 25 | $seq = $seq{$id}; 26 | $seq =~ s/[\s*-]+//g; 27 | $len = length $seq; 28 | $total += $len; 29 | print "$id $len\n"; 30 | # print "\t$len\n"; 31 | # print ">$id ; length $len\n$seq{$id}\n"; 32 | } 33 | 34 | print "$total total sequence length\n"; 35 | -------------------------------------------------------------------------------- /fasta2nexus.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use strict; 4 | use warnings; 5 | use Getopt::Long; 6 | use File::Path; 7 | use File::Basename; 8 | 9 | # Declare variables 10 | 11 | my @input; 12 | #our $infiletype=1; 13 | 14 | parseArgs(); 15 | 16 | my ( $filepath, $dirpath ) = fileparse($input[0]); 17 | 18 | #Iterate through files 19 | 20 | @input = glob "@input"; 21 | 22 | foreach my $file ( @input ){ 23 | 24 | #Initialize variables within each daughter process 25 | my %data; 26 | my $taxa = 0; 27 | my @fasta; 28 | my @loci; 29 | my $nchar=0; 30 | my $line; 31 | my $name = ""; 32 | my $seq = ""; 33 | 34 | open ( FILE, "$file" ) || die "Error\nCan't open $file: $!\n"; 35 | 36 | 37 | while ( ){ 38 | chomp; 39 | if( $_ =~ /^\>/ ){ 40 | $taxa++; 41 | if ($name =~ ""){ 42 | $_ =~ /^\>(\S+)/; 43 | $name = "$1"; 44 | }else{ 45 | $data{$name} = $seq; 46 | $seq = ""; 47 | $nchar = length($seq); 48 | $_ =~ /^\>(\S+)/; 49 | $name = "$1"; 50 | } 51 | }elsif( $_ =~ /^\s*$/ ){ 52 | next; 53 | }elsif( $_ =~ /^\s*#/ ){ 54 | next; 55 | }else{ 56 | $seq .= $_; #append sequence to line; accounts for multi line fasta 57 | } 58 | } 59 | close FILE; 60 | 61 | #Capture taxa name to use as identifier 62 | my $filepath = fileparse("$file"); 63 | $filepath =~ /(\w+)\.\w/; 64 | my $ID = $1; 65 | 66 | open( OUT, '>', "$dirpath$ID.nex" ) || die "Error\nCan't write to $ID.nex\n"; 67 | print OUT "#NEXUS\n\n"; 68 | print OUT "BEGIN DATA; 69 | DIMENSIONS NTAX=$taxa NCHAR=$nchar; 70 | FORMAT DATATYPE=DNA MISSING=? GAP=- ; 71 | 72 | MATRIX\n"; 73 | 74 | foreach my $key (keys %data){ 75 | print OUT "$key\t$data{$key}\n"; 76 | } 77 | print OUT ";\n"; 78 | 79 | print OUT "END;\n\n"; 80 | 81 | close OUT; 82 | } 83 | 84 | 85 | exit; 86 | ###########################SUBROUTINES################################### 87 | 88 | sub parseArgs{ 89 | #Message to print if mandatory variables not declared 90 | my $usage ="\nUsage: $0 --i /path/to/input/directory/*.fasta 91 | Mandatory 92 | -i, --input - path to the input files in fasta format 93 | \n"; 94 | 95 | my $options = GetOptions 96 | ( 97 | 'input|i=s{1,}' => \@input, 98 | ); 99 | 100 | @input or die "\n\nError: Input not specified!\n\n$usage\n"; 101 | } 102 | 103 | ######################################################################### 104 | -------------------------------------------------------------------------------- /fasta2phylip.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import re 4 | import sys 5 | import os 6 | import getopt 7 | import random 8 | 9 | def main(): 10 | params = parseArgs() 11 | 12 | seqs = dict() #key=FASTA header; val=sequence 13 | 14 | #read sequence in 15 | if params.fasta: 16 | print('Reading alignment from FASTA...') 17 | for f in read_fasta(params.fasta): 18 | seqs[f[0]] = f[1] 19 | 20 | print("Writing new PHYLIP file",params.out) 21 | write_phylip(params.out, seqs) 22 | elif params.phylip: 23 | print('Reading alignment from PHYLIP...') 24 | for f in read_phylip(params.phylip): 25 | seqs[f[0]] = f[1] 26 | 27 | print("Writing new FASTA file",params.out) 28 | write_fasta(params.out, seqs) 29 | 30 | 31 | 32 | #Print dict to phylip file 33 | def write_phylip(p, aln): 34 | with open(p, 'w') as fh: 35 | try: 36 | header = getPhylipHeader(aln) + "\n" 37 | fh.write(header) 38 | 39 | for sample in aln.keys(): 40 | line = str(sample) + "\t" + "".join(aln[sample]) + "\n" 41 | fh.write(line) 42 | except IOError as e: 43 | print("Could not read file %s: %s"%(p,e)) 44 | sys.exit(1) 45 | except Exception as e: 46 | print("Unexpected error reading file %s: %s"%(p,e)) 47 | sys.exit(1) 48 | finally: 49 | fh.close() 50 | 51 | #Function to write fasta-formatted sequences 52 | def write_fasta(f, aln): 53 | with open(f, 'w') as fh: 54 | try: 55 | for samp in aln.keys(): 56 | ol = ">" + str(samp) + "\n" + str(aln[samp]) + "\n" 57 | fh.write(ol) 58 | except IOError as e: 59 | print("Could not read file %s: %s"%(f,e)) 60 | sys.exit(1) 61 | except Exception as e: 62 | print("Unexpected error reading file %s: %s"%(f,e)) 63 | sys.exit(1) 64 | finally: 65 | fh.close() 66 | 67 | #Returns header for Phylip file from a dictionary of samples w/ data 68 | def getPhylipHeader(d): 69 | numSamp = 0 70 | numLoci = None 71 | for sample in d: 72 | numSamp = numSamp + 1 73 | if not numLoci: 74 | numLoci = len(d[sample]) 75 | else: 76 | if numLoci != len(d[sample]): 77 | print("getPhylipHeader: Warning: Sequences of unequal length.") 78 | header = str(numSamp) + " " + str(numLoci) 79 | if numLoci == 0 or not numLoci: 80 | print("getPhylipHeader: Warning: No loci in dictionary.") 81 | if numSamp == 0: 82 | print("getPhylipHeader: Warning: No samples in dictionary.") 83 | return(header) 84 | 85 | #Read samples as FASTA. Generator function 86 | def read_fasta(fas): 87 | 88 | if os.path.exists(fas): 89 | with open(fas, 'r') as fh: 90 | try: 91 | contig = "" 92 | seq = "" 93 | for line in fh: 94 | line = line.strip() 95 | if not line: 96 | continue 97 | #print(line) 98 | if line[0] == ">": #Found a header line 99 | #If we already loaded a contig, yield that contig and 100 | #start loading a new one 101 | if contig: 102 | yield([contig,seq]) #yield 103 | contig = "" #reset contig and seq 104 | seq = "" 105 | split_line = line.split() 106 | contig = (split_line[0].replace(">","")) 107 | else: 108 | seq += line 109 | #Iyield last sequence, if it has both a header and sequence 110 | if contig and seq: 111 | yield([contig,seq]) 112 | except IOError: 113 | print("Could not read file ",fas) 114 | sys.exit(1) 115 | finally: 116 | fh.close() 117 | else: 118 | raise FileNotFoundError("File %s not found!"%fas) 119 | 120 | #Read samples as PHYLIP. Generator function 121 | def read_phylip(phy): 122 | if os.path.exists(phy): 123 | with open(phy, 'r') as fh: 124 | try: 125 | num=0 126 | for line in fh: 127 | line = line.strip() 128 | if not line: 129 | continue 130 | num += 1 131 | if num == 1: 132 | continue 133 | arr = line.split() 134 | yield(arr[0], arr[1]) 135 | except IOError: 136 | print("Could not read file ",phy) 137 | sys.exit(1) 138 | finally: 139 | fh.close() 140 | else: 141 | raise FileNotFoundError("File %s not found!"%phy) 142 | 143 | #Object to parse command-line arguments 144 | class parseArgs(): 145 | def __init__(self): 146 | #Define options 147 | try: 148 | options, remainder = getopt.getopt(sys.argv[1:], 'f:p:h', \ 149 | ["help", "fasta=", "phy="]) 150 | except getopt.GetoptError as err: 151 | print(err) 152 | self.display_help("\nExiting because getopt returned non-zero exit status.") 153 | #Default values for params 154 | #Input params 155 | self.fasta=None 156 | self.phylip=None 157 | self.out=None 158 | 159 | #First pass to see if help menu was called 160 | for o, a in options: 161 | if o in ("-h", "-help", "--help"): 162 | self.display_help("Exiting because help menu was called.") 163 | 164 | #Second pass to set all args. 165 | for opt, arg_raw in options: 166 | arg = arg_raw.replace(" ","") 167 | arg = arg.strip() 168 | opt = opt.replace("-","") 169 | #print(opt,arg) 170 | if opt =="f" or opt=="fasta": 171 | self.fasta = arg 172 | elif opt =="p" or opt=="phy": 173 | self.phylip = arg 174 | elif opt =="h" or opt == "help": 175 | pass 176 | else: 177 | assert False, "Unhandled option %r"%opt 178 | 179 | #Check manditory options are set 180 | if not self.fasta and not self.phylip: 181 | self.display_help("Must provide either a FASTA or PHYLIP file.") 182 | 183 | if self.fasta and self.phylip: 184 | self.display_help("Must provide either a FASTA or PHYLIP file.") 185 | 186 | #get output prefix if not set by user 187 | if self.fasta: 188 | self.out = os.path.splitext(self.fasta)[0] + '.phylip' 189 | elif self.phylip: 190 | self.out = os.path.splitext(self.phylip)[0] + '.fasta' 191 | 192 | def display_help(self, message=None): 193 | if message is not None: 194 | print() 195 | print (message) 196 | print ("\nfasta2phylip.py\n") 197 | print ("Contact:Tyler K. Chafin, University of Arkansas,tkchafin@uark.edu") 198 | print ("\nUsage: ", sys.argv[0], "[-f <.fasta>] [-p <.phy>]\n") 199 | print ("Description: Simple script to convert between FASTA and PHYLIP formats") 200 | 201 | print(""" 202 | Arguments: 203 | -f,--fasta : Input FASTA to be converted to PHYLIP 204 | -p,--phy : Input PHYLIP to be converted to FASTA 205 | -h,--help : Displays help menu 206 | """) 207 | print() 208 | sys.exit() 209 | 210 | #Call main function 211 | if __name__ == '__main__': 212 | main() 213 | -------------------------------------------------------------------------------- /fastaFormatter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import sys 4 | import os 5 | import getopt 6 | from textwrap import wrap 7 | 8 | def main(): 9 | params = parseArgs() 10 | 11 | if params.many2one: 12 | seqs=dict() 13 | for f in read_fasta(params.many2one): 14 | seqs[f[0]] = f[1] 15 | write_fasta(params.out, seqs) 16 | elif params.one2many: 17 | seqs=dict() 18 | for f in read_fasta(params.one2many): 19 | seqs[f[0]] = f[1] 20 | write_fasta(params.out, seqs, params.width) 21 | 22 | #Function to write fasta-formatted sequences 23 | def write_fasta(f, aln, width=None): 24 | with open(f, 'w') as fh: 25 | try: 26 | for samp in aln.keys(): 27 | if width: 28 | ol = ">" + str(samp) + "\n" 29 | chunks=wrap(aln[samp], width=width, break_on_hyphens=False, drop_whitespace=False) 30 | for chunk in chunks: 31 | ol=ol + str(chunk) + "\n" 32 | else: 33 | ol = ">" + str(samp) + "\n" + str(aln[samp]) + "\n" 34 | fh.write(ol) 35 | except IOError as e: 36 | print("Could not read file %s: %s"%(f,e)) 37 | sys.exit(1) 38 | except Exception as e: 39 | print("Unexpected error reading file %s: %s"%(f,e)) 40 | sys.exit(1) 41 | finally: 42 | fh.close() 43 | 44 | #Read samples as FASTA. Generator function 45 | def read_fasta(fas): 46 | if os.path.exists(fas): 47 | with open(fas, 'r') as fh: 48 | try: 49 | contig = "" 50 | seq = "" 51 | for line in fh: 52 | line = line.strip() 53 | if not line: 54 | continue 55 | #print(line) 56 | if line[0] == ">": #Found a header line 57 | #If we already loaded a contig, yield that contig and 58 | #start loading a new one 59 | if contig: 60 | yield([contig,seq]) #yield 61 | contig = "" #reset contig and seq 62 | seq = "" 63 | split_line = line.split() 64 | contig = (split_line[0].replace(">","")) 65 | else: 66 | seq += line 67 | #Iyield last sequence, if it has both a header and sequence 68 | if contig and seq: 69 | yield([contig,seq]) 70 | except IOError: 71 | print("Could not read file ",fas) 72 | sys.exit(1) 73 | finally: 74 | fh.close() 75 | else: 76 | raise FileNotFoundError("File %s not found!"%fas) 77 | 78 | #Object to parse command-line arguments 79 | class parseArgs(): 80 | def __init__(self): 81 | #Define options 82 | try: 83 | options, remainder = getopt.getopt(sys.argv[1:], 'h1:M:w:o:', \ 84 | ["help", "one2many=","many2one=","width=","out="]) 85 | except getopt.GetoptError as err: 86 | print(err) 87 | self.display_help("\nExiting because getopt returned non-zero exit status.") 88 | #Default values for params 89 | #Input params 90 | self.one2many=None 91 | self.many2one=None 92 | self.width=60 93 | self.out="out.fas" 94 | 95 | 96 | #First pass to see if help menu was called 97 | for o, a in options: 98 | if o in ("-h", "-help", "--help"): 99 | self.display_help("Exiting because help menu was called.") 100 | 101 | #Second pass to set all args. 102 | for opt, arg_raw in options: 103 | arg = arg_raw.replace(" ","") 104 | arg = arg.strip() 105 | opt = opt.replace("-","") 106 | #print(opt,arg) 107 | if opt == "h" or opt == "help": 108 | continue 109 | elif opt=="one2many" or opt=="1": 110 | self.one2many=arg 111 | elif opt=="many2one" or opt=="M": 112 | self.many2one=arg 113 | elif opt=="width" or opt=="w": 114 | self.width=int(arg) 115 | elif opt=="out" or opt=="o": 116 | self.out=arg 117 | else: 118 | assert False, "Unhandled option %r"%opt 119 | 120 | #Check manditory options are set 121 | if not self.one2many and not self.many2one: 122 | self.display_help("No files provided.") 123 | 124 | 125 | 126 | def display_help(self, message=None): 127 | if message is not None: 128 | print() 129 | print (message) 130 | print ("\nfastaFormatter.py\n") 131 | print("Author: Tyler K Chafin, University of Arkansas") 132 | print ("Contact: tkchafin@uark.edu") 133 | print ("Description:Right now just converts b/n multi-line and one-line fasta formats, might add later") 134 | print(""" 135 | -1,--one2many : Path to fasta file to multi-line format 136 | -M,--many2one : Path to fasta file to convert to one-line format 137 | -w,--width : Characters per line for multi-line (default: 60) 138 | -o,--out : Output file name (default=out.fas) 139 | """) 140 | print() 141 | sys.exit() 142 | 143 | #Call main function 144 | if __name__ == '__main__': 145 | main() 146 | -------------------------------------------------------------------------------- /fill_quartets.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os 4 | import sys 5 | import itertools 6 | import collections 7 | 8 | if len(sys.argv) < 3: 9 | print("Usage: fill_quartets.py ") 10 | 11 | CF=sys.argv[1] 12 | all=sys.argv[2] 13 | 14 | spoof=True #hard coded option 15 | 16 | #list of lists, capturing sampled quartets 17 | sampled = list() 18 | 19 | with open(CF, 'r') as fh: 20 | try: 21 | seen = list() 22 | for line in fh: 23 | if not line: 24 | continue 25 | else: 26 | stuff = line.split(",") 27 | seen = sorted(stuff[0:4]) 28 | sampled.append(seen) 29 | except IOError: 30 | print("Could not read file ",CF) 31 | sys.exit(1) 32 | finally: 33 | fh.close() 34 | 35 | all_quartets=list() 36 | all_tax = list() 37 | with open(all, 'r') as fh: 38 | try: 39 | all = list() 40 | for line in fh: 41 | line=line.strip() 42 | if not line: 43 | continue 44 | else: 45 | all_tax.append(line) 46 | except IOError: 47 | print("Could not read file ",CF) 48 | sys.exit(1) 49 | finally: 50 | fh.close() 51 | 52 | all_comb = list(itertools.combinations(all_tax,4)) 53 | for comb in all_comb: 54 | all_quartets.append(sorted(list(comb))) 55 | 56 | #print("Writing all missing quartets to stdout...") 57 | 58 | for quartet in all_quartets: 59 | miss=True 60 | for sample in sampled: 61 | if set(quartet) == set(sample): 62 | miss=False 63 | if miss==True: 64 | if spoof: 65 | oline = ""; 66 | for tax in quartet: 67 | oline = oline + str(tax) + "," 68 | oline = oline + "0.333333333333334,0.333333333333333,0.333333333333333" 69 | print(oline) 70 | else: 71 | print(quartet) 72 | -------------------------------------------------------------------------------- /filterFastaMedianLength.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #Tyler K. Chafin 4 | #July 23 2021 5 | #Generates n bootstrap samples of an input newick-formatted file of trees 6 | #Email: tylerkchafin@gmail.com with issues 7 | 8 | if [ $1 ]; 9 | then 10 | fasta="$1" 11 | else 12 | printf "\nUsage: $0 \n\n" 13 | exit 1 14 | fi 15 | 16 | #calculate median sequence length 17 | median=`grep -v ">" $fasta | awk 'BEGIN{FS=""}{print NF}' | sort -n | awk '{a[NR]=$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}'` 18 | 19 | #select out sequences equal to or above median length 20 | grep -B1 "^[A-Za-z]\{$median,\}" $fasta | sed "/^--$/d" > $fasta".filter" 21 | -------------------------------------------------------------------------------- /filter_loci.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use strict; 4 | use warnings; 5 | use Getopt::Long; 6 | use File::Path; 7 | use File::Copy; 8 | use File::Basename; 9 | 10 | my @DIR; 11 | my $cutoff; 12 | my $N; 13 | my $blacklist=0; 14 | my @contents; 15 | 16 | parseArgs(); 17 | 18 | 19 | 20 | #Use File::Basename to capture some info 21 | my ($filepath, $dirpath) = fileparse( $DIR[0] ); 22 | 23 | #If user toggled "blacklist loci" on, then some configuration... 24 | $blacklist =~ "1" and my $bname = "blacklist"; 25 | $blacklist =~ "1" and rmtree "$dirpath/$bname"; 26 | $blacklist =~ "1" and mkdir "$dirpath/$bname"; 27 | 28 | 29 | #Iterate through files 30 | 31 | @DIR = glob "@DIR"; 32 | foreach my $file (@DIR){ 33 | 34 | $N=0; 35 | @contents=""; 36 | 37 | open (FILE, "$file"); 38 | while (){ 39 | chomp $_; 40 | push @contents, $_; 41 | } 42 | $N +=()= "@contents" =~ /\>/g; 43 | 44 | if ( $blacklist eq "1"){ 45 | if ( $N < $cutoff ){ 46 | move("$file","$dirpath/$bname/"); 47 | } 48 | }else{ 49 | 50 | if ( $N < $cutoff ){ 51 | unlink "$file"; 52 | } 53 | } 54 | close FILE; 55 | } 56 | 57 | 58 | ##############################################SUBROUTINES########################################### 59 | 60 | sub parseArgs{ 61 | 62 | my $usage="\nfilter_loci.pl takes a directory full of fasta files, each representing a single locus, and deletes any loci with insufficient coverage across samples, using a user-specified cut off value. 63 | 64 | Usage: $0 --i=/path/to/*.fasta --x=# [--b] 65 | 66 | Mandatory Variables 67 | -i, --input - Path to fasta files 68 | -x, --cutoff - Integer indicating the minimum number of samples to retain a locus 69 | Options 70 | -b, --blacklist - Retain dropped loci in a blacklisted_loci directory 71 | "; 72 | 73 | my $results = GetOptions 74 | ( 75 | 'input|i=s{1,}' => \@DIR, 76 | 'cutoff|x=i' => \$cutoff, 77 | 'blacklist|b!' => \$blacklist, 78 | ); 79 | 80 | @DIR or die "\nDerp: Input directory not defined!\n\n$usage"; 81 | $cutoff or die "\nDerp: Minimum coverage required to retain a locus not defined!\n\n$usage"; 82 | 83 | } 84 | -------------------------------------------------------------------------------- /findBreaksVCF.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import re 4 | import sys 5 | import os 6 | import getopt 7 | import vcf 8 | 9 | def main(): 10 | params = parseArgs() 11 | 12 | vfh = vcf.Reader(open(params.vcf, 'r')) 13 | 14 | #grab contig sizes 15 | contigs = dict() 16 | for c,s in vfh.contigs.items(): 17 | contigs[s.id] = s.length 18 | 19 | regions = list() 20 | 21 | this_chrom = None 22 | start = int() 23 | stop = int() 24 | count = 0 25 | for rec in vfh: 26 | if not this_chrom: 27 | this_chrom = rec.CHROM 28 | start = 1 29 | stop = 1 30 | count = 0 31 | #If we entered new chromosome, submit old break 32 | elif this_chrom != rec.CHROM: 33 | t = tuple([this_chrom, start, contigs[this_chrom]]) 34 | regions.append(t) 35 | this_chrom = rec.CHROM 36 | start = 1 37 | stop = 1 38 | count = 0 39 | 40 | #if this SNP is parsimony-informative 41 | if rec.is_snp and not rec.is_monomorphic: 42 | #Check if parsimony-informative 43 | if is_PIS(rec): 44 | count+=1 45 | #if this is the final PIS, submit region to list 46 | if count == params.force: 47 | stop = rec.POS 48 | t = tuple([this_chrom, start, stop]) 49 | regions.append(t) 50 | start = stop + 1 51 | count = 0 52 | 53 | t = tuple([this_chrom, start, contigs[this_chrom]]) 54 | regions.append(t) 55 | 56 | print("Writing regions to out.regions...") 57 | write_regions("out.regions", regions) 58 | 59 | #Function to write list of regions tuples, in GATK format 60 | def write_regions(f, r): 61 | 62 | with open(f, 'w') as fh: 63 | try: 64 | for reg in r: 65 | ol = str(reg[0]) + ":" + str(reg[1]) + "-" + str(reg[2]) + "\n" 66 | fh.write(ol) 67 | except IOError as e: 68 | print("Could not read file %s: %s"%(f,e)) 69 | sys.exit(1) 70 | except Exception as e: 71 | print("Unexpected error reading file %s: %s"%(f,e)) 72 | sys.exit(1) 73 | finally: 74 | fh.close() 75 | 76 | #Function to check pyVCF record for if parsimony informative or not 77 | def is_PIS(r): 78 | ref=0 79 | alt=0 80 | for call in r.samples: 81 | if call.gt_type: 82 | if call.gt_type == 0: 83 | ref += 1 84 | elif call.gt_type == 1: 85 | alt += 1 86 | elif call.gt_type == 2: 87 | alt += 1 88 | ref += 1 89 | if ref >= 2 and alt >= 2: 90 | return(True) 91 | if ref <= 2 and alt <= 2: 92 | return(False) 93 | 94 | #Object to parse command-line arguments 95 | class parseArgs(): 96 | def __init__(self): 97 | #Define options 98 | try: 99 | options, remainder = getopt.getopt(sys.argv[1:], 'v:f:h', \ 100 | ["vcf=" "help", "force="]) 101 | except getopt.GetoptError as err: 102 | print(err) 103 | self.display_help("\nExiting because getopt returned non-zero exit status.") 104 | #Default values for params 105 | #Input params 106 | self.vcf=None 107 | self.force=100000 108 | 109 | #First pass to see if help menu was called 110 | for o, a in options: 111 | if o in ("-h", "-help", "--help"): 112 | self.display_help("Exiting because help menu was called.") 113 | 114 | #Second pass to set all args. 115 | for opt, arg_raw in options: 116 | arg = arg_raw.replace(" ","") 117 | arg = arg.strip() 118 | opt = opt.replace("-","") 119 | #print(opt,arg) 120 | if opt in ('v', 'vcf'): 121 | self.vcf = arg 122 | elif opt in ('f','force'): 123 | self.force=int(arg) 124 | elif opt in ('h', 'help'): 125 | pass 126 | else: 127 | assert False, "Unhandled option %r"%opt 128 | 129 | #Check manditory options are set 130 | if not self.vcf: 131 | self.display_help("Must provide VCF file <-v,--vcf>") 132 | 133 | def display_help(self, message=None): 134 | if message is not None: 135 | print() 136 | print (message) 137 | print ("\nfindBreaksVCF.py\n") 138 | print ("Contact:Tyler K. Chafin, University of Arkansas,tkchafin@uark.edu") 139 | print ("\nUsage: ", sys.argv[0], "-v -f <100000>\n") 140 | print ("Description: Breaks chromosomes into chunks of X parsimony-informative sites, for running MDL") 141 | 142 | print(""" 143 | Arguments: 144 | -v,--vcf : VCF file for parsing 145 | -f,--force : Number of PIS to force a break 146 | -h,--help : Displays help menu 147 | 148 | """) 149 | print() 150 | sys.exit() 151 | 152 | #Call main function 153 | if __name__ == '__main__': 154 | main() 155 | -------------------------------------------------------------------------------- /genesFromGFF.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | 4 | use strict; 5 | use warnings; 6 | use Getopt::Long; 7 | 8 | 9 | our $gff=""; 10 | our $genome=""; 11 | 12 | parseArgs(); #Call subroutine to parse arguments... 13 | 14 | my @line; 15 | my $dna; 16 | my $element; 17 | my @info; 18 | 19 | 20 | 21 | 22 | 23 | #Call subroutine- recognizes each element type in gff and provides total length and GC content for each 24 | 25 | seqsFromGFF( "CDS" ); 26 | seqsFromGFF( "rRNA" ); 27 | seqsFromGFF( "tRNA" ); 28 | 29 | 30 | exit; 31 | 32 | ###############################SUBROUTINES###################################### 33 | 34 | #Subroutine to parse command line arguments 35 | sub parseArgs{ 36 | 37 | my $usage = "\nUsage: $0 --genome=whole_genome.fasta --gff=annotations.gff 38 | 39 | mandatory 40 | --genome - FASTA file containing sequences to parse 41 | --gff - GFF file containing gene annotations \n\n"; 42 | 43 | 44 | my $result = GetOptions 45 | ( 46 | 'genome=s' => \$genome, 47 | 'gff=s' => \$gff, 48 | 49 | ); 50 | 51 | $genome ne "" || die $usage; #Die if mandatory variables undefined 52 | $gff ne "" || die $usage; 53 | 54 | } 55 | 56 | 57 | #Subroutine to parse gff and genome for particular type of element 58 | 59 | sub seqsFromGFF{ 60 | 61 | my $type = $_[0]; 62 | my %genes; 63 | my $subseq; 64 | my $name; 65 | my $exon; 66 | 67 | 68 | undef @line; 69 | undef $dna; 70 | 71 | 72 | open ( GENOME, "$genome") || die "Derp: Can't open file $genome!"; 73 | 74 | while (){ 75 | if ($_ !~ />/){ 76 | chomp $_; 77 | $dna .= $_; 78 | } 79 | } 80 | 81 | close GENOME; 82 | 83 | 84 | open ( GFF, "$gff" ) || die "Derp: Can't open file $gff!"; 85 | 86 | foreach ( ){ 87 | @line = split /\t/, $_; 88 | #print "$line[2]\n"; 89 | if ( uc $line[2] eq uc $type ){ 90 | $subseq = substr ( $dna, $line[3]-1, $line[5] ); 91 | 92 | @info = split /\s/, $line[8]; 93 | $name = $info[1]; 94 | 95 | #Reverse complement if on opposite strand 96 | $line[6] =~ "-" and $subseq = revcom( $subseq ); 97 | 98 | 99 | if ( uc $info[2] eq uc "exon"){ 100 | $exon = $info[3]; 101 | }else{ 102 | $exon = 1; 103 | } 104 | 105 | 106 | 107 | 108 | #If element is already in hash, then alter values in the arrays by following ref in hash value... 109 | if ( exists $genes{$name} ){ 110 | $genes{$name}->[$exon-1] = $subseq; 111 | }else{ 112 | 113 | #Create array containing length and GC content, then assign array ref to hash key for that element 114 | my @seqs=(); 115 | $genes{$name} = \@seqs; 116 | $genes{$name}->[0] = ""; 117 | $genes{$name}->[$exon-1] = $subseq; 118 | } 119 | } 120 | } 121 | 122 | print "\nSequences for element type \"$type\": \n\n"; 123 | 124 | foreach my $key ( keys %genes ){ 125 | print "\>$key\n"; 126 | print "@{$genes{$key}}"."\n"; 127 | 128 | } 129 | 130 | close GFF; 131 | } 132 | 133 | ################################################################################################### 134 | 135 | sub revcom { 136 | 137 | my $DNA = reverse ( $_[0] ); 138 | 139 | $DNA =~ tr/ACGTacgt/TGCAtgca/; 140 | 141 | return $DNA; 142 | 143 | } 144 | -------------------------------------------------------------------------------- /ipyrad2polyrad.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import sys 4 | import os 5 | import getopt 6 | 7 | def main(): 8 | params = parseArgs() 9 | 10 | f = open(params.out, 'w') 11 | 12 | with open(params.vcf, "r") as vcf: 13 | for line in vcf: 14 | line=line.strip() 15 | #directly transfer header lines 16 | if line[0] == "#": 17 | if line[1] != "#": 18 | f.write("##FORMAT=\n") 19 | f.write(line) 20 | f.write("\n") 21 | 22 | else: 23 | fields=line.split("\t") 24 | ref=get_index(fields[3].split(",")) 25 | alt=get_index(fields[4].split(",")) 26 | #if biallelic filter on and site has >2 alleles, skip 27 | if params.biallelic == True: 28 | if (len(ref) + len(alt) > 2): 29 | continue 30 | if fields[8]=="GT:DP:CATG": 31 | fields[8]="GT:DP:AD" 32 | else: 33 | print("Something wrong with VCF. Field 8 should be GT:DP:CATG") 34 | sys.exit() 35 | for idx, sample in enumerate(fields[9:]): 36 | fixed=str(fix_sample(sample, ref, alt))+":" 37 | fields[idx+9] = fixed 38 | #print(sample, " -- ", fixed) 39 | f.write("\t".join(fields)) 40 | f.write("\n") 41 | vcf.close() 42 | f.close() 43 | 44 | def fix_sample(sample, ref, alt): 45 | fields=sample.split(":") 46 | catg=fields[2].split(",") 47 | ad=list() 48 | for r in ref: 49 | ad.append(catg[r]) 50 | for a in alt: 51 | ad.append(catg[a]) 52 | fields[2]=",".join(ad) 53 | return(":".join(fields)) 54 | 55 | def get_index(char): 56 | ret=list() 57 | for c in char: 58 | if c.lower()=="c": 59 | ret.append(0) 60 | elif c.lower()=="a": 61 | ret.append(1) 62 | elif c.lower()=="t": 63 | ret.append(2) 64 | elif c.lower()=="g": 65 | ret.append(3) 66 | else: 67 | print("Unrecognized character",char) 68 | sys.exit() 69 | return(ret) 70 | 71 | #Object to parse command-line arguments 72 | class parseArgs(): 73 | def __init__(self): 74 | #Define options 75 | try: 76 | options, remainder = getopt.getopt(sys.argv[1:], 'hv:o:b', \ 77 | ["help", "vcf=", "out=", "biallelic"]) 78 | except getopt.GetoptError as err: 79 | print(err) 80 | self.display_help("\nExiting because getopt returned non-zero exit status.") 81 | #Default values for params 82 | #Input params 83 | self.vcf=None 84 | self.out="polyrad.vcf" 85 | self.biallelic=False 86 | 87 | 88 | #First pass to see if help menu was called 89 | for o, a in options: 90 | if o in ("-h", "-help", "--help"): 91 | self.display_help("Exiting because help menu was called.") 92 | 93 | #Second pass to set all args. 94 | for opt, arg_raw in options: 95 | arg = arg_raw.replace(" ","") 96 | arg = arg.strip() 97 | opt = opt.replace("-","") 98 | #print(opt,arg) 99 | if opt == "h" or opt == "help": 100 | continue 101 | elif opt=="vcf" or opt=="v": 102 | self.vcf=arg 103 | elif opt=="out" or opt=="o": 104 | self.out=arg 105 | elif opt=="biallelic" or opt=="b": 106 | self.biallelic=True 107 | else: 108 | assert False, "Unhandled option %r"%opt 109 | 110 | #Check manditory options are set 111 | if not self.vcf: 112 | self.display_help("Need an ipyrad VCF file") 113 | 114 | 115 | 116 | def display_help(self, message=None): 117 | if message is not None: 118 | print() 119 | print (message) 120 | print ("\nipyrad2polyrad.py\n") 121 | print("Author: Tyler K Chafin, University of Arkansas") 122 | print ("Contact: tkchafin@uark.edu") 123 | print ("Description:Converts the ipyrad VCF to a format usable for polyRAD") 124 | print(""" 125 | -v,--vcf : VCF input with ipyrad "CATG" field 126 | -b,--biallelic : [Boolean] Toggle to skip non-biallelic sites 127 | -o,--out : Output file name (default=polyrad.vcf) 128 | """) 129 | print() 130 | sys.exit() 131 | 132 | #Call main function 133 | if __name__ == '__main__': 134 | main() 135 | -------------------------------------------------------------------------------- /liftoverCoords.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import sys 4 | import os 5 | import getopt 6 | import pyliftover 7 | import csv 8 | import pandas as pd 9 | 10 | 11 | def main(): 12 | params = parseArgs() 13 | if params.liftover: 14 | lo = pyliftover.LiftOver(params.liftover) 15 | if params.table: 16 | tab=pd.read_csv(params.table, sep="\t") 17 | print("Read table:") 18 | print(tab) 19 | def convert(row): 20 | name="chr"+row[params.chrom] 21 | ret=lo.convert_coordinate(name, row[params.bp]) 22 | return(int(ret[0][1])) 23 | 24 | tab[params.ocol] = tab.apply(convert,axis = 1) 25 | print("Writing the output table:") 26 | print(tab) 27 | tab.to_csv(params.oname, sep="\t", index=False) 28 | 29 | if params.marey: 30 | marey=make_marey(tab, params.chrom, params.ocol) 31 | print("Created the following Marey Map input:") 32 | print(marey) 33 | mout=params.oname+"_mmap.txt" 34 | marey.to_csv(mout, sep=" ", quoting=csv.QUOTE_NONNUMERIC, index=False) 35 | 36 | else: 37 | params.display_help("Error: No table provided") 38 | else: 39 | params.display_help("Error: No liftover file provided") 40 | 41 | #function writes a spoof marey map file from a table of : 42 | #chr \t bp \t cM \t liftover.bp 43 | def make_marey(table, chrom, bp): 44 | ret=pd.DataFrame() 45 | ret["map"] = "chr"+table[chrom].astype(str) 46 | ret["set"] = "fakeset" 47 | ret["mkr"] = "fakemarker" 48 | ret["phys"] = table[bp].astype(int) 49 | ret["gen"] = table["cM"].astype(float) 50 | return(ret) 51 | 52 | 53 | #Object to parse command-line arguments 54 | class parseArgs(): 55 | def __init__(self): 56 | #Define options 57 | try: 58 | options, remainder = getopt.getopt(sys.argv[1:], 'hf:t:p:c:n:o:m', \ 59 | ["help"]) 60 | except getopt.GetoptError as err: 61 | print(err) 62 | self.display_help("\nExiting because getopt returned non-zero exit status.") 63 | #Default values for params 64 | #Input params 65 | self.table = None 66 | self.liftover = None 67 | self.chrom = "chr" 68 | self.bp = "bp" 69 | self.ocol = "liftover.bp" 70 | self.oname = None 71 | self.marey=False 72 | 73 | #First pass to see if help menu was called 74 | for o, a in options: 75 | if o in ("-h", "-help", "--help"): 76 | self.display_help("Exiting because help menu was called.") 77 | 78 | #Second pass to set all args. 79 | for opt, arg_raw in options: 80 | arg = arg_raw.replace(" ","") 81 | arg = arg.strip() 82 | opt = opt.replace("-","") 83 | #print(opt,arg) 84 | if opt == "h" or opt == "help": 85 | continue 86 | elif opt == "f": 87 | self.liftover=arg 88 | elif opt == "t": 89 | self.table = arg 90 | elif opt == "p": 91 | self.bp=arg 92 | elif opt == "c": 93 | self.chrom=str(arg) 94 | elif opt == "n": 95 | self.ocol=str(arg) 96 | elif opt == "o": 97 | self.oname=str(arg) 98 | elif opt == "m": 99 | self.marey=True 100 | else: 101 | assert False, "Unhandled option %r"%opt 102 | 103 | #Check manditory options are set 104 | if not self.liftover or not self.table: 105 | self.display_help("No files provided.") 106 | self.oname=self.table + ".liftover" 107 | 108 | 109 | def display_help(self, message=None): 110 | if message is not None: 111 | print() 112 | print (message) 113 | print ("\nliftoverCoords.py\n") 114 | print("Author: Tyler K Chafin, University of Arkansas") 115 | print ("Contact: tkchafin@uark.edu") 116 | print ("Description: Converts a table of physical positions from one genome assembly to another given an \".over.chain.gz\" database") 117 | print(""" 118 | Arguments: 119 | -h, --help : Display help menu 120 | -f : Path to .over.chain.gz file 121 | -t : Tab-delimited table including coordinates 122 | -p : Column name in table containing the physical (bp) coordinates 123 | [default = \"bp\"] 124 | -c : Column name in table containing the chromosome names 125 | [default = \"chr\"] 126 | -n : Output column name for new table 127 | [default = \"liftover.bp\"] 128 | -o : Output file name 129 | [default = \".liftover\"] 130 | -m : (Boolean) Additionally output Marey-Map input file 131 | 132 | NOTE: Chromosomes should be named e.g. as \"chr1\" or \"chrX\" in the 133 | .over.chain.gz file, but without the \"chr\" in the table file """) 134 | print() 135 | sys.exit() 136 | 137 | #Call main function 138 | if __name__ == '__main__': 139 | main() 140 | -------------------------------------------------------------------------------- /liftoverFromPafscaff.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import sys 4 | import os 5 | import getopt 6 | import csv 7 | import pandas as pd 8 | import functools 9 | 10 | #sorry this code isn't really commented 11 | #i'm in a hurry 12 | #and tired 13 | 14 | def main(): 15 | params = parseArgs() 16 | mapper=dict() 17 | coords=pd.read_csv(params.coords, sep="\t", names=["scaffold", "scaffold_pos"]) 18 | 19 | #capture mappings from pafscaff headers 20 | with open(params.paf, "r") as fh: 21 | for line in fh: 22 | l=line.split() 23 | chr=l[0].split(".")[0].replace(">","") 24 | if l[1] == "RevComp": 25 | revcomp=True 26 | scaffold=l[2] 27 | start=int(l[7].split(":")[0].replace(",","")) 28 | end=int(l[7].split(":")[1].replace(";","").replace(",","")) 29 | else: 30 | revcomp=False 31 | scaffold=l[1] 32 | #print(l) 33 | #print(line[6]) 34 | start=int(l[6].split(":")[0].replace(",","")) 35 | end=int(l[6].split(":")[1].replace(";","").replace(",","")) 36 | mapper[scaffold]=[chr, revcomp, start, end] 37 | #print(scaffold, ":", mapper[scaffold]) 38 | 39 | #sys.exit() 40 | 41 | #...watch out for off-by-one errors 42 | def liftover(mapper, row): 43 | #print(row) 44 | if row[0] in mapper: 45 | convert = mapper[row[0]] 46 | #print(convert) 47 | if convert[1]: 48 | #revcomp 49 | new_coord=convert[3]-row[1] 50 | else: 51 | #not revcomp 52 | new_coord=convert[2]+row[1] 53 | row['chrom']=convert[0] 54 | row['chrom_pos']=new_coord 55 | else: 56 | #print("Scaffold",str(row[0]), "not placed in pafscaff output" 57 | row['chrom'] = "NA" 58 | row['chrom_pos'] = 0 59 | return(row) 60 | #return(["NA", 0]) 61 | #print(coords) 62 | liftover_call = functools.partial(liftover, mapper) 63 | coords=coords.apply(liftover_call, axis = 1) 64 | print(coords) 65 | 66 | coords.to_csv(params.out, sep="\t", 67 | header=True, quoting=False, 68 | index=False) 69 | 70 | 71 | 72 | 73 | #Object to parse command-line arguments 74 | class parseArgs(): 75 | def __init__(self): 76 | #Define options 77 | try: 78 | options, remainder = getopt.getopt(sys.argv[1:], 'hp:c:o:', \ 79 | ["help", "out=", "paf=", "coords="]) 80 | except getopt.GetoptError as err: 81 | print(err) 82 | self.display_help("\nExiting because getopt returned non-zero exit status.") 83 | #Default values for params 84 | #Input params 85 | self.coords = None 86 | self.paf = None 87 | self.out = "out.txt" 88 | 89 | #First pass to see if help menu was called 90 | for o, a in options: 91 | if o in ("-h", "-help", "--help"): 92 | self.display_help("Exiting because help menu was called.") 93 | 94 | #Second pass to set all args. 95 | for opt, arg_raw in options: 96 | arg = arg_raw.replace(" ","") 97 | arg = arg.strip() 98 | opt = opt.replace("-","") 99 | #print(opt,arg) 100 | if opt == "h" or opt == "help": 101 | continue 102 | elif opt == "c" or opt=="coords": 103 | self.coords=arg 104 | elif opt == "p" or opt=="paf": 105 | self.paf=arg 106 | elif opt =="o" or opt=="out": 107 | self.out=arg 108 | else: 109 | assert False, "Unhandled option %r"%opt 110 | 111 | #Check manditory options are set 112 | if not self.paf: 113 | self.display_help("No paf provided.") 114 | if not self.coords: 115 | self.display_help("No coordinates provided.") 116 | 117 | 118 | def display_help(self, message=None): 119 | if message is not None: 120 | print() 121 | print (message) 122 | print ("\nliftoverFromPafscaff.py\n") 123 | print("Author: Tyler K Chafin, University of Colorado") 124 | print ("Contact: tyler.chafin@colorado.edu") 125 | print ("Description: Converts a given set of coordinates (e.g., from a VCF file) to a new coordinate system, as mapped by pafscaff") 126 | print(""" 127 | Arguments: 128 | -h, --help : Display help menu 129 | -p,--paf : Path to pafscaff fasta file (can be just headers) 130 | -c,--coords : Tab-delimited table in the format: scaffold_name "\t" position 131 | -o, --out : Output file name [default=out.tsv] 132 | """) 133 | print() 134 | sys.exit() 135 | 136 | #Call main function 137 | if __name__ == '__main__': 138 | main() 139 | -------------------------------------------------------------------------------- /makeHyde.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import re 4 | import sys 5 | import os 6 | import getopt 7 | import operator 8 | import collections 9 | import copy 10 | 11 | def main(): 12 | params = parseArgs() 13 | 14 | if params.phylip: 15 | #Get sequences as dict of lists 16 | seqs = readPhylip(params.phylip) 17 | else: 18 | print("No input provided.") 19 | sys.exit(1) 20 | 21 | pop_assign = dict() 22 | #parse popmap file for dictionary of sample assignments 23 | if params.popmap: 24 | print("Parsing popmap file...") 25 | pop_assign = parsePopmap(params.popmap) 26 | else: 27 | print("ERROR: Popmap file must be provided.") 28 | sys.exit(1) 29 | 30 | if seqs and pop_assign: 31 | 32 | #Remove samples from pop_assign that do not have data 33 | pop_assign = cleanPopmap(pop_assign, seqs.keys()) 34 | 35 | #Make dict of dicts that splits by population, only retaining pops/samples from the popmap. 36 | #Get unique pop names using one of the worst python lines ever written 37 | pops = dict() 38 | for k in set(pop_assign.values()): 39 | pops[k] = dict() 40 | 41 | #Remove pops listed as excluded 42 | if params.exclude: 43 | print("Excluding populations:", ", ".join(params.exclude)) 44 | for exc in params.exclude: 45 | if exc in pops: 46 | del pops[exc] 47 | if params.include: 48 | print("Only keeping populations:", ", ".join(params.include)) 49 | for pop in list(pops): 50 | if pop not in params.include: 51 | del pops[pop] 52 | 53 | #make sure we didn't throw out all populations... 54 | if len(list(pops)) < 1: 55 | print("Oops! No populations remaining. Check that popmap sample names match those in your data file, or that selections using --include or --exclude are correct! :)") 56 | sys.exit(1) 57 | 58 | alen = getSeqLen(seqs) 59 | inum = 0 60 | for assigned in pop_assign: 61 | if pop_assign[assigned] in pops: 62 | pops[pop_assign[assigned]][assigned] = seqs[assigned] 63 | inum+=1 64 | seqs.clear() 65 | 66 | #Make 2D list to remove columns failing the globalN filter 67 | bad_columns = list() #list of column numbers to delete 68 | 69 | #For each pop dict, make 2D list to remove columns failing popN filter 70 | print("Found",alen,"nucleotide columns in the dataset!") 71 | columns = [[]for i in range(alen)] #2D array of global data 72 | for pop, data in pops.items(): 73 | for sample, sequence in data.items(): 74 | for i, nuc in enumerate(sequence): 75 | columns[i].append(nuc) 76 | 77 | #Write new ordered output and phylip 78 | print("Writing outputs...") 79 | phy = params.out + ".phy" 80 | omap = params.out + ".map" 81 | 82 | pfh = open(phy, "w") 83 | mfh = open(omap, "w") 84 | 85 | header = str(inum) + "\t" + str(alen) + "\n" 86 | pfh.write(header) 87 | 88 | for pop in sorted(pops): 89 | for ind, data in pops[pop].items(): 90 | indline = str(ind) + "\t" + "".join(data) + "\n" 91 | pfh.write(indline) 92 | 93 | mapline = str(ind) + "\t" + str(pop) + "\n" 94 | mfh.write(mapline) 95 | pfh.close() 96 | mfh.close() 97 | 98 | print("Done!\n") 99 | 100 | #Goes through a dict of sequences and get the alignment length 101 | def getSeqLen(aln): 102 | length = None 103 | for key in aln: 104 | if not length: 105 | length = len(aln[key]) 106 | else: 107 | if length != len(aln[key]): 108 | print("getSeqLen: Alignment contains sequences of multiple lengths.") 109 | return(length) 110 | 111 | #function reads a tab-delimited popmap file and return dictionary of assignments 112 | def parsePopmap(popmap): 113 | ret = dict() 114 | if os.path.exists(popmap): 115 | with open(popmap, 'r') as fh: 116 | try: 117 | contig = "" 118 | seq = "" 119 | for line in fh: 120 | line = line.strip() 121 | if not line: 122 | continue 123 | else: 124 | stuff = line.split() 125 | ret[stuff[0]] = stuff[1] 126 | return(ret) 127 | except IOError: 128 | print("Could not read file ",pairs) 129 | sys.exit(1) 130 | finally: 131 | fh.close() 132 | else: 133 | raise FileNotFoundError("File %s not found!"%popmap) 134 | 135 | #Function to remove samples from a popmap dict, given a list of valid samples (e.g. those to retain) 136 | def cleanPopmap(popmap, names): 137 | ret = copy.deepcopy(popmap) 138 | to_remove = list() 139 | for ind in popmap: 140 | if ind not in names: 141 | to_remove.append(ind) 142 | for rem in sorted(to_remove, reverse=True): 143 | del ret[rem] 144 | 145 | return(ret) 146 | 147 | #Function to read a phylip file. Returns dict (key=sample) of lists (sequences divided by site) 148 | def readPhylip(phy): 149 | if os.path.exists(phy): 150 | with open(phy, 'r') as fh: 151 | try: 152 | num=0 153 | ret = dict() 154 | for line in fh: 155 | line = line.strip() 156 | if not line: 157 | continue 158 | num += 1 159 | if num == 1: 160 | continue 161 | arr = line.split() 162 | ret[arr[0]] = list(arr[1]) 163 | return(ret) 164 | except IOError: 165 | print("Could not read file ",fas) 166 | sys.exit(1) 167 | finally: 168 | fh.close() 169 | else: 170 | raise FileNotFoundError("File %s not found!"%fas) 171 | 172 | 173 | #Object to parse command-line arguments 174 | class parseArgs(): 175 | def __init__(self): 176 | #Define options 177 | try: 178 | options, remainder = getopt.getopt(sys.argv[1:], 'p:i:ho:X:I:', \ 179 | ["input=","phylip=","phy=","out=","popmap=","maxN=", 180 | "popN=","exclude=","include="]) 181 | except getopt.GetoptError as err: 182 | print(err) 183 | self.display_help("\nExiting because getopt returned non-zero exit status.") 184 | #Default values for params 185 | #Input params 186 | self.phylip=None 187 | self.popmap=None 188 | self.out="out" 189 | self.exclude = list() 190 | self.include = list() 191 | 192 | 193 | #First pass to see if help menu was called 194 | for o, a in options: 195 | if o in ("-h", "-help", "--help"): 196 | self.display_help("Exiting because help menu was called.") 197 | 198 | #Second pass to set all args. 199 | for opt, arg_raw in options: 200 | arg = arg_raw.replace(" ","") 201 | arg = arg.strip() 202 | opt = opt.replace("-","") 203 | #print(opt,arg) 204 | if opt in ('i', 'phylip', 'input','phy'): 205 | self.phylip = arg 206 | elif opt in ('p', 'popmap'): 207 | self.popmap = arg 208 | elif opt in ('h', 'help'): 209 | pass 210 | elif opt in ('o','out'): 211 | self.out = arg 212 | elif opt in ('X', 'exclude'): 213 | self.exclude = arg.split(",") 214 | elif opt in ('I','include'): 215 | self.include = arg.split(",") 216 | else: 217 | assert False, "Unhandled option %r"%opt 218 | 219 | #Check manditory options are set 220 | if not self.phylip : 221 | self.display_help("Error: Missing required alignment file (--input)") 222 | if not self.popmap: 223 | self.display_help("Error: Missing required popmap file (-p, --popmap)") 224 | if self.include and self.exclude: 225 | self.display_help("Don't use both --include and --exclude.") 226 | 227 | 228 | def display_help(self, message=None): 229 | if message is not None: 230 | print ("\n",message) 231 | print ("\nmakeHyde.py\n") 232 | print ("Contact:Tyler K. Chafin, University of Arkansas,tkchafin@uark.edu") 233 | print ("\nUsage: ", sys.argv[0], "-i /path/to/phylip -i /path/to/popmap\n") 234 | print ("Description: Making inputs for HyDe and filtering populations for inclusion/exclusion") 235 | 236 | print(""" 237 | Arguments: 238 | INPUT FILES [REQUIRED] 239 | -i,--input : Input file as PHYLIP 240 | -p,--popmap : Tab-delimited population map 241 | 242 | PARAMETERS [OPTIONAL] 243 | -o,--out : Output file name 244 | -X,--exclude: List of pops to exclude (format: -x "Pop1,Pop2,Sample4...") 245 | -I,--include: List of pops to include (removing all others) 246 | -h,--help : Displays help menu 247 | 248 | """) 249 | sys.exit() 250 | 251 | #Call main function 252 | if __name__ == '__main__': 253 | main() 254 | -------------------------------------------------------------------------------- /newhybs2distruct.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import sys 4 | import os 5 | import getopt 6 | import collections 7 | 8 | def main(): 9 | params = parseArgs() 10 | if params.pops and params.pofz: 11 | #get pop IDS 12 | p = readList(params.pops) 13 | 14 | #get OrderedDict of prob results 15 | probs = readNewHybs(params.pofz) 16 | 17 | #iterate over probs to make output 18 | nan = 0 19 | index = -1 20 | popCount = dict() 21 | popEnum = dict() 22 | enumCounter = 1 23 | gen_cats = 0 24 | 25 | numPops = 0 26 | numInds = 0 27 | print("\nWriting INDIVQ file for distruct:",params.out) 28 | 29 | with open (params.out, "w") as IQ: 30 | try: 31 | for key, value in probs.items(): 32 | index += 1; 33 | if ("nan" in value): 34 | nan += 1; 35 | continue #skip individuals which couldn't be assigned 36 | 37 | numInds += 1 38 | #track population ID and count per pop 39 | if (p[index] not in popEnum): 40 | popEnum[p[index]] = enumCounter 41 | enumCounter += 1 42 | numPops += 1 43 | if (p[index] not in popCount): 44 | popCount[p[index]] = 1 45 | else: 46 | popCount[p[index]] += 1 47 | 48 | if gen_cats == 0: 49 | gen_cats = len(value) 50 | elif gen_cats != len(value): 51 | print("Warning: Samples don't have the same number of probabilities! Something is wrong") 52 | 53 | #build output line for INDIVQ 54 | indline = str(key) + "\t" + str(key) + "\t(0)\t" + str(popEnum[p[index]]) + "\t: " + "\t".join(value) + "\n" 55 | IQ.write(indline) 56 | #print(key, "(",p[index], "): ", value) 57 | 58 | if (nan > 0): 59 | print("Warning:",nan,"individuals had \"nan\" probabilities are were skipped.") 60 | except IOError: 61 | print("Could not open file",params.out) 62 | sys.exit(1) 63 | finally: 64 | IQ.close() 65 | 66 | print("Writing dummy POPQ file:", params.popq) 67 | with open(params.popq, "w") as PQ: 68 | try: 69 | for pop, enum in popEnum.items(): 70 | out = str(enum) + ":" 71 | for cat in range(gen_cats): 72 | out = out + "\t0.0" 73 | out = out + "\t" + str(popCount[pop]) + "\n" 74 | PQ.write(out) 75 | 76 | except IOError: 77 | print("Could not open file",params.popq) 78 | sys.exit(1) 79 | finally: 80 | PQ.close() 81 | 82 | print("Writing Labels file: NH_labels.txt") 83 | with open("NH_labels.txt", "w") as ID: 84 | try: 85 | for pop, enum in popEnum.items(): 86 | out = str(enum) + " " + str(pop) + "\n" 87 | ID.write(out) 88 | 89 | except IOError: 90 | print("Could not open file NH_labels.txt") 91 | sys.exit(1) 92 | finally: 93 | ID.close() 94 | 95 | print("Writing COLOR PERMUTATION file: NH_geno.perm") 96 | with open("NH_geno.perm", "w") as PERM: 97 | try: 98 | print() 99 | P1 = "1 RdGy_6_div_1\n" 100 | print("P1: Red (RdGy_6_div_1)") 101 | PERM.write(P1) 102 | 103 | P2 = "2 RdBu_6_div_6\n" 104 | print("P2: Blue (RdBu_6_div_6)") 105 | PERM.write(P2) 106 | 107 | F1 = "3 Greens_6_seq_5\n" 108 | print("F1: Green (Greens_6_seq_5)") 109 | PERM.write(F1) 110 | 111 | F2 = "4 Greens_6_seq_2\n" 112 | print("F2: Light Green (Greens_6_seq_2)") 113 | PERM.write(F2) 114 | 115 | BO1 = "5 RdBu_6_div_3\n" 116 | print("BO1: Light red (RdBu_6_div_3)") 117 | PERM.write(BO1) 118 | 119 | BO2 = "6 RdBu_6_div_4\n" 120 | print("BO2: Light Blue (RdBu_6_div_4)") 121 | PERM.write(BO2) 122 | 123 | print() 124 | 125 | except IOError: 126 | print("Could not open file NH_geno.perm") 127 | sys.exit(1) 128 | finally: 129 | PERM.close() 130 | 131 | print("Writing Distruct paramsfile: NH_params.txt") 132 | with open("NH_params.txt", "w") as PAR: 133 | try: 134 | stuff = getParams(numPops, numInds) 135 | PAR.write(stuff) 136 | 137 | except IOError: 138 | print("Could not open file NH_params.txt") 139 | sys.exit(1) 140 | finally: 141 | PAR.close() 142 | 143 | 144 | print("Done!\n") 145 | else: 146 | print("Missing required inputs.") 147 | sys.exit(1) 148 | 149 | def getParams(np, ni): 150 | par = """ 151 | #define INFILE_POPQ NH_popq.txt 152 | #define INFILE_INDIVQ NH_indivq.txt 153 | #define INFILE_LABEL_BELOW NH_labels.txt 154 | #define INFILE_LABEL_ATOP NH_labels.txt 155 | #define INFILE_CLUST_PERM NH_geno.perm 156 | #define OUTFILE NH.ps 157 | #define K 6 158 | """ 159 | par = par + "#define NUMPOPS " + str(np) + "\n" 160 | par = par + "#define NUMINDS " + str(ni) + "\n" 161 | par = par + """#define PRINT_INDIVS 1 162 | #define PRINT_LABEL_ATOP 1 163 | #define PRINT_LABEL_BELOW 0 164 | #define PRINT_SEP 1 165 | #define FONTHEIGHT 6 166 | #define DIST_ABOVE -160 167 | #define DIST_BELOW -50 168 | #define BOXHEIGHT 150 169 | #define INDIVWIDTH 2 170 | #define ORIENTATION 1 171 | #define XORIGIN 200 172 | #define YORIGIN 10 173 | #define XSCALE 1 174 | #define YSCALE 1 175 | #define ANGLE_LABEL_ATOP 270 176 | #define ANGLE_LABEL_BELOW 270 177 | #define LINEWIDTH_RIM 3 178 | #define LINEWIDTH_SEP 1 179 | #define LINEWIDTH_IND 3 180 | #define GRAYSCALE 0 181 | #define ECHO_DATA 1 182 | #define REPRINT_DATA 1 183 | #define PRINT_INFILE_NAME 0 184 | #define PRINT_COLOR_BREWER 1""" 185 | return(par) 186 | 187 | #reads and returns a list from a file 188 | def readList(l): 189 | if os.path.exists(l): 190 | with open(l, 'r') as fh: 191 | try: 192 | ret = list() 193 | for line in fh: 194 | line = line.strip() 195 | if not line: 196 | continue 197 | ret.append(line) 198 | return(ret) 199 | except IOError: 200 | print("Could not read file ",fas) 201 | sys.exit(1) 202 | finally: 203 | fh.close() 204 | else: 205 | raise FileNotFoundError("File %s not found!"%fas) 206 | 207 | 208 | #reads assignment probabilities from NewHybs PofZ output file 209 | def readNewHybs(p): 210 | if os.path.exists(p): 211 | with open(p, 'r') as fh: 212 | try: 213 | ret = collections.OrderedDict() 214 | count = 0; 215 | for line in fh: 216 | line = line.strip() 217 | if not line: 218 | continue 219 | count += 1 220 | if count == 1: 221 | continue #skip first non-blank line, which is the header 222 | else: 223 | arr = line.split() 224 | ret[arr[0]] = list(arr[2:]) 225 | return(ret) 226 | except IOError: 227 | print("Could not read file ",fas) 228 | sys.exit(1) 229 | finally: 230 | fh.close() 231 | else: 232 | raise FileNotFoundError("File %s not found!"%fas) 233 | 234 | 235 | 236 | 237 | #Object to parse command-line arguments 238 | class parseArgs(): 239 | def __init__(self): 240 | #Define options 241 | try: 242 | options, remainder = getopt.getopt(sys.argv[1:], 'i:p:', \ 243 | ["pops=","input="]) 244 | except getopt.GetoptError as err: 245 | print(err) 246 | self.display_help("\nExiting because getopt returned non-zero exit status.") 247 | #Default values for params 248 | #Input params 249 | self.pops = None 250 | self.pofz=None 251 | self.out = "NH_indivq.txt" 252 | self.popq = "NH_popq.txt" 253 | 254 | #First pass to see if help menu was called 255 | for o, a in options: 256 | if o in ("-h", "-help", "--help"): 257 | self.display_help("Exiting because help menu was called.") 258 | 259 | #Second pass to set all args. 260 | for opt, arg_raw in options: 261 | arg = arg_raw.replace(" ","") 262 | arg = arg.strip() 263 | opt = opt.replace("-","") 264 | #print(opt,arg) 265 | if opt in ('p', 'pops'): 266 | self.pops = arg 267 | elif opt in ('h', 'help'): 268 | pass 269 | elif opt in ('i', 'input'): 270 | self.pofz = arg 271 | else: 272 | assert False, "Unhandled option %r"%opt 273 | 274 | #Check manditory options are set 275 | if not self.pops: 276 | self.display_help("Error: Missing required PopID file (-p, --pops)") 277 | if not self.pofz: 278 | self.display_help("Error: Missing required PofZ file (-i, --input)") 279 | 280 | 281 | def display_help(self, message=None): 282 | if message is not None: 283 | print ("\n",message) 284 | print ("\nnewhybs2distruct.py\n") 285 | print ("Contact:Tyler K. Chafin, University of Arkansas,tkchafin@uark.edu") 286 | print ("\nUsage: ", sys.argv[0], "-i aa-PofZ.txt -p popmap \n") 287 | print ("Description: Creates inputs for DISTRUCT from NewHybrids output.") 288 | 289 | print(""" 290 | Arguments: 291 | -i,--input : aa-PofZ.txt output from NewHybrids. 292 | -p,--pops : Path to population IDs for NewHybrids samples 293 | Format: List of population IDs in the SAME ORDER as NewHybrids output. 294 | Note: My phylip2newhybrids.pl script will create this for you. 295 | -o,--out : Output file name 296 | -h,--help : Displays help menu 297 | 298 | """) 299 | sys.exit() 300 | 301 | #Call main function 302 | if __name__ == '__main__': 303 | main() 304 | -------------------------------------------------------------------------------- /parsePhaseCons.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os 4 | import sys 5 | import argparse 6 | 7 | def main(): 8 | params = parseArgs() 9 | output=list() 10 | with open(params.input, "r") as fh: 11 | coord=1 12 | linecount=0 13 | chrom=None 14 | start=1 15 | step=1 16 | current_start=None 17 | report=True 18 | total=0 19 | for line in fh: 20 | line = line.strip() 21 | if not line: 22 | continue 23 | linecount += 1 24 | if linecount == 1 or "=" in line: 25 | header = line.split() 26 | for field in header: 27 | parts=field.split("=") 28 | if parts[0] == "chrom": 29 | if chrom and parts[1] != chrom: 30 | print("Found new chrom:",parts[1]) 31 | chrom=parts[1] 32 | elif parts[0] == "start": 33 | start = int(parts[1]) 34 | elif parts[0] == "step": 35 | step = int(parts[1]) 36 | else: 37 | continue 38 | if current_start and start > coord: 39 | print("Started new region:",start, "- jumped from",coord) 40 | padded_start = current_start - params.padding 41 | if padded_start <= 0: 42 | padded_start = 1 43 | end = coord 44 | padded_end = end + params.padding 45 | if end - current_start > params.min_length: 46 | total += (padded_end-padded_start) 47 | oline=str(chrom)+":"+str(padded_start)+"-"+str(end)+"\n" 48 | output.append(oline) 49 | #print(oline) 50 | current_start = None 51 | coord=start 52 | else: 53 | coord=coord+step 54 | if not chrom: 55 | sys.exit("No chrom field found in header! Exiting script.") 56 | if report: 57 | print("\nChrom is:",chrom) 58 | print("Starting coordinate:",start) 59 | print("Step size:",step) 60 | print("Minimum phaseCons score:",params.min_score) 61 | print("Minimum length to report interval:",params.min_length) 62 | if params.padding > 0: 63 | print("Padding (+/-) for interval coordinates:",params.padding) 64 | print("\n--\n") 65 | report=False 66 | continue 67 | 68 | if float(line) >= params.min_score: 69 | if not current_start: 70 | current_start=coord 71 | else: 72 | #NOT above threshold. If there is a previous interval, check it now 73 | if current_start: 74 | padded_start = current_start - params.padding 75 | if padded_start <= 0: 76 | padded_start = 1 77 | end = coord 78 | padded_end = end + params.padding 79 | if end - current_start > params.min_length: 80 | #print(end-current_start+(2*params.padding)) 81 | total += (padded_end-padded_start) 82 | oline=str(chrom)+":"+str(padded_start)+"-"+str(end)+"\n" 83 | output.append(oline) 84 | #print(oline) 85 | current_start = None 86 | coord = coord + step 87 | fh.close() 88 | 89 | print("\n--\nDone! Writing output to:", params.output) 90 | 91 | with open(params.output, "w") as ofh: 92 | if len(output) > 0: 93 | for l in output: 94 | ofh.write(l) 95 | ofh.close() 96 | print("\nProcess complete. Total bases included in retained intervals:",total, "\n") 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | #argument parsing 106 | def parseArgs(): 107 | help = """ 108 | parsePhaseCons.py 109 | 110 | Author: Tyler K. Chafin 111 | Contact: tkchafin@uark.edu 112 | 113 | Description: Processes phaseCons outputs to generate a set of intervals with phaseCons score above X 114 | 115 | Input should be a file of phaseCons scores, with a header including the following information: 116 | chrom= start= step= 117 | """ 118 | parser = argparse.ArgumentParser(description=help) 119 | 120 | parser.add_argument('--min_length', dest='min_length', type=int, default=10, 121 | help='Minimum interval length to report [default=10]') 122 | parser.add_argument('--padding', dest='padding', type=int, default=0, 123 | help='Distance to pad interval coordinated (e.g. output=start-padding:end+padding) [default=0]') 124 | parser.add_argument('--min_score', dest='min_score', type=float, default=0.5, 125 | help='Minimum phaseCons score [default=0.5]') 126 | parser.add_argument('--input', dest='input', type=str, 127 | help='Input .pp.data file') 128 | parser.add_argument('--output', dest='output', type=str, default="phaseCons_intervals.bed", 129 | help='Output .bed file [default=phaseCons_intervals.bed]') 130 | 131 | args = parser.parse_args() 132 | 133 | if not args.input: 134 | sys.exit("Missing inputs") 135 | 136 | return args 137 | 138 | #Call main function 139 | if __name__ == '__main__': 140 | main() 141 | -------------------------------------------------------------------------------- /phylip2nexus.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | #Modified from fasta2nexus.pl written by BTM 4 | #-TKC 5 | 6 | use strict; 7 | use warnings; 8 | use Getopt::Long; 9 | use File::Path; 10 | use File::Basename; 11 | # Declare variables 12 | 13 | our $input; 14 | #our $infiletype=1; 15 | 16 | parseArgs(); 17 | 18 | 19 | #Initialize variables within each daughter process 20 | my @data; 21 | my @names; 22 | my $taxa = 0; 23 | my $name; 24 | my @fasta; 25 | my @loci; 26 | my $nchar; 27 | my $line=0; 28 | my @linedata; 29 | 30 | open ( FILE, "$input" ) || die "Error\nCan't open $input: $!\n"; 31 | while ( ){ 32 | chomp; 33 | $line++; 34 | @linedata = split /\s+/, $_; 35 | s/\s+//g; 36 | length($_) or next; 37 | $line == 1 and next; 38 | $taxa++; 39 | $name = $linedata[0]; 40 | push @names, "$name"; 41 | push @data, $linedata[1]; 42 | if ($nchar){ 43 | length($linedata[1]) != $nchar and print "Error: Line beginning with $name has a different sequence length.\n"; 44 | }else{ 45 | $nchar = length($linedata[1]); 46 | } 47 | } 48 | close FILE; 49 | 50 | #Capture to use as identifier 51 | my ($filepath, $dirpath) = fileparse("$input"); 52 | $filepath =~ /(\w+)\.\w/; 53 | my $ID = $1; 54 | 55 | open( OUT, '>', "$dirpath$ID.nex" ) || die "Error\nCan't write to $ID.nex\n"; 56 | print OUT "#NEXUS\n\n"; 57 | print OUT "BEGIN DATA; 58 | DIMENSIONS NTAX=$taxa NCHAR=$nchar; 59 | FORMAT DATATYPE=DNA MISSING=? GAP=- ; 60 | 61 | MATRIX\n"; 62 | 63 | for ( my $i = 0; $i \$input 86 | ); 87 | 88 | $input or die "\n\nError: Input not specified!\n\n$usage\n"; 89 | } 90 | 91 | ######################################################################### 92 | -------------------------------------------------------------------------------- /phylip2structure.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use strict; 4 | use warnings; 5 | use Getopt::Long; 6 | use File::Basename; 7 | 8 | #Initialize scalars 9 | my $input; 10 | my $popmap; 11 | my $output="structure.in"; 12 | my $missing="-9"; 13 | my $suppress=0; 14 | my $extra; 15 | my $locnames=0; 16 | my $popN = 1.0; 17 | my $globalN = 1.0; 18 | my $oneLine =0; 19 | #Call sub parseArgs to parse command-line arguments 20 | parseArgs(); 21 | 22 | #Some warnings... 23 | if ($suppress == 0){ 24 | $output eq "structure.in" and print "Warning: Output name not specified, using default of \"structure.in\"", "\n"; 25 | $missing eq "-9" and print "Warning: Missing data value not given; using default of \"-9\"\n", "\n"; 26 | } 27 | 28 | #Format output if default used 29 | if ($output eq "structure.in"){ 30 | my ($filepath, $dirpath) = fileparse ($input); 31 | $output = "$dirpath/$output"; 32 | }; 33 | 34 | #Specify iupac abiguity codes and how to write them out to structure file 35 | my $iupac="A 1 1 36 | C 2 2 37 | G 3 3 38 | T 4 4 39 | N $missing $missing 40 | - $missing $missing 41 | R 1 3 42 | Y 2 4 43 | S 2 3 44 | W 1 4 45 | K 3 4 46 | M 1 2 "; 47 | 48 | my %first_line; 49 | my %second_line; 50 | 51 | #Build hashes of above iupac codes 52 | for my $line (split "\n", $iupac){ 53 | chomp $line; 54 | my @a = split /\s+/, $line; 55 | $first_line{ $a[0] } = $a[1]; 56 | $second_line{ $a[0] } = $a[2]; 57 | 58 | }; 59 | 60 | #Store population identifiers for each individual (from popmap) 61 | my %popmap; 62 | my %enum; 63 | my %popcodes; 64 | if ($popmap){ 65 | open ( POPMAP, $popmap) || die "Derp: Can't open $popmap: $!"; 66 | my $popcount = 1; 67 | while (){ 68 | chomp; 69 | my @c = split /\s+/, $_; 70 | if ($enum{$c[1]}){ 71 | $popmap{$c[0]} = $enum{$c[1]}; 72 | if (!exists $popcodes{$enum{$c[1]}}){ 73 | $popcodes{$enum{$c[1]}} = $c[1]; 74 | } 75 | #print "$c[0] is from pop# $popmap{$c[0]}\n"; 76 | }else{ 77 | $enum{$c[1]} = $popcount; 78 | $popcount++; 79 | $popmap{$c[0]} = $enum{$c[1]}; 80 | } 81 | } 82 | close POPMAP; 83 | print "Population codes:\n"; 84 | foreach my $p (sort keys %popcodes){ 85 | print $p, ": ", $popcodes{$p}, "\n"; 86 | } 87 | } 88 | 89 | #Begin going through phylip file 90 | my $count = 0; 91 | 92 | open ( OUTFILE, ">", $output) || die "Can't open $output: $!"; 93 | open ( PHY, $input ) || die "Can't open $input: $!"; 94 | 95 | my $samplecount = 0; 96 | my $snpcount = 0; 97 | 98 | #data structure to hold it, so we can print in order 99 | my %structure; 100 | 101 | while ( my $line = ){ 102 | $count++; 103 | $count == 1 and next; #Test if $count=1, if so then skip to next iteration 104 | 105 | #Split each line, store sequence name and sequence 106 | my @b = split /\s+/, $line; 107 | my @seq_array = split //, $b[1]; 108 | 109 | #Build first line of structure file, containing "locus IDs" 110 | if ($count == 2){ 111 | my $locus_names= "\t\t"; 112 | for (my $i=1; $i <= scalar @seq_array; $i++){ 113 | $locus_names .= "$i\t"; 114 | } 115 | chop $locus_names; 116 | 117 | if ($locnames == 1){ 118 | print OUTFILE "$locus_names\n"; 119 | } 120 | } 121 | #Begin building structure lines 122 | my $line_1 = "$b[0]\t";#Put in sequence name 123 | my $line_2 = "$b[0]\t"; 124 | my $pop; 125 | 126 | if ($popmap){ 127 | if (exists $popmap{$b[0]}){ 128 | $pop = $popmap{$b[0]}; 129 | #Add pop codes 130 | $line_1 .= "$popmap{$b[0]}\t"; 131 | $line_2 .= "$popmap{$b[0]}\t"; 132 | } 133 | else{ 134 | next; 135 | } 136 | } 137 | 138 | if ($extra){ 139 | for (my $i=0; $i<$extra; $i++){ 140 | $line_1 .= "0\t"; 141 | $line_2 .= "0\t"; 142 | } 143 | } 144 | 145 | #Start adding allele data 146 | for( my $i=0; $i <= $#seq_array; $i++ ){ 147 | if ($snpcount == 0){ 148 | $snpcount = $#seq_array; 149 | }else{ 150 | if ($snpcount != $#seq_array){ 151 | print "Warning: Sample ",$b[0], " appears to have a different number of nucleotides. Something is wrong.\n"; 152 | } 153 | } 154 | if ($first_line{ uc $seq_array[$i] }){ 155 | $line_1 .= "$first_line{ uc $seq_array[$i] }\t"; 156 | }else{ 157 | $line_1 .= "-9\t"; 158 | } 159 | if ($second_line{ uc $seq_array[$i] }){ 160 | if ($oneLine == 0){ 161 | $line_2 .= "$second_line{ uc $seq_array[$i] }\t"; 162 | }else{ 163 | $line_1 .= "$second_line{ uc $seq_array[$i] }\t"; 164 | } 165 | }else{ 166 | if ($oneLine==0){ 167 | $line_2 .= "-9\t"; 168 | }else{ 169 | $line_1 .= "-9\t"; 170 | } 171 | } 172 | 173 | } 174 | 175 | chop $line_1; 176 | chop $line_2; 177 | 178 | if (not exists $structure{$pop}){ 179 | $structure{$pop} = []; 180 | } 181 | 182 | if ($oneLine==0){ 183 | #print OUTFILE $line_1, "\n"; 184 | #print OUTFILE $line_2, "\n"; 185 | push @{ $structure{$pop} }, [$line_1, $line_2]; 186 | }else{ 187 | push @{ $structure{$pop} }, [$line_1]; 188 | #print OUTFILE $line_1, "\n"; 189 | } 190 | $samplecount++; 191 | 192 | } 193 | 194 | foreach my $pop_key (sort {$a <=> $b} keys %structure) { 195 | foreach my $sample (@{$structure{$pop_key}}){ 196 | foreach my $line (@{$sample}){ 197 | print OUTFILE $line, "\n"; 198 | } 199 | } 200 | } 201 | 202 | close PHY; 203 | close OUTFILE; 204 | print ("Done! Outputted ", $samplecount, " samples and ", $snpcount+1, " SNPs.\n"); 205 | exit; 206 | 207 | ############################SUBROUTINES###################################### 208 | 209 | sub parseArgs{ 210 | my $help=0; 211 | 212 | my $usage= "\nUsage: $0 -i /path/to/phylip -p /path/to/popmap -o /path/to/output 213 | 214 | The purpose of this script is to take a phylip-formatted file of concatenated SNPs (such as that output by the program pyRAD) and convert it to a structure-formatted file, with two lines for each individual representing the phased allele, as well as a column representing the a priori population/ locality assignment (as provided by the user in the form of a tab-delimited table). 215 | 216 | Format of population map: 217 | Sample1 1 218 | Sample2 1 219 | Sample3 2 220 | etc 221 | 222 | 223 | Required Inputs 224 | 225 | -i, --input - Path to the input phylip file 226 | -p, --popmap - Path to the input population ID table 227 | -o, --output - Path to output (including desired filename) 228 | -n, --popN - Percent missing data allowed per SNP per population [default=1.0] 229 | NOTE: Only applies when popmap provided 230 | -N, --globalN - Percent missing data allowed per SNP globally [default=1.0] 231 | NOTE: N filters not implemented yet. 232 | 233 | Optional inputs 234 | --oneLine - Print phased alleles on one line 235 | -l, --loc - Bool, switch on printing of locus names in first row 236 | -e, --extra - Number of extra columns to insert 237 | -m, --missing - Desired code for missing data [Default is \"-9\"] 238 | -q, --quiet - Quiet mode; suppress internal warnings 239 | -x - Exlude samples that are NOT in popmap 240 | 241 | NOTE: Both gaps and N\'s will be coded as missing data.\n\n"; 242 | 243 | my $result = GetOptions 244 | ( 245 | 'input|i=s' => \$input, 246 | 'popmap|p=s' => \$popmap, 247 | 'output|o=s' => \$output, 248 | 'missing|m=s' => \$missing, 249 | 'help|h!' => \$help, 250 | 'extra|e=i' => \$extra, 251 | 'loc|l!' => \$locnames, 252 | 'quiet|q!' => \$suppress, 253 | 'popN|n' => \$popN, 254 | 'globalN|N' => \$globalN, 255 | 'oneLine!' => \$oneLine 256 | ); 257 | 258 | $help == 1 and die "$usage"; 259 | $input || die "Input not specified!\n$usage"; 260 | }; 261 | -------------------------------------------------------------------------------- /phylobarcode.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import sys 4 | import os 5 | 6 | 7 | def main(): 8 | params = parseArgs() 9 | 10 | #Object to parse command-line arguments 11 | class parseArgs(): 12 | def __init__(self): 13 | #Define options 14 | try: 15 | options, remainder = getopt.getopt(sys.argv[1:], 'h', \ 16 | ["help"]) 17 | except getopt.GetoptError as err: 18 | print(err) 19 | self.display_help("\nExiting because getopt returned non-zero exit status.") 20 | #Default values for params 21 | #Input params 22 | 23 | 24 | 25 | #First pass to see if help menu was called 26 | for o, a in options: 27 | if o in ("-h", "-help", "--help"): 28 | self.display_help("Exiting because help menu was called.") 29 | 30 | #Second pass to set all args. 31 | for opt, arg_raw in options: 32 | arg = arg_raw.replace(" ","") 33 | arg = arg.strip() 34 | opt = opt.replace("-","") 35 | #print(opt,arg) 36 | if opt == "h" or opt == "help": 37 | continue 38 | else: 39 | assert False, "Unhandled option %r"%opt 40 | 41 | #Check manditory options are set 42 | if not self.files: 43 | self.display_help("No files provided.") 44 | 45 | 46 | 47 | def display_help(self, message=None): 48 | if message is not None: 49 | print() 50 | print (message) 51 | print ("\n\n") 52 | print("Author: Tyler K Chafin, University of Arkansas") 53 | print ("Contact: tkchafin@uark.edu") 54 | print ("Description: ") 55 | print(""" 56 | 57 | """) 58 | print() 59 | sys.exit() 60 | 61 | #Call main function 62 | if __name__ == '__main__': 63 | main() 64 | -------------------------------------------------------------------------------- /process_ecoevolity.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | pre=$1 4 | 5 | mkdir $pre-output 6 | 7 | cd $pre-output 8 | 9 | echo "Sumchains...." 10 | pyco-sumchains -s 100 ../$pre-state-run-*.log &> $pre-sumchains.txt 11 | 12 | echo "Getting optimal number for burnin..." 13 | ch=$pre"-sumchains.txt" 14 | samps=`grep "maximized" $ch | sed 's/.*: //g' | sed 's/ samples.*//g'` 15 | 16 | echo "Removing $samps samples!" 17 | 18 | echo "sumcoevolity..." 19 | yam=$pre".yaml" 20 | p=$pre"-" 21 | sumcoevolity -b $samps -c ../$yam -p $p -n 1000000 ../$pre-state-run*.log 22 | 23 | echo "pyco-sumevents...." 24 | pyco-sumevents -p $p -f $pre-sumcoevolity-results-nevents.txt 25 | 26 | echo "pyco-sumtimes..." 27 | pyco-sumtimes -p $p -f -b $samps -z ../$pre-state*.log 28 | 29 | -------------------------------------------------------------------------------- /pseudoHaploidize.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import re 4 | import sys 5 | import os 6 | import getopt 7 | import random 8 | 9 | def main(): 10 | params = parseArgs() 11 | 12 | seqs = dict() #key=FASTA header; val=sequence 13 | 14 | #Now, get the alignment from the FASTA file 15 | #note that this works fine with interleaved FASTA 16 | if params.fasta: 17 | print('Reading alignment from FASTA...') 18 | for f in read_fasta(params.fasta): 19 | seqs[f[0]] = list(f[1]) 20 | 21 | 22 | #get indices of all multi-allele sites, then randomly resolve each 23 | mults = ["R", "Y", "S", "W", "K", "M", "D", "H", "B", "V"] 24 | 25 | for key in (seqs.keys()): 26 | #get indices of multi-allelic sites 27 | idxs = [i for i, c in enumerate(seqs[key]) if c.upper() in mults] 28 | 29 | #loop through amiguities, replace each with a new one 30 | for i in idxs: 31 | #print(seqs[key][i], end=" - ") 32 | seqs[key][i] = sampleAllele(seqs[key][i]) 33 | #print(seqs[key][i]) 34 | 35 | #write new FASTA outputs 36 | for samp in seqs.keys(): 37 | seqs[samp] = "".join(seqs[samp]) 38 | if (params.split): 39 | for samp in seqs.keys(): 40 | fname = samp + "_" + params.out 41 | sd = dict() 42 | sd[samp] = seqs[samp] 43 | write_fasta(fname, sd) 44 | else: 45 | write_fasta(params.out, seqs) 46 | 47 | #Function to write fasta-formatted sequences 48 | def write_fasta(f, aln): 49 | 50 | with open(f, 'w') as fh: 51 | try: 52 | for samp in aln.keys(): 53 | ol = ">" + str(samp) + "\n" + str(aln[samp]) + "\n" 54 | fh.write(ol) 55 | except IOError as e: 56 | print("Could not read file %s: %s"%(f,e)) 57 | sys.exit(1) 58 | except Exception as e: 59 | print("Unexpected error reading file %s: %s"%(f,e)) 60 | sys.exit(1) 61 | finally: 62 | fh.close() 63 | 64 | #function to randomly sample an allele given an ambiguity code 65 | def sampleAllele(ch): 66 | return(random.choice(get_iupac(ch.upper()))) 67 | 68 | #Function to split character to IUPAC codes, assuing diploidy 69 | def get_iupac(char): 70 | iupac = { 71 | "A" : ["A"], 72 | "G" : ["G"], 73 | "C" : ["C"], 74 | "T" : ["T"], 75 | "N" : ["N"], 76 | "-" : ["-"], 77 | "R" : ["A","G"], 78 | "Y" : ["C","T"], 79 | "S" : ["G","C"], 80 | "W" : ["A","T"], 81 | "K" : ["G","T"], 82 | "M" : ["A","C"], 83 | "B" : ["C","G","T"], 84 | "D" : ["A","G","T"], 85 | "H" : ["A","C","T"], 86 | "V" : ["A","C","G"] 87 | } 88 | return iupac[char] 89 | 90 | #function returns all indices 91 | def find(str, opts): 92 | return [i for i, ltr in enumerate(s) if ltr == ch] 93 | 94 | #Read samples as FASTA. Generator function 95 | def read_fasta(fas): 96 | 97 | if os.path.exists(fas): 98 | with open(fas, 'r') as fh: 99 | try: 100 | contig = "" 101 | seq = "" 102 | for line in fh: 103 | line = line.strip() 104 | if not line: 105 | continue 106 | #print(line) 107 | if line[0] == ">": #Found a header line 108 | #If we already loaded a contig, yield that contig and 109 | #start loading a new one 110 | if contig: 111 | yield([contig,seq]) #yield 112 | contig = "" #reset contig and seq 113 | seq = "" 114 | split_line = line.split() 115 | contig = (split_line[0].replace(">","")) 116 | else: 117 | seq += line 118 | #Iyield last sequence, if it has both a header and sequence 119 | if contig and seq: 120 | yield([contig,seq]) 121 | except IOError: 122 | print("Could not read file ",fas) 123 | sys.exit(1) 124 | finally: 125 | fh.close() 126 | else: 127 | raise FileNotFoundError("File %s not found!"%fas) 128 | 129 | 130 | #Object to parse command-line arguments 131 | class parseArgs(): 132 | def __init__(self): 133 | #Define options 134 | try: 135 | options, remainder = getopt.getopt(sys.argv[1:], 'f:so:h', \ 136 | ["out=", "help", "fasta=", "split"]) 137 | except getopt.GetoptError as err: 138 | print(err) 139 | self.display_help("\nExiting because getopt returned non-zero exit status.") 140 | #Default values for params 141 | #Input params 142 | self.fasta=None 143 | self.out=None 144 | self.split=False 145 | 146 | #First pass to see if help menu was called 147 | for o, a in options: 148 | if o in ("-h", "-help", "--help"): 149 | self.display_help("Exiting because help menu was called.") 150 | 151 | #Second pass to set all args. 152 | for opt, arg_raw in options: 153 | arg = arg_raw.replace(" ","") 154 | arg = arg.strip() 155 | opt = opt.replace("-","") 156 | #print(opt,arg) 157 | if opt =="f" or opt=="fasta": 158 | self.fasta = arg 159 | elif opt =="o" or opt=="out": 160 | self.out = arg 161 | elif opt == "s" or opt == "split": 162 | self.split=True 163 | elif opt =="h" or opt == "help": 164 | pass 165 | else: 166 | assert False, "Unhandled option %r"%opt 167 | 168 | #Check manditory options are set 169 | if not self.fasta: 170 | self.display_help("Must provide FASTA file <-f,--fasta>") 171 | 172 | #get output prefix if not set by user 173 | if not self.out: 174 | self.out = os.path.splitext(self.fasta)[0] + '_hap.fasta' 175 | 176 | def display_help(self, message=None): 177 | if message is not None: 178 | print() 179 | print (message) 180 | print ("\npseudoHaploidize.py\n") 181 | print ("Contact:Tyler K. Chafin, University of Arkansas,tkchafin@uark.edu") 182 | print ("\nUsage: ", sys.argv[0], "-f [-s] [-f example_hap]\n") 183 | print ("Description: Creates a pseudo-haploid sequence from input fasta, randomly resolving heterozygous sites") 184 | 185 | print(""" 186 | Arguments: 187 | -f,--fasta : Input fasta sequence 188 | -s,--split : [Boolean] Write outputs each to their own output file 189 | -o,--out : Output file name [default=input_hap.fasta or samp_input_hap.fasta if -s] 190 | -h,--help : Displays help menu 191 | """) 192 | print() 193 | sys.exit() 194 | 195 | #Call main function 196 | if __name__ == '__main__': 197 | main() 198 | -------------------------------------------------------------------------------- /pyrad2fasta.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use strict; 4 | use warnings; 5 | use Getopt::Long; 6 | use File::Path; 7 | 8 | # Declare variables 9 | 10 | my $line; 11 | my @loci; 12 | my $workdir=""; 13 | my @fasta; 14 | my $i = 1; 15 | my $input; 16 | my $batch; 17 | 18 | parseArgs(); 19 | 20 | my $output="loci"; 21 | $batch and $output.=$batch; 22 | 23 | # open file and read it in 24 | open( LOCI, $input ) || die "Can't open $input: $!\n"; 25 | # Make the loci directory to put fasta files from pyrad2fasta subroutine 26 | chdir $workdir; 27 | rmtree $output; 28 | mkdir $output; 29 | chdir $output; 30 | 31 | while ( $line = ){ 32 | 33 | 34 | if( $line =~ /^\/\// ){ 35 | if( $line =~ /\*|\-/ ){ 36 | pyrad2fasta( @loci, $i ); 37 | undef( @loci ); 38 | $i++; 39 | 40 | }else{ 41 | undef( @loci ); 42 | } 43 | }else{ 44 | push @loci, $line; 45 | 46 | } 47 | 48 | 49 | } 50 | close LOCI; 51 | exit; 52 | 53 | ###########################SUBROUTINES################################### 54 | 55 | sub parseArgs{ 56 | #Message to print if mandatory variables not declared 57 | my $usage ="\npyrad2fasta.pl takes the custom .loci output from pyRAD and creates a new FASTA file for each locus containing at least 1 SNP. 58 | 59 | Usage: $0 --i /path/to/*.loci --w /path/to/workdir 60 | 61 | Mandatory 62 | -i, --input - path to the input file (*.loci from pyRAD) 63 | -w, --workdir - path to working directoy (new fasta files will be placed within /workdir/loci 64 | 65 | Optional 66 | -b, --batch - Provide a batch number 67 | 68 | \n"; 69 | 70 | my $options = GetOptions 71 | ( 72 | 'input|i=s' => \$input, 73 | 'workdir|w=s' => \$workdir, 74 | 'batch|b=i' => \$batch, 75 | ); 76 | 77 | $input or die "\n\nError: Input not specified!\n\n$usage\n"; 78 | if ( $workdir eq ""){die "\nDerp: Working directory not specified!\n\n"}; 79 | } 80 | 81 | ######################################################################### 82 | 83 | sub pyrad2fasta{ 84 | 85 | 86 | # split at whitespace 87 | for my $element ( @loci ){ 88 | open( OUT, '>>', "$i.fasta" ) || die "Error. Can't write to $i.fasta: $!\n\n"; 89 | @fasta = split( /\s+/, $element ); 90 | print OUT $fasta[0], "\n"; 91 | print OUT $fasta[1], "\n"; 92 | 93 | } 94 | # Print the loci in FASTA format 95 | 96 | 97 | } 98 | -------------------------------------------------------------------------------- /python_template.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import sys 4 | import os 5 | import getopt 6 | 7 | def main(): 8 | params = parseArgs() 9 | 10 | 11 | 12 | #Object to parse command-line arguments 13 | class parseArgs(): 14 | def __init__(self): 15 | #Define options 16 | try: 17 | options, remainder = getopt.getopt(sys.argv[1:], 'h1:M:w:o:', \ 18 | ["help", "one2many=","many2one=","width=","out="]) 19 | except getopt.GetoptError as err: 20 | print(err) 21 | self.display_help("\nExiting because getopt returned non-zero exit status.") 22 | #Default values for params 23 | #Input params 24 | self.one2many=None 25 | self.many2one=None 26 | self.width=60 27 | self.out="out.fas" 28 | 29 | 30 | #First pass to see if help menu was called 31 | for o, a in options: 32 | if o in ("-h", "-help", "--help"): 33 | self.display_help("Exiting because help menu was called.") 34 | 35 | #Second pass to set all args. 36 | for opt, arg_raw in options: 37 | arg = arg_raw.replace(" ","") 38 | arg = arg.strip() 39 | opt = opt.replace("-","") 40 | #print(opt,arg) 41 | if opt == "h" or opt == "help": 42 | continue 43 | elif opt=="one2many" or opt=="1": 44 | self.one2many=arg 45 | elif opt=="many2one" or opt=="M": 46 | self.many2one=arg 47 | elif opt=="width" or opt=="w": 48 | self.width=int(arg) 49 | elif opt=="out" or opt=="o": 50 | self.out=arg 51 | else: 52 | assert False, "Unhandled option %r"%opt 53 | 54 | #Check manditory options are set 55 | if not self.one2many and not self.many2one: 56 | self.display_help("No files provided.") 57 | 58 | 59 | 60 | def display_help(self, message=None): 61 | if message is not None: 62 | print() 63 | print (message) 64 | print ("\nfastaFormatter.py\n") 65 | print("Author: Tyler K Chafin, University of Arkansas") 66 | print ("Contact: tkchafin@uark.edu") 67 | print ("Description:Right now just converts b/n multi-line and one-line fasta formats, might add later") 68 | print(""" 69 | -1,--one2many : Path to fasta file to multi-line format 70 | -M,--many2one : Path to fasta file to convert to one-line format 71 | -w,--width : Characters per line for multi-line (default: 60) 72 | -o,--out : Output file name (default=out.fas) 73 | """) 74 | print() 75 | sys.exit() 76 | 77 | #Call main function 78 | if __name__ == '__main__': 79 | main() 80 | -------------------------------------------------------------------------------- /revTransAll.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import sys 4 | import os 5 | import getopt 6 | from itertools import product 7 | 8 | def main(): 9 | params = parseArgs() 10 | 11 | codon_table = dict() 12 | 13 | if params.code is None: 14 | codon_table = get_standard_code() 15 | else: 16 | codon_table = read_code_file(params.code) 17 | 18 | #print(codon_table) 19 | 20 | nucs = dict() 21 | current=None 22 | curr_index=1 23 | for aa in read_fasta(params.input): 24 | if current is None: 25 | current = aa[0] 26 | elif current != aa[0]: 27 | current = aa[0] 28 | curr_index = 1 29 | for trans in get_all_revtrans(aa[1], codon_table): 30 | header=str(aa[0]) + "_translation-" + str(curr_index) 31 | nucs[header] = trans 32 | curr_index +=1 33 | 34 | write_fasta(params.out, nucs) 35 | 36 | #generator function 37 | def get_all_revtrans(aa, code): 38 | possibilities = list() 39 | for pos in aa: 40 | possibilities.append(list(code[pos.upper()])) 41 | 42 | for nuc in product(*possibilities): 43 | yield("".join(nuc)) 44 | 45 | def read_code_file(file): 46 | d = dict() 47 | if os.path.exists(file): 48 | with open(file, 'r') as fh: 49 | try: 50 | num=0 51 | ret = dict() 52 | for line in fh: 53 | line = line.strip() 54 | if not line: 55 | continue 56 | num += 1 57 | if num == 1: 58 | continue 59 | arr = line.split() 60 | if arr[0] not in d: 61 | d[arr[0].upper()] = list() 62 | d[arr[0].upper()].append(arr[1].upper()) 63 | 64 | return(d) 65 | except IOError: 66 | print("Could not read file ",file) 67 | sys.exit(1) 68 | finally: 69 | fh.close() 70 | else: 71 | raise FileNotFoundError("File %s not found!"%file) 72 | return(d) 73 | 74 | 75 | def get_standard_code(): 76 | d = { 77 | '*' : ['TAA','TAG','TGA'], 78 | 'A' : ['GCA','GCC','GCG','GCT'], 79 | 'C' : ['TGC','TGT'], 80 | 'D' : ['GAC','GAT'], 81 | 'E' : ['GAA','GAG'], 82 | 'F' : ['TTC'], 83 | 'G' : ['GGA','GGC','GGG','GGT'], 84 | 'H' : ['CAC','CAT'], 85 | 'I' : ['ATA','ATC','ATT'], 86 | 'K' : ['AAA','AAG'], 87 | 'L' : ['CTA','CTC','CTG','CTT','TTA','TTG'], 88 | 'M' : ['ATG'], 89 | 'N' : ['AAC','AAT'], 90 | 'P' : ['CCA','CCC','CCG','CCT'], 91 | 'Q' : ['CAA','CAG'], 92 | 'R' : ['AGA','AGG','CGA','CGC','CGG','CGT'], 93 | 'S' : ['AGC','AGT','TCA','TCC','TCG','TCT'], 94 | 'T' : ['ACA','ACC','ACG','ACT'], 95 | 'V' : ['GTA','GTC','GTG','GTT'], 96 | 'W' : ['TGG'], 97 | 'Y' : ['TAC','TAT'] 98 | } 99 | return(d) 100 | 101 | def write_fasta(f, aln): 102 | with open(f, 'w') as fh: 103 | try: 104 | for samp in aln.keys(): 105 | ol = ">" + str(samp) + "\n" + str(aln[samp]) + "\n" 106 | fh.write(ol) 107 | except IOError as e: 108 | print("Could not read file %s: %s"%(f,e)) 109 | sys.exit(1) 110 | except Exception as e: 111 | print("Unexpected error reading file %s: %s"%(f,e)) 112 | sys.exit(1) 113 | finally: 114 | fh.close() 115 | 116 | def read_fasta(fas): 117 | if os.path.exists(fas): 118 | with open(fas, 'r') as fh: 119 | try: 120 | contig = "" 121 | seq = "" 122 | for line in fh: 123 | line = line.strip() 124 | if not line: 125 | continue 126 | #print(line) 127 | if line[0] == ">": #Found a header line 128 | #If we already loaded a contig, yield that contig and 129 | #start loading a new one 130 | if contig: 131 | yield([contig,seq]) #yield 132 | contig = "" #reset contig and seq 133 | seq = "" 134 | split_line = line.split() 135 | contig = (split_line[0].replace(">","")) 136 | else: 137 | seq += line 138 | #Iyield last sequence, if it has both a header and sequence 139 | if contig and seq: 140 | yield([contig,seq]) 141 | except IOError: 142 | print("Could not read file ",fas) 143 | sys.exit(1) 144 | finally: 145 | fh.close() 146 | else: 147 | raise FileNotFoundError("File %s not found!"%fas) 148 | 149 | #Object to parse command-line arguments 150 | class parseArgs(): 151 | def __init__(self): 152 | #Define options 153 | try: 154 | options, remainder = getopt.getopt(sys.argv[1:], 'hi:c:o:', \ 155 | ["help", "in=", "code=", "out="]) 156 | except getopt.GetoptError as err: 157 | print(err) 158 | self.display_help("\nExiting because getopt returned non-zero exit status.") 159 | #Default values for params 160 | #Input params 161 | self.input=None 162 | self.code=None 163 | self.out="out.fas" 164 | 165 | 166 | #First pass to see if help menu was called 167 | for o, a in options: 168 | if o in ("-h", "-help", "--help"): 169 | self.display_help("Exiting because help menu was called.") 170 | 171 | #Second pass to set all args. 172 | for opt, arg_raw in options: 173 | arg = arg_raw.replace(" ","") 174 | arg = arg.strip() 175 | opt = opt.replace("-","") 176 | #print(opt,arg) 177 | if opt == "h" or opt == "help": 178 | continue 179 | elif opt=="i" or opt=="in": 180 | self.input = arg 181 | elif opt=="c" or opt=="code": 182 | self.code = arg 183 | elif opt=="out" or opt=="o": 184 | self.out = arg 185 | else: 186 | assert False, "Unhandled option %r"%opt 187 | 188 | #Check manditory options are set 189 | if not self.input: 190 | self.display_help("No input provided.") 191 | 192 | if not self.code: 193 | self.display_help("No code provided. Using default.") 194 | 195 | 196 | 197 | def display_help(self, message=None): 198 | if message is not None: 199 | print() 200 | print (message) 201 | print ("\nrevTransAll.py\n") 202 | print("Author: Tyler Chafin") 203 | print ("Contact: tyler.chafin@colorado.edu") 204 | print ("Description: Gives all possible reverse translations for a amino acid sequence") 205 | print(""" 206 | -i,--in : Input file name (FASTA format) 207 | format: 208 | >my_sequence 209 | MFLIMVVFPTTAASVMMVMMV... 210 | -c,--code : Tab-delimited codon table 211 | format: 212 | F TTT 213 | F TTC 214 | F TTA 215 | F TTG 216 | L CTT 217 | ... 218 | ... 219 | 220 | -o,--out : Output file name (default=out.fas) 221 | format: 222 | >my_sequence_translation-1 223 | ATGATGAT... 224 | >my_sequence_translation-2 225 | ATGATCAT... 226 | ... 227 | ... 228 | """) 229 | print() 230 | sys.exit() 231 | 232 | #Call main function 233 | if __name__ == '__main__': 234 | main() 235 | -------------------------------------------------------------------------------- /seq2structure.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use strict; 4 | use warnings; 5 | use Getopt::Long qw( :config posix_default no_ignore_case ); 6 | use File::Basename; 7 | 8 | #Initialize scalars 9 | my $input; 10 | my $popmap; 11 | my $output="structure.in"; 12 | my $missing="-9"; 13 | my $suppress=0; 14 | my $type; 15 | my $snp = 0; 16 | #Call sub parseArgs to parse command-line arguments 17 | parseArgs(); 18 | 19 | 20 | my $message = "#File created by seq2structure.pl; script by Tyler K. Chafin last updated 12-Dec-14"; 21 | 22 | #Some warnings... 23 | if ($suppress == 0){ 24 | $output eq "structure.in" and print "Warning: Output name not specified, using default of \"structure.in\"", "\n"; 25 | $missing eq "-9" and print "Warning: Missing data value not given; using default of \"-9\"\n", "\n"; 26 | } 27 | 28 | #Format output if default used 29 | if ($output eq "structure.in"){ 30 | my ($filepath, $dirpath) = fileparse ($input); 31 | #$output = "$output"; 32 | }; 33 | 34 | #Specify iupac abiguity codes and how to write them out to structure file 35 | my $iupac="A 1 1 36 | C 2 2 37 | G 3 3 38 | T 4 4 39 | N $missing $missing 40 | - $missing $missing 41 | R 1 3 42 | Y 2 4 43 | S 2 3 44 | W 1 4 45 | K 3 4 46 | M 1 2 "; 47 | 48 | 49 | my %first_line; 50 | my %second_line; 51 | 52 | #Build hashes of above iupac codes 53 | for my $line (split "\n", $iupac){ 54 | chomp $line; 55 | my @a = split /\s+/, $line; 56 | $first_line{ $a[0] } = $a[1]; 57 | $second_line{ $a[0] } = $a[2]; 58 | 59 | }; 60 | 61 | #Store population identifiers for each individual (from popmap) 62 | open ( POPMAP, $popmap) || die "Derp: Can't open $popmap: $!"; 63 | my %popmap; 64 | while (){ 65 | chomp; 66 | my @c = split /\s+/, $_; 67 | $popmap{$c[0]} = $c[1]; 68 | #print "$c[0] is from pop# $popmap{$c[0]}\n"; 69 | } 70 | close POPMAP; 71 | 72 | #Begin going through phylip file 73 | my $count = 0; 74 | my @b; 75 | my @seq_array; 76 | 77 | open ( OUTFILE, ">$output") || die "Can't open $output: $!"; 78 | open ( PHY, $input ) || die "Can't open $input: $!"; 79 | 80 | while ( my $line = ){ 81 | $count++; 82 | 83 | chomp $line; 84 | #Split each line, store sequence name and sequence 85 | if ($type =~ /p/i){ 86 | $count == 1 and next; #Test if $count=1, if so then skip to next iteration. 87 | @b = split /\s+/, $line; 88 | @seq_array = split //, $b[1]; 89 | 90 | #Build first line of structure file, containing "locus IDs" 91 | if ($count == 2){ 92 | #my $locus_names= "\t\t"; 93 | #for (my $i=1; $i <= scalar @seq_array; $i++){ 94 | #$locus_names .= "$i\t"; 95 | #} 96 | #chop $locus_names; 97 | #print OUTFILE "$message\n"; 98 | #print OUTFILE "$locus_names\n"; 99 | } 100 | } 101 | 102 | if ($type =~ /f/i){ 103 | 104 | if ($count ==2){ 105 | #my $locus_names= "\t\t"; 106 | 107 | #for (my $i=1; $i <= scalar(@seq_array); $i++){ 108 | #$locus_names .= "$i\t"; 109 | #} 110 | #chop $locus_names; 111 | #print OUTFILE "$message\n"; 112 | #print OUTFILE "$locus_names\n"; 113 | } 114 | 115 | if ($line =~ /^\>(\S+)/){ 116 | $b[0] = $1; 117 | next; 118 | }elsif ($line =~ /[ACGT]+/i){ 119 | @seq_array = split //, $line; 120 | }else{ 121 | 122 | next; 123 | } 124 | 125 | } 126 | 127 | 128 | #Begin building structure lines 129 | my $line_1 = "$b[0]\t";#Put in sequence name 130 | my $line_2 = "$b[0]\t"; 131 | 132 | #Add pop codes 133 | $line_1 .= "$popmap{$b[0]}\t"; 134 | $line_2 .= "$popmap{$b[0]}\t"; 135 | 136 | #Start adding allele data 137 | for( my $i=0; $i <= $#seq_array; $i++ ){ 138 | $line_1 .= "$first_line{ uc $seq_array[$i] }\t"; 139 | $line_2 .= "$second_line{ uc $seq_array[$i] }\t"; 140 | } 141 | 142 | chop $line_1; 143 | chop $line_2; 144 | 145 | print OUTFILE $line_1, "\n"; 146 | print OUTFILE $line_2, "\n"; 147 | print "Sample $b[0] done...\n"; 148 | $count++; 149 | } 150 | 151 | close PHY; 152 | close OUTFILE; 153 | 154 | #If SNP check toggled on, rewrite file with only snps 155 | my $loci=0; 156 | if ($snp == 1){ 157 | open (STR, "$output") || die "Cannot open $output for reading: $!\n"; 158 | my $comments = ""; 159 | my @data; 160 | my $num=0; 161 | my $locnames; 162 | foreach (){ 163 | chomp; 164 | #If line is a comment, capture to reprint later 165 | $_ =~ /^#/ and $comments .= $_ and next; 166 | #If line contains variable number of spaces and nothing else, skip 167 | $_ =~ /^ *$/ and next; 168 | #If column has locus names 169 | #$num == 0 and $locnames = $_; 170 | #Capture elements in line 171 | my @line = split("\t"); 172 | 173 | #Build array of arrays with secondary arrays as the columns from structure file 174 | for (my $col = 0; $col < scalar(@line); $col++){ 175 | push(@{$data[$col]}, $line[$col]); 176 | } 177 | $num++; 178 | } 179 | close STR; 180 | 181 | # Check each column for unique 182 | for (my $col = 0; $col < scalar(@data); $col++){ 183 | $col < 2 and next; #Skip sample and popID columns 184 | my %counts; 185 | $counts{$_}++ for @{$data[$col]}; 186 | #print keys(%counts) ."\n"; 187 | my $number = keys %counts; 188 | #If column doesn't contain a SNP, delete it. 189 | unless ($number > 1){ 190 | undef $data[$col]; 191 | next; 192 | } 193 | $loci++; 194 | } 195 | 196 | #Build new structure file containing only the SNPs 197 | my $ind = (scalar(@{$data[0]})/2); 198 | print "\n######################################\n\n"; 199 | print "Number of Individuals: $ind\nNumber of SNPs discovered: $loci\n"; 200 | my $outfile = "N" . $ind ."-" . "L" . $loci . "_" . "$output"; 201 | print "\nWriting $outfile...\n\n"; 202 | open (NEWOUT, ">$outfile") || die "Can't open $output for re-writing: $!\n"; 203 | $comments and print NEWOUT "$comments\n"; 204 | #print NEWOUT "$locnames\n"; 205 | for (my $row = 0; $row < scalar(@{$data[1]}); $row++){ 206 | for (my $col = 0; $col < scalar(@data); $col++){ 207 | if (defined $data[$col][$row]){ 208 | print NEWOUT $data[$col][$row] . "\t"; 209 | } 210 | } 211 | print NEWOUT "\n"; 212 | } 213 | close NEWOUT; 214 | } 215 | 216 | 217 | exit; 218 | 219 | ############################SUBROUTINES###################################### 220 | 221 | sub parseArgs{ 222 | 223 | my $help=0; 224 | 225 | my $usage= "\nUsage: $0 -i /path/to/seqfile -p /path/to/popmap -o /path/to/output 226 | 227 | The purpose of this script is to take a phylip or fasta-formatted file of concatenated SNPs (such as that output by the program pyRAD) and convert it to a structure-formatted file, with two lines for each individual representing the phased allele, as well as a column representing the a priori population/ locality assignment (as provided by the user in the form of a tab-delimited table). 228 | 229 | Format of population map: 230 | Sample1 1 231 | Sample2 1 232 | Sample3 2 233 | etc 234 | 235 | 236 | Required Inputs 237 | 238 | -i, --input - Path to the input sequencefile 239 | -p, --popmap - Path to the input population ID table 240 | -o, --output - Path to output (including desired filename) 241 | -t, --type - Input file type (phylip or fasta) 242 | 243 | Optional inputs 244 | 245 | -m, --missing - Desired code for missing data [Default is \"-9\"] 246 | -s, --snp - Check for SNPs; only write snps to str file 247 | -q, --quiet - Quiet mode; suppress internal warnings 248 | 249 | NOTE: Both gaps and N\'s will be coded as missing data. 250 | NOTE: Script assumes a perfect alignment (same length, gaps and N's inserted where needed). 251 | NOTE: SNP checking currently not functional. 252 | NOTE: Script does not create a row in structure file for locus names. Will add this functionality back in later, if necessary. 253 | TODO: Add built-in check for filetype so it doesn\'t need to be specified.\n\n"; 254 | 255 | my $result = GetOptions 256 | ( 257 | 'input|i=s' => \$input, 258 | 'popmap|p=s' => \$popmap, 259 | 'output|o=s' => \$output, 260 | 'missing|m=s' => \$missing, 261 | 'help|h!' => \$help, 262 | 'snp|s!' => \$snp, 263 | 'type|t=s' => \$type, 264 | 'quiet|q!' => \$suppress, 265 | ); 266 | 267 | $help == 1 and die "$usage"; 268 | $input || die "Input not specified!\n$usage"; 269 | $popmap || die "Popmap not provided!\n$usage"; 270 | $type || die "Popmap not provided!\n$usage"; 271 | }; 272 | 273 | -------------------------------------------------------------------------------- /short2fullPopmap.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | use Getopt::Std; 5 | 6 | my %opts; 7 | getopts('i:s:c:h', \%opts); 8 | 9 | if ($opts{h}){ 10 | &help; 11 | die "Exiting because help menu was called.\n\n" 12 | } 13 | 14 | my ($names, $short, $num, $out) = &parseArgs(\%opts); 15 | 16 | open (my $fhs, $short) or die "Can't open short\n"; 17 | 18 | my %hash; 19 | while (my $row = <$fhs>){ 20 | chomp $row; 21 | my @arr = split "\t", $row; 22 | if (!exists $hash{$arr[0]}){ 23 | $hash{$arr[0]} = $arr[1]; 24 | #print $arr[0]," is ", $hash{$arr[0]}, "\n"; 25 | } 26 | } 27 | close $fhs; 28 | 29 | 30 | open (my $fhn, $names) or die "Can't open names\n"; 31 | open (my $outfh, ">$out") or die "Can't open output file for writing\n"; 32 | 33 | while (my $name = <$fhn>){ 34 | chomp $name; 35 | my $n = substr $name, 0, $num; 36 | if (exists $hash{$n}){ 37 | print $outfh $name, "\t", $hash{$n}, "\n"; 38 | }else{ 39 | print "$name ($n) doesn't match anything", "\n"; 40 | } 41 | } 42 | close $fhn; 43 | close $out; 44 | 45 | exit; 46 | 47 | ############################################################################### 48 | ################################ Subroutines ################################## 49 | ############################################################################### 50 | 51 | # subroutine to print help 52 | sub help{ 53 | 54 | print "\nLazy script to create full popmap from a prefix popmap\n\n"; 55 | print "Program Options:\n"; 56 | 57 | print "\t-i:\tText file with list of sample names\n"; 58 | print "\t-s:\tTab-delimited prefix names\n"; 59 | print "\t-c:\tNumber of characters used for prefix\n"; 60 | print "\t-o:\tOutput file name\n"; 61 | print "\t-h:\tBoolean. Calls help menu.\n\n"; 62 | 63 | } 64 | 65 | 66 | # subroutine to parse the command line options 67 | sub parseArgs{ 68 | 69 | my( $params ) = @_; 70 | my %opts = %$params; 71 | 72 | my $names = $opts{i} or die "File with sample names not given\n"; 73 | my $short = $opts{s} or die "File with prefix popmap not given\n"; 74 | my $num = $opts{c} or die "Number of characters not given\n"; 75 | my $out = $opts{o} || "output.popmap"; 76 | 77 | 78 | return( $names, $short, $num, $out); 79 | 80 | } 81 | -------------------------------------------------------------------------------- /slidingWindowGC.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | 4 | use strict; 5 | use warnings; 6 | use Getopt::Long; 7 | 8 | 9 | our $input=""; 10 | our $win=100; 11 | our $inc=50; 12 | 13 | parseArgs(); #Call subroutine to parse arguments... 14 | 15 | my $dna; 16 | my $name = ""; 17 | my $header = ""; 18 | open ( FAS, "$input" ) || die "\nDerp: Can't open $input!\n$!\n"; 19 | 20 | 21 | #This block submits slidingWindowGC for each separate sequence in fasta file. 22 | while () { 23 | chomp $_; 24 | if ( $_ =~ m/^>(\w+)/){ 25 | $header = "$1"; #New sample name stored 26 | if ( $dna ) { 27 | print "\n$name\n\n"; 28 | slidingWindowGC( $dna, 0 ) ; 29 | } 30 | undef $dna; 31 | $name = $header; 32 | 33 | }else{ 34 | $dna .= $_; 35 | } 36 | } 37 | 38 | print "\n$name\n\n"; 39 | slidingWindowGC( $dna, 0 ); 40 | 41 | 42 | 43 | close FAS; 44 | 45 | 46 | 47 | 48 | #############################################SUBROUTINES############################################### 49 | 50 | #Subroutine to parse command line arguments 51 | sub parseArgs{ 52 | 53 | my $usage = "\nUsage: $0 --input=file.fasta --window=100 --increment=50 54 | 55 | mandatory 56 | --file - FASTA file containing sequences; the first sequence in the file will be used 57 | --window - window length (default=100) 58 | --increment - increment length; how far to shift each window (default=50) \n\n"; 59 | 60 | 61 | my $result = GetOptions 62 | ( 63 | 'file=s' => \$input, 64 | 'window=s' => \$win, 65 | 'increment=s'=> \$inc, 66 | 67 | ); 68 | 69 | $input eq "" and die $usage; #Die if mandatory variables undefined 70 | $win==100 and print "\nWarning: No window length defined- using default of 100\n\n"; 71 | $inc==50 and print "Warning: No increment length defined- using default of 50\n\n"; 72 | 73 | } 74 | 75 | 76 | 77 | #Recursive subroutine to perform sliding window through input DNA sequence 78 | 79 | sub slidingWindowGC{ 80 | 81 | 82 | my $DATA = $_[0]; 83 | my $subseq; 84 | my $GC; 85 | my $start=$_[1]; # $start initialized at zero 86 | 87 | $subseq = substr ($DATA, $start, $win); #the "window"... 88 | 89 | $GC =()=$subseq =~ /G|C/gi; #Count up Gs and Cs 90 | 91 | print "$start\t$GC\n"; #print window start coordinate and GC content 92 | 93 | $start+=$inc; #Increment start. The "sliding" part 94 | 95 | #Check if $start is within length of the dna... Sets limit to recursive subroutine and keeps it from going crazy 96 | if ($start < length($DATA) ){ 97 | slidingWindowGC( $DATA, $start); 98 | 99 | } 100 | } 101 | 102 | 103 | 104 | 105 | 106 | 107 | -------------------------------------------------------------------------------- /snps2phy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #Tyler K. Chafin 4 | #December 4 2015 5 | #Converts .snps file from pyRAD to phylip format 6 | #Email: tkchafin@uark.edu with issues 7 | 8 | if [ $1 ]; then 9 | file="$1"; 10 | else 11 | printf "\nUsage: $0 <.snps>\n\n"; 12 | exit 1; 13 | fi; 14 | 15 | #Format to phylip 16 | sed -r 's/^(\w+)\s+([A-Z_-]+)/\1\t\2/g' $file | sed 's/ //g' | sed 's/_//g' > $file.phy 17 | #Replace header line 18 | sed -i -r 's/##([0-9]+).+,.*,([0-9]+).*/\1\t\2/g' $file.phy; 19 | 20 | 21 | -------------------------------------------------------------------------------- /splitFASTA.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # 4 | #Script by Tyler K. Chafin 5 | #Last Modified: 6 May 2015 6 | #Added: Capability to split file into user-defined number of parts 7 | # 8 | 9 | 10 | use strict; 11 | use warnings; 12 | use Getopt::Long; 13 | 14 | our $pattern=">"; 15 | our $infile=""; 16 | our $suffix="fasta"; 17 | our $breaks; 18 | parseArgs(); 19 | 20 | 21 | 22 | open ( INFILE, "$infile" ) ; 23 | 24 | my $n=0; 25 | my $matches=0; 26 | if ($breaks){ 27 | #print "Breaks = $breaks\n"; 28 | 29 | 30 | #Count num of pattern matches 31 | while (){ 32 | $_ =~ "$pattern" and $matches++; 33 | } 34 | my $num_lines = int($matches/$breaks); 35 | my $count=0; 36 | 37 | #print "Num_lines = $num_lines\n"; 38 | #print "matches = $matches\n"; 39 | #Foreach part, read data and write to appropriate outfile 40 | 41 | seek(INFILE,0,0); #Reset reading position in fh 42 | 43 | while (){ 44 | 45 | if ($_ =~ "$pattern"){ 46 | $n++; 47 | if ($count == 0){ 48 | $count++; 49 | open (OUTFILE, "> $count.$suffix") || die $!; 50 | 51 | } 52 | if ($count >= $breaks){ 53 | print OUTFILE "$_"; 54 | 55 | }else{ 56 | if($n<=($num_lines*$count)){ 57 | print OUTFILE "$_"; 58 | }else{ 59 | close OUTFILE; 60 | $count++; 61 | open (OUTFILE, "> $count.$suffix") || die $!; 62 | 63 | print OUTFILE "$_"; 64 | } 65 | } 66 | }else{ 67 | print OUTFILE "$_"; 68 | } 69 | } 70 | 71 | #If num breaks not defined: break for each contig 72 | }else{ 73 | while () { 74 | 75 | if ( $_ =~ "$pattern" ){ 76 | $n++; 77 | open ( OUTFILE, "> $n.$suffix" ) || die $!; 78 | print OUTFILE "$_"; 79 | }else{ 80 | print OUTFILE "$_"; 81 | } 82 | } 83 | } 84 | 85 | 86 | close INFILE; 87 | close OUTFILE; 88 | 89 | exit; 90 | ################################################### 91 | 92 | sub parseArgs{ 93 | 94 | my $usage = "\nUsage: $0 --file=whole_genome.fasta --pattern=\> --suffix=fasta 95 | 96 | Author: Tyler K. Chafin - tkchafin\@uark.edu 97 | Last Modified: 6 May 2015 98 | 99 | Purpose of script is to break a given FASTA file into a user-defined number of portions, or into separate files per FASTA header and associated sequence. 100 | 101 | 102 | mandatory 103 | --file - File to break up 104 | 105 | optional 106 | --breaks - Break file into n pieces [default is one file for each contig] 107 | --pattern - Pattern to use to divide file [default=>] 108 | ---suffix - Suffix to use when naming daughter files [default=fasta]\n\n"; 109 | 110 | 111 | my $result = GetOptions 112 | ( 113 | 'f|file=s' => \$infile, 114 | 'p|pattern=s' => \$pattern, 115 | 's|suffix=s' => \$suffix, 116 | 'b|breaks=i' => \$breaks, 117 | ); 118 | 119 | if ( $infile eq "" ){ die $usage}; 120 | } 121 | 122 | -------------------------------------------------------------------------------- /splitFastaPops.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import sys 4 | import os 5 | import getopt 6 | import collections 7 | 8 | def main(): 9 | params = parseArgs() 10 | 11 | pop_assign = dict() 12 | seqs = dict() 13 | 14 | #parse popmap file for dictionary of sample assignments 15 | if params.popmap: 16 | print("Parsing popmap file...") 17 | pop_assign = parsePopmap(params.popmap) 18 | else: 19 | print("ERROR: Popmap file must be provided.") 20 | sys.exit(1) 21 | 22 | #Now, get the alignment from the FASTA file (as another dict) 23 | if params.fasta: 24 | print('Reading alignment from FASTA...') 25 | for f in read_fasta(params.fasta): 26 | seqs[f[0]] = f[1] 27 | else: 28 | print("ERROR: Popmap file must be provided.") 29 | sys.exit(1) 30 | 31 | print("Writing new FASTA files...") 32 | #For each pop, write a new FASTA 33 | seen = list(seqs.keys()) 34 | pops = make2Dpopmap(pop_assign) 35 | for pop in pops.keys(): 36 | fas = str(pop) + ".fasta" 37 | with open(fas, 'w') as fh: 38 | try: 39 | print(fas + "....") 40 | for sample in pops[pop]: 41 | if sample in seen: 42 | to_write = ">" + str(sample) + "\n" + seqs[sample] + "\n" 43 | fh.write(to_write) 44 | else: 45 | print("Sample not found in FASTA:",sample) 46 | except IOError as e: 47 | print("Could not read file:",e) 48 | sys.exit(1) 49 | except Exception as e: 50 | print("Unexpected error:",e) 51 | sys.exit(1) 52 | finally: 53 | fh.close() 54 | 55 | #Makes a dict of lists from a popmap 56 | def make2Dpopmap(p): 57 | ret = dict() 58 | for s in p: 59 | if p[s] not in ret: 60 | ret[p[s]] = list() 61 | ret[p[s]].append(s) 62 | return(ret) 63 | 64 | 65 | 66 | #function reads a tab-delimited popmap file and return dictionary of assignments 67 | def parsePopmap(popmap): 68 | 69 | ret = dict() 70 | if os.path.exists(popmap): 71 | with open(popmap, 'r') as fh: 72 | try: 73 | contig = "" 74 | seq = "" 75 | for line in fh: 76 | line = line.strip() 77 | if not line: 78 | continue 79 | else: 80 | stuff = line.split() 81 | ret[stuff[0]] = stuff[1] 82 | return(ret) 83 | except IOError: 84 | print("Could not read file ",pairs) 85 | sys.exit(1) 86 | finally: 87 | fh.close() 88 | else: 89 | raise FileNotFoundError("File %s not found!"%popmap) 90 | 91 | 92 | #Read samples as FASTA. Generator function 93 | def read_fasta(fas): 94 | 95 | if os.path.exists(fas): 96 | with open(fas, 'r') as fh: 97 | try: 98 | contig = "" 99 | seq = "" 100 | for line in fh: 101 | line = line.strip() 102 | if not line: 103 | continue 104 | #print(line) 105 | if line[0] == ">": #Found a header line 106 | #If we already loaded a contig, yield that contig and 107 | #start loading a new one 108 | if contig: 109 | yield([contig,seq]) #yield 110 | contig = "" #reset contig and seq 111 | seq = "" 112 | split_line = line.split() 113 | contig = (split_line[0].replace(">","")) 114 | else: 115 | seq += line 116 | #Iyield last sequence, if it has both a header and sequence 117 | if contig and seq: 118 | yield([contig,seq]) 119 | except IOError: 120 | print("Could not read file ",fas) 121 | sys.exit(1) 122 | finally: 123 | fh.close() 124 | else: 125 | raise FileNotFoundError("File %s not found!"%fas) 126 | 127 | #Object to parse command-line arguments 128 | class parseArgs(): 129 | def __init__(self): 130 | #Define options 131 | try: 132 | options, remainder = getopt.getopt(sys.argv[1:], 'f:p:h', \ 133 | ["ppmap=","fasta=","help"]) 134 | except getopt.GetoptError as err: 135 | print(err) 136 | self.display_help("\nExiting because getopt returned non-zero exit status.") 137 | #Default values for params 138 | #Input params 139 | self.popmap=None 140 | self.fasta=None 141 | 142 | #First pass to see if help menu was called 143 | for o, a in options: 144 | if o in ("-h", "-help", "--help"): 145 | self.display_help("Exiting because help menu was called.") 146 | 147 | #Second pass to set all args. 148 | for opt, arg_raw in options: 149 | arg = arg_raw.replace(" ","") 150 | arg = arg.strip() 151 | opt = opt.replace("-","") 152 | #print(opt,arg) 153 | if opt in ('p', 'popmap'): 154 | self.popmap = arg 155 | elif opt in ('h', 'help'): 156 | pass 157 | elif opt in ('f', 'fasta'): 158 | self.fasta = arg 159 | else: 160 | assert False, "Unhandled option %r"%opt 161 | 162 | #Check manditory options are set 163 | if not self.popmap: 164 | self.display_help("Error: Need popmap") 165 | if not self.fasta: 166 | self.display_help("Error: Need fasta") 167 | 168 | 169 | def display_help(self, message=None): 170 | if message is not None: 171 | print ("\n",message) 172 | print ("\nsplitFastaPops.py\n") 173 | print ("Contact:Tyler K. Chafin, University of Arkansas,tkchafin@uark.edu") 174 | print ("\nUsage: ", sys.argv[0], "-f -p \n") 175 | print ("Description: Splits a FASTA file into 1 file per population (pops from tab-delited popmap)") 176 | 177 | print(""" 178 | Arguments: 179 | -i,--input : FASTA file 180 | -p,--popmap : Tab-delimited population map (Sample \\t PopID) 181 | -h,--help : Displays help menu 182 | 183 | """) 184 | sys.exit() 185 | 186 | #Call main function 187 | if __name__ == '__main__': 188 | main() 189 | -------------------------------------------------------------------------------- /splitStackedFasta.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl 2 | 3 | # By Tyler K. Chafin 4 | # Contact: tkchafin@uark.edu 5 | 6 | use strict; 7 | use warnings; 8 | use Getopt::Std; 9 | 10 | #Die if no arguments given 11 | if( scalar( @ARGV ) == 0 ){ 12 | &help; 13 | die "No options given.\n\n"; 14 | } 15 | 16 | #Parse arguments 17 | my %opts; 18 | getopts( 'i:o:hm:n:x:', \%opts ); 19 | 20 | # kill if help option is true 21 | if( $opts{h} ){ 22 | &help; 23 | die "Printing help menu.\n\n"; 24 | } 25 | 26 | #get options 27 | my ($in, $out, $min, $max, $cap) = &parseArgs(\%opts); 28 | 29 | open (FASTA, "$in") || die "Could not open file $in: $!\n"; 30 | print "\nReading input file <$in>...\n"; 31 | open (OUT, ">$out") || die "Could not open file for output ($out) : $!\n"; 32 | print "Writing output to <$out>...\n"; 33 | my $base; 34 | my $count; 35 | my $num = 0; 36 | while (){ 37 | chomp; 38 | if ($_ =~ /^\>/){ #If header line 39 | my @line = split(/-/, $_); 40 | $line[0] =~ s/\>//g; 41 | $base = $line[0]; 42 | $count = $line[1]; 43 | $num == 1 and die "Error: Header line \"$_\" immediately follows another header line.\n"; 44 | $num = 1; 45 | next; 46 | }else{ 47 | $num == 2 and die "Error: Sequence line \"$_\" immediately follows another sequence line.\n"; 48 | $num = 2; 49 | 50 | if ($count < $min && $min != 0){ 51 | print "Skipping <$base>: Depth (<$count>) is below minimum <$min>!\n"; 52 | undef($count); 53 | undef($base); 54 | next; 55 | }elsif ($count > $cap && $cap != 0){ 56 | print "Skipping <$base>: Depth (<$count>) is above maximum <$cap>!\n"; 57 | undef($count); 58 | undef($base); 59 | next; 60 | }elsif ($max != 0){ 61 | $count > $max and $count = $max; 62 | } 63 | for (my $i=1; $i <= $count; $i++){ 64 | print OUT ">" . $base . "-" . $i . "\n"; 65 | print OUT $_ . "\n"; 66 | } 67 | undef($count); 68 | undef($base); 69 | } 70 | } 71 | print "Done!\n\n"; 72 | close FASTA; 73 | close OUT; 74 | exit; 75 | 76 | ########################### SUBROUTINES ############################### 77 | 78 | sub help{ 79 | 80 | print "\nThis perl script is written by Tyler K. Chafin - tkchafin\@uark.edu\n"; 81 | print "\nInput should be a FASTA file of collapsed read clusters where -# at the end of the FASTA header for each sequence indicates the stack depth for the cluster.\n"; 82 | print "\nNOTE: Stack depth counts start at 1.\n"; 83 | print "\nNOTE: Header cannot contain \"-\" except before the read depth, e.g.:\n"; 84 | print "\t>Name-3 85 | AGTAGTAGTAG.... 86 | Where \"Name\" is the sequence name and \"3\" is the depth.\n\n"; 87 | print "Options:\n"; 88 | print "\t-i : Path to input file (fasta)\n"; 89 | print "\t-m : Maximum stack depth to print [default: not set]\n"; 90 | print "\t-n : Skip clusters with less than \"n\" depth [default: not set]\n"; 91 | print "\t-x : Skip clusters with more than \"x\" depth [default: not set]\n"; 92 | print "\t-o : Output file name. [Default = out.phy]\n"; 93 | print "\n\n"; 94 | } 95 | 96 | #parse arguments 97 | sub parseArgs{ 98 | 99 | my( $params ) = @_; 100 | my %opts = %$params; 101 | 102 | #defaults 103 | my $in = $opts{i} or die "\nNo input was provided.\n\n"; 104 | my $min = $opts{n} || 0; 105 | my $max = $opts{m} || 0; 106 | my $cap = $opts{x} || 0; 107 | my $out = $opts{o} || "out.fasta"; 108 | #return 109 | return ($in, $out, $min, $max, $cap); 110 | } 111 | -------------------------------------------------------------------------------- /splitTableCF.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import sys 4 | import os 5 | import warnings 6 | import getopt 7 | import toytree as tt 8 | import pandas as pd 9 | import numpy as np 10 | 11 | warnings.simplefilter(action='ignore', category=FutureWarning) 12 | 13 | def main(): 14 | params = parseArgs() 15 | 16 | #read data 17 | with open(params.samples) as f: 18 | l = f.read().splitlines() 19 | cf = pd.read_csv(params.cf, header=0) 20 | tree = tt.tree(params.tree) 21 | 22 | #calculate mean ngenes for each sample 23 | cov=dict() 24 | for f in l: 25 | b=cf.eq(f).any(1) 26 | cov[f] = np.mean(cf[b]["ngenes"]) 27 | 28 | #find which sample has best representation; will be used as placehold for whole list of samples 29 | placeholder = max(cov, key = cov.get) 30 | 31 | #make subset datasets 32 | removes = [s for s in l if s != placeholder] 33 | left_cf = subset_df_blacklist(cf, removes) #keeps placeholder 34 | removes2 = [s for s in tree.get_tip_labels() if s not in l] 35 | right_cf = subset_df_blacklist(cf, removes2) 36 | left_tree = tree_remove_blacklist(tree, removes) 37 | #print(left_tree.get_tip_labels()) 38 | right_tree = tree_remove_whitelist(tree, l) 39 | #print(right_tree.get_tip_labels()) 40 | 41 | #write outputs 42 | #ingroup_tree 43 | right_tree.write("ingroup_tree.tre", tree_format=5) 44 | #ingroup_cfs 45 | right_cf.to_csv("ingroup_cfs.csv", index=False, index_label=False) 46 | #outgroup_tree 47 | left_tree.write("outgroup_tree.tre", tree_format=5) 48 | #outgroup_cfs 49 | left_cf.to_csv("outgroup_cfs.csv", index=False, index_label=False) 50 | 51 | def tree_remove_whitelist(tree, goodbois): 52 | all_tips = tree.get_tip_labels() 53 | rem = [a for a in all_tips if a not in goodbois] 54 | return(tree.drop_tips(names=rem)) 55 | 56 | def tree_remove_blacklist(tree, badbois): 57 | all_tips = tree.get_tip_labels() 58 | rem = [r for r in badbois if r in all_tips] 59 | return(tree.drop_tips(names=rem)) 60 | 61 | def subset_df_blacklist(df, badbois): 62 | ret = df.copy() 63 | for i in badbois: 64 | bools = ret.eq(i).any(1) 65 | ret = ret[~bools] 66 | return(ret) 67 | 68 | 69 | #Object to parse command-line arguments 70 | class parseArgs(): 71 | def __init__(self): 72 | #Define options 73 | try: 74 | options, remainder = getopt.getopt(sys.argv[1:], 'hc:s:n:t:', \ 75 | ["help", "cf=", "name=", "samples=", "tree="]) 76 | except getopt.GetoptError as err: 77 | print(err) 78 | self.display_help("\nExiting because getopt returned non-zero exit status.") 79 | #Default values for params 80 | #Input params 81 | self.cf = None 82 | self.samples=None 83 | self.write="both" 84 | self.tree=None 85 | 86 | #First pass to see if help menu was called 87 | for o, a in options: 88 | if o in ("-h", "-help", "--help"): 89 | self.display_help("Exiting because help menu was called.") 90 | 91 | #Second pass to set all args. 92 | for opt, arg_raw in options: 93 | arg = arg_raw.replace(" ","") 94 | arg = arg.strip() 95 | opt = opt.replace("-","") 96 | #print(opt,arg) 97 | if opt == "h" or opt == "help": 98 | continue 99 | elif opt=="cf" or opt=="c": 100 | self.cf=arg 101 | elif opt=="samples" or opt=="s": 102 | self.samples=arg 103 | elif opt=="tree" or opt=="t": 104 | self.tree=arg 105 | else: 106 | assert False, "Unhandled option %r"%opt 107 | 108 | #Check manditory options are set 109 | if not self.tree or not self.samples or not self.cf: 110 | self.display_help("No files provided.") 111 | 112 | 113 | 114 | def display_help(self, message=None): 115 | if message is not None: 116 | print() 117 | print (message) 118 | print ("\nsplitTableCF.py\n") 119 | print("Author: Tyler K Chafin, University of Colorado") 120 | print ("Contact: tyler.chafin@colorado.edu") 121 | print ("Description: Subsets a TableCF file (PhyloNetworks) given a list of samples comprising a monophyletic clade -- right now only designed for 1 split at a time") 122 | print(""" 123 | -c,--cf : CF table 124 | -s,--samples : File with list of samples 125 | -t,--tree : Tree file 126 | -h,--help : Help menu 127 | """) 128 | print() 129 | sys.exit() 130 | 131 | #Call main function 132 | if __name__ == '__main__': 133 | main() 134 | -------------------------------------------------------------------------------- /stacks2fasta.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use strict; 4 | use warnings; 5 | use Getopt::Long; 6 | use File::Path; 7 | 8 | our $input; 9 | our $workdir=""; 10 | our $catalog; 11 | our $batch=1; 12 | 13 | parseArgs(); 14 | 15 | my $locus; 16 | my $name; 17 | my @info; 18 | my %whitelist; 19 | my $ID; 20 | my $output="loci.$batch"; 21 | $workdir =~ /\S/ and $output = "$workdir/$output"; 22 | 23 | #Build list of loci containing SNPs (batch_#.catalog.snps.tsv output from STACKS) 24 | open ( CAT, $catalog ) || die "\nDerp: Can't open $catalog!\n\n"; 25 | 26 | while (){ 27 | @info = split /\t/, $_; 28 | $ID = $info[2]; 29 | 30 | #If locus ID is already in hash, then skip it 31 | if (exists $whitelist{$ID}){ 32 | next; 33 | }else{ 34 | $whitelist{$ID}=""; 35 | } 36 | } 37 | 38 | close CAT; 39 | 40 | #Parse STACKS output fasta file into loci, query each locus against whitelist 41 | open ( IN, $input ) || die "\nDerp: Can't open $input!\n\n"; 42 | 43 | rmtree $output; 44 | mkdir $output; 45 | chdir $output; 46 | 47 | while (){ 48 | 49 | $_ =~ m/CLocus_(\d+)_Sample_(\d+)/; 50 | 51 | $locus = $1; 52 | $name=$2; 53 | 54 | if (exists $whitelist{$locus}){ 55 | open ( OUT, ">>$locus.fasta"); 56 | if ( $_ =~ />/ ){ 57 | print OUT ">$name\n"; 58 | }else{ 59 | print OUT $_; 60 | } 61 | } 62 | } 63 | 64 | 65 | 66 | 67 | 68 | ##########################################SUBROUTINES######################################### 69 | 70 | sub parseArgs{ 71 | 72 | my $usage="\nstacks2fasta.pl takes the fasta output from STACKS and outputs a new fasta file for each locus containing variation, which are identified by querying the cstacks catalog 73 | 74 | Usage: $0 --i /path/to/infile --w /path/to/workdir --c=/path/to/catalog 75 | 76 | Mandatory Variables 77 | -i, --input - path to input file (absolute path) 78 | -w, --workdir - path to working directory (new fasta files will be placed within /workdir/loci 79 | -c, --catalog - path to STACKS catalog 80 | 81 | Optional 82 | -b, --batch - Provide a batch number to append to output dir name [default=1]\n\n"; 83 | 84 | my $result = GetOptions 85 | ( 86 | 'input|i=s' => \$input, 87 | 'workdir|w=s' => \$workdir, 88 | 'catalog|c=s' => \$catalog, 89 | 'batch|b=i' => \$batch, 90 | ); 91 | 92 | if ( $input eq "" ){ die "\nDerp: Input not specified!\n\n$usage"}; 93 | 94 | } 95 | 96 | ############################################################################################ 97 | 98 | 99 | 100 | -------------------------------------------------------------------------------- /subsetPhy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import getopt 4 | import sys 5 | import os 6 | 7 | #Object to parse command-line arguments 8 | class parseArgs(): 9 | def __init__(self): 10 | #Define options 11 | try: 12 | options, remainder = getopt.getopt(sys.argv[1:], 'x:p:l:o:h', \ 13 | ["xml=","phy=","list=","out=","help"]) 14 | except getopt.GetoptError as err: 15 | print(err) 16 | self.display_help("\nExiting because getopt returned non-zero exit status.") 17 | #Default values for params 18 | #Input params 19 | self.xml=None 20 | self.phy=None 21 | self.tax=None 22 | self.out="out.phy" 23 | 24 | #First pass to see if help menu was called 25 | for o, a in options: 26 | if o in ("-h", "-help", "--help"): 27 | self.display_help("Exiting because help menu was called.") 28 | 29 | #Second pass to set all args. 30 | for opt, arg_raw in options: 31 | arg = arg_raw.replace(" ","") 32 | arg = arg.strip() 33 | opt = opt.replace("-","") 34 | #print(opt,arg) 35 | if opt in ('x', 'xml'): 36 | self.xml = arg 37 | elif opt in ('h', 'help'): 38 | pass 39 | elif opt in ('p','phy'): 40 | self.phy = arg 41 | elif opt in ('l','list'): 42 | self.tax = arg 43 | elif opt in ('o','out'): 44 | self.out = arg 45 | else: 46 | assert False, "Unhandled option %r"%opt 47 | 48 | #Check manditory options are set 49 | self.phy or self.display_help("INPUT ERROR: No PHYLIP provided") 50 | self.tax or self.display_help("INPUT ERROR: No TAXON LIST provided") 51 | 52 | 53 | def display_help(self, message=None): 54 | if message is not None: 55 | print() 56 | print (message) 57 | print ("\nsubsetPhy.py\n") 58 | print ("Contact:\n\n\tTyler K. Chafin\n\tUniversity of Arkansas\n\ttkchafin@uark.edu\n") 59 | print ("\nUsage:\n\t", sys.argv[0], "-p -l \$genome, 55 | 'gff=s' => \$gff, 56 | 57 | ); 58 | 59 | $genome ne "" || die $usage; #Die if mandatory variables undefined 60 | $gff ne "" || die $usage; 61 | 62 | } 63 | 64 | 65 | #Subroutine to parse gff and genome for particular type of element 66 | 67 | sub summaryGFF{ 68 | 69 | 70 | undef @line; 71 | undef $dna; 72 | 73 | 74 | open ( GENOME, "$genome") || die "Derp: Can't open file $genome!"; 75 | 76 | while (){ 77 | $_ ne /^>/ and $dna .= $_; 78 | }; 79 | 80 | close GENOME; 81 | 82 | 83 | open ( GFF, "$gff" ) || die "Derp: Can't open file $gff!"; 84 | 85 | foreach ( ){ 86 | @line = split /\t/, $_; 87 | #print "$line[2]\n"; 88 | $GC=0; 89 | $subseq = substr ( $dna, $line[3]-1, $line[5] ); 90 | $add =()=$subseq =~ /G/gi; 91 | $GC += $add; 92 | $add =()=$subseq =~ /C/gi; 93 | $GC += $add; 94 | #print "$GC\n"; 95 | 96 | #If element is already in hash, then alter values in the arrays by following ref in hash value... 97 | if ( exists $summary{$line[2]} ){ 98 | #print "$line[2]\n"; 99 | $summary{$line[2]}->[0] += $line[5]; 100 | #print "$summary{$line[2]}\n"; 101 | $summary{$line[2]}->[1] += $GC; 102 | }else{ 103 | 104 | #Create array containing length and GC content, then assign array ref to hash key for that element 105 | my @info=($line[5], $GC); 106 | $summary{$line[2]} = \@info; 107 | } 108 | 109 | } 110 | 111 | foreach my $key ( keys %summary ){ 112 | print "$key \t$summary{$key}->[0] "; 113 | printf( "(%.1f%%) \t", $summary{$key}->[0] / length($dna) * 100); 114 | printf( "%.2f \n", $summary{$key}->[1] / $summary{$key}->[0] * 100); 115 | } 116 | } 117 | 118 | close GFF; 119 | exit; 120 | 121 | 122 | 123 | -------------------------------------------------------------------------------- /terminalGapRemover.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import re 4 | import sys 5 | import os 6 | import getopt 7 | import random 8 | 9 | def main(): 10 | params = parseArgs() 11 | 12 | seqs = dict() #key=FASTA header; val=sequence 13 | 14 | #read sequence in 15 | if params.fasta: 16 | print('Reading alignment from FASTA...') 17 | for f in read_fasta(params.fasta): 18 | seqs[f[0]] = f[1] 19 | 20 | #print("Writing new PHYLIP file",params.out) 21 | #write_phylip(params.out, seqs) 22 | elif params.phylip: 23 | print('Reading alignment from PHYLIP...') 24 | for f in read_phylip(params.phylip): 25 | seqs[f[0]] = f[1] 26 | 27 | #print("Writing new FASTA file",params.out) 28 | #write_fasta(params.out, seqs) 29 | 30 | for s in seqs.keys(): 31 | new=seqs[s] 32 | #print(new) 33 | left_ns = "" 34 | right_ns = "" 35 | for nuc in new: 36 | if nuc == "-": 37 | left_ns = left_ns + "N" 38 | else: 39 | break 40 | if len(left_ns) != len(new): 41 | for nuc in reversed(new): 42 | if nuc == "-": 43 | right_ns = right_ns + "N" 44 | else: 45 | break 46 | if len(left_ns) > 0: 47 | new = left_ns + new[len(left_ns):] 48 | if len(right_ns) > 0: 49 | new = new[0:len(new)-len(right_ns)] + right_ns 50 | seqs[s] = new 51 | 52 | if params.phylip: 53 | print("Writing new PHYLIP file",params.out) 54 | write_phylip(params.out, seqs) 55 | elif params.fasta: 56 | print("Writing new FASTA file",params.out) 57 | write_fasta(params.out, seqs) 58 | 59 | 60 | #Print dict to phylip file 61 | def write_phylip(p, aln): 62 | with open(p, 'w') as fh: 63 | try: 64 | header = getPhylipHeader(aln) + "\n" 65 | fh.write(header) 66 | 67 | for sample in aln.keys(): 68 | line = str(sample) + "\t" + "".join(aln[sample]) + "\n" 69 | fh.write(line) 70 | except IOError as e: 71 | print("Could not read file %s: %s"%(p,e)) 72 | sys.exit(1) 73 | except Exception as e: 74 | print("Unexpected error reading file %s: %s"%(p,e)) 75 | sys.exit(1) 76 | finally: 77 | fh.close() 78 | 79 | #Function to write fasta-formatted sequences 80 | def write_fasta(f, aln): 81 | with open(f, 'w') as fh: 82 | try: 83 | for samp in aln.keys(): 84 | ol = ">" + str(samp) + "\n" + str(aln[samp]) + "\n" 85 | fh.write(ol) 86 | except IOError as e: 87 | print("Could not read file %s: %s"%(f,e)) 88 | sys.exit(1) 89 | except Exception as e: 90 | print("Unexpected error reading file %s: %s"%(f,e)) 91 | sys.exit(1) 92 | finally: 93 | fh.close() 94 | 95 | #Returns header for Phylip file from a dictionary of samples w/ data 96 | def getPhylipHeader(d): 97 | numSamp = 0 98 | numLoci = None 99 | for sample in d: 100 | numSamp = numSamp + 1 101 | if not numLoci: 102 | numLoci = len(d[sample]) 103 | else: 104 | if numLoci != len(d[sample]): 105 | print("getPhylipHeader: Warning: Sequences of unequal length.") 106 | header = str(numSamp) + " " + str(numLoci) 107 | if numLoci == 0 or not numLoci: 108 | print("getPhylipHeader: Warning: No loci in dictionary.") 109 | if numSamp == 0: 110 | print("getPhylipHeader: Warning: No samples in dictionary.") 111 | return(header) 112 | 113 | #Read samples as FASTA. Generator function 114 | def read_fasta(fas): 115 | 116 | if os.path.exists(fas): 117 | with open(fas, 'r') as fh: 118 | try: 119 | contig = "" 120 | seq = "" 121 | for line in fh: 122 | line = line.strip() 123 | if not line: 124 | continue 125 | #print(line) 126 | if line[0] == ">": #Found a header line 127 | #If we already loaded a contig, yield that contig and 128 | #start loading a new one 129 | if contig: 130 | yield([contig,seq]) #yield 131 | contig = "" #reset contig and seq 132 | seq = "" 133 | split_line = line.split() 134 | contig = (split_line[0].replace(">","")) 135 | else: 136 | seq += line 137 | #Iyield last sequence, if it has both a header and sequence 138 | if contig and seq: 139 | yield([contig,seq]) 140 | except IOError: 141 | print("Could not read file ",fas) 142 | sys.exit(1) 143 | finally: 144 | fh.close() 145 | else: 146 | raise FileNotFoundError("File %s not found!"%fas) 147 | 148 | #Read samples as PHYLIP. Generator function 149 | def read_phylip(phy): 150 | if os.path.exists(phy): 151 | with open(phy, 'r') as fh: 152 | try: 153 | num=0 154 | for line in fh: 155 | line = line.strip() 156 | if not line: 157 | continue 158 | num += 1 159 | if num == 1: 160 | continue 161 | arr = line.split() 162 | yield(arr[0], arr[1]) 163 | except IOError: 164 | print("Could not read file ",phy) 165 | sys.exit(1) 166 | finally: 167 | fh.close() 168 | else: 169 | raise FileNotFoundError("File %s not found!"%phy) 170 | 171 | #Object to parse command-line arguments 172 | class parseArgs(): 173 | def __init__(self): 174 | #Define options 175 | try: 176 | options, remainder = getopt.getopt(sys.argv[1:], 'f:p:h', \ 177 | ["help", "fasta=", "phy="]) 178 | except getopt.GetoptError as err: 179 | print(err) 180 | self.display_help("\nExiting because getopt returned non-zero exit status.") 181 | #Default values for params 182 | #Input params 183 | self.fasta=None 184 | self.phylip=None 185 | self.out=None 186 | 187 | #First pass to see if help menu was called 188 | for o, a in options: 189 | if o in ("-h", "-help", "--help"): 190 | self.display_help("Exiting because help menu was called.") 191 | 192 | #Second pass to set all args. 193 | for opt, arg_raw in options: 194 | arg = arg_raw.replace(" ","") 195 | arg = arg.strip() 196 | opt = opt.replace("-","") 197 | #print(opt,arg) 198 | if opt =="f" or opt=="fasta": 199 | self.fasta = arg 200 | elif opt =="p" or opt=="phy": 201 | self.phylip = arg 202 | elif opt =="h" or opt == "help": 203 | pass 204 | else: 205 | assert False, "Unhandled option %r"%opt 206 | 207 | #Check manditory options are set 208 | if not self.fasta and not self.phylip: 209 | self.display_help("Must provide either a FASTA or PHYLIP file.") 210 | 211 | if self.fasta and self.phylip: 212 | self.display_help("Must provide either a FASTA or PHYLIP file.") 213 | 214 | #get output prefix if not set by user 215 | if self.fasta: 216 | self.out = os.path.splitext(self.fasta)[0] + '.gapfix.fasta' 217 | elif self.phylip: 218 | self.out = os.path.splitext(self.phylip)[0] + '.gapfix.phylip' 219 | 220 | def display_help(self, message=None): 221 | if message is not None: 222 | print() 223 | print (message) 224 | print ("\nterminalGapRemover.py\n") 225 | print ("Contact:Tyler K. Chafin") 226 | print ("\nUsage: ", sys.argv[0], "[-f <.fasta>] [-p <.phy>]\n") 227 | print ("Description: Simple script to convert terminal gap characters to N's. Accepts FASTA or PHYLIP") 228 | 229 | print(""" 230 | Arguments: 231 | -f,--fasta : Input FASTA to be converted 232 | -p,--phy : Input PHYLIP to be converted 233 | -h,--help : Displays help menu 234 | """) 235 | print() 236 | sys.exit() 237 | 238 | #Call main function 239 | if __name__ == '__main__': 240 | main() 241 | -------------------------------------------------------------------------------- /test_files/gtrees.tre: -------------------------------------------------------------------------------- 1 | (4,(1,(2,3))); 2 | (4,(2,(1,3))); 3 | (4,(1,(2,3))); 4 | (4,(2,(1,3))); 5 | (1,(4,(2,3))); 6 | (4,(3,(2,3))); 7 | (4,(2,(1,3))); 8 | (4,(1,(2,3))); 9 | (4,(2,(1,3))); 10 | (1,(4,(2,3))); 11 | -------------------------------------------------------------------------------- /test_files/revTransAll_code.txt: -------------------------------------------------------------------------------- 1 | F TTT 2 | S TCT 3 | Y TAT 4 | C TGT 5 | F TTC 6 | S TCC 7 | Y TAC 8 | C TGC 9 | L TTA 10 | S TCA 11 | * TAA 12 | * TGA 13 | L TTG 14 | S TCG 15 | * TAG 16 | W TGG 17 | L CTT 18 | P CCT 19 | H CAT 20 | R CGT 21 | L CTC 22 | P CCC 23 | H CAC 24 | R CGC 25 | L CTA 26 | P CCA 27 | Q CAA 28 | R CGA 29 | L CTG 30 | P CCG 31 | Q CAG 32 | R CGG 33 | I ATT 34 | T ACT 35 | N AAT 36 | S AGT 37 | I ATC 38 | T ACC 39 | N AAC 40 | S AGC 41 | I ATA 42 | T ACA 43 | K AAA 44 | R AGA 45 | M ATG 46 | T ACG 47 | K AAG 48 | R AGG 49 | V GTT 50 | A GCT 51 | D GAT 52 | G GGT 53 | V GTC 54 | A GCC 55 | D GAC 56 | G GGC 57 | V GTA 58 | A GCA 59 | E GAA 60 | G GGA 61 | V GTG 62 | A GCG 63 | E GAG 64 | G GGG 65 | -------------------------------------------------------------------------------- /test_files/revTransAll_in.fas: -------------------------------------------------------------------------------- 1 | >my_protein 2 | MPTTRPNLK 3 | -------------------------------------------------------------------------------- /test_files/terminal_gaps.fasta: -------------------------------------------------------------------------------- 1 | >A1.TEST 2 | -----TTATA--GGTTG--- 3 | >B1.TEST 4 | ---TTTTATACTG-TT---- 5 | >C1.TEST 6 | AAATTT--------GGTTTT 7 | >C2.TEST 8 | AAATTTAATGACGTGGGGGG 9 | >D1.TEST 10 | -------------------- 11 | -------------------------------------------------------------------------------- /test_files/terminal_gaps.gapfix.fasta: -------------------------------------------------------------------------------- 1 | >A1.TEST 2 | NNNNNTTATA--GGTTGNNN 3 | >B1.TEST 4 | NNNTTTTATACTG-TTNNNN 5 | >C1.TEST 6 | AAATTT--------GGTTTT 7 | >C2.TEST 8 | AAATTTAATGACGTGGGGGG 9 | >D1.TEST 10 | NNNNNNNNNNNNNNNNNNNN 11 | -------------------------------------------------------------------------------- /test_files/variable_length.fas: -------------------------------------------------------------------------------- 1 | >ka001 2 | AAAAAAAAAAAAAAAAAAAA 3 | >ka002 4 | AAAAAA 5 | >ka003 6 | AAAAGAGAGAGAGGAGAGAGAGGAGAGAGAGAG 7 | >ka004 8 | AGAGAGAGGAGAGAGGAGAGAGAG 9 | >ka005 10 | AGAGAGAGGAGAGAGGAGAGAGAG 11 | >ka006 12 | AGAGAGAGGAGAGAGGAGAGAGAG 13 | >ka007 14 | AGAGAGAGGAGAGAGGAGAGAGAG 15 | >ka008 16 | AGAGAG 17 | >ka009 18 | AAGA -------------------------------------------------------------------------------- /test_files/variable_length.fas.filter: -------------------------------------------------------------------------------- 1 | >ka003 2 | AAAAGAGAGAGAGGAGAGAGAGGAGAGAGAGAG 3 | >ka004 4 | AGAGAGAGGAGAGAGGAGAGAGAG 5 | >ka005 6 | AGAGAGAGGAGAGAGGAGAGAGAG 7 | >ka006 8 | AGAGAGAGGAGAGAGGAGAGAGAG 9 | >ka007 10 | AGAGAGAGGAGAGAGGAGAGAGAG 11 | -------------------------------------------------------------------------------- /traitsList2LagrangePhylip.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import sys 4 | import os 5 | import getopt 6 | 7 | def main(): 8 | params = parseArgs() 9 | 10 | traits=set() 11 | samples=dict() 12 | map=dict() 13 | 14 | if params.map: 15 | with open(params.map, "r") as m: 16 | for line in m: 17 | line=line.strip() 18 | if len(line)==0: 19 | continue 20 | stuff=line.split() 21 | if len(stuff) >2: 22 | print("ERROR: Too many elements --",line) 23 | else: 24 | if stuff[0] in map.keys(): 25 | map[stuff[0]].append(stuff[1]) 26 | else: 27 | map[stuff[0]]=list() 28 | map[stuff[0]].append(stuff[1]) 29 | 30 | with open(params.tab, "r") as t: 31 | for line in t: 32 | line=line.strip() 33 | if len(line) == 0: 34 | continue 35 | stuff=line.split("\t") 36 | if len(stuff) >2: 37 | print("ERROR: Too many elements --",line) 38 | else: 39 | samples[stuff[0]]=set() 40 | #print(stuff[0]) 41 | #print(stuff) 42 | if len(stuff) >1: 43 | splitstuff=stuff[1].split(",") 44 | for s in splitstuff: 45 | loc=s 46 | if params.map and s in map.keys(): 47 | loc=map[s] 48 | for l in loc: 49 | samples[stuff[0]].add(l) 50 | traits.add(l) 51 | continue 52 | else: 53 | samples[stuff[0]].add(loc) 54 | traits.add(loc) 55 | #print(samples) 56 | #sys.exit() 57 | t.close() 58 | #print(traits) 59 | #sys.exit() 60 | 61 | 62 | trlen=len(traits) 63 | slen=len(samples) 64 | output="" 65 | rep=False 66 | for samp in samples: 67 | #print(samples[samp]) 68 | oline = str(samp) + "\t" 69 | #if no traits, report and skip 70 | #print(samples[samp]) 71 | if len(samples[samp]) < 1: 72 | if not rep: 73 | rep=True 74 | print("Samples were found without any trait data. Skipping samples:") 75 | print(samp) 76 | slen-=1 77 | continue 78 | else: 79 | for t in traits: 80 | if t in samples[samp]: 81 | oline = oline + "1" 82 | else: 83 | oline = oline + "0" 84 | oline+="\n" 85 | output = output+oline 86 | #sys.exit() 87 | #print(output) 88 | #write lagrange phylip file 89 | with open(params.out, "w") as ofh: 90 | header=str(slen) + "\t" + str(trlen) + "\t(" + str(" ".join(traits)) + ")\n" 91 | print("Traits output in this order:") 92 | print(str(", ".join(traits))) 93 | ofh.write(header) 94 | ofh.write(output) 95 | ofh.close() 96 | 97 | 98 | #Object to parse command-line arguments 99 | class parseArgs(): 100 | def __init__(self): 101 | #Define options 102 | try: 103 | options, remainder = getopt.getopt(sys.argv[1:], 'ht:o:m:', \ 104 | ["help"]) 105 | except getopt.GetoptError as err: 106 | print(err) 107 | self.display_help("\nExiting because getopt returned non-zero exit status.") 108 | #Default values for params 109 | #Input params 110 | self.tab=None 111 | self.out="out.phy" 112 | self.map=None 113 | 114 | 115 | #First pass to see if help menu was called 116 | for o, a in options: 117 | if o in ("-h", "-help", "--help"): 118 | self.display_help("Exiting because help menu was called.") 119 | 120 | #Second pass to set all args. 121 | for opt, arg_raw in options: 122 | arg = arg_raw.replace(" ","") 123 | arg = arg.strip() 124 | opt = opt.replace("-","") 125 | #print(opt,arg) 126 | if opt == "h" or opt == "help": 127 | continue 128 | elif opt == "t": 129 | self.tab=arg 130 | elif opt=="o": 131 | self.out=arg 132 | elif opt=="m": 133 | self.map=arg 134 | else: 135 | assert False, "Unhandled option %r"%opt 136 | 137 | #Check manditory options are set 138 | if not self.tab: 139 | self.display_help("No table provided.") 140 | 141 | 142 | 143 | def display_help(self, message=None): 144 | if message is not None: 145 | print() 146 | print (message) 147 | print ("\ntraitsList2LagrangePhylip.py\n") 148 | print("Author: Tyler K Chafin, University of Arkansas") 149 | print ("Contact: tkchafin@uark.edu") 150 | print ("Description: Converts table of the form Sample \t Trait,Trait,Trait to phylip 0/1 format, for LAGRANGE of BioGeoBEARS") 151 | print(""" 152 | -t: Tab-delimited trait table 153 | -m: Option tab-delimited map grouping trait names 154 | -o: Output file name [default=out.phy] 155 | """) 156 | print() 157 | sys.exit() 158 | 159 | #Call main function 160 | if __name__ == '__main__': 161 | main() 162 | -------------------------------------------------------------------------------- /treeAlignment_subsetter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import sys 4 | import os 5 | import getopt 6 | import toytree as tt 7 | import random 8 | 9 | def main(): 10 | params = parseArgs() 11 | 12 | seqs=dict() 13 | for f in read_phylip(params.phylip): 14 | seqs[f[0]] = f[1] 15 | 16 | tree=tt.tree(params.tree, tree_format=0) 17 | 18 | if not params.samples: 19 | params.samples=int(params.freq*len(list(seqs.keys()))) 20 | 21 | print("Generating",params.reps,"random subsets of",params.samples,"samples eac") 22 | 23 | for r in range(params.reps): 24 | print("starting replicate",str(r)) 25 | prefix=params.out + "_" + str(r) 26 | print("subsetting alignment") 27 | keeps=dict(random.sample(seqs.items(), params.samples)) 28 | bad_bois=[k for k in seqs.keys() if k not in keeps] 29 | print("subsetting tree") 30 | stree = tree.drop_tips(names=bad_bois) 31 | print("writing subset files") 32 | write_phylip(prefix+".phylip",keeps) 33 | stree.write(prefix+".tre", tree_format=0) 34 | 35 | 36 | #Print dict to phylip file 37 | def write_phylip(p, aln): 38 | with open(p, 'w') as fh: 39 | try: 40 | header = getPhylipHeader(aln) + "\n" 41 | fh.write(header) 42 | 43 | for sample in aln.keys(): 44 | line = str(sample) + "\t" + "".join(aln[sample]) + "\n" 45 | fh.write(line) 46 | except IOError as e: 47 | print("Could not read file %s: %s"%(p,e)) 48 | sys.exit(1) 49 | except Exception as e: 50 | print("Unexpected error reading file %s: %s"%(p,e)) 51 | sys.exit(1) 52 | finally: 53 | fh.close() 54 | 55 | #Returns header for Phylip file from a dictionary of samples w/ data 56 | def getPhylipHeader(d): 57 | numSamp = 0 58 | numLoci = None 59 | for sample in d: 60 | numSamp = numSamp + 1 61 | if not numLoci: 62 | numLoci = len(d[sample]) 63 | else: 64 | if numLoci != len(d[sample]): 65 | print("getPhylipHeader: Warning: Sequences of unequal length.") 66 | header = str(numSamp) + " " + str(numLoci) 67 | if numLoci == 0 or not numLoci: 68 | print("getPhylipHeader: Warning: No loci in dictionary.") 69 | if numSamp == 0: 70 | print("getPhylipHeader: Warning: No samples in dictionary.") 71 | return(header) 72 | 73 | 74 | #Read samples as PHYLIP. Generator function 75 | def read_phylip(phy): 76 | if os.path.exists(phy): 77 | with open(phy, 'r') as fh: 78 | try: 79 | num=0 80 | for line in fh: 81 | line = line.strip() 82 | if not line: 83 | continue 84 | num += 1 85 | if num == 1: 86 | continue 87 | arr = line.split() 88 | yield(arr[0], arr[1]) 89 | except IOError: 90 | print("Could not read file ",phy) 91 | sys.exit(1) 92 | finally: 93 | fh.close() 94 | else: 95 | raise FileNotFoundError("File %s not found!"%phy) 96 | 97 | #Object to parse command-line arguments 98 | class parseArgs(): 99 | def __init__(self): 100 | #Define options 101 | try: 102 | options, remainder = getopt.getopt(sys.argv[1:], 'hs:f:r:t:p:o:m:', \ 103 | ["help", "reps=","tree=","phylip=","out=", "method=", "samples=", "freq="]) 104 | except getopt.GetoptError as err: 105 | print(err) 106 | self.display_help("\nExiting because getopt returned non-zero exit status.") 107 | #Default values for params 108 | #Input params 109 | self.tree=None 110 | self.reps=10 111 | self.freq=0.1 112 | self.samples=None 113 | self.phylip=None 114 | self.method="random" 115 | self.out="subset" 116 | 117 | 118 | #First pass to see if help menu was called 119 | for o, a in options: 120 | if o in ("-h", "-help", "--help"): 121 | self.display_help("Exiting because help menu was called.") 122 | 123 | #Second pass to set all args. 124 | for opt, arg_raw in options: 125 | arg = arg_raw.replace(" ","") 126 | arg = arg.strip() 127 | opt = opt.replace("-","") 128 | #print(opt,arg) 129 | if opt == "h" or opt == "help": 130 | continue 131 | elif opt=="tree" or opt=="t": 132 | self.tree=arg 133 | elif opt=="phylip" or opt=="p": 134 | self.phylip=arg 135 | elif opt=="method" or opt=="m": 136 | self.method=arg 137 | elif opt=="reps" or opt=="r": 138 | self.reps=int(arg) 139 | elif opt=="freq" or opt=="f": 140 | self.freq=float(arg) 141 | elif opt=="samples" or opt=="s": 142 | self.samples=int(arg) 143 | elif opt=="out" or opt=="o": 144 | self.out=arg 145 | else: 146 | assert False, "Unhandled option %r"%opt 147 | 148 | #Check manditory options are set 149 | if not self.phylip and not self.tree: 150 | self.display_help("Must provide input tree (newick) and alignment (phylip) files.") 151 | 152 | 153 | 154 | def display_help(self, message=None): 155 | if message is not None: 156 | print() 157 | print (message) 158 | print ("\ntreeAlignment_subsetter.py\n") 159 | print ("Description: Generate random subsets of an input phylogenetic dataset (tree and alignment)") 160 | print(""" 161 | -t,--tree : Path to input newick file 162 | -p,--phylip : Path to input phylip file 163 | -s,--samples : Number of samples to keep 164 | -f,--freq : Sampling frequency (must set either -f or -s) 165 | -r,--reps : Number of replicates to generate 166 | -o,--out : Output file name (default=out.fas) 167 | """) 168 | print() 169 | sys.exit() 170 | 171 | #Call main function 172 | if __name__ == '__main__': 173 | main() 174 | -------------------------------------------------------------------------------- /treeExpansion.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import re 4 | import sys 5 | import os 6 | import getopt 7 | 8 | 9 | def main(): 10 | params = parseArgs() 11 | 12 | popsList = dict() 13 | #parse popmap file for dictionary of sample assignments 14 | if params.popmap: 15 | #print("Parsing popmap file...") 16 | popsList = parsePopmap_alt(params.popmap) 17 | 18 | if not params.tree: 19 | 20 | print("ERROR: No tree provided.") 21 | sys.exit(1) 22 | 23 | 24 | newtree = params.tree 25 | for pop in popsList: 26 | replace = ", ".join(popsList[pop]) 27 | newtree = newtree.replace(str(pop), str(replace)) 28 | print(newtree) 29 | 30 | 31 | else: 32 | print("ERROR: Popmap file must be provided.") 33 | sys.exit(1) 34 | 35 | 36 | #function reads a tab-delimited popmap file and return dictionary of assignments 37 | #function returns dict of pops, each pointint to list of taxa 38 | def parsePopmap_alt(popmap): 39 | ret = dict() 40 | with open(popmap, 'r') as fh: 41 | try: 42 | contig = "" 43 | seq = "" 44 | for line in fh: 45 | line = line.strip() 46 | if not line: 47 | continue 48 | else: 49 | stuff = line.split() 50 | if len(stuff)!= 2: 51 | print("Uh oh! Record missing a field: ",stuff) 52 | continue 53 | if stuff[1] not in ret: 54 | l = list() 55 | l.append(stuff[0]) 56 | ret[stuff[1]] = l 57 | else: 58 | ret[stuff[1]].append(stuff[0]) 59 | return(ret) 60 | except IOError as e: 61 | print("Could not read file %s: %s"%(popmap,e)) 62 | sys.exit(1) 63 | except Exception as e: 64 | print("Unexpected error reading file %s: %s"%(popmap,e)) 65 | sys.exit(1) 66 | finally: 67 | fh.close() 68 | 69 | #function returns first readable line from a file 70 | #good for getting headers etc 71 | def firstLine(f): 72 | with open(f, 'r') as fh: 73 | try: 74 | for line in fh: 75 | line = line.strip() 76 | if not line: 77 | continue 78 | else: 79 | return(line) #returns first real line 80 | except IOError as e: 81 | print("Could not read file %s: %s"%(f,e)) 82 | sys.exit(1) 83 | except Exception as e: 84 | print("Unexpected error reading file %s: %s"%(f,e)) 85 | sys.exit(1) 86 | finally: 87 | fh.close() 88 | 89 | 90 | #Object to parse command-line arguments 91 | class parseArgs(): 92 | def __init__(self): 93 | #Define options 94 | try: 95 | options, remainder = getopt.getopt(sys.argv[1:], 't:s:p:h', \ 96 | ["tree=","popmap="]) 97 | except getopt.GetoptError as err: 98 | print(err) 99 | self.display_help("\nExiting because getopt returned non-zero exit status.") 100 | #Default values for params 101 | #Input params 102 | self.tree=None 103 | self.popmap=None 104 | 105 | 106 | #First pass to see if help menu was called 107 | for o, a in options: 108 | if o in ("-h", "-help", "--help"): 109 | self.display_help("Exiting because help menu was called.") 110 | 111 | #Second pass to set all args. 112 | for opt, arg_raw in options: 113 | arg = arg_raw.replace(" ","") 114 | arg = arg.strip() 115 | opt = opt.replace("-","") 116 | #print(opt,arg) 117 | if opt in ('t','tree'): 118 | self.tree = firstLine(arg) 119 | elif opt in ('p', 'popmap'): 120 | self.popmap = arg 121 | elif opt in ('h', 'help'): 122 | pass 123 | else: 124 | assert False, "Unhandled option %r"%opt 125 | 126 | #Check manditory options are set 127 | if not self.tree: 128 | self.display_help("Error: Missing required tree (--tree or --stree)") 129 | if not self.popmap: 130 | self.display_help("Error: Missing required popmap file (-p, --popmap)") 131 | 132 | 133 | def display_help(self, message=None): 134 | if message is not None: 135 | print ("\n",message) 136 | print ("\ntreeExpansion.py\n") 137 | print ("Contact:Tyler K. Chafin, University of Arkansas,tkchafin@uark.edu") 138 | print ("Description: Expands Newick tree of clades to include all taxa in a popmap file") 139 | 140 | print(""" 141 | Arguments: 142 | -p,--popmap : Tab-delimited population map 143 | -t,--tree : Newick tree in a file 144 | or 145 | -s,--stree : Newick tree given as a string 146 | -h,--help : Displays help menu 147 | 148 | """) 149 | sys.exit() 150 | 151 | #Call main function 152 | if __name__ == '__main__': 153 | main() 154 | -------------------------------------------------------------------------------- /trimFastq.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use strict; 4 | use warnings; 5 | 6 | my $usage = "\nUsage: $0 input.fq 5-prime-bp 3-prime-bp 7 | 8 | This script removes a specified number of bp from the 5' and 3' ends of each sequence in the fastq file\n\n"; 9 | 10 | # print "1: $ARGV[1]\n"; 11 | # print "2: $ARGV[2]\n"; 12 | 13 | defined $ARGV[2] or die $usage; 14 | 15 | my $begTrim = $ARGV[1]; 16 | my $endTrim = $ARGV[2]; 17 | 18 | open( FAS, $ARGV[0] ) || die "Couldn't open $ARGV[0]: $!\n"; 19 | 20 | while( my $line = ){ 21 | if( $line =~ /\A@/ ){ 22 | print $line and next; # skip headers 23 | }elsif( $line =~ /\A\+/ ){ 24 | print $line and next; # skip "+" line 25 | }else{ 26 | my $len = length $line; 27 | print substr( $line, 0 + $begTrim, $len - $endTrim - $begTrim - 1 ), "\n"; 28 | } 29 | } 30 | 31 | close FAS; 32 | 33 | exit; 34 | -------------------------------------------------------------------------------- /utm2latlong.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import sys 4 | import os 5 | import getopt 6 | import utm 7 | 8 | def main(): 9 | params = parseArgs() 10 | 11 | if params.utm: 12 | for line in readTSV(params.utm): 13 | if params.zone and params.hemi: 14 | coords = utm.to_latlon(float(line[1]), float(line[2]), params.zone, params.hemi) 15 | oline=str(line[0])+"\t"+str(coords[0])+"\t"+str(coords[1]) 16 | print(oline) 17 | elif params.inline: 18 | z=line[3][:-1] 19 | h=line[3][-1] 20 | coords = utm.to_latlon(float(line[1]), float(line[2]), int(z), h) 21 | oline=str(line[0])+"\t"+str(coords[0])+"\t"+str(coords[1]) 22 | print(oline) 23 | else: 24 | params.display_help("No UTM zone information provided.") 25 | elif params.latlong: 26 | for line in readTSV(params.latlong): 27 | coords = utm.from_latlon(float(line[1]), float(line[2])) 28 | oline=str(line[0])+"\t"+str(coords[0])+"\t"+str(coords[1]) + "\t" +str(coords[2])+str(coords[3]) 29 | print(oline) 30 | else: 31 | params.display_help("No input provided") 32 | 33 | 34 | #generator function, reads tsv line by line 35 | def readTSV(tab): 36 | with open(tab, 'r') as fh: 37 | try: 38 | for line in fh: 39 | line = line.strip() 40 | if not line: 41 | continue 42 | yield(line.split()) 43 | except IOError: 44 | print("Could not read file ",tab) 45 | sys.exit(1) 46 | finally: 47 | fh.close() 48 | 49 | #Object to parse command-line arguments 50 | class parseArgs(): 51 | def __init__(self): 52 | #Define options 53 | try: 54 | options, remainder = getopt.getopt(sys.argv[1:], 'hu:c:z:l:i', \ 55 | ["help"]) 56 | except getopt.GetoptError as err: 57 | print(err) 58 | self.display_help("\nExiting because getopt returned non-zero exit status.") 59 | #Default values for params 60 | #Input params 61 | self.utm=None 62 | self.latlong=None 63 | self.zone=None 64 | self.hemi=None 65 | self.inline=False 66 | 67 | 68 | #First pass to see if help menu was called 69 | for o, a in options: 70 | if o in ("-h", "-help", "--help"): 71 | self.display_help("Exiting because help menu was called.") 72 | 73 | #Second pass to set all args. 74 | for opt, arg_raw in options: 75 | arg = arg_raw.replace(" ","") 76 | arg = arg.strip() 77 | opt = opt.replace("-","") 78 | #print(opt,arg) 79 | if opt == "h" or opt == "help": 80 | continue 81 | elif opt == "c": 82 | self.latlong=arg 83 | elif opt=="l": 84 | self.hemi=arg 85 | elif opt == "z": 86 | self.zone=int(arg) 87 | elif opt=="u": 88 | self.utm=arg 89 | elif opt=="i": 90 | self.inline=True 91 | else: 92 | assert False, "Unhandled option %r"%opt 93 | 94 | #Check manditory options are set 95 | if not self.utm and not self.latlong: 96 | self.display_help("No input file provided (must be one of: <-u> or <-c>)") 97 | if self.utm and self.latlong: 98 | self.display_help("Options not compatible: <-u> <-c>") 99 | if self.utm: 100 | if not self.zone and not self.hemi and not self.inline: 101 | self.display_help("Must provide zone number <-z> and letter <-l> with UTMs or as inline <-i>") 102 | if self.zone and self.inline: 103 | self.display_help("Options not compatible: <-i> <-z>") 104 | 105 | 106 | 107 | def display_help(self, message=None): 108 | if message is not None: 109 | print() 110 | print (message) 111 | print ("\n\n") 112 | print("Author: Tyler K Chafin, University of Arkansas") 113 | print ("Contact: tkchafin@uark.edu") 114 | print ("Description: ") 115 | print(""" 116 | Arguments 117 | -u : Tab-delimited table of UTM coordinates (2nd col = Easting; 3rd col = Northing) 118 | -or- 119 | -c : Tab-delimited table of lat/long coordinates (2nd col = lat; 3rd col= long) 120 | -z : If converting UTM to lat/long, provide zone number here (e.g. "15") 121 | -l : If converting UTM to lat/long, provide zone letter here (e.g. "N") 122 | -i : If converting from UTMs, zone can be as 4th column (e.g. "15S") 123 | 124 | """) 125 | print() 126 | sys.exit() 127 | 128 | #Call main function 129 | if __name__ == '__main__': 130 | main() 131 | -------------------------------------------------------------------------------- /vcf2phylip.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import re 4 | import sys 5 | import os 6 | import vcf 7 | import getopt 8 | 9 | def main(): 10 | params = parseArgs() 11 | 12 | data = dict() 13 | 14 | if params.vcf: 15 | #for each record in VCF 16 | for record in read_vcf(params.vcf): 17 | for call in record.samples: 18 | #Get consensus base call 19 | cons = None 20 | if call.gt_bases: 21 | l = (call.gt_bases).split("/") 22 | cons = reverse_iupac(listToSortUniqueString(l)) 23 | else: 24 | cons = "N" 25 | if cons: 26 | if call.sample in data: 27 | data[call.sample].append(cons) 28 | else: 29 | data[call.sample] = list() 30 | data[call.sample].append(cons) 31 | else: 32 | print ("Uh oh! No consensus called for %s, something is wrong"%call) 33 | 34 | #Print dict to phylip file 35 | with open(params.out, 'w') as fh: 36 | try: 37 | header = getPhylipHeader(data) + "\n" 38 | fh.write(header) 39 | 40 | for sample in data: 41 | line = str(sample) + "\t" + "".join(data[sample]) + "\n" 42 | fh.write(line) 43 | except IOError: 44 | print("Could not write to file ",params.out) 45 | sys.exit(1) 46 | finally: 47 | fh.close() 48 | 49 | else: 50 | print("Error: No VCF file provided") 51 | sys.exit(1) 52 | 53 | #Returns header for Phylip file from a dictionary of samples w/ data 54 | def getPhylipHeader(d): 55 | numSamp = 0 56 | numLoci = None 57 | for sample in d: 58 | numSamp = numSamp + 1 59 | if not numLoci: 60 | numLoci = len(d[sample]) 61 | else: 62 | if numLoci != len(d[sample]): 63 | print("getPhylipHeader: Warning: Sequences of unequal length.") 64 | header = str(numSamp) + " " + str(numLoci) 65 | if numLoci == 0 or not numLoci: 66 | print("getPhylipHeader: Warning: No loci in dictionary.") 67 | if numSamp == 0: 68 | print("getPhylipHeader: Warning: No samples in dictionary.") 69 | return(header) 70 | 71 | #Read VCF variant calls 72 | #Generator function, yields each locus 73 | def read_vcf(v): 74 | 75 | try: 76 | vfh = vcf.Reader(filename=v) 77 | except IOError as err: 78 | print("I/O error({0}): {1}".format(err.errno, err.strerror)) 79 | except: 80 | print("Unexpected error:", sys.exec_info()[0]) 81 | 82 | chrom = "" 83 | recs = [] 84 | added = 0 85 | for rec in vfh: 86 | if not rec.FILTER: 87 | yield(rec) 88 | 89 | #Function to return sorted unique string from list of chars 90 | def listToSortUniqueString(l): 91 | sl = sorted(set(l)) 92 | return(str(''.join(sl))) 93 | 94 | #Function to translate a string of bases to an iupac ambiguity code 95 | def reverse_iupac(char): 96 | char = char.upper() 97 | if "-" in char: 98 | return("-") 99 | elif "N" in char: 100 | return("N") 101 | elif "." in char: 102 | return(".") 103 | else: 104 | iupac = { 105 | 'A':'A', 106 | 'N':'N', 107 | '-':'-', 108 | 'C':'C', 109 | 'G':'G', 110 | 'T':'T', 111 | 'AG':'R', 112 | 'CT':'Y', 113 | 'AC':'M', 114 | 'GT':'K', 115 | 'AT':'W', 116 | 'CG':'S', 117 | 'CGT':'B', 118 | 'AGT':'D', 119 | 'ACT':'H', 120 | 'ACG':'V', 121 | 'ACGT':'N' 122 | } 123 | return iupac[char] 124 | 125 | #Object to parse command-line arguments 126 | class parseArgs(): 127 | def __init__(self): 128 | #Define options 129 | try: 130 | options, remainder = getopt.getopt(sys.argv[1:], 'v:o:h', \ 131 | ["vcf=","help","out="]) 132 | except getopt.GetoptError as err: 133 | print(err) 134 | self.display_help("\nExiting because getopt returned non-zero exit status.") 135 | #Default values for params 136 | #Input params 137 | self.vcf=None 138 | self.out=None 139 | 140 | #First pass to see if help menu was called 141 | for o, a in options: 142 | if o in ("-h", "-help", "--help"): 143 | self.display_help("Exiting because help menu was called.") 144 | 145 | #Second pass to set all args. 146 | for opt, arg_raw in options: 147 | arg = arg_raw.replace(" ","") 148 | arg = arg.strip() 149 | opt = opt.replace("-","") 150 | #print(opt,arg) 151 | if opt in ('v', 'vcf'): 152 | self.vcf = arg 153 | elif opt in ('h', 'help'): 154 | pass 155 | elif opt in ('o','out'): 156 | self.out = arg 157 | else: 158 | assert False, "Unhandled option %r"%opt 159 | 160 | #Check manditory options are set 161 | if not self.vcf: 162 | self.display_help("\nError: Missing required input file <-v,--vcf>") 163 | 164 | 165 | if self.out: 166 | self.out = self.out + ".phy" 167 | else: 168 | self.out = "out.phy" 169 | 170 | 171 | def display_help(self, message=None): 172 | if message is not None: 173 | print (message) 174 | print ("\nvcf2phylip.py\n") 175 | print ("Contact:Tyler K. Chafin, University of Arkansas,tkchafin@uark.edu") 176 | print ("\nUsage: ", sys.argv[0], "-v \n") 177 | print ("Description: Extract SNPs from a VCF file and outputs as concatenated Phylip") 178 | 179 | print(""" 180 | Arguments: 181 | -v,--vcf : VCF input file 182 | -o,--out : Prefix for output file 183 | -h,--help : Displays help menu 184 | 185 | """) 186 | sys.exit() 187 | 188 | #Call main function 189 | if __name__ == '__main__': 190 | main() 191 | --------------------------------------------------------------------------------