├── LICENSE.md
├── README.md
├── alignment_subsetter.py
├── alleles2taghap.pl
├── autoFetcher.py
├── averageFastStructure.pl
├── batchBUCKY.pl
├── bootstrapGeneTrees.sh
├── check_cds.py
├── collapseHaps.pl
├── collapse_baits.py
├── compare2seqs.pl
├── compare_seqs_fasta.py
├── concatFasta.py
├── concatenateNexus.py
├── condenseAlleles.pl
├── count_residues.pl
├── expandSeq.py
├── fast2distruct.pl
├── fasta2gphocs.py
├── fasta2length.pl
├── fasta2nexus.pl
├── fasta2phylip.py
├── fastaFormatter.py
├── fill_quartets.py
├── filterFastaMedianLength.sh
├── filterLoci.py
├── filter_loci.pl
├── findBreaksVCF.py
├── fixedSNP.pl
├── genesFromGFF.pl
├── ipyrad2polyrad.py
├── liftoverCoords.py
├── liftoverFromPafscaff.py
├── makeHyde.py
├── makePopArt.py
├── makeSAMOVA.pl
├── newhybs2distruct.py
├── newhybs2props.py
├── nremover.pl
├── parallelMB.pl
├── parsePhaseCons.py
├── phylip2bgc.pl
├── phylip2biNumNex.py
├── phylip2ecoevolity.pl
├── phylip2introgress.pl
├── phylip2newhybrids.pl
├── phylip2nexus.pl
├── phylip2structure.pl
├── phylipFilterPops.pl
├── phylobarcode.py
├── process_ecoevolity.sh
├── pseudoHaploidize.py
├── pyrad2fasta.pl
├── python_template.py
├── revTransAll.py
├── seq2structure.pl
├── short2fullPopmap.pl
├── slidingWindowGC.pl
├── snps2phy.sh
├── splitFASTA.pl
├── splitFastaPops.py
├── splitStackedFasta.pl
├── splitTableCF.py
├── stacks2fasta.pl
├── structure2newhyb.pl
├── subsetPhy.py
├── subsetSnps.py
├── sumls.sh
├── summaryGFF.pl
├── terminalGapRemover.py
├── test_files
    ├── gtrees.tre
    ├── revTransAll_code.txt
    ├── revTransAll_in.fas
    ├── revTransAll_out.fas
    ├── terminal_gaps.fasta
    ├── terminal_gaps.gapfix.fasta
    ├── variable_length.fas
    └── variable_length.fas.filter
├── traitsList2LagrangePhylip.py
├── treeAlignment_subsetter.py
├── treeExpansion.py
├── trimFastq.pl
├── utm2latlong.py
└── vcf2phylip.py


/README.md:
--------------------------------------------------------------------------------
  1 | # scripts
  2 | Collection of scripts- mostly for manipulating, filtering, and format-conversion of DNA sequence files. Feel free to use.
  3 | 
  4 | ### How to use
  5 | Most scripts are written to accept the <-h> argument to display a help menu which should describe the function of the scripts as well as any optional or mandatory inputs.
  6 | 
  7 | Example:
  8 | The Perl program "alleles2taghap.pl" takes the ".alleles" output from the RADseq assembly program pyRAD and creates the ".taghap" format for the program fineRADstructure. To display the help menu, call the program like so:
  9 | 
 10 |     ./alleles2taghap.pl -h
 11 | 
 12 | Which will display:
 13 | 
 14 |     tkchafin@acamel-linux1:~/scripts$ ./alleles2taghap.pl -h
 15 | 
 16 |     alleles2taghap.pl by Tyler Chafin
 17 | 
 18 |     This script converts from the .alleles file output by pyRAD to create the input for fineRADstructure
 19 | 
 20 |     NOTE:
 21 | 	- All samples are assumed to be diploid.
 22 | 	- Sample names CANNOT contain underscores.
 23 | 	- Columns containing Ns or gaps will be deleted from final output
 24 | 	- Popmap file should be tab-delimited, like so: SampleName [tab] PopID
 25 | 	- If populations to include/exclude are not given, all samples in popmap are used.
 26 | 	- You can specify multiple popIDs as: ID1+ID2+ID3, as long as these match IDs in popmap
 27 | 	- For the -s filter, singletons are evaluated within the selected subset of individuals
 28 | 
 29 |     Options:
 30 | 	-a	: Path to input file (.alleles)
 31 | 	-p	: Path to popmap file (tab-delimited)
 32 | 	-o	: Output file prefix. [Default = out, i.e. out.taghap]
 33 | 	-c	: Min number of samples for which data must be present per locus [Default = 1]
 34 | 	-n	: Minumum proportion of loci an individual must be present at to be retained [def = 0.2]
 35 | 	-i	: PopIDs to include in output file (e.g. -i pop1+pop4)
 36 | 	-x	: PopIDs to exclude (e.g. -x catenatus or -x sistrTX+sistrIN)
 37 | 	-m	: Maximum number of SNPs per locus. Loci exceeding are deleted [default:10]
 38 | 	-s	: Skip SNPs that are singletons [Boolean; Default = false]
 39 | 	-h	: Displays this help message
 40 | 
 41 |     Program killed: Help menu called.
 42 | 
 43 | ### Contents
 44 | Here is a (probably) complete list of the scripts contained here, and generally what they do. All scripts written in Python require Python3.
 45 | ```
 46 | alleles2taghap.pl	: Converts from pyRAD .alleles format to input for fineRadStructure
 47 | averageFastStructure.pl	: Combines multiple replicate runs of FastStructure
 48 | batchBUCKY.pl		: Pipeline for running BUCKy. Old and probably broken.
 49 | collapse_baits.py	: For filtering baits by SNP count from BaitsTools output
 50 | collapseHaps.pl		: Collapse sequences to redundant consensus sequences
 51 | compare2seqs.pl		: This was a learning exercise. Just compares sequences.
 52 | concatFasta.py		: Script to concatenate fastas (No help menu- use argv)
 53 | concatenateNexus.py	: Concatenate Nexus alignments and calculate partitions block
 54 | condenseAlleles.pl	: Creates a consensus of alleles (input as FASTA) per individual
 55 | count_residues.pl	: Counts residues in an amino acid alignment
 56 | fast2distruct.pl	: Tries to parse FastStructure ouputs to create DISTRUCT input
 57 | fasta2length.pl		: Calculate non-gap character length of sequences
 58 | fasta2nexus.pl		: Converts FASTA to NEXUS format
 59 | fasta2phylip.py		: Converts from FASTA to PHYLIP and PHYLIP to FASTA, nothing fancy
 60 | findBreaksVCF.py	: Breaks contigs in VCF to chunks of X parsimony-informative SNPs, for running MDL
 61 | fill_quartets.py	: Sorts through TICR output to find missed quartets (for debugging only) (no help menu - use argv)
 62 | filter_loci.pl		: Parses a directory of FASTA alignments, and blacklists those with too low alignment coverage
 63 | filterLoci.py		: Filters a pyRAD .loci file on individual coverage and number of parsimony-informative sites
 64 | fixedSNP.pl		: Parses PHYLIP file to find differentially fixed SNPs between two given populations
 65 | genesFromGFF.pl		: Extracts elements from a FASTA file, given a GFF file of annotations
 66 | liftoverCoords.py	: Converts coordinates between assemblies (e.g. CanFam2 to CanFam3) and makes MareyMap inputs
 67 | makeHyde.py		: Makes inputs for HyDe- Hybrid Detection program
 68 | makePopArt.py		: Python program to make inputs for PopArt (haplotype network program) from FASTA
 69 | makeSAMOVA.pl		: Makes inputs for SAMOVA given FASTA and coordinates, with automatic clustering by distance
 70 | newhybs2distruct.py	: Takes posterior probs (PofZ.txt) from NewHybrids and makes inputs to run DISTRUCT
 71 | newhybs2props.py	: Calculates geneological assignment proportions from NewHybrids, outputs table and files to spoof DISTRUCT
 72 | nremover.pl		: My version of Steve Mussmann's nremover script, for filtering DNA alignments
 73 | parallelMB.pl		: For running batches of MrBayes on a cluster, in parallel per locus
 74 | phylip2bgc.pl		: Converts PHYLIP alignment to inputs for BGC (inference of Bayesian Genomic CLines)
 75 | phylip2biNumNex.py	: Converts PHYLIP to bi-allelic numerically coded NEXUS for PhyloNet's MLE_BiMarkers
 76 | phylip2ecoevolity.pl	: Converts PHYLIP to the NEXUS format needed for ecoevolity.
 77 | phylip2introgress.pl	: Converts PHYLIP to inputs for R package INTROGRESS (introgession analyses)
 78 | phylip2newhybrids.pl	: Creates inputs for NewHybrids, with missing data filters built in
 79 | phylip2nexus.pl		: Converts PHYLIP to NEXUS
 80 | phylip2structure.pl	: Converts PHYLIP alignment of SNPs to inputs for STRUCTURE
 81 | phylipFilterPops.pl	: Filters SNPs for creating PoMo-IQTREE inputs
 82 | process_ecoevolity.sh	: Runs the post-processing for ecoevolity outputs
 83 | pseudoHaploidize.py	: Script to haploidize FASTA-formatted sequences by randomly sampling alleles at heterozygous sites
 84 | pyrad2fasta.pl		: Extracts genewise alignments from pyRAD .loci format, and writes FASTA for each
 85 | seq2structure.pl	: I assume somehow different than phylip2structure, I don't remember honestly
 86 | short2fullPopmap.pl	: Does a very specific thing to my tab-delimited popmap files
 87 | slidingWindowGC.pl	: Calculates GC content along a sliding window down a sequence
 88 | snps2phy.sh		: Shell script to convert pyRAD .snp output to PHYLIP format
 89 | splitFASTA.pl		: Breaks a FASTA file into a user-defined number of chunks. For helping parse a large genome
 90 | splitFastaPops.py	: Pulls subsets from FASTA file to new FASTA file, given tab-delimited table of population IDs
 91 | splitStackedFasta.pl	: Splits FASTA of specifically-formatted collapsed read clusters
 92 | stacks2fasta.pl		: Fromats output of STACKS to a new FASTA for variable loci, but querying cstacks catalog
 93 | structure2newhy.pl	: Converts STRUCTURE file to input for NewHybrids
 94 | subsetPhy.py		: Quickly written and shitty script to subset taxa from a PHYLIP alignment
 95 | subsetSnps.py		: Given a list of desired columns, subsets SNPs from a STRUCTURE file
 96 | sumls.sh		: A bash alias for doing something with ls
 97 | summaryGFF.pl		: Something old and incomplete.
 98 | treeExpansion.py	: Converts a Newick tree of clade names, expanded to all taxa in given tab-delimited file
 99 | trimFastq.pl		: Perl script for end-trimming FASTQ reads
100 | utm2latlong.py		: Converts UTM to and from latitude and longitudes
101 | vcf2phylip.py		: VCF to PHYLIP
102 | ```
103 | 


--------------------------------------------------------------------------------
/alignment_subsetter.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | import sys
  4 | import os
  5 | import getopt
  6 | #import toytree as tt
  7 | import random
  8 | 
  9 | def main():
 10 | 	params = parseArgs()
 11 | 
 12 | 	seqs=dict()	
 13 | 	for f in read_phylip(params.phylip):
 14 |                         seqs[f[0]] = f[1]
 15 | 
 16 | 	#tree=tt.tree(params.tree, tree_format=0)
 17 | 
 18 | 	if not params.samples:
 19 | 		params.samples=int(params.freq*len(list(seqs.keys())))
 20 | 	
 21 | 	print("Generating",params.reps,"random subsets of",params.samples,"samples eac")
 22 | 
 23 | 	for r in range(params.reps):
 24 | 		print("starting replicate",str(r))
 25 | 		prefix=params.out + "_" + str(r)
 26 | 		print("subsetting alignment")
 27 | 		keeps=dict(random.sample(seqs.items(), params.samples))
 28 | 		bad_bois=[k for k in seqs.keys() if k not in keeps]
 29 | 		#print("subsetting tree")
 30 | 		#stree = tree.drop_tips(names=bad_bois)
 31 | 		print("writing subset file")
 32 | 		write_phylip(prefix+".phylip",keeps)
 33 | 		#stree.write(prefix+".tre", tree_format=0)
 34 | 
 35 | 
 36 | #Print dict to phylip file
 37 | def write_phylip(p, aln):
 38 |         with open(p, 'w') as fh:
 39 |                 try:
 40 |                         header = getPhylipHeader(aln) + "\n"
 41 |                         fh.write(header)
 42 | 
 43 |                         for sample in aln.keys():
 44 |                                 line = str(sample) + "\t" + "".join(aln[sample]) + "\n"
 45 |                                 fh.write(line)
 46 |                 except IOError as e:
 47 |                         print("Could not read file %s: %s"%(p,e))
 48 |                         sys.exit(1)
 49 |                 except Exception as e:
 50 |                         print("Unexpected error reading file %s: %s"%(p,e))
 51 |                         sys.exit(1)
 52 |                 finally:
 53 |                         fh.close()	
 54 | 
 55 | #Returns header for Phylip file from a dictionary of samples w/ data
 56 | def getPhylipHeader(d):
 57 |         numSamp = 0
 58 |         numLoci = None
 59 |         for sample in d:
 60 |                 numSamp = numSamp + 1
 61 |                 if not numLoci:
 62 |                         numLoci = len(d[sample])
 63 |                 else:
 64 |                         if numLoci != len(d[sample]):
 65 |                                 print("getPhylipHeader: Warning: Sequences of unequal length.")
 66 |         header = str(numSamp) + " " + str(numLoci)
 67 |         if numLoci == 0 or not numLoci:
 68 |                 print("getPhylipHeader: Warning: No loci in dictionary.")
 69 |         if numSamp == 0:
 70 |                 print("getPhylipHeader: Warning: No samples in dictionary.")
 71 |         return(header)
 72 | 
 73 | 
 74 | #Read samples as PHYLIP. Generator function
 75 | def read_phylip(phy):
 76 |         if os.path.exists(phy):
 77 |                 with open(phy, 'r') as fh:
 78 |                         try:
 79 |                                 num=0
 80 |                                 for line in fh:
 81 |                                         line = line.strip()
 82 |                                         if not line:
 83 |                                                 continue
 84 |                                         num += 1
 85 |                                         if num == 1:
 86 |                                                 continue
 87 |                                         arr = line.split()
 88 |                                         yield(arr[0], arr[1])
 89 |                         except IOError:
 90 |                                 print("Could not read file ",phy)
 91 |                                 sys.exit(1)
 92 |                         finally:
 93 |                                 fh.close()
 94 |         else:
 95 |                 raise FileNotFoundError("File %s not found!"%phy)
 96 | 
 97 | #Object to parse command-line arguments
 98 | class parseArgs():
 99 | 	def __init__(self):
100 | 		#Define options
101 | 		try:
102 | 			options, remainder = getopt.getopt(sys.argv[1:], 'hs:f:r:p:o:m:', \
103 | 			["help", "reps=","phylip=","out=", "method=", "samples=", "freq="])
104 | 		except getopt.GetoptError as err:
105 | 			print(err)
106 | 			self.display_help("\nExiting because getopt returned non-zero exit status.")
107 | 		#Default values for params
108 | 		#Input params
109 | 		#self.tree=None
110 | 		self.reps=10
111 | 		self.freq=0.1
112 | 		self.samples=None
113 | 		self.phylip=None
114 | 		self.method="random"
115 | 		self.out="subset"
116 | 
117 | 
118 | 		#First pass to see if help menu was called
119 | 		for o, a in options:
120 | 			if o in ("-h", "-help", "--help"):
121 | 				self.display_help("Exiting because help menu was called.")
122 | 
123 | 		#Second pass to set all args.
124 | 		for opt, arg_raw in options:
125 | 			arg = arg_raw.replace(" ","")
126 | 			arg = arg.strip()
127 | 			opt = opt.replace("-","")
128 | 			#print(opt,arg)
129 | 			if opt == "h" or opt == "help":
130 | 				continue
131 | 			elif opt=="phylip" or opt=="p":
132 | 				self.phylip=arg
133 | 			elif opt=="method" or opt=="m":
134 | 				self.method=arg
135 | 			elif opt=="reps" or opt=="r":
136 | 				self.reps=int(arg)
137 | 			elif opt=="freq" or opt=="f":
138 | 				self.freq=float(arg)
139 | 			elif opt=="samples" or opt=="s":
140 | 				self.samples=int(arg)
141 | 			elif opt=="out" or opt=="o":
142 | 				self.out=arg
143 | 			else:
144 | 				assert False, "Unhandled option %r"%opt
145 | 
146 | 		#Check manditory options are set
147 | 		if not self.phylip and not self.tree:
148 | 			self.display_help("Must provide input tree (newick) and alignment (phylip) files.")
149 | 
150 | 
151 | 
152 | 	def display_help(self, message=None):
153 | 		if message is not None:
154 | 			print()
155 | 			print (message)
156 | 		print ("\nalignment_subsetter.py\n")
157 | 		print ("Description: Generate random subsets of an input phylip (alignment)")
158 | 		print("""
159 | 		-p,--phylip	: Path to input phylip file
160 | 		-s,--samples	: Number of samples to keep
161 | 		-f,--freq	: Sampling frequency (must set either -f or -s)
162 | 		-r,--reps	: Number of replicates to generate
163 | 		-o,--out	: Output file name (default=out.fas)
164 | """)
165 | 		print()
166 | 		sys.exit()
167 | 
168 | #Call main function
169 | if __name__ == '__main__':
170 |     main()
171 | 


--------------------------------------------------------------------------------
/averageFastStructure.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl 
  2 | 
  3 | use strict; 
  4 | use warnings;
  5 | use Getopt::Long;
  6 | use File::Basename; 
  7 | use Statistics::R; 
  8 | 
  9 | my @meanQ; 
 10 | my @log; 
 11 | my $out = "./avg_k";
 12 | my $help = 0; 
 13 | my $force =0; 
 14 | my $k; 
 15 | my $reps; 
 16 | parseArgs(); 
 17 | 
 18 | my ($filepath, $dirpath) = fileparse($meanQ[0]); 
 19 | @meanQ = glob("@meanQ"); 
 20 | @log = glob("@log"); 
 21 | my $count = 0; 
 22 | my @data; 
 23 | my $fnum = 0; 
 24 | foreach my $file(@meanQ){ 
 25 |   my @line; 
 26 |   if ($force == 0){
 27 |     $file !~ /.*meanQ/ and die "Error: File $file is missing .meanQ extension. Are you sure it is the correct file type? To skip this check, add the -f flag to your command-line call\n"; 
 28 |   }
 29 |   open (my $fh, $file) || die "Can't open $file\n";  
 30 |   $count++;
 31 |   my $lnum = 0; 
 32 |   while (<$fh>){ 
 33 |     $lnum++; 
 34 |     chomp; 
 35 |     @line = split /\s+/; 
 36 |     s/\s+//g; 
 37 |     next unless length; 
 38 |     if ($count==1){
 39 |       if (!defined $k){
 40 |         $k = @line unless $k;
 41 |         print "K value was not supplied; inferring clusters from file $file: $k\n"
 42 |       }
 43 |      } 
 44 |     @line != $k and die "Error: Line $lnum of file $file doesn't have the correct number of clusters ($k)\n";  
 45 |   } 
 46 |   #$fnum++; 
 47 |   close $fh; 
 48 | }
 49 | 
 50 | #Get likelihoods from .log files
 51 | my @lognames; 
 52 | my @likelihoods; 
 53 | foreach my $file (@log){
 54 |   my ($fpath, $dpath) = fileparse($file); 
 55 |   push @lognames, $dpath . $fpath; 
 56 |   open (my $fh, $file) || die "Can't open $file\n"; 
 57 |   while (<$fh>){
 58 |     chomp;
 59 |     if (m/Marginal Likelihood =/){ 
 60 |       s/Marginal Likelihood =//;
 61 |       push @likelihoods, $_;  
 62 |     }
 63 |   }
 64 | }
 65 | 
 66 | #Default use all reps if no subset number provided
 67 | if (!defined $reps){
 68 |   print "Warning: Number of replicates to subset not provided; using all by default\n";
 69 |   $reps = scalar(@lognames);
 70 |   #print $reps . "\n"; 
 71 | }
 72 | 
 73 | my $R = Statistics::R->new(); 
 74 |   $R->start; 
 75 |   $out = $out . $k . ".meanQ";
 76 |   $R->set('lognames', \@lognames); 
 77 |   $R->set('likelihoods', \@likelihoods); 
 78 |   $R->set('reps', $reps);
 79 |   $R->set('out', $out); 
 80 |   $R->send(q`options(scipen=999)`);
 81 |   $R->run(q`likes <- data.frame(lognames, likelihoods)`);
 82 |   $R->run(q`likes[,1] = sub(".log","",likes[,1])`); 
 83 |   #Set up R functions
 84 |   $R->send(q`
 85 |     ################################
 86 |     JSD.pair <- function(x, y){
 87 | 	###Function to compute Shannon-Jensen Divergence
 88 | 	###x and y are the frequencies for the same p categories
 89 | 	u <- x/sum(x)
 90 | 	v <- y/sum(y)
 91 | 	m <- (u+v)/2
 92 | 	if (all(u*v>0)){
 93 | 		d <- (u*log(u/m)+v*log(v/m))/2
 94 | 	} else {
 95 | 		P1 <- u*log(u/m)
 96 | 		P2 <- v*log(v/m)
 97 | 		P1[is.nan(P1)] <- 0
 98 | 		P2[is.nan(P2)] <- 0
 99 | 		d <- (P1+P2)/2
100 | 	}
101 | 	return(sum(d))
102 |     }
103 |     ##############################
104 |     matchPops=function(ga, gb, niter=3000) {
105 | 	### function to match population identifiers between fastStructure runs
106 | 	### based on permutations of column names and Shannon-Jensen divergences
107 | 	minsum=1000
108 | 	for (i in 1:niter) {
109 | 		names(gb)=sample(names(gb))
110 | 		sumjsd=0
111 | 		for (n in names(ga)) { 
112 | 			sumjsd=sumjsd+JSD.pair(ga[,n],gb[,n])
113 | 		}
114 | 		if (sumjsd<minsum) {
115 | 			minsum=sumjsd
116 | 			gbnames=names(gb)
117 | 		}
118 | 	}
119 | 	return(list("pops"=gbnames,"min.JSD"=minsum))
120 |     }
121 |     ##############################
122 |     averageBest=function(likelihoods,top=25) {
123 | 	# matches populations assignments among best-likelihood runs,
124 | 	# averages assignemnt probabilities, returns averaged meanQ table
125 | 	bests=head(likes[order(likes[,2],decreasing=T),1],top)
126 | 	gs=read.table(paste(bests[1],".meanQ",sep=""))
127 | 	g1=gs
128 | 	print("top 1")
129 | 	for (b in 2:top) {
130 | 		gn=read.table(paste(bests[b],".meanQ",sep=""))
131 | 		names(gn)=matchPops(g1,gn)$pops
132 | 		gs=gs+gn[,names(g1)]
133 | 	}
134 | 	return(gs/top)
135 |     }`
136 |   );
137 |   #Run averaging functions
138 |   $R->run(q`means=averageBest(likelihoods=likes, top=reps)`);
139 |   $R->run(q`write.table(means, file=out, sep="   ", quote=FALSE, na="NA", append=FALSE, row.names=FALSE,col.names=FALSE)`); 
140 |  
141 | 
142 | #open (my $ofstream, ">$out") || die "Can't open $out\n"; 
143 | #  for (my $i=0; $i<=$#data; $i++){ 
144 | #    for (my $k=0; $k<=$#{$data[$i]}; $k++){ 
145 | #      print $data[$i][$k]/$count . " ";
146 | #    }
147 | #    print "\n"; 
148 | #  }
149 | 
150 | 
151 | exit;
152 | 
153 | #########################################################################################
154 | 
155 | sub parseArgs{
156 | 
157 | my $message = 
158 | "\n\nAverages multiple fastStructure runs for the same k value.  
159 | 
160 | If you have problems running the script let me know. It hasn't really been tested fully, and I threw it together quickly. 
161 | 
162 | Arguments
163 | 
164 | 	-i	- Input fastStructure .meanQ files - wildcard usage is fine
165 | 	-o	- Output prefix and path
166 | 	-l	- Input fastStructure .log files
167 | 	-r	- Number of replicates to use. Script will choose top N reps based on likelihoods
168 | 	-k	- Provide a k value, otherwise it will be detected from column counts
169 | 	-f	- Shut up and stop checking files for .meanQ extension 
170 | \n\n"; 
171 | 
172 | 	my $result = GetOptions
173 | 	( 
174 | 	'i=s{1,}'	=> \@meanQ,
175 | 	'f!'		=> \$force,
176 | 	'l=s{1,}'	=> \@log, 
177 | 	'k=i'		=> \$k,
178 | 	'r=i'		=> \$reps,
179 | 	'o=s'		=> \$out, 
180 | 	'h!'		=> \$help
181 | 	);
182 | @meanQ or die "\n\nNo meanQ specified!" . $message; 
183 | @log or die "\n\nNo .log specified!" .  $message; 
184 | $help == 1 and die $message; 
185 | 
186 | }
187 | 
188 | 
189 |  
190 | 


--------------------------------------------------------------------------------
/batchBUCKY.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | 
  3 | use strict; 
  4 | use warnings; 
  5 | use Getopt::Long; 
  6 | use File::Basename; 
  7 | 
  8 | our $cmd="bucky"; 
  9 | our $input; 
 10 | our $ngen=100000; 
 11 | our $nrun=2;
 12 | our $alpha=1;  
 13 | our $help=0; 
 14 | our $nchain=1; 
 15 | our $rate="";
 16 | our $alpham=""; 
 17 | our @other = ();
 18 | our $cutoff="0.05";
 19 | our $ind=0;
 20 | our $spacesaver=0;
 21 | our $out="bca";
 22 | 
 23 | #Parse command line arguments to define above variables
 24 | 
 25 | parseArgs(); 
 26 | 
 27 | #Format some variables
 28 | 
 29 | my $other = "@other";
 30 | $rate =~ /\S/ and $rate = "-r $rate";
 31 | $alpham =~ /\S/ and $alpham = "-m $alpham"; 
 32 | $ind == 1 and $ind = "--use-independence-prior";
 33 | $ind == 0 and $ind = ""; 
 34 | $spacesaver == 1 and $spacesaver = "--opt-space";
 35 | $spacesaver == 0 and $spacesaver = "";
 36 | my ($filepath, $dirpath) = fileparse($input);
 37 | $input = "$dirpath\*.in";
 38 | my ($outname, $outpath) = fileparse($out); 
 39 | 
 40 | #BUCKy system call 
 41 | print $outpath, "\n"; 
 42 | chdir "$outpath"; 
 43 | 
 44 | system ("$cmd -a $alpha -k $nrun -c $nchain $rate $alpham -o $outname -cf $cutoff $ind $other $spacesaver $input");
 45 | 
 46 | 
 47 | 
 48 | 
 49 | exit;
 50 | 
 51 | #####################################SUBROUTINES##############################################
 52 | 
 53 | sub parseArgs{
 54 | 
 55 |     my $usage="\nUsage: $0 -i /path/to/*.in [-option value] or [--option=value] 
 56 | 
 57 | batchBUCKY.pl takes as input the summed .t files from mrbayes (summarized per locus via mbsum) and performs a Bayesian Concordance Analysis to assess what proportion of the genome supports different phylogenetic topologies. 
 58 | 
 59 | --------------------------------------Mandatory Input-------------------------------
 60 | 	-i, --input	- Path to .in files created by mbsum (automatically generated by runMRBAYES.pl)
 61 | 
 62 | 
 63 | ---------------------------------------General Options-------------------------------
 64 | 	--cmd		- Command to call bucky, if different than default [default=bucky] 
 65 | 	-o, --out		- Output file root name [Default=bca] Can also include path to output directory [e.g. -o /path/to/bca]
 66 | 	
 67 | ----------------------------------------BUCKy Options-------------------------------- 
 68 | 	-a, --alpha	- Use this option to set the a priori level of discordance among loci [default=1] 
 69 | 	-n, --ngen	- Number of generations for MCMC. Burnin will automatically be 10% of the desired number of post-burnin updates. [default=100,000] 
 70 | 	-k, --nrun	- Number of independent analyses to run 
 71 | 	-f, --cutoff	- Provide a cutoff Concordance Factor value, above which all splits will be retained [default=0.05] 
 72 | 	--other		- Use this option to set any other bucky parameters or functions [example: --other -s1 1234 --calculate-pairs --create-single-file] 	
 73 | 	--ind		- Use independent priors. Assumes a priori that loci have independent histories. [Usage: --ind ] 
 74 | 	--opt-space	- This option accomodates large data sets with space optimization. [Usage: --opt-space ] 
 75 | 
 76 | ----------------------------------------MCMCMC Options--------------------------------
 77 | 	-c, --nchain	- This option toggles on Metropolic coupled MCMC. Any chains more than one will be \"hot\" chains which will occasionally swap states with the cold chain to improve mixing [Default=1; i.e. no heated chains]
 78 | 	-r, --rate	- If MCMCMC is used, this controls the rate at which chains swap [default=100]
 79 | 	-m, --alpham	- Heated chains in MCMCMC use higher alpha values than the cold chain. This parameter sets the multiplier for the heated alpha value [default=10]\n\n"; 
 80 | 
 81 | 	my $result = GetOptions 
 82 | 	(	
 83 | 	'input|i=s'	=> \$input,
 84 | 	'cmd=s'		=> \$cmd, 
 85 | 	'alpha|a=s'	=> \$alpha,
 86 | 	'ngen|n=i'	=> \$ngen, 
 87 | 	'nrun|k=i'	=> \$nrun,
 88 | 	'cutoff|f=s'	=> \$cutoff,
 89 | 	'other=s{1,}'	=> \@other,
 90 | 	'nchain|c=i'	=> \$nchain,
 91 | 	'rate|r=i'	=> \$rate,
 92 | 	'alpham|m=i'	=> \$alpham,
 93 | 	'help|h!'	=> \$help,
 94 | 	'out|o=s'	=> \$out,
 95 | 	'ind!'		=> \$ind,
 96 | 	'opt-space!'	=> \$spacesaver,
 97 | 	); 
 98 | 
 99 | $help == 1 and die "\n$usage\n";
100 | $input or die "\nInput not specified!\n$usage\n";
101 | 
102 | }
103 | 


--------------------------------------------------------------------------------
/bootstrapGeneTrees.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash 
 2 | 
 3 | #Tyler K. Chafin 
 4 | #July 23 2021
 5 | #Generates n bootstrap samples of an input newick-formatted file of trees
 6 | #Email: tylerkchafin@gmail.com with issues
 7 | 
 8 | if [ $1 ] && [ $2 ]; 
 9 | then 
10 |   trees="$1"
11 |   n=$2
12 | else 
13 |   printf "\nUsage: $0 <tree file> <n>\n\n"
14 |   exit 1
15 | fi
16 | 
17 | for i in `seq 1 $n`; 
18 | do
19 |   ofile="b_"$i".tre"
20 |   shuf -r -n $n $trees > $ofile
21 | done


--------------------------------------------------------------------------------
/collapseHaps.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl 
 2 | 
 3 | #Tyler K. Chafin; 14-Dec-15 
 4 | #tkchafin@uark.edu 
 5 | 
 6 | use strict; 
 7 | use warnings; 
 8 | 
 9 | my $usage = "
10 | This script functions to collapse aligned sequences in FASTA format into haplotypes, and sort halpotypes by frequency. 
11 | 
12 | Author: Tyler K. Chafin - tkchafin\@uark.edu
13 | Last Modified: 14-Dec-15
14 | 
15 | Usage: $0 </path/to/.fasta> 
16 | 
17 | "; 
18 | my $file; 
19 | if (defined $ARGV[0]){
20 |   $file = $ARGV[0]; 
21 |   print "Input: $file\n"; 
22 | }else{ 
23 |   die $usage; 
24 | }
25 | 
26 | my %contents; 
27 | my %freq; 
28 | 
29 | open (INPUT, $file) || die "Cannot open $file: $!\n\n"; 
30 | while (<INPUT>){ 
31 |   chomp;   
32 |   if ($_ =~ /^\s*$/){
33 |     next;  
34 |   }elsif ($_ =~ /^>/){ 
35 |     next; 
36 |   }else{
37 |     if ($contents{uc $_}){
38 |       $contents{uc $_}++; 
39 |     }else{ 
40 |       $contents{uc $_} = 1; 
41 |     }
42 |   }   
43 | }
44 | close INPUT;
45 | open (OUT, ">sorted.fasta") || die "Could not open output file: $!\n"; 
46 | print "Output: sorted.fasta\n";
47 | my $count=1; 
48 | foreach my $key (sort {$contents{$b} <=> $contents{$a}}keys %contents){ 
49 |   print OUT ">H$count\n";
50 |   print OUT "$key\n";
51 |   $count++; 
52 | }
53 | close OUT;
54 | 
55 | exit; 
56 | 


--------------------------------------------------------------------------------
/compare2seqs.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | use strict; 
 4 | use warnings; 
 5 | 
 6 | my $tax1="Carica"; 
 7 | my $tax2="Vitis"; 
 8 | my $input="cox1.fasta";
 9 | 
10 | #Open input fasta
11 | open (FAS, "$input" ) || die "\n\nI pity the fool that can't open their fasta files\n\n$!\n\n";
12 | 
13 | my $count=0; 
14 | my @dna1; 
15 | my @dna2;
16 | 
17 | 
18 | #Set first and second sequences to exploded arrays
19 | while ( <FAS> ){ 
20 |     if ($_ !~ /^>/)  { 
21 | 	$count++;  
22 | 	$count==1 and @dna1 = split //, "$_";
23 | 	$count==2 and @dna2 = split //, "$_"; 
24 |     }
25 | }
26 | 		  
27 | #print "\n\n@dna1\n\n@dna2\n\n"; 
28 | 
29 | 
30 | if (length(@dna1) ne length(@dna2)){
31 |     print "\nWarning:Sequences to compare are of different length. Check your alignment.\n\n";
32 | }
33 | 
34 | #Print header for table
35 | print "\nPosition $tax1 $tax2\n"; 
36 | 
37 | my $samelen=0;
38 | my $difflen=0;
39 | my $total=0; 
40 | for ( my $i=0; $i <= $#dna1; $i++){ 
41 |     if ($dna1[$i] eq $dna2[$i]){ 
42 | 	$samelen++;
43 | 	$total++; 
44 |     }else{ 
45 | 	$difflen++;
46 | 	$total++;  
47 | 	print $i+1, "\t$dna1[$i]\t$dna2[$i]\n"; 
48 |     } 
49 | } 
50 | 
51 | print "\nnumber of identical sites: $samelen\n"; 
52 | print "number of different sites: $difflen\n"; 
53 | print "percent difference: "; 
54 | printf( "%.2f \n\n", $difflen / $total * 100 ); 
55 | 
56 | 
57 | 
58 | 
59 | 


--------------------------------------------------------------------------------
/compare_seqs_fasta.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | import sys
  4 | import os
  5 | import getopt
  6 | from textwrap import wrap
  7 | 
  8 | def main():
  9 | 	params = parseArgs()
 10 | 
 11 | 	seqs=dict()
 12 | 	seqlen=0
 13 | 	for s in read_fasta(params.infile):
 14 | 		seqs[s[0]]=s[1]
 15 | 		seqlen=len(s[1])
 16 | 
 17 | 	ann=""
 18 | 	for i in range(seqlen):
 19 | 		seen=list()
 20 | 		for s in seqs:
 21 | 			seen.append(seqs[s][i].lower())
 22 | 		vars=set(seen)
 23 | 		# add conditions here if you want to change output e.g. if there are gaps
 24 | 		if len(vars)==1:
 25 | 			ann = ann + "-"
 26 | 		else:
 27 | 			ann = ann + "*"
 28 | 	seqs["annotation"] = ann
 29 | 
 30 | 	write_fasta(params.out, seqs)
 31 | 
 32 | #Function to write fasta-formatted sequences
 33 | def write_fasta(f, aln, width=None):
 34 | 	with open(f, 'w') as fh:
 35 | 		try:
 36 | 			for samp in aln.keys():
 37 | 				if width:
 38 | 					ol = ">" + str(samp) + "\n"
 39 | 					chunks=wrap(aln[samp], width=width, break_on_hyphens=False, drop_whitespace=False)
 40 | 					for chunk in chunks:
 41 | 						ol=ol + str(chunk) + "\n"
 42 | 				else:
 43 | 					ol = ">" + str(samp) + "\n" + str(aln[samp]) + "\n"
 44 | 				fh.write(ol)
 45 | 		except IOError as e:
 46 | 			print("Could not read file %s: %s"%(f,e))
 47 | 			sys.exit(1)
 48 | 		except Exception as e:
 49 | 			print("Unexpected error reading file %s: %s"%(f,e))
 50 | 			sys.exit(1)
 51 | 		finally:
 52 | 			fh.close()
 53 | 
 54 | #Read samples as FASTA. Generator function
 55 | def read_fasta(fas):
 56 | 	if os.path.exists(fas):
 57 | 		with open(fas, 'r') as fh:
 58 | 			try:
 59 | 				contig = ""
 60 | 				seq = ""
 61 | 				for line in fh:
 62 | 					line = line.strip()
 63 | 					if not line:
 64 | 						continue
 65 | 					#print(line)
 66 | 					if line[0] == ">": #Found a header line
 67 | 						#If we already loaded a contig, yield that contig and
 68 | 						#start loading a new one
 69 | 						if contig:
 70 | 							yield([contig,seq]) #yield
 71 | 							contig = "" #reset contig and seq
 72 | 							seq = ""
 73 | 						split_line = line.split()
 74 | 						contig = (split_line[0].replace(">",""))
 75 | 					else:
 76 | 						seq += line
 77 | 				#Iyield last sequence, if it has both a header and sequence
 78 | 				if contig and seq:
 79 | 					yield([contig,seq])
 80 | 			except IOError:
 81 | 				print("Could not read file ",fas)
 82 | 				sys.exit(1)
 83 | 			finally:
 84 | 				fh.close()
 85 | 	else:
 86 | 		raise FileNotFoundError("File %s not found!"%fas)
 87 | 
 88 | #Object to parse command-line arguments
 89 | class parseArgs():
 90 | 	def __init__(self):
 91 | 		#Define options
 92 | 		try:
 93 | 			options, remainder = getopt.getopt(sys.argv[1:], 'hi:o:', \
 94 | 			["help", "infile=","out="])
 95 | 		except getopt.GetoptError as err:
 96 | 			print(err)
 97 | 			self.display_help("\nExiting because getopt returned non-zero exit status.")
 98 | 		#Default values for params
 99 | 		#Input params
100 | 		self.infile=None
101 | 		self.out="out.fas"
102 | 
103 | 
104 | 		#First pass to see if help menu was called
105 | 		for o, a in options:
106 | 			if o in ("-h", "-help", "--help"):
107 | 				self.display_help("Exiting because help menu was called.")
108 | 
109 | 		#Second pass to set all args.
110 | 		for opt, arg_raw in options:
111 | 			arg = arg_raw.replace(" ","")
112 | 			arg = arg.strip()
113 | 			opt = opt.replace("-","")
114 | 			#print(opt,arg)
115 | 			if opt == "h" or opt == "help":
116 | 				continue
117 | 			elif opt=="i" or opt=="in":
118 | 				self.infile=arg
119 | 			elif opt=="out" or opt=="o":
120 | 				self.out=arg
121 | 			else:
122 | 				assert False, "Unhandled option %r"%opt
123 | 
124 | 		#Check manditory options are set
125 | 		if not self.infile:
126 | 			self.display_help("No files provided.")
127 | 
128 | 
129 | 	def display_help(self, message=None):
130 | 		if message is not None:
131 | 			print()
132 | 			print (message)
133 | 		print ("Description: Annotate differences in a fasta alignment")
134 | 		print("""
135 | 		-i,--in		: Input fasta file
136 | 		-o,--out	: Output file name (default=out.fas)
137 | """)
138 | 		print()
139 | 		sys.exit()
140 | 
141 | #Call main function
142 | if __name__ == '__main__':
143 |     main()
144 | 


--------------------------------------------------------------------------------
/concatFasta.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | import os
  4 | import sys 
  5 | 
  6 | def main():
  7 | 	if len(sys.argv) <= 1:
  8 | 		print("No files provided!")
  9 | 		print('Usage: ./concatFasta.py *.fasta - or - ./concatFasta.py 1.fas 2.fas...')
 10 | 		sys.exit(1)
 11 | 	files = sys.argv[1:]
 12 | 
 13 | 	#print("concatenating fastas using the following order:")
 14 | 	#print("if this is incorrect, change something")
 15 | 
 16 | 	pre=None
 17 | 	samps=dict()
 18 | 	#loop through and get list of samples
 19 | 	for file in sorted(files):
 20 | 		for s in read_fasta(file):
 21 | 			samps[s[0]] = ""
 22 | 	
 23 | 	for file in sorted(files):
 24 | 		#print(file)
 25 | 		pre=file.split("_")[0]
 26 | 		#get seqlen
 27 | 		seqlen = None
 28 | 		#seen
 29 | 		seen = dict()
 30 | 		for s in read_fasta(file):
 31 | 			seen[s[0]] = 0
 32 | 			seqlen = len(s[1])
 33 | 			samps[s[0]] = samps[s[0]] + s[1]
 34 | 			
 35 | 		for key in samps.keys():
 36 | 			if key not in seen:
 37 | 				samps[key] = samps[key] + Nrepeats("N", seqlen)
 38 | 
 39 | 	print("Using prefix from files to write output:",pre)
 40 | 	oname = pre + ".fasta"
 41 | 	write_fasta(oname, samps)
 42 | 
 43 | def Nrepeats(pattern, N):
 44 | 	ret = ""
 45 | 	for i in range(int(N)):
 46 | 		ret = ret + str(pattern)
 47 | 	return(ret)
 48 | 
 49 | #write fasta from dict
 50 | def write_fasta(name, d):
 51 | 	with open(name, 'w') as fh:
 52 | 		try:
 53 | 			for sample in d.keys():
 54 | 				to_write = ">" + str(sample) + "\n" + d[sample] + "\n"
 55 | 				fh.write(to_write)
 56 | 		except IOError as e:
 57 | 			print("Could not read file:",e)
 58 | 			sys.exit(1)
 59 | 		except Exception as e:
 60 | 			print("Unexpected error:",e)
 61 | 			sys.exit(1)
 62 | 		finally:
 63 | 			fh.close()
 64 | 
 65 | #Read samples as FASTA. Generator function
 66 | def read_fasta(fas):
 67 | 	if os.path.exists(fas):
 68 | 		with open(fas, 'r') as fh:
 69 | 			try:
 70 | 				contig = ""
 71 | 				seq = ""
 72 | 				for line in fh:
 73 | 					line = line.strip()
 74 | 					if not line:
 75 | 						continue
 76 | 					#print(line)
 77 | 					if line[0] == ">": #Found a header line
 78 | 						#If we already loaded a contig, yield that contig and
 79 | 						#start loading a new one
 80 | 						if contig:
 81 | 							yield([contig,seq]) #yield
 82 | 							contig = "" #reset contig and seq
 83 | 							seq = ""
 84 | 						split_line = line.split()
 85 | 						contig = (split_line[0].replace(">",""))
 86 | 					else:
 87 | 						seq += line
 88 | 				#Iyield last sequence, if it has both a header and sequence
 89 | 				if contig and seq:
 90 | 					yield([contig,seq])
 91 | 			except IOError:
 92 | 				print("Could not read file ",fas)
 93 | 				sys.exit(1)
 94 | 			finally:
 95 | 				fh.close()
 96 | 	else:
 97 | 		raise FileNotFoundError("File %s not found!"%fas)
 98 | 		
 99 | 		
100 | 
101 | #Call main function
102 | if __name__ == '__main__':
103 |     main()


--------------------------------------------------------------------------------
/condenseAlleles.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl 
  2 | 
  3 | use strict; 
  4 | use warnings;
  5 | use Getopt::Long;
  6 | use File::Path; 
  7 | use File::Basename; 
  8 | 
  9 | our @input; 
 10 | 
 11 | #Call subroutine to parse command-line arguments
 12 | parseArgs(); 
 13 |  
 14 | 
 15 | 
 16 | #iterate through input files 
 17 | @input = glob "@input";
 18 | foreach my $file ( @input ){ 
 19 | 
 20 | my $name = "";
 21 | my %data;
 22 | 
 23 | #Open each file
 24 |     open ( FAS, "$file" ) || die "Derp: Cannot open $file!";
 25 |  
 26 |     while (<FAS>){ 
 27 | 
 28 | 	chomp $_; 
 29 | 	
 30 | 	if ($_ =~ /\>/ ){ 
 31 | 	    $_ =~ />(\S+)/;
 32 |  	    $name = "$1";
 33 | 	}else{ 
 34 | 	    push @{$data{$name}}, $_;
 35 | 	}
 36 |     }
 37 | 	consense( \%data, $file ); 
 38 |     
 39 | }
 40 | 
 41 | close FAS; 
 42 | 
 43 | ###########################################SUBROUTINES#########################################
 44 | 
 45 | sub parseArgs{
 46 | 
 47 | 	my $usage="\ncondenseAlleles.pl takes FASTA files with individual alleles encoded separately, and generates a consensus sequence for each individual
 48 | 
 49 | Usage: $0 --i /path/to/*.fasta
 50 | 
 51 | Mandatory Variables 
 52 | 	-i, --input	-   path to input files in FASTA format\n\n"; 
 53 | 
 54 | 	my $result = GetOptions 
 55 | 	( 
 56 | 	'input|s=s{1,}'	=> \@input,
 57 | 	); 
 58 | 
 59 | @input or die "\nDerp: Input not specified!\n\n$usage";
 60 | 
 61 | }
 62 | 
 63 | ############################################################################################
 64 | 
 65 | #This subroutine takes a hash of arrays, where the array stores the alleles for each sample (key in the hash), and spits out a consensus sequence for any heterozygote. 
 66 | #
 67 | #WARNING: This script may contain excessive warnings!
 68 | #
 69 | #WARNING: OVERWRITES THE ORIGINAL FILE!!
 70 | #
 71 | #ANOTHER WARNING: Not written to accomodate polyploids, or paralogous loci. I will be implementing this as a part of post-processing the output of Stacks, which should have removed any loci with individuals having more than 2 alleles (as these indicate presence of paralogs, which I don't want). 
 72 | #
 73 | #YET ANOTHER ANOTHER WARNING: This script also assumes that alleles are of the same length, and pre-aligned. 
 74 | 
 75 | sub consense{
 76 | 
 77 | my $datref = $_[0]; 
 78 | my $file = $_[1];
 79 | 
 80 | 
 81 | 
 82 | 
 83 | #Empty current file to start rewriting (all info stored in %info hash now) 
 84 | open ( OUT, ">$file" ) || die "Derp: Oh noes! I'm in the subroutine and cannot open $file!"; 
 85 | 
 86 | foreach my $key ( sort {$a <=> $b } ( keys %$datref ) ) { 
 87 |     
 88 |     print OUT "\>$key\n";  #Print FASTA header 
 89 | 
 90 |     if ( exists $$datref{$key}->[1] ) { #Check if sequence has multiple alleles
 91 | 	#Load sequences into arrays for comparison
 92 | 	my @allele1 = split //, $$datref{$key}->[0];
 93 | 	my @allele2 = split //, $$datref{$key}->[1]; 
 94 | 
 95 | 	for ( my $i=0; $i <= length $$datref{$key}->[0]; $i++ ){
 96 | 		#This is messy and sucks. Fix later with Bio::AlignIO and consensus_iupac function
 97 | 	    if ( uc ( $allele1[$i] ) eq uc ( $allele2[$i] ) ) { 
 98 | 		uc ( $allele1[$i] ) eq "A" and print OUT "A"; 
 99 | 		uc ( $allele1[$i] ) eq "G" and print OUT "G"; 
100 | 		uc ( $allele1[$i] ) eq "T" and print OUT "T"; 
101 | 		uc ( $allele1[$i] ) eq "C" and print OUT "C"; 
102 | 	    }else{ 
103 | 		if ( uc $allele1[$i] eq "A" ){	
104 | 		    uc $allele2[$i] eq "G" and print OUT "R"; 
105 | 		    uc $allele2[$i] eq "C" and print OUT "M"; 
106 | 		    uc $allele2[$i] eq "T" and print OUT "W"; 
107 | 		}elsif ( uc $allele1[$i] eq "G" ){ 
108 | 		    uc $allele2[$i] eq "A" and print OUT "R"; 
109 | 		    uc $allele2[$i] eq "C" and print OUT "S"; 
110 | 		    uc $allele2[$i] eq "T" and print OUT "K"; 
111 | 		}elsif ( uc $allele1[$i] eq "T" ){ 
112 | 		    uc $allele2[$i] eq "A" and print OUT "W";  
113 | 		    uc $allele2[$i] eq "C" and print OUT "Y";
114 | 		    uc $allele2[$i] eq "G" and print OUT "K";
115 | 		}elsif ( uc $allele1[$i] eq "C" ){
116 | 		    uc $allele2[$i] eq "A" and print OUT "M";  
117 | 		    uc $allele2[$i] eq "G" and print OUT "S"; 
118 | 		    uc $allele2[$i] eq "T" and print OUT "Y";
119 | 		}
120 | 	    }
121 | 	} 
122 | 	print OUT "\n"; 
123 |     }else{ 
124 | 	print OUT $$datref{$key}->[0],"\n";
125 |     }    
126 | }
127 | #close OUT;
128 | }
129 | 
130 | 


--------------------------------------------------------------------------------
/count_residues.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl 
 2 | 
 3 | #Author: Tyler K. Chafin
 4 | #Script counts a given residue in provided amino acid alignment
 5 | 
 6 | use strict;
 7 | use warnings;
 8 | 
 9 | if ($#ARGV != 1){
10 | 	die "\nUsage: $0 <fasta> <AA>\n\n";
11 | }
12 | 
13 | my $file = $ARGV[0];
14 | my $res = $ARGV[1];
15 | 
16 | open (FASTA, "$file") || die "Cannot open file $1: $!\n";
17 | my %outhash;
18 | my $temp;
19 | while (<FASTA>){
20 | 	chomp $_;
21 | 	if ($_ =~ "^\>"){
22 | 		$temp = $_;
23 | 		$outhash{$temp} = "";
24 | 		next;
25 | 	}else{
26 | 		$outhash{$temp} .= $_;
27 | 	} 
28 | 
29 | }
30 | close FASTA;
31 | 
32 | open (OUT, ">out.tsv") || die "Cannot open out.tsv: $!\n";
33 | for my $key (keys %outhash){
34 | 	$key =~ m/\>.*\|.*\|(.*?)\s+.*/;
35 | 	my $match = $1;
36 | 	my $count = () = $outhash{$key} =~ /$res/gi;
37 | 	print OUT $match, "\t", $count ,"\n";
38 | }
39 | close OUT;
40 | 
41 | exit;
42 | 


--------------------------------------------------------------------------------
/expandSeq.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | import re
  4 | import sys
  5 | import os
  6 | import getopt
  7 | from itertools import product
  8 | 
  9 | def main():
 10 | 	params = parseArgs()
 11 | 
 12 | 	if params.fasta:
 13 | 		file_object = open("out.fasta", "w")
 14 | 		for seq in read_fasta(params.fasta):
 15 | 			count = 1
 16 | 			for i in expandAmbiquousDNA(seq[1]):
 17 | 				header = ">" + str(seq[0]) + "." + str(count)+ "\n"
 18 | 				sequence = str(i) + "\n"
 19 | 				file_object.write(header)
 20 | 				file_object.write(sequence)
 21 | 				count += 1
 22 | 		file_object.close()
 23 | 	elif params.seq:
 24 | 		for i in expandAmbiquousDNA(params.seq):
 25 | 			print(i)
 26 | 	else:
 27 | 		sys.exit("No input provided.")
 28 | 
 29 | 
 30 | #Object to parse command-line arguments
 31 | class parseArgs():
 32 | 	def __init__(self):
 33 | 		#Define options
 34 | 		try:
 35 | 			options, remainder = getopt.getopt(sys.argv[1:], 's:f:h', \
 36 | 			["seq=","fasta=","help"])
 37 | 		except getopt.GetoptError as err:
 38 | 			print(err)
 39 | 			self.display_help("\nExiting because getopt returned non-zero exit status.")
 40 | 		#Default values for params
 41 | 		#Input params
 42 | 		self.seq=None
 43 | 		self.fasta=None
 44 | 
 45 | 		#First pass to see if help menu was called
 46 | 		for o, a in options:
 47 | 			if o in ("-h", "-help", "--help"):
 48 | 				self.display_help("Exiting because help menu was called.")
 49 | 
 50 | 		#Second pass to set all args.
 51 | 		for opt, arg_raw in options:
 52 | 			arg = arg_raw.replace(" ","")
 53 | 			arg = arg.strip()
 54 | 			opt = opt.replace("-","")
 55 | 			#print(opt,arg)
 56 | 			if opt in ('s', 'seq'):
 57 | 				self.seq = arg
 58 | 			elif opt in ('h', 'help'):
 59 | 				pass
 60 | 			elif opt in ('f','fasta'):
 61 | 				self.fasta = arg
 62 | 			else:
 63 | 				assert False, "Unhandled option %r"%opt
 64 | 
 65 | 		#Check manditory options are set
 66 | 		if self.seq and self.fasta:
 67 | 			sys.exit("Error: Input either -s, or -f. Not both.")
 68 | 		if not self.seq and not self.fasta:
 69 | 			sys.exit("Error: Input either -s, or -f.")
 70 | 
 71 | 
 72 | 	def display_help(self, message=None):
 73 | 		if message is not None:
 74 | 			print()
 75 | 			print (message)
 76 | 		print ("\nexpandSeq.py\n")
 77 | 		print ("Contact:Tyler K. Chafin, University of Arkansas,tkchafin@uark.edu")
 78 | 		print ("\nUsage: ", sys.argv[0], "-s AGTGATAGTAGTGRRTGAYAGAGT \n")
 79 | 		print ("Description: expandSeq.py expands DNA sequences with ambiguities to a list of all possible variants.")
 80 | 
 81 | 		print("""
 82 | 	Input options:
 83 | 		-s,--seq	: Sequence string to expand (results output to stdout)
 84 | 			   or
 85 | 		-f,--fasta	: You can also specify a FASTA file. Results will be output as FASTA.
 86 | 		-h,--help	: Displays help menu""")
 87 | 		print()
 88 | 		sys.exit()
 89 | 
 90 | #Function to split character to IUPAC codes, assuing diploidy
 91 | def get_iupac_caseless(char):
 92 | 	lower = False
 93 | 	if char.islower():
 94 | 		lower = True
 95 | 		char = char.upper()
 96 | 	iupac = {
 97 | 		"A"	: ["A"],
 98 | 		"G"	: ["G"],
 99 | 		"C"	: ["C"],
100 | 		"T"	: ["T"],
101 | 		"N"	: ["A", "C", "G", "T"],
102 | 		"-"	: ["-"],
103 | 		"R"	: ["A","G"],
104 | 		"Y"	: ["C","T"],
105 | 		"S"	: ["G","C"],
106 | 		"W"	: ["A","T"],
107 | 		"K"	: ["G","T"],
108 | 		"M"	: ["A","C"],
109 | 		"B"	: ["C","G","T"],
110 | 		"D"	: ["A","G","T"],
111 | 		"H"	: ["A","C","T"],
112 | 		"V"	: ["A","C","G"]
113 | 	}
114 | 	ret = iupac[char]
115 | 	if lower:
116 | 		ret = [c.lower() for c in ret]
117 | 	return ret
118 | 
119 | #Read genome as FASTA. FASTA header will be used
120 | #This is a generator function
121 | #Doesn't matter if sequences are interleaved or not.
122 | def read_fasta(fas):
123 | 	if not fileCheck(fas):
124 | 		raise FileNotFoundError("Fatal exception, file %s not found."%fas)
125 | 
126 | 	fh = open(fas)
127 | 	try:
128 | 		with fh as file_object:
129 | 			contig = ""
130 | 			seq = ""
131 | 			for line in file_object:
132 | 				line = line.strip()
133 | 				if not line:
134 | 					continue
135 | 				line = line.replace(" ","")
136 | 				#print(line)
137 | 				if line[0] == ">": #Found a header line
138 | 					#If we already loaded a contig, yield that contig and
139 | 					#start loading a new one
140 | 					if contig:
141 | 						yield([contig,seq]) #yield
142 | 						contig = "" #reset contig and seq
143 | 						seq = ""
144 | 					contig = (line.replace(">",""))
145 | 				else:
146 | 					seq += line
147 | 		#Iyield last sequence, if it has both a header and sequence
148 | 		if contig and seq:
149 | 			yield([contig,seq])
150 | 	finally:
151 | 		fh.close()
152 | 
153 | #Function to check if a file path is valid
154 | def fileCheck(f):
155 | 	return (os.path.isfile(f))
156 | 
157 | #Function to expand ambiguous sequences
158 | #Generator function
159 | def expandAmbiquousDNA(sequence):
160 |    for i in product(*[get_iupac_caseless(j) for j in sequence]):
161 |       yield("".join(i))
162 | 
163 | 
164 | #Call main function
165 | if __name__ == '__main__':
166 |     main()
167 | 


--------------------------------------------------------------------------------
/fast2distruct.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl 
  2 | 
  3 | use strict; 
  4 | use warnings;
  5 | use Getopt::Long;
  6 | 
  7 | my $struct;  
  8 | my $popmap ;
  9 | my $meanQ  ;
 10 | my $out = "./distruct";
 11 | my $inline = 0;
 12 | my $substrGuess = 0; 
 13 | my $help = 0; 
 14 | 
 15 | parseArgs(); 
 16 | 
 17 | my $popq = $out . ".popq"; 
 18 | my $indivq = $out . ".indivq"; 
 19 | 
 20 | #Capture individual order from structure file
 21 | open (my $str, $struct) || die "\nDerp: Couldn't open $struct!\n\n";
 22 | 
 23 | my @indOrder;
 24 | my @row; 
 25 | my $index = 0; 
 26 | my $skipLine = 0; 
 27 | while (<$str>){
 28 |   #If 2 line structure format, skip every other line
 29 |   if ($inline == 0){
 30 |     if ($skipLine == 1){
 31 |       $skipLine = 0;
 32 |       next; 
 33 |     }
 34 |   }
 35 |   chomp; 
 36 |   @row = split /\t/, $_;
 37 |   s/\s+//g; 
 38 |   next unless length; 
 39 |   $row[0] =~ s/\s+//g;
 40 |   $indOrder[$index] = $row[0]; 
 41 |   $index++;
 42 |   $skipLine = 1;  
 43 | }
 44 | close $str; 
 45 | 
 46 | #Capture population identifiers 
 47 | open (my $pops, $popmap) || die "\nDerp: Couldn't open $popmap!\n\n"; 
 48 | 
 49 | my %popHash;
 50 | my %substrHash; 
 51 | my %seen; #Hash lookup table 
 52 | my $popcount = 1; 
 53 | my $sub;
 54 | my $subCount = 1;  
 55 | while (<$pops>){
 56 |   chomp; 
 57 |   @row = split /\t/, $_; 
 58 |   $row[0] =~ s/\s+//g;
 59 |   $row[1] =~ s/\s+//g; 
 60 |   s/\s+//g; 
 61 |   next unless length; 
 62 |   $row[0] = uc($row[0]); 
 63 |   $row[0] =~ /(\d+[A-Za-z]+)/; 
 64 |   $sub = $1;  
 65 |   if ($seen{$row[1]}){ 
 66 |     $popHash{$row[0]} = $seen{$row[1]};
 67 |     if ($substrGuess == 0){
 68 |       if ($substrHash{$sub}){ 
 69 |         $substrHash{$sub} != $seen{$row[1]} and print "Warning:Found more than one population identifier for the same population substring " . $sub ."!\n"; 
 70 |       }else{
 71 |         #print "2Setting " . $sub ." to " . $seen{$row[1]} . "\n";
 72 |         $substrHash{$sub} = $seen{$row[1]}; 
 73 |       }
 74 |     }
 75 |   }else{ 
 76 |     $seen{$row[1]} = $popcount;
 77 |     $popHash{$row[0]} = $popcount; 
 78 |     $substrGuess == 0 and $substrHash{$sub} = $popcount; 
 79 |     #$substrGuess == 0 and print "1Setting " . $sub ." to " . $popcount . "\n";
 80 |     $popcount++; 
 81 |   }
 82 |     
 83 | } 
 84 | close $pops; 
 85 | 
 86 | #Parse meanQ file and write indivq file and make calculations for popq
 87 | open (my $results, $meanQ) || die "\nDerp: Couldn't open $meanQ!\n\n"; 
 88 | open (my $iq, ">$indivq") || die "\nDerp: Couldn't open $indivq!\n\n";
 89 | 
 90 | my @asn;
 91 | $index = 0;  
 92 | my $popID; 
 93 | my %popq; 
 94 | 
 95 | while (<$results>){ 
 96 |   chomp;
 97 |   @row = split /\s+/, $_;
 98 |   s/\s+//g; 
 99 |   next unless length; 
100 |   $asn[$index] = [@row];  
101 | 
102 |   if ($popHash{uc($indOrder[$index])}){ 
103 |     $popID = $popHash{uc($indOrder[$index])}; 
104 |   }elsif ($substrGuess == 0){
105 |     print "\nWarning: Individual ". $indOrder[$index]; 
106 |     print " not found in popmap! Trying to guess correct population identifier... \n"; 
107 |     $indOrder[$index] =~ /(\d+[A-Za-z]+)/; 
108 |     $sub = uc($1); 
109 |     if ($substrHash{$sub}){ 
110 |       $popID = $substrHash{$sub};
111 |       print "Assigning " . $sub ." to population " . $substrHash{$sub} . "\n";
112 |     }else{
113 |       print "Population substring " . $sub . " not found. Setting popID to ".$popcount .".\n";
114 |       $popID = $popcount;
115 |       $popHash{$indOrder[$index]} = $popcount; 
116 |       $substrHash{$sub} = $popcount; 
117 |       $popcount++;  
118 |     }
119 |   }elsif ($substrGuess == 1){ 
120 |     
121 |     print "\nWarning: Individual " . $indOrder[$index]; 
122 |     print " not found in popmap! Setting popID to ". $popcount . ".\n";
123 |     $popID = $popcount;
124 |     $popHash{$indOrder[$index]} = $popcount; 
125 |     $popcount++;  
126 |   }
127 |   print $iq "  " . $indOrder[$index] .  "      " . $index . "   (0)     "; 
128 |   print $iq $popID. "  : "; 
129 |   printf $iq "%.4f ", $_ for @row; 
130 |   print $iq "\n"; 
131 |   
132 |   #If popID already in popq table, then add to it
133 |   if ($popq{$popID}){ 
134 |     for (my $i =0; $i <= $#row; $i++){ 
135 |       $popq{$popID}[0][$i] += $row[$i];  
136 |     }
137 |     $popq{$popID}[1]++; 
138 |   }else{ 
139 |     $popq{$popID}[0] = [@row]; 
140 |     $popq{$popID}[1] = 1; 
141 |   }
142 | 
143 |   $index++; 
144 | }
145 | close $meanQ; 
146 | close $iq; 
147 | 
148 | #Process popq table and print popq file 
149 | open (my $pq, ">$popq") || die "Derp: Couldn't open $popq!\n";
150 | my $total; 
151 | foreach my $key ( sort {$a<=>$b} keys %popq){ 
152 |    print $pq $key . ":   ";
153 |    $total = $popq{$key}[1]; 
154 |    for (my $i=0; $i<@{$popq{$key}[0]}; $i++){ 
155 |       printf $pq("%.4f",($popq{$key}[0][$i]/$total));
156 |       print $pq " ";
157 |    }
158 |    print $pq $total . "\n"; 
159 | }
160 | 
161 | close $pq; 
162 | exit;
163 | 
164 | #########################################################################################
165 | 
166 | sub parseArgs{
167 | 
168 | my $message = 
169 | "This script converts from the output of fastStructure to the input required for standard Distruct. It requires the structure file output by pyRAD (which was used for the analyses) and a population map in the style SMM required for Astral pipeline. It will use that pop map to determine a priori groupings, for building the popq files. I might add the ability to just pull these from the structure file later, but the pyRAD str doesn't have this so that's why I didn't do that yet. 
170 | 
171 | If you have problems running the script let me know. It hasn't really been tested fully, and I threw it together quickly. 
172 | 
173 | Options 
174 | 
175 | 	-i	- Input fastStructure meanQ file 
176 | 	-s	- Structure file from pyRAD which was given to fastStructure 
177 | 	-p	- Population map, tab-delimted (e.g. 8HBC001 \t cypha or 8HBC001 \t 1) 
178 | 	-o	- Output prefix (e.g. ./k3) 
179 | 	-e	- Bool, toggle to turn off population estimation based on prefix 
180 | 		  e.g. program will guess 9WRW002 goes in same population as 9WRW001
181 | 		  if 9WRW002 is missing from pop map. I did this because my pop map was 
182 |  		  missing samples, and I didn't want to go back and fine all of them to add in 
183 | 
184 | "; 
185 | 
186 | 	my $result = GetOptions
187 | 	( 
188 | 	'i=s'	=> \$meanQ, 
189 | 	's=s'	=> \$struct, 
190 | 	'p=s'	=> \$popmap, 
191 | 	'o=s'	=> \$out, 
192 | 	'h!'	=> \$help,
193 | 	'e!'	=> \$substrGuess 
194 | 	);
195 | $meanQ or die $message; 
196 | $struct or die $message; 
197 | $popmap or die $message; 
198 | $help == 1 and die $message; 
199 | 
200 | }
201 | 
202 | 
203 |  
204 | 


--------------------------------------------------------------------------------
/fasta2gphocs.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | import sys
  4 | import os
  5 | import getopt
  6 | 
  7 | def main():
  8 | 	params = parseArgs()
  9 | 	locnum=1
 10 | 	skipped=0
 11 | 	contents=""
 12 | 	print("Minimum allowable alignment length:",params.minlen)
 13 | 	for file in os.listdir(params.fasdir):
 14 | 		if file.endswith(".fas") or file.endswith(".fasta") or file.endswith(".fsa"):
 15 | 			aln=dict()
 16 | 			tax=0
 17 | 			aln_len=0
 18 | 			skip=False
 19 | 			for line in read_fasta(params.fasdir + "/" + file):
 20 | 				aln[line[0]] = line[1].replace("-", "N")
 21 | 				tax=tax+1
 22 | 				aln_len=len(line[1])
 23 | 				if aln_len < params.minlen:
 24 | 					skip=True
 25 | 					continue
 26 | 			if skip:
 27 | 				skipped=skipped+1
 28 | 				continue
 29 | 			contents=contents+"locus"+str(locnum)+" "+str(tax)+" "+str(aln_len)+"\n"
 30 | 			locnum=locnum+1
 31 | 			#print(locnum)
 32 | 			for samp in sorted(aln):
 33 | 				#print(samp)
 34 | 				contents=contents+str(samp)+" "+aln[samp]+"\n"
 35 | 			contents=contents+"\n"
 36 | 	#print(contents)
 37 | 
 38 | 	print("Skipped alignments smaller than minimum length:",skipped)
 39 | 	print("Total alignments passing filtering:",locnum)
 40 | 	
 41 | 	ofh=open(params.out, "w")
 42 | 	header=str(locnum)+"\n"
 43 | 	ofh.write(header)
 44 | 	ofh.write(contents)
 45 | 	ofh.close()
 46 | 
 47 | #Read genome as FASTA. FASTA header will be used
 48 | #This is a generator function
 49 | #Doesn't matter if sequences are interleaved or not.
 50 | def read_fasta(fas):
 51 | 	fh = open(fas)
 52 | 	try:
 53 | 		with fh as file_object:
 54 | 			contig = ""
 55 | 			seq = ""
 56 | 			for line in file_object:
 57 | 				line = line.strip()
 58 | 				if not line:
 59 | 					continue
 60 | 				line = line.replace(" ","")
 61 | 				#print(line)
 62 | 				if line[0] == ">": #Found a header line
 63 | 					#If we already loaded a contig, yield that contig and
 64 | 					#start loading a new one
 65 | 					if contig:
 66 | 						yield([contig,seq]) #yield
 67 | 						contig = "" #reset contig and seq
 68 | 						seq = ""
 69 | 					contig = (line.replace(">",""))
 70 | 				else:
 71 | 					seq += line
 72 | 		#Iyield last sequence, if it has both a header and sequence
 73 | 		if contig and seq:
 74 | 			yield([contig,seq])
 75 | 	finally:
 76 | 		fh.close()
 77 | 
 78 | 
 79 | #Object to parse command-line arguments
 80 | class parseArgs():
 81 | 	def __init__(self):
 82 | 		#Define options
 83 | 		try:
 84 | 			options, remainder = getopt.getopt(sys.argv[1:], 'hf:o:m:', \
 85 | 			["help"])
 86 | 		except getopt.GetoptError as err:
 87 | 			print(err)
 88 | 			self.display_help("\nExiting because getopt returned non-zero exit status.")
 89 | 		#Default values for params
 90 | 		#Input params
 91 | 		self.fasdir=None
 92 | 		self.out="gphocs_input.txt"
 93 | 		self.minlen=500
 94 | 
 95 | 		#First pass to see if help menu was called
 96 | 		for o, a in options:
 97 | 			if o in ("-h", "-help", "--help"):
 98 | 				self.display_help("Exiting because help menu was called.")
 99 | 
100 | 		#Second pass to set all args.
101 | 		for opt, arg_raw in options:
102 | 			arg = arg_raw.replace(" ","")
103 | 			arg = arg.strip()
104 | 			opt = opt.replace("-","")
105 | 			#print(opt,arg)
106 | 			if opt == "h" or opt == "help":
107 | 				continue
108 | 			elif opt == "f":
109 | 				self.fasdir=arg
110 | 			elif opt=="o":
111 | 				self.out=arg
112 | 			elif opt=="m":
113 | 				self.minlen=int(arg)
114 | 			else:
115 | 				assert False, "Unhandled option %r"%opt
116 | 
117 | 		#Check manditory options are set
118 | 		if not self.fasdir:
119 | 			self.display_help("No files provided.")
120 | 
121 | 
122 | 
123 | 	def display_help(self, message=None):
124 | 		if message is not None:
125 | 			print()
126 | 			print (message)
127 | 		print ("\nfasta2gphocs.py\n")
128 | 		print("Author: Tyler K Chafin, University of Arkansas")
129 | 		print ("Contact: tkchafin@uark.edu")
130 | 		print ("Description: Converts a set of separate FASTA-formatted gene alignments to g-phocs sequence file format")
131 | 		print("""
132 | 	Arguments:
133 | 	-f	: Directory containing FASTA files
134 | 	-o	: Output file name
135 | 	-m	: Minimum alignment length (default=500)
136 | """)
137 | 		print()
138 | 		sys.exit()
139 | 
140 | #Call main function
141 | if __name__ == '__main__':
142 |     main()
143 | 


--------------------------------------------------------------------------------
/fasta2length.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | # gives the non-gap-character-length of each sequence
 4 | 
 5 | use warnings;
 6 | use strict;
 7 | 
 8 | die "usage: $0 fastafile\n" unless $#ARGV == 0;
 9 | 
10 | open A, shift;
11 | 
12 | my ($id, @ids, %seq, $total);
13 | while (<A>) {
14 |   chomp;
15 |   if (/^>(.*)/) {
16 |     $id = $1;
17 |     push @ids, $id;
18 |   } else {
19 |     $seq{$id} .= $_;
20 |   }
21 | }
22 | 
23 | my (%group, $group, $ar, $seq, $len);
24 | foreach $id (@ids) {
25 |   $seq = $seq{$id};
26 |   $seq =~ s/[\s*-]+//g;
27 |   $len = length $seq;
28 |   $total += $len;
29 |   print "$id $len\n";
30 | #  print "\t$len\n";
31 | #  print ">$id ; length $len\n$seq{$id}\n";
32 | }
33 | 
34 | print "$total total sequence length\n";
35 | 


--------------------------------------------------------------------------------
/fasta2nexus.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | 
  3 | use strict;
  4 | use warnings;
  5 | use Getopt::Long;
  6 | use File::Path;
  7 | use File::Basename;
  8 | 
  9 | # Declare variables
 10 | 
 11 | my @input;
 12 | #our $infiletype=1;
 13 | 
 14 | parseArgs();
 15 | 
 16 | my ( $filepath, $dirpath ) = fileparse($input[0]);
 17 | 
 18 | #Iterate through files
 19 | 
 20 | @input = glob "@input";
 21 | 
 22 | foreach my $file ( @input ){
 23 | 
 24 | #Initialize variables within each daughter process
 25 |     my %data;
 26 |     my $taxa = 0;
 27 |     my @fasta;
 28 |     my @loci;
 29 |     my $nchar=0;
 30 |     my $line;
 31 | 		my $name = "";
 32 | 		my $seq = "";
 33 | 
 34 |     open ( FILE, "$file" ) || die "Error\nCan't open $file: $!\n";
 35 | 
 36 | 
 37 | 	while ( <FILE> ){
 38 | 			chomp;
 39 | 			if( $_ =~ /^\>/ ){
 40 | 				$taxa++;
 41 | 				if ($name =~ ""){
 42 | 					$_ =~ /^\>(\S+)/;
 43 | 					$name = "$1";
 44 | 				}else{
 45 | 					$data{$name} = $seq;
 46 | 					$seq = "";
 47 | 					$nchar = length($seq);
 48 | 					$_ =~ /^\>(\S+)/;
 49 | 					$name = "$1";
 50 | 				}
 51 | 			}elsif( $_ =~ /^\s*$/ ){
 52 | 				next;
 53 | 			}elsif( $_ =~ /^\s*#/ ){
 54 | 				next;
 55 | 			}else{
 56 | 				$seq .= $_; #append sequence to line; accounts for multi line fasta
 57 | 			}
 58 |     }
 59 |     close FILE;
 60 | 
 61 |     #Capture taxa name to use as identifier
 62 | 	my $filepath = fileparse("$file");
 63 | 	$filepath =~ /(\w+)\.\w/;
 64 | 	my $ID = $1;
 65 | 
 66 |     open( OUT, '>', "$dirpath$ID.nex" ) || die "Error\nCan't write to $ID.nex\n";
 67 |          print OUT "#NEXUS\n\n";
 68 |          print OUT "BEGIN DATA;
 69 | DIMENSIONS NTAX=$taxa NCHAR=$nchar;
 70 | FORMAT DATATYPE=DNA MISSING=? GAP=- ;
 71 | 
 72 | MATRIX\n";
 73 | 
 74 | 	foreach my $key (keys %data){
 75 | 		print OUT "$key\t$data{$key}\n";
 76 | 	}
 77 |     print OUT ";\n";
 78 | 
 79 |     print OUT "END;\n\n";
 80 | 
 81 |     close OUT;
 82 | }
 83 | 
 84 | 
 85 | exit;
 86 | ###########################SUBROUTINES###################################
 87 | 
 88 | sub parseArgs{
 89 | 	#Message to print if mandatory variables not declared
 90 | 	my $usage ="\nUsage: $0 --i /path/to/input/directory/*.fasta
 91 | Mandatory
 92 | 	-i, --input	-  path to the input files in fasta format
 93 | \n";
 94 | 
 95 | 	my $options = GetOptions
 96 | 		(
 97 | 		'input|i=s{1,}'		=>	\@input,
 98 | 		);
 99 | 
100 | 	@input or die "\n\nError: Input not specified!\n\n$usage\n";
101 | }
102 | 
103 | #########################################################################
104 | 


--------------------------------------------------------------------------------
/fasta2phylip.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | import re
  4 | import sys
  5 | import os
  6 | import getopt
  7 | import random
  8 | 
  9 | def main():
 10 | 	params = parseArgs()
 11 | 
 12 | 	seqs = dict() #key=FASTA header; val=sequence
 13 | 
 14 | 	#read sequence in
 15 | 	if params.fasta:
 16 | 		print('Reading alignment from FASTA...')
 17 | 		for f in read_fasta(params.fasta):
 18 | 			seqs[f[0]] = f[1]
 19 | 
 20 | 		print("Writing new PHYLIP file",params.out)
 21 | 		write_phylip(params.out, seqs)
 22 | 	elif params.phylip:
 23 | 		print('Reading alignment from PHYLIP...')
 24 | 		for f in read_phylip(params.phylip):
 25 | 			seqs[f[0]] = f[1]
 26 | 
 27 | 		print("Writing new FASTA file",params.out)
 28 | 		write_fasta(params.out, seqs)
 29 | 
 30 | 
 31 | 
 32 | #Print dict to phylip file
 33 | def write_phylip(p, aln):
 34 | 	with open(p, 'w') as fh:
 35 | 		try:
 36 | 			header = getPhylipHeader(aln) + "\n"
 37 | 			fh.write(header)
 38 | 
 39 | 			for sample in aln.keys():
 40 | 				line = str(sample) + "\t" + "".join(aln[sample]) + "\n"
 41 | 				fh.write(line)
 42 | 		except IOError as e:
 43 | 			print("Could not read file %s: %s"%(p,e))
 44 | 			sys.exit(1)
 45 | 		except Exception as e:
 46 | 			print("Unexpected error reading file %s: %s"%(p,e))
 47 | 			sys.exit(1)
 48 | 		finally:
 49 | 			fh.close()
 50 | 
 51 | #Function to write fasta-formatted sequences
 52 | def write_fasta(f, aln):
 53 | 	with open(f, 'w') as fh:
 54 | 		try:
 55 | 			for samp in aln.keys():
 56 | 				ol = ">" + str(samp) + "\n" + str(aln[samp]) + "\n"
 57 | 				fh.write(ol)
 58 | 		except IOError as e:
 59 | 			print("Could not read file %s: %s"%(f,e))
 60 | 			sys.exit(1)
 61 | 		except Exception as e:
 62 | 			print("Unexpected error reading file %s: %s"%(f,e))
 63 | 			sys.exit(1)
 64 | 		finally:
 65 | 			fh.close()
 66 | 
 67 | #Returns header for Phylip file from a dictionary of samples w/ data
 68 | def getPhylipHeader(d):
 69 | 	numSamp = 0
 70 | 	numLoci = None
 71 | 	for sample in d:
 72 | 		numSamp = numSamp + 1
 73 | 		if not numLoci:
 74 | 			numLoci = len(d[sample])
 75 | 		else:
 76 | 			if numLoci != len(d[sample]):
 77 | 				print("getPhylipHeader: Warning: Sequences of unequal length.")
 78 | 	header = str(numSamp) + " " + str(numLoci)
 79 | 	if numLoci == 0 or not numLoci:
 80 | 		print("getPhylipHeader: Warning: No loci in dictionary.")
 81 | 	if numSamp == 0:
 82 | 		print("getPhylipHeader: Warning: No samples in dictionary.")
 83 | 	return(header)
 84 | 
 85 | #Read samples as FASTA. Generator function
 86 | def read_fasta(fas):
 87 | 
 88 | 	if os.path.exists(fas):
 89 | 		with open(fas, 'r') as fh:
 90 | 			try:
 91 | 				contig = ""
 92 | 				seq = ""
 93 | 				for line in fh:
 94 | 					line = line.strip()
 95 | 					if not line:
 96 | 						continue
 97 | 					#print(line)
 98 | 					if line[0] == ">": #Found a header line
 99 | 						#If we already loaded a contig, yield that contig and
100 | 						#start loading a new one
101 | 						if contig:
102 | 							yield([contig,seq]) #yield
103 | 							contig = "" #reset contig and seq
104 | 							seq = ""
105 | 						split_line = line.split()
106 | 						contig = (split_line[0].replace(">",""))
107 | 					else:
108 | 						seq += line
109 | 				#Iyield last sequence, if it has both a header and sequence
110 | 				if contig and seq:
111 | 					yield([contig,seq])
112 | 			except IOError:
113 | 				print("Could not read file ",fas)
114 | 				sys.exit(1)
115 | 			finally:
116 | 				fh.close()
117 | 	else:
118 | 		raise FileNotFoundError("File %s not found!"%fas)
119 | 
120 | #Read samples as PHYLIP. Generator function
121 | def read_phylip(phy):
122 | 	if os.path.exists(phy):
123 | 		with open(phy, 'r') as fh:
124 | 			try:
125 | 				num=0
126 | 				for line in fh:
127 | 					line = line.strip()
128 | 					if not line:
129 | 						continue
130 | 					num += 1
131 | 					if num == 1:
132 | 						continue
133 | 					arr = line.split()
134 | 					yield(arr[0], arr[1])
135 | 			except IOError:
136 | 				print("Could not read file ",phy)
137 | 				sys.exit(1)
138 | 			finally:
139 | 				fh.close()
140 | 	else:
141 | 		raise FileNotFoundError("File %s not found!"%phy)
142 | 
143 | #Object to parse command-line arguments
144 | class parseArgs():
145 | 	def __init__(self):
146 | 		#Define options
147 | 		try:
148 | 			options, remainder = getopt.getopt(sys.argv[1:], 'f:p:h', \
149 | 			["help", "fasta=", "phy="])
150 | 		except getopt.GetoptError as err:
151 | 			print(err)
152 | 			self.display_help("\nExiting because getopt returned non-zero exit status.")
153 | 		#Default values for params
154 | 		#Input params
155 | 		self.fasta=None
156 | 		self.phylip=None
157 | 		self.out=None
158 | 
159 | 		#First pass to see if help menu was called
160 | 		for o, a in options:
161 | 			if o in ("-h", "-help", "--help"):
162 | 				self.display_help("Exiting because help menu was called.")
163 | 
164 | 		#Second pass to set all args.
165 | 		for opt, arg_raw in options:
166 | 			arg = arg_raw.replace(" ","")
167 | 			arg = arg.strip()
168 | 			opt = opt.replace("-","")
169 | 			#print(opt,arg)
170 | 			if opt =="f" or opt=="fasta":
171 | 				self.fasta = arg
172 | 			elif opt =="p" or opt=="phy":
173 | 				self.phylip = arg
174 | 			elif opt =="h" or opt == "help":
175 | 				pass
176 | 			else:
177 | 				assert False, "Unhandled option %r"%opt
178 | 
179 | 		#Check manditory options are set
180 | 		if not self.fasta and not self.phylip:
181 | 			self.display_help("Must provide either a FASTA or PHYLIP file.")
182 | 
183 | 		if self.fasta and self.phylip:
184 | 			self.display_help("Must provide either a FASTA or PHYLIP file.")
185 | 
186 | 		#get output prefix if not set by user
187 | 		if self.fasta:
188 | 			self.out = os.path.splitext(self.fasta)[0] + '.phylip'
189 | 		elif self.phylip:
190 | 			self.out = os.path.splitext(self.phylip)[0] + '.fasta'
191 | 
192 | 	def display_help(self, message=None):
193 | 		if message is not None:
194 | 			print()
195 | 			print (message)
196 | 		print ("\nfasta2phylip.py\n")
197 | 		print ("Contact:Tyler K. Chafin, University of Arkansas,tkchafin@uark.edu")
198 | 		print ("\nUsage: ", sys.argv[0], "[-f <.fasta>] [-p <.phy>]\n")
199 | 		print ("Description: Simple script to convert between FASTA and PHYLIP formats")
200 | 
201 | 		print("""
202 | 	Arguments:
203 | 		-f,--fasta	: Input FASTA to be converted to PHYLIP
204 | 		-p,--phy	: Input PHYLIP to be converted to FASTA
205 | 		-h,--help	: Displays help menu
206 | """)
207 | 		print()
208 | 		sys.exit()
209 | 
210 | #Call main function
211 | if __name__ == '__main__':
212 |     main()
213 | 


--------------------------------------------------------------------------------
/fastaFormatter.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | import sys
  4 | import os
  5 | import getopt
  6 | from textwrap import wrap
  7 | 
  8 | def main():
  9 | 	params = parseArgs()
 10 | 	
 11 | 	if params.many2one:
 12 | 		seqs=dict()
 13 | 		for f in read_fasta(params.many2one):
 14 | 			seqs[f[0]] = f[1]
 15 | 		write_fasta(params.out, seqs)
 16 | 	elif params.one2many:
 17 | 		seqs=dict()
 18 | 		for f in read_fasta(params.one2many):
 19 | 			seqs[f[0]] = f[1]
 20 | 		write_fasta(params.out, seqs, params.width)
 21 | 
 22 | #Function to write fasta-formatted sequences
 23 | def write_fasta(f, aln, width=None):
 24 | 	with open(f, 'w') as fh:
 25 | 		try:
 26 | 			for samp in aln.keys():
 27 | 				if width:
 28 | 					ol = ">" + str(samp) + "\n"
 29 | 					chunks=wrap(aln[samp], width=width, break_on_hyphens=False, drop_whitespace=False)
 30 | 					for chunk in chunks:
 31 | 						ol=ol + str(chunk) + "\n"
 32 | 				else:
 33 | 					ol = ">" + str(samp) + "\n" + str(aln[samp]) + "\n"
 34 | 				fh.write(ol)
 35 | 		except IOError as e:
 36 | 			print("Could not read file %s: %s"%(f,e))
 37 | 			sys.exit(1)
 38 | 		except Exception as e:
 39 | 			print("Unexpected error reading file %s: %s"%(f,e))
 40 | 			sys.exit(1)
 41 | 		finally:
 42 | 			fh.close()
 43 | 
 44 | #Read samples as FASTA. Generator function
 45 | def read_fasta(fas):
 46 | 	if os.path.exists(fas):
 47 | 		with open(fas, 'r') as fh:
 48 | 			try:
 49 | 				contig = ""
 50 | 				seq = ""
 51 | 				for line in fh:
 52 | 					line = line.strip()
 53 | 					if not line:
 54 | 						continue
 55 | 					#print(line)
 56 | 					if line[0] == ">": #Found a header line
 57 | 						#If we already loaded a contig, yield that contig and
 58 | 						#start loading a new one
 59 | 						if contig:
 60 | 							yield([contig,seq]) #yield
 61 | 							contig = "" #reset contig and seq
 62 | 							seq = ""
 63 | 						split_line = line.split()
 64 | 						contig = (split_line[0].replace(">",""))
 65 | 					else:
 66 | 						seq += line
 67 | 				#Iyield last sequence, if it has both a header and sequence
 68 | 				if contig and seq:
 69 | 					yield([contig,seq])
 70 | 			except IOError:
 71 | 				print("Could not read file ",fas)
 72 | 				sys.exit(1)
 73 | 			finally:
 74 | 				fh.close()
 75 | 	else:
 76 | 		raise FileNotFoundError("File %s not found!"%fas)
 77 | 
 78 | #Object to parse command-line arguments
 79 | class parseArgs():
 80 | 	def __init__(self):
 81 | 		#Define options
 82 | 		try:
 83 | 			options, remainder = getopt.getopt(sys.argv[1:], 'h1:M:w:o:', \
 84 | 			["help", "one2many=","many2one=","width=","out="])
 85 | 		except getopt.GetoptError as err:
 86 | 			print(err)
 87 | 			self.display_help("\nExiting because getopt returned non-zero exit status.")
 88 | 		#Default values for params
 89 | 		#Input params
 90 | 		self.one2many=None
 91 | 		self.many2one=None
 92 | 		self.width=60
 93 | 		self.out="out.fas"
 94 | 
 95 | 
 96 | 		#First pass to see if help menu was called
 97 | 		for o, a in options:
 98 | 			if o in ("-h", "-help", "--help"):
 99 | 				self.display_help("Exiting because help menu was called.")
100 | 
101 | 		#Second pass to set all args.
102 | 		for opt, arg_raw in options:
103 | 			arg = arg_raw.replace(" ","")
104 | 			arg = arg.strip()
105 | 			opt = opt.replace("-","")
106 | 			#print(opt,arg)
107 | 			if opt == "h" or opt == "help":
108 | 				continue
109 | 			elif opt=="one2many" or opt=="1":
110 | 				self.one2many=arg
111 | 			elif opt=="many2one" or opt=="M":
112 | 				self.many2one=arg
113 | 			elif opt=="width" or opt=="w":
114 | 				self.width=int(arg)
115 | 			elif opt=="out" or opt=="o":
116 | 				self.out=arg
117 | 			else:
118 | 				assert False, "Unhandled option %r"%opt
119 | 
120 | 		#Check manditory options are set
121 | 		if not self.one2many and not self.many2one:
122 | 			self.display_help("No files provided.")
123 | 
124 | 
125 | 
126 | 	def display_help(self, message=None):
127 | 		if message is not None:
128 | 			print()
129 | 			print (message)
130 | 		print ("\nfastaFormatter.py\n")
131 | 		print("Author: Tyler K Chafin, University of Arkansas")
132 | 		print ("Contact: tkchafin@uark.edu")
133 | 		print ("Description:Right now just converts b/n multi-line and one-line fasta formats, might add later")
134 | 		print("""
135 | 		-1,--one2many	: Path to fasta file to multi-line format
136 | 		-M,--many2one	: Path to fasta file to convert to one-line format
137 | 		-w,--width	: Characters per line for multi-line (default: 60)
138 | 		-o,--out	: Output file name (default=out.fas)
139 | """)
140 | 		print()
141 | 		sys.exit()
142 | 
143 | #Call main function
144 | if __name__ == '__main__':
145 |     main()
146 | 


--------------------------------------------------------------------------------
/fill_quartets.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os
 4 | import sys
 5 | import itertools
 6 | import collections
 7 | 
 8 | if len(sys.argv) < 3:
 9 | 	print("Usage: fill_quartets.py <CF_file> <taxon list>")
10 | 
11 | CF=sys.argv[1]
12 | all=sys.argv[2]
13 | 
14 | spoof=True #hard coded option
15 | 
16 | #list of lists, capturing sampled quartets
17 | sampled = list()
18 | 
19 | with open(CF, 'r') as fh:
20 | 	try:
21 | 		seen = list()
22 | 		for line in fh:
23 | 			if not line:
24 | 				continue
25 | 			else:
26 | 				stuff = line.split(",")
27 | 				seen = sorted(stuff[0:4])
28 | 				sampled.append(seen)
29 | 	except IOError:
30 | 		print("Could not read file ",CF)
31 | 		sys.exit(1)
32 | 	finally:
33 | 		fh.close()
34 | 
35 | all_quartets=list()
36 | all_tax = list()
37 | with open(all, 'r') as fh:
38 | 	try:
39 | 		all = list()
40 | 		for line in fh:
41 | 			line=line.strip()
42 | 			if not line:
43 | 				continue
44 | 			else:
45 | 				all_tax.append(line)
46 | 	except IOError:
47 | 		print("Could not read file ",CF)
48 | 		sys.exit(1)
49 | 	finally:
50 | 		fh.close()
51 | 
52 | all_comb = list(itertools.combinations(all_tax,4))
53 | for comb in all_comb:
54 | 	all_quartets.append(sorted(list(comb)))
55 | 
56 | #print("Writing all missing quartets to stdout...")
57 | 
58 | for quartet in all_quartets:
59 | 	miss=True
60 | 	for sample in sampled:
61 | 		if set(quartet) == set(sample):
62 | 			miss=False
63 | 	if miss==True:
64 | 		if spoof:
65 | 			oline = "";
66 | 			for tax in quartet:
67 | 				oline = oline + str(tax) + ","
68 | 			oline = oline + "0.333333333333334,0.333333333333333,0.333333333333333"
69 | 			print(oline)
70 | 		else:
71 | 			print(quartet)
72 | 


--------------------------------------------------------------------------------
/filterFastaMedianLength.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash 
 2 | 
 3 | #Tyler K. Chafin 
 4 | #July 23 2021
 5 | #Generates n bootstrap samples of an input newick-formatted file of trees
 6 | #Email: tylerkchafin@gmail.com with issues
 7 | 
 8 | if [ $1 ]; 
 9 | then 
10 |   fasta="$1"
11 | else 
12 |   printf "\nUsage: $0 <fasta file>\n\n"
13 |   exit 1
14 | fi
15 | 
16 | #calculate median sequence length
17 | median=`grep -v ">" $fasta | awk 'BEGIN{FS=""}{print NF}' | sort -n | awk '{a[NR]=$0}END{print(NR%2==1)?a[int(NR/2)+1]:(a[NR/2]+a[NR/2+1])/2}'`
18 | 
19 | #select out sequences equal to or above median length 
20 | grep -B1 "^[A-Za-z]\{$median,\}" $fasta | sed "/^--$/d" > $fasta".filter"
21 | 


--------------------------------------------------------------------------------
/filter_loci.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | use Getopt::Long; 
 6 | use File::Path; 
 7 | use File::Copy; 
 8 | use File::Basename;
 9 | 
10 | my @DIR; 
11 | my $cutoff;
12 | my $N; 
13 | my $blacklist=0;
14 | my @contents;
15 | 
16 | parseArgs();  
17 | 
18 | 
19 | 
20 | #Use File::Basename to capture some info
21 | my ($filepath, $dirpath) = fileparse( $DIR[0] );
22 | 
23 | #If user toggled "blacklist loci" on, then some configuration...
24 | $blacklist =~ "1" and my $bname = "blacklist";
25 | $blacklist =~ "1" and rmtree "$dirpath/$bname";
26 | $blacklist =~ "1" and mkdir "$dirpath/$bname";
27 |  
28 | 
29 | #Iterate through files
30 | 
31 | @DIR = glob "@DIR"; 
32 | foreach my $file (@DIR){ 
33 | 
34 | $N=0; 
35 | @contents="";
36 | 
37 | open (FILE, "$file"); 
38 |     while (<FILE>){ 
39 | 	chomp $_;
40 | 	push @contents, $_; 
41 |     }
42 | 	    $N +=()= "@contents" =~ /\>/g;
43 | 	    	    
44 | 	    if ( $blacklist eq "1"){ 
45 | 	        if ( $N < $cutoff ){ 
46 | 		    move("$file","$dirpath/$bname/");  
47 | 	        }
48 | 	    }else{
49 | 	
50 | 	        if ( $N < $cutoff ){ 
51 | 		    unlink "$file"; 
52 | 	        } 
53 |             }
54 | close FILE; 
55 | }
56 | 
57 | 
58 | ##############################################SUBROUTINES###########################################
59 | 
60 | sub parseArgs{ 
61 | 
62 | 	my $usage="\nfilter_loci.pl takes a directory full of fasta files, each representing a single locus, and deletes any loci with insufficient coverage across samples, using a user-specified cut off value.                  
63 | 
64 | Usage: $0 --i=/path/to/*.fasta --x=# [--b] 
65 | 
66 | Mandatory Variables 
67 | 	-i, --input		-   Path to fasta files 
68 | 	-x, --cutoff		-   Integer indicating the minimum number of samples to retain a locus
69 | Options
70 | 	-b, --blacklist		-   Retain dropped loci in a blacklisted_loci directory
71 | ";
72 | 
73 | 	my $results = GetOptions 
74 | 	( 
75 | 	'input|i=s{1,}'	=> \@DIR,
76 | 	'cutoff|x=i'	=> \$cutoff,
77 | 	'blacklist|b!'	=> \$blacklist,
78 | 	); 
79 | 
80 | @DIR or die "\nDerp: Input directory not defined!\n\n$usage";
81 | $cutoff or die "\nDerp: Minimum coverage required to retain a locus not defined!\n\n$usage";
82 | 
83 | }
84 | 


--------------------------------------------------------------------------------
/findBreaksVCF.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | import re
  4 | import sys
  5 | import os
  6 | import getopt
  7 | import vcf
  8 | 
  9 | def main():
 10 | 	params = parseArgs()
 11 | 
 12 | 	vfh = vcf.Reader(open(params.vcf, 'r'))
 13 | 
 14 | 	#grab contig sizes
 15 | 	contigs = dict()
 16 | 	for c,s in vfh.contigs.items():
 17 | 		contigs[s.id] = s.length
 18 | 
 19 | 	regions = list()
 20 | 
 21 | 	this_chrom = None
 22 | 	start = int()
 23 | 	stop = int()
 24 | 	count = 0
 25 | 	for rec in vfh:
 26 | 		if not this_chrom:
 27 | 			this_chrom = rec.CHROM
 28 | 			start = 1
 29 | 			stop = 1
 30 | 			count = 0
 31 | 		#If we entered new chromosome, submit old break
 32 | 		elif this_chrom != rec.CHROM:
 33 | 			t = tuple([this_chrom, start, contigs[this_chrom]])
 34 | 			regions.append(t)
 35 | 			this_chrom = rec.CHROM
 36 | 			start = 1
 37 | 			stop = 1
 38 | 			count = 0
 39 | 
 40 | 		#if this SNP is parsimony-informative
 41 | 		if rec.is_snp and not rec.is_monomorphic:
 42 | 			#Check if parsimony-informative
 43 | 			if is_PIS(rec):
 44 | 				count+=1
 45 | 				#if this is the final PIS, submit region to list
 46 | 				if count == params.force:
 47 | 					stop = rec.POS
 48 | 					t = tuple([this_chrom, start, stop])
 49 | 					regions.append(t)
 50 | 					start = stop + 1
 51 | 					count = 0
 52 | 
 53 | 	t = tuple([this_chrom, start, contigs[this_chrom]])
 54 | 	regions.append(t)
 55 | 
 56 | 	print("Writing regions to out.regions...")
 57 | 	write_regions("out.regions", regions)
 58 | 
 59 | #Function to write list of regions tuples, in GATK format
 60 | def write_regions(f, r):
 61 | 
 62 | 	with open(f, 'w') as fh:
 63 | 		try:
 64 | 			for reg in r:
 65 | 				ol = str(reg[0]) + ":" + str(reg[1]) + "-" + str(reg[2]) + "\n"
 66 | 				fh.write(ol)
 67 | 		except IOError as e:
 68 | 			print("Could not read file %s: %s"%(f,e))
 69 | 			sys.exit(1)
 70 | 		except Exception as e:
 71 | 			print("Unexpected error reading file %s: %s"%(f,e))
 72 | 			sys.exit(1)
 73 | 		finally:
 74 | 			fh.close()
 75 | 
 76 | #Function to check pyVCF record for if parsimony informative or not
 77 | def is_PIS(r):
 78 | 	ref=0
 79 | 	alt=0
 80 | 	for call in r.samples:
 81 | 		if call.gt_type:
 82 | 			if call.gt_type == 0:
 83 | 				ref += 1
 84 | 			elif call.gt_type == 1:
 85 | 				alt += 1
 86 | 			elif call.gt_type == 2:
 87 | 				alt += 1
 88 | 				ref += 1
 89 | 		if ref >= 2 and alt >= 2:
 90 | 			return(True)
 91 | 	if ref <= 2 and alt <= 2:
 92 | 		return(False)
 93 | 
 94 | #Object to parse command-line arguments
 95 | class parseArgs():
 96 | 	def __init__(self):
 97 | 		#Define options
 98 | 		try:
 99 | 			options, remainder = getopt.getopt(sys.argv[1:], 'v:f:h', \
100 | 			["vcf=" "help", "force="])
101 | 		except getopt.GetoptError as err:
102 | 			print(err)
103 | 			self.display_help("\nExiting because getopt returned non-zero exit status.")
104 | 		#Default values for params
105 | 		#Input params
106 | 		self.vcf=None
107 | 		self.force=100000
108 | 
109 | 		#First pass to see if help menu was called
110 | 		for o, a in options:
111 | 			if o in ("-h", "-help", "--help"):
112 | 				self.display_help("Exiting because help menu was called.")
113 | 
114 | 		#Second pass to set all args.
115 | 		for opt, arg_raw in options:
116 | 			arg = arg_raw.replace(" ","")
117 | 			arg = arg.strip()
118 | 			opt = opt.replace("-","")
119 | 			#print(opt,arg)
120 | 			if opt in ('v', 'vcf'):
121 | 				self.vcf = arg
122 | 			elif opt in ('f','force'):
123 | 				self.force=int(arg)
124 | 			elif opt in ('h', 'help'):
125 | 				pass
126 | 			else:
127 | 				assert False, "Unhandled option %r"%opt
128 | 
129 | 		#Check manditory options are set
130 | 		if not self.vcf:
131 | 			self.display_help("Must provide VCF file <-v,--vcf>")
132 | 
133 | 	def display_help(self, message=None):
134 | 		if message is not None:
135 | 			print()
136 | 			print (message)
137 | 		print ("\nfindBreaksVCF.py\n")
138 | 		print ("Contact:Tyler K. Chafin, University of Arkansas,tkchafin@uark.edu")
139 | 		print ("\nUsage: ", sys.argv[0], "-v <input.vcf> -f <100000>\n")
140 | 		print ("Description: Breaks chromosomes into chunks of X parsimony-informative sites, for running MDL")
141 | 
142 | 		print("""
143 | 	Arguments:
144 | 		-v,--vcf	: VCF file for parsing
145 | 		-f,--force	: Number of PIS to force a break
146 | 		-h,--help	: Displays help menu
147 | 
148 | """)
149 | 		print()
150 | 		sys.exit()
151 | 
152 | #Call main function
153 | if __name__ == '__main__':
154 |     main()
155 | 


--------------------------------------------------------------------------------
/genesFromGFF.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl 
  2 | 
  3 | 
  4 | use strict; 
  5 | use warnings; 
  6 | use Getopt::Long; 
  7 | 
  8 | 
  9 | our $gff=""; 
 10 | our $genome=""; 
 11 | 
 12 | parseArgs(); #Call subroutine to parse arguments... 
 13 | 
 14 | my @line; 
 15 | my $dna; 
 16 | my $element;
 17 | my @info;   
 18 | 	
 19 | 
 20 | 
 21 | 
 22 | 
 23 | #Call subroutine- recognizes each element type in gff and provides total length and GC content for each
 24 | 
 25 | seqsFromGFF( "CDS" ); 
 26 | seqsFromGFF( "rRNA" );  
 27 | seqsFromGFF( "tRNA" ); 
 28 | 
 29 | 
 30 | exit; 
 31 | 
 32 | ###############################SUBROUTINES######################################
 33 | 
 34 | #Subroutine to parse command line arguments
 35 | sub parseArgs{
 36 | 
 37 |     my $usage = "\nUsage: $0 --genome=whole_genome.fasta --gff=annotations.gff
 38 | 
 39 |     mandatory
 40 |        --genome      -  FASTA file containing sequences to parse
 41 |        --gff         -  GFF file containing gene annotations \n\n";
 42 | 
 43 | 
 44 |                  my $result = GetOptions
 45 |                          (
 46 |                                  'genome=s'  => \$genome,
 47 |                                  'gff=s'     => \$gff,
 48 |                                
 49 |                          );
 50 |              
 51 | 	        $genome ne "" || die $usage;  #Die if mandatory variables undefined
 52 | 		$gff ne "" || die $usage; 
 53 |         
 54 | }
 55 |                                                              
 56 | 
 57 | #Subroutine to parse gff and genome for particular type of element
 58 | 
 59 | sub seqsFromGFF{
 60 | 
 61 | my $type = $_[0]; 
 62 | my %genes; 
 63 | my $subseq; 
 64 | my $name; 
 65 | my $exon; 
 66 | 
 67 | 
 68 | undef @line; 
 69 | undef $dna; 
 70 |  
 71 |     
 72 |     open ( GENOME, "$genome") || die "Derp: Can't open file $genome!";
 73 | 
 74 | 	while (<GENOME>){ 
 75 | 	    if ($_ !~ />/){ 
 76 | 		chomp $_; 
 77 | 		$dna .= $_; 
 78 | 	    }
 79 | 	}
 80 | 
 81 |     close GENOME;
 82 |  
 83 | 
 84 |     open ( GFF, "$gff" ) || die "Derp: Can't open file $gff!"; 
 85 | 
 86 | 	foreach ( <GFF> ){ 
 87 |             @line = split /\t/, $_;
 88 | 			#print "$line[2]\n"; 
 89 | 	        if ( uc $line[2]  eq uc $type ){ 
 90 | 		    $subseq = substr ( $dna, $line[3]-1, $line[5] ); 
 91 |                     
 92 | 		    @info = split /\s/, $line[8]; 
 93 | 		    $name = $info[1]; 
 94 | 		    
 95 |                     #Reverse complement if on opposite strand
 96 | 		    $line[6] =~ "-" and $subseq = revcom( $subseq ); 
 97 | 
 98 | 
 99 | 		    if ( uc $info[2] eq uc "exon"){ 
100 | 			$exon = $info[3]; 
101 | 		    }else{
102 | 			$exon = 1; 
103 | 		    }
104 | 
105 | 		    		 
106 | 
107 | 
108 | #If element is already in hash, then alter values in the arrays by following ref in hash value...
109 | 		    if ( exists $genes{$name} ){  
110 | 			$genes{$name}->[$exon-1] = $subseq;
111 | 		    }else{ 
112 | 		
113 |  #Create array containing length and GC content, then assign array ref to hash key for that element	
114 | 			my @seqs=(); 
115 | 			$genes{$name} = \@seqs;
116 | 		   	$genes{$name}->[0] = "";
117 | 			$genes{$name}->[$exon-1] = $subseq;  
118 | 		    } 	 
119 | 		}
120 | 	}
121 | 
122 | print "\nSequences for element type \"$type\": \n\n";	
123 | 
124 | foreach my $key ( keys %genes ){ 
125 |     print "\>$key\n";
126 |     print "@{$genes{$key}}"."\n";
127 |     
128 | }
129 |         
130 | close GFF; 
131 | }	
132 |  
133 | ###################################################################################################
134 | 
135 | sub revcom { 
136 | 
137 | my $DNA = reverse ( $_[0] ); 
138 | 
139 | $DNA =~ tr/ACGTacgt/TGCAtgca/; 
140 | 
141 | return $DNA; 
142 | 
143 | }
144 | 


--------------------------------------------------------------------------------
/ipyrad2polyrad.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | import sys
  4 | import os
  5 | import getopt
  6 | 
  7 | def main():
  8 | 	params = parseArgs()
  9 | 	
 10 | 	f = open(params.out, 'w')
 11 | 	
 12 | 	with open(params.vcf, "r") as vcf:
 13 | 		for line in vcf:
 14 | 			line=line.strip()
 15 | 			#directly transfer header lines
 16 | 			if line[0] == "#":
 17 | 				if line[1] != "#":
 18 | 					f.write("##FORMAT=<ID=AD,Number=.,Type=Integer,Description=\"Allelic depths for the reference and alternate alleles in the order listed\">\n")
 19 | 				f.write(line)
 20 | 				f.write("\n")
 21 | 				
 22 | 			else:
 23 | 				fields=line.split("\t")
 24 | 				ref=get_index(fields[3].split(","))
 25 | 				alt=get_index(fields[4].split(","))
 26 | 				#if biallelic filter on and site has >2 alleles, skip
 27 | 				if params.biallelic == True:
 28 | 					if (len(ref) + len(alt) > 2):
 29 | 						continue
 30 | 				if fields[8]=="GT:DP:CATG":
 31 | 					fields[8]="GT:DP:AD"
 32 | 				else:
 33 | 					print("Something wrong with VCF. Field 8 should be GT:DP:CATG")
 34 | 					sys.exit()
 35 | 				for idx, sample in enumerate(fields[9:]):
 36 | 					fixed=str(fix_sample(sample, ref, alt))+":"
 37 | 					fields[idx+9] = fixed
 38 | 					#print(sample, " -- ", fixed)
 39 | 				f.write("\t".join(fields))
 40 | 				f.write("\n")
 41 | 		vcf.close()
 42 | 	f.close()
 43 | 
 44 | def fix_sample(sample, ref, alt):
 45 | 	fields=sample.split(":")
 46 | 	catg=fields[2].split(",")
 47 | 	ad=list()
 48 | 	for r in ref:
 49 | 		ad.append(catg[r])
 50 | 	for a in alt:
 51 | 		ad.append(catg[a])
 52 | 	fields[2]=",".join(ad)
 53 | 	return(":".join(fields))
 54 | 	
 55 | def get_index(char):
 56 | 	ret=list()
 57 | 	for c in char:
 58 | 		if c.lower()=="c":
 59 | 			ret.append(0)
 60 | 		elif c.lower()=="a":
 61 | 			ret.append(1)
 62 | 		elif c.lower()=="t":
 63 | 			ret.append(2)
 64 | 		elif c.lower()=="g":
 65 | 			ret.append(3)
 66 | 		else:
 67 | 			print("Unrecognized character",char)
 68 | 			sys.exit()
 69 | 	return(ret)
 70 | 
 71 | #Object to parse command-line arguments
 72 | class parseArgs():
 73 | 	def __init__(self):
 74 | 		#Define options
 75 | 		try:
 76 | 			options, remainder = getopt.getopt(sys.argv[1:], 'hv:o:b', \
 77 | 			["help", "vcf=", "out=", "biallelic"])
 78 | 		except getopt.GetoptError as err:
 79 | 			print(err)
 80 | 			self.display_help("\nExiting because getopt returned non-zero exit status.")
 81 | 		#Default values for params
 82 | 		#Input params
 83 | 		self.vcf=None
 84 | 		self.out="polyrad.vcf"
 85 | 		self.biallelic=False
 86 | 
 87 | 
 88 | 		#First pass to see if help menu was called
 89 | 		for o, a in options:
 90 | 			if o in ("-h", "-help", "--help"):
 91 | 				self.display_help("Exiting because help menu was called.")
 92 | 
 93 | 		#Second pass to set all args.
 94 | 		for opt, arg_raw in options:
 95 | 			arg = arg_raw.replace(" ","")
 96 | 			arg = arg.strip()
 97 | 			opt = opt.replace("-","")
 98 | 			#print(opt,arg)
 99 | 			if opt == "h" or opt == "help":
100 | 				continue
101 | 			elif opt=="vcf" or opt=="v":
102 | 				self.vcf=arg
103 | 			elif opt=="out" or opt=="o":
104 | 				self.out=arg
105 | 			elif opt=="biallelic" or opt=="b":
106 | 				self.biallelic=True
107 | 			else:
108 | 				assert False, "Unhandled option %r"%opt
109 | 
110 | 		#Check manditory options are set
111 | 		if not self.vcf:
112 | 			self.display_help("Need an ipyrad VCF file")
113 | 
114 | 
115 | 
116 | 	def display_help(self, message=None):
117 | 		if message is not None:
118 | 			print()
119 | 			print (message)
120 | 		print ("\nipyrad2polyrad.py\n")
121 | 		print("Author: Tyler K Chafin, University of Arkansas")
122 | 		print ("Contact: tkchafin@uark.edu")
123 | 		print ("Description:Converts the ipyrad VCF to a format usable for polyRAD")
124 | 		print("""
125 | 		-v,--vcf	: VCF input with ipyrad "CATG" field
126 | 		-b,--biallelic	: [Boolean] Toggle to skip non-biallelic sites
127 | 		-o,--out	: Output file name (default=polyrad.vcf)
128 | """)
129 | 		print()
130 | 		sys.exit()
131 | 
132 | #Call main function
133 | if __name__ == '__main__':
134 |     main()
135 | 


--------------------------------------------------------------------------------
/liftoverCoords.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | import sys
  4 | import os
  5 | import getopt
  6 | import pyliftover
  7 | import csv
  8 | import pandas as pd
  9 | 
 10 | 
 11 | def main():
 12 | 	params = parseArgs()
 13 | 	if params.liftover:
 14 | 		lo = pyliftover.LiftOver(params.liftover)
 15 | 		if params.table:
 16 | 			tab=pd.read_csv(params.table, sep="\t")
 17 | 			print("Read table:")
 18 | 			print(tab)
 19 | 			def convert(row):
 20 | 				name="chr"+row[params.chrom]
 21 | 				ret=lo.convert_coordinate(name, row[params.bp])
 22 | 				return(int(ret[0][1]))
 23 | 				
 24 | 			tab[params.ocol] = tab.apply(convert,axis = 1)
 25 | 			print("Writing the output table:")
 26 | 			print(tab)
 27 | 			tab.to_csv(params.oname, sep="\t", index=False)
 28 | 			
 29 | 			if params.marey:
 30 | 				marey=make_marey(tab, params.chrom, params.ocol)
 31 | 				print("Created the following Marey Map input:")
 32 | 				print(marey)
 33 | 				mout=params.oname+"_mmap.txt"
 34 | 				marey.to_csv(mout, sep=" ", quoting=csv.QUOTE_NONNUMERIC, index=False)
 35 | 			
 36 | 		else:
 37 | 			params.display_help("Error: No table provided")
 38 | 	else:
 39 | 		params.display_help("Error: No liftover file provided")
 40 | 
 41 | #function writes a spoof marey map file from a table of :
 42 | #chr \t bp \t cM \t liftover.bp
 43 | def make_marey(table, chrom, bp):
 44 | 	ret=pd.DataFrame()
 45 | 	ret["map"] = "chr"+table[chrom].astype(str)
 46 | 	ret["set"] = "fakeset"
 47 | 	ret["mkr"] = "fakemarker"
 48 | 	ret["phys"] = table[bp].astype(int)
 49 | 	ret["gen"] = table["cM"].astype(float)
 50 | 	return(ret)
 51 | 		
 52 | 
 53 | #Object to parse command-line arguments
 54 | class parseArgs():
 55 | 	def __init__(self):
 56 | 		#Define options
 57 | 		try:
 58 | 			options, remainder = getopt.getopt(sys.argv[1:], 'hf:t:p:c:n:o:m', \
 59 | 			["help"])
 60 | 		except getopt.GetoptError as err:
 61 | 			print(err)
 62 | 			self.display_help("\nExiting because getopt returned non-zero exit status.")
 63 | 		#Default values for params
 64 | 		#Input params
 65 | 		self.table = None
 66 | 		self.liftover = None
 67 | 		self.chrom = "chr"
 68 | 		self.bp = "bp"
 69 | 		self.ocol = "liftover.bp"
 70 | 		self.oname = None
 71 | 		self.marey=False
 72 | 
 73 | 		#First pass to see if help menu was called
 74 | 		for o, a in options:
 75 | 			if o in ("-h", "-help", "--help"):
 76 | 				self.display_help("Exiting because help menu was called.")
 77 | 
 78 | 		#Second pass to set all args.
 79 | 		for opt, arg_raw in options:
 80 | 			arg = arg_raw.replace(" ","")
 81 | 			arg = arg.strip()
 82 | 			opt = opt.replace("-","")
 83 | 			#print(opt,arg)
 84 | 			if opt == "h" or opt == "help":
 85 | 				continue
 86 | 			elif opt == "f":
 87 | 				self.liftover=arg
 88 | 			elif opt == "t":
 89 | 				self.table = arg
 90 | 			elif opt == "p":
 91 | 				self.bp=arg
 92 | 			elif opt == "c":
 93 | 				self.chrom=str(arg)
 94 | 			elif opt == "n":
 95 | 				self.ocol=str(arg)
 96 | 			elif opt == "o":
 97 | 				self.oname=str(arg)
 98 | 			elif opt == "m":
 99 | 				self.marey=True
100 | 			else:
101 | 				assert False, "Unhandled option %r"%opt
102 | 
103 | 		#Check manditory options are set
104 | 		if not self.liftover or not self.table:
105 | 			self.display_help("No files provided.")
106 | 		self.oname=self.table + ".liftover"
107 | 
108 | 
109 | 	def display_help(self, message=None):
110 | 		if message is not None:
111 | 			print()
112 | 			print (message)
113 | 		print ("\nliftoverCoords.py\n")
114 | 		print("Author: Tyler K Chafin, University of Arkansas")
115 | 		print ("Contact: tkchafin@uark.edu")
116 | 		print ("Description: Converts a table of physical positions from one genome assembly to another given an \".over.chain.gz\" database")
117 | 		print("""
118 | Arguments: 
119 | 	-h, --help	: Display help menu
120 | 	-f			: Path to .over.chain.gz file
121 | 	-t			: Tab-delimited table including coordinates
122 | 	-p			: Column name in table containing the physical (bp) coordinates
123 | 				   [default = \"bp\"]
124 | 	-c			: Column name in table containing the chromosome names
125 | 				   [default = \"chr\"]
126 | 	-n			: Output column name for new table
127 | 				   [default = \"liftover.bp\"]
128 | 	-o			: Output file name
129 | 				   [default = \"<infile>.liftover\"]
130 | 	-m			: (Boolean) Additionally output Marey-Map input file
131 | 
132 | 	NOTE: Chromosomes should be named e.g. as \"chr1\" or \"chrX\" in the
133 | 			      .over.chain.gz file, but without the \"chr\" in the table file """)
134 | 		print()
135 | 		sys.exit()
136 | 
137 | #Call main function
138 | if __name__ == '__main__':
139 |     main()
140 | 


--------------------------------------------------------------------------------
/liftoverFromPafscaff.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | import sys
  4 | import os
  5 | import getopt
  6 | import csv
  7 | import pandas as pd
  8 | import functools
  9 | 
 10 | #sorry this code isn't really commented
 11 | #i'm in a hurry
 12 | #and tired
 13 | 
 14 | def main():
 15 | 	params = parseArgs()
 16 | 	mapper=dict()
 17 | 	coords=pd.read_csv(params.coords, sep="\t", names=["scaffold", "scaffold_pos"])
 18 | 
 19 | 	#capture mappings from pafscaff headers
 20 | 	with open(params.paf, "r") as fh:
 21 | 		for line in fh:
 22 | 			l=line.split()
 23 | 			chr=l[0].split(".")[0].replace(">","")
 24 | 			if l[1] == "RevComp":
 25 | 				revcomp=True
 26 | 				scaffold=l[2]
 27 | 				start=int(l[7].split(":")[0].replace(",",""))
 28 | 				end=int(l[7].split(":")[1].replace(";","").replace(",",""))
 29 | 			else:
 30 | 				revcomp=False
 31 | 				scaffold=l[1]
 32 | 				#print(l)
 33 | 				#print(line[6])
 34 | 				start=int(l[6].split(":")[0].replace(",",""))
 35 | 				end=int(l[6].split(":")[1].replace(";","").replace(",",""))
 36 | 			mapper[scaffold]=[chr, revcomp, start, end]
 37 | 			#print(scaffold, ":", mapper[scaffold])
 38 | 
 39 | 			#sys.exit()
 40 | 
 41 | 	#...watch out for off-by-one errors
 42 | 	def liftover(mapper, row):
 43 | 		#print(row)
 44 | 		if row[0] in mapper:
 45 | 			convert = mapper[row[0]]
 46 | 			#print(convert)
 47 | 			if convert[1]:
 48 | 				#revcomp
 49 | 				new_coord=convert[3]-row[1]
 50 | 			else:
 51 | 				#not revcomp
 52 | 				new_coord=convert[2]+row[1]
 53 | 			row['chrom']=convert[0]
 54 | 			row['chrom_pos']=new_coord
 55 | 		else:
 56 | 			#print("Scaffold",str(row[0]), "not placed in pafscaff output"
 57 | 			row['chrom'] = "NA"
 58 | 			row['chrom_pos'] = 0
 59 | 		return(row)
 60 | 			#return(["NA", 0])
 61 | 	#print(coords)
 62 | 	liftover_call = functools.partial(liftover, mapper)
 63 | 	coords=coords.apply(liftover_call, axis = 1)
 64 | 	print(coords)
 65 | 
 66 | 	coords.to_csv(params.out, sep="\t",
 67 | 		header=True, quoting=False,
 68 | 		index=False)
 69 | 
 70 | 
 71 | 
 72 | 
 73 | #Object to parse command-line arguments
 74 | class parseArgs():
 75 | 	def __init__(self):
 76 | 		#Define options
 77 | 		try:
 78 | 			options, remainder = getopt.getopt(sys.argv[1:], 'hp:c:o:', \
 79 | 			["help", "out=", "paf=", "coords="])
 80 | 		except getopt.GetoptError as err:
 81 | 			print(err)
 82 | 			self.display_help("\nExiting because getopt returned non-zero exit status.")
 83 | 		#Default values for params
 84 | 		#Input params
 85 | 		self.coords = None
 86 | 		self.paf = None
 87 | 		self.out = "out.txt"
 88 | 
 89 | 		#First pass to see if help menu was called
 90 | 		for o, a in options:
 91 | 			if o in ("-h", "-help", "--help"):
 92 | 				self.display_help("Exiting because help menu was called.")
 93 | 
 94 | 		#Second pass to set all args.
 95 | 		for opt, arg_raw in options:
 96 | 			arg = arg_raw.replace(" ","")
 97 | 			arg = arg.strip()
 98 | 			opt = opt.replace("-","")
 99 | 			#print(opt,arg)
100 | 			if opt == "h" or opt == "help":
101 | 				continue
102 | 			elif opt == "c" or opt=="coords":
103 | 				self.coords=arg
104 | 			elif opt == "p" or opt=="paf":
105 | 				self.paf=arg
106 | 			elif opt =="o" or opt=="out":
107 | 				self.out=arg
108 | 			else:
109 | 				assert False, "Unhandled option %r"%opt
110 | 
111 | 		#Check manditory options are set
112 | 		if not self.paf:
113 | 			self.display_help("No paf provided.")
114 | 		if not self.coords:
115 | 			self.display_help("No coordinates provided.")
116 | 
117 | 
118 | 	def display_help(self, message=None):
119 | 		if message is not None:
120 | 			print()
121 | 			print (message)
122 | 		print ("\nliftoverFromPafscaff.py\n")
123 | 		print("Author: Tyler K Chafin, University of Colorado")
124 | 		print ("Contact: tyler.chafin@colorado.edu")
125 | 		print ("Description: Converts a given set of coordinates (e.g., from a VCF file) to a new coordinate system, as mapped by pafscaff")
126 | 		print("""
127 | Arguments:
128 | 	-h, --help	: Display help menu
129 | 	-p,--paf	: Path to pafscaff fasta file (can be just headers)
130 | 	-c,--coords	: Tab-delimited table in the format: scaffold_name "\t" position
131 | 	-o, --out	: Output file name [default=out.tsv]
132 | """)
133 | 		print()
134 | 		sys.exit()
135 | 
136 | #Call main function
137 | if __name__ == '__main__':
138 |     main()
139 | 


--------------------------------------------------------------------------------
/makeHyde.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | import re
  4 | import sys
  5 | import os
  6 | import getopt
  7 | import operator
  8 | import collections
  9 | import copy
 10 | 
 11 | def main():
 12 | 	params = parseArgs()
 13 | 
 14 | 	if params.phylip:
 15 | 		#Get sequences as dict of lists
 16 | 		seqs = readPhylip(params.phylip)
 17 | 	else:
 18 | 		print("No input provided.")
 19 | 		sys.exit(1)
 20 | 
 21 | 	pop_assign = dict()
 22 | 	#parse popmap file for dictionary of sample assignments
 23 | 	if params.popmap:
 24 | 		print("Parsing popmap file...")
 25 | 		pop_assign = parsePopmap(params.popmap)
 26 | 	else:
 27 | 		print("ERROR: Popmap file must be provided.")
 28 | 		sys.exit(1)
 29 | 
 30 | 	if seqs and pop_assign:
 31 | 
 32 | 		#Remove samples from pop_assign that do not have data
 33 | 		pop_assign = cleanPopmap(pop_assign, seqs.keys())
 34 | 
 35 | 		#Make dict of dicts that splits by population, only retaining pops/samples from the popmap.
 36 | 		#Get unique pop names using one of the worst python lines ever written
 37 | 		pops = dict()
 38 | 		for k in set(pop_assign.values()):
 39 | 			pops[k] = dict()
 40 | 
 41 | 		#Remove pops listed as excluded
 42 | 		if params.exclude:
 43 | 			print("Excluding populations:", ", ".join(params.exclude))
 44 | 			for exc in params.exclude:
 45 | 				if exc in pops:
 46 | 					del pops[exc]
 47 | 		if params.include:
 48 | 			print("Only keeping populations:", ", ".join(params.include))
 49 | 			for pop in list(pops):
 50 | 				if pop not in params.include:
 51 | 					del pops[pop]
 52 | 
 53 | 		#make sure we didn't throw out all populations...
 54 | 		if len(list(pops)) < 1:
 55 | 			print("Oops! No populations remaining. Check that popmap sample names match those in your data file, or that selections using --include or --exclude are correct! :)")
 56 | 			sys.exit(1)
 57 | 
 58 | 		alen = getSeqLen(seqs)
 59 | 		inum = 0
 60 | 		for assigned in pop_assign:
 61 | 			if pop_assign[assigned] in pops:
 62 | 				pops[pop_assign[assigned]][assigned] = seqs[assigned]
 63 | 				inum+=1
 64 | 		seqs.clear()
 65 | 
 66 | 		#Make 2D list to remove columns failing the globalN filter
 67 | 		bad_columns = list() #list of column numbers to delete
 68 | 
 69 | 		#For each pop dict, make 2D list to remove columns failing popN filter
 70 | 		print("Found",alen,"nucleotide columns in the dataset!")
 71 | 		columns = [[]for i in range(alen)] #2D array of global data
 72 | 		for pop, data in pops.items():
 73 | 			for sample, sequence in data.items():
 74 | 				for i, nuc in enumerate(sequence):
 75 | 					columns[i].append(nuc)
 76 | 
 77 | 		#Write new ordered output and phylip
 78 | 		print("Writing outputs...")
 79 | 		phy = params.out + ".phy"
 80 | 		omap = params.out + ".map"
 81 | 
 82 | 		pfh = open(phy, "w")
 83 | 		mfh = open(omap, "w")
 84 | 
 85 | 		header = str(inum) + "\t" + str(alen) + "\n"
 86 | 		pfh.write(header)
 87 | 
 88 | 		for pop in sorted(pops):
 89 | 			for ind, data in pops[pop].items():
 90 | 				indline = str(ind) + "\t" + "".join(data) + "\n"
 91 | 				pfh.write(indline)
 92 | 
 93 | 				mapline = str(ind) + "\t" + str(pop) + "\n"
 94 | 				mfh.write(mapline)
 95 | 		pfh.close()
 96 | 		mfh.close()
 97 | 
 98 | 		print("Done!\n")
 99 | 
100 | #Goes through a dict of sequences and get the alignment length
101 | def getSeqLen(aln):
102 | 	length = None
103 | 	for key in aln:
104 | 		if not length:
105 | 			length = len(aln[key])
106 | 		else:
107 | 			if length != len(aln[key]):
108 | 				print("getSeqLen: Alignment contains sequences of multiple lengths.")
109 | 	return(length)
110 | 
111 | #function reads a tab-delimited popmap file and return dictionary of assignments
112 | def parsePopmap(popmap):
113 | 	ret = dict()
114 | 	if os.path.exists(popmap):
115 | 		with open(popmap, 'r') as fh:
116 | 			try:
117 | 				contig = ""
118 | 				seq = ""
119 | 				for line in fh:
120 | 					line = line.strip()
121 | 					if not line:
122 | 						continue
123 | 					else:
124 | 						stuff = line.split()
125 | 						ret[stuff[0]] = stuff[1]
126 | 				return(ret)
127 | 			except IOError:
128 | 				print("Could not read file ",pairs)
129 | 				sys.exit(1)
130 | 			finally:
131 | 				fh.close()
132 | 	else:
133 | 		raise FileNotFoundError("File %s not found!"%popmap)
134 | 
135 | #Function to remove samples from a popmap dict, given a list of valid samples (e.g. those to retain)
136 | def cleanPopmap(popmap, names):
137 | 	ret = copy.deepcopy(popmap)
138 | 	to_remove = list()
139 | 	for ind in popmap:
140 | 		if ind not in names:
141 | 			to_remove.append(ind)
142 | 	for rem in sorted(to_remove, reverse=True):
143 | 		del ret[rem]
144 | 
145 | 	return(ret)
146 | 
147 | #Function to read a phylip file. Returns dict (key=sample) of lists (sequences divided by site)
148 | def readPhylip(phy):
149 | 	if os.path.exists(phy):
150 | 		with open(phy, 'r') as fh:
151 | 			try:
152 | 				num=0
153 | 				ret = dict()
154 | 				for line in fh:
155 | 					line = line.strip()
156 | 					if not line:
157 | 						continue
158 | 					num += 1
159 | 					if num == 1:
160 | 						continue
161 | 					arr = line.split()
162 | 					ret[arr[0]] = list(arr[1])
163 | 				return(ret)
164 | 			except IOError:
165 | 				print("Could not read file ",fas)
166 | 				sys.exit(1)
167 | 			finally:
168 | 				fh.close()
169 | 	else:
170 | 		raise FileNotFoundError("File %s not found!"%fas)
171 | 
172 | 
173 | #Object to parse command-line arguments
174 | class parseArgs():
175 | 	def __init__(self):
176 | 		#Define options
177 | 		try:
178 | 			options, remainder = getopt.getopt(sys.argv[1:], 'p:i:ho:X:I:', \
179 | 			["input=","phylip=","phy=","out=","popmap=","maxN=",
180 | 			"popN=","exclude=","include="])
181 | 		except getopt.GetoptError as err:
182 | 			print(err)
183 | 			self.display_help("\nExiting because getopt returned non-zero exit status.")
184 | 		#Default values for params
185 | 		#Input params
186 | 		self.phylip=None
187 | 		self.popmap=None
188 | 		self.out="out"
189 | 		self.exclude = list()
190 | 		self.include = list()
191 | 
192 | 
193 | 		#First pass to see if help menu was called
194 | 		for o, a in options:
195 | 			if o in ("-h", "-help", "--help"):
196 | 				self.display_help("Exiting because help menu was called.")
197 | 
198 | 		#Second pass to set all args.
199 | 		for opt, arg_raw in options:
200 | 			arg = arg_raw.replace(" ","")
201 | 			arg = arg.strip()
202 | 			opt = opt.replace("-","")
203 | 			#print(opt,arg)
204 | 			if opt in ('i', 'phylip', 'input','phy'):
205 | 				self.phylip = arg
206 | 			elif opt in ('p', 'popmap'):
207 | 				self.popmap = arg
208 | 			elif opt in ('h', 'help'):
209 | 				pass
210 | 			elif opt in ('o','out'):
211 | 				self.out = arg
212 | 			elif opt in ('X', 'exclude'):
213 | 				self.exclude = arg.split(",")
214 | 			elif opt in ('I','include'):
215 | 				self.include = arg.split(",")
216 | 			else:
217 | 				assert False, "Unhandled option %r"%opt
218 | 
219 | 		#Check manditory options are set
220 | 		if not self.phylip :
221 | 			self.display_help("Error: Missing required alignment file (--input)")
222 | 		if not self.popmap:
223 | 			self.display_help("Error: Missing required popmap file (-p, --popmap)")
224 | 		if self.include and self.exclude:
225 | 			self.display_help("Don't use both --include and --exclude.")
226 | 
227 | 
228 | 	def display_help(self, message=None):
229 | 		if message is not None:
230 | 			print ("\n",message)
231 | 		print ("\nmakeHyde.py\n")
232 | 		print ("Contact:Tyler K. Chafin, University of Arkansas,tkchafin@uark.edu")
233 | 		print ("\nUsage: ", sys.argv[0], "-i /path/to/phylip -i /path/to/popmap\n")
234 | 		print ("Description: Making inputs for HyDe and filtering populations for inclusion/exclusion")
235 | 
236 | 		print("""
237 | 	Arguments:
238 | 		INPUT FILES [REQUIRED]
239 | 		-i,--input	: Input file as PHYLIP
240 | 		-p,--popmap	: Tab-delimited population map
241 | 
242 | 		PARAMETERS [OPTIONAL]
243 | 		-o,--out	: Output file name <default = out.nex>
244 | 		-X,--exclude: List of pops to exclude (format: -x "Pop1,Pop2,Sample4...")
245 | 		-I,--include: List of pops to include (removing all others)
246 | 		-h,--help	: Displays help menu
247 | 
248 | """)
249 | 		sys.exit()
250 | 
251 | #Call main function
252 | if __name__ == '__main__':
253 |     main()
254 | 


--------------------------------------------------------------------------------
/newhybs2distruct.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | import sys
  4 | import os
  5 | import getopt
  6 | import collections
  7 | 
  8 | def main():
  9 | 	params = parseArgs()
 10 | 	if params.pops and params.pofz:
 11 | 		#get pop IDS
 12 | 		p = readList(params.pops)
 13 | 
 14 | 		#get OrderedDict of prob results
 15 | 		probs = readNewHybs(params.pofz)
 16 | 
 17 | 		#iterate over probs to make output
 18 | 		nan = 0
 19 | 		index = -1
 20 | 		popCount = dict()
 21 | 		popEnum = dict()
 22 | 		enumCounter = 1
 23 | 		gen_cats = 0
 24 | 
 25 | 		numPops = 0
 26 | 		numInds = 0
 27 | 		print("\nWriting INDIVQ file for distruct:",params.out)
 28 | 
 29 | 		with open (params.out, "w") as IQ:
 30 | 			try:
 31 | 				for key, value in probs.items():
 32 | 					index += 1;
 33 | 					if ("nan" in value):
 34 | 						nan += 1;
 35 | 						continue #skip individuals which couldn't be assigned
 36 | 
 37 | 					numInds += 1
 38 | 					#track population ID and count per pop
 39 | 					if (p[index] not in popEnum):
 40 | 						popEnum[p[index]] = enumCounter
 41 | 						enumCounter += 1
 42 | 						numPops += 1
 43 | 					if (p[index] not in popCount):
 44 | 						popCount[p[index]] = 1
 45 | 					else:
 46 | 						popCount[p[index]] += 1
 47 | 
 48 | 					if gen_cats == 0:
 49 | 						gen_cats = len(value)
 50 | 					elif gen_cats != len(value):
 51 | 						print("Warning: Samples don't have the same number of probabilities! Something is wrong")
 52 | 
 53 | 					#build output line for INDIVQ
 54 | 					indline = str(key) + "\t" + str(key) + "\t(0)\t" + str(popEnum[p[index]]) + "\t: " + "\t".join(value) + "\n"
 55 | 					IQ.write(indline)
 56 | 					#print(key, "(",p[index], "): ", value)
 57 | 
 58 | 				if (nan > 0):
 59 | 					print("Warning:",nan,"individuals had \"nan\" probabilities are were skipped.")
 60 | 			except IOError:
 61 | 				print("Could not open file",params.out)
 62 | 				sys.exit(1)
 63 | 			finally:
 64 | 				IQ.close()
 65 | 
 66 | 		print("Writing dummy POPQ file:", params.popq)
 67 | 		with open(params.popq, "w") as PQ:
 68 | 			try:
 69 | 				for pop, enum in popEnum.items():
 70 | 					out = str(enum) + ":"
 71 | 					for cat in range(gen_cats):
 72 | 						out = out + "\t0.0"
 73 | 					out = out + "\t" + str(popCount[pop]) + "\n"
 74 | 					PQ.write(out)
 75 | 
 76 | 			except IOError:
 77 | 				print("Could not open file",params.popq)
 78 | 				sys.exit(1)
 79 | 			finally:
 80 | 				PQ.close()
 81 | 
 82 | 		print("Writing Labels file: NH_labels.txt")
 83 | 		with open("NH_labels.txt", "w") as ID:
 84 | 			try:
 85 | 				for pop, enum in popEnum.items():
 86 | 					out = str(enum) + " " + str(pop) + "\n"
 87 | 					ID.write(out)
 88 | 
 89 | 			except IOError:
 90 | 				print("Could not open file NH_labels.txt")
 91 | 				sys.exit(1)
 92 | 			finally:
 93 | 				ID.close()
 94 | 
 95 | 		print("Writing COLOR PERMUTATION file: NH_geno.perm")
 96 | 		with open("NH_geno.perm", "w") as PERM:
 97 | 			try:
 98 | 				print()
 99 | 				P1 = "1 RdGy_6_div_1\n"
100 | 				print("P1: Red (RdGy_6_div_1)")
101 | 				PERM.write(P1)
102 | 
103 | 				P2 = "2 RdBu_6_div_6\n"
104 | 				print("P2: Blue (RdBu_6_div_6)")
105 | 				PERM.write(P2)
106 | 
107 | 				F1 = "3 Greens_6_seq_5\n"
108 | 				print("F1: Green (Greens_6_seq_5)")
109 | 				PERM.write(F1)
110 | 
111 | 				F2 = "4 Greens_6_seq_2\n"
112 | 				print("F2: Light Green (Greens_6_seq_2)")
113 | 				PERM.write(F2)
114 | 
115 | 				BO1 = "5 RdBu_6_div_3\n"
116 | 				print("BO1: Light red (RdBu_6_div_3)")
117 | 				PERM.write(BO1)
118 | 
119 | 				BO2 = "6 RdBu_6_div_4\n"
120 | 				print("BO2: Light Blue (RdBu_6_div_4)")
121 | 				PERM.write(BO2)
122 | 
123 | 				print()
124 | 
125 | 			except IOError:
126 | 				print("Could not open file NH_geno.perm")
127 | 				sys.exit(1)
128 | 			finally:
129 | 				PERM.close()
130 | 
131 | 		print("Writing Distruct paramsfile: NH_params.txt")
132 | 		with open("NH_params.txt", "w") as PAR:
133 | 			try:
134 | 				stuff = getParams(numPops, numInds)
135 | 				PAR.write(stuff)
136 | 
137 | 			except IOError:
138 | 				print("Could not open file NH_params.txt")
139 | 				sys.exit(1)
140 | 			finally:
141 | 				PAR.close()
142 | 
143 | 
144 | 		print("Done!\n")
145 | 	else:
146 | 		print("Missing required inputs.")
147 | 		sys.exit(1)
148 | 
149 | def getParams(np, ni):
150 | 	par = """
151 | #define INFILE_POPQ NH_popq.txt
152 | #define INFILE_INDIVQ NH_indivq.txt
153 | #define INFILE_LABEL_BELOW NH_labels.txt
154 | #define INFILE_LABEL_ATOP NH_labels.txt
155 | #define INFILE_CLUST_PERM NH_geno.perm
156 | #define OUTFILE NH.ps
157 | #define K 6
158 | """
159 | 	par = par + "#define NUMPOPS " + str(np) + "\n"
160 | 	par = par + "#define NUMINDS " + str(ni) + "\n"
161 | 	par = par + """#define PRINT_INDIVS 1
162 | #define PRINT_LABEL_ATOP 1
163 | #define PRINT_LABEL_BELOW 0
164 | #define PRINT_SEP 1
165 | #define FONTHEIGHT 6
166 | #define DIST_ABOVE -160
167 | #define DIST_BELOW -50
168 | #define BOXHEIGHT 150
169 | #define INDIVWIDTH 2
170 | #define ORIENTATION 1
171 | #define XORIGIN 200
172 | #define YORIGIN 10
173 | #define XSCALE 1
174 | #define YSCALE 1
175 | #define ANGLE_LABEL_ATOP 270
176 | #define ANGLE_LABEL_BELOW 270
177 | #define LINEWIDTH_RIM 3
178 | #define LINEWIDTH_SEP 1
179 | #define LINEWIDTH_IND 3
180 | #define GRAYSCALE 0
181 | #define ECHO_DATA 1
182 | #define REPRINT_DATA 1
183 | #define PRINT_INFILE_NAME 0
184 | #define PRINT_COLOR_BREWER 1"""
185 | 	return(par)
186 | 
187 | #reads and returns a list from a file
188 | def readList(l):
189 | 	if os.path.exists(l):
190 | 		with open(l, 'r') as fh:
191 | 			try:
192 | 				ret = list()
193 | 				for line in fh:
194 | 					line = line.strip()
195 | 					if not line:
196 | 						continue
197 | 					ret.append(line)
198 | 				return(ret)
199 | 			except IOError:
200 | 				print("Could not read file ",fas)
201 | 				sys.exit(1)
202 | 			finally:
203 | 				fh.close()
204 | 	else:
205 | 		raise FileNotFoundError("File %s not found!"%fas)
206 | 
207 | 
208 | #reads assignment probabilities from NewHybs PofZ output file
209 | def readNewHybs(p):
210 | 	if os.path.exists(p):
211 | 		with open(p, 'r') as fh:
212 | 			try:
213 | 				ret = collections.OrderedDict()
214 | 				count = 0;
215 | 				for line in fh:
216 | 					line = line.strip()
217 | 					if not line:
218 | 						continue
219 | 					count += 1
220 | 					if count == 1:
221 | 						continue #skip first non-blank line, which is the header
222 | 					else:
223 | 						arr = line.split()
224 | 						ret[arr[0]] = list(arr[2:])
225 | 				return(ret)
226 | 			except IOError:
227 | 				print("Could not read file ",fas)
228 | 				sys.exit(1)
229 | 			finally:
230 | 				fh.close()
231 | 	else:
232 | 		raise FileNotFoundError("File %s not found!"%fas)
233 | 
234 | 
235 | 
236 | 
237 | #Object to parse command-line arguments
238 | class parseArgs():
239 | 	def __init__(self):
240 | 		#Define options
241 | 		try:
242 | 			options, remainder = getopt.getopt(sys.argv[1:], 'i:p:', \
243 | 			["pops=","input="])
244 | 		except getopt.GetoptError as err:
245 | 			print(err)
246 | 			self.display_help("\nExiting because getopt returned non-zero exit status.")
247 | 		#Default values for params
248 | 		#Input params
249 | 		self.pops = None
250 | 		self.pofz=None
251 | 		self.out = "NH_indivq.txt"
252 | 		self.popq = "NH_popq.txt"
253 | 
254 | 		#First pass to see if help menu was called
255 | 		for o, a in options:
256 | 			if o in ("-h", "-help", "--help"):
257 | 				self.display_help("Exiting because help menu was called.")
258 | 
259 | 		#Second pass to set all args.
260 | 		for opt, arg_raw in options:
261 | 			arg = arg_raw.replace(" ","")
262 | 			arg = arg.strip()
263 | 			opt = opt.replace("-","")
264 | 			#print(opt,arg)
265 | 			if opt in ('p', 'pops'):
266 | 				self.pops = arg
267 | 			elif opt in ('h', 'help'):
268 | 				pass
269 | 			elif opt in ('i', 'input'):
270 | 				self.pofz = arg
271 | 			else:
272 | 				assert False, "Unhandled option %r"%opt
273 | 
274 | 		#Check manditory options are set
275 | 		if not self.pops:
276 | 			self.display_help("Error: Missing required PopID file (-p, --pops)")
277 | 		if not self.pofz:
278 | 			self.display_help("Error: Missing required PofZ file (-i, --input)")
279 | 
280 | 
281 | 	def display_help(self, message=None):
282 | 		if message is not None:
283 | 			print ("\n",message)
284 | 		print ("\nnewhybs2distruct.py\n")
285 | 		print ("Contact:Tyler K. Chafin, University of Arkansas,tkchafin@uark.edu")
286 | 		print ("\nUsage: ", sys.argv[0], "-i aa-PofZ.txt -p popmap \n")
287 | 		print ("Description: Creates inputs for DISTRUCT from NewHybrids output.")
288 | 
289 | 		print("""
290 | 	Arguments:
291 | 		-i,--input	: aa-PofZ.txt output from NewHybrids.
292 | 		-p,--pops	: Path to population IDs for NewHybrids samples
293 | 			Format: List of population IDs in the SAME ORDER as NewHybrids output.
294 | 			Note: My phylip2newhybrids.pl script will create this for you.
295 | 		-o,--out	: Output file name <default = out.nex>
296 | 		-h,--help	: Displays help menu
297 | 
298 | """)
299 | 		sys.exit()
300 | 
301 | #Call main function
302 | if __name__ == '__main__':
303 |     main()
304 | 


--------------------------------------------------------------------------------
/parsePhaseCons.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | import os
  4 | import sys
  5 | import argparse
  6 | 
  7 | def main():
  8 | 		params = parseArgs()
  9 | 		output=list()
 10 | 		with open(params.input, "r") as fh:
 11 | 			coord=1
 12 | 			linecount=0
 13 | 			chrom=None
 14 | 			start=1
 15 | 			step=1
 16 | 			current_start=None
 17 | 			report=True
 18 | 			total=0
 19 | 			for line in fh:
 20 | 				line = line.strip()
 21 | 				if not line:
 22 | 					continue
 23 | 				linecount += 1
 24 | 				if linecount == 1 or "=" in line:
 25 | 					header = line.split()
 26 | 					for field in header:
 27 | 						parts=field.split("=")
 28 | 						if parts[0] == "chrom":
 29 | 							if chrom and parts[1] != chrom:
 30 | 								print("Found new chrom:",parts[1])
 31 | 							chrom=parts[1]
 32 | 						elif parts[0] == "start":
 33 | 							start = int(parts[1])
 34 | 						elif parts[0] == "step":
 35 | 							step = int(parts[1])
 36 | 						else:
 37 | 							continue
 38 | 					if current_start and start > coord:
 39 | 						print("Started new region:",start, "- jumped from",coord)
 40 | 						padded_start = current_start - params.padding
 41 | 						if padded_start <= 0:
 42 | 							padded_start = 1
 43 | 						end = coord
 44 | 						padded_end = end + params.padding
 45 | 						if end - current_start > params.min_length:
 46 | 							total += (padded_end-padded_start)
 47 | 							oline=str(chrom)+":"+str(padded_start)+"-"+str(end)+"\n"
 48 | 							output.append(oline)
 49 | 						#print(oline)
 50 | 						current_start = None
 51 | 						coord=start
 52 | 					else:
 53 | 						coord=coord+step
 54 | 					if not chrom:
 55 | 						sys.exit("No chrom field found in header! Exiting script.")
 56 | 					if report:
 57 | 						print("\nChrom is:",chrom)
 58 | 						print("Starting coordinate:",start)
 59 | 						print("Step size:",step)
 60 | 						print("Minimum phaseCons score:",params.min_score)
 61 | 						print("Minimum length to report interval:",params.min_length)
 62 | 						if params.padding > 0:
 63 | 							print("Padding (+/-) for interval coordinates:",params.padding)
 64 | 						print("\n--\n")
 65 | 						report=False
 66 | 					continue
 67 | 
 68 | 				if float(line) >= params.min_score:
 69 | 					if not current_start:
 70 | 						current_start=coord
 71 | 				else:
 72 | 					#NOT above threshold. If there is a previous interval, check it now
 73 | 					if current_start:
 74 | 						padded_start = current_start - params.padding
 75 | 						if padded_start <= 0:
 76 | 							padded_start = 1
 77 | 						end = coord
 78 | 						padded_end = end + params.padding
 79 | 						if end - current_start > params.min_length:
 80 | 							#print(end-current_start+(2*params.padding))
 81 | 							total += (padded_end-padded_start)
 82 | 							oline=str(chrom)+":"+str(padded_start)+"-"+str(end)+"\n"
 83 | 							output.append(oline)
 84 | 							#print(oline)
 85 | 						current_start = None
 86 | 				coord = coord + step
 87 | 		fh.close()
 88 | 
 89 | 		print("\n--\nDone! Writing output to:", params.output)
 90 | 
 91 | 		with open(params.output, "w") as ofh:
 92 | 			if len(output) > 0:
 93 | 				for l in output:
 94 | 					ofh.write(l)
 95 | 		ofh.close()
 96 | 		print("\nProcess complete. Total bases included in retained intervals:",total, "\n")
 97 | 
 98 | 
 99 | 
100 | 
101 | 
102 | 
103 | 
104 | 
105 | #argument parsing
106 | def parseArgs():
107 | 	help = """
108 | 	parsePhaseCons.py
109 | 
110 | 	Author: Tyler K. Chafin
111 | 	Contact: tkchafin@uark.edu
112 | 
113 | 	Description: Processes phaseCons outputs to generate a set of intervals with phaseCons score above X
114 | 
115 | 	Input should be a file of phaseCons scores, with a header including the following information:
116 | 	chrom=<chrom_name> start=<start_coordinate> step=<step_size>
117 | 	"""
118 | 	parser = argparse.ArgumentParser(description=help)
119 | 
120 | 	parser.add_argument('--min_length', dest='min_length', type=int, default=10,
121 | 						help='Minimum interval length to report [default=10]')
122 | 	parser.add_argument('--padding', dest='padding', type=int, default=0,
123 | 						help='Distance to pad interval coordinated (e.g. output=start-padding:end+padding) [default=0]')
124 | 	parser.add_argument('--min_score', dest='min_score', type=float, default=0.5,
125 | 						help='Minimum phaseCons score [default=0.5]')
126 | 	parser.add_argument('--input', dest='input', type=str,
127 | 						help='Input .pp.data file')
128 | 	parser.add_argument('--output', dest='output', type=str, default="phaseCons_intervals.bed",
129 | 						help='Output .bed file [default=phaseCons_intervals.bed]')
130 | 
131 | 	args = parser.parse_args()
132 | 
133 | 	if not args.input:
134 | 		sys.exit("Missing inputs")
135 | 
136 | 	return args
137 | 
138 | #Call main function
139 | if __name__ == '__main__':
140 |     main()
141 | 


--------------------------------------------------------------------------------
/phylip2nexus.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | #Modified from  fasta2nexus.pl written by BTM
 4 | #-TKC
 5 | 
 6 | use strict;
 7 | use warnings;
 8 | use Getopt::Long;
 9 | use File::Path;
10 | use File::Basename;
11 | # Declare variables
12 | 
13 | our $input;      
14 | #our $infiletype=1; 
15 | 
16 | parseArgs(); 
17 |          
18 | 
19 | #Initialize variables within each daughter process
20 |     my @data;
21 |     my @names;
22 |     my $taxa = 0;
23 |     my $name; 
24 |     my @fasta;
25 |     my @loci; 
26 |     my $nchar; 
27 |     my $line=0; 
28 |     my @linedata; 
29 |  
30 |     open ( FILE, "$input" ) || die "Error\nCan't open $input: $!\n";
31 | 	while ( <FILE> ){
32 | 	    chomp;
33 | 	    $line++; 
34 | 	    @linedata = split /\s+/, $_; 
35 | 	    s/\s+//g; 
36 | 	    length($_) or next;
37 | 	    $line == 1 and next;  
38 | 	    $taxa++; 
39 | 	    $name = $linedata[0];
40 | 	    push @names, "$name"; 
41 | 	    push @data, $linedata[1];
42 | 	    if ($nchar){ 
43 | 		length($linedata[1]) != $nchar and print "Error: Line beginning with $name has a different sequence length.\n"; 
44 | 	    }else{ 
45 | 		$nchar = length($linedata[1]);
46 | 	    }
47 |     	}
48 |     close FILE;
49 | 
50 |     #Capture to use as identifier
51 | 	my ($filepath, $dirpath) = fileparse("$input");
52 | 	$filepath =~ /(\w+)\.\w/;
53 | 	my $ID = $1;
54 | 
55 |     open( OUT, '>', "$dirpath$ID.nex" ) || die "Error\nCan't write to $ID.nex\n";		
56 |          print OUT "#NEXUS\n\n";    
57 |          print OUT "BEGIN DATA;
58 | DIMENSIONS NTAX=$taxa NCHAR=$nchar;
59 | FORMAT DATATYPE=DNA MISSING=? GAP=- ;
60 | 
61 | MATRIX\n";
62 | 
63 | 	for ( my $i = 0; $i<scalar @names; $i++ ){	
64 | 		print OUT "$names[$i]\t$data[$i]\n";
65 | 	}
66 |     print OUT ";\n";
67 | 
68 |     print OUT "END;\n\n";
69 |     
70 |     close OUT; 	
71 | 
72 | 
73 | exit;
74 | ###########################SUBROUTINES###################################
75 | 
76 | sub parseArgs{ 
77 | 	#Message to print if mandatory variables not declared
78 | 	my $usage ="\nUsage: $0 --i /path/to/input/directory/*.phylip
79 | Mandatory 
80 | 	-i, --input	-  path to the input files in phylip format
81 | \n";
82 | 
83 | 	my $options = GetOptions 
84 | 		( 
85 | 		'input|i=s{1,}'		=>	\$input
86 | 		);
87 | 		
88 | 	$input or die "\n\nError: Input not specified!\n\n$usage\n"; 
89 | }
90 | 
91 | #########################################################################
92 | 


--------------------------------------------------------------------------------
/phylip2structure.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | 
  3 | use strict;
  4 | use warnings;
  5 | use Getopt::Long;
  6 | use File::Basename;
  7 | 
  8 | #Initialize scalars
  9 | my $input;
 10 | my $popmap;
 11 | my $output="structure.in";
 12 | my $missing="-9";
 13 | my $suppress=0;
 14 | my $extra;
 15 | my $locnames=0;
 16 | my $popN = 1.0;
 17 | my $globalN = 1.0;
 18 | my $oneLine =0;
 19 | #Call sub parseArgs to parse command-line arguments
 20 | parseArgs();
 21 | 
 22 | #Some warnings...
 23 | if ($suppress == 0){
 24 | 	$output eq "structure.in" and print "Warning: Output name not specified, using default of \"structure.in\"", "\n";
 25 | 	$missing eq "-9" and print "Warning: Missing data value not given; using default of \"-9\"\n", "\n";
 26 | }
 27 | 
 28 | #Format output if default used
 29 | if ($output eq "structure.in"){
 30 | 	my ($filepath, $dirpath) = fileparse ($input);
 31 | 	$output = "$dirpath/$output";
 32 | };
 33 | 
 34 | #Specify iupac abiguity codes and how to write them out to structure file
 35 | my $iupac="A       1 1
 36 | C       2 2
 37 | G       3 3
 38 | T       4 4
 39 | N       $missing $missing
 40 | -       $missing $missing
 41 | R       1 3
 42 | Y       2 4
 43 | S       2 3
 44 | W       1 4
 45 | K       3 4
 46 | M       1 2 ";
 47 | 
 48 | my %first_line;
 49 | my %second_line;
 50 | 
 51 | #Build hashes of above iupac codes
 52 | for my $line (split "\n", $iupac){
 53 | 	chomp $line;
 54 | 	my @a = split /\s+/, $line;
 55 | 	$first_line{ $a[0] } = $a[1];
 56 | 	$second_line{ $a[0] } = $a[2];
 57 | 
 58 | };
 59 | 
 60 | #Store population identifiers for each individual (from popmap)
 61 | my %popmap;
 62 | my %enum;
 63 | my %popcodes;
 64 | if ($popmap){
 65 |     open ( POPMAP, $popmap) || die "Derp: Can't open $popmap: $!";
 66 |     my $popcount = 1;
 67 |     while (<POPMAP>){
 68 |         chomp;
 69 |         my @c = split /\s+/, $_;
 70 |         if ($enum{$c[1]}){
 71 |             $popmap{$c[0]} = $enum{$c[1]};
 72 | 						if (!exists $popcodes{$enum{$c[1]}}){
 73 | 							$popcodes{$enum{$c[1]}} = $c[1];
 74 | 						}
 75 |             #print "$c[0] is from pop# $popmap{$c[0]}\n";
 76 |         }else{
 77 |             $enum{$c[1]} = $popcount;
 78 |             $popcount++;
 79 |             $popmap{$c[0]} = $enum{$c[1]};
 80 |         }
 81 |     }
 82 | 	close POPMAP;
 83 | 	print "Population codes:\n";
 84 | 		foreach my $p (sort keys %popcodes){
 85 | 			print $p, ": ", $popcodes{$p}, "\n";
 86 | 		}
 87 | }
 88 | 
 89 | #Begin going through phylip file
 90 | my $count = 0;
 91 | 
 92 | open ( OUTFILE, ">", $output) || die "Can't open $output: $!";
 93 | open ( PHY, $input ) || die "Can't open $input: $!";
 94 | 
 95 | my $samplecount = 0;
 96 | my $snpcount = 0;
 97 | 
 98 | #data structure to hold it, so we can print in order
 99 | my %structure;
100 | 
101 | while ( my $line = <PHY> ){
102 | 	$count++;
103 | 	$count == 1 and next; #Test if $count=1, if so then skip to next iteration
104 | 
105 | 	#Split each line, store sequence name and sequence
106 | 	my @b = split /\s+/, $line;
107 | 	my @seq_array = split //, $b[1];
108 | 
109 | 	#Build first line of structure file, containing "locus IDs"
110 | 	if ($count == 2){
111 | 	    my $locus_names= "\t\t";
112 | 	    for (my $i=1; $i <= scalar @seq_array; $i++){
113 | 		$locus_names .= "$i\t";
114 | 	    }
115 | 	    chop $locus_names;
116 | 
117 | 	   if ($locnames == 1){
118 | 		 print OUTFILE "$locus_names\n";
119 | 	   }
120 | 	}
121 | 	#Begin building structure lines
122 | 	my $line_1 = "$b[0]\t";#Put in sequence name
123 | 	my $line_2 = "$b[0]\t";
124 | 	my $pop;
125 | 
126 | 	if ($popmap){
127 | 		if (exists $popmap{$b[0]}){
128 | 			$pop = $popmap{$b[0]};
129 | 			#Add pop codes
130 | 			$line_1 .= "$popmap{$b[0]}\t";
131 | 			$line_2 .= "$popmap{$b[0]}\t";
132 | 		}
133 | 		else{
134 | 			next;
135 | 		}
136 | 	}
137 | 
138 | 	if ($extra){
139 | 	   for (my $i=0; $i<$extra; $i++){
140 | 		$line_1 .= "0\t";
141 | 		$line_2 .= "0\t";
142 | 	    }
143 | 	}
144 | 
145 | 	#Start adding allele data
146 | 	for( my $i=0; $i <= $#seq_array; $i++ ){
147 | 		if ($snpcount == 0){
148 | 			$snpcount = $#seq_array;
149 | 		}else{
150 | 			if ($snpcount != $#seq_array){
151 | 				print "Warning: Sample ",$b[0], " appears to have a different number of nucleotides. Something is wrong.\n";
152 | 			}
153 | 		}
154 | 		if ($first_line{ uc $seq_array[$i] }){
155 | 		    $line_1 .= "$first_line{ uc $seq_array[$i] }\t";
156 | 		}else{
157 | 		    $line_1 .= "-9\t";
158 | 		}
159 | 		if ($second_line{ uc $seq_array[$i] }){
160 | 		    if ($oneLine == 0){
161 | 		        $line_2 .= "$second_line{ uc $seq_array[$i] }\t";
162 | 		    }else{
163 | 			$line_1 .= "$second_line{ uc $seq_array[$i] }\t";
164 | 		    }
165 | 		}else{
166 | 		    if ($oneLine==0){
167 | 		   	$line_2 .= "-9\t";
168 | 		    }else{
169 | 			$line_1 .= "-9\t";
170 | 		    }
171 | 		}
172 | 
173 | 	}
174 | 
175 | 	chop $line_1;
176 | 	chop $line_2;
177 | 
178 | 	if (not exists $structure{$pop}){
179 | 		$structure{$pop} = [];
180 | 	}
181 | 
182 | 	if ($oneLine==0){
183 | 		#print OUTFILE $line_1, "\n";
184 | 		#print OUTFILE $line_2, "\n";
185 | 		push @{ $structure{$pop} }, [$line_1, $line_2];
186 | 	}else{
187 | 		push @{ $structure{$pop} }, [$line_1];
188 | 		#print OUTFILE $line_1, "\n";
189 | 	}
190 | 	$samplecount++;
191 | 
192 | }
193 | 
194 | foreach my $pop_key (sort {$a <=> $b} keys %structure) {
195 | 	foreach my $sample (@{$structure{$pop_key}}){
196 | 		foreach my $line (@{$sample}){
197 | 			print OUTFILE $line, "\n";
198 | 		}
199 | 	}
200 | }
201 | 
202 | close PHY;
203 | close OUTFILE;
204 | print ("Done! Outputted ", $samplecount, " samples and ", $snpcount+1, " SNPs.\n");
205 | exit;
206 | 
207 | ############################SUBROUTINES######################################
208 | 
209 | sub parseArgs{
210 | my $help=0;
211 | 
212 | my $usage= "\nUsage: $0 -i /path/to/phylip -p /path/to/popmap -o /path/to/output
213 | 
214 | The purpose of this script is to take a phylip-formatted file of concatenated SNPs (such as that output by the program pyRAD) and convert it to a structure-formatted file, with two lines for each individual representing the phased allele, as well as a column representing the a priori population/ locality assignment (as provided by the user in the form of a tab-delimited table).
215 | 
216 | Format of population map:
217 | Sample1 	1
218 | Sample2		1
219 | Sample3		2
220 | etc
221 | 
222 | 
223 | Required Inputs
224 | 
225 | 	-i, --input	-  Path to the input phylip file
226 | 	-p, --popmap	-  Path to the input population ID table
227 | 	-o, --output	-  Path to output (including desired filename)
228 | 	-n, --popN	- Percent missing data allowed per SNP per population [default=1.0]
229 | 			NOTE: Only applies when popmap provided
230 | 	-N, --globalN	- Percent missing data allowed per SNP globally [default=1.0]
231 | 			NOTE: N filters not implemented yet.
232 | 
233 | Optional inputs
234 | 	--oneLine	- Print phased alleles on one line
235 | 	-l, --loc	-  Bool, switch on printing of locus names in first row
236 | 	-e, --extra	-  Number of extra columns to insert
237 | 	-m, --missing	-  Desired code for missing data [Default is \"-9\"]
238 | 	-q, --quiet	-  Quiet mode; suppress internal warnings
239 | 	-x	- Exlude samples that are NOT in popmap
240 | 
241 | NOTE: Both gaps and N\'s will be coded as missing data.\n\n";
242 | 
243 | 	my $result = GetOptions
244 | 	(
245 | 	'input|i=s'	=> \$input,
246 | 	'popmap|p=s'	=> \$popmap,
247 | 	'output|o=s'	=> \$output,
248 | 	'missing|m=s'	=> \$missing,
249 | 	'help|h!'	=> \$help,
250 | 	'extra|e=i'	=> \$extra,
251 | 	'loc|l!'	=> \$locnames,
252 | 	'quiet|q!'	=> \$suppress,
253 | 	'popN|n'	=> \$popN,
254 | 	'globalN|N' => \$globalN,
255 | 	'oneLine!' => \$oneLine
256 | 	);
257 | 
258 | 	$help == 1 and die "$usage";
259 | 	$input || die "Input not specified!\n$usage";
260 | };
261 | 


--------------------------------------------------------------------------------
/phylobarcode.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import sys
 4 | import os
 5 | 
 6 | 
 7 | def main():
 8 | 	params = parseArgs()
 9 | 
10 | #Object to parse command-line arguments
11 | class parseArgs():
12 | 	def __init__(self):
13 | 		#Define options
14 | 		try:
15 | 			options, remainder = getopt.getopt(sys.argv[1:], 'h', \
16 | 			["help"])
17 | 		except getopt.GetoptError as err:
18 | 			print(err)
19 | 			self.display_help("\nExiting because getopt returned non-zero exit status.")
20 | 		#Default values for params
21 | 		#Input params
22 | 
23 | 
24 | 
25 | 		#First pass to see if help menu was called
26 | 		for o, a in options:
27 | 			if o in ("-h", "-help", "--help"):
28 | 				self.display_help("Exiting because help menu was called.")
29 | 
30 | 		#Second pass to set all args.
31 | 		for opt, arg_raw in options:
32 | 			arg = arg_raw.replace(" ","")
33 | 			arg = arg.strip()
34 | 			opt = opt.replace("-","")
35 | 			#print(opt,arg)
36 | 			if opt == "h" or opt == "help":
37 | 				continue
38 | 			else:
39 | 				assert False, "Unhandled option %r"%opt
40 | 
41 | 		#Check manditory options are set
42 | 		if not self.files:
43 | 			self.display_help("No files provided.")
44 | 
45 | 
46 | 
47 | 	def display_help(self, message=None):
48 | 		if message is not None:
49 | 			print()
50 | 			print (message)
51 | 		print ("\n<template.py>\n")
52 | 		print("Author: Tyler K Chafin, University of Arkansas")
53 | 		print ("Contact: tkchafin@uark.edu")
54 | 		print ("Description: ")
55 | 		print("""
56 | 
57 | """)
58 | 		print()
59 | 		sys.exit()
60 | 
61 | #Call main function
62 | if __name__ == '__main__':
63 |     main()
64 | 


--------------------------------------------------------------------------------
/process_ecoevolity.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | pre=$1
 4 | 
 5 | mkdir $pre-output
 6 | 
 7 | cd $pre-output
 8 | 
 9 | echo "Sumchains...."
10 | pyco-sumchains -s 100 ../$pre-state-run-*.log &> $pre-sumchains.txt
11 | 
12 | echo "Getting optimal number for burnin..."
13 | ch=$pre"-sumchains.txt"
14 | samps=`grep "maximized" $ch | sed 's/.*: //g' | sed 's/ samples.*//g'`
15 | 
16 | echo "Removing $samps samples!"
17 | 
18 | echo "sumcoevolity..."
19 | yam=$pre".yaml"
20 | p=$pre"-"
21 | sumcoevolity -b $samps -c ../$yam -p $p -n 1000000 ../$pre-state-run*.log
22 | 
23 | echo "pyco-sumevents...."
24 | pyco-sumevents -p $p -f $pre-sumcoevolity-results-nevents.txt
25 | 
26 | echo "pyco-sumtimes..."
27 | pyco-sumtimes -p $p -f -b $samps -z ../$pre-state*.log
28 | 
29 | 


--------------------------------------------------------------------------------
/pseudoHaploidize.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | import re
  4 | import sys
  5 | import os
  6 | import getopt
  7 | import random
  8 | 
  9 | def main():
 10 | 	params = parseArgs()
 11 | 
 12 | 	seqs = dict() #key=FASTA header; val=sequence
 13 | 
 14 | 	#Now, get the alignment from the FASTA file
 15 | 	#note that this works fine with interleaved FASTA
 16 | 	if params.fasta:
 17 | 		print('Reading alignment from FASTA...')
 18 | 		for f in read_fasta(params.fasta):
 19 | 			seqs[f[0]] = list(f[1])
 20 | 
 21 | 
 22 | 	#get indices of all multi-allele sites, then randomly resolve each
 23 | 	mults = ["R", "Y", "S", "W", "K", "M", "D", "H", "B", "V"]
 24 | 
 25 | 	for key in (seqs.keys()):
 26 | 		#get indices of multi-allelic sites
 27 | 		idxs = [i for i, c in enumerate(seqs[key]) if c.upper() in mults]
 28 | 
 29 | 		#loop through amiguities, replace each with a new one
 30 | 		for i in idxs:
 31 | 			#print(seqs[key][i], end=" - ")
 32 | 			seqs[key][i] = sampleAllele(seqs[key][i])
 33 | 			#print(seqs[key][i])
 34 | 
 35 | 	#write new FASTA outputs
 36 | 	for samp in seqs.keys():
 37 | 		seqs[samp] = "".join(seqs[samp])
 38 | 	if (params.split):
 39 | 		for samp in seqs.keys():
 40 | 			fname = samp + "_" + params.out
 41 | 			sd = dict()
 42 | 			sd[samp] = seqs[samp]
 43 | 			write_fasta(fname, sd)
 44 | 	else:
 45 | 		write_fasta(params.out, seqs)
 46 | 
 47 | #Function to write fasta-formatted sequences
 48 | def write_fasta(f, aln):
 49 | 
 50 | 	with open(f, 'w') as fh:
 51 | 		try:
 52 | 			for samp in aln.keys():
 53 | 				ol = ">" + str(samp) + "\n" + str(aln[samp]) + "\n"
 54 | 				fh.write(ol)
 55 | 		except IOError as e:
 56 | 			print("Could not read file %s: %s"%(f,e))
 57 | 			sys.exit(1)
 58 | 		except Exception as e:
 59 | 			print("Unexpected error reading file %s: %s"%(f,e))
 60 | 			sys.exit(1)
 61 | 		finally:
 62 | 			fh.close()
 63 | 
 64 | #function to randomly sample an allele given an ambiguity code
 65 | def sampleAllele(ch):
 66 | 	return(random.choice(get_iupac(ch.upper())))
 67 | 
 68 | #Function to split character to IUPAC codes, assuing diploidy
 69 | def get_iupac(char):
 70 | 	iupac = {
 71 | 		"A"	: ["A"],
 72 | 		"G"	: ["G"],
 73 | 		"C"	: ["C"],
 74 | 		"T"	: ["T"],
 75 | 		"N"	: ["N"],
 76 | 		"-"	: ["-"],
 77 | 		"R"	: ["A","G"],
 78 | 		"Y"	: ["C","T"],
 79 | 		"S"	: ["G","C"],
 80 | 		"W"	: ["A","T"],
 81 | 		"K"	: ["G","T"],
 82 | 		"M"	: ["A","C"],
 83 | 		"B"	: ["C","G","T"],
 84 | 		"D"	: ["A","G","T"],
 85 | 		"H"	: ["A","C","T"],
 86 | 		"V"	: ["A","C","G"]
 87 | 	}
 88 | 	return iupac[char]
 89 | 
 90 | #function returns all indices
 91 | def find(str, opts):
 92 | 	return [i for i, ltr in enumerate(s) if ltr == ch]
 93 | 
 94 | #Read samples as FASTA. Generator function
 95 | def read_fasta(fas):
 96 | 
 97 | 	if os.path.exists(fas):
 98 | 		with open(fas, 'r') as fh:
 99 | 			try:
100 | 				contig = ""
101 | 				seq = ""
102 | 				for line in fh:
103 | 					line = line.strip()
104 | 					if not line:
105 | 						continue
106 | 					#print(line)
107 | 					if line[0] == ">": #Found a header line
108 | 						#If we already loaded a contig, yield that contig and
109 | 						#start loading a new one
110 | 						if contig:
111 | 							yield([contig,seq]) #yield
112 | 							contig = "" #reset contig and seq
113 | 							seq = ""
114 | 						split_line = line.split()
115 | 						contig = (split_line[0].replace(">",""))
116 | 					else:
117 | 						seq += line
118 | 				#Iyield last sequence, if it has both a header and sequence
119 | 				if contig and seq:
120 | 					yield([contig,seq])
121 | 			except IOError:
122 | 				print("Could not read file ",fas)
123 | 				sys.exit(1)
124 | 			finally:
125 | 				fh.close()
126 | 	else:
127 | 		raise FileNotFoundError("File %s not found!"%fas)
128 | 
129 | 
130 | #Object to parse command-line arguments
131 | class parseArgs():
132 | 	def __init__(self):
133 | 		#Define options
134 | 		try:
135 | 			options, remainder = getopt.getopt(sys.argv[1:], 'f:so:h', \
136 | 			["out=", "help", "fasta=", "split"])
137 | 		except getopt.GetoptError as err:
138 | 			print(err)
139 | 			self.display_help("\nExiting because getopt returned non-zero exit status.")
140 | 		#Default values for params
141 | 		#Input params
142 | 		self.fasta=None
143 | 		self.out=None
144 | 		self.split=False
145 | 
146 | 		#First pass to see if help menu was called
147 | 		for o, a in options:
148 | 			if o in ("-h", "-help", "--help"):
149 | 				self.display_help("Exiting because help menu was called.")
150 | 
151 | 		#Second pass to set all args.
152 | 		for opt, arg_raw in options:
153 | 			arg = arg_raw.replace(" ","")
154 | 			arg = arg.strip()
155 | 			opt = opt.replace("-","")
156 | 			#print(opt,arg)
157 | 			if opt =="f" or opt=="fasta":
158 | 				self.fasta = arg
159 | 			elif opt =="o" or opt=="out":
160 | 				self.out = arg
161 | 			elif opt == "s" or opt == "split":
162 | 				self.split=True
163 | 			elif opt =="h" or opt == "help":
164 | 				pass
165 | 			else:
166 | 				assert False, "Unhandled option %r"%opt
167 | 
168 | 		#Check manditory options are set
169 | 		if not self.fasta:
170 | 			self.display_help("Must provide FASTA file <-f,--fasta>")
171 | 
172 | 		#get output prefix if not set by user
173 | 		if not self.out:
174 | 			self.out = os.path.splitext(self.fasta)[0] + '_hap.fasta'
175 | 
176 | 	def display_help(self, message=None):
177 | 		if message is not None:
178 | 			print()
179 | 			print (message)
180 | 		print ("\npseudoHaploidize.py\n")
181 | 		print ("Contact:Tyler K. Chafin, University of Arkansas,tkchafin@uark.edu")
182 | 		print ("\nUsage: ", sys.argv[0], "-f <input.fasta> [-s] [-f example_hap]\n")
183 | 		print ("Description: Creates a pseudo-haploid sequence from input fasta, randomly resolving heterozygous sites")
184 | 
185 | 		print("""
186 | 	Arguments:
187 | 		-f,--fasta	: Input fasta sequence
188 | 		-s,--split	: [Boolean] Write outputs each to their own output file
189 | 		-o,--out	: Output file name [default=input_hap.fasta or samp_input_hap.fasta if -s]
190 | 		-h,--help	: Displays help menu
191 | """)
192 | 		print()
193 | 		sys.exit()
194 | 
195 | #Call main function
196 | if __name__ == '__main__':
197 |     main()
198 | 


--------------------------------------------------------------------------------
/pyrad2fasta.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | use Getopt::Long;
 6 | use File::Path;
 7 | 
 8 | # Declare variables
 9 |     
10 |     my $line; 
11 |     my @loci;
12 |     my $workdir=""; 
13 |     my @fasta;
14 |     my $i = 1;
15 |     my $input;
16 |     my $batch;
17 | 	 
18 | parseArgs();
19 | 
20 | my $output="loci";
21 | $batch and $output.=$batch;
22 | 
23 | # open file and read it in
24 | open( LOCI, $input ) || die "Can't open $input: $!\n";
25 | # Make the loci directory to put fasta files from pyrad2fasta subroutine
26 | chdir $workdir;
27 | rmtree $output;
28 |     mkdir $output;
29 |     chdir $output;
30 |     
31 | while ( $line = <LOCI> ){
32 | 
33 |     
34 |     if( $line =~ /^\/\// ){
35 |         if( $line =~ /\*|\-/ ){
36 |             pyrad2fasta( @loci, $i );
37 | 			undef( @loci );
38 | 			$i++;
39 | 
40 |         }else{
41 |             undef( @loci );
42 |         }	
43 |     }else{
44 |         push @loci, $line;
45 | 		
46 |     }
47 |     
48 | 
49 | }
50 | close LOCI;
51 | exit;
52 | 
53 | ###########################SUBROUTINES###################################
54 | 
55 | sub parseArgs{ 
56 | 	#Message to print if mandatory variables not declared
57 | 	my $usage ="\npyrad2fasta.pl takes the custom .loci output from pyRAD and creates a new FASTA file for each locus containing at least 1 SNP.
58 | 
59 | Usage: $0 --i /path/to/*.loci --w /path/to/workdir   
60 | 
61 | Mandatory 
62 | 	-i, --input	-  path to the input file (*.loci from pyRAD)  
63 | 	-w, --workdir	-  path to working directoy (new fasta files will be placed within /workdir/loci 
64 | 
65 | Optional
66 | 	-b, --batch	- Provide a batch number
67 | 
68 | \n";
69 | 
70 | 	my $options = GetOptions 
71 | 		( 
72 | 		'input|i=s'		=>	\$input,
73 | 		'workdir|w=s'		=> 	\$workdir,
74 | 		'batch|b=i'		=>	\$batch,
75 | 		);
76 | 		
77 | 	$input or die "\n\nError: Input not specified!\n\n$usage\n"; 
78 | 	if ( $workdir eq ""){die "\nDerp: Working directory not specified!\n\n"}; 
79 | }
80 | 
81 | #########################################################################
82 | 
83 | sub pyrad2fasta{
84 | 
85 | 		
86 | 	# split at whitespace
87 |     for my $element ( @loci ){
88 | 		open( OUT, '>>', "$i.fasta" ) || die "Error.  Can't write to $i.fasta: $!\n\n";
89 | 		 @fasta = split( /\s+/, $element );
90 | 		 print OUT $fasta[0], "\n";
91 | 		 print OUT $fasta[1], "\n";
92 | 	
93 | 	}
94 | #     Print the loci in FASTA format		
95 | 		
96 | 		   
97 | }
98 | 


--------------------------------------------------------------------------------
/python_template.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import sys
 4 | import os
 5 | import getopt
 6 | 
 7 | def main():
 8 | 	params = parseArgs()
 9 | 	
10 | 	
11 | 
12 | #Object to parse command-line arguments
13 | class parseArgs():
14 | 	def __init__(self):
15 | 		#Define options
16 | 		try:
17 | 			options, remainder = getopt.getopt(sys.argv[1:], 'h1:M:w:o:', \
18 | 			["help", "one2many=","many2one=","width=","out="])
19 | 		except getopt.GetoptError as err:
20 | 			print(err)
21 | 			self.display_help("\nExiting because getopt returned non-zero exit status.")
22 | 		#Default values for params
23 | 		#Input params
24 | 		self.one2many=None
25 | 		self.many2one=None
26 | 		self.width=60
27 | 		self.out="out.fas"
28 | 
29 | 
30 | 		#First pass to see if help menu was called
31 | 		for o, a in options:
32 | 			if o in ("-h", "-help", "--help"):
33 | 				self.display_help("Exiting because help menu was called.")
34 | 
35 | 		#Second pass to set all args.
36 | 		for opt, arg_raw in options:
37 | 			arg = arg_raw.replace(" ","")
38 | 			arg = arg.strip()
39 | 			opt = opt.replace("-","")
40 | 			#print(opt,arg)
41 | 			if opt == "h" or opt == "help":
42 | 				continue
43 | 			elif opt=="one2many" or opt=="1":
44 | 				self.one2many=arg
45 | 			elif opt=="many2one" or opt=="M":
46 | 				self.many2one=arg
47 | 			elif opt=="width" or opt=="w":
48 | 				self.width=int(arg)
49 | 			elif opt=="out" or opt=="o":
50 | 				self.out=arg
51 | 			else:
52 | 				assert False, "Unhandled option %r"%opt
53 | 
54 | 		#Check manditory options are set
55 | 		if not self.one2many and not self.many2one:
56 | 			self.display_help("No files provided.")
57 | 
58 | 
59 | 
60 | 	def display_help(self, message=None):
61 | 		if message is not None:
62 | 			print()
63 | 			print (message)
64 | 		print ("\nfastaFormatter.py\n")
65 | 		print("Author: Tyler K Chafin, University of Arkansas")
66 | 		print ("Contact: tkchafin@uark.edu")
67 | 		print ("Description:Right now just converts b/n multi-line and one-line fasta formats, might add later")
68 | 		print("""
69 | 		-1,--one2many	: Path to fasta file to multi-line format
70 | 		-M,--many2one	: Path to fasta file to convert to one-line format
71 | 		-w,--width	: Characters per line for multi-line (default: 60)
72 | 		-o,--out	: Output file name (default=out.fas)
73 | """)
74 | 		print()
75 | 		sys.exit()
76 | 
77 | #Call main function
78 | if __name__ == '__main__':
79 |     main()
80 | 


--------------------------------------------------------------------------------
/revTransAll.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | import sys
  4 | import os
  5 | import getopt
  6 | from itertools import product
  7 | 
  8 | def main():
  9 | 	params = parseArgs()
 10 | 	
 11 | 	codon_table = dict()
 12 | 	
 13 | 	if params.code is None:
 14 | 		codon_table = get_standard_code()
 15 | 	else:
 16 | 		codon_table = read_code_file(params.code)
 17 | 	
 18 | 	#print(codon_table)
 19 | 	
 20 | 	nucs = dict()
 21 | 	current=None
 22 | 	curr_index=1
 23 | 	for aa in read_fasta(params.input):
 24 | 		if current is None:
 25 | 			current = aa[0]
 26 | 		elif current != aa[0]:
 27 | 			current = aa[0]
 28 | 			curr_index = 1
 29 | 		for trans in get_all_revtrans(aa[1], codon_table):
 30 | 			header=str(aa[0]) + "_translation-" + str(curr_index)
 31 | 			nucs[header] = trans
 32 | 			curr_index +=1
 33 | 	
 34 | 	write_fasta(params.out, nucs)
 35 | 
 36 | #generator function
 37 | def get_all_revtrans(aa, code):
 38 | 	possibilities = list()
 39 | 	for pos in aa:
 40 | 		possibilities.append(list(code[pos.upper()]))
 41 | 		
 42 | 	for nuc in product(*possibilities):
 43 | 		yield("".join(nuc))
 44 | 
 45 | def read_code_file(file):
 46 | 	d = dict()
 47 | 	if os.path.exists(file):
 48 | 		with open(file, 'r') as fh:
 49 | 			try:
 50 | 				num=0
 51 | 				ret = dict()
 52 | 				for line in fh:
 53 | 					line = line.strip()
 54 | 					if not line:
 55 | 						continue
 56 | 					num += 1
 57 | 					if num == 1:
 58 | 						continue
 59 | 					arr = line.split()
 60 | 					if arr[0] not in d:
 61 | 						d[arr[0].upper()] = list()
 62 | 					d[arr[0].upper()].append(arr[1].upper())
 63 | 					
 64 | 				return(d)
 65 | 			except IOError:
 66 | 				print("Could not read file ",file)
 67 | 				sys.exit(1)
 68 | 			finally:
 69 | 				fh.close()
 70 | 	else:
 71 | 		raise FileNotFoundError("File %s not found!"%file)
 72 | 	return(d)
 73 | 
 74 | 
 75 | def get_standard_code():
 76 | 	d = {
 77 | 		'*' : ['TAA','TAG','TGA'],
 78 | 		'A' : ['GCA','GCC','GCG','GCT'],
 79 | 		'C' : ['TGC','TGT'],
 80 | 		'D' : ['GAC','GAT'],
 81 | 		'E' : ['GAA','GAG'],
 82 | 		'F' : ['TTC'],
 83 | 		'G' : ['GGA','GGC','GGG','GGT'],
 84 | 		'H' : ['CAC','CAT'],
 85 | 		'I' : ['ATA','ATC','ATT'],
 86 | 		'K' : ['AAA','AAG'],
 87 | 		'L' : ['CTA','CTC','CTG','CTT','TTA','TTG'],
 88 | 		'M' : ['ATG'],
 89 | 		'N' : ['AAC','AAT'],
 90 | 		'P' : ['CCA','CCC','CCG','CCT'],
 91 | 		'Q' : ['CAA','CAG'],
 92 | 		'R' : ['AGA','AGG','CGA','CGC','CGG','CGT'],
 93 | 		'S' : ['AGC','AGT','TCA','TCC','TCG','TCT'],
 94 | 		'T' : ['ACA','ACC','ACG','ACT'],
 95 | 		'V' : ['GTA','GTC','GTG','GTT'],
 96 | 		'W' : ['TGG'],
 97 | 		'Y' : ['TAC','TAT']
 98 | 	}
 99 | 	return(d)
100 | 
101 | def write_fasta(f, aln):
102 | 	with open(f, 'w') as fh:
103 | 		try:
104 | 			for samp in aln.keys():
105 | 				ol = ">" + str(samp) + "\n" + str(aln[samp]) + "\n"
106 | 				fh.write(ol)
107 | 		except IOError as e:
108 | 			print("Could not read file %s: %s"%(f,e))
109 | 			sys.exit(1)
110 | 		except Exception as e:
111 | 			print("Unexpected error reading file %s: %s"%(f,e))
112 | 			sys.exit(1)
113 | 		finally:
114 | 			fh.close()
115 | 
116 | def read_fasta(fas):
117 | 	if os.path.exists(fas):
118 | 		with open(fas, 'r') as fh:
119 | 			try:
120 | 				contig = ""
121 | 				seq = ""
122 | 				for line in fh:
123 | 					line = line.strip()
124 | 					if not line:
125 | 						continue
126 | 					#print(line)
127 | 					if line[0] == ">": #Found a header line
128 | 						#If we already loaded a contig, yield that contig and
129 | 						#start loading a new one
130 | 						if contig:
131 | 							yield([contig,seq]) #yield
132 | 							contig = "" #reset contig and seq
133 | 							seq = ""
134 | 						split_line = line.split()
135 | 						contig = (split_line[0].replace(">",""))
136 | 					else:
137 | 						seq += line
138 | 				#Iyield last sequence, if it has both a header and sequence
139 | 				if contig and seq:
140 | 					yield([contig,seq])
141 | 			except IOError:
142 | 				print("Could not read file ",fas)
143 | 				sys.exit(1)
144 | 			finally:
145 | 				fh.close()
146 | 	else:
147 | 		raise FileNotFoundError("File %s not found!"%fas)
148 | 
149 | #Object to parse command-line arguments
150 | class parseArgs():
151 | 	def __init__(self):
152 | 		#Define options
153 | 		try:
154 | 			options, remainder = getopt.getopt(sys.argv[1:], 'hi:c:o:', \
155 | 			["help", "in=", "code=", "out="])
156 | 		except getopt.GetoptError as err:
157 | 			print(err)
158 | 			self.display_help("\nExiting because getopt returned non-zero exit status.")
159 | 		#Default values for params
160 | 		#Input params
161 | 		self.input=None
162 | 		self.code=None
163 | 		self.out="out.fas"
164 | 
165 | 
166 | 		#First pass to see if help menu was called
167 | 		for o, a in options:
168 | 			if o in ("-h", "-help", "--help"):
169 | 				self.display_help("Exiting because help menu was called.")
170 | 
171 | 		#Second pass to set all args.
172 | 		for opt, arg_raw in options:
173 | 			arg = arg_raw.replace(" ","")
174 | 			arg = arg.strip()
175 | 			opt = opt.replace("-","")
176 | 			#print(opt,arg)
177 | 			if opt == "h" or opt == "help":
178 | 				continue
179 | 			elif opt=="i" or opt=="in":
180 | 				self.input = arg
181 | 			elif opt=="c" or opt=="code":
182 | 				self.code = arg
183 | 			elif opt=="out" or opt=="o":
184 | 				self.out = arg
185 | 			else:
186 | 				assert False, "Unhandled option %r"%opt
187 | 
188 | 		#Check manditory options are set
189 | 		if not self.input:
190 | 			self.display_help("No input provided.")
191 | 		
192 | 		if not self.code:
193 | 			self.display_help("No code provided. Using default.")
194 | 
195 | 
196 | 
197 | 	def display_help(self, message=None):
198 | 		if message is not None:
199 | 			print()
200 | 			print (message)
201 | 		print ("\nrevTransAll.py\n")
202 | 		print("Author: Tyler Chafin")
203 | 		print ("Contact: tyler.chafin@colorado.edu")
204 | 		print ("Description: Gives all possible reverse translations for a amino acid sequence")
205 | 		print("""
206 | 		-i,--in 	: Input file name (FASTA format)
207 | 			format:
208 | 			>my_sequence
209 | 			MFLIMVVFPTTAASVMMVMMV...
210 | 		-c,--code	: Tab-delimited codon table 
211 | 			format:
212 | 			F	TTT
213 | 			F	TTC
214 | 			F	TTA
215 | 			F	TTG
216 | 			L	CTT
217 | 			...
218 | 			...
219 | 			<NOTE: If none supplied, will use 'standard' code>
220 | 		-o,--out	: Output file name (default=out.fas)
221 | 			format:
222 | 			>my_sequence_translation-1
223 | 			ATGATGAT...
224 | 			>my_sequence_translation-2
225 | 			ATGATCAT...
226 | 			...
227 | 			...
228 | """)
229 | 		print()
230 | 		sys.exit()
231 | 
232 | #Call main function
233 | if __name__ == '__main__':
234 | 	main()
235 | 


--------------------------------------------------------------------------------
/seq2structure.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl 
  2 | 
  3 | use strict; 
  4 | use warnings; 
  5 | use Getopt::Long qw( :config posix_default no_ignore_case );
  6 | use File::Basename;
  7 | 
  8 | #Initialize scalars
  9 | my $input; 
 10 | my $popmap;
 11 | my $output="structure.in";
 12 | my $missing="-9";  
 13 | my $suppress=0; 
 14 | my $type; 
 15 | my $snp = 0; 
 16 | #Call sub parseArgs to parse command-line arguments
 17 | parseArgs();
 18 | 
 19 | 
 20 | my $message = "#File created by seq2structure.pl; script by Tyler K. Chafin last updated 12-Dec-14";
 21 | 
 22 | #Some warnings...
 23 | if ($suppress == 0){
 24 | 	$output eq "structure.in" and print "Warning: Output name not specified, using default of \"structure.in\"", "\n";
 25 | 	$missing eq "-9" and print "Warning: Missing data value not given; using default of \"-9\"\n", "\n";
 26 | }
 27 | 
 28 | #Format output if default used
 29 | if ($output eq "structure.in"){
 30 | 	my ($filepath, $dirpath) = fileparse ($input); 
 31 | 	#$output = "$output"; 
 32 | };
 33 | 
 34 | #Specify iupac abiguity codes and how to write them out to structure file
 35 | my $iupac="A       1 1
 36 | C       2 2
 37 | G       3 3
 38 | T       4 4
 39 | N       $missing $missing
 40 | -       $missing $missing
 41 | R       1 3
 42 | Y       2 4
 43 | S       2 3
 44 | W       1 4
 45 | K       3 4
 46 | M       1 2 ";
 47 | 
 48 | 
 49 | my %first_line; 
 50 | my %second_line;
 51 | 
 52 | #Build hashes of above iupac codes
 53 | for my $line (split "\n", $iupac){
 54 | 	chomp $line; 
 55 | 	my @a = split /\s+/, $line; 
 56 | 	$first_line{ $a[0] } = $a[1]; 
 57 | 	$second_line{ $a[0] } = $a[2];
 58 | 
 59 | };
 60 | 
 61 | #Store population identifiers for each individual (from popmap)
 62 | open ( POPMAP, $popmap) || die "Derp: Can't open $popmap: $!";
 63 | 	my %popmap;
 64 | 	    while (<POPMAP>){
 65 | 	    chomp;
 66 | 	    my @c = split /\s+/, $_;
 67 | 	    $popmap{$c[0]} = $c[1];
 68 | 	#print "$c[0] is from pop# $popmap{$c[0]}\n";
 69 | }
 70 | close POPMAP;
 71 | 
 72 | #Begin going through phylip file
 73 | my $count = 0;
 74 | my @b; 
 75 | my @seq_array;
 76 | 
 77 | open ( OUTFILE, ">$output") || die "Can't open $output: $!";
 78 | open ( PHY, $input ) || die "Can't open $input: $!";
 79 | 
 80 | while ( my $line = <PHY> ){
 81 | 	$count++; 
 82 | 		
 83 | 	chomp $line; 
 84 | 	#Split each line, store sequence name and sequence 
 85 | 	if ($type =~ /p/i){
 86 | 		$count == 1 and next; #Test if $count=1, if so then skip to next iteration.
 87 | 		@b = split /\s+/, $line; 
 88 | 		@seq_array = split //, $b[1]; 
 89 | 	
 90 | 		#Build first line of structure file, containing "locus IDs"
 91 | 		if ($count == 2){
 92 | 			#my $locus_names= "\t\t";
 93 | 			#for (my $i=1; $i <= scalar @seq_array; $i++){
 94 | 				#$locus_names .= "$i\t"; 
 95 | 			#}
 96 | 			#chop $locus_names;
 97 | 			#print OUTFILE "$message\n";
 98 | 			#print OUTFILE "$locus_names\n";
 99 | 		}
100 | 	}
101 | 	
102 | 	if ($type =~ /f/i){
103 | 		
104 | 		if ($count ==2){
105 | 			#my $locus_names= "\t\t";
106 | 		
107 | 			#for (my $i=1; $i <= scalar(@seq_array); $i++){
108 | 				#$locus_names .= "$i\t"; 
109 | 			#}
110 | 			#chop $locus_names;
111 | 			#print OUTFILE "$message\n";
112 | 			#print OUTFILE "$locus_names\n";
113 | 		}
114 | 		
115 | 		if ($line =~ /^\>(\S+)/){ 
116 | 			$b[0] = $1; 
117 | 			next; 
118 | 		}elsif ($line =~ /[ACGT]+/i){ 
119 | 			@seq_array = split //, $line; 
120 | 		}else{
121 | 			
122 | 			next; 
123 | 		}
124 | 		
125 | 	}
126 | 		
127 | 		 
128 | 	#Begin building structure lines
129 | 	my $line_1 = "$b[0]\t";#Put in sequence name
130 | 	my $line_2 = "$b[0]\t";
131 | 
132 | 	#Add pop codes
133 | 	$line_1 .= "$popmap{$b[0]}\t";
134 | 	$line_2 .= "$popmap{$b[0]}\t";
135 | 
136 | 	#Start adding allele data
137 | 	for( my $i=0; $i <= $#seq_array; $i++ ){ 
138 | 		$line_1 .= "$first_line{ uc $seq_array[$i] }\t"; 
139 | 		$line_2 .= "$second_line{ uc $seq_array[$i] }\t";  
140 | 	}
141 | 	
142 | 	chop $line_1; 
143 | 	chop $line_2;
144 | 	
145 | 	print OUTFILE $line_1, "\n"; 
146 | 	print OUTFILE $line_2, "\n"; 
147 | 	print "Sample $b[0] done...\n";
148 | 	$count++;
149 | }
150 | 
151 | close PHY;
152 | close OUTFILE;
153 | 
154 | #If SNP check toggled on, rewrite file with only snps
155 | my $loci=0;
156 | if ($snp == 1){
157 | 	open (STR, "$output") || die "Cannot open $output for reading: $!\n";
158 | 	my $comments = "";
159 | 	my @data;  
160 | 	my $num=0; 
161 | 	my $locnames;
162 | 	foreach (<STR>){ 
163 | 		chomp; 
164 | 		#If line is a comment, capture to reprint later
165 | 		$_ =~ /^#/ and $comments .= $_ and next; 
166 | 		#If line contains variable number of spaces and nothing else, skip
167 | 		$_ =~ /^ *$/ and next; 
168 | 		#If column has locus names
169 | 		#$num == 0 and $locnames = $_; 
170 | 		#Capture elements in line
171 | 		my @line = split("\t"); 
172 | 		
173 | 		#Build array of arrays with secondary arrays as the columns from structure file
174 | 		for (my $col = 0; $col < scalar(@line); $col++){ 
175 | 			 push(@{$data[$col]}, $line[$col]); 
176 | 		}
177 | 		$num++; 
178 | 	}
179 | 	close STR;
180 | 	 
181 | 	# Check each column for unique 	
182 | 	for (my $col = 0; $col < scalar(@data); $col++){ 
183 | 		$col < 2 and next; #Skip sample and popID columns
184 | 		my %counts; 
185 | 		$counts{$_}++ for @{$data[$col]};
186 | 			#print keys(%counts) ."\n";
187 | 		my $number = keys %counts; 
188 | 		#If column doesn't contain a SNP, delete it.
189 | 		unless ($number > 1){
190 | 			undef $data[$col];
191 | 			next;
192 | 		}
193 | 		$loci++;
194 | 	}
195 | 	
196 | 	#Build new structure file containing only the SNPs
197 | 	my $ind = (scalar(@{$data[0]})/2);
198 | 	print "\n######################################\n\n";
199 | 	print "Number of Individuals: $ind\nNumber of SNPs discovered: $loci\n";
200 | 	my $outfile = "N" . $ind ."-" . "L" . $loci . "_" . "$output";
201 | 	print "\nWriting $outfile...\n\n";
202 | 	open (NEWOUT, ">$outfile") || die "Can't open $output for re-writing: $!\n";
203 | 	$comments and print NEWOUT "$comments\n"; 
204 | 	#print NEWOUT "$locnames\n";
205 | 	for (my $row = 0; $row < scalar(@{$data[1]}); $row++){ 
206 | 		for (my $col = 0; $col < scalar(@data); $col++){ 			
207 | 			if (defined $data[$col][$row]){
208 | 				print NEWOUT $data[$col][$row] . "\t";
209 | 			}
210 | 		}
211 | 		print NEWOUT "\n"; 
212 | 	}
213 | 	close NEWOUT;
214 | }
215 | 	
216 | 
217 | exit;  
218 | 
219 | ############################SUBROUTINES######################################
220 | 
221 | sub parseArgs{
222 | 
223 | my $help=0;
224 | 
225 | my $usage= "\nUsage: $0 -i /path/to/seqfile -p /path/to/popmap -o /path/to/output
226 | 
227 | The purpose of this script is to take a phylip or fasta-formatted file of concatenated SNPs (such as that output by the program pyRAD) and convert it to a structure-formatted file, with two lines for each individual representing the phased allele, as well as a column representing the a priori population/ locality assignment (as provided by the user in the form of a tab-delimited table).
228 | 
229 | Format of population map: 
230 | Sample1 	1
231 | Sample2		1
232 | Sample3		2
233 | etc
234 | 
235 | 
236 | Required Inputs
237 | 
238 | 	-i, --input	-  Path to the input sequencefile
239 | 	-p, --popmap	-  Path to the input population ID table
240 | 	-o, --output	-  Path to output (including desired filename)
241 | 	-t, --type	-  Input file type (phylip or fasta)
242 | 
243 | Optional inputs
244 | 
245 | 	-m, --missing	-  Desired code for missing data [Default is \"-9\"]
246 | 	-s, --snp	-  Check for SNPs; only write snps to str file
247 | 	-q, --quiet	-  Quiet mode; suppress internal warnings
248 | 
249 | NOTE: Both gaps and N\'s will be coded as missing data.
250 | NOTE: Script assumes a perfect alignment (same length, gaps and N's inserted where needed).
251 | NOTE: SNP checking currently not functional. 
252 | NOTE: Script does not create a row in structure file for locus names. Will add this functionality back in later, if necessary.
253 | TODO: Add built-in check for filetype so it doesn\'t need to be specified.\n\n";
254 | 
255 | 	my $result = GetOptions
256 | 	(
257 | 	'input|i=s'	=> \$input,
258 | 	'popmap|p=s'	=> \$popmap,
259 | 	'output|o=s'	=> \$output,
260 | 	'missing|m=s'	=> \$missing,
261 | 	'help|h!'	=> \$help,
262 | 	'snp|s!'	=> \$snp, 
263 | 	'type|t=s'	=> \$type,  
264 | 	'quiet|q!'	=> \$suppress,
265 | 	);
266 | 
267 | 	$help == 1 and die "$usage";
268 | 	$input || die "Input not specified!\n$usage";
269 | 	$popmap || die "Popmap not provided!\n$usage";
270 | 	$type || die "Popmap not provided!\n$usage";
271 | };
272 | 	
273 | 


--------------------------------------------------------------------------------
/short2fullPopmap.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | use strict;
 3 | use warnings;
 4 | use Getopt::Std;
 5 | 
 6 | my %opts;
 7 | getopts('i:s:c:h', \%opts);
 8 | 
 9 | if ($opts{h}){
10 | 	&help;
11 | 	die "Exiting because help menu was called.\n\n"
12 | }
13 | 
14 | my ($names, $short, $num, $out) = &parseArgs(\%opts);
15 | 
16 | open (my $fhs, $short) or die "Can't open short\n";
17 | 
18 | my %hash;
19 | while (my $row = <$fhs>){
20 | 	chomp $row;
21 | 	my @arr = split "\t", $row;
22 | 	if  (!exists $hash{$arr[0]}){
23 | 		$hash{$arr[0]} = $arr[1];
24 | 		#print $arr[0]," is ", $hash{$arr[0]}, "\n";
25 | 	}
26 | }
27 | close $fhs;
28 | 
29 | 
30 | open (my $fhn, $names) or die "Can't open names\n";
31 | open (my $outfh, ">$out") or die "Can't open output file for writing\n";
32 | 
33 | while (my $name = <$fhn>){
34 | 	chomp $name;
35 | 	my $n = substr $name, 0, $num;
36 | 	if (exists $hash{$n}){
37 | 		print $outfh $name, "\t", $hash{$n}, "\n";
38 | 	}else{
39 | 		print "$name ($n) doesn't match anything", "\n";
40 | 	}
41 | }
42 | close $fhn;
43 | close $out;
44 | 
45 | exit;
46 | 
47 | ###############################################################################
48 | ################################ Subroutines ##################################
49 | ###############################################################################
50 | 
51 | # subroutine to print help
52 | sub help{
53 | 
54 |   print "\nLazy script to create full popmap from a prefix popmap\n\n";
55 |   print "Program Options:\n";
56 | 
57 |   print "\t-i:\tText file with list of sample names\n";
58 |   print "\t-s:\tTab-delimited prefix names\n";
59 | 	print "\t-c:\tNumber of characters used for prefix\n";
60 | 	print "\t-o:\tOutput file name\n";
61 | 	print "\t-h:\tBoolean. Calls help menu.\n\n";
62 | 
63 | }
64 | 
65 | 
66 | # subroutine to parse the command line options
67 | sub parseArgs{
68 | 
69 |   my( $params ) =  @_;
70 |   my %opts = %$params;
71 | 
72 | 	my $names = $opts{i} or die "File with sample names not given\n";
73 |   my $short = $opts{s} or die "File with prefix popmap not given\n";
74 | 	my $num = $opts{c} or die "Number of characters not given\n";
75 | 	my $out = $opts{o} || "output.popmap";
76 | 
77 | 
78 |   return( $names, $short, $num, $out);
79 | 
80 | }
81 | 


--------------------------------------------------------------------------------
/slidingWindowGC.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl 
  2 | 
  3 | 
  4 | use strict; 
  5 | use warnings; 
  6 | use Getopt::Long; 
  7 | 
  8 | 
  9 | our $input=""; 
 10 | our $win=100; 
 11 | our $inc=50;
 12 | 
 13 | parseArgs(); #Call subroutine to parse arguments... 
 14 | 
 15 | my $dna; 
 16 | my $name = "";  
 17 | my $header = ""; 
 18 | open ( FAS, "$input" ) || die "\nDerp: Can't open $input!\n$!\n"; 
 19 | 
 20 | 
 21 | #This block submits slidingWindowGC for each separate sequence in fasta file.
 22 | while (<FAS>) { 
 23 |     chomp $_; 
 24 |     if ( $_ =~ m/^>(\w+)/){ 
 25 | 	$header = "$1";  #New sample name stored
 26 | 	if ( $dna ) {
 27 | 	    print "\n$name\n\n";
 28 | 	    slidingWindowGC( $dna, 0 ) ;
 29 | 	}     
 30 | 	undef $dna;
 31 | 	$name = $header; 
 32 |  
 33 |         }else{
 34 | 	    $dna .= $_;  
 35 | 	}
 36 |     }     
 37 | 
 38 | print "\n$name\n\n"; 
 39 | slidingWindowGC( $dna, 0 ); 
 40 | 
 41 | 
 42 | 
 43 | close FAS; 
 44 | 
 45 | 
 46 | 
 47 | 
 48 | #############################################SUBROUTINES###############################################
 49 | 
 50 | #Subroutine to parse command line arguments
 51 | sub parseArgs{
 52 | 
 53 |     my $usage = "\nUsage: $0 --input=file.fasta --window=100 --increment=50
 54 | 
 55 |     mandatory
 56 |        --file        -  FASTA file containing sequences; the first sequence in the file will be used
 57 |        --window      -  window length (default=100) 
 58 |        --increment   -  increment length; how far to shift each window (default=50) \n\n";
 59 | 
 60 | 
 61 |                  my $result = GetOptions
 62 |                          (
 63 |                                  'file=s'     => \$input,
 64 | 				 'window=s'   => \$win,
 65 |                                  'increment=s'=> \$inc,
 66 |                                
 67 |                          );
 68 |              
 69 | 	        $input eq "" and die $usage;  #Die if mandatory variables undefined
 70 | 		$win==100 and print "\nWarning: No window length defined- using default of 100\n\n"; 
 71 | 		$inc==50 and print "Warning: No increment length defined- using default of 50\n\n";
 72 |         
 73 | }
 74 |                                                              
 75 | 
 76 | 
 77 | #Recursive subroutine to perform sliding window through input DNA sequence
 78 | 
 79 | sub slidingWindowGC{ 
 80 |  
 81 |  
 82 | my $DATA = $_[0]; 
 83 | my $subseq;  
 84 | my $GC; 
 85 | my $start=$_[1];  # $start initialized at zero 
 86 | 
 87 | $subseq = substr ($DATA, $start, $win); #the "window"... 
 88 | 
 89 | $GC =()=$subseq =~ /G|C/gi; #Count up Gs and Cs 
 90 | 
 91 | print "$start\t$GC\n";  #print window start coordinate and GC content
 92 | 
 93 | $start+=$inc; #Increment start. The "sliding" part
 94 | 
 95 | #Check if $start is within length of the dna... Sets limit to recursive subroutine and keeps it from going crazy
 96 |     if ($start < length($DATA) ){ 
 97 |         slidingWindowGC( $DATA, $start);
 98 | 
 99 |     }
100 | }
101 | 
102 | 
103 | 
104 | 
105 | 
106 | 
107 | 


--------------------------------------------------------------------------------
/snps2phy.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash 
 2 | 
 3 | #Tyler K. Chafin 
 4 | #December 4 2015 
 5 | #Converts .snps file from pyRAD to phylip format 
 6 | #Email: tkchafin@uark.edu with issues
 7 | 
 8 | if [ $1 ]; then 
 9 |   file="$1"; 
10 | else 
11 |   printf "\nUsage: $0 <.snps>\n\n";  
12 |   exit 1; 
13 | fi; 
14 | 
15 | #Format to phylip
16 | sed -r 's/^(\w+)\s+([A-Z_-]+)/\1\t\2/g' $file | sed 's/ //g' | sed 's/_//g' > $file.phy
17 | #Replace header line
18 | sed -i -r 's/##([0-9]+).+,.*,([0-9]+).*/\1\t\2/g' $file.phy;
19 | 
20 | 
21 | 


--------------------------------------------------------------------------------
/splitFASTA.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | 
  3 | #
  4 | #Script by Tyler K. Chafin
  5 | #Last Modified: 6 May 2015
  6 | #Added: Capability to split file into user-defined number of parts
  7 | #
  8 | 
  9 | 
 10 | use strict; 
 11 | use warnings;
 12 | use Getopt::Long;
 13 | 
 14 | our $pattern=">";
 15 | our $infile="";
 16 | our $suffix="fasta";
 17 | our $breaks;  
 18 | parseArgs();
 19 | 
 20 | 
 21 | 
 22 | open ( INFILE, "$infile" ) ; 
 23 | 
 24 | my $n=0; 
 25 | my $matches=0; 
 26 | if ($breaks){
 27 | #print "Breaks = $breaks\n"; 
 28 | 
 29 | 
 30 |     #Count num of pattern matches
 31 |     while (<INFILE>){ 
 32 | 	$_ =~ "$pattern" and $matches++; 
 33 |     }  	
 34 |     my $num_lines = int($matches/$breaks); 
 35 |     my $count=0;
 36 |  
 37 | #print "Num_lines = $num_lines\n"; 
 38 | #print "matches = $matches\n"; 
 39 |     #Foreach part, read data and write to appropriate outfile 
 40 |      
 41 |     seek(INFILE,0,0); #Reset reading position in fh
 42 |     
 43 |     while (<INFILE>){ 
 44 |  
 45 | 	if ($_ =~ "$pattern"){ 
 46 | 	    $n++;
 47 | 	    if ($count == 0){ 
 48 | 		$count++;
 49 | 		open (OUTFILE, "> $count.$suffix") || die $!;  
 50 | 	     
 51 | 	    }
 52 | 	    if ($count >= $breaks){ 
 53 | 		print OUTFILE "$_"; 
 54 | 	 
 55 | 	    }else{ 
 56 | 	    	if($n<=($num_lines*$count)){ 	
 57 | 		    print OUTFILE "$_"; 
 58 | 	    	}else{ 
 59 | 		    close OUTFILE; 
 60 | 		    $count++; 
 61 | 		    open (OUTFILE, "> $count.$suffix") || die $!;
 62 | 		
 63 | 		    print OUTFILE "$_"; 
 64 | 	    	}
 65 | 	    } 
 66 | 	}else{ 
 67 | 	    print OUTFILE "$_"; 
 68 | 	} 
 69 |    }		
 70 | 
 71 | #If num breaks not defined: break for each contig
 72 | }else{
 73 |     while (<INFILE>) { 
 74 | 
 75 |     	if ( $_ =~ "$pattern"  ){
 76 |             $n++; 
 77 | 	    open ( OUTFILE, "> $n.$suffix" ) || die $!;  	
 78 |             print OUTFILE "$_"; 	
 79 |     	}else{
 80 | 	    print OUTFILE "$_";
 81 |         } 
 82 |     }
 83 | }
 84 | 
 85 | 
 86 | close INFILE;
 87 | close OUTFILE; 
 88 | 
 89 | exit;
 90 | ###################################################
 91 | 
 92 | sub parseArgs{ 
 93 | 
 94 |     my $usage = "\nUsage: $0 --file=whole_genome.fasta --pattern=\> --suffix=fasta
 95 | 
 96 | Author: Tyler K. Chafin - tkchafin\@uark.edu
 97 | Last Modified: 6 May 2015
 98 | 
 99 | Purpose of script is to break a given FASTA file into a user-defined number of portions, or into separate files per FASTA header and associated sequence. 
100 | 
101 | 
102 | mandatory
103 |    --file      -  File to break up
104 | 
105 | optional 
106 |    --breaks    -  Break file into n pieces [default is one file for each contig]
107 |    --pattern   -  Pattern to use to divide file [default=>] 
108 |    ---suffix   -  Suffix to use when naming daughter files [default=fasta]\n\n";
109 | 
110 | 
111 |     my $result = GetOptions 
112 | 	(
113 | 	'f|file=s'	=> \$infile, 
114 | 	'p|pattern=s'	=> \$pattern, 
115 | 	's|suffix=s'	=> \$suffix, 
116 | 	'b|breaks=i'	=> \$breaks, 
117 | 	); 
118 |     
119 |     if ( $infile eq "" ){ die $usage}; 
120 | }
121 | 
122 | 


--------------------------------------------------------------------------------
/splitFastaPops.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | import sys
  4 | import os
  5 | import getopt
  6 | import collections
  7 | 
  8 | def main():
  9 | 	params = parseArgs()
 10 | 
 11 | 	pop_assign = dict()
 12 | 	seqs = dict()
 13 | 
 14 | 	#parse popmap file for dictionary of sample assignments
 15 | 	if params.popmap:
 16 | 		print("Parsing popmap file...")
 17 | 		pop_assign = parsePopmap(params.popmap)
 18 | 	else:
 19 | 		print("ERROR: Popmap file must be provided.")
 20 | 		sys.exit(1)
 21 | 
 22 | 	#Now, get the alignment from the FASTA file (as another dict)
 23 | 	if params.fasta:
 24 | 		print('Reading alignment from FASTA...')
 25 | 		for f in read_fasta(params.fasta):
 26 | 			seqs[f[0]] = f[1]
 27 | 	else:
 28 | 		print("ERROR: Popmap file must be provided.")
 29 | 		sys.exit(1)
 30 | 
 31 | 	print("Writing new FASTA files...")
 32 | 	#For each pop, write a new FASTA
 33 | 	seen = list(seqs.keys())
 34 | 	pops = make2Dpopmap(pop_assign)
 35 | 	for pop in pops.keys():
 36 | 		fas = str(pop) + ".fasta"
 37 | 		with open(fas, 'w') as fh:
 38 | 			try:
 39 | 				print(fas + "....")
 40 | 				for sample in pops[pop]:
 41 | 					if sample in seen:
 42 | 						to_write = ">" + str(sample) + "\n" + seqs[sample] + "\n"
 43 | 						fh.write(to_write)
 44 | 					else:
 45 | 						print("Sample not found in FASTA:",sample)
 46 | 			except IOError as e:
 47 | 				print("Could not read file:",e)
 48 | 				sys.exit(1)
 49 | 			except Exception as e:
 50 | 				print("Unexpected error:",e)
 51 | 				sys.exit(1)
 52 | 			finally:
 53 | 				fh.close()
 54 | 
 55 | #Makes a dict of lists from a popmap
 56 | def make2Dpopmap(p):
 57 | 	ret = dict()
 58 | 	for s in p:
 59 | 		if p[s] not in ret:
 60 | 			ret[p[s]] = list()
 61 | 		ret[p[s]].append(s)
 62 | 	return(ret)
 63 | 
 64 | 
 65 | 
 66 | #function reads a tab-delimited popmap file and return dictionary of assignments
 67 | def parsePopmap(popmap):
 68 | 
 69 | 	ret = dict()
 70 | 	if os.path.exists(popmap):
 71 | 		with open(popmap, 'r') as fh:
 72 | 			try:
 73 | 				contig = ""
 74 | 				seq = ""
 75 | 				for line in fh:
 76 | 					line = line.strip()
 77 | 					if not line:
 78 | 						continue
 79 | 					else:
 80 | 						stuff = line.split()
 81 | 						ret[stuff[0]] = stuff[1]
 82 | 				return(ret)
 83 | 			except IOError:
 84 | 				print("Could not read file ",pairs)
 85 | 				sys.exit(1)
 86 | 			finally:
 87 | 				fh.close()
 88 | 	else:
 89 | 		raise FileNotFoundError("File %s not found!"%popmap)
 90 | 
 91 | 
 92 | #Read samples as FASTA. Generator function
 93 | def read_fasta(fas):
 94 | 
 95 | 	if os.path.exists(fas):
 96 | 		with open(fas, 'r') as fh:
 97 | 			try:
 98 | 				contig = ""
 99 | 				seq = ""
100 | 				for line in fh:
101 | 					line = line.strip()
102 | 					if not line:
103 | 						continue
104 | 					#print(line)
105 | 					if line[0] == ">": #Found a header line
106 | 						#If we already loaded a contig, yield that contig and
107 | 						#start loading a new one
108 | 						if contig:
109 | 							yield([contig,seq]) #yield
110 | 							contig = "" #reset contig and seq
111 | 							seq = ""
112 | 						split_line = line.split()
113 | 						contig = (split_line[0].replace(">",""))
114 | 					else:
115 | 						seq += line
116 | 				#Iyield last sequence, if it has both a header and sequence
117 | 				if contig and seq:
118 | 					yield([contig,seq])
119 | 			except IOError:
120 | 				print("Could not read file ",fas)
121 | 				sys.exit(1)
122 | 			finally:
123 | 				fh.close()
124 | 	else:
125 | 		raise FileNotFoundError("File %s not found!"%fas)
126 | 
127 | #Object to parse command-line arguments
128 | class parseArgs():
129 | 	def __init__(self):
130 | 		#Define options
131 | 		try:
132 | 			options, remainder = getopt.getopt(sys.argv[1:], 'f:p:h', \
133 | 			["ppmap=","fasta=","help"])
134 | 		except getopt.GetoptError as err:
135 | 			print(err)
136 | 			self.display_help("\nExiting because getopt returned non-zero exit status.")
137 | 		#Default values for params
138 | 		#Input params
139 | 		self.popmap=None
140 | 		self.fasta=None
141 | 
142 | 		#First pass to see if help menu was called
143 | 		for o, a in options:
144 | 			if o in ("-h", "-help", "--help"):
145 | 				self.display_help("Exiting because help menu was called.")
146 | 
147 | 		#Second pass to set all args.
148 | 		for opt, arg_raw in options:
149 | 			arg = arg_raw.replace(" ","")
150 | 			arg = arg.strip()
151 | 			opt = opt.replace("-","")
152 | 			#print(opt,arg)
153 | 			if opt in ('p', 'popmap'):
154 | 				self.popmap = arg
155 | 			elif opt in ('h', 'help'):
156 | 				pass
157 | 			elif opt in ('f', 'fasta'):
158 | 				self.fasta = arg
159 | 			else:
160 | 				assert False, "Unhandled option %r"%opt
161 | 
162 | 		#Check manditory options are set
163 | 		if not self.popmap:
164 | 			self.display_help("Error: Need popmap")
165 | 		if not self.fasta:
166 | 			self.display_help("Error: Need fasta")
167 | 
168 | 
169 | 	def display_help(self, message=None):
170 | 		if message is not None:
171 | 			print ("\n",message)
172 | 		print ("\nsplitFastaPops.py\n")
173 | 		print ("Contact:Tyler K. Chafin, University of Arkansas,tkchafin@uark.edu")
174 | 		print ("\nUsage: ", sys.argv[0], "-f <fasta> -p <popmap> \n")
175 | 		print ("Description: Splits a FASTA file into 1 file per population (pops from tab-delited popmap)")
176 | 
177 | 		print("""
178 | 	Arguments:
179 | 		-i,--input	: FASTA file
180 | 		-p,--popmap	: Tab-delimited population map (Sample \\t PopID)
181 | 		-h,--help	: Displays help menu
182 | 
183 | """)
184 | 		sys.exit()
185 | 
186 | #Call main function
187 | if __name__ == '__main__':
188 |     main()
189 | 


--------------------------------------------------------------------------------
/splitStackedFasta.pl:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/perl
  2 | 
  3 | # By Tyler K. Chafin
  4 | # Contact: tkchafin@uark.edu
  5 | 
  6 | use strict; 
  7 | use warnings; 
  8 | use Getopt::Std; 
  9 | 
 10 | #Die if no arguments given
 11 | if( scalar( @ARGV ) == 0 ){
 12 |   &help;
 13 |   die "No options given.\n\n";
 14 | }
 15 | 
 16 | #Parse arguments
 17 | my %opts;
 18 | getopts( 'i:o:hm:n:x:', \%opts );
 19 | 
 20 | # kill if help option is true
 21 | if( $opts{h} ){
 22 |   &help;
 23 |   die "Printing help menu.\n\n";
 24 | }
 25 | 
 26 | #get options 
 27 | my ($in, $out, $min, $max, $cap) = &parseArgs(\%opts); 
 28 | 
 29 | open (FASTA, "$in") || die "Could not open file $in: $!\n";
 30 | print "\nReading input file <$in>...\n";
 31 | open (OUT, ">$out") || die "Could not open file for output ($out) : $!\n";
 32 | print "Writing output to <$out>...\n";
 33 | my $base;
 34 | my $count;
 35 | my $num = 0;
 36 | while (<FASTA>){
 37 | 	chomp;
 38 | 	if ($_ =~ /^\>/){ #If header line
 39 | 		my @line = split(/-/, $_);
 40 | 		$line[0] =~ s/\>//g;
 41 | 		$base = $line[0];
 42 | 		$count = $line[1];
 43 | 		$num == 1 and die "Error: Header line \"$_\" immediately follows another header line.\n";
 44 | 		$num = 1;
 45 | 		next;
 46 | 	}else{
 47 | 		$num == 2 and die "Error: Sequence line \"$_\" immediately follows another sequence line.\n";
 48 | 		$num = 2; 
 49 | 		
 50 | 		if ($count < $min && $min != 0){
 51 | 			print "Skipping <$base>: Depth (<$count>) is below minimum <$min>!\n";
 52 | 			undef($count);
 53 | 			undef($base);
 54 | 			next;
 55 | 		}elsif ($count > $cap && $cap != 0){
 56 | 			print "Skipping <$base>: Depth (<$count>) is above maximum <$cap>!\n";
 57 | 			undef($count);
 58 | 			undef($base);
 59 | 			next;
 60 | 		}elsif ($max != 0){
 61 | 			$count > $max and $count = $max;
 62 | 		}
 63 | 		for (my $i=1; $i <= $count; $i++){
 64 | 			print OUT ">" . $base . "-" . $i . "\n";
 65 | 			print OUT $_ . "\n";
 66 | 		}
 67 | 		undef($count);
 68 | 		undef($base);
 69 | 	}
 70 | }
 71 | print "Done!\n\n";
 72 | close FASTA;
 73 | close OUT;
 74 | exit;
 75 | 
 76 |  ########################### SUBROUTINES ###############################
 77 | 
 78 |  sub help{
 79 | 	 
 80 | 	print "\nThis perl script is written by Tyler K. Chafin - tkchafin\@uark.edu\n";
 81 | 	print "\nInput should be a FASTA file of collapsed read clusters where -# at the end of the FASTA header for each sequence indicates the stack depth for the cluster.\n";
 82 | 	print "\nNOTE: Stack depth counts start at 1.\n";
 83 | 	print "\nNOTE: Header cannot contain \"-\" except before the read depth, e.g.:\n";
 84 | 	print "\t>Name-3
 85 | 	AGTAGTAGTAG....
 86 | Where \"Name\" is the sequence name and \"3\" is the depth.\n\n";
 87 | 	print "Options:\n";
 88 | 	print "\t-i	: Path to input file (fasta)\n";
 89 | 	print "\t-m	: Maximum stack depth to print [default: not set]\n";
 90 | 	print "\t-n	: Skip clusters with less than \"n\" depth [default: not set]\n";
 91 | 	print "\t-x	: Skip clusters with more than \"x\" depth [default: not set]\n";
 92 | 	print "\t-o	: Output file name. [Default = out.phy]\n";
 93 | 	print "\n\n";
 94 | }
 95 | 
 96 | #parse arguments
 97 | sub parseArgs{
 98 | 
 99 |   my( $params ) =  @_;
100 |   my %opts = %$params;
101 |   
102 |   #defaults
103 |   my $in = $opts{i} or die "\nNo input was provided.\n\n";
104 |   my $min = $opts{n} || 0; 
105 |   my $max = $opts{m} || 0; 
106 |   my $cap = $opts{x} || 0;
107 |   my $out = $opts{o} || "out.fasta"; 
108 |   #return
109 |   return ($in, $out, $min, $max, $cap);
110 | }
111 | 


--------------------------------------------------------------------------------
/splitTableCF.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | import sys
  4 | import os
  5 | import warnings
  6 | import getopt
  7 | import toytree as tt
  8 | import pandas as pd
  9 | import numpy as np
 10 | 
 11 | warnings.simplefilter(action='ignore', category=FutureWarning)
 12 | 
 13 | def main():
 14 | 	params = parseArgs()
 15 | 	
 16 | 	#read data
 17 | 	with open(params.samples) as f:
 18 | 		l = f.read().splitlines()
 19 | 	cf = pd.read_csv(params.cf, header=0)
 20 | 	tree = tt.tree(params.tree)
 21 | 	
 22 | 	#calculate mean ngenes for each sample
 23 | 	cov=dict()
 24 | 	for f in l:
 25 | 		b=cf.eq(f).any(1)
 26 | 		cov[f] = np.mean(cf[b]["ngenes"])
 27 | 	
 28 | 	#find which sample has best representation; will be used as placehold for whole list of samples
 29 | 	placeholder = max(cov, key = cov.get)
 30 | 	
 31 | 	#make subset datasets
 32 | 	removes = [s for s in l if s != placeholder]
 33 | 	left_cf = subset_df_blacklist(cf, removes) #keeps placeholder
 34 | 	removes2 = [s for s in tree.get_tip_labels() if s not in l]
 35 | 	right_cf = subset_df_blacklist(cf, removes2)
 36 | 	left_tree = tree_remove_blacklist(tree, removes)
 37 | 	#print(left_tree.get_tip_labels())
 38 | 	right_tree = tree_remove_whitelist(tree, l)
 39 | 	#print(right_tree.get_tip_labels())
 40 | 	
 41 | 	#write outputs
 42 | 	#ingroup_tree
 43 | 	right_tree.write("ingroup_tree.tre", tree_format=5)
 44 | 	#ingroup_cfs
 45 | 	right_cf.to_csv("ingroup_cfs.csv", index=False, index_label=False)
 46 | 	#outgroup_tree
 47 | 	left_tree.write("outgroup_tree.tre", tree_format=5)
 48 | 	#outgroup_cfs
 49 | 	left_cf.to_csv("outgroup_cfs.csv", index=False, index_label=False)
 50 | 
 51 | def tree_remove_whitelist(tree, goodbois):
 52 | 	all_tips = tree.get_tip_labels()
 53 | 	rem = [a for a in all_tips if a not in goodbois]
 54 | 	return(tree.drop_tips(names=rem))
 55 | 
 56 | def tree_remove_blacklist(tree, badbois):
 57 | 	all_tips = tree.get_tip_labels()
 58 | 	rem = [r for r in badbois if r in all_tips]
 59 | 	return(tree.drop_tips(names=rem))
 60 | 
 61 | def subset_df_blacklist(df, badbois):
 62 | 	ret = df.copy()
 63 | 	for i in badbois:
 64 | 		bools = ret.eq(i).any(1)
 65 | 		ret = ret[~bools]
 66 | 	return(ret)
 67 | 
 68 | 
 69 | #Object to parse command-line arguments
 70 | class parseArgs():
 71 | 	def __init__(self):
 72 | 		#Define options
 73 | 		try:
 74 | 			options, remainder = getopt.getopt(sys.argv[1:], 'hc:s:n:t:', \
 75 | 			["help", "cf=", "name=", "samples=", "tree="])
 76 | 		except getopt.GetoptError as err:
 77 | 			print(err)
 78 | 			self.display_help("\nExiting because getopt returned non-zero exit status.")
 79 | 		#Default values for params
 80 | 		#Input params
 81 | 		self.cf = None
 82 | 		self.samples=None
 83 | 		self.write="both"
 84 | 		self.tree=None
 85 | 
 86 | 		#First pass to see if help menu was called
 87 | 		for o, a in options:
 88 | 			if o in ("-h", "-help", "--help"):
 89 | 				self.display_help("Exiting because help menu was called.")
 90 | 
 91 | 		#Second pass to set all args.
 92 | 		for opt, arg_raw in options:
 93 | 			arg = arg_raw.replace(" ","")
 94 | 			arg = arg.strip()
 95 | 			opt = opt.replace("-","")
 96 | 			#print(opt,arg)
 97 | 			if opt == "h" or opt == "help":
 98 | 				continue
 99 | 			elif opt=="cf" or opt=="c":
100 | 				self.cf=arg
101 | 			elif opt=="samples" or opt=="s":
102 | 				self.samples=arg
103 | 			elif opt=="tree" or opt=="t":
104 | 				self.tree=arg
105 | 			else:
106 | 				assert False, "Unhandled option %r"%opt
107 | 
108 | 		#Check manditory options are set
109 | 		if not self.tree or not self.samples or not self.cf:
110 | 			self.display_help("No files provided.")
111 | 
112 | 
113 | 
114 | 	def display_help(self, message=None):
115 | 		if message is not None:
116 | 			print()
117 | 			print (message)
118 | 		print ("\nsplitTableCF.py\n")
119 | 		print("Author: Tyler K Chafin, University of Colorado")
120 | 		print ("Contact: tyler.chafin@colorado.edu")
121 | 		print ("Description: Subsets a TableCF file (PhyloNetworks) given a list of samples comprising a monophyletic clade -- right now only designed for 1 split at a time")
122 | 		print("""
123 | 		-c,--cf		: CF table
124 | 		-s,--samples	: File with list of samples
125 | 		-t,--tree	: Tree file 
126 | 		-h,--help	: Help menu
127 | """)
128 | 		print()
129 | 		sys.exit()
130 | 
131 | #Call main function
132 | if __name__ == '__main__':
133 |     main()
134 | 


--------------------------------------------------------------------------------
/stacks2fasta.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl 
  2 | 
  3 | use strict; 
  4 | use warnings;
  5 | use Getopt::Long;
  6 | use File::Path; 
  7 | 
  8 | our $input; 
  9 | our $workdir="";
 10 | our $catalog;
 11 | our $batch=1; 
 12 | 
 13 | parseArgs(); 
 14 |  
 15 | my $locus;
 16 | my $name; 
 17 | my @info; 
 18 | my %whitelist;
 19 | my $ID;
 20 | my $output="loci.$batch";
 21 | $workdir =~ /\S/ and $output = "$workdir/$output"; 
 22 | 
 23 | #Build list of loci containing SNPs (batch_#.catalog.snps.tsv output from STACKS) 
 24 | open ( CAT, $catalog ) || die "\nDerp: Can't open $catalog!\n\n";
 25 | 
 26 | while (<CAT>){ 
 27 |     @info = split /\t/, $_; 
 28 |     $ID = $info[2];
 29 | 
 30 | 	#If locus ID is already in hash, then skip it 
 31 |     if (exists $whitelist{$ID}){ 
 32 | 	 next;
 33 |     }else{
 34 | 	$whitelist{$ID}=""; 
 35 |     }
 36 | }
 37 | 
 38 | close CAT;
 39 | 
 40 | #Parse STACKS output fasta file into loci, query each locus against whitelist
 41 | open ( IN, $input ) || die "\nDerp: Can't open $input!\n\n"; 
 42 | 
 43 | rmtree $output;
 44 | mkdir $output;
 45 | chdir $output;
 46 |  
 47 | while (<IN>){ 
 48 | 
 49 | $_ =~ m/CLocus_(\d+)_Sample_(\d+)/;
 50 |  
 51 | $locus = $1;
 52 | $name=$2;  
 53 | 
 54 |     if (exists $whitelist{$locus}){
 55 | 	open ( OUT, ">>$locus.fasta");
 56 | 	if ( $_ =~ />/ ){ 
 57 | 	    print OUT ">$name\n";
 58 | 	}else{
 59 | 	    print OUT $_; 
 60 | 	}
 61 |     }
 62 | }
 63 | 
 64 | 
 65 | 
 66 | 
 67 | 
 68 | ##########################################SUBROUTINES#########################################
 69 | 
 70 | sub parseArgs{
 71 | 
 72 | 	my $usage="\nstacks2fasta.pl takes the fasta output from STACKS and outputs a new fasta file for each locus containing variation, which are identified by querying the cstacks catalog
 73 | 
 74 | Usage: $0 --i /path/to/infile --w /path/to/workdir --c=/path/to/catalog
 75 | 
 76 | Mandatory Variables 
 77 | 	-i, --input	-   path to input file (absolute path)
 78 | 	-w, --workdir	-   path to working directory (new fasta files will be placed within /workdir/loci
 79 | 	-c, --catalog	-   path to STACKS catalog 
 80 | 
 81 | Optional
 82 | 	-b, --batch	-   Provide a batch number to append to output dir name [default=1]\n\n";
 83 | 
 84 | 	my $result = GetOptions 
 85 | 	( 
 86 | 	'input|i=s'	=> \$input, 
 87 | 	'workdir|w=s'	=> \$workdir, 
 88 | 	'catalog|c=s'	=> \$catalog,
 89 | 	'batch|b=i'	=> \$batch,
 90 | 	); 
 91 | 
 92 | if ( $input eq "" ){ die "\nDerp: Input not specified!\n\n$usage"};
 93 | 
 94 | }
 95 | 
 96 | ############################################################################################
 97 | 
 98 | 
 99 | 
100 | 


--------------------------------------------------------------------------------
/subsetPhy.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | import getopt
  4 | import sys
  5 | import os
  6 | 
  7 | #Object to parse command-line arguments
  8 | class parseArgs():
  9 | 	def __init__(self):
 10 | 		#Define options
 11 | 		try:
 12 | 			options, remainder = getopt.getopt(sys.argv[1:], 'x:p:l:o:h', \
 13 | 			["xml=","phy=","list=","out=","help"])
 14 | 		except getopt.GetoptError as err:
 15 | 			print(err)
 16 | 			self.display_help("\nExiting because getopt returned non-zero exit status.")
 17 | 		#Default values for params
 18 | 		#Input params
 19 | 		self.xml=None
 20 | 		self.phy=None
 21 | 		self.tax=None
 22 | 		self.out="out.phy"
 23 | 
 24 | 		#First pass to see if help menu was called
 25 | 		for o, a in options:
 26 | 			if o in ("-h", "-help", "--help"):
 27 | 				self.display_help("Exiting because help menu was called.")
 28 | 
 29 | 		#Second pass to set all args.
 30 | 		for opt, arg_raw in options:
 31 | 			arg = arg_raw.replace(" ","")
 32 | 			arg = arg.strip()
 33 | 			opt = opt.replace("-","")
 34 | 			#print(opt,arg)
 35 | 			if opt in ('x', 'xml'):
 36 | 				self.xml = arg
 37 | 			elif opt in ('h', 'help'):
 38 | 				pass
 39 | 			elif opt in ('p','phy'):
 40 | 				self.phy = arg
 41 | 			elif opt in ('l','list'):
 42 | 				self.tax = arg
 43 | 			elif opt in ('o','out'):
 44 | 				self.out = arg
 45 | 			else:
 46 | 				assert False, "Unhandled option %r"%opt
 47 | 
 48 | 		#Check manditory options are set
 49 | 		self.phy or self.display_help("INPUT ERROR: No PHYLIP provided")
 50 | 		self.tax or self.display_help("INPUT ERROR: No TAXON LIST provided")
 51 | 
 52 | 
 53 | 	def display_help(self, message=None):
 54 | 		if message is not None:
 55 | 			print()
 56 | 			print (message)
 57 | 		print ("\nsubsetPhy.py\n")
 58 | 		print ("Contact:\n\n\tTyler K. Chafin\n\tUniversity of Arkansas\n\ttkchafin@uark.edu\n")
 59 | 		print ("\nUsage:\n\t", sys.argv[0], "-p </path/to/xml> -l </path/to/.txt\n")
 60 | 		print ("Description:\n")
 61 | 		print("\tsubsetPhy.py is a quickly written and shitty script to help manipulate phylip files\n")
 62 | 
 63 | 		print("""
 64 | 		Input options:
 65 | 
 66 | 			-p,--phy	: Phylip file
 67 | 			-l,--list	: .txt file containing a list of taxa to subset
 68 | 			-o,--out	: (Optional) output prefix [default:out.xml]
 69 | 			-h,--help	: Displays help menu""")
 70 | 		print()
 71 | 		sys.exit()
 72 | 
 73 | 
 74 | ################################# MAIN #########################################
 75 | params = parseArgs()
 76 | 
 77 | #Read TAX LIST into a list
 78 | taxlist = list()
 79 | fullnames = list()
 80 | fh = open(params.tax)
 81 | try:
 82 | 	with fh as file_object:
 83 | 		for line in file_object:
 84 | 			line = line.strip()
 85 | 			if not line:
 86 | 				continue
 87 | 			line = line.replace(" ","")
 88 | 			arr = line.split("_")
 89 | 			taxlist.append(arr[-1])
 90 | 			fullnames.append(line)
 91 | finally:
 92 | 	fh.close()
 93 | 
 94 | #Read phylip file
 95 | data = {}
 96 | numSites = None
 97 | count = 0
 98 | pfh = open(params.phy)
 99 | try:
100 | 	with pfh as file_object:
101 | 		for line in file_object:
102 | 			line = line.strip()
103 | 			if not line:
104 | 				continue
105 | 			count += 1
106 | 			if count == 1:
107 | 				continue
108 | 			arr = line.split()
109 | 			if arr[0] in taxlist:
110 | 				data[fullnames[taxlist.index(arr[0])]] = arr[1]
111 | 				if numSites:
112 | 					if len(arr[1]) != numSites:
113 | 						sys.exit("ERROR: Samples do not have the same sequence length -")
114 | 				else:
115 | 					numSites = len(arr[1])
116 | finally:
117 | 	pfh.close()
118 | 
119 | #Open output file
120 | ofh = open(params.out, "w")
121 | try:
122 | 	with ofh as file_object:
123 | 		header = str(len(data)) + " " + str(numSites) + "\n"
124 | 		file_object.write(header)
125 | 		for key in data:
126 | 			out = key + "\t" + data[key] + "\n"
127 | 			file_object.write(out)
128 | finally:
129 | 	ofh.close()
130 | 


--------------------------------------------------------------------------------
/subsetSnps.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import sys
 4 | 
 5 | print("Usage: ",sys.argv[0], " SubsetFile StructureFile")
 6 | 
 7 | print(sys.argv[1])
 8 | 
 9 | #Get loci to keep
10 | loci = []
11 | with open(sys.argv[1]) as file_object:
12 | 	count = 0
13 | 	for line in file_object:
14 | 		if count == 0:
15 | 			count += 1
16 | 			continue
17 | 		line = line.strip()
18 | 		t = line.split()
19 | 		loc = t[0].replace("\"","")
20 | 		loci.append(int(loc))
21 | file_object.close()
22 | 
23 | output = open("out.str", "w")
24 | with open(sys.argv[2]) as file_2:
25 | 	for line in file_2:
26 | 		line = line.strip()
27 | 		t = line.split()
28 | 		#print("Line: ", t[0])
29 | 		col = 0
30 | 		snp = 0
31 | 		for c in t:
32 | 			if col in (0,1):
33 | 				#print(c)
34 | 				if col == 0:
35 | 					output.write(c)
36 | 				else:
37 | 					stuff = "\t" + c
38 | 					output.write(stuff)
39 | 				col += 1
40 | 			else:
41 | 				snp += 1
42 | 				if snp in loci:
43 | 					#print(snp)
44 | 					stuff = "\t" + c
45 | 					output.write(stuff)
46 | 		output.write("\n")
47 | output.close()
48 | 


--------------------------------------------------------------------------------
/sumls.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [[ $# -eq 0 ]]; then 
 4 | 	echo "Usage: sumls [-m units in MB][-g units in GB][-k units in KB][/path/to/ls]" 
 5 | 	exit 0;
 6 | fi;
 7 | 
 8 | case $1 in 
 9 | 	-m) mult=.000000953674; unit=MB ;;
10 | 	-g) mult=.00000000093192; unit=GB ;; 
11 | 	-k) mult=.000976563; unit=KB ;;
12 | esac; 
13 |  
14 | 
15 | pre_num=`ls -lR $2 | awk '{sum+=$5}END{print sum}'`;
16 | adj_num=`echo "($pre_num*$mult)/1; scale=3" | bc`;
17 | echo "There are $adj_num $unit in $0"; 
18 | 
19 | 
20 | 


--------------------------------------------------------------------------------
/summaryGFF.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl 
  2 | 
  3 | 
  4 | use strict; 
  5 | use warnings; 
  6 | use Getopt::Long; 
  7 | 
  8 | 
  9 | our $gff=""; 
 10 | our $genome=""; 
 11 | 
 12 | parseArgs(); #Call subroutine to parse arguments... 
 13 | 
 14 | my @line; 
 15 | my $dna; 
 16 | my $subseq; 
 17 | my $total;
 18 | my $GC;  
 19 | my $element;
 20 | my $add; 
 21 | my @info;  
 22 | my %summary; 
 23 | 
 24 | 	
 25 | 
 26 | 
 27 | 
 28 | #Call subroutine for each type of element...
 29 | 
 30 | 
 31 | 
 32 | summaryGFF(); 
 33 | 
 34 | 
 35 | 
 36 | 
 37 | 
 38 | 
 39 | 
 40 | ################################SUBROUTINES######################################
 41 | 
 42 | #Subroutine to parse command line arguments
 43 | sub parseArgs{
 44 | 
 45 |     my $usage = "\nUsage: $0 --genome=whole_genome.fasta --gff=annotations.gff
 46 | 
 47 |     mandatory
 48 |        --genome      -  FASTA file containing sequences to parse
 49 |        --gff         -  GFF file containing gene annotations \n\n";
 50 | 
 51 | 
 52 |                  my $result = GetOptions
 53 |                          (
 54 |                                  'genome=s'  => \$genome,
 55 |                                  'gff=s'     => \$gff,
 56 |                                
 57 |                          );
 58 |              
 59 | 	        $genome ne "" || die $usage;  #Die if mandatory variables undefined
 60 | 		$gff ne "" || die $usage; 
 61 |         
 62 | }
 63 |                                                              
 64 | 
 65 | #Subroutine to parse gff and genome for particular type of element
 66 | 
 67 | sub summaryGFF{
 68 | 
 69 | 
 70 | undef @line; 
 71 | undef $dna; 
 72 |  
 73 |     
 74 |     open ( GENOME, "$genome") || die "Derp: Can't open file $genome!";
 75 | 
 76 | 	while (<GENOME>){ 
 77 | 	    $_ ne /^>/ and $dna .= $_; 
 78 | 	};
 79 | 
 80 |     close GENOME;
 81 |  
 82 | 
 83 |     open ( GFF, "$gff" ) || die "Derp: Can't open file $gff!"; 
 84 | 
 85 | 	foreach ( <GFF> ){ 
 86 | 		@line = split /\t/, $_;
 87 | 			#print "$line[2]\n"; 
 88 | 	            $GC=0; 
 89 | 		    $subseq = substr ( $dna, $line[3]-1, $line[5] ); 
 90 |                     $add =()=$subseq =~ /G/gi; 
 91 | 		    $GC += $add; 
 92 | 		    $add =()=$subseq =~ /C/gi;
 93 | 		    $GC += $add; 
 94 | 		        #print "$GC\n";
 95 | 
 96 | #If element is already in hash, then alter values in the arrays by following ref in hash value...
 97 | 		    if ( exists $summary{$line[2]} ){ 
 98 | 		            #print "$line[2]\n";
 99 | 			$summary{$line[2]}->[0] += $line[5];
100 | 			   #print "$summary{$line[2]}\n"; 
101 | 			$summary{$line[2]}->[1] += $GC;
102 | 		    }else{ 
103 | 		
104 |  #Create array containing length and GC content, then assign array ref to hash key for that element	
105 | 			my @info=($line[5], $GC); 
106 | 			$summary{$line[2]} = \@info; 
107 | 		    } 	 
108 | 
109 | 	}
110 | 	
111 | 	foreach my $key ( keys %summary ){ 
112 | 	    print "$key \t$summary{$key}->[0]   ";
113 | 	    printf( "(%.1f%%) \t", $summary{$key}->[0] / length($dna) * 100); 
114 | 	    printf( "%.2f \n", $summary{$key}->[1] / $summary{$key}->[0] * 100); 
115 |         }
116 | }	
117 | 
118 | close GFF; 
119 | exit; 
120 | 
121 | 
122 | 
123 | 


--------------------------------------------------------------------------------
/terminalGapRemover.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | import re
  4 | import sys
  5 | import os
  6 | import getopt
  7 | import random
  8 | 
  9 | def main():
 10 | 	params = parseArgs()
 11 | 
 12 | 	seqs = dict() #key=FASTA header; val=sequence
 13 | 
 14 | 	#read sequence in
 15 | 	if params.fasta:
 16 | 		print('Reading alignment from FASTA...')
 17 | 		for f in read_fasta(params.fasta):
 18 | 			seqs[f[0]] = f[1]
 19 | 
 20 | 		#print("Writing new PHYLIP file",params.out)
 21 | 		#write_phylip(params.out, seqs)
 22 | 	elif params.phylip:
 23 | 		print('Reading alignment from PHYLIP...')
 24 | 		for f in read_phylip(params.phylip):
 25 | 			seqs[f[0]] = f[1]
 26 | 
 27 | 		#print("Writing new FASTA file",params.out)
 28 | 		#write_fasta(params.out, seqs)
 29 | 
 30 | 	for s in seqs.keys():
 31 | 		new=seqs[s]
 32 | 		#print(new)
 33 | 		left_ns = ""
 34 | 		right_ns = ""
 35 | 		for nuc in new:
 36 | 			if nuc == "-":
 37 | 				left_ns = left_ns + "N"
 38 | 			else:
 39 | 				break
 40 | 		if len(left_ns) != len(new):
 41 | 			for nuc in reversed(new):
 42 | 				if nuc == "-":
 43 | 					right_ns = right_ns + "N"
 44 | 				else:
 45 | 					break
 46 | 		if len(left_ns) > 0:
 47 | 			new = left_ns + new[len(left_ns):]
 48 | 		if len(right_ns) > 0:
 49 | 			new = new[0:len(new)-len(right_ns)] + right_ns
 50 | 		seqs[s] = new
 51 | 
 52 | 	if params.phylip:
 53 | 		print("Writing new PHYLIP file",params.out)
 54 | 		write_phylip(params.out, seqs)
 55 | 	elif params.fasta:
 56 | 		print("Writing new FASTA file",params.out)
 57 | 		write_fasta(params.out, seqs)
 58 | 
 59 | 
 60 | #Print dict to phylip file
 61 | def write_phylip(p, aln):
 62 | 	with open(p, 'w') as fh:
 63 | 		try:
 64 | 			header = getPhylipHeader(aln) + "\n"
 65 | 			fh.write(header)
 66 | 
 67 | 			for sample in aln.keys():
 68 | 				line = str(sample) + "\t" + "".join(aln[sample]) + "\n"
 69 | 				fh.write(line)
 70 | 		except IOError as e:
 71 | 			print("Could not read file %s: %s"%(p,e))
 72 | 			sys.exit(1)
 73 | 		except Exception as e:
 74 | 			print("Unexpected error reading file %s: %s"%(p,e))
 75 | 			sys.exit(1)
 76 | 		finally:
 77 | 			fh.close()
 78 | 
 79 | #Function to write fasta-formatted sequences
 80 | def write_fasta(f, aln):
 81 | 	with open(f, 'w') as fh:
 82 | 		try:
 83 | 			for samp in aln.keys():
 84 | 				ol = ">" + str(samp) + "\n" + str(aln[samp]) + "\n"
 85 | 				fh.write(ol)
 86 | 		except IOError as e:
 87 | 			print("Could not read file %s: %s"%(f,e))
 88 | 			sys.exit(1)
 89 | 		except Exception as e:
 90 | 			print("Unexpected error reading file %s: %s"%(f,e))
 91 | 			sys.exit(1)
 92 | 		finally:
 93 | 			fh.close()
 94 | 
 95 | #Returns header for Phylip file from a dictionary of samples w/ data
 96 | def getPhylipHeader(d):
 97 | 	numSamp = 0
 98 | 	numLoci = None
 99 | 	for sample in d:
100 | 		numSamp = numSamp + 1
101 | 		if not numLoci:
102 | 			numLoci = len(d[sample])
103 | 		else:
104 | 			if numLoci != len(d[sample]):
105 | 				print("getPhylipHeader: Warning: Sequences of unequal length.")
106 | 	header = str(numSamp) + " " + str(numLoci)
107 | 	if numLoci == 0 or not numLoci:
108 | 		print("getPhylipHeader: Warning: No loci in dictionary.")
109 | 	if numSamp == 0:
110 | 		print("getPhylipHeader: Warning: No samples in dictionary.")
111 | 	return(header)
112 | 
113 | #Read samples as FASTA. Generator function
114 | def read_fasta(fas):
115 | 
116 | 	if os.path.exists(fas):
117 | 		with open(fas, 'r') as fh:
118 | 			try:
119 | 				contig = ""
120 | 				seq = ""
121 | 				for line in fh:
122 | 					line = line.strip()
123 | 					if not line:
124 | 						continue
125 | 					#print(line)
126 | 					if line[0] == ">": #Found a header line
127 | 						#If we already loaded a contig, yield that contig and
128 | 						#start loading a new one
129 | 						if contig:
130 | 							yield([contig,seq]) #yield
131 | 							contig = "" #reset contig and seq
132 | 							seq = ""
133 | 						split_line = line.split()
134 | 						contig = (split_line[0].replace(">",""))
135 | 					else:
136 | 						seq += line
137 | 				#Iyield last sequence, if it has both a header and sequence
138 | 				if contig and seq:
139 | 					yield([contig,seq])
140 | 			except IOError:
141 | 				print("Could not read file ",fas)
142 | 				sys.exit(1)
143 | 			finally:
144 | 				fh.close()
145 | 	else:
146 | 		raise FileNotFoundError("File %s not found!"%fas)
147 | 
148 | #Read samples as PHYLIP. Generator function
149 | def read_phylip(phy):
150 | 	if os.path.exists(phy):
151 | 		with open(phy, 'r') as fh:
152 | 			try:
153 | 				num=0
154 | 				for line in fh:
155 | 					line = line.strip()
156 | 					if not line:
157 | 						continue
158 | 					num += 1
159 | 					if num == 1:
160 | 						continue
161 | 					arr = line.split()
162 | 					yield(arr[0], arr[1])
163 | 			except IOError:
164 | 				print("Could not read file ",phy)
165 | 				sys.exit(1)
166 | 			finally:
167 | 				fh.close()
168 | 	else:
169 | 		raise FileNotFoundError("File %s not found!"%phy)
170 | 
171 | #Object to parse command-line arguments
172 | class parseArgs():
173 | 	def __init__(self):
174 | 		#Define options
175 | 		try:
176 | 			options, remainder = getopt.getopt(sys.argv[1:], 'f:p:h', \
177 | 			["help", "fasta=", "phy="])
178 | 		except getopt.GetoptError as err:
179 | 			print(err)
180 | 			self.display_help("\nExiting because getopt returned non-zero exit status.")
181 | 		#Default values for params
182 | 		#Input params
183 | 		self.fasta=None
184 | 		self.phylip=None
185 | 		self.out=None
186 | 
187 | 		#First pass to see if help menu was called
188 | 		for o, a in options:
189 | 			if o in ("-h", "-help", "--help"):
190 | 				self.display_help("Exiting because help menu was called.")
191 | 
192 | 		#Second pass to set all args.
193 | 		for opt, arg_raw in options:
194 | 			arg = arg_raw.replace(" ","")
195 | 			arg = arg.strip()
196 | 			opt = opt.replace("-","")
197 | 			#print(opt,arg)
198 | 			if opt =="f" or opt=="fasta":
199 | 				self.fasta = arg
200 | 			elif opt =="p" or opt=="phy":
201 | 				self.phylip = arg
202 | 			elif opt =="h" or opt == "help":
203 | 				pass
204 | 			else:
205 | 				assert False, "Unhandled option %r"%opt
206 | 
207 | 		#Check manditory options are set
208 | 		if not self.fasta and not self.phylip:
209 | 			self.display_help("Must provide either a FASTA or PHYLIP file.")
210 | 
211 | 		if self.fasta and self.phylip:
212 | 			self.display_help("Must provide either a FASTA or PHYLIP file.")
213 | 
214 | 		#get output prefix if not set by user
215 | 		if self.fasta:
216 | 			self.out = os.path.splitext(self.fasta)[0] + '.gapfix.fasta'
217 | 		elif self.phylip:
218 | 			self.out = os.path.splitext(self.phylip)[0] + '.gapfix.phylip'
219 | 
220 | 	def display_help(self, message=None):
221 | 		if message is not None:
222 | 			print()
223 | 			print (message)
224 | 		print ("\nterminalGapRemover.py\n")
225 | 		print ("Contact:Tyler K. Chafin")
226 | 		print ("\nUsage: ", sys.argv[0], "[-f <.fasta>] [-p <.phy>]\n")
227 | 		print ("Description: Simple script to convert terminal gap characters to N's. Accepts FASTA or PHYLIP")
228 | 
229 | 		print("""
230 | 	Arguments:
231 | 		-f,--fasta	: Input FASTA to be converted
232 | 		-p,--phy	: Input PHYLIP to be converted
233 | 		-h,--help	: Displays help menu
234 | """)
235 | 		print()
236 | 		sys.exit()
237 | 
238 | #Call main function
239 | if __name__ == '__main__':
240 |     main()
241 | 


--------------------------------------------------------------------------------
/test_files/gtrees.tre:
--------------------------------------------------------------------------------
 1 | (4,(1,(2,3)));
 2 | (4,(2,(1,3)));
 3 | (4,(1,(2,3)));
 4 | (4,(2,(1,3)));
 5 | (1,(4,(2,3)));
 6 | (4,(3,(2,3)));
 7 | (4,(2,(1,3)));
 8 | (4,(1,(2,3)));
 9 | (4,(2,(1,3)));
10 | (1,(4,(2,3)));
11 | 


--------------------------------------------------------------------------------
/test_files/revTransAll_code.txt:
--------------------------------------------------------------------------------
 1 | F	TTT
 2 | S	TCT
 3 | Y	TAT
 4 | C	TGT
 5 | F	TTC
 6 | S	TCC
 7 | Y	TAC
 8 | C	TGC
 9 | L	TTA
10 | S	TCA
11 | *	TAA
12 | *	TGA
13 | L	TTG
14 | S	TCG
15 | *	TAG
16 | W	TGG
17 | L	CTT
18 | P	CCT
19 | H	CAT
20 | R	CGT
21 | L	CTC
22 | P	CCC
23 | H	CAC
24 | R	CGC
25 | L	CTA
26 | P	CCA
27 | Q	CAA
28 | R	CGA
29 | L	CTG
30 | P	CCG
31 | Q	CAG
32 | R	CGG
33 | I	ATT
34 | T	ACT
35 | N	AAT
36 | S	AGT
37 | I	ATC
38 | T	ACC
39 | N	AAC
40 | S	AGC
41 | I	ATA
42 | T	ACA
43 | K	AAA
44 | R	AGA
45 | M	ATG
46 | T	ACG
47 | K	AAG
48 | R	AGG
49 | V	GTT
50 | A	GCT
51 | D	GAT
52 | G	GGT
53 | V	GTC
54 | A	GCC
55 | D	GAC
56 | G	GGC
57 | V	GTA
58 | A	GCA
59 | E	GAA
60 | G	GGA
61 | V	GTG
62 | A	GCG
63 | E	GAG
64 | G	GGG
65 | 


--------------------------------------------------------------------------------
/test_files/revTransAll_in.fas:
--------------------------------------------------------------------------------
1 | >my_protein
2 | MPTTRPNLK
3 | 


--------------------------------------------------------------------------------
/test_files/terminal_gaps.fasta:
--------------------------------------------------------------------------------
 1 | >A1.TEST
 2 | -----TTATA--GGTTG---
 3 | >B1.TEST
 4 | ---TTTTATACTG-TT----
 5 | >C1.TEST
 6 | AAATTT--------GGTTTT
 7 | >C2.TEST
 8 | AAATTTAATGACGTGGGGGG
 9 | >D1.TEST
10 | --------------------
11 | 


--------------------------------------------------------------------------------
/test_files/terminal_gaps.gapfix.fasta:
--------------------------------------------------------------------------------
 1 | >A1.TEST
 2 | NNNNNTTATA--GGTTGNNN
 3 | >B1.TEST
 4 | NNNTTTTATACTG-TTNNNN
 5 | >C1.TEST
 6 | AAATTT--------GGTTTT
 7 | >C2.TEST
 8 | AAATTTAATGACGTGGGGGG
 9 | >D1.TEST
10 | NNNNNNNNNNNNNNNNNNNN
11 | 


--------------------------------------------------------------------------------
/test_files/variable_length.fas:
--------------------------------------------------------------------------------
 1 | >ka001
 2 | AAAAAAAAAAAAAAAAAAAA
 3 | >ka002
 4 | AAAAAA
 5 | >ka003
 6 | AAAAGAGAGAGAGGAGAGAGAGGAGAGAGAGAG
 7 | >ka004
 8 | AGAGAGAGGAGAGAGGAGAGAGAG
 9 | >ka005
10 | AGAGAGAGGAGAGAGGAGAGAGAG
11 | >ka006
12 | AGAGAGAGGAGAGAGGAGAGAGAG
13 | >ka007
14 | AGAGAGAGGAGAGAGGAGAGAGAG
15 | >ka008
16 | AGAGAG
17 | >ka009
18 | AAGA


--------------------------------------------------------------------------------
/test_files/variable_length.fas.filter:
--------------------------------------------------------------------------------
 1 | >ka003
 2 | AAAAGAGAGAGAGGAGAGAGAGGAGAGAGAGAG
 3 | >ka004
 4 | AGAGAGAGGAGAGAGGAGAGAGAG
 5 | >ka005
 6 | AGAGAGAGGAGAGAGGAGAGAGAG
 7 | >ka006
 8 | AGAGAGAGGAGAGAGGAGAGAGAG
 9 | >ka007
10 | AGAGAGAGGAGAGAGGAGAGAGAG
11 | 


--------------------------------------------------------------------------------
/traitsList2LagrangePhylip.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | import sys
  4 | import os
  5 | import getopt
  6 | 
  7 | def main():
  8 | 	params = parseArgs()
  9 | 	
 10 | 	traits=set()
 11 | 	samples=dict()
 12 | 	map=dict()
 13 | 	
 14 | 	if params.map:
 15 | 		with open(params.map, "r") as m:
 16 | 			for line in m:
 17 | 				line=line.strip()
 18 | 				if len(line)==0:
 19 | 					continue
 20 | 				stuff=line.split()
 21 | 				if len(stuff) >2: 
 22 | 					print("ERROR: Too many elements --",line)
 23 | 				else:
 24 | 					if stuff[0] in map.keys():
 25 | 						map[stuff[0]].append(stuff[1])
 26 | 					else:
 27 | 						map[stuff[0]]=list()
 28 | 						map[stuff[0]].append(stuff[1])
 29 | 
 30 | 	with open(params.tab, "r") as t:
 31 | 		for line in t:
 32 | 			line=line.strip()
 33 | 			if len(line) == 0:
 34 | 				continue
 35 | 			stuff=line.split("\t")
 36 | 			if len(stuff) >2: 
 37 | 				print("ERROR: Too many elements --",line)
 38 | 			else:
 39 | 				samples[stuff[0]]=set()
 40 | 				#print(stuff[0])
 41 | 				#print(stuff)
 42 | 				if len(stuff) >1:
 43 | 					splitstuff=stuff[1].split(",")
 44 | 					for s in splitstuff:
 45 | 						loc=s
 46 | 						if params.map and s in map.keys():
 47 | 							loc=map[s]
 48 | 							for l in loc:
 49 | 								samples[stuff[0]].add(l)
 50 | 								traits.add(l)
 51 | 							continue
 52 | 						else:
 53 | 							samples[stuff[0]].add(loc)
 54 | 							traits.add(loc)
 55 | 				#print(samples)
 56 | 				#sys.exit()
 57 | 		t.close()
 58 | 	#print(traits)
 59 | 	#sys.exit()
 60 | 
 61 | 
 62 | 	trlen=len(traits)
 63 | 	slen=len(samples)
 64 | 	output=""
 65 | 	rep=False
 66 | 	for samp in samples:
 67 | 		#print(samples[samp])
 68 | 		oline = str(samp) + "\t"
 69 | 		#if no traits, report and skip
 70 | 		#print(samples[samp])
 71 | 		if len(samples[samp]) < 1:
 72 | 			if not rep:
 73 | 				rep=True
 74 | 				print("Samples were found without any trait data. Skipping samples:")
 75 | 			print(samp)
 76 | 			slen-=1
 77 | 			continue
 78 | 		else:
 79 | 			for t in traits:
 80 | 				if t in samples[samp]:
 81 | 					oline = oline + "1"
 82 | 				else:
 83 | 					oline = oline + "0"
 84 | 			oline+="\n"
 85 | 			output = output+oline
 86 | 		#sys.exit()
 87 | 	#print(output)
 88 | 	#write lagrange phylip file 
 89 | 	with open(params.out, "w") as ofh:
 90 | 		header=str(slen) + "\t" + str(trlen) + "\t(" + str(" ".join(traits)) + ")\n"
 91 | 		print("Traits output in this order:")
 92 | 		print(str(", ".join(traits)))
 93 | 		ofh.write(header)
 94 | 		ofh.write(output)
 95 | 		ofh.close()
 96 | 	
 97 | 
 98 | #Object to parse command-line arguments
 99 | class parseArgs():
100 | 	def __init__(self):
101 | 		#Define options
102 | 		try:
103 | 			options, remainder = getopt.getopt(sys.argv[1:], 'ht:o:m:', \
104 | 			["help"])
105 | 		except getopt.GetoptError as err:
106 | 			print(err)
107 | 			self.display_help("\nExiting because getopt returned non-zero exit status.")
108 | 		#Default values for params
109 | 		#Input params
110 | 		self.tab=None
111 | 		self.out="out.phy"
112 | 		self.map=None
113 | 
114 | 
115 | 		#First pass to see if help menu was called
116 | 		for o, a in options:
117 | 			if o in ("-h", "-help", "--help"):
118 | 				self.display_help("Exiting because help menu was called.")
119 | 
120 | 		#Second pass to set all args.
121 | 		for opt, arg_raw in options:
122 | 			arg = arg_raw.replace(" ","")
123 | 			arg = arg.strip()
124 | 			opt = opt.replace("-","")
125 | 			#print(opt,arg)
126 | 			if opt == "h" or opt == "help":
127 | 				continue
128 | 			elif opt == "t":
129 | 				self.tab=arg
130 | 			elif opt=="o":
131 | 				self.out=arg
132 | 			elif opt=="m":
133 | 				self.map=arg
134 | 			else:
135 | 				assert False, "Unhandled option %r"%opt
136 | 
137 | 		#Check manditory options are set
138 | 		if not self.tab:
139 | 			self.display_help("No table provided.")
140 | 
141 | 
142 | 
143 | 	def display_help(self, message=None):
144 | 		if message is not None:
145 | 			print()
146 | 			print (message)
147 | 		print ("\ntraitsList2LagrangePhylip.py\n")
148 | 		print("Author: Tyler K Chafin, University of Arkansas")
149 | 		print ("Contact: tkchafin@uark.edu")
150 | 		print ("Description: Converts table of the form Sample \t Trait,Trait,Trait to phylip 0/1 format, for LAGRANGE of BioGeoBEARS")
151 | 		print("""
152 | 		-t:	Tab-delimited trait table
153 | 		-m: Option tab-delimited map grouping trait names
154 | 		-o: Output file name [default=out.phy]
155 | """)
156 | 		print()
157 | 		sys.exit()
158 | 
159 | #Call main function
160 | if __name__ == '__main__':
161 |     main()
162 | 


--------------------------------------------------------------------------------
/treeAlignment_subsetter.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | import sys
  4 | import os
  5 | import getopt
  6 | import toytree as tt
  7 | import random
  8 | 
  9 | def main():
 10 | 	params = parseArgs()
 11 | 
 12 | 	seqs=dict()	
 13 | 	for f in read_phylip(params.phylip):
 14 |                         seqs[f[0]] = f[1]
 15 | 
 16 | 	tree=tt.tree(params.tree, tree_format=0)
 17 | 
 18 | 	if not params.samples:
 19 | 		params.samples=int(params.freq*len(list(seqs.keys())))
 20 | 	
 21 | 	print("Generating",params.reps,"random subsets of",params.samples,"samples eac")
 22 | 
 23 | 	for r in range(params.reps):
 24 | 		print("starting replicate",str(r))
 25 | 		prefix=params.out + "_" + str(r)
 26 | 		print("subsetting alignment")
 27 | 		keeps=dict(random.sample(seqs.items(), params.samples))
 28 | 		bad_bois=[k for k in seqs.keys() if k not in keeps]
 29 | 		print("subsetting tree")
 30 | 		stree = tree.drop_tips(names=bad_bois)
 31 | 		print("writing subset files")
 32 | 		write_phylip(prefix+".phylip",keeps)
 33 | 		stree.write(prefix+".tre", tree_format=0)
 34 | 
 35 | 
 36 | #Print dict to phylip file
 37 | def write_phylip(p, aln):
 38 |         with open(p, 'w') as fh:
 39 |                 try:
 40 |                         header = getPhylipHeader(aln) + "\n"
 41 |                         fh.write(header)
 42 | 
 43 |                         for sample in aln.keys():
 44 |                                 line = str(sample) + "\t" + "".join(aln[sample]) + "\n"
 45 |                                 fh.write(line)
 46 |                 except IOError as e:
 47 |                         print("Could not read file %s: %s"%(p,e))
 48 |                         sys.exit(1)
 49 |                 except Exception as e:
 50 |                         print("Unexpected error reading file %s: %s"%(p,e))
 51 |                         sys.exit(1)
 52 |                 finally:
 53 |                         fh.close()	
 54 | 
 55 | #Returns header for Phylip file from a dictionary of samples w/ data
 56 | def getPhylipHeader(d):
 57 |         numSamp = 0
 58 |         numLoci = None
 59 |         for sample in d:
 60 |                 numSamp = numSamp + 1
 61 |                 if not numLoci:
 62 |                         numLoci = len(d[sample])
 63 |                 else:
 64 |                         if numLoci != len(d[sample]):
 65 |                                 print("getPhylipHeader: Warning: Sequences of unequal length.")
 66 |         header = str(numSamp) + " " + str(numLoci)
 67 |         if numLoci == 0 or not numLoci:
 68 |                 print("getPhylipHeader: Warning: No loci in dictionary.")
 69 |         if numSamp == 0:
 70 |                 print("getPhylipHeader: Warning: No samples in dictionary.")
 71 |         return(header)
 72 | 
 73 | 
 74 | #Read samples as PHYLIP. Generator function
 75 | def read_phylip(phy):
 76 |         if os.path.exists(phy):
 77 |                 with open(phy, 'r') as fh:
 78 |                         try:
 79 |                                 num=0
 80 |                                 for line in fh:
 81 |                                         line = line.strip()
 82 |                                         if not line:
 83 |                                                 continue
 84 |                                         num += 1
 85 |                                         if num == 1:
 86 |                                                 continue
 87 |                                         arr = line.split()
 88 |                                         yield(arr[0], arr[1])
 89 |                         except IOError:
 90 |                                 print("Could not read file ",phy)
 91 |                                 sys.exit(1)
 92 |                         finally:
 93 |                                 fh.close()
 94 |         else:
 95 |                 raise FileNotFoundError("File %s not found!"%phy)
 96 | 
 97 | #Object to parse command-line arguments
 98 | class parseArgs():
 99 | 	def __init__(self):
100 | 		#Define options
101 | 		try:
102 | 			options, remainder = getopt.getopt(sys.argv[1:], 'hs:f:r:t:p:o:m:', \
103 | 			["help", "reps=","tree=","phylip=","out=", "method=", "samples=", "freq="])
104 | 		except getopt.GetoptError as err:
105 | 			print(err)
106 | 			self.display_help("\nExiting because getopt returned non-zero exit status.")
107 | 		#Default values for params
108 | 		#Input params
109 | 		self.tree=None
110 | 		self.reps=10
111 | 		self.freq=0.1
112 | 		self.samples=None
113 | 		self.phylip=None
114 | 		self.method="random"
115 | 		self.out="subset"
116 | 
117 | 
118 | 		#First pass to see if help menu was called
119 | 		for o, a in options:
120 | 			if o in ("-h", "-help", "--help"):
121 | 				self.display_help("Exiting because help menu was called.")
122 | 
123 | 		#Second pass to set all args.
124 | 		for opt, arg_raw in options:
125 | 			arg = arg_raw.replace(" ","")
126 | 			arg = arg.strip()
127 | 			opt = opt.replace("-","")
128 | 			#print(opt,arg)
129 | 			if opt == "h" or opt == "help":
130 | 				continue
131 | 			elif opt=="tree" or opt=="t":
132 | 				self.tree=arg
133 | 			elif opt=="phylip" or opt=="p":
134 | 				self.phylip=arg
135 | 			elif opt=="method" or opt=="m":
136 | 				self.method=arg
137 | 			elif opt=="reps" or opt=="r":
138 | 				self.reps=int(arg)
139 | 			elif opt=="freq" or opt=="f":
140 | 				self.freq=float(arg)
141 | 			elif opt=="samples" or opt=="s":
142 | 				self.samples=int(arg)
143 | 			elif opt=="out" or opt=="o":
144 | 				self.out=arg
145 | 			else:
146 | 				assert False, "Unhandled option %r"%opt
147 | 
148 | 		#Check manditory options are set
149 | 		if not self.phylip and not self.tree:
150 | 			self.display_help("Must provide input tree (newick) and alignment (phylip) files.")
151 | 
152 | 
153 | 
154 | 	def display_help(self, message=None):
155 | 		if message is not None:
156 | 			print()
157 | 			print (message)
158 | 		print ("\ntreeAlignment_subsetter.py\n")
159 | 		print ("Description: Generate random subsets of an input phylogenetic dataset (tree and alignment)")
160 | 		print("""
161 | 		-t,--tree	: Path to input newick file
162 | 		-p,--phylip	: Path to input phylip file
163 | 		-s,--samples	: Number of samples to keep
164 | 		-f,--freq	: Sampling frequency (must set either -f or -s)
165 | 		-r,--reps	: Number of replicates to generate
166 | 		-o,--out	: Output file name (default=out.fas)
167 | """)
168 | 		print()
169 | 		sys.exit()
170 | 
171 | #Call main function
172 | if __name__ == '__main__':
173 |     main()
174 | 


--------------------------------------------------------------------------------
/treeExpansion.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | import re
  4 | import sys
  5 | import os
  6 | import getopt
  7 | 
  8 | 
  9 | def main():
 10 | 	params = parseArgs()
 11 | 
 12 | 	popsList = dict()
 13 | 	#parse popmap file for dictionary of sample assignments
 14 | 	if params.popmap:
 15 | 		#print("Parsing popmap file...")
 16 | 		popsList = parsePopmap_alt(params.popmap)
 17 | 
 18 | 		if not params.tree:
 19 | 
 20 | 			print("ERROR: No tree provided.")
 21 | 			sys.exit(1)
 22 | 
 23 | 
 24 | 		newtree = params.tree
 25 | 		for pop in popsList:
 26 | 			replace = ", ".join(popsList[pop])
 27 | 			newtree = newtree.replace(str(pop), str(replace))
 28 | 		print(newtree)
 29 | 
 30 | 
 31 | 	else:
 32 | 		print("ERROR: Popmap file must be provided.")
 33 | 		sys.exit(1)
 34 | 
 35 | 
 36 | #function reads a tab-delimited popmap file and return dictionary of assignments
 37 | #function returns dict of pops, each pointint to list of taxa
 38 | def parsePopmap_alt(popmap):
 39 | 	ret = dict()
 40 | 	with open(popmap, 'r') as fh:
 41 | 		try:
 42 | 			contig = ""
 43 | 			seq = ""
 44 | 			for line in fh:
 45 | 				line = line.strip()
 46 | 				if not line:
 47 | 					continue
 48 | 				else:
 49 | 					stuff = line.split()
 50 | 					if len(stuff)!= 2:
 51 | 						print("Uh oh! Record missing a field: ",stuff)
 52 | 						continue
 53 | 					if stuff[1] not in ret:
 54 | 						l = list()
 55 | 						l.append(stuff[0])
 56 | 						ret[stuff[1]] = l
 57 | 					else:
 58 | 						ret[stuff[1]].append(stuff[0])
 59 | 			return(ret)
 60 | 		except IOError as e:
 61 | 			print("Could not read file %s: %s"%(popmap,e))
 62 | 			sys.exit(1)
 63 | 		except Exception as e:
 64 | 			print("Unexpected error reading file %s: %s"%(popmap,e))
 65 | 			sys.exit(1)
 66 | 		finally:
 67 | 			fh.close()
 68 | 
 69 | #function returns first readable line from a file
 70 | #good for getting headers etc
 71 | def firstLine(f):
 72 | 	with open(f, 'r') as fh:
 73 | 		try:
 74 | 			for line in fh:
 75 | 				line = line.strip()
 76 | 				if not line:
 77 | 					continue
 78 | 				else:
 79 | 					return(line) #returns first real line
 80 | 		except IOError as e:
 81 | 			print("Could not read file %s: %s"%(f,e))
 82 | 			sys.exit(1)
 83 | 		except Exception as e:
 84 | 			print("Unexpected error reading file %s: %s"%(f,e))
 85 | 			sys.exit(1)
 86 | 		finally:
 87 | 			fh.close()
 88 | 
 89 | 
 90 | #Object to parse command-line arguments
 91 | class parseArgs():
 92 | 	def __init__(self):
 93 | 		#Define options
 94 | 		try:
 95 | 			options, remainder = getopt.getopt(sys.argv[1:], 't:s:p:h', \
 96 | 			["tree=","popmap="])
 97 | 		except getopt.GetoptError as err:
 98 | 			print(err)
 99 | 			self.display_help("\nExiting because getopt returned non-zero exit status.")
100 | 		#Default values for params
101 | 		#Input params
102 | 		self.tree=None
103 | 		self.popmap=None
104 | 
105 | 
106 | 		#First pass to see if help menu was called
107 | 		for o, a in options:
108 | 			if o in ("-h", "-help", "--help"):
109 | 				self.display_help("Exiting because help menu was called.")
110 | 
111 | 		#Second pass to set all args.
112 | 		for opt, arg_raw in options:
113 | 			arg = arg_raw.replace(" ","")
114 | 			arg = arg.strip()
115 | 			opt = opt.replace("-","")
116 | 			#print(opt,arg)
117 | 			if opt in ('t','tree'):
118 | 				self.tree = firstLine(arg)
119 | 			elif opt in ('p', 'popmap'):
120 | 				self.popmap = arg
121 | 			elif opt in ('h', 'help'):
122 | 				pass
123 | 			else:
124 | 				assert False, "Unhandled option %r"%opt
125 | 
126 | 		#Check manditory options are set
127 | 		if not self.tree:
128 | 			self.display_help("Error: Missing required tree (--tree or --stree)")
129 | 		if not self.popmap:
130 | 			self.display_help("Error: Missing required popmap file (-p, --popmap)")
131 | 
132 | 
133 | 	def display_help(self, message=None):
134 | 		if message is not None:
135 | 			print ("\n",message)
136 | 		print ("\ntreeExpansion.py\n")
137 | 		print ("Contact:Tyler K. Chafin, University of Arkansas,tkchafin@uark.edu")
138 | 		print ("Description: Expands Newick tree of clades to include all taxa in a popmap file")
139 | 
140 | 		print("""
141 | 	Arguments:
142 | 		-p,--popmap	: Tab-delimited population map
143 | 		-t,--tree	: Newick tree in a file
144 | 		    or
145 | 		-s,--stree	: Newick tree given as a string
146 | 		-h,--help	: Displays help menu
147 | 
148 | """)
149 | 		sys.exit()
150 | 
151 | #Call main function
152 | if __name__ == '__main__':
153 |     main()
154 | 


--------------------------------------------------------------------------------
/trimFastq.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | 
 6 | my $usage = "\nUsage: $0 input.fq 5-prime-bp 3-prime-bp
 7 | 
 8 |      This script removes a specified number of bp from the 5' and 3' ends of each sequence in the fastq file\n\n";
 9 | 
10 | # print "1: $ARGV[1]\n";
11 | # print "2: $ARGV[2]\n";
12 | 
13 | defined $ARGV[2] or die $usage;
14 | 
15 | my $begTrim = $ARGV[1];
16 | my $endTrim = $ARGV[2];
17 | 
18 | open( FAS, $ARGV[0] ) || die "Couldn't open $ARGV[0]: $!\n";
19 | 
20 | while( my $line = <FAS>){
21 |   if( $line =~ /\A@/ ){
22 |     print $line and next; # skip headers
23 |   }elsif( $line =~ /\A\+/ ){
24 |     print $line and next; # skip "+" line
25 |   }else{
26 |     my $len = length $line;
27 |     print substr( $line, 0 + $begTrim, $len - $endTrim - $begTrim - 1 ), "\n";
28 |   }
29 | }
30 | 
31 | close FAS;
32 | 
33 | exit;
34 | 


--------------------------------------------------------------------------------
/utm2latlong.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | import sys
  4 | import os
  5 | import getopt
  6 | import utm
  7 | 
  8 | def main():
  9 | 	params = parseArgs()
 10 | 
 11 | 	if params.utm:
 12 | 		for line in readTSV(params.utm):
 13 | 			if params.zone and params.hemi:
 14 | 				coords = utm.to_latlon(float(line[1]), float(line[2]), params.zone, params.hemi)
 15 | 				oline=str(line[0])+"\t"+str(coords[0])+"\t"+str(coords[1])
 16 | 				print(oline)
 17 | 			elif params.inline:
 18 | 				z=line[3][:-1]
 19 | 				h=line[3][-1]
 20 | 				coords = utm.to_latlon(float(line[1]), float(line[2]), int(z), h)
 21 | 				oline=str(line[0])+"\t"+str(coords[0])+"\t"+str(coords[1])
 22 | 				print(oline)
 23 | 			else:
 24 | 				params.display_help("No UTM zone information provided.")
 25 | 	elif params.latlong:
 26 | 		for line in readTSV(params.latlong):
 27 | 			coords = utm.from_latlon(float(line[1]), float(line[2]))
 28 | 			oline=str(line[0])+"\t"+str(coords[0])+"\t"+str(coords[1]) + "\t" +str(coords[2])+str(coords[3])
 29 | 			print(oline)
 30 | 	else:
 31 | 		params.display_help("No input provided")
 32 | 
 33 | 
 34 | #generator function, reads tsv line by line
 35 | def readTSV(tab):
 36 | 	with open(tab, 'r') as fh:
 37 | 		try:
 38 | 			for line in fh:
 39 | 				line = line.strip()
 40 | 				if not line:
 41 | 					continue
 42 | 				yield(line.split())
 43 | 		except IOError:
 44 | 			print("Could not read file ",tab)
 45 | 			sys.exit(1)
 46 | 		finally:
 47 | 			fh.close()
 48 | 
 49 | #Object to parse command-line arguments
 50 | class parseArgs():
 51 | 	def __init__(self):
 52 | 		#Define options
 53 | 		try:
 54 | 			options, remainder = getopt.getopt(sys.argv[1:], 'hu:c:z:l:i', \
 55 | 			["help"])
 56 | 		except getopt.GetoptError as err:
 57 | 			print(err)
 58 | 			self.display_help("\nExiting because getopt returned non-zero exit status.")
 59 | 		#Default values for params
 60 | 		#Input params
 61 | 		self.utm=None
 62 | 		self.latlong=None
 63 | 		self.zone=None
 64 | 		self.hemi=None
 65 | 		self.inline=False
 66 | 
 67 | 
 68 | 		#First pass to see if help menu was called
 69 | 		for o, a in options:
 70 | 			if o in ("-h", "-help", "--help"):
 71 | 				self.display_help("Exiting because help menu was called.")
 72 | 
 73 | 		#Second pass to set all args.
 74 | 		for opt, arg_raw in options:
 75 | 			arg = arg_raw.replace(" ","")
 76 | 			arg = arg.strip()
 77 | 			opt = opt.replace("-","")
 78 | 			#print(opt,arg)
 79 | 			if opt == "h" or opt == "help":
 80 | 				continue
 81 | 			elif opt == "c":
 82 | 				self.latlong=arg
 83 | 			elif opt=="l":
 84 | 				self.hemi=arg
 85 | 			elif opt == "z":
 86 | 				self.zone=int(arg)
 87 | 			elif opt=="u":
 88 | 				self.utm=arg
 89 | 			elif opt=="i":
 90 | 				self.inline=True
 91 | 			else:
 92 | 				assert False, "Unhandled option %r"%opt
 93 | 
 94 | 		#Check manditory options are set
 95 | 		if not self.utm and not self.latlong:
 96 | 			self.display_help("No input file provided (must be one of: <-u> or <-c>)")
 97 | 		if self.utm and self.latlong:
 98 | 			self.display_help("Options not compatible: <-u> <-c>")
 99 | 		if self.utm:
100 | 			if not self.zone and not self.hemi and not self.inline:
101 | 				self.display_help("Must provide zone number <-z> and letter <-l> with UTMs or as inline <-i>")
102 | 			if self.zone and self.inline:
103 | 				self.display_help("Options not compatible: <-i> <-z>")
104 | 
105 | 
106 | 
107 | 	def display_help(self, message=None):
108 | 		if message is not None:
109 | 			print()
110 | 			print (message)
111 | 		print ("\n<template.py>\n")
112 | 		print("Author: Tyler K Chafin, University of Arkansas")
113 | 		print ("Contact: tkchafin@uark.edu")
114 | 		print ("Description: ")
115 | 		print("""
116 | 		Arguments
117 | 		-u	: Tab-delimited table of UTM coordinates (2nd col = Easting; 3rd col = Northing)
118 | 		  -or-
119 | 		-c	: Tab-delimited table of lat/long coordinates (2nd col = lat; 3rd col= long)
120 | 		-z	: If converting UTM to lat/long, provide zone number here (e.g. "15")
121 | 		-l	: If converting UTM to lat/long, provide zone letter here (e.g. "N")
122 | 		-i	: If converting from UTMs, zone can be as 4th column (e.g. "15S")
123 | 
124 | """)
125 | 		print()
126 | 		sys.exit()
127 | 
128 | #Call main function
129 | if __name__ == '__main__':
130 |     main()
131 | 


--------------------------------------------------------------------------------
/vcf2phylip.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | import re
  4 | import sys
  5 | import os
  6 | import vcf
  7 | import getopt
  8 | 
  9 | def main():
 10 | 	params = parseArgs()
 11 | 
 12 | 	data = dict()
 13 | 
 14 | 	if params.vcf:
 15 | 		#for each record in VCF
 16 | 		for record in read_vcf(params.vcf):
 17 | 			for call in record.samples:
 18 | 				#Get consensus base call
 19 | 				cons = None
 20 | 				if call.gt_bases:
 21 | 					l = (call.gt_bases).split("/")
 22 | 					cons = reverse_iupac(listToSortUniqueString(l))
 23 | 				else:
 24 | 					cons = "N"
 25 | 				if cons:
 26 | 					if call.sample in data:
 27 | 						data[call.sample].append(cons)
 28 | 					else:
 29 | 						data[call.sample] = list()
 30 | 						data[call.sample].append(cons)
 31 | 				else:
 32 | 					print ("Uh oh! No consensus called for %s, something is wrong"%call)
 33 | 
 34 | 		#Print dict to phylip file
 35 | 		with open(params.out, 'w') as fh:
 36 | 			try:
 37 | 				header = getPhylipHeader(data) + "\n"
 38 | 				fh.write(header)
 39 | 
 40 | 				for sample in data:
 41 | 					line = str(sample) + "\t" + "".join(data[sample]) + "\n"
 42 | 					fh.write(line)
 43 | 			except IOError:
 44 | 				print("Could not write to file ",params.out)
 45 | 				sys.exit(1)
 46 | 			finally:
 47 | 				fh.close()
 48 | 
 49 | 	else:
 50 | 		print("Error: No VCF file provided")
 51 | 		sys.exit(1)
 52 | 
 53 | #Returns header for Phylip file from a dictionary of samples w/ data
 54 | def getPhylipHeader(d):
 55 | 	numSamp = 0
 56 | 	numLoci = None
 57 | 	for sample in d:
 58 | 		numSamp = numSamp + 1
 59 | 		if not numLoci:
 60 | 			numLoci = len(d[sample])
 61 | 		else:
 62 | 			if numLoci != len(d[sample]):
 63 | 				print("getPhylipHeader: Warning: Sequences of unequal length.")
 64 | 	header = str(numSamp) + " " + str(numLoci)
 65 | 	if numLoci == 0 or not numLoci:
 66 | 		print("getPhylipHeader: Warning: No loci in dictionary.")
 67 | 	if numSamp == 0:
 68 | 		print("getPhylipHeader: Warning: No samples in dictionary.")
 69 | 	return(header)
 70 | 
 71 | #Read VCF variant calls
 72 | #Generator function, yields each locus
 73 | def read_vcf(v):
 74 | 
 75 | 	try:
 76 | 		vfh = vcf.Reader(filename=v)
 77 | 	except IOError as err:
 78 | 		print("I/O error({0}): {1}".format(err.errno, err.strerror))
 79 | 	except:
 80 | 		print("Unexpected error:", sys.exec_info()[0])
 81 | 
 82 | 	chrom = ""
 83 | 	recs = []
 84 | 	added = 0
 85 | 	for rec in vfh:
 86 | 		if not rec.FILTER:
 87 | 			yield(rec)
 88 | 
 89 | #Function to return sorted unique string from list of chars
 90 | def listToSortUniqueString(l):
 91 | 	sl = sorted(set(l))
 92 | 	return(str(''.join(sl)))
 93 | 
 94 | #Function to translate a string of bases to an iupac ambiguity code
 95 | def reverse_iupac(char):
 96 | 	char = char.upper()
 97 | 	if "-" in char:
 98 | 		return("-")
 99 | 	elif "N" in char:
100 | 		return("N")
101 | 	elif "." in char:
102 | 		return(".")
103 | 	else:
104 | 		iupac = {
105 | 			'A':'A',
106 | 			'N':'N',
107 | 			'-':'-',
108 | 			'C':'C',
109 | 			'G':'G',
110 | 			'T':'T',
111 | 			'AG':'R',
112 | 			'CT':'Y',
113 | 			'AC':'M',
114 | 			'GT':'K',
115 | 			'AT':'W',
116 | 			'CG':'S',
117 | 			'CGT':'B',
118 | 			'AGT':'D',
119 | 			'ACT':'H',
120 | 			'ACG':'V',
121 | 			'ACGT':'N'
122 | 		}
123 | 		return iupac[char]
124 | 
125 | #Object to parse command-line arguments
126 | class parseArgs():
127 | 	def __init__(self):
128 | 		#Define options
129 | 		try:
130 | 			options, remainder = getopt.getopt(sys.argv[1:], 'v:o:h', \
131 | 			["vcf=","help","out="])
132 | 		except getopt.GetoptError as err:
133 | 			print(err)
134 | 			self.display_help("\nExiting because getopt returned non-zero exit status.")
135 | 		#Default values for params
136 | 		#Input params
137 | 		self.vcf=None
138 | 		self.out=None
139 | 
140 | 		#First pass to see if help menu was called
141 | 		for o, a in options:
142 | 			if o in ("-h", "-help", "--help"):
143 | 				self.display_help("Exiting because help menu was called.")
144 | 
145 | 		#Second pass to set all args.
146 | 		for opt, arg_raw in options:
147 | 			arg = arg_raw.replace(" ","")
148 | 			arg = arg.strip()
149 | 			opt = opt.replace("-","")
150 | 			#print(opt,arg)
151 | 			if opt in ('v', 'vcf'):
152 | 				self.vcf = arg
153 | 			elif opt in ('h', 'help'):
154 | 				pass
155 | 			elif opt in ('o','out'):
156 | 				self.out = arg
157 | 			else:
158 | 				assert False, "Unhandled option %r"%opt
159 | 
160 | 		#Check manditory options are set
161 | 		if not self.vcf:
162 | 			self.display_help("\nError: Missing required input file <-v,--vcf>")
163 | 
164 | 
165 | 		if self.out:
166 | 			self.out = self.out + ".phy"
167 | 		else:
168 | 			self.out = "out.phy"
169 | 
170 | 
171 | 	def display_help(self, message=None):
172 | 		if message is not None:
173 | 			print (message)
174 | 		print ("\nvcf2phylip.py\n")
175 | 		print ("Contact:Tyler K. Chafin, University of Arkansas,tkchafin@uark.edu")
176 | 		print ("\nUsage: ", sys.argv[0], "-v </path/to/vcf>\n")
177 | 		print ("Description: Extract SNPs from a VCF file and outputs as concatenated Phylip")
178 | 
179 | 		print("""
180 | 	Arguments:
181 | 		-v,--vcf	: VCF input file
182 | 		-o,--out	: Prefix for output file <default = ./out>
183 | 		-h,--help	: Displays help menu
184 | 
185 | """)
186 | 		sys.exit()
187 | 
188 | #Call main function
189 | if __name__ == '__main__':
190 |     main()
191 | 


--------------------------------------------------------------------------------