├── pyrad ├── __init__.py ├── potpour.py ├── loci2gphocs.py ├── loci2mig.py ├── loci2vcf.py ├── createfile.py ├── loci2treemix.py ├── tier2clust.py ├── loci2phynex.py ├── loci2SNP.py ├── overlapcheck.py ├── editraw_rads.py ├── editraw_merges.py ├── H_err_dp.py ├── editraw_pairs.py ├── cluster_cons7_shuf.py ├── Dtest.py ├── sortandcheck2.py ├── Dtest_5.py ├── consensdp.py ├── consens_pairs.py └── Dtest_foil.py ├── tox.ini ├── .gitignore ├── setup.py └── README.rst /pyrad/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist=py27 3 | 4 | [testenv] 5 | commands=py.test pyrad 6 | deps=pytest 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | 3 | # C extensions 4 | *.so 5 | 6 | # Packages 7 | *.egg 8 | *.egg-info 9 | dist 10 | build 11 | eggs 12 | parts 13 | bin 14 | var 15 | sdist 16 | develop-eggs 17 | .installed.cfg 18 | lib 19 | lib64 20 | __pycache__ 21 | 22 | # Installer logs 23 | pip-log.txt 24 | 25 | # Unit test / coverage reports 26 | .coverage 27 | .tox 28 | nosetests.xml 29 | 30 | # Translations 31 | *.mo 32 | 33 | # Mr Developer 34 | .mr.developer.cfg 35 | .project 36 | .pydevproject 37 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | requirements = [ 4 | 'numpy', 5 | 'scipy', 6 | ] 7 | 8 | setuptools.setup( 9 | name="pyrad", 10 | version="3.0.66", 11 | url="https://github.com/dereneaton/pyrad", 12 | 13 | author="Deren Eaton", 14 | author_email="deren.eaton@yale.edu", 15 | 16 | description="Assembly and analysis of RADseq data sets", 17 | long_description=open('README.rst').read(), 18 | 19 | packages=setuptools.find_packages(), 20 | 21 | install_requires=[requirements], 22 | 23 | 24 | entry_points={ 25 | 'console_scripts': [ 26 | 'pyrad = pyrad.pyRAD:main', 27 | ], 28 | }, 29 | 30 | license='GPL', 31 | 32 | classifiers=[ 33 | 'Programming Language :: Python', 34 | 'Programming Language :: Python :: 2', 35 | 'Programming Language :: Python :: 2.7', 36 | ], 37 | ) 38 | -------------------------------------------------------------------------------- /pyrad/potpour.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | 4 | 5 | import multiprocessing 6 | 7 | 8 | class Worker(multiprocessing.Process): 9 | 10 | def __init__(self, work_queue, result_queue, func): 11 | 12 | # base class initialization 13 | multiprocessing.Process.__init__(self) 14 | 15 | # job management stuff 16 | self.work_queue = work_queue 17 | self.result_queue = result_queue 18 | self.kill_received = False 19 | self.func = func 20 | 21 | def run(self): 22 | while not self.kill_received: 23 | # get a task 24 | if self.work_queue.empty(): 25 | break 26 | else: 27 | #job = self.work_queue.get_nowait() 28 | job = self.work_queue.get() 29 | 30 | # the actual processing 31 | res = self.func(*job) 32 | 33 | # store the result 34 | self.result_queue.put(res) 35 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | 2 | pyrad has now been superceded by *ipyrad* 3 | ========================================== 4 | (http://ipyrad.readthedocs.io) <-- see here 5 | 6 | All new development will be made on ipyrad 7 | and I recommend that pyrad users switch over. The new software 8 | offers huge speed improvements and many new features. 9 | 10 | 11 | 12 | pyrad 13 | ===== 14 | 15 | Assembly and analysis of RADseq data sets 16 | 17 | 18 | Tutorials 19 | --------- 20 | 21 | Detailed information and a number of example tutorials are 22 | available `here `_. 23 | 24 | 25 | Downloads 26 | --------- 27 | 28 | Stable release versions can be downloaded `here `_, or you can clone the current development version using git: 29 | 30 | :: 31 | 32 | git clone https://github.com/dereneaton/pyrad.git 33 | 34 | 35 | 36 | Installation (As of v.3.0.6 and newer) 37 | ^^^^^^^^^^^^^^^^^ 38 | With the following you can install pyrad so that it is callable as an executable from anywhere on your machine. If you have pip, then the second option will also install the dependencies numpy and scipy: 39 | 40 | :: 41 | 42 | cd pyrad 43 | sudo pip install . 44 | pyrad -h 45 | 46 | Or 47 | 48 | :: 49 | 50 | cd pyrad 51 | sudo python setup.py install 52 | pyrad -h 53 | 54 | Or alternatively, without having to install you can simply call pyRAD.py from its location using python: 55 | 56 | :: 57 | 58 | python pyrad/pyrad/pyRAD.py -h 59 | 60 | 61 | Python requirements 62 | ^^^^^^^^^^^^^^^^^^^ 63 | You will need the following two Python dependencies with `pyrad`. 64 | 65 | * numpy 66 | * scipy 67 | 68 | In addition to the programs 69 | 70 | * `muscle `_ 71 | * `vsearch `_ 72 | 73 | Example usage (see tutorials) 74 | ^^^^^^^^^^^^^^^^^^^ 75 | :: 76 | 77 | >>> pyrad -n 78 | new params.txt file created 79 | 80 | 81 | >>> pyrad -p params.txt 82 | 83 | 84 | 85 | Licence 86 | ------- 87 | GPLv3 88 | 89 | 90 | Authors 91 | ------- 92 | 93 | `pyrad` was written by `Deren Eaton `_. 94 | -------------------------------------------------------------------------------- /pyrad/loci2gphocs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | # pyrad .loci to gphocs format conversion 4 | # 5 | # This is a very simple conversion because the formats are very similar. 6 | # 7 | # Isaac Overcast 8 | # March 21, 2015 9 | 10 | ## import libraries 11 | import sys, os 12 | 13 | def make(WORK, outname): 14 | 15 | #read in loci file 16 | infile = open(WORK+"outfiles/"+outname+".loci") 17 | outfile = open(WORK+"outfiles/"+outname+".gphocs",'w') 18 | 19 | ## parse the loci 20 | ## Each set of reads at a locus is appended with a line 21 | ## beginning with // and ending with |x, where x in the locus id. 22 | ## so after this call 'loci' will contain an array 23 | ## of sets of each read per locus. 24 | loci = infile.read().strip().split("//")[:-1] 25 | 26 | # Print the header, the number of loci in this file 27 | print >>outfile, len(loci)#-1 28 | 29 | # iterate through each locus, print out the header for each locus: 30 | # 31 | # Then print the data for each sample in this format: 32 | # 33 | for i, loc in enumerate(loci): 34 | # Separate out each sequence within the loc block. 'sequences' 35 | # will now be a list strings containing name/sequence pairs. 36 | # We select each line in the locus string that starts with ">" 37 | names = [line.split()[0] for line in loc.strip().split("\n") if ">" in line] 38 | sequences = [line.split()[-1] for line in loc.strip().split("\n") if ">" in line] 39 | 40 | # Strips off 'nnnn' separator for paired data 41 | # replaces '-' with 'N' 42 | editsequences = [seq.replace("n","").replace('-','N') for seq in sequences] 43 | sequence_length = len(editsequences[0]) 44 | 45 | # get length of longest name and add 4 spaces 46 | longname = max(map(len,names))+4 47 | 48 | # Print out the header for this locus 49 | print >>outfile, 'locus'+ str(i), len(sequences), str( sequence_length ) 50 | 51 | # Iterate through each sequence read at this locus and write it to the file. 52 | for name,sequence in zip(names,sequences): 53 | # Clean up the sequence data to make gphocs happy. Only accepts UPPER 54 | # case chars for bases, and only accepts 'N' for missing data. Also, 55 | # the .loci format prepends a '>' on to the individual names, so we have 56 | # to clean this up which is what the [1:] is doing. 57 | print >>outfile, name[1:]+" "*(longname-len(name))+sequence 58 | 59 | if __name__ == "__main__": 60 | make(WORK, outfile) 61 | -------------------------------------------------------------------------------- /pyrad/loci2mig.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | import numpy as np 4 | import sys 5 | import gzip 6 | try: 7 | from collections import OrderedDict 8 | except ImportError: 9 | from ordereddict import OrderedDict 10 | try: 11 | from collections import Counter 12 | except ImportError: 13 | from counter import Counter 14 | import alignable 15 | 16 | def make(WORK, outname, taxadict, minhits, seed): 17 | 18 | ## outfile 19 | outfile = open(WORK+"/outfiles/"+outname+".migrate", 'w') 20 | 21 | ## cleanup taxadict 22 | taxa = OrderedDict() 23 | for group in taxadict: 24 | taxa[group] = [] 25 | for samp in taxadict[group]: 26 | a = samp.split("/")[-1].replace(".consens.gz","") 27 | taxa[group].append(a) 28 | 29 | print "\t data set reduced for group coverage minimums" 30 | for i,j in zip(taxa,minhits): 31 | print "\t ",i, taxa[i], "minimum=",j 32 | 33 | #print taxadict.keys() 34 | 35 | ## filter data to only the loci that have data 36 | ## for at least N individuals in each pop 37 | keep = [] 38 | MINS = zip(taxa.keys(), minhits) 39 | 40 | ## read in data to sample names 41 | loci = open(WORK+"/outfiles/"+outname+".loci",'r').read().strip().split("|")[:-1] 42 | for loc in loci: 43 | samps = [i.split()[0].replace(">","") for i in loc.split("\n") if ">" in i] 44 | ## filter for coverage 45 | GG = [] 46 | for group,mins in MINS: 47 | GG.append( sum([i in samps for i in taxa[group]]) >= int(mins) ) 48 | if all(GG): 49 | keep.append(loc) 50 | 51 | ## print data to file 52 | print >>outfile, len(taxa), len(keep), "( npops nloci for data set", outname+".loci",")" 53 | 54 | ## print all data for each population at a time 55 | done = 0 56 | for group in taxadict: 57 | ## print a list of lengths of each locus 58 | if not done: 59 | loclens = [len(loc.split("\n")[1].split()[-1].replace("x","n").replace("n","")) for loc in keep] 60 | print >>outfile, " ".join(map(str,loclens)) 61 | done += 1 62 | 63 | ## print a list of number of individuals in each locus 64 | indslist = [] 65 | for loc in keep: 66 | samps = [i.split()[0].replace(">","") for i in loc.split("\n") if ">" in i] 67 | inds = sum([i in samps for i in taxa[group]]) 68 | indslist.append(inds) 69 | print >>outfile, " ".join(map(str,indslist)), group 70 | 71 | ## print sample id, spaces, and sequence data 72 | #for loc in range(len(keep)): 73 | for loc in range(len(keep)): 74 | seqs = [i.split()[-1] for i in keep[loc].split("\n") if \ 75 | i.split()[0].replace(">","") in taxa[group]] 76 | for i in range(len(seqs)): 77 | print >>outfile, group[0:8]+"_"+str(i)+\ 78 | (" "*(10-len(group[0:8]+"_"+str(i))))+seqs[i].replace("x","n").replace("n","") 79 | 80 | outfile.close() 81 | 82 | 83 | # WORK = "/home/deren/Dropbox/Public/PyRAD_TUTORIALS/tutorial_RAD" 84 | # outname = "c85m4p3" 85 | 86 | # pops = ['pop1','pop2','pop3'] 87 | # samps = [ ["1A0","1B0","1C0","1D0"], 88 | # ["2E0","2F0","2G0","2H0"], 89 | # ["3I0","3J0","3K0","3L0"] ] 90 | 91 | # taxadict = OrderedDict(zip(pops,samps)) 92 | # minhits = [4,4,4] 93 | # seed = 112233 94 | 95 | if __name__ == "__main__": 96 | make(WORK, outname, taxadict, minhits, seed) 97 | -------------------------------------------------------------------------------- /pyrad/loci2vcf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | import time 4 | import numpy as np 5 | import alignable 6 | 7 | 8 | def make(WORK, version, outname, mindepth, names): 9 | outfile = open(WORK+"/outfiles/"+outname+".vcf", 'w') 10 | inloci = WORK+"/outfiles/"+outname+".loci" 11 | names = list(names) 12 | names.sort() 13 | 14 | print >>outfile, "##fileformat=VCFv4.1" 15 | print >>outfile, "##fileDate="+time.strftime("%Y%m%d") 16 | print >>outfile, "##source=pyRAD.v."+str(version) 17 | print >>outfile, "##reference=common_allele_at_each_locus" 18 | print >>outfile, "##INFO=" 19 | print >>outfile, "##INFO=" 20 | print >>outfile, "##INFO=" 21 | print >>outfile, "##INFO=" 22 | print >>outfile, "##FORMAT=" 23 | print >>outfile, "##FORMAT=" 24 | print >>outfile, "##FORMAT=" 25 | print >>outfile, "\t".join(["#CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO ","FORMAT"]+list(names)) 26 | 27 | loci = open(inloci).read().split("|")[:-1] 28 | snps = 0 29 | vcflist = [] 30 | for locusnumber in range(len(loci)): 31 | samps = [i.split()[0][1:] for i in loci[locusnumber].strip().split("\n") if ">" in i] 32 | loc = np.array([tuple(i.split()[-1]) for i in loci[locusnumber].strip().split("\n") if ">" in i]) 33 | NS = str(len(loc)) 34 | DP = str(mindepth) 35 | for base in range(len(loc.T)): 36 | col = [] 37 | site = list(loc.T[base]) 38 | site = list("".join(site).replace("-","").replace("N","")) 39 | if site: 40 | for bb in site: 41 | if bb in list("RKYSWM"): 42 | col += alignable.unstruct(bb)[0] 43 | col += alignable.unstruct(bb)[1] 44 | else: 45 | col += bb 46 | REF = alignable.most_common([i for i in col if i not in list("-RKYSWMN")]) 47 | ALT = set([i for i in col if (i in list("ATGC-N")) and (i!=REF)]) 48 | if ALT: 49 | snps += 1 50 | GENO = [REF]+list(ALT) 51 | GENOS = [] 52 | for samp in names: 53 | if samp in samps: 54 | idx = samps.index(samp) 55 | f = alignable.unstruct(loc.T[base][idx]) 56 | if ('-' in f) or ('N' in f): 57 | GENOS.append("./.") 58 | else: 59 | GENOS.append(str(GENO.index(f[0]))+"|"+str(GENO.index(f[1]))) 60 | else: 61 | GENOS.append("./.") 62 | vcflist.append("\t".join([`locusnumber+1`, `base+1`, '.', REF, ",".join(ALT), "20", "PASS", 63 | ";".join(["NS="+NS, "DP="+DP]), "GT"]+GENOS)) 64 | if not locusnumber % 1000: 65 | outfile.write( "\n".join(vcflist)+"\n" ) 66 | vcflist = [] 67 | 68 | #print >>outfile, "\t".join([`locusnumber+1`, `base+1`, '.', REF, ",".join(ALT), "20", "PASS", 69 | # ";".join(["NS="+NS, "DP="+DP]), "GT"]+GENOS) 70 | 71 | 72 | outfile.write( "\n".join(vcflist) ) 73 | outfile.close() 74 | 75 | if __name__ == "__main__": 76 | make(WORK, version, outname, mindepth, names) 77 | -------------------------------------------------------------------------------- /pyrad/createfile.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | import sys 4 | 5 | def main(version): 6 | output = """ 7 | ==** parameter inputs for pyRAD version %s **======================== affected step == 8 | ./ ## 1. Working directory (all) 9 | ./*.fastq.gz ## 2. Loc. of non-demultiplexed files (if not line 18) (s1) 10 | ./*.barcodes ## 3. Loc. of barcode file (if not line 18) (s1) 11 | vsearch ## 4. command (or path) to call vsearch (or usearch) (s3,s6) 12 | muscle ## 5. command (or path) to call muscle (s3,s7) 13 | TGCAG ## 6. Restriction overhang (e.g., C|TGCAG -> TGCAG) (s1,s2) 14 | 2 ## 7. N processors (parallel) (all) 15 | 6 ## 8. Mindepth: min coverage for a cluster (s4,s5) 16 | 4 ## 9. NQual: max # sites with qual < 20 (or see line 20)(s2) 17 | .88 ## 10. Wclust: clustering threshold as a decimal (s3,s6) 18 | rad ## 11. Datatype: rad,gbs,pairgbs,pairddrad,(others:see docs)(all) 19 | 4 ## 12. MinCov: min samples in a final locus (s7) 20 | 3 ## 13. MaxSH: max inds with shared hetero site (s7) 21 | c88d6m4p3 ## 14. Prefix name for final output (no spaces) (s7) 22 | ==== optional params below this line =================================== affected step == 23 | ## 15.opt.: select subset (prefix* only selector) (s2-s7) 24 | ## 16.opt.: add-on (outgroup) taxa (list or prefix*) (s6,s7) 25 | ## 17.opt.: exclude taxa (list or prefix*) (s7) 26 | ## 18.opt.: loc. of de-multiplexed data (s2) 27 | ## 19.opt.: maxM: N mismatches in barcodes (def= 1) (s1) 28 | ## 20.opt.: phred Qscore offset (def= 33) (s2) 29 | ## 21.opt.: filter: def=0=NQual 1=NQual+adapters. 2=strict (s2) 30 | ## 22.opt.: a priori E,H (def= 0.001,0.01, if not estimated) (s5) 31 | ## 23.opt.: maxN: max Ns in a cons seq (def=5) (s5) 32 | ## 24.opt.: maxH: max heterozyg. sites in cons seq (def=5) (s5) 33 | ## 25.opt.: ploidy: max alleles in cons seq (def=2;see docs) (s4,s5) 34 | ## 26.opt.: maxSNPs: (def=100). Paired (def=100,100) (s7) 35 | ## 27.opt.: maxIndels: within-clust,across-clust (def. 3,99) (s3,s7) 36 | ## 28.opt.: random number seed (def. 112233) (s3,s6,s7) 37 | ## 29.opt.: trim overhang left,right on final loci, def(0,0) (s7) 38 | ## 30.opt.: output formats: p,n,a,s,v,u,t,m,k,g,* (see docs) (s7) 39 | ## 31.opt.: maj. base call at depth>x>sys.stderr, "\tnew params.txt file created" 49 | print >>outfile, "\n".join(output.split("\n")[1:]) 50 | 51 | -------------------------------------------------------------------------------- /pyrad/loci2treemix.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | import numpy as np 4 | import sys 5 | import gzip 6 | try: 7 | from collections import OrderedDict 8 | except ImportError: 9 | from ordereddict import OrderedDict 10 | try: 11 | from collections import Counter 12 | except ImportError: 13 | from counter import Counter 14 | import alignable 15 | 16 | 17 | def make(WORK, outname, taxadict, minhits): 18 | 19 | ## output files 20 | outfile = gzip.open(WORK+"/outfiles/"+outname+".treemix.gz",'w') 21 | 22 | ## cleanup taxadict to just sample names 23 | taxa = OrderedDict() 24 | for group in taxadict: 25 | taxa[group] = [] 26 | for samp in taxadict[group]: 27 | a = samp.split("/")[-1].replace(".consens.gz","") 28 | taxa[group].append(a) 29 | 30 | print "\t data set reduced for group coverage minimums" 31 | for i,j in zip(taxa,minhits): 32 | print "\t ",i, taxa[i], 'minimum=',j 33 | 34 | ## read in data from unlinked_snps to sample names 35 | infile = open(WORK.rstrip("/")+"/outfiles/"+outname+".unlinked_snps",'r') 36 | dat = infile.readlines() 37 | nsamp,nsnps = dat[0].strip().split(" ") 38 | nsamp = int(nsamp) 39 | nsnps = int(nsnps) 40 | NDATA = np.empty([int(nsamp),int(nsnps)],dtype='object') 41 | excludes = 0 42 | 43 | ## read SNP matrix into a numpy.array 44 | for line in range(len(dat[1:])): 45 | a,b = dat[1:][line].split() 46 | NDATA[line] = list(b) 47 | sites = np.transpose(NDATA) 48 | 49 | ## unpack ambiguity bases and find two most common alleles 50 | ## at every SNP site, save to a list 51 | alleles = [] 52 | for site in sites: 53 | ds = [] 54 | for s in site: 55 | if s in list("RKSYWM"): 56 | ds.append(alignable.unstruct(s)[0]) 57 | ds.append(alignable.unstruct(s)[1]) 58 | else: 59 | ds.append(s) 60 | ds.append(s) 61 | snp = [s for s in ds if s not in ["N",'-']] 62 | a = Counter(snp).most_common(3) 63 | alleles.append([a[0][0],a[1][0]]) 64 | 65 | ## create a dictionary mapping sample names to SNPs 66 | SNPS = OrderedDict() 67 | for line in dat[1:]: 68 | a,b = line.split() 69 | SNPS[a] = b 70 | 71 | ## reduce Taxa dict to only samples that are in the unlinkedsnps alignment 72 | for key in taxa: 73 | replacement = [] 74 | for val in taxa[key]: 75 | if val in SNPS.keys(): 76 | replacement.append(val) 77 | taxa[key] = replacement 78 | 79 | ## create a dictionary with empty lists for each taxon 80 | FREQ = OrderedDict() 81 | for tax in taxa: 82 | FREQ[tax] = [] 83 | 84 | ## fill the FREQ dictionary with SNPs for all 85 | ## samples in that taxon 86 | keeps = [] 87 | for snp in range(int(nsnps)): 88 | GG = [] 89 | ## if snp meets minhits requirement 90 | for tax,mins in zip(taxa,minhits): 91 | GG.append( sum([SNPS[i][snp] not in ["N","-"] for i in taxa[tax]]) >= int(mins)) 92 | if all(GG): 93 | keeps.append(snp) 94 | 95 | 96 | for keep in keeps: 97 | for tax in FREQ: 98 | bunch = [] 99 | for i in taxa[tax]: 100 | bunch.append(alignable.unstruct(SNPS[i][keep])[0]) 101 | bunch.append(alignable.unstruct(SNPS[i][keep])[1]) 102 | #print tax, i, SNPS[i][keep], bunch 103 | FREQ[tax].append("".join(bunch)) 104 | 105 | ## header 106 | print >>outfile, " ".join(FREQ.keys()) 107 | 108 | ## data to file 109 | for i,j in enumerate(keeps): 110 | a1 = alleles[j][0] 111 | a2 = alleles[j][1] 112 | H = [str(FREQ[tax][i].count(a1))+","+str(FREQ[tax][i].count(a2)) for tax in FREQ] 113 | HH = " ".join(H) 114 | 115 | ## exclude non-biallelic SNPs 116 | if " 0,0 " not in HH: 117 | ## exclude invariable sites given this sampling 118 | if not all([zz.split(",")[1] in '0' for zz in H]): 119 | print >>outfile, " ".join(H) 120 | else: 121 | excludes += 1 122 | 123 | outfile.close() 124 | 125 | 126 | if __name__ == "__main__": 127 | make(WORK, outname, taxadict) 128 | -------------------------------------------------------------------------------- /pyrad/tier2clust.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | import os 4 | import sys 5 | import itertools 6 | import numpy 7 | import random 8 | import glob 9 | import subprocess 10 | import pickle 11 | import gzip 12 | from cluster_cons7_shuf import comp 13 | 14 | 15 | def cluster(UCLUST, ID, datatype, WORK, MASK): 16 | C = " -cluster_smallmem "+WORK+"prefix/cat.consens_" 17 | 18 | if datatype in ['gbs','pairgbs','mergegbs']: 19 | P = " -strand both" 20 | COV = ".90" 21 | else: 22 | P = " -leftjust " 23 | COV = ".90" 24 | if 'vsearch' not in UCLUST: 25 | Q = "" 26 | T = " -threads 1" 27 | else: 28 | Q = " -qmask "+MASK 29 | ## TODO: figure out optimized threads setting... 30 | T = " -threads 6" 31 | U = " -userout "+WORK+"prefix/cat.u" 32 | cmd = UCLUST+\ 33 | C+\ 34 | P+\ 35 | " -id "+ID+\ 36 | Q+\ 37 | T+\ 38 | U+\ 39 | " -userfields query+target+id+gaps+qstrand+qcov"+\ 40 | " -maxaccepts 1"+\ 41 | " -maxrejects 0"+\ 42 | " -fulldp"+\ 43 | " -query_cov "+str(COV)+\ 44 | " -notmatched "+WORK+"prefix/cat._tempU" 45 | os.system(cmd) 46 | #subprocess.call(cmd, shell=True, stderr=subprocess.STDOUT, stdout=subprocess.PIPE) 47 | 48 | 49 | def flip(a): 50 | if a == "+": 51 | return "-" 52 | elif a == "-": 53 | return "+" 54 | 55 | 56 | def makeclust(ID, datatype, WORK): 57 | 58 | " load tier 2 hits (names,direction) into a Dic with seeds as keys" 59 | Uin = open(WORK+"prefix/cat.u") 60 | Fseeds = {} 61 | for line in [line.split("\t") for line in Uin.readlines()]: 62 | if line[1] not in Fseeds: 63 | Fseeds[line[1]] = [(line[0],line[4])] 64 | else: 65 | Fseeds[line[1]].append((line[0],line[4])) 66 | Uin.close() 67 | 68 | 69 | " load tier 1 hits (names,direction) into a Dictionary with seeds as keys" 70 | FS = glob.glob(WORK+"prefix/cat.u_*") 71 | Useeds = {} 72 | for f in FS: 73 | infile = open(f) 74 | for line in [line.split("\t") for line in infile.readlines()]: 75 | if line[1] not in Useeds: 76 | Useeds[line[1]] = [(line[0],line[4])] 77 | else: 78 | Useeds[line[1]].append((line[0],line[4])) 79 | infile.close() 80 | 81 | 82 | " Make one dictionary with combining Fseeds and Useeds matching to Fseeds" 83 | D = {} 84 | for seed in Fseeds: 85 | # add matches to seed to D[seed] 86 | Fhits = Useeds.get(seed) 87 | # add matches to hits to seed to D[seed] 88 | Mhits = [] 89 | for hit in Fseeds[seed]: 90 | Mhits.append(hit) 91 | ugh = Useeds.get(hit[0]) 92 | if ugh: 93 | if hit[1] == "-": 94 | if len(ugh) == 1: 95 | Mhits += [(ugh[0][0],flip(ugh[0][1]))] 96 | elif len(ugh) > 1: 97 | for child in ugh: 98 | Mhits += [(child[0], flip(child[1]))] 99 | else: 100 | Mhits += ugh 101 | if Fhits: 102 | D[(seed,'s')] = Fhits+Mhits 103 | else: 104 | D[(seed,'s')] = Mhits 105 | 106 | 107 | " load seeds of tier 2 into D and set its Useed hits" 108 | f = open(WORK+"prefix/cat._tempU") 109 | lines = f.readlines() 110 | for line in lines: 111 | if ">" in line: 112 | if (line.strip()[1:],'s') not in D: 113 | if Useeds.get(line.strip()[1:]): 114 | D[(line.strip()[1:],'s')] = Useeds.get(line.strip()[1:]) 115 | f.close() 116 | 117 | " load .consens files into Dics " 118 | FS = glob.glob(WORK+"clust"+ID+"/cat.consens_*.gz") 119 | Seqs = {} 120 | for f in FS: 121 | with gzip.open(f) as ff: 122 | k = itertools.izip(*[iter(ff)]*2) 123 | while 1: 124 | try: a = k.next() 125 | except StopIteration: break 126 | Seqs[a[0].strip()] = a[1].strip() 127 | 128 | 129 | " write clust file " 130 | outfile = gzip.open(WORK+"prefix/cat.clust_.gz", 'w') 131 | for i in D: 132 | thisclust = [] 133 | outfile.write(">"+i[0]+'\n'+Seqs[">"+i[0]].upper()+'\n') 134 | thisclust.append(">"+i[0]+'\n'+Seqs[">"+i[0]].upper()) 135 | for m in D[i]: 136 | if ">"+m[0]+'\n'+Seqs[">"+m[0]].upper() not in thisclust: 137 | if m[1] == "-": 138 | outfile.write(">"+m[0]+'\n'+comp(Seqs[">"+m[0]].upper())[::-1]+'\n') 139 | thisclust.append(">"+m[0]+'\n'+comp(Seqs[">"+m[0]].upper())[::-1]) 140 | else: 141 | outfile.write(">"+m[0]+'\n'+Seqs[">"+m[0]].upper()+'\n') 142 | thisclust.append(">"+m[0]+'\n'+Seqs[">"+m[0]].upper()) 143 | outfile.write("//\n") 144 | outfile.close() 145 | 146 | 147 | 148 | def main(UCLUST, ID, datatype, 149 | gids, seed, WORK, MASK): 150 | 151 | sys.stderr.write('\n\tstep 6: clustering across cons-samples at '+`ID`+' similarity \n') 152 | 153 | " read in all seeds and hits " 154 | seeds = [WORK+"prefix/cat.seed_"+gid for gid in gids] 155 | temps = [WORK+"prefix/cat._temp_"+gid for gid in gids] 156 | 157 | #print seeds 158 | #print temps 159 | 160 | " read in all seeds and make same length for randomizing " 161 | out = gzip.open(WORK+'prefix/cat.group_.gz','wb') 162 | for handle in seeds: 163 | f = open(handle,'r') 164 | k = itertools.izip(*[iter(f)]*3) 165 | while 1: 166 | try: a = k.next() 167 | except StopIteration: break 168 | if len(a[0].strip()) < 100: 169 | " seriously, don't have names longer than 100 chars " 170 | out.write(a[0].strip()+" "*(100-len(a[0].strip()))+a[1]) 171 | else: 172 | out.write(a[0].strip()+" "*((len(a[0].strip())+3)-len(a[0].strip()))+a[1]) 173 | print "long name lengths may cause errors" 174 | f.close() 175 | out.close() 176 | 177 | """ randomize input order """ 178 | if seed: 179 | random.seed(seed) 180 | with gzip.open(WORK+'prefix/cat.group_.gz','rb') as source: 181 | data = [ (random.random(), line) for line in source ] 182 | data.sort() 183 | 184 | """ sort by length while preserving randomization within size classes """ 185 | D = [line for _,line in data] 186 | D.sort(key=len, reverse=True) 187 | k = iter(D) 188 | out = open(WORK+'prefix/cat.consens_','w') 189 | while 1: 190 | try: a = k.next().split(" ") 191 | except StopIteration: break 192 | ss = a[-1].replace("a","A").replace("g","G").replace("c","C").replace("t","T").strip() 193 | print >>out, a[0]+'\n'+ss 194 | out.close() 195 | 196 | cluster(UCLUST, ID, datatype, WORK, MASK) 197 | makeclust(ID, datatype, WORK) 198 | 199 | 200 | #if glob.glob(WORK+"prefix/*.seed_*") or glob.glob(WORK+"prefix/*._temp_*"): 201 | # os.system("rm "+WORK+"prefix/*.seed_*") 202 | # os.system("rm "+WORK+"prefix/*._temp_*") 203 | # os.system("rm "+WORK+"prefix/*.u_*") 204 | 205 | -------------------------------------------------------------------------------- /pyrad/loci2phynex.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | import numpy as np 4 | import sys 5 | import os 6 | import glob 7 | 8 | 9 | def update(idict, count, WORK, outname): 10 | """ updates dictionary with the next .5M reads 11 | from the super long string phylip file. Makes 12 | for faster reading. """ 13 | 14 | data = iter(open(WORK+"outfiles/"+outname+".phy")) 15 | ntax, nchar = data.next().strip().split() 16 | 17 | ## read in max N bp at a time 18 | for line in data: 19 | tax, seq = line.strip().split() 20 | idict[tax] = idict[tax][100000:] 21 | idict[tax] += seq[count:count+100000] 22 | del line 23 | 24 | return idict 25 | 26 | 27 | 28 | def makephy(WORK, outname, names, longname): 29 | """ builds phy output. If large files writes 50000 loci 30 | at a time to tmp files and rebuilds at the end""" 31 | 32 | " order names " 33 | names = list(names) 34 | names.sort() 35 | 36 | " read in loci file " 37 | locus = iter(open(WORK+"outfiles/"+outname+".loci", 'rb')) 38 | 39 | " dict for saving the full matrix " 40 | fdict = {name:[] for name in names} 41 | 42 | " list for saving locus number and locus range for partitions " 43 | partitions = [] 44 | loc_number = 1 45 | initial_pos = 1 46 | 47 | " remove empty column sites and append edited seqs to dict F " 48 | done = 0 49 | nloci = 0 50 | nbases = 0 51 | while not done: #nloci < 50000: #not done: 52 | seqs = [] 53 | #arrayed = np.array([]) 54 | anames = [] 55 | while 1: 56 | ## get next locus 57 | try: 58 | samp = locus.next() 59 | except StopIteration: 60 | done = 1 61 | break 62 | if "//" in samp: 63 | nloci += 1 64 | break 65 | else: 66 | try: 67 | name, seq = samp.split() 68 | except ValueError: 69 | print samp 70 | anames.append(name[1:]) 71 | seqs.append(seq.strip()) 72 | ## reset 73 | arrayed = np.array([list(i) for i in seqs]) 74 | if done: 75 | break 76 | ## create mask for columns that are empty or 77 | ## that are paired-end separators (compatible w/ pyrad v2 and v3) 78 | #mask = [i for i in range(len(arrayed.T)) if np.any([ 79 | ## still surely a better way to vectorize this... 80 | mask = [i for i in arrayed.T if any([j not in list("-Nn") for j in i])] 81 | masked = np.dstack(mask)[0] 82 | 83 | ## partition information 84 | loc_name = "p"+str(nloci) 85 | loc_range = str(initial_pos) + "-" +\ 86 | str(len(masked[0]) + initial_pos -1) 87 | initial_pos += len(masked[0]) 88 | partitions.append(loc_name+"="+loc_range) 89 | 90 | ## uncomment to print block info (used to partition by locus) 91 | #blockend += minray 92 | #print blockend, 93 | #print loc 94 | #print arrayed 95 | 96 | ## append data to dict 97 | for name in names: 98 | if name in anames: 99 | #fdict[name].append(arrayed[anames.index(name), mask].tostring()) 100 | fdict[name].append(masked[anames.index(name),:].tostring()) 101 | else: 102 | fdict[name].append("N"*masked.shape[1]) 103 | #fdict[name].append("N"*len(arrayed[0, mask])) 104 | ## add len to total length 105 | nbases += len(fdict[name][-1]) 106 | 107 | ## after x iterations tmp pickle fdict? 108 | if not nloci % int(1e4): 109 | ## concat strings 110 | for name in fdict: 111 | with open(os.path.join(WORK, "tmp", 112 | "{}_{}.tmp".format(name, nloci)), 'wb') as wout: 113 | wout.write("".join(fdict[name])) 114 | del fdict 115 | fdict = {name:[] for name in names} 116 | 117 | ## print out .PHY file, if really big, pull form multiple tmp pickle 118 | superout = open(WORK+"outfiles/"+outname+".phy", 'wb') 119 | print >>superout, len(names), nbases 120 | if nloci < 1e4: 121 | for name in names: 122 | print >>superout, name+(" "*((longname+3)-\ 123 | len(name)))+"".join(fdict[name]) 124 | else: 125 | for name in names: 126 | superout.write("{}{}{}".format( 127 | name, 128 | " "*((longname+3)-len(name)), 129 | "".join(fdict[name]))) 130 | tmpfiles = glob.glob(os.path.join(WORK, "tmp", name+"*.tmp")) 131 | tmpfiles.sort() 132 | for tmpf in tmpfiles: 133 | with open(tmpf, 'rb') as tmpin: 134 | superout.write(tmpin.read()) 135 | superout.write("\n") 136 | superout.close() 137 | raxml_part_out = open(WORK+"outfiles/"+outname+".phy.partitions", 'w') 138 | for partition in partitions: 139 | print >>raxml_part_out, "DNA, %s" % (partition) 140 | raxml_part_out.close() 141 | 142 | return partitions 143 | 144 | 145 | def makenex(WORK, outname, names, longname, partitions): 146 | """ PRINT NEXUS """ 147 | 148 | " make nexus output " 149 | data = iter(open(WORK+"outfiles/"+outname+".phy")) 150 | nexout = open(WORK+"outfiles/"+outname+".nex", 'wb') 151 | 152 | ntax, nchar = data.next().strip().split(" ") 153 | 154 | print >>nexout, "#NEXUS" 155 | print >>nexout, "BEGIN DATA;" 156 | print >>nexout, " DIMENSIONS NTAX=%s NCHAR=%s;" % (ntax,nchar) 157 | print >>nexout, " FORMAT DATATYPE=DNA MISSING=N GAP=- INTERLEAVE=YES;" 158 | print >>nexout, " MATRIX" 159 | 160 | idict = {} 161 | 162 | ## read in max 1M bp at a time 163 | for line in data: 164 | tax, seq = line.strip().split() 165 | idict[tax] = seq[0:100000] 166 | del line 167 | 168 | nameorder = idict.keys() 169 | nameorder.sort() 170 | 171 | n=0 172 | tempn=0 173 | sz = 100 174 | while n < len(seq): 175 | for tax in nameorder: 176 | print >>nexout, " "+tax+" "*\ 177 | ((longname-len(tax))+3)+\ 178 | idict[tax][tempn:tempn+sz] 179 | n += sz 180 | tempn += sz 181 | print >>nexout, "" 182 | 183 | if not n % 100000: 184 | #print idict[tax][tempn:tempn+sz] 185 | idict = update(idict, n, WORK, outname) 186 | tempn -= 100000 187 | 188 | print >>nexout, ';' 189 | print >>nexout, 'END;' 190 | 191 | ### partitions info 192 | print >>nexout, "BEGIN SETS;" 193 | for partition in partitions: 194 | print >>nexout, " CHARSET %s;" % (partition) 195 | print >>nexout, "END;" 196 | 197 | nexout.close() 198 | 199 | 200 | def make(WORK, outfile, names, longname, formats): 201 | partitions = makephy(WORK, outfile, names, longname) 202 | makenex(WORK, outfile, names, longname, partitions) 203 | 204 | 205 | if __name__ == "__main__": 206 | make() 207 | -------------------------------------------------------------------------------- /pyrad/loci2SNP.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | import numpy as np 4 | import sys 5 | import gzip 6 | try: 7 | from collections import Counter 8 | except ImportError: 9 | from counter import Counter 10 | from itertools import chain 11 | import alignable 12 | 13 | 14 | def make(WORK, outname, names, formats, seed, ploidy): 15 | np.random.seed(int(seed)) 16 | finalfile = open(WORK+"outfiles/"+outname+".loci").read() 17 | longname = max(map(len,names)) 18 | 19 | " output .snps and .unlinked_snps" 20 | S = {} ## snp dict 21 | Si = {} ## unlinked snp dict 22 | for name in list(names): 23 | S[name] = [] 24 | Si[name] = [] 25 | 26 | " record bi-allelic snps" 27 | nobis = 0 28 | 29 | " for each locus select out the SNPs" 30 | for loc in finalfile.strip().split("|")[:-1]: 31 | pis = "" 32 | ns = [] 33 | ss = [] 34 | cov = {} ## record coverage for each SNP 35 | for line in loc.split("\n"): 36 | if ">" in line: 37 | ns.append(line.split()[0].replace(">","")) 38 | ss.append(line.split()[-1]) 39 | else: 40 | pis = [i[0] for i in enumerate(line) if i[1] in list('*-')] 41 | 42 | " assign snps to S, and record coverage for usnps" 43 | for tax in S: 44 | if tax in ns: 45 | if pis: 46 | for snpsite in pis: 47 | snpsite -= (longname+5) 48 | S[tax].append(ss[ns.index(tax)][snpsite]) 49 | if snpsite not in cov: 50 | cov[snpsite] = 1 51 | else: 52 | cov[snpsite] += 1 53 | "downweight selection of gap sites " 54 | if ss[ns.index(tax)][snpsite] != '-': 55 | cov[snpsite] += 1 56 | else: 57 | if pis: 58 | for snpsite in pis: 59 | S[tax].append("N") 60 | Si[tax].append("N") 61 | 62 | " randomly select among snps w/ greatest coverage for unlinked snp " 63 | maxlist = [] 64 | for j,k in cov.items(): 65 | if k == max(cov.values()): 66 | maxlist.append(j) 67 | 68 | " Is bi-allelic after resolution of ambigs? " 69 | bisnps = [] 70 | for maxl in maxlist: 71 | bases = [ss[ns.index(tax)][maxl] for tax in S if tax in ns] 72 | ambigs = list(chain(*[alignable.unstruct(i) for i in bases if i in "RSWYMK"])) 73 | bases = set(bases+ambigs) 74 | for ambig in "RSWYMKN-": 75 | bases.discard(ambig) 76 | if len(bases) <= 2: 77 | bisnps.append(maxl) 78 | 79 | #rando = pis[np.random.randint(len(pis))] 80 | #rando -= (longname+5) 81 | if bisnps: 82 | rando = bisnps[np.random.randint(len(bisnps))] 83 | elif maxlist: 84 | rando = maxlist[np.random.randint(len(maxlist))] 85 | 86 | ## record how many loci have no 87 | tbi = 0 88 | for tax in S: 89 | if tax in ns: 90 | if pis: 91 | " if none are bi-allelic " 92 | if not bisnps: 93 | tbi = 1 94 | Si[tax].append(ss[ns.index(tax)][rando]) 95 | if pis: 96 | " add spacer between loci " 97 | S[tax].append(" ") 98 | else: 99 | " invariable locus " 100 | S[tax].append("_ ") 101 | nobis += tbi 102 | " names " 103 | SF = list(S.keys()) 104 | SF.sort() 105 | 106 | " print out .SNP file " 107 | if 's' in formats: 108 | snpsout = open(WORK+'outfiles/'+outname+".snps",'w') 109 | print >>snpsout, "## %s taxa, %s loci, %s snps" % (len(S), 110 | len("".join(S.values()[0]).split(" "))-1, 111 | len("".join(S[SF[0]]).replace(" ",""))) 112 | for i in SF: 113 | print >>snpsout, i+(" "*(longname-len(i)+3))+"".join(S[i]) 114 | snpsout.close() 115 | 116 | 117 | " print out .USNP file " 118 | snpout = open(WORK+'outfiles/'+outname+".unlinked_snps",'w') 119 | print >>snpout, len(Si),len("".join(Si.values()[0])) 120 | for i in SF: 121 | print >>snpout, i+(" "*(longname-len(i)+3))+"".join(Si[i]) 122 | snpout.close() 123 | 124 | statsout = open(WORK+"stats/"+outname+".stats",'a') 125 | print >>statsout, "sampled unlinked SNPs=",len(Si.values()[0]) 126 | print >>statsout, "sampled unlinked bi-allelic SNPs=", len(Si.values()[0])-nobis 127 | statsout.close() 128 | 129 | if 'k' in formats: 130 | "print out .str (structure) file " 131 | structout = open(WORK+'outfiles/'+outname+".str", 'w') 132 | 133 | B = {'A': '0', 134 | 'T': '1', 135 | 'G': '2', 136 | 'C': '3', 137 | 'N': '-9', 138 | '-': '-9'} 139 | if ploidy > 1: 140 | for line in SF: 141 | print >>structout, line+(" "*(longname-len(line)+3))+\ 142 | "\t"*6+"\t".join([B[alignable.unstruct(j)[0]] for j in Si[line]]) 143 | print >>structout, line+(" "*(longname-len(line)+3))+\ 144 | "\t"*6+"\t".join([B[alignable.unstruct(j)[1]] for j in Si[line]]) 145 | else: 146 | for line in SF: 147 | print >>structout, line+(" "*(longname-len(line)+3))+\ 148 | "\t"*6+"\t".join([B[alignable.unstruct(j)[1]] for j in Si[line]]) 149 | structout.close() 150 | 151 | 152 | if 'g' in formats: 153 | "print out .geno file " 154 | genoout = open(WORK+'outfiles/'+outname+".usnps.geno", 'w') 155 | for i in range(len(Si.values()[0])): 156 | getref = 0 157 | ref = "N" 158 | while ref == "N": 159 | ref = alignable.unstruct(Si[SF[getref]][i])[0] 160 | getref += 1 161 | SNProw = "".join(map(str,[alignable.unstruct(Si[j][i]).count(ref) if Si[j][i] != "N" \ 162 | else "9" for j in SF])) 163 | ## print ref,SNProw 164 | if len(set(SNProw)) > 1: 165 | print >>genoout, SNProw 166 | genoout.close() 167 | 168 | if 'g' in formats: 169 | "print out .geno file " 170 | genoout = open(WORK+'outfiles/'+outname+".snps.geno", 'w') 171 | for i in range(len(S.values()[0])): 172 | if S[SF[0]][i].strip("_").strip(): 173 | getref = 0 174 | ref = "N" 175 | while ref == "N": 176 | #print i, S[SF[0]][i] 177 | ref = alignable.unstruct(S[SF[getref]][i])[0] 178 | getref += 1 179 | SNProw = "".join(map(str,[alignable.unstruct(S[j][i]).count(ref) if \ 180 | S[j][i] != "N" else "9" for j in SF])) 181 | ## print ref,SNProw 182 | if len(set(SNProw)) > 1: 183 | print >>genoout, SNProw 184 | genoout.close() 185 | 186 | 187 | if __name__ == "__main__": 188 | make(WORK, outname, names, formats, seed, ploidy) 189 | -------------------------------------------------------------------------------- /pyrad/overlapcheck.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | import glob 4 | import multiprocessing 5 | import gzip 6 | import subprocess 7 | import sys 8 | import os 9 | import itertools 10 | from potpour import Worker 11 | 12 | 13 | def mergepairs(WORK, UCLUST, handle, match, Q): 14 | 15 | handle1 = handle 16 | handle2 = handle.replace("_R1.","_R2.") 17 | outfile = handle.replace("_R1.","M.") 18 | 19 | while outfile.split(".")[-1] in ["fastq","fastQ","gz","fq","FastQ"]: 20 | outfile = outfile.replace('.'+outfile.split(".")[-1], "") 21 | outfile = outfile.split("/")[-1] 22 | outfile = WORK+"mergedreads/"+outfile+".fq" 23 | 24 | if [handle1 and handle2]: 25 | if ".gz" in handle1[-4:]: 26 | k1 = itertools.izip(*[iter(gzip.open(handle1))]*4) 27 | k2 = itertools.izip(*[iter(gzip.open(handle2))]*4) 28 | thandle1 = WORK+"mergedreads/"+handle1.split("/")[-1].replace(".gz",".temp2") 29 | thandle2 = WORK+"mergedreads/"+handle2.split("/")[-1].replace(".gz",".temp2") 30 | numout1 = open(thandle1, 'w') 31 | numout2 = open(thandle2, 'w') 32 | else: 33 | k1 = itertools.izip(*[iter(open(handle1))]*4) 34 | k2 = itertools.izip(*[iter(open(handle2))]*4) 35 | thandle1 = WORK+"mergedreads/"+handle1.split("/")[-1]+".temp2" 36 | thandle2 = WORK+"mergedreads/"+handle2.split("/")[-1]+".temp2" 37 | numout1 = open(thandle1, 'w') 38 | numout2 = open(thandle2, 'w') 39 | else: 40 | print "pair missing" 41 | sys.exit() 42 | 43 | N1 = [] 44 | N2 = [] 45 | cnt = 0 46 | 47 | while 1: 48 | try: d = k1.next() 49 | except StopIteration: break 50 | e = k2.next() 51 | N1.append("".join([d[0].strip()+"_"+str(cnt)+"\n",d[1],d[2],d[3]])) 52 | N2.append("".join([e[0].strip()+"_"+str(cnt)+"\n",e[1],e[2],e[3]])) 53 | cnt+=1 54 | if not cnt % 50000: 55 | numout1.write("".join(N1)) 56 | numout2.write("".join(N2)) 57 | N1 = [] 58 | N2 = [] 59 | numout1.write("".join(N1)) 60 | numout2.write("".join(N2)) 61 | numout1.close() 62 | numout2.close() 63 | 64 | cmd = UCLUST+\ 65 | " -fastq_mergepairs "+thandle1 +\ 66 | " -reverse "+thandle2 +\ 67 | " -fastq_maxdiffs 6 " +\ 68 | " -fastq_truncqual 2 " +\ 69 | " -fastq_minlen 36 " +\ 70 | " -fastq_minmergelen 50 "+\ 71 | " -fastqout "+outfile +\ 72 | " -fastq_allowmergestagger" +\ 73 | " -quiet " 74 | subprocess.call(cmd, shell=True) 75 | 76 | 77 | stats = statsout(thandle1, thandle2, outfile, WORK) 78 | sys.stderr.write(".") 79 | return stats 80 | 81 | 82 | def statsout(h1,h2,m,WORK): 83 | " remove merged reads from 1st & 2nd read files " 84 | 85 | " stat counters " 86 | cnt = 0 87 | mcnt = 0 88 | 89 | " create list of merged IDs " 90 | MIDS = [] 91 | if os.path.exists(m): 92 | merged = open(m, 'r') 93 | for line in itertools.izip(*[iter(merged)]*4): 94 | MIDS.append(int(line[0].strip().split("_")[-1])) 95 | merged.close() 96 | ## if not... 97 | 98 | if ".gz" in h1[-5:]: 99 | hand1 = gzip.open(h1, 'rb') 100 | hand2 = gzip.open(h2, 'rb') 101 | else: 102 | hand1 = open(h1, 'r') 103 | hand2 = open(h2, 'r') 104 | 105 | r1 = itertools.izip(*[iter(hand1)]*4) 106 | r2 = itertools.izip(*[iter(hand2)]*4) 107 | 108 | " lists to write output " 109 | ONE = [] 110 | TWO = [] 111 | 112 | " outfile names for mergeless reads " 113 | outname = WORK+"fastq/"+h1.split("/")[-1].replace(".temp2",".nomerge")+".gz" 114 | 115 | if os.path.exists(outname): 116 | os.remove(outname) 117 | outname2 = outname.replace("_R1.","_R2.") 118 | if os.path.exists(outname2): 119 | os.remove(outname2) 120 | 121 | while 1: 122 | try: one = r1.next() 123 | except StopIteration: break 124 | two = r2.next() 125 | cnt += 1 126 | find = int(one[0].strip().split("_")[-1]) 127 | if MIDS: 128 | if find == MIDS[0]: 129 | "reads were merged, don't write to file" 130 | mcnt += 1 131 | MIDS.pop(0) 132 | else: 133 | ONE.append(one) #[i.strip() for i in one]) 134 | TWO.append(two) #[i.strip() for i in two]) 135 | else: 136 | ONE.append(one) #[i.strip() for i in one]) 137 | TWO.append(two) #[i.strip() for i in two]) 138 | 139 | if not cnt % 10000: 140 | outfile = gzip.open(outname, 'ab') 141 | outfile.write("".join(["".join(i) for i in ONE])) 142 | outfile.close() 143 | outfile2 = gzip.open(outname2, 'ab') 144 | outfile2.write("".join(["".join(i) for i in TWO])) 145 | outfile2.close() 146 | ONE = [] 147 | TWO = [] 148 | 149 | 150 | if os.path.exists(h1): 151 | cmd1 = "/bin/rm "+h1 152 | cmd2 = "/bin/rm "+h2 153 | subprocess.call(cmd1, shell=True) 154 | subprocess.call(cmd2, shell=True) 155 | 156 | outfile = gzip.open(outname, 'ab') 157 | outfile.write("".join(["".join(i) for i in ONE])) 158 | outfile.close() 159 | outfile2 = gzip.open(outname2, 'ab') 160 | outfile2.write("".join(["".join(i) for i in TWO])) 161 | outfile2.close() 162 | sys.stderr.write(".") 163 | return [outname,mcnt] 164 | 165 | 166 | 167 | def main(WORK, UCLUST, FQs, match, Q, Parallel): 168 | 169 | " create output directories " 170 | if not os.path.exists(WORK+'fastq/'): 171 | os.makedirs(WORK+'fastq') 172 | if not os.path.exists(WORK+'mergedreads'): 173 | os.makedirs(WORK+'mergedreads') 174 | if not os.path.exists(WORK+'stats'): 175 | os.makedirs(WORK+'stats') 176 | 177 | 178 | submitted = 0 179 | work_queue = multiprocessing.Queue() 180 | 181 | names = [i for i in glob.glob(FQs) if "_R1.fq" in i] 182 | 183 | " submit jobs to queue " 184 | if len(names) > 1: 185 | for handle in names: 186 | if "nomerge." not in handle: 187 | n = str(handle.split('/')[-1]).replace("_R1.",".") 188 | while n.split(".")[-1] in ["fastq","fastQ","gz","fq","FastQ"]: 189 | n = n.replace('.'+n.split(".")[-1], "") 190 | finder = WORK+'edits/'+n+".edit" 191 | if finder not in glob.glob(WORK+"edits/*"): 192 | if os.stat(handle).st_size > 0: ## exclude empty files 193 | if os.path.exists(handle.replace("_R1.","_R2.")): 194 | if not os.path.exists(handle.replace(".fq",".nomerge.fq")): 195 | args = [WORK, UCLUST, handle, match, Q] 196 | work_queue.put(args) 197 | submitted += 1 198 | else: 199 | print "merge file already created for", handle.split("/")[-1] 200 | else: 201 | print "cannot find 2nd read file for", handle.split("/")[-1] 202 | else: 203 | print "\t"+finder+" already in edits/" 204 | else: 205 | if not names: 206 | if [i for i in glob.glob(FQs) if "_R1_." in i]: 207 | print "\n\tfile names should have _R1. not _R1_." 208 | print "\n\tcannot find input files" 209 | sys.exit() 210 | else: 211 | work_queue.put([WORK, UCLUST, names[0], match, Q]) 212 | submitted += 1 213 | 214 | " create a queue to pass to workers to store the results " 215 | result_queue = multiprocessing.Queue() 216 | 217 | 218 | " spawn workers, give function " 219 | jobs = [] 220 | for i in range( min(Parallel,submitted) ): 221 | worker = Worker(work_queue, result_queue, mergepairs) 222 | worker.start() 223 | jobs.append(worker) 224 | for job in jobs: 225 | job.join() 226 | 227 | if submitted > 0: 228 | statout = open(WORK+"stats/s2.mergedreads.txt",'w') 229 | print >>statout, "\t".join(["taxon","mergedreads"]) 230 | 231 | for i in range(submitted): 232 | stat = result_queue.get() 233 | a,b = stat 234 | n = a.strip().split("/")[-1].replace(".nomerge.gz","") 235 | print >>statout, "\t".join([n,str(b)]) 236 | print >>statout, "\nmerged reads written to", WORK+"mergedreads/ " 237 | statout.close() 238 | -------------------------------------------------------------------------------- /pyrad/editraw_rads.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | import multiprocessing 4 | import itertools 5 | import sys 6 | import os 7 | import glob 8 | import operator 9 | import gzip 10 | from potpour import Worker 11 | from sortandcheck2 import unambig 12 | from cluster_cons7_shuf import comp 13 | 14 | 15 | 16 | def unambar(CUT): 17 | if any([i in CUT for i in list("RKYSWM")]): 18 | CUTa, CUTb = unambig(CUT) 19 | return [CUTa,CUTb] 20 | else: 21 | return False 22 | 23 | 24 | def Afilter(CUT,s,strict): 25 | a = b = wheretocut = None 26 | " lookfor cut site " 27 | if unambar(CUT): 28 | " if ambiguity in cutter " 29 | CUTa,CUTb = unambar(CUT) 30 | if strict == 2: 31 | lookfor1 = CUTa+"A" 32 | lookfor2 = CUTb+"A" 33 | else: 34 | lookfor1 = CUTa+"AGA" 35 | lookfor2 = CUTb+"AGA" 36 | if lookfor1 in s: 37 | a = s.rindex(lookfor1) 38 | if lookfor2 in s: 39 | b = s.rindex(lookfor2) 40 | if (a or b): 41 | wheretocut = min([i for i in [a,b] if i]) 42 | else: 43 | wheretocut = None 44 | else: 45 | if strict == 2: 46 | lookfor = CUT+"A" 47 | else: 48 | lookfor = CUT+"AGA" 49 | if lookfor in s: 50 | wheretocut = s.rindex(lookfor) 51 | else: 52 | wheretocut = None 53 | 54 | if not wheretocut: 55 | " look for adapter sequence " 56 | if strict == 2: 57 | lookfor1 = "AGATCG" 58 | else: 59 | lookfor1 = "AGATCGGA" 60 | if lookfor1 in s: 61 | wheretocut = s.rindex(lookfor1)-(len(CUT)+1) 62 | else: 63 | wheretocut = None 64 | 65 | " look for CUT at end of seq " 66 | if not wheretocut: 67 | if CUT in s[-len(CUT)-5:]: 68 | wheretocut = s.rindex(CUT) 69 | return wheretocut 70 | 71 | 72 | 73 | 74 | 75 | def rawedit(WORK, infile, CUT, pN, trimkeep, strict, Q, datatype): 76 | """ three functions: 77 | (1) replaces low quality base calls with Ns, 78 | (2) checks for adapter sequence if strict set to 1 or 2 """ 79 | 80 | if "," in CUT: 81 | CUT1,CUT2 = CUT.split(',') 82 | else: 83 | CUT1=CUT2=CUT 84 | 85 | if ".gz" in infile: 86 | f = gzip.open(infile, 'r') 87 | else: 88 | f = open(infile,'r') 89 | n = str(infile.split('/')[-1]).replace("_R1.",".") 90 | while n.split(".")[-1] in ["fastq","fastQ","gz","fq","FastQ"]: 91 | n = n.replace('.'+n.split(".")[-1], "") 92 | k = itertools.izip(*[iter(f)]*4) 93 | writing_r = [] 94 | writing_c = [] 95 | 96 | orig = keep = keepcut = 0 97 | handle = WORK+'edits/'+str(n)+".edit" 98 | 99 | while 1: 100 | try: d = k.next() 101 | except StopIteration: break 102 | orig += 1 103 | SS = d[1].strip() 104 | 105 | ph = map(ord,d[3].strip('\n')) 106 | offset = int(Q) 107 | phred = map(lambda x:x-offset,ph) 108 | seq = ["N"]*len(phred) 109 | for base in range(len(phred)): 110 | if base >= len(CUT1): ## don't quality check cut site 111 | if phred[base] >= 20: ## quality threshold 112 | try: seq[base] = SS[base] 113 | except IndexError: 114 | None 115 | else: 116 | seq[base] = "N" 117 | else: 118 | if unambar(CUT1): 119 | seq[base] = unambar(CUT1)[0][base] 120 | else: 121 | seq[base] = CUT1[base] 122 | #try: seq[base] = SS[base] 123 | #except IndexError: 124 | # None 125 | 126 | if not orig % 5000: 127 | if trimkeep: 128 | " write full length and fragment reads " 129 | with open(WORK+'edits/'+str(n)+".edit",'a') as outfile: 130 | outfile.write("".join([z for z in writing_r])) 131 | outfile.write("".join([z for z in writing_c])) 132 | else: 133 | " write only full length reads " 134 | with open(WORK+'edits/'+str(n)+".edit",'a') as outfile: 135 | outfile.write("".join([z for z in writing_r])) 136 | writing_r = [] 137 | writing_c = [] 138 | 139 | s = "".join(seq) 140 | wheretocut1 = None 141 | if strict: 142 | wheretocut1 = Afilter(comp(CUT2)[::-1],s,strict) 143 | s = s[:wheretocut1] 144 | 145 | if datatype == 'merged': 146 | " remove extra forward base so forwards match reverse length" 147 | s = s[:-1] 148 | 149 | if s.count("N") <= pN: ## max allowed Ns 150 | if len(s) >= max(32,trimkeep): ## if read is trimmed, must be minlen long 151 | if wheretocut1: ## if it was trimmed... 152 | writing_c.append(">"+n+"_"+str(keepcut)+"_c1"+"\n"+s+"\n") 153 | keepcut += 1 154 | else: 155 | writing_r.append(">"+n+"_"+str(keep)+"_r1"+"\n"+s+"\n") 156 | keep += 1 157 | 158 | if trimkeep: 159 | with open(WORK+'edits/'+str(n)+".edit",'a') as outfile: 160 | outfile.write("".join([z for z in writing_r])) 161 | outfile.write("".join([z for z in writing_c])) 162 | else: 163 | with open(WORK+'edits/'+str(n)+".edit",'a') as outfile: 164 | outfile.write("".join([z for z in writing_r])) 165 | writing_r = [] 166 | writing_c = [] 167 | 168 | f.close() 169 | sys.stderr.write(".") 170 | if not trimkeep: 171 | keepcut = 0 172 | return [handle.split("/")[-1].replace(".edit",""),str(orig),str(keep),str(keepcut)] 173 | 174 | 175 | 176 | def main(Parallel, WORK, FQs, CUT, pN, Q, strict, trimkeep, datatype): 177 | print >>sys.stderr, "\tstep 2: editing raw reads \n\t", 178 | 179 | " create output directories " 180 | if not os.path.exists(WORK+'stats'): 181 | os.makedirs(WORK+'stats') 182 | if not os.path.exists(WORK+'edits'): 183 | os.makedirs(WORK+'edits') 184 | 185 | " load up work queue " 186 | submitted = 0 187 | work_queue = multiprocessing.Queue() 188 | if len(glob.glob(FQs)) > 1: 189 | FS = glob.glob(FQs) 190 | 191 | " order files by size " 192 | for i in range(len(FS)): 193 | statinfo = os.stat(FS[i]) 194 | FS[i] = FS[i],statinfo.st_size 195 | FS.sort(key=operator.itemgetter(1)) 196 | FS = [i[0] for i in FS][::-1] 197 | 198 | " submit jobs to queue " 199 | for handle in FS: 200 | finder = WORK+'edits/'+handle.split("/")[-1] 201 | while finder.split(".")[-1] in ["fastq","fastQ","gz","fq","FastQ"]: 202 | finder = finder.replace('.'+finder.split(".")[-1], "").replace("_R1","") 203 | if finder+".edit" not in glob.glob(WORK+"edits/*"): 204 | if os.stat(handle).st_size > 0: ## exclude empty files 205 | args = [WORK, handle, CUT, float(pN), trimkeep, strict, Q, datatype] 206 | work_queue.put(args) 207 | submitted += 1 208 | else: 209 | print "skipping",handle,", file is empty" 210 | else: 211 | print "\t"+finder+" already in edits/" 212 | 213 | elif len(glob.glob(FQs)) == 1: 214 | " if only one file " 215 | work_queue.put([WORK, glob.glob(FQs)[0], CUT, float(pN), trimkeep, strict, Q, datatype]) 216 | submitted += 1 217 | 218 | else: 219 | print "\tNo demultiplexed files found. Check path." 220 | sys.exit() 221 | 222 | " create a queue to pass to workers to store the results " 223 | result_queue = multiprocessing.Queue() 224 | 225 | " spawn workers, give function " 226 | jobs = [] 227 | for i in range( min(Parallel,submitted) ): 228 | worker = Worker(work_queue, result_queue, rawedit) 229 | worker.start() 230 | jobs.append(worker) 231 | for job in jobs: 232 | job.join() 233 | 234 | 235 | " collect the results off the queue " 236 | outstats = open(WORK+"stats/s2.rawedit.txt",'a') 237 | print >> outstats, "\t".join(["sample ","Nreads","passed","passed.w.trim","passed.total"]) 238 | STATS = [] 239 | for i in range(submitted): 240 | STATS.append(result_queue.get()) 241 | 242 | STATS.sort(key = lambda x: x[0]) 243 | for i in range(submitted): 244 | a,b,c,d = STATS[i] 245 | print >> outstats, "\t".join([a,b,c,d,str(int(c)+int(d))]) 246 | 247 | print >>outstats, """ 248 | Nreads = total number of reads for a sample 249 | passed = retained reads that passed quality filtering at full length 250 | passed.w.trim= retained reads that were trimmed due to detection of adapters 251 | passed.total = total kept reads of sufficient length 252 | note: you can set the option in params file to include trimmed reads of xx length. """ 253 | outstats.close() 254 | 255 | #" zip files to save size " 256 | #for ff in glob.glob(WORK+"edits/*"): 257 | # os.system("gzip "+ff) 258 | -------------------------------------------------------------------------------- /pyrad/editraw_merges.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | import multiprocessing 4 | import itertools 5 | import sys 6 | import os 7 | import glob 8 | import operator 9 | import gzip 10 | from potpour import Worker 11 | from sortandcheck2 import unambig 12 | from cluster_cons7_shuf import comp 13 | 14 | 15 | def unambar(CUT): 16 | if any([i in CUT for i in list("RKYSWM")]): 17 | CUTa, CUTb = unambig(CUT) 18 | return [CUTa,CUTb] 19 | else: 20 | return CUT 21 | 22 | 23 | def most_common(L): 24 | return max(itertools.groupby(sorted(L)), key=lambda(x, v):(len(list(v)),-L.index(x)))[0] 25 | 26 | 27 | def rawedit(WORK, infile, CUT, pN, trimkeep, strict, Q): 28 | """ three functions: 29 | (1) replaces low quality base calls with Ns, 30 | (2) checks for adapter sequence if strict set to 1 or 2 """ 31 | 32 | if "," in CUT: 33 | CUT1,CUT2 = CUT.split(',') 34 | else: 35 | CUT1=CUT2=CUT 36 | 37 | if ".gz" in infile: 38 | f = gzip.open(infile, 'r') 39 | else: 40 | f = open(infile,'r') 41 | 42 | " remove name suffix" 43 | n = str(infile.split('/')[-1]).replace("_R1.",".") 44 | while n.split(".")[-1] in ["fastq","fastQ","gz","fq","FastQ"]: 45 | n = n.replace('.'+n.split(".")[-1], "") 46 | 47 | " read infile 4 lines at a time, setup counters and lists" 48 | k = itertools.izip(*[iter(f)]*4) 49 | writing_r = [] 50 | writing_c = [] 51 | orig = keep = keepcut = 0 52 | handle = WORK+'edits/'+str(n)+".edit" 53 | 54 | 55 | " do a test run on first 1000 reads to find if extra bases on right end of reads" 56 | rightend = [] 57 | while len(rightend) < 1000: 58 | try: d = k.next() 59 | except StopIteration: break 60 | s = "".join(d[1].strip()) 61 | 62 | " cutters " 63 | find1 = CUT1 64 | find2 = comp(CUT2)[::-1] 65 | 66 | " are cutters found on both ends? A correct merge" 67 | a = s[:len(find1)] 68 | b = s[-len(find2)-2:] ## w/ wiggle room 69 | if (find1 in a) and (find2 in b) : 70 | xtra = s.rindex(find2)+len(find2) 71 | rightend.append(len(s)-xtra) 72 | 73 | " find most common element in rightend " 74 | if rightend: 75 | a = most_common(rightend) 76 | if a>3: 77 | Roffset = 0 78 | else: 79 | Roffset = a 80 | else: 81 | Roffset = 0 82 | 83 | " reset iterable " 84 | if ".gz" in infile: 85 | f = gzip.open(infile, 'r') 86 | else: 87 | f = open(infile,'r') 88 | k = itertools.izip(*[iter(f)]*4) 89 | 90 | " iterate over each read " 91 | while 1: 92 | try: d = k.next() 93 | except StopIteration: break 94 | orig += 1 95 | SS = d[1].strip() 96 | 97 | " apply Phred Q filter " 98 | ph = map(ord,d[3].strip('\n')) 99 | offset = int(Q) 100 | phred = map(lambda x:x-offset,ph) 101 | seq = ["N"]*len(phred) 102 | for base in range(len(phred)): 103 | "don't quality check cut sites " 104 | if (base >= len(CUT1)) and (base < len(phred)-len(CUT2)): 105 | if phred[base] >= 20: 106 | try: seq[base] = SS[base] 107 | except IndexError: 108 | None 109 | else: 110 | seq[base] = "N" 111 | else: 112 | try: seq[base] = SS[base] 113 | except IndexError: 114 | None 115 | 116 | " write to file " 117 | if not orig % 5000: 118 | with open(WORK+'edits/'+str(n)+".edit",'a') as outfile: 119 | outfile.write("".join([z for z in writing_r])) 120 | writing_r = [] 121 | 122 | s = "".join(seq) 123 | 124 | wheretocut = [None,None,None] 125 | " filter for N" 126 | if s.count("N") <= pN: 127 | 128 | " apply filter for Adapters " 129 | find1 = CUT1 130 | find2 = comp(CUT2)[::-1] 131 | 132 | if "trim" in d[0]: 133 | " filters for non-merged, trimmed reads from s2 " 134 | if (find1 in s[:len(find1)]) or (find2 in s[len(find2)-2:]): 135 | None 136 | else: 137 | " CUT1 rarely missing, CUT2 sometimes missing" 138 | s = s[:-len(CUT2)-Roffset] 139 | 140 | else: 141 | " merged reads. Are cutters found on both ends? A correct merge" 142 | a = s[:len(find1)] 143 | b = s[-len(find2)-2:] ## w/ wiggle room 144 | if (find1 in a) and (find2 in b) : 145 | " find end of read2 " 146 | xtra = s.rindex(find2)+len(find2) 147 | wheretocut = [None, len(s)-Roffset, 'complete'] 148 | else: 149 | " look for CUT2 from right side " 150 | if find2 in s[len(s)/2:]: ## check that this is a good general number... 151 | a = s.rindex(find2)+len(find2) 152 | wheretocut = [None, a, 'find2 in s'] 153 | else: 154 | "couldn't find cut2, maybe has error, look for adapter" 155 | if 'AGATCG' in s: 156 | a = s.rindex('AGATCG')-len(CUT2) 157 | wheretocut = [None, a, 'AGATCG in s'] 158 | else: 159 | if "TCGGAAG" in s: 160 | a = s.rindex('TCGGAAG')-len(CUT2)-3 161 | wheretocut = [None, a, 'TCGGAAG in s'] 162 | else: 163 | "no sign of overshoot to right --->" 164 | " look for overshoot on left <---- " 165 | wheretocut = [None, len(s)-Roffset, "None"] 166 | 167 | " look for CUT1 from left side " 168 | if CUT1 in s: 169 | a = s.index(CUT1) 170 | wheretocut[0] = a 171 | else: 172 | "exclude read" 173 | wheretocut[0] = wheretocut[1] 174 | 175 | w1,w2,reason = wheretocut 176 | if len(s[w1:w2]) > trimkeep: 177 | #print s[w1:w2], reason, len(s[w1:w2]), trimkeep 178 | s = s[w1:w2] 179 | else: 180 | s = "" 181 | 182 | if len(s) >= max(36,trimkeep): ## if read is trimmed, must be minlen long 183 | writing_r.append(">"+n+"_"+str(keep)+"_r1"+"\n"+s+"\n") 184 | keep += 1 185 | 186 | with open(WORK+'edits/'+str(n)+".edit",'a') as outfile: 187 | outfile.write("".join([z for z in writing_r])) 188 | writing_r = [] 189 | 190 | f.close() 191 | sys.stderr.write(".") 192 | if not trimkeep: 193 | keepcut = 0 194 | return [handle.split("/")[-1].replace(".edit",""),str(orig),str(keep),str(keepcut)] 195 | 196 | 197 | 198 | def main(Parallel, WORK, FQs, CUT, pN, Q, strict, trimkeep): 199 | print >>sys.stderr, "\tstep 2: editing raw reads \n\t", 200 | 201 | " create output directories " 202 | if not os.path.exists(WORK+'stats'): 203 | os.makedirs(WORK+'stats') 204 | if not os.path.exists(WORK+'edits'): 205 | os.makedirs(WORK+'edits') 206 | 207 | " used to find if files already exist " 208 | lookfor = ".edit" 209 | 210 | " load up work queue " 211 | submitted = 0 212 | work_queue = multiprocessing.Queue() 213 | if len(glob.glob(FQs)) > 1: 214 | FS = [f for f in glob.glob(FQs)] 215 | 216 | " order files by size " 217 | for i in range(len(FS)): 218 | statinfo = os.stat(FS[i]) 219 | FS[i] = FS[i],statinfo.st_size 220 | FS.sort(key=operator.itemgetter(1)) 221 | FS = [i[0] for i in FS][::-1] 222 | 223 | " submit jobs to queue " 224 | for handle in FS: 225 | finder = WORK+'edits/'+handle.split("/")[-1] 226 | while finder.split(".")[-1] in ["fastq","fastQ","gz","fq","FastQ"]: 227 | finder = finder.replace('.'+finder.split(".")[-1], "").replace("_R1","") 228 | if finder+".edit" not in glob.glob(WORK+"edits/*"): 229 | if os.stat(handle).st_size > 0: ## exclude empty files 230 | args = [WORK, handle, CUT, float(pN), trimkeep, strict, Q] 231 | work_queue.put(args) 232 | submitted += 1 233 | else: 234 | print "\t"+finder+" already in edits/" 235 | 236 | elif len(glob.glob(FQs)) == 1: 237 | " if only one file " 238 | work_queue.put([WORK, glob.glob(FQs)[0], CUT, float(pN), trimkeep, strict, Q]) 239 | submitted += 1 240 | 241 | else: 242 | print "\tNo demultiplexed files found. Check path." 243 | 244 | " create a queue to pass to workers to store the results " 245 | result_queue = multiprocessing.Queue() 246 | 247 | " spawn workers, give function " 248 | jobs = [] 249 | for i in range( min(Parallel,submitted) ): 250 | worker = Worker(work_queue, result_queue, rawedit) 251 | worker.start() 252 | jobs.append(worker) 253 | for job in jobs: 254 | job.join() 255 | 256 | 257 | " collect the results off the queue " 258 | outstats = open(WORK+"stats/s2.rawedit.txt",'a') 259 | print >> outstats, "\t".join(["sample ","Nreads","passed","passed.w.trim","passed.total"]) 260 | STATS = [] 261 | for i in range(submitted): 262 | STATS.append(result_queue.get()) 263 | 264 | STATS.sort(key = lambda x: x[0]) 265 | for i in range(submitted): 266 | a,b,c,d = STATS[i] 267 | print >> outstats, "\t".join([a,b,c,d,str(int(c)+int(d))]) 268 | 269 | print >>outstats, """ 270 | Nreads = total number of reads for a sample 271 | passed = retained reads that passed quality filtering at full length 272 | passed.w.trim= retained reads that were trimmed due to detection of adapters 273 | passed.total = total kept reads of sufficient length 274 | note: you can set the option in params file to include trimmed reads of xx length. """ 275 | outstats.close() 276 | 277 | #" zip files to save size " 278 | #for ff in glob.glob(WORK+"edits/*"): 279 | # os.system("gzip "+ff) 280 | -------------------------------------------------------------------------------- /pyrad/H_err_dp.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | import scipy.stats 4 | import scipy.optimize 5 | import numpy 6 | import itertools 7 | import sys 8 | import glob 9 | import multiprocessing 10 | import os 11 | import gzip 12 | from potpour import Worker 13 | 14 | 15 | 16 | def makeP(N): 17 | """ returns a list of freq. for ATGC""" 18 | sump = sum([sum(i) for i in N]) 19 | try: p1 = sum([float(i[0]) for i in N])/sump 20 | except ZeroDivisionError: p1 = 0.0 21 | try: p2 = sum([float(i[1]) for i in N])/sump 22 | except ZeroDivisionError: p2 = 0.0 23 | try: p3 = sum([float(i[2]) for i in N])/sump 24 | except ZeroDivisionError: p3 = 0.0 25 | try: p4 = sum([float(i[3]) for i in N])/sump 26 | except ZeroDivisionError: p4 = 0.0 27 | return [p1,p2,p3,p4] 28 | 29 | 30 | def L1(E,P,N): 31 | """probability homozygous""" 32 | h = [] 33 | s = sum(N) 34 | for i,l in enumerate(N): 35 | p = P[i] 36 | b = scipy.stats.binom.pmf(s-l,s,E) 37 | h.append(p*b) 38 | return sum(h) 39 | 40 | 41 | def L2(E,P,N): 42 | """probability of heterozygous""" 43 | h = [] 44 | s = sum(N) 45 | for l,i in enumerate(N): 46 | for j,k in enumerate(N): 47 | if j>l: 48 | one = 2.*P[l]*P[j] 49 | two = scipy.stats.binom.pmf(s-i-k,s,(2.*E)/3.) 50 | three = scipy.stats.binom.pmf(i,k+i,0.5) 51 | four = 1.-(sum([q**2. for q in P])) 52 | h.append(one*two*(three/four)) 53 | return sum(h) 54 | 55 | 56 | def totlik(E,P,H,N): 57 | """ total probability """ 58 | lik = ((1-H)*L1(E,P,N)) + (H*L2(E,P,N)) 59 | return lik 60 | 61 | def LL(x0,P,Tab): 62 | """ Log likelihood score given values [H,E] """ 63 | H = x0[0] 64 | E = x0[1] 65 | L = [] 66 | if (H <= 0.) or (E <= 0.): 67 | r = numpy.exp(100) 68 | else: 69 | for i in Tab: 70 | ll = totlik(E,P,H,i[0]) 71 | if ll > 0: 72 | L.append(i[1] * numpy.log(ll)) 73 | r = -sum(L) 74 | #print "\t".join(map(str,[r, H, E])) 75 | return r 76 | 77 | 78 | def LL_haploid(E,P,Tab): 79 | """ Log likelihood score given values [H,E] """ 80 | H = 0. 81 | L = [] 82 | if (E <= 0.): 83 | r = numpy.exp(100) 84 | else: 85 | for i in Tab: 86 | ll = totlik(E,P,H,i[0]) 87 | if ll > 0: 88 | L.append(i[1] * numpy.log(ll)) 89 | r = -sum(L) 90 | #print "\t".join(map(str,[r, H, E])) 91 | return r 92 | 93 | 94 | 95 | def table_c(N): 96 | """ makes a dictionary with counts of base counts [x,x,x,x]:x, 97 | speeds up Likelihood calculation""" 98 | Tab = {} 99 | k = iter(N) 100 | while 1: 101 | try: 102 | d = k.next() 103 | except StopIteration: break 104 | if tuple(d) in Tab: 105 | Tab[tuple(d)] += 1 106 | else: 107 | Tab[tuple(d)] = 1 108 | L = [] 109 | for i,j in Tab.items(): 110 | [i,j] 111 | L.append([i,j]) 112 | return [i for i in L if (0,0,0,0) not in i] 113 | 114 | 115 | def stack(D): 116 | """ 117 | from list of bases at a site D, 118 | returns an ordered list of counts of bases 119 | """ 120 | L = len(D) 121 | counts = [] 122 | for i in range(len(D[0])): 123 | A=C=T=G=N=S=0 124 | for nseq in range(L): 125 | A += D[nseq][i].count("A") 126 | C += D[nseq][i].count("C") 127 | T += D[nseq][i].count("T") 128 | G += D[nseq][i].count("G") 129 | N += D[nseq][i].count("N") 130 | S += D[nseq][i].count("-") 131 | counts.append( [[A,C,T,G],N,S] ) 132 | return counts 133 | 134 | 135 | 136 | def consensus(f, minsamp, CUT1, CUT2, datatype): 137 | """ makes a list of lists of reads at each site """ 138 | f = gzip.open(f) 139 | k = itertools.izip(*[iter(f)]*2) 140 | L = [] 141 | locus = 0 142 | while 1: 143 | try: 144 | first = k.next() 145 | except StopIteration: break 146 | itera = [first[0],first[1]] 147 | fname = first[0] 148 | S = [] 149 | rights = [] 150 | lefts = [] 151 | leftjust = rightjust = None 152 | while itera[0] != "//\n": 153 | nreps = int(itera[0].strip().split(";")[1].replace("size=","")) 154 | 155 | " record left and right most for cutting if gbs merge data " 156 | if datatype in ['mergegbs','gbs']: 157 | if itera[0].strip().split(";")[-1] == "": 158 | leftjust = itera[1].index([i for i in itera[1] if i not in list("-N")][0]) 159 | rightjust = itera[1].rindex([i for i in itera[1] if i not in list("-N")][0]) 160 | lefts.append(itera[1].index([i for i in itera[1] if i not in list("-N")][0])) 161 | rights.append(itera[1].rindex([i for i in itera[1] if i not in list("-N")][0])) 162 | 163 | " append sequence * number of dereps " 164 | for i in range(nreps): 165 | S.append(tuple(itera[1].strip())) 166 | itera = k.next() 167 | 168 | " trim off overhang edges of gbs reads " 169 | if datatype in ['mergegbs','gbs']: 170 | if any([i < leftjust for i in lefts]): 171 | rightjust = min(rights) 172 | if any([i < rightjust for i in rights]): 173 | leftjust = max(lefts) 174 | 175 | for s in range(len(S)): 176 | if rightjust: 177 | S[s] = S[s][leftjust:rightjust+1] 178 | if leftjust: 179 | S[s] = S[s][leftjust:rightjust+1] ## +1? 180 | 181 | " trim off restriction sites from end/s " 182 | if datatype in ['merged','pairddrad','pairgbs','gbs']: 183 | for s in range(len(S)): 184 | S[s] = S[s][len(CUT1):-(len(CUT2)+1)] 185 | else: 186 | for s in range(len(S)): 187 | S[s] = S[s][len(CUT1):] 188 | 189 | if len(S) >= minsamp: 190 | " make list for each site in sequences " 191 | res = stack(S) 192 | " exclude sites with indels " 193 | L += [i[0] for i in res if i[2] == 0] 194 | locus += 1 195 | return L 196 | 197 | 198 | 199 | 200 | 201 | def optim(WORK,handle, minsamp, CUT1, CUT2, datatype, haplos): 202 | name = handle.split("/")[-1].replace(".clustS.gz","") 203 | D = consensus(handle, minsamp, CUT1, CUT2, datatype) 204 | P = makeP(D) 205 | Tab = table_c(D) 206 | del D 207 | #H,E = scipy.optimize.fmin(LL,x0,(P,Tab),maxiter=500,maxfun=200,ftol=0.0001,disp=False,full_output=False) 208 | if haplos == 1: 209 | x0 = [0.001] 210 | H = 0. 211 | E = scipy.optimize.fmin(LL_haploid,x0,(P,Tab),disp=False,full_output=False) 212 | else: 213 | x0 = [0.01,0.001] 214 | H,E = scipy.optimize.fmin(LL,x0,(P,Tab),disp=False,full_output=False) 215 | del Tab 216 | outfile = open(WORK+"stats/."+name+".temp",'w') 217 | outfile.write("\t".join([name.strip(".gz"),str(round(H,8))[0:10],str(round(E,8))[0:10],"\n"])) 218 | outfile.close() 219 | sys.stderr.write(".") 220 | 221 | 222 | 223 | 224 | def main(Parallel,ID,minsamp,subset,haplos,WORK,CUT,datatype): 225 | sys.stderr.write("\n\tstep 4: estimating error rate and heterozygosity\n\t") 226 | 227 | " find clust.xx directory " 228 | if not os.path.exists(WORK+'clust'+ID): 229 | print "\terror: could not find "+WORK+"clust"+str(ID)+"/ directory,"+ \ 230 | "\n\t\tif you changed the clustering threshold you must transfer *.clustS"+ \ 231 | "\n\t\tfiles to a new directory named clust.xx with xx replaced by new clustering threshold" 232 | sys.exit() 233 | 234 | 235 | # warning message for low minsamp 236 | if minsamp < 5: 237 | sys.stderr.write("""\n\t warning: Mindepth < 5 is not recommended for this step.\n 238 | If you intend to make low coverage base calls use a high mindepth in 239 | step 4 to accurately infer H & E parameters, and then use a low mindepth 240 | in conjunction with the line 31 params file option to make low coverage 241 | base calls""") 242 | 243 | # if haploid data 244 | if haplos == 1: 245 | sys.stderr.write("\n\tapplying haploid-based test (infer E while H is fixed to 0)\n\t") 246 | 247 | # if double digest use first cut site 248 | if "," in CUT: 249 | CUT1, CUT2 = CUT.strip().split(",") 250 | else: 251 | CUT1 = CUT2 = CUT 252 | 253 | # load up work queue 254 | work_queue = multiprocessing.Queue() 255 | 256 | # iterate over files 257 | HH = glob.glob(WORK+"clust"+ID+"/"+subset+"*.clustS*") 258 | submitted = 0 259 | FS = [] 260 | if len(HH) > 1: 261 | ## sort files by size 262 | for i in range(len(HH)): 263 | statinfo = os.stat(HH[i]) 264 | if statinfo.st_size > 1000: 265 | FS.append((HH[i],statinfo.st_size)) 266 | else: 267 | print "excluding ",HH[i],"file is too small\n" 268 | FS.sort(key=lambda x: x[1]) 269 | FS = [i[0] for i in FS] 270 | else: 271 | FS = HH 272 | REMOVE = glob.glob(WORK+'clust'+ID+"/cat.*") 273 | FS = [f for f in FS if f not in REMOVE] 274 | for handle in FS: 275 | work_queue.put([WORK,handle, minsamp, CUT1, CUT2, datatype, haplos]) 276 | submitted += 1 277 | 278 | " remove temp files if previous run " 279 | for ff in FS: 280 | end = ff.split("/")[-1].replace(".clustS.gz","") 281 | ff = WORK+"stats/."+end+".temp" 282 | if os.path.exists(ff): 283 | os.remove(ff) 284 | 285 | " create a queue to pass to workers to store the results " 286 | result_queue = multiprocessing.Queue() 287 | results = [] 288 | 289 | " spawn workers " 290 | jobs = [] 291 | for i in range( min(Parallel,submitted) ): 292 | worker = Worker(work_queue, result_queue, optim) 293 | worker.start() 294 | jobs.append(worker) 295 | for job in jobs: 296 | job.join() 297 | 298 | " write results to stats file " 299 | if not os.path.exists(WORK+"stats/Pi_E_estimate.txt"): 300 | outstats = open(WORK+"stats/Pi_E_estimate.txt",'w') 301 | outstats.write("taxa\tH\tE\n") 302 | else: 303 | outstats = open(WORK+"stats/Pi_E_estimate.txt",'a') 304 | for ff in FS: 305 | end = ff.split("/")[-1].replace(".clustS.gz","") 306 | ft = WORK+"stats/."+end+".temp" 307 | line = open(ft).readlines() 308 | outstats.write(line[0]) 309 | os.remove(ft) 310 | # n,h,e = line[0].strip().split("\t") 311 | # H.append(float(h)) 312 | # E.append(float(e)) 313 | #outstats.write(" ".join(["mean E =",str(numpy.mean(E))])+"\n") 314 | #outstats.write(" ".join(["mean H =",str(numpy.mean(H))])) 315 | outstats.close() 316 | 317 | 318 | 319 | -------------------------------------------------------------------------------- /pyrad/editraw_pairs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | import multiprocessing 4 | import itertools 5 | import sys 6 | import os 7 | import glob 8 | import operator 9 | import gzip 10 | from potpour import Worker 11 | from sortandcheck2 import unambig 12 | 13 | 14 | def revcomp(s): 15 | ss = s[::-1].strip().replace("A","t").replace("T","a").\ 16 | replace("C","g").replace("G","c").replace("n","Z").upper().replace("Z","n") 17 | return ss 18 | 19 | 20 | def unambar(CUT): 21 | if any([i in CUT for i in list("RKYSWM")]): 22 | CUTa, CUTb = unambig(CUT) 23 | return [CUTa,CUTb] 24 | else: 25 | return False 26 | 27 | 28 | def Afilter(CUT,s,strict,read): 29 | read1 = read==1 30 | a = b = lookfor = wheretocut = None 31 | " lookfor cut site " 32 | 33 | " if ambiguity in cutter " 34 | if unambar(CUT): 35 | CUTa,CUTb = unambar(CUT) 36 | if strict == 2: 37 | if read1: 38 | lookfor1 = CUTa+"A" 39 | lookfor2 = CUTb+"A" 40 | else: 41 | lookfor1 = CUTa 42 | lookfor2 = CUTb 43 | else: 44 | if read1: 45 | lookfor1 = CUTa+"AGAT" 46 | lookfor2 = CUTb+"AGAT" 47 | else: 48 | lookfor1 = "A"*50 49 | lookfor2 = "A"*50 50 | if lookfor1 in s: 51 | a = s.rindex(lookfor1) 52 | if lookfor2 in s: 53 | b = s.rindex(lookfor2) 54 | if (a or b): 55 | wheretocut = min([i for i in [a,b] if i]) 56 | else: 57 | wheretocut = None 58 | else: 59 | "no ambiguity in cutter " 60 | if strict == 2: 61 | if read1: 62 | lookfor1 = CUT+"A" 63 | else: 64 | lookfor1 = CUT 65 | else: 66 | if read1: 67 | lookfor1 = CUT+"AGA" 68 | else: 69 | lookfor1 = "A"*50 70 | if lookfor1 in s: 71 | wheretocut = s.rindex(lookfor1) 72 | else: 73 | wheretocut = None 74 | 75 | " look for adapter sequence " 76 | if not wheretocut: 77 | if strict == 2: 78 | lookfor1 = "AGATCG" 79 | else: 80 | lookfor1 = "AGATCGGA" 81 | if lookfor1 in s: 82 | if read1: 83 | wheretocut = s.rindex(lookfor1)-(len(CUT)+1) 84 | else: 85 | wheretocut = s.rindex(lookfor1)-(len(CUT)+6) 86 | else: 87 | wheretocut = None 88 | 89 | " look for CUT and end of seq " 90 | if not wheretocut: 91 | if CUT in s[-(len(CUT)+5):]: 92 | wheretocut = s.rindex(CUT) 93 | return wheretocut 94 | 95 | 96 | 97 | 98 | def rawedit(WORK, infile, CUT, pN, trimkeep, strict, Q, datatype): 99 | """ three functions: 100 | (1) replaces low quality base calls with Ns, 101 | (2) checks for adapter sequence and xxbarcodesxx if strict set to 1 or 2 102 | (3) concatenate paired reads with a separator and write to file """ 103 | 104 | if CUT: 105 | if "," in CUT: 106 | CUT1,CUT2 = CUT.split(",") 107 | CUT2=revcomp(CUT2) 108 | x = 0 109 | else: 110 | CUT1=CUT 111 | CUT2=revcomp(CUT1) 112 | x = 1 ## trims garbage base off gbs 113 | 114 | " create iterators for R1 and R2 files " 115 | if ".gz" in infile: 116 | f1 = gzip.open(infile, 'rb') 117 | if ".forward." in infile: 118 | f2 = gzip.open(infile.replace(".forward.",".reverse."), 'r') 119 | else: 120 | f2 = gzip.open(infile.replace("_R1.","_R2."), 'r') 121 | else: 122 | f1 = open(infile,'r') 123 | if ".forward." in infile: 124 | f2 = open(infile.replace(".forward.",".reverse."), 'r') 125 | else: 126 | f2 = open(infile.replace("_R1.","_R2."), 'r') 127 | n = str(infile.split('/')[-1]) 128 | while n.split(".")[-1] in ["fastq","fastQ","gz","fq","FastQ","nomerge"]: 129 | n = n.replace('.'+n.split(".")[-1], "") 130 | if '.forward' in n: 131 | n = n.split(".forward")[0] 132 | None 133 | else: 134 | n = n.replace("_R1","") 135 | 136 | k1 = itertools.izip(*[iter(f1)]*4) 137 | k2 = itertools.izip(*[iter(f2)]*4) 138 | writing_r = [] 139 | writing_c = [] 140 | 141 | orig = keep = keepcut = 0 142 | handle = WORK+'edits/'+str(n)+".edit" 143 | 144 | "iterate over paired reads, edits 1st, if OK, append both to .edit file" 145 | while 1: 146 | try: d = k1.next() 147 | except StopIteration: break 148 | dd = k2.next() 149 | 150 | orig += 1 151 | SS = d[1].strip() 152 | ph = map(ord,d[3].strip()) 153 | offset = int(Q) 154 | phred = map(lambda x:x-offset,ph) 155 | seq = ["N"]*len(phred) 156 | for base in range(len(phred)): 157 | if base >= len(CUT1): ## don't quality check cut site 158 | if phred[base] >= 20: ## quality threshold 159 | try: seq[base] = SS[base] 160 | except IndexError: 161 | None 162 | else: 163 | seq[base] = "N" 164 | else: 165 | if unambar(CUT1): 166 | seq[base] = unambar(CUT1)[0][base] 167 | else: 168 | seq[base] = CUT1[base] 169 | 170 | s = "".join(seq) 171 | wheretocut1 = None 172 | 173 | " apply filters for adapter sequences " 174 | " if GBS CUT2 = revcomp(CUT1) ex: CTGCA" 175 | " if ddRAD CUT2 = revcomp(CUT2) ex: AATT " 176 | if strict: 177 | wheretocut1 = Afilter(CUT2,s,strict,1) 178 | 179 | if s.count("N") <= pN: ## max allowed Ns 180 | if len(s) >= max(32,trimkeep): ## if trimmed read1 length atleast t 181 | 182 | " first read is (maybe) good, now filter second reads " 183 | SS = dd[1].strip() 184 | ph = map(ord,dd[3].strip()) 185 | " if PEAR filtered then seqs are revcomps " 186 | if '.forward' in infile: 187 | SS = revcomp(SS) 188 | ph = ph[::-1] 189 | 190 | offset = int(Q) 191 | phred = map(lambda x:x-offset,ph) 192 | seq = ["N"]*len(phred) 193 | for base in range(len(phred)): 194 | if base > len(CUT2): ## don't quality check cut site 195 | if phred[base] >= 20: ## quality threshold 196 | try: seq[base] = SS[base] 197 | except IndexError: None 198 | else: 199 | seq[base] = "N" 200 | else: 201 | try: seq[base] = SS[base] 202 | except IndexError: None 203 | s2 = "".join(seq) 204 | 205 | " filter for gbs read2s, b/c they will be clustered" 206 | badread = 0 207 | if datatype == "pairgbs": 208 | s2 = s2[:len(s)] 209 | if s2.count("N")>pN: 210 | badread = 1 211 | 212 | " apply adapter filter to read 2 " 213 | wheretocut2 = None 214 | if strict: 215 | wheretocut2 = Afilter(revcomp(CUT1),s2,strict,2) 216 | 217 | if (wheretocut1 and wheretocut2): 218 | cutter = min(wheretocut1,wheretocut2) 219 | else: 220 | cutter = max(wheretocut1,wheretocut2) 221 | if strict: 222 | if not cutter: 223 | if (revcomp(CUT1) in s2[-16:]) or (CUT2 in s[-10:]): 224 | cutter = len(s)-16 225 | 226 | if not badread: 227 | if cutter: 228 | ## second read was trimmed 229 | if cutter > max(32,trimkeep): 230 | ## include only the first read, with an N placeholder for read2 231 | ## since it was trimmed off 232 | sout = ">"+n+"_"+str(keepcut)+"_trim1"+"\n"+s[:cutter]+\ 233 | "nnnnN\n"#+d[2]+d[3][:cutter]+"\n" 234 | writing_c.append(sout) 235 | keepcut += 1 236 | ## cannot keep trimmed second read in pairddrad method 237 | ## but can in pairgbs 238 | if datatype == 'pairgbs': 239 | sout = ">"+n+"_"+str(keepcut)+"_trim2"+"\nNnnnn"+revcomp(s2[x:cutter+5])+\ 240 | "\n"#+d[2]+d[3][x:cutter+5]+"\n" 241 | writing_c.append(sout) 242 | keepcut += 1 243 | else: 244 | ## second read is good, not trimmed 245 | sout = ">"+n+"_"+str(keep)+"_pair"+"\n"+s[:-1]+"nnnn"+revcomp(s2[x:])+"\n" 246 | writing_r.append(sout) 247 | keep += 1 248 | 249 | if not orig % 5000: 250 | #if trimkeep: 251 | # with open(WORK+'mergedreads/'+str(n)+"M.fq",'a') as outfile: 252 | # outfile.write("".join([z for z in writing_c])) 253 | " writes only full length reads " 254 | with open(WORK+'edits/'+str(n)+".edit",'a') as outfile: 255 | outfile.write("".join([z for z in writing_r])) 256 | " writes only full length reads " 257 | with open(WORK+'edits/'+str(n)+".edit",'a') as outfile: 258 | outfile.write("".join([z for z in writing_c])) 259 | writing_r = [] 260 | writing_c = [] 261 | 262 | #if trimkeep: 263 | # with open(WORK+'mergedreads/'+str(n)+"M.fq",'a') as outfile: 264 | # outfile.write("".join([z for z in writing_c])) 265 | " writes only full length reads " 266 | with open(WORK+'edits/'+str(n)+".edit",'a') as outfile: 267 | outfile.write("".join([z for z in writing_r])) 268 | " writes only full length reads " 269 | with open(WORK+'edits/'+str(n)+".edit",'a') as outfile: 270 | outfile.write("".join([z for z in writing_c])) 271 | writing_r = [] 272 | writing_c = [] 273 | 274 | f1.close() 275 | f2.close() 276 | sys.stderr.write(".") 277 | if datatype=='pairgbs': 278 | keepcut = keepcut*2 279 | return [handle.split("/")[-1].replace(".edit",""),str(orig),str(keepcut),str(keep)] 280 | 281 | 282 | 283 | def main(Parallel, WORK, FQs, CUT, pN, Q, strict, trimkeep, datatype): 284 | 285 | print >>sys.stderr, "\n\tstep 2: quality filtering \n\t", 286 | 287 | " create output directories " 288 | if not os.path.exists(WORK+'stats'): 289 | os.makedirs(WORK+'stats') 290 | if not os.path.exists(WORK+'edits'): 291 | os.makedirs(WORK+'edits') 292 | 293 | " load up work queue " 294 | submitted = 0 295 | work_queue = multiprocessing.Queue() 296 | 297 | " do not select merged or discarded reads if PEAR was used on data" 298 | FQs = glob.glob(FQs) 299 | fqs = [i for i in FQs if not any([j in i for j in ["discarded",".assembled."]])] 300 | 301 | if len(fqs) > 1: 302 | " subselect only the first reads " 303 | if any([".unassembled.forward." in i for i in fqs]): 304 | FS = [i for i in fqs if '.forward.' in i] 305 | else: 306 | FS = [i for i in fqs if '_R1.' in i] 307 | 308 | " order files by size " 309 | for i in range(len(FS)): 310 | statinfo = os.stat(FS[i]) 311 | FS[i] = FS[i],statinfo.st_size 312 | FS.sort(key=operator.itemgetter(1)) 313 | FS = [i[0] for i in FS][::-1] 314 | 315 | " submit jobs to queue " 316 | for handle in FS: 317 | n = handle.split('/')[-1] 318 | while n.split(".")[-1] in ["fastq","fastQ","gz","fq","FastQ","nomerge"]: 319 | n = n.replace('.'+n.split(".")[-1], "") 320 | if '.forward.' in n: 321 | n = n.split(".forward")[0] 322 | None 323 | else: 324 | "_".join(n.split('_R')[:-1]) 325 | if WORK+"edits/"+n+".edit" not in glob.glob(WORK+"edits/*"): 326 | if os.stat(handle).st_size > 0: ## exclude empty files 327 | args = [WORK, handle, CUT, float(pN), trimkeep, strict, Q, datatype] 328 | work_queue.put(args) 329 | submitted += 1 330 | else: 331 | print 'skipping',handle,", file is empty" 332 | else: 333 | print "\t"+n+'.edit'+" already in edits/" 334 | elif len(fqs) == 1: 335 | " if only one file " 336 | work_queue.put([WORK, glob.glob(FQs)[0], CUT, float(pN), trimkeep, strict, Q, datatype]) 337 | submitted += 1 338 | 339 | else: 340 | print "no _paired_ de-multiplexed files found in this location." 341 | sys.exit() 342 | 343 | " create a queue to pass to workers to store the results " 344 | result_queue = multiprocessing.Queue() 345 | 346 | " spawn workers, give function " 347 | jobs = [] 348 | for i in range( min(Parallel,submitted) ): 349 | worker = Worker(work_queue, result_queue, rawedit) 350 | worker.start() 351 | jobs.append(worker) 352 | for job in jobs: 353 | job.join() 354 | 355 | 356 | " collect the results off the queue " 357 | outstats = open(WORK+"stats/s2.rawedit.txt",'a') 358 | print >> outstats, "\t".join(["sample","Nreads","exclude","trimmed","passed"]) 359 | for i in range(submitted): 360 | a,b,c,d = result_queue.get() 361 | print >> outstats, "\t".join([a,b, str(int(b)-int(d)), c, d]) 362 | 363 | print >>outstats, """ 364 | Nreads = total number of reads for a sample 365 | exclude = reads that were excluded 366 | trimmed = reads that had adapter trimmed but were kept 367 | passed = total kept reads 368 | """ 369 | outstats.close() 370 | 371 | -------------------------------------------------------------------------------- /pyrad/cluster_cons7_shuf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | import os 3 | import sys 4 | import itertools 5 | import numpy 6 | import random 7 | import glob 8 | import subprocess 9 | import gzip 10 | import copy 11 | from consensdp import unhetero, uplow, breakalleles 12 | 13 | 14 | 15 | def comp(seq): 16 | """ returns complement of sequence including ambiguity characters, 17 | and saves lower case info for multiple hetero sequences""" 18 | seq = seq.replace("A",'u')\ 19 | .replace('T','v')\ 20 | .replace('C','p')\ 21 | .replace('G','z')\ 22 | .replace('u','T')\ 23 | .replace('v','A')\ 24 | .replace('p','G')\ 25 | .replace('z','C') 26 | seq = seq.replace('R','u')\ 27 | .replace('Y','v')\ 28 | .replace('K','p')\ 29 | .replace('M','z')\ 30 | .replace('u','Y')\ 31 | .replace('v','R')\ 32 | .replace('p','M')\ 33 | .replace('z','K') 34 | seq = seq.replace('r','u')\ 35 | .replace('y','v')\ 36 | .replace('k','p')\ 37 | .replace('m','z')\ 38 | .replace('u','y')\ 39 | .replace('v','r')\ 40 | .replace('p','m')\ 41 | .replace('z','k') 42 | return seq 43 | 44 | 45 | def cmd_exists(cmd): 46 | return subprocess.call("type " + cmd, shell=True, 47 | stdout=subprocess.PIPE, stderr=subprocess.PIPE) == 0 48 | 49 | 50 | def cluster(vsearch, handle, ID, datatype, 51 | quiet, WORK, gid, MASK): 52 | 53 | if datatype == 'pairddrad': 54 | " use first files for split clustering " 55 | if gid: 56 | "hierarchical clustering save temps " 57 | N = " -notmatched "+WORK+"prefix/"+handle.split("/")[-1].replace(".firsts_","._temp_") 58 | U = " -userout "+WORK+"prefix/"+handle.split("/")[-1].replace(".firsts_",".u_") 59 | else: 60 | N = "" 61 | U = " -userout "+handle.replace(".firsts_",".u") 62 | else: 63 | " use haplos files " 64 | if gid: 65 | "hierarchical clustering save temps " 66 | N = " -notmatched "+WORK+"prefix/"+handle.split("/")[-1].replace(".haplos_","._temp_") 67 | U = " -userout "+WORK+"prefix/"+handle.split("/")[-1].replace(".haplos_",".u_") 68 | else: 69 | N = "" 70 | U = " -userout "+handle.replace(".haplos_",".u") 71 | 72 | C = " -cluster_smallmem "+handle 73 | if datatype in ['gbs','pairgbs','merged']: 74 | P = " -strand both" 75 | COV = " -query_cov .90 " ## this can vary 76 | else: 77 | P = " -leftjust " 78 | COV = " -query_cov .90 " 79 | if 'vsearch' not in vsearch: 80 | Q = "" 81 | T = " -threads 1" 82 | else: 83 | Q = " -qmask "+MASK 84 | T = " -threads 6" 85 | cmd = vsearch+\ 86 | C+\ 87 | P+\ 88 | Q+\ 89 | T+\ 90 | " -id "+ID+\ 91 | U+\ 92 | " -userfields query+target+id+gaps+qstrand+qcov"+\ 93 | " -maxaccepts 1"+\ 94 | " -maxrejects 0"+\ 95 | " -fulldp"+\ 96 | " -usersort"+\ 97 | COV+\ 98 | N 99 | #os.system(cmd) 100 | subprocess.call(cmd, shell=True) 101 | 102 | 103 | def makeclust(handle,datatype,gid, 104 | minmatch,WORK): 105 | 106 | " read in cluster hits and seeds files " 107 | if not gid: 108 | Userout = open(handle.replace(".haplos_",".u"),'r') 109 | outfile = gzip.open(handle.replace(".haplos_"+gid,".clust_"+gid+".gz"),'w') 110 | else: 111 | Userout = open(WORK+'prefix/'+handle.split("/")[-1].replace(".haplos_",".u_") ,'r') 112 | nomatch = open(WORK+'prefix/'+handle.split("/")[-1].replace(".haplos_","._temp_"),'r') 113 | outfile = open(WORK+'prefix/'+handle.split("/")[-1].replace(".haplos_",".seed_"),'w') 114 | outfilename = WORK+'prefix/'+handle.split("/")[-1].replace(".haplos_",".seed_") 115 | 116 | " load full fasta file into a Dic " 117 | D = {} 118 | if datatype == 'pairddrad': 119 | if gid: 120 | f = open(handle.replace(".haplos_"+gid,".firsts_"+gid)) 121 | else: 122 | f = gzip.open(handle.replace(".haplos_"+gid,".consens_"+gid+".gz")) 123 | else: 124 | f = gzip.open(handle.replace(".haplos_"+gid,".consens_"+gid+".gz")) 125 | 126 | L = itertools.izip(*[iter(f)]*2) 127 | while 1: 128 | try: a,b = L.next() 129 | except StopIteration: break 130 | D[a.strip()] = b.strip() 131 | f.close() 132 | 133 | " load .u info into a Dic " 134 | U = {} 135 | for line in [line.split("\t") for line in Userout.readlines()]: 136 | if ">"+line[1] in U: 137 | U[">"+line[1]].append([">"+line[0],line[4]]) 138 | else: 139 | U[">"+line[1]] = [[">"+line[0],line[4]]] 140 | 141 | " if tier 1 of hierarchical clustering " 142 | if gid: 143 | if int(minmatch) == 1: 144 | " no reduction, write seeds only " 145 | # if datatype == 'pairddrad': 146 | # singles = itertools.izip(*[iter(open(handle.replace(".haplos_",".firsts_")))]*2) 147 | # else: 148 | # singles = itertools.izip(*[iter(open(handle))]*2) 149 | singles = nomatch.read().split(">")[1:] 150 | for i in singles: 151 | i = i.split("\n")[0]+"\n"+"".join(i.split("\n")[1:]).upper() 152 | #print ">"+i+"\n//" 153 | print >>outfile, ">"+i+"\n//" 154 | #print "//\n".join(i) 155 | #outfile.write("//\n".join(i)) 156 | # i,j = i.split('\n')[0], "\n".join(i.split('\n')[1:]) 157 | # outfile.write("//\n".join(i+j)) 158 | del singles 159 | #outfile.write("//\n".join(LLL)) 160 | # LLL = [] 161 | # while 1: 162 | # try: a,b = singles.next() 163 | # except StopIteration: break 164 | # LLL.append(a+b) 165 | #outfile.write("//\n".join(LLL)) 166 | #del LLL 167 | else: 168 | for key,values in U.items(): 169 | ## reduction, only write seed if minimum hits reached 170 | if (len(values)+1) >= int(minmatch): 171 | ## fix for if short seqs are excluded during clustering 172 | if D.get(key): 173 | seq = key+"\n"+D[key]+"\n" 174 | seq += "//\n" 175 | outfile.write(seq) 176 | 177 | else: 178 | " map sequences to clust file in order " 179 | seq = "" 180 | for key,values in U.items(): 181 | if D.get(key): ## fix for if short seqs are excluded during clustering 182 | seq = key+"\n"+D[key]+'\n' 183 | S = [i[0] for i in values] 184 | R = [i[1] for i in values] 185 | for i in range(len(S)): 186 | if D.get(S[i]): ## testing as fix for removed short reads... 187 | if R[i] == "+": 188 | seq += S[i] + '\n' + D[S[i]] + "\n" 189 | else: 190 | seq += S[i] + '\n' + comp(D[S[i]][::-1]) + "\n" 191 | seq += "//\n" 192 | outfile.write(seq) 193 | outfile.close() 194 | Userout.close() 195 | if gid: nomatch.close() 196 | 197 | 198 | 199 | def splitter(handle): 200 | infile = open(handle) 201 | if os.path.exists(handle.replace(".haplos",".firsts")): 202 | os.remove(handle.replace(".haplos",".firsts")) 203 | 204 | orderfirsts = open(handle.replace(".haplos",".firsts"),'w') 205 | dp = itertools.izip(*[iter(infile)]*2) 206 | ff = [] 207 | cnts = 0 208 | for d in dp: 209 | n,s = d 210 | ## checking fix to pairddrad splitting problem... 211 | ## backwards compatible with pyrad v2 212 | s1 = s.replace("X","x").replace("x","n").split("nn")[0] 213 | ff.append(n+s1+"\n") 214 | cnts += 1 215 | orderfirsts.write("".join(ff)) 216 | orderfirsts.close() 217 | return handle.replace(".haplos",".firsts") 218 | 219 | 220 | 221 | def makecons(vsearch, ID, datatype, 222 | outg, seed, gid, minmatch, inlist, 223 | WORK, quiet, outhandle): 224 | 225 | " find usearch" 226 | if not cmd_exists(vsearch): 227 | print "\tcannot find usearch (or vsearch), edit path in param file" 228 | sys.exit() 229 | 230 | " make list of consens files " 231 | FS = [i for i in inlist if "/cat.cons" not in i] 232 | FS = [i for i in FS if "/cat.group" not in i] 233 | if not FS: 234 | print "no consens files found" 235 | sys.exit() 236 | 237 | " and a list including outgroups " 238 | fs = copy.copy(inlist) 239 | 240 | " are files gzipped ? " 241 | if any(['.gz' in i[-4:] for i in FS]): 242 | gz = ".gz" 243 | else: 244 | gz = "" 245 | 246 | " remove previous files if present " 247 | if os.path.exists(WORK+'clust'+ID+'/cat.consens_'+gid+gz): 248 | os.remove(WORK+'clust'+ID+'/cat.consens_'+gid+gz) 249 | if os.path.exists(WORK+'clust'+ID+'/cat.group_'+gid+gz): 250 | os.remove(WORK+'clust'+ID+'/cat.group_'+gid+gz) 251 | 252 | 253 | " remove outgroup sequences, add back in later to bottom after shuffling " 254 | if outg: 255 | outgroup = outg.strip().split(",") 256 | if len(outgroup) > 1: 257 | for s in outgroup: 258 | if WORK+"clust"+ID+"/"+s+".consens"+gz in FS: 259 | FS.remove(WORK+"clust"+ID+"/"+s+".consens"+gz) 260 | else: 261 | outgroup = WORK+"clust"+ID+"/"+outg+".consens"+gz 262 | if outgroup in FS: 263 | FS.remove(outgroup) 264 | 265 | " create file with consens seqs from all taxa in list " 266 | out = gzip.open(WORK+'clust'+ID+'/cat.group_'+gid+gz,'w') 267 | 268 | for qhandle in FS: 269 | if gz: 270 | f = gzip.open(qhandle) 271 | else: 272 | f = open(qhandle) 273 | k = itertools.izip(*[iter(f)]*2) 274 | while 1: 275 | try: a = k.next() 276 | except StopIteration: break 277 | print >>out, a[0].strip()+" "+a[1].strip() 278 | f.close() 279 | out.close() 280 | 281 | " message to shell " 282 | if gid: 283 | sys.stderr.write('\n\tstep 6: clustering across '+str(len(FS))+' samples at '+`ID`+\ 284 | ' similarity \n\tfor group ('+str(gid)+') retaining seeds w/ minimum of '+str(minmatch)+' hits\n\n') 285 | else: 286 | sys.stderr.write('\n\tstep 6: clustering across '+str(len(FS))+' samples at '+`ID`+' similarity \n\n') 287 | 288 | " make list of random number and data " 289 | if seed: 290 | random.seed(seed) 291 | source = gzip.open(WORK+'clust'+ID+'/cat.group_'+gid+".gz",'r') 292 | data = [ (random.random(), line) for line in source ] 293 | source.close() 294 | " sort by random number " 295 | data.sort() 296 | 297 | " order by size while retaining randomization within size classes " 298 | D = [line.split(' ') for _, line in data] 299 | DD = ["".join([i[0]+" "*(100-len(i[0])),i[1]]) for i in D] 300 | DD.sort(key=len, reverse=True) 301 | k = iter(["**".join([i.split(" ")[0],i.split(" ")[-1]]) for i in DD]) 302 | 303 | " write output to .consens_.gz file " 304 | out = gzip.open(WORK+'clust'+ID+'/cat.consens_'+gid+".gz",'w') 305 | while 1: 306 | try: a,b = k.next().split("**") 307 | except StopIteration: break 308 | print >>out, a+'\n'+b.strip() 309 | 310 | 311 | """ add outgroup taxa back onto end of file.""" 312 | if outg: 313 | " append to existing consens_ file " 314 | outgroup = outg.strip().split(',') 315 | if len(outgroup) > 1: 316 | for s in outgroup: 317 | xoutg = WORK+"clust"+ID+"/"+s+".consens.gz" 318 | if xoutg in fs: 319 | f = gzip.open(xoutg) 320 | k = itertools.izip(*[iter(f)]*2) 321 | while 1: 322 | try: a = k.next() 323 | except StopIteration: break 324 | print >>out, a[0].strip()+"\n"+a[1].strip() 325 | f.close() 326 | elif len(outgroup) == 1: 327 | xoutg = WORK+"clust"+ID+"/"+outgroup[0]+".consens.gz" 328 | if xoutg in fs: 329 | f = gzip.open(xoutg) 330 | k = itertools.izip(*[iter(f)]*2) 331 | while 1: 332 | try: a = k.next() 333 | except StopIteration: break 334 | print >>out, a[0].strip()+"\n"+a[1].strip() 335 | f.close() 336 | else: 337 | None 338 | out.close() 339 | 340 | 341 | """ convert ambiguity codes into a sampled haplotype for any sample 342 | to use for clustering, but save ambiguities for later """ 343 | 344 | " output file" 345 | outhaplos = open(outhandle,'w') 346 | 347 | " input file " 348 | infile = gzip.open(WORK+"clust"+ID+"/cat.consens_"+gid+".gz") 349 | lines = iter(infile.readlines()) 350 | infile.close() 351 | 352 | " write to haplo files in fasta format " 353 | writinghaplos = [] 354 | 355 | for line in lines: 356 | if ">" in line: 357 | writinghaplos.append(line.strip()) 358 | else: 359 | allele = breakalleles(line)[0] 360 | writinghaplos.append(allele.strip()) 361 | outhaplos.write("\n".join(writinghaplos)) 362 | outhaplos.close() 363 | 364 | 365 | def main(vsearch, ID, datatype, 366 | outg, seed, gid, minmatch, inlist, 367 | WORK, MASK, quiet): 368 | 369 | outhandle = WORK+"clust"+ID+"/cat.haplos_"+gid 370 | 371 | makecons(vsearch,ID,datatype, 372 | outg,seed,gid,minmatch, 373 | inlist,WORK,quiet,outhandle) 374 | 375 | if datatype == 'pairddrad': 376 | splithandle = splitter(outhandle) 377 | cluster(vsearch,splithandle,ID,datatype,quiet,WORK, gid, MASK) 378 | else: 379 | cluster(vsearch,outhandle,ID,datatype,quiet,WORK, gid, MASK) 380 | 381 | " remake clusters with .haplos, .u, and .temp files" 382 | makeclust(outhandle,datatype,gid,minmatch,WORK) 383 | 384 | 385 | 386 | 387 | 388 | -------------------------------------------------------------------------------- /pyrad/Dtest.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | import os 4 | import numpy 5 | import sys 6 | import random 7 | import itertools 8 | import glob 9 | import multiprocessing 10 | import cPickle as pickle 11 | from potpour import Worker 12 | 13 | 14 | def IUPAC(one): 15 | """ 16 | returns IUPAC symbol for ambiguity bases, 17 | used for polymorphic sites. 18 | """ 19 | D = {"R":['G','A'], 20 | "K":['G','T'], 21 | "S":['G','C'], 22 | "Y":['T','C'], 23 | "W":['T','A'], 24 | "M":['C','A']} 25 | return D[one] 26 | 27 | 28 | def makefreq(patlist): 29 | " identify which allele is derived in P3 relative to outgroup " 30 | " and is the most frequent and use that as the SNP " 31 | P = {} 32 | for tax in patlist: 33 | P[tax] = [] 34 | 35 | for tax in patlist: 36 | for base in patlist[tax]: 37 | if base in list('ATGC'): 38 | P[tax].append(base[0]) 39 | P[tax].append(base[0]) 40 | elif base in list("RKSYWM"): 41 | hh = IUPAC(base[0]) 42 | for i in hh: 43 | P[tax].append(i) 44 | 45 | major = [i for i in set(P['p3']) if i not in set(P['o'])] 46 | " in case of multiple bases " 47 | if len(major) > 1: 48 | cc = [P['p3'].count(base) for base in major] 49 | major = major[cc.index(max(cc))] ## maybe [0] 50 | elif not major: 51 | major = [i for i in set(P['o']) if i in set(P['p3'])] 52 | elif len(major) == 1: 53 | major = major[0] 54 | 55 | ret = [float(P[i].count(major))/len(P[i]) for i in ['p1','p2','p3','o']] 56 | return ret 57 | 58 | 59 | def Dstat(Loc, pat): 60 | if pat[0] != pat[1]: 61 | if pat[0] == pat[3]: 62 | if pat[1] == pat[2]: 63 | Loc.abba += 1 64 | else: 65 | if pat[0] == pat[2]: 66 | if pat[1] == pat[3]: 67 | Loc.baba += 1 68 | return Loc 69 | 70 | 71 | def polyDstat(Loc,patlist): 72 | ## calculate frequencies 73 | " look at the P3 taxon first for a derived allele " 74 | p1,p2,p3,o = makefreq(patlist) #[a0,a1,a2,a3]) 75 | Loc.abba += ((1.-p1)*p2*p3*(1.-o)) 76 | Loc.baba += (p1*(1.-p2)*p3*(1.-o)) 77 | return Loc 78 | 79 | 80 | def fillin(ll,name,col,ulnames,patlist): 81 | if len(ll)>1: 82 | for i in ll: 83 | patlist[name] = col[ [ulnames.index(i) for i in ll if i in ulnames] ] 84 | else: 85 | patlist[name] = col[ [ulnames.index(i) for i in ll ] ] 86 | return patlist 87 | 88 | 89 | 90 | def IUA(Loc,L): 91 | Loc.abba = 0. 92 | Loc.baba = 0. 93 | for col in Loc.seq.transpose(): 94 | if all(i in list("ATGC") for i in col): 95 | Loc = Dstat(Loc,col) 96 | return Loc 97 | 98 | 99 | def IUAfreq(Loc,L): 100 | patlist = {} 101 | Loc.abba = 0. 102 | Loc.baba = 0. 103 | for col in Loc.seq.transpose(): 104 | patlist = fillin(L[0],'p1',col,Loc.names,patlist) 105 | patlist = fillin(L[1],'p2',col,Loc.names,patlist) 106 | patlist = fillin(L[2],'p3',col,Loc.names,patlist) 107 | patlist = fillin(L[3],'o', col,Loc.names,patlist) 108 | 109 | #print Loc.seq, Loc.number 110 | #print patlist 111 | if not any([all([i in ["N",'-'] for i in patlist['p1']]), 112 | all([i in ["N",'-'] for i in patlist['p2']]), 113 | all([i in ["N",'-'] for i in patlist['p3']]), 114 | all([i in ["N",'-'] for i in patlist['o']])]): 115 | if any([i not in patlist['o'] for i in patlist['p3']]): 116 | Loc = polyDstat(Loc,patlist) 117 | else: 118 | None 119 | else: 120 | None 121 | return Loc 122 | 123 | 124 | 125 | def sample_wr(population, k): 126 | "used for bootstrap sampling" 127 | "Chooses k random elements (with replacement) from a population" 128 | n = len(population) 129 | _random, _int = random.random, int # speed hack 130 | return [_int(_random() * n) for i in itertools.repeat(None, k)] 131 | 132 | 133 | def bootfreq(Ldict, which): 134 | Dftop = Dfbot = 0 135 | while 1: 136 | try: Lx = Ldict[Ldict.keys()[which.next()]] 137 | except StopIteration: break 138 | Dftop += Lx.abba - Lx.baba 139 | Dfbot += Lx.abba + Lx.baba 140 | D = 0. 141 | if Dfbot > 0: 142 | D = Dftop/float(Dfbot) 143 | return D 144 | 145 | 146 | def bootfixed(Ldict, which): 147 | abba = baba = 0 148 | while 1: 149 | try: Lx = Ldict[Ldict.keys()[which.next()]] 150 | except StopIteration: break 151 | abba += Lx.abba 152 | baba += Lx.baba 153 | D = 0. 154 | if abba+baba > 0: 155 | D = float(abba-baba)/float(abba+baba) 156 | return D 157 | 158 | 159 | def makeSNP(L,snpfreq,loci): 160 | Ndict = {} 161 | num = 0 162 | for loc in loci: 163 | Loc = Locus() 164 | Loc.number = num 165 | " only select loci that have data for all four tiptaxa " 166 | names = [i.split()[0].replace(">","") for i in loc.lstrip().rstrip().split("\n")[1:-1]] 167 | if snpfreq: 168 | Loc.names = [i for i in names if i in list(itertools.chain(*L))] 169 | else: 170 | Loc.names = L # [i for i in names if i in L] 171 | 172 | " if snpfreq only need one of possibly multiple individuals " 173 | keep = 0 174 | 175 | if snpfreq: 176 | for tax in L: 177 | z = any([tax in names for tax in L[0]]) 178 | y = any([tax in names for tax in L[1]]) 179 | w = any([tax in names for tax in L[2]]) 180 | u = any([tax in names for tax in L[3]]) 181 | if all([z,y,w,u]): 182 | keep = 1 183 | else: 184 | if all(tax in names for tax in Loc.names): 185 | keep = 1 186 | 187 | if keep: 188 | N = numpy.array([tuple(i) for i in loc.split("\n")[1:]]) 189 | 190 | " only select sites with synapomorphies " 191 | ## may want to keep autapomorphies in the future, or more 192 | ## when making a parameterized version of D-statistic 193 | ## only pyrad 2.1+ finds synapormorphies btwn hetero and fixed sites 194 | N[-1] = list(N[-1].tostring().replace("-","*")) 195 | N = N[:, N[-1] == "*"] 196 | " only select rows with focal taxa" 197 | if snpfreq: 198 | Loc.seq = N[[names.index(i) for i in Loc.names],:] 199 | else: 200 | Loc.seq = N[[names.index(i) for i in Loc.names],:] 201 | #print names 202 | #print N, "______________" 203 | #print Loc.number 204 | #print Loc.seq 205 | #print Loc.names 206 | #print [names.index(i) for i in Loc.names] 207 | Ndict[num] = Loc 208 | num += 1 209 | return Ndict 210 | 211 | 212 | 213 | class Locus(): 214 | """locus keeps track of position in input file, 215 | variable sites, and D-statistics""" 216 | def _init_(self): 217 | self.number = number 218 | self.names = names 219 | self.seq = sequence 220 | self.abba = abba 221 | self.baba = baba 222 | def D(self): 223 | """ just used to check if abba > baba 224 | not a global genomic measure of D """ 225 | if self.abba+self.baba > 0: 226 | return float(self.abba-self.baba)/(self.abba+self.baba) 227 | else: 228 | return 0.0 229 | 230 | 231 | 232 | def runtest(infile, L, nboots, snpfreq, submitted): 233 | " print test" 234 | 235 | " split each locus " 236 | loci = open(infile).read().strip().split("|")[:-1] 237 | loci[0] = "xx\n"+loci[0] 238 | 239 | " returns a {} of Locus objects with data from tiptaxa L" 240 | Ldict = makeSNP(L,snpfreq,loci) 241 | 242 | " calculate ABBA/BABA for each locus" 243 | for loc in Ldict: 244 | if snpfreq: 245 | Ldict[loc] = IUAfreq(Ldict[loc],L) 246 | else: 247 | Ldict[loc] = IUA(Ldict[loc],L) 248 | 249 | " calculate final D " 250 | dftfinal = sum([Ldict[l].abba-Ldict[l].baba for l in Ldict]) 251 | dbtfinal = sum([Ldict[l].abba+Ldict[l].baba for l in Ldict]) 252 | if dbtfinal > 0: 253 | Dfinal = float(dftfinal)/dbtfinal 254 | else: 255 | Dfinal = 0. 256 | 257 | " proportion of discordant loci " 258 | try: pdisc = len([i for i in Ldict if Ldict[i].D()]) / float(len(Ldict)) 259 | except ZeroDivisionError: 260 | pdisc = 0.0 261 | 262 | " do bootstrapping " 263 | BB = [] 264 | for i in xrange(nboots): 265 | which = iter(sample_wr(xrange(len(Ldict)),len(Ldict))) 266 | if snpfreq: 267 | bb = bootfreq(Ldict, which) 268 | else: 269 | bb = bootfixed(Ldict, which) 270 | BB.append(bb) 271 | STD = numpy.std(BB) 272 | 273 | " out stats " 274 | if STD < 0.00001: 275 | STD = 0.0 276 | if Dfinal != 0.0: 277 | if STD != 0.0: 278 | Z = (abs(Dfinal/STD)) 279 | else: 280 | Z = 0.0 281 | else: 282 | Dfinal = 0. 283 | Z = 0. 284 | 285 | ABBAloci = [Ldict[l].number for l in Ldict if Ldict[l].D() > 0] 286 | BABAloci = [Ldict[l].number for l in Ldict if Ldict[l].D() < 0] 287 | 288 | ret = [L,Dfinal,STD,Z, 289 | len(Ldict), 290 | sum([Ldict[l].abba for l in Ldict]), 291 | sum([Ldict[l].baba for l in Ldict]), 292 | pdisc,submitted, 293 | ABBAloci,BABAloci, BB ] 294 | pickle.dump(ret, open(".save.D4temp"+str(submitted),'wb')) 295 | 296 | 297 | 298 | 299 | def makesortfiles(outn,locifile,n,loci,outfile,makesort,sub,ps): 300 | locifile.sort() 301 | "write to ABBA file all loci indexed in ABBAloci list" 302 | with open(outfile+"_"+str(sub+1)+"."+outn[0:n]+".txt",'w') as out: 303 | print >>out, " ".join(ps) 304 | print >>out, "//" 305 | print >>out, ",".join(map(str,locifile)) 306 | print >>out, "//" 307 | if makesort == 2: 308 | for loc in xrange(len(loci)): 309 | if loc in locifile: 310 | out.write(loci[loc]+"| locus: "+str(loc)) 311 | 312 | 313 | 314 | 315 | 316 | def checktaxa(taxalist,alignfile): 317 | with open(alignfile) as infile: 318 | data = infile.readlines() 319 | taxainfile = set() 320 | for line in data: 321 | if ">" in line: 322 | tax = line.split(" ")[0].replace(">","") 323 | if tax not in taxainfile: 324 | taxainfile.add(tax) 325 | if not set(taxalist).difference(taxainfile): 326 | return 1 327 | 328 | 329 | 330 | 331 | 332 | def multiproc_it(tests, alignfile, outfile, nboots, nproc, namelen, makesort, makeboots): 333 | 334 | " submit jobs to processors " 335 | work_queue = multiprocessing.Queue() 336 | result_queue = multiprocessing.Queue() 337 | submitted = 0 338 | Notes = [] 339 | for rep in tests: 340 | notes = "" 341 | if len(rep) == 2: 342 | rep,notes = rep 343 | p1,p2,p3,o = rep 344 | if any(["[" in i for i in rep]): 345 | p1 = p1[1:-1].split(",") 346 | p2 = p2[1:-1].split(",") 347 | p3 = p3[1:-1].split(",") 348 | o = o[1:-1].split(",") 349 | taxalist = list(itertools.chain(*[p1+p2+p3+o])) 350 | if checktaxa(taxalist,alignfile): 351 | work_queue.put([alignfile,[p1,p2,p3,o],nboots,1, submitted]) 352 | submitted += 1 353 | else: 354 | print 'a taxon name was found that is not in the sequence file' 355 | else: 356 | if checktaxa([p1,p2,p3,o],alignfile): 357 | work_queue.put([alignfile,[p1,p2,p3,o],nboots,0, submitted]) 358 | submitted += 1 359 | else: 360 | print 'a taxon name was found that is not in the sequence file' 361 | 362 | Notes.append(notes) 363 | jobs = [] 364 | for i in range(nproc): 365 | worker = Worker(work_queue, result_queue, runtest) 366 | jobs.append(worker) 367 | worker.start() 368 | for j in jobs: 369 | j.join() 370 | 371 | " read results back in " 372 | #Results = [result_queue.get() for i in range(submitted)] 373 | Results = [pickle.load(open(".save.D4temp"+str(i),'rb')) for i in xrange(submitted)] 374 | Results.sort(key = lambda x:x[8]) 375 | 376 | "setup results file " 377 | outs = open(outfile+".D4.txt", 'w') 378 | header = "\t".join([ 'P1'+" "*(namelen[0]-2), 379 | 'P2'+" "*(namelen[1]-2), 380 | 'P3'+" "*(namelen[2]-2), 381 | 'O'+" "*(namelen[3]-1), 382 | 'D','std(D)','Z', 383 | 'BABA','ABBA', 384 | 'nloci','nboot','pdisc', 'notes']) 385 | print >>outs, header 386 | 387 | for i in range(len(Results)): 388 | ps,D,STD,Z,nloci,ABBA,BABA,pdisc,sub,ABBAloci,BABAloci,boots = Results[i] 389 | ps = [str(x).replace("['","[").replace("']","]").replace("', '",",").replace(">","") for x in ps] 390 | print >>outs, "%s\t%s\t%s\t%s\t%.3f\t%.3f\t%.2f\t%.2f\t%.2f\t%d\t%d\t%.2f\t%s" % (ps[0]+" "*(namelen[0]-len(ps[0])), 391 | ps[1]+" "*(namelen[1]-len(ps[1])), 392 | ps[2]+" "*(namelen[2]-len(ps[2])), 393 | ps[3]+" "*(namelen[3]-len(ps[3])), 394 | D,STD,Z, 395 | BABA,ABBA, 396 | nloci,nboots, 397 | pdisc,Notes[i]) 398 | 399 | 400 | 401 | #loci = open(alignfile).read().strip().split("|")[:-1] 402 | loci = open(alignfile).read().strip().split("|")[:-1] 403 | loci[0] = "xx\n"+loci[0] 404 | 405 | if makesort: 406 | makesortfiles('ABBA',ABBAloci,4,loci,outfile,makesort,sub,ps) 407 | makesortfiles('BABA',BABAloci,4,loci,outfile,makesort,sub,ps) 408 | 409 | if makeboots: 410 | with open(outfile+"_"+str(sub+1)+".boots",'w') as out: 411 | out.write(",".join(map(str,boots))) 412 | 413 | for oldpickle in glob.glob(".save.D4temp*"): 414 | os.remove(oldpickle) 415 | 416 | 417 | def main(tests, alignfile, outfile, nboots, nproc, makesort, makeboots): 418 | 419 | P1namelen = max(map(len,[str(i[0][0]) for i in tests])) 420 | P2namelen = max(map(len,[str(i[0][1]) for i in tests])) 421 | P3namelen = max(map(len,[str(i[0][2]) for i in tests])) 422 | Onamelen = max(map(len,[str(i[0][3]).strip() for i in tests])) 423 | namelen = [P1namelen,P2namelen,P3namelen,Onamelen] 424 | 425 | multiproc_it(tests,alignfile,outfile,nboots,nproc,namelen,makesort,makeboots) 426 | 427 | 428 | 429 | if __name__ == '__main__': 430 | main() 431 | 432 | -------------------------------------------------------------------------------- /pyrad/sortandcheck2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | import gzip 4 | import itertools 5 | import sys 6 | import glob 7 | import os 8 | import cPickle as pickle 9 | import multiprocessing 10 | from potpour import Worker 11 | 12 | 13 | def combinefiles(GLOB): 14 | "combines first and second reads file names" 15 | if len(glob.glob(GLOB)) > 1: 16 | FS = glob.glob(GLOB) 17 | else: 18 | FS = glob.glob(GLOB) 19 | firsts = [i for i in FS if "_R1_" in i] 20 | if len(firsts) < 1: 21 | print "\n\tFirst read files names must contain '_R1_'." 22 | sys.exit() 23 | seconds = [ff.replace("_R1_","_R2_") for ff in firsts] 24 | if len(firsts) != len(seconds): 25 | print "different numbers of first and second read files. Check that the names of files are correct" 26 | sys.exit() 27 | return zip(firsts,seconds) 28 | 29 | 30 | def revcomp(s): 31 | "returns reverse complement of a string" 32 | ss = s[::-1].strip().replace("A","t").replace("T","a").\ 33 | replace("C","g").replace("G","c").upper() 34 | return ss 35 | 36 | 37 | def matching(a,b, maxmismatch): 38 | "allows for N base difference between barcodes" 39 | if len(a) == len(b): 40 | t = [a[i]==b[i] for i in range(len(a))] 41 | if t.count(False) <= maxmismatch: 42 | return 1 43 | else: 44 | return 0 45 | else: 46 | return 0 47 | 48 | 49 | def unambig(seq): 50 | """ returns both resolutions of a cut site 51 | that has an ambiguous base in it """ 52 | resos = [] 53 | D = {"R":("G","A"), 54 | "K":("G","T"), 55 | "S":("G","C"), 56 | "Y":("T","C"), 57 | "W":("T","A"), 58 | "M":("C","A")} 59 | for base in list("RKSYWM"): 60 | if base in seq: 61 | resos.append(seq.replace(base,D[base][0])) 62 | resos.append(seq.replace(base,D[base][1])) 63 | return resos 64 | 65 | 66 | def findbcode(CUT,longB,l): 67 | barcode = 'N'*20 68 | " in case ambiguous base in CUT " 69 | if any([i in CUT for i in list("RKYSWM")]): 70 | CUT = unambig(CUT) 71 | Bs = [] 72 | for cut in CUT: 73 | if l[1][0:longB+len(cut)].count(cut) == 1: 74 | barcode = l[1].split(cut)[0].strip() 75 | elif l[1][0:longB+len(cut)].count(cut) == 2: 76 | barcode = cut.join(l[1].split(cut)[0:2]).strip() 77 | else: 78 | barcode = "" 79 | Bs.append(barcode) 80 | longestbar = Bs[map(len,Bs).index(max(map(len,Bs)))] 81 | return longestbar 82 | else: 83 | if l[1][0:longB+len(CUT)].count(CUT) == 1: 84 | barcode = l[1].split(CUT)[0].strip() 85 | elif l[1][0:longB+len(CUT)].count(CUT) == 2: 86 | barcode = CUT.join(l[1].split(CUT)[0:2]).strip() 87 | else: 88 | barcode = "" 89 | return barcode 90 | 91 | 92 | 93 | def barmatch(C, Raws, CUT, datatype, num, maxmismatch, WORK, longB): 94 | """matches reads to barcodes in barcode file 95 | and writes to individual temp files, after all 96 | read files have been split, temp files are collated 97 | into .fq files""" 98 | 99 | #CUT1 = CUT = unambig(CUT)[0] 100 | locus = 0 101 | match = 0 102 | match2 = 0 103 | barcodehits = set() 104 | 105 | " dictionary to record barcode misses" 106 | M = {} 107 | M['_'] = 0 108 | 109 | "read in paired end read files" 110 | if 'pair' in datatype: 111 | if '.gz' in Raws[0][-3:]: 112 | fr1 = gzip.open(Raws[0]) 113 | else: 114 | fr1 = open(Raws[0]) 115 | if '.gz' in Raws[1][-3:]: 116 | fr2 = gzip.open(Raws[1]) 117 | else: 118 | fr2 = open(Raws[1]) 119 | R1 = itertools.izip(*[iter(fr1)]*4) 120 | R2 = itertools.izip(*[iter(fr2)]*4) 121 | else: 122 | "read in single end read file" 123 | if '.gz' in Raws[-3:]: 124 | fr1 = gzip.open(Raws) 125 | else: 126 | fr1 = open(Raws) 127 | R1 = itertools.izip(*[iter(fr1)]*4) 128 | 129 | D = {} 130 | DD = {} 131 | while 1: 132 | try: r1 = R1.next() 133 | except StopIteration: break 134 | 135 | "match paired end reads together, for now" 136 | if 'pair' in datatype: 137 | r2 = R2.next() 138 | l = [r.strip() for r in r1] 139 | l = [l[0],l[1],l[2],l[3]] 140 | ll = [r.strip() for r in r2] 141 | ll = [ll[0],ll[1],ll[2],ll[3]] 142 | else: 143 | "make list of four fastq line elements" 144 | l = [r.strip() for r in r1] 145 | l = [l[0],l[1],l[2],l[3]] 146 | 147 | locus += 1 148 | 149 | if 'pair' in datatype: 150 | if longB[1] == 'same': 151 | " bars are all same length" 152 | barcode = l[1][:longB[0]] 153 | else: 154 | " find barcodes" 155 | barcode = findbcode(CUT,longB[0],l) 156 | if barcode: 157 | if barcode in M: 158 | M[barcode] += 1 159 | else: 160 | M[barcode] = 1 161 | 162 | "exclude the read if no cutsite/barcode found" 163 | if barcode in D: 164 | l[1] = l[1][len(barcode):] #barcode.join(l[1].split(barcode)[1:]) 165 | l[3] = l[3][len(barcode):] 166 | D[barcode].append("\n".join(l).strip()) 167 | DD[barcode].append("\n".join(ll).strip()) 168 | match += 1 169 | else: 170 | l[1] = l[1][len(barcode):] #barcode.join(l[1].split(barcode)[1:]) 171 | l[3] = l[3][len(barcode):] 172 | D[barcode] = l 173 | DD[barcode] = ll 174 | match += 1 175 | 176 | else: 177 | M["_"] += 1 178 | 179 | else: 180 | if longB[1] == 'same': 181 | if datatype=='2brad': 182 | barcode = l[1][-longB[0]:] 183 | else: 184 | barcode = l[1][:longB[0]] 185 | else: 186 | barcode = findbcode(CUT,longB[0],l) 187 | if barcode: 188 | " tracker of number of occurrences of each barcode" 189 | if barcode in M: 190 | M[barcode] += 1 191 | else: 192 | M[barcode] = 1 193 | 194 | "exclude the read if no cutsite/barcode found" 195 | "saves reads from barcodes to a dictionary D" 196 | if barcode in D: 197 | #l[1] = CUT1+l[1][len(barcode)+len(CUT):] 198 | if datatype=='2brad': 199 | l[1] = l[1][:-len(barcode)] 200 | l[3] = l[3][:-len(barcode)] 201 | else: 202 | l[1] = l[1][len(barcode):] 203 | l[3] = l[3][len(barcode):] 204 | D[barcode].append("\n".join(l).strip()) 205 | match += 1 206 | else: 207 | l[1] = l[1][len(barcode):] 208 | l[3] = l[3][len(barcode):] 209 | D[barcode] = l 210 | match += 1 211 | else: 212 | M["_"] += 1 213 | 214 | 215 | "write to file every 50Kth read" 216 | " only writes reads that match to a barcode in C by less than some N differences " 217 | if not locus % 50000: 218 | for bar in C: 219 | outF1 = gzip.open(WORK+"fastq/."+C[bar]+'.temp_R1_'+str(num)+'.gz','ab') 220 | if 'pair' in datatype: 221 | outF2 = gzip.open(WORK+"fastq/."+C[bar]+'.temp_R2_'+str(num)+'.gz','ab') 222 | for barcode in D: 223 | if matching(bar,barcode,maxmismatch): 224 | barcodehits.add(barcode) 225 | if D[barcode]: 226 | match2 += len(D[barcode]) ## -3 227 | outF1.write("\n".join(D[barcode])+'\n') 228 | if 'pair' in datatype: 229 | if DD[barcode]: 230 | outF2.write("\n".join(DD[barcode])+'\n') 231 | D[barcode] = [] 232 | DD[barcode] = [] 233 | outF1.close() 234 | if 'pair' in datatype: 235 | outF2.close() 236 | D[bar] = [] 237 | DD[bar] = [] 238 | 239 | 240 | "write the remaining reads to file" 241 | for bar in C: 242 | outF1 = gzip.open(WORK+"fastq/."+C[bar]+'.temp_R1_'+str(num)+'.gz','ab') 243 | if 'pair' in datatype: 244 | outF2 = gzip.open(WORK+"fastq/."+C[bar]+'.temp_R2_'+str(num)+'.gz','ab') 245 | for barcode in D: 246 | if matching(bar,barcode,maxmismatch): 247 | barcodehits.add(barcode) 248 | if D[barcode]: 249 | match2 += len(D[barcode]) ## -3 250 | outF1.write("\n".join(D[barcode])+'\n') 251 | if 'pair' in datatype: 252 | if DD[barcode]: 253 | outF2.write("\n".join(DD[barcode])+'\n') 254 | D[barcode] = [] 255 | DD[barcode] = [] 256 | outF1.close() 257 | if 'pair' in datatype: 258 | outF2.close() 259 | D[bar] = [] 260 | DD[bar] = [] 261 | 262 | 263 | sys.stderr.write(".") 264 | fr1.close() 265 | if 'pair' in datatype: 266 | fr2.close() 267 | 268 | "writes statistics out" 269 | statout = open(WORK+"stats/s1.sorting.txt",'a') 270 | if 'pair' in datatype: 271 | name = Raws[0].split("/")[-1].replace("_R1_","_") 272 | else: 273 | name = Raws.split("/")[-1].replace("_R1_","_") 274 | 275 | match2 = sum([M[i] for i in M if i in barcodehits]) 276 | writeit = "%s\t%i\t%i\t%i\n" % (name, locus, match, match2) 277 | statout.write(writeit) 278 | statout.close() 279 | pickout = open(WORK+"fastq/."+name+".pickle","wb") 280 | pickle.dump( M, pickout) 281 | pickout.close() 282 | 283 | 284 | def writefunc(GLOB,Parallel,Bcode,CUT,datatype,maxmismatch,WORK): 285 | "create barcode dictionary" 286 | codetable = open(Bcode, 'r') 287 | codes = [line.strip().split() for line in codetable.readlines()] 288 | C = {} 289 | for line in codes: 290 | if line[0]: 291 | C[line[1].strip().upper()] = line[0] 292 | 293 | " find longest barcode " 294 | keylens = map(len,C.keys()) 295 | if len(set(keylens)) == 1: 296 | longB = (keylens[0],'same') 297 | else: 298 | longB = (max(keylens),'diff') 299 | 300 | " check for CUT in barcodes " 301 | CCC = unambig(CUT) 302 | if len(CCC)>1: 303 | for cut in CCC: 304 | if any([cut in i for i in C.keys()]): 305 | print "\n\twarning: CUT site matches within one of the barcodes, "+\ 306 | "I suggest double \n\tchecking the file to make sure it properly demultiplexes" 307 | else: 308 | if any([CUT in i for i in C.keys()]): 309 | print "\n\twarning: CUT site matches within one of the barcodes, "+\ 310 | "I suggest double \n\tchecking the file to make sure it properly demultiplexes" 311 | 312 | " read in sequence files " 313 | if len(glob.glob(GLOB)) > 1: 314 | FS = [f for f in glob.glob(GLOB)] 315 | else: 316 | FS = glob.glob(GLOB) 317 | if 'pair' in datatype: 318 | Raws = combinefiles(GLOB) 319 | else: 320 | Raws = FS 321 | 322 | "send jobs to multiprocess queue" 323 | num = 0 324 | work_queue = multiprocessing.Queue() 325 | submitted = 0 326 | for fs in Raws: 327 | if 'pair' in datatype: 328 | work_queue.put([C, [fs[0],fs[1]], CUT, datatype, num, maxmismatch, WORK, longB]) 329 | submitted += 1 330 | else: 331 | work_queue.put([C, fs, CUT, datatype, num, maxmismatch, WORK, longB]) 332 | submitted += 1 333 | num += 1 334 | 335 | result_queue = multiprocessing.Queue() 336 | 337 | "spawn workers, give function" 338 | jobs = [] 339 | for i in range( min(Parallel,submitted) ): 340 | worker = Worker(work_queue, result_queue, barmatch) 341 | worker.start() 342 | jobs.append(worker) 343 | for job in jobs: 344 | job.join() 345 | 346 | Ms = {} 347 | 348 | if len(glob.glob(WORK+"fastq/.*.pickle")) > 1: 349 | for pick in glob.glob(WORK+"fastq/.*.pickle"): 350 | pickin = open(pick, "rb") 351 | M = pickle.load( pickin ) 352 | pickin.close() 353 | for key in M: 354 | if key not in Ms: 355 | Ms[key] = M[key] 356 | else: 357 | Ms[key] += M[key] 358 | os.remove(pick) 359 | elif len(glob.glob(WORK+"fastq/.*.pickle")) == 1: 360 | pick = glob.glob(WORK+"fastq/.*.pickle")[0] 361 | pickin = open(pick, 'rb') 362 | Ms = pickle.load( pickin ) 363 | pickin.close() 364 | os.remove(pick) 365 | else: 366 | print "\nno stats file generated" 367 | 368 | Mkeys = Ms.keys() 369 | Mkeys.sort(key=lambda x: Ms[x], reverse=True) 370 | 371 | statout = open(WORK+"stats/s1.sorting.txt",'a') 372 | statout.write("\n\n") 373 | statout.write("sample\ttrue_bar\tobs_bars\tN_obs\n") 374 | 375 | Cnames = C.keys() 376 | Cnames.sort() 377 | try: maxl = max(map(len,map(str,Ms.values()))) 378 | except ValueError: maxl = 5 379 | 380 | hits = [] 381 | for bar in Cnames: 382 | for barcode in Mkeys: 383 | if matching(bar, barcode, maxmismatch): 384 | print >>statout, "%s \t%s \t%s\t%s" % (C[bar], bar, barcode, 385 | str(Ms[barcode])+" "*(maxl+3-len(str(Ms[barcode])))) 386 | hits.append(barcode) 387 | 388 | statout.write("\n") 389 | maxl = max(map(len,Mkeys)) 390 | for barcode in Mkeys: 391 | if barcode not in hits: 392 | print >>statout, "nomatch \t%s \t%i" % (barcode+" "*(maxl+3-len(barcode)), Ms[barcode]) 393 | statout.close() 394 | 395 | 396 | 397 | def main(Bcode, GLOB, CUT, datatype, Parallel, maxmismatch, WORK): 398 | 399 | if not len(glob.glob(GLOB)) > 0: 400 | sys.stderr.write("\tno data found in "+GLOB+" fix path to the data files\n") 401 | sys.exit() 402 | 403 | "check for previous output" 404 | if not os.path.exists(WORK+'stats'): 405 | os.makedirs(WORK+'stats') 406 | if os.path.exists(WORK+'fastq'): 407 | if os.listdir(WORK+'fastq'): 408 | print ("\n\tfastq/ directory in working directory contains data, move/remove it before running step 1\n") 409 | sys.exit() 410 | else: 411 | os.makedirs(WORK+'fastq') 412 | 413 | if "*" in Bcode: 414 | if len(glob.glob(Bcode)) == 1: 415 | Bcode = glob.glob(Bcode)[0] 416 | 417 | sys.stderr.write("\n\tstep 1: sorting reads by barcode\n\t ") 418 | 419 | " seperate double digest cut sites, only need first read one for now " 420 | if "," in CUT: 421 | CUT,CUT2 = CUT.split(",") 422 | 423 | statout = open(WORK+"stats/s1.sorting.txt",'w') 424 | statout.write("\t".join(["file ","Nreads","cut_found","bar_matched"])+"\n") 425 | statout.close() 426 | 427 | " DO THE BARCODE SORTING " 428 | writefunc(GLOB, Parallel, Bcode, CUT, datatype, maxmismatch, WORK) 429 | names = [line.split()[0] for line in open(Bcode).readlines()] 430 | 431 | # " remove tiny sorted temp files " 432 | # if len(glob.glob(GLOB)) > 1: 433 | # for name in names: 434 | # if len(glob.glob(WORK+"fastq/."+name+"*")) > 0: 435 | # "remove very small files, probably errors" 436 | # for ff in glob.glob(WORK+'fastq/.'+name+"*"): 437 | # statinfo = os.stat(ff) 438 | # s = statinfo.st_size 439 | # if s < 1000: 440 | # os.remove(ff) 441 | 442 | 443 | " concatenate temp files " 444 | for name in names: 445 | if len(glob.glob(WORK+"fastq/."+name+"*")) > 0: 446 | os.system("/bin/cat "+WORK+"fastq/."+name+".temp_R1_*.gz > "+WORK+"fastq/"+name+"_R1.fq.gz") 447 | if datatype in ['pairgbs','pairddrad']: 448 | os.system("/bin/cat "+WORK+"fastq/."+name+".temp_R2_*.gz > "+WORK+"fastq/"+name+"_R2.fq.gz") 449 | 450 | if len(glob.glob(WORK+"fastq/*")) > 0: 451 | os.system("/bin/ls "+WORK+"fastq/.*temp_* | xargs /bin/rm" ) 452 | if len(glob.glob(WORK+"fastq/*.pickle")) > 0: 453 | os.system("/bin/ls "+WORK+"fastq/.*pickle | xargs /bin/rm" ) 454 | -------------------------------------------------------------------------------- /pyrad/Dtest_5.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | import numpy 4 | import sys 5 | import random 6 | import itertools 7 | import multiprocessing 8 | import cPickle as pickle 9 | from potpour import Worker 10 | from Dtest import IUPAC, sample_wr, fillin, makesortfiles 11 | 12 | 13 | def most_common(lst): 14 | return max(set(lst), key=lst.count) 15 | 16 | 17 | def makefreq(patlist): 18 | " identify which allele is derived in P3 relative to outgroup " 19 | " and is the most frequent and use that as the SNP." 20 | " Also, split up alleles into those that are P3a & vs. or P3b " 21 | P = {} 22 | for tax in patlist: 23 | P[tax] = [] 24 | 25 | for tax in patlist: 26 | for base in patlist[tax]: 27 | if base in list('ATGC'): 28 | P[tax].append(base[0]) 29 | P[tax].append(base[0]) 30 | elif base in list("RKSYWM"): 31 | hh = IUPAC(base[0]) 32 | for i in hh: 33 | P[tax].append(i) 34 | 35 | " select most common element in outgroup " 36 | if len(set(P['o'])) > 1: 37 | minor = most_common(P['o']) 38 | else: 39 | minor = P['o'][0] 40 | 41 | " select most common element that is not minor " 42 | bases = list(itertools.chain(*P.values())) 43 | majors = [i for i in bases if i != minor] 44 | major = most_common(majors) 45 | 46 | ret = [float(P[i].count(major)) / len(P[i]) for i in ['p1','p2','p3a','p3b','o']] 47 | ret += [float(P['p3a'].count(major)+P['p3b'].count(major))/(len(P['p3a'])+len(P['p3b']))] 48 | return ret 49 | 50 | 51 | 52 | def Dstat5(Loc,pat): 53 | " check site for patterns and add to Locus object if found" 54 | if len(set(pat)) < 3: 55 | if len(set(pat[2:])) > 1: 56 | minor = pat[-1] 57 | major = [i for i in [pat[2],pat[3]] if i not in pat[4]][0] 58 | 59 | o = 0. 60 | p3ab = 1. if (pat[3] == major) & (pat[2] == major) else 0. 61 | p3b = 1. if pat[3] == major else 0. 62 | p3a = 1. if pat[2] == major else 0. 63 | p2 = 1. if pat[1] == major else 0. 64 | p1 = 1. if pat[0] == major else 0. 65 | 66 | Loc.abbba += ( (1.-p1)*p2*p3ab*(1.-o) ) 67 | Loc.babba += ( p1*(1.-p2)*p3ab*(1.-o) ) 68 | 69 | Loc.abbaa += ( (1.-p1)*p2*p3a*(1.-p3b)*(1.-o) ) 70 | Loc.babaa += ( p1*(1.-p2)*p3a*(1.-p3b)*(1.-o) ) 71 | 72 | Loc.ababa += ( (1.-p1)*p2*(1.-p3a)*p3b*(1.-o) ) 73 | Loc.baaba += ( p1*(1.-p2)*(1.-p3a)*p3b*(1.-o) ) 74 | 75 | return Loc 76 | 77 | 78 | 79 | def polyDstat5(Loc, pat): 80 | ## calculate frequencies 81 | " look at the P3 taxon first for a derived allele " 82 | p1,p2,p3a,p3b,o,p3ab = makefreq(pat) 83 | 84 | Loc.abbba += ( (1.-p1)*p2*p3ab*(1.-o) ) 85 | Loc.babba += ( p1*(1.-p2)*p3ab*(1.-o) ) 86 | 87 | Loc.abbaa += ( (1.-p1)*p2*p3a*(1.-p3b)*(1.-o) ) 88 | Loc.babaa += ( p1*(1.-p2)*p3a*(1.-p3b)*(1.-o) ) 89 | 90 | Loc.ababa += ( (1.-p1)*p2*(1.-p3a)*p3b*(1.-o) ) 91 | Loc.baaba += ( p1*(1.-p2)*(1.-p3a)*p3b*(1.-o) ) 92 | 93 | return Loc 94 | 95 | 96 | 97 | def IUAfreq(Loc, L): 98 | patlist = {} 99 | Loc.abbba = 0. 100 | Loc.babba = 0. 101 | Loc.abbaa = 0. 102 | Loc.babaa = 0. 103 | Loc.ababa = 0. 104 | Loc.baaba = 0. 105 | 106 | for col in Loc.seq.transpose(): 107 | patlist = fillin(L[0], 'p1', col, Loc.names, patlist) 108 | patlist = fillin(L[1], 'p2', col, Loc.names, patlist) 109 | patlist = fillin(L[2], 'p3a', col, Loc.names, patlist) 110 | patlist = fillin(L[3], 'p3b', col, Loc.names, patlist) 111 | patlist = fillin(L[4], 'o', col, Loc.names, patlist) 112 | 113 | if not any([ all([i in ["N",'-'] for i in patlist['p1']]), 114 | all([i in ["N",'-'] for i in patlist['p2']]), 115 | all([i in ["N",'-'] for i in patlist['p3a']]), 116 | all([i in ["N",'-'] for i in patlist['p3b']]), 117 | all([i in ["N",'-'] for i in patlist['o']]) ]): 118 | if any([ i not in patlist['o'] for i in numpy.dstack((patlist['p3a'],patlist['p3b']))[0][0] ]): 119 | Loc = polyDstat5(Loc, patlist) 120 | else: 121 | None 122 | else: 123 | None 124 | return Loc 125 | 126 | 127 | 128 | def IUA(Loc,L): 129 | Loc.abbba = 0. 130 | Loc.babba = 0. 131 | Loc.abbaa = 0. 132 | Loc.babaa = 0. 133 | Loc.ababa = 0. 134 | Loc.baaba = 0. 135 | for col in Loc.seq.transpose(): 136 | if all(i in list("ATGC") for i in col): 137 | Loc = Dstat5(Loc,col) 138 | return Loc 139 | 140 | 141 | 142 | def bootfreq(Ldict, which): 143 | Dft_12 = Dfb_12 = 0 144 | Dft_1 = Dfb_1 = 0 145 | Dft_2 = Dfb_2 = 0 146 | while 1: 147 | try: Lx = Ldict[Ldict.keys()[which.next()]] 148 | except StopIteration: break 149 | Dft_12 += Lx.abbba - Lx.babba 150 | Dfb_12 += Lx.abbba + Lx.babba 151 | Dft_1 += Lx.abbaa - Lx.babaa 152 | Dfb_1 += Lx.abbaa + Lx.babaa 153 | Dft_2 += Lx.ababa - Lx.baaba 154 | Dfb_2 += Lx.ababa + Lx.baaba 155 | D12 = 0. 156 | if Dfb_12 > 0: 157 | D12 = Dft_12/float(Dfb_12) 158 | D1 = 0. 159 | if Dfb_1 > 0: 160 | D1 = Dft_1/float(Dfb_1) 161 | D2 = 0. 162 | if Dfb_2 > 0: 163 | D2 = Dft_2/float(Dfb_2) 164 | return D12, D1, D2 165 | 166 | 167 | 168 | def bootfixed(Ldict, which): 169 | abbba = babba = 0 170 | abbaa = babaa = 0 171 | ababa = baaba = 0 172 | while 1: 173 | try: Lx = Ldict[Ldict.keys()[which.next()]] 174 | except StopIteration: break 175 | abbba += Lx.abbba 176 | babba += Lx.babba 177 | abbaa += Lx.abbaa 178 | babaa += Lx.babaa 179 | ababa += Lx.ababa 180 | baaba += Lx.baaba 181 | D12 = 0. 182 | if abbba + babba > 0: 183 | D12 = float(abbba-babba)/(abbba+babba) 184 | D1 = 0. 185 | if abbaa + babaa > 0: 186 | D1 = float(abbaa-babaa)/(abbaa+babaa) 187 | D2 = 0. 188 | if ababa + baaba > 0: 189 | D2 = float(ababa-baaba)/(ababa+baaba) 190 | return D12, D1, D2 191 | 192 | 193 | 194 | class Locus5(): 195 | """locus keeps track of position in input file, 196 | variable sites, and D-statistics""" 197 | def _init_(self): 198 | self.number = number 199 | self.taxa = names 200 | self.seq = sequence 201 | self.abbba = abbba 202 | self.babba = abbba 203 | self.abbaa = abbaa 204 | self.babaa = babaa 205 | self.ababa = ababa 206 | self.baaba = baaba 207 | " D-stats for an individual locus " 208 | def D1(self): 209 | if self.abbaa+self.babaa > 0: 210 | return float(self.abbaa-self.babaa)/(self.abbaa+self.babaa) 211 | else: 212 | return 0.0 213 | def D2(self): 214 | if self.ababa+self.baaba > 0: 215 | return float(self.ababa-self.baaba)/(self.ababa+self.baaba) 216 | else: 217 | return 0.0 218 | def D12(self): 219 | if self.abbba+self.babba > 0: 220 | return float(self.abbba-self.babba)/(self.abbba+self.babba) 221 | else: 222 | return 0.0 223 | 224 | 225 | 226 | 227 | 228 | 229 | def makeSNP(L, snpfreq, loci): 230 | Ndict = {} 231 | num = 0 232 | for loc in loci: 233 | Loc = Locus5() 234 | Loc.number = num 235 | 236 | " only select loci that have data for all five tiptaxa " 237 | names = [i.split()[0].replace(">","") for i in loc.lstrip().rstrip().split("\n")[1:-1]] 238 | if snpfreq: 239 | Loc.names = [i for i in names if i in list(itertools.chain(*L))] 240 | else: 241 | Loc.names = L #[i for i in names if i in L] 242 | 243 | " if snpfreq only need one of possibly multiple individuals" 244 | keep = 0 245 | 246 | if snpfreq: 247 | for tax in L: 248 | z = any([tax in Loc.names for tax in L[0]]) 249 | y = any([tax in Loc.names for tax in L[1]]) 250 | x = any([tax in Loc.names for tax in L[2]]) 251 | w = any([tax in Loc.names for tax in L[3]]) 252 | u = any([tax in Loc.names for tax in L[4]]) 253 | if all([z,y,x,w,u]): 254 | keep = 1 255 | 256 | else: 257 | if all(tax in names for tax in Loc.names): 258 | keep = 1 259 | 260 | if keep: 261 | N = numpy.array([tuple(i) for i in loc.split("\n")[1:]]) 262 | " only select sites with synapomorphies " 263 | # select all variable 264 | N[-1] = list(N[-1].tostring().replace("-","*")) 265 | N = N[:, N[-1] == "*"] 266 | 267 | " only select rows with focal taxa " 268 | Loc.seq = N[[names.index(i) for i in Loc.names],:] 269 | Ndict[num] = Loc 270 | num += 1 271 | return Ndict 272 | 273 | 274 | 275 | def runtest(infile, L, nboots, snpfreq, submitted): 276 | " print test " 277 | print L 278 | 279 | " split each locus " 280 | loci = open(infile).read().strip().split("|")[:-1] 281 | loci[0] = "\n"+loci[0] 282 | 283 | " returns a {} of Locus5 objects with data for tiptaxa L " 284 | Ldict = makeSNP(L, snpfreq, loci) 285 | 286 | " calculate discordant patterns for each locus " 287 | for loc in Ldict: 288 | if snpfreq: 289 | Ldict[loc] = IUAfreq(Ldict[loc],L) 290 | else: 291 | Ldict[loc] = IUA(Ldict[loc],L) 292 | ################################################ 293 | 294 | " final D12 " 295 | dft_12 = sum([Ldict[l].abbba - Ldict[l].babba for l in Ldict]) 296 | dbt_12 = sum([Ldict[l].abbba + Ldict[l].babba for l in Ldict]) 297 | if dbt_12 > 0: 298 | D12 = float(dft_12)/dbt_12 299 | else: D12 = 0. 300 | 301 | " final D1 " 302 | dft_1 = sum([Ldict[l].abbaa - Ldict[l].babaa for l in Ldict]) 303 | dbt_1 = sum([Ldict[l].abbaa + Ldict[l].babaa for l in Ldict]) 304 | if dbt_1 > 0: 305 | D1 = float(dft_1)/dbt_1 306 | else: D1 = 0. 307 | 308 | " final D2 " 309 | dft_2 = sum([Ldict[l].ababa - Ldict[l].baaba for l in Ldict]) 310 | dbt_2 = sum([Ldict[l].ababa + Ldict[l].baaba for l in Ldict]) 311 | if dbt_2 > 0: 312 | D2 = float(dft_2)/dbt_2 313 | else: D2 = 0. 314 | 315 | " proportion of discordant loci " 316 | try: pdisc = len([i for i in Ldict if any([Ldict[i].D12(),Ldict[i].D1(),Ldict[i].D2()])]) / float(len(Ldict)) 317 | except ValueError: 318 | pdisc = 0.0 319 | 320 | ################################################# 321 | 322 | " do bootstrapping " 323 | BB12 = [] 324 | BB1 = [] 325 | BB2 = [] 326 | for i in xrange(nboots): 327 | which = iter(sample_wr(xrange(len(Ldict)), len(Ldict))) 328 | if snpfreq: 329 | bb12,bb1,bb2 = bootfreq(Ldict, which) 330 | else: 331 | #bb12,bb1,bb2 = bootfixed(Ldict, which) 332 | bb12,bb1,bb2 = bootfreq(Ldict, which) 333 | BB12.append(bb12) 334 | BB1.append(bb1) 335 | BB2.append(bb2) 336 | STD12 = numpy.std(BB12) 337 | STD1 = numpy.std(BB1) 338 | STD2 = numpy.std(BB2) 339 | ################################################## 340 | 341 | " stats out " 342 | if STD12 > 0: 343 | Z12 = (abs(D12/STD12)) 344 | else: Z12 = 0. 345 | if STD1 > 0: 346 | Z1 = (abs(D1/STD1)) 347 | else: Z1 = 0. 348 | if STD2 > 0: 349 | Z2 = (abs(D2/STD2)) 350 | else: Z2 = 0. 351 | 352 | ## make loci files here 353 | ABBBAloci = [Ldict[l].number for l in Ldict if Ldict[l].D12() > 0] 354 | BABBAloci = [Ldict[l].number for l in Ldict if Ldict[l].D12() < 0] 355 | ABBAAloci = [Ldict[l].number for l in Ldict if Ldict[l].D1() > 0] 356 | BABAAloci = [Ldict[l].number for l in Ldict if Ldict[l].D1() < 0] 357 | ABABAloci = [Ldict[l].number for l in Ldict if Ldict[l].D2() > 0] 358 | BAABAloci = [Ldict[l].number for l in Ldict if Ldict[l].D2() < 0] 359 | 360 | " pickle to prevent multiprocessing from freezing on large returns " 361 | ret = [L, 362 | D12,Z12, 363 | D1,Z1, 364 | D2,Z2, 365 | len(Ldict), 366 | sum([Ldict[l].abbba for l in Ldict]), 367 | sum([Ldict[l].babba for l in Ldict]), 368 | sum([Ldict[l].abbaa for l in Ldict]), 369 | sum([Ldict[l].babaa for l in Ldict]), 370 | sum([Ldict[l].ababa for l in Ldict]), 371 | sum([Ldict[l].baaba for l in Ldict]), 372 | pdisc, submitted, 373 | ABBBAloci, BABBAloci, 374 | ABBAAloci, BABAAloci, 375 | ABABAloci, BAABAloci, 376 | BB12, BB1, BB2] 377 | pickle.dump(ret, open(".save."+str(submitted),'wb')) 378 | 379 | 380 | 381 | def checktaxa(taxalist,alignfile): 382 | with open(alignfile) as infile: 383 | data = infile.readlines() 384 | taxainfile = set() 385 | for line in data: 386 | if ">" in line: 387 | tax = line.split(" ")[0].replace(">","") 388 | if tax not in taxainfile: 389 | taxainfile.add(tax) 390 | if not set(taxainfile).difference(taxainfile): 391 | return 1 392 | 393 | 394 | 395 | 396 | def multiproc_it(subtests,alignfile,outfile, nboots,nproc,namelen,makesort,makeboots): 397 | work_queue = multiprocessing.Queue() 398 | result_queue = multiprocessing.Queue() 399 | submitted = 0 400 | Notes = [] 401 | for rep in subtests: 402 | notes = "" 403 | if len(rep) == 2: 404 | rep,notes = rep 405 | p1,p2,p3a,p3b,o = rep 406 | if all(["[" in i for i in rep[1:]]): 407 | p1 = p1[1:-1].split(",") 408 | p2 = p2[1:-1].split(",") 409 | p3a = p3a[1:-1].split(",") 410 | p3b = p3b[1:-1].split(",") 411 | o = o[1:-1].split(",") 412 | if checktaxa([p1,p2,p3a,p3b,o],alignfile): 413 | work_queue.put([alignfile, [p1,p2,p3a,p3b,o], nboots, 1, submitted]) 414 | submitted += 1 415 | else: 416 | print 'a taxon name was found that is not in the sequence file' 417 | else: 418 | if checktaxa([p1,p2,p3a,p3b,o],alignfile): 419 | work_queue.put([alignfile, [p1,p2,p3a,p3b,o], nboots, 0, submitted]) 420 | submitted += 1 421 | else: 422 | print 'a taxon name was found that is not in the sequence file' 423 | Notes.append(notes) 424 | 425 | jobs = [] 426 | for i in range(min(submitted,nproc)): 427 | worker = Worker(work_queue, result_queue, runtest) 428 | jobs.append(worker) 429 | worker.start() 430 | for j in jobs: 431 | j.join() 432 | 433 | 434 | " read results back in " 435 | #Results = [result_queue.get() for i in range(submitted)] 436 | Results = [pickle.load(open(".save."+str(i),'rb')) for i in range(submitted)] 437 | Results.sort(key = lambda x:x[15]) 438 | 439 | 440 | " setup results file " 441 | outs = open(outfile+".partD.txt", 'w') 442 | header = "\t".join([ 'p1'+" "*(namelen[0]-2), 443 | 'p2'+" "*(namelen[1]-2), 444 | 'p3_1'+" "*(namelen[2]-4), 445 | 'p3_2'+" "*(namelen[3]-4), 446 | 'O'+" "*(namelen[4]-1), 447 | 'D_12','D_1','D_2', 448 | 'Z_12','Z_1','Z_2', 449 | 'BABBA','ABBBA', 450 | 'BABAA','ABBAA', 451 | 'BAABA','ABABA', 452 | 'nloci','pdisc', 'notes']) 453 | 454 | print >>outs, header 455 | 456 | 457 | for i in range(len(Results)): 458 | L,D12,Z12,D1,Z1,D2,Z2,nloc,ABBBA,BABBA,ABBAA,BABAA,ABABA,BAABA,pdisc,sub,ABBBAloci,BABBAloci,ABBAAloci,BABAAloci,ABABAloci,BAABAloci,BB12,BB1,BB2 = Results[i] 459 | L = [str(x).replace("['","[").replace("']","]").replace("', '",",") for x in L] 460 | 461 | resin = tuple([str(L[0])+" "*(namelen[0]-len(str(L[0]))), 462 | str(L[1])+" "*(namelen[1]-len(str(L[1]))), 463 | str(L[2])+" "*(namelen[2]-len(str(L[2]))), 464 | str(L[3])+" "*(namelen[3]-len(str(L[3]))), 465 | str(L[4])+" "*(namelen[4]-len(str(L[4]))), 466 | D12, D1, D2, Z12, Z1, Z2, 467 | BABBA, ABBBA, BABAA, ABBAA, BAABA, ABABA, 468 | nloc, pdisc, Notes[i]]) 469 | 470 | print >>outs, "%s\t%s\t%s\t%s\t%s\t%.3f\t%.3f\t%.3f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d\t%.2f\t%s" % resin 471 | 472 | loci = open(alignfile).read().strip().split("|")[:-1] 473 | if makesort: 474 | makesortfiles("ABBBA",ABBBAloci,5,loci,outfile,makesort,sub,L) 475 | makesortfiles("BABBA",BABBAloci,5,loci,outfile,makesort,sub,L) 476 | makesortfiles("ABBAA",ABBAAloci,5,loci,outfile,makesort,sub,L) 477 | makesortfiles("BABAA",BABAAloci,5,loci,outfile,makesort,sub,L) 478 | makesortfiles("ABABA",ABABAloci,5,loci,outfile,makesort,sub,L) 479 | makesortfiles("BAABA",BAABAloci,5,loci,outfile,makesort,sub,L) 480 | 481 | if makeboots: 482 | with open(outfile+"_"+str(sub+1)+".boots_D12",'w') as out: 483 | out.write(",".join(map(str,BB12))) 484 | with open(outfile+"_"+str(sub+1)+".boots_D1",'w') as out: 485 | out.write(",".join(map(str,BB1))) 486 | with open(outfile+"_"+str(sub+1)+".boots_D2",'w') as out: 487 | out.write(",".join(map(str,BB2))) 488 | 489 | 490 | def main(tests, alignfile, outfile, nboots, nproc, makesort, makeboots): 491 | import sys 492 | 493 | P1namelen = max(map(len,[str(i[0][0]) for i in tests])) 494 | P2namelen = max(map(len,[str(i[0][1]) for i in tests])) 495 | P3anamelen = max(map(len,[str(i[0][2]) for i in tests])) 496 | P3bnamelen = max(map(len,[str(i[0][3]) for i in tests])) 497 | Onamelen = max(map(len,[str(i[0][4]).strip() for i in tests])) 498 | namelen = [P1namelen,P2namelen,P3anamelen,P3bnamelen,Onamelen] 499 | 500 | multiproc_it(tests,alignfile,outfile,nboots,nproc,namelen,makesort,makeboots) 501 | 502 | 503 | if __name__ == '__main__': 504 | main() 505 | 506 | 507 | 508 | 509 | -------------------------------------------------------------------------------- /pyrad/consensdp.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | import multiprocessing 3 | import glob 4 | import itertools 5 | import sys 6 | import scipy.stats 7 | import scipy.misc 8 | import numpy 9 | import os 10 | import operator 11 | import gzip 12 | from potpour import Worker 13 | 14 | 15 | def binomprobr(n1,n2,e,r): 16 | """ 17 | given two bases are observed at a site 18 | n1 and n2, and the error rate e, the 19 | probability the site is truly aa,bb,ab 20 | is calculated using binomial distribution 21 | as in Li_et al 2009, 2011, and if 22 | coverage > 500, 500 reads were randomly 23 | sampled. 24 | """ 25 | maf = n1/(n1+n2) 26 | prior_homo = ((1.-r)/2.) 27 | prior_het = r 28 | ab = scipy.misc.comb(n1+n2,n1)/(2.**(n1+n2)) 29 | aa= scipy.stats.binom.pmf(n1,n1+n2,e) 30 | bb= scipy.stats.binom.pmf(n2,n1+n2,e) 31 | aa = aa*prior_homo 32 | bb = bb*prior_homo 33 | ab = ab*prior_het 34 | Q = [aa,bb,ab] 35 | Qn = ['aa','bb','ab'] 36 | P = max(Q)/float(aa+bb+ab) 37 | return [P,maf,Qn[Q.index(max(Q))]] 38 | 39 | 40 | def simpleconsens(n1,n2): 41 | """ 42 | majority consensus calling for sites 43 | with too low of coverage for 44 | statistical calling. Only used 45 | with 'lowcounts' option. 46 | """ 47 | Qn = ['aa','bb','ab'] 48 | maf = n1/(n1+n2) 49 | # if not n2: 50 | # P = 1.0 51 | # aa = 1.0 52 | # ab = bb = 0.0 53 | # else: 54 | # P = 0.99 55 | # aa = bb = 0.0 56 | # ab = 1.0 57 | ## create an option that saves 58 | ## frequencies. Useful for pooled sample data sets. 59 | #Q = [aa,bb,ab] 60 | #return [P,Qn[Q.index(max(Q))]] 61 | return [1.0,maf,'aa'] 62 | 63 | 64 | def hetero(n1,n2): 65 | """ 66 | returns IUPAC symbol for ambiguity bases, 67 | used for polymorphic sites. 68 | """ 69 | D = {('G','A'):"R", 70 | ('G','T'):"K", 71 | ('G','C'):"S", 72 | ('T','C'):"Y", 73 | ('T','A'):"W", 74 | ('C','A'):"M"} 75 | a = D.get((n1,n2)) 76 | b = D.get((n2,n1)) 77 | if a: 78 | return a 79 | else: 80 | return b 81 | 82 | 83 | def unhetero(amb): 84 | amb = amb.upper() 85 | " returns bases from ambiguity code" 86 | D = {"R":("G","A"), 87 | "K":("G","T"), 88 | "S":("G","C"), 89 | "Y":("T","C"), 90 | "W":("T","A"), 91 | "M":("C","A")} 92 | return D.get(amb) 93 | 94 | 95 | def uplow(b1): 96 | " allele precedence " 97 | " G > T > C > A " 98 | D = {('G','A'):"G", 99 | ('A','G'):"G", 100 | ('G','T'):"G", 101 | ('T','G'):"G", 102 | ('G','C'):"G", 103 | ('C','G'):"G", 104 | ('T','C'):"T", 105 | ('C','T'):"T", 106 | ('T','A'):"T", 107 | ('A','T'):"T", 108 | ('C','A'):"C", 109 | ('A','C'):"C"} 110 | r = D.get(b1) 111 | if not r: 112 | r = b1[0] 113 | return r 114 | 115 | 116 | 117 | def findalleles(consensus,sss,bbb): 118 | cons = list(consensus) 119 | " relative to first base " 120 | bigbase = uplow(tuple([i.split("_")[0] for i in bbb])) 121 | bigallele = bbb.index([i for i in bbb if i.split("_")[0] == bigbase][0]) 122 | for k in range(1,len(sss)): 123 | c = uplow(tuple([i.split("_")[k] for i in bbb])) 124 | which = bbb.index([i for i in bbb if i.split("_")[k] == c][0]) 125 | if bbb[bigallele] != bbb[which]: 126 | cons[sss[k]] = cons[sss[k]].lower() 127 | return "".join(cons) 128 | 129 | 130 | def breakalleles(consensus): 131 | """ 132 | break ambiguity code consensus seqs 133 | into two alleles 134 | """ 135 | a1 = "" 136 | a2 = "" 137 | #bigbase = "" 138 | for base in consensus: 139 | if base in tuple("RKSYWM"): 140 | a,b = unhetero(base) 141 | d = set([a,b]) 142 | a1 += uplow((a,b)) 143 | a2 += d.difference(uplow((a,b))).pop() 144 | #if not bigbase: 145 | # bigbase = uplow((a,b)) 146 | elif base in tuple("rksywm"): 147 | a,b = unhetero(base) 148 | d = set([a,b]) 149 | a2 += uplow((a,b)) 150 | a1 += d.difference(uplow((a,b))).pop() 151 | else: 152 | a1 += base 153 | a2 += base 154 | return a1, a2 155 | 156 | 157 | # correct G A C 158 | # >1D_0_0 CCTGCGTCGGGG G ATCCGTCTTATCTAAGCGGACAATAGCGGCAAACGCTCATAGT T CAAC G ACGTGACGCCGAACACCACCTCTAACC 159 | # >1D_0_1 CCTGCGTCGGGG T ATCCGTCTTATCTAAGCGGACAATAGCGGCAAACGCTCATAGT A CAAC C ACGTGACGCCGAACACCACCTCTAACC 160 | 161 | # >1D_0 CCTGCGTCGGGG K ATCCGTCTTATCTAAGCGGACAATAGCGGCAAACGCTCATAGT W CAAC S ACGTGACGCCGAACACCACCTCTAACC 162 | 163 | def stack(D): 164 | """ 165 | from list of bases at a site D, 166 | returns an ordered list of counts of bases 167 | """ 168 | L = len(D) 169 | counts = [] 170 | for i in range(len(D[0])): 171 | A=C=T=G=N=M=X=0 172 | for nseq in range(L): 173 | A += D[nseq][i].count("A") 174 | C += D[nseq][i].count("C") 175 | T += D[nseq][i].count("T") 176 | G += D[nseq][i].count("G") 177 | N += D[nseq][i].count("N") 178 | M += D[nseq][i].count("-") 179 | X += D[nseq][i].count("X") 180 | counts.append( [[A,C,T,G],N,M,X] ) 181 | return counts 182 | 183 | 184 | # def ffmin(x): 185 | # d = [] 186 | # for i,j in enumerate(x): 187 | # if j not in ["-","N"]: 188 | # d.append(i) 189 | # return min(d) 190 | 191 | # def ffmax(x): 192 | # d = [] 193 | # for i,j in enumerate(x): 194 | # if j not in ["-","N"]: 195 | # d.append(i) 196 | # return max(d) 197 | 198 | 199 | def removerepeat_Ns(shortcon): 200 | """ 201 | checks for interior Ns in consensus seqs 202 | remove those that arise next to *single repeats* 203 | of at least 3 bases on either side, which may be 204 | sequencing errors on deep coverage repeats """ 205 | Nlocs = [i for i,j in enumerate(shortcon) if j=="N"] 206 | repeats = set() 207 | for n in Nlocs: 208 | r1 = len(set(list(shortcon)[n-3:n])) 209 | if r1 < 2: 210 | repeats.add(n) 211 | r2 = len(set(list(shortcon)[n+1:n+4])) 212 | if r2 < 2: 213 | repeats.add(n) 214 | return "".join([j for (i,j) in enumerate(shortcon) if i not in repeats]) 215 | 216 | 217 | 218 | 219 | def consensus(infile,E,H,mindepth,maxN,maxH,datatype, 220 | haplos,CUT,upperSD,strict,lowcounts): 221 | """ 222 | from a clust file f, 223 | reads in all copies at a locus and sorts 224 | bases at each site, tests for errors at the 225 | site according to error rate, calls consensus 226 | """ 227 | f = gzip.open(infile) 228 | k = itertools.izip(*[iter(f)]*2) 229 | bases = ['A','C','T','G'] 230 | Dic = {} 231 | Errors = [] 232 | haplo = [] 233 | #Plist = [] 234 | locus = minsamplocus = npoly = P = 0 235 | while 1: 236 | try: first = k.next() 237 | except StopIteration: break 238 | itera = [first[0],first[1]] 239 | fname = itera[0].strip().split(";")[0] 240 | leftjust = rightjust = None 241 | 242 | " lists and variables for this locus" 243 | S = [] ## list for sequence data 244 | alleles = [] ## for measuring # alleles, detect paralogs 245 | locus += 1 ## recording n loci 246 | ploidy = 0 ## for measuring # alleles, detect paralogs 247 | nHs = 0 ## will record heterozygous sites in this locus 248 | consensus = "" ## empty vector for consensus sequence 249 | basenumber = 1 ## for recording error locations 250 | lefts = [] 251 | rights = [] 252 | while itera[0] != "//\n": 253 | " append sequence * number of dereps " 254 | nreps = int(itera[0].strip().split(";")[1].replace("size=","")) 255 | for i in xrange(nreps): 256 | S.append(tuple(itera[1].strip())) 257 | #print i, itera[1].strip(), itera[0].strip()[-1], leftjust, rights 258 | 259 | " record left and right most index of seed and hits (for GBS) " 260 | if datatype in ['gbs','merged']: 261 | " leftjust is seed's left " 262 | if itera[0].strip()[-1] == ";": 263 | leftjust = itera[1].index([i for i in itera[1] if i not in list("-N")][0]) 264 | 265 | " rightjust is the shortest reverse hit " 266 | if itera[0].strip()[-1] == "-": 267 | rights.append(max(-1,[itera[1].rindex(i) for i in itera[1] if i in list("ATGC")])) 268 | #if rights == -1: 269 | # print itera 270 | #lefts.append(itera[1].index([i for i in itera[1] if i not in list("-N")][0])) 271 | 272 | itera = k.next() 273 | 274 | " trim off overhang edges of gbs reads " 275 | if datatype in ['gbs','merged']: 276 | if rights: 277 | " record in name that there was a reverse hit" 278 | fname = "_".join(fname.split("_")[0:-1])+"_c1" 279 | try: rightjust = min([min(i) for i in rights]) 280 | except ValueError: 281 | S = "" 282 | 283 | for s in xrange(len(S)): 284 | S[s] = S[s][leftjust:] 285 | if rightjust: 286 | #print rights, rightjust, 'right,just' 287 | S[s] = S[s][:rightjust+1] 288 | 289 | #for i in S: 290 | # print "".join(i) 291 | 292 | #if any([i < leftjust for i in lefts]): 293 | # fname = "_".join(fname.split("_")[0:-1])+"_c1" 294 | #print "".join(list(S[s])), "new" 295 | 296 | " Apply depth and paralog filters " 297 | if (len(S) >= min(lowcounts,mindepth)) and (len(S) < upperSD): 298 | minsamplocus += 1 299 | RAD = stack(S) 300 | for site in RAD: 301 | nchanged = 0 302 | 303 | " minimum depth of coverage for base calling " 304 | depthofcoverage = sum(site[0]) 305 | if depthofcoverage < min(mindepth,lowcounts): 306 | cons = "N"; n1 = depthofcoverage-1; n2=0 ## prevents zero division error. 307 | else: 308 | n1,n2,n3,n4 = sorted(site[0],reverse=True) 309 | 310 | " speed hack = if diploid exclude if a third base present at > 20% " 311 | quickthirdbasetest = 0 312 | if haplos == 2: 313 | if float(n3)/(n1+n2+n3+n4) > 0.20: 314 | quickthirdbasetest = 1 315 | if not quickthirdbasetest: 316 | 317 | """ if depth > 500 reduce by some factor for base calling """ 318 | if n1+n2 >= 500: ## if > 500, random sample 500 319 | firstfivehundred = numpy.array(tuple("A"*n1+"B"*n2)) 320 | numpy.random.shuffle(firstfivehundred) 321 | nchanged = 1 322 | oldn1 = n1 323 | oldn2 = n2 324 | n1 = list(firstfivehundred[:500]).count("A") 325 | n2 = list(firstfivehundred[:500]).count("B") 326 | 327 | """ make base calls using... """ 328 | if n1+n2 >= mindepth: 329 | """ if above stat minimum """ 330 | P,maf,who = binomprobr(n1,n2,float(E),H) 331 | elif n1+n2 >= lowcounts: 332 | """ if above maj rule minimum""" 333 | P,maf,who = simpleconsens(n1,n2) 334 | 335 | """ if the base could be called with 95% probability """ 336 | if float(P) >= 0.95: 337 | if who in 'ab': 338 | if nchanged: 339 | a = [i for i,l in enumerate(site[0]) if l == oldn1] 340 | else: 341 | a = [i for i,l in enumerate(site[0]) if l == n1] 342 | if len(a)==2: ## alleles came up equal freq. 343 | cons = hetero(bases[a[0]],bases[a[1]]) 344 | alleles.append(basenumber) 345 | else: ## alleles came up diff freq. 346 | if nchanged: 347 | b= [i for i,l in enumerate(site[0]) if l == oldn2] 348 | else: 349 | b= [i for i,l in enumerate(site[0]) if l == n2] 350 | "if three alleles came up equal, only need if diploid paralog filter off" 351 | if a == b: 352 | cons = hetero(bases[a[0]],bases[a[1]]) 353 | else: 354 | cons = hetero(bases[a[0]],bases[b[0]]) 355 | alleles.append(basenumber) 356 | nHs += 1 357 | else: 358 | if nchanged: 359 | cons = bases[site[0].index(oldn1)] 360 | else: 361 | cons = bases[site[0].index(n1)] 362 | else: 363 | cons = "N" 364 | else: 365 | "paralog flag" 366 | cons = "@" 367 | consensus += cons 368 | basenumber += 1 369 | 370 | " only allow maxH polymorphic sites in a locus " 371 | if "@" not in consensus: 372 | if nHs <= maxH: 373 | " filter to limit to N haplotypes (e.g., diploid) " 374 | if haplos: 375 | al = [] 376 | if len(alleles) > 1: 377 | for i in S: 378 | d = "" 379 | for z in alleles: 380 | if i[z-1] in unhetero(consensus[z-1]): 381 | d += i[z-1]+"_" 382 | if "N" not in d: 383 | if d.count("_") == len(alleles): 384 | al.append(d.rstrip("_")) 385 | 386 | " remove very rare thirds representing a possible error at a heterozygous site \ 387 | that changed the base to the alternate allele at that site " 388 | #if len(al) >= 50: 389 | al = [i for i in al if al.count(i) > len(al)*.25] 390 | 391 | AL = sorted(set(al), key=al.count) 392 | ploidy = len(AL) 393 | #Plist.append(ploidy) 394 | 395 | " set correct alleles relative to first polymorphic base" 396 | if AL: 397 | if ploidy <= haplos: 398 | sss = [zz-1 for zz in alleles] 399 | consensus = findalleles(consensus,sss,AL) 400 | else: 401 | consensus += "@E" 402 | # print ploidy, haplos 403 | # print alleles 404 | # print "AL", AL 405 | # print "al", al 406 | 407 | #else: Plist.append(1) 408 | 409 | if "@" not in consensus: 410 | " strip N's from either end " 411 | shortcon = consensus.lstrip("N").rstrip("N").replace("-","N") 412 | shortcon = removerepeat_Ns(shortcon) 413 | if shortcon.count("N") <= maxN: ## only allow maxN internal "N"s in a locus 414 | if len(shortcon) >= 32: ## minimum length set to 36 415 | #print shortcon, 'keep' 416 | npoly += nHs 417 | Dic[fname] = shortcon 418 | 419 | 420 | #with open(infile.replace(".clustS",".ploids"),'w+') as ploidout: 421 | # ploidout.write(",".join(map(str,Plist))) 422 | 423 | consens = gzip.open(infile.replace(".clustS",".consens"),'w+') 424 | for i in Dic.items(): 425 | consens.write(str(i[0])+'\n'+str(i[1])+"\n") 426 | consens.close() 427 | sys.stderr.write(".") 428 | 429 | 430 | if datatype in ['pairgbs','pairddrad']: 431 | " -4 for the xxxx " 432 | nsites = sum([len(i)-len(CUT)-4 for i in Dic.values()]) 433 | else: 434 | nsites = sum([len(i)-len(CUT) for i in Dic.values()]) 435 | ldic = len(Dic) 436 | try: NP = npoly/float(nsites) 437 | except ZeroDivisionError: NP = 0 438 | return [infile.split('/')[-1], locus, minsamplocus, ldic, nsites, npoly, round(NP,7)] 439 | 440 | 441 | 442 | def upSD(handle,mindepth): 443 | " function to calculate mean and SD of clustersize" 444 | infile = gzip.open(handle) 445 | L = itertools.izip(*[iter(infile)]*2) 446 | a = L.next()[0].strip() 447 | depth = [] 448 | d = int(a.split(";")[1].replace("size=","")) 449 | while 1: 450 | try: a = L.next()[0].strip() 451 | except StopIteration: break 452 | if a != "//": 453 | d += int(a.split(";")[1].replace("size=","")) 454 | else: 455 | depth.append(d) 456 | d = 0 457 | infile.close() 458 | keep = [i for i in depth if i>=(mindepth)] 459 | if keep: 460 | me = numpy.mean(keep) 461 | std = numpy.std(keep) 462 | else: 463 | me = 0.0 464 | std = 0.0 465 | return me, std 466 | 467 | 468 | def main(Parallel, E, H, ID, mindepth, subset, 469 | maxN, maxH, haplos, CUT, datatype, 470 | lowcounts, strict, WORK, maxstack): 471 | 472 | " find clust.xx directory " 473 | if not os.path.exists(WORK+'clust'+ID): 474 | print "\terror: could not find "+WORK+"clust"+str(ID)+"/ directory,"+ \ 475 | "\n\t\tif you changed the clustering threshold you must transfer *.clustS"+ \ 476 | "\n\t\tfiles to a new directory named clust.xx with xx replaced by new clustering threshold" 477 | sys.exit() 478 | 479 | " load up work queue" 480 | work_queue = multiprocessing.Queue() 481 | 482 | " iterate over files" 483 | outfolder = WORK+'clust'+str(ID) 484 | HH = glob.glob(outfolder+"/"+subset+".clustS*") 485 | stringout = "\n\tstep 5: creating consensus seqs for %i samples, using H=%.5f E=%.5f\n\t" % (len(HH),round(H,5),round(E,5)) 486 | sys.stderr.write(stringout) 487 | 488 | if len(HH) > 1: 489 | " sort files by size" 490 | for i in xrange(len(HH)): 491 | statinfo = os.stat(HH[i]) 492 | HH[i] = HH[i],statinfo.st_size 493 | HH.sort(key=operator.itemgetter(1)) 494 | FS = [f[0] for f in HH][::-1] 495 | else: FS = HH 496 | REMOVE = glob.glob('clust'+ID+"/cat.*") 497 | FS = [f for f in FS if f not in REMOVE] 498 | submitted = 0 499 | for handle in FS: 500 | if handle.replace('.clustS','.consens').replace('.clust','.consens') not in glob.glob(outfolder+"/*"): 501 | m,sd = upSD(handle,mindepth) 502 | if maxstack == "2SD": 503 | upperSD = max(500,m+(sd*2.5)) 504 | else: 505 | upperSD = int(maxstack) 506 | work_queue.put([handle,E,H,mindepth,maxN,maxH,datatype, 507 | haplos,CUT,upperSD,strict,lowcounts]) 508 | submitted += 1 509 | else: 510 | print "\tskipping "+handle.replace(".clustS",".consens")+\ 511 | ', it already exists in '+outfolder+"/" 512 | 513 | 514 | " create a queue to pass to workers to store the results" 515 | result_queue = multiprocessing.Queue() 516 | 517 | " spawn workers" 518 | jobs = [] 519 | for i in xrange( min(Parallel,submitted) ): 520 | worker = Worker(work_queue, result_queue, consensus) 521 | jobs.append(worker) 522 | worker.start() 523 | for j in jobs: 524 | j.join() 525 | 526 | " get results" 527 | stats = open(WORK+'stats/s5.consens.txt','a+') 528 | print >>stats, "taxon \tnloci\tf1loci\tf2loci\tnsites\tnpoly\tpoly" 529 | for i in range(submitted): 530 | a,b,c,d,e,f,g = result_queue.get() 531 | print >> stats, "\t".join(map(str,[a.replace(".clustS.gz","")+" "*(10-len(a)),b,c,d,e,f,g])) 532 | print >>stats, """ 533 | ## nloci = number of loci 534 | ## f1loci = number of loci with >N depth coverage 535 | ## f2loci = number of loci with >N depth and passed paralog filter 536 | ## nsites = number of sites across f loci 537 | ## npoly = number of polymorphic sites in nsites 538 | ## poly = frequency of polymorphic sites""" 539 | stats.close() 540 | 541 | 542 | 543 | 544 | -------------------------------------------------------------------------------- /pyrad/consens_pairs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | import multiprocessing 4 | import glob 5 | import itertools 6 | import sys 7 | import scipy.stats 8 | import scipy.misc 9 | import numpy 10 | import os 11 | import operator 12 | import gzip 13 | from potpour import Worker 14 | 15 | from consensdp import binomprobr, simpleconsens, hetero, unhetero, uplow, findalleles,breakalleles, removerepeat_Ns 16 | 17 | 18 | def stack(D): 19 | """ 20 | from list of bases at a site D, 21 | returns an ordered list of counts of bases 22 | """ 23 | ## TODO: replace with Counter 24 | L = len(D) 25 | counts = [] 26 | for i in range(len(D[0])): 27 | A=C=T=G=N=M=X=n=0 28 | for nseq in range(L): 29 | A += D[nseq][i].count("A") 30 | C += D[nseq][i].count("C") 31 | T += D[nseq][i].count("T") 32 | G += D[nseq][i].count("G") 33 | N += D[nseq][i].count("N") 34 | M += D[nseq][i].count("-") 35 | X += D[nseq][i].count("X") 36 | n += D[nseq][i].count("n") 37 | counts.append( [[A,C,T,G],N,M,X,n] ) 38 | return counts 39 | 40 | 41 | def consensus(infile,E,H,mindepth,maxN,maxH,datatype, 42 | ploidy,CUT,upperSD,strict,lowcounts): 43 | """ 44 | from a clust file f, 45 | reads in all copies at a locus and sorts 46 | bases at each site, tests for errors at the 47 | site according to error rate, calls consensus 48 | """ 49 | f = gzip.open(infile,'r') 50 | k = itertools.izip(*[iter(f)]*2) 51 | bases = ['A','C','T','G'] 52 | Dic = {} 53 | Errors = [] 54 | haplo = [] 55 | Plist = [] 56 | locus = minsamplocus = npoly = P = 0 57 | while 1: 58 | try: first = k.next() 59 | except StopIteration: break 60 | itera = [first[0],first[1]] 61 | fname = itera[0].strip().split(";")[0] 62 | leftjust = rightjust = None 63 | 64 | " lists and variables for this locus" 65 | S = [] ## list for sequence data 66 | S2 = [] ## list for sequence data 67 | alleles = [] ## for measuring # alleles, detect paralogs 68 | locus += 1 ## recording n loci 69 | ploidy = 0 ## for measuring # alleles, detect paralogs 70 | nHs = 0 ## will record heterozygous sites in this locus 71 | consensus = "" ## empty vector for consensus sequence 72 | consensus1 = "" ## empty vector for consensus sequence 73 | consensus2 = "" ## empty vector for consensus sequence 74 | basenumber = 1 ## for recording error locations 75 | 76 | while itera[0] != "//\n": 77 | nreps = int(itera[0].strip().split(";")[1].replace("size=","")) 78 | 79 | " append sequence * number of dereps " 80 | for i in xrange(nreps): 81 | " compatibility from pyrad 2 -> 3 " 82 | ss = itera[1].strip().replace("X","n") 83 | S.append(ss) 84 | S2.append(ss) 85 | itera = k.next() 86 | 87 | " separate first and second read clusters " 88 | firsts = [tuple(i.split("n")[0]) for i in S] 89 | seconds = [tuple(i.split("n")[-1]) for i in S] 90 | 91 | " call first read consensus " 92 | " Apply depth and paralog filters " 93 | if (len(firsts) >= min(mindepth,lowcounts)) and (len(firsts) < upperSD): ## upper limit is meandepth + 2 SD 94 | minsamplocus += 1 95 | RAD = stack(firsts) 96 | for site in RAD: 97 | nchanged = 0 98 | 99 | " minimum depth of coverage for base calling at each site" 100 | depthofcoverage = sum(site[0]) 101 | if depthofcoverage < min(mindepth,lowcounts): 102 | cons = "N"; n1 = depthofcoverage-1; n2=0 ## prevents zero division error. 103 | else: 104 | n1,n2,n3,n4 = sorted(site[0],reverse=True) 105 | 106 | " speed hack = if diploid exclude if a third base present at > 20% " 107 | quickthirdbasetest = 0 108 | if ploidy == 2: 109 | if float(n3)/(n1+n2+n3+n4) > 0.20: 110 | quickthirdbasetest = 1 111 | if not quickthirdbasetest: 112 | 113 | """ if depth > 500 reduce by some factor for base calling """ 114 | if n1+n2 >= 500: ## if > 500, random sample 500 115 | firstfivehundred = numpy.array(tuple("A"*n1+"B"*n2)) 116 | numpy.random.shuffle(firstfivehundred) 117 | nchanged = 1 118 | oldn1 = n1 119 | oldn2 = n2 120 | n1 = list(firstfivehundred[:500]).count("A") 121 | n2 = list(firstfivehundred[:500]).count("B") 122 | 123 | """ if lowcounts, make base calls by majority instead of statistics 124 | when depth is below mindepth """ 125 | # if lowcounts: ## include low count sites or no 126 | # if n1+n2 >= 5: 127 | # P,who = binomprobr(n1,n2,float(E),H) 128 | # else: 129 | # P,who = simpleconsens(n1,n2) 130 | # else: 131 | # P,who = binomprobr(n1,n2,float(E),H) 132 | """ make base calls using... """ 133 | if n1+n2 >= mindepth: 134 | """ if above stat minimum """ 135 | P,maf,who = binomprobr(n1,n2,float(E),H) 136 | elif n1+n2 >= lowcounts: 137 | """ if above maj rule minimum""" 138 | P,maf,who = simpleconsens(n1,n2) 139 | 140 | """ if the base could be called with 95% probability """ 141 | if float(P) >= 0.95: 142 | if who in 'ab': 143 | if nchanged: 144 | a = [i for i,l in enumerate(site[0]) if l == oldn1] 145 | else: 146 | a = [i for i,l in enumerate(site[0]) if l == n1] 147 | if len(a)==2: ## alleles came up equal freq. 148 | cons = hetero(bases[a[0]],bases[a[1]]) 149 | alleles.append(basenumber) 150 | else: ## alleles came up diff freq. 151 | if nchanged: 152 | b= [i for i,l in enumerate(site[0]) if l == oldn2] 153 | else: 154 | b= [i for i,l in enumerate(site[0]) if l == n2] 155 | "if three alleles came up equal, only need if diploid paralog filter off" 156 | if a == b: 157 | cons = hetero(bases[a[0]],bases[a[1]]) 158 | else: 159 | cons = hetero(bases[a[0]],bases[b[0]]) 160 | alleles.append(basenumber) 161 | nHs += 1 162 | else: 163 | if nchanged: 164 | cons = bases[site[0].index(oldn1)] 165 | else: 166 | cons = bases[site[0].index(n1)] 167 | else: 168 | cons = "N" ## poor base call 169 | else: 170 | cons = "@" ## third base freq fail 171 | consensus1 += cons 172 | basenumber += 1 173 | 174 | 175 | if "@" not in consensus1: 176 | if consensus1.count("N") <= maxN: ## only allow maxN internal "N"s in a locus 177 | if nHs < maxH: ## only allow maxH Hs, shortcut if first read fail 178 | basenumber += 4 ## separator length 179 | 180 | " call second read consensus " 181 | RAD = stack(seconds) 182 | for site in RAD: 183 | nchanged = 0 184 | " minimum depth of coverage for base calling at each site" 185 | depthofcoverage = sum(site[0]) 186 | if depthofcoverage < mindepth: 187 | cons = "N"; n1 = depthofcoverage-1; n2=0 188 | else: 189 | n1,n2,n3,n4 = sorted(site[0],reverse=True) 190 | 191 | " speed hack = if diploid exclude if a third base present at > 20% " 192 | quickthirdbasetest = 0 193 | if ploidy == 2: 194 | if float(n3)/(n1+n2+n3+n4) > 0.20: 195 | quickthirdbasetest = 1 196 | if not quickthirdbasetest: 197 | 198 | """ if depth > 500 reduce by some factor for base calling """ 199 | if n1+n2 >= 500: ## if > 500, random sample 500 200 | firstfivehundred = numpy.array(tuple("A"*n1+"B"*n2)) 201 | numpy.random.shuffle(firstfivehundred) 202 | nchanged = 1 203 | oldn1 = n1 204 | oldn2 = n2 205 | n1 = list(firstfivehundred[:500]).count("A") 206 | n2 = list(firstfivehundred[:500]).count("B") 207 | 208 | """ make base calls using... """ 209 | if n1+n2 >= mindepth: 210 | """ if above stat minimum """ 211 | P,maf,who = binomprobr(n1,n2,float(E),H) 212 | elif n1+n2 >= lowcounts: 213 | """ if above maj rule minimum""" 214 | P,maf,who = simpleconsens(n1,n2) 215 | 216 | """ if the base could be called with 95% probability """ 217 | if float(P) >= 0.95: 218 | if who in 'ab': 219 | if nchanged: 220 | a = [i for i,l in enumerate(site[0]) if l == oldn1] 221 | else: 222 | a = [i for i,l in enumerate(site[0]) if l == n1] 223 | if len(a)==2: ## alleles came up equal freq. 224 | cons = hetero(bases[a[0]],bases[a[1]]) 225 | alleles.append(basenumber) 226 | else: ## alleles came up diff freq. 227 | if nchanged: 228 | b= [i for i,l in enumerate(site[0]) if l == oldn2] 229 | else: 230 | b= [i for i,l in enumerate(site[0]) if l == n2] 231 | "if three alleles came up equal, only need if diploid paralog filter off" 232 | if a == b: 233 | cons = hetero(bases[a[0]],bases[a[1]]) 234 | else: 235 | cons = hetero(bases[a[0]],bases[b[0]]) 236 | alleles.append(basenumber) 237 | nHs += 1 238 | else: 239 | if nchanged: 240 | cons = bases[site[0].index(oldn1)] 241 | else: 242 | cons = bases[site[0].index(n1)] 243 | else: 244 | cons = "N" 245 | else: 246 | "paralog flag" 247 | cons = "@" 248 | consensus2 += cons 249 | basenumber += 1 250 | 251 | 252 | "create concatenated consensus sequence from pairs " 253 | if "@" not in consensus2: 254 | consensus2.replace("-","N") 255 | consensus = consensus1 + "nnnn" + consensus2 256 | 257 | 258 | " filter applies to concatenated sequence " 259 | if consensus: 260 | if "@" not in consensus: 261 | " only allow maxH polymorphic sites in a locus " 262 | if nHs <= maxH: 263 | " filter for number of 2 alleles - diploids " 264 | if ploidy: 265 | al = [] 266 | " only check if more than one hetero site present " 267 | if len(alleles) > 1: 268 | for i in S2: 269 | d = "" 270 | for z in alleles: 271 | if i[z-1] in unhetero(consensus[z-1]): 272 | d += i[z-1]+"_" 273 | if "N" not in d: 274 | if d.count("_") == len(alleles): 275 | al.append(d.rstrip("_")) 276 | 277 | " remove allele if it came up less than one in ten " 278 | " in which case it is likely a true heterozygous site " 279 | " but contains a sequencing error also " 280 | ## a hack for now. But very conservative. 281 | #if len(al) >= 5: 282 | # al = [i for i in al if al.count(i) > len(al)/10.] 283 | #TODO allow only 1 bp difference for excludes 284 | 285 | AL = sorted(set(al), key=al.count) 286 | diploid = len(AL) 287 | 288 | " set correct alleles relative to first polymorphic base" 289 | if AL: 290 | if diploid <= ploidy: 291 | sss = [zz-1 for zz in alleles] 292 | consensus = findalleles(consensus,sss,AL) 293 | ## TODO: incorporate option to output alleles for haplos>2 294 | else: 295 | consensus += "@E" 296 | else: 297 | None 298 | else: 299 | consensus += "@P" 300 | 301 | if "@" not in consensus: 302 | #print consensus, nHs 303 | " strip terminal N's from either end " 304 | shortcon1 = consensus1.rstrip("N").replace("-","N") 305 | " remove internal - or N, if low count " 306 | shortcon1 = removerepeat_Ns(shortcon1) 307 | " check for length not trimmed " 308 | if (len(shortcon1) >= 32) and (len(consensus2) >= 32): 309 | Dic[fname] = shortcon1 + "nnnn" +consensus2 310 | npoly += nHs 311 | 312 | 313 | 314 | if ".gz" in infile[-5:]: 315 | consens = gzip.open(infile.replace(".clustS",".consens"),'w') 316 | else: 317 | consens = open(infile.replace(".clustS",".consens"),'w') 318 | for i in Dic.items(): 319 | consens.write(str(i[0])+'\n'+str(i[1])+"\n") 320 | consens.close() 321 | sys.stderr.write(".") 322 | 323 | if 'pair' in datatype: 324 | nsites = sum([len(i)-len(CUT)-4 for i in Dic.values()]) 325 | else: 326 | nsites = sum([len(i)-len(CUT) for i in Dic.values()]) 327 | ldic = len(Dic) 328 | try: NP = npoly/float(nsites) 329 | except ZeroDivisionError: NP = 0 330 | return [infile.split('/')[-1], locus, minsamplocus, ldic, nsites, npoly, round(NP,7)] 331 | 332 | 333 | 334 | def upSD(handle,mindepth): 335 | " function to calculate mean and SD of clustersize" 336 | if ".gz" in handle[-5:]: 337 | infile = gzip.open(handle) 338 | else: 339 | infile = open(handle) 340 | L = itertools.izip(*[iter(infile)]*2) 341 | a = L.next()[0].strip() 342 | depth = [] 343 | d = int(a.split(";")[1].replace("size=","")) 344 | while 1: 345 | try: a = L.next()[0].strip() 346 | except StopIteration: break 347 | if a != "//": 348 | d += int(a.split(";")[1].replace("size=","")) 349 | else: 350 | depth.append(d) 351 | d = 0 352 | infile.close() 353 | keep = [i for i in depth if i>=(mindepth)] 354 | if keep: 355 | me = numpy.mean(keep) 356 | std = numpy.std(keep) 357 | else: 358 | me = 0.0 359 | std = 0.0 360 | return me, std 361 | 362 | 363 | def main(Parallel, E, H, ID, mindepth, subset, 364 | maxN, maxH, ploidy, CUT, datatype, 365 | lowcounts, strict, WORK, maxstack): 366 | 367 | " find clust.xx directory " 368 | if not os.path.exists(WORK+'clust'+ID): 369 | print "\terror: could not find "+WORK+"clust"+str(ID)+"/ directory,"+ \ 370 | "\n\t\tif you changed the clustering threshold you must transfer *.clustS"+ \ 371 | "\n\t\tfiles to a new directory named clust.xx with xx replaced by new clustering threshold" 372 | sys.exit() 373 | 374 | " create work queue" 375 | work_queue = multiprocessing.Queue() 376 | 377 | " iterate over files" 378 | outfolder = WORK+'clust'+str(ID) 379 | HH = glob.glob(outfolder+"/"+subset+".clustS*") 380 | stringout = "\n\tstep 5: created consensus seqs for %i samples, using H=%.5f E=%.5f\n\t" % (len(HH),round(H,5),round(E,5)) 381 | sys.stderr.write(stringout) 382 | 383 | if len(HH) > 1: 384 | " sort files by size" 385 | for i in range(len(HH)): 386 | statinfo = os.stat(HH[i]) 387 | HH[i] = HH[i],statinfo.st_size 388 | HH.sort(key=operator.itemgetter(1)) 389 | FS = [f[0] for f in HH][::-1] 390 | else: FS = HH 391 | REMOVE = glob.glob('clust'+ID+"/cat.*") 392 | FS = [f for f in FS if f not in REMOVE] 393 | submitted = 0 394 | for handle in FS: 395 | if handle.replace('.clustS','.consens').replace('.clust','.consens') not in glob.glob(outfolder+"/*"): 396 | m,sd = upSD(handle,mindepth) 397 | if maxstack == "2SD": 398 | upperSD = max(500,m+(sd*2.5)) 399 | else: 400 | upperSD = int(maxstack) 401 | work_queue.put([handle,E,H,mindepth,maxN,maxH,datatype, 402 | ploidy,CUT,upperSD,strict,lowcounts]) 403 | submitted += 1 404 | else: 405 | print "\tskipping "+handle.replace(".clustS",".consens")+\ 406 | ', it already exists in '+outfolder+"/" 407 | 408 | 409 | " create a queue to pass to workers to store the results" 410 | result_queue = multiprocessing.Queue() 411 | 412 | " spawn workers" 413 | jobs = [] 414 | for i in range( min(Parallel,submitted) ): 415 | worker = Worker(work_queue, result_queue, consensus) 416 | jobs.append(worker) 417 | worker.start() 418 | for j in jobs: 419 | j.join() 420 | 421 | " get results" 422 | stats = open(WORK+'stats/s5.consens.txt','a+') 423 | print >>stats, "taxon\tnloci\tf1loci\tf2loci\tnsites\tnpoly\tpoly" 424 | for i in range(submitted): 425 | a,b,c,d,e,f,g = result_queue.get() 426 | nn = a.replace(".clustS.gz","") 427 | print >> stats, "\t".join(map(str,[nn,b,c,d,e,f,g])) 428 | print >>stats, """ 429 | ## nloci = number of loci 430 | ## f1loci = number of loci with >N depth coverage 431 | ## f2loci = number of loci with >N depth and passed paralog filter 432 | ## nsites = number of sites across f loci 433 | ## npoly = number of polymorphic sites in nsites 434 | ## poly = frequency of polymorphic sites""" 435 | stats.close() 436 | 437 | 438 | 439 | 440 | -------------------------------------------------------------------------------- /pyrad/Dtest_foil.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | import numpy 4 | import sys 5 | import itertools 6 | import multiprocessing 7 | from potpour import Worker 8 | from Dtest import IUPAC, sample_wr, fillin, makesortfiles 9 | 10 | 11 | 12 | def most_common(lst): 13 | return max(set(lst), key=lst.count) 14 | 15 | 16 | def makefreq(patlist): 17 | " identify which allele is derived in P3 relative to outgroup " 18 | " and is the most frequent and use that as the SNP." 19 | " Also, split up alleles into those that are P3a & vs. or P3b " 20 | P = {} 21 | for tax in patlist: 22 | P[tax] = [] 23 | 24 | for tax in patlist: 25 | for base in patlist[tax]: 26 | if base in list('ATGC'): 27 | P[tax].append(base[0]) 28 | P[tax].append(base[0]) 29 | elif base in list("RKSYWM"): 30 | hh = IUPAC(base[0]) 31 | for i in hh: 32 | P[tax].append(i) 33 | 34 | """ select most common element in outgroup if multiple individuals, 35 | if only one ind but two alleles, select the first one """ 36 | if len(set(P['o'])) > 1: 37 | minor = most_common(P['o']) 38 | else: 39 | minor = P['o'][0] 40 | 41 | " select most common element that is not minor " 42 | bases = list(itertools.chain(*P.values())) 43 | majors = [i for i in bases if i != minor] 44 | major = most_common(majors) 45 | 46 | ret = [float(P[i].count(major)) / len(P[i]) for i in ['p1','p2','p3a','p3b','o']] 47 | return ret 48 | 49 | 50 | 51 | 52 | 53 | def Dstatfoil(Loc,pat): 54 | " check site for patterns and add to Locus object if found" 55 | if len(set(pat)) < 3: 56 | " only allow biallelic " 57 | minor = pat[-1] 58 | " select only alternative to the outgroup allele " 59 | major = [i for i in pat if i!= pat[-1]][0] 60 | 61 | o = 0. 62 | p3b = 1. if pat[3] == major else 0. 63 | p3a = 1. if pat[2] == major else 0. 64 | p2 = 1. if pat[1] == major else 0. 65 | p1 = 1. if pat[0] == major else 0. 66 | 67 | ## from partitioned D-stat 68 | Loc.abbba += ( (1.-p1)*p2*p3a*p3b*(1.-o) ) # DFI[5] DOL[5] 69 | Loc.babba += ( p1*(1.-p2)*p3a*p3b*(1.-o) ) # DFI[1] DOL[1] 70 | 71 | Loc.abbaa += ( (1.-p1)*p2*p3a*(1.-p3b)*(1.-o) ) # DFO[6] DIL[0] DFI[4] DOL[2] 72 | Loc.babaa += ( p1*(1.-p2)*p3a*(1.-p3b)*(1.-o) ) # DFO[0] DIL[6] DFI[0] DOL[6] 73 | 74 | Loc.ababa += ( (1.-p1)*p2*(1.-p3a)*p3b*(1.-o) ) # DFO[2] DIL[4] DFI[2] DOL[4] 75 | Loc.baaba += ( p1*(1.-p2)*(1.-p3a)*p3b*(1.-o) ) # DFO[4] DIL[2] DFI[6] DOL[0] 76 | 77 | ## new to foil, contrast of bbxxa 78 | Loc.bbbaa += ( p1*p2*p3a*(1.-p3b)*(1.-o) ) # DFO[1] DIL[1] 79 | Loc.bbaba += ( p1*p2*(1.-p3a)*p3b*(1.-o) ) # DFO[5] DIL[5] 80 | 81 | ## terminal branch patterns 82 | if not Loc.noterminals: 83 | Loc.aaaba += ( (1.-p1)*(1.-p2)*(1.-p3a)*p3b*(1.-o) ) # DFO[3] DIL[3] 84 | Loc.aabaa += ( (1.-p1)*(1.-p2)*p3a*(1.-p3b)*(1.-o) ) # DFO[7] DIL[7] 85 | Loc.abaaa += ( (1.-p1)*p2*(1.-p3a)*(1.-p3b)*(1.-o) ) # DFI[3] DOL[3] 86 | Loc.baaaa += ( p1*(1.-p2)*(1.-p3a)*(1.-p3b)*(1.-o) ) # DFI[7] DOL[7] 87 | return Loc 88 | 89 | 90 | 91 | def polyDstatfoil(Loc, pat): 92 | ## calculate frequencies 93 | " look at the P3 taxon first for a derived allele " 94 | p1,p2,p3a,p3b,o = makefreq(pat) 95 | # else: 96 | # pat = [1. if base!=pat[-1] else 0. for base in pat] 97 | # p1,p2,p3a,p3b,o = pat 98 | 99 | ## from partitioned D-stat 100 | Loc.abbba += ( (1.-p1)*p2*p3a*p3b*(1.-o) ) # DFI[5] DOL[5] 101 | Loc.babba += ( p1*(1.-p2)*p3a*p3b*(1.-o) ) # DFI[1] DOL[1] 102 | 103 | Loc.abbaa += ( (1.-p1)*p2*p3a*(1.-p3b)*(1.-o) ) # DFO[6] DIL[0] DFI[4] DOL[2] 104 | Loc.babaa += ( p1*(1.-p2)*p3a*(1.-p3b)*(1.-o) ) # DFO[0] DIL[6] DFI[0] DOL[6] 105 | 106 | Loc.ababa += ( (1.-p1)*p2*(1.-p3a)*p3b*(1.-o) ) # DFO[2] DIL[4] DFI[2] DOL[4] 107 | Loc.baaba += ( p1*(1.-p2)*(1.-p3a)*p3b*(1.-o) ) # DFO[4] DIL[2] DFI[6] DOL[0] 108 | 109 | ## new to foil, contrast of xxbba 110 | Loc.bbbaa += ( p1*p2*p3a*(1.-p3b)*(1.-o) ) # DFO[1] DIL[1] 111 | Loc.bbaba += ( p1*p2*(1-p3a)*p3b*(1.-o) ) # DFO[5] DIL[5] 112 | 113 | ## terminal branch patterns 114 | if not Loc.noterminals: 115 | Loc.aaaba += ( (1.-p1)*(1.-p2)*(1.-p3a)*p3b*(1.-o) ) # DFO[3] DIL[3] 116 | Loc.aabaa += ( (1.-p1)*(1-p2)*p3a*(1.-p3b)*(1.-o) ) # DFO[7] DIL[7] 117 | Loc.abaaa += ( (1.-p1)*p2*(1.-p3a)*(1.-p3b)*(1.-o) ) # DFI[3] DOL[3] 118 | Loc.baaaa += ( p1*(1.-p2)*(1.-p3a)*(1.-p3b)*(1.-o) ) # DFI[7] DOL[7] 119 | 120 | return Loc 121 | 122 | 123 | 124 | def IUAfreq(Loc, L): 125 | patlist = {} 126 | Loc.abbba = 0. 127 | Loc.babba = 0. 128 | Loc.abbaa = 0. 129 | Loc.babaa = 0. 130 | Loc.ababa = 0. 131 | Loc.baaba = 0. 132 | 133 | Loc.bbbaa = 0. 134 | Loc.bbaba = 0. 135 | Loc.aaaba = 0. 136 | Loc.aabaa = 0. 137 | Loc.abaaa = 0. 138 | Loc.baaaa = 0. 139 | 140 | for col in Loc.seq.transpose(): 141 | patlist = fillin(L[0], 'p1', col, Loc.names, patlist) 142 | patlist = fillin(L[1], 'p2', col, Loc.names, patlist) 143 | patlist = fillin(L[2], 'p3a', col, Loc.names, patlist) 144 | patlist = fillin(L[3], 'p3b', col, Loc.names, patlist) 145 | patlist = fillin(L[4], 'o', col, Loc.names, patlist) 146 | 147 | " exclude sites with missing data " 148 | if not any([ all([i in ["N",'-'] for i in patlist['p1']]), 149 | all([i in ["N",'-'] for i in patlist['p2']]), 150 | all([i in ["N",'-'] for i in patlist['p3a']]), 151 | all([i in ["N",'-'] for i in patlist['p3b']]), 152 | all([i in ["N",'-'] for i in patlist['o']]) ]): 153 | " if site in not invariable " 154 | isvar = len(set(col)-set(["N","-"])) > 1 155 | if isvar: 156 | " look for patterns in site " 157 | Loc = polyDstatfoil(Loc, patlist) 158 | return Loc 159 | 160 | 161 | 162 | def IUA(Loc,L): 163 | Loc.abbba = 0. 164 | Loc.babba = 0. 165 | Loc.abbaa = 0. 166 | Loc.babaa = 0. 167 | Loc.ababa = 0. 168 | Loc.baaba = 0. 169 | 170 | Loc.bbbaa = 0. 171 | Loc.bbaba = 0. 172 | Loc.aaaba = 0. 173 | Loc.aabaa = 0. 174 | Loc.abaaa = 0. 175 | Loc.baaaa = 0. 176 | 177 | for col in Loc.seq.transpose(): 178 | " exclude heterozygous sites " 179 | if all(i in list("ATGC") for i in col): 180 | " if site is not invariable " 181 | if len(set(col)) > 1: 182 | " look for patterns in site " 183 | Loc = Dstatfoil(Loc,col) 184 | return Loc 185 | 186 | 187 | 188 | def bootfreq(Ldict, which): 189 | Dfo_t = Dfo_b = 0. 190 | Dil_t = Dil_b = 0. 191 | Dfi_t = Dfi_b = 0. 192 | Dol_t = Dol_b = 0. 193 | while 1: 194 | try: Lx = Ldict[Ldict.keys()[which.next()]] 195 | except StopIteration: break 196 | " iterate over loci summing top and bottom values of Ds" 197 | Dfo_t += Lx.DFO_t() 198 | Dfo_b += Lx.DFO_b() 199 | Dil_t += Lx.DIL_t() 200 | Dil_b += Lx.DIL_b() 201 | Dfi_t += Lx.DFI_t() 202 | Dfi_b += Lx.DFI_b() 203 | Dol_t += Lx.DOL_t() 204 | Dol_b += Lx.DOL_b() 205 | " take top over bottom values to calc Ds " 206 | DFO = 0. 207 | if Dfo_b > 0: 208 | DFO = Dfo_t/float(Dfo_b) 209 | DIL = 0. 210 | if Dil_b > 0: 211 | DIL = Dil_t/float(Dil_b) 212 | DFI = 0. 213 | if Dfi_b > 0: 214 | DFI = Dfi_t/float(Dfi_b) 215 | DOL = 0. 216 | if Dol_b > 0: 217 | DOL = Dol_t/float(Dol_b) 218 | 219 | return DFO,DIL,DFI,DOL 220 | 221 | 222 | 223 | class Locusfoil(): 224 | """locus keeps track of position in input file, 225 | variable sites, and D-statistics""" 226 | def _init_(self): 227 | self.number = number 228 | self.names = names 229 | self.seq = seq 230 | self.noterminals = noterminals 231 | 232 | self.abbba = abbba 233 | self.babba = abbba 234 | self.abbaa = abbaa 235 | self.babaa = babaa 236 | self.ababa = ababa 237 | self.baaba = baaba 238 | 239 | self.bbbaa = bbbaa 240 | self.bbaba = bbaba 241 | 242 | self.aaaba = aaaba 243 | self.aabaa = aabaa 244 | self.abaaa = abaaa 245 | self.baaaa = baaaa 246 | 247 | """ per-locus top or bottom values of Dstats """ 248 | def DFO_t(self): 249 | part1 = [self.babaa,self.bbbaa,self.ababa,self.aaaba] 250 | part2 = [self.baaba,self.bbaba,self.abbaa,self.aabaa] 251 | if self.noterminals: 252 | part1 = part1[:-1] 253 | part2 = part2[:-1] 254 | return float(sum(part1)-sum(part2)) 255 | 256 | def DFO_b(self): 257 | part1 = [self.babaa,self.bbbaa,self.ababa,self.aaaba] 258 | part2 = [self.baaba,self.bbaba,self.abbaa,self.aabaa] 259 | if self.noterminals: 260 | part1 = part1[:-1] 261 | part2 = part2[:-1] 262 | return float(sum(part1)+sum(part2)) 263 | 264 | 265 | def DIL_t(self): 266 | part1 = [self.abbaa,self.bbbaa,self.baaba,self.aaaba] 267 | part2 = [self.ababa,self.bbaba,self.babaa,self.aabaa] 268 | if self.noterminals: 269 | part1 = part1[:-1] 270 | part2 = part2[:-1] 271 | return float(sum(part1)-sum(part2)) 272 | 273 | 274 | def DIL_b(self): 275 | part1 = [self.abbaa,self.bbbaa,self.baaba,self.aaaba] 276 | part2 = [self.ababa,self.bbaba,self.babaa,self.aabaa] 277 | if self.noterminals: 278 | part1 = part1[:-1] 279 | part2 = part2[:-1] 280 | return float(sum(part1)+sum(part2)) 281 | 282 | 283 | def DFI_t(self): 284 | part1 = [self.babaa,self.babba,self.ababa,self.abaaa] 285 | part2 = [self.abbaa,self.abbba,self.baaba,self.baaaa] 286 | if self.noterminals: 287 | part1 = part1[:-1] 288 | part2 = part2[:-1] 289 | return float(sum(part1)-sum(part2)) 290 | 291 | 292 | def DFI_b(self): 293 | part1 = [self.babaa,self.babba,self.ababa,self.abaaa] 294 | part2 = [self.abbaa,self.abbba,self.baaba,self.baaaa] 295 | if self.noterminals: 296 | part1 = part1[:-1] 297 | part2 = part2[:-1] 298 | return float(sum(part1)+sum(part2)) 299 | 300 | 301 | def DOL_t(self): 302 | part1 = [self.baaba,self.babba,self.abbaa,self.abaaa] 303 | part2 = [self.ababa,self.abbba,self.babaa,self.baaaa] 304 | if self.noterminals: 305 | part1 = part1[:-1] 306 | part2 = part2[:-1] 307 | return float(sum(part1)-sum(part2)) 308 | 309 | 310 | def DOL_b(self): 311 | part1 = [self.baaba,self.babba,self.abbaa,self.abaaa] 312 | part2 = [self.ababa,self.abbba,self.babaa,self.baaaa] 313 | if self.noterminals: 314 | part1 = part1[:-1] 315 | part2 = part2[:-1] 316 | return float(sum(part1)+sum(part2)) 317 | 318 | 319 | 320 | 321 | 322 | def makeSNP(L, snpfreq, loci, noterminals): 323 | Ndict = {} 324 | num = 0 325 | for loc in loci: 326 | Loc = Locusfoil() 327 | Loc.noterminals = noterminals 328 | Loc.number = num 329 | 330 | " only select loci that have data for all five tiptaxa " 331 | names = [i.split()[0].replace(">","") for i in loc.lstrip().rstrip().split("\n")[:-1]] 332 | if snpfreq: 333 | Loc.names = [i for i in names if i in list(itertools.chain(*L))] 334 | else: 335 | Loc.names = L #[i for i in names if i in L] 336 | 337 | " if snpfreq only need one of possibly multiple individuals" 338 | keep = 0 339 | 340 | if snpfreq: 341 | for tax in L: 342 | z = any([tax in Loc.names for tax in L[0]]) 343 | y = any([tax in Loc.names for tax in L[1]]) 344 | x = any([tax in Loc.names for tax in L[2]]) 345 | w = any([tax in Loc.names for tax in L[3]]) 346 | u = any([tax in Loc.names for tax in L[4]]) 347 | if all([z,y,x,w,u]): 348 | keep = 1 349 | 350 | else: 351 | if all(tax in names for tax in Loc.names): 352 | keep = 1 353 | 354 | if keep: 355 | N = numpy.array([tuple(i) for i in loc.split("\n")[1:]]) 356 | " only select sites with synapomorphies " 357 | # select all variable sites 358 | N[-1] = list(N[-1].tostring().replace("-","*")) 359 | N = N[:, N[-1] == "*"] 360 | 361 | " only select rows with focal taxa " 362 | Loc.seq = N[[names.index(i) for i in Loc.names],:] 363 | Ndict[num] = Loc 364 | num += 1 365 | return Ndict 366 | 367 | 368 | 369 | def runtest(infile, L, nboots, snpfreq, submitted, noterminals): 370 | " print test " 371 | print L 372 | 373 | " split each locus " 374 | loci = open(infile).read().strip().split("|")[:-1] 375 | loci[0] = "\n"+loci[0] 376 | 377 | " returns a {} of Locusfoil objects with data for tiptaxa L " 378 | Ldict = makeSNP(L, snpfreq, loci, noterminals) 379 | 380 | " calculate discordant patterns for each locus " 381 | for loc in Ldict: 382 | if snpfreq: 383 | Ldict[loc] = IUAfreq(Ldict[loc],L) 384 | else: 385 | Ldict[loc] = IUA(Ldict[loc],L) 386 | ################################################ 387 | 388 | " final DFO " 389 | DFO_t = sum([(Ldict[l].babaa + Ldict[l].bbbaa + Ldict[l].ababa + Ldict[l].aaaba) -\ 390 | (Ldict[l].baaba + Ldict[l].bbaba + Ldict[l].abbaa + Ldict[l].aabaa) for l in Ldict]) 391 | DFO_b = sum([(Ldict[l].babaa + Ldict[l].bbbaa + Ldict[l].ababa + Ldict[l].aaaba) + \ 392 | (Ldict[l].baaba + Ldict[l].bbaba + Ldict[l].abbaa + Ldict[l].aabaa) for l in Ldict]) 393 | if DFO_b > 0: 394 | DFO = float(DFO_t)/DFO_b 395 | else: DFO = 0. 396 | 397 | " final DIL " 398 | DIL_t = sum([(Ldict[l].abbaa + Ldict[l].bbbaa + Ldict[l].baaba + Ldict[l].aaaba) - \ 399 | (Ldict[l].ababa + Ldict[l].bbaba + Ldict[l].babaa + Ldict[l].aabaa) for l in Ldict]) 400 | DIL_b = sum([(Ldict[l].abbaa + Ldict[l].bbbaa + Ldict[l].baaba + Ldict[l].aaaba) + \ 401 | (Ldict[l].ababa + Ldict[l].bbaba + Ldict[l].babaa + Ldict[l].aabaa) for l in Ldict]) 402 | if DIL_b > 0: 403 | DIL = float(DIL_t)/DIL_b 404 | else: DIL = 0. 405 | 406 | " final DFI " 407 | DFI_t = sum([(Ldict[l].babaa + Ldict[l].babba + Ldict[l].ababa + Ldict[l].abaaa) - \ 408 | (Ldict[l].abbaa + Ldict[l].abbba + Ldict[l].baaba + Ldict[l].baaaa) for l in Ldict]) 409 | DFI_b = sum([(Ldict[l].babaa + Ldict[l].babba + Ldict[l].ababa + Ldict[l].abaaa) + \ 410 | (Ldict[l].abbaa + Ldict[l].abbba + Ldict[l].baaba + Ldict[l].baaaa) for l in Ldict]) 411 | if DFI_b > 0: 412 | DFI = float(DFI_t)/DFI_b 413 | else: DFI = 0. 414 | 415 | " final DOL " 416 | DOL_t = sum([(Ldict[l].baaba + Ldict[l].babba + Ldict[l].abbaa + Ldict[l].abaaa) - \ 417 | (Ldict[l].ababa + Ldict[l].abbba + Ldict[l].babaa + Ldict[l].baaaa) for l in Ldict]) 418 | DOL_b = sum([(Ldict[l].baaba + Ldict[l].babba + Ldict[l].abbaa + Ldict[l].abaaa) + \ 419 | (Ldict[l].ababa + Ldict[l].abbba + Ldict[l].babaa + Ldict[l].baaaa) for l in Ldict]) 420 | if DOL_b > 0: 421 | DOL = float(DOL_t)/DOL_b 422 | else: DOL = 0. 423 | 424 | " proportion of discordant loci " 425 | #try: pdisc = len([i for i in Ldict if any([Ldict[i].D12(),Ldict[i].D1(),Ldict[i].D2()])]) / float(len(Ldict)) 426 | #except ValueError: 427 | # pdisc = 0.0 428 | 429 | " TODO " 430 | pdisc = 0.0 431 | 432 | ################################################# 433 | 434 | " do bootstrapping " 435 | BBFO = [] 436 | BBIL = [] 437 | BBFI = [] 438 | BBOL = [] 439 | for i in xrange(nboots): 440 | which = iter(sample_wr(xrange(len(Ldict)), len(Ldict))) 441 | bbfo,bbil,bbfi,bbol = bootfreq(Ldict, which) 442 | BBFO.append(bbfo) 443 | BBIL.append(bbil) 444 | BBFI.append(bbfi) 445 | BBOL.append(bbol) 446 | STDfo = numpy.std(BBFO) 447 | STDil = numpy.std(BBIL) 448 | STDfi = numpy.std(BBFI) 449 | STDol = numpy.std(BBOL) 450 | ################################################## 451 | 452 | " stats out " 453 | if STDfo > 0: 454 | ZFO = (abs(DFO/STDfo)) 455 | else: ZFO = 0. 456 | if STDil > 0: 457 | ZIL = (abs(DIL/STDil)) 458 | else: ZIL = 0. 459 | if STDfi > 0: 460 | ZFI = (abs(DFI/STDfi)) 461 | else: ZFI = 0. 462 | if STDol > 0: 463 | ZOL = (abs(DOL/STDol)) 464 | else: ZOL = 0. 465 | 466 | ## make loci files here 467 | #ABBBAloci = [Ldict[l].number for l in Ldict if Ldict[l].D12() > 0] 468 | #BABBAloci = [Ldict[l].number for l in Ldict if Ldict[l].D12() < 0] 469 | #ABBAAloci = [Ldict[l].number for l in Ldict if Ldict[l].D1() > 0] 470 | #BABAAloci = [Ldict[l].number for l in Ldict if Ldict[l].D1() < 0] 471 | #ABABAloci = [Ldict[l].number for l in Ldict if Ldict[l].D2() > 0] 472 | #BAABAloci = [Ldict[l].number for l in Ldict if Ldict[l].D2() < 0] 473 | 474 | return [L, 475 | DFO,ZFO, 476 | DIL,ZIL, 477 | DFI,ZFI, 478 | DOL,ZOL, 479 | len(Ldict), 480 | sum([Ldict[l].babba for l in Ldict]), 481 | sum([Ldict[l].abbba for l in Ldict]), 482 | sum([Ldict[l].babaa for l in Ldict]), 483 | sum([Ldict[l].abbaa for l in Ldict]), 484 | sum([Ldict[l].baaba for l in Ldict]), 485 | sum([Ldict[l].ababa for l in Ldict]), 486 | sum([Ldict[l].bbbaa for l in Ldict]), 487 | sum([Ldict[l].bbaba for l in Ldict]), 488 | sum([Ldict[l].aabaa for l in Ldict]), 489 | sum([Ldict[l].aaaba for l in Ldict]), 490 | sum([Ldict[l].baaaa for l in Ldict]), 491 | sum([Ldict[l].abaaa for l in Ldict]), 492 | pdisc, submitted, 493 | BBFO, BBIL, BBFI, BBOL] 494 | 495 | 496 | def checktaxa(taxalist,alignfile): 497 | with open(alignfile) as infile: 498 | data = infile.readlines() 499 | taxainfile = set() 500 | for line in data: 501 | if ">" in line: 502 | tax = line.split(" ")[0].replace(">","") 503 | if tax not in taxainfile: 504 | taxainfile.add(tax) 505 | if not set(taxainfile).difference(taxainfile): 506 | return 1 507 | 508 | 509 | 510 | 511 | def multiproc_it(subtests,alignfile,outfile, nboots,nproc,namelen,makesort,makeboots,noterminals): 512 | work_queue = multiprocessing.Queue() 513 | result_queue = multiprocessing.Queue() 514 | submitted = 0 515 | Notes = [] 516 | for rep in subtests: 517 | notes = "" 518 | if len(rep) == 2: 519 | rep,notes = rep 520 | p1,p2,p3a,p3b,o = rep 521 | if all(["[" in i for i in rep[1:]]): 522 | p1 = p1[1:-1].split(",") 523 | p2 = p2[1:-1].split(",") 524 | p3a = p3a[1:-1].split(",") 525 | p3b = p3b[1:-1].split(",") 526 | o = o[1:-1].split(",") 527 | if checktaxa([p1,p2,p3a,p3b,o],alignfile): 528 | work_queue.put([alignfile, [p1,p2,p3a,p3b,o], nboots, 1, submitted, noterminals]) 529 | submitted += 1 530 | else: 531 | print 'a taxon name was found that is not in the sequence file' 532 | else: 533 | if checktaxa([p1,p2,p3a,p3b,o],alignfile): 534 | work_queue.put([alignfile, [p1,p2,p3a,p3b,o], nboots, 0, submitted, noterminals]) 535 | submitted += 1 536 | else: 537 | print 'a taxon name was found that is not in the sequence file' 538 | Notes.append(notes) 539 | 540 | jobs = [] 541 | for i in range(min(submitted,nproc)): 542 | worker = Worker(work_queue, result_queue, runtest) 543 | jobs.append(worker) 544 | worker.start() 545 | for j in jobs: 546 | j.join() 547 | 548 | " read results back in " 549 | Results = [result_queue.get() for i in range(submitted)] 550 | Results.sort(key = lambda x:x[15]) 551 | 552 | 553 | 554 | " setup results file " 555 | if noterminals: 556 | outs = open(outfile+".Dfoilalt.txt", 'w') 557 | else: 558 | outs = open(outfile+".Dfoil.txt", 'w') 559 | header = "\t".join([ 'p1'+" "*(namelen[0]-2), 560 | 'p2'+" "*(namelen[1]-2), 561 | 'p3'+" "*(namelen[2]-2), 562 | 'p4'+" "*(namelen[3]-2), 563 | 'O'+" "*(namelen[4]-1), 564 | 'Dfo','Dil','Dfi','Dol', 565 | 'Z_fo','Z_il','Z_fi','Z_ol', 566 | 'BABBA','ABBBA', 567 | 'BABAA','ABBAA', 568 | 'BAABA','ABABA', 569 | 'BBBAA','BBABA', 570 | 'AABAA','AAABA', 571 | 'BAAAA','ABAAA', 572 | 'nloci','sign', 'notes']) 573 | print >>outs, header 574 | 575 | for i in range(len(Results)): 576 | L,DFO,ZFO,DIL,ZIL,DFI,ZFI,DOL,ZOL,nloc,BABBA,ABBBA,BABAA,ABBAA,BAABA,ABABA,BBBAA,BBABA,AABAA,AAABA,BAAAA,ABAAA,pdisc,sub,BBFO,BBIL,BBFI,BBOL = Results[i] 577 | L = [str(x).replace("['","[").replace("']","]").replace("', '",",") for x in L] 578 | 579 | sign = [] 580 | for s,d in zip([ZFO,ZIL,ZFI,ZOL],[DFO,DIL,DFI,DOL]): 581 | if s>3.5: 582 | if d>0: 583 | sign.append("+") 584 | else: 585 | sign.append("-") 586 | else: 587 | sign.append("0") 588 | #print sign 589 | 590 | resin = tuple([str(L[0])+" "*(namelen[0]-len(str(L[0]))), 591 | str(L[1])+" "*(namelen[1]-len(str(L[1]))), 592 | str(L[2])+" "*(namelen[2]-len(str(L[2]))), 593 | str(L[3])+" "*(namelen[3]-len(str(L[3]))), 594 | str(L[4])+" "*(namelen[4]-len(str(L[4]))), 595 | DFO,DIL,DFI,DOL, 596 | ZFO,ZIL,ZFI,ZOL, 597 | BABBA,ABBBA,BABAA,ABBAA,BAABA,ABABA,BBBAA,BBABA,AABAA,AAABA,BAAAA,ABAAA, 598 | nloc, "".join(sign), Notes[i]]) 599 | 600 | print >>outs, "%s\t%s\t%s\t%s\t%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d\t%s\t%s" % resin 601 | 602 | loci = open(alignfile).read().strip().split("|")[:-1] 603 | if makesort: 604 | None 605 | # makesortfiles("ABBBA",ABBBAloci,5,loci,outfile,makesort,sub,L) 606 | # makesortfiles("BABBA",BABBAloci,5,loci,outfile,makesort,sub,L) 607 | # makesortfiles("ABBAA",ABBAAloci,5,loci,outfile,makesort,sub,L) 608 | # makesortfiles("BABAA",BABAAloci,5,loci,outfile,makesort,sub,L) 609 | # makesortfiles("ABABA",ABABAloci,5,loci,outfile,makesort,sub,L) 610 | # makesortfiles("BAABA",BAABAloci,5,loci,outfile,makesort,sub,L) 611 | 612 | if makeboots: 613 | None 614 | # with open(outfile+"_"+str(sub+1)+".boots_D12",'w') as out: 615 | # out.write(",".join(map(str,BB12))) 616 | # with open(outfile+"_"+str(sub+1)+".boots_D1",'w') as out: 617 | # out.write(",".join(map(str,BB1))) 618 | # with open(outfile+"_"+str(sub+1)+".boots_D2",'w') as out: 619 | # out.write(",".join(map(str,BB2))) 620 | 621 | 622 | def main(tests, alignfile, outfile, nboots, nproc, makesort, makeboots,noterminals): 623 | import sys 624 | 625 | P1namelen = max(map(len,[str(i[0][0]) for i in tests])) 626 | P2namelen = max(map(len,[str(i[0][1]) for i in tests])) 627 | P3anamelen = max(map(len,[str(i[0][2]) for i in tests])) 628 | P3bnamelen = max(map(len,[str(i[0][3]) for i in tests])) 629 | Onamelen = max(map(len,[str(i[0][4]).strip() for i in tests])) 630 | namelen = [P1namelen,P2namelen,P3anamelen,P3bnamelen,Onamelen] 631 | 632 | multiproc_it(tests,alignfile,outfile,nboots,nproc,namelen,makesort,makeboots,noterminals) 633 | 634 | 635 | if __name__ == '__main__': 636 | main() 637 | 638 | 639 | 640 | 641 | --------------------------------------------------------------------------------