├── pyrad
├── __init__.py
├── potpour.py
├── loci2gphocs.py
├── loci2mig.py
├── loci2vcf.py
├── createfile.py
├── loci2treemix.py
├── tier2clust.py
├── loci2phynex.py
├── loci2SNP.py
├── overlapcheck.py
├── editraw_rads.py
├── editraw_merges.py
├── H_err_dp.py
├── editraw_pairs.py
├── cluster_cons7_shuf.py
├── Dtest.py
├── sortandcheck2.py
├── Dtest_5.py
├── consensdp.py
├── consens_pairs.py
└── Dtest_foil.py
├── tox.ini
├── .gitignore
├── setup.py
└── README.rst
/pyrad/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | [tox]
2 | envlist=py27
3 |
4 | [testenv]
5 | commands=py.test pyrad
6 | deps=pytest
7 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.py[cod]
2 |
3 | # C extensions
4 | *.so
5 |
6 | # Packages
7 | *.egg
8 | *.egg-info
9 | dist
10 | build
11 | eggs
12 | parts
13 | bin
14 | var
15 | sdist
16 | develop-eggs
17 | .installed.cfg
18 | lib
19 | lib64
20 | __pycache__
21 |
22 | # Installer logs
23 | pip-log.txt
24 |
25 | # Unit test / coverage reports
26 | .coverage
27 | .tox
28 | nosetests.xml
29 |
30 | # Translations
31 | *.mo
32 |
33 | # Mr Developer
34 | .mr.developer.cfg
35 | .project
36 | .pydevproject
37 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import setuptools
2 |
3 | requirements = [
4 | 'numpy',
5 | 'scipy',
6 | ]
7 |
8 | setuptools.setup(
9 | name="pyrad",
10 | version="3.0.66",
11 | url="https://github.com/dereneaton/pyrad",
12 |
13 | author="Deren Eaton",
14 | author_email="deren.eaton@yale.edu",
15 |
16 | description="Assembly and analysis of RADseq data sets",
17 | long_description=open('README.rst').read(),
18 |
19 | packages=setuptools.find_packages(),
20 |
21 | install_requires=[requirements],
22 |
23 |
24 | entry_points={
25 | 'console_scripts': [
26 | 'pyrad = pyrad.pyRAD:main',
27 | ],
28 | },
29 |
30 | license='GPL',
31 |
32 | classifiers=[
33 | 'Programming Language :: Python',
34 | 'Programming Language :: Python :: 2',
35 | 'Programming Language :: Python :: 2.7',
36 | ],
37 | )
38 |
--------------------------------------------------------------------------------
/pyrad/potpour.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python2
2 |
3 |
4 |
5 | import multiprocessing
6 |
7 |
8 | class Worker(multiprocessing.Process):
9 |
10 | def __init__(self, work_queue, result_queue, func):
11 |
12 | # base class initialization
13 | multiprocessing.Process.__init__(self)
14 |
15 | # job management stuff
16 | self.work_queue = work_queue
17 | self.result_queue = result_queue
18 | self.kill_received = False
19 | self.func = func
20 |
21 | def run(self):
22 | while not self.kill_received:
23 | # get a task
24 | if self.work_queue.empty():
25 | break
26 | else:
27 | #job = self.work_queue.get_nowait()
28 | job = self.work_queue.get()
29 |
30 | # the actual processing
31 | res = self.func(*job)
32 |
33 | # store the result
34 | self.result_queue.put(res)
35 |
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 |
2 | pyrad has now been superceded by *ipyrad*
3 | ==========================================
4 | (http://ipyrad.readthedocs.io) <-- see here
5 |
6 | All new development will be made on ipyrad
7 | and I recommend that pyrad users switch over. The new software
8 | offers huge speed improvements and many new features.
9 |
10 |
11 |
12 | pyrad
13 | =====
14 |
15 | Assembly and analysis of RADseq data sets
16 |
17 |
18 | Tutorials
19 | ---------
20 |
21 | Detailed information and a number of example tutorials are
22 | available `here `_.
23 |
24 |
25 | Downloads
26 | ---------
27 |
28 | Stable release versions can be downloaded `here `_, or you can clone the current development version using git:
29 |
30 | ::
31 |
32 | git clone https://github.com/dereneaton/pyrad.git
33 |
34 |
35 |
36 | Installation (As of v.3.0.6 and newer)
37 | ^^^^^^^^^^^^^^^^^
38 | With the following you can install pyrad so that it is callable as an executable from anywhere on your machine. If you have pip, then the second option will also install the dependencies numpy and scipy:
39 |
40 | ::
41 |
42 | cd pyrad
43 | sudo pip install .
44 | pyrad -h
45 |
46 | Or
47 |
48 | ::
49 |
50 | cd pyrad
51 | sudo python setup.py install
52 | pyrad -h
53 |
54 | Or alternatively, without having to install you can simply call pyRAD.py from its location using python:
55 |
56 | ::
57 |
58 | python pyrad/pyrad/pyRAD.py -h
59 |
60 |
61 | Python requirements
62 | ^^^^^^^^^^^^^^^^^^^
63 | You will need the following two Python dependencies with `pyrad`.
64 |
65 | * numpy
66 | * scipy
67 |
68 | In addition to the programs
69 |
70 | * `muscle `_
71 | * `vsearch `_
72 |
73 | Example usage (see tutorials)
74 | ^^^^^^^^^^^^^^^^^^^
75 | ::
76 |
77 | >>> pyrad -n
78 | new params.txt file created
79 |
80 |
81 | >>> pyrad -p params.txt
82 |
83 |
84 |
85 | Licence
86 | -------
87 | GPLv3
88 |
89 |
90 | Authors
91 | -------
92 |
93 | `pyrad` was written by `Deren Eaton `_.
94 |
--------------------------------------------------------------------------------
/pyrad/loci2gphocs.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python2
2 |
3 | # pyrad .loci to gphocs format conversion
4 | #
5 | # This is a very simple conversion because the formats are very similar.
6 | #
7 | # Isaac Overcast
8 | # March 21, 2015
9 |
10 | ## import libraries
11 | import sys, os
12 |
13 | def make(WORK, outname):
14 |
15 | #read in loci file
16 | infile = open(WORK+"outfiles/"+outname+".loci")
17 | outfile = open(WORK+"outfiles/"+outname+".gphocs",'w')
18 |
19 | ## parse the loci
20 | ## Each set of reads at a locus is appended with a line
21 | ## beginning with // and ending with |x, where x in the locus id.
22 | ## so after this call 'loci' will contain an array
23 | ## of sets of each read per locus.
24 | loci = infile.read().strip().split("//")[:-1]
25 |
26 | # Print the header, the number of loci in this file
27 | print >>outfile, len(loci)#-1
28 |
29 | # iterate through each locus, print out the header for each locus:
30 | #
31 | # Then print the data for each sample in this format:
32 | #
33 | for i, loc in enumerate(loci):
34 | # Separate out each sequence within the loc block. 'sequences'
35 | # will now be a list strings containing name/sequence pairs.
36 | # We select each line in the locus string that starts with ">"
37 | names = [line.split()[0] for line in loc.strip().split("\n") if ">" in line]
38 | sequences = [line.split()[-1] for line in loc.strip().split("\n") if ">" in line]
39 |
40 | # Strips off 'nnnn' separator for paired data
41 | # replaces '-' with 'N'
42 | editsequences = [seq.replace("n","").replace('-','N') for seq in sequences]
43 | sequence_length = len(editsequences[0])
44 |
45 | # get length of longest name and add 4 spaces
46 | longname = max(map(len,names))+4
47 |
48 | # Print out the header for this locus
49 | print >>outfile, 'locus'+ str(i), len(sequences), str( sequence_length )
50 |
51 | # Iterate through each sequence read at this locus and write it to the file.
52 | for name,sequence in zip(names,sequences):
53 | # Clean up the sequence data to make gphocs happy. Only accepts UPPER
54 | # case chars for bases, and only accepts 'N' for missing data. Also,
55 | # the .loci format prepends a '>' on to the individual names, so we have
56 | # to clean this up which is what the [1:] is doing.
57 | print >>outfile, name[1:]+" "*(longname-len(name))+sequence
58 |
59 | if __name__ == "__main__":
60 | make(WORK, outfile)
61 |
--------------------------------------------------------------------------------
/pyrad/loci2mig.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python2
2 |
3 | import numpy as np
4 | import sys
5 | import gzip
6 | try:
7 | from collections import OrderedDict
8 | except ImportError:
9 | from ordereddict import OrderedDict
10 | try:
11 | from collections import Counter
12 | except ImportError:
13 | from counter import Counter
14 | import alignable
15 |
16 | def make(WORK, outname, taxadict, minhits, seed):
17 |
18 | ## outfile
19 | outfile = open(WORK+"/outfiles/"+outname+".migrate", 'w')
20 |
21 | ## cleanup taxadict
22 | taxa = OrderedDict()
23 | for group in taxadict:
24 | taxa[group] = []
25 | for samp in taxadict[group]:
26 | a = samp.split("/")[-1].replace(".consens.gz","")
27 | taxa[group].append(a)
28 |
29 | print "\t data set reduced for group coverage minimums"
30 | for i,j in zip(taxa,minhits):
31 | print "\t ",i, taxa[i], "minimum=",j
32 |
33 | #print taxadict.keys()
34 |
35 | ## filter data to only the loci that have data
36 | ## for at least N individuals in each pop
37 | keep = []
38 | MINS = zip(taxa.keys(), minhits)
39 |
40 | ## read in data to sample names
41 | loci = open(WORK+"/outfiles/"+outname+".loci",'r').read().strip().split("|")[:-1]
42 | for loc in loci:
43 | samps = [i.split()[0].replace(">","") for i in loc.split("\n") if ">" in i]
44 | ## filter for coverage
45 | GG = []
46 | for group,mins in MINS:
47 | GG.append( sum([i in samps for i in taxa[group]]) >= int(mins) )
48 | if all(GG):
49 | keep.append(loc)
50 |
51 | ## print data to file
52 | print >>outfile, len(taxa), len(keep), "( npops nloci for data set", outname+".loci",")"
53 |
54 | ## print all data for each population at a time
55 | done = 0
56 | for group in taxadict:
57 | ## print a list of lengths of each locus
58 | if not done:
59 | loclens = [len(loc.split("\n")[1].split()[-1].replace("x","n").replace("n","")) for loc in keep]
60 | print >>outfile, " ".join(map(str,loclens))
61 | done += 1
62 |
63 | ## print a list of number of individuals in each locus
64 | indslist = []
65 | for loc in keep:
66 | samps = [i.split()[0].replace(">","") for i in loc.split("\n") if ">" in i]
67 | inds = sum([i in samps for i in taxa[group]])
68 | indslist.append(inds)
69 | print >>outfile, " ".join(map(str,indslist)), group
70 |
71 | ## print sample id, spaces, and sequence data
72 | #for loc in range(len(keep)):
73 | for loc in range(len(keep)):
74 | seqs = [i.split()[-1] for i in keep[loc].split("\n") if \
75 | i.split()[0].replace(">","") in taxa[group]]
76 | for i in range(len(seqs)):
77 | print >>outfile, group[0:8]+"_"+str(i)+\
78 | (" "*(10-len(group[0:8]+"_"+str(i))))+seqs[i].replace("x","n").replace("n","")
79 |
80 | outfile.close()
81 |
82 |
83 | # WORK = "/home/deren/Dropbox/Public/PyRAD_TUTORIALS/tutorial_RAD"
84 | # outname = "c85m4p3"
85 |
86 | # pops = ['pop1','pop2','pop3']
87 | # samps = [ ["1A0","1B0","1C0","1D0"],
88 | # ["2E0","2F0","2G0","2H0"],
89 | # ["3I0","3J0","3K0","3L0"] ]
90 |
91 | # taxadict = OrderedDict(zip(pops,samps))
92 | # minhits = [4,4,4]
93 | # seed = 112233
94 |
95 | if __name__ == "__main__":
96 | make(WORK, outname, taxadict, minhits, seed)
97 |
--------------------------------------------------------------------------------
/pyrad/loci2vcf.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python2
2 |
3 | import time
4 | import numpy as np
5 | import alignable
6 |
7 |
8 | def make(WORK, version, outname, mindepth, names):
9 | outfile = open(WORK+"/outfiles/"+outname+".vcf", 'w')
10 | inloci = WORK+"/outfiles/"+outname+".loci"
11 | names = list(names)
12 | names.sort()
13 |
14 | print >>outfile, "##fileformat=VCFv4.1"
15 | print >>outfile, "##fileDate="+time.strftime("%Y%m%d")
16 | print >>outfile, "##source=pyRAD.v."+str(version)
17 | print >>outfile, "##reference=common_allele_at_each_locus"
18 | print >>outfile, "##INFO="
19 | print >>outfile, "##INFO="
20 | print >>outfile, "##INFO="
21 | print >>outfile, "##INFO="
22 | print >>outfile, "##FORMAT="
23 | print >>outfile, "##FORMAT="
24 | print >>outfile, "##FORMAT="
25 | print >>outfile, "\t".join(["#CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO ","FORMAT"]+list(names))
26 |
27 | loci = open(inloci).read().split("|")[:-1]
28 | snps = 0
29 | vcflist = []
30 | for locusnumber in range(len(loci)):
31 | samps = [i.split()[0][1:] for i in loci[locusnumber].strip().split("\n") if ">" in i]
32 | loc = np.array([tuple(i.split()[-1]) for i in loci[locusnumber].strip().split("\n") if ">" in i])
33 | NS = str(len(loc))
34 | DP = str(mindepth)
35 | for base in range(len(loc.T)):
36 | col = []
37 | site = list(loc.T[base])
38 | site = list("".join(site).replace("-","").replace("N",""))
39 | if site:
40 | for bb in site:
41 | if bb in list("RKYSWM"):
42 | col += alignable.unstruct(bb)[0]
43 | col += alignable.unstruct(bb)[1]
44 | else:
45 | col += bb
46 | REF = alignable.most_common([i for i in col if i not in list("-RKYSWMN")])
47 | ALT = set([i for i in col if (i in list("ATGC-N")) and (i!=REF)])
48 | if ALT:
49 | snps += 1
50 | GENO = [REF]+list(ALT)
51 | GENOS = []
52 | for samp in names:
53 | if samp in samps:
54 | idx = samps.index(samp)
55 | f = alignable.unstruct(loc.T[base][idx])
56 | if ('-' in f) or ('N' in f):
57 | GENOS.append("./.")
58 | else:
59 | GENOS.append(str(GENO.index(f[0]))+"|"+str(GENO.index(f[1])))
60 | else:
61 | GENOS.append("./.")
62 | vcflist.append("\t".join([`locusnumber+1`, `base+1`, '.', REF, ",".join(ALT), "20", "PASS",
63 | ";".join(["NS="+NS, "DP="+DP]), "GT"]+GENOS))
64 | if not locusnumber % 1000:
65 | outfile.write( "\n".join(vcflist)+"\n" )
66 | vcflist = []
67 |
68 | #print >>outfile, "\t".join([`locusnumber+1`, `base+1`, '.', REF, ",".join(ALT), "20", "PASS",
69 | # ";".join(["NS="+NS, "DP="+DP]), "GT"]+GENOS)
70 |
71 |
72 | outfile.write( "\n".join(vcflist) )
73 | outfile.close()
74 |
75 | if __name__ == "__main__":
76 | make(WORK, version, outname, mindepth, names)
77 |
--------------------------------------------------------------------------------
/pyrad/createfile.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python2
2 |
3 | import sys
4 |
5 | def main(version):
6 | output = """
7 | ==** parameter inputs for pyRAD version %s **======================== affected step ==
8 | ./ ## 1. Working directory (all)
9 | ./*.fastq.gz ## 2. Loc. of non-demultiplexed files (if not line 18) (s1)
10 | ./*.barcodes ## 3. Loc. of barcode file (if not line 18) (s1)
11 | vsearch ## 4. command (or path) to call vsearch (or usearch) (s3,s6)
12 | muscle ## 5. command (or path) to call muscle (s3,s7)
13 | TGCAG ## 6. Restriction overhang (e.g., C|TGCAG -> TGCAG) (s1,s2)
14 | 2 ## 7. N processors (parallel) (all)
15 | 6 ## 8. Mindepth: min coverage for a cluster (s4,s5)
16 | 4 ## 9. NQual: max # sites with qual < 20 (or see line 20)(s2)
17 | .88 ## 10. Wclust: clustering threshold as a decimal (s3,s6)
18 | rad ## 11. Datatype: rad,gbs,pairgbs,pairddrad,(others:see docs)(all)
19 | 4 ## 12. MinCov: min samples in a final locus (s7)
20 | 3 ## 13. MaxSH: max inds with shared hetero site (s7)
21 | c88d6m4p3 ## 14. Prefix name for final output (no spaces) (s7)
22 | ==== optional params below this line =================================== affected step ==
23 | ## 15.opt.: select subset (prefix* only selector) (s2-s7)
24 | ## 16.opt.: add-on (outgroup) taxa (list or prefix*) (s6,s7)
25 | ## 17.opt.: exclude taxa (list or prefix*) (s7)
26 | ## 18.opt.: loc. of de-multiplexed data (s2)
27 | ## 19.opt.: maxM: N mismatches in barcodes (def= 1) (s1)
28 | ## 20.opt.: phred Qscore offset (def= 33) (s2)
29 | ## 21.opt.: filter: def=0=NQual 1=NQual+adapters. 2=strict (s2)
30 | ## 22.opt.: a priori E,H (def= 0.001,0.01, if not estimated) (s5)
31 | ## 23.opt.: maxN: max Ns in a cons seq (def=5) (s5)
32 | ## 24.opt.: maxH: max heterozyg. sites in cons seq (def=5) (s5)
33 | ## 25.opt.: ploidy: max alleles in cons seq (def=2;see docs) (s4,s5)
34 | ## 26.opt.: maxSNPs: (def=100). Paired (def=100,100) (s7)
35 | ## 27.opt.: maxIndels: within-clust,across-clust (def. 3,99) (s3,s7)
36 | ## 28.opt.: random number seed (def. 112233) (s3,s6,s7)
37 | ## 29.opt.: trim overhang left,right on final loci, def(0,0) (s7)
38 | ## 30.opt.: output formats: p,n,a,s,v,u,t,m,k,g,* (see docs) (s7)
39 | ## 31.opt.: maj. base call at depth>x>sys.stderr, "\tnew params.txt file created"
49 | print >>outfile, "\n".join(output.split("\n")[1:])
50 |
51 |
--------------------------------------------------------------------------------
/pyrad/loci2treemix.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python2
2 |
3 | import numpy as np
4 | import sys
5 | import gzip
6 | try:
7 | from collections import OrderedDict
8 | except ImportError:
9 | from ordereddict import OrderedDict
10 | try:
11 | from collections import Counter
12 | except ImportError:
13 | from counter import Counter
14 | import alignable
15 |
16 |
17 | def make(WORK, outname, taxadict, minhits):
18 |
19 | ## output files
20 | outfile = gzip.open(WORK+"/outfiles/"+outname+".treemix.gz",'w')
21 |
22 | ## cleanup taxadict to just sample names
23 | taxa = OrderedDict()
24 | for group in taxadict:
25 | taxa[group] = []
26 | for samp in taxadict[group]:
27 | a = samp.split("/")[-1].replace(".consens.gz","")
28 | taxa[group].append(a)
29 |
30 | print "\t data set reduced for group coverage minimums"
31 | for i,j in zip(taxa,minhits):
32 | print "\t ",i, taxa[i], 'minimum=',j
33 |
34 | ## read in data from unlinked_snps to sample names
35 | infile = open(WORK.rstrip("/")+"/outfiles/"+outname+".unlinked_snps",'r')
36 | dat = infile.readlines()
37 | nsamp,nsnps = dat[0].strip().split(" ")
38 | nsamp = int(nsamp)
39 | nsnps = int(nsnps)
40 | NDATA = np.empty([int(nsamp),int(nsnps)],dtype='object')
41 | excludes = 0
42 |
43 | ## read SNP matrix into a numpy.array
44 | for line in range(len(dat[1:])):
45 | a,b = dat[1:][line].split()
46 | NDATA[line] = list(b)
47 | sites = np.transpose(NDATA)
48 |
49 | ## unpack ambiguity bases and find two most common alleles
50 | ## at every SNP site, save to a list
51 | alleles = []
52 | for site in sites:
53 | ds = []
54 | for s in site:
55 | if s in list("RKSYWM"):
56 | ds.append(alignable.unstruct(s)[0])
57 | ds.append(alignable.unstruct(s)[1])
58 | else:
59 | ds.append(s)
60 | ds.append(s)
61 | snp = [s for s in ds if s not in ["N",'-']]
62 | a = Counter(snp).most_common(3)
63 | alleles.append([a[0][0],a[1][0]])
64 |
65 | ## create a dictionary mapping sample names to SNPs
66 | SNPS = OrderedDict()
67 | for line in dat[1:]:
68 | a,b = line.split()
69 | SNPS[a] = b
70 |
71 | ## reduce Taxa dict to only samples that are in the unlinkedsnps alignment
72 | for key in taxa:
73 | replacement = []
74 | for val in taxa[key]:
75 | if val in SNPS.keys():
76 | replacement.append(val)
77 | taxa[key] = replacement
78 |
79 | ## create a dictionary with empty lists for each taxon
80 | FREQ = OrderedDict()
81 | for tax in taxa:
82 | FREQ[tax] = []
83 |
84 | ## fill the FREQ dictionary with SNPs for all
85 | ## samples in that taxon
86 | keeps = []
87 | for snp in range(int(nsnps)):
88 | GG = []
89 | ## if snp meets minhits requirement
90 | for tax,mins in zip(taxa,minhits):
91 | GG.append( sum([SNPS[i][snp] not in ["N","-"] for i in taxa[tax]]) >= int(mins))
92 | if all(GG):
93 | keeps.append(snp)
94 |
95 |
96 | for keep in keeps:
97 | for tax in FREQ:
98 | bunch = []
99 | for i in taxa[tax]:
100 | bunch.append(alignable.unstruct(SNPS[i][keep])[0])
101 | bunch.append(alignable.unstruct(SNPS[i][keep])[1])
102 | #print tax, i, SNPS[i][keep], bunch
103 | FREQ[tax].append("".join(bunch))
104 |
105 | ## header
106 | print >>outfile, " ".join(FREQ.keys())
107 |
108 | ## data to file
109 | for i,j in enumerate(keeps):
110 | a1 = alleles[j][0]
111 | a2 = alleles[j][1]
112 | H = [str(FREQ[tax][i].count(a1))+","+str(FREQ[tax][i].count(a2)) for tax in FREQ]
113 | HH = " ".join(H)
114 |
115 | ## exclude non-biallelic SNPs
116 | if " 0,0 " not in HH:
117 | ## exclude invariable sites given this sampling
118 | if not all([zz.split(",")[1] in '0' for zz in H]):
119 | print >>outfile, " ".join(H)
120 | else:
121 | excludes += 1
122 |
123 | outfile.close()
124 |
125 |
126 | if __name__ == "__main__":
127 | make(WORK, outname, taxadict)
128 |
--------------------------------------------------------------------------------
/pyrad/tier2clust.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python2
2 |
3 | import os
4 | import sys
5 | import itertools
6 | import numpy
7 | import random
8 | import glob
9 | import subprocess
10 | import pickle
11 | import gzip
12 | from cluster_cons7_shuf import comp
13 |
14 |
15 | def cluster(UCLUST, ID, datatype, WORK, MASK):
16 | C = " -cluster_smallmem "+WORK+"prefix/cat.consens_"
17 |
18 | if datatype in ['gbs','pairgbs','mergegbs']:
19 | P = " -strand both"
20 | COV = ".90"
21 | else:
22 | P = " -leftjust "
23 | COV = ".90"
24 | if 'vsearch' not in UCLUST:
25 | Q = ""
26 | T = " -threads 1"
27 | else:
28 | Q = " -qmask "+MASK
29 | ## TODO: figure out optimized threads setting...
30 | T = " -threads 6"
31 | U = " -userout "+WORK+"prefix/cat.u"
32 | cmd = UCLUST+\
33 | C+\
34 | P+\
35 | " -id "+ID+\
36 | Q+\
37 | T+\
38 | U+\
39 | " -userfields query+target+id+gaps+qstrand+qcov"+\
40 | " -maxaccepts 1"+\
41 | " -maxrejects 0"+\
42 | " -fulldp"+\
43 | " -query_cov "+str(COV)+\
44 | " -notmatched "+WORK+"prefix/cat._tempU"
45 | os.system(cmd)
46 | #subprocess.call(cmd, shell=True, stderr=subprocess.STDOUT, stdout=subprocess.PIPE)
47 |
48 |
49 | def flip(a):
50 | if a == "+":
51 | return "-"
52 | elif a == "-":
53 | return "+"
54 |
55 |
56 | def makeclust(ID, datatype, WORK):
57 |
58 | " load tier 2 hits (names,direction) into a Dic with seeds as keys"
59 | Uin = open(WORK+"prefix/cat.u")
60 | Fseeds = {}
61 | for line in [line.split("\t") for line in Uin.readlines()]:
62 | if line[1] not in Fseeds:
63 | Fseeds[line[1]] = [(line[0],line[4])]
64 | else:
65 | Fseeds[line[1]].append((line[0],line[4]))
66 | Uin.close()
67 |
68 |
69 | " load tier 1 hits (names,direction) into a Dictionary with seeds as keys"
70 | FS = glob.glob(WORK+"prefix/cat.u_*")
71 | Useeds = {}
72 | for f in FS:
73 | infile = open(f)
74 | for line in [line.split("\t") for line in infile.readlines()]:
75 | if line[1] not in Useeds:
76 | Useeds[line[1]] = [(line[0],line[4])]
77 | else:
78 | Useeds[line[1]].append((line[0],line[4]))
79 | infile.close()
80 |
81 |
82 | " Make one dictionary with combining Fseeds and Useeds matching to Fseeds"
83 | D = {}
84 | for seed in Fseeds:
85 | # add matches to seed to D[seed]
86 | Fhits = Useeds.get(seed)
87 | # add matches to hits to seed to D[seed]
88 | Mhits = []
89 | for hit in Fseeds[seed]:
90 | Mhits.append(hit)
91 | ugh = Useeds.get(hit[0])
92 | if ugh:
93 | if hit[1] == "-":
94 | if len(ugh) == 1:
95 | Mhits += [(ugh[0][0],flip(ugh[0][1]))]
96 | elif len(ugh) > 1:
97 | for child in ugh:
98 | Mhits += [(child[0], flip(child[1]))]
99 | else:
100 | Mhits += ugh
101 | if Fhits:
102 | D[(seed,'s')] = Fhits+Mhits
103 | else:
104 | D[(seed,'s')] = Mhits
105 |
106 |
107 | " load seeds of tier 2 into D and set its Useed hits"
108 | f = open(WORK+"prefix/cat._tempU")
109 | lines = f.readlines()
110 | for line in lines:
111 | if ">" in line:
112 | if (line.strip()[1:],'s') not in D:
113 | if Useeds.get(line.strip()[1:]):
114 | D[(line.strip()[1:],'s')] = Useeds.get(line.strip()[1:])
115 | f.close()
116 |
117 | " load .consens files into Dics "
118 | FS = glob.glob(WORK+"clust"+ID+"/cat.consens_*.gz")
119 | Seqs = {}
120 | for f in FS:
121 | with gzip.open(f) as ff:
122 | k = itertools.izip(*[iter(ff)]*2)
123 | while 1:
124 | try: a = k.next()
125 | except StopIteration: break
126 | Seqs[a[0].strip()] = a[1].strip()
127 |
128 |
129 | " write clust file "
130 | outfile = gzip.open(WORK+"prefix/cat.clust_.gz", 'w')
131 | for i in D:
132 | thisclust = []
133 | outfile.write(">"+i[0]+'\n'+Seqs[">"+i[0]].upper()+'\n')
134 | thisclust.append(">"+i[0]+'\n'+Seqs[">"+i[0]].upper())
135 | for m in D[i]:
136 | if ">"+m[0]+'\n'+Seqs[">"+m[0]].upper() not in thisclust:
137 | if m[1] == "-":
138 | outfile.write(">"+m[0]+'\n'+comp(Seqs[">"+m[0]].upper())[::-1]+'\n')
139 | thisclust.append(">"+m[0]+'\n'+comp(Seqs[">"+m[0]].upper())[::-1])
140 | else:
141 | outfile.write(">"+m[0]+'\n'+Seqs[">"+m[0]].upper()+'\n')
142 | thisclust.append(">"+m[0]+'\n'+Seqs[">"+m[0]].upper())
143 | outfile.write("//\n")
144 | outfile.close()
145 |
146 |
147 |
148 | def main(UCLUST, ID, datatype,
149 | gids, seed, WORK, MASK):
150 |
151 | sys.stderr.write('\n\tstep 6: clustering across cons-samples at '+`ID`+' similarity \n')
152 |
153 | " read in all seeds and hits "
154 | seeds = [WORK+"prefix/cat.seed_"+gid for gid in gids]
155 | temps = [WORK+"prefix/cat._temp_"+gid for gid in gids]
156 |
157 | #print seeds
158 | #print temps
159 |
160 | " read in all seeds and make same length for randomizing "
161 | out = gzip.open(WORK+'prefix/cat.group_.gz','wb')
162 | for handle in seeds:
163 | f = open(handle,'r')
164 | k = itertools.izip(*[iter(f)]*3)
165 | while 1:
166 | try: a = k.next()
167 | except StopIteration: break
168 | if len(a[0].strip()) < 100:
169 | " seriously, don't have names longer than 100 chars "
170 | out.write(a[0].strip()+" "*(100-len(a[0].strip()))+a[1])
171 | else:
172 | out.write(a[0].strip()+" "*((len(a[0].strip())+3)-len(a[0].strip()))+a[1])
173 | print "long name lengths may cause errors"
174 | f.close()
175 | out.close()
176 |
177 | """ randomize input order """
178 | if seed:
179 | random.seed(seed)
180 | with gzip.open(WORK+'prefix/cat.group_.gz','rb') as source:
181 | data = [ (random.random(), line) for line in source ]
182 | data.sort()
183 |
184 | """ sort by length while preserving randomization within size classes """
185 | D = [line for _,line in data]
186 | D.sort(key=len, reverse=True)
187 | k = iter(D)
188 | out = open(WORK+'prefix/cat.consens_','w')
189 | while 1:
190 | try: a = k.next().split(" ")
191 | except StopIteration: break
192 | ss = a[-1].replace("a","A").replace("g","G").replace("c","C").replace("t","T").strip()
193 | print >>out, a[0]+'\n'+ss
194 | out.close()
195 |
196 | cluster(UCLUST, ID, datatype, WORK, MASK)
197 | makeclust(ID, datatype, WORK)
198 |
199 |
200 | #if glob.glob(WORK+"prefix/*.seed_*") or glob.glob(WORK+"prefix/*._temp_*"):
201 | # os.system("rm "+WORK+"prefix/*.seed_*")
202 | # os.system("rm "+WORK+"prefix/*._temp_*")
203 | # os.system("rm "+WORK+"prefix/*.u_*")
204 |
205 |
--------------------------------------------------------------------------------
/pyrad/loci2phynex.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python2
2 |
3 | import numpy as np
4 | import sys
5 | import os
6 | import glob
7 |
8 |
9 | def update(idict, count, WORK, outname):
10 | """ updates dictionary with the next .5M reads
11 | from the super long string phylip file. Makes
12 | for faster reading. """
13 |
14 | data = iter(open(WORK+"outfiles/"+outname+".phy"))
15 | ntax, nchar = data.next().strip().split()
16 |
17 | ## read in max N bp at a time
18 | for line in data:
19 | tax, seq = line.strip().split()
20 | idict[tax] = idict[tax][100000:]
21 | idict[tax] += seq[count:count+100000]
22 | del line
23 |
24 | return idict
25 |
26 |
27 |
28 | def makephy(WORK, outname, names, longname):
29 | """ builds phy output. If large files writes 50000 loci
30 | at a time to tmp files and rebuilds at the end"""
31 |
32 | " order names "
33 | names = list(names)
34 | names.sort()
35 |
36 | " read in loci file "
37 | locus = iter(open(WORK+"outfiles/"+outname+".loci", 'rb'))
38 |
39 | " dict for saving the full matrix "
40 | fdict = {name:[] for name in names}
41 |
42 | " list for saving locus number and locus range for partitions "
43 | partitions = []
44 | loc_number = 1
45 | initial_pos = 1
46 |
47 | " remove empty column sites and append edited seqs to dict F "
48 | done = 0
49 | nloci = 0
50 | nbases = 0
51 | while not done: #nloci < 50000: #not done:
52 | seqs = []
53 | #arrayed = np.array([])
54 | anames = []
55 | while 1:
56 | ## get next locus
57 | try:
58 | samp = locus.next()
59 | except StopIteration:
60 | done = 1
61 | break
62 | if "//" in samp:
63 | nloci += 1
64 | break
65 | else:
66 | try:
67 | name, seq = samp.split()
68 | except ValueError:
69 | print samp
70 | anames.append(name[1:])
71 | seqs.append(seq.strip())
72 | ## reset
73 | arrayed = np.array([list(i) for i in seqs])
74 | if done:
75 | break
76 | ## create mask for columns that are empty or
77 | ## that are paired-end separators (compatible w/ pyrad v2 and v3)
78 | #mask = [i for i in range(len(arrayed.T)) if np.any([
79 | ## still surely a better way to vectorize this...
80 | mask = [i for i in arrayed.T if any([j not in list("-Nn") for j in i])]
81 | masked = np.dstack(mask)[0]
82 |
83 | ## partition information
84 | loc_name = "p"+str(nloci)
85 | loc_range = str(initial_pos) + "-" +\
86 | str(len(masked[0]) + initial_pos -1)
87 | initial_pos += len(masked[0])
88 | partitions.append(loc_name+"="+loc_range)
89 |
90 | ## uncomment to print block info (used to partition by locus)
91 | #blockend += minray
92 | #print blockend,
93 | #print loc
94 | #print arrayed
95 |
96 | ## append data to dict
97 | for name in names:
98 | if name in anames:
99 | #fdict[name].append(arrayed[anames.index(name), mask].tostring())
100 | fdict[name].append(masked[anames.index(name),:].tostring())
101 | else:
102 | fdict[name].append("N"*masked.shape[1])
103 | #fdict[name].append("N"*len(arrayed[0, mask]))
104 | ## add len to total length
105 | nbases += len(fdict[name][-1])
106 |
107 | ## after x iterations tmp pickle fdict?
108 | if not nloci % int(1e4):
109 | ## concat strings
110 | for name in fdict:
111 | with open(os.path.join(WORK, "tmp",
112 | "{}_{}.tmp".format(name, nloci)), 'wb') as wout:
113 | wout.write("".join(fdict[name]))
114 | del fdict
115 | fdict = {name:[] for name in names}
116 |
117 | ## print out .PHY file, if really big, pull form multiple tmp pickle
118 | superout = open(WORK+"outfiles/"+outname+".phy", 'wb')
119 | print >>superout, len(names), nbases
120 | if nloci < 1e4:
121 | for name in names:
122 | print >>superout, name+(" "*((longname+3)-\
123 | len(name)))+"".join(fdict[name])
124 | else:
125 | for name in names:
126 | superout.write("{}{}{}".format(
127 | name,
128 | " "*((longname+3)-len(name)),
129 | "".join(fdict[name])))
130 | tmpfiles = glob.glob(os.path.join(WORK, "tmp", name+"*.tmp"))
131 | tmpfiles.sort()
132 | for tmpf in tmpfiles:
133 | with open(tmpf, 'rb') as tmpin:
134 | superout.write(tmpin.read())
135 | superout.write("\n")
136 | superout.close()
137 | raxml_part_out = open(WORK+"outfiles/"+outname+".phy.partitions", 'w')
138 | for partition in partitions:
139 | print >>raxml_part_out, "DNA, %s" % (partition)
140 | raxml_part_out.close()
141 |
142 | return partitions
143 |
144 |
145 | def makenex(WORK, outname, names, longname, partitions):
146 | """ PRINT NEXUS """
147 |
148 | " make nexus output "
149 | data = iter(open(WORK+"outfiles/"+outname+".phy"))
150 | nexout = open(WORK+"outfiles/"+outname+".nex", 'wb')
151 |
152 | ntax, nchar = data.next().strip().split(" ")
153 |
154 | print >>nexout, "#NEXUS"
155 | print >>nexout, "BEGIN DATA;"
156 | print >>nexout, " DIMENSIONS NTAX=%s NCHAR=%s;" % (ntax,nchar)
157 | print >>nexout, " FORMAT DATATYPE=DNA MISSING=N GAP=- INTERLEAVE=YES;"
158 | print >>nexout, " MATRIX"
159 |
160 | idict = {}
161 |
162 | ## read in max 1M bp at a time
163 | for line in data:
164 | tax, seq = line.strip().split()
165 | idict[tax] = seq[0:100000]
166 | del line
167 |
168 | nameorder = idict.keys()
169 | nameorder.sort()
170 |
171 | n=0
172 | tempn=0
173 | sz = 100
174 | while n < len(seq):
175 | for tax in nameorder:
176 | print >>nexout, " "+tax+" "*\
177 | ((longname-len(tax))+3)+\
178 | idict[tax][tempn:tempn+sz]
179 | n += sz
180 | tempn += sz
181 | print >>nexout, ""
182 |
183 | if not n % 100000:
184 | #print idict[tax][tempn:tempn+sz]
185 | idict = update(idict, n, WORK, outname)
186 | tempn -= 100000
187 |
188 | print >>nexout, ';'
189 | print >>nexout, 'END;'
190 |
191 | ### partitions info
192 | print >>nexout, "BEGIN SETS;"
193 | for partition in partitions:
194 | print >>nexout, " CHARSET %s;" % (partition)
195 | print >>nexout, "END;"
196 |
197 | nexout.close()
198 |
199 |
200 | def make(WORK, outfile, names, longname, formats):
201 | partitions = makephy(WORK, outfile, names, longname)
202 | makenex(WORK, outfile, names, longname, partitions)
203 |
204 |
205 | if __name__ == "__main__":
206 | make()
207 |
--------------------------------------------------------------------------------
/pyrad/loci2SNP.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python2
2 |
3 | import numpy as np
4 | import sys
5 | import gzip
6 | try:
7 | from collections import Counter
8 | except ImportError:
9 | from counter import Counter
10 | from itertools import chain
11 | import alignable
12 |
13 |
14 | def make(WORK, outname, names, formats, seed, ploidy):
15 | np.random.seed(int(seed))
16 | finalfile = open(WORK+"outfiles/"+outname+".loci").read()
17 | longname = max(map(len,names))
18 |
19 | " output .snps and .unlinked_snps"
20 | S = {} ## snp dict
21 | Si = {} ## unlinked snp dict
22 | for name in list(names):
23 | S[name] = []
24 | Si[name] = []
25 |
26 | " record bi-allelic snps"
27 | nobis = 0
28 |
29 | " for each locus select out the SNPs"
30 | for loc in finalfile.strip().split("|")[:-1]:
31 | pis = ""
32 | ns = []
33 | ss = []
34 | cov = {} ## record coverage for each SNP
35 | for line in loc.split("\n"):
36 | if ">" in line:
37 | ns.append(line.split()[0].replace(">",""))
38 | ss.append(line.split()[-1])
39 | else:
40 | pis = [i[0] for i in enumerate(line) if i[1] in list('*-')]
41 |
42 | " assign snps to S, and record coverage for usnps"
43 | for tax in S:
44 | if tax in ns:
45 | if pis:
46 | for snpsite in pis:
47 | snpsite -= (longname+5)
48 | S[tax].append(ss[ns.index(tax)][snpsite])
49 | if snpsite not in cov:
50 | cov[snpsite] = 1
51 | else:
52 | cov[snpsite] += 1
53 | "downweight selection of gap sites "
54 | if ss[ns.index(tax)][snpsite] != '-':
55 | cov[snpsite] += 1
56 | else:
57 | if pis:
58 | for snpsite in pis:
59 | S[tax].append("N")
60 | Si[tax].append("N")
61 |
62 | " randomly select among snps w/ greatest coverage for unlinked snp "
63 | maxlist = []
64 | for j,k in cov.items():
65 | if k == max(cov.values()):
66 | maxlist.append(j)
67 |
68 | " Is bi-allelic after resolution of ambigs? "
69 | bisnps = []
70 | for maxl in maxlist:
71 | bases = [ss[ns.index(tax)][maxl] for tax in S if tax in ns]
72 | ambigs = list(chain(*[alignable.unstruct(i) for i in bases if i in "RSWYMK"]))
73 | bases = set(bases+ambigs)
74 | for ambig in "RSWYMKN-":
75 | bases.discard(ambig)
76 | if len(bases) <= 2:
77 | bisnps.append(maxl)
78 |
79 | #rando = pis[np.random.randint(len(pis))]
80 | #rando -= (longname+5)
81 | if bisnps:
82 | rando = bisnps[np.random.randint(len(bisnps))]
83 | elif maxlist:
84 | rando = maxlist[np.random.randint(len(maxlist))]
85 |
86 | ## record how many loci have no
87 | tbi = 0
88 | for tax in S:
89 | if tax in ns:
90 | if pis:
91 | " if none are bi-allelic "
92 | if not bisnps:
93 | tbi = 1
94 | Si[tax].append(ss[ns.index(tax)][rando])
95 | if pis:
96 | " add spacer between loci "
97 | S[tax].append(" ")
98 | else:
99 | " invariable locus "
100 | S[tax].append("_ ")
101 | nobis += tbi
102 | " names "
103 | SF = list(S.keys())
104 | SF.sort()
105 |
106 | " print out .SNP file "
107 | if 's' in formats:
108 | snpsout = open(WORK+'outfiles/'+outname+".snps",'w')
109 | print >>snpsout, "## %s taxa, %s loci, %s snps" % (len(S),
110 | len("".join(S.values()[0]).split(" "))-1,
111 | len("".join(S[SF[0]]).replace(" ","")))
112 | for i in SF:
113 | print >>snpsout, i+(" "*(longname-len(i)+3))+"".join(S[i])
114 | snpsout.close()
115 |
116 |
117 | " print out .USNP file "
118 | snpout = open(WORK+'outfiles/'+outname+".unlinked_snps",'w')
119 | print >>snpout, len(Si),len("".join(Si.values()[0]))
120 | for i in SF:
121 | print >>snpout, i+(" "*(longname-len(i)+3))+"".join(Si[i])
122 | snpout.close()
123 |
124 | statsout = open(WORK+"stats/"+outname+".stats",'a')
125 | print >>statsout, "sampled unlinked SNPs=",len(Si.values()[0])
126 | print >>statsout, "sampled unlinked bi-allelic SNPs=", len(Si.values()[0])-nobis
127 | statsout.close()
128 |
129 | if 'k' in formats:
130 | "print out .str (structure) file "
131 | structout = open(WORK+'outfiles/'+outname+".str", 'w')
132 |
133 | B = {'A': '0',
134 | 'T': '1',
135 | 'G': '2',
136 | 'C': '3',
137 | 'N': '-9',
138 | '-': '-9'}
139 | if ploidy > 1:
140 | for line in SF:
141 | print >>structout, line+(" "*(longname-len(line)+3))+\
142 | "\t"*6+"\t".join([B[alignable.unstruct(j)[0]] for j in Si[line]])
143 | print >>structout, line+(" "*(longname-len(line)+3))+\
144 | "\t"*6+"\t".join([B[alignable.unstruct(j)[1]] for j in Si[line]])
145 | else:
146 | for line in SF:
147 | print >>structout, line+(" "*(longname-len(line)+3))+\
148 | "\t"*6+"\t".join([B[alignable.unstruct(j)[1]] for j in Si[line]])
149 | structout.close()
150 |
151 |
152 | if 'g' in formats:
153 | "print out .geno file "
154 | genoout = open(WORK+'outfiles/'+outname+".usnps.geno", 'w')
155 | for i in range(len(Si.values()[0])):
156 | getref = 0
157 | ref = "N"
158 | while ref == "N":
159 | ref = alignable.unstruct(Si[SF[getref]][i])[0]
160 | getref += 1
161 | SNProw = "".join(map(str,[alignable.unstruct(Si[j][i]).count(ref) if Si[j][i] != "N" \
162 | else "9" for j in SF]))
163 | ## print ref,SNProw
164 | if len(set(SNProw)) > 1:
165 | print >>genoout, SNProw
166 | genoout.close()
167 |
168 | if 'g' in formats:
169 | "print out .geno file "
170 | genoout = open(WORK+'outfiles/'+outname+".snps.geno", 'w')
171 | for i in range(len(S.values()[0])):
172 | if S[SF[0]][i].strip("_").strip():
173 | getref = 0
174 | ref = "N"
175 | while ref == "N":
176 | #print i, S[SF[0]][i]
177 | ref = alignable.unstruct(S[SF[getref]][i])[0]
178 | getref += 1
179 | SNProw = "".join(map(str,[alignable.unstruct(S[j][i]).count(ref) if \
180 | S[j][i] != "N" else "9" for j in SF]))
181 | ## print ref,SNProw
182 | if len(set(SNProw)) > 1:
183 | print >>genoout, SNProw
184 | genoout.close()
185 |
186 |
187 | if __name__ == "__main__":
188 | make(WORK, outname, names, formats, seed, ploidy)
189 |
--------------------------------------------------------------------------------
/pyrad/overlapcheck.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python2
2 |
3 | import glob
4 | import multiprocessing
5 | import gzip
6 | import subprocess
7 | import sys
8 | import os
9 | import itertools
10 | from potpour import Worker
11 |
12 |
13 | def mergepairs(WORK, UCLUST, handle, match, Q):
14 |
15 | handle1 = handle
16 | handle2 = handle.replace("_R1.","_R2.")
17 | outfile = handle.replace("_R1.","M.")
18 |
19 | while outfile.split(".")[-1] in ["fastq","fastQ","gz","fq","FastQ"]:
20 | outfile = outfile.replace('.'+outfile.split(".")[-1], "")
21 | outfile = outfile.split("/")[-1]
22 | outfile = WORK+"mergedreads/"+outfile+".fq"
23 |
24 | if [handle1 and handle2]:
25 | if ".gz" in handle1[-4:]:
26 | k1 = itertools.izip(*[iter(gzip.open(handle1))]*4)
27 | k2 = itertools.izip(*[iter(gzip.open(handle2))]*4)
28 | thandle1 = WORK+"mergedreads/"+handle1.split("/")[-1].replace(".gz",".temp2")
29 | thandle2 = WORK+"mergedreads/"+handle2.split("/")[-1].replace(".gz",".temp2")
30 | numout1 = open(thandle1, 'w')
31 | numout2 = open(thandle2, 'w')
32 | else:
33 | k1 = itertools.izip(*[iter(open(handle1))]*4)
34 | k2 = itertools.izip(*[iter(open(handle2))]*4)
35 | thandle1 = WORK+"mergedreads/"+handle1.split("/")[-1]+".temp2"
36 | thandle2 = WORK+"mergedreads/"+handle2.split("/")[-1]+".temp2"
37 | numout1 = open(thandle1, 'w')
38 | numout2 = open(thandle2, 'w')
39 | else:
40 | print "pair missing"
41 | sys.exit()
42 |
43 | N1 = []
44 | N2 = []
45 | cnt = 0
46 |
47 | while 1:
48 | try: d = k1.next()
49 | except StopIteration: break
50 | e = k2.next()
51 | N1.append("".join([d[0].strip()+"_"+str(cnt)+"\n",d[1],d[2],d[3]]))
52 | N2.append("".join([e[0].strip()+"_"+str(cnt)+"\n",e[1],e[2],e[3]]))
53 | cnt+=1
54 | if not cnt % 50000:
55 | numout1.write("".join(N1))
56 | numout2.write("".join(N2))
57 | N1 = []
58 | N2 = []
59 | numout1.write("".join(N1))
60 | numout2.write("".join(N2))
61 | numout1.close()
62 | numout2.close()
63 |
64 | cmd = UCLUST+\
65 | " -fastq_mergepairs "+thandle1 +\
66 | " -reverse "+thandle2 +\
67 | " -fastq_maxdiffs 6 " +\
68 | " -fastq_truncqual 2 " +\
69 | " -fastq_minlen 36 " +\
70 | " -fastq_minmergelen 50 "+\
71 | " -fastqout "+outfile +\
72 | " -fastq_allowmergestagger" +\
73 | " -quiet "
74 | subprocess.call(cmd, shell=True)
75 |
76 |
77 | stats = statsout(thandle1, thandle2, outfile, WORK)
78 | sys.stderr.write(".")
79 | return stats
80 |
81 |
82 | def statsout(h1,h2,m,WORK):
83 | " remove merged reads from 1st & 2nd read files "
84 |
85 | " stat counters "
86 | cnt = 0
87 | mcnt = 0
88 |
89 | " create list of merged IDs "
90 | MIDS = []
91 | if os.path.exists(m):
92 | merged = open(m, 'r')
93 | for line in itertools.izip(*[iter(merged)]*4):
94 | MIDS.append(int(line[0].strip().split("_")[-1]))
95 | merged.close()
96 | ## if not...
97 |
98 | if ".gz" in h1[-5:]:
99 | hand1 = gzip.open(h1, 'rb')
100 | hand2 = gzip.open(h2, 'rb')
101 | else:
102 | hand1 = open(h1, 'r')
103 | hand2 = open(h2, 'r')
104 |
105 | r1 = itertools.izip(*[iter(hand1)]*4)
106 | r2 = itertools.izip(*[iter(hand2)]*4)
107 |
108 | " lists to write output "
109 | ONE = []
110 | TWO = []
111 |
112 | " outfile names for mergeless reads "
113 | outname = WORK+"fastq/"+h1.split("/")[-1].replace(".temp2",".nomerge")+".gz"
114 |
115 | if os.path.exists(outname):
116 | os.remove(outname)
117 | outname2 = outname.replace("_R1.","_R2.")
118 | if os.path.exists(outname2):
119 | os.remove(outname2)
120 |
121 | while 1:
122 | try: one = r1.next()
123 | except StopIteration: break
124 | two = r2.next()
125 | cnt += 1
126 | find = int(one[0].strip().split("_")[-1])
127 | if MIDS:
128 | if find == MIDS[0]:
129 | "reads were merged, don't write to file"
130 | mcnt += 1
131 | MIDS.pop(0)
132 | else:
133 | ONE.append(one) #[i.strip() for i in one])
134 | TWO.append(two) #[i.strip() for i in two])
135 | else:
136 | ONE.append(one) #[i.strip() for i in one])
137 | TWO.append(two) #[i.strip() for i in two])
138 |
139 | if not cnt % 10000:
140 | outfile = gzip.open(outname, 'ab')
141 | outfile.write("".join(["".join(i) for i in ONE]))
142 | outfile.close()
143 | outfile2 = gzip.open(outname2, 'ab')
144 | outfile2.write("".join(["".join(i) for i in TWO]))
145 | outfile2.close()
146 | ONE = []
147 | TWO = []
148 |
149 |
150 | if os.path.exists(h1):
151 | cmd1 = "/bin/rm "+h1
152 | cmd2 = "/bin/rm "+h2
153 | subprocess.call(cmd1, shell=True)
154 | subprocess.call(cmd2, shell=True)
155 |
156 | outfile = gzip.open(outname, 'ab')
157 | outfile.write("".join(["".join(i) for i in ONE]))
158 | outfile.close()
159 | outfile2 = gzip.open(outname2, 'ab')
160 | outfile2.write("".join(["".join(i) for i in TWO]))
161 | outfile2.close()
162 | sys.stderr.write(".")
163 | return [outname,mcnt]
164 |
165 |
166 |
167 | def main(WORK, UCLUST, FQs, match, Q, Parallel):
168 |
169 | " create output directories "
170 | if not os.path.exists(WORK+'fastq/'):
171 | os.makedirs(WORK+'fastq')
172 | if not os.path.exists(WORK+'mergedreads'):
173 | os.makedirs(WORK+'mergedreads')
174 | if not os.path.exists(WORK+'stats'):
175 | os.makedirs(WORK+'stats')
176 |
177 |
178 | submitted = 0
179 | work_queue = multiprocessing.Queue()
180 |
181 | names = [i for i in glob.glob(FQs) if "_R1.fq" in i]
182 |
183 | " submit jobs to queue "
184 | if len(names) > 1:
185 | for handle in names:
186 | if "nomerge." not in handle:
187 | n = str(handle.split('/')[-1]).replace("_R1.",".")
188 | while n.split(".")[-1] in ["fastq","fastQ","gz","fq","FastQ"]:
189 | n = n.replace('.'+n.split(".")[-1], "")
190 | finder = WORK+'edits/'+n+".edit"
191 | if finder not in glob.glob(WORK+"edits/*"):
192 | if os.stat(handle).st_size > 0: ## exclude empty files
193 | if os.path.exists(handle.replace("_R1.","_R2.")):
194 | if not os.path.exists(handle.replace(".fq",".nomerge.fq")):
195 | args = [WORK, UCLUST, handle, match, Q]
196 | work_queue.put(args)
197 | submitted += 1
198 | else:
199 | print "merge file already created for", handle.split("/")[-1]
200 | else:
201 | print "cannot find 2nd read file for", handle.split("/")[-1]
202 | else:
203 | print "\t"+finder+" already in edits/"
204 | else:
205 | if not names:
206 | if [i for i in glob.glob(FQs) if "_R1_." in i]:
207 | print "\n\tfile names should have _R1. not _R1_."
208 | print "\n\tcannot find input files"
209 | sys.exit()
210 | else:
211 | work_queue.put([WORK, UCLUST, names[0], match, Q])
212 | submitted += 1
213 |
214 | " create a queue to pass to workers to store the results "
215 | result_queue = multiprocessing.Queue()
216 |
217 |
218 | " spawn workers, give function "
219 | jobs = []
220 | for i in range( min(Parallel,submitted) ):
221 | worker = Worker(work_queue, result_queue, mergepairs)
222 | worker.start()
223 | jobs.append(worker)
224 | for job in jobs:
225 | job.join()
226 |
227 | if submitted > 0:
228 | statout = open(WORK+"stats/s2.mergedreads.txt",'w')
229 | print >>statout, "\t".join(["taxon","mergedreads"])
230 |
231 | for i in range(submitted):
232 | stat = result_queue.get()
233 | a,b = stat
234 | n = a.strip().split("/")[-1].replace(".nomerge.gz","")
235 | print >>statout, "\t".join([n,str(b)])
236 | print >>statout, "\nmerged reads written to", WORK+"mergedreads/ "
237 | statout.close()
238 |
--------------------------------------------------------------------------------
/pyrad/editraw_rads.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python2
2 |
3 | import multiprocessing
4 | import itertools
5 | import sys
6 | import os
7 | import glob
8 | import operator
9 | import gzip
10 | from potpour import Worker
11 | from sortandcheck2 import unambig
12 | from cluster_cons7_shuf import comp
13 |
14 |
15 |
16 | def unambar(CUT):
17 | if any([i in CUT for i in list("RKYSWM")]):
18 | CUTa, CUTb = unambig(CUT)
19 | return [CUTa,CUTb]
20 | else:
21 | return False
22 |
23 |
24 | def Afilter(CUT,s,strict):
25 | a = b = wheretocut = None
26 | " lookfor cut site "
27 | if unambar(CUT):
28 | " if ambiguity in cutter "
29 | CUTa,CUTb = unambar(CUT)
30 | if strict == 2:
31 | lookfor1 = CUTa+"A"
32 | lookfor2 = CUTb+"A"
33 | else:
34 | lookfor1 = CUTa+"AGA"
35 | lookfor2 = CUTb+"AGA"
36 | if lookfor1 in s:
37 | a = s.rindex(lookfor1)
38 | if lookfor2 in s:
39 | b = s.rindex(lookfor2)
40 | if (a or b):
41 | wheretocut = min([i for i in [a,b] if i])
42 | else:
43 | wheretocut = None
44 | else:
45 | if strict == 2:
46 | lookfor = CUT+"A"
47 | else:
48 | lookfor = CUT+"AGA"
49 | if lookfor in s:
50 | wheretocut = s.rindex(lookfor)
51 | else:
52 | wheretocut = None
53 |
54 | if not wheretocut:
55 | " look for adapter sequence "
56 | if strict == 2:
57 | lookfor1 = "AGATCG"
58 | else:
59 | lookfor1 = "AGATCGGA"
60 | if lookfor1 in s:
61 | wheretocut = s.rindex(lookfor1)-(len(CUT)+1)
62 | else:
63 | wheretocut = None
64 |
65 | " look for CUT at end of seq "
66 | if not wheretocut:
67 | if CUT in s[-len(CUT)-5:]:
68 | wheretocut = s.rindex(CUT)
69 | return wheretocut
70 |
71 |
72 |
73 |
74 |
75 | def rawedit(WORK, infile, CUT, pN, trimkeep, strict, Q, datatype):
76 | """ three functions:
77 | (1) replaces low quality base calls with Ns,
78 | (2) checks for adapter sequence if strict set to 1 or 2 """
79 |
80 | if "," in CUT:
81 | CUT1,CUT2 = CUT.split(',')
82 | else:
83 | CUT1=CUT2=CUT
84 |
85 | if ".gz" in infile:
86 | f = gzip.open(infile, 'r')
87 | else:
88 | f = open(infile,'r')
89 | n = str(infile.split('/')[-1]).replace("_R1.",".")
90 | while n.split(".")[-1] in ["fastq","fastQ","gz","fq","FastQ"]:
91 | n = n.replace('.'+n.split(".")[-1], "")
92 | k = itertools.izip(*[iter(f)]*4)
93 | writing_r = []
94 | writing_c = []
95 |
96 | orig = keep = keepcut = 0
97 | handle = WORK+'edits/'+str(n)+".edit"
98 |
99 | while 1:
100 | try: d = k.next()
101 | except StopIteration: break
102 | orig += 1
103 | SS = d[1].strip()
104 |
105 | ph = map(ord,d[3].strip('\n'))
106 | offset = int(Q)
107 | phred = map(lambda x:x-offset,ph)
108 | seq = ["N"]*len(phred)
109 | for base in range(len(phred)):
110 | if base >= len(CUT1): ## don't quality check cut site
111 | if phred[base] >= 20: ## quality threshold
112 | try: seq[base] = SS[base]
113 | except IndexError:
114 | None
115 | else:
116 | seq[base] = "N"
117 | else:
118 | if unambar(CUT1):
119 | seq[base] = unambar(CUT1)[0][base]
120 | else:
121 | seq[base] = CUT1[base]
122 | #try: seq[base] = SS[base]
123 | #except IndexError:
124 | # None
125 |
126 | if not orig % 5000:
127 | if trimkeep:
128 | " write full length and fragment reads "
129 | with open(WORK+'edits/'+str(n)+".edit",'a') as outfile:
130 | outfile.write("".join([z for z in writing_r]))
131 | outfile.write("".join([z for z in writing_c]))
132 | else:
133 | " write only full length reads "
134 | with open(WORK+'edits/'+str(n)+".edit",'a') as outfile:
135 | outfile.write("".join([z for z in writing_r]))
136 | writing_r = []
137 | writing_c = []
138 |
139 | s = "".join(seq)
140 | wheretocut1 = None
141 | if strict:
142 | wheretocut1 = Afilter(comp(CUT2)[::-1],s,strict)
143 | s = s[:wheretocut1]
144 |
145 | if datatype == 'merged':
146 | " remove extra forward base so forwards match reverse length"
147 | s = s[:-1]
148 |
149 | if s.count("N") <= pN: ## max allowed Ns
150 | if len(s) >= max(32,trimkeep): ## if read is trimmed, must be minlen long
151 | if wheretocut1: ## if it was trimmed...
152 | writing_c.append(">"+n+"_"+str(keepcut)+"_c1"+"\n"+s+"\n")
153 | keepcut += 1
154 | else:
155 | writing_r.append(">"+n+"_"+str(keep)+"_r1"+"\n"+s+"\n")
156 | keep += 1
157 |
158 | if trimkeep:
159 | with open(WORK+'edits/'+str(n)+".edit",'a') as outfile:
160 | outfile.write("".join([z for z in writing_r]))
161 | outfile.write("".join([z for z in writing_c]))
162 | else:
163 | with open(WORK+'edits/'+str(n)+".edit",'a') as outfile:
164 | outfile.write("".join([z for z in writing_r]))
165 | writing_r = []
166 | writing_c = []
167 |
168 | f.close()
169 | sys.stderr.write(".")
170 | if not trimkeep:
171 | keepcut = 0
172 | return [handle.split("/")[-1].replace(".edit",""),str(orig),str(keep),str(keepcut)]
173 |
174 |
175 |
176 | def main(Parallel, WORK, FQs, CUT, pN, Q, strict, trimkeep, datatype):
177 | print >>sys.stderr, "\tstep 2: editing raw reads \n\t",
178 |
179 | " create output directories "
180 | if not os.path.exists(WORK+'stats'):
181 | os.makedirs(WORK+'stats')
182 | if not os.path.exists(WORK+'edits'):
183 | os.makedirs(WORK+'edits')
184 |
185 | " load up work queue "
186 | submitted = 0
187 | work_queue = multiprocessing.Queue()
188 | if len(glob.glob(FQs)) > 1:
189 | FS = glob.glob(FQs)
190 |
191 | " order files by size "
192 | for i in range(len(FS)):
193 | statinfo = os.stat(FS[i])
194 | FS[i] = FS[i],statinfo.st_size
195 | FS.sort(key=operator.itemgetter(1))
196 | FS = [i[0] for i in FS][::-1]
197 |
198 | " submit jobs to queue "
199 | for handle in FS:
200 | finder = WORK+'edits/'+handle.split("/")[-1]
201 | while finder.split(".")[-1] in ["fastq","fastQ","gz","fq","FastQ"]:
202 | finder = finder.replace('.'+finder.split(".")[-1], "").replace("_R1","")
203 | if finder+".edit" not in glob.glob(WORK+"edits/*"):
204 | if os.stat(handle).st_size > 0: ## exclude empty files
205 | args = [WORK, handle, CUT, float(pN), trimkeep, strict, Q, datatype]
206 | work_queue.put(args)
207 | submitted += 1
208 | else:
209 | print "skipping",handle,", file is empty"
210 | else:
211 | print "\t"+finder+" already in edits/"
212 |
213 | elif len(glob.glob(FQs)) == 1:
214 | " if only one file "
215 | work_queue.put([WORK, glob.glob(FQs)[0], CUT, float(pN), trimkeep, strict, Q, datatype])
216 | submitted += 1
217 |
218 | else:
219 | print "\tNo demultiplexed files found. Check path."
220 | sys.exit()
221 |
222 | " create a queue to pass to workers to store the results "
223 | result_queue = multiprocessing.Queue()
224 |
225 | " spawn workers, give function "
226 | jobs = []
227 | for i in range( min(Parallel,submitted) ):
228 | worker = Worker(work_queue, result_queue, rawedit)
229 | worker.start()
230 | jobs.append(worker)
231 | for job in jobs:
232 | job.join()
233 |
234 |
235 | " collect the results off the queue "
236 | outstats = open(WORK+"stats/s2.rawedit.txt",'a')
237 | print >> outstats, "\t".join(["sample ","Nreads","passed","passed.w.trim","passed.total"])
238 | STATS = []
239 | for i in range(submitted):
240 | STATS.append(result_queue.get())
241 |
242 | STATS.sort(key = lambda x: x[0])
243 | for i in range(submitted):
244 | a,b,c,d = STATS[i]
245 | print >> outstats, "\t".join([a,b,c,d,str(int(c)+int(d))])
246 |
247 | print >>outstats, """
248 | Nreads = total number of reads for a sample
249 | passed = retained reads that passed quality filtering at full length
250 | passed.w.trim= retained reads that were trimmed due to detection of adapters
251 | passed.total = total kept reads of sufficient length
252 | note: you can set the option in params file to include trimmed reads of xx length. """
253 | outstats.close()
254 |
255 | #" zip files to save size "
256 | #for ff in glob.glob(WORK+"edits/*"):
257 | # os.system("gzip "+ff)
258 |
--------------------------------------------------------------------------------
/pyrad/editraw_merges.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python2
2 |
3 | import multiprocessing
4 | import itertools
5 | import sys
6 | import os
7 | import glob
8 | import operator
9 | import gzip
10 | from potpour import Worker
11 | from sortandcheck2 import unambig
12 | from cluster_cons7_shuf import comp
13 |
14 |
15 | def unambar(CUT):
16 | if any([i in CUT for i in list("RKYSWM")]):
17 | CUTa, CUTb = unambig(CUT)
18 | return [CUTa,CUTb]
19 | else:
20 | return CUT
21 |
22 |
23 | def most_common(L):
24 | return max(itertools.groupby(sorted(L)), key=lambda(x, v):(len(list(v)),-L.index(x)))[0]
25 |
26 |
27 | def rawedit(WORK, infile, CUT, pN, trimkeep, strict, Q):
28 | """ three functions:
29 | (1) replaces low quality base calls with Ns,
30 | (2) checks for adapter sequence if strict set to 1 or 2 """
31 |
32 | if "," in CUT:
33 | CUT1,CUT2 = CUT.split(',')
34 | else:
35 | CUT1=CUT2=CUT
36 |
37 | if ".gz" in infile:
38 | f = gzip.open(infile, 'r')
39 | else:
40 | f = open(infile,'r')
41 |
42 | " remove name suffix"
43 | n = str(infile.split('/')[-1]).replace("_R1.",".")
44 | while n.split(".")[-1] in ["fastq","fastQ","gz","fq","FastQ"]:
45 | n = n.replace('.'+n.split(".")[-1], "")
46 |
47 | " read infile 4 lines at a time, setup counters and lists"
48 | k = itertools.izip(*[iter(f)]*4)
49 | writing_r = []
50 | writing_c = []
51 | orig = keep = keepcut = 0
52 | handle = WORK+'edits/'+str(n)+".edit"
53 |
54 |
55 | " do a test run on first 1000 reads to find if extra bases on right end of reads"
56 | rightend = []
57 | while len(rightend) < 1000:
58 | try: d = k.next()
59 | except StopIteration: break
60 | s = "".join(d[1].strip())
61 |
62 | " cutters "
63 | find1 = CUT1
64 | find2 = comp(CUT2)[::-1]
65 |
66 | " are cutters found on both ends? A correct merge"
67 | a = s[:len(find1)]
68 | b = s[-len(find2)-2:] ## w/ wiggle room
69 | if (find1 in a) and (find2 in b) :
70 | xtra = s.rindex(find2)+len(find2)
71 | rightend.append(len(s)-xtra)
72 |
73 | " find most common element in rightend "
74 | if rightend:
75 | a = most_common(rightend)
76 | if a>3:
77 | Roffset = 0
78 | else:
79 | Roffset = a
80 | else:
81 | Roffset = 0
82 |
83 | " reset iterable "
84 | if ".gz" in infile:
85 | f = gzip.open(infile, 'r')
86 | else:
87 | f = open(infile,'r')
88 | k = itertools.izip(*[iter(f)]*4)
89 |
90 | " iterate over each read "
91 | while 1:
92 | try: d = k.next()
93 | except StopIteration: break
94 | orig += 1
95 | SS = d[1].strip()
96 |
97 | " apply Phred Q filter "
98 | ph = map(ord,d[3].strip('\n'))
99 | offset = int(Q)
100 | phred = map(lambda x:x-offset,ph)
101 | seq = ["N"]*len(phred)
102 | for base in range(len(phred)):
103 | "don't quality check cut sites "
104 | if (base >= len(CUT1)) and (base < len(phred)-len(CUT2)):
105 | if phred[base] >= 20:
106 | try: seq[base] = SS[base]
107 | except IndexError:
108 | None
109 | else:
110 | seq[base] = "N"
111 | else:
112 | try: seq[base] = SS[base]
113 | except IndexError:
114 | None
115 |
116 | " write to file "
117 | if not orig % 5000:
118 | with open(WORK+'edits/'+str(n)+".edit",'a') as outfile:
119 | outfile.write("".join([z for z in writing_r]))
120 | writing_r = []
121 |
122 | s = "".join(seq)
123 |
124 | wheretocut = [None,None,None]
125 | " filter for N"
126 | if s.count("N") <= pN:
127 |
128 | " apply filter for Adapters "
129 | find1 = CUT1
130 | find2 = comp(CUT2)[::-1]
131 |
132 | if "trim" in d[0]:
133 | " filters for non-merged, trimmed reads from s2 "
134 | if (find1 in s[:len(find1)]) or (find2 in s[len(find2)-2:]):
135 | None
136 | else:
137 | " CUT1 rarely missing, CUT2 sometimes missing"
138 | s = s[:-len(CUT2)-Roffset]
139 |
140 | else:
141 | " merged reads. Are cutters found on both ends? A correct merge"
142 | a = s[:len(find1)]
143 | b = s[-len(find2)-2:] ## w/ wiggle room
144 | if (find1 in a) and (find2 in b) :
145 | " find end of read2 "
146 | xtra = s.rindex(find2)+len(find2)
147 | wheretocut = [None, len(s)-Roffset, 'complete']
148 | else:
149 | " look for CUT2 from right side "
150 | if find2 in s[len(s)/2:]: ## check that this is a good general number...
151 | a = s.rindex(find2)+len(find2)
152 | wheretocut = [None, a, 'find2 in s']
153 | else:
154 | "couldn't find cut2, maybe has error, look for adapter"
155 | if 'AGATCG' in s:
156 | a = s.rindex('AGATCG')-len(CUT2)
157 | wheretocut = [None, a, 'AGATCG in s']
158 | else:
159 | if "TCGGAAG" in s:
160 | a = s.rindex('TCGGAAG')-len(CUT2)-3
161 | wheretocut = [None, a, 'TCGGAAG in s']
162 | else:
163 | "no sign of overshoot to right --->"
164 | " look for overshoot on left <---- "
165 | wheretocut = [None, len(s)-Roffset, "None"]
166 |
167 | " look for CUT1 from left side "
168 | if CUT1 in s:
169 | a = s.index(CUT1)
170 | wheretocut[0] = a
171 | else:
172 | "exclude read"
173 | wheretocut[0] = wheretocut[1]
174 |
175 | w1,w2,reason = wheretocut
176 | if len(s[w1:w2]) > trimkeep:
177 | #print s[w1:w2], reason, len(s[w1:w2]), trimkeep
178 | s = s[w1:w2]
179 | else:
180 | s = ""
181 |
182 | if len(s) >= max(36,trimkeep): ## if read is trimmed, must be minlen long
183 | writing_r.append(">"+n+"_"+str(keep)+"_r1"+"\n"+s+"\n")
184 | keep += 1
185 |
186 | with open(WORK+'edits/'+str(n)+".edit",'a') as outfile:
187 | outfile.write("".join([z for z in writing_r]))
188 | writing_r = []
189 |
190 | f.close()
191 | sys.stderr.write(".")
192 | if not trimkeep:
193 | keepcut = 0
194 | return [handle.split("/")[-1].replace(".edit",""),str(orig),str(keep),str(keepcut)]
195 |
196 |
197 |
198 | def main(Parallel, WORK, FQs, CUT, pN, Q, strict, trimkeep):
199 | print >>sys.stderr, "\tstep 2: editing raw reads \n\t",
200 |
201 | " create output directories "
202 | if not os.path.exists(WORK+'stats'):
203 | os.makedirs(WORK+'stats')
204 | if not os.path.exists(WORK+'edits'):
205 | os.makedirs(WORK+'edits')
206 |
207 | " used to find if files already exist "
208 | lookfor = ".edit"
209 |
210 | " load up work queue "
211 | submitted = 0
212 | work_queue = multiprocessing.Queue()
213 | if len(glob.glob(FQs)) > 1:
214 | FS = [f for f in glob.glob(FQs)]
215 |
216 | " order files by size "
217 | for i in range(len(FS)):
218 | statinfo = os.stat(FS[i])
219 | FS[i] = FS[i],statinfo.st_size
220 | FS.sort(key=operator.itemgetter(1))
221 | FS = [i[0] for i in FS][::-1]
222 |
223 | " submit jobs to queue "
224 | for handle in FS:
225 | finder = WORK+'edits/'+handle.split("/")[-1]
226 | while finder.split(".")[-1] in ["fastq","fastQ","gz","fq","FastQ"]:
227 | finder = finder.replace('.'+finder.split(".")[-1], "").replace("_R1","")
228 | if finder+".edit" not in glob.glob(WORK+"edits/*"):
229 | if os.stat(handle).st_size > 0: ## exclude empty files
230 | args = [WORK, handle, CUT, float(pN), trimkeep, strict, Q]
231 | work_queue.put(args)
232 | submitted += 1
233 | else:
234 | print "\t"+finder+" already in edits/"
235 |
236 | elif len(glob.glob(FQs)) == 1:
237 | " if only one file "
238 | work_queue.put([WORK, glob.glob(FQs)[0], CUT, float(pN), trimkeep, strict, Q])
239 | submitted += 1
240 |
241 | else:
242 | print "\tNo demultiplexed files found. Check path."
243 |
244 | " create a queue to pass to workers to store the results "
245 | result_queue = multiprocessing.Queue()
246 |
247 | " spawn workers, give function "
248 | jobs = []
249 | for i in range( min(Parallel,submitted) ):
250 | worker = Worker(work_queue, result_queue, rawedit)
251 | worker.start()
252 | jobs.append(worker)
253 | for job in jobs:
254 | job.join()
255 |
256 |
257 | " collect the results off the queue "
258 | outstats = open(WORK+"stats/s2.rawedit.txt",'a')
259 | print >> outstats, "\t".join(["sample ","Nreads","passed","passed.w.trim","passed.total"])
260 | STATS = []
261 | for i in range(submitted):
262 | STATS.append(result_queue.get())
263 |
264 | STATS.sort(key = lambda x: x[0])
265 | for i in range(submitted):
266 | a,b,c,d = STATS[i]
267 | print >> outstats, "\t".join([a,b,c,d,str(int(c)+int(d))])
268 |
269 | print >>outstats, """
270 | Nreads = total number of reads for a sample
271 | passed = retained reads that passed quality filtering at full length
272 | passed.w.trim= retained reads that were trimmed due to detection of adapters
273 | passed.total = total kept reads of sufficient length
274 | note: you can set the option in params file to include trimmed reads of xx length. """
275 | outstats.close()
276 |
277 | #" zip files to save size "
278 | #for ff in glob.glob(WORK+"edits/*"):
279 | # os.system("gzip "+ff)
280 |
--------------------------------------------------------------------------------
/pyrad/H_err_dp.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python2
2 |
3 | import scipy.stats
4 | import scipy.optimize
5 | import numpy
6 | import itertools
7 | import sys
8 | import glob
9 | import multiprocessing
10 | import os
11 | import gzip
12 | from potpour import Worker
13 |
14 |
15 |
16 | def makeP(N):
17 | """ returns a list of freq. for ATGC"""
18 | sump = sum([sum(i) for i in N])
19 | try: p1 = sum([float(i[0]) for i in N])/sump
20 | except ZeroDivisionError: p1 = 0.0
21 | try: p2 = sum([float(i[1]) for i in N])/sump
22 | except ZeroDivisionError: p2 = 0.0
23 | try: p3 = sum([float(i[2]) for i in N])/sump
24 | except ZeroDivisionError: p3 = 0.0
25 | try: p4 = sum([float(i[3]) for i in N])/sump
26 | except ZeroDivisionError: p4 = 0.0
27 | return [p1,p2,p3,p4]
28 |
29 |
30 | def L1(E,P,N):
31 | """probability homozygous"""
32 | h = []
33 | s = sum(N)
34 | for i,l in enumerate(N):
35 | p = P[i]
36 | b = scipy.stats.binom.pmf(s-l,s,E)
37 | h.append(p*b)
38 | return sum(h)
39 |
40 |
41 | def L2(E,P,N):
42 | """probability of heterozygous"""
43 | h = []
44 | s = sum(N)
45 | for l,i in enumerate(N):
46 | for j,k in enumerate(N):
47 | if j>l:
48 | one = 2.*P[l]*P[j]
49 | two = scipy.stats.binom.pmf(s-i-k,s,(2.*E)/3.)
50 | three = scipy.stats.binom.pmf(i,k+i,0.5)
51 | four = 1.-(sum([q**2. for q in P]))
52 | h.append(one*two*(three/four))
53 | return sum(h)
54 |
55 |
56 | def totlik(E,P,H,N):
57 | """ total probability """
58 | lik = ((1-H)*L1(E,P,N)) + (H*L2(E,P,N))
59 | return lik
60 |
61 | def LL(x0,P,Tab):
62 | """ Log likelihood score given values [H,E] """
63 | H = x0[0]
64 | E = x0[1]
65 | L = []
66 | if (H <= 0.) or (E <= 0.):
67 | r = numpy.exp(100)
68 | else:
69 | for i in Tab:
70 | ll = totlik(E,P,H,i[0])
71 | if ll > 0:
72 | L.append(i[1] * numpy.log(ll))
73 | r = -sum(L)
74 | #print "\t".join(map(str,[r, H, E]))
75 | return r
76 |
77 |
78 | def LL_haploid(E,P,Tab):
79 | """ Log likelihood score given values [H,E] """
80 | H = 0.
81 | L = []
82 | if (E <= 0.):
83 | r = numpy.exp(100)
84 | else:
85 | for i in Tab:
86 | ll = totlik(E,P,H,i[0])
87 | if ll > 0:
88 | L.append(i[1] * numpy.log(ll))
89 | r = -sum(L)
90 | #print "\t".join(map(str,[r, H, E]))
91 | return r
92 |
93 |
94 |
95 | def table_c(N):
96 | """ makes a dictionary with counts of base counts [x,x,x,x]:x,
97 | speeds up Likelihood calculation"""
98 | Tab = {}
99 | k = iter(N)
100 | while 1:
101 | try:
102 | d = k.next()
103 | except StopIteration: break
104 | if tuple(d) in Tab:
105 | Tab[tuple(d)] += 1
106 | else:
107 | Tab[tuple(d)] = 1
108 | L = []
109 | for i,j in Tab.items():
110 | [i,j]
111 | L.append([i,j])
112 | return [i for i in L if (0,0,0,0) not in i]
113 |
114 |
115 | def stack(D):
116 | """
117 | from list of bases at a site D,
118 | returns an ordered list of counts of bases
119 | """
120 | L = len(D)
121 | counts = []
122 | for i in range(len(D[0])):
123 | A=C=T=G=N=S=0
124 | for nseq in range(L):
125 | A += D[nseq][i].count("A")
126 | C += D[nseq][i].count("C")
127 | T += D[nseq][i].count("T")
128 | G += D[nseq][i].count("G")
129 | N += D[nseq][i].count("N")
130 | S += D[nseq][i].count("-")
131 | counts.append( [[A,C,T,G],N,S] )
132 | return counts
133 |
134 |
135 |
136 | def consensus(f, minsamp, CUT1, CUT2, datatype):
137 | """ makes a list of lists of reads at each site """
138 | f = gzip.open(f)
139 | k = itertools.izip(*[iter(f)]*2)
140 | L = []
141 | locus = 0
142 | while 1:
143 | try:
144 | first = k.next()
145 | except StopIteration: break
146 | itera = [first[0],first[1]]
147 | fname = first[0]
148 | S = []
149 | rights = []
150 | lefts = []
151 | leftjust = rightjust = None
152 | while itera[0] != "//\n":
153 | nreps = int(itera[0].strip().split(";")[1].replace("size=",""))
154 |
155 | " record left and right most for cutting if gbs merge data "
156 | if datatype in ['mergegbs','gbs']:
157 | if itera[0].strip().split(";")[-1] == "":
158 | leftjust = itera[1].index([i for i in itera[1] if i not in list("-N")][0])
159 | rightjust = itera[1].rindex([i for i in itera[1] if i not in list("-N")][0])
160 | lefts.append(itera[1].index([i for i in itera[1] if i not in list("-N")][0]))
161 | rights.append(itera[1].rindex([i for i in itera[1] if i not in list("-N")][0]))
162 |
163 | " append sequence * number of dereps "
164 | for i in range(nreps):
165 | S.append(tuple(itera[1].strip()))
166 | itera = k.next()
167 |
168 | " trim off overhang edges of gbs reads "
169 | if datatype in ['mergegbs','gbs']:
170 | if any([i < leftjust for i in lefts]):
171 | rightjust = min(rights)
172 | if any([i < rightjust for i in rights]):
173 | leftjust = max(lefts)
174 |
175 | for s in range(len(S)):
176 | if rightjust:
177 | S[s] = S[s][leftjust:rightjust+1]
178 | if leftjust:
179 | S[s] = S[s][leftjust:rightjust+1] ## +1?
180 |
181 | " trim off restriction sites from end/s "
182 | if datatype in ['merged','pairddrad','pairgbs','gbs']:
183 | for s in range(len(S)):
184 | S[s] = S[s][len(CUT1):-(len(CUT2)+1)]
185 | else:
186 | for s in range(len(S)):
187 | S[s] = S[s][len(CUT1):]
188 |
189 | if len(S) >= minsamp:
190 | " make list for each site in sequences "
191 | res = stack(S)
192 | " exclude sites with indels "
193 | L += [i[0] for i in res if i[2] == 0]
194 | locus += 1
195 | return L
196 |
197 |
198 |
199 |
200 |
201 | def optim(WORK,handle, minsamp, CUT1, CUT2, datatype, haplos):
202 | name = handle.split("/")[-1].replace(".clustS.gz","")
203 | D = consensus(handle, minsamp, CUT1, CUT2, datatype)
204 | P = makeP(D)
205 | Tab = table_c(D)
206 | del D
207 | #H,E = scipy.optimize.fmin(LL,x0,(P,Tab),maxiter=500,maxfun=200,ftol=0.0001,disp=False,full_output=False)
208 | if haplos == 1:
209 | x0 = [0.001]
210 | H = 0.
211 | E = scipy.optimize.fmin(LL_haploid,x0,(P,Tab),disp=False,full_output=False)
212 | else:
213 | x0 = [0.01,0.001]
214 | H,E = scipy.optimize.fmin(LL,x0,(P,Tab),disp=False,full_output=False)
215 | del Tab
216 | outfile = open(WORK+"stats/."+name+".temp",'w')
217 | outfile.write("\t".join([name.strip(".gz"),str(round(H,8))[0:10],str(round(E,8))[0:10],"\n"]))
218 | outfile.close()
219 | sys.stderr.write(".")
220 |
221 |
222 |
223 |
224 | def main(Parallel,ID,minsamp,subset,haplos,WORK,CUT,datatype):
225 | sys.stderr.write("\n\tstep 4: estimating error rate and heterozygosity\n\t")
226 |
227 | " find clust.xx directory "
228 | if not os.path.exists(WORK+'clust'+ID):
229 | print "\terror: could not find "+WORK+"clust"+str(ID)+"/ directory,"+ \
230 | "\n\t\tif you changed the clustering threshold you must transfer *.clustS"+ \
231 | "\n\t\tfiles to a new directory named clust.xx with xx replaced by new clustering threshold"
232 | sys.exit()
233 |
234 |
235 | # warning message for low minsamp
236 | if minsamp < 5:
237 | sys.stderr.write("""\n\t warning: Mindepth < 5 is not recommended for this step.\n
238 | If you intend to make low coverage base calls use a high mindepth in
239 | step 4 to accurately infer H & E parameters, and then use a low mindepth
240 | in conjunction with the line 31 params file option to make low coverage
241 | base calls""")
242 |
243 | # if haploid data
244 | if haplos == 1:
245 | sys.stderr.write("\n\tapplying haploid-based test (infer E while H is fixed to 0)\n\t")
246 |
247 | # if double digest use first cut site
248 | if "," in CUT:
249 | CUT1, CUT2 = CUT.strip().split(",")
250 | else:
251 | CUT1 = CUT2 = CUT
252 |
253 | # load up work queue
254 | work_queue = multiprocessing.Queue()
255 |
256 | # iterate over files
257 | HH = glob.glob(WORK+"clust"+ID+"/"+subset+"*.clustS*")
258 | submitted = 0
259 | FS = []
260 | if len(HH) > 1:
261 | ## sort files by size
262 | for i in range(len(HH)):
263 | statinfo = os.stat(HH[i])
264 | if statinfo.st_size > 1000:
265 | FS.append((HH[i],statinfo.st_size))
266 | else:
267 | print "excluding ",HH[i],"file is too small\n"
268 | FS.sort(key=lambda x: x[1])
269 | FS = [i[0] for i in FS]
270 | else:
271 | FS = HH
272 | REMOVE = glob.glob(WORK+'clust'+ID+"/cat.*")
273 | FS = [f for f in FS if f not in REMOVE]
274 | for handle in FS:
275 | work_queue.put([WORK,handle, minsamp, CUT1, CUT2, datatype, haplos])
276 | submitted += 1
277 |
278 | " remove temp files if previous run "
279 | for ff in FS:
280 | end = ff.split("/")[-1].replace(".clustS.gz","")
281 | ff = WORK+"stats/."+end+".temp"
282 | if os.path.exists(ff):
283 | os.remove(ff)
284 |
285 | " create a queue to pass to workers to store the results "
286 | result_queue = multiprocessing.Queue()
287 | results = []
288 |
289 | " spawn workers "
290 | jobs = []
291 | for i in range( min(Parallel,submitted) ):
292 | worker = Worker(work_queue, result_queue, optim)
293 | worker.start()
294 | jobs.append(worker)
295 | for job in jobs:
296 | job.join()
297 |
298 | " write results to stats file "
299 | if not os.path.exists(WORK+"stats/Pi_E_estimate.txt"):
300 | outstats = open(WORK+"stats/Pi_E_estimate.txt",'w')
301 | outstats.write("taxa\tH\tE\n")
302 | else:
303 | outstats = open(WORK+"stats/Pi_E_estimate.txt",'a')
304 | for ff in FS:
305 | end = ff.split("/")[-1].replace(".clustS.gz","")
306 | ft = WORK+"stats/."+end+".temp"
307 | line = open(ft).readlines()
308 | outstats.write(line[0])
309 | os.remove(ft)
310 | # n,h,e = line[0].strip().split("\t")
311 | # H.append(float(h))
312 | # E.append(float(e))
313 | #outstats.write(" ".join(["mean E =",str(numpy.mean(E))])+"\n")
314 | #outstats.write(" ".join(["mean H =",str(numpy.mean(H))]))
315 | outstats.close()
316 |
317 |
318 |
319 |
--------------------------------------------------------------------------------
/pyrad/editraw_pairs.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python2
2 |
3 | import multiprocessing
4 | import itertools
5 | import sys
6 | import os
7 | import glob
8 | import operator
9 | import gzip
10 | from potpour import Worker
11 | from sortandcheck2 import unambig
12 |
13 |
14 | def revcomp(s):
15 | ss = s[::-1].strip().replace("A","t").replace("T","a").\
16 | replace("C","g").replace("G","c").replace("n","Z").upper().replace("Z","n")
17 | return ss
18 |
19 |
20 | def unambar(CUT):
21 | if any([i in CUT for i in list("RKYSWM")]):
22 | CUTa, CUTb = unambig(CUT)
23 | return [CUTa,CUTb]
24 | else:
25 | return False
26 |
27 |
28 | def Afilter(CUT,s,strict,read):
29 | read1 = read==1
30 | a = b = lookfor = wheretocut = None
31 | " lookfor cut site "
32 |
33 | " if ambiguity in cutter "
34 | if unambar(CUT):
35 | CUTa,CUTb = unambar(CUT)
36 | if strict == 2:
37 | if read1:
38 | lookfor1 = CUTa+"A"
39 | lookfor2 = CUTb+"A"
40 | else:
41 | lookfor1 = CUTa
42 | lookfor2 = CUTb
43 | else:
44 | if read1:
45 | lookfor1 = CUTa+"AGAT"
46 | lookfor2 = CUTb+"AGAT"
47 | else:
48 | lookfor1 = "A"*50
49 | lookfor2 = "A"*50
50 | if lookfor1 in s:
51 | a = s.rindex(lookfor1)
52 | if lookfor2 in s:
53 | b = s.rindex(lookfor2)
54 | if (a or b):
55 | wheretocut = min([i for i in [a,b] if i])
56 | else:
57 | wheretocut = None
58 | else:
59 | "no ambiguity in cutter "
60 | if strict == 2:
61 | if read1:
62 | lookfor1 = CUT+"A"
63 | else:
64 | lookfor1 = CUT
65 | else:
66 | if read1:
67 | lookfor1 = CUT+"AGA"
68 | else:
69 | lookfor1 = "A"*50
70 | if lookfor1 in s:
71 | wheretocut = s.rindex(lookfor1)
72 | else:
73 | wheretocut = None
74 |
75 | " look for adapter sequence "
76 | if not wheretocut:
77 | if strict == 2:
78 | lookfor1 = "AGATCG"
79 | else:
80 | lookfor1 = "AGATCGGA"
81 | if lookfor1 in s:
82 | if read1:
83 | wheretocut = s.rindex(lookfor1)-(len(CUT)+1)
84 | else:
85 | wheretocut = s.rindex(lookfor1)-(len(CUT)+6)
86 | else:
87 | wheretocut = None
88 |
89 | " look for CUT and end of seq "
90 | if not wheretocut:
91 | if CUT in s[-(len(CUT)+5):]:
92 | wheretocut = s.rindex(CUT)
93 | return wheretocut
94 |
95 |
96 |
97 |
98 | def rawedit(WORK, infile, CUT, pN, trimkeep, strict, Q, datatype):
99 | """ three functions:
100 | (1) replaces low quality base calls with Ns,
101 | (2) checks for adapter sequence and xxbarcodesxx if strict set to 1 or 2
102 | (3) concatenate paired reads with a separator and write to file """
103 |
104 | if CUT:
105 | if "," in CUT:
106 | CUT1,CUT2 = CUT.split(",")
107 | CUT2=revcomp(CUT2)
108 | x = 0
109 | else:
110 | CUT1=CUT
111 | CUT2=revcomp(CUT1)
112 | x = 1 ## trims garbage base off gbs
113 |
114 | " create iterators for R1 and R2 files "
115 | if ".gz" in infile:
116 | f1 = gzip.open(infile, 'rb')
117 | if ".forward." in infile:
118 | f2 = gzip.open(infile.replace(".forward.",".reverse."), 'r')
119 | else:
120 | f2 = gzip.open(infile.replace("_R1.","_R2."), 'r')
121 | else:
122 | f1 = open(infile,'r')
123 | if ".forward." in infile:
124 | f2 = open(infile.replace(".forward.",".reverse."), 'r')
125 | else:
126 | f2 = open(infile.replace("_R1.","_R2."), 'r')
127 | n = str(infile.split('/')[-1])
128 | while n.split(".")[-1] in ["fastq","fastQ","gz","fq","FastQ","nomerge"]:
129 | n = n.replace('.'+n.split(".")[-1], "")
130 | if '.forward' in n:
131 | n = n.split(".forward")[0]
132 | None
133 | else:
134 | n = n.replace("_R1","")
135 |
136 | k1 = itertools.izip(*[iter(f1)]*4)
137 | k2 = itertools.izip(*[iter(f2)]*4)
138 | writing_r = []
139 | writing_c = []
140 |
141 | orig = keep = keepcut = 0
142 | handle = WORK+'edits/'+str(n)+".edit"
143 |
144 | "iterate over paired reads, edits 1st, if OK, append both to .edit file"
145 | while 1:
146 | try: d = k1.next()
147 | except StopIteration: break
148 | dd = k2.next()
149 |
150 | orig += 1
151 | SS = d[1].strip()
152 | ph = map(ord,d[3].strip())
153 | offset = int(Q)
154 | phred = map(lambda x:x-offset,ph)
155 | seq = ["N"]*len(phred)
156 | for base in range(len(phred)):
157 | if base >= len(CUT1): ## don't quality check cut site
158 | if phred[base] >= 20: ## quality threshold
159 | try: seq[base] = SS[base]
160 | except IndexError:
161 | None
162 | else:
163 | seq[base] = "N"
164 | else:
165 | if unambar(CUT1):
166 | seq[base] = unambar(CUT1)[0][base]
167 | else:
168 | seq[base] = CUT1[base]
169 |
170 | s = "".join(seq)
171 | wheretocut1 = None
172 |
173 | " apply filters for adapter sequences "
174 | " if GBS CUT2 = revcomp(CUT1) ex: CTGCA"
175 | " if ddRAD CUT2 = revcomp(CUT2) ex: AATT "
176 | if strict:
177 | wheretocut1 = Afilter(CUT2,s,strict,1)
178 |
179 | if s.count("N") <= pN: ## max allowed Ns
180 | if len(s) >= max(32,trimkeep): ## if trimmed read1 length atleast t
181 |
182 | " first read is (maybe) good, now filter second reads "
183 | SS = dd[1].strip()
184 | ph = map(ord,dd[3].strip())
185 | " if PEAR filtered then seqs are revcomps "
186 | if '.forward' in infile:
187 | SS = revcomp(SS)
188 | ph = ph[::-1]
189 |
190 | offset = int(Q)
191 | phred = map(lambda x:x-offset,ph)
192 | seq = ["N"]*len(phred)
193 | for base in range(len(phred)):
194 | if base > len(CUT2): ## don't quality check cut site
195 | if phred[base] >= 20: ## quality threshold
196 | try: seq[base] = SS[base]
197 | except IndexError: None
198 | else:
199 | seq[base] = "N"
200 | else:
201 | try: seq[base] = SS[base]
202 | except IndexError: None
203 | s2 = "".join(seq)
204 |
205 | " filter for gbs read2s, b/c they will be clustered"
206 | badread = 0
207 | if datatype == "pairgbs":
208 | s2 = s2[:len(s)]
209 | if s2.count("N")>pN:
210 | badread = 1
211 |
212 | " apply adapter filter to read 2 "
213 | wheretocut2 = None
214 | if strict:
215 | wheretocut2 = Afilter(revcomp(CUT1),s2,strict,2)
216 |
217 | if (wheretocut1 and wheretocut2):
218 | cutter = min(wheretocut1,wheretocut2)
219 | else:
220 | cutter = max(wheretocut1,wheretocut2)
221 | if strict:
222 | if not cutter:
223 | if (revcomp(CUT1) in s2[-16:]) or (CUT2 in s[-10:]):
224 | cutter = len(s)-16
225 |
226 | if not badread:
227 | if cutter:
228 | ## second read was trimmed
229 | if cutter > max(32,trimkeep):
230 | ## include only the first read, with an N placeholder for read2
231 | ## since it was trimmed off
232 | sout = ">"+n+"_"+str(keepcut)+"_trim1"+"\n"+s[:cutter]+\
233 | "nnnnN\n"#+d[2]+d[3][:cutter]+"\n"
234 | writing_c.append(sout)
235 | keepcut += 1
236 | ## cannot keep trimmed second read in pairddrad method
237 | ## but can in pairgbs
238 | if datatype == 'pairgbs':
239 | sout = ">"+n+"_"+str(keepcut)+"_trim2"+"\nNnnnn"+revcomp(s2[x:cutter+5])+\
240 | "\n"#+d[2]+d[3][x:cutter+5]+"\n"
241 | writing_c.append(sout)
242 | keepcut += 1
243 | else:
244 | ## second read is good, not trimmed
245 | sout = ">"+n+"_"+str(keep)+"_pair"+"\n"+s[:-1]+"nnnn"+revcomp(s2[x:])+"\n"
246 | writing_r.append(sout)
247 | keep += 1
248 |
249 | if not orig % 5000:
250 | #if trimkeep:
251 | # with open(WORK+'mergedreads/'+str(n)+"M.fq",'a') as outfile:
252 | # outfile.write("".join([z for z in writing_c]))
253 | " writes only full length reads "
254 | with open(WORK+'edits/'+str(n)+".edit",'a') as outfile:
255 | outfile.write("".join([z for z in writing_r]))
256 | " writes only full length reads "
257 | with open(WORK+'edits/'+str(n)+".edit",'a') as outfile:
258 | outfile.write("".join([z for z in writing_c]))
259 | writing_r = []
260 | writing_c = []
261 |
262 | #if trimkeep:
263 | # with open(WORK+'mergedreads/'+str(n)+"M.fq",'a') as outfile:
264 | # outfile.write("".join([z for z in writing_c]))
265 | " writes only full length reads "
266 | with open(WORK+'edits/'+str(n)+".edit",'a') as outfile:
267 | outfile.write("".join([z for z in writing_r]))
268 | " writes only full length reads "
269 | with open(WORK+'edits/'+str(n)+".edit",'a') as outfile:
270 | outfile.write("".join([z for z in writing_c]))
271 | writing_r = []
272 | writing_c = []
273 |
274 | f1.close()
275 | f2.close()
276 | sys.stderr.write(".")
277 | if datatype=='pairgbs':
278 | keepcut = keepcut*2
279 | return [handle.split("/")[-1].replace(".edit",""),str(orig),str(keepcut),str(keep)]
280 |
281 |
282 |
283 | def main(Parallel, WORK, FQs, CUT, pN, Q, strict, trimkeep, datatype):
284 |
285 | print >>sys.stderr, "\n\tstep 2: quality filtering \n\t",
286 |
287 | " create output directories "
288 | if not os.path.exists(WORK+'stats'):
289 | os.makedirs(WORK+'stats')
290 | if not os.path.exists(WORK+'edits'):
291 | os.makedirs(WORK+'edits')
292 |
293 | " load up work queue "
294 | submitted = 0
295 | work_queue = multiprocessing.Queue()
296 |
297 | " do not select merged or discarded reads if PEAR was used on data"
298 | FQs = glob.glob(FQs)
299 | fqs = [i for i in FQs if not any([j in i for j in ["discarded",".assembled."]])]
300 |
301 | if len(fqs) > 1:
302 | " subselect only the first reads "
303 | if any([".unassembled.forward." in i for i in fqs]):
304 | FS = [i for i in fqs if '.forward.' in i]
305 | else:
306 | FS = [i for i in fqs if '_R1.' in i]
307 |
308 | " order files by size "
309 | for i in range(len(FS)):
310 | statinfo = os.stat(FS[i])
311 | FS[i] = FS[i],statinfo.st_size
312 | FS.sort(key=operator.itemgetter(1))
313 | FS = [i[0] for i in FS][::-1]
314 |
315 | " submit jobs to queue "
316 | for handle in FS:
317 | n = handle.split('/')[-1]
318 | while n.split(".")[-1] in ["fastq","fastQ","gz","fq","FastQ","nomerge"]:
319 | n = n.replace('.'+n.split(".")[-1], "")
320 | if '.forward.' in n:
321 | n = n.split(".forward")[0]
322 | None
323 | else:
324 | "_".join(n.split('_R')[:-1])
325 | if WORK+"edits/"+n+".edit" not in glob.glob(WORK+"edits/*"):
326 | if os.stat(handle).st_size > 0: ## exclude empty files
327 | args = [WORK, handle, CUT, float(pN), trimkeep, strict, Q, datatype]
328 | work_queue.put(args)
329 | submitted += 1
330 | else:
331 | print 'skipping',handle,", file is empty"
332 | else:
333 | print "\t"+n+'.edit'+" already in edits/"
334 | elif len(fqs) == 1:
335 | " if only one file "
336 | work_queue.put([WORK, glob.glob(FQs)[0], CUT, float(pN), trimkeep, strict, Q, datatype])
337 | submitted += 1
338 |
339 | else:
340 | print "no _paired_ de-multiplexed files found in this location."
341 | sys.exit()
342 |
343 | " create a queue to pass to workers to store the results "
344 | result_queue = multiprocessing.Queue()
345 |
346 | " spawn workers, give function "
347 | jobs = []
348 | for i in range( min(Parallel,submitted) ):
349 | worker = Worker(work_queue, result_queue, rawedit)
350 | worker.start()
351 | jobs.append(worker)
352 | for job in jobs:
353 | job.join()
354 |
355 |
356 | " collect the results off the queue "
357 | outstats = open(WORK+"stats/s2.rawedit.txt",'a')
358 | print >> outstats, "\t".join(["sample","Nreads","exclude","trimmed","passed"])
359 | for i in range(submitted):
360 | a,b,c,d = result_queue.get()
361 | print >> outstats, "\t".join([a,b, str(int(b)-int(d)), c, d])
362 |
363 | print >>outstats, """
364 | Nreads = total number of reads for a sample
365 | exclude = reads that were excluded
366 | trimmed = reads that had adapter trimmed but were kept
367 | passed = total kept reads
368 | """
369 | outstats.close()
370 |
371 |
--------------------------------------------------------------------------------
/pyrad/cluster_cons7_shuf.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python2
2 | import os
3 | import sys
4 | import itertools
5 | import numpy
6 | import random
7 | import glob
8 | import subprocess
9 | import gzip
10 | import copy
11 | from consensdp import unhetero, uplow, breakalleles
12 |
13 |
14 |
15 | def comp(seq):
16 | """ returns complement of sequence including ambiguity characters,
17 | and saves lower case info for multiple hetero sequences"""
18 | seq = seq.replace("A",'u')\
19 | .replace('T','v')\
20 | .replace('C','p')\
21 | .replace('G','z')\
22 | .replace('u','T')\
23 | .replace('v','A')\
24 | .replace('p','G')\
25 | .replace('z','C')
26 | seq = seq.replace('R','u')\
27 | .replace('Y','v')\
28 | .replace('K','p')\
29 | .replace('M','z')\
30 | .replace('u','Y')\
31 | .replace('v','R')\
32 | .replace('p','M')\
33 | .replace('z','K')
34 | seq = seq.replace('r','u')\
35 | .replace('y','v')\
36 | .replace('k','p')\
37 | .replace('m','z')\
38 | .replace('u','y')\
39 | .replace('v','r')\
40 | .replace('p','m')\
41 | .replace('z','k')
42 | return seq
43 |
44 |
45 | def cmd_exists(cmd):
46 | return subprocess.call("type " + cmd, shell=True,
47 | stdout=subprocess.PIPE, stderr=subprocess.PIPE) == 0
48 |
49 |
50 | def cluster(vsearch, handle, ID, datatype,
51 | quiet, WORK, gid, MASK):
52 |
53 | if datatype == 'pairddrad':
54 | " use first files for split clustering "
55 | if gid:
56 | "hierarchical clustering save temps "
57 | N = " -notmatched "+WORK+"prefix/"+handle.split("/")[-1].replace(".firsts_","._temp_")
58 | U = " -userout "+WORK+"prefix/"+handle.split("/")[-1].replace(".firsts_",".u_")
59 | else:
60 | N = ""
61 | U = " -userout "+handle.replace(".firsts_",".u")
62 | else:
63 | " use haplos files "
64 | if gid:
65 | "hierarchical clustering save temps "
66 | N = " -notmatched "+WORK+"prefix/"+handle.split("/")[-1].replace(".haplos_","._temp_")
67 | U = " -userout "+WORK+"prefix/"+handle.split("/")[-1].replace(".haplos_",".u_")
68 | else:
69 | N = ""
70 | U = " -userout "+handle.replace(".haplos_",".u")
71 |
72 | C = " -cluster_smallmem "+handle
73 | if datatype in ['gbs','pairgbs','merged']:
74 | P = " -strand both"
75 | COV = " -query_cov .90 " ## this can vary
76 | else:
77 | P = " -leftjust "
78 | COV = " -query_cov .90 "
79 | if 'vsearch' not in vsearch:
80 | Q = ""
81 | T = " -threads 1"
82 | else:
83 | Q = " -qmask "+MASK
84 | T = " -threads 6"
85 | cmd = vsearch+\
86 | C+\
87 | P+\
88 | Q+\
89 | T+\
90 | " -id "+ID+\
91 | U+\
92 | " -userfields query+target+id+gaps+qstrand+qcov"+\
93 | " -maxaccepts 1"+\
94 | " -maxrejects 0"+\
95 | " -fulldp"+\
96 | " -usersort"+\
97 | COV+\
98 | N
99 | #os.system(cmd)
100 | subprocess.call(cmd, shell=True)
101 |
102 |
103 | def makeclust(handle,datatype,gid,
104 | minmatch,WORK):
105 |
106 | " read in cluster hits and seeds files "
107 | if not gid:
108 | Userout = open(handle.replace(".haplos_",".u"),'r')
109 | outfile = gzip.open(handle.replace(".haplos_"+gid,".clust_"+gid+".gz"),'w')
110 | else:
111 | Userout = open(WORK+'prefix/'+handle.split("/")[-1].replace(".haplos_",".u_") ,'r')
112 | nomatch = open(WORK+'prefix/'+handle.split("/")[-1].replace(".haplos_","._temp_"),'r')
113 | outfile = open(WORK+'prefix/'+handle.split("/")[-1].replace(".haplos_",".seed_"),'w')
114 | outfilename = WORK+'prefix/'+handle.split("/")[-1].replace(".haplos_",".seed_")
115 |
116 | " load full fasta file into a Dic "
117 | D = {}
118 | if datatype == 'pairddrad':
119 | if gid:
120 | f = open(handle.replace(".haplos_"+gid,".firsts_"+gid))
121 | else:
122 | f = gzip.open(handle.replace(".haplos_"+gid,".consens_"+gid+".gz"))
123 | else:
124 | f = gzip.open(handle.replace(".haplos_"+gid,".consens_"+gid+".gz"))
125 |
126 | L = itertools.izip(*[iter(f)]*2)
127 | while 1:
128 | try: a,b = L.next()
129 | except StopIteration: break
130 | D[a.strip()] = b.strip()
131 | f.close()
132 |
133 | " load .u info into a Dic "
134 | U = {}
135 | for line in [line.split("\t") for line in Userout.readlines()]:
136 | if ">"+line[1] in U:
137 | U[">"+line[1]].append([">"+line[0],line[4]])
138 | else:
139 | U[">"+line[1]] = [[">"+line[0],line[4]]]
140 |
141 | " if tier 1 of hierarchical clustering "
142 | if gid:
143 | if int(minmatch) == 1:
144 | " no reduction, write seeds only "
145 | # if datatype == 'pairddrad':
146 | # singles = itertools.izip(*[iter(open(handle.replace(".haplos_",".firsts_")))]*2)
147 | # else:
148 | # singles = itertools.izip(*[iter(open(handle))]*2)
149 | singles = nomatch.read().split(">")[1:]
150 | for i in singles:
151 | i = i.split("\n")[0]+"\n"+"".join(i.split("\n")[1:]).upper()
152 | #print ">"+i+"\n//"
153 | print >>outfile, ">"+i+"\n//"
154 | #print "//\n".join(i)
155 | #outfile.write("//\n".join(i))
156 | # i,j = i.split('\n')[0], "\n".join(i.split('\n')[1:])
157 | # outfile.write("//\n".join(i+j))
158 | del singles
159 | #outfile.write("//\n".join(LLL))
160 | # LLL = []
161 | # while 1:
162 | # try: a,b = singles.next()
163 | # except StopIteration: break
164 | # LLL.append(a+b)
165 | #outfile.write("//\n".join(LLL))
166 | #del LLL
167 | else:
168 | for key,values in U.items():
169 | ## reduction, only write seed if minimum hits reached
170 | if (len(values)+1) >= int(minmatch):
171 | ## fix for if short seqs are excluded during clustering
172 | if D.get(key):
173 | seq = key+"\n"+D[key]+"\n"
174 | seq += "//\n"
175 | outfile.write(seq)
176 |
177 | else:
178 | " map sequences to clust file in order "
179 | seq = ""
180 | for key,values in U.items():
181 | if D.get(key): ## fix for if short seqs are excluded during clustering
182 | seq = key+"\n"+D[key]+'\n'
183 | S = [i[0] for i in values]
184 | R = [i[1] for i in values]
185 | for i in range(len(S)):
186 | if D.get(S[i]): ## testing as fix for removed short reads...
187 | if R[i] == "+":
188 | seq += S[i] + '\n' + D[S[i]] + "\n"
189 | else:
190 | seq += S[i] + '\n' + comp(D[S[i]][::-1]) + "\n"
191 | seq += "//\n"
192 | outfile.write(seq)
193 | outfile.close()
194 | Userout.close()
195 | if gid: nomatch.close()
196 |
197 |
198 |
199 | def splitter(handle):
200 | infile = open(handle)
201 | if os.path.exists(handle.replace(".haplos",".firsts")):
202 | os.remove(handle.replace(".haplos",".firsts"))
203 |
204 | orderfirsts = open(handle.replace(".haplos",".firsts"),'w')
205 | dp = itertools.izip(*[iter(infile)]*2)
206 | ff = []
207 | cnts = 0
208 | for d in dp:
209 | n,s = d
210 | ## checking fix to pairddrad splitting problem...
211 | ## backwards compatible with pyrad v2
212 | s1 = s.replace("X","x").replace("x","n").split("nn")[0]
213 | ff.append(n+s1+"\n")
214 | cnts += 1
215 | orderfirsts.write("".join(ff))
216 | orderfirsts.close()
217 | return handle.replace(".haplos",".firsts")
218 |
219 |
220 |
221 | def makecons(vsearch, ID, datatype,
222 | outg, seed, gid, minmatch, inlist,
223 | WORK, quiet, outhandle):
224 |
225 | " find usearch"
226 | if not cmd_exists(vsearch):
227 | print "\tcannot find usearch (or vsearch), edit path in param file"
228 | sys.exit()
229 |
230 | " make list of consens files "
231 | FS = [i for i in inlist if "/cat.cons" not in i]
232 | FS = [i for i in FS if "/cat.group" not in i]
233 | if not FS:
234 | print "no consens files found"
235 | sys.exit()
236 |
237 | " and a list including outgroups "
238 | fs = copy.copy(inlist)
239 |
240 | " are files gzipped ? "
241 | if any(['.gz' in i[-4:] for i in FS]):
242 | gz = ".gz"
243 | else:
244 | gz = ""
245 |
246 | " remove previous files if present "
247 | if os.path.exists(WORK+'clust'+ID+'/cat.consens_'+gid+gz):
248 | os.remove(WORK+'clust'+ID+'/cat.consens_'+gid+gz)
249 | if os.path.exists(WORK+'clust'+ID+'/cat.group_'+gid+gz):
250 | os.remove(WORK+'clust'+ID+'/cat.group_'+gid+gz)
251 |
252 |
253 | " remove outgroup sequences, add back in later to bottom after shuffling "
254 | if outg:
255 | outgroup = outg.strip().split(",")
256 | if len(outgroup) > 1:
257 | for s in outgroup:
258 | if WORK+"clust"+ID+"/"+s+".consens"+gz in FS:
259 | FS.remove(WORK+"clust"+ID+"/"+s+".consens"+gz)
260 | else:
261 | outgroup = WORK+"clust"+ID+"/"+outg+".consens"+gz
262 | if outgroup in FS:
263 | FS.remove(outgroup)
264 |
265 | " create file with consens seqs from all taxa in list "
266 | out = gzip.open(WORK+'clust'+ID+'/cat.group_'+gid+gz,'w')
267 |
268 | for qhandle in FS:
269 | if gz:
270 | f = gzip.open(qhandle)
271 | else:
272 | f = open(qhandle)
273 | k = itertools.izip(*[iter(f)]*2)
274 | while 1:
275 | try: a = k.next()
276 | except StopIteration: break
277 | print >>out, a[0].strip()+" "+a[1].strip()
278 | f.close()
279 | out.close()
280 |
281 | " message to shell "
282 | if gid:
283 | sys.stderr.write('\n\tstep 6: clustering across '+str(len(FS))+' samples at '+`ID`+\
284 | ' similarity \n\tfor group ('+str(gid)+') retaining seeds w/ minimum of '+str(minmatch)+' hits\n\n')
285 | else:
286 | sys.stderr.write('\n\tstep 6: clustering across '+str(len(FS))+' samples at '+`ID`+' similarity \n\n')
287 |
288 | " make list of random number and data "
289 | if seed:
290 | random.seed(seed)
291 | source = gzip.open(WORK+'clust'+ID+'/cat.group_'+gid+".gz",'r')
292 | data = [ (random.random(), line) for line in source ]
293 | source.close()
294 | " sort by random number "
295 | data.sort()
296 |
297 | " order by size while retaining randomization within size classes "
298 | D = [line.split(' ') for _, line in data]
299 | DD = ["".join([i[0]+" "*(100-len(i[0])),i[1]]) for i in D]
300 | DD.sort(key=len, reverse=True)
301 | k = iter(["**".join([i.split(" ")[0],i.split(" ")[-1]]) for i in DD])
302 |
303 | " write output to .consens_.gz file "
304 | out = gzip.open(WORK+'clust'+ID+'/cat.consens_'+gid+".gz",'w')
305 | while 1:
306 | try: a,b = k.next().split("**")
307 | except StopIteration: break
308 | print >>out, a+'\n'+b.strip()
309 |
310 |
311 | """ add outgroup taxa back onto end of file."""
312 | if outg:
313 | " append to existing consens_ file "
314 | outgroup = outg.strip().split(',')
315 | if len(outgroup) > 1:
316 | for s in outgroup:
317 | xoutg = WORK+"clust"+ID+"/"+s+".consens.gz"
318 | if xoutg in fs:
319 | f = gzip.open(xoutg)
320 | k = itertools.izip(*[iter(f)]*2)
321 | while 1:
322 | try: a = k.next()
323 | except StopIteration: break
324 | print >>out, a[0].strip()+"\n"+a[1].strip()
325 | f.close()
326 | elif len(outgroup) == 1:
327 | xoutg = WORK+"clust"+ID+"/"+outgroup[0]+".consens.gz"
328 | if xoutg in fs:
329 | f = gzip.open(xoutg)
330 | k = itertools.izip(*[iter(f)]*2)
331 | while 1:
332 | try: a = k.next()
333 | except StopIteration: break
334 | print >>out, a[0].strip()+"\n"+a[1].strip()
335 | f.close()
336 | else:
337 | None
338 | out.close()
339 |
340 |
341 | """ convert ambiguity codes into a sampled haplotype for any sample
342 | to use for clustering, but save ambiguities for later """
343 |
344 | " output file"
345 | outhaplos = open(outhandle,'w')
346 |
347 | " input file "
348 | infile = gzip.open(WORK+"clust"+ID+"/cat.consens_"+gid+".gz")
349 | lines = iter(infile.readlines())
350 | infile.close()
351 |
352 | " write to haplo files in fasta format "
353 | writinghaplos = []
354 |
355 | for line in lines:
356 | if ">" in line:
357 | writinghaplos.append(line.strip())
358 | else:
359 | allele = breakalleles(line)[0]
360 | writinghaplos.append(allele.strip())
361 | outhaplos.write("\n".join(writinghaplos))
362 | outhaplos.close()
363 |
364 |
365 | def main(vsearch, ID, datatype,
366 | outg, seed, gid, minmatch, inlist,
367 | WORK, MASK, quiet):
368 |
369 | outhandle = WORK+"clust"+ID+"/cat.haplos_"+gid
370 |
371 | makecons(vsearch,ID,datatype,
372 | outg,seed,gid,minmatch,
373 | inlist,WORK,quiet,outhandle)
374 |
375 | if datatype == 'pairddrad':
376 | splithandle = splitter(outhandle)
377 | cluster(vsearch,splithandle,ID,datatype,quiet,WORK, gid, MASK)
378 | else:
379 | cluster(vsearch,outhandle,ID,datatype,quiet,WORK, gid, MASK)
380 |
381 | " remake clusters with .haplos, .u, and .temp files"
382 | makeclust(outhandle,datatype,gid,minmatch,WORK)
383 |
384 |
385 |
386 |
387 |
388 |
--------------------------------------------------------------------------------
/pyrad/Dtest.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python2
2 |
3 | import os
4 | import numpy
5 | import sys
6 | import random
7 | import itertools
8 | import glob
9 | import multiprocessing
10 | import cPickle as pickle
11 | from potpour import Worker
12 |
13 |
14 | def IUPAC(one):
15 | """
16 | returns IUPAC symbol for ambiguity bases,
17 | used for polymorphic sites.
18 | """
19 | D = {"R":['G','A'],
20 | "K":['G','T'],
21 | "S":['G','C'],
22 | "Y":['T','C'],
23 | "W":['T','A'],
24 | "M":['C','A']}
25 | return D[one]
26 |
27 |
28 | def makefreq(patlist):
29 | " identify which allele is derived in P3 relative to outgroup "
30 | " and is the most frequent and use that as the SNP "
31 | P = {}
32 | for tax in patlist:
33 | P[tax] = []
34 |
35 | for tax in patlist:
36 | for base in patlist[tax]:
37 | if base in list('ATGC'):
38 | P[tax].append(base[0])
39 | P[tax].append(base[0])
40 | elif base in list("RKSYWM"):
41 | hh = IUPAC(base[0])
42 | for i in hh:
43 | P[tax].append(i)
44 |
45 | major = [i for i in set(P['p3']) if i not in set(P['o'])]
46 | " in case of multiple bases "
47 | if len(major) > 1:
48 | cc = [P['p3'].count(base) for base in major]
49 | major = major[cc.index(max(cc))] ## maybe [0]
50 | elif not major:
51 | major = [i for i in set(P['o']) if i in set(P['p3'])]
52 | elif len(major) == 1:
53 | major = major[0]
54 |
55 | ret = [float(P[i].count(major))/len(P[i]) for i in ['p1','p2','p3','o']]
56 | return ret
57 |
58 |
59 | def Dstat(Loc, pat):
60 | if pat[0] != pat[1]:
61 | if pat[0] == pat[3]:
62 | if pat[1] == pat[2]:
63 | Loc.abba += 1
64 | else:
65 | if pat[0] == pat[2]:
66 | if pat[1] == pat[3]:
67 | Loc.baba += 1
68 | return Loc
69 |
70 |
71 | def polyDstat(Loc,patlist):
72 | ## calculate frequencies
73 | " look at the P3 taxon first for a derived allele "
74 | p1,p2,p3,o = makefreq(patlist) #[a0,a1,a2,a3])
75 | Loc.abba += ((1.-p1)*p2*p3*(1.-o))
76 | Loc.baba += (p1*(1.-p2)*p3*(1.-o))
77 | return Loc
78 |
79 |
80 | def fillin(ll,name,col,ulnames,patlist):
81 | if len(ll)>1:
82 | for i in ll:
83 | patlist[name] = col[ [ulnames.index(i) for i in ll if i in ulnames] ]
84 | else:
85 | patlist[name] = col[ [ulnames.index(i) for i in ll ] ]
86 | return patlist
87 |
88 |
89 |
90 | def IUA(Loc,L):
91 | Loc.abba = 0.
92 | Loc.baba = 0.
93 | for col in Loc.seq.transpose():
94 | if all(i in list("ATGC") for i in col):
95 | Loc = Dstat(Loc,col)
96 | return Loc
97 |
98 |
99 | def IUAfreq(Loc,L):
100 | patlist = {}
101 | Loc.abba = 0.
102 | Loc.baba = 0.
103 | for col in Loc.seq.transpose():
104 | patlist = fillin(L[0],'p1',col,Loc.names,patlist)
105 | patlist = fillin(L[1],'p2',col,Loc.names,patlist)
106 | patlist = fillin(L[2],'p3',col,Loc.names,patlist)
107 | patlist = fillin(L[3],'o', col,Loc.names,patlist)
108 |
109 | #print Loc.seq, Loc.number
110 | #print patlist
111 | if not any([all([i in ["N",'-'] for i in patlist['p1']]),
112 | all([i in ["N",'-'] for i in patlist['p2']]),
113 | all([i in ["N",'-'] for i in patlist['p3']]),
114 | all([i in ["N",'-'] for i in patlist['o']])]):
115 | if any([i not in patlist['o'] for i in patlist['p3']]):
116 | Loc = polyDstat(Loc,patlist)
117 | else:
118 | None
119 | else:
120 | None
121 | return Loc
122 |
123 |
124 |
125 | def sample_wr(population, k):
126 | "used for bootstrap sampling"
127 | "Chooses k random elements (with replacement) from a population"
128 | n = len(population)
129 | _random, _int = random.random, int # speed hack
130 | return [_int(_random() * n) for i in itertools.repeat(None, k)]
131 |
132 |
133 | def bootfreq(Ldict, which):
134 | Dftop = Dfbot = 0
135 | while 1:
136 | try: Lx = Ldict[Ldict.keys()[which.next()]]
137 | except StopIteration: break
138 | Dftop += Lx.abba - Lx.baba
139 | Dfbot += Lx.abba + Lx.baba
140 | D = 0.
141 | if Dfbot > 0:
142 | D = Dftop/float(Dfbot)
143 | return D
144 |
145 |
146 | def bootfixed(Ldict, which):
147 | abba = baba = 0
148 | while 1:
149 | try: Lx = Ldict[Ldict.keys()[which.next()]]
150 | except StopIteration: break
151 | abba += Lx.abba
152 | baba += Lx.baba
153 | D = 0.
154 | if abba+baba > 0:
155 | D = float(abba-baba)/float(abba+baba)
156 | return D
157 |
158 |
159 | def makeSNP(L,snpfreq,loci):
160 | Ndict = {}
161 | num = 0
162 | for loc in loci:
163 | Loc = Locus()
164 | Loc.number = num
165 | " only select loci that have data for all four tiptaxa "
166 | names = [i.split()[0].replace(">","") for i in loc.lstrip().rstrip().split("\n")[1:-1]]
167 | if snpfreq:
168 | Loc.names = [i for i in names if i in list(itertools.chain(*L))]
169 | else:
170 | Loc.names = L # [i for i in names if i in L]
171 |
172 | " if snpfreq only need one of possibly multiple individuals "
173 | keep = 0
174 |
175 | if snpfreq:
176 | for tax in L:
177 | z = any([tax in names for tax in L[0]])
178 | y = any([tax in names for tax in L[1]])
179 | w = any([tax in names for tax in L[2]])
180 | u = any([tax in names for tax in L[3]])
181 | if all([z,y,w,u]):
182 | keep = 1
183 | else:
184 | if all(tax in names for tax in Loc.names):
185 | keep = 1
186 |
187 | if keep:
188 | N = numpy.array([tuple(i) for i in loc.split("\n")[1:]])
189 |
190 | " only select sites with synapomorphies "
191 | ## may want to keep autapomorphies in the future, or more
192 | ## when making a parameterized version of D-statistic
193 | ## only pyrad 2.1+ finds synapormorphies btwn hetero and fixed sites
194 | N[-1] = list(N[-1].tostring().replace("-","*"))
195 | N = N[:, N[-1] == "*"]
196 | " only select rows with focal taxa"
197 | if snpfreq:
198 | Loc.seq = N[[names.index(i) for i in Loc.names],:]
199 | else:
200 | Loc.seq = N[[names.index(i) for i in Loc.names],:]
201 | #print names
202 | #print N, "______________"
203 | #print Loc.number
204 | #print Loc.seq
205 | #print Loc.names
206 | #print [names.index(i) for i in Loc.names]
207 | Ndict[num] = Loc
208 | num += 1
209 | return Ndict
210 |
211 |
212 |
213 | class Locus():
214 | """locus keeps track of position in input file,
215 | variable sites, and D-statistics"""
216 | def _init_(self):
217 | self.number = number
218 | self.names = names
219 | self.seq = sequence
220 | self.abba = abba
221 | self.baba = baba
222 | def D(self):
223 | """ just used to check if abba > baba
224 | not a global genomic measure of D """
225 | if self.abba+self.baba > 0:
226 | return float(self.abba-self.baba)/(self.abba+self.baba)
227 | else:
228 | return 0.0
229 |
230 |
231 |
232 | def runtest(infile, L, nboots, snpfreq, submitted):
233 | " print test"
234 |
235 | " split each locus "
236 | loci = open(infile).read().strip().split("|")[:-1]
237 | loci[0] = "xx\n"+loci[0]
238 |
239 | " returns a {} of Locus objects with data from tiptaxa L"
240 | Ldict = makeSNP(L,snpfreq,loci)
241 |
242 | " calculate ABBA/BABA for each locus"
243 | for loc in Ldict:
244 | if snpfreq:
245 | Ldict[loc] = IUAfreq(Ldict[loc],L)
246 | else:
247 | Ldict[loc] = IUA(Ldict[loc],L)
248 |
249 | " calculate final D "
250 | dftfinal = sum([Ldict[l].abba-Ldict[l].baba for l in Ldict])
251 | dbtfinal = sum([Ldict[l].abba+Ldict[l].baba for l in Ldict])
252 | if dbtfinal > 0:
253 | Dfinal = float(dftfinal)/dbtfinal
254 | else:
255 | Dfinal = 0.
256 |
257 | " proportion of discordant loci "
258 | try: pdisc = len([i for i in Ldict if Ldict[i].D()]) / float(len(Ldict))
259 | except ZeroDivisionError:
260 | pdisc = 0.0
261 |
262 | " do bootstrapping "
263 | BB = []
264 | for i in xrange(nboots):
265 | which = iter(sample_wr(xrange(len(Ldict)),len(Ldict)))
266 | if snpfreq:
267 | bb = bootfreq(Ldict, which)
268 | else:
269 | bb = bootfixed(Ldict, which)
270 | BB.append(bb)
271 | STD = numpy.std(BB)
272 |
273 | " out stats "
274 | if STD < 0.00001:
275 | STD = 0.0
276 | if Dfinal != 0.0:
277 | if STD != 0.0:
278 | Z = (abs(Dfinal/STD))
279 | else:
280 | Z = 0.0
281 | else:
282 | Dfinal = 0.
283 | Z = 0.
284 |
285 | ABBAloci = [Ldict[l].number for l in Ldict if Ldict[l].D() > 0]
286 | BABAloci = [Ldict[l].number for l in Ldict if Ldict[l].D() < 0]
287 |
288 | ret = [L,Dfinal,STD,Z,
289 | len(Ldict),
290 | sum([Ldict[l].abba for l in Ldict]),
291 | sum([Ldict[l].baba for l in Ldict]),
292 | pdisc,submitted,
293 | ABBAloci,BABAloci, BB ]
294 | pickle.dump(ret, open(".save.D4temp"+str(submitted),'wb'))
295 |
296 |
297 |
298 |
299 | def makesortfiles(outn,locifile,n,loci,outfile,makesort,sub,ps):
300 | locifile.sort()
301 | "write to ABBA file all loci indexed in ABBAloci list"
302 | with open(outfile+"_"+str(sub+1)+"."+outn[0:n]+".txt",'w') as out:
303 | print >>out, " ".join(ps)
304 | print >>out, "//"
305 | print >>out, ",".join(map(str,locifile))
306 | print >>out, "//"
307 | if makesort == 2:
308 | for loc in xrange(len(loci)):
309 | if loc in locifile:
310 | out.write(loci[loc]+"| locus: "+str(loc))
311 |
312 |
313 |
314 |
315 |
316 | def checktaxa(taxalist,alignfile):
317 | with open(alignfile) as infile:
318 | data = infile.readlines()
319 | taxainfile = set()
320 | for line in data:
321 | if ">" in line:
322 | tax = line.split(" ")[0].replace(">","")
323 | if tax not in taxainfile:
324 | taxainfile.add(tax)
325 | if not set(taxalist).difference(taxainfile):
326 | return 1
327 |
328 |
329 |
330 |
331 |
332 | def multiproc_it(tests, alignfile, outfile, nboots, nproc, namelen, makesort, makeboots):
333 |
334 | " submit jobs to processors "
335 | work_queue = multiprocessing.Queue()
336 | result_queue = multiprocessing.Queue()
337 | submitted = 0
338 | Notes = []
339 | for rep in tests:
340 | notes = ""
341 | if len(rep) == 2:
342 | rep,notes = rep
343 | p1,p2,p3,o = rep
344 | if any(["[" in i for i in rep]):
345 | p1 = p1[1:-1].split(",")
346 | p2 = p2[1:-1].split(",")
347 | p3 = p3[1:-1].split(",")
348 | o = o[1:-1].split(",")
349 | taxalist = list(itertools.chain(*[p1+p2+p3+o]))
350 | if checktaxa(taxalist,alignfile):
351 | work_queue.put([alignfile,[p1,p2,p3,o],nboots,1, submitted])
352 | submitted += 1
353 | else:
354 | print 'a taxon name was found that is not in the sequence file'
355 | else:
356 | if checktaxa([p1,p2,p3,o],alignfile):
357 | work_queue.put([alignfile,[p1,p2,p3,o],nboots,0, submitted])
358 | submitted += 1
359 | else:
360 | print 'a taxon name was found that is not in the sequence file'
361 |
362 | Notes.append(notes)
363 | jobs = []
364 | for i in range(nproc):
365 | worker = Worker(work_queue, result_queue, runtest)
366 | jobs.append(worker)
367 | worker.start()
368 | for j in jobs:
369 | j.join()
370 |
371 | " read results back in "
372 | #Results = [result_queue.get() for i in range(submitted)]
373 | Results = [pickle.load(open(".save.D4temp"+str(i),'rb')) for i in xrange(submitted)]
374 | Results.sort(key = lambda x:x[8])
375 |
376 | "setup results file "
377 | outs = open(outfile+".D4.txt", 'w')
378 | header = "\t".join([ 'P1'+" "*(namelen[0]-2),
379 | 'P2'+" "*(namelen[1]-2),
380 | 'P3'+" "*(namelen[2]-2),
381 | 'O'+" "*(namelen[3]-1),
382 | 'D','std(D)','Z',
383 | 'BABA','ABBA',
384 | 'nloci','nboot','pdisc', 'notes'])
385 | print >>outs, header
386 |
387 | for i in range(len(Results)):
388 | ps,D,STD,Z,nloci,ABBA,BABA,pdisc,sub,ABBAloci,BABAloci,boots = Results[i]
389 | ps = [str(x).replace("['","[").replace("']","]").replace("', '",",").replace(">","") for x in ps]
390 | print >>outs, "%s\t%s\t%s\t%s\t%.3f\t%.3f\t%.2f\t%.2f\t%.2f\t%d\t%d\t%.2f\t%s" % (ps[0]+" "*(namelen[0]-len(ps[0])),
391 | ps[1]+" "*(namelen[1]-len(ps[1])),
392 | ps[2]+" "*(namelen[2]-len(ps[2])),
393 | ps[3]+" "*(namelen[3]-len(ps[3])),
394 | D,STD,Z,
395 | BABA,ABBA,
396 | nloci,nboots,
397 | pdisc,Notes[i])
398 |
399 |
400 |
401 | #loci = open(alignfile).read().strip().split("|")[:-1]
402 | loci = open(alignfile).read().strip().split("|")[:-1]
403 | loci[0] = "xx\n"+loci[0]
404 |
405 | if makesort:
406 | makesortfiles('ABBA',ABBAloci,4,loci,outfile,makesort,sub,ps)
407 | makesortfiles('BABA',BABAloci,4,loci,outfile,makesort,sub,ps)
408 |
409 | if makeboots:
410 | with open(outfile+"_"+str(sub+1)+".boots",'w') as out:
411 | out.write(",".join(map(str,boots)))
412 |
413 | for oldpickle in glob.glob(".save.D4temp*"):
414 | os.remove(oldpickle)
415 |
416 |
417 | def main(tests, alignfile, outfile, nboots, nproc, makesort, makeboots):
418 |
419 | P1namelen = max(map(len,[str(i[0][0]) for i in tests]))
420 | P2namelen = max(map(len,[str(i[0][1]) for i in tests]))
421 | P3namelen = max(map(len,[str(i[0][2]) for i in tests]))
422 | Onamelen = max(map(len,[str(i[0][3]).strip() for i in tests]))
423 | namelen = [P1namelen,P2namelen,P3namelen,Onamelen]
424 |
425 | multiproc_it(tests,alignfile,outfile,nboots,nproc,namelen,makesort,makeboots)
426 |
427 |
428 |
429 | if __name__ == '__main__':
430 | main()
431 |
432 |
--------------------------------------------------------------------------------
/pyrad/sortandcheck2.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python2
2 |
3 | import gzip
4 | import itertools
5 | import sys
6 | import glob
7 | import os
8 | import cPickle as pickle
9 | import multiprocessing
10 | from potpour import Worker
11 |
12 |
13 | def combinefiles(GLOB):
14 | "combines first and second reads file names"
15 | if len(glob.glob(GLOB)) > 1:
16 | FS = glob.glob(GLOB)
17 | else:
18 | FS = glob.glob(GLOB)
19 | firsts = [i for i in FS if "_R1_" in i]
20 | if len(firsts) < 1:
21 | print "\n\tFirst read files names must contain '_R1_'."
22 | sys.exit()
23 | seconds = [ff.replace("_R1_","_R2_") for ff in firsts]
24 | if len(firsts) != len(seconds):
25 | print "different numbers of first and second read files. Check that the names of files are correct"
26 | sys.exit()
27 | return zip(firsts,seconds)
28 |
29 |
30 | def revcomp(s):
31 | "returns reverse complement of a string"
32 | ss = s[::-1].strip().replace("A","t").replace("T","a").\
33 | replace("C","g").replace("G","c").upper()
34 | return ss
35 |
36 |
37 | def matching(a,b, maxmismatch):
38 | "allows for N base difference between barcodes"
39 | if len(a) == len(b):
40 | t = [a[i]==b[i] for i in range(len(a))]
41 | if t.count(False) <= maxmismatch:
42 | return 1
43 | else:
44 | return 0
45 | else:
46 | return 0
47 |
48 |
49 | def unambig(seq):
50 | """ returns both resolutions of a cut site
51 | that has an ambiguous base in it """
52 | resos = []
53 | D = {"R":("G","A"),
54 | "K":("G","T"),
55 | "S":("G","C"),
56 | "Y":("T","C"),
57 | "W":("T","A"),
58 | "M":("C","A")}
59 | for base in list("RKSYWM"):
60 | if base in seq:
61 | resos.append(seq.replace(base,D[base][0]))
62 | resos.append(seq.replace(base,D[base][1]))
63 | return resos
64 |
65 |
66 | def findbcode(CUT,longB,l):
67 | barcode = 'N'*20
68 | " in case ambiguous base in CUT "
69 | if any([i in CUT for i in list("RKYSWM")]):
70 | CUT = unambig(CUT)
71 | Bs = []
72 | for cut in CUT:
73 | if l[1][0:longB+len(cut)].count(cut) == 1:
74 | barcode = l[1].split(cut)[0].strip()
75 | elif l[1][0:longB+len(cut)].count(cut) == 2:
76 | barcode = cut.join(l[1].split(cut)[0:2]).strip()
77 | else:
78 | barcode = ""
79 | Bs.append(barcode)
80 | longestbar = Bs[map(len,Bs).index(max(map(len,Bs)))]
81 | return longestbar
82 | else:
83 | if l[1][0:longB+len(CUT)].count(CUT) == 1:
84 | barcode = l[1].split(CUT)[0].strip()
85 | elif l[1][0:longB+len(CUT)].count(CUT) == 2:
86 | barcode = CUT.join(l[1].split(CUT)[0:2]).strip()
87 | else:
88 | barcode = ""
89 | return barcode
90 |
91 |
92 |
93 | def barmatch(C, Raws, CUT, datatype, num, maxmismatch, WORK, longB):
94 | """matches reads to barcodes in barcode file
95 | and writes to individual temp files, after all
96 | read files have been split, temp files are collated
97 | into .fq files"""
98 |
99 | #CUT1 = CUT = unambig(CUT)[0]
100 | locus = 0
101 | match = 0
102 | match2 = 0
103 | barcodehits = set()
104 |
105 | " dictionary to record barcode misses"
106 | M = {}
107 | M['_'] = 0
108 |
109 | "read in paired end read files"
110 | if 'pair' in datatype:
111 | if '.gz' in Raws[0][-3:]:
112 | fr1 = gzip.open(Raws[0])
113 | else:
114 | fr1 = open(Raws[0])
115 | if '.gz' in Raws[1][-3:]:
116 | fr2 = gzip.open(Raws[1])
117 | else:
118 | fr2 = open(Raws[1])
119 | R1 = itertools.izip(*[iter(fr1)]*4)
120 | R2 = itertools.izip(*[iter(fr2)]*4)
121 | else:
122 | "read in single end read file"
123 | if '.gz' in Raws[-3:]:
124 | fr1 = gzip.open(Raws)
125 | else:
126 | fr1 = open(Raws)
127 | R1 = itertools.izip(*[iter(fr1)]*4)
128 |
129 | D = {}
130 | DD = {}
131 | while 1:
132 | try: r1 = R1.next()
133 | except StopIteration: break
134 |
135 | "match paired end reads together, for now"
136 | if 'pair' in datatype:
137 | r2 = R2.next()
138 | l = [r.strip() for r in r1]
139 | l = [l[0],l[1],l[2],l[3]]
140 | ll = [r.strip() for r in r2]
141 | ll = [ll[0],ll[1],ll[2],ll[3]]
142 | else:
143 | "make list of four fastq line elements"
144 | l = [r.strip() for r in r1]
145 | l = [l[0],l[1],l[2],l[3]]
146 |
147 | locus += 1
148 |
149 | if 'pair' in datatype:
150 | if longB[1] == 'same':
151 | " bars are all same length"
152 | barcode = l[1][:longB[0]]
153 | else:
154 | " find barcodes"
155 | barcode = findbcode(CUT,longB[0],l)
156 | if barcode:
157 | if barcode in M:
158 | M[barcode] += 1
159 | else:
160 | M[barcode] = 1
161 |
162 | "exclude the read if no cutsite/barcode found"
163 | if barcode in D:
164 | l[1] = l[1][len(barcode):] #barcode.join(l[1].split(barcode)[1:])
165 | l[3] = l[3][len(barcode):]
166 | D[barcode].append("\n".join(l).strip())
167 | DD[barcode].append("\n".join(ll).strip())
168 | match += 1
169 | else:
170 | l[1] = l[1][len(barcode):] #barcode.join(l[1].split(barcode)[1:])
171 | l[3] = l[3][len(barcode):]
172 | D[barcode] = l
173 | DD[barcode] = ll
174 | match += 1
175 |
176 | else:
177 | M["_"] += 1
178 |
179 | else:
180 | if longB[1] == 'same':
181 | if datatype=='2brad':
182 | barcode = l[1][-longB[0]:]
183 | else:
184 | barcode = l[1][:longB[0]]
185 | else:
186 | barcode = findbcode(CUT,longB[0],l)
187 | if barcode:
188 | " tracker of number of occurrences of each barcode"
189 | if barcode in M:
190 | M[barcode] += 1
191 | else:
192 | M[barcode] = 1
193 |
194 | "exclude the read if no cutsite/barcode found"
195 | "saves reads from barcodes to a dictionary D"
196 | if barcode in D:
197 | #l[1] = CUT1+l[1][len(barcode)+len(CUT):]
198 | if datatype=='2brad':
199 | l[1] = l[1][:-len(barcode)]
200 | l[3] = l[3][:-len(barcode)]
201 | else:
202 | l[1] = l[1][len(barcode):]
203 | l[3] = l[3][len(barcode):]
204 | D[barcode].append("\n".join(l).strip())
205 | match += 1
206 | else:
207 | l[1] = l[1][len(barcode):]
208 | l[3] = l[3][len(barcode):]
209 | D[barcode] = l
210 | match += 1
211 | else:
212 | M["_"] += 1
213 |
214 |
215 | "write to file every 50Kth read"
216 | " only writes reads that match to a barcode in C by less than some N differences "
217 | if not locus % 50000:
218 | for bar in C:
219 | outF1 = gzip.open(WORK+"fastq/."+C[bar]+'.temp_R1_'+str(num)+'.gz','ab')
220 | if 'pair' in datatype:
221 | outF2 = gzip.open(WORK+"fastq/."+C[bar]+'.temp_R2_'+str(num)+'.gz','ab')
222 | for barcode in D:
223 | if matching(bar,barcode,maxmismatch):
224 | barcodehits.add(barcode)
225 | if D[barcode]:
226 | match2 += len(D[barcode]) ## -3
227 | outF1.write("\n".join(D[barcode])+'\n')
228 | if 'pair' in datatype:
229 | if DD[barcode]:
230 | outF2.write("\n".join(DD[barcode])+'\n')
231 | D[barcode] = []
232 | DD[barcode] = []
233 | outF1.close()
234 | if 'pair' in datatype:
235 | outF2.close()
236 | D[bar] = []
237 | DD[bar] = []
238 |
239 |
240 | "write the remaining reads to file"
241 | for bar in C:
242 | outF1 = gzip.open(WORK+"fastq/."+C[bar]+'.temp_R1_'+str(num)+'.gz','ab')
243 | if 'pair' in datatype:
244 | outF2 = gzip.open(WORK+"fastq/."+C[bar]+'.temp_R2_'+str(num)+'.gz','ab')
245 | for barcode in D:
246 | if matching(bar,barcode,maxmismatch):
247 | barcodehits.add(barcode)
248 | if D[barcode]:
249 | match2 += len(D[barcode]) ## -3
250 | outF1.write("\n".join(D[barcode])+'\n')
251 | if 'pair' in datatype:
252 | if DD[barcode]:
253 | outF2.write("\n".join(DD[barcode])+'\n')
254 | D[barcode] = []
255 | DD[barcode] = []
256 | outF1.close()
257 | if 'pair' in datatype:
258 | outF2.close()
259 | D[bar] = []
260 | DD[bar] = []
261 |
262 |
263 | sys.stderr.write(".")
264 | fr1.close()
265 | if 'pair' in datatype:
266 | fr2.close()
267 |
268 | "writes statistics out"
269 | statout = open(WORK+"stats/s1.sorting.txt",'a')
270 | if 'pair' in datatype:
271 | name = Raws[0].split("/")[-1].replace("_R1_","_")
272 | else:
273 | name = Raws.split("/")[-1].replace("_R1_","_")
274 |
275 | match2 = sum([M[i] for i in M if i in barcodehits])
276 | writeit = "%s\t%i\t%i\t%i\n" % (name, locus, match, match2)
277 | statout.write(writeit)
278 | statout.close()
279 | pickout = open(WORK+"fastq/."+name+".pickle","wb")
280 | pickle.dump( M, pickout)
281 | pickout.close()
282 |
283 |
284 | def writefunc(GLOB,Parallel,Bcode,CUT,datatype,maxmismatch,WORK):
285 | "create barcode dictionary"
286 | codetable = open(Bcode, 'r')
287 | codes = [line.strip().split() for line in codetable.readlines()]
288 | C = {}
289 | for line in codes:
290 | if line[0]:
291 | C[line[1].strip().upper()] = line[0]
292 |
293 | " find longest barcode "
294 | keylens = map(len,C.keys())
295 | if len(set(keylens)) == 1:
296 | longB = (keylens[0],'same')
297 | else:
298 | longB = (max(keylens),'diff')
299 |
300 | " check for CUT in barcodes "
301 | CCC = unambig(CUT)
302 | if len(CCC)>1:
303 | for cut in CCC:
304 | if any([cut in i for i in C.keys()]):
305 | print "\n\twarning: CUT site matches within one of the barcodes, "+\
306 | "I suggest double \n\tchecking the file to make sure it properly demultiplexes"
307 | else:
308 | if any([CUT in i for i in C.keys()]):
309 | print "\n\twarning: CUT site matches within one of the barcodes, "+\
310 | "I suggest double \n\tchecking the file to make sure it properly demultiplexes"
311 |
312 | " read in sequence files "
313 | if len(glob.glob(GLOB)) > 1:
314 | FS = [f for f in glob.glob(GLOB)]
315 | else:
316 | FS = glob.glob(GLOB)
317 | if 'pair' in datatype:
318 | Raws = combinefiles(GLOB)
319 | else:
320 | Raws = FS
321 |
322 | "send jobs to multiprocess queue"
323 | num = 0
324 | work_queue = multiprocessing.Queue()
325 | submitted = 0
326 | for fs in Raws:
327 | if 'pair' in datatype:
328 | work_queue.put([C, [fs[0],fs[1]], CUT, datatype, num, maxmismatch, WORK, longB])
329 | submitted += 1
330 | else:
331 | work_queue.put([C, fs, CUT, datatype, num, maxmismatch, WORK, longB])
332 | submitted += 1
333 | num += 1
334 |
335 | result_queue = multiprocessing.Queue()
336 |
337 | "spawn workers, give function"
338 | jobs = []
339 | for i in range( min(Parallel,submitted) ):
340 | worker = Worker(work_queue, result_queue, barmatch)
341 | worker.start()
342 | jobs.append(worker)
343 | for job in jobs:
344 | job.join()
345 |
346 | Ms = {}
347 |
348 | if len(glob.glob(WORK+"fastq/.*.pickle")) > 1:
349 | for pick in glob.glob(WORK+"fastq/.*.pickle"):
350 | pickin = open(pick, "rb")
351 | M = pickle.load( pickin )
352 | pickin.close()
353 | for key in M:
354 | if key not in Ms:
355 | Ms[key] = M[key]
356 | else:
357 | Ms[key] += M[key]
358 | os.remove(pick)
359 | elif len(glob.glob(WORK+"fastq/.*.pickle")) == 1:
360 | pick = glob.glob(WORK+"fastq/.*.pickle")[0]
361 | pickin = open(pick, 'rb')
362 | Ms = pickle.load( pickin )
363 | pickin.close()
364 | os.remove(pick)
365 | else:
366 | print "\nno stats file generated"
367 |
368 | Mkeys = Ms.keys()
369 | Mkeys.sort(key=lambda x: Ms[x], reverse=True)
370 |
371 | statout = open(WORK+"stats/s1.sorting.txt",'a')
372 | statout.write("\n\n")
373 | statout.write("sample\ttrue_bar\tobs_bars\tN_obs\n")
374 |
375 | Cnames = C.keys()
376 | Cnames.sort()
377 | try: maxl = max(map(len,map(str,Ms.values())))
378 | except ValueError: maxl = 5
379 |
380 | hits = []
381 | for bar in Cnames:
382 | for barcode in Mkeys:
383 | if matching(bar, barcode, maxmismatch):
384 | print >>statout, "%s \t%s \t%s\t%s" % (C[bar], bar, barcode,
385 | str(Ms[barcode])+" "*(maxl+3-len(str(Ms[barcode]))))
386 | hits.append(barcode)
387 |
388 | statout.write("\n")
389 | maxl = max(map(len,Mkeys))
390 | for barcode in Mkeys:
391 | if barcode not in hits:
392 | print >>statout, "nomatch \t%s \t%i" % (barcode+" "*(maxl+3-len(barcode)), Ms[barcode])
393 | statout.close()
394 |
395 |
396 |
397 | def main(Bcode, GLOB, CUT, datatype, Parallel, maxmismatch, WORK):
398 |
399 | if not len(glob.glob(GLOB)) > 0:
400 | sys.stderr.write("\tno data found in "+GLOB+" fix path to the data files\n")
401 | sys.exit()
402 |
403 | "check for previous output"
404 | if not os.path.exists(WORK+'stats'):
405 | os.makedirs(WORK+'stats')
406 | if os.path.exists(WORK+'fastq'):
407 | if os.listdir(WORK+'fastq'):
408 | print ("\n\tfastq/ directory in working directory contains data, move/remove it before running step 1\n")
409 | sys.exit()
410 | else:
411 | os.makedirs(WORK+'fastq')
412 |
413 | if "*" in Bcode:
414 | if len(glob.glob(Bcode)) == 1:
415 | Bcode = glob.glob(Bcode)[0]
416 |
417 | sys.stderr.write("\n\tstep 1: sorting reads by barcode\n\t ")
418 |
419 | " seperate double digest cut sites, only need first read one for now "
420 | if "," in CUT:
421 | CUT,CUT2 = CUT.split(",")
422 |
423 | statout = open(WORK+"stats/s1.sorting.txt",'w')
424 | statout.write("\t".join(["file ","Nreads","cut_found","bar_matched"])+"\n")
425 | statout.close()
426 |
427 | " DO THE BARCODE SORTING "
428 | writefunc(GLOB, Parallel, Bcode, CUT, datatype, maxmismatch, WORK)
429 | names = [line.split()[0] for line in open(Bcode).readlines()]
430 |
431 | # " remove tiny sorted temp files "
432 | # if len(glob.glob(GLOB)) > 1:
433 | # for name in names:
434 | # if len(glob.glob(WORK+"fastq/."+name+"*")) > 0:
435 | # "remove very small files, probably errors"
436 | # for ff in glob.glob(WORK+'fastq/.'+name+"*"):
437 | # statinfo = os.stat(ff)
438 | # s = statinfo.st_size
439 | # if s < 1000:
440 | # os.remove(ff)
441 |
442 |
443 | " concatenate temp files "
444 | for name in names:
445 | if len(glob.glob(WORK+"fastq/."+name+"*")) > 0:
446 | os.system("/bin/cat "+WORK+"fastq/."+name+".temp_R1_*.gz > "+WORK+"fastq/"+name+"_R1.fq.gz")
447 | if datatype in ['pairgbs','pairddrad']:
448 | os.system("/bin/cat "+WORK+"fastq/."+name+".temp_R2_*.gz > "+WORK+"fastq/"+name+"_R2.fq.gz")
449 |
450 | if len(glob.glob(WORK+"fastq/*")) > 0:
451 | os.system("/bin/ls "+WORK+"fastq/.*temp_* | xargs /bin/rm" )
452 | if len(glob.glob(WORK+"fastq/*.pickle")) > 0:
453 | os.system("/bin/ls "+WORK+"fastq/.*pickle | xargs /bin/rm" )
454 |
--------------------------------------------------------------------------------
/pyrad/Dtest_5.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python2
2 |
3 | import numpy
4 | import sys
5 | import random
6 | import itertools
7 | import multiprocessing
8 | import cPickle as pickle
9 | from potpour import Worker
10 | from Dtest import IUPAC, sample_wr, fillin, makesortfiles
11 |
12 |
13 | def most_common(lst):
14 | return max(set(lst), key=lst.count)
15 |
16 |
17 | def makefreq(patlist):
18 | " identify which allele is derived in P3 relative to outgroup "
19 | " and is the most frequent and use that as the SNP."
20 | " Also, split up alleles into those that are P3a & vs. or P3b "
21 | P = {}
22 | for tax in patlist:
23 | P[tax] = []
24 |
25 | for tax in patlist:
26 | for base in patlist[tax]:
27 | if base in list('ATGC'):
28 | P[tax].append(base[0])
29 | P[tax].append(base[0])
30 | elif base in list("RKSYWM"):
31 | hh = IUPAC(base[0])
32 | for i in hh:
33 | P[tax].append(i)
34 |
35 | " select most common element in outgroup "
36 | if len(set(P['o'])) > 1:
37 | minor = most_common(P['o'])
38 | else:
39 | minor = P['o'][0]
40 |
41 | " select most common element that is not minor "
42 | bases = list(itertools.chain(*P.values()))
43 | majors = [i for i in bases if i != minor]
44 | major = most_common(majors)
45 |
46 | ret = [float(P[i].count(major)) / len(P[i]) for i in ['p1','p2','p3a','p3b','o']]
47 | ret += [float(P['p3a'].count(major)+P['p3b'].count(major))/(len(P['p3a'])+len(P['p3b']))]
48 | return ret
49 |
50 |
51 |
52 | def Dstat5(Loc,pat):
53 | " check site for patterns and add to Locus object if found"
54 | if len(set(pat)) < 3:
55 | if len(set(pat[2:])) > 1:
56 | minor = pat[-1]
57 | major = [i for i in [pat[2],pat[3]] if i not in pat[4]][0]
58 |
59 | o = 0.
60 | p3ab = 1. if (pat[3] == major) & (pat[2] == major) else 0.
61 | p3b = 1. if pat[3] == major else 0.
62 | p3a = 1. if pat[2] == major else 0.
63 | p2 = 1. if pat[1] == major else 0.
64 | p1 = 1. if pat[0] == major else 0.
65 |
66 | Loc.abbba += ( (1.-p1)*p2*p3ab*(1.-o) )
67 | Loc.babba += ( p1*(1.-p2)*p3ab*(1.-o) )
68 |
69 | Loc.abbaa += ( (1.-p1)*p2*p3a*(1.-p3b)*(1.-o) )
70 | Loc.babaa += ( p1*(1.-p2)*p3a*(1.-p3b)*(1.-o) )
71 |
72 | Loc.ababa += ( (1.-p1)*p2*(1.-p3a)*p3b*(1.-o) )
73 | Loc.baaba += ( p1*(1.-p2)*(1.-p3a)*p3b*(1.-o) )
74 |
75 | return Loc
76 |
77 |
78 |
79 | def polyDstat5(Loc, pat):
80 | ## calculate frequencies
81 | " look at the P3 taxon first for a derived allele "
82 | p1,p2,p3a,p3b,o,p3ab = makefreq(pat)
83 |
84 | Loc.abbba += ( (1.-p1)*p2*p3ab*(1.-o) )
85 | Loc.babba += ( p1*(1.-p2)*p3ab*(1.-o) )
86 |
87 | Loc.abbaa += ( (1.-p1)*p2*p3a*(1.-p3b)*(1.-o) )
88 | Loc.babaa += ( p1*(1.-p2)*p3a*(1.-p3b)*(1.-o) )
89 |
90 | Loc.ababa += ( (1.-p1)*p2*(1.-p3a)*p3b*(1.-o) )
91 | Loc.baaba += ( p1*(1.-p2)*(1.-p3a)*p3b*(1.-o) )
92 |
93 | return Loc
94 |
95 |
96 |
97 | def IUAfreq(Loc, L):
98 | patlist = {}
99 | Loc.abbba = 0.
100 | Loc.babba = 0.
101 | Loc.abbaa = 0.
102 | Loc.babaa = 0.
103 | Loc.ababa = 0.
104 | Loc.baaba = 0.
105 |
106 | for col in Loc.seq.transpose():
107 | patlist = fillin(L[0], 'p1', col, Loc.names, patlist)
108 | patlist = fillin(L[1], 'p2', col, Loc.names, patlist)
109 | patlist = fillin(L[2], 'p3a', col, Loc.names, patlist)
110 | patlist = fillin(L[3], 'p3b', col, Loc.names, patlist)
111 | patlist = fillin(L[4], 'o', col, Loc.names, patlist)
112 |
113 | if not any([ all([i in ["N",'-'] for i in patlist['p1']]),
114 | all([i in ["N",'-'] for i in patlist['p2']]),
115 | all([i in ["N",'-'] for i in patlist['p3a']]),
116 | all([i in ["N",'-'] for i in patlist['p3b']]),
117 | all([i in ["N",'-'] for i in patlist['o']]) ]):
118 | if any([ i not in patlist['o'] for i in numpy.dstack((patlist['p3a'],patlist['p3b']))[0][0] ]):
119 | Loc = polyDstat5(Loc, patlist)
120 | else:
121 | None
122 | else:
123 | None
124 | return Loc
125 |
126 |
127 |
128 | def IUA(Loc,L):
129 | Loc.abbba = 0.
130 | Loc.babba = 0.
131 | Loc.abbaa = 0.
132 | Loc.babaa = 0.
133 | Loc.ababa = 0.
134 | Loc.baaba = 0.
135 | for col in Loc.seq.transpose():
136 | if all(i in list("ATGC") for i in col):
137 | Loc = Dstat5(Loc,col)
138 | return Loc
139 |
140 |
141 |
142 | def bootfreq(Ldict, which):
143 | Dft_12 = Dfb_12 = 0
144 | Dft_1 = Dfb_1 = 0
145 | Dft_2 = Dfb_2 = 0
146 | while 1:
147 | try: Lx = Ldict[Ldict.keys()[which.next()]]
148 | except StopIteration: break
149 | Dft_12 += Lx.abbba - Lx.babba
150 | Dfb_12 += Lx.abbba + Lx.babba
151 | Dft_1 += Lx.abbaa - Lx.babaa
152 | Dfb_1 += Lx.abbaa + Lx.babaa
153 | Dft_2 += Lx.ababa - Lx.baaba
154 | Dfb_2 += Lx.ababa + Lx.baaba
155 | D12 = 0.
156 | if Dfb_12 > 0:
157 | D12 = Dft_12/float(Dfb_12)
158 | D1 = 0.
159 | if Dfb_1 > 0:
160 | D1 = Dft_1/float(Dfb_1)
161 | D2 = 0.
162 | if Dfb_2 > 0:
163 | D2 = Dft_2/float(Dfb_2)
164 | return D12, D1, D2
165 |
166 |
167 |
168 | def bootfixed(Ldict, which):
169 | abbba = babba = 0
170 | abbaa = babaa = 0
171 | ababa = baaba = 0
172 | while 1:
173 | try: Lx = Ldict[Ldict.keys()[which.next()]]
174 | except StopIteration: break
175 | abbba += Lx.abbba
176 | babba += Lx.babba
177 | abbaa += Lx.abbaa
178 | babaa += Lx.babaa
179 | ababa += Lx.ababa
180 | baaba += Lx.baaba
181 | D12 = 0.
182 | if abbba + babba > 0:
183 | D12 = float(abbba-babba)/(abbba+babba)
184 | D1 = 0.
185 | if abbaa + babaa > 0:
186 | D1 = float(abbaa-babaa)/(abbaa+babaa)
187 | D2 = 0.
188 | if ababa + baaba > 0:
189 | D2 = float(ababa-baaba)/(ababa+baaba)
190 | return D12, D1, D2
191 |
192 |
193 |
194 | class Locus5():
195 | """locus keeps track of position in input file,
196 | variable sites, and D-statistics"""
197 | def _init_(self):
198 | self.number = number
199 | self.taxa = names
200 | self.seq = sequence
201 | self.abbba = abbba
202 | self.babba = abbba
203 | self.abbaa = abbaa
204 | self.babaa = babaa
205 | self.ababa = ababa
206 | self.baaba = baaba
207 | " D-stats for an individual locus "
208 | def D1(self):
209 | if self.abbaa+self.babaa > 0:
210 | return float(self.abbaa-self.babaa)/(self.abbaa+self.babaa)
211 | else:
212 | return 0.0
213 | def D2(self):
214 | if self.ababa+self.baaba > 0:
215 | return float(self.ababa-self.baaba)/(self.ababa+self.baaba)
216 | else:
217 | return 0.0
218 | def D12(self):
219 | if self.abbba+self.babba > 0:
220 | return float(self.abbba-self.babba)/(self.abbba+self.babba)
221 | else:
222 | return 0.0
223 |
224 |
225 |
226 |
227 |
228 |
229 | def makeSNP(L, snpfreq, loci):
230 | Ndict = {}
231 | num = 0
232 | for loc in loci:
233 | Loc = Locus5()
234 | Loc.number = num
235 |
236 | " only select loci that have data for all five tiptaxa "
237 | names = [i.split()[0].replace(">","") for i in loc.lstrip().rstrip().split("\n")[1:-1]]
238 | if snpfreq:
239 | Loc.names = [i for i in names if i in list(itertools.chain(*L))]
240 | else:
241 | Loc.names = L #[i for i in names if i in L]
242 |
243 | " if snpfreq only need one of possibly multiple individuals"
244 | keep = 0
245 |
246 | if snpfreq:
247 | for tax in L:
248 | z = any([tax in Loc.names for tax in L[0]])
249 | y = any([tax in Loc.names for tax in L[1]])
250 | x = any([tax in Loc.names for tax in L[2]])
251 | w = any([tax in Loc.names for tax in L[3]])
252 | u = any([tax in Loc.names for tax in L[4]])
253 | if all([z,y,x,w,u]):
254 | keep = 1
255 |
256 | else:
257 | if all(tax in names for tax in Loc.names):
258 | keep = 1
259 |
260 | if keep:
261 | N = numpy.array([tuple(i) for i in loc.split("\n")[1:]])
262 | " only select sites with synapomorphies "
263 | # select all variable
264 | N[-1] = list(N[-1].tostring().replace("-","*"))
265 | N = N[:, N[-1] == "*"]
266 |
267 | " only select rows with focal taxa "
268 | Loc.seq = N[[names.index(i) for i in Loc.names],:]
269 | Ndict[num] = Loc
270 | num += 1
271 | return Ndict
272 |
273 |
274 |
275 | def runtest(infile, L, nboots, snpfreq, submitted):
276 | " print test "
277 | print L
278 |
279 | " split each locus "
280 | loci = open(infile).read().strip().split("|")[:-1]
281 | loci[0] = "\n"+loci[0]
282 |
283 | " returns a {} of Locus5 objects with data for tiptaxa L "
284 | Ldict = makeSNP(L, snpfreq, loci)
285 |
286 | " calculate discordant patterns for each locus "
287 | for loc in Ldict:
288 | if snpfreq:
289 | Ldict[loc] = IUAfreq(Ldict[loc],L)
290 | else:
291 | Ldict[loc] = IUA(Ldict[loc],L)
292 | ################################################
293 |
294 | " final D12 "
295 | dft_12 = sum([Ldict[l].abbba - Ldict[l].babba for l in Ldict])
296 | dbt_12 = sum([Ldict[l].abbba + Ldict[l].babba for l in Ldict])
297 | if dbt_12 > 0:
298 | D12 = float(dft_12)/dbt_12
299 | else: D12 = 0.
300 |
301 | " final D1 "
302 | dft_1 = sum([Ldict[l].abbaa - Ldict[l].babaa for l in Ldict])
303 | dbt_1 = sum([Ldict[l].abbaa + Ldict[l].babaa for l in Ldict])
304 | if dbt_1 > 0:
305 | D1 = float(dft_1)/dbt_1
306 | else: D1 = 0.
307 |
308 | " final D2 "
309 | dft_2 = sum([Ldict[l].ababa - Ldict[l].baaba for l in Ldict])
310 | dbt_2 = sum([Ldict[l].ababa + Ldict[l].baaba for l in Ldict])
311 | if dbt_2 > 0:
312 | D2 = float(dft_2)/dbt_2
313 | else: D2 = 0.
314 |
315 | " proportion of discordant loci "
316 | try: pdisc = len([i for i in Ldict if any([Ldict[i].D12(),Ldict[i].D1(),Ldict[i].D2()])]) / float(len(Ldict))
317 | except ValueError:
318 | pdisc = 0.0
319 |
320 | #################################################
321 |
322 | " do bootstrapping "
323 | BB12 = []
324 | BB1 = []
325 | BB2 = []
326 | for i in xrange(nboots):
327 | which = iter(sample_wr(xrange(len(Ldict)), len(Ldict)))
328 | if snpfreq:
329 | bb12,bb1,bb2 = bootfreq(Ldict, which)
330 | else:
331 | #bb12,bb1,bb2 = bootfixed(Ldict, which)
332 | bb12,bb1,bb2 = bootfreq(Ldict, which)
333 | BB12.append(bb12)
334 | BB1.append(bb1)
335 | BB2.append(bb2)
336 | STD12 = numpy.std(BB12)
337 | STD1 = numpy.std(BB1)
338 | STD2 = numpy.std(BB2)
339 | ##################################################
340 |
341 | " stats out "
342 | if STD12 > 0:
343 | Z12 = (abs(D12/STD12))
344 | else: Z12 = 0.
345 | if STD1 > 0:
346 | Z1 = (abs(D1/STD1))
347 | else: Z1 = 0.
348 | if STD2 > 0:
349 | Z2 = (abs(D2/STD2))
350 | else: Z2 = 0.
351 |
352 | ## make loci files here
353 | ABBBAloci = [Ldict[l].number for l in Ldict if Ldict[l].D12() > 0]
354 | BABBAloci = [Ldict[l].number for l in Ldict if Ldict[l].D12() < 0]
355 | ABBAAloci = [Ldict[l].number for l in Ldict if Ldict[l].D1() > 0]
356 | BABAAloci = [Ldict[l].number for l in Ldict if Ldict[l].D1() < 0]
357 | ABABAloci = [Ldict[l].number for l in Ldict if Ldict[l].D2() > 0]
358 | BAABAloci = [Ldict[l].number for l in Ldict if Ldict[l].D2() < 0]
359 |
360 | " pickle to prevent multiprocessing from freezing on large returns "
361 | ret = [L,
362 | D12,Z12,
363 | D1,Z1,
364 | D2,Z2,
365 | len(Ldict),
366 | sum([Ldict[l].abbba for l in Ldict]),
367 | sum([Ldict[l].babba for l in Ldict]),
368 | sum([Ldict[l].abbaa for l in Ldict]),
369 | sum([Ldict[l].babaa for l in Ldict]),
370 | sum([Ldict[l].ababa for l in Ldict]),
371 | sum([Ldict[l].baaba for l in Ldict]),
372 | pdisc, submitted,
373 | ABBBAloci, BABBAloci,
374 | ABBAAloci, BABAAloci,
375 | ABABAloci, BAABAloci,
376 | BB12, BB1, BB2]
377 | pickle.dump(ret, open(".save."+str(submitted),'wb'))
378 |
379 |
380 |
381 | def checktaxa(taxalist,alignfile):
382 | with open(alignfile) as infile:
383 | data = infile.readlines()
384 | taxainfile = set()
385 | for line in data:
386 | if ">" in line:
387 | tax = line.split(" ")[0].replace(">","")
388 | if tax not in taxainfile:
389 | taxainfile.add(tax)
390 | if not set(taxainfile).difference(taxainfile):
391 | return 1
392 |
393 |
394 |
395 |
396 | def multiproc_it(subtests,alignfile,outfile, nboots,nproc,namelen,makesort,makeboots):
397 | work_queue = multiprocessing.Queue()
398 | result_queue = multiprocessing.Queue()
399 | submitted = 0
400 | Notes = []
401 | for rep in subtests:
402 | notes = ""
403 | if len(rep) == 2:
404 | rep,notes = rep
405 | p1,p2,p3a,p3b,o = rep
406 | if all(["[" in i for i in rep[1:]]):
407 | p1 = p1[1:-1].split(",")
408 | p2 = p2[1:-1].split(",")
409 | p3a = p3a[1:-1].split(",")
410 | p3b = p3b[1:-1].split(",")
411 | o = o[1:-1].split(",")
412 | if checktaxa([p1,p2,p3a,p3b,o],alignfile):
413 | work_queue.put([alignfile, [p1,p2,p3a,p3b,o], nboots, 1, submitted])
414 | submitted += 1
415 | else:
416 | print 'a taxon name was found that is not in the sequence file'
417 | else:
418 | if checktaxa([p1,p2,p3a,p3b,o],alignfile):
419 | work_queue.put([alignfile, [p1,p2,p3a,p3b,o], nboots, 0, submitted])
420 | submitted += 1
421 | else:
422 | print 'a taxon name was found that is not in the sequence file'
423 | Notes.append(notes)
424 |
425 | jobs = []
426 | for i in range(min(submitted,nproc)):
427 | worker = Worker(work_queue, result_queue, runtest)
428 | jobs.append(worker)
429 | worker.start()
430 | for j in jobs:
431 | j.join()
432 |
433 |
434 | " read results back in "
435 | #Results = [result_queue.get() for i in range(submitted)]
436 | Results = [pickle.load(open(".save."+str(i),'rb')) for i in range(submitted)]
437 | Results.sort(key = lambda x:x[15])
438 |
439 |
440 | " setup results file "
441 | outs = open(outfile+".partD.txt", 'w')
442 | header = "\t".join([ 'p1'+" "*(namelen[0]-2),
443 | 'p2'+" "*(namelen[1]-2),
444 | 'p3_1'+" "*(namelen[2]-4),
445 | 'p3_2'+" "*(namelen[3]-4),
446 | 'O'+" "*(namelen[4]-1),
447 | 'D_12','D_1','D_2',
448 | 'Z_12','Z_1','Z_2',
449 | 'BABBA','ABBBA',
450 | 'BABAA','ABBAA',
451 | 'BAABA','ABABA',
452 | 'nloci','pdisc', 'notes'])
453 |
454 | print >>outs, header
455 |
456 |
457 | for i in range(len(Results)):
458 | L,D12,Z12,D1,Z1,D2,Z2,nloc,ABBBA,BABBA,ABBAA,BABAA,ABABA,BAABA,pdisc,sub,ABBBAloci,BABBAloci,ABBAAloci,BABAAloci,ABABAloci,BAABAloci,BB12,BB1,BB2 = Results[i]
459 | L = [str(x).replace("['","[").replace("']","]").replace("', '",",") for x in L]
460 |
461 | resin = tuple([str(L[0])+" "*(namelen[0]-len(str(L[0]))),
462 | str(L[1])+" "*(namelen[1]-len(str(L[1]))),
463 | str(L[2])+" "*(namelen[2]-len(str(L[2]))),
464 | str(L[3])+" "*(namelen[3]-len(str(L[3]))),
465 | str(L[4])+" "*(namelen[4]-len(str(L[4]))),
466 | D12, D1, D2, Z12, Z1, Z2,
467 | BABBA, ABBBA, BABAA, ABBAA, BAABA, ABABA,
468 | nloc, pdisc, Notes[i]])
469 |
470 | print >>outs, "%s\t%s\t%s\t%s\t%s\t%.3f\t%.3f\t%.3f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d\t%.2f\t%s" % resin
471 |
472 | loci = open(alignfile).read().strip().split("|")[:-1]
473 | if makesort:
474 | makesortfiles("ABBBA",ABBBAloci,5,loci,outfile,makesort,sub,L)
475 | makesortfiles("BABBA",BABBAloci,5,loci,outfile,makesort,sub,L)
476 | makesortfiles("ABBAA",ABBAAloci,5,loci,outfile,makesort,sub,L)
477 | makesortfiles("BABAA",BABAAloci,5,loci,outfile,makesort,sub,L)
478 | makesortfiles("ABABA",ABABAloci,5,loci,outfile,makesort,sub,L)
479 | makesortfiles("BAABA",BAABAloci,5,loci,outfile,makesort,sub,L)
480 |
481 | if makeboots:
482 | with open(outfile+"_"+str(sub+1)+".boots_D12",'w') as out:
483 | out.write(",".join(map(str,BB12)))
484 | with open(outfile+"_"+str(sub+1)+".boots_D1",'w') as out:
485 | out.write(",".join(map(str,BB1)))
486 | with open(outfile+"_"+str(sub+1)+".boots_D2",'w') as out:
487 | out.write(",".join(map(str,BB2)))
488 |
489 |
490 | def main(tests, alignfile, outfile, nboots, nproc, makesort, makeboots):
491 | import sys
492 |
493 | P1namelen = max(map(len,[str(i[0][0]) for i in tests]))
494 | P2namelen = max(map(len,[str(i[0][1]) for i in tests]))
495 | P3anamelen = max(map(len,[str(i[0][2]) for i in tests]))
496 | P3bnamelen = max(map(len,[str(i[0][3]) for i in tests]))
497 | Onamelen = max(map(len,[str(i[0][4]).strip() for i in tests]))
498 | namelen = [P1namelen,P2namelen,P3anamelen,P3bnamelen,Onamelen]
499 |
500 | multiproc_it(tests,alignfile,outfile,nboots,nproc,namelen,makesort,makeboots)
501 |
502 |
503 | if __name__ == '__main__':
504 | main()
505 |
506 |
507 |
508 |
509 |
--------------------------------------------------------------------------------
/pyrad/consensdp.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python2
2 | import multiprocessing
3 | import glob
4 | import itertools
5 | import sys
6 | import scipy.stats
7 | import scipy.misc
8 | import numpy
9 | import os
10 | import operator
11 | import gzip
12 | from potpour import Worker
13 |
14 |
15 | def binomprobr(n1,n2,e,r):
16 | """
17 | given two bases are observed at a site
18 | n1 and n2, and the error rate e, the
19 | probability the site is truly aa,bb,ab
20 | is calculated using binomial distribution
21 | as in Li_et al 2009, 2011, and if
22 | coverage > 500, 500 reads were randomly
23 | sampled.
24 | """
25 | maf = n1/(n1+n2)
26 | prior_homo = ((1.-r)/2.)
27 | prior_het = r
28 | ab = scipy.misc.comb(n1+n2,n1)/(2.**(n1+n2))
29 | aa= scipy.stats.binom.pmf(n1,n1+n2,e)
30 | bb= scipy.stats.binom.pmf(n2,n1+n2,e)
31 | aa = aa*prior_homo
32 | bb = bb*prior_homo
33 | ab = ab*prior_het
34 | Q = [aa,bb,ab]
35 | Qn = ['aa','bb','ab']
36 | P = max(Q)/float(aa+bb+ab)
37 | return [P,maf,Qn[Q.index(max(Q))]]
38 |
39 |
40 | def simpleconsens(n1,n2):
41 | """
42 | majority consensus calling for sites
43 | with too low of coverage for
44 | statistical calling. Only used
45 | with 'lowcounts' option.
46 | """
47 | Qn = ['aa','bb','ab']
48 | maf = n1/(n1+n2)
49 | # if not n2:
50 | # P = 1.0
51 | # aa = 1.0
52 | # ab = bb = 0.0
53 | # else:
54 | # P = 0.99
55 | # aa = bb = 0.0
56 | # ab = 1.0
57 | ## create an option that saves
58 | ## frequencies. Useful for pooled sample data sets.
59 | #Q = [aa,bb,ab]
60 | #return [P,Qn[Q.index(max(Q))]]
61 | return [1.0,maf,'aa']
62 |
63 |
64 | def hetero(n1,n2):
65 | """
66 | returns IUPAC symbol for ambiguity bases,
67 | used for polymorphic sites.
68 | """
69 | D = {('G','A'):"R",
70 | ('G','T'):"K",
71 | ('G','C'):"S",
72 | ('T','C'):"Y",
73 | ('T','A'):"W",
74 | ('C','A'):"M"}
75 | a = D.get((n1,n2))
76 | b = D.get((n2,n1))
77 | if a:
78 | return a
79 | else:
80 | return b
81 |
82 |
83 | def unhetero(amb):
84 | amb = amb.upper()
85 | " returns bases from ambiguity code"
86 | D = {"R":("G","A"),
87 | "K":("G","T"),
88 | "S":("G","C"),
89 | "Y":("T","C"),
90 | "W":("T","A"),
91 | "M":("C","A")}
92 | return D.get(amb)
93 |
94 |
95 | def uplow(b1):
96 | " allele precedence "
97 | " G > T > C > A "
98 | D = {('G','A'):"G",
99 | ('A','G'):"G",
100 | ('G','T'):"G",
101 | ('T','G'):"G",
102 | ('G','C'):"G",
103 | ('C','G'):"G",
104 | ('T','C'):"T",
105 | ('C','T'):"T",
106 | ('T','A'):"T",
107 | ('A','T'):"T",
108 | ('C','A'):"C",
109 | ('A','C'):"C"}
110 | r = D.get(b1)
111 | if not r:
112 | r = b1[0]
113 | return r
114 |
115 |
116 |
117 | def findalleles(consensus,sss,bbb):
118 | cons = list(consensus)
119 | " relative to first base "
120 | bigbase = uplow(tuple([i.split("_")[0] for i in bbb]))
121 | bigallele = bbb.index([i for i in bbb if i.split("_")[0] == bigbase][0])
122 | for k in range(1,len(sss)):
123 | c = uplow(tuple([i.split("_")[k] for i in bbb]))
124 | which = bbb.index([i for i in bbb if i.split("_")[k] == c][0])
125 | if bbb[bigallele] != bbb[which]:
126 | cons[sss[k]] = cons[sss[k]].lower()
127 | return "".join(cons)
128 |
129 |
130 | def breakalleles(consensus):
131 | """
132 | break ambiguity code consensus seqs
133 | into two alleles
134 | """
135 | a1 = ""
136 | a2 = ""
137 | #bigbase = ""
138 | for base in consensus:
139 | if base in tuple("RKSYWM"):
140 | a,b = unhetero(base)
141 | d = set([a,b])
142 | a1 += uplow((a,b))
143 | a2 += d.difference(uplow((a,b))).pop()
144 | #if not bigbase:
145 | # bigbase = uplow((a,b))
146 | elif base in tuple("rksywm"):
147 | a,b = unhetero(base)
148 | d = set([a,b])
149 | a2 += uplow((a,b))
150 | a1 += d.difference(uplow((a,b))).pop()
151 | else:
152 | a1 += base
153 | a2 += base
154 | return a1, a2
155 |
156 |
157 | # correct G A C
158 | # >1D_0_0 CCTGCGTCGGGG G ATCCGTCTTATCTAAGCGGACAATAGCGGCAAACGCTCATAGT T CAAC G ACGTGACGCCGAACACCACCTCTAACC
159 | # >1D_0_1 CCTGCGTCGGGG T ATCCGTCTTATCTAAGCGGACAATAGCGGCAAACGCTCATAGT A CAAC C ACGTGACGCCGAACACCACCTCTAACC
160 |
161 | # >1D_0 CCTGCGTCGGGG K ATCCGTCTTATCTAAGCGGACAATAGCGGCAAACGCTCATAGT W CAAC S ACGTGACGCCGAACACCACCTCTAACC
162 |
163 | def stack(D):
164 | """
165 | from list of bases at a site D,
166 | returns an ordered list of counts of bases
167 | """
168 | L = len(D)
169 | counts = []
170 | for i in range(len(D[0])):
171 | A=C=T=G=N=M=X=0
172 | for nseq in range(L):
173 | A += D[nseq][i].count("A")
174 | C += D[nseq][i].count("C")
175 | T += D[nseq][i].count("T")
176 | G += D[nseq][i].count("G")
177 | N += D[nseq][i].count("N")
178 | M += D[nseq][i].count("-")
179 | X += D[nseq][i].count("X")
180 | counts.append( [[A,C,T,G],N,M,X] )
181 | return counts
182 |
183 |
184 | # def ffmin(x):
185 | # d = []
186 | # for i,j in enumerate(x):
187 | # if j not in ["-","N"]:
188 | # d.append(i)
189 | # return min(d)
190 |
191 | # def ffmax(x):
192 | # d = []
193 | # for i,j in enumerate(x):
194 | # if j not in ["-","N"]:
195 | # d.append(i)
196 | # return max(d)
197 |
198 |
199 | def removerepeat_Ns(shortcon):
200 | """
201 | checks for interior Ns in consensus seqs
202 | remove those that arise next to *single repeats*
203 | of at least 3 bases on either side, which may be
204 | sequencing errors on deep coverage repeats """
205 | Nlocs = [i for i,j in enumerate(shortcon) if j=="N"]
206 | repeats = set()
207 | for n in Nlocs:
208 | r1 = len(set(list(shortcon)[n-3:n]))
209 | if r1 < 2:
210 | repeats.add(n)
211 | r2 = len(set(list(shortcon)[n+1:n+4]))
212 | if r2 < 2:
213 | repeats.add(n)
214 | return "".join([j for (i,j) in enumerate(shortcon) if i not in repeats])
215 |
216 |
217 |
218 |
219 | def consensus(infile,E,H,mindepth,maxN,maxH,datatype,
220 | haplos,CUT,upperSD,strict,lowcounts):
221 | """
222 | from a clust file f,
223 | reads in all copies at a locus and sorts
224 | bases at each site, tests for errors at the
225 | site according to error rate, calls consensus
226 | """
227 | f = gzip.open(infile)
228 | k = itertools.izip(*[iter(f)]*2)
229 | bases = ['A','C','T','G']
230 | Dic = {}
231 | Errors = []
232 | haplo = []
233 | #Plist = []
234 | locus = minsamplocus = npoly = P = 0
235 | while 1:
236 | try: first = k.next()
237 | except StopIteration: break
238 | itera = [first[0],first[1]]
239 | fname = itera[0].strip().split(";")[0]
240 | leftjust = rightjust = None
241 |
242 | " lists and variables for this locus"
243 | S = [] ## list for sequence data
244 | alleles = [] ## for measuring # alleles, detect paralogs
245 | locus += 1 ## recording n loci
246 | ploidy = 0 ## for measuring # alleles, detect paralogs
247 | nHs = 0 ## will record heterozygous sites in this locus
248 | consensus = "" ## empty vector for consensus sequence
249 | basenumber = 1 ## for recording error locations
250 | lefts = []
251 | rights = []
252 | while itera[0] != "//\n":
253 | " append sequence * number of dereps "
254 | nreps = int(itera[0].strip().split(";")[1].replace("size=",""))
255 | for i in xrange(nreps):
256 | S.append(tuple(itera[1].strip()))
257 | #print i, itera[1].strip(), itera[0].strip()[-1], leftjust, rights
258 |
259 | " record left and right most index of seed and hits (for GBS) "
260 | if datatype in ['gbs','merged']:
261 | " leftjust is seed's left "
262 | if itera[0].strip()[-1] == ";":
263 | leftjust = itera[1].index([i for i in itera[1] if i not in list("-N")][0])
264 |
265 | " rightjust is the shortest reverse hit "
266 | if itera[0].strip()[-1] == "-":
267 | rights.append(max(-1,[itera[1].rindex(i) for i in itera[1] if i in list("ATGC")]))
268 | #if rights == -1:
269 | # print itera
270 | #lefts.append(itera[1].index([i for i in itera[1] if i not in list("-N")][0]))
271 |
272 | itera = k.next()
273 |
274 | " trim off overhang edges of gbs reads "
275 | if datatype in ['gbs','merged']:
276 | if rights:
277 | " record in name that there was a reverse hit"
278 | fname = "_".join(fname.split("_")[0:-1])+"_c1"
279 | try: rightjust = min([min(i) for i in rights])
280 | except ValueError:
281 | S = ""
282 |
283 | for s in xrange(len(S)):
284 | S[s] = S[s][leftjust:]
285 | if rightjust:
286 | #print rights, rightjust, 'right,just'
287 | S[s] = S[s][:rightjust+1]
288 |
289 | #for i in S:
290 | # print "".join(i)
291 |
292 | #if any([i < leftjust for i in lefts]):
293 | # fname = "_".join(fname.split("_")[0:-1])+"_c1"
294 | #print "".join(list(S[s])), "new"
295 |
296 | " Apply depth and paralog filters "
297 | if (len(S) >= min(lowcounts,mindepth)) and (len(S) < upperSD):
298 | minsamplocus += 1
299 | RAD = stack(S)
300 | for site in RAD:
301 | nchanged = 0
302 |
303 | " minimum depth of coverage for base calling "
304 | depthofcoverage = sum(site[0])
305 | if depthofcoverage < min(mindepth,lowcounts):
306 | cons = "N"; n1 = depthofcoverage-1; n2=0 ## prevents zero division error.
307 | else:
308 | n1,n2,n3,n4 = sorted(site[0],reverse=True)
309 |
310 | " speed hack = if diploid exclude if a third base present at > 20% "
311 | quickthirdbasetest = 0
312 | if haplos == 2:
313 | if float(n3)/(n1+n2+n3+n4) > 0.20:
314 | quickthirdbasetest = 1
315 | if not quickthirdbasetest:
316 |
317 | """ if depth > 500 reduce by some factor for base calling """
318 | if n1+n2 >= 500: ## if > 500, random sample 500
319 | firstfivehundred = numpy.array(tuple("A"*n1+"B"*n2))
320 | numpy.random.shuffle(firstfivehundred)
321 | nchanged = 1
322 | oldn1 = n1
323 | oldn2 = n2
324 | n1 = list(firstfivehundred[:500]).count("A")
325 | n2 = list(firstfivehundred[:500]).count("B")
326 |
327 | """ make base calls using... """
328 | if n1+n2 >= mindepth:
329 | """ if above stat minimum """
330 | P,maf,who = binomprobr(n1,n2,float(E),H)
331 | elif n1+n2 >= lowcounts:
332 | """ if above maj rule minimum"""
333 | P,maf,who = simpleconsens(n1,n2)
334 |
335 | """ if the base could be called with 95% probability """
336 | if float(P) >= 0.95:
337 | if who in 'ab':
338 | if nchanged:
339 | a = [i for i,l in enumerate(site[0]) if l == oldn1]
340 | else:
341 | a = [i for i,l in enumerate(site[0]) if l == n1]
342 | if len(a)==2: ## alleles came up equal freq.
343 | cons = hetero(bases[a[0]],bases[a[1]])
344 | alleles.append(basenumber)
345 | else: ## alleles came up diff freq.
346 | if nchanged:
347 | b= [i for i,l in enumerate(site[0]) if l == oldn2]
348 | else:
349 | b= [i for i,l in enumerate(site[0]) if l == n2]
350 | "if three alleles came up equal, only need if diploid paralog filter off"
351 | if a == b:
352 | cons = hetero(bases[a[0]],bases[a[1]])
353 | else:
354 | cons = hetero(bases[a[0]],bases[b[0]])
355 | alleles.append(basenumber)
356 | nHs += 1
357 | else:
358 | if nchanged:
359 | cons = bases[site[0].index(oldn1)]
360 | else:
361 | cons = bases[site[0].index(n1)]
362 | else:
363 | cons = "N"
364 | else:
365 | "paralog flag"
366 | cons = "@"
367 | consensus += cons
368 | basenumber += 1
369 |
370 | " only allow maxH polymorphic sites in a locus "
371 | if "@" not in consensus:
372 | if nHs <= maxH:
373 | " filter to limit to N haplotypes (e.g., diploid) "
374 | if haplos:
375 | al = []
376 | if len(alleles) > 1:
377 | for i in S:
378 | d = ""
379 | for z in alleles:
380 | if i[z-1] in unhetero(consensus[z-1]):
381 | d += i[z-1]+"_"
382 | if "N" not in d:
383 | if d.count("_") == len(alleles):
384 | al.append(d.rstrip("_"))
385 |
386 | " remove very rare thirds representing a possible error at a heterozygous site \
387 | that changed the base to the alternate allele at that site "
388 | #if len(al) >= 50:
389 | al = [i for i in al if al.count(i) > len(al)*.25]
390 |
391 | AL = sorted(set(al), key=al.count)
392 | ploidy = len(AL)
393 | #Plist.append(ploidy)
394 |
395 | " set correct alleles relative to first polymorphic base"
396 | if AL:
397 | if ploidy <= haplos:
398 | sss = [zz-1 for zz in alleles]
399 | consensus = findalleles(consensus,sss,AL)
400 | else:
401 | consensus += "@E"
402 | # print ploidy, haplos
403 | # print alleles
404 | # print "AL", AL
405 | # print "al", al
406 |
407 | #else: Plist.append(1)
408 |
409 | if "@" not in consensus:
410 | " strip N's from either end "
411 | shortcon = consensus.lstrip("N").rstrip("N").replace("-","N")
412 | shortcon = removerepeat_Ns(shortcon)
413 | if shortcon.count("N") <= maxN: ## only allow maxN internal "N"s in a locus
414 | if len(shortcon) >= 32: ## minimum length set to 36
415 | #print shortcon, 'keep'
416 | npoly += nHs
417 | Dic[fname] = shortcon
418 |
419 |
420 | #with open(infile.replace(".clustS",".ploids"),'w+') as ploidout:
421 | # ploidout.write(",".join(map(str,Plist)))
422 |
423 | consens = gzip.open(infile.replace(".clustS",".consens"),'w+')
424 | for i in Dic.items():
425 | consens.write(str(i[0])+'\n'+str(i[1])+"\n")
426 | consens.close()
427 | sys.stderr.write(".")
428 |
429 |
430 | if datatype in ['pairgbs','pairddrad']:
431 | " -4 for the xxxx "
432 | nsites = sum([len(i)-len(CUT)-4 for i in Dic.values()])
433 | else:
434 | nsites = sum([len(i)-len(CUT) for i in Dic.values()])
435 | ldic = len(Dic)
436 | try: NP = npoly/float(nsites)
437 | except ZeroDivisionError: NP = 0
438 | return [infile.split('/')[-1], locus, minsamplocus, ldic, nsites, npoly, round(NP,7)]
439 |
440 |
441 |
442 | def upSD(handle,mindepth):
443 | " function to calculate mean and SD of clustersize"
444 | infile = gzip.open(handle)
445 | L = itertools.izip(*[iter(infile)]*2)
446 | a = L.next()[0].strip()
447 | depth = []
448 | d = int(a.split(";")[1].replace("size=",""))
449 | while 1:
450 | try: a = L.next()[0].strip()
451 | except StopIteration: break
452 | if a != "//":
453 | d += int(a.split(";")[1].replace("size=",""))
454 | else:
455 | depth.append(d)
456 | d = 0
457 | infile.close()
458 | keep = [i for i in depth if i>=(mindepth)]
459 | if keep:
460 | me = numpy.mean(keep)
461 | std = numpy.std(keep)
462 | else:
463 | me = 0.0
464 | std = 0.0
465 | return me, std
466 |
467 |
468 | def main(Parallel, E, H, ID, mindepth, subset,
469 | maxN, maxH, haplos, CUT, datatype,
470 | lowcounts, strict, WORK, maxstack):
471 |
472 | " find clust.xx directory "
473 | if not os.path.exists(WORK+'clust'+ID):
474 | print "\terror: could not find "+WORK+"clust"+str(ID)+"/ directory,"+ \
475 | "\n\t\tif you changed the clustering threshold you must transfer *.clustS"+ \
476 | "\n\t\tfiles to a new directory named clust.xx with xx replaced by new clustering threshold"
477 | sys.exit()
478 |
479 | " load up work queue"
480 | work_queue = multiprocessing.Queue()
481 |
482 | " iterate over files"
483 | outfolder = WORK+'clust'+str(ID)
484 | HH = glob.glob(outfolder+"/"+subset+".clustS*")
485 | stringout = "\n\tstep 5: creating consensus seqs for %i samples, using H=%.5f E=%.5f\n\t" % (len(HH),round(H,5),round(E,5))
486 | sys.stderr.write(stringout)
487 |
488 | if len(HH) > 1:
489 | " sort files by size"
490 | for i in xrange(len(HH)):
491 | statinfo = os.stat(HH[i])
492 | HH[i] = HH[i],statinfo.st_size
493 | HH.sort(key=operator.itemgetter(1))
494 | FS = [f[0] for f in HH][::-1]
495 | else: FS = HH
496 | REMOVE = glob.glob('clust'+ID+"/cat.*")
497 | FS = [f for f in FS if f not in REMOVE]
498 | submitted = 0
499 | for handle in FS:
500 | if handle.replace('.clustS','.consens').replace('.clust','.consens') not in glob.glob(outfolder+"/*"):
501 | m,sd = upSD(handle,mindepth)
502 | if maxstack == "2SD":
503 | upperSD = max(500,m+(sd*2.5))
504 | else:
505 | upperSD = int(maxstack)
506 | work_queue.put([handle,E,H,mindepth,maxN,maxH,datatype,
507 | haplos,CUT,upperSD,strict,lowcounts])
508 | submitted += 1
509 | else:
510 | print "\tskipping "+handle.replace(".clustS",".consens")+\
511 | ', it already exists in '+outfolder+"/"
512 |
513 |
514 | " create a queue to pass to workers to store the results"
515 | result_queue = multiprocessing.Queue()
516 |
517 | " spawn workers"
518 | jobs = []
519 | for i in xrange( min(Parallel,submitted) ):
520 | worker = Worker(work_queue, result_queue, consensus)
521 | jobs.append(worker)
522 | worker.start()
523 | for j in jobs:
524 | j.join()
525 |
526 | " get results"
527 | stats = open(WORK+'stats/s5.consens.txt','a+')
528 | print >>stats, "taxon \tnloci\tf1loci\tf2loci\tnsites\tnpoly\tpoly"
529 | for i in range(submitted):
530 | a,b,c,d,e,f,g = result_queue.get()
531 | print >> stats, "\t".join(map(str,[a.replace(".clustS.gz","")+" "*(10-len(a)),b,c,d,e,f,g]))
532 | print >>stats, """
533 | ## nloci = number of loci
534 | ## f1loci = number of loci with >N depth coverage
535 | ## f2loci = number of loci with >N depth and passed paralog filter
536 | ## nsites = number of sites across f loci
537 | ## npoly = number of polymorphic sites in nsites
538 | ## poly = frequency of polymorphic sites"""
539 | stats.close()
540 |
541 |
542 |
543 |
544 |
--------------------------------------------------------------------------------
/pyrad/consens_pairs.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python2
2 |
3 | import multiprocessing
4 | import glob
5 | import itertools
6 | import sys
7 | import scipy.stats
8 | import scipy.misc
9 | import numpy
10 | import os
11 | import operator
12 | import gzip
13 | from potpour import Worker
14 |
15 | from consensdp import binomprobr, simpleconsens, hetero, unhetero, uplow, findalleles,breakalleles, removerepeat_Ns
16 |
17 |
18 | def stack(D):
19 | """
20 | from list of bases at a site D,
21 | returns an ordered list of counts of bases
22 | """
23 | ## TODO: replace with Counter
24 | L = len(D)
25 | counts = []
26 | for i in range(len(D[0])):
27 | A=C=T=G=N=M=X=n=0
28 | for nseq in range(L):
29 | A += D[nseq][i].count("A")
30 | C += D[nseq][i].count("C")
31 | T += D[nseq][i].count("T")
32 | G += D[nseq][i].count("G")
33 | N += D[nseq][i].count("N")
34 | M += D[nseq][i].count("-")
35 | X += D[nseq][i].count("X")
36 | n += D[nseq][i].count("n")
37 | counts.append( [[A,C,T,G],N,M,X,n] )
38 | return counts
39 |
40 |
41 | def consensus(infile,E,H,mindepth,maxN,maxH,datatype,
42 | ploidy,CUT,upperSD,strict,lowcounts):
43 | """
44 | from a clust file f,
45 | reads in all copies at a locus and sorts
46 | bases at each site, tests for errors at the
47 | site according to error rate, calls consensus
48 | """
49 | f = gzip.open(infile,'r')
50 | k = itertools.izip(*[iter(f)]*2)
51 | bases = ['A','C','T','G']
52 | Dic = {}
53 | Errors = []
54 | haplo = []
55 | Plist = []
56 | locus = minsamplocus = npoly = P = 0
57 | while 1:
58 | try: first = k.next()
59 | except StopIteration: break
60 | itera = [first[0],first[1]]
61 | fname = itera[0].strip().split(";")[0]
62 | leftjust = rightjust = None
63 |
64 | " lists and variables for this locus"
65 | S = [] ## list for sequence data
66 | S2 = [] ## list for sequence data
67 | alleles = [] ## for measuring # alleles, detect paralogs
68 | locus += 1 ## recording n loci
69 | ploidy = 0 ## for measuring # alleles, detect paralogs
70 | nHs = 0 ## will record heterozygous sites in this locus
71 | consensus = "" ## empty vector for consensus sequence
72 | consensus1 = "" ## empty vector for consensus sequence
73 | consensus2 = "" ## empty vector for consensus sequence
74 | basenumber = 1 ## for recording error locations
75 |
76 | while itera[0] != "//\n":
77 | nreps = int(itera[0].strip().split(";")[1].replace("size=",""))
78 |
79 | " append sequence * number of dereps "
80 | for i in xrange(nreps):
81 | " compatibility from pyrad 2 -> 3 "
82 | ss = itera[1].strip().replace("X","n")
83 | S.append(ss)
84 | S2.append(ss)
85 | itera = k.next()
86 |
87 | " separate first and second read clusters "
88 | firsts = [tuple(i.split("n")[0]) for i in S]
89 | seconds = [tuple(i.split("n")[-1]) for i in S]
90 |
91 | " call first read consensus "
92 | " Apply depth and paralog filters "
93 | if (len(firsts) >= min(mindepth,lowcounts)) and (len(firsts) < upperSD): ## upper limit is meandepth + 2 SD
94 | minsamplocus += 1
95 | RAD = stack(firsts)
96 | for site in RAD:
97 | nchanged = 0
98 |
99 | " minimum depth of coverage for base calling at each site"
100 | depthofcoverage = sum(site[0])
101 | if depthofcoverage < min(mindepth,lowcounts):
102 | cons = "N"; n1 = depthofcoverage-1; n2=0 ## prevents zero division error.
103 | else:
104 | n1,n2,n3,n4 = sorted(site[0],reverse=True)
105 |
106 | " speed hack = if diploid exclude if a third base present at > 20% "
107 | quickthirdbasetest = 0
108 | if ploidy == 2:
109 | if float(n3)/(n1+n2+n3+n4) > 0.20:
110 | quickthirdbasetest = 1
111 | if not quickthirdbasetest:
112 |
113 | """ if depth > 500 reduce by some factor for base calling """
114 | if n1+n2 >= 500: ## if > 500, random sample 500
115 | firstfivehundred = numpy.array(tuple("A"*n1+"B"*n2))
116 | numpy.random.shuffle(firstfivehundred)
117 | nchanged = 1
118 | oldn1 = n1
119 | oldn2 = n2
120 | n1 = list(firstfivehundred[:500]).count("A")
121 | n2 = list(firstfivehundred[:500]).count("B")
122 |
123 | """ if lowcounts, make base calls by majority instead of statistics
124 | when depth is below mindepth """
125 | # if lowcounts: ## include low count sites or no
126 | # if n1+n2 >= 5:
127 | # P,who = binomprobr(n1,n2,float(E),H)
128 | # else:
129 | # P,who = simpleconsens(n1,n2)
130 | # else:
131 | # P,who = binomprobr(n1,n2,float(E),H)
132 | """ make base calls using... """
133 | if n1+n2 >= mindepth:
134 | """ if above stat minimum """
135 | P,maf,who = binomprobr(n1,n2,float(E),H)
136 | elif n1+n2 >= lowcounts:
137 | """ if above maj rule minimum"""
138 | P,maf,who = simpleconsens(n1,n2)
139 |
140 | """ if the base could be called with 95% probability """
141 | if float(P) >= 0.95:
142 | if who in 'ab':
143 | if nchanged:
144 | a = [i for i,l in enumerate(site[0]) if l == oldn1]
145 | else:
146 | a = [i for i,l in enumerate(site[0]) if l == n1]
147 | if len(a)==2: ## alleles came up equal freq.
148 | cons = hetero(bases[a[0]],bases[a[1]])
149 | alleles.append(basenumber)
150 | else: ## alleles came up diff freq.
151 | if nchanged:
152 | b= [i for i,l in enumerate(site[0]) if l == oldn2]
153 | else:
154 | b= [i for i,l in enumerate(site[0]) if l == n2]
155 | "if three alleles came up equal, only need if diploid paralog filter off"
156 | if a == b:
157 | cons = hetero(bases[a[0]],bases[a[1]])
158 | else:
159 | cons = hetero(bases[a[0]],bases[b[0]])
160 | alleles.append(basenumber)
161 | nHs += 1
162 | else:
163 | if nchanged:
164 | cons = bases[site[0].index(oldn1)]
165 | else:
166 | cons = bases[site[0].index(n1)]
167 | else:
168 | cons = "N" ## poor base call
169 | else:
170 | cons = "@" ## third base freq fail
171 | consensus1 += cons
172 | basenumber += 1
173 |
174 |
175 | if "@" not in consensus1:
176 | if consensus1.count("N") <= maxN: ## only allow maxN internal "N"s in a locus
177 | if nHs < maxH: ## only allow maxH Hs, shortcut if first read fail
178 | basenumber += 4 ## separator length
179 |
180 | " call second read consensus "
181 | RAD = stack(seconds)
182 | for site in RAD:
183 | nchanged = 0
184 | " minimum depth of coverage for base calling at each site"
185 | depthofcoverage = sum(site[0])
186 | if depthofcoverage < mindepth:
187 | cons = "N"; n1 = depthofcoverage-1; n2=0
188 | else:
189 | n1,n2,n3,n4 = sorted(site[0],reverse=True)
190 |
191 | " speed hack = if diploid exclude if a third base present at > 20% "
192 | quickthirdbasetest = 0
193 | if ploidy == 2:
194 | if float(n3)/(n1+n2+n3+n4) > 0.20:
195 | quickthirdbasetest = 1
196 | if not quickthirdbasetest:
197 |
198 | """ if depth > 500 reduce by some factor for base calling """
199 | if n1+n2 >= 500: ## if > 500, random sample 500
200 | firstfivehundred = numpy.array(tuple("A"*n1+"B"*n2))
201 | numpy.random.shuffle(firstfivehundred)
202 | nchanged = 1
203 | oldn1 = n1
204 | oldn2 = n2
205 | n1 = list(firstfivehundred[:500]).count("A")
206 | n2 = list(firstfivehundred[:500]).count("B")
207 |
208 | """ make base calls using... """
209 | if n1+n2 >= mindepth:
210 | """ if above stat minimum """
211 | P,maf,who = binomprobr(n1,n2,float(E),H)
212 | elif n1+n2 >= lowcounts:
213 | """ if above maj rule minimum"""
214 | P,maf,who = simpleconsens(n1,n2)
215 |
216 | """ if the base could be called with 95% probability """
217 | if float(P) >= 0.95:
218 | if who in 'ab':
219 | if nchanged:
220 | a = [i for i,l in enumerate(site[0]) if l == oldn1]
221 | else:
222 | a = [i for i,l in enumerate(site[0]) if l == n1]
223 | if len(a)==2: ## alleles came up equal freq.
224 | cons = hetero(bases[a[0]],bases[a[1]])
225 | alleles.append(basenumber)
226 | else: ## alleles came up diff freq.
227 | if nchanged:
228 | b= [i for i,l in enumerate(site[0]) if l == oldn2]
229 | else:
230 | b= [i for i,l in enumerate(site[0]) if l == n2]
231 | "if three alleles came up equal, only need if diploid paralog filter off"
232 | if a == b:
233 | cons = hetero(bases[a[0]],bases[a[1]])
234 | else:
235 | cons = hetero(bases[a[0]],bases[b[0]])
236 | alleles.append(basenumber)
237 | nHs += 1
238 | else:
239 | if nchanged:
240 | cons = bases[site[0].index(oldn1)]
241 | else:
242 | cons = bases[site[0].index(n1)]
243 | else:
244 | cons = "N"
245 | else:
246 | "paralog flag"
247 | cons = "@"
248 | consensus2 += cons
249 | basenumber += 1
250 |
251 |
252 | "create concatenated consensus sequence from pairs "
253 | if "@" not in consensus2:
254 | consensus2.replace("-","N")
255 | consensus = consensus1 + "nnnn" + consensus2
256 |
257 |
258 | " filter applies to concatenated sequence "
259 | if consensus:
260 | if "@" not in consensus:
261 | " only allow maxH polymorphic sites in a locus "
262 | if nHs <= maxH:
263 | " filter for number of 2 alleles - diploids "
264 | if ploidy:
265 | al = []
266 | " only check if more than one hetero site present "
267 | if len(alleles) > 1:
268 | for i in S2:
269 | d = ""
270 | for z in alleles:
271 | if i[z-1] in unhetero(consensus[z-1]):
272 | d += i[z-1]+"_"
273 | if "N" not in d:
274 | if d.count("_") == len(alleles):
275 | al.append(d.rstrip("_"))
276 |
277 | " remove allele if it came up less than one in ten "
278 | " in which case it is likely a true heterozygous site "
279 | " but contains a sequencing error also "
280 | ## a hack for now. But very conservative.
281 | #if len(al) >= 5:
282 | # al = [i for i in al if al.count(i) > len(al)/10.]
283 | #TODO allow only 1 bp difference for excludes
284 |
285 | AL = sorted(set(al), key=al.count)
286 | diploid = len(AL)
287 |
288 | " set correct alleles relative to first polymorphic base"
289 | if AL:
290 | if diploid <= ploidy:
291 | sss = [zz-1 for zz in alleles]
292 | consensus = findalleles(consensus,sss,AL)
293 | ## TODO: incorporate option to output alleles for haplos>2
294 | else:
295 | consensus += "@E"
296 | else:
297 | None
298 | else:
299 | consensus += "@P"
300 |
301 | if "@" not in consensus:
302 | #print consensus, nHs
303 | " strip terminal N's from either end "
304 | shortcon1 = consensus1.rstrip("N").replace("-","N")
305 | " remove internal - or N, if low count "
306 | shortcon1 = removerepeat_Ns(shortcon1)
307 | " check for length not trimmed "
308 | if (len(shortcon1) >= 32) and (len(consensus2) >= 32):
309 | Dic[fname] = shortcon1 + "nnnn" +consensus2
310 | npoly += nHs
311 |
312 |
313 |
314 | if ".gz" in infile[-5:]:
315 | consens = gzip.open(infile.replace(".clustS",".consens"),'w')
316 | else:
317 | consens = open(infile.replace(".clustS",".consens"),'w')
318 | for i in Dic.items():
319 | consens.write(str(i[0])+'\n'+str(i[1])+"\n")
320 | consens.close()
321 | sys.stderr.write(".")
322 |
323 | if 'pair' in datatype:
324 | nsites = sum([len(i)-len(CUT)-4 for i in Dic.values()])
325 | else:
326 | nsites = sum([len(i)-len(CUT) for i in Dic.values()])
327 | ldic = len(Dic)
328 | try: NP = npoly/float(nsites)
329 | except ZeroDivisionError: NP = 0
330 | return [infile.split('/')[-1], locus, minsamplocus, ldic, nsites, npoly, round(NP,7)]
331 |
332 |
333 |
334 | def upSD(handle,mindepth):
335 | " function to calculate mean and SD of clustersize"
336 | if ".gz" in handle[-5:]:
337 | infile = gzip.open(handle)
338 | else:
339 | infile = open(handle)
340 | L = itertools.izip(*[iter(infile)]*2)
341 | a = L.next()[0].strip()
342 | depth = []
343 | d = int(a.split(";")[1].replace("size=",""))
344 | while 1:
345 | try: a = L.next()[0].strip()
346 | except StopIteration: break
347 | if a != "//":
348 | d += int(a.split(";")[1].replace("size=",""))
349 | else:
350 | depth.append(d)
351 | d = 0
352 | infile.close()
353 | keep = [i for i in depth if i>=(mindepth)]
354 | if keep:
355 | me = numpy.mean(keep)
356 | std = numpy.std(keep)
357 | else:
358 | me = 0.0
359 | std = 0.0
360 | return me, std
361 |
362 |
363 | def main(Parallel, E, H, ID, mindepth, subset,
364 | maxN, maxH, ploidy, CUT, datatype,
365 | lowcounts, strict, WORK, maxstack):
366 |
367 | " find clust.xx directory "
368 | if not os.path.exists(WORK+'clust'+ID):
369 | print "\terror: could not find "+WORK+"clust"+str(ID)+"/ directory,"+ \
370 | "\n\t\tif you changed the clustering threshold you must transfer *.clustS"+ \
371 | "\n\t\tfiles to a new directory named clust.xx with xx replaced by new clustering threshold"
372 | sys.exit()
373 |
374 | " create work queue"
375 | work_queue = multiprocessing.Queue()
376 |
377 | " iterate over files"
378 | outfolder = WORK+'clust'+str(ID)
379 | HH = glob.glob(outfolder+"/"+subset+".clustS*")
380 | stringout = "\n\tstep 5: created consensus seqs for %i samples, using H=%.5f E=%.5f\n\t" % (len(HH),round(H,5),round(E,5))
381 | sys.stderr.write(stringout)
382 |
383 | if len(HH) > 1:
384 | " sort files by size"
385 | for i in range(len(HH)):
386 | statinfo = os.stat(HH[i])
387 | HH[i] = HH[i],statinfo.st_size
388 | HH.sort(key=operator.itemgetter(1))
389 | FS = [f[0] for f in HH][::-1]
390 | else: FS = HH
391 | REMOVE = glob.glob('clust'+ID+"/cat.*")
392 | FS = [f for f in FS if f not in REMOVE]
393 | submitted = 0
394 | for handle in FS:
395 | if handle.replace('.clustS','.consens').replace('.clust','.consens') not in glob.glob(outfolder+"/*"):
396 | m,sd = upSD(handle,mindepth)
397 | if maxstack == "2SD":
398 | upperSD = max(500,m+(sd*2.5))
399 | else:
400 | upperSD = int(maxstack)
401 | work_queue.put([handle,E,H,mindepth,maxN,maxH,datatype,
402 | ploidy,CUT,upperSD,strict,lowcounts])
403 | submitted += 1
404 | else:
405 | print "\tskipping "+handle.replace(".clustS",".consens")+\
406 | ', it already exists in '+outfolder+"/"
407 |
408 |
409 | " create a queue to pass to workers to store the results"
410 | result_queue = multiprocessing.Queue()
411 |
412 | " spawn workers"
413 | jobs = []
414 | for i in range( min(Parallel,submitted) ):
415 | worker = Worker(work_queue, result_queue, consensus)
416 | jobs.append(worker)
417 | worker.start()
418 | for j in jobs:
419 | j.join()
420 |
421 | " get results"
422 | stats = open(WORK+'stats/s5.consens.txt','a+')
423 | print >>stats, "taxon\tnloci\tf1loci\tf2loci\tnsites\tnpoly\tpoly"
424 | for i in range(submitted):
425 | a,b,c,d,e,f,g = result_queue.get()
426 | nn = a.replace(".clustS.gz","")
427 | print >> stats, "\t".join(map(str,[nn,b,c,d,e,f,g]))
428 | print >>stats, """
429 | ## nloci = number of loci
430 | ## f1loci = number of loci with >N depth coverage
431 | ## f2loci = number of loci with >N depth and passed paralog filter
432 | ## nsites = number of sites across f loci
433 | ## npoly = number of polymorphic sites in nsites
434 | ## poly = frequency of polymorphic sites"""
435 | stats.close()
436 |
437 |
438 |
439 |
440 |
--------------------------------------------------------------------------------
/pyrad/Dtest_foil.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python2
2 |
3 | import numpy
4 | import sys
5 | import itertools
6 | import multiprocessing
7 | from potpour import Worker
8 | from Dtest import IUPAC, sample_wr, fillin, makesortfiles
9 |
10 |
11 |
12 | def most_common(lst):
13 | return max(set(lst), key=lst.count)
14 |
15 |
16 | def makefreq(patlist):
17 | " identify which allele is derived in P3 relative to outgroup "
18 | " and is the most frequent and use that as the SNP."
19 | " Also, split up alleles into those that are P3a & vs. or P3b "
20 | P = {}
21 | for tax in patlist:
22 | P[tax] = []
23 |
24 | for tax in patlist:
25 | for base in patlist[tax]:
26 | if base in list('ATGC'):
27 | P[tax].append(base[0])
28 | P[tax].append(base[0])
29 | elif base in list("RKSYWM"):
30 | hh = IUPAC(base[0])
31 | for i in hh:
32 | P[tax].append(i)
33 |
34 | """ select most common element in outgroup if multiple individuals,
35 | if only one ind but two alleles, select the first one """
36 | if len(set(P['o'])) > 1:
37 | minor = most_common(P['o'])
38 | else:
39 | minor = P['o'][0]
40 |
41 | " select most common element that is not minor "
42 | bases = list(itertools.chain(*P.values()))
43 | majors = [i for i in bases if i != minor]
44 | major = most_common(majors)
45 |
46 | ret = [float(P[i].count(major)) / len(P[i]) for i in ['p1','p2','p3a','p3b','o']]
47 | return ret
48 |
49 |
50 |
51 |
52 |
53 | def Dstatfoil(Loc,pat):
54 | " check site for patterns and add to Locus object if found"
55 | if len(set(pat)) < 3:
56 | " only allow biallelic "
57 | minor = pat[-1]
58 | " select only alternative to the outgroup allele "
59 | major = [i for i in pat if i!= pat[-1]][0]
60 |
61 | o = 0.
62 | p3b = 1. if pat[3] == major else 0.
63 | p3a = 1. if pat[2] == major else 0.
64 | p2 = 1. if pat[1] == major else 0.
65 | p1 = 1. if pat[0] == major else 0.
66 |
67 | ## from partitioned D-stat
68 | Loc.abbba += ( (1.-p1)*p2*p3a*p3b*(1.-o) ) # DFI[5] DOL[5]
69 | Loc.babba += ( p1*(1.-p2)*p3a*p3b*(1.-o) ) # DFI[1] DOL[1]
70 |
71 | Loc.abbaa += ( (1.-p1)*p2*p3a*(1.-p3b)*(1.-o) ) # DFO[6] DIL[0] DFI[4] DOL[2]
72 | Loc.babaa += ( p1*(1.-p2)*p3a*(1.-p3b)*(1.-o) ) # DFO[0] DIL[6] DFI[0] DOL[6]
73 |
74 | Loc.ababa += ( (1.-p1)*p2*(1.-p3a)*p3b*(1.-o) ) # DFO[2] DIL[4] DFI[2] DOL[4]
75 | Loc.baaba += ( p1*(1.-p2)*(1.-p3a)*p3b*(1.-o) ) # DFO[4] DIL[2] DFI[6] DOL[0]
76 |
77 | ## new to foil, contrast of bbxxa
78 | Loc.bbbaa += ( p1*p2*p3a*(1.-p3b)*(1.-o) ) # DFO[1] DIL[1]
79 | Loc.bbaba += ( p1*p2*(1.-p3a)*p3b*(1.-o) ) # DFO[5] DIL[5]
80 |
81 | ## terminal branch patterns
82 | if not Loc.noterminals:
83 | Loc.aaaba += ( (1.-p1)*(1.-p2)*(1.-p3a)*p3b*(1.-o) ) # DFO[3] DIL[3]
84 | Loc.aabaa += ( (1.-p1)*(1.-p2)*p3a*(1.-p3b)*(1.-o) ) # DFO[7] DIL[7]
85 | Loc.abaaa += ( (1.-p1)*p2*(1.-p3a)*(1.-p3b)*(1.-o) ) # DFI[3] DOL[3]
86 | Loc.baaaa += ( p1*(1.-p2)*(1.-p3a)*(1.-p3b)*(1.-o) ) # DFI[7] DOL[7]
87 | return Loc
88 |
89 |
90 |
91 | def polyDstatfoil(Loc, pat):
92 | ## calculate frequencies
93 | " look at the P3 taxon first for a derived allele "
94 | p1,p2,p3a,p3b,o = makefreq(pat)
95 | # else:
96 | # pat = [1. if base!=pat[-1] else 0. for base in pat]
97 | # p1,p2,p3a,p3b,o = pat
98 |
99 | ## from partitioned D-stat
100 | Loc.abbba += ( (1.-p1)*p2*p3a*p3b*(1.-o) ) # DFI[5] DOL[5]
101 | Loc.babba += ( p1*(1.-p2)*p3a*p3b*(1.-o) ) # DFI[1] DOL[1]
102 |
103 | Loc.abbaa += ( (1.-p1)*p2*p3a*(1.-p3b)*(1.-o) ) # DFO[6] DIL[0] DFI[4] DOL[2]
104 | Loc.babaa += ( p1*(1.-p2)*p3a*(1.-p3b)*(1.-o) ) # DFO[0] DIL[6] DFI[0] DOL[6]
105 |
106 | Loc.ababa += ( (1.-p1)*p2*(1.-p3a)*p3b*(1.-o) ) # DFO[2] DIL[4] DFI[2] DOL[4]
107 | Loc.baaba += ( p1*(1.-p2)*(1.-p3a)*p3b*(1.-o) ) # DFO[4] DIL[2] DFI[6] DOL[0]
108 |
109 | ## new to foil, contrast of xxbba
110 | Loc.bbbaa += ( p1*p2*p3a*(1.-p3b)*(1.-o) ) # DFO[1] DIL[1]
111 | Loc.bbaba += ( p1*p2*(1-p3a)*p3b*(1.-o) ) # DFO[5] DIL[5]
112 |
113 | ## terminal branch patterns
114 | if not Loc.noterminals:
115 | Loc.aaaba += ( (1.-p1)*(1.-p2)*(1.-p3a)*p3b*(1.-o) ) # DFO[3] DIL[3]
116 | Loc.aabaa += ( (1.-p1)*(1-p2)*p3a*(1.-p3b)*(1.-o) ) # DFO[7] DIL[7]
117 | Loc.abaaa += ( (1.-p1)*p2*(1.-p3a)*(1.-p3b)*(1.-o) ) # DFI[3] DOL[3]
118 | Loc.baaaa += ( p1*(1.-p2)*(1.-p3a)*(1.-p3b)*(1.-o) ) # DFI[7] DOL[7]
119 |
120 | return Loc
121 |
122 |
123 |
124 | def IUAfreq(Loc, L):
125 | patlist = {}
126 | Loc.abbba = 0.
127 | Loc.babba = 0.
128 | Loc.abbaa = 0.
129 | Loc.babaa = 0.
130 | Loc.ababa = 0.
131 | Loc.baaba = 0.
132 |
133 | Loc.bbbaa = 0.
134 | Loc.bbaba = 0.
135 | Loc.aaaba = 0.
136 | Loc.aabaa = 0.
137 | Loc.abaaa = 0.
138 | Loc.baaaa = 0.
139 |
140 | for col in Loc.seq.transpose():
141 | patlist = fillin(L[0], 'p1', col, Loc.names, patlist)
142 | patlist = fillin(L[1], 'p2', col, Loc.names, patlist)
143 | patlist = fillin(L[2], 'p3a', col, Loc.names, patlist)
144 | patlist = fillin(L[3], 'p3b', col, Loc.names, patlist)
145 | patlist = fillin(L[4], 'o', col, Loc.names, patlist)
146 |
147 | " exclude sites with missing data "
148 | if not any([ all([i in ["N",'-'] for i in patlist['p1']]),
149 | all([i in ["N",'-'] for i in patlist['p2']]),
150 | all([i in ["N",'-'] for i in patlist['p3a']]),
151 | all([i in ["N",'-'] for i in patlist['p3b']]),
152 | all([i in ["N",'-'] for i in patlist['o']]) ]):
153 | " if site in not invariable "
154 | isvar = len(set(col)-set(["N","-"])) > 1
155 | if isvar:
156 | " look for patterns in site "
157 | Loc = polyDstatfoil(Loc, patlist)
158 | return Loc
159 |
160 |
161 |
162 | def IUA(Loc,L):
163 | Loc.abbba = 0.
164 | Loc.babba = 0.
165 | Loc.abbaa = 0.
166 | Loc.babaa = 0.
167 | Loc.ababa = 0.
168 | Loc.baaba = 0.
169 |
170 | Loc.bbbaa = 0.
171 | Loc.bbaba = 0.
172 | Loc.aaaba = 0.
173 | Loc.aabaa = 0.
174 | Loc.abaaa = 0.
175 | Loc.baaaa = 0.
176 |
177 | for col in Loc.seq.transpose():
178 | " exclude heterozygous sites "
179 | if all(i in list("ATGC") for i in col):
180 | " if site is not invariable "
181 | if len(set(col)) > 1:
182 | " look for patterns in site "
183 | Loc = Dstatfoil(Loc,col)
184 | return Loc
185 |
186 |
187 |
188 | def bootfreq(Ldict, which):
189 | Dfo_t = Dfo_b = 0.
190 | Dil_t = Dil_b = 0.
191 | Dfi_t = Dfi_b = 0.
192 | Dol_t = Dol_b = 0.
193 | while 1:
194 | try: Lx = Ldict[Ldict.keys()[which.next()]]
195 | except StopIteration: break
196 | " iterate over loci summing top and bottom values of Ds"
197 | Dfo_t += Lx.DFO_t()
198 | Dfo_b += Lx.DFO_b()
199 | Dil_t += Lx.DIL_t()
200 | Dil_b += Lx.DIL_b()
201 | Dfi_t += Lx.DFI_t()
202 | Dfi_b += Lx.DFI_b()
203 | Dol_t += Lx.DOL_t()
204 | Dol_b += Lx.DOL_b()
205 | " take top over bottom values to calc Ds "
206 | DFO = 0.
207 | if Dfo_b > 0:
208 | DFO = Dfo_t/float(Dfo_b)
209 | DIL = 0.
210 | if Dil_b > 0:
211 | DIL = Dil_t/float(Dil_b)
212 | DFI = 0.
213 | if Dfi_b > 0:
214 | DFI = Dfi_t/float(Dfi_b)
215 | DOL = 0.
216 | if Dol_b > 0:
217 | DOL = Dol_t/float(Dol_b)
218 |
219 | return DFO,DIL,DFI,DOL
220 |
221 |
222 |
223 | class Locusfoil():
224 | """locus keeps track of position in input file,
225 | variable sites, and D-statistics"""
226 | def _init_(self):
227 | self.number = number
228 | self.names = names
229 | self.seq = seq
230 | self.noterminals = noterminals
231 |
232 | self.abbba = abbba
233 | self.babba = abbba
234 | self.abbaa = abbaa
235 | self.babaa = babaa
236 | self.ababa = ababa
237 | self.baaba = baaba
238 |
239 | self.bbbaa = bbbaa
240 | self.bbaba = bbaba
241 |
242 | self.aaaba = aaaba
243 | self.aabaa = aabaa
244 | self.abaaa = abaaa
245 | self.baaaa = baaaa
246 |
247 | """ per-locus top or bottom values of Dstats """
248 | def DFO_t(self):
249 | part1 = [self.babaa,self.bbbaa,self.ababa,self.aaaba]
250 | part2 = [self.baaba,self.bbaba,self.abbaa,self.aabaa]
251 | if self.noterminals:
252 | part1 = part1[:-1]
253 | part2 = part2[:-1]
254 | return float(sum(part1)-sum(part2))
255 |
256 | def DFO_b(self):
257 | part1 = [self.babaa,self.bbbaa,self.ababa,self.aaaba]
258 | part2 = [self.baaba,self.bbaba,self.abbaa,self.aabaa]
259 | if self.noterminals:
260 | part1 = part1[:-1]
261 | part2 = part2[:-1]
262 | return float(sum(part1)+sum(part2))
263 |
264 |
265 | def DIL_t(self):
266 | part1 = [self.abbaa,self.bbbaa,self.baaba,self.aaaba]
267 | part2 = [self.ababa,self.bbaba,self.babaa,self.aabaa]
268 | if self.noterminals:
269 | part1 = part1[:-1]
270 | part2 = part2[:-1]
271 | return float(sum(part1)-sum(part2))
272 |
273 |
274 | def DIL_b(self):
275 | part1 = [self.abbaa,self.bbbaa,self.baaba,self.aaaba]
276 | part2 = [self.ababa,self.bbaba,self.babaa,self.aabaa]
277 | if self.noterminals:
278 | part1 = part1[:-1]
279 | part2 = part2[:-1]
280 | return float(sum(part1)+sum(part2))
281 |
282 |
283 | def DFI_t(self):
284 | part1 = [self.babaa,self.babba,self.ababa,self.abaaa]
285 | part2 = [self.abbaa,self.abbba,self.baaba,self.baaaa]
286 | if self.noterminals:
287 | part1 = part1[:-1]
288 | part2 = part2[:-1]
289 | return float(sum(part1)-sum(part2))
290 |
291 |
292 | def DFI_b(self):
293 | part1 = [self.babaa,self.babba,self.ababa,self.abaaa]
294 | part2 = [self.abbaa,self.abbba,self.baaba,self.baaaa]
295 | if self.noterminals:
296 | part1 = part1[:-1]
297 | part2 = part2[:-1]
298 | return float(sum(part1)+sum(part2))
299 |
300 |
301 | def DOL_t(self):
302 | part1 = [self.baaba,self.babba,self.abbaa,self.abaaa]
303 | part2 = [self.ababa,self.abbba,self.babaa,self.baaaa]
304 | if self.noterminals:
305 | part1 = part1[:-1]
306 | part2 = part2[:-1]
307 | return float(sum(part1)-sum(part2))
308 |
309 |
310 | def DOL_b(self):
311 | part1 = [self.baaba,self.babba,self.abbaa,self.abaaa]
312 | part2 = [self.ababa,self.abbba,self.babaa,self.baaaa]
313 | if self.noterminals:
314 | part1 = part1[:-1]
315 | part2 = part2[:-1]
316 | return float(sum(part1)+sum(part2))
317 |
318 |
319 |
320 |
321 |
322 | def makeSNP(L, snpfreq, loci, noterminals):
323 | Ndict = {}
324 | num = 0
325 | for loc in loci:
326 | Loc = Locusfoil()
327 | Loc.noterminals = noterminals
328 | Loc.number = num
329 |
330 | " only select loci that have data for all five tiptaxa "
331 | names = [i.split()[0].replace(">","") for i in loc.lstrip().rstrip().split("\n")[:-1]]
332 | if snpfreq:
333 | Loc.names = [i for i in names if i in list(itertools.chain(*L))]
334 | else:
335 | Loc.names = L #[i for i in names if i in L]
336 |
337 | " if snpfreq only need one of possibly multiple individuals"
338 | keep = 0
339 |
340 | if snpfreq:
341 | for tax in L:
342 | z = any([tax in Loc.names for tax in L[0]])
343 | y = any([tax in Loc.names for tax in L[1]])
344 | x = any([tax in Loc.names for tax in L[2]])
345 | w = any([tax in Loc.names for tax in L[3]])
346 | u = any([tax in Loc.names for tax in L[4]])
347 | if all([z,y,x,w,u]):
348 | keep = 1
349 |
350 | else:
351 | if all(tax in names for tax in Loc.names):
352 | keep = 1
353 |
354 | if keep:
355 | N = numpy.array([tuple(i) for i in loc.split("\n")[1:]])
356 | " only select sites with synapomorphies "
357 | # select all variable sites
358 | N[-1] = list(N[-1].tostring().replace("-","*"))
359 | N = N[:, N[-1] == "*"]
360 |
361 | " only select rows with focal taxa "
362 | Loc.seq = N[[names.index(i) for i in Loc.names],:]
363 | Ndict[num] = Loc
364 | num += 1
365 | return Ndict
366 |
367 |
368 |
369 | def runtest(infile, L, nboots, snpfreq, submitted, noterminals):
370 | " print test "
371 | print L
372 |
373 | " split each locus "
374 | loci = open(infile).read().strip().split("|")[:-1]
375 | loci[0] = "\n"+loci[0]
376 |
377 | " returns a {} of Locusfoil objects with data for tiptaxa L "
378 | Ldict = makeSNP(L, snpfreq, loci, noterminals)
379 |
380 | " calculate discordant patterns for each locus "
381 | for loc in Ldict:
382 | if snpfreq:
383 | Ldict[loc] = IUAfreq(Ldict[loc],L)
384 | else:
385 | Ldict[loc] = IUA(Ldict[loc],L)
386 | ################################################
387 |
388 | " final DFO "
389 | DFO_t = sum([(Ldict[l].babaa + Ldict[l].bbbaa + Ldict[l].ababa + Ldict[l].aaaba) -\
390 | (Ldict[l].baaba + Ldict[l].bbaba + Ldict[l].abbaa + Ldict[l].aabaa) for l in Ldict])
391 | DFO_b = sum([(Ldict[l].babaa + Ldict[l].bbbaa + Ldict[l].ababa + Ldict[l].aaaba) + \
392 | (Ldict[l].baaba + Ldict[l].bbaba + Ldict[l].abbaa + Ldict[l].aabaa) for l in Ldict])
393 | if DFO_b > 0:
394 | DFO = float(DFO_t)/DFO_b
395 | else: DFO = 0.
396 |
397 | " final DIL "
398 | DIL_t = sum([(Ldict[l].abbaa + Ldict[l].bbbaa + Ldict[l].baaba + Ldict[l].aaaba) - \
399 | (Ldict[l].ababa + Ldict[l].bbaba + Ldict[l].babaa + Ldict[l].aabaa) for l in Ldict])
400 | DIL_b = sum([(Ldict[l].abbaa + Ldict[l].bbbaa + Ldict[l].baaba + Ldict[l].aaaba) + \
401 | (Ldict[l].ababa + Ldict[l].bbaba + Ldict[l].babaa + Ldict[l].aabaa) for l in Ldict])
402 | if DIL_b > 0:
403 | DIL = float(DIL_t)/DIL_b
404 | else: DIL = 0.
405 |
406 | " final DFI "
407 | DFI_t = sum([(Ldict[l].babaa + Ldict[l].babba + Ldict[l].ababa + Ldict[l].abaaa) - \
408 | (Ldict[l].abbaa + Ldict[l].abbba + Ldict[l].baaba + Ldict[l].baaaa) for l in Ldict])
409 | DFI_b = sum([(Ldict[l].babaa + Ldict[l].babba + Ldict[l].ababa + Ldict[l].abaaa) + \
410 | (Ldict[l].abbaa + Ldict[l].abbba + Ldict[l].baaba + Ldict[l].baaaa) for l in Ldict])
411 | if DFI_b > 0:
412 | DFI = float(DFI_t)/DFI_b
413 | else: DFI = 0.
414 |
415 | " final DOL "
416 | DOL_t = sum([(Ldict[l].baaba + Ldict[l].babba + Ldict[l].abbaa + Ldict[l].abaaa) - \
417 | (Ldict[l].ababa + Ldict[l].abbba + Ldict[l].babaa + Ldict[l].baaaa) for l in Ldict])
418 | DOL_b = sum([(Ldict[l].baaba + Ldict[l].babba + Ldict[l].abbaa + Ldict[l].abaaa) + \
419 | (Ldict[l].ababa + Ldict[l].abbba + Ldict[l].babaa + Ldict[l].baaaa) for l in Ldict])
420 | if DOL_b > 0:
421 | DOL = float(DOL_t)/DOL_b
422 | else: DOL = 0.
423 |
424 | " proportion of discordant loci "
425 | #try: pdisc = len([i for i in Ldict if any([Ldict[i].D12(),Ldict[i].D1(),Ldict[i].D2()])]) / float(len(Ldict))
426 | #except ValueError:
427 | # pdisc = 0.0
428 |
429 | " TODO "
430 | pdisc = 0.0
431 |
432 | #################################################
433 |
434 | " do bootstrapping "
435 | BBFO = []
436 | BBIL = []
437 | BBFI = []
438 | BBOL = []
439 | for i in xrange(nboots):
440 | which = iter(sample_wr(xrange(len(Ldict)), len(Ldict)))
441 | bbfo,bbil,bbfi,bbol = bootfreq(Ldict, which)
442 | BBFO.append(bbfo)
443 | BBIL.append(bbil)
444 | BBFI.append(bbfi)
445 | BBOL.append(bbol)
446 | STDfo = numpy.std(BBFO)
447 | STDil = numpy.std(BBIL)
448 | STDfi = numpy.std(BBFI)
449 | STDol = numpy.std(BBOL)
450 | ##################################################
451 |
452 | " stats out "
453 | if STDfo > 0:
454 | ZFO = (abs(DFO/STDfo))
455 | else: ZFO = 0.
456 | if STDil > 0:
457 | ZIL = (abs(DIL/STDil))
458 | else: ZIL = 0.
459 | if STDfi > 0:
460 | ZFI = (abs(DFI/STDfi))
461 | else: ZFI = 0.
462 | if STDol > 0:
463 | ZOL = (abs(DOL/STDol))
464 | else: ZOL = 0.
465 |
466 | ## make loci files here
467 | #ABBBAloci = [Ldict[l].number for l in Ldict if Ldict[l].D12() > 0]
468 | #BABBAloci = [Ldict[l].number for l in Ldict if Ldict[l].D12() < 0]
469 | #ABBAAloci = [Ldict[l].number for l in Ldict if Ldict[l].D1() > 0]
470 | #BABAAloci = [Ldict[l].number for l in Ldict if Ldict[l].D1() < 0]
471 | #ABABAloci = [Ldict[l].number for l in Ldict if Ldict[l].D2() > 0]
472 | #BAABAloci = [Ldict[l].number for l in Ldict if Ldict[l].D2() < 0]
473 |
474 | return [L,
475 | DFO,ZFO,
476 | DIL,ZIL,
477 | DFI,ZFI,
478 | DOL,ZOL,
479 | len(Ldict),
480 | sum([Ldict[l].babba for l in Ldict]),
481 | sum([Ldict[l].abbba for l in Ldict]),
482 | sum([Ldict[l].babaa for l in Ldict]),
483 | sum([Ldict[l].abbaa for l in Ldict]),
484 | sum([Ldict[l].baaba for l in Ldict]),
485 | sum([Ldict[l].ababa for l in Ldict]),
486 | sum([Ldict[l].bbbaa for l in Ldict]),
487 | sum([Ldict[l].bbaba for l in Ldict]),
488 | sum([Ldict[l].aabaa for l in Ldict]),
489 | sum([Ldict[l].aaaba for l in Ldict]),
490 | sum([Ldict[l].baaaa for l in Ldict]),
491 | sum([Ldict[l].abaaa for l in Ldict]),
492 | pdisc, submitted,
493 | BBFO, BBIL, BBFI, BBOL]
494 |
495 |
496 | def checktaxa(taxalist,alignfile):
497 | with open(alignfile) as infile:
498 | data = infile.readlines()
499 | taxainfile = set()
500 | for line in data:
501 | if ">" in line:
502 | tax = line.split(" ")[0].replace(">","")
503 | if tax not in taxainfile:
504 | taxainfile.add(tax)
505 | if not set(taxainfile).difference(taxainfile):
506 | return 1
507 |
508 |
509 |
510 |
511 | def multiproc_it(subtests,alignfile,outfile, nboots,nproc,namelen,makesort,makeboots,noterminals):
512 | work_queue = multiprocessing.Queue()
513 | result_queue = multiprocessing.Queue()
514 | submitted = 0
515 | Notes = []
516 | for rep in subtests:
517 | notes = ""
518 | if len(rep) == 2:
519 | rep,notes = rep
520 | p1,p2,p3a,p3b,o = rep
521 | if all(["[" in i for i in rep[1:]]):
522 | p1 = p1[1:-1].split(",")
523 | p2 = p2[1:-1].split(",")
524 | p3a = p3a[1:-1].split(",")
525 | p3b = p3b[1:-1].split(",")
526 | o = o[1:-1].split(",")
527 | if checktaxa([p1,p2,p3a,p3b,o],alignfile):
528 | work_queue.put([alignfile, [p1,p2,p3a,p3b,o], nboots, 1, submitted, noterminals])
529 | submitted += 1
530 | else:
531 | print 'a taxon name was found that is not in the sequence file'
532 | else:
533 | if checktaxa([p1,p2,p3a,p3b,o],alignfile):
534 | work_queue.put([alignfile, [p1,p2,p3a,p3b,o], nboots, 0, submitted, noterminals])
535 | submitted += 1
536 | else:
537 | print 'a taxon name was found that is not in the sequence file'
538 | Notes.append(notes)
539 |
540 | jobs = []
541 | for i in range(min(submitted,nproc)):
542 | worker = Worker(work_queue, result_queue, runtest)
543 | jobs.append(worker)
544 | worker.start()
545 | for j in jobs:
546 | j.join()
547 |
548 | " read results back in "
549 | Results = [result_queue.get() for i in range(submitted)]
550 | Results.sort(key = lambda x:x[15])
551 |
552 |
553 |
554 | " setup results file "
555 | if noterminals:
556 | outs = open(outfile+".Dfoilalt.txt", 'w')
557 | else:
558 | outs = open(outfile+".Dfoil.txt", 'w')
559 | header = "\t".join([ 'p1'+" "*(namelen[0]-2),
560 | 'p2'+" "*(namelen[1]-2),
561 | 'p3'+" "*(namelen[2]-2),
562 | 'p4'+" "*(namelen[3]-2),
563 | 'O'+" "*(namelen[4]-1),
564 | 'Dfo','Dil','Dfi','Dol',
565 | 'Z_fo','Z_il','Z_fi','Z_ol',
566 | 'BABBA','ABBBA',
567 | 'BABAA','ABBAA',
568 | 'BAABA','ABABA',
569 | 'BBBAA','BBABA',
570 | 'AABAA','AAABA',
571 | 'BAAAA','ABAAA',
572 | 'nloci','sign', 'notes'])
573 | print >>outs, header
574 |
575 | for i in range(len(Results)):
576 | L,DFO,ZFO,DIL,ZIL,DFI,ZFI,DOL,ZOL,nloc,BABBA,ABBBA,BABAA,ABBAA,BAABA,ABABA,BBBAA,BBABA,AABAA,AAABA,BAAAA,ABAAA,pdisc,sub,BBFO,BBIL,BBFI,BBOL = Results[i]
577 | L = [str(x).replace("['","[").replace("']","]").replace("', '",",") for x in L]
578 |
579 | sign = []
580 | for s,d in zip([ZFO,ZIL,ZFI,ZOL],[DFO,DIL,DFI,DOL]):
581 | if s>3.5:
582 | if d>0:
583 | sign.append("+")
584 | else:
585 | sign.append("-")
586 | else:
587 | sign.append("0")
588 | #print sign
589 |
590 | resin = tuple([str(L[0])+" "*(namelen[0]-len(str(L[0]))),
591 | str(L[1])+" "*(namelen[1]-len(str(L[1]))),
592 | str(L[2])+" "*(namelen[2]-len(str(L[2]))),
593 | str(L[3])+" "*(namelen[3]-len(str(L[3]))),
594 | str(L[4])+" "*(namelen[4]-len(str(L[4]))),
595 | DFO,DIL,DFI,DOL,
596 | ZFO,ZIL,ZFI,ZOL,
597 | BABBA,ABBBA,BABAA,ABBAA,BAABA,ABABA,BBBAA,BBABA,AABAA,AAABA,BAAAA,ABAAA,
598 | nloc, "".join(sign), Notes[i]])
599 |
600 | print >>outs, "%s\t%s\t%s\t%s\t%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d\t%s\t%s" % resin
601 |
602 | loci = open(alignfile).read().strip().split("|")[:-1]
603 | if makesort:
604 | None
605 | # makesortfiles("ABBBA",ABBBAloci,5,loci,outfile,makesort,sub,L)
606 | # makesortfiles("BABBA",BABBAloci,5,loci,outfile,makesort,sub,L)
607 | # makesortfiles("ABBAA",ABBAAloci,5,loci,outfile,makesort,sub,L)
608 | # makesortfiles("BABAA",BABAAloci,5,loci,outfile,makesort,sub,L)
609 | # makesortfiles("ABABA",ABABAloci,5,loci,outfile,makesort,sub,L)
610 | # makesortfiles("BAABA",BAABAloci,5,loci,outfile,makesort,sub,L)
611 |
612 | if makeboots:
613 | None
614 | # with open(outfile+"_"+str(sub+1)+".boots_D12",'w') as out:
615 | # out.write(",".join(map(str,BB12)))
616 | # with open(outfile+"_"+str(sub+1)+".boots_D1",'w') as out:
617 | # out.write(",".join(map(str,BB1)))
618 | # with open(outfile+"_"+str(sub+1)+".boots_D2",'w') as out:
619 | # out.write(",".join(map(str,BB2)))
620 |
621 |
622 | def main(tests, alignfile, outfile, nboots, nproc, makesort, makeboots,noterminals):
623 | import sys
624 |
625 | P1namelen = max(map(len,[str(i[0][0]) for i in tests]))
626 | P2namelen = max(map(len,[str(i[0][1]) for i in tests]))
627 | P3anamelen = max(map(len,[str(i[0][2]) for i in tests]))
628 | P3bnamelen = max(map(len,[str(i[0][3]) for i in tests]))
629 | Onamelen = max(map(len,[str(i[0][4]).strip() for i in tests]))
630 | namelen = [P1namelen,P2namelen,P3anamelen,P3bnamelen,Onamelen]
631 |
632 | multiproc_it(tests,alignfile,outfile,nboots,nproc,namelen,makesort,makeboots,noterminals)
633 |
634 |
635 | if __name__ == '__main__':
636 | main()
637 |
638 |
639 |
640 |
641 |
--------------------------------------------------------------------------------