├── .gitignore
├── LICENSE
├── SeqRecordLite.py
├── ab1.py
├── analyze_oligos.py
├── bin
    ├── compute_divergence.py
    ├── explode_fasta.py
    ├── fasta2idt.py
    ├── fasta2lenhist.py
    ├── fasta2tiles.py
    ├── fasta2uniq.py
    ├── fasta_rm_newlines.py
    ├── fasta_sort_by_abundance.py
    ├── generate_otu_table.py
    ├── generic_script.py
    ├── idt2fasta.py
    ├── make_timeseries_figures.py
    ├── qiime_cluster_jobs_LSF.py
    ├── quality_hist.py
    ├── sff2fastq_trimmed.py
    ├── streamgraph_html.py
    ├── timeseries2json.py
    └── timeseries2streamgraph.py
├── blast.py
├── blat.py
├── countdata.py
├── daemonize.py
├── degex.py
├── exonerate.py
├── graphtools.py
├── lsf.py
├── mplextensions.py
├── oligoTm.py
├── primers.py
├── pyutils.py
├── qPCR2melting.py
├── qPCR2quantitation.py
├── sanger.py
├── scale.py
├── seqtools.py
├── statstools.py
├── stitch.py
├── streamgraph.py
├── timeseries.py
└── unafold.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *.so
3 | *.o
4 | build/
5 | .DS_Store


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/SeqRecordLite.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | 
  3 | from Bio.Seq import Seq, UnknownSeq
  4 | from Bio.SeqRecord import SeqRecord
  5 | from Bio.SeqFeature import SeqFeature, FeatureLocation
  6 | from Bio.Alphabet import NucleotideAlphabet
  7 | 
  8 | 
  9 | class SeqRecordLite(object):
 10 |     """SeqRecord wrapper that allows simpler attribute access.
 11 |     
 12 |     The underlying data structure is actually a biopython `SeqRecord` object.
 13 |     This class wraps it in a way that maintains the simple-to-use interface to
 14 |     get at some common annotations. It also knows how to print out it's data
 15 |     as IMGT-flavored INSDC (e.g., GenBank/EMBL).
 16 |     """
 17 |     
 18 |     def __init__(self, biopython_object=None):
 19 |         
 20 |         # first we define our underlying SeqRecord object
 21 |         if biopython_object == None:
 22 |             self._record = SeqRecord(seq=UnknownSeq(0,alphabet=NucleotideAlphabet()),id='',name='',description='')
 23 |         elif isinstance(biopython_object,Seq):
 24 |             self._record = SeqRecord(seq=copy.deepcopy(biopython_object),id='',name='',description='')
 25 |         elif isinstance(biopython_object,SeqRecord):
 26 |             self._record = copy.deepcopy(biopython_object)
 27 |         
 28 |         # define dictionary of features for faster lookup
 29 |         self._features = {}
 30 |         for (i,feature) in enumerate(self._record.features):
 31 |             self._features.setdefault(feature.type,[]).append(i)
 32 |     
 33 |     
 34 |     def __getattr__(self,name):
 35 |         # This function should only get called if I am looking for an attribute that
 36 |         # didn't already have a getter defined or a default method.  In this case, I
 37 |         # search the annotations dictionary or the features table of the underlying
 38 |         # SeqRecord to try to find the information.
 39 |         if name in self._record.annotations:
 40 |             return self._record.annotations[name]
 41 |         elif name in self._features:
 42 |             return [self._record.features[i] for i in self._features[name]]
 43 |         raise AttributeError
 44 |     
 45 |     
 46 |     # define properties to access some common SeqRecord interface
 47 |     
 48 |     @property
 49 |     def seq(self):
 50 |         return self._record.seq
 51 |     
 52 |     @seq.setter
 53 |     def seq(self,s):
 54 |         self._record.seq = s
 55 |     
 56 |     @property
 57 |     def annotations(self):
 58 |         return self._record.annotations
 59 |     
 60 |     @property
 61 |     def id(self):
 62 |         return self._record.id
 63 |     
 64 |     @id.setter
 65 |     def id(self,i):
 66 |         self._record.id = i
 67 |     
 68 |     @property
 69 |     def description(self):
 70 |         return self._record.description
 71 |     
 72 |     @description.setter
 73 |     def description(self,d):
 74 |         self._record.description = d
 75 |     
 76 |     @property
 77 |     def name(self):
 78 |         return self._record.name
 79 |     
 80 |     @name.setter
 81 |     def name(self,n):
 82 |         self._record.name = n
 83 |     
 84 |     @property
 85 |     def features(self):
 86 |         return self._record.features
 87 |     
 88 |     def format(self,*args,**kw):
 89 |         return self._record.format(*args,**kw)
 90 |     
 91 |     
 92 |     # manipulation of SeqRecord parts
 93 |     
 94 |     def add_feature(self,start=None,end=None,type='',strand=None,qualifiers=None):
 95 |         if start == None or end == None:
 96 |             raise ValueError, "if there is no spanning location...use an annotation?"
 97 |         location = FeatureLocation(start,end)
 98 |         feature = SeqFeature(location=location,type=type,strand=strand,qualifiers=qualifiers)
 99 |         self._record.features.append(feature)
100 |         self._features.setdefault(feature.type,[]).append(len(self._record.features) - 1)
101 |         return self
102 |     
103 |     def has_feature(self,type):
104 |         return type in self._features
105 |     
106 |     def del_feature(self,type):
107 |         idxs = self._features.pop(type)
108 |         idxs.sort(reverse=True)
109 |         for i in idxs:
110 |             self._record.features.pop(i)
111 |         return self
112 |     
113 |     
114 |     # some standard interface
115 |     
116 |     def __len__(self):
117 |         return len(self.seq)
118 |     
119 |     def __str__(self):
120 |         return self.__repr__()
121 |     
122 |     def __repr__(self):
123 |         return self.format('imgt')
124 | 


--------------------------------------------------------------------------------
/ab1.py:
--------------------------------------------------------------------------------
  1 | # Downloaded from http://www.interactive-biosoftware.com/open-source/ABIFReader.py
  2 | # on 14 November 2010.
  3 | #
  4 | # Python implementation of an ABIF file reader according to Applied Biosystems' specificatons,
  5 | # see http://www.appliedbiosystems.com/support/software_community/ABIF_File_Format.pdf
  6 | #
  7 | # This code is published by Interactive Biosoftware, France,
  8 | # see http://www.interactive-biosoftware.com/
  9 | # under GPL license,
 10 | # see http://www.gnu.org/licenses/gpl.html
 11 | #
 12 | # Author: Francis Wolinski
 13 | # Version: 1.0, March 2007
 14 | # Copyright (c) Francis Wolinski 2007
 15 | #
 16 | # User Manual
 17 | #
 18 | # Conversion of ABIF data types to Python types (see struct.unpack method):
 19 | # type 1 = byte -> integer
 20 | # type 2 = char -> string
 21 | # type 3 = word -> long
 22 | # type 4 = short -> integer
 23 | # type 5 = long -> integer
 24 | # type 7 = float -> float
 25 | # type 8 = double -> float
 26 | # type 10 = date -> datetime.date instance
 27 | # type 11 = time -> datetime.time instance
 28 | # type 12 = thumb -> tuple
 29 | # type 13 = bool -> True or False
 30 | # type 18 = pString -> string
 31 | # type 19 = cString -> string
 32 | # type = 1024+ = user -> NotImplemented: to be overwritten in user's code in ABIFReader.readNextUserData method
 33 | # type = other -> NotImplemented
 34 | #
 35 | # from ABIFReader import *
 36 | # reader = ABIFReader(<filename>) # creates an instance of ABIFReader
 37 | # reader.version # version of ABIF file
 38 | # reader.showEntries() # print all entries of ABIF file "<name> (<num>) / <type> (<size>)"
 39 | # data = reader.getData(<name>[, <num>]) # read data for entry named <name> with number <num>, by default <num> is 1
 40 | # reader.close() # close the file, since it is kept open
 41 | #
 42 | 
 43 | import struct
 44 | import datetime
 45 | 
 46 | ABIF_TYPES = {1: 'byte', 2: 'char', 3: 'word', 4: 'short', 5: 'long', 7: 'float', 8: 'double',\
 47 |         10: 'date', 11: 'time', 12: 'thumb', 13: 'bool', 18: 'pString', 19: 'cString'}
 48 | 
 49 | class ABIFReader:
 50 |     def __init__(self, fn):
 51 |         self.filename = fn
 52 |         self.file = open(fn, 'rb')
 53 |         self.type = self.readNextString(4)
 54 |         if self.type != 'ABIF':
 55 |             self.close()
 56 |             raise SystemExit("error: No ABIF file '%s'" % fn)
 57 |         self.version = self.readNextShort()
 58 |         dir = DirEntry(self)
 59 |         self.seek(dir.dataoffset)
 60 |         self.entries = [DirEntry(self) for i in range(dir.numelements)]
 61 | 
 62 |     def getData(self, name, num = 1):
 63 |         entry = self.getEntry(name, num)
 64 |         if not entry:
 65 |             raise SystemExit("error: Entry '%s (%i)' not found in '%s'" % (name, num, self.filename))
 66 |         self.seek(entry.mydataoffset())
 67 |         data = self.readData(entry.elementtype, entry.numelements)
 68 |         if data != NotImplemented and len(data) == 1:
 69 |             return data[0]
 70 |         else:
 71 |             return data
 72 | 
 73 |     def showEntries(self):
 74 |         for e in self.entries:
 75 |             print e
 76 | 
 77 |     def getEntry(self, name, num):
 78 |         for e in self.entries:
 79 |             if e.name == name and e.number == num:
 80 |                 return e
 81 |         return None
 82 | 
 83 |     def readData(self, type, num):
 84 |         if type == 1:
 85 |             return [self.readNextByte() for i in range(num)]
 86 |         elif type == 2:
 87 |             return self.readNextString(num)
 88 |         elif type == 3:
 89 |             return [self.readNextUnsignedInt() for i in range(num)]
 90 |         elif type == 4:
 91 |             return [self.readNextShort() for i in range(num)]
 92 |         elif type == 5:
 93 |             return [self.readNextLong() for i in range(num)]
 94 |         elif type == 7:
 95 |             return [self.readNextFloat() for i in range(num)]
 96 |         elif type == 8:
 97 |             return [self.readNextDouble() for i in range(num)]
 98 |         elif type == 10:
 99 |             return [self.readNextDate() for i in range(num)]
100 |         elif type == 11:
101 |             return [self.readNextTime() for i in range(num)]
102 |         elif type == 12:
103 |             return [self.readNextThumb() for i in range(num)]
104 |         elif type == 13:
105 |             return [self.readNextBool() for i in range(num)]
106 |         elif type == 18:
107 |             return self.readNextpString()
108 |         elif type == 19:
109 |             return self.readNextcString()
110 |         elif type >= 1024:
111 |             return self.readNextUserData(type, num)
112 |         else:
113 |             return NotImplemented
114 | 
115 |     def readNextBool(self):
116 |         return readNextByte(self) == 1
117 | 
118 |     def readNextByte(self):
119 |         return self.primUnpack('B', 1)
120 | 
121 |     def readNextChar(self):
122 |         return self.primUnpack('c', 1)
123 | 
124 |     def readNextcString(self):
125 |         chars = []
126 |         while True:
127 |             c = self.readNextChar()
128 |             if ord(c) == 0:
129 |                 return ''.join(chars)
130 |             else:
131 |                 chars.append(c)
132 | 
133 |     def readNextDate(self):
134 |         return datetime.date(self.readNextShort(), self.readNextByte(), self.readNextByte())
135 | 
136 |     def readNextDouble(self):
137 |         return self.primUnpack('>d', 8)
138 | 
139 |     def readNextInt(self):
140 |         return self.primUnpack('>i', 4)
141 | 
142 |     def readNextFloat(self):
143 |         return self.primUnpack('>f', 4)
144 | 
145 |     def readNextLong(self):
146 |         return self.primUnpack('>l', 4)
147 | 
148 |     def readNextpString(self):
149 |         nb = self.readNextByte()
150 |         chars = [self.readNextChar() for i in range(nb)]
151 |         return ''.join(chars)
152 | 
153 |     def readNextShort(self):
154 |         return self.primUnpack('>h', 2)
155 | 
156 |     def readNextString(self, size):
157 |         chars = [self.readNextChar() for i in range(size)]
158 |         return ''.join(chars)
159 |     
160 |     def readNextThumb(self):
161 |         return (self.readNextLong(), self.readNextLong(), self.readNextByte(), self.readNextByte())
162 | 
163 |     def readNextTime(self):
164 |         return datetime.time(self.readNextByte(), self.readNextByte(), self.readNextByte(), self.readNextByte())
165 | 
166 |     def readNextUnsignedInt(self):
167 |         return self.primUnpack('>I', 4)
168 |     
169 |     def readNextUserData(self, type, num):
170 |         # to be overwritten in user's code
171 |         return NotImplemented
172 | 
173 |     def primUnpack(self, format, nb):
174 |         x = struct.unpack(format, self.file.read(nb))
175 |         return x[0]
176 |     
177 |     def close(self):
178 |         self.file.close()
179 | 
180 |     def seek(self, pos):
181 |         self.file.seek(pos)
182 | 
183 |     def tell(self):
184 |         return self.file.tell()
185 | 
186 | class DirEntry:
187 |     def __init__(self, reader):
188 |         self.name = reader.readNextString(4)
189 |         self.number = reader.readNextInt()
190 |         self.elementtype = reader.readNextShort()
191 |         self.elementsize = reader.readNextShort()
192 |         self.numelements = reader.readNextInt()
193 |         self.datasize = reader.readNextInt()
194 |         self.dataoffsetpos = reader.tell()
195 |         self.dataoffset = reader.readNextInt()
196 |         self.datahandle = reader.readNextInt()
197 | 
198 |     def __str__(self):
199 |         return "%s (%i) / %s (%i)" % (self.name, self.number, self.mytype(), self.numelements)
200 | 
201 |     def mydataoffset(self):
202 |         if self.datasize <= 4:
203 |             return self.dataoffsetpos
204 |         else:
205 |             return self.dataoffset
206 | 
207 |     def mytype(self):
208 |         if self.elementtype < 1024:
209 |             return ABIF_TYPES.get(self.elementtype, 'unknown')
210 |         else:
211 |             return 'user'


--------------------------------------------------------------------------------
/analyze_oligos.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | from Bio import SeqIO
 4 | import numpy as np
 5 | 
 6 | import seqtools
 7 | import oligoTm
 8 | import unafold
 9 | import blat
10 | 
11 | # ==================
12 | # = Output primers =
13 | # ==================
14 | 
15 | def output_primers(primers,names):
16 |     datum = ('name','sequence','len','Tm',r'%GC','ss-dG','BLAT')
17 |     header = "\n%-25s %-30s %-4s %-5s %-4s %-7s %-5s\n" % datum
18 |     sys.stdout.write(header)
19 |     
20 |     lens = map(len,primers)
21 |     Tms = map(oligoTm.oligo_Tm,primers)
22 |     gcs = map(lambda p: seqtools.gc_content(p)*100,primers)
23 |     dGs = map(lambda p: unafold.hybrid_ss_min(p,NA='DNA',sodium=0.05),primers)
24 |     # trunc_primers = [p[-min(18,min(lens)):] for p in primers]
25 |     trunc_primers = primers # NO TRUNCATION
26 |     seqrecords = map(lambda t: seqtools.make_SeqRecord(*t),zip(names,trunc_primers))
27 |     # blat_hits = map(blat.search_sequence,seqrecords)
28 |     
29 |     for datum in zip(names,primers,lens,Tms,gcs,dGs): #,blat_hits):
30 |         primer_string = "%-25s %-30s %-4i %-5.1f %-4.0f %-7.1f\n" % datum
31 |         sys.stdout.write(primer_string)
32 |     
33 |     summary_data = lambda d: (np.mean(d),np.std(d),np.min(d),np.max(d))
34 |     
35 |     sys.stdout.write('\nsummary:\n')
36 |     sys.stdout.write('num primers: %i\n' % len(primers))
37 |     sys.stdout.write('len    mean: %5.1f    std: %5.1f    min: %5.1f    max %5.1f\n' % summary_data(lens))
38 |     sys.stdout.write('Tm     mean: %5.1f    std: %5.1f    min: %5.1f    max %5.1f\n' % summary_data(Tms))
39 |     sys.stdout.write('%%GC    mean: %5.1f    std: %5.1f    min: %5.1f    max %5.1f\n' % summary_data(gcs))
40 |     sys.stdout.write('dGs    mean: %5.1f    std: %5.1f    min: %5.1f    max %5.1f\n' % summary_data(dGs))
41 |     # sys.stdout.write('BLAT  mean: %5.1f    std: %5.1f    min: %5.1f    max %5.1f    total: %5.1f\n' % (summary_data(blat_hits)+(np.sum(blat_hits),)))
42 | 
43 | if __name__ == '__main__':
44 |     
45 |     if len(sys.argv) == 3:
46 |         inhandle = open(sys.argv[1],'r')
47 |         outhandle = open(sys.argv[2],'w')
48 |     elif len(sys.argv) == 2:
49 |         inhandle = open(sys.argv[1],'r')
50 |         outhandle = sys.stdout
51 |     elif len(sys.argv) == 1:
52 |         inhandle = sys.stdin
53 |         outhandle = sys.stdout
54 |     
55 |     seqrecords = list(SeqIO.parse(inhandle,'fasta'))
56 |     names = [rec.id for rec in seqrecords]
57 |     primers = [seqtools.get_string(rec) for rec in seqrecords]
58 |     
59 |     if not blat.is_server_running():
60 |         blat_server = blat.start_gfServer()
61 |     
62 |     output_primers(primers,names)
63 |     
64 |     # if blat.is_server_running():
65 |     #     blat.stop_gfServer( blat_server )
66 | 
67 | 


--------------------------------------------------------------------------------
/bin/compute_divergence.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | import subprocess
 4 | import argparse
 5 | 
 6 | argparser = argparse.ArgumentParser(description=None)
 7 | argparser.add_argument('-q','--query',required=True)
 8 | argparser.add_argument('-t','--target',required=True)
 9 | argparser.add_argument('-o','--output',required=True)
10 | argparser.add_argument('-u','--usearch',default='usearch')
11 | args = argparser.parse_args()
12 | 
13 | usearch_cmd = "%s --query %s --db %s --nofastalign --nousort --minlen 1 --maxaccepts 0 --maxrejects 0 --global --id 0 --userout %s --userfields query+target+id0+id1+id2+id3+id4+gaps+intgaps+qloz+qhiz+tloz+thiz+ql+tl+cols+intcols"
14 | 
15 | p = subprocess.Popen(usearch_cmd % (args.usearch,args.query,args.target,args.output),shell=True)
16 | p.wait()
17 | 


--------------------------------------------------------------------------------
/bin/explode_fasta.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | import os
 4 | import sys
 5 | import argparse
 6 | 
 7 | from Bio import SeqIO
 8 | 
 9 | from pyutils import cleanup_id
10 | 
11 | argparser = argparse.ArgumentParser(description=None)
12 | argparser.add_argument('input_file',nargs='?',type=argparse.FileType('r'),default=sys.stdin)
13 | argparser.add_argument('output_dir',nargs='?',default=os.getcwd())
14 | args = argparser.parse_args()
15 | 
16 | for record in SeqIO.parse(args.input_file,'fasta'):
17 |     output_file = os.path.join(args.output_dir,'%s.fasta' % cleanup_id(record.id))
18 |     with open(output_file,'w') as op:
19 |         print >>op, record.format('fasta')
20 | 


--------------------------------------------------------------------------------
/bin/fasta2idt.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | import sys
 4 | 
 5 | import seqtools
 6 | 
 7 | if len(sys.argv) == 3:
 8 |     inhandle = open(sys.argv[1],'r')
 9 |     outhandle = open(sys.argv[2],'w')
10 | elif len(sys.argv) == 2:
11 |     inhandle = open(sys.argv[1],'r')
12 |     outhandle = sys.stdout
13 | elif len(sys.argv) == 1:
14 |     inhandle = sys.stdin
15 |     outhandle = sys.stdout
16 | 
17 | for (descr,seq) in seqtools.FastaIterator(inhandle):
18 |     print >>outhandle, "%s\t%s" % (descr,seq)
19 | 


--------------------------------------------------------------------------------
/bin/fasta2lenhist.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | import os
 4 | import sys
 5 | import argparse
 6 | 
 7 | import numpy as np
 8 | import matplotlib as mpl
 9 | mpl.use('agg')
10 | import matplotlib.pyplot as plt
11 | 
12 | import seqtools
13 | 
14 | argparser = argparse.ArgumentParser(description=None)
15 | argparser.add_argument('positional',nargs='*')
16 | argparser.add_argument('--log',action='store_true')
17 | args = argparser.parse_args()
18 | 
19 | if len(args.positional) == 2:
20 |     inhandle = open(args.positional[0],'r')
21 |     outfile = args.positional[1]
22 | elif len(args.positional) == 1:
23 |     inhandle = open(args.positional[0],'r')
24 |     outfile = 'lenhist.png'
25 | elif len(args.positional) == 0:
26 |     inhandle = sys.stdin
27 |     outfile = 'lenhist.png'
28 | 
29 | read_lengths = []
30 | for (name,read) in seqtools.FastaIterator(inhandle):
31 |     read_lengths.append(len(read))
32 | 
33 | print "Number of reads: %i" % len(read_lengths)
34 | print "Shortest read length: %i bp" % min(read_lengths)
35 | print "Longest read length: %i bp" % max(read_lengths)
36 | print "Median read length: %i bp" % np.median(read_lengths)
37 | print "Mean read length: %i bp" % np.mean(read_lengths)
38 | 
39 | if not args.log:
40 |     fig = plt.figure()
41 |     ax = fig.add_subplot(111)
42 |     ax.hist(read_lengths,bins=range(max(read_lengths)+1),linewidth=0,log=False)
43 |     ax.set_xlabel('Read length')
44 |     fig.savefig(outfile)
45 | else:
46 |     fig = plt.figure()
47 |     ax = fig.add_subplot(111)
48 |     ax.hist(read_lengths,bins=range(max(read_lengths)+1),linewidth=0,log=True)
49 |     ax.set_xlabel('Read length')
50 |     fig.savefig(outfile)
51 | 


--------------------------------------------------------------------------------
/bin/fasta2tiles.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | import sys
 4 | import optparse
 5 | 
 6 | import blast
 7 | 
 8 | parser = optparse.OptionParser()
 9 | parser.add_option('-s','--size',type='int')
10 | parser.add_option('-o','--offset',type='int')
11 | parser.add_option('-p','--blastp',action='store_true')
12 | (options, args) = parser.parse_args()
13 | 
14 | if len(args) == 2:
15 |     inhandle = open(args[0],'r')
16 |     outhandle = open(args[1],'w')
17 | elif len(args) == 1:
18 |     inhandle = open(args[0],'r')
19 |     outhandle = sys.stdout
20 | elif len(args) == 0:
21 |     inhandle = sys.stdin
22 |     outhandle = sys.stdout
23 | 
24 | 
25 | #-----------------------------------------------------------------------------
26 | 
27 | def fasta_parser(handle):
28 |     # taken from biopython
29 |     
30 |     #Skip any text before the first record (e.g. blank lines, comments)
31 |     while True:
32 |         line = handle.readline()
33 |         if line == "" : return #Premature end of file, or just empty?
34 |         if line[0] == ">":
35 |             break
36 |     
37 |     while True:
38 |         if line[0]!=">":
39 |             raise ValueError("Records in Fasta files should start with '>' character")
40 |         descr = line[1:].rstrip()
41 |         
42 |         lines = []
43 |         line = handle.readline()
44 |         while True:
45 |             if not line : break
46 |             if line[0] == ">": break
47 |             lines.append(line.rstrip().replace(" ","").replace("\r",""))
48 |             line = handle.readline()
49 |         
50 |         yield (descr,"".join(lines))
51 |  
52 |         if not line : return #StopIteration
53 |     assert False, "Should not reach this line"
54 | 
55 | #-----------------------------------------------------------------------------
56 | 
57 | tile_size = options.size
58 | tile_offset = options.offset
59 | 
60 | for (descr,seq) in fasta_parser(inhandle):
61 |     pos = 0
62 |     num = 1
63 |     while pos < len(seq):
64 |         if pos+tile_size >= len(seq):    # last tile in seq
65 |             tile = seq[-tile_size:]
66 |             start = len(seq) - tile_size
67 |             end = len(seq)
68 |         else:
69 |             tile = seq[pos:pos+tile_size]
70 |             start = pos
71 |             end = pos+tile_size
72 |         
73 |         if options.blastp == True:
74 |             num_hits = blast.number_genome_qblast_protein_hits(tile)
75 |             print >>outhandle, '>%s|tile%03i|%i|%i|%i\n%s' % (descr,num,start,end,num_hits,tile)
76 |         else:
77 |             print >>outhandle, '>%s|tile%03i|%i|%i\n%s' % (descr,num,start,end,tile)
78 |         
79 |         pos += tile_offset
80 |         num += 1
81 | 


--------------------------------------------------------------------------------
/bin/fasta2uniq.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | import seqtools
 4 | 
 5 | if len(sys.argv) == 3:
 6 |     inhandle = open(sys.argv[1],'r')
 7 |     outhandle = open(sys.argv[2],'w')
 8 | elif len(sys.argv) == 2:
 9 |     inhandle = open(sys.argv[1],'r')
10 |     outhandle = sys.stdout
11 | elif len(sys.argv) == 1:
12 |     inhandle = sys.stdin
13 |     outhandle = sys.stdout
14 | 
15 | all_seqs = []
16 | uniq_seqs = set()
17 | 
18 | for (descr,seq) in seqtools.FastaIterator(inhandle):
19 |     all_seqs.append((descr,seq))
20 |     uniq_seqs.add(seq)
21 | 
22 | for (descr,seq) in all_seqs:
23 |     if seq in uniq_seqs:
24 |         outhandle.write('>%s\n%s\n' % (descr,seq))
25 |         uniq_seqs.remove(seq)
26 | 
27 |     


--------------------------------------------------------------------------------
/bin/fasta_rm_newlines.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | import sys
 4 | 
 5 | from Bio import SeqIO
 6 | 
 7 | if len(sys.argv) == 3:
 8 |     inhandle = open(sys.argv[1],'r')
 9 |     outhandle = open(sys.argv[2],'w')
10 | elif len(sys.argv) == 2:
11 |     inhandle = open(sys.argv[1],'r')
12 |     outhandle = sys.stdout
13 | elif len(sys.argv) == 1:
14 |     inhandle = sys.stdin
15 |     outhandle = sys.stdout
16 | 
17 | # SeqIO.write does not allow access to the wrap parameter
18 | # SeqIO.write(SeqIO.parse(inhandle,'fasta'),outhandle,'fasta')
19 | 
20 | print_fasta = lambda r: outhandle.write(">%s\n%s\n" % (r.description,r.seq.tostring()))
21 | map(print_fasta,SeqIO.parse(inhandle,'fasta'))
22 | 


--------------------------------------------------------------------------------
/bin/fasta_sort_by_abundance.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | import sys
 4 | import argparse
 5 | from collections import defaultdict
 6 | 
 7 | from seqtools import FastaIterator
 8 | 
 9 | argparser = argparse.ArgumentParser(description=None)
10 | argparser.add_argument('input',nargs='?',type=argparse.FileType('r'),default=sys.stdin)
11 | argparser.add_argument('output',nargs='?',type=argparse.FileType('w'),default=sys.stdout)
12 | args = argparser.parse_args()
13 | 
14 | counts = defaultdict(list)
15 | for (name,seq) in FastaIterator(args.input):
16 |     counts[seq].append(name)
17 | 
18 | for seq in sorted(counts.keys(), key=lambda k: len(counts[k]), reverse=True):
19 |     for name in counts[seq]:
20 |         args.output.write(">%s\n%s\n" % (name,seq))
21 | 


--------------------------------------------------------------------------------
/bin/generate_otu_table.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | import optparse
 4 | import os
 5 | 
 6 | import vdj.analysis
 7 | 
 8 | option_parser = optparse.OptionParser()
 9 | option_parser.add_option('-m','--mapping_file')
10 | (options,args) = option_parser.parse_args()
11 | 
12 | if len(args) != 2:
13 |     raise ValueError, "need input and output filenames"
14 | 
15 | # Read sample mapping file
16 | mapping_handle = open(options.mapping_file,'r')
17 | samples = [(line.split('\t')[0].strip(),line.split('\t')[2].strip()) for line in mapping_handle if not line.startswith('#')]
18 | mapping_handle.close()
19 | 
20 | # Load count data
21 | infilename = args[0]
22 | inhandle = open(infilename,'r')
23 | (uniq_feature_values,countdict) = vdj.analysis.vdjxml2countdict(inhandle,['barcode','clone'])
24 | inhandle.close()
25 | 
26 | # Convert to matrix form
27 | countmatrix = vdj.analysis.countdict2matrix(['barcode','clone'],uniq_feature_values,countdict).transpose()
28 | 
29 | # Reorder columns to correspond to mapping file order
30 | sample_idxs = dict([(v,i) for (i,v) in enumerate(uniq_feature_values['barcode'])])
31 | argsort = [sample_idxs[sample] for (sample,descr) in samples]
32 | countmatrix = countmatrix[:,argsort]
33 | 
34 | # Dump OTU table
35 | basename = os.path.basename(args[0])
36 | outfilename = args[1]
37 | outhandle = open(outfilename,'w')
38 | 
39 | print >>outhandle, "#OTU counts %s" % basename
40 | header = "OTU ID"
41 | # for (sample,descr) in samples: header += "\t%s" % ('_'.join([sample,descr]))
42 | for (sample,descr) in samples: header += "\t%s" % sample
43 | print >>outhandle, header
44 | 
45 | for (label,countvector) in zip(uniq_feature_values['clone'],countmatrix):
46 |     line = label
47 |     for count in countvector:
48 |         line += "\t%i" % int(count)
49 |     print >>outhandle, line
50 | 
51 | outhandle.close()
52 | 


--------------------------------------------------------------------------------
/bin/generic_script.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | if __name__ == '__main__':
 4 |     import sys
 5 |     import argparse
 6 |     
 7 |     argparser = argparse.ArgumentParser(description=None)
 8 |     argparser.add_argument('positional',type=int,nargs='*')
 9 |     argparser.add_argument('input_file',nargs='?',type=argparse.FileType('r'),default=sys.stdin)
10 |     argparser.add_argument('output_dir',nargs='?',default=os.getcwd())
11 |     argparser.add_argument('--option',dest='xxx',action='store_const',default=5)
12 |     args = argparser.parse_args()
13 |     
14 |     if len(args.positional) == 2:
15 |         inhandle = open(args.positional[0],'r')
16 |         outhandle = open(args.positional[1],'w')
17 |     elif len(args.positional) == 1:
18 |         inhandle = open(args.positional[0],'r')
19 |         outhandle = sys.stdout
20 |     elif len(args.positional) == 0:
21 |         inhandle = sys.stdin
22 |         outhandle = sys.stdout
23 | 
24 | 
25 | 
26 | # OR
27 | 
28 | if __name__ == '__main__':
29 |     import sys
30 |     
31 |     if len(sys.argv) == 3:
32 |         inhandle = open(sys.argv[1],'r')
33 |         outhandle = open(sys.argv[2],'w')
34 |     elif len(sys.argv) == 2:
35 |         inhandle = open(sys.argv[1],'r')
36 |         outhandle = sys.stdout
37 |     elif len(sys.argv) == 1:
38 |         inhandle = sys.stdin
39 |         outhandle = sys.stdout
40 | 


--------------------------------------------------------------------------------
/bin/idt2fasta.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | import seqtools
 4 | 
 5 | if len(sys.argv) == 3:
 6 |     inhandle = open(sys.argv[1],'r')
 7 |     outhandle = open(sys.argv[2],'w')
 8 | elif len(sys.argv) == 2:
 9 |     inhandle = open(sys.argv[1],'r')
10 |     outhandle = sys.stdout
11 | elif len(sys.argv) == 1:
12 |     inhandle = sys.stdin
13 |     outhandle = sys.stdout
14 | 
15 | for line in inhandle:
16 |     if line.strip() == '':
17 |         print >>outhandle, ''
18 |         continue
19 |     descr = line.split()[0]
20 |     seq = line.split()[1]
21 |     print >>outhandle, ">%s\n%s" % (descr,seq)
22 | 


--------------------------------------------------------------------------------
/bin/make_timeseries_figures.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | 
  3 | import optparse
  4 | 
  5 | import numpy as np
  6 | import matplotlib as mpl
  7 | mpl.use('Agg')
  8 | import matplotlib.pyplot as plt
  9 | import matplotlib.collections
 10 | 
 11 | import vdj
 12 | import vdj.analysis
 13 | import timeseries
 14 | 
 15 | option_parser = optparse.OptionParser()
 16 | option_parser.add_option('-r','--threshold',type='float')
 17 | option_parser.add_option('-o','--outputbasename')
 18 | option_parser.add_option('-q','--quantify')
 19 | option_parser.add_option('-n','--normalize',action='store_true')
 20 | (options,args) = option_parser.parse_args()
 21 | 
 22 | if len(args) == 1:
 23 |     inhandle = open(args[0],'r')
 24 | else:
 25 |     raise ValueError, "Must give a single argument that is a timeseries data file"
 26 | 
 27 | data = timeseries.load_timeseries(inhandle)
 28 | labels = data['labels']
 29 | times = data['times']
 30 | timeseriesmatrix = data['matrix']
 31 | 
 32 | try:
 33 |     sums = data['sums']
 34 | except KeyError:
 35 |     sums = timeseriesmatrix.sum(axis=0)
 36 | 
 37 | # normalize if desired
 38 | if options.normalize:
 39 |     timeseriesmatrix = np.float_(timeseriesmatrix) / np.asarray(sums)
 40 | 
 41 | # define which time series to plot
 42 | if options.threshold:
 43 |     idxs = np.sum(timeseriesmatrix>=options.threshold,axis=1)>0 # breaks threshold at least once
 44 | else:
 45 |     idxs = np.asarray([True]*timeseriesmatrix.shape[0])
 46 | # idxs = np.sum(time_series_freqs>0,axis=1)>2 # seen at least twice
 47 | # idxs_bool = np.logical_and(idxs_bool_1,idxs_bool_2)
 48 | # idxs_bool = np.array([False]*len(reference_clones))
 49 | print "Number of lines plotted: %i" % np.sum(idxs)
 50 | 
 51 | # ==================
 52 | # = Make the plots =
 53 | # ==================
 54 | 
 55 | # get output names
 56 | if options.outputbasename:
 57 |     outputbasename = options.outputbasename
 58 | else:
 59 |     outputbasename = '.'.join(args[0].split('.')[:-1])
 60 | 
 61 | random_color = lambda: '#%02x%02x%02x' % tuple(np.random.randint(0,256,3))
 62 | 
 63 | segments = [zip(times,timeseries) for timeseries in timeseriesmatrix[idxs]]
 64 | colors = [random_color() for i in xrange(len(segments))]
 65 | lines = mpl.collections.LineCollection(segments,colors=colors,linewidths=0.5)
 66 | lines.set_alpha(0.75)
 67 | 
 68 | fig = plt.figure()
 69 | ax = fig.add_subplot(111)
 70 | ax.add_collection(lines)
 71 | ax.spines['top'].set_visible(False)
 72 | ax.spines['right'].set_visible(False)
 73 | ax.spines['bottom'].set_position(('outward',5))
 74 | ax.spines['left'].set_position(('outward',5))
 75 | ax.xaxis.set_ticks_position('bottom')
 76 | ax.yaxis.set_ticks_position('left')
 77 | ax.xaxis.set_major_locator(mpl.ticker.FixedLocator(times))
 78 | ax.set_xlim([times.min(),times.max()])
 79 | ax.autoscale_view(scalex=False,scaley=True)
 80 | # ax.set_yscale('log')
 81 | ax.set_xlabel('time')
 82 | ax.set_ylabel(options.quantify+' frequency')
 83 | # fig.show()
 84 | fig.savefig(outputbasename+'.%stimeseries.png' % options.quantify)
 85 | fig.savefig(outputbasename+'.%stimeseries.pdf' % options.quantify)
 86 | 
 87 | # segments = [np.asarray(zip(times,timeseries)) for timeseries in timeseriesmatrix[idxs]]
 88 | # segments = [segment[segment[:,1]>0] for segment in segments if segment[:,1].sum()>0]
 89 | # lines = mpl.collections.LineCollection(segments,colors=colors,linewidths=0.5)
 90 | # lines.set_alpha(0.75)
 91 | 
 92 | figlog = plt.figure()
 93 | ax = figlog.add_subplot(111)
 94 | ax.add_collection(lines)
 95 | ax.spines['top'].set_visible(False)
 96 | ax.spines['right'].set_visible(False)
 97 | ax.spines['bottom'].set_position(('outward',5))
 98 | ax.spines['left'].set_position(('outward',5))
 99 | ax.xaxis.set_ticks_position('bottom')
100 | ax.yaxis.set_ticks_position('left')
101 | ax.xaxis.set_major_locator(mpl.ticker.FixedLocator(times))
102 | ax.set_yscale('log')
103 | ax.set_xlim([times.min(),times.max()])
104 | ax.set_xlabel('time')
105 | ax.set_ylabel(options.quantify+' frequency')
106 | # fig.show()
107 | figlog.savefig(outputbasename+'.%stimeseries.log.png' % options.quantify)
108 | figlog.savefig(outputbasename+'.%stimeseries.log.pdf' % options.quantify)
109 | 


--------------------------------------------------------------------------------
/bin/qiime_cluster_jobs_LSF.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | import os
 4 | import datetime
 5 | import optparse
 6 | 
 7 | import lsf
 8 | 
 9 | option_parser = optparse.OptionParser()
10 | option_parser.add_option('-m','--make_jobs',action='store_true')
11 | option_parser.add_option('-s','--submit_jobs',action='store_true')
12 | option_parser.add_option('-q','--queue',default='normal_serial')
13 | option_parser.add_option('-l','--log_dir')
14 | (options,args) = option_parser.parse_args()
15 | 
16 | # check that we get the qiime-required arguments
17 | if len(args) == 2:
18 |     jobs_list_file = args[0]
19 |     job_id = args[1]
20 | elif len(args) == 0:
21 |     raise ValueError, "Didn't get the right command line arguments"
22 | 
23 | # make a directory for holding LSF log files
24 | if options.log_dir == None:
25 |     log_dir = os.path.join(os.environ['HOME'],'qiime_parallel_logs')
26 | else:
27 |     log_dir = options.log_dir
28 | 
29 | if not os.path.exists(log_dir):
30 |     os.mkdir(log_dir,0755)
31 | 
32 | # submit the jobs
33 | jobs_handle = open(jobs_list_file,'r')
34 | job_ids = []
35 | logs = []
36 | for (i,line) in enumerate(jobs_handle):
37 |     datetimestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
38 |     log = os.path.join( log_dir, 'job_%i_%s.log' % (i,datetimestamp) )
39 |     job_id = lsf.submit_to_LSF(options.queue,log,line.strip())
40 |     job_ids.append(job_id)
41 |     logs.append(log)
42 | jobs_handle.close()
43 | 


--------------------------------------------------------------------------------
/bin/quality_hist.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | import sys
 4 | import random
 5 | 
 6 | from Bio import SeqIO
 7 | 
 8 | import numpy as np
 9 | import scipy as sp
10 | import scipy.stats
11 | 
12 | import matplotlib as mpl
13 | mpl.use('agg')
14 | import matplotlib.pyplot as plt
15 | 
16 | from pbs import wc
17 | 
18 | input_file = sys.argv[1]
19 | output_file = sys.argv[2]
20 | 
21 | num_lines = int(wc(input_file, '-l').split()[0])
22 | assert(num_lines % 4 == 0)
23 | num_reads = num_lines / 4
24 | 
25 | if num_reads > 10000000:
26 |     idxs = set(sorted(random.sample(xrange(num_reads),10000000)))
27 | 
28 | qualities = []
29 | for (i,record) in enumerate(SeqIO.parse(input_file, 'fastq')):
30 |     if num_reads <= 10000000 or i in idxs:
31 |         qualities.append(record.letter_annotations['phred_quality'])
32 |     
33 |     if i % 10000 == 0:
34 |         sys.stdout.write("%i " % i)
35 |         sys.stdout.flush()
36 | 
37 | qualities = np.array(qualities)
38 | 
39 | positions = range(1, qualities.shape[1]+1)
40 | 
41 | p5  = sp.stats.scoreatpercentile(qualities, 5)
42 | p25 = sp.stats.scoreatpercentile(qualities, 25)
43 | p50 = sp.stats.scoreatpercentile(qualities, 50)
44 | p75 = sp.stats.scoreatpercentile(qualities, 75)
45 | p95 = sp.stats.scoreatpercentile(qualities, 95)
46 | 
47 | fig = plt.figure()
48 | ax = fig.add_subplot(111)
49 | ax.scatter(positions,p5,  s=3, c='k', linewidths=0, zorder=2)
50 | ax.scatter(positions,p95, s=3, c='k', linewidths=0, zorder=2)
51 | for (pos, low, high) in zip(positions, p25, p75):
52 |     ax.plot([pos, pos], [low, high], color='#bdbdbd', lw=1, zorder=1)
53 | ax.scatter(positions, p50, s=6, c='r', linewidths=0, zorder=3)
54 | ax.set_xlabel('position')
55 | ax.set_ylabel('phred score')
56 | ax.set_xlim([positions[0]-1, positions[-1]+1])
57 | ax.set_ylim([0, 45])
58 | fig.savefig(output_file)
59 | 


--------------------------------------------------------------------------------
/bin/sff2fastq_trimmed.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | import sys
 4 | import argparse
 5 | 
 6 | from Bio import SeqIO
 7 |     
 8 | argparser = argparse.ArgumentParser(description=None)
 9 | argparser.add_argument('input_file',nargs='?',type=argparse.FileType('rb'),default=sys.stdin)
10 | argparser.add_argument('output_file',nargs='?',type=argparse.FileType('w'),default=sys.stdout)
11 | args = argparser.parse_args()
12 | 
13 | for record in SeqIO.parse(args.input_file,'sff'):
14 |     start = record.annotations['clip_qual_left']
15 |     end   = record.annotations['clip_qual_right']
16 |     args.output_file.write( record[start:end].format('fastq') )
17 | 


--------------------------------------------------------------------------------
/bin/streamgraph_html.py:
--------------------------------------------------------------------------------
  1 | streamgraph_html = r"""
  2 | <html>
  3 |     <head>
  4 |         <title>Visualization</title>
  5 |         <script type="text/javascript" src="../protovis-r3.2.js"></script>
  6 |     </head>
  7 | 
  8 |     <body>
  9 |         <script type="text/javascript+protovis">
 10 |             
 11 |             data = %s
 12 |             
 13 |             // Unpack data and sort it accordingly
 14 |             var times = data.times ;
 15 |             var sums = data.sums ;
 16 |             var matrix = data.matrix ;
 17 |             
 18 |             function first_nonzero_index(v) {
 19 |                 for( i=0; i<v.length; i++) {
 20 |                     if(v[i] > 0) {
 21 |                         break;
 22 |                     }
 23 |                 }
 24 |                 return i;
 25 |             }
 26 |             
 27 |             // sort data according to first nonzero index in timeseries
 28 |             matrix.sort(function(a,b) (first_nonzero_index(a)-first_nonzero_index(b)))
 29 |             
 30 |             // Define some parameters for the visualization
 31 |             var frame_w = document.body.clientWidth,
 32 |                 frame_h = document.body.clientHeight,
 33 |                 vis_w = frame_w - 50
 34 |                 vis_h = frame_h - 100
 35 |                 x = pv.Scale.linear(pv.min(times), pv.max(times)).range(0, vis_w),
 36 |                 y = pv.Scale.linear(0,1).range(0, vis_h);
 37 |             
 38 |             function norm_weight(v) {
 39 |                 sum = 0;
 40 |                 for( i=0; i<v.length; i++) {
 41 |                     sum += v[i] / sums[i];
 42 |                 }
 43 |                 return sum;
 44 |             }
 45 |             
 46 |             var min_norm_weight = pv.min(matrix.map(norm_weight)),
 47 |                 max_norm_weight = pv.max(matrix.map(norm_weight));
 48 |             
 49 |             Hscale = pv.Scale.linear(pv.range(times.length)).range(0,360-360/times.length)
 50 |             Lscale = pv.Scale.root(min_norm_weight,max_norm_weight).range(80,50).power(4)
 51 |             function color_picker(d) {
 52 |                 h = Hscale(first_nonzero_index(d))
 53 |                 s = 100
 54 |                 l = Lscale(norm_weight(d))
 55 |                 return pv.color("hsl("+h+","+s+"%%,"+l+"%%)");
 56 |             }
 57 |             
 58 |             // Start the visualization
 59 |             var frame = new pv.Panel()
 60 |                 .width(frame_w)
 61 |                 .height(frame_h);
 62 |                 // .strokeStyle("#000")
 63 |             
 64 |             vis = frame.add(pv.Panel)
 65 |                 .width(vis_w)
 66 |                 .height(vis_h)
 67 |                 .left((frame_w - vis_w) / 2.0);
 68 |                 // .strokeStyle("#000")
 69 |             
 70 |             vis.add(pv.Layout.Stack)
 71 |                 .layers(matrix)
 72 |                 // .order("inside-out")
 73 |                 .offset("wiggle")
 74 |                 .x(function() x(times[this.index]))
 75 |                 .y(function(d) y(d/sums[this.index]))
 76 |               .layer.add(pv.Area)
 77 |                 .fillStyle(function(d,p) color_picker(p));
 78 |                 // .interpolate("basis")    // makes it pretty
 79 |             
 80 |             // Panel with time ticks
 81 |             vis.add(pv.Panel)
 82 |                 .width(vis_w)
 83 |                 .height(100)
 84 |                 .left(0)
 85 |                 .bottom(-100)    // Adjust here to move time ticks up/down
 86 |                 // .strokeStyle("#000")
 87 |               .add(pv.Rule)
 88 |                 .data(times)
 89 |                 // .visible(function(d) d)
 90 |                 .left(x)
 91 |                 .bottom(25)
 92 |                 .height(5)
 93 |               .anchor("bottom").add(pv.Label)
 94 |               .anchor("top").add(pv.Bar)
 95 |                   .bottom(function() (this.index==3)? 65 : 35)
 96 |                   .left(function(d) x(d)-8)
 97 |                   .height(20)
 98 |                   .width(16)
 99 |                   .fillStyle(function() pv.color("hsl("+Hscale(this.index)+",100%%,50%%)"));
100 |             
101 |             // vis.add(pv.Rule)
102 |             //     .data(y.ticks())
103 |             //     .bottom(y)
104 |             //   .anchor('left').add(pv.Label)
105 |             //     .text(y.tickFormat);
106 |             
107 |             frame.render();
108 |     
109 |             document.getElementById("svgoutput").value = frame.scene[0].canvas.innerHTML;
110 |     
111 |     
112 |         </script>
113 |         <textarea id="svgoutput" cols="80" rows="24"></textarea>
114 |     </body>
115 | </html>
116 | """


--------------------------------------------------------------------------------
/bin/timeseries2json.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | import sys
 4 | import optparse
 5 | import json
 6 | 
 7 | import timeseries
 8 | 
 9 | option_parser = optparse.OptionParser()
10 | # option_parser.add_option('-x','--xxx',dest='xxxx',type='int')
11 | (options,args) = option_parser.parse_args()
12 | 
13 | if len(args) == 2:
14 |     inhandle = open(args[0],'r')
15 |     outhandle = open(args[1],'w')
16 | elif len(args) == 1:
17 |     inhandle = open(args[0],'r')
18 |     outhandle = sys.stdout
19 | elif len(args) == 0:
20 |     inhandle = sys.stdin
21 |     outhandle = sys.stdout
22 | 
23 | data = timeseries.load_timeseries(inhandle)
24 | 
25 | # eliminate numpy-ness of objects before JSON output
26 | np_matrix = data['matrix']
27 | py_matrix = []
28 | for row in np_matrix:
29 |     py_matrix.append(list(row))
30 | data['matrix'] = py_matrix
31 | data['labels'] = list(data['labels'])
32 | 
33 | for label in data.keys():
34 |     if label == 'labels' or label == 'matrix':
35 |         continue
36 |     data[label] = list(data[label])
37 | 
38 | json.dump(data,outhandle)
39 | 


--------------------------------------------------------------------------------
/bin/timeseries2streamgraph.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | import optparse
 4 | import colorsys
 5 | 
 6 | import numpy as np
 7 | import matplotlib as mpl
 8 | mpl.use('Agg')
 9 | import matplotlib.pyplot as plt
10 | 
11 | import scale
12 | import timeseries
13 | import streamgraph
14 | 
15 | option_parser = optparse.OptionParser()
16 | option_parser.add_option('-f','--filter',type='choice',choices=['none','seen2','sum2','sum3'],default='none')
17 | (options,args) = option_parser.parse_args()
18 | 
19 | if len(args) == 2:
20 |     inhandle = open(args[0],'r')
21 | else:
22 |     raise ValueError, "need input and output names"
23 | 
24 | data = timeseries.load_timeseries(inhandle)
25 | matrix = data['matrix']
26 | labels = np.asarray(data['labels'])
27 | times = data['times']
28 | sums = data['sums']
29 | 
30 | streams = matrix / sums
31 | 
32 | # determine colors for the streamgraph
33 | colors = []
34 | time_idxs = np.arange(streams.shape[1])
35 | onset_time = lambda stream: np.min(time_idxs[stream > 0])
36 | weight = lambda stream: np.sum(stream)
37 | Hscale = scale.linear(range(len(times))).range(0,1-1./len(times))
38 | Lscale = scale.root(streams.sum(axis=1)).range(0.8,0.5).power(4)
39 | for stream in streams:
40 |     h = Hscale(onset_time(stream))
41 |     l = Lscale(weight(stream))
42 |     colors.append( colorsys.hls_to_rgb(h,l,1) + (1.,) )
43 | colors = np.array(colors)
44 | 
45 | # sort streamgraphs appropriately
46 | argsort_onset = streamgraph.argsort_onset(streams)
47 | streams = streams[argsort_onset]
48 | matrix = matrix[argsort_onset]
49 | colors = colors[argsort_onset]
50 | 
51 | # argsort_inside_out = streamgraph.argsort_inside_out(streams)
52 | # streams = streams[argsort_inside_out]
53 | # colors = colors[argsort_inside_out]
54 | 
55 | # filter out some clones
56 | if options.filter == 'none':
57 |     filter_idxs = np.ones(streams.shape[0]) > 0     # all streams
58 | elif options.filter == 'seen2':
59 |     filter_idxs = np.sum(streams > 0, axis=1) >= 2  # seen twice
60 | elif options.filter == 'sum2':
61 |     filter_idxs = np.sum(matrix, axis=1) >= 2  # sum=2
62 | elif options.filter == 'sum3':
63 |     filter_idxs = np.sum(matrix, axis=1) >= 3  # sum=3
64 | else:
65 |     raise ValueError, "what filter do you want me to use?"
66 | 
67 | fig = plt.figure(figsize=(24,16))
68 | ax = fig.add_subplot(111)
69 | streamgraph.streamgraph(ax, streams[filter_idxs], x=times, colors=colors[filter_idxs])
70 | streamgraph.format_streamgraph(ax)
71 | ax.xaxis.set_ticks(times)
72 | ax.autoscale_view()
73 | # fig.show()
74 | fig.savefig(args[1],dpi=120)
75 | 


--------------------------------------------------------------------------------
/blast.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from Bio.Blast import NCBIWWW
 3 | from Bio.Blast import NCBIXML
 4 | 
 5 | def number_genome_qblast_hits(seqreclist):
 6 |     fastastring = ''.join([rec.format('fasta') for rec in seqreclist])
 7 |     results_handle = NCBIWWW.qblast('blastn','nr',fastastring,expect=1.,word_size=7,nucl_reward=1,nucl_penalty=-3,hitlist_size=1000)
 8 |     blast_records = NCBIXML.parse(results_handle)
 9 |     
10 |     hits = [len(record.alignments) for record in blast_records]
11 |     
12 |     return hits
13 | 
14 | def number_genome_qblast_protein_hits(sequence):
15 |     results_handle = NCBIWWW.qblast('blastp','nr',sequence,expect=100,word_size=3,hitlist_size=1000)
16 |     blast_records = NCBIXML.parse(results_handle)
17 |     num_hits = sum([len(record.alignments) for record in blast_records])
18 |     return num_hits
19 |     
20 | 
21 | 
22 | # def number_genome_qblast_hits(seqlist):
23 | #     fastastring = ''
24 | #     for (i,seq) in enumerate(seqlist): fastastring += '>seq%i\n%s\n' % (i,seq)
25 | #     
26 | #     results_handle = NCBIWWW.qblast('blastn','nr',fastastring,expect=0.1,word_size=7,nucl_reward=1,nucl_penalty=-3,hitlist_size=500)
27 | #     blast_records = NCBIXML.parse(results_handle)
28 | #     
29 | #     total_hits = 0
30 | #     for record in blast_records: total_hits += len(record.alignments)
31 | #     
32 | #     # print total_hits
33 | #     # sys.stdout.flush()
34 | #     
35 | #     return total_hits
36 | 


--------------------------------------------------------------------------------
/blat.py:
--------------------------------------------------------------------------------
 1 | # BLAT tools
 2 | # based on Sri's code
 3 | 
 4 | import sys
 5 | import subprocess
 6 | import os
 7 | import signal
 8 | import time
 9 | 
10 | import seqtools
11 | 
12 | hg_idx = '~/genome/hg19.2bit'
13 | 
14 | def start_gfServer(file2idx=hg_idx,tileSize=11,stepSize=2,minMatch=2,maxGap=4,repMatch=1000000,debug=False):
15 |     params = (tileSize,stepSize,minMatch,maxGap,repMatch,file2idx)
16 |     cmd = "gfServer start -tileSize=%i -stepSize=%i -minMatch=%i -maxGap=%i -repMatch=%i localhost 17779 %s" % params
17 |     if debug: print "Command is:\n%s" % cmd
18 |     p = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE)
19 |     time.sleep(660)
20 |     print "Finished starting up BLAT server (hopefully)."
21 |     return p
22 | 
23 | def is_server_running():
24 |     p = subprocess.Popen('ps -A',shell=True,stdout=subprocess.PIPE)
25 |     lines = p.stdout.readlines()
26 |     for line in lines:
27 |         if 'gfServer' in line:
28 |             return True
29 |     return False
30 | 
31 | def stop_gfServer(p=None):
32 |     if p != None:
33 |         os.kill(p.pid,signal.SIGTERM)
34 |         time.sleep(5)
35 |     else:
36 |         pids = []
37 |         p = subprocess.Popen('ps -A',shell=True,stdout=subprocess.PIPE)
38 |         lines = p.stdout.readlines()
39 |         for line in lines:
40 |             if 'gfServer' in line:
41 |                 pids.append(int(line.split()[0]))
42 |         for pid in pids:
43 |             os.kill(pid,signal.SIGTERM)
44 |             time.sleep(5)
45 | 
46 | # HACK/BUG: for some reason gfClient is doubling the directory prefix.  It works if
47 | #           file2idx='/'
48 | # def search_sequences(seqs,file2idx=hg_idx,minScore=20,minIdentity=70,debug=False):
49 | def search_sequences(seqs,file2idx='/',minScore=15,minIdentity=70,debug=False):    
50 |     if not is_server_running():
51 |         raise RuntimeError, "BLAT server not running."
52 |     
53 |     # generate query
54 |     if hasattr(seqs[0],'format'):
55 |         query = ''.join([s.format('fasta') for s in seqs])
56 |     else:
57 |         query = ''.join(['>query%i\n%s\n' % (i,s) for (i,s) in enumerate(seqs)])
58 |     
59 |     # define and run command
60 |     nibdir = os.path.dirname(file2idx)
61 |     params = (minScore,minIdentity,nibdir)
62 |     cmd = "gfClient -minScore=%i -minIdentity=%i -nohead localhost 17779 %s /dev/stdin /dev/stdout" % params
63 |     if debug: print cmd
64 |     p = subprocess.Popen(cmd,shell=True,stdin=subprocess.PIPE,stdout=subprocess.PIPE)
65 |     p.stdin.write( query )
66 |     p.stdin.close()
67 |     
68 |     # process output
69 |     num = 0
70 |     for line in p.stdout:
71 |         if debug: print line
72 |         if line == "Output is in /dev/stdout\n":
73 |             continue
74 |         num += 1
75 |     
76 |     return num
77 | 
78 | # HACK/BUG: for some reason gfClient is doubling the directory prefix.  It works if
79 | #           file2idx='/'
80 | def search_sequence(seq,file2idx='/',minScore=15,minIdentity=50,debug=False):
81 |     return search_sequences([seq],file2idx,minScore,minIdentity,debug)
82 | 


--------------------------------------------------------------------------------
/countdata.py:
--------------------------------------------------------------------------------
  1 | """
  2 | countdata.py
  3 | 
  4 | Functions for stats and analysis of count data.
  5 | 
  6 | """
  7 | 
  8 | import sys
  9 | 
 10 | import numpy as np
 11 | import scipy as sp
 12 | # import scipy.stats
 13 | 
 14 | # ==============================================================================
 15 | 
 16 | # ======================
 17 | # = Count manipulation =
 18 | # ======================
 19 | 
 20 | def sample2counts(sample, categories=0):
 21 |     """Return count vector from list of samples.
 22 |     
 23 |     Take vector of samples and return a vector of counts.  The elts
 24 |        refer to indices in something that would ultimately map to the
 25 |        originating category (like from a multinomial).  Therefore, if there
 26 |        are, say, 8 categories, then valid values in sample should be 0-7.
 27 |        If categories is not given, then i compute it from the highest value
 28 |        present in sample (+1).
 29 |     
 30 |     """
 31 |     counts = np.bincount(sample)
 32 |     if (categories > 0) and (categories > len(counts)):
 33 |         counts = np.append( counts, np.zeros(categories-len(counts)) )
 34 |     return counts
 35 | 
 36 | def counts2sample(counts):
 37 |     """Computes a consistent sample from a vector of counts.
 38 |     
 39 |     Takes a vector of counts and returns a vector of indices x
 40 |        such that len(x) = sum(c) and each elt of x is the index of
 41 |        a corresponding elt in c
 42 |     
 43 |     """
 44 |     x = np.ones(np.sum(counts),dtype=np.int_)
 45 |     
 46 |     start_idx = 0
 47 |     end_idx = 0
 48 |     for i in xrange(len(counts)):
 49 |         start_idx = end_idx
 50 |         end_idx = end_idx + counts[i]
 51 |         x[start_idx:end_idx] = x[start_idx:end_idx] * i 
 52 |     return x
 53 | 
 54 | # ==============================================================================
 55 | 
 56 | # ========================
 57 | # = Percentile functions =
 58 | # ========================
 59 | 
 60 | def scoreatpercentile(values,rank):
 61 |     return sp.stats.scoreatpercentile(values,rank)
 62 | 
 63 | def percentileofscore(values,score):
 64 |     values.sort()
 65 |     return values.searchsorted(score) / np.float_(len(values))
 66 | 
 67 | #The scipy version does some funny histogramming thing
 68 | #def percentileofscore(values,score):
 69 | #   return stats.percentileofscore(values,score,kind='weak')
 70 | 
 71 | # ==============================================================================
 72 | 
 73 | # ============
 74 | # = q-values =
 75 | # ============
 76 | 
 77 | def qvalues(p,lambd=np.arange(0,0.91,0.05),method='bootstrap',B=100,smoothlog = False,robust=False):
 78 |     """Compute q-values using Storey method from array of p-values.
 79 |     
 80 |     Adapted from his R software.
 81 |     
 82 |     """
 83 |     # check validity of values
 84 |     p = np.array(p)
 85 |     if np.min(p)<0 or np.max(p)>1:
 86 |         raise Exception, "p-values not in valid range"
 87 |     
 88 |     m = len(p)
 89 |     
 90 |     pi0 = np.zeros(len(lambd))
 91 |     
 92 |     for i in np.arange(len(lambd)):
 93 |         pi0[i] = np.mean(p >= lambd[i]) / (1-lambd[i])
 94 |     
 95 |     if method == 'bootstrap':
 96 |         minpi0 = np.min(pi0)
 97 |         mse = np.zeros(len(lambd))
 98 |         pi0_boot = np.zeros(len(lambd))
 99 |         for i in np.arange( B ):
100 |             p_boot = p[ np.random.randint(0,m,m) ]
101 |             for j in np.arange( len(lambd) ):
102 |                 pi0_boot[j] = np.mean(p_boot >= lambd[j]) / (1-lambd[j])
103 |             mse += (pi0_boot - minpi0)**2
104 |         pi0 = np.min(pi0[mse == np.min(mse)])
105 |         print pi0.shape
106 |         pi0 = np.min(pi0,axis=1)
107 |     elif method == 'smoother':
108 |         # TODO
109 |         print "Not implemented yet"
110 |         return
111 |     
112 |     if pi0 <= 0:
113 |         raise Exception, "The estimated pi0 <=0.  May be problem with pvalues."
114 |     
115 |     # calculate estimated q-values
116 |     u = np.argsort(p)
117 |     v = qvalrank(p)
118 |     
119 |     qvalue = pi0*m*p/v
120 |     if robust == True:
121 |         qvalue = pi0*m*p/(v*(1-(1-p)**m))
122 |     
123 |     qvalue[u[m-1]] = np.min( [qvalue[u[m-1]], 1] )
124 |     for i in np.arange(m-2,-1,-1):
125 |         qvalue[u[i]] = np.min( [qvalue[u[i]], qvalue[u[i+1]], 1] )
126 |     
127 |     return qvalue
128 | 
129 | def qvalrank(x):
130 |     idx = np.argsort(x)
131 |     levels = np.unique(x)    # sorted unique-d list
132 |     bin = levels.searchsorted(x)
133 |     tbl = np.bincount(bin)
134 |     cs = np.cumsum(tbl)
135 |     
136 |     tbl = cs.repeat(tbl)
137 |     tbl2 = np.zeros(len(tbl),np.int_)
138 |     tbl2[idx] = tbl
139 |     
140 |     return tbl2
141 | 
142 | # ==============================================================================
143 | 
144 | # ====================
145 | # = Compute p-values =
146 | # ====================
147 | 
148 | def pval_KalZtest(n1,N1,n2,N2):
149 |     """Compute p-value using Kal Z-test for count data.
150 |     
151 |     Compute pval using Z-test, as published in
152 |     Kal et al, 1999, Mol Biol Cell 10:1859.
153 |     
154 |     Z = (p1-p2) / sqrt( p0 * (1-p0) * (1/N1 + 1/N2) )
155 |     where p1 = n1/N1, p2=n2/N2, and p0=(n1+n2)/(N1+N2)
156 |     You reject if |Z| > Z_a/2 where a is sig lev.  Here
157 |     we return the p-value itself.
158 |     
159 |     """
160 |     if n1==0 and n2==0:
161 |         return 1.0
162 |     
163 |     n1 = np.float_(n1)
164 |     N1 = np.float_(N1)
165 |     n2 = np.float_(n2)
166 |     N2 = np.float_(N2)
167 |     
168 |     p0 = (n1+n2)/(N1+N2)
169 |     p1 = n1/N1
170 |     p2 = n2/N2
171 |     
172 |     Z = (p1-p2) / np.sqrt( p0 * (1-p0) * ((1/N1) + (1/N2)) )
173 |     
174 |     pval = 2 * sp.stats.norm.cdf(-1*abs(Z))
175 |     
176 |     return pval
177 | 
178 | def pval_KalZtest_vec(n1,N1,n2,N2):
179 |     assert n1.shape[0] == n2.shape[0]
180 |     
181 |     p0 = (n1+n2)/(float(N1)+N2)
182 |     p1 = n1/float(N1)
183 |     p2 = n2/float(N2)
184 |     
185 |     p0[(n1 == 0) & (n2 == 0)] = 0.5
186 |     
187 |     Z = (p1-p2) / np.sqrt( p0 * (1.-p0) * ((1./N1) + (1./N2)) )
188 |     
189 |     pval = 2 * sp.stats.norm.cdf(-1*abs(Z))
190 |     pval[(n1 == 0) & (n2 == 0)] = -1.
191 |     
192 |     return pval
193 | 
194 | def pval_logRatioMC(n1,N1,n2,N2):
195 |     pass
196 | 
197 | def pvals_logRatioMC(counts1, counts2, B=1e6, pseudocount=1, verbose=False):
198 |     """Compute component-wise p-values of difference between two count vectors
199 |     using Monte Carlo sampling of log ratios.
200 |     
201 |     Null hypothesis is that data is from same multinomial.  Parameters estimated
202 |     by combining both count vectors.  Zeros are handled by adding pseudocount to
203 |     each element.
204 |     
205 |     The test statistic is log Ratio, which is computed for each component.
206 |     
207 |     Two random count vectors are generated, and and component-wise log ratio
208 |     is computed.  For each component, it is recorded whether the abs random log
209 |     ratio was greater than or less than the abs test statistic value.  This is
210 |     performed B times.  The absolute value makes the test two-sided and symmetric.
211 |     
212 |     The achieved significance level (ASL) is returned for each component.
213 |     
214 |     """
215 |     if len(counts1) != len(counts2): raise ValueError, "Counts vectors have different lengths."
216 |     
217 |     counts1 = np.asarray(counts1, dtype=np.float)
218 |     counts2 = np.asarray(counts2, dtype=np.float)
219 |     
220 |     total1 = int(np.round(np.sum(counts1)))
221 |     total2 = int(np.round(np.sum(counts2)))
222 |     
223 |     countsMLE = counts1 + counts2 + pseudocount
224 |     counts1 = counts1 + pseudocount     # note: counts1 and counts2 are changed at this point
225 |     counts2 = counts2 + pseudocount
226 |     
227 |     normcounts1 = counts1 / np.sum(counts1)
228 |     normcounts2 = counts2 / np.sum(counts2)
229 |     
230 |     testabslogratios = np.abs(np.log10(normcounts2 / normcounts1))
231 |     
232 |     probvec = countsMLE / np.sum(countsMLE)
233 |     
234 |     atleastasextreme = np.zeros(len(counts1))
235 |     
236 |     for i in xrange(B):
237 |         if verbose and i % 10 == 0:
238 |             sys.stdout.write("%i " % i)
239 |             sys.stdout.flush()
240 |         
241 |         randcounts1 = np.float_(np.random.multinomial(total1, probvec)) + pseudocount
242 |         randcounts2 = np.float_(np.random.multinomial(total2, probvec)) + pseudocount
243 |         
244 |         normrandcounts1 = randcounts1 / np.sum(randcounts1)
245 |         normrandcounts2 = randcounts2 / np.sum(randcounts2)
246 |         
247 |         randabslogratios = np.abs(np.log10(normrandcounts2 / normrandcounts1))
248 |         
249 |         atleastasextreme += np.float_(randabslogratios >= testabslogratios)
250 |     
251 |     ASL = atleastasextreme / B
252 |     
253 |     return ASL
254 | 
255 | def pvals_counts(counts1,counts2,method='KalZtest'):
256 |     """Compute component-wise p-values of difference between two count vectors.
257 |     
258 |     method can be one of:
259 |         KalZtest
260 |         MonteCarlo
261 |     
262 |     """
263 |     if len(counts1) != len(counts2): raise ValueError, "Counts vectors have different lengths."
264 |     
265 |     pvals = np.zeros(len(counts1))
266 |     N1 = np.sum(counts1)
267 |     N2 = np.sum(counts2)
268 |     
269 |     if method == 'KalZtest':
270 |         for i in xrange(len(pvals)):
271 |             pvals[i] = pval_KalZtest(counts1[i],N1,counts2[i],N2)
272 |     elif method == 'MonteCarlo':
273 |         pvals = pvals_logRatioMC(counts1,counts2,B=1e6,pseudocounts=1)
274 |     else:
275 |         raise Exception, method + " is not a recognized method for computing p-values."
276 |     
277 |     return pvals
278 | 
279 | # ==============================================================================
280 | 
281 | # ==========================
282 | # = Random data generation =
283 | # ==========================
284 | 
285 | def gen_rand_count_vec(numComponents,numCounts,fracNull,probvecNull,probvecAlt):
286 |     pass
287 | 
288 | # ==============================================================================
289 | # ==============================================================================
290 | # ==============================================================================
291 | # ==============================================================================
292 | # ==============================================================================
293 | # ==============================================================================


--------------------------------------------------------------------------------
/daemonize.py:
--------------------------------------------------------------------------------
 1 | # from Python Cookbook, 2nd edition, Recipe 9.13
 2 | import sys, os
 3 | ''' Module to fork the current process as a daemon.
 4 |     NOTE: don't do any of this if your daemon gets started by inetd!  inetd
 5 |     does all you need, including redirecting standard file descriptors;
 6 |     the chdir( ) and umask( ) steps are the only ones you may still want.
 7 | '''
 8 | def daemonize (stdin='/dev/null', stdout='/dev/null', stderr='/dev/null'):
 9 |     ''' Fork the current process as a daemon, redirecting standard file
10 |         descriptors (by default, redirects them to /dev/null).
11 |     '''
12 |     # Perform first fork.
13 |     try:
14 |         pid = os.fork( )
15 |         if pid > 0:
16 |             sys.exit(0) # Exit first parent.
17 |     except OSError, e:
18 |         sys.stderr.write("fork #1 failed: (%d) %s\n" % (e.errno, e.strerror))
19 |         sys.exit(1)
20 |     # Decouple from parent environment.
21 |     os.chdir("/")
22 |     os.umask(0)
23 |     os.setsid( )
24 |     # Perform second fork.
25 |     try:
26 |         pid = os.fork( )
27 |         if pid > 0:
28 |             sys.exit(0) # Exit second parent.
29 |     except OSError, e:
30 |         sys.stderr.write("fork #2 failed: (%d) %s\n" % (e.errno, e.strerror))
31 |         sys.exit(1)
32 |     # The process is now daemonized, redirect standard file descriptors.
33 |     for f in sys.stdout, sys.stderr: f.flush( )
34 |     si = file(stdin, 'r')
35 |     so = file(stdout, 'a+')
36 |     se = file(stderr, 'a+', 0)
37 |     os.dup2(si.fileno( ), sys.stdin.fileno( ))
38 |     os.dup2(so.fileno( ), sys.stdout.fileno( ))
39 |     os.dup2(se.fileno( ), sys.stderr.fileno( ))
40 | 
41 | # def _example_main ( ):
42 | #     ''' Example main function: print a count & timestamp each second '''
43 | #     import time
44 | #     sys.stdout.write('Daemon started with pid %d\n' % os.getpid( ) )
45 | #     sys.stdout.write('Daemon stdout output\n')
46 | #     sys.stderr.write('Daemon stderr output\n')
47 | #     c = 0
48 | #     while True:
49 | #         sys.stdout.write('%d: %s\n' % (c, time.ctime( )))
50 | #         sys.stdout.flush( )
51 | #         c = c + 1
52 | #         time.sleep(1)
53 | # if _ _name_ _ == "_ _main_ _":
54 | #     daemonize('/dev/null','/tmp/daemon.log','/tmp/daemon.log')
55 | #     _example_main( )
56 | 


--------------------------------------------------------------------------------
/degex.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | # take a FASTA file of DNA sequences (short oligos) with IUPAC degeneracies
 4 | # and ambiguities and expand all combinatorial possibilities into a new FASTA
 5 | # file
 6 | #
 7 | # works by implementing a recursive depth first search
 8 | 
 9 | IUPAC_vals = {'A': 'A',
10 |               'B': 'CGT',
11 |               'C': 'C',
12 |               'D': 'AGT',
13 |               'G': 'G',
14 |               'H': 'ACT',
15 |               'K': 'GT',
16 |               'M': 'AC',
17 |               'N': 'GATC',
18 |               'R': 'AG',
19 |               'S': 'CG',
20 |               'T': 'T',
21 |               'V': 'ACG',
22 |               'W': 'AT',
23 |               'X': 'GATC',
24 |               'Y': 'CT'}
25 | 
26 | # ======================
27 | # = Depth first search =
28 | # ======================
29 | 
30 | class dfs_node:
31 |     def __init__(self, cum, rem):
32 |         self.visited =    False
33 |         self.neighbors =  []
34 |         self.cumul_seq =  cum
35 |         self.remain_seq = rem
36 | 
37 | # to use: must supply:
38 | #    1. a list where the sequences will be pushed and
39 | #    2. a dfs_node with cumul_seq empty and remain_seq = IUPAC DNA sequence
40 | 
41 | def dfs_expand_seq( curr_dfs_node, cum_list ):
42 |     curr_dfs_node.visited = True
43 | 
44 |     # if we are not at the end of the tree yet
45 |     if len(curr_dfs_node.remain_seq) > 0:
46 |         # construct neighbors of current dfs_node based on remaining sequence
47 |         for nucleotide in IUPAC_vals[ curr_dfs_node.remain_seq[0] ]:
48 |             curr_dfs_node.neighbors.append( dfs_node(curr_dfs_node.cumul_seq + nucleotide, curr_dfs_node.remain_seq[1:]) )
49 | 
50 |         # implement recursive DFS
51 |         for neighbor in curr_dfs_node.neighbors:
52 |             if neighbor.visited == False:
53 |                 dfs_expand_seq( neighbor, cum_list )
54 | 
55 |     # we should only run this when there are no neighbors left
56 |     elif len(curr_dfs_node.remain_seq) == 0:
57 |         cum_list.append(curr_dfs_node.cumul_seq)
58 | 
59 | def expand_seq(seq):
60 |     expanded_list = []
61 |     start_node = dfs_node('',seq)
62 |     dfs_expand_seq( start_node, expanded_list )
63 |     return expanded_list
64 | 
65 | # ========
66 | # = MAIN =
67 | # ========
68 | 
69 | if __name__ == '__main__':
70 |     import sys
71 |     
72 |     from Bio import SeqIO
73 |     
74 |     if len(sys.argv) == 3:
75 |         inhandle = open(sys.argv[1],'r')
76 |         outhandle = open(sys.argv[2],'w')
77 |     elif len(sys.argv) == 2:
78 |         inhandle = open(sys.argv[1],'r')
79 |         outhandle = sys.stdout
80 |     elif len(sys.argv) == 1:
81 |         inhandle = sys.stdin
82 |         outhandle = sys.stdout
83 |     
84 |     for record in SeqIO.parse(inhandle,'fasta'):
85 |         seq = record.seq.tostring().upper()
86 |         expanded_seqs = expand_seq( seq )
87 |         for (i,s) in enumerate(expanded_seqs):
88 |             outhandle.write(">%s|%i\n%s\n" % (record.description,i+1,s))     # write fasta output
89 | 


--------------------------------------------------------------------------------
/exonerate.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import tempfile
  3 | import subprocess
  4 | 
  5 | import seqtools
  6 | 
  7 | class ExonerateCommand(object):
  8 |     """Build command for exonerate"""
  9 |     
 10 |     options_list = [
 11 |         'query',
 12 |         'target',
 13 |         'querytype',
 14 |         'targettype',
 15 |         'querychunkid',
 16 |         'querychunktotal',
 17 |         'targetchunkid',
 18 |         'targetchunktotal',
 19 |         'verbose',
 20 |         'exhaustive',
 21 |         'bigseq',
 22 |         'forcescan',
 23 |         'saturatethreshold',
 24 |         'customserver',
 25 |         'fastasuffix',
 26 |         'model',
 27 |         'score',
 28 |         'percent',
 29 |         'showalignment',
 30 |         'showsugar',
 31 |         'showcigar',
 32 |         'showvulgar',
 33 |         'showquerygff',
 34 |         'showtargetgff',
 35 |         # 'ryo',    NOTE: this is left out as it requires special handling
 36 |         'bestn',
 37 |         'subopt',
 38 |         'gappedextension',
 39 |         'refine',
 40 |         'refineboundary',
 41 |         'dpmemory',
 42 |         'compiled',
 43 |         'terminalrangeint',
 44 |         'terminalrangeext',
 45 |         'joinrangeint',
 46 |         'joinrangeext',
 47 |         'spanrangeint',
 48 |         'spanrangeext',
 49 |         'extensionthreshold',
 50 |         'singlepass',
 51 |         'joinfilter',
 52 |         'annotation',
 53 |         'softmaskquery',
 54 |         'softmasktarget',
 55 |         'dnasubmat',
 56 |         'proteinsubmat',
 57 |         'fsmmemory',
 58 |         'forcefsm',
 59 |         'wordjump',
 60 |         'gapopen',
 61 |         'gapextend',
 62 |         'codongapopen',
 63 |         'codongapextend',
 64 |         'minner',
 65 |         'maxner',
 66 |         'neropen',
 67 |         'minintron',
 68 |         'maxintron',
 69 |         'intronpenalty',
 70 |         'frameshift',
 71 |         'useaatla',
 72 |         'geneticcode',
 73 |         'hspfilter',
 74 |         'useworddropoff',
 75 |         'seedrepeat',
 76 |         'dnawordlen',
 77 |         'proteinwordlen',
 78 |         'codonnwordlen',
 79 |         'dnahspdropoff',
 80 |         'proteinhspdropoff',
 81 |         'codonhspdropoff',
 82 |         'dnahspthreshold',
 83 |         'proteinhspthreshold',
 84 |         'codonhspthreshold',
 85 |         'dnawordlimit',
 86 |         'proteinwordlimit',
 87 |         'codonwordlimit',
 88 |         'geneseed',
 89 |         'geneseedrepeat',
 90 |         'alignmentwidth',
 91 |         'forwardcoordinates',
 92 |         'quality',
 93 |         'splice3',
 94 |         'splice5',
 95 |         'forcegtag']
 96 |     
 97 |     
 98 |     def __init__(self, *args, **kw):
 99 |         # register preset handlers
100 |         self.register = {
101 |             'affine:local' : self.preset_affinelocal,
102 |             'affine:global' : self.preset_affineglobal,
103 |             'findend' : self.preset_findend,
104 |             'parsable' : self.preset_parsable,
105 |             'pretty' : self.preset_pretty,
106 |             'bestonly' : self.preset_bestonly,
107 |             'ungapped' : self.preset_ungapped
108 |         }
109 |         
110 |         # these attributes must be handled special, and set manually at the start
111 |         self.options = {}
112 |         self.ryo = None
113 |         
114 |         # first execute any registered functions
115 |         for a in args:
116 |             self.register[a]()
117 |         
118 |         # check for ryo output and save it (needs special handling)
119 |         if kw.has_key('ryo'): self.ryo = kw.pop('ryo')
120 |         
121 |         # then set all the manual options supplied
122 |         self.options.update(kw)
123 |         
124 |         # set standard options in case they weren't given initially
125 |         # they can still be overwritten
126 |         self.softset_default()
127 |         
128 |         # return self
129 |     
130 |     def __setattr__(self,name,value):
131 |         """Allows setting of options by acting on object attributes.
132 |         
133 |         For example:
134 |         cmd = ExonerateCommand()
135 |         cmd.querytype = 'dna'
136 |         
137 |         Catches the special cases of ryo and options.
138 |         ryo needs to be set manually
139 |         options shouldn't be overwritten, but lets you...
140 |         """
141 |         if name in ExonerateCommand.options_list:
142 |             self.options[name] = value
143 |         else:
144 |             object.__setattr__(self,name,value)
145 |     
146 |     def __getattr__(self,name):
147 |         if name in ExonerateCommand.options_list:
148 |             return self.options[name]
149 |         else:
150 |             raise AttributeError
151 |     
152 |     def build_command(self):
153 |         self.cmd = 'exonerate'
154 |         for (option,value) in self.options.iteritems():
155 |             self.cmd += ' --%s %s' % (option,value)
156 |         
157 |         # handle ryo output using raw string
158 |         if self.ryo is not None:
159 |             self.cmd += r' --%s "%s"' % ('ryo',self.ryo)
160 |         
161 |         return self.cmd
162 |     
163 |     def softset_default(self):
164 |         """Conditionally override options to a reasonable default."""
165 |         if not self.options.has_key('model'):
166 |             self.model = 'affine:local'
167 |         if not self.options.has_key('querytype'):
168 |             self.querytype = 'dna'
169 |         if not self.options.has_key('targettype'):
170 |             self.targettype = 'dna'
171 |     
172 |     def hardset_preset(self,*args):
173 |         for a in args:
174 |             register[a](self)
175 |     
176 |     def preset_affinelocal(self):
177 |         self.model = 'affine:local'
178 |     
179 |     def preset_affineglobal(self):
180 |         self.model = 'affine:global'
181 |         self.exhaustive = True
182 |     
183 |     def preset_ungapped(self):
184 |         self.model = 'ungapped'
185 |         self.exhaustive = True
186 |     
187 |     def preset_findend(self):
188 |         self.model = 'affine:overlap'
189 |         self.exhaustive = True
190 |     
191 |     def preset_parsable(self):
192 |         self.verbose = 0
193 |         # self.showalignment = False
194 |         # self.showvulgar = False
195 |         self.ryo = r'aln_summary: %qi %ql %qab %qae %qS %ti %tl %tab %tae %tS %s %et %ei %pi\n'
196 |     
197 |     def preset_pretty(self):
198 |         self.showalignment = True
199 |         self.showvulgar = True
200 |         self.showsugar = True
201 |     
202 |     def preset_bestonly(self):
203 |         self.bestn = 1
204 | 
205 | def run_exonerate(cmd,query=None,target=None):
206 |     """Run exonerate using given ExonerateCommand object
207 |     
208 |     query and target must refer to files
209 |     """
210 |     # check query and target are set properly
211 |     if query is not None: cmd.query = query
212 |     if target is not None: cmd.target = target
213 |     try:
214 |         cmd.query
215 |         cmd.target
216 |     except KeyError:
217 |         print "cmd.query or cmd.target is not set"
218 |         raise
219 |     
220 |     # submit process
221 |     p = subprocess.Popen(cmd.build_command(),shell=True,stdout=subprocess.PIPE)
222 |     aln = p.stdout.read()
223 |     p.wait()
224 |     return aln
225 | 
226 | def run_exonerate2(cmd,query,target,queryname='query',targetname='target',debug=False):
227 |     """Perform pairwise alignment using cmd ExonerateCommand object
228 |     
229 |     query and target are sequences
230 |     """
231 |     # TODO: see if this can be implemented without writing to temporary files
232 |     
233 |     # write seqs to tempfiles
234 |     (fdq,queryfile) = tempfile.mkstemp()
235 |     (fdt,targetfile) = tempfile.mkstemp()
236 |     iopq = open(queryfile,'w')
237 |     iopt = open(targetfile,'w')
238 |     print >>iopq, ">%s\n%s\n" % (queryname,query)
239 |     print >>iopt, ">%s\n%s\n" % (targetname,target)
240 |     iopq.close()
241 |     iopt.close()
242 |     os.close(fdq)
243 |     os.close(fdt)
244 |     
245 |     try:
246 |         # perform alignment
247 |         cmd.query = queryfile
248 |         cmd.target = targetfile
249 |         aln = run_exonerate(cmd)
250 |     finally:
251 |         # clean up
252 |         os.remove(queryfile)
253 |         os.remove(targetfile)
254 |     
255 |     if debug: print aln
256 |     
257 |     return aln
258 | 
259 | def iter_alnsummary(rawaln):
260 |     """Return alnsummary line from rawaln."""
261 |     for line in rawaln.split('\n'):
262 |         if line.startswith('aln_summary'):
263 |             yield line
264 | 
265 | def extract_alnsummary(rawaln):
266 |     """Return alnsummary line from rawaln."""
267 |     return iter_alnsummary(rawaln).next()
268 | 
269 | def iter_vulgar(rawaln):
270 |     """Return vulgar line from rawaln."""
271 |     for line in rawaln.split('\n'):
272 |         if line.startswith('vulgar'):
273 |             yield line
274 | 
275 | def extract_vulgar(rawaln):
276 |     """Return vulgar line from rawaln."""
277 |     return iter_vulgar(rawaln).next()
278 | 
279 | def iter_alnsummary_vulgar(rawaln):
280 |     for (alnsummary,vulgar_commands) in zip(iter_alnsummary(rawaln),iter_vulgar(rawaln)):
281 |         yield (alnsummary,vulgar_commands)
282 | 
283 | def parse_alnsummary(rawalnsummary):
284 |     """Parse alnsummary line from exonerate using 'parsable' preset.
285 |     
286 |     Takes an alnsummary line from an alignment that was generated from an ryo
287 |     'parsable' preset.
288 |     """
289 |     # 'aln_summary: %qi %ql %qab %qae %qS %ti %tl %tab %tae %tS %s %et %ei %pi\n'
290 |     data = rawalnsummary.split()
291 |     
292 |     aln = {}
293 |     aln['query_id']         = data[1]
294 |     aln['query_len']        = int(data[2])
295 |     aln['query_aln_begin']  = int(data[3])
296 |     aln['query_aln_end']    = int(data[4])
297 |     aln['query_strand']     = data[5]
298 |     aln['target_id']        = data[6]
299 |     aln['target_len']       = int(data[7])
300 |     aln['target_aln_begin'] = int(data[8])
301 |     aln['target_aln_end']   = int(data[9])
302 |     aln['target_strand']    = data[10]
303 |     aln['score']            = int(data[11])
304 |     aln['equiv_total']      = int(data[12])
305 |     aln['equiv_id']         = int(data[13])
306 |     aln['percent_id']       = float(data[14])
307 |     
308 |     return aln
309 | 
310 | def parse_aln(rawaln):
311 |     """Parse raw alignment from exonerate using 'parsable' preset.
312 |     
313 |     Takes a raw alignment and searches for an alnsummary line (generated from
314 |     an ryo 'parsable' preset) and parses it.
315 |     """
316 |     for line in rawaln.split('\n'):
317 |         if line.strip().startswith('aln_summary'):
318 |             rawalnsummary = line.strip()
319 |             break
320 |     else:
321 |         raise ValueError, "aln_summary line not found in raw aln:\n%s" % rawaln
322 |     
323 |     return parse_alnsummary(rawalnsummary)
324 | 
325 | def parse_vulgar(rawvulgar):
326 |     """Parse vulgar line
327 |     
328 |     Takes vulgar line from alignment output
329 |     
330 |     returns only the non-sugar part that allows you to build the aln
331 |     """
332 |     data = rawvulgar.split()[10:]
333 |     cmds = []
334 |     for i in range(0,len(data),3):
335 |         cmds.append( (data[0],int(data[1]),int(data[2])) )
336 |     return cmds
337 | 
338 | def build_aln(alnsummary,vulgar_commands,queryseq,targetseq):
339 |     """Build full alignment from exonerate using 'parsable' preset and vulgar output"""
340 |     
341 |     queryname = alnsummary['query_id']
342 |     targetname = alnsummary['target_id']
343 |     
344 |     # process strands. the position vars below will always progress
345 |     # from 0->len(seq), so the seqs must be revcomped accordingly
346 |     
347 |     queryposition  = alnsummary['query_aln_begin']
348 |     targetposition = alnsummary['target_aln_begin']
349 |     if alnsummary['query_strand'] == '-':
350 |         queryseq = seqtools.reverse_complement(queryseq)
351 |         queryposition = len(queryseq) - queryposition
352 |     if alnsummary['target_strand'] == '-':
353 |         targetseq = seqtools.reverse_complement(targetseq)
354 |         targetposition = len(targetseq) - targetposition
355 |     pad = abs(queryposition - targetposition)
356 |     
357 |     # build alignment
358 |     queryaln  = ''
359 |     targetaln = ''
360 |     
361 |     # process necessary padding
362 |     if queryposition > targetposition:
363 |         targetaln = ' ' * pad
364 |     else:
365 |         queryaln  = ' ' * pad
366 |     
367 |     # add pre-aln sequence
368 |     queryaln  += queryseq[0:queryposition]
369 |     targetaln += targetseq[0:targetposition]
370 |     
371 |     # walk through alignment (from vulgar output)
372 |     for cmd in vulgar_commands:
373 |         if cmd[0] == 'M':
374 |             assert(cmd[1]==cmd[2])
375 |             queryaln  += queryseq[queryposition:queryposition+cmd[1]]
376 |             targetaln += targetseq[targetposition:targetposition+cmd[2]]
377 |             queryposition  += cmd[1]
378 |             targetposition += cmd[2]
379 |         elif cmd[0] == 'G':
380 |             assert( (cmd[1]==0) != (cmd[1]==0) )    # xor
381 |             if cmd[1] == 0:
382 |                 queryaddendum = '-' * cmd[2]
383 |                 targetaddendum = targetseq[targetposition:targetposition+cmd[2]]
384 |             elif cmd[2] == 0:
385 |                 queryaddendum = queryseq[queryposition:queryposition+cmd[1]]
386 |                 targetaddendum = '-' * cmd[1]
387 |             queryaln  += queryaddendum
388 |             targetaln += targetaddendum
389 |             queryposition  += cmd[1]
390 |             targetposition += cmd[2]
391 |         else:
392 |             raise ValueError, "I do not understand the vulgar command %s" % cmd[0]
393 |    
394 |     # add any post-aln sequence
395 |     queryaln  += queryseq[queryposition:]
396 |     targetaln += targetseq[targetposition:]
397 |     
398 |     return (queryaln,targetaln)
399 | 


--------------------------------------------------------------------------------
/graphtools.py:
--------------------------------------------------------------------------------
 1 | import pygraphviz as pgv
 2 | 
 3 | import scale
 4 | 
 5 | def load_immunitree_nodes(infile):
 6 |     G = pgv.AGraph(strict=True,directed=True)
 7 |     with open(infile,'r') as ip:
 8 |         ip.next()   # burn header
 9 |         for line in ip:
10 |             data = [d.strip() for d in line.split(',')]
11 |             
12 |             node = data[0]
13 |             parent = data[1]
14 |             size = int(data[2])
15 |             muts = len(data[-1].split('-'))
16 |             
17 |             G.add_node(node,xlabel="[%s] %i" % (node,size),size=size)
18 |             if parent != '0':
19 |                 G.add_edge(parent,node,label=muts)
20 |     
21 |     return G
22 | 
23 | def format_immunitree_graph(G):
24 |     min_size = max(min([int(node.attr['size']) for node in G.nodes_iter()]),1)
25 |     max_size = max([int(node.attr['size']) for node in G.nodes_iter()])
26 |     min_area = 0.3
27 |     max_area = 1.3
28 |     area_scale = scale.root(min_size,max_size).range(min_area,max_area).power(2)
29 |     for node in G.nodes_iter():
30 |         node.attr['fixedsize'] = True
31 |         if int(node.attr['size']) == 0:
32 |             node.attr['shape'] = 'point'
33 |         else:
34 |             node.attr['shape'] = 'circle'
35 |             node.attr['height'] = area_scale(int(node.attr['size']))
36 |     
37 |     for edge in G.edges_iter():
38 |         pass
39 |     
40 |     G.graph_attr['forcelabels'] = True
41 |     G.layout(prog='dot')
42 | 


--------------------------------------------------------------------------------
/lsf.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | import time
 4 | 
 5 | # ===================
 6 | # = LSF Dispatching =
 7 | # ===================
 8 | 
 9 | def submit_to_LSF(queue, LSFopfile, duration, cmd_to_submit, mem_usage=None):
10 |     # wrap command to submit in quotations
11 |     cmd_to_submit = r"'%s'" % cmd_to_submit.strip(r'"')
12 |     LSF_params = {'LSFoutput': LSFopfile,
13 |                   'queue': queue,
14 |                   'duration': duration}
15 |     LSF_cmd = 'rbsub -q%(queue)s -W %(duration)s -o%(LSFoutput)s' % LSF_params
16 |     if mem_usage != None:
17 |         LSF_cmd += r' -R "rusage[mem=%d]"' % mem_usage
18 |     cmd = ' '.join([LSF_cmd, cmd_to_submit])
19 |     p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
20 |     #p.wait()
21 |     return p.stdout.read().split('<')[1].split('>')[0]
22 | 
23 | def parse_LSF_report(filename):
24 |     jobID = -1
25 |     finished = False
26 |     succeeded = False
27 |     
28 |     ip = open(filename)
29 |     for line in ip:
30 |         if line.startswith('Subject:') and 'Job' in line:
31 |             jobID = line.split()[2].rstrip(':')
32 |             if 'Done' in line or 'Exited' in line:
33 |                 finished = True
34 |         if 'TERM_REQUEUE_ADMIN' in line:    # for when rbsub requeues
35 |             finished = False
36 |         if 'Successfully completed.' in line:
37 |             succeeded = True
38 |     ip.close()
39 |     
40 |     return (jobID,finished,succeeded)
41 | 
42 | def wait_for_LSF_jobs(jobIDs,logfiles,interval=120):
43 |     while len(jobIDs) > 0:
44 |         time.sleep(interval)        
45 |         # parse logfiles to see which jobs finished in the interim
46 |         for logfile in logfiles:
47 |             if not os.path.exists(logfile): # (job not finished)
48 |                 continue
49 |             (jobID,finished,succeeded) = parse_LSF_report(logfile)
50 |             if jobID != -1 and finished and succeeded:
51 |                 jobIDs.remove(jobID)
52 |                 logfiles.remove(logfile)
53 |             elif jobID != -1 and finished and not succeeded:
54 |                 raise ValueError, "Job %s failed" % jobID
55 | 
56 | # DEPRECATED: USES bjobs TO TEST FOR JOB COMPLETION
57 | # def wait_for_LSF_jobs(PIDs,interval=30):
58 | #     finished = False
59 | #     while not finished:
60 | #         time.sleep(interval)
61 | #         p = subprocess.Popen('bjobs',shell=True,stdout=subprocess.PIPE)
62 | #         #p.wait()
63 | #         status = p.stdout.read().split('\n')
64 | #         if status[0].split()[0] != 'JOBID':
65 | #             finished = False
66 | #             continue
67 | #         runningprocesses = [line.split()[0] for line in status if line.split() != [] and line.split()[0] != 'JOBID']
68 | #         finished = True
69 | #         for pid in PIDs:
70 | #             if pid in runningprocesses:
71 | #                 finished = False


--------------------------------------------------------------------------------
/mplextensions.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | 
  3 | import numpy as np
  4 | import scipy as sp
  5 | import scipy.stats
  6 | 
  7 | import matplotlib as mpl
  8 | import matplotlib.pyplot as plt
  9 | 
 10 | def jitter(data, bins=100):
 11 |     data = np.asarray(data)
 12 |     (hist,edges) = np.histogram(data,bins=bins)
 13 |     hist = np.float_(hist) / max(hist)
 14 |     idxs = np.searchsorted(edges[:-2],data)
 15 |     return hist[idxs]
 16 | 
 17 | def jitter_x(x,y,width=None,bins=100):
 18 |     x = np.asarray(x)
 19 |     y = np.asarray(y)
 20 |     
 21 |     x_argsort = np.argsort(x)
 22 |     x_uniq = sorted(list(set(x)))
 23 |     
 24 |     # find smallest interval between any two x-values
 25 |     if width == None:
 26 |         if len(x_uniq) == 1:
 27 |             width = 1.
 28 |         else:
 29 |             interval = min([x[x_argsort[i+1]]-x[x_argsort[i]] for i in xrange(len(x)-1)])
 30 |             width = interval / 3.
 31 |     
 32 |     x_jit = []
 33 |     y_jit = []
 34 |     for val in x_uniq:
 35 |         idx = (x==val)
 36 |         scaling_factors = jitter(y[idx],bins=bins)
 37 |         for (x_val,y_val,scaling) in zip(x[idx],y[idx],scaling_factors):
 38 |             x_jit.append( x_val + width * scaling * random.choice([-1,1]) * np.random.uniform(0,1))
 39 |             y_jit.append( y_val )
 40 |     
 41 |     return (x_jit,y_jit)
 42 | 
 43 | 
 44 | # def jitter_x(x,y,width=None):
 45 | #     x = np.asarray(x)
 46 | #     y = np.asarray(y)
 47 | #     
 48 | #     x_argsort = np.argsort(x)
 49 | #     x_uniq = sorted(list(set(x)))
 50 | #     
 51 | #     # find smallest interval between any two x-values
 52 | #     if width == None:
 53 | #         interval = min([x[x_argsort[i+1]]-x[x_argsort[i]] for i in xrange(len(x)-1)])
 54 | #         width = interval / 3.
 55 | #     
 56 | #     x_jit = []
 57 | #     y_jit = []
 58 | #     for val in x_uniq:
 59 | #         idx = (x==val)
 60 | #         kernel = sp.stats.kde.gaussian_kde(y[idx])
 61 | #         kernel_max = max([kernel(v) for v in set(y[idx])])
 62 | #         for (x_val,y_val) in zip(x[idx],y[idx]):
 63 | #             x_jit.append( x_val + np.random.uniform(-1,1) * width * kernel(y_val) / kernel_max)
 64 | #             y_jit.append( y_val )
 65 | #     
 66 | #     return (x_jit,y_jit)
 67 | 
 68 | 
 69 | class ConstWidthRectangle(mpl.patches.Patch):
 70 |     
 71 |     def __init__(self, x, y1, y2, w, **kwargs):
 72 |         self.x  = x
 73 |         self.y1 = y1
 74 |         self.y2 = y2
 75 |         self.w  = w
 76 |         mpl.patches.Patch.__init__(self,**kwargs)
 77 |     
 78 |     def get_path(self):
 79 |         return mpl.path.Path.unit_rectangle()
 80 |     
 81 |     def get_transform(self):
 82 |         box = np.array([[self.x,self.y1],
 83 |                         [self.x,self.y2]])
 84 |         box = self.axes.transData.transform(box)
 85 |         
 86 |         w = self.w * self.axes.bbox.width / 2.0
 87 |         
 88 |         box[0,0] -= w
 89 |         box[1,0] += w
 90 |         
 91 |         return mpl.transforms.BboxTransformTo(mpl.transforms.Bbox(box))
 92 | 
 93 | class ConstWidthLine(mpl.lines.Line2D):
 94 |     
 95 |     def __init__(self,x,y,w,**kwargs):
 96 |         self.x = x
 97 |         self.y = y
 98 |         self.w = w
 99 |         mpl.lines.Line2D.__init__(self,[0,1],[0,0],**kwargs) # init to unit line
100 |     
101 |     def get_transform(self):
102 |         # define transform that takes unit horiz line seg
103 |         # and places it in correct position using display
104 |         # coords
105 |         
106 |         box = np.array([[self.x,self.y],
107 |                         [self.x,self.y+1]])
108 |         box = self.axes.transData.transform(box)
109 |         
110 |         w = self.w * self.axes.bbox.width / 2.0
111 |         
112 |         box[0,0] -= w
113 |         box[1,0] += w
114 |         
115 |         #xdisp,ydisp = self.axes.transData.transform_point([self.x,self.y])
116 |         #xdisp -= w
117 |         #xleft  = xdisp - w
118 |         #xright = xdisp + w
119 |         
120 |         return mpl.transforms.BboxTransformTo(mpl.transforms.Bbox(box))
121 |         #return mpl.transforms.Affine2D().scale(w,1).translate(xdisp,ydisp)
122 |     
123 |     def draw(self,renderer):
124 |         # the ONLY purpose of redefining this function is to force the Line2D
125 |         # object to execute recache().  Otherwise, certain changes in the scale
126 |         # do not invalidate the Line2D object, and the transform will not be
127 |         # recomputed (and so the Axes coords computed earlier will be obsolete)
128 |         self.recache()
129 |         return mpl.lines.Line2D.draw(self,renderer)
130 | 
131 | 
132 | class ConstHeightRectangle(mpl.patches.Patch):
133 |     
134 |     def __init__(self, x1, x2, y, h, **kwargs):
135 |         self.x1 = x1
136 |         self.x2 = x2
137 |         self.y  = y
138 |         self.h  = h
139 |         mpl.patches.Patch.__init__(self,**kwargs)
140 |     
141 |     def get_path(self):
142 |         return mpl.path.Path.unit_rectangle()
143 |     
144 |     def get_transform(self):
145 |         box = np.array([[self.x1,self.y],
146 |                         [self.x2,self.y]])
147 |         box = self.axes.transData.transform(box)
148 |         
149 |         h = self.h * self.axes.bbox.height / 2.0
150 |         
151 |         box[0,1] -= h
152 |         box[1,1] += h
153 |         
154 |         return mpl.transforms.BboxTransformTo(mpl.transforms.Bbox(box))
155 | 
156 | class ConstHeightLine(mpl.lines.Line2D):
157 |     
158 |     def __init__(self,x,y,h,**kwargs):
159 |         self.x = x
160 |         self.y = y
161 |         self.h = h
162 |         mpl.lines.Line2D.__init__(self,[0,0],[0,1],**kwargs) # init to unit line
163 |         
164 |         # self.x = x
165 |         # self.y = y
166 |         # self.w = w
167 |         # mpl.lines.Line2D.__init__(self,[0,1],[0,0],**kwargs) # init to unit line
168 |     
169 |     def get_transform(self):
170 |         # define transform that takes unit horiz line seg
171 |         # and places it in correct position using display
172 |         # coords
173 |         
174 |         box = np.array([[self.x,self.y],
175 |                         [self.x+1,self.y]])
176 |         box = self.axes.transData.transform(box)
177 |         
178 |         h = self.h * self.axes.bbox.height / 2.0
179 |         
180 |         box[0,1] -= h
181 |         box[1,1] += h
182 |         
183 |         #xdisp,ydisp = self.axes.transData.transform_point([self.x,self.y])
184 |         #xdisp -= w
185 |         #xleft  = xdisp - w
186 |         #xright = xdisp + w
187 |         
188 |         return mpl.transforms.BboxTransformTo(mpl.transforms.Bbox(box))
189 |         #return mpl.transforms.Affine2D().scale(w,1).translate(xdisp,ydisp)
190 |     
191 |     def draw(self,renderer):
192 |         # the ONLY purpose of redefining this function is to force the Line2D
193 |         # object to execute recache().  Otherwise, certain changes in the scale
194 |         # do not invalidate the Line2D object, and the transform will not be
195 |         # recomputed (and so the Axes coords computed earlier will be obsolete)
196 |         self.recache()
197 |         return mpl.lines.Line2D.draw(self,renderer)
198 | 
199 | 
200 | def boxplot(ax, x, positions=None, widths=None, vert=1):
201 |     # adapted from matplotlib
202 |     
203 |     # convert x to a list of vectors
204 |     if hasattr(x, 'shape'):
205 |         if len(x.shape) == 1:
206 |             if hasattr(x[0], 'shape'):
207 |                 x = list(x)
208 |             else:
209 |                 x = [x,]
210 |         elif len(x.shape) == 2:
211 |             nr, nc = x.shape
212 |             if nr == 1:
213 |                 x = [x]
214 |             elif nc == 1:
215 |                 x = [x.ravel()]
216 |             else:
217 |                 x = [x[:,i] for i in xrange(nc)]
218 |         else:
219 |             raise ValueError, "input x can have no more than 2 dimensions"
220 |     if not hasattr(x[0], '__len__'):
221 |         x = [x]
222 |     col = len(x)
223 |     
224 |     # get some plot info
225 |     if positions is None:
226 |         positions = range(1, col + 1)
227 |     if widths is None:
228 |         widths = min(0.3/len(positions),0.05)
229 |     if isinstance(widths, float) or isinstance(widths, int):
230 |         widths = np.ones((col,), float) * widths
231 |     
232 |     # loop through columns, adding each to plot
233 |     for i,pos in enumerate(positions):
234 |         d = np.ravel(x[i])
235 |         row = len(d)
236 |         if row==0:
237 |             # no data, skip this position
238 |             continue
239 |         # get distrib info
240 |         q1, med, q3 = mpl.mlab.prctile(d,[25,50,75])
241 |         dmax = np.max(d)
242 |         dmin = np.min(d)
243 |         
244 |         line_color = '#074687'
245 |         face_color = '#96B7EC'
246 |         if vert == 1:
247 |             medline = ConstWidthLine(pos,med,widths[i],color=line_color,zorder=3)
248 |             box = ConstWidthRectangle(pos,q1,q3,widths[i],facecolor=face_color,edgecolor=line_color,zorder=2)
249 |             vertline = mpl.lines.Line2D([pos,pos],[dmin,dmax],color=line_color,zorder=1)
250 |         else:
251 |             medline = ConstHeightLine(med,pos,widths[i],color=line_color,zorder=3)
252 |             box = ConstHeightRectangle(q1,q3,pos,widths[i],facecolor=face_color,edgecolor=line_color,zorder=2)
253 |             vertline = mpl.lines.Line2D([dmin,dmax],[pos,pos],color=line_color,zorder=1)
254 |         
255 |         ax.add_line(vertline)
256 |         ax.add_patch(box)
257 |         ax.add_line(medline)
258 | 
259 | 
260 | # define colormap for -1 to 1 (green-black-red) like gene expression
261 | _redgreencdict = {'red': [(0.0,   0.0,   0.0),
262 |                          (0.5,   0.0,   0.0),
263 |                          (1.0,   1.0,   0.0)],
264 |                         
265 |                 'green':[(0.0,   0.0,   1.0),
266 |                          (0.5,   0.0,   0.0),
267 |                          (1.0,   0.0,   0.0)],
268 |                         
269 |                 'blue': [(0.0,   0.0,   0.0),
270 |                          (0.5,   0.0,   0.0),
271 |                          (1.0,   0.0,   0.0)]}
272 | 
273 | redgreen = mpl.colors.LinearSegmentedColormap('redgreen',_redgreencdict,256)
274 | redgreen.set_bad(color='w')
275 | 
276 | 
277 | def compute_log_view_lim(data):
278 |     lo_lim = 10**np.floor(np.log10(np.min(data)))
279 |     hi_lim = 10**np.ceil(np.log10(np.max(data)))
280 |     return (lo_lim, hi_lim)
281 | 
282 | def generate_counthist(counts, label, view_lim=[1e-6,1e0,1e0,1e5]):
283 |     """Generate count size histogram.
284 |     
285 |     counts -- dictionary of (key,count) pairs
286 |     label  -- for the legend
287 |     """
288 |     max_size = max(counts.values())
289 |     num_chains = sum(counts.values())
290 |     sizes = np.arange(1,max_size+1)
291 |     freqs = np.float_(sizes) / num_chains
292 |     (hist,garbage) = np.histogram(counts.values(),bins=sizes)
293 |     idxs = hist > 0
294 |     
295 |     fig = plt.figure()
296 |     
297 |     ax = fig.add_subplot(111)
298 |     ax2 = ax.twiny()
299 |         
300 |     ax.spines['top'].set_position(('outward',5))
301 |     ax.spines['right'].set_visible(False)
302 |     ax.spines['bottom'].set_position(('outward',5))
303 |     ax.spines['left'].set_position(('outward',5))
304 |     ax.xaxis.set_ticks_position('bottom')
305 |     ax.yaxis.set_ticks_position('left')
306 |     ax.plot(freqs[idxs],hist[idxs],marker='o',linestyle='None',color='#e31a1c',markeredgewidth=0,markersize=4,clip_on=False,label=label)
307 |     ax.set_xscale('log')
308 |     ax.set_yscale('log')
309 |     ax.set_xlim(view_lim[:2])
310 |     ax.set_ylim(view_lim[2:])
311 |     
312 |     ax2.spines['top'].set_position(('outward',5))
313 |     ax2.spines['right'].set_visible(False)
314 |     ax2.spines['bottom'].set_visible(False)
315 |     ax2.spines['left'].set_visible(False)
316 |     ax2.xaxis.set_ticks_position('top')
317 |     ax2.yaxis.set_ticks_position('none')
318 |     ax2.set_xscale('log')
319 |     ax2.set_xlim([view_lim[0]*num_chains,view_lim[1]*num_chains])
320 |     
321 |     ax.set_xlabel('junction frequency (bottom) or count (top)')
322 |     ax.set_ylabel('number of junctions')
323 |     
324 |     leg = ax.legend(loc=0,numpoints=1,prop=mpl.font_manager.FontProperties(size='small'))
325 |     leg.get_frame().set_visible(False)
326 |     
327 |     return fig
328 | 
329 | def generate_counthistline(counts, label, view_lim=[1e-6,1e0,1e0,1e5]):
330 |     """Generate count size histogram.
331 |     
332 |     counts -- dictionary of (key,count) pairs
333 |     label  -- for the legend
334 |     """
335 |     max_size = max(counts.values())
336 |     num_chains = sum(counts.values())
337 |     bins = np.logspace(0,np.log10(max_size),21)
338 |     bins_freqs = np.float_(bins) / num_chains
339 |     (hist,garbage) = np.histogram(counts.values(),bins=bins)
340 |         
341 |     fig = plt.figure()
342 |     
343 |     ax = fig.add_subplot(111)
344 |     ax2 = ax.twiny()
345 |         
346 |     ax.spines['top'].set_position(('outward',5))
347 |     ax.spines['right'].set_visible(False)
348 |     ax.spines['bottom'].set_position(('outward',5))
349 |     ax.spines['left'].set_position(('outward',5))
350 |     ax.xaxis.set_ticks_position('bottom')
351 |     ax.yaxis.set_ticks_position('left')
352 |     ax.plot(bins_freqs,list(hist)+[hist[-1]],color='#e31a1c',drawstyle='steps-post',clip_on=False,label=label)
353 |     ax.set_xscale('log')
354 |     ax.set_yscale('log')
355 |     ax.set_xlim(view_lim[:2])
356 |     ax.set_ylim(view_lim[2:])
357 |     
358 |     ax2.spines['top'].set_position(('outward',5))
359 |     ax2.spines['right'].set_visible(False)
360 |     ax2.spines['bottom'].set_visible(False)
361 |     ax2.spines['left'].set_visible(False)
362 |     ax2.xaxis.set_ticks_position('top')
363 |     ax2.yaxis.set_ticks_position('none')
364 |     ax2.set_xscale('log')
365 |     ax2.set_xlim([view_lim[0]*num_chains,view_lim[1]*num_chains])
366 |     
367 |     ax.set_xlabel('junction frequency (bottom) or count (top)')
368 |     ax.set_ylabel('number of junctions')
369 |     
370 |     leg = ax.legend(loc=0,numpoints=1,prop=mpl.font_manager.FontProperties(size='small'))
371 |     leg.get_frame().set_visible(False)
372 |     
373 |     return fig
374 | 
375 | def generate_rankaccum(counts,label,view_lim=[1e0,1e5,1e-6,1e0]):
376 |     """Generate rankaccum curve.
377 |     
378 |     counts -- dictionary of (key,count) pairs
379 |     label  -- for the legend
380 |     """
381 |     num_chains = sum(counts.values())
382 |     freqs = np.float_(counts.values()) / num_chains
383 |     
384 |     fig = plt.figure()
385 |     
386 |     ax = fig.add_subplot(111)
387 |     ax2 = ax.twinx()
388 |     
389 |     ax.spines['top'].set_visible(False)
390 |     ax.spines['right'].set_position(('outward',5))
391 |     ax.spines['bottom'].set_position(('outward',5))
392 |     ax.spines['left'].set_position(('outward',5))
393 |     ax.xaxis.set_ticks_position('bottom')
394 |     ax.yaxis.set_ticks_position('left')
395 |     ax.plot(range(1,len(counts.values())+1),sorted(freqs,reverse=True),marker='o',linestyle='None',color='#377db8',markeredgewidth=0,markersize=4,clip_on=False,label=label)
396 |     ax.set_xscale('log')
397 |     ax.set_yscale('log')
398 |     ax.set_xlim(view_lim[:2])
399 |     ax.set_ylim(view_lim[2:])
400 |     
401 |     ax2.spines['top'].set_visible(False)
402 |     ax2.spines['right'].set_position(('outward',5))
403 |     ax2.spines['bottom'].set_visible(False)
404 |     ax2.spines['left'].set_visible(False)
405 |     ax2.xaxis.set_ticks_position('none')
406 |     ax2.yaxis.set_ticks_position('right')
407 |     ax2.set_yscale('log')
408 |     ax2.set_ylim([view_lim[2]*num_chains,view_lim[3]*num_chains])
409 |     
410 |     ax.set_xlabel('rank')
411 |     ax.set_ylabel('junction frequency (left) or count (right)')
412 |     
413 |     leg = ax.legend(loc=0,numpoints=1,prop=mpl.font_manager.FontProperties(size='small'))
414 |     leg.get_frame().set_visible(False)
415 |     
416 |     return fig
417 | 
418 | 


--------------------------------------------------------------------------------
/oligoTm.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | from Bio.Seq import Seq
 4 | from Bio.SeqRecord import SeqRecord
 5 | 
 6 | def oligoTm(seqobj):
 7 |     """Computes the melting temp based on the NN model.
 8 |     
 9 |     (Originated from Kun Zhang)
10 |     """
11 |     
12 |     if isinstance(seqobj,SeqRecord):
13 |         seq = seqobj.seq.tostring().upper()
14 |     elif isinstance(seqobj,Seq):
15 |         seq = seqobj.tostring().upper()
16 |     elif isinstance(seqobj,str):
17 |         seq = seqobj.upper()
18 |     
19 |     # set the default Tm parameters
20 |     C_primer = 250.0 # nM
21 |     C_Mg = 0.0 # mM  #1.5 # 10mM intracellular concentration
22 |     C_MonovalentIon = 50.0 #mM #10mM Tris-Cl in 9N Ligase  
23 |     C_dNTP = 0.0 #mM #0.8 #mM
24 |     percentage_DMSO = 0
25 |     percentage_annealed = 50
26 |     
27 |     percentage_annealed = percentage_annealed/100.0
28 |     percentage_DMSO = percentage_DMSO/100.0
29 |     
30 |     #Some constants
31 |     R = 1.987
32 |     deltaH = dict()
33 |     deltaS = dict()
34 |     deltaH =  {     "AA": -7.6,  "TT": -7.6, "AT": -7.2, "TA": -7.2, "CA": -8.5, "TG": -8.5, "GT": -8.4, "AC": -8.4,"CT": -7.8, "AG": -7.8, "GA": -8.2, "TC": -8.2,"CG": -10.6,"GC": -9.8, "GG": -8.0, "CC": -8.0, "A": 2.2, "T": 2.2, "G": 0.0, "C": 0.0}
35 |     deltaS = { "AA": -21.3, "TT": -21.3, "AT": -20.4, "TA": -21.3, "CA": -22.7, "TG": -22.7, "GT": -22.4, "AC": -22.4, "CT": -21.0, "AG": -21.0, "GA": -22.2, "TC": -22.2,"CG": -27.2, "GC": -24.4, "GG": -19.9, "CC":-19.9, "A": 6.9, "T": 6.9, "G": 0.0, "C": 0.0}
36 |     
37 |     C_SodiumEquivalent = C_MonovalentIon + 120 * math.sqrt(C_Mg-C_dNTP)
38 |     seqLength = len(seq)
39 |     dH = 0.2 + deltaH[str(seq[0])] + deltaH[str(seq[len(seq)-1])]
40 |     dS = -5.7 + deltaS[seq[0]] + deltaS[seq[len(seq)-1]]
41 |     for i in range(0, seqLength - 1):
42 |         dH += deltaH[str(seq[i:i+2])]
43 |         dS +=  deltaS[seq[i:i+2]]
44 |     dS = dS + 0.368 * seqLength * math.log(C_SodiumEquivalent/1000.0)
45 |     #val = math.log(C_primer*(1-percentage_annealed)/percentage_annealed)
46 |     Tm =(dH * 1000) / (dS + R * (math.log(C_primer*(1-percentage_annealed)/percentage_annealed)-21.4164)) - 273.15 - 0.75*percentage_DMSO
47 |     return Tm
48 | 
49 | oligo_Tm = oligoTm


--------------------------------------------------------------------------------
/primers.py:
--------------------------------------------------------------------------------
 1 | import oligoTm
 2 | import unafold
 3 | import seqtools
 4 | 
 5 | def generate_candidates(seq,minlen=18,maxlen=30):
 6 |     candidates = []
 7 |     for start in xrange(len(seq)):
 8 |         length = minlen
 9 |         while length <= maxlen and start+length <= len(seq):
10 |             candidates.append( seq[start:start+length] )
11 |             length += 1
12 |     return candidates
13 | 
14 | def choose_PCR_primer(seq,target_Tm=62.):
15 |     candidates = generate_candidates(seq)
16 |     
17 |     # filter for Tm
18 |     candidates = filter(lambda s: abs(oligoTm.oligoTm(s) - target_Tm) <= 2, candidates)
19 |     if len(candidates) == 0:
20 |         raise ValueError, "No primer candidates meet Tm cutoffs"
21 |     
22 |     # filter for 0.4-0.6 GC content
23 |     candidates = filter(lambda s: abs(seqtools.gc_content(s) - 0.5) <= 0.1,candidates)
24 |     if len(candidates) == 0:
25 |         raise ValueError, "No primer candidates meet GC content cutoffs"
26 |     
27 |     # rank on secondary structure minimization
28 |     candidates.sort(key=unafold.hybrid_ss_min)
29 |     
30 |     return candidates[0]
31 | 


--------------------------------------------------------------------------------
/pyutils.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import string
  3 | import collections
  4 | import contextlib
  5 | 
  6 | @contextlib.contextmanager
  7 | def as_handle(handleish, mode='r', **kwargs):
  8 |     """Open handleish as file.
  9 |     
 10 |     Stolen from Biopython
 11 |     """
 12 |     if isinstance(handleish, basestring):
 13 |         with open(handleish, mode, **kwargs) as fp:
 14 |             yield fp
 15 |     else:
 16 |         yield handleish
 17 | 
 18 | # for generating 'safe' filenames from identifiers
 19 | cleanup_table = string.maketrans('/*|><+ ','_____p_')
 20 | def cleanup_id(identifier):
 21 |     return identifier.translate(cleanup_table)
 22 | 
 23 | 
 24 | class nesteddict(collections.defaultdict):
 25 |     """Nested dictionary structure.
 26 |     
 27 |     Based on Stack Overflow question 635483
 28 |     """
 29 |     def __init__(self,default=None):
 30 |         if default == None:
 31 |             collections.defaultdict.__init__(self, nesteddict)
 32 |         else:
 33 |             collections.defaultdict.__init__(self, default)
 34 |         self.locked = False
 35 |     
 36 |     def lock(self):
 37 |         # self.default_factory = raiseKeyError
 38 |         self.default_factory = None
 39 |         self.locked = True
 40 |         for value in self.itervalues():
 41 |             if isinstance(value, nesteddict):
 42 |                 value.lock()
 43 |     
 44 |     def unlock(self):
 45 |         self.default_factory = nesteddict
 46 |         self.locked = False
 47 |         for value in self.itervalues():
 48 |             if isinstance(value, nesteddict):
 49 |                 value.unlock()
 50 |     
 51 |     def islocked(self):
 52 |         return self.locked
 53 |     
 54 |     def todict(self):
 55 |         raise NotImplementedError
 56 |         for (key,val) in self.iteritems():
 57 |             if isinstance(val,nesteddict):
 58 |                 val.todict()
 59 |                 self[key] = dict(val)
 60 |         self = dict(self)
 61 |     
 62 |     @staticmethod
 63 |     def asdict(d):
 64 |         d = copy.deepcopy(d)
 65 |         for (key,val) in d.iteritems():
 66 |             if isinstance(val,nesteddict):
 67 |                 d[key] = nesteddict.asdict(val)
 68 |         return dict(d)
 69 |     
 70 |     def nested_setdefault(self,keylist,default):
 71 |         curr_dict = self
 72 |         for key in keylist[:-1]:
 73 |             curr_dict = curr_dict[key]
 74 |         key = keylist[-1]
 75 |         return curr_dict.setdefault(key,default)
 76 |     
 77 |     def nested_get(self,keylist,default):
 78 |         curr_dict = self
 79 |         for key in keylist[:-1]:
 80 |             curr_dict = curr_dict[key]
 81 |         key = keylist[-1]
 82 |         return curr_dict.get(key,default)
 83 |     
 84 |     def nested_assign(self,keylist,val):
 85 |         curr_dict = self
 86 |         for key in keylist[:-1]:
 87 |             curr_dict = curr_dict[key]
 88 |         key = keylist[-1]
 89 |         curr_dict[key] = val
 90 |         return self
 91 |     
 92 |     def walk(self):
 93 |         for (key,value) in self.iteritems():
 94 |             if isinstance(value, nesteddict):
 95 |                 for tup in value.walk():
 96 |                     yield (key,) + tup
 97 |             else:
 98 |                 yield (key,value)
 99 |     
100 |     # these functions below implement special cases of nesteddict, where the
101 |     # deepest-level dict is of a particular type (e.g., int for counter, set
102 |     # for uniq objects, etc.)
103 |     # 
104 |     # These functions could be implemented with nested_setdefault and
105 |     # nested_get, but would be less efficient since they would have to
106 |     # traverse the dict structure more times.
107 |     
108 |     def nested_increment(self,keylist,increment=1):
109 |         curr_dict = self
110 |         for key in keylist[:-1]:
111 |             curr_dict = curr_dict[key]
112 |         key = keylist[-1]
113 |         curr_dict[key] = curr_dict.get(key,0) + increment
114 |     
115 |     def nested_add(self,keylist,obj):
116 |         curr_dict = self
117 |         for key in keylist[:-1]:
118 |             curr_dict = curr_dict[key]
119 |         key = keylist[-1]
120 |         curr_dict.setdefault(key,set()).add(obj)
121 | 
122 | 
123 | 
124 | 
125 | 
126 | 
127 | 
128 | 
129 | 
130 | # class ModuleWrapper(object):
131 | #     """Wrap a module to allow user-defined __getattr__
132 | #     
133 | #     see http://stackoverflow.com/questions/2447353/getattr-on-a-module
134 | #     """
135 | #     def __init__(self, module, usergetattr):
136 | #         self.module = module
137 | #         self.usergetattr = usergetattr
138 | #     
139 | #     def __getattr__(self, name):
140 | #         return self.usergetattr(self,name)
141 | 


--------------------------------------------------------------------------------
/qPCR2melting.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import matplotlib as mpl
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | def qPCR2melting(inputfile,output_formats):
 7 |     outputbasename = os.path.splitext(os.path.basename(inputfile))[0]
 8 |     
 9 |     # Learn some things about the data:
10 |     #   How many curves are there?
11 |     ip = open(inputfile,'r')
12 |     for line in ip:
13 |         if line.startswith('Step'):
14 |             # Verify the fields in the line:
15 |             fields = line.split(',')
16 |             if fields[0] != 'Step' or fields[1] != 'Cycle' or fields[2] != 'Dye' or fields[3] != 'Temp.':
17 |                 raise ValueError, 'Expected line like: "Step,Cycle,Dye,Temp.,..."'
18 |             curve_labels = fields[4:-1]   # (skip the above four fields and last extra comma)
19 |             break
20 |     #   What step is the melting at?
21 |     for line in ip: # advance to data set characterization
22 |         if line.strip() == 'Analysis Options':
23 |             break
24 |     for line in ip:
25 |         if line.startswith("Step") and "Melting Curve" in line:
26 |             line_id = line.split()[1].strip(':')
27 |             break
28 |     ip.close()
29 |     
30 |     # Create data structures
31 |     temps = []
32 |     curves = [[] for curve in curve_labels]
33 |     
34 |     # Load the data
35 |     ip = open(inputfile,'r')
36 |     for line in ip: # advance to data
37 |         if line.startswith('Step'):
38 |             break
39 |     for line in ip:
40 |         if line.strip() == '':
41 |             break
42 |         if line.split(',')[0] == line_id:
43 |             temps.append(float(line.split(',')[3]))
44 |             data = map(float,line.split(',')[4:-1])
45 |             for (i,value) in enumerate(data):
46 |                 curves[i].append(value)
47 |     
48 |     # Make the plots
49 |     fig = plt.figure()
50 |     ax = fig.add_subplot(111)
51 |     for (label,curve) in zip(curve_labels,curves):
52 |         ax.plot(temps,curve,label=label)
53 |     ax.legend(loc=2)
54 |     ax.set_xlabel('Temperature')
55 |     ax.set_ylabel('Fluorescence (a.u.)')
56 |     for format in output_formats:
57 |         fig.savefig(outputbasename+'.melting.'+format)
58 | 
59 | if __name__ == '__main__':
60 |     import sys
61 |     import optparse
62 |     
63 |     output_formats = set()
64 |     def append_format(option,opt_str,value,parser):
65 |         output_formats.add(opt_str.strip('-'))
66 |     
67 |     option_parser = optparse.OptionParser()
68 |     option_parser.add_option('--png',action='callback',callback=append_format)
69 |     option_parser.add_option('--pdf',action='callback',callback=append_format)
70 |     option_parser.add_option('--eps',action='callback',callback=append_format)
71 |     (options,args) = option_parser.parse_args()
72 |     
73 |     if len(args) != 1:
74 |         raise ValueError, "Must give a single file as input."
75 |     
76 |     output_formats = list(output_formats)
77 |     if output_formats == []:
78 |         output_formats.append('pdf')
79 |         output_formats.append('png')
80 |     inputfile = args[0]
81 |     
82 |     qPCR2melting(inputfile,output_formats)
83 |     


--------------------------------------------------------------------------------
/qPCR2quantitation.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import matplotlib as mpl
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | def qPCR2quantitation(inputfile,output_formats):
 7 |     outputbasename = os.path.splitext(os.path.basename(inputfile))[0]
 8 |     
 9 |     # Learn some things about the data:
10 |     #   How many curves are there?
11 |     ip = open(inputfile,'r')
12 |     for line in ip:
13 |         if line.startswith('Step'):
14 |             # Verify the fields in the line:
15 |             fields = line.split(',')
16 |             if fields[0] != 'Step' or fields[1] != 'Cycle' or fields[2] != 'Dye' or fields[3] != 'Temp.':
17 |                 raise ValueError, 'Expected line like: "Step,Cycle,Dye,Temp.,..."'
18 |             curve_labels = fields[4:-1]   # (skip the above four fields and last extra comma)
19 |             break
20 |     #   What step is the quantitation at?
21 |     for line in ip: # advance to data set characterization
22 |         if line.strip() == 'Analysis Options':
23 |             break
24 |     for line in ip:
25 |         if line.startswith("Step") and "Quantitation" in line:
26 |             line_id = line.split()[1].strip(':')
27 |             break
28 |     ip.close()
29 |     
30 |     # Create data structures
31 |     cycles = []
32 |     curves = [[] for curve in curve_labels]
33 |     
34 |     # Load the data
35 |     ip = open(inputfile,'r')
36 |     for line in ip: # advance to data
37 |         if line.startswith('Step'):
38 |             break
39 |     for line in ip:
40 |         if line.strip() == '':
41 |             break
42 |         if  line.split(',')[0] == line_id:
43 |             cycles.append(int(line.split(',')[1]))
44 |             data = map(float,line.split(',')[4:-1])
45 |             for (i,value) in enumerate(data):
46 |                 curves[i].append(value)
47 |     
48 |     # Make the plots
49 |     fig = plt.figure()
50 |     ax = fig.add_subplot(111)
51 |     for (label,curve) in zip(curve_labels,curves):
52 |         ax.plot(cycles,curve,label=label)
53 |     ax.legend(loc=2)
54 |     ax.set_xlabel('Cycles')
55 |     ax.set_ylabel('Fluorescence (a.u.)')
56 |     for format in output_formats:
57 |         fig.savefig(outputbasename+'.quantitation.'+format)
58 | 
59 | if __name__ == '__main__':
60 |     import sys
61 |     import optparse
62 |     
63 |     output_formats = set()
64 |     def append_format(option,opt_str,value,parser):
65 |         output_formats.add(opt_str.strip('-'))
66 |     
67 |     option_parser = optparse.OptionParser()
68 |     option_parser.add_option('--png',action='callback',callback=append_format)
69 |     option_parser.add_option('--pdf',action='callback',callback=append_format)
70 |     option_parser.add_option('--eps',action='callback',callback=append_format)
71 |     (options,args) = option_parser.parse_args()
72 |     
73 |     if len(args) != 1:
74 |         raise ValueError, "Must give a single file as input."
75 |     
76 |     output_formats = list(output_formats)
77 |     if output_formats == []:
78 |         output_formats.append('pdf')
79 |         output_formats.append('png')
80 |     inputfile = args[0]
81 |     
82 |     qPCR2quantitation(inputfile,output_formats)
83 |     


--------------------------------------------------------------------------------
/sanger.py:
--------------------------------------------------------------------------------
  1 | import exonerate
  2 | 
  3 | standard_primers = {    # 5' -> 3'
  4 |     'M13R' : 'caggaaacagctatgac',
  5 |     'M13F-20' : 'gtaaaacgacggccag',
  6 |     'T3' : 'attaaccctcactaaaggga',
  7 |     'T7' : 'taatacgactcactataggg'
  8 | }
  9 | 
 10 | standard_vectors = {
 11 |     'pCR4-TOPO-left' : 'catgattacgccaagctcagaattaaccctcactaaagggactagtcctgcaggtttaaacgaattcgccctt',
 12 |     'pCR4-TOPO-right' : 'aagggcgaattcgcggccgctaaattcaattcgccctatagtgagtcgtattacaattca',
 13 |     'pCR4Blunt-TOPO-left' : 'catgattacgccaagctcagaattaaccctcactaaagggactagtcctgcaggtttaaacgaattcgccctt',
 14 |     'pCR4Blunt-TOPO-right' : 'aagggcgaattcgcggccgctaaattcaattcgccctatagtgagtcgtattacaattca'
 15 | }
 16 | 
 17 | 
 18 | 
 19 | 
 20 | def trimleft(left,read):
 21 |     """Align 2 seqs, forcing alignment of right-end of left.
 22 |     
 23 |                      ...RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR...
 24 |     ...LLLLLLLLLLLLLLLLLLLLLLLLLLL  <- (forced aln here)
 25 |     
 26 |     Uses exonerate.
 27 |     """
 28 |     # perform alignment
 29 |     cmd = exonerate.ExonerateCommand('findend','parsable','bestonly')
 30 |     rawaln = exonerate.run_exonerate2(cmd,left,read)
 31 |     if rawaln == '': return read
 32 |     aln = exonerate.parse_aln(rawaln)
 33 |     
 34 |     # check that the right-end of left was successfully placed
 35 |     if aln['query_len'] != aln['query_aln_end']:
 36 |         raise ValueError, "failed to align right-end of left sequence"
 37 |     
 38 |     # check that both strands are + orientation
 39 |     if aln['query_strand'] != '+':
 40 |         raise ValueError, "query strand has been reversed"
 41 |     if aln['target_strand'] != '+':
 42 |         raise ValueError, "target strand has been reversed"
 43 |     
 44 |     # return trimmed sequence
 45 |     return read[aln['target_aln_end']:]
 46 | 
 47 | def trimright(right,read):
 48 |     """Align 2 seqs, forcing alignment of left-end of right.
 49 |     
 50 |     ...DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD...
 51 |         (forced aln here) -> RRRRRRRRRRRRRRRRRRRRRRRRRRRRR...
 52 |     
 53 |     Uses exonerate.
 54 |     """
 55 |     # perform alignment
 56 |     cmd = exonerate.ExonerateCommand('findend','parsable','bestonly')
 57 |     rawaln = exonerate.run_exonerate2(cmd,right,read)
 58 |     if rawaln == '': return read
 59 |     aln = exonerate.parse_aln(rawaln)
 60 |     
 61 |     # check that the left-end of right was successfully placed
 62 |     if aln['query_aln_begin'] != 0:
 63 |         raise ValueError, "failed to align left-end of right sequence"
 64 |     
 65 |     # check that both strands are + orientation
 66 |     if aln['query_strand'] != '+':
 67 |         raise ValueError, "query strand has been reversed"
 68 |     if aln['target_strand'] != '+':
 69 |         raise ValueError, "target strand has been reversed"
 70 |     
 71 |     # return trimmed sequence
 72 |     return read[:aln['target_aln_begin']]
 73 | 
 74 | # ===============
 75 | # = UNFINISHED: =
 76 | # ===============
 77 | 
 78 | def bidirectional_alignment(forward,reverse):
 79 |     """Align forward and reverse sequence of bidirectional Sanger reads.
 80 |     
 81 |     forward and reverse sequences must already be in the same 'sense' (i.e.,
 82 |     reverse should be revcomped if necessary so that both strands in alignment
 83 |     are '+').
 84 |     
 85 |     Forces alignment of both ends (right end of forward, and left end of
 86 |     reverse).
 87 |     
 88 |     ...FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
 89 |                                RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR...
 90 |     
 91 |     Uses exonerate.
 92 |     """
 93 |     # perform alignment
 94 |     cmd = exonerate.ExonerateCommand('findend','parsable','bestonly')
 95 |     rawaln = exonerate.run_exonerate2(cmd,forward,reverse)
 96 |     aln = exonerate.parse_aln(rawaln)
 97 |     
 98 |     # check that both strands are + orientation
 99 |     if aln['query_strand'] != '+':
100 |         raise ValueError, "query strand has been reversed"
101 |     if aln['target_strand'] != '+':
102 |         raise ValueError, "target strand has been reversed"
103 |     
104 |     # check that right end of forward and left end of reverse are placed
105 |     if aln['query_len'] != aln['query_aln_end']:
106 |         raise ValueError, "failed to align right-end of forward sequence"
107 |     if aln['query_aln_begin'] != 0:
108 |         raise ValueError, "failed to align left-end of right sequence"
109 | 


--------------------------------------------------------------------------------
/scale.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import types
  3 | import numbers
  4 | import bisect
  5 | 
  6 | import numpy as np
  7 | 
  8 | def is_iterable(x):
  9 |     try:
 10 |         iter(x)
 11 |         return True
 12 |     except TypeError:
 13 |         return False
 14 | 
 15 | class quantitative(object):
 16 |     """Implement abstract quantitative scale."""
 17 |     
 18 |     def __init__(self, *args):
 19 |         self._domain = [0,1]
 20 |         self._range  = [0,1]
 21 |         
 22 |         self._transform = lambda x: x
 23 |         self._inverse = lambda y: y
 24 |         
 25 |         self.domain(*args)
 26 |     
 27 |     def _in_domain(self,x):
 28 |         return (x >= min(self._domain)) and (x <= max(self._domain))
 29 |     
 30 |     def _in_range(self,y):
 31 |         return (y >= min(self._range)) and (y <= max(self._range))
 32 |     
 33 |     def __call__(self,x):
 34 |         if not self._in_domain(x):
 35 |             raise ValueError, "outside domain"
 36 |         segment = bisect.bisect_right(self._domain,x) - 1
 37 |         if segment + 1 == len(self._domain): segment -= 1   # deal with extra endpoint (fully closed interval), e.g., [0,1) [1,2) [2,3]
 38 |         return (self._transform(x) - self._transform(self._domain[segment])) / (self._transform(self._domain[segment+1]) - self._transform(self._domain[segment])) * (self._range[segment+1] - self._range[segment]) + self._range[segment]
 39 |     
 40 |     def domain(self,*args):
 41 |         if len(args) == 0:
 42 |             return self._domain
 43 |         elif is_iterable(args[0]):  # given array of data from which to determine domain
 44 |             if len(args[0]) < 2: raise ValueError, "domain specification needs at least two numbers"
 45 |             self._domain = [np.min(args[0]),np.max(args[0])]
 46 |         else:   # given explicit values for piecewise domain
 47 |             if len(args) != len(set(args)):
 48 |                 raise ValueError, "domain values must be unique"
 49 |             if list(args) != sorted(list(args)) and list(args)[::-1] != sorted(list(args)):     # FIGURE THIS OUT
 50 |                 raise ValueError, "domain values must be sorted"
 51 |             self._domain = args
 52 |         
 53 |         self._domain = map(float,self._domain)
 54 |         map(self._transform,self._domain)   # test that transform is defined on domain
 55 |         
 56 |         return self
 57 |     
 58 |     def range(self,*args):
 59 |         if len(args) == 0:
 60 |             return self._range
 61 |         elif is_iterable(args[0]):  # given array of data from which to determine range
 62 |             if len(args[0]) != len(self._domain): raise ValueError, "range specification needs at least two numbers"
 63 |             self._range = [np.min(args[0]),np.max(args[0])]
 64 |         else:   # given explicit values for piecewise range
 65 |             if len(args) != len(set(args)):
 66 |                 raise ValueError, "range values must be unique"
 67 |             if list(args) != sorted(list(args)) and list(args)[::-1] != sorted(list(args)):     # FIGURE THIS OUT
 68 |                 raise ValueError, "range values must be sorted"
 69 |             self._range = args
 70 |         
 71 |         if len(args) != len(self._domain):
 72 |             raise ValueError, "range specification must have same number of points as domain"
 73 |         
 74 |         return self
 75 |     
 76 |     def invert(self,y):
 77 |         if not self._in_range(x):
 78 |             raise ValueError, "outside range"
 79 |         segment = bisect.bisect_right(self._range,y) - 1
 80 |         if segment == len(self._range): segment -= 1   # deal with extra endpoint (fully closed interval), e.g., [0,1) [1,2) [2,3]
 81 |         return self._inverse((y - self._range[segment]) / (self._range[segment+1] - self._range[segment]) * (self._transform(self._domain[segment+1]) - self._transform(self._domain[segment])) + self._transform(self._domain[segment]))
 82 | 
 83 | linear = quantitative
 84 | 
 85 | class log(quantitative):
 86 |     """Implementation of log scale"""
 87 |     
 88 |     def __init__(self, *args):
 89 |         self._domain = [1,10]
 90 |         quantitative.__init__(self,*args)
 91 |         self.base(10)
 92 |     
 93 |     def base(self,*args):
 94 |         if len(args) == 0:
 95 |             return self._base
 96 |         else:
 97 |             self._base = args[0]
 98 |             self._logbase = math.log(self._base)
 99 |             self._transform = lambda x: math.log(x) / self._logbase
100 |             self._inverse = lambda y: self._base ** y
101 |             return self
102 | 
103 | class root(quantitative):
104 |     """root scale"""
105 |     
106 |     def __init__(self, *args):
107 |         quantitative.__init__(self,*args)
108 |         self.power(2)
109 |     
110 |     def power(self,*args):
111 |         if len(args) == 0:
112 |             return self._power
113 |         else:
114 |             self._power = args[0]
115 |             self._transform = lambda x: x**(1./self._power)
116 |             self._inverse = lambda y: y**self._power
117 |             return self
118 | 
119 | 
120 | 
121 | # class ordinal(object):
122 | #     """Implementation for ordinal scale"""
123 | #     
124 | #     def __init__(self, *args):
125 | #         Scale.__init__(self)
126 | #         self._domain = []
127 | #         self._indices = {}
128 | #         self._range = []
129 | #         self._band = 0
130 | #         self.domain(*args)
131 | #         return self
132 | #     
133 | #     def scale(self,x):
134 | #         if x not in self._indices:
135 | #             self._domain.append(x)
136 | #             self._indices[x] = len(self._domain) - 1
137 | #         return self._range[ self._indices[x] % len(self._range) ]
138 | #     
139 | #     def domain(self,*args):
140 | #         if len(args) == 0:
141 | #             return self._domain
142 | #         
143 | #         try:
144 | #             iter(args[0])   # test for array type
145 | #             array = args[0]
146 | #             if len(args) > 1:
147 | #                 array = map(args[1],array)
148 | #         except TypeError:
149 | #             array = args
150 | #         
151 | #         self._domain = list(set(array))
152 | #         self._indices = pv.numerate(self._domain)
153 | #         
154 | #         return self
155 | #     
156 | #     def range(self,*args):
157 | #         if len(args) == 0:
158 | #             return self._range
159 | #         
160 | #         try:
161 | #             iter(args[0])   # test for array type
162 | #             array = args[0]
163 | #             if len(args) > 1:
164 | #                 array = map(args[1],array)
165 | #         except TypeError:
166 | #             array = args
167 | #         
168 | #         if isinstance(array[0],types.StringType):
169 | #             array = map(pv.color,array)
170 | #         
171 | #         self._range = array
172 | #         
173 | #         return self
174 | #     
175 | #     def split(self,_min,_max):
176 | #         step = float(_max - _min) / length(self.domain())
177 | #         self._range = range(_min + step / 2., _max, step)
178 | #         return self
179 | #     
180 | #     def splitFlush(self,_min,_max):
181 | #         n = len(self.domain())
182 | #         step = float(_max - _min) / (n - 1)
183 | #         if n == 1:
184 | #             self._range = (_min + _max) / 2.
185 | #         else:
186 | #             self._range = range(_min, _max + step / 2., step)
187 | #         return self
188 | #     
189 | #     def splitBanded(self,_min,_max,band=1):
190 | #         if band < 0:
191 | #             n = len(self.domain())
192 | #             total = -band * n
193 | #             remaining = _max - _min - total
194 | #             padding = remaining / float(n + 1)
195 | #             self._range = range(_min + padding, _max, padding - band)
196 | #             self._band = -band
197 | #         else:
198 | #             step = float(_max - _min) / (len(self.domain()) + (1 - band))
199 | #             self._range = range(_min + step * (1 - band), _max, step)
200 | #             self._band = step * band
201 | #         return self
202 | #     
203 | #     def by(self,f):
204 | #         raise NotImplementedError
205 | # 
206 | # class quantile(Scale):
207 | #     """quantile scale"""
208 | #     
209 | #     def __init__(self, *args):
210 | #         Scale.__init__(self)
211 | #         self._num_quantiles = -1
212 | #         self._max_quantile_index = -1
213 | #         self._quantile_boundaries = []
214 | #         self._domain = []
215 | #         self._y = linear()  # the range
216 | #         self.domain(*args)
217 | #         return self
218 | #     
219 | #     def scale(self,x):
220 | #         return self._y(max(0, min(self._max_quantile_index, bisect.bisect_right(self._quantile_boundaries, x) - 1)) / float(self._max_quantile_index))
221 | #     
222 | #     def quantiles(self,*args):
223 | #         if len(args) == 0:
224 | #             return self._quantile_boundaries
225 | #         
226 | #         self._num_quantiles = int(args[0])
227 | #         
228 | #         if self._num_quantiles < 0:
229 | #             self._quantile_boundaries = [self._domain[0]] + self._domain
230 | #             self._max_quantile_index = len(self._domain) - 1
231 | #         else:
232 | #             self._quantile_boundaries = [self._domain[0]]
233 | #             for i in range(1,self._num_quantiles+1):
234 | #                 self._quantile_boundaries.append( self._domain[ int(float(i) * (len(self._domain) - 1) / self._num_quantiles) ] )
235 | #             self._max_quantile_index = self._num_quantiles - 1
236 | #         
237 | #         return self
238 | #     
239 | #     def domain(self,*args):
240 | #         if len(args) == 0:
241 | #             return self._domain
242 | #         
243 | #         try:
244 | #             iter(args[0])
245 | #             array = args[0]
246 | #             if len(args) > 1:
247 | #                 array = map(args[1],array)
248 | #         except TypeError:
249 | #             array = args
250 | #         
251 | #         self._domain = array
252 | #         self._domain.sort()
253 | #         self.quantiles(self._num_quantiles)
254 | #         return self
255 | #     
256 | #     def range(self,*args):
257 | #         if len(args) == 0:
258 | #             return self._y.range()
259 | #         
260 | #         self._y.range(*args)
261 | #         return self
262 | #     
263 | #     def by(self,f):
264 | #         raise NotImplementedError
265 | # 
266 | 


--------------------------------------------------------------------------------
/seqtools.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import json
  3 | import string
  4 | import random
  5 | import itertools
  6 | 
  7 | from Bio            import Alphabet
  8 | from Bio.Seq        import Seq
  9 | from Bio.SeqRecord  import SeqRecord
 10 | from Bio.SeqFeature import SeqFeature, FeatureLocation
 11 | from Bio            import pairwise2
 12 | 
 13 | import numpy as np
 14 | import scipy as sp
 15 | import scipy.stats
 16 | 
 17 | from jellyfish import hamming_distance
 18 | 
 19 | import unafold
 20 | from pyutils import as_handle
 21 | 
 22 | random.seed()
 23 | 
 24 | # ==============================
 25 | # = General sequence utilities =
 26 | # ==============================
 27 | 
 28 | def substitute(seq,pos,sub):
 29 |     return seq[:pos] + sub + seq[pos+1:]
 30 | 
 31 | 
 32 | complement_table = string.maketrans('ACGTRYSWKMBDHVN','TGCAYRSWMKVHDBN')
 33 | 
 34 | def reverse(seq):
 35 |     return seq[::-1]
 36 | 
 37 | 
 38 | def complement(seq):
 39 |     return seq.upper().translate(complement_table)
 40 | 
 41 | 
 42 | def reverse_complement(seq):
 43 |     """Compute reverse complement of sequence.
 44 |     
 45 |     Mindful of IUPAC ambiguities.
 46 |     Return all uppercase.
 47 |     """
 48 |     return reverse(complement(seq))
 49 |     # return seq.upper().translate(complement_table)[::-1]
 50 | 
 51 | def translate(seq):
 52 |     return Seq(seq.replace('-','N'),Alphabet.DNAAlphabet()).translate().tostring()
 53 | 
 54 | 
 55 | def gc_content(seq):
 56 |     gc = seq.lower().count('g') + seq.lower().count('c')
 57 |     return float(gc) / len(seq)
 58 | 
 59 | 
 60 | def random_dna_seq(n):
 61 |     choice = random.choice
 62 |     return reduce(lambda cumul,garbage:cumul+choice('ACGT'),xrange(n),'')
 63 | 
 64 | global_align = lambda seq1,seq2: pairwise2.align.globalms(seq1,seq2,0.5,-0.75,-2.,-1.5,one_alignment_only=True)[0]
 65 | 
 66 | def percent_id(seq1,seq2):
 67 |     alignment = global_align(seq1,seq2)
 68 |     return (1. - hamming_distance(alignment[0],alignment[1]) / float(len(alignment[0]))) * 100.
 69 | 
 70 | 
 71 | # barcode mapping fns
 72 | def barcode_hamming(observed,barcodes):
 73 |     """Compute entropy of probabilistic barcode assignment.
 74 |     
 75 |     observed -- SeqRecord of the barcode
 76 |     barcodes -- list of barcode possibilities (python strings)
 77 |     """
 78 |     obs_seq = observed.seq.tostring()
 79 |     distances = [(barcode,hamming_distance(obs_seq,barcode)) for barcode in barcodes]
 80 |     closest = min(distances,key=lambda p: p[1])
 81 |     return closest  # tuple of (barcode, distance)
 82 | 
 83 | def barcode_probabilities(observed,barcodes):
 84 |     """Compute entropy of probabilistic barcode assignment.
 85 |     
 86 |     observed -- 'fastq' SeqRecord of the barcode
 87 |     barcodes -- list of barcode possibilities (python strings)
 88 |     """
 89 |     obs_seq = np.array(list(observed.seq.tostring()))
 90 |     obs_qual = np.array(observed.letter_annotations['phred_quality'])
 91 |     barcodes = np.array([list(bc) for bc in barcodes])
 92 |     
 93 |     choice = np.zeros(barcodes.shape, dtype=np.int)
 94 |     choice[barcodes == obs_seq] = 1
 95 |     choice[barcodes != obs_seq] = 2
 96 |     choice[:, obs_seq == 'N'] = 0
 97 |     
 98 |     N = np.zeros((1,barcodes.shape[1]))
 99 |     E = np.log1p(-np.power(10, -obs_qual / 10.))
100 |     D = -np.log(3) - (obs_qual / 10.) * np.log(3)
101 |     
102 |     B = np.exp(np.sum(np.choose(choice, [N,E,D]), axis=1))
103 |     return B / np.sum(B)
104 | 
105 | def barcode_entropy(observed, barcodes):
106 |     """Compute entropy of probabilistic barcode assignment.
107 |     
108 |     observed -- 'fastq' SeqRecord of the barcode
109 |     barcodes -- list of barcode possibilities (python strings)
110 |     """
111 |     P = barcode_probabilities(observed, barcodes)
112 |     return sp.stats.entropy(P)
113 | 
114 | 
115 | # for generating 'safe' filenames from identifiers
116 | cleanup_table = string.maketrans('/*|><+ ','_____p_')
117 | def cleanup_id(identifier):
118 |     return identifier.translate(cleanup_table)
119 | 
120 | 
121 | def seqhist(seqlist):
122 |     seqdict = dict()
123 |     for seq in seqlist:
124 |         seqdict[seq] = seqdict.get(seq,0) + 1
125 |     return seqdict
126 | 
127 | def seqmode(seqs):
128 |     if isinstance(seqs,list):
129 |         seqs = seqhist(seqs)
130 |     return max(seqs.iterkeys(),key=lambda k: seqs[k])
131 | 
132 | def dimer_dG(seq1,seq2):
133 |     """Compute a primer-dimer score using UNAFOLD hybrid_min"""
134 |     scores = []
135 |     subseqs1 = []
136 |     subseqs2 = []
137 |     for i in xrange( min(len(seq1),len(seq2)) ):
138 |         subseqs1.append( seq1[-i-1:] )
139 |         subseqs2.append( seq2[-i-1:] )
140 |     scores = unafold.hybrid_min_list(subseqs1,subseqs2,NA='DNA')
141 |     return -min(scores)
142 | 
143 | def dimer_overlap(seq1,seq2,weight_3=10):
144 |     """Compute a primer-dimer score by counting overlaps
145 |     
146 |     weight_3 is the num of 3' bases to add extra weight to either primer
147 |     """
148 |     # import pdb
149 |     # pdb.set_trace()
150 |     overlap_score = lambda s1,s2: sum(1 if c1.lower() == c2.lower() else -1 for c1, c2 in itertools.izip(s1,s2))
151 |     seq2rc = reverse_complement(seq1)
152 |     scores = []
153 |     for i in xrange( min(len(seq1),len(seq2)) ):
154 |         subseq1 = seq1[-i-1:]
155 |         subseq2 = seq2rc[:i+1]
156 |         score = 0
157 |         if (i+1) <= 2*weight_3:
158 |             score += overlap_score(subseq1,subseq2) * 2
159 |         else:
160 |             score += overlap_score(subseq1[:weight_3],subseq2[:weight_3]) * 2
161 |             score += overlap_score(subseq1[weight_3:-weight_3],subseq2[weight_3:-weight_3])
162 |             score += overlap_score(subseq1[-weight_3:],subseq2[-weight_3:]) * 2
163 |         scores.append(score)
164 |     return max(scores)
165 | 
166 | # ==========================
167 | # = Manual FASTA iteration =
168 | # ==========================
169 | 
170 | # taken from biopython
171 | 
172 | identity = string.maketrans('','')
173 | nonalpha = identity.translate(identity,string.ascii_letters)
174 | 
175 | def FastaIterator(handleish,title2ids=lambda s: s):
176 |     with as_handle(handleish,'r') as handle:
177 |         while True:
178 |             line = handle.readline()
179 |             if line == '' : return
180 |             if line[0] == '>':
181 |                 break
182 |     
183 |         while True:
184 |             if line[0] != '>':
185 |                 raise ValueError("Records in Fasta files should start with '>' character")
186 |             descr = title2ids(line[1:].rstrip())
187 |             fullline = ''
188 |             line = handle.readline()
189 |             while True:
190 |                 if not line : break
191 |                 if line[0] == '>': break
192 |                 fullline += line.translate(identity,nonalpha)
193 |                 line = handle.readline()
194 |         
195 |             yield (descr,fullline)
196 |         
197 |             if not line : return #StopIteration
198 |         assert False, "Should not reach this line"
199 | 
200 | 
201 | # ============================
202 | # = biopython-specific tools =
203 | # ============================
204 | 
205 | def make_SeqRecord(name,seq):
206 |     return SeqRecord(Seq(seq),id=name,name=name,description=name)
207 | 
208 | 
209 | def get_string(seqobj):
210 |     if isinstance(seqobj,SeqRecord):
211 |         seq = seqobj.seq.tostring().upper()
212 |     elif isinstance(seqobj,Seq):
213 |         seq = seqobj.tostring().upper()
214 |     elif isinstance(seqobj,str):
215 |         seq = seqobj.upper()
216 |     return seq
217 | 
218 | 
219 | def get_features(feature_list,feature_type):
220 |     target_features = []
221 |     for feature in feature_list:
222 |         if feature.type == feature_type:
223 |             target_features.append(feature)
224 |     return target_features
225 | 
226 | 
227 | def advance_to_features(feature_iter,feature_types):
228 |     # note, here feature_types is a list of possible stopping points
229 |     for feature in feature_iter:
230 |         if feature.type in feature_types:
231 |             return feature
232 |     raise ValueError, "didn't find %s in record" % feature_types
233 | 
234 | 
235 | def advance_to_feature(feature_iter,feature_type):
236 |     return advance_to_features(feature_iter,[feature_type])
237 | 
238 | def map_feature( feature, coord_mapping, offset=0, erase=[] ):
239 |     new_feature = copy.deepcopy(feature)
240 |     new_start = coord_mapping[feature.location.start.position][-1] + offset
241 |     new_end   = coord_mapping[feature.location.end.position][0] + offset
242 |     new_location = FeatureLocation(new_start,new_end)
243 |     new_feature.location = new_location
244 |     for qual in erase:
245 |         new_feature.qualifiers.pop(qual,None)
246 |     return new_feature
247 | 
248 | def copy_features( record_from, record_to, coord_mapping, offset=0, erase=[], replace=False ):
249 |     if replace:
250 |         # index record_to features:
251 |         feature_index = {}
252 |         for (i,feature) in enumerate(record_to.features):
253 |             feature_index.setdefault(feature.type,[]).append(i)
254 |     
255 |     feat_idx_to_delete = []
256 |     for feature in record_from.features:
257 |         if replace:
258 |             feat_idx_to_delete += feature_index.get(feature.type,[])
259 |         new_feature = map_feature( feature, coord_mapping, offset, erase )
260 |         record_to.features.append(new_feature)
261 |     
262 |     if replace:
263 |         for idx in sorted(feat_idx_to_delete,reverse=True):
264 |             record_to.features.pop(idx)
265 | 
266 | def translate_features( record ):
267 |     for feature in record.features:
268 |         offset = int(feature.qualifiers.get('codon_start',[1])[0]) - 1
269 |         feature.qualifiers['translation'] = feature.extract(record.seq)[offset:].translate()
270 | 
271 | # SeqRecord <-> JSON-serializable
272 | 
273 | def simplifySeq(seq):
274 |     obj = {}
275 |     obj['__Seq__'] = True
276 |     obj['seq'] = seq.tostring()
277 |     obj['alphabet'] = seq.alphabet.__repr__().rstrip(')').rstrip('(')
278 |     return obj
279 | 
280 | def complicateSeq(obj):
281 |     if '__Seq__' not in obj:
282 |         raise ValueError, "object must be converable to Bio.Seq"
283 |     
284 |     # Figure out which alphabet to use
285 |     try:
286 |         alphabet = Alphabet.__getattribute__(obj['alphabet'])()
287 |     except AttributeError:
288 |         pass
289 |     try:
290 |         alphabet = Alphabet.IUPAC.__getattribute__(obj['alphabet'])()
291 |     except AttributeError:
292 |         raise
293 |     
294 |     seq = Seq(obj['seq'],alphabet=alphabet)
295 |     return seq
296 | 
297 | def simplifySeqFeature(feature):
298 |     obj = {}
299 |     obj['__SeqFeature__'] = True
300 |     obj['location'] = (feature.location.nofuzzy_start,feature.location.nofuzzy_end)
301 |     obj['type'] = feature.type
302 |     obj['strand'] = feature.strand
303 |     obj['id'] = feature.id
304 |     obj['qualifiers'] = feature.qualifiers
305 |     return obj
306 | 
307 | def complicateSeqFeature(obj):
308 |     if '__SeqFeature__' not in obj:
309 |         raise ValueError, "object must be converable to Bio.SeqFeature"
310 |     location = FeatureLocation(*obj['location'])
311 |     feature = SeqFeature(location=location,type=obj['type'],strand=obj['strand'],id=obj['id'],qualifiers=obj['qualifiers'])
312 |     return feature
313 | 
314 | def simplifySeqRecord(record):
315 |     obj = {}
316 |     obj['__SeqRecord__'] = True
317 |     obj['seq'] = simplifySeq(record.seq)
318 |     obj['id'] = record.id
319 |     obj['name'] = record.name
320 |     obj['description'] = record.description
321 |     obj['dbxrefs'] = record.dbxrefs
322 |     obj['annotations'] = record.annotations
323 |     obj['letter_annotations'] = record.letter_annotations   # should work because it is actually a _RestrictedDict obj which subclasses dict
324 |     obj['features'] = map(simplifySeqFeature,record.features)
325 |     return obj
326 | 
327 | def complicateSeqRecord(obj):
328 |     if '__SeqRecord__' not in obj:
329 |         raise ValueError, "object must be converable to Bio.SeqRecord"
330 |     features = map(complicateSeqFeature,obj['features'])
331 |     record = SeqRecord(seq=complicateSeq(obj['seq']),id=obj['id'],name=obj['name'],description=obj['description'],dbxrefs=obj['dbxrefs'],features=features,annotations=obj['annotations'],letter_annotations=obj['letter_annotations'])
332 |     return record
333 | 


--------------------------------------------------------------------------------
/statstools.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | 
  3 | import numpy as np
  4 | import scipy as sp
  5 | import scipy.stats
  6 | import scipy.spatial
  7 | 
  8 | random.seed()
  9 | # random.seed(1)
 10 | 
 11 | np.random.seed()
 12 | # np.random.seed(1)
 13 | 
 14 | permutation = np.random.permutation
 15 | randint     = np.random.randint
 16 | 
 17 | def random_read(seq,read_len):
 18 |     position = randint(0,len(seq)-read_len+1)
 19 |     return (position,seq[position:position+read_len])
 20 | 
 21 | def sample_with_replacement(population,len,choose=random.choice):
 22 |     """Sample from a population with replacement
 23 |     
 24 |     Taken from Python Cookbook, 2nd ed, recipe 18.3
 25 |     """
 26 |     s = []
 27 |     for i in xrange(len):
 28 |         s.append(choose(population))
 29 |     return s
 30 | 
 31 | def multinomial_sample(n,p):
 32 |     """Return sample variates from multinomial
 33 |     
 34 |     NOTE: the numpy/scipy multinomial function will return a vector same
 35 |     length as p with the number of observations of each type. This function
 36 |     will return a vector of length n that has the actual observations.
 37 |     
 38 |     n - number of experiments
 39 |     p - vector or parameters; must sum to 1
 40 |     """
 41 |     if sum(p) != 1.: raise ValueError, "p must sum to 1"
 42 |     uniform_sample = np.random.uniform(size=n)
 43 |     p_cum = np.cumsum(p)
 44 |     return np.searchsorted(p_cum,uniform_sample,side='right')
 45 | 
 46 | def permsamp(x, nperm, theta):
 47 |     '''sample nperm times from the permutation distribution of the data x (numpy)
 48 |        theta is the function that takes the data and computes the statistic
 49 |        it must know how the data is "encoded" in x
 50 |        
 51 |        returns a vector of nperm th_star values
 52 |     '''
 53 |     N = len(x)
 54 |     
 55 |     def perm_iter():
 56 |         for i in xrange(nperm):
 57 |             yield x[permutation(N)]
 58 |     
 59 |     th_star = np.asarray( map(theta,perm_iter()) )
 60 |     
 61 |     return th_star
 62 | 
 63 | def bootstrap(x, nboot, theta):
 64 |     '''return n bootstrap replications of theta from x'''
 65 |     
 66 |     N = len(x)
 67 |     
 68 |     def rand_iter():
 69 |         for i in xrange(nboot):
 70 |             yield x[randint(0,N,N)]
 71 |     
 72 |     th_star = np.asarray( map(theta,rand_iter()) )
 73 |     
 74 |     return th_star
 75 | 
 76 | def sample2counts(sample, categories=0):
 77 |     """Return count vector from list of samples.
 78 |     
 79 |     Take vector of samples and return a vector of counts.  The elts
 80 |        refer to indices in something that would ultimately map to the
 81 |        originating category (like from a multinomial).  Therefore, if there
 82 |        are, say, 8 categories, then valid values in sample should be 0-7.
 83 |        If categories is not given, then i compute it from the highest value
 84 |        present in sample (+1).
 85 |     
 86 |     """
 87 |     counts = np.bincount(sample)
 88 |     if (categories > 0) and (categories > len(counts)):
 89 |         counts = np.append( counts, np.zeros(categories-len(counts)) )
 90 |     return counts
 91 | 
 92 | def counts2sample(counts):
 93 |     """Computes a consistent sample from a vector of counts.
 94 |     
 95 |     Takes a vector of counts and returns a vector of indices x
 96 |        such that len(x) = sum(c) and each elt of x is the index of
 97 |        a corresponding elt in c
 98 |     
 99 |     """
100 |     x = np.ones(np.sum(counts),dtype=np.int_)
101 |     
102 |     start_idx = 0
103 |     end_idx = 0
104 |     for i in xrange(len(counts)):
105 |         start_idx = end_idx
106 |         end_idx = end_idx + counts[i]
107 |         x[start_idx:end_idx] = x[start_idx:end_idx] * i 
108 |     return x
109 | 
110 | def density2d(x,y):
111 |     x = np.asarray(x).ravel()
112 |     y = np.asarray(y).ravel()
113 |     data = np.r_['0,2',x,y]
114 |     kde = sp.stats.kde.gaussian_kde(data)
115 |     return kde(data)
116 | 
117 | def entropy_bootstrap(pk,size,N=1000):
118 |     """Compute bootstrapped entropy values.
119 | 
120 |     pk is a multinomial vector (will be normalized)
121 |     size is the number of objects to draw from a multinomial at each iter
122 |     N is number of bootstrap replicates
123 |     """
124 |     pk = np.asarray(pk,dtype=np.float)
125 |     pk = pk / np.sum(pk)
126 |     
127 |     entropies = []
128 |     for i in xrange(N):
129 |         entropies.append( sp.stats.entropy(np.random.multinomial(size,pk)) )
130 |     
131 |     return entropies
132 | 
133 | def entropy_bootstrap2(pk,N=1000,total=0):
134 |     """Compute bootstrapped entropy values.
135 |     
136 |     pk is a count vector
137 |     sum is the total number of objects to draw from
138 |     N is number of bootstrap replicates
139 |     """
140 |     n = sum(pk)
141 |     if total == 0:
142 |         total = n
143 |     
144 |     pk = list(pk)
145 |     pk.append(total-n)
146 |     pk = np.asarray(pk,dtype=np.float)
147 |     pk = pk / np.sum(pk)
148 |     
149 |     entropies = []
150 |     for i in xrange(N):
151 |         entropies.append( sp.stats.entropy(np.random.multinomial(n,pk)[:-1]) )
152 |     
153 |     return entropies
154 | 
155 | def silhouette(Y,T):
156 |     """Emulate MATLAB silhouette fn for cluster quality.
157 |     
158 |     Y -- condensed-form pairwise distance matrix
159 |     T -- cluster assignments
160 |     
161 |     Based on StackOverflow #6644445
162 |     """
163 |     n = len(T)          # number of objects
164 |     clusters = set(T)   # the cluster labels
165 |     
166 |     X = sp.spatial.distance.squareform(Y)
167 |     
168 |     s = np.zeros(n)
169 |     for i in xrange(n):
170 |         incluster = T==T[i]
171 |         incluster[i] = False
172 |         if np.sum(incluster) == 0:
173 |             continue
174 |         
175 |         outcluster = lambda j: T==j
176 |         
177 |         # incluster average dist
178 |         a = np.mean( X[incluster,i] )
179 |         
180 |         # min outcluster avg dist
181 |         b = np.min([np.mean( X[outcluster(j),i] ) for j in (clusters-set([T[i]]))])
182 |         
183 |         s[i] = (b - a) / np.max([a,b])
184 |     
185 |     return s
186 | 


--------------------------------------------------------------------------------
/stitch.py:
--------------------------------------------------------------------------------
 1 | from numpy import array, power, log, log10, log1p, choose, sum
 2 | from Bio import SeqIO
 3 | from itertools import izip
 4 | from seqtools import reverse_complement
 5 | 
 6 | def stitch(record1, record2):
 7 |     seq1 = array([record1.seq.tostring()])
 8 |     seq2 = array([reverse_complement(record2.seq.tostring())])
 9 |     seq1.dtype = '|S1'
10 |     seq2.dtype = '|S1'
11 |     quals1 = array(record1.letter_annotations['phred_quality'])
12 |     quals2 = array(record2.letter_annotations['phred_quality'][::-1])
13 |     
14 |     log10p_consensus_1 = log1p(-power(10, -quals1 / 10.)) / log(10)
15 |     log10p_consensus_2 = log1p(-power(10, -quals2 / 10.)) / log(10)
16 |     log10p_error_1 = -log10(3) - (quals1 / 10.)
17 |     log10p_error_2 = -log10(3) - (quals2 / 10.)
18 |     
19 |     min_overlap = 1
20 |     max_overlap = max(len(record1), len(record2))
21 |     overlaps = {}
22 |     for overlap in range(1, max_overlap):
23 |         s1 = seq1[-overlap:]
24 |         s2 = seq2[:overlap]
25 |         q1 = quals1[-overlap:]
26 |         q2 = quals2[:overlap]
27 |         lpc1 = log10p_consensus_1[-overlap:]
28 |         lpc2 = log10p_consensus_2[:overlap]
29 |         lpe1 = log10p_error_1[-overlap:]
30 |         lpe2 = log10p_error_2[:overlap]
31 |         
32 |         consensus = choose(q1 < q2, [s1, s2])
33 |         score = sum(choose(consensus == s1, [lpe1, lpc1])) + sum(choose(consensus == s2, [lpe2, lpc2])) + len(consensus) * log10(4) * 2    # last term is null hypothesis, p=1/4
34 |         consensus.dtype = '|S%i' % len(consensus)
35 |         overlaps[overlap] = (consensus[0],score)
36 |     
37 |     return overlaps
38 | 
39 | import numpy as np
40 | 
41 | if __name__ == '__main__':
42 |     input_file1 = '/n/home00/laserson/data/MS_HIV_MiSeq_data_20120105/samples/HIV1.1.fastq'
43 |     input_file2 = '/n/home00/laserson/data/MS_HIV_MiSeq_data_20120105/samples/HIV1.2.fastq'
44 | 
45 |     input_file1 = '/Users/laserson/Dropbox/stitcher/test.1.fastq'
46 |     input_file2 = '/Users/laserson/Dropbox/stitcher/test.2.fastq'
47 | 
48 |     it = izip(SeqIO.parse(input_file1,'fastq'), SeqIO.parse(input_file2,'fastq'))
49 |     (record1,record2) = it.next()
50 |     overlaps = stitch(record1,record2)
51 |     scores = [p[1] for p in overlaps.values()]
52 |     (entropy(power(10,scores)), max(overlaps.items(),key=lambda i: i[1][1]))
53 | 
54 |     entropies = []
55 |     for (i,(rec1,rec2)) in enumerate(izip(SeqIO.parse(input_file1,'fastq'), SeqIO.parse(input_file1,'fastq'))):
56 |         entropies.append(stitch(rec1,rec2))
57 |         if i == 1000:
58 |             break


--------------------------------------------------------------------------------
/streamgraph.py:
--------------------------------------------------------------------------------
  1 | # Based on http://code.activestate.com/recipes/576633/
  2 | # which is based on:
  3 | # Reference: 'Stacked graphs- geometry & aesthetics' by Byron and Wattenberg
  4 | # http://www.leebyron.com/else/streamgraph/download.php?file=stackedgraphs_byron_wattenberg.pdf
  5 | 
  6 | import numpy as np
  7 | import matplotlib as mpl
  8 | # mpl.use('Agg')
  9 | import matplotlib.pyplot as plt
 10 | 
 11 | # baseline functions
 12 | def baseline_symmetric(streams):
 13 |     """Symmetric baseline ('silhouette')"""
 14 |     g0 = -0.5 * np.sum(np.asarray(streams),axis=0)
 15 |     return g0
 16 | 
 17 | def baseline_zero(streams):
 18 |     """Zero baseline"""
 19 |     return np.zeros(np.asarray(streams).shape[1])
 20 | 
 21 | def baseline_weighted_wiggle(streams):
 22 |     """Weighted-wiggle minimization
 23 |     
 24 |     NOTE: streams should already be ordered as desired
 25 |     """
 26 |     streams = np.asarray(streams)
 27 |     
 28 |     # add a column of zeros on the left side of streams
 29 |     f = np.hstack( (np.zeros((streams.shape[0],1)),streams) )
 30 |     df = np.diff(f)
 31 |     cum_sum_df = np.vstack( (np.zeros((1,df.shape[1])),np.cumsum(df,axis=0)) )[:-1,:]
 32 |     dg0 = (-1./np.sum(streams,axis=0)) * np.sum((0.5 * df + cum_sum_df) * streams,axis=0)
 33 |     g0 = np.cumsum(dg0)
 34 |     return g0
 35 | 
 36 | # ordering functions
 37 | def argsort_onset(streams):
 38 |     """Returns permutation indices (like argsort) for onset ordering."""
 39 |     streams = np.asarray(streams)
 40 |     nonzero_idxs = [np.arange(streams.shape[1])[idxs] for idxs in (streams > 0)]
 41 |     onset_idxs = [np.min(nzi) if len(nzi) > 0 else streams.shape[1] for nzi in nonzero_idxs]
 42 |     return np.argsort(onset_idxs)
 43 | 
 44 | def argsort_inside_out(streams):
 45 |     """Returns permutation indices (like argsort) for inside-out ordering."""
 46 |     upper = []
 47 |     lower = []
 48 |     weight_up = 0
 49 |     weight_lo = 0
 50 |     for (i,stream) in enumerate(streams):
 51 |         if weight_up < weight_lo:
 52 |             upper.append(i)
 53 |             weight_up += np.sum(stream)
 54 |         else:
 55 |             lower.append(i)
 56 |             weight_lo += np.sum(stream)
 57 |     
 58 |     return upper[::-1] + lower
 59 | 
 60 | def streamgraph(ax, streams, x=None, colors=None, baseline=baseline_weighted_wiggle, yoffset=0., whitebg=True):
 61 |     streams = np.asarray(streams)
 62 |     
 63 |     g0 = baseline(streams) + yoffset
 64 |     
 65 |     if x == None:
 66 |         x = range(streams.shape[1])
 67 |     
 68 |     if colors == None:
 69 |         colors = map(mpl.cm.bone,np.random.uniform(size=streams.shape[0]))
 70 |     
 71 |     layers = []
 72 |     g_lo = g0
 73 |     for stream in streams:
 74 |         g_hi = g_lo + stream
 75 |         verts_lo = zip(x,g_lo)
 76 |         verts_hi = zip(x[::-1],g_hi[::-1])
 77 |         layer = verts_lo + verts_hi
 78 |         layers.append(layer)
 79 |         g_lo = g_hi
 80 |     
 81 |     polys = mpl.collections.PolyCollection(layers,facecolors=colors,linewidths=0, zorder=10)
 82 |     ax.add_collection(polys)
 83 |     
 84 |     # add an opaque white background to the streamgraph
 85 |     if whitebg == True:
 86 |         verts = np.asarray(zip(x,g0) + zip(x[::-1],g_hi[::-1]))
 87 |         bglayer = mpl.patches.Polygon(verts, closed=True, color='white', alpha=1, zorder=5)
 88 |         ax.add_patch(bglayer)
 89 |     
 90 |     return ax
 91 | 
 92 | def format_streamgraph(ax):
 93 |     """Performs some common formatting operations for streamgraphs"""
 94 |     # kill the frame
 95 |     ax.spines['top'].set_visible(False)
 96 |     ax.spines['right'].set_visible(False)
 97 |     ax.spines['bottom'].set_visible(False)
 98 |     ax.spines['left'].set_visible(False)
 99 |     
100 |     # set ticks
101 |     ax.xaxis.set_ticks_position('bottom')
102 | 
103 | 
104 | 
105 | 
106 | 
107 | 
108 | 
109 | 


--------------------------------------------------------------------------------
/timeseries.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | def load_timeseries(inhandle):
 4 |     """Load timeseries data from file.
 5 |     
 6 |     There may be 'control' lines that start with #, e.g.:
 7 |         #times
 8 |         #sums
 9 |     that are loaded into the returned data dictionary as:
10 |         data['times'] = ...
11 |     
12 |     The control lines must contain a whitespace delimited list of numbers
13 |     (float allowed)
14 |     
15 |     data['labels'] will contain all the labels (first entry of each timeseries
16 |     row) in order
17 |     
18 |     data['matrix'] will contain the numpy array that has the actual data in it
19 |     """
20 |     data = {}
21 |     labels = []
22 |     matrix = []
23 |     for line in inhandle:
24 |         if line.startswith('#'):
25 |             tokens = line.split()
26 |             label = tokens[0].lstrip('#')
27 |             values = np.asarray(map(float,tokens[1:]))
28 |             data[label] = values
29 |         else:
30 |             values = line.split()
31 |             labels.append(values[0].strip())
32 |             matrix.append(map(int,values[1:]))
33 |     data['labels'] = labels
34 |     data['matrix'] = np.asarray(matrix)
35 |     
36 |     return data
37 | 
38 | def write_timeseries(outhandle,**kw):
39 |     """Write timeseries to file.
40 |     
41 |     Must provide labels and matrix. All other arguments must be lists of
42 |     numbers (floats allowed) which get printed as "comments". They will be
43 |     loaded by the load function as well.
44 |     
45 |     matrix must always be integer type. (If normalization is required, pass
46 |     the proper column sums in as control line #sums).
47 |     """
48 |     labels = kw.pop('labels')
49 |     matrix = kw.pop('matrix')
50 |     for (key,values) in kw.iteritems():
51 |         print >>outhandle, '#%s ' % key + ' '.join(map(str,values))
52 |     for (label,timeseries) in zip(labels,matrix):
53 |         print >>outhandle, ' '.join(map(str,[label]+list(timeseries)))
54 | 
55 | def normalized_timeseries(timeseries):
56 |     return np.float_(timeseries) / timeseries.sum(axis=0)
57 | 


--------------------------------------------------------------------------------
/unafold.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import glob
 3 | import random
 4 | import tempfile
 5 | import subprocess
 6 | 
 7 | def hybrid_ss_min(seq,NA='RNA',tmin=37,tinc=1,tmax=37,sodium=1,magnesium=0):
 8 |     cmd = 'hybrid-ss-min --quiet --NA=%s --tmin=%f --tinc=%f --tmax=%f --sodium=%f --magnesium=%f %s' % (NA,tmin,tinc,tmax,sodium,magnesium,seq)
 9 |     p = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE)
10 |     p.wait()
11 |     dG = float(p.stdout.read())
12 |     return dG
13 | 
14 | def hybrid_min(seq1,seq2,NA='RNA',tmin=37,tinc=1,tmax=37,sodium=1,magnesium=0):
15 |     cmd = 'hybrid-min --quiet --NA=%s --tmin=%f --tinc=%f --tmax=%f --sodium=%f --magnesium=%f %s %s' % (NA,tmin,tinc,tmax,sodium,magnesium,seq1,seq2)
16 |     p = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE)
17 |     p.wait()
18 |     dG = float(p.stdout.read().split()[0])
19 |     return dG
20 | 
21 | def hybrid_min_list(seqlist1,seqlist2,NA='RNA',tmin=37,tinc=1,tmax=37,sodium=1,magnesium=0):
22 |     # set up temporary files
23 |     temp_out_prefix = 'temporary_hybrid_%i_%i' % (os.getpid(),random.randint(0,10000))
24 |     seqfile1 = tempfile.NamedTemporaryFile(mode='w',dir='.',prefix='hybrid_min_temp',suffix='.fasta')
25 |     seqfile2 = tempfile.NamedTemporaryFile(mode='w',dir='.',prefix='hybrid_min_temp',suffix='.fasta')
26 |     for (i,seq) in enumerate(seqlist1): print >>seqfile1, ">1_%i\n%s" % (i,seq)
27 |     for (i,seq) in enumerate(seqlist2): print >>seqfile2, ">2_%i\n%s" % (i,seq)
28 |     seqfile1.file.flush()
29 |     seqfile2.file.flush()
30 |     
31 |     # set up and execute command
32 |     cmd = 'hybrid-min --NA=%s --tmin=%f --tinc=%f --tmax=%f --sodium=%f --magnesium=%f --output=%s %s %s' % (NA,tmin,tinc,tmax,sodium,magnesium,temp_out_prefix,seqfile1.name,seqfile2.name)
33 |     p = subprocess.Popen(cmd,shell=True)
34 |     p.wait()
35 |     
36 |     # read results
37 |     ip = open(temp_out_prefix+'.dG','r')
38 |     dGs = []
39 |     for line in ip:
40 |         if line.startswith('#'): continue
41 |         dGs.append(float(line.split()[1]))
42 |     
43 |     # clean up output
44 |     seqfile1.close()
45 |     seqfile2.close()
46 |     for filename in glob.glob(temp_out_prefix+'*'): os.remove(filename)
47 |     
48 |     return dGs
49 | 


--------------------------------------------------------------------------------